From c9e9d5d74fcfed3789189d79879797f505ab0c0b Mon Sep 17 00:00:00 2001 From: gabemontero Date: Thu, 25 Jul 2024 12:01:00 -0400 Subject: [PATCH] SRVKP-4532: factor k8s throttling into task panel Also: - fix tekton-config.yaml yamllint error - widen controller restart query - make osp pruner cfg consistent with konflux prod rh-pre-commit.version: 2.3.0 rh-pre-commit.check-secrets: ENABLED --- .../pipeline-service-dashboard.json | 6 +++--- .../openshift-pipelines/tekton-config.yaml | 19 +++++++++---------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/operator/gitops/argocd/grafana/dashboards/pipeline-service-dashboard.json b/operator/gitops/argocd/grafana/dashboards/pipeline-service-dashboard.json index 59b46431f..918b559d9 100644 --- a/operator/gitops/argocd/grafana/dashboards/pipeline-service-dashboard.json +++ b/operator/gitops/argocd/grafana/dashboards/pipeline-service-dashboard.json @@ -227,7 +227,7 @@ "type": "stat" }, { - "description": "The number of times the pipelines controller has restarted", + "description": "The number of times any of the pipelines controllers hav restarted", "fieldConfig": { "defaults": { "color": { @@ -281,7 +281,7 @@ "targets": [ { "editorMode": "code", - "expr": "sum(increase(kube_pod_container_status_restarts_total{namespace=\"openshift-pipelines\", pod=~\"tekton-pipelines-controller-.*\"}[2m]))", + "expr": "sum(increase(kube_pod_container_status_restarts_total{namespace=\"openshift-pipelines\", pod=~\"tekton-.*\"}[2m]))", "legendFormat": "__auto", "range": true, "refId": "A" @@ -409,7 +409,7 @@ "targets": [ { "editorMode": "code", - "expr": "sum(increase(taskrun_pod_create_not_attempted_or_pending_count[2m]))", + "expr": "sum(increase(taskrun_pod_create_not_attempted_or_pending_count[2m])) - sum(increase(tekton_pipelines_controller_running_taskruns_throttled_by_quota[2m])) - sum(increase(tekton_pipelines_controller_running_taskruns_throttled_by_node[2m]))", "legendFormat": "__auto", "range": true, "refId": "A" diff --git a/operator/gitops/argocd/pipeline-service/openshift-pipelines/tekton-config.yaml b/operator/gitops/argocd/pipeline-service/openshift-pipelines/tekton-config.yaml index 32429e2cc..643050aad 100644 --- a/operator/gitops/argocd/pipeline-service/openshift-pipelines/tekton-config.yaml +++ b/operator/gitops/argocd/pipeline-service/openshift-pipelines/tekton-config.yaml @@ -190,13 +190,12 @@ spec: kube-api-qps: 50 kube-api-burst: 50 pruner: - # The load on prod-rh01 is to the point now where tekton-results - # can fall too far behind. Until the watcher's log storage is rewritten - # etc with SRVKP-4347 or if we risk adding more processing power (threads,qps,burst) - # to the mem leak version of the watcher, we need the OSP pruner as a backup. - # a bit of an adjustment, we will prune once an hour now per https://crontab.guru/every-1-hour - # to line up with typical timeout settings. - keep: 10 - resources: - - pipelinerun - schedule: 0 * * * * + # The load on prod-rh01 is to the point now where tekton-results + # can fall too far behind. Until the watcher's log storage is rewritten + # etc with SRVKP-4347 or if we risk adding more processing power (threads,qps,burst) + # to the mem leak version of the watcher, we need the OSP pruner as a backup. + disable: false + keep-since: 60 + resources: + - pipelinerun + schedule: "*/10 * * * *"