From 8d70de80f475ec26f28105112314f0e5f6435aa9 Mon Sep 17 00:00:00 2001 From: Kemal Akkoyun Date: Wed, 8 Jan 2020 16:06:59 +0100 Subject: [PATCH 1/4] Add Thanos Ruler alerts Signed-off-by: Kemal Akkoyun --- examples/alerts/alerts.md | 91 ++++++++--- examples/alerts/alerts.yaml | 77 +++++++++ examples/dashboards/ruler.json | 198 ++++++++++++++++++++++-- mixin/thanos/alerts/alerts.libsonnet | 1 + mixin/thanos/alerts/ruler.libsonnet | 121 +++++++++++++++ mixin/thanos/dashboards/ruler.libsonnet | 17 ++ 6 files changed, 467 insertions(+), 38 deletions(-) create mode 100644 mixin/thanos/alerts/ruler.libsonnet diff --git a/examples/alerts/alerts.md b/examples/alerts/alerts.md index 14367defa7..991642d45a 100644 --- a/examples/alerts/alerts.md +++ b/examples/alerts/alerts.md @@ -64,39 +64,84 @@ rules: For Thanos ruler we run some alerts in local Prometheus, to make sure that Thanos Rule is working: -[//]: # "TODO(kakkoyun): Generate rule rules using thanos-mixin." - +[embedmd]:# (../tmp/thanos-ruler.rules.yaml yaml) ```yaml -- alert: ThanosRuleIsDown - expr: up{app="thanos-ruler"} == 0 or absent(up{app="thanos-ruler"}) +name: thanos-ruler.rules +rules: +- alert: ThanosRulerQueueIsDroppingAlerts + annotations: + message: Thanos Ruler {{$labels.job}} {{$labels.pod}} is failing to queue alerts. + expr: | + sum by (job) (thanos_alert_queue_alerts_dropped_total{job=~"thanos-ruler.*"}) > 0 for: 5m labels: - team: TEAM + severity: critical +- alert: ThanosRulerSenderIsFailingAlerts annotations: - summary: Thanos Rule is down - impact: Alerts are not working - action: 'check {{ $labels.kubernetes_pod_name }} pod in {{ $labels.kubernetes_namespace}} namespace' - dashboard: RULE_DASHBOARD -- alert: ThanosRuleIsDroppingAlerts - expr: rate(thanos_alert_queue_alerts_dropped_total{app="thanos-ruler"}[5m]) > 0 + message: Thanos Ruler {{$labels.job}} {{$labels.pod}} is failing to send alerts + to alertmanager. + expr: | + sum by (job) (thanos_alert_sender_alerts_dropped_total{job=~"thanos-ruler.*"}) > 0 for: 5m labels: - team: TEAM + severity: critical +- alert: ThanosRulerHighRuleExaluationFailures annotations: - summary: Thanos Rule is dropping alerts - impact: Alerts are not working - action: 'check {{ $labels.kubernetes_pod_name }} pod logs in {{ $labels.kubernetes_namespace}} namespace' - dashboard: RULE_DASHBOARD -- alert: ThanosRuleGrpcErrorRate - expr: rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable",app="thanos-ruler"}[5m]) > 0 + message: Thanos Ruler {{$labels.job}} {{$labels.pod}} is failing to evaluate rules. + expr: | + ( + sum by (job) (rate(prometheus_rule_evaluation_failures_total{job=~"thanos-ruler.*"}[5m])) + / + sum by (job) (rate(prometheus_rule_evaluations_total{job=~"thanos-ruler.*"}[5m])) + * 100 > 5 + ) for: 5m labels: - team: TEAM + severity: warning +- alert: ThanosRulerHighRuleExaluationWarnings annotations: - summary: Thanos Rule is returning Internal/Unavailable errors - impact: Recording Rules are not working - action: Check {{ $labels.kubernetes_pod_name }} pod logs in {{ $labels.kubernetes_namespace}} namespace - dashboard: RULE_DASHBOARD + message: Thanos Ruler {{$labels.job}} {{$labels.pod}} has high number of evaluation + warnings. + expr: | + sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{job=~"thanos-ruler.*"}[5m])) > 0 + for: 5m + labels: + severity: warning +- alert: ThanosRulerRuleEvaluationLatencyHigh + annotations: + message: Thanos Ruler {{$labels.job}} has higher evaluation latency than interval + for {{$labels.rule_group}}. + expr: | + ( + sum by (job, rule_group) (prometheus_rule_group_last_duration_seconds{job=~"thanos-receiver.*"}) + > + sum by (job, rule_group) (prometheus_rule_group_interval_seconds{job=~"thanos-receiver.*"}) + ) + for: 5m + labels: + severity: warning +- alert: ThanosRulerGrpcErrorRate + annotations: + message: Thanos Ruler {{$labels.job}} is failing to handle {{ $value | humanize + }}% of requests. + expr: | + ( + sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-ruler.*"}[5m])) + / + sum by (job) (rate(grpc_server_started_total{job=~"thanos-ruler.*"}[5m])) + * 100 > 5 + ) + for: 5m + labels: + severity: warning +- alert: ThanosRulerConfigReloadFailure + annotations: + message: Thanos Ruler {{$labels.job}} has not been able to reload its configuration. + expr: avg(thanos_rule_config_last_reload_successful{job=~"thanos-ruler.*"}) by (job) + != 1 + for: 5m + labels: + severity: warning ``` ## Store Gateway diff --git a/examples/alerts/alerts.yaml b/examples/alerts/alerts.yaml index 23f10b655f..ccd54aaef5 100644 --- a/examples/alerts/alerts.yaml +++ b/examples/alerts/alerts.yaml @@ -277,6 +277,83 @@ groups: for: 10m labels: severity: warning +- name: thanos-ruler.rules + rules: + - alert: ThanosRulerQueueIsDroppingAlerts + annotations: + message: Thanos Ruler {{$labels.job}} {{$labels.pod}} is failing to queue alerts. + expr: | + sum by (job) (thanos_alert_queue_alerts_dropped_total{job=~"thanos-ruler.*"}) > 0 + for: 5m + labels: + severity: critical + - alert: ThanosRulerSenderIsFailingAlerts + annotations: + message: Thanos Ruler {{$labels.job}} {{$labels.pod}} is failing to send alerts + to alertmanager. + expr: | + sum by (job) (thanos_alert_sender_alerts_dropped_total{job=~"thanos-ruler.*"}) > 0 + for: 5m + labels: + severity: critical + - alert: ThanosRulerHighRuleExaluationFailures + annotations: + message: Thanos Ruler {{$labels.job}} {{$labels.pod}} is failing to evaluate + rules. + expr: | + ( + sum by (job) (rate(prometheus_rule_evaluation_failures_total{job=~"thanos-ruler.*"}[5m])) + / + sum by (job) (rate(prometheus_rule_evaluations_total{job=~"thanos-ruler.*"}[5m])) + * 100 > 5 + ) + for: 5m + labels: + severity: warning + - alert: ThanosRulerHighRuleExaluationWarnings + annotations: + message: Thanos Ruler {{$labels.job}} {{$labels.pod}} has high number of evaluation + warnings. + expr: | + sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{job=~"thanos-ruler.*"}[5m])) > 0 + for: 5m + labels: + severity: warning + - alert: ThanosRulerRuleEvaluationLatencyHigh + annotations: + message: Thanos Ruler {{$labels.job}} has higher evaluation latency than interval + for {{$labels.rule_group}}. + expr: | + ( + sum by (job, rule_group) (prometheus_rule_group_last_duration_seconds{job=~"thanos-receiver.*"}) + > + sum by (job, rule_group) (prometheus_rule_group_interval_seconds{job=~"thanos-receiver.*"}) + ) + for: 5m + labels: + severity: warning + - alert: ThanosRulerGrpcErrorRate + annotations: + message: Thanos Ruler {{$labels.job}} is failing to handle {{ $value | humanize + }}% of requests. + expr: | + ( + sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-ruler.*"}[5m])) + / + sum by (job) (rate(grpc_server_started_total{job=~"thanos-ruler.*"}[5m])) + * 100 > 5 + ) + for: 5m + labels: + severity: warning + - alert: ThanosRulerConfigReloadFailure + annotations: + message: Thanos Ruler {{$labels.job}} has not been able to reload its configuration. + expr: avg(thanos_rule_config_last_reload_successful{job=~"thanos-ruler.*"}) by + (job) != 1 + for: 5m + labels: + severity: warning - name: thanos-component-absent.rules rules: - alert: ThanosCompactorIsDown diff --git a/examples/dashboards/ruler.json b/examples/dashboards/ruler.json index fa2596da4a..6dc5f85a25 100644 --- a/examples/dashboards/ruler.json +++ b/examples/dashboards/ruler.json @@ -347,6 +347,174 @@ "title": "Alert Sent", "titleSize": "h6" }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of queued alerts.", + "fill": 1, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_alert_queue_alerts_dropped_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Push Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of dropped alerts compared to the total number of queued alerts.", + "fill": 10, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_alert_queue_alerts_dropped_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) / sum(rate(thanos_alert_queue_alerts_pushed_total{namespace=\"$namespace\",job=~\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Drop Ratio", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Alert Queue", + "titleSize": "h6" + }, { "collapse": false, "height": "250px", @@ -378,7 +546,7 @@ "datasource": "$datasource", "description": "Shows rate of handled Unary gRPC requests.", "fill": 10, - "id": 5, + "id": 7, "legend": { "avg": false, "current": false, @@ -457,7 +625,7 @@ "datasource": "$datasource", "description": "Shows ratio of errors compared to the total number of handled requests.", "fill": 10, - "id": 6, + "id": 8, "legend": { "avg": false, "current": false, @@ -534,7 +702,7 @@ "datasource": "$datasource", "description": "Shows how long has it taken to handle requests, in quantiles.", "fill": 1, - "id": 7, + "id": 9, "legend": { "avg": false, "current": false, @@ -639,7 +807,7 @@ "datasource": "$datasource", "description": "Shows rate of handled Unary gRPC requests.", "fill": 10, - "id": 8, + "id": 10, "legend": { "avg": false, "current": false, @@ -716,7 +884,7 @@ "datasource": "$datasource", "description": "Shows ratio of errors compared to the total number of handled requests.", "fill": 10, - "id": 9, + "id": 11, "legend": { "avg": false, "current": false, @@ -793,7 +961,7 @@ "datasource": "$datasource", "description": "Shows how long has it taken to handle requests, in quantiles.", "fill": 1, - "id": 10, + "id": 12, "legend": { "avg": false, "current": false, @@ -917,7 +1085,7 @@ "datasource": "$datasource", "description": "Shows rate of handled Streamed gRPC requests.", "fill": 10, - "id": 11, + "id": 13, "legend": { "avg": false, "current": false, @@ -996,7 +1164,7 @@ "datasource": "$datasource", "description": "Shows ratio of errors compared to the total number of handled requests.", "fill": 10, - "id": 12, + "id": 14, "legend": { "avg": false, "current": false, @@ -1073,7 +1241,7 @@ "datasource": "$datasource", "description": "Shows how long has it taken to handle requests, in quantiles", "fill": 1, - "id": 13, + "id": 15, "legend": { "avg": false, "current": false, @@ -1178,7 +1346,7 @@ "datasource": "$datasource", "description": "Shows rate of handled Streamed gRPC requests.", "fill": 10, - "id": 14, + "id": 16, "legend": { "avg": false, "current": false, @@ -1255,7 +1423,7 @@ "datasource": "$datasource", "description": "Shows ratio of errors compared to the total number of handled requests.", "fill": 10, - "id": 15, + "id": 17, "legend": { "avg": false, "current": false, @@ -1332,7 +1500,7 @@ "datasource": "$datasource", "description": "Shows how long has it taken to handle requests, in quantiles", "fill": 1, - "id": 16, + "id": 18, "legend": { "avg": false, "current": false, @@ -1436,7 +1604,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 17, + "id": 19, "legend": { "avg": false, "current": false, @@ -1552,7 +1720,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 18, + "id": 20, "legend": { "avg": false, "current": false, @@ -1628,7 +1796,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 19, + "id": 21, "legend": { "avg": false, "current": false, diff --git a/mixin/thanos/alerts/alerts.libsonnet b/mixin/thanos/alerts/alerts.libsonnet index 0eb63dc98d..e3fa004090 100644 --- a/mixin/thanos/alerts/alerts.libsonnet +++ b/mixin/thanos/alerts/alerts.libsonnet @@ -3,4 +3,5 @@ (import 'receiver.libsonnet') + (import 'sidecar.libsonnet') + (import 'store.libsonnet') + +(import 'ruler.libsonnet') + (import 'absent.libsonnet') diff --git a/mixin/thanos/alerts/ruler.libsonnet b/mixin/thanos/alerts/ruler.libsonnet new file mode 100644 index 0000000000..949dcbb55d --- /dev/null +++ b/mixin/thanos/alerts/ruler.libsonnet @@ -0,0 +1,121 @@ +{ + local thanos = self, + ruler+:: { + jobPrefix: error 'must provide job prefix for Thanos Ruler alerts', + selector: error 'must provide selector for Thanos Ruler alerts', + }, + prometheusAlerts+:: { + groups+: [ + { + name: 'thanos-ruler.rules', + rules: [ + { + alert: 'ThanosRulerQueueIsDroppingAlerts', + annotations: { + message: 'Thanos Ruler {{$labels.job}} {{$labels.pod}} is failing to queue alerts.', + }, + expr: ||| + sum by (job) (thanos_alert_queue_alerts_dropped_total{%(selector)s}) > 0 + ||| % thanos.ruler, + 'for': '5m', + labels: { + severity: 'critical', + }, + }, + { + alert: 'ThanosRulerSenderIsFailingAlerts', + annotations: { + message: 'Thanos Ruler {{$labels.job}} {{$labels.pod}} is failing to send alerts to alertmanager.', + }, + expr: ||| + sum by (job) (thanos_alert_sender_alerts_dropped_total{%(selector)s}) > 0 + ||| % thanos.ruler, + 'for': '5m', + labels: { + severity: 'critical', + }, + }, + { + alert: 'ThanosRulerHighRuleExaluationFailures', + annotations: { + message: 'Thanos Ruler {{$labels.job}} {{$labels.pod}} is failing to evaluate rules.', + }, + expr: ||| + ( + sum by (job) (rate(prometheus_rule_evaluation_failures_total{%(selector)s}[5m])) + / + sum by (job) (rate(prometheus_rule_evaluations_total{%(selector)s}[5m])) + * 100 > 5 + ) + ||| % thanos.ruler, + + 'for': '5m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'ThanosRulerHighRuleExaluationWarnings', + annotations: { + message: 'Thanos Ruler {{$labels.job}} {{$labels.pod}} has high number of evaluation warnings.', + }, + expr: ||| + sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{%(selector)s}[5m])) > 0 + ||| % thanos.ruler, + + 'for': '5m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'ThanosRulerRuleEvaluationLatencyHigh', + annotations: { + message: 'Thanos Ruler {{$labels.job}} has higher evaluation latency than interval for {{$labels.rule_group}}.', + }, + expr: ||| + ( + sum by (job, rule_group) (prometheus_rule_group_last_duration_seconds{%(selector)s}) + > + sum by (job, rule_group) (prometheus_rule_group_interval_seconds{%(selector)s}) + ) + ||| % thanos.receiver, + 'for': '5m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'ThanosRulerGrpcErrorRate', + annotations: { + message: 'Thanos Ruler {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.', + }, + expr: ||| + ( + sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s}[5m])) + / + sum by (job) (rate(grpc_server_started_total{%(selector)s}[5m])) + * 100 > 5 + ) + ||| % thanos.ruler, + 'for': '5m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'ThanosRulerConfigReloadFailure', + annotations: { + message: 'Thanos Ruler {{$labels.job}} has not been able to reload its configuration.', + }, + expr: 'avg(thanos_rule_config_last_reload_successful{%(selector)s}) by (job) != 1' % thanos.ruler, + 'for': '5m', + labels: { + severity: 'warning', + }, + }, + ], + }, + ], + }, +} diff --git a/mixin/thanos/dashboards/ruler.libsonnet b/mixin/thanos/dashboards/ruler.libsonnet index 941c38b417..067e4d1af2 100644 --- a/mixin/thanos/dashboards/ruler.libsonnet +++ b/mixin/thanos/dashboards/ruler.libsonnet @@ -39,6 +39,23 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; g.latencyPanel('thanos_alert_sender_latency_seconds', 'namespace="$namespace",job=~"$job"'), ) ) + .addRow( + g.row('Alert Queue') + .addPanel( + g.panel('Push Rate', 'Shows rate of queued alerts.') + + g.queryPanel( + 'sum(rate(thanos_alert_queue_alerts_dropped_total{namespace="$namespace",job=~"$job"}[$interval])) by (job, pod)', + '{{job}} {{pod}}' + ) + ) + .addPanel( + g.panel('Drop Ratio', 'Shows ratio of dropped alerts compared to the total number of queued alerts.') + + g.qpsErrTotalPanel( + 'thanos_alert_queue_alerts_dropped_total{namespace="$namespace",job=~"$job"}', + 'thanos_alert_queue_alerts_pushed_total{namespace="$namespace",job=~"$job"}', + ) + ) + ) .addRow( g.row('gRPC (Unary)') .addPanel( From 0f984d7b6c82983dcf4ede80f9e1ec505eb3f775 Mon Sep 17 00:00:00 2001 From: Kemal Akkoyun Date: Wed, 8 Jan 2020 16:21:36 +0100 Subject: [PATCH 2/4] Fix wrong job selector Signed-off-by: Kemal Akkoyun --- examples/alerts/alerts.md | 4 ++-- examples/alerts/alerts.yaml | 4 ++-- mixin/thanos/alerts/ruler.libsonnet | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/alerts/alerts.md b/examples/alerts/alerts.md index 991642d45a..12ed917c66 100644 --- a/examples/alerts/alerts.md +++ b/examples/alerts/alerts.md @@ -113,9 +113,9 @@ rules: for {{$labels.rule_group}}. expr: | ( - sum by (job, rule_group) (prometheus_rule_group_last_duration_seconds{job=~"thanos-receiver.*"}) + sum by (job, rule_group) (prometheus_rule_group_last_duration_seconds{job=~"thanos-ruler.*"}) > - sum by (job, rule_group) (prometheus_rule_group_interval_seconds{job=~"thanos-receiver.*"}) + sum by (job, rule_group) (prometheus_rule_group_interval_seconds{job=~"thanos-ruler.*"}) ) for: 5m labels: diff --git a/examples/alerts/alerts.yaml b/examples/alerts/alerts.yaml index ccd54aaef5..f468cca8fa 100644 --- a/examples/alerts/alerts.yaml +++ b/examples/alerts/alerts.yaml @@ -325,9 +325,9 @@ groups: for {{$labels.rule_group}}. expr: | ( - sum by (job, rule_group) (prometheus_rule_group_last_duration_seconds{job=~"thanos-receiver.*"}) + sum by (job, rule_group) (prometheus_rule_group_last_duration_seconds{job=~"thanos-ruler.*"}) > - sum by (job, rule_group) (prometheus_rule_group_interval_seconds{job=~"thanos-receiver.*"}) + sum by (job, rule_group) (prometheus_rule_group_interval_seconds{job=~"thanos-ruler.*"}) ) for: 5m labels: diff --git a/mixin/thanos/alerts/ruler.libsonnet b/mixin/thanos/alerts/ruler.libsonnet index 949dcbb55d..8bcff296d1 100644 --- a/mixin/thanos/alerts/ruler.libsonnet +++ b/mixin/thanos/alerts/ruler.libsonnet @@ -79,7 +79,7 @@ > sum by (job, rule_group) (prometheus_rule_group_interval_seconds{%(selector)s}) ) - ||| % thanos.receiver, + ||| % thanos.ruler, 'for': '5m', labels: { severity: 'warning', From d6f184a74a661662c2e83ba95eee2026080198e6 Mon Sep 17 00:00:00 2001 From: Kemal Akkoyun Date: Wed, 8 Jan 2020 16:25:51 +0100 Subject: [PATCH 3/4] Add pod label as aggregator for evaluation latency query Signed-off-by: Kemal Akkoyun --- examples/alerts/alerts.md | 8 ++++---- examples/alerts/alerts.yaml | 8 ++++---- mixin/thanos/alerts/ruler.libsonnet | 6 +++--- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/examples/alerts/alerts.md b/examples/alerts/alerts.md index 12ed917c66..2c8d13c30c 100644 --- a/examples/alerts/alerts.md +++ b/examples/alerts/alerts.md @@ -109,13 +109,13 @@ rules: severity: warning - alert: ThanosRulerRuleEvaluationLatencyHigh annotations: - message: Thanos Ruler {{$labels.job}} has higher evaluation latency than interval - for {{$labels.rule_group}}. + message: Thanos Ruler {{$labels.job}}/{{$labels.pod}} has higher evaluation latency + than interval for {{$labels.rule_group}}. expr: | ( - sum by (job, rule_group) (prometheus_rule_group_last_duration_seconds{job=~"thanos-ruler.*"}) + sum by (job, pod, rule_group) (prometheus_rule_group_last_duration_seconds{job=~"thanos-ruler.*"}) > - sum by (job, rule_group) (prometheus_rule_group_interval_seconds{job=~"thanos-ruler.*"}) + sum by (job, pod, rule_group) (prometheus_rule_group_interval_seconds{job=~"thanos-ruler.*"}) ) for: 5m labels: diff --git a/examples/alerts/alerts.yaml b/examples/alerts/alerts.yaml index f468cca8fa..66985f5173 100644 --- a/examples/alerts/alerts.yaml +++ b/examples/alerts/alerts.yaml @@ -321,13 +321,13 @@ groups: severity: warning - alert: ThanosRulerRuleEvaluationLatencyHigh annotations: - message: Thanos Ruler {{$labels.job}} has higher evaluation latency than interval - for {{$labels.rule_group}}. + message: Thanos Ruler {{$labels.job}}/{{$labels.pod}} has higher evaluation + latency than interval for {{$labels.rule_group}}. expr: | ( - sum by (job, rule_group) (prometheus_rule_group_last_duration_seconds{job=~"thanos-ruler.*"}) + sum by (job, pod, rule_group) (prometheus_rule_group_last_duration_seconds{job=~"thanos-ruler.*"}) > - sum by (job, rule_group) (prometheus_rule_group_interval_seconds{job=~"thanos-ruler.*"}) + sum by (job, pod, rule_group) (prometheus_rule_group_interval_seconds{job=~"thanos-ruler.*"}) ) for: 5m labels: diff --git a/mixin/thanos/alerts/ruler.libsonnet b/mixin/thanos/alerts/ruler.libsonnet index 8bcff296d1..4a0dacacd6 100644 --- a/mixin/thanos/alerts/ruler.libsonnet +++ b/mixin/thanos/alerts/ruler.libsonnet @@ -71,13 +71,13 @@ { alert: 'ThanosRulerRuleEvaluationLatencyHigh', annotations: { - message: 'Thanos Ruler {{$labels.job}} has higher evaluation latency than interval for {{$labels.rule_group}}.', + message: 'Thanos Ruler {{$labels.job}}/{{$labels.pod}} has higher evaluation latency than interval for {{$labels.rule_group}}.', }, expr: ||| ( - sum by (job, rule_group) (prometheus_rule_group_last_duration_seconds{%(selector)s}) + sum by (job, pod, rule_group) (prometheus_rule_group_last_duration_seconds{%(selector)s}) > - sum by (job, rule_group) (prometheus_rule_group_interval_seconds{%(selector)s}) + sum by (job, pod, rule_group) (prometheus_rule_group_interval_seconds{%(selector)s}) ) ||| % thanos.ruler, 'for': '5m', From 453d15e51b2a1a773f8131a9e4e60c47acbcfba4 Mon Sep 17 00:00:00 2001 From: Kemal Akkoyun Date: Wed, 8 Jan 2020 17:09:25 +0100 Subject: [PATCH 4/4] Fix review issues Signed-off-by: Kemal Akkoyun --- examples/alerts/alerts.md | 6 +++--- examples/alerts/alerts.yaml | 6 +++--- mixin/thanos/alerts/ruler.libsonnet | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/examples/alerts/alerts.md b/examples/alerts/alerts.md index 2c8d13c30c..41e135b24c 100644 --- a/examples/alerts/alerts.md +++ b/examples/alerts/alerts.md @@ -72,7 +72,7 @@ rules: annotations: message: Thanos Ruler {{$labels.job}} {{$labels.pod}} is failing to queue alerts. expr: | - sum by (job) (thanos_alert_queue_alerts_dropped_total{job=~"thanos-ruler.*"}) > 0 + sum by (job) (rate(thanos_alert_queue_alerts_dropped_total{job=~"thanos-ruler.*"}[5m])) > 0 for: 5m labels: severity: critical @@ -81,7 +81,7 @@ rules: message: Thanos Ruler {{$labels.job}} {{$labels.pod}} is failing to send alerts to alertmanager. expr: | - sum by (job) (thanos_alert_sender_alerts_dropped_total{job=~"thanos-ruler.*"}) > 0 + sum by (job) (rate(thanos_alert_sender_alerts_dropped_total{job=~"thanos-ruler.*"}[5m])) > 0 for: 5m labels: severity: critical @@ -104,7 +104,7 @@ rules: warnings. expr: | sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{job=~"thanos-ruler.*"}[5m])) > 0 - for: 5m + for: 15m labels: severity: warning - alert: ThanosRulerRuleEvaluationLatencyHigh diff --git a/examples/alerts/alerts.yaml b/examples/alerts/alerts.yaml index 66985f5173..0eb4fda1d1 100644 --- a/examples/alerts/alerts.yaml +++ b/examples/alerts/alerts.yaml @@ -283,7 +283,7 @@ groups: annotations: message: Thanos Ruler {{$labels.job}} {{$labels.pod}} is failing to queue alerts. expr: | - sum by (job) (thanos_alert_queue_alerts_dropped_total{job=~"thanos-ruler.*"}) > 0 + sum by (job) (rate(thanos_alert_queue_alerts_dropped_total{job=~"thanos-ruler.*"}[5m])) > 0 for: 5m labels: severity: critical @@ -292,7 +292,7 @@ groups: message: Thanos Ruler {{$labels.job}} {{$labels.pod}} is failing to send alerts to alertmanager. expr: | - sum by (job) (thanos_alert_sender_alerts_dropped_total{job=~"thanos-ruler.*"}) > 0 + sum by (job) (rate(thanos_alert_sender_alerts_dropped_total{job=~"thanos-ruler.*"}[5m])) > 0 for: 5m labels: severity: critical @@ -316,7 +316,7 @@ groups: warnings. expr: | sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{job=~"thanos-ruler.*"}[5m])) > 0 - for: 5m + for: 15m labels: severity: warning - alert: ThanosRulerRuleEvaluationLatencyHigh diff --git a/mixin/thanos/alerts/ruler.libsonnet b/mixin/thanos/alerts/ruler.libsonnet index 4a0dacacd6..08de10a23b 100644 --- a/mixin/thanos/alerts/ruler.libsonnet +++ b/mixin/thanos/alerts/ruler.libsonnet @@ -15,7 +15,7 @@ message: 'Thanos Ruler {{$labels.job}} {{$labels.pod}} is failing to queue alerts.', }, expr: ||| - sum by (job) (thanos_alert_queue_alerts_dropped_total{%(selector)s}) > 0 + sum by (job) (rate(thanos_alert_queue_alerts_dropped_total{%(selector)s}[5m])) > 0 ||| % thanos.ruler, 'for': '5m', labels: { @@ -28,7 +28,7 @@ message: 'Thanos Ruler {{$labels.job}} {{$labels.pod}} is failing to send alerts to alertmanager.', }, expr: ||| - sum by (job) (thanos_alert_sender_alerts_dropped_total{%(selector)s}) > 0 + sum by (job) (rate(thanos_alert_sender_alerts_dropped_total{%(selector)s}[5m])) > 0 ||| % thanos.ruler, 'for': '5m', labels: { @@ -63,7 +63,7 @@ sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{%(selector)s}[5m])) > 0 ||| % thanos.ruler, - 'for': '5m', + 'for': '15m', labels: { severity: 'warning', },