From 8d70de80f475ec26f28105112314f0e5f6435aa9 Mon Sep 17 00:00:00 2001
From: Kemal Akkoyun <kakkoyun@gmail.com>
Date: Wed, 8 Jan 2020 16:06:59 +0100
Subject: [PATCH 1/4] Add Thanos Ruler alerts

Signed-off-by: Kemal Akkoyun <kakkoyun@gmail.com>
---
 examples/alerts/alerts.md               |  91 ++++++++---
 examples/alerts/alerts.yaml             |  77 +++++++++
 examples/dashboards/ruler.json          | 198 ++++++++++++++++++++++--
 mixin/thanos/alerts/alerts.libsonnet    |   1 +
 mixin/thanos/alerts/ruler.libsonnet     | 121 +++++++++++++++
 mixin/thanos/dashboards/ruler.libsonnet |  17 ++
 6 files changed, 467 insertions(+), 38 deletions(-)
 create mode 100644 mixin/thanos/alerts/ruler.libsonnet

diff --git a/examples/alerts/alerts.md b/examples/alerts/alerts.md
index 14367defa7..991642d45a 100644
--- a/examples/alerts/alerts.md
+++ b/examples/alerts/alerts.md
@@ -64,39 +64,84 @@ rules:
 
 For Thanos ruler we run some alerts in local Prometheus, to make sure that Thanos Rule is working:
 
-[//]: # "TODO(kakkoyun): Generate rule rules using thanos-mixin."
-<!-- [embedmd]:# (../tmp/thanos-ruler.rules.yaml yaml) -->
+[embedmd]:# (../tmp/thanos-ruler.rules.yaml yaml)
 ```yaml
-- alert: ThanosRuleIsDown
-  expr: up{app="thanos-ruler"} == 0 or absent(up{app="thanos-ruler"})
+name: thanos-ruler.rules
+rules:
+- alert: ThanosRulerQueueIsDroppingAlerts
+  annotations:
+    message: Thanos Ruler {{$labels.job}} {{$labels.pod}} is failing to queue alerts.
+  expr: |
+    sum by (job) (thanos_alert_queue_alerts_dropped_total{job=~"thanos-ruler.*"}) > 0
   for: 5m
   labels:
-    team: TEAM
+    severity: critical
+- alert: ThanosRulerSenderIsFailingAlerts
   annotations:
-    summary: Thanos Rule is down
-    impact: Alerts are not working
-    action: 'check {{ $labels.kubernetes_pod_name }} pod in {{ $labels.kubernetes_namespace}} namespace'
-    dashboard: RULE_DASHBOARD
-- alert: ThanosRuleIsDroppingAlerts
-  expr: rate(thanos_alert_queue_alerts_dropped_total{app="thanos-ruler"}[5m]) > 0
+    message: Thanos Ruler {{$labels.job}} {{$labels.pod}} is failing to send alerts
+      to alertmanager.
+  expr: |
+    sum by (job) (thanos_alert_sender_alerts_dropped_total{job=~"thanos-ruler.*"}) > 0
   for: 5m
   labels:
-    team: TEAM
+    severity: critical
+- alert: ThanosRulerHighRuleExaluationFailures
   annotations:
-    summary: Thanos Rule is dropping alerts
-    impact: Alerts are not working
-    action: 'check {{ $labels.kubernetes_pod_name }} pod logs in {{ $labels.kubernetes_namespace}} namespace'
-    dashboard: RULE_DASHBOARD
-- alert: ThanosRuleGrpcErrorRate
-  expr: rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable",app="thanos-ruler"}[5m]) > 0
+    message: Thanos Ruler {{$labels.job}} {{$labels.pod}} is failing to evaluate rules.
+  expr: |
+    (
+      sum by (job) (rate(prometheus_rule_evaluation_failures_total{job=~"thanos-ruler.*"}[5m]))
+    /
+      sum by (job) (rate(prometheus_rule_evaluations_total{job=~"thanos-ruler.*"}[5m]))
+    * 100 > 5
+    )
   for: 5m
   labels:
-    team: TEAM
+    severity: warning
+- alert: ThanosRulerHighRuleExaluationWarnings
   annotations:
-    summary: Thanos Rule is returning Internal/Unavailable errors
-    impact: Recording Rules are not working
-    action: Check {{ $labels.kubernetes_pod_name }} pod logs in {{ $labels.kubernetes_namespace}} namespace
-    dashboard: RULE_DASHBOARD
+    message: Thanos Ruler {{$labels.job}} {{$labels.pod}} has high number of evaluation
+      warnings.
+  expr: |
+    sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{job=~"thanos-ruler.*"}[5m])) > 0
+  for: 5m
+  labels:
+    severity: warning
+- alert: ThanosRulerRuleEvaluationLatencyHigh
+  annotations:
+    message: Thanos Ruler {{$labels.job}} has higher evaluation latency than interval
+      for {{$labels.rule_group}}.
+  expr: |
+    (
+      sum by (job, rule_group) (prometheus_rule_group_last_duration_seconds{job=~"thanos-receiver.*"})
+    >
+      sum by (job, rule_group) (prometheus_rule_group_interval_seconds{job=~"thanos-receiver.*"})
+    )
+  for: 5m
+  labels:
+    severity: warning
+- alert: ThanosRulerGrpcErrorRate
+  annotations:
+    message: Thanos Ruler {{$labels.job}} is failing to handle {{ $value | humanize
+      }}% of requests.
+  expr: |
+    (
+      sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-ruler.*"}[5m]))
+    /
+      sum by (job) (rate(grpc_server_started_total{job=~"thanos-ruler.*"}[5m]))
+    * 100 > 5
+    )
+  for: 5m
+  labels:
+    severity: warning
+- alert: ThanosRulerConfigReloadFailure
+  annotations:
+    message: Thanos Ruler {{$labels.job}} has not been able to reload its configuration.
+  expr: avg(thanos_rule_config_last_reload_successful{job=~"thanos-ruler.*"}) by (job)
+    != 1
+  for: 5m
+  labels:
+    severity: warning
 ```
 
 ## Store Gateway
diff --git a/examples/alerts/alerts.yaml b/examples/alerts/alerts.yaml
index 23f10b655f..ccd54aaef5 100644
--- a/examples/alerts/alerts.yaml
+++ b/examples/alerts/alerts.yaml
@@ -277,6 +277,83 @@ groups:
     for: 10m
     labels:
       severity: warning
+- name: thanos-ruler.rules
+  rules:
+  - alert: ThanosRulerQueueIsDroppingAlerts
+    annotations:
+      message: Thanos Ruler {{$labels.job}} {{$labels.pod}} is failing to queue alerts.
+    expr: |
+      sum by (job) (thanos_alert_queue_alerts_dropped_total{job=~"thanos-ruler.*"}) > 0
+    for: 5m
+    labels:
+      severity: critical
+  - alert: ThanosRulerSenderIsFailingAlerts
+    annotations:
+      message: Thanos Ruler {{$labels.job}} {{$labels.pod}} is failing to send alerts
+        to alertmanager.
+    expr: |
+      sum by (job) (thanos_alert_sender_alerts_dropped_total{job=~"thanos-ruler.*"}) > 0
+    for: 5m
+    labels:
+      severity: critical
+  - alert: ThanosRulerHighRuleExaluationFailures
+    annotations:
+      message: Thanos Ruler {{$labels.job}} {{$labels.pod}} is failing to evaluate
+        rules.
+    expr: |
+      (
+        sum by (job) (rate(prometheus_rule_evaluation_failures_total{job=~"thanos-ruler.*"}[5m]))
+      /
+        sum by (job) (rate(prometheus_rule_evaluations_total{job=~"thanos-ruler.*"}[5m]))
+      * 100 > 5
+      )
+    for: 5m
+    labels:
+      severity: warning
+  - alert: ThanosRulerHighRuleExaluationWarnings
+    annotations:
+      message: Thanos Ruler {{$labels.job}} {{$labels.pod}} has high number of evaluation
+        warnings.
+    expr: |
+      sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{job=~"thanos-ruler.*"}[5m])) > 0
+    for: 5m
+    labels:
+      severity: warning
+  - alert: ThanosRulerRuleEvaluationLatencyHigh
+    annotations:
+      message: Thanos Ruler {{$labels.job}} has higher evaluation latency than interval
+        for {{$labels.rule_group}}.
+    expr: |
+      (
+        sum by (job, rule_group) (prometheus_rule_group_last_duration_seconds{job=~"thanos-receiver.*"})
+      >
+        sum by (job, rule_group) (prometheus_rule_group_interval_seconds{job=~"thanos-receiver.*"})
+      )
+    for: 5m
+    labels:
+      severity: warning
+  - alert: ThanosRulerGrpcErrorRate
+    annotations:
+      message: Thanos Ruler {{$labels.job}} is failing to handle {{ $value | humanize
+        }}% of requests.
+    expr: |
+      (
+        sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-ruler.*"}[5m]))
+      /
+        sum by (job) (rate(grpc_server_started_total{job=~"thanos-ruler.*"}[5m]))
+      * 100 > 5
+      )
+    for: 5m
+    labels:
+      severity: warning
+  - alert: ThanosRulerConfigReloadFailure
+    annotations:
+      message: Thanos Ruler {{$labels.job}} has not been able to reload its configuration.
+    expr: avg(thanos_rule_config_last_reload_successful{job=~"thanos-ruler.*"}) by
+      (job) != 1
+    for: 5m
+    labels:
+      severity: warning
 - name: thanos-component-absent.rules
   rules:
   - alert: ThanosCompactorIsDown
diff --git a/examples/dashboards/ruler.json b/examples/dashboards/ruler.json
index fa2596da4a..6dc5f85a25 100644
--- a/examples/dashboards/ruler.json
+++ b/examples/dashboards/ruler.json
@@ -347,6 +347,174 @@
          "title": "Alert Sent",
          "titleSize": "h6"
       },
+      {
+         "collapse": false,
+         "height": "250px",
+         "panels": [
+            {
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "description": "Shows rate of queued alerts.",
+               "fill": 1,
+               "id": 5,
+               "legend": {
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "show": true,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 1,
+               "links": [ ],
+               "nullPointMode": "null as zero",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 6,
+               "stack": false,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "sum(rate(thanos_alert_queue_alerts_dropped_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, pod)",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "{{job}} {{pod}}",
+                     "legendLink": null,
+                     "step": 10
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Push Rate",
+               "tooltip": {
+                  "shared": false,
+                  "sort": 0,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": 0,
+                     "show": true
+                  },
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": false
+                  }
+               ]
+            },
+            {
+               "aliasColors": {
+                  "error": "#E24D42"
+               },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "description": "Shows ratio of dropped alerts compared to the total number of queued alerts.",
+               "fill": 10,
+               "id": 6,
+               "legend": {
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "show": true,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 0,
+               "links": [ ],
+               "nullPointMode": "null as zero",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 6,
+               "stack": true,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "sum(rate(thanos_alert_queue_alerts_dropped_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) / sum(rate(thanos_alert_queue_alerts_pushed_total{namespace=\"$namespace\",job=~\"$job\"}[$interval]))",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "error",
+                     "refId": "A",
+                     "step": 10
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Drop Ratio",
+               "tooltip": {
+                  "shared": false,
+                  "sort": 0,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "percentunit",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": 0,
+                     "show": true
+                  },
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": false
+                  }
+               ]
+            }
+         ],
+         "repeat": null,
+         "repeatIteration": null,
+         "repeatRowId": null,
+         "showTitle": true,
+         "title": "Alert Queue",
+         "titleSize": "h6"
+      },
       {
          "collapse": false,
          "height": "250px",
@@ -378,7 +546,7 @@
                "datasource": "$datasource",
                "description": "Shows rate of handled Unary gRPC requests.",
                "fill": 10,
-               "id": 5,
+               "id": 7,
                "legend": {
                   "avg": false,
                   "current": false,
@@ -457,7 +625,7 @@
                "datasource": "$datasource",
                "description": "Shows ratio of errors compared to the total number of handled requests.",
                "fill": 10,
-               "id": 6,
+               "id": 8,
                "legend": {
                   "avg": false,
                   "current": false,
@@ -534,7 +702,7 @@
                "datasource": "$datasource",
                "description": "Shows how long has it taken to handle requests, in quantiles.",
                "fill": 1,
-               "id": 7,
+               "id": 9,
                "legend": {
                   "avg": false,
                   "current": false,
@@ -639,7 +807,7 @@
                "datasource": "$datasource",
                "description": "Shows rate of handled Unary gRPC requests.",
                "fill": 10,
-               "id": 8,
+               "id": 10,
                "legend": {
                   "avg": false,
                   "current": false,
@@ -716,7 +884,7 @@
                "datasource": "$datasource",
                "description": "Shows ratio of errors compared to the total number of handled requests.",
                "fill": 10,
-               "id": 9,
+               "id": 11,
                "legend": {
                   "avg": false,
                   "current": false,
@@ -793,7 +961,7 @@
                "datasource": "$datasource",
                "description": "Shows how long has it taken to handle requests, in quantiles.",
                "fill": 1,
-               "id": 10,
+               "id": 12,
                "legend": {
                   "avg": false,
                   "current": false,
@@ -917,7 +1085,7 @@
                "datasource": "$datasource",
                "description": "Shows rate of handled Streamed gRPC requests.",
                "fill": 10,
-               "id": 11,
+               "id": 13,
                "legend": {
                   "avg": false,
                   "current": false,
@@ -996,7 +1164,7 @@
                "datasource": "$datasource",
                "description": "Shows ratio of errors compared to the total number of handled requests.",
                "fill": 10,
-               "id": 12,
+               "id": 14,
                "legend": {
                   "avg": false,
                   "current": false,
@@ -1073,7 +1241,7 @@
                "datasource": "$datasource",
                "description": "Shows how long has it taken to handle requests, in quantiles",
                "fill": 1,
-               "id": 13,
+               "id": 15,
                "legend": {
                   "avg": false,
                   "current": false,
@@ -1178,7 +1346,7 @@
                "datasource": "$datasource",
                "description": "Shows rate of handled Streamed gRPC requests.",
                "fill": 10,
-               "id": 14,
+               "id": 16,
                "legend": {
                   "avg": false,
                   "current": false,
@@ -1255,7 +1423,7 @@
                "datasource": "$datasource",
                "description": "Shows ratio of errors compared to the total number of handled requests.",
                "fill": 10,
-               "id": 15,
+               "id": 17,
                "legend": {
                   "avg": false,
                   "current": false,
@@ -1332,7 +1500,7 @@
                "datasource": "$datasource",
                "description": "Shows how long has it taken to handle requests, in quantiles",
                "fill": 1,
-               "id": 16,
+               "id": 18,
                "legend": {
                   "avg": false,
                   "current": false,
@@ -1436,7 +1604,7 @@
                "dashes": false,
                "datasource": "$datasource",
                "fill": 1,
-               "id": 17,
+               "id": 19,
                "legend": {
                   "avg": false,
                   "current": false,
@@ -1552,7 +1720,7 @@
                "dashes": false,
                "datasource": "$datasource",
                "fill": 1,
-               "id": 18,
+               "id": 20,
                "legend": {
                   "avg": false,
                   "current": false,
@@ -1628,7 +1796,7 @@
                "dashes": false,
                "datasource": "$datasource",
                "fill": 1,
-               "id": 19,
+               "id": 21,
                "legend": {
                   "avg": false,
                   "current": false,
diff --git a/mixin/thanos/alerts/alerts.libsonnet b/mixin/thanos/alerts/alerts.libsonnet
index 0eb63dc98d..e3fa004090 100644
--- a/mixin/thanos/alerts/alerts.libsonnet
+++ b/mixin/thanos/alerts/alerts.libsonnet
@@ -3,4 +3,5 @@
 (import 'receiver.libsonnet') +
 (import 'sidecar.libsonnet') +
 (import 'store.libsonnet') +
+(import 'ruler.libsonnet') +
 (import 'absent.libsonnet')
diff --git a/mixin/thanos/alerts/ruler.libsonnet b/mixin/thanos/alerts/ruler.libsonnet
new file mode 100644
index 0000000000..949dcbb55d
--- /dev/null
+++ b/mixin/thanos/alerts/ruler.libsonnet
@@ -0,0 +1,121 @@
+{
+  local thanos = self,
+  ruler+:: {
+    jobPrefix: error 'must provide job prefix for Thanos Ruler alerts',
+    selector: error 'must provide selector for Thanos Ruler alerts',
+  },
+  prometheusAlerts+:: {
+    groups+: [
+      {
+        name: 'thanos-ruler.rules',
+        rules: [
+          {
+            alert: 'ThanosRulerQueueIsDroppingAlerts',
+            annotations: {
+              message: 'Thanos Ruler {{$labels.job}} {{$labels.pod}} is failing to queue alerts.',
+            },
+            expr: |||
+              sum by (job) (thanos_alert_queue_alerts_dropped_total{%(selector)s}) > 0
+            ||| % thanos.ruler,
+            'for': '5m',
+            labels: {
+              severity: 'critical',
+            },
+          },
+          {
+            alert: 'ThanosRulerSenderIsFailingAlerts',
+            annotations: {
+              message: 'Thanos Ruler {{$labels.job}} {{$labels.pod}} is failing to send alerts to alertmanager.',
+            },
+            expr: |||
+              sum by (job) (thanos_alert_sender_alerts_dropped_total{%(selector)s}) > 0
+            ||| % thanos.ruler,
+            'for': '5m',
+            labels: {
+              severity: 'critical',
+            },
+          },
+          {
+            alert: 'ThanosRulerHighRuleExaluationFailures',
+            annotations: {
+              message: 'Thanos Ruler {{$labels.job}} {{$labels.pod}} is failing to evaluate rules.',
+            },
+            expr: |||
+              (
+                sum by (job) (rate(prometheus_rule_evaluation_failures_total{%(selector)s}[5m]))
+              /
+                sum by (job) (rate(prometheus_rule_evaluations_total{%(selector)s}[5m]))
+              * 100 > 5
+              )
+            ||| % thanos.ruler,
+
+            'for': '5m',
+            labels: {
+              severity: 'warning',
+            },
+          },
+          {
+            alert: 'ThanosRulerHighRuleExaluationWarnings',
+            annotations: {
+              message: 'Thanos Ruler {{$labels.job}} {{$labels.pod}} has high number of evaluation warnings.',
+            },
+            expr: |||
+              sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{%(selector)s}[5m])) > 0
+            ||| % thanos.ruler,
+
+            'for': '5m',
+            labels: {
+              severity: 'warning',
+            },
+          },
+          {
+            alert: 'ThanosRulerRuleEvaluationLatencyHigh',
+            annotations: {
+              message: 'Thanos Ruler {{$labels.job}} has higher evaluation latency than interval for {{$labels.rule_group}}.',
+            },
+            expr: |||
+              (
+                sum by (job, rule_group) (prometheus_rule_group_last_duration_seconds{%(selector)s})
+              >
+                sum by (job, rule_group) (prometheus_rule_group_interval_seconds{%(selector)s})
+              )
+            ||| % thanos.receiver,
+            'for': '5m',
+            labels: {
+              severity: 'warning',
+            },
+          },
+          {
+            alert: 'ThanosRulerGrpcErrorRate',
+            annotations: {
+              message: 'Thanos Ruler {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.',
+            },
+            expr: |||
+              (
+                sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s}[5m]))
+              /
+                sum by (job) (rate(grpc_server_started_total{%(selector)s}[5m]))
+              * 100 > 5
+              )
+            ||| % thanos.ruler,
+            'for': '5m',
+            labels: {
+              severity: 'warning',
+            },
+          },
+          {
+            alert: 'ThanosRulerConfigReloadFailure',
+            annotations: {
+              message: 'Thanos Ruler {{$labels.job}} has not been able to reload its configuration.',
+            },
+            expr: 'avg(thanos_rule_config_last_reload_successful{%(selector)s}) by (job) != 1' % thanos.ruler,
+            'for': '5m',
+            labels: {
+              severity: 'warning',
+            },
+          },
+        ],
+      },
+    ],
+  },
+}
diff --git a/mixin/thanos/dashboards/ruler.libsonnet b/mixin/thanos/dashboards/ruler.libsonnet
index 941c38b417..067e4d1af2 100644
--- a/mixin/thanos/dashboards/ruler.libsonnet
+++ b/mixin/thanos/dashboards/ruler.libsonnet
@@ -39,6 +39,23 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet';
           g.latencyPanel('thanos_alert_sender_latency_seconds', 'namespace="$namespace",job=~"$job"'),
         )
       )
+      .addRow(
+        g.row('Alert Queue')
+        .addPanel(
+          g.panel('Push Rate', 'Shows rate of queued alerts.') +
+          g.queryPanel(
+            'sum(rate(thanos_alert_queue_alerts_dropped_total{namespace="$namespace",job=~"$job"}[$interval])) by (job, pod)',
+            '{{job}} {{pod}}'
+          )
+        )
+        .addPanel(
+          g.panel('Drop Ratio', 'Shows ratio of dropped alerts compared to the total number of queued alerts.') +
+          g.qpsErrTotalPanel(
+            'thanos_alert_queue_alerts_dropped_total{namespace="$namespace",job=~"$job"}',
+            'thanos_alert_queue_alerts_pushed_total{namespace="$namespace",job=~"$job"}',
+          )
+        )
+      )
       .addRow(
         g.row('gRPC (Unary)')
         .addPanel(

From 0f984d7b6c82983dcf4ede80f9e1ec505eb3f775 Mon Sep 17 00:00:00 2001
From: Kemal Akkoyun <kakkoyun@gmail.com>
Date: Wed, 8 Jan 2020 16:21:36 +0100
Subject: [PATCH 2/4] Fix wrong job selector

Signed-off-by: Kemal Akkoyun <kakkoyun@gmail.com>
---
 examples/alerts/alerts.md           | 4 ++--
 examples/alerts/alerts.yaml         | 4 ++--
 mixin/thanos/alerts/ruler.libsonnet | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/alerts/alerts.md b/examples/alerts/alerts.md
index 991642d45a..12ed917c66 100644
--- a/examples/alerts/alerts.md
+++ b/examples/alerts/alerts.md
@@ -113,9 +113,9 @@ rules:
       for {{$labels.rule_group}}.
   expr: |
     (
-      sum by (job, rule_group) (prometheus_rule_group_last_duration_seconds{job=~"thanos-receiver.*"})
+      sum by (job, rule_group) (prometheus_rule_group_last_duration_seconds{job=~"thanos-ruler.*"})
     >
-      sum by (job, rule_group) (prometheus_rule_group_interval_seconds{job=~"thanos-receiver.*"})
+      sum by (job, rule_group) (prometheus_rule_group_interval_seconds{job=~"thanos-ruler.*"})
     )
   for: 5m
   labels:
diff --git a/examples/alerts/alerts.yaml b/examples/alerts/alerts.yaml
index ccd54aaef5..f468cca8fa 100644
--- a/examples/alerts/alerts.yaml
+++ b/examples/alerts/alerts.yaml
@@ -325,9 +325,9 @@ groups:
         for {{$labels.rule_group}}.
     expr: |
       (
-        sum by (job, rule_group) (prometheus_rule_group_last_duration_seconds{job=~"thanos-receiver.*"})
+        sum by (job, rule_group) (prometheus_rule_group_last_duration_seconds{job=~"thanos-ruler.*"})
       >
-        sum by (job, rule_group) (prometheus_rule_group_interval_seconds{job=~"thanos-receiver.*"})
+        sum by (job, rule_group) (prometheus_rule_group_interval_seconds{job=~"thanos-ruler.*"})
       )
     for: 5m
     labels:
diff --git a/mixin/thanos/alerts/ruler.libsonnet b/mixin/thanos/alerts/ruler.libsonnet
index 949dcbb55d..8bcff296d1 100644
--- a/mixin/thanos/alerts/ruler.libsonnet
+++ b/mixin/thanos/alerts/ruler.libsonnet
@@ -79,7 +79,7 @@
               >
                 sum by (job, rule_group) (prometheus_rule_group_interval_seconds{%(selector)s})
               )
-            ||| % thanos.receiver,
+            ||| % thanos.ruler,
             'for': '5m',
             labels: {
               severity: 'warning',

From d6f184a74a661662c2e83ba95eee2026080198e6 Mon Sep 17 00:00:00 2001
From: Kemal Akkoyun <kakkoyun@gmail.com>
Date: Wed, 8 Jan 2020 16:25:51 +0100
Subject: [PATCH 3/4] Add pod label as aggregator for evaluation latency query

Signed-off-by: Kemal Akkoyun <kakkoyun@gmail.com>
---
 examples/alerts/alerts.md           | 8 ++++----
 examples/alerts/alerts.yaml         | 8 ++++----
 mixin/thanos/alerts/ruler.libsonnet | 6 +++---
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/examples/alerts/alerts.md b/examples/alerts/alerts.md
index 12ed917c66..2c8d13c30c 100644
--- a/examples/alerts/alerts.md
+++ b/examples/alerts/alerts.md
@@ -109,13 +109,13 @@ rules:
     severity: warning
 - alert: ThanosRulerRuleEvaluationLatencyHigh
   annotations:
-    message: Thanos Ruler {{$labels.job}} has higher evaluation latency than interval
-      for {{$labels.rule_group}}.
+    message: Thanos Ruler {{$labels.job}}/{{$labels.pod}} has higher evaluation latency
+      than interval for {{$labels.rule_group}}.
   expr: |
     (
-      sum by (job, rule_group) (prometheus_rule_group_last_duration_seconds{job=~"thanos-ruler.*"})
+      sum by (job, pod, rule_group) (prometheus_rule_group_last_duration_seconds{job=~"thanos-ruler.*"})
     >
-      sum by (job, rule_group) (prometheus_rule_group_interval_seconds{job=~"thanos-ruler.*"})
+      sum by (job, pod, rule_group) (prometheus_rule_group_interval_seconds{job=~"thanos-ruler.*"})
     )
   for: 5m
   labels:
diff --git a/examples/alerts/alerts.yaml b/examples/alerts/alerts.yaml
index f468cca8fa..66985f5173 100644
--- a/examples/alerts/alerts.yaml
+++ b/examples/alerts/alerts.yaml
@@ -321,13 +321,13 @@ groups:
       severity: warning
   - alert: ThanosRulerRuleEvaluationLatencyHigh
     annotations:
-      message: Thanos Ruler {{$labels.job}} has higher evaluation latency than interval
-        for {{$labels.rule_group}}.
+      message: Thanos Ruler {{$labels.job}}/{{$labels.pod}} has higher evaluation
+        latency than interval for {{$labels.rule_group}}.
     expr: |
       (
-        sum by (job, rule_group) (prometheus_rule_group_last_duration_seconds{job=~"thanos-ruler.*"})
+        sum by (job, pod, rule_group) (prometheus_rule_group_last_duration_seconds{job=~"thanos-ruler.*"})
       >
-        sum by (job, rule_group) (prometheus_rule_group_interval_seconds{job=~"thanos-ruler.*"})
+        sum by (job, pod, rule_group) (prometheus_rule_group_interval_seconds{job=~"thanos-ruler.*"})
       )
     for: 5m
     labels:
diff --git a/mixin/thanos/alerts/ruler.libsonnet b/mixin/thanos/alerts/ruler.libsonnet
index 8bcff296d1..4a0dacacd6 100644
--- a/mixin/thanos/alerts/ruler.libsonnet
+++ b/mixin/thanos/alerts/ruler.libsonnet
@@ -71,13 +71,13 @@
           {
             alert: 'ThanosRulerRuleEvaluationLatencyHigh',
             annotations: {
-              message: 'Thanos Ruler {{$labels.job}} has higher evaluation latency than interval for {{$labels.rule_group}}.',
+              message: 'Thanos Ruler {{$labels.job}}/{{$labels.pod}} has higher evaluation latency than interval for {{$labels.rule_group}}.',
             },
             expr: |||
               (
-                sum by (job, rule_group) (prometheus_rule_group_last_duration_seconds{%(selector)s})
+                sum by (job, pod, rule_group) (prometheus_rule_group_last_duration_seconds{%(selector)s})
               >
-                sum by (job, rule_group) (prometheus_rule_group_interval_seconds{%(selector)s})
+                sum by (job, pod, rule_group) (prometheus_rule_group_interval_seconds{%(selector)s})
               )
             ||| % thanos.ruler,
             'for': '5m',

From 453d15e51b2a1a773f8131a9e4e60c47acbcfba4 Mon Sep 17 00:00:00 2001
From: Kemal Akkoyun <kakkoyun@gmail.com>
Date: Wed, 8 Jan 2020 17:09:25 +0100
Subject: [PATCH 4/4] Fix review issues

Signed-off-by: Kemal Akkoyun <kakkoyun@gmail.com>
---
 examples/alerts/alerts.md           | 6 +++---
 examples/alerts/alerts.yaml         | 6 +++---
 mixin/thanos/alerts/ruler.libsonnet | 6 +++---
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/examples/alerts/alerts.md b/examples/alerts/alerts.md
index 2c8d13c30c..41e135b24c 100644
--- a/examples/alerts/alerts.md
+++ b/examples/alerts/alerts.md
@@ -72,7 +72,7 @@ rules:
   annotations:
     message: Thanos Ruler {{$labels.job}} {{$labels.pod}} is failing to queue alerts.
   expr: |
-    sum by (job) (thanos_alert_queue_alerts_dropped_total{job=~"thanos-ruler.*"}) > 0
+    sum by (job) (rate(thanos_alert_queue_alerts_dropped_total{job=~"thanos-ruler.*"}[5m])) > 0
   for: 5m
   labels:
     severity: critical
@@ -81,7 +81,7 @@ rules:
     message: Thanos Ruler {{$labels.job}} {{$labels.pod}} is failing to send alerts
       to alertmanager.
   expr: |
-    sum by (job) (thanos_alert_sender_alerts_dropped_total{job=~"thanos-ruler.*"}) > 0
+    sum by (job) (rate(thanos_alert_sender_alerts_dropped_total{job=~"thanos-ruler.*"}[5m])) > 0
   for: 5m
   labels:
     severity: critical
@@ -104,7 +104,7 @@ rules:
       warnings.
   expr: |
     sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{job=~"thanos-ruler.*"}[5m])) > 0
-  for: 5m
+  for: 15m
   labels:
     severity: warning
 - alert: ThanosRulerRuleEvaluationLatencyHigh
diff --git a/examples/alerts/alerts.yaml b/examples/alerts/alerts.yaml
index 66985f5173..0eb4fda1d1 100644
--- a/examples/alerts/alerts.yaml
+++ b/examples/alerts/alerts.yaml
@@ -283,7 +283,7 @@ groups:
     annotations:
       message: Thanos Ruler {{$labels.job}} {{$labels.pod}} is failing to queue alerts.
     expr: |
-      sum by (job) (thanos_alert_queue_alerts_dropped_total{job=~"thanos-ruler.*"}) > 0
+      sum by (job) (rate(thanos_alert_queue_alerts_dropped_total{job=~"thanos-ruler.*"}[5m])) > 0
     for: 5m
     labels:
       severity: critical
@@ -292,7 +292,7 @@ groups:
       message: Thanos Ruler {{$labels.job}} {{$labels.pod}} is failing to send alerts
         to alertmanager.
     expr: |
-      sum by (job) (thanos_alert_sender_alerts_dropped_total{job=~"thanos-ruler.*"}) > 0
+      sum by (job) (rate(thanos_alert_sender_alerts_dropped_total{job=~"thanos-ruler.*"}[5m])) > 0
     for: 5m
     labels:
       severity: critical
@@ -316,7 +316,7 @@ groups:
         warnings.
     expr: |
       sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{job=~"thanos-ruler.*"}[5m])) > 0
-    for: 5m
+    for: 15m
     labels:
       severity: warning
   - alert: ThanosRulerRuleEvaluationLatencyHigh
diff --git a/mixin/thanos/alerts/ruler.libsonnet b/mixin/thanos/alerts/ruler.libsonnet
index 4a0dacacd6..08de10a23b 100644
--- a/mixin/thanos/alerts/ruler.libsonnet
+++ b/mixin/thanos/alerts/ruler.libsonnet
@@ -15,7 +15,7 @@
               message: 'Thanos Ruler {{$labels.job}} {{$labels.pod}} is failing to queue alerts.',
             },
             expr: |||
-              sum by (job) (thanos_alert_queue_alerts_dropped_total{%(selector)s}) > 0
+              sum by (job) (rate(thanos_alert_queue_alerts_dropped_total{%(selector)s}[5m])) > 0
             ||| % thanos.ruler,
             'for': '5m',
             labels: {
@@ -28,7 +28,7 @@
               message: 'Thanos Ruler {{$labels.job}} {{$labels.pod}} is failing to send alerts to alertmanager.',
             },
             expr: |||
-              sum by (job) (thanos_alert_sender_alerts_dropped_total{%(selector)s}) > 0
+              sum by (job) (rate(thanos_alert_sender_alerts_dropped_total{%(selector)s}[5m])) > 0
             ||| % thanos.ruler,
             'for': '5m',
             labels: {
@@ -63,7 +63,7 @@
               sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{%(selector)s}[5m])) > 0
             ||| % thanos.ruler,
 
-            'for': '5m',
+            'for': '15m',
             labels: {
               severity: 'warning',
             },