From 5ff75a6f5614b871f83484c46751966dba27376e Mon Sep 17 00:00:00 2001
From: spaparaju <paparaju@gmail.com>
Date: Thu, 29 Apr 2021 10:59:25 +0530
Subject: [PATCH 1/4] Added alert ThanosReceiveTrafficBelowThreshold to flag
 unusually low ingestion rate

Signed-off-by: spaparaju <paparaju@gmail.com>
---
 CHANGELOG.md                   |  4 +++-
 examples/alerts/alerts.md      | 17 +++++++++++++++++
 examples/alerts/alerts.yaml    | 17 +++++++++++++++++
 mixin/alerts/receive.libsonnet | 19 +++++++++++++++++++
 mixin/runbook.md               |  1 +
 5 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e63d0039a1..62ab8f8482 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,7 +13,9 @@ We use _breaking :warning:_ to mark changes that are not backward compatible (re
 ## Unreleased
 
 ### Added
--
+- [#4107](https://github.com/thanos-io/thanos/pull/4107) Store: `LabelNames` and `LabelValues` now support label matchers.
+- [#4117](https://github.com/thanos-io/thanos/pull/4117) Mixin:  new alert ThanosReceiveTrafficBelowThreshold to flag if the ingestion average of the last hour dips below 50% of the ingestion average for the last 12 hours.
+
 ### Fixed
 -
 ### Changed
diff --git a/examples/alerts/alerts.md b/examples/alerts/alerts.md
index 98bfe8a6c5..5d1e044445 100644
--- a/examples/alerts/alerts.md
+++ b/examples/alerts/alerts.md
@@ -562,6 +562,23 @@ rules:
   for: 3h
   labels:
     severity: critical
+- alert: ThanosReceiveTrafficBelowThreshold
+  annotations:
+    description: At Thanos Receive {{$labels.job}} in {{$labels.namespace}} , the
+      average 1-hr avg. metrics ingestion rate  is {{$value | humanize}}% of 12-hr
+      avg. ingestion rate.
+    runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivetrafficbelowthreshold
+    summary: Thanos Receive is experiencing low avg. 1-hr ingestion rate relative
+      to avg. 12-hr ingestion rate.
+  expr: |
+    (
+      avg by (job) (rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[1h]))
+    /
+      avg by (job) (rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[12h]))
+    ) * 100 < 50
+  for: 1h
+  labels:
+    severity: warning
 ```
 
 ## Replicate
diff --git a/examples/alerts/alerts.yaml b/examples/alerts/alerts.yaml
index 46c3cb0727..7a2a4d31bc 100644
--- a/examples/alerts/alerts.yaml
+++ b/examples/alerts/alerts.yaml
@@ -282,6 +282,23 @@ groups:
     for: 3h
     labels:
       severity: critical
+  - alert: ThanosReceiveTrafficBelowThreshold
+    annotations:
+      description: At Thanos Receive {{$labels.job}} in {{$labels.namespace}} , the
+        average 1-hr avg. metrics ingestion rate  is {{$value | humanize}}% of 12-hr
+        avg. ingestion rate.
+      runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivetrafficbelowthreshold
+      summary: Thanos Receive is experiencing low avg. 1-hr ingestion rate relative
+        to avg. 12-hr ingestion rate.
+    expr: |
+      (
+        avg by (job) (rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[1h]))
+      /
+        avg by (job) (rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[12h]))
+      ) * 100 < 50
+    for: 1h
+    labels:
+      severity: warning
 - name: thanos-sidecar
   rules:
   - alert: ThanosSidecarPrometheusDown
diff --git a/mixin/alerts/receive.libsonnet b/mixin/alerts/receive.libsonnet
index d491be7536..f9e48a24f2 100644
--- a/mixin/alerts/receive.libsonnet
+++ b/mixin/alerts/receive.libsonnet
@@ -3,6 +3,7 @@
   receive+:: {
     selector: error 'must provide selector for Thanos Receive alerts',
     httpErrorThreshold: 5,
+    ingestionThreshold: 50,
     forwardErrorThreshold: 20,
     refreshErrorThreshold: 0,
     p99LatencyThreshold: 10,
@@ -143,6 +144,24 @@
               severity: 'critical',
             },
           },
+          {
+            alert: 'ThanosReceiveTrafficBelowThreshold',
+            annotations: {
+              description: 'At Thanos Receive {{$labels.job}} in {{$labels.namespace}} , the average 1-hr avg. metrics ingestion rate  is {{$value | humanize}}% of 12-hr avg. ingestion rate.',
+              summary: 'Thanos Receive is experiencing low avg. 1-hr ingestion rate relative to avg. 12-hr ingestion rate.',
+            },
+            expr: |||
+              (
+                avg by (%(dimensions)s) (rate(http_requests_total{code=~"2..", %(selector)s, handler="receive"}[1h]))
+              /
+                avg by (%(dimensions)s) (rate(http_requests_total{code=~"2..", %(selector)s, handler="receive"}[12h]))
+              ) * 100 < %(ingestionThreshold)s
+            ||| % thanos.receive,
+            'for': '1h',
+            labels: {
+              severity: 'warning',
+            },
+          },
         ],
       },
     ],
diff --git a/mixin/runbook.md b/mixin/runbook.md
index d87c7bc2ca..03f92aed71 100755
--- a/mixin/runbook.md
+++ b/mixin/runbook.md
@@ -63,6 +63,7 @@
 |ThanosReceiveHighHashringFileRefreshFailures|Thanos Receive is failing to refresh hasring file.|Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{$value  humanize}} of attempts failed.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehighhashringfilerefreshfailures](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehighhashringfilerefreshfailures)|
 |ThanosReceiveConfigReloadFailure|Thanos Receive has not been able to reload configuration.|Thanos Receive {{$labels.job}} has not been able to reload hashring configurations.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceiveconfigreloadfailure](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceiveconfigreloadfailure)|
 |ThanosReceiveNoUpload|Thanos Receive has not uploaded latest data to object storage.|Thanos Receive {{$labels.instance}} has not uploaded latest data to object storage.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivenoupload](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivenoupload)|
+|ThanosReceiveTrafficBelowThreshold|Thanos Receive is experiencing low avg. 1-hr ingestion rate relative to avg. 12-hr ingestion rate.|At Thanos Receive {{$labels.job}} in {{$labels.namespace}} , the average 1-hr avg. metrics ingestion rate  is {{$value  humanize}}% of 12-hr avg. ingestion rate.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivetrafficbelowthreshold](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivetrafficbelowthreshold)|
 
 ## thanos-rule
 

From b94d7a93bbed1c8de9750757fec54eef5e5b2c88 Mon Sep 17 00:00:00 2001
From: spaparaju <paparaju@gmail.com>
Date: Thu, 29 Apr 2021 12:27:29 +0530
Subject: [PATCH 2/4] updated rule tests for Thanos Receive component

Signed-off-by: spaparaju <paparaju@gmail.com>
---
 pkg/rules/rules_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/rules/rules_test.go b/pkg/rules/rules_test.go
index bae891b614..01965a9b31 100644
--- a/pkg/rules/rules_test.go
+++ b/pkg/rules/rules_test.go
@@ -67,7 +67,7 @@ func testRulesAgainstExamples(t *testing.T, dir string, server rulespb.RulesServ
 			Name: "thanos-receive",
 			File: filepath.Join(dir, "alerts.yaml"),
 			Rules: []*rulespb.Rule{
-				someAlert, someAlert, someAlert, someAlert, someAlert, someAlert, someAlert,
+				someAlert, someAlert, someAlert, someAlert, someAlert, someAlert, someAlert, someAlert,
 			},
 			Interval:                60,
 			PartialResponseStrategy: storepb.PartialResponseStrategy_ABORT,

From 6c1f563eea8620f615fdf951111a92ed3b130607 Mon Sep 17 00:00:00 2001
From: spaparaju <paparaju@gmail.com>
Date: Thu, 29 Apr 2021 13:27:30 +0530
Subject: [PATCH 3/4] change the evaluation query to averaging over time

Signed-off-by: spaparaju <paparaju@gmail.com>
---
 examples/alerts/alerts.md      | 4 ++--
 examples/alerts/alerts.yaml    | 4 ++--
 mixin/alerts/receive.libsonnet | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/alerts/alerts.md b/examples/alerts/alerts.md
index 5d1e044445..b4daa20577 100644
--- a/examples/alerts/alerts.md
+++ b/examples/alerts/alerts.md
@@ -572,9 +572,9 @@ rules:
       to avg. 12-hr ingestion rate.
   expr: |
     (
-      avg by (job) (rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[1h]))
+      avg_over_time(rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[5m])[1h:5m])
     /
-      avg by (job) (rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[12h]))
+      avg_over_time(rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[5m])[12h:5m])
     ) * 100 < 50
   for: 1h
   labels:
diff --git a/examples/alerts/alerts.yaml b/examples/alerts/alerts.yaml
index 7a2a4d31bc..84a0c8b02d 100644
--- a/examples/alerts/alerts.yaml
+++ b/examples/alerts/alerts.yaml
@@ -292,9 +292,9 @@ groups:
         to avg. 12-hr ingestion rate.
     expr: |
       (
-        avg by (job) (rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[1h]))
+        avg_over_time(rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[5m])[1h:5m])
       /
-        avg by (job) (rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[12h]))
+        avg_over_time(rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[5m])[12h:5m])
       ) * 100 < 50
     for: 1h
     labels:
diff --git a/mixin/alerts/receive.libsonnet b/mixin/alerts/receive.libsonnet
index f9e48a24f2..c65fdf6ad2 100644
--- a/mixin/alerts/receive.libsonnet
+++ b/mixin/alerts/receive.libsonnet
@@ -152,9 +152,9 @@
             },
             expr: |||
               (
-                avg by (%(dimensions)s) (rate(http_requests_total{code=~"2..", %(selector)s, handler="receive"}[1h]))
+                avg_over_time(rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[5m])[1h:5m])
               /
-                avg by (%(dimensions)s) (rate(http_requests_total{code=~"2..", %(selector)s, handler="receive"}[12h]))
+                avg_over_time(rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[5m])[12h:5m])
               ) * 100 < %(ingestionThreshold)s
             ||| % thanos.receive,
             'for': '1h',

From 5ad761f9531089dd2d2240e83c4981ffa9b43117 Mon Sep 17 00:00:00 2001
From: spaparaj <paparaju@gmail.com>
Date: Tue, 11 May 2021 08:13:01 -0400
Subject: [PATCH 4/4] Switched selectors to refer variables

Signed-off-by: spaparaj <paparaju@gmail.com>
---
 examples/alerts/alerts.md      | 4 ++--
 examples/alerts/alerts.yaml    | 4 ++--
 mixin/alerts/receive.libsonnet | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/alerts/alerts.md b/examples/alerts/alerts.md
index b4daa20577..7209920a60 100644
--- a/examples/alerts/alerts.md
+++ b/examples/alerts/alerts.md
@@ -572,9 +572,9 @@ rules:
       to avg. 12-hr ingestion rate.
   expr: |
     (
-      avg_over_time(rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[5m])[1h:5m])
+      avg_over_time(rate(http_requests_total{job=~".*thanos-receive.*", code=~"2..", handler="receive"}[5m])[1h:5m])
     /
-      avg_over_time(rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[5m])[12h:5m])
+      avg_over_time(rate(http_requests_total{job=~".*thanos-receive.*", code=~"2..", handler="receive"}[5m])[12h:5m])
     ) * 100 < 50
   for: 1h
   labels:
diff --git a/examples/alerts/alerts.yaml b/examples/alerts/alerts.yaml
index 84a0c8b02d..7c9f08bdc7 100644
--- a/examples/alerts/alerts.yaml
+++ b/examples/alerts/alerts.yaml
@@ -292,9 +292,9 @@ groups:
         to avg. 12-hr ingestion rate.
     expr: |
       (
-        avg_over_time(rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[5m])[1h:5m])
+        avg_over_time(rate(http_requests_total{job=~".*thanos-receive.*", code=~"2..", handler="receive"}[5m])[1h:5m])
       /
-        avg_over_time(rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[5m])[12h:5m])
+        avg_over_time(rate(http_requests_total{job=~".*thanos-receive.*", code=~"2..", handler="receive"}[5m])[12h:5m])
       ) * 100 < 50
     for: 1h
     labels:
diff --git a/mixin/alerts/receive.libsonnet b/mixin/alerts/receive.libsonnet
index c65fdf6ad2..d313b717cb 100644
--- a/mixin/alerts/receive.libsonnet
+++ b/mixin/alerts/receive.libsonnet
@@ -152,9 +152,9 @@
             },
             expr: |||
               (
-                avg_over_time(rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[5m])[1h:5m])
+                avg_over_time(rate(http_requests_total{%(selector)s, code=~"2..", handler="receive"}[5m])[1h:5m])
               /
-                avg_over_time(rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[5m])[12h:5m])
+                avg_over_time(rate(http_requests_total{%(selector)s, code=~"2..", handler="receive"}[5m])[12h:5m])
               ) * 100 < %(ingestionThreshold)s
             ||| % thanos.receive,
             'for': '1h',