From 5ff75a6f5614b871f83484c46751966dba27376e Mon Sep 17 00:00:00 2001 From: spaparaju <paparaju@gmail.com> Date: Thu, 29 Apr 2021 10:59:25 +0530 Subject: [PATCH 1/4] Added alert ThanosReceiveTrafficBelowThreshold to flag unusually low ingestion rate Signed-off-by: spaparaju <paparaju@gmail.com> --- CHANGELOG.md | 4 +++- examples/alerts/alerts.md | 17 +++++++++++++++++ examples/alerts/alerts.yaml | 17 +++++++++++++++++ mixin/alerts/receive.libsonnet | 19 +++++++++++++++++++ mixin/runbook.md | 1 + 5 files changed, 57 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e63d0039a1..62ab8f8482 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,7 +13,9 @@ We use _breaking :warning:_ to mark changes that are not backward compatible (re ## Unreleased ### Added -- +- [#4107](https://github.com/thanos-io/thanos/pull/4107) Store: `LabelNames` and `LabelValues` now support label matchers. +- [#4117](https://github.com/thanos-io/thanos/pull/4117) Mixin: new alert ThanosReceiveTrafficBelowThreshold to flag if the ingestion average of the last hour dips below 50% of the ingestion average for the last 12 hours. + ### Fixed - ### Changed diff --git a/examples/alerts/alerts.md b/examples/alerts/alerts.md index 98bfe8a6c5..5d1e044445 100644 --- a/examples/alerts/alerts.md +++ b/examples/alerts/alerts.md @@ -562,6 +562,23 @@ rules: for: 3h labels: severity: critical +- alert: ThanosReceiveTrafficBelowThreshold + annotations: + description: At Thanos Receive {{$labels.job}} in {{$labels.namespace}} , the + average 1-hr avg. metrics ingestion rate is {{$value | humanize}}% of 12-hr + avg. ingestion rate. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivetrafficbelowthreshold + summary: Thanos Receive is experiencing low avg. 1-hr ingestion rate relative + to avg. 12-hr ingestion rate. + expr: | + ( + avg by (job) (rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[1h])) + / + avg by (job) (rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[12h])) + ) * 100 < 50 + for: 1h + labels: + severity: warning ``` ## Replicate diff --git a/examples/alerts/alerts.yaml b/examples/alerts/alerts.yaml index 46c3cb0727..7a2a4d31bc 100644 --- a/examples/alerts/alerts.yaml +++ b/examples/alerts/alerts.yaml @@ -282,6 +282,23 @@ groups: for: 3h labels: severity: critical + - alert: ThanosReceiveTrafficBelowThreshold + annotations: + description: At Thanos Receive {{$labels.job}} in {{$labels.namespace}} , the + average 1-hr avg. metrics ingestion rate is {{$value | humanize}}% of 12-hr + avg. ingestion rate. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivetrafficbelowthreshold + summary: Thanos Receive is experiencing low avg. 1-hr ingestion rate relative + to avg. 12-hr ingestion rate. + expr: | + ( + avg by (job) (rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[1h])) + / + avg by (job) (rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[12h])) + ) * 100 < 50 + for: 1h + labels: + severity: warning - name: thanos-sidecar rules: - alert: ThanosSidecarPrometheusDown diff --git a/mixin/alerts/receive.libsonnet b/mixin/alerts/receive.libsonnet index d491be7536..f9e48a24f2 100644 --- a/mixin/alerts/receive.libsonnet +++ b/mixin/alerts/receive.libsonnet @@ -3,6 +3,7 @@ receive+:: { selector: error 'must provide selector for Thanos Receive alerts', httpErrorThreshold: 5, + ingestionThreshold: 50, forwardErrorThreshold: 20, refreshErrorThreshold: 0, p99LatencyThreshold: 10, @@ -143,6 +144,24 @@ severity: 'critical', }, }, + { + alert: 'ThanosReceiveTrafficBelowThreshold', + annotations: { + description: 'At Thanos Receive {{$labels.job}} in {{$labels.namespace}} , the average 1-hr avg. metrics ingestion rate is {{$value | humanize}}% of 12-hr avg. ingestion rate.', + summary: 'Thanos Receive is experiencing low avg. 1-hr ingestion rate relative to avg. 12-hr ingestion rate.', + }, + expr: ||| + ( + avg by (%(dimensions)s) (rate(http_requests_total{code=~"2..", %(selector)s, handler="receive"}[1h])) + / + avg by (%(dimensions)s) (rate(http_requests_total{code=~"2..", %(selector)s, handler="receive"}[12h])) + ) * 100 < %(ingestionThreshold)s + ||| % thanos.receive, + 'for': '1h', + labels: { + severity: 'warning', + }, + }, ], }, ], diff --git a/mixin/runbook.md b/mixin/runbook.md index d87c7bc2ca..03f92aed71 100755 --- a/mixin/runbook.md +++ b/mixin/runbook.md @@ -63,6 +63,7 @@ |ThanosReceiveHighHashringFileRefreshFailures|Thanos Receive is failing to refresh hasring file.|Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{$value humanize}} of attempts failed.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehighhashringfilerefreshfailures](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehighhashringfilerefreshfailures)| |ThanosReceiveConfigReloadFailure|Thanos Receive has not been able to reload configuration.|Thanos Receive {{$labels.job}} has not been able to reload hashring configurations.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceiveconfigreloadfailure](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceiveconfigreloadfailure)| |ThanosReceiveNoUpload|Thanos Receive has not uploaded latest data to object storage.|Thanos Receive {{$labels.instance}} has not uploaded latest data to object storage.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivenoupload](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivenoupload)| +|ThanosReceiveTrafficBelowThreshold|Thanos Receive is experiencing low avg. 1-hr ingestion rate relative to avg. 12-hr ingestion rate.|At Thanos Receive {{$labels.job}} in {{$labels.namespace}} , the average 1-hr avg. metrics ingestion rate is {{$value humanize}}% of 12-hr avg. ingestion rate.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivetrafficbelowthreshold](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivetrafficbelowthreshold)| ## thanos-rule From b94d7a93bbed1c8de9750757fec54eef5e5b2c88 Mon Sep 17 00:00:00 2001 From: spaparaju <paparaju@gmail.com> Date: Thu, 29 Apr 2021 12:27:29 +0530 Subject: [PATCH 2/4] updated rule tests for Thanos Receive component Signed-off-by: spaparaju <paparaju@gmail.com> --- pkg/rules/rules_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/rules/rules_test.go b/pkg/rules/rules_test.go index bae891b614..01965a9b31 100644 --- a/pkg/rules/rules_test.go +++ b/pkg/rules/rules_test.go @@ -67,7 +67,7 @@ func testRulesAgainstExamples(t *testing.T, dir string, server rulespb.RulesServ Name: "thanos-receive", File: filepath.Join(dir, "alerts.yaml"), Rules: []*rulespb.Rule{ - someAlert, someAlert, someAlert, someAlert, someAlert, someAlert, someAlert, + someAlert, someAlert, someAlert, someAlert, someAlert, someAlert, someAlert, someAlert, }, Interval: 60, PartialResponseStrategy: storepb.PartialResponseStrategy_ABORT, From 6c1f563eea8620f615fdf951111a92ed3b130607 Mon Sep 17 00:00:00 2001 From: spaparaju <paparaju@gmail.com> Date: Thu, 29 Apr 2021 13:27:30 +0530 Subject: [PATCH 3/4] change the evaluation query to averaging over time Signed-off-by: spaparaju <paparaju@gmail.com> --- examples/alerts/alerts.md | 4 ++-- examples/alerts/alerts.yaml | 4 ++-- mixin/alerts/receive.libsonnet | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/alerts/alerts.md b/examples/alerts/alerts.md index 5d1e044445..b4daa20577 100644 --- a/examples/alerts/alerts.md +++ b/examples/alerts/alerts.md @@ -572,9 +572,9 @@ rules: to avg. 12-hr ingestion rate. expr: | ( - avg by (job) (rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[1h])) + avg_over_time(rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[5m])[1h:5m]) / - avg by (job) (rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[12h])) + avg_over_time(rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[5m])[12h:5m]) ) * 100 < 50 for: 1h labels: diff --git a/examples/alerts/alerts.yaml b/examples/alerts/alerts.yaml index 7a2a4d31bc..84a0c8b02d 100644 --- a/examples/alerts/alerts.yaml +++ b/examples/alerts/alerts.yaml @@ -292,9 +292,9 @@ groups: to avg. 12-hr ingestion rate. expr: | ( - avg by (job) (rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[1h])) + avg_over_time(rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[5m])[1h:5m]) / - avg by (job) (rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[12h])) + avg_over_time(rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[5m])[12h:5m]) ) * 100 < 50 for: 1h labels: diff --git a/mixin/alerts/receive.libsonnet b/mixin/alerts/receive.libsonnet index f9e48a24f2..c65fdf6ad2 100644 --- a/mixin/alerts/receive.libsonnet +++ b/mixin/alerts/receive.libsonnet @@ -152,9 +152,9 @@ }, expr: ||| ( - avg by (%(dimensions)s) (rate(http_requests_total{code=~"2..", %(selector)s, handler="receive"}[1h])) + avg_over_time(rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[5m])[1h:5m]) / - avg by (%(dimensions)s) (rate(http_requests_total{code=~"2..", %(selector)s, handler="receive"}[12h])) + avg_over_time(rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[5m])[12h:5m]) ) * 100 < %(ingestionThreshold)s ||| % thanos.receive, 'for': '1h', From 5ad761f9531089dd2d2240e83c4981ffa9b43117 Mon Sep 17 00:00:00 2001 From: spaparaj <paparaju@gmail.com> Date: Tue, 11 May 2021 08:13:01 -0400 Subject: [PATCH 4/4] Switched selectors to refer variables Signed-off-by: spaparaj <paparaju@gmail.com> --- examples/alerts/alerts.md | 4 ++-- examples/alerts/alerts.yaml | 4 ++-- mixin/alerts/receive.libsonnet | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/alerts/alerts.md b/examples/alerts/alerts.md index b4daa20577..7209920a60 100644 --- a/examples/alerts/alerts.md +++ b/examples/alerts/alerts.md @@ -572,9 +572,9 @@ rules: to avg. 12-hr ingestion rate. expr: | ( - avg_over_time(rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[5m])[1h:5m]) + avg_over_time(rate(http_requests_total{job=~".*thanos-receive.*", code=~"2..", handler="receive"}[5m])[1h:5m]) / - avg_over_time(rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[5m])[12h:5m]) + avg_over_time(rate(http_requests_total{job=~".*thanos-receive.*", code=~"2..", handler="receive"}[5m])[12h:5m]) ) * 100 < 50 for: 1h labels: diff --git a/examples/alerts/alerts.yaml b/examples/alerts/alerts.yaml index 84a0c8b02d..7c9f08bdc7 100644 --- a/examples/alerts/alerts.yaml +++ b/examples/alerts/alerts.yaml @@ -292,9 +292,9 @@ groups: to avg. 12-hr ingestion rate. expr: | ( - avg_over_time(rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[5m])[1h:5m]) + avg_over_time(rate(http_requests_total{job=~".*thanos-receive.*", code=~"2..", handler="receive"}[5m])[1h:5m]) / - avg_over_time(rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[5m])[12h:5m]) + avg_over_time(rate(http_requests_total{job=~".*thanos-receive.*", code=~"2..", handler="receive"}[5m])[12h:5m]) ) * 100 < 50 for: 1h labels: diff --git a/mixin/alerts/receive.libsonnet b/mixin/alerts/receive.libsonnet index c65fdf6ad2..d313b717cb 100644 --- a/mixin/alerts/receive.libsonnet +++ b/mixin/alerts/receive.libsonnet @@ -152,9 +152,9 @@ }, expr: ||| ( - avg_over_time(rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[5m])[1h:5m]) + avg_over_time(rate(http_requests_total{%(selector)s, code=~"2..", handler="receive"}[5m])[1h:5m]) / - avg_over_time(rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[5m])[12h:5m]) + avg_over_time(rate(http_requests_total{%(selector)s, code=~"2..", handler="receive"}[5m])[12h:5m]) ) * 100 < %(ingestionThreshold)s ||| % thanos.receive, 'for': '1h',