diff --git a/CHANGELOG.md b/CHANGELOG.md index 87e9decfc3..41c8ec7b7d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,7 +13,7 @@ We use _breaking :warning:_ to mark changes that are not backward compatible (re ## Unreleased ### Added - +- [#4117](https://github.com/thanos-io/thanos/pull/4117) Mixin: new alert ThanosReceiveTrafficBelowThreshold to flag if the ingestion average of the last hour dips below 50% of the ingestion average for the last 12 hours. - [#4107](https://github.com/thanos-io/thanos/pull/4107) Store: `LabelNames` and `LabelValues` now support label matchers. - [#4171](https://github.com/thanos-io/thanos/pull/4171) Docker: Busybox image updated to latest (1.33.1) - [#4175](https://github.com/thanos-io/thanos/pull/4175) Added Tag Configuration Support Lightstep Tracing diff --git a/examples/alerts/alerts.md b/examples/alerts/alerts.md index 98bfe8a6c5..7209920a60 100644 --- a/examples/alerts/alerts.md +++ b/examples/alerts/alerts.md @@ -562,6 +562,23 @@ rules: for: 3h labels: severity: critical +- alert: ThanosReceiveTrafficBelowThreshold + annotations: + description: At Thanos Receive {{$labels.job}} in {{$labels.namespace}} , the + average 1-hr avg. metrics ingestion rate is {{$value | humanize}}% of 12-hr + avg. ingestion rate. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivetrafficbelowthreshold + summary: Thanos Receive is experiencing low avg. 1-hr ingestion rate relative + to avg. 12-hr ingestion rate. + expr: | + ( + avg_over_time(rate(http_requests_total{job=~".*thanos-receive.*", code=~"2..", handler="receive"}[5m])[1h:5m]) + / + avg_over_time(rate(http_requests_total{job=~".*thanos-receive.*", code=~"2..", handler="receive"}[5m])[12h:5m]) + ) * 100 < 50 + for: 1h + labels: + severity: warning ``` ## Replicate diff --git a/examples/alerts/alerts.yaml b/examples/alerts/alerts.yaml index 46c3cb0727..7c9f08bdc7 100644 --- a/examples/alerts/alerts.yaml +++ b/examples/alerts/alerts.yaml @@ -282,6 +282,23 @@ groups: for: 3h labels: severity: critical + - alert: ThanosReceiveTrafficBelowThreshold + annotations: + description: At Thanos Receive {{$labels.job}} in {{$labels.namespace}} , the + average 1-hr avg. metrics ingestion rate is {{$value | humanize}}% of 12-hr + avg. ingestion rate. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivetrafficbelowthreshold + summary: Thanos Receive is experiencing low avg. 1-hr ingestion rate relative + to avg. 12-hr ingestion rate. + expr: | + ( + avg_over_time(rate(http_requests_total{job=~".*thanos-receive.*", code=~"2..", handler="receive"}[5m])[1h:5m]) + / + avg_over_time(rate(http_requests_total{job=~".*thanos-receive.*", code=~"2..", handler="receive"}[5m])[12h:5m]) + ) * 100 < 50 + for: 1h + labels: + severity: warning - name: thanos-sidecar rules: - alert: ThanosSidecarPrometheusDown diff --git a/mixin/alerts/receive.libsonnet b/mixin/alerts/receive.libsonnet index d491be7536..d313b717cb 100644 --- a/mixin/alerts/receive.libsonnet +++ b/mixin/alerts/receive.libsonnet @@ -3,6 +3,7 @@ receive+:: { selector: error 'must provide selector for Thanos Receive alerts', httpErrorThreshold: 5, + ingestionThreshold: 50, forwardErrorThreshold: 20, refreshErrorThreshold: 0, p99LatencyThreshold: 10, @@ -143,6 +144,24 @@ severity: 'critical', }, }, + { + alert: 'ThanosReceiveTrafficBelowThreshold', + annotations: { + description: 'At Thanos Receive {{$labels.job}} in {{$labels.namespace}} , the average 1-hr avg. metrics ingestion rate is {{$value | humanize}}% of 12-hr avg. ingestion rate.', + summary: 'Thanos Receive is experiencing low avg. 1-hr ingestion rate relative to avg. 12-hr ingestion rate.', + }, + expr: ||| + ( + avg_over_time(rate(http_requests_total{%(selector)s, code=~"2..", handler="receive"}[5m])[1h:5m]) + / + avg_over_time(rate(http_requests_total{%(selector)s, code=~"2..", handler="receive"}[5m])[12h:5m]) + ) * 100 < %(ingestionThreshold)s + ||| % thanos.receive, + 'for': '1h', + labels: { + severity: 'warning', + }, + }, ], }, ], diff --git a/mixin/runbook.md b/mixin/runbook.md index d87c7bc2ca..03f92aed71 100755 --- a/mixin/runbook.md +++ b/mixin/runbook.md @@ -63,6 +63,7 @@ |ThanosReceiveHighHashringFileRefreshFailures|Thanos Receive is failing to refresh hasring file.|Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{$value humanize}} of attempts failed.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehighhashringfilerefreshfailures](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehighhashringfilerefreshfailures)| |ThanosReceiveConfigReloadFailure|Thanos Receive has not been able to reload configuration.|Thanos Receive {{$labels.job}} has not been able to reload hashring configurations.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceiveconfigreloadfailure](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceiveconfigreloadfailure)| |ThanosReceiveNoUpload|Thanos Receive has not uploaded latest data to object storage.|Thanos Receive {{$labels.instance}} has not uploaded latest data to object storage.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivenoupload](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivenoupload)| +|ThanosReceiveTrafficBelowThreshold|Thanos Receive is experiencing low avg. 1-hr ingestion rate relative to avg. 12-hr ingestion rate.|At Thanos Receive {{$labels.job}} in {{$labels.namespace}} , the average 1-hr avg. metrics ingestion rate is {{$value humanize}}% of 12-hr avg. ingestion rate.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivetrafficbelowthreshold](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivetrafficbelowthreshold)| ## thanos-rule diff --git a/pkg/rules/rules_test.go b/pkg/rules/rules_test.go index bae891b614..01965a9b31 100644 --- a/pkg/rules/rules_test.go +++ b/pkg/rules/rules_test.go @@ -67,7 +67,7 @@ func testRulesAgainstExamples(t *testing.T, dir string, server rulespb.RulesServ Name: "thanos-receive", File: filepath.Join(dir, "alerts.yaml"), Rules: []*rulespb.Rule{ - someAlert, someAlert, someAlert, someAlert, someAlert, someAlert, someAlert, + someAlert, someAlert, someAlert, someAlert, someAlert, someAlert, someAlert, someAlert, }, Interval: 60, PartialResponseStrategy: storepb.PartialResponseStrategy_ABORT,