diff --git a/CHANGELOG.md b/CHANGELOG.md index e63d0039a11..524dcfacef2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,7 +13,8 @@ We use _breaking :warning:_ to mark changes that are not backward compatible (re ## Unreleased ### Added -- +- [#4117](https://github.com/thanos-io/thanos/pull/4117) Mixin: new alert ThanosReceiveTrafficBelowThreshold to flag if the ingestion average of the last hour is 50% of the ingestion average for the last 12 hours. + ### Fixed - ### Changed diff --git a/examples/alerts/alerts.md b/examples/alerts/alerts.md index 98bfe8a6c55..5d1e044445a 100644 --- a/examples/alerts/alerts.md +++ b/examples/alerts/alerts.md @@ -562,6 +562,23 @@ rules: for: 3h labels: severity: critical +- alert: ThanosReceiveTrafficBelowThreshold + annotations: + description: At Thanos Receive {{$labels.job}} in {{$labels.namespace}} , the + average 1-hr avg. metrics ingestion rate is {{$value | humanize}}% of 12-hr + avg. ingestion rate. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivetrafficbelowthreshold + summary: Thanos Receive is experiencing low avg. 1-hr ingestion rate relative + to avg. 12-hr ingestion rate. + expr: | + ( + avg by (job) (rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[1h])) + / + avg by (job) (rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[12h])) + ) * 100 < 50 + for: 1h + labels: + severity: warning ``` ## Replicate diff --git a/examples/alerts/alerts.yaml b/examples/alerts/alerts.yaml index 46c3cb07279..7a2a4d31bc7 100644 --- a/examples/alerts/alerts.yaml +++ b/examples/alerts/alerts.yaml @@ -282,6 +282,23 @@ groups: for: 3h labels: severity: critical + - alert: ThanosReceiveTrafficBelowThreshold + annotations: + description: At Thanos Receive {{$labels.job}} in {{$labels.namespace}} , the + average 1-hr avg. metrics ingestion rate is {{$value | humanize}}% of 12-hr + avg. ingestion rate. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivetrafficbelowthreshold + summary: Thanos Receive is experiencing low avg. 1-hr ingestion rate relative + to avg. 12-hr ingestion rate. + expr: | + ( + avg by (job) (rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[1h])) + / + avg by (job) (rate(http_requests_total{code=~"2..", job=~".*thanos-receive.*", handler="receive"}[12h])) + ) * 100 < 50 + for: 1h + labels: + severity: warning - name: thanos-sidecar rules: - alert: ThanosSidecarPrometheusDown diff --git a/mixin/alerts/receive.libsonnet b/mixin/alerts/receive.libsonnet index d491be7536b..f9e48a24f27 100644 --- a/mixin/alerts/receive.libsonnet +++ b/mixin/alerts/receive.libsonnet @@ -3,6 +3,7 @@ receive+:: { selector: error 'must provide selector for Thanos Receive alerts', httpErrorThreshold: 5, + ingestionThreshold: 50, forwardErrorThreshold: 20, refreshErrorThreshold: 0, p99LatencyThreshold: 10, @@ -143,6 +144,24 @@ severity: 'critical', }, }, + { + alert: 'ThanosReceiveTrafficBelowThreshold', + annotations: { + description: 'At Thanos Receive {{$labels.job}} in {{$labels.namespace}} , the average 1-hr avg. metrics ingestion rate is {{$value | humanize}}% of 12-hr avg. ingestion rate.', + summary: 'Thanos Receive is experiencing low avg. 1-hr ingestion rate relative to avg. 12-hr ingestion rate.', + }, + expr: ||| + ( + avg by (%(dimensions)s) (rate(http_requests_total{code=~"2..", %(selector)s, handler="receive"}[1h])) + / + avg by (%(dimensions)s) (rate(http_requests_total{code=~"2..", %(selector)s, handler="receive"}[12h])) + ) * 100 < %(ingestionThreshold)s + ||| % thanos.receive, + 'for': '1h', + labels: { + severity: 'warning', + }, + }, ], }, ], diff --git a/mixin/runbook.md b/mixin/runbook.md index d87c7bc2ca8..03f92aed716 100755 --- a/mixin/runbook.md +++ b/mixin/runbook.md @@ -63,6 +63,7 @@ |ThanosReceiveHighHashringFileRefreshFailures|Thanos Receive is failing to refresh hasring file.|Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{$value humanize}} of attempts failed.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehighhashringfilerefreshfailures](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehighhashringfilerefreshfailures)| |ThanosReceiveConfigReloadFailure|Thanos Receive has not been able to reload configuration.|Thanos Receive {{$labels.job}} has not been able to reload hashring configurations.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceiveconfigreloadfailure](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceiveconfigreloadfailure)| |ThanosReceiveNoUpload|Thanos Receive has not uploaded latest data to object storage.|Thanos Receive {{$labels.instance}} has not uploaded latest data to object storage.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivenoupload](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivenoupload)| +|ThanosReceiveTrafficBelowThreshold|Thanos Receive is experiencing low avg. 1-hr ingestion rate relative to avg. 12-hr ingestion rate.|At Thanos Receive {{$labels.job}} in {{$labels.namespace}} , the average 1-hr avg. metrics ingestion rate is {{$value humanize}}% of 12-hr avg. ingestion rate.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivetrafficbelowthreshold](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivetrafficbelowthreshold)| ## thanos-rule