diff --git a/CHANGELOG.md b/CHANGELOG.md index 4df5347fc2d..020dedcef30 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re - [#6352](https://github.com/thanos-io/thanos/pull/6352) Store: Expose store gateway query stats in series response hints. - [#6420](https://github.com/thanos-io/thanos/pull/6420) Index Cache: Cache expanded postings. - [#6441](https://github.com/thanos-io/thanos/pull/6441) Compact: Compactor will set `index_stats` in `meta.json` file with max series and chunk size information. +- [#6466](https://github.com/thanos-io/thanos/pull/6466) Mixin (Receive): add limits alerting for configuration reload and meta-monitoring. ### Fixed - [#6456](https://github.com/thanos-io/thanos/pull/6456) Store: fix crash when computing set matches from regex pattern diff --git a/examples/alerts/alerts.md b/examples/alerts/alerts.md index 1e59873cf51..1f564610734 100644 --- a/examples/alerts/alerts.md +++ b/examples/alerts/alerts.md @@ -530,6 +530,24 @@ rules: for: 3h labels: severity: critical +- alert: ThanosReceiveLimitsConfigReloadFailure + annotations: + description: Thanos Receive {{$labels.job}} has not been able to reload the limits configuration. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivelimitsconfigreloadfailure + summary: Thanos Receive has not been able to reload the limits configuration. + expr: sum by(job) (increase(thanos_receive_limits_config_reload_err_total{job=~".*thanos-receive.*"}[5m])) > 0 + for: 5m + labels: + severity: warning +- alert: ThanosReceiveLimitsHighMetaMonitoringQueriesFailureRate + annotations: + description: Thanos Receive {{$labels.job}} is failing for {{$value | humanize}}% of meta monitoring queries. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivelimitshighmetamonitoringqueriesfailurerate + summary: Thanos Receive has not been able to update the number of head series. + expr: (sum by(job) (increase(thanos_receive_metamonitoring_failed_queries_total{job=~".*thanos-receive.*"}[5m])) / 20) * 100 > 20 + for: 5m + labels: + severity: warning ``` ## Replicate diff --git a/examples/alerts/alerts.yaml b/examples/alerts/alerts.yaml index 7d99fdba804..55cb1404197 100644 --- a/examples/alerts/alerts.yaml +++ b/examples/alerts/alerts.yaml @@ -274,6 +274,24 @@ groups: for: 3h labels: severity: critical + - alert: ThanosReceiveLimitsConfigReloadFailure + annotations: + description: Thanos Receive {{$labels.job}} has not been able to reload the limits configuration. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivelimitsconfigreloadfailure + summary: Thanos Receive has not been able to reload the limits configuration. + expr: sum by(job) (increase(thanos_receive_limits_config_reload_err_total{job=~".*thanos-receive.*"}[5m])) > 0 + for: 5m + labels: + severity: warning + - alert: ThanosReceiveLimitsHighMetaMonitoringQueriesFailureRate + annotations: + description: Thanos Receive {{$labels.job}} is failing for {{$value | humanize}}% of meta monitoring queries. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivelimitshighmetamonitoringqueriesfailurerate + summary: Thanos Receive has not been able to update the number of head series. + expr: (sum by(job) (increase(thanos_receive_metamonitoring_failed_queries_total{job=~".*thanos-receive.*"}[5m])) / 20) * 100 > 20 + for: 5m + labels: + severity: warning - name: thanos-sidecar rules: - alert: ThanosSidecarBucketOperationsFailed diff --git a/mixin/alerts/receive.libsonnet b/mixin/alerts/receive.libsonnet index 00da01211cd..528827a2c2d 100644 --- a/mixin/alerts/receive.libsonnet +++ b/mixin/alerts/receive.libsonnet @@ -5,6 +5,7 @@ httpErrorThreshold: 5, ingestionThreshold: 50, forwardErrorThreshold: 20, + metaMonitoringErrorThreshold: 20, refreshErrorThreshold: 0, p99LatencyThreshold: 10, dimensions: std.join(', ', std.objectFields(thanos.targetGroups) + ['job']), @@ -144,6 +145,31 @@ severity: 'critical', }, }, + { + alert: 'ThanosReceiveLimitsConfigReloadFailure', + annotations: { + description: 'Thanos Receive {{$labels.job}}%s has not been able to reload the limits configuration.' % location, + summary: 'Thanos Receive has not been able to reload the limits configuration.', + }, + expr: 'sum by(%(dimensions)s) (increase(thanos_receive_limits_config_reload_err_total{%(selector)s}[5m])) > 0' % thanos.receive, + 'for': '5m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'ThanosReceiveLimitsHighMetaMonitoringQueriesFailureRate', + annotations: { + description: 'Thanos Receive {{$labels.job}}%s is failing for {{$value | humanize}}%% of meta monitoring queries.' % location, + summary: 'Thanos Receive has not been able to update the number of head series.', + }, + // Values are updated every 15s, 20 times over 5 minutes. + expr: '(sum by(%(dimensions)s) (increase(thanos_receive_metamonitoring_failed_queries_total{%(selector)s}[5m])) / 20) * 100 > %(metaMonitoringErrorThreshold)s' % thanos.receive, + 'for': '5m', + labels: { + severity: 'warning', + }, + }, ], }, ], diff --git a/mixin/runbook.md b/mixin/runbook.md index 833cf1d4033..ba2131cd91e 100755 --- a/mixin/runbook.md +++ b/mixin/runbook.md @@ -63,6 +63,8 @@ |ThanosReceiveHighHashringFileRefreshFailures|Thanos Receive is failing to refresh hasring file.|Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{$value humanize}} of attempts failed.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehighhashringfilerefreshfailures](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehighhashringfilerefreshfailures)| |ThanosReceiveConfigReloadFailure|Thanos Receive has not been able to reload configuration.|Thanos Receive {{$labels.job}} has not been able to reload hashring configurations.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceiveconfigreloadfailure](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceiveconfigreloadfailure)| |ThanosReceiveNoUpload|Thanos Receive has not uploaded latest data to object storage.|Thanos Receive {{$labels.instance}} has not uploaded latest data to object storage.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivenoupload](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivenoupload)| +|ThanosReceiveLimitsConfigReloadFailure|Thanos Receive has not been able to reload the limits configuration.|Thanos Receive {{$labels.job}} has not been able to reload the limits configuration.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivelimitsconfigreloadfailure](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivelimitsconfigreloadfailure)| +|ThanosReceiveLimitsHighMetaMonitoringQueriesFailureRate|Thanos Receive has not been able to update the number of head series.|Thanos Receive {{$labels.job}} is failing for {{$value humanize}}% of meta monitoring queries.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivelimitshighmetamonitoringqueriesfailurerate](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivelimitshighmetamonitoringqueriesfailurerate)| ## thanos-rule diff --git a/pkg/rules/rules_test.go b/pkg/rules/rules_test.go index 09142586032..b5ee5dacd26 100644 --- a/pkg/rules/rules_test.go +++ b/pkg/rules/rules_test.go @@ -69,7 +69,7 @@ func testRulesAgainstExamples(t *testing.T, dir string, server rulespb.RulesServ Name: "thanos-receive", File: filepath.Join(dir, "alerts.yaml"), Rules: []*rulespb.Rule{ - someAlert, someAlert, someAlert, someAlert, someAlert, someAlert, someAlert, + someAlert, someAlert, someAlert, someAlert, someAlert, someAlert, someAlert, someAlert, someAlert, }, Interval: 60, PartialResponseStrategy: storepb.PartialResponseStrategy_ABORT,