diff --git a/examples/alerts/alerts.md b/examples/alerts/alerts.md index d603e751b2..dad259a3d0 100644 --- a/examples/alerts/alerts.md +++ b/examples/alerts/alerts.md @@ -142,6 +142,34 @@ rules: for: 5m labels: severity: warning +- alert: ThanosRulerQueryHighDNSFailures + annotations: + message: Thanos Ruler {{$labels.job}} have {{ $value | humanize }}% of failing + DNS queries for query endpoints. + expr: | + ( + sum by (job) (rate(thanos_ruler_query_apis_dns_failures_total{job=~"thanos-ruler.*"}[5m])) + / + sum by (job) (rate(thanos_ruler_query_apis_dns_lookups_total{job=~"thanos-ruler.*"}[5m])) + * 100 > 1 + ) + for: 15m + labels: + severity: warning +- alert: ThanosRulerAlertmanagerHighDNSFailures + annotations: + message: Thanos Ruler {{$labels.job}} have {{ $value | humanize }}% of failing + DNS queries for Alertmanager endpoints. + expr: | + ( + sum by (job) (rate(thanos_ruler_alertmanagers_dns_failures_total{job=~"thanos-ruler.*"}[5m])) + / + sum by (job) (rate(thanos_ruler_alertmanagers_dns_lookups_total{job=~"thanos-ruler.*"}[5m])) + * 100 > 1 + ) + for: 15m + labels: + severity: warning ``` ## Store Gateway diff --git a/examples/alerts/alerts.yaml b/examples/alerts/alerts.yaml index 434d741d58..5011e882b4 100644 --- a/examples/alerts/alerts.yaml +++ b/examples/alerts/alerts.yaml @@ -354,6 +354,34 @@ groups: for: 5m labels: severity: warning + - alert: ThanosRulerQueryHighDNSFailures + annotations: + message: Thanos Ruler {{$labels.job}} have {{ $value | humanize }}% of failing + DNS queries for query endpoints. + expr: | + ( + sum by (job) (rate(thanos_ruler_query_apis_dns_failures_total{job=~"thanos-ruler.*"}[5m])) + / + sum by (job) (rate(thanos_ruler_query_apis_dns_lookups_total{job=~"thanos-ruler.*"}[5m])) + * 100 > 1 + ) + for: 15m + labels: + severity: warning + - alert: ThanosRulerAlertmanagerHighDNSFailures + annotations: + message: Thanos Ruler {{$labels.job}} have {{ $value | humanize }}% of failing + DNS queries for Alertmanager endpoints. + expr: | + ( + sum by (job) (rate(thanos_ruler_alertmanagers_dns_failures_total{job=~"thanos-ruler.*"}[5m])) + / + sum by (job) (rate(thanos_ruler_alertmanagers_dns_lookups_total{job=~"thanos-ruler.*"}[5m])) + * 100 > 1 + ) + for: 15m + labels: + severity: warning - name: thanos-component-absent.rules rules: - alert: ThanosCompactorIsDown diff --git a/mixin/thanos/alerts/ruler.libsonnet b/mixin/thanos/alerts/ruler.libsonnet index 16d5c4e486..83f4d59807 100644 --- a/mixin/thanos/alerts/ruler.libsonnet +++ b/mixin/thanos/alerts/ruler.libsonnet @@ -114,6 +114,42 @@ severity: 'warning', }, }, + { + alert: 'ThanosRulerQueryHighDNSFailures', + annotations: { + message: 'Thanos Ruler {{$labels.job}} have {{ $value | humanize }}% of failing DNS queries for query endpoints.', + }, + expr: ||| + ( + sum by (job) (rate(thanos_ruler_query_apis_dns_failures_total{%(selector)s}[5m])) + / + sum by (job) (rate(thanos_ruler_query_apis_dns_lookups_total{%(selector)s}[5m])) + * 100 > 1 + ) + ||| % thanos.ruler, + 'for': '15m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'ThanosRulerAlertmanagerHighDNSFailures', + annotations: { + message: 'Thanos Ruler {{$labels.job}} have {{ $value | humanize }}% of failing DNS queries for Alertmanager endpoints.', + }, + expr: ||| + ( + sum by (job) (rate(thanos_ruler_alertmanagers_dns_failures_total{%(selector)s}[5m])) + / + sum by (job) (rate(thanos_ruler_alertmanagers_dns_lookups_total{%(selector)s}[5m])) + * 100 > 1 + ) + ||| % thanos.ruler, + 'for': '15m', + labels: { + severity: 'warning', + }, + }, ], }, ],