Skip to content

Commit

Permalink
mixin: make range interval configurable in alerts (#7591)
Browse files Browse the repository at this point in the history
* chore: mixin make range interval configurable in alerts

* chore: example conversion of 2 alerts

* chore: remove unused local variable

* chore: switch to using minutes as base, remove unnecessary variable

* chore: address first batch of alerts

* chore: fix typos

* chore: convert more alerts

* chore: only change alerts with range selector under 10m

* chore: address comments from review

* chore: fix pipeline failing

* chore: address comments from review

* chore: rebased and added changelog entry

* chore: make lint happy
  • Loading branch information
jmichalek132 authored Mar 14, 2024
1 parent 297fe2c commit be893e7
Show file tree
Hide file tree
Showing 9 changed files with 121 additions and 73 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
* [BUGFIX] Querier, store-gateway: Protect against panics raised during snappy encoding. #7520

### Mixin
* [ENHANCEMENT] Alerts: allow configuring alerts range interval via `_config.base_alerts_range_interval_minutes`. #7591

### Jsonnet

Expand Down
24 changes: 12 additions & 12 deletions operations/mimir-mixin/alerts/alertmanager.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
{
alert: $.alertName('AlertmanagerSyncConfigsFailing'),
expr: |||
rate(cortex_alertmanager_sync_configs_failed_total[5m]) > 0
|||,
rate(cortex_alertmanager_sync_configs_failed_total[%s]) > 0
||| % $.alertRangeInterval(5),
'for': '30m',
labels: {
severity: 'critical',
Expand All @@ -21,8 +21,8 @@
{
alert: $.alertName('AlertmanagerRingCheckFailing'),
expr: |||
rate(cortex_alertmanager_ring_check_errors_total[2m]) > 0
|||,
rate(cortex_alertmanager_ring_check_errors_total[%s]) > 0
||| % $.alertRangeInterval(2),
'for': '10m',
labels: {
severity: 'critical',
Expand All @@ -36,8 +36,8 @@
{
alert: $.alertName('AlertmanagerPartialStateMergeFailing'),
expr: |||
rate(cortex_alertmanager_partial_state_merges_failed_total[2m]) > 0
|||,
rate(cortex_alertmanager_partial_state_merges_failed_total[%s]) > 0
||| % $.alertRangeInterval(2),
'for': '10m',
labels: {
severity: 'critical',
Expand All @@ -51,8 +51,8 @@
{
alert: $.alertName('AlertmanagerReplicationFailing'),
expr: |||
rate(cortex_alertmanager_state_replication_failed_total[2m]) > 0
|||,
rate(cortex_alertmanager_state_replication_failed_total[%s]) > 0
||| % $.alertRangeInterval(2),
'for': '10m',
labels: {
severity: 'critical',
Expand All @@ -66,8 +66,8 @@
{
alert: $.alertName('AlertmanagerPersistStateFailing'),
expr: |||
rate(cortex_alertmanager_state_persist_failed_total[15m]) > 0
|||,
rate(cortex_alertmanager_state_persist_failed_total[%s]) > 0
||| % $.alertRangeInterval(15),
'for': '1h',
labels: {
severity: 'critical',
Expand All @@ -81,8 +81,8 @@
{
alert: $.alertName('AlertmanagerInitialSyncFailed'),
expr: |||
increase(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed"}[1m]) > 0
|||,
increase(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed"}[%s]) > 0
||| % $.alertRangeInterval(1),
labels: {
severity: 'critical',
},
Expand Down
2 changes: 2 additions & 0 deletions operations/mimir-mixin/alerts/alerts-utils.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -54,4 +54,6 @@
for group in groups
],

alertRangeInterval(multiple)::
($._config.base_alerts_range_interval_minutes * multiple) + 'm',
}
104 changes: 68 additions & 36 deletions operations/mimir-mixin/alerts/alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,15 @@ local utils = import 'mixin-utils/utils.libsonnet';
// Note if alert_aggregation_labels is "job", this will repeat the label. But
// prometheus seems to tolerate that.
expr: |||
100 * sum by (%(group_by)s, %(job_label)s, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",route!~"%(excluded_routes)s"}[1m]))
100 * sum by (%(group_by)s, %(job_label)s, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",route!~"%(excluded_routes)s"}[%(range_interval)s]))
/
sum by (%(group_by)s, %(job_label)s, route) (rate(cortex_request_duration_seconds_count{route!~"%(excluded_routes)s"}[1m]))
sum by (%(group_by)s, %(job_label)s, route) (rate(cortex_request_duration_seconds_count{route!~"%(excluded_routes)s"}[%(range_interval)s]))
> 1
||| % {
group_by: $._config.alert_aggregation_labels,
job_label: $._config.per_job_label,
excluded_routes: std.join('|', ['ready'] + $._config.alert_excluded_routes),
range_interval: $.alertRangeInterval(1),
},
'for': '15m',
labels: {
Expand Down Expand Up @@ -81,10 +82,13 @@ local utils = import 'mixin-utils/utils.libsonnet';
{
alert: $.alertName('QueriesIncorrect'),
expr: |||
100 * sum by (%s) (rate(test_exporter_test_case_result_total{result="fail"}[5m]))
100 * sum by (%(group_by)s) (rate(test_exporter_test_case_result_total{result="fail"}[%(range_interval)s]))
/
sum by (%s) (rate(test_exporter_test_case_result_total[5m])) > 1
||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels],
sum by (%(group_by)s) (rate(test_exporter_test_case_result_total[%(range_interval)s])) > 1
||| % {
group_by: $._config.alert_aggregation_labels,
range_interval: $.alertRangeInterval(5),
},
'for': '15m',
labels: {
severity: 'warning',
Expand Down Expand Up @@ -130,8 +134,12 @@ local utils = import 'mixin-utils/utils.libsonnet';
{
alert: $.alertName('FrontendQueriesStuck'),
expr: |||
sum by (%(alert_aggregation_labels)s, %(per_job_label)s) (min_over_time(cortex_query_frontend_queue_length[1m])) > 0
||| % $._config,
sum by (%(group_by)s, %(job_label)s) (min_over_time(cortex_query_frontend_queue_length[%(range_interval)s])) > 0
||| % {
group_by: $._config.alert_aggregation_labels,
job_label: $._config.per_job_label,
range_interval: $.alertRangeInterval(1),
},
'for': '5m', // We don't want to block for longer.
labels: {
severity: 'critical',
Expand All @@ -145,8 +153,12 @@ local utils = import 'mixin-utils/utils.libsonnet';
{
alert: $.alertName('SchedulerQueriesStuck'),
expr: |||
sum by (%(alert_aggregation_labels)s, %(per_job_label)s) (min_over_time(cortex_query_scheduler_queue_length[1m])) > 0
||| % $._config,
sum by (%(group_by)s, %(job_label)s) (min_over_time(cortex_query_scheduler_queue_length[%(range_interval)s])) > 0
||| % {
group_by: $._config.alert_aggregation_labels,
job_label: $._config.per_job_label,
range_interval: $.alertRangeInterval(1),
},
'for': '7m', // We don't want to block for longer.
labels: {
severity: 'critical',
Expand All @@ -161,19 +173,22 @@ local utils = import 'mixin-utils/utils.libsonnet';
alert: $.alertName('CacheRequestErrors'),
expr: |||
(
sum by(%s, name, operation) (
rate(thanos_memcached_operation_failures_total[1m])
sum by(%(group_by)s, name, operation) (
rate(thanos_memcached_operation_failures_total[%(range_interval)s])
or
rate(thanos_cache_operation_failures_total[1m])
rate(thanos_cache_operation_failures_total[%(range_interval)s])
)
/
sum by(%s, name, operation) (
rate(thanos_memcached_operations_total[1m])
sum by(%(group_by)s, name, operation) (
rate(thanos_memcached_operations_total[%(range_interval)s])
or
rate(thanos_cache_operations_total[1m])
rate(thanos_cache_operations_total[%(range_interval)s])
)
) * 100 > 5
||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels],
||| % {
group_by: $._config.alert_aggregation_labels,
range_interval: $.alertRangeInterval(1),
},
'for': '5m',
labels: {
severity: 'warning',
Expand Down Expand Up @@ -215,13 +230,15 @@ local utils = import 'mixin-utils/utils.libsonnet';
alert: $.alertName('KVStoreFailure'),
expr: |||
(
sum by(%(alert_aggregation_labels)s, %(per_instance_label)s, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count{status_code!~"2.+"}[1m]))
sum by(%(alert_aggregation_labels)s, %(per_instance_label)s, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count{status_code!~"2.+"}[%(range_interval)s]))
/
sum by(%(alert_aggregation_labels)s, %(per_instance_label)s, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count[1m]))
sum by(%(alert_aggregation_labels)s, %(per_instance_label)s, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count[%(range_interval)s]))
)
# We want to get alerted only in case there's a constant failure.
== 1
||| % $._config,
||| % $._config {
range_interval: $.alertRangeInterval(1),
},
'for': '5m',
labels: {
severity: 'critical',
Expand Down Expand Up @@ -316,9 +333,10 @@ local utils = import 'mixin-utils/utils.libsonnet';
alert: $.alertName('StoreGatewayTooManyFailedOperations'),
'for': '5m',
expr: |||
sum by(%(alert_aggregation_labels)s, operation) (rate(thanos_objstore_bucket_operation_failures_total{component="store-gateway"}[1m])) > 0
sum by(%(alert_aggregation_labels)s, operation) (rate(thanos_objstore_bucket_operation_failures_total{component="store-gateway"}[%(range_interval)s])) > 0
||| % {
alert_aggregation_labels: $._config.alert_aggregation_labels,
range_interval: $.alertRangeInterval(1),
},
labels: {
severity: 'warning',
Expand Down Expand Up @@ -502,7 +520,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
%(kube_statefulset_status_replicas_updated)s
)
) and (
changes(%(kube_statefulset_status_replicas_updated)s[15m:1m])
changes(%(kube_statefulset_status_replicas_updated)s[%(range_interval)s])
==
0
)
Expand All @@ -513,6 +531,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
kube_statefulset_status_update_revision: groupStatefulSetByRolloutGroup('kube_statefulset_status_update_revision'),
kube_statefulset_replicas: groupStatefulSetByRolloutGroup('kube_statefulset_replicas'),
kube_statefulset_status_replicas_updated: groupStatefulSetByRolloutGroup('kube_statefulset_status_replicas_updated'),
range_interval: '15m:' + $.alertRangeInterval(1),
},
'for': '30m',
labels: {
Expand All @@ -533,7 +552,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
!=
%(kube_deployment_status_replicas_updated)s
) and (
changes(%(kube_deployment_status_replicas_updated)s[15m:1m])
changes(%(kube_deployment_status_replicas_updated)s[%(range_interval)s])
==
0
)
Expand All @@ -542,6 +561,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
aggregation_labels: $._config.alert_aggregation_labels,
kube_deployment_spec_replicas: groupDeploymentByRolloutGroup('kube_deployment_spec_replicas'),
kube_deployment_status_replicas_updated: groupDeploymentByRolloutGroup('kube_deployment_status_replicas_updated'),
range_interval: '15m:' + $.alertRangeInterval(1),
},
'for': '30m',
labels: {
Expand Down Expand Up @@ -619,11 +639,13 @@ local utils = import 'mixin-utils/utils.libsonnet';
alert: $.alertName('RulerTooManyFailedPushes'),
expr: |||
100 * (
sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ruler_write_requests_failed_total[1m]))
sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ruler_write_requests_failed_total[%(range_interval)s]))
/
sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ruler_write_requests_total[1m]))
sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ruler_write_requests_total[%(range_interval)s]))
) > 1
||| % $._config,
||| % $._config {
range_interval: $.alertRangeInterval(1),
},
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -638,11 +660,13 @@ local utils = import 'mixin-utils/utils.libsonnet';
alert: $.alertName('RulerTooManyFailedQueries'),
expr: |||
100 * (
sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ruler_queries_failed_total[1m]))
sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ruler_queries_failed_total[%(range_interval)s]))
/
sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ruler_queries_total[1m]))
sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ruler_queries_total[%(range_interval)s]))
) > 1
||| % $._config,
||| % $._config {
range_interval: $.alertRangeInterval(1),
},
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -657,11 +681,13 @@ local utils = import 'mixin-utils/utils.libsonnet';
alert: $.alertName('RulerMissedEvaluations'),
expr: |||
100 * (
sum by (%(alert_aggregation_labels)s, %(per_instance_label)s, rule_group) (rate(cortex_prometheus_rule_group_iterations_missed_total[1m]))
sum by (%(alert_aggregation_labels)s, %(per_instance_label)s, rule_group) (rate(cortex_prometheus_rule_group_iterations_missed_total[%(range_interval)s]))
/
sum by (%(alert_aggregation_labels)s, %(per_instance_label)s, rule_group) (rate(cortex_prometheus_rule_group_iterations_total[1m]))
sum by (%(alert_aggregation_labels)s, %(per_instance_label)s, rule_group) (rate(cortex_prometheus_rule_group_iterations_total[%(range_interval)s]))
) > 1
||| % $._config,
||| % $._config {
range_interval: $.alertRangeInterval(1),
},
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -675,9 +701,11 @@ local utils = import 'mixin-utils/utils.libsonnet';
{
alert: $.alertName('RulerFailedRingCheck'),
expr: |||
sum by (%(alert_aggregation_labels)s, %(per_job_label)s) (rate(cortex_ruler_ring_check_errors_total[1m]))
sum by (%(alert_aggregation_labels)s, %(per_job_label)s) (rate(cortex_ruler_ring_check_errors_total[%(range_interval)s]))
> 0
||| % $._config,
||| % $._config {
range_interval: $.alertRangeInterval(1),
},
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -692,11 +720,15 @@ local utils = import 'mixin-utils/utils.libsonnet';
alert: $.alertName('RulerRemoteEvaluationFailing'),
expr: |||
100 * (
sum by (%s) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", status_code=~"5..", %s}[5m]))
sum by (%(alert_aggregation_labels)s) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", status_code=~"5..", %(job_regex)s}[%(range_interval)s]))
/
sum by (%s) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", %s}[5m]))
sum by (%(alert_aggregation_labels)s) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", %(job_regex)s}[%(range_interval)s]))
) > 1
||| % [$._config.alert_aggregation_labels, $.jobMatcher($._config.job_names.ruler_query_frontend), $._config.alert_aggregation_labels, $.jobMatcher($._config.job_names.ruler_query_frontend)],
||| % {
alert_aggregation_labels: $._config.alert_aggregation_labels,
job_regex: $.jobMatcher($._config.job_names.ruler_query_frontend),
range_interval: $.alertRangeInterval(5),
},
'for': '5m',
labels: {
severity: 'warning',
Expand Down
3 changes: 2 additions & 1 deletion operations/mimir-mixin/alerts/autoscaling.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,14 @@
expr: |||
(
# Find KEDA scalers reporting errors.
label_replace(rate(keda_scaler_errors[5m]), "namespace", "$1", "exported_namespace", "(.*)")
label_replace(rate(keda_scaler_errors[%(range_interval)s]), "namespace", "$1", "exported_namespace", "(.*)")
# Match only Mimir namespaces.
* on(%(aggregation_labels)s) group_left max by(%(aggregation_labels)s) (cortex_build_info)
)
> 0
||| % {
aggregation_labels: $._config.alert_aggregation_labels,
range_interval: $.alertRangeInterval(5),
},
labels: {
severity: 'critical',
Expand Down
Loading

0 comments on commit be893e7

Please sign in to comment.