diff --git a/CHANGELOG.md b/CHANGELOG.md index 3c088fd4fe5..771e9ed1aeb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -37,9 +37,7 @@ ### Mixin * [CHANGE] Alerts: Removed obsolete `MimirQueriesIncorrect` alert that used test-exporter metrics. Test-exporter support was however removed in Mimir 2.0 release. #7774 -* [CHANGE] Fine-tuned `terminationGracePeriodSeconds` for the following components: #7364 - * Querier: changed from `30` to `180` - * Query-scheduler: changed from `30` to `180` +* [CHANGE] Alerts: Change threshold for `MimirBucketIndexNotUpdated` alert to fire before queries begin to fail due to bucket index age. #7879 * [FEATURE] Dashboards: added 'Remote ruler reads networking' dashboard. #7751 * [ENHANCEMENT] Alerts: allow configuring alerts range interval via `_config.base_alerts_range_interval_minutes`. #7591 * [ENHANCEMENT] Dashboards: Add panels for monitoring distributor and ingester when using ingest-storage. These panels are disabled by default, but can be enabled using `show_ingest_storage_panels: true` config option. Similarly existing panels used when distributors and ingesters use gRPC for forwarding requests can be disabled by setting `show_grpc_ingestion_panels: false`. #7670 #7699 @@ -61,6 +59,9 @@ ### Jsonnet * [CHANGE] Memcached: Change default read timeout for chunks and index caches to `750ms` from `450ms`. #7778 +* [CHANGE] Fine-tuned `terminationGracePeriodSeconds` for the following components: #7364 + * Querier: changed from `30` to `180` + * Query-scheduler: changed from `30` to `180` * [ENHANCEMENT] Compactor: add `$._config.cortex_compactor_concurrent_rollout_enabled` option (disabled by default) that makes use of rollout-operator to speed up the rollout of compactors. #7783 #7878 * [ENHANCEMENT] Shuffle-sharding: add `$._config.shuffle_sharding.ingest_storage_partitions_enabled` and `$._config.shuffle_sharding.ingester_partitions_shard_size` options, that allow configuring partitions shard size in ingest-storage mode. #7804 * [BUGFIX] Guard against missing samples in KEDA queries. #7691 diff --git a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml index 941ecb1ce29..95e65aea3cb 100644 --- a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml +++ b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml @@ -801,7 +801,7 @@ spec: }}. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirbucketindexnotupdated expr: | - min by(cluster, namespace, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 7200 + min by(cluster, namespace, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 2100 labels: severity: critical - name: mimir_compactor_alerts diff --git a/operations/mimir-mixin-compiled-baremetal/alerts.yaml b/operations/mimir-mixin-compiled-baremetal/alerts.yaml index 0efa8546bbb..cd503f25a29 100644 --- a/operations/mimir-mixin-compiled-baremetal/alerts.yaml +++ b/operations/mimir-mixin-compiled-baremetal/alerts.yaml @@ -775,7 +775,7 @@ groups: }}. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirbucketindexnotupdated expr: | - min by(cluster, namespace, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 7200 + min by(cluster, namespace, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 2100 labels: severity: critical - name: mimir_compactor_alerts diff --git a/operations/mimir-mixin-compiled/alerts.yaml b/operations/mimir-mixin-compiled/alerts.yaml index ef05886356b..46541a667ff 100644 --- a/operations/mimir-mixin-compiled/alerts.yaml +++ b/operations/mimir-mixin-compiled/alerts.yaml @@ -789,7 +789,7 @@ groups: }}. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirbucketindexnotupdated expr: | - min by(cluster, namespace, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 7200 + min by(cluster, namespace, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 2100 labels: severity: critical - name: mimir_compactor_alerts diff --git a/operations/mimir-mixin/alerts/blocks.libsonnet b/operations/mimir-mixin/alerts/blocks.libsonnet index 28c3073e140..eeec268f026 100644 --- a/operations/mimir-mixin/alerts/blocks.libsonnet +++ b/operations/mimir-mixin/alerts/blocks.libsonnet @@ -220,10 +220,13 @@ }, }, { - // Alert if the bucket index has not been updated for a given user. + // Alert if the bucket index has not been updated for a given user. The default update interval is 900 seconds + // so we alert if we've missed two updates plus a 300 second buffer to avoid false-positives. It's important + // that this alert fire before queriers start to return errors because the bucket index is too old (3600 seconds + // by default). alert: $.alertName('BucketIndexNotUpdated'), expr: ||| - min by(%(alert_aggregation_labels)s, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 7200 + min by(%(alert_aggregation_labels)s, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 2100 ||| % $._config, labels: { severity: 'critical',