From e5f0737d0189871051795dc99f84e38662565e11 Mon Sep 17 00:00:00 2001 From: Ben Ye Date: Wed, 29 Jul 2020 12:22:45 -0400 Subject: [PATCH] format changelog Signed-off-by: Ben Ye --- CHANGELOG.md | 6 +- examples/alerts/alerts.md | 102 ++++++++++++++++++++++------------ examples/alerts/alerts.yaml | 108 ++++++++++++++++++++++++------------ pkg/replicate/scheme.go | 2 +- 4 files changed, 144 insertions(+), 74 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 478ab6d9461..599202ab219 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,9 +13,6 @@ We use *breaking* word for marking changes that are not backward compatible (rel ### Changed -- [#2906](https://github.com/thanos-io/thanos/pull/2906) Tools: Refactor Bucket replicate execution. Removed all `thanos_replicate_origin_.*` metrics. - - `thanos_replicate_origin_meta_loads_total` can be replaced by `blocks_meta_synced{state="loaded"}`. - - `thanos_replicate_origin_partial_meta_reads_total` can be replaced by `blocks_meta_synced{state="failed"}`. ### Fixed @@ -49,6 +46,9 @@ We use *breaking* word for marking changes that are not backward compatible (rel - [#2902](https://github.com/thanos-io/thanos/pull/2902) ui: React: Separate dedupe and partial response checkboxes per panel. - [#2931](https://github.com/thanos-io/thanos/pull/2931) Query: Allow passing a `storeMatcher[]` to select matching stores when debugging the querier. See [documentation](https://thanos.io/components/query.md/#store-filtering) - [#2991](https://github.com/thanos-io/thanos/pull/2991) store: `operation` label value `getrange` changed to `get_range` for `thanos_store_bucket_cache_operation_requests_total` and `thanos_store_bucket_cache_operation_hits_total` to be consistent with bucket operation metrics. +- [#2906](https://github.com/thanos-io/thanos/pull/2906) Tools: Refactor Bucket replicate execution. Removed all `thanos_replicate_origin_.*` metrics. + - `thanos_replicate_origin_meta_loads_total` can be replaced by `blocks_meta_synced{state="loaded"}`. + - `thanos_replicate_origin_partial_meta_reads_total` can be replaced by `blocks_meta_synced{state="failed"}`. ## [v0.14.0](https://github.com/thanos-io/thanos/releases/tag/v0.14.0) - 2020.07.10 diff --git a/examples/alerts/alerts.md b/examples/alerts/alerts.md index 1c30529f5bc..cb40e2a9f71 100644 --- a/examples/alerts/alerts.md +++ b/examples/alerts/alerts.md @@ -10,7 +10,8 @@ name: thanos-compact.rules rules: - alert: ThanosCompactMultipleRunning annotations: - message: No more than one Thanos Compact instance should be running at once. There are {{ $value }} + message: No more than one Thanos Compact instance should be running at once. There + are {{ $value }} expr: sum(up{job=~"thanos-compact.*"}) > 1 for: 5m labels: @@ -24,7 +25,8 @@ rules: severity: warning - alert: ThanosCompactHighCompactionFailures annotations: - message: Thanos Compact {{$labels.job}} is failing to execute {{ $value | humanize }}% of compactions. + message: Thanos Compact {{$labels.job}} is failing to execute {{ $value | humanize + }}% of compactions. expr: | ( sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~"thanos-compact.*"}[5m])) @@ -37,7 +39,8 @@ rules: severity: warning - alert: ThanosCompactBucketHighOperationFailures annotations: - message: Thanos Compact {{$labels.job}} Bucket is failing to execute {{ $value | humanize }}% of operations. + message: Thanos Compact {{$labels.job}} Bucket is failing to execute {{ $value + | humanize }}% of operations. expr: | ( sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-compact.*"}[5m])) @@ -51,7 +54,8 @@ rules: - alert: ThanosCompactHasNotRun annotations: message: Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours. - expr: (time() - max(max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~"thanos-compact.*"}[24h]))) / 60 / 60 > 24 + expr: (time() - max(max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~"thanos-compact.*"}[24h]))) + / 60 / 60 > 24 labels: severity: warning ``` @@ -74,7 +78,8 @@ rules: severity: critical - alert: ThanosRuleSenderIsFailingAlerts annotations: - message: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to send alerts to alertmanager. + message: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to send alerts + to alertmanager. expr: | sum by (job) (rate(thanos_alert_sender_alerts_dropped_total{job=~"thanos-rule.*"}[5m])) > 0 for: 5m @@ -95,7 +100,8 @@ rules: severity: critical - alert: ThanosRuleHighRuleEvaluationWarnings annotations: - message: Thanos Rule {{$labels.job}} {{$labels.pod}} has high number of evaluation warnings. + message: Thanos Rule {{$labels.job}} {{$labels.pod}} has high number of evaluation + warnings. expr: | sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{job=~"thanos-rule.*"}[5m])) > 0 for: 15m @@ -103,7 +109,8 @@ rules: severity: info - alert: ThanosRuleRuleEvaluationLatencyHigh annotations: - message: Thanos Rule {{$labels.job}}/{{$labels.pod}} has higher evaluation latency than interval for {{$labels.rule_group}}. + message: Thanos Rule {{$labels.job}}/{{$labels.pod}} has higher evaluation latency + than interval for {{$labels.rule_group}}. expr: | ( sum by (job, pod, rule_group) (prometheus_rule_group_last_duration_seconds{job=~"thanos-rule.*"}) @@ -115,7 +122,8 @@ rules: severity: warning - alert: ThanosRuleGrpcErrorRate annotations: - message: Thanos Rule {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests. + message: Thanos Rule {{$labels.job}} is failing to handle {{ $value | humanize + }}% of requests. expr: | ( sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-rule.*"}[5m])) @@ -129,13 +137,15 @@ rules: - alert: ThanosRuleConfigReloadFailure annotations: message: Thanos Rule {{$labels.job}} has not been able to reload its configuration. - expr: avg(thanos_rule_config_last_reload_successful{job=~"thanos-rule.*"}) by (job) != 1 + expr: avg(thanos_rule_config_last_reload_successful{job=~"thanos-rule.*"}) by (job) + != 1 for: 5m labels: severity: info - alert: ThanosRuleQueryHighDNSFailures annotations: - message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS queries for query endpoints. + message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS + queries for query endpoints. expr: | ( sum by (job) (rate(thanos_ruler_query_apis_dns_failures_total{job=~"thanos-rule.*"}[5m])) @@ -148,7 +158,8 @@ rules: severity: warning - alert: ThanosRuleAlertmanagerHighDNSFailures annotations: - message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS queries for Alertmanager endpoints. + message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS + queries for Alertmanager endpoints. expr: | ( sum by (job) (rate(thanos_ruler_alertmanagers_dns_failures_total{job=~"thanos-rule.*"}[5m])) @@ -161,7 +172,8 @@ rules: severity: warning - alert: ThanosRuleNoEvaluationFor10Intervals annotations: - message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% rule groups that did not evaluate for at least 10x of their expected interval. + message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% rule groups + that did not evaluate for at least 10x of their expected interval. expr: | time() - max by (job, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~"thanos-rule.*"}) > @@ -171,7 +183,8 @@ rules: severity: info - alert: ThanosNoRuleEvaluations annotations: - message: Thanos Rule {{$labels.job}} did not perform any rule evaluations in the past 2 minutes. + message: Thanos Rule {{$labels.job}} did not perform any rule evaluations in the + past 2 minutes. expr: | sum(rate(prometheus_rule_evaluations_total{job=~"thanos-rule.*"}[2m])) <= 0 and @@ -189,7 +202,8 @@ name: thanos-store.rules rules: - alert: ThanosStoreGrpcErrorRate annotations: - message: Thanos Store {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests. + message: Thanos Store {{$labels.job}} is failing to handle {{ $value | humanize + }}% of requests. expr: | ( sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*"}[5m])) @@ -202,7 +216,8 @@ rules: severity: warning - alert: ThanosStoreSeriesGateLatencyHigh annotations: - message: Thanos Store {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for store series gate requests. + message: Thanos Store {{$labels.job}} has a 99th percentile latency of {{ $value + }} seconds for store series gate requests. expr: | ( histogram_quantile(0.9, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2 @@ -214,7 +229,8 @@ rules: severity: warning - alert: ThanosStoreBucketHighOperationFailures annotations: - message: Thanos Store {{$labels.job}} Bucket is failing to execute {{ $value | humanize }}% of operations. + message: Thanos Store {{$labels.job}} Bucket is failing to execute {{ $value | + humanize }}% of operations. expr: | ( sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-store.*"}[5m])) @@ -227,7 +243,8 @@ rules: severity: warning - alert: ThanosStoreObjstoreOperationLatencyHigh annotations: - message: Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{ $value }} seconds for the bucket operations. + message: Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of + {{ $value }} seconds for the bucket operations. expr: | ( histogram_quantile(0.9, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2 @@ -255,7 +272,8 @@ rules: severity: critical - alert: ThanosSidecarUnhealthy annotations: - message: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{ $value }} seconds. + message: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{ $value + }} seconds. expr: | count(time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"}) by (job, pod) >= 300) > 0 labels: @@ -270,7 +288,8 @@ name: thanos-query.rules rules: - alert: ThanosQueryHttpRequestQueryErrorRateHigh annotations: - message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of "query" requests. + message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize + }}% of "query" requests. expr: | ( sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query"}[5m])) @@ -282,7 +301,8 @@ rules: severity: critical - alert: ThanosQueryHttpRequestQueryRangeErrorRateHigh annotations: - message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of "query_range" requests. + message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize + }}% of "query_range" requests. expr: | ( sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query_range"}[5m])) @@ -294,7 +314,8 @@ rules: severity: critical - alert: ThanosQueryGrpcServerErrorRate annotations: - message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests. + message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize + }}% of requests. expr: | ( sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-query.*"}[5m])) @@ -307,7 +328,8 @@ rules: severity: warning - alert: ThanosQueryGrpcClientErrorRate annotations: - message: Thanos Query {{$labels.job}} is failing to send {{ $value | humanize }}% of requests. + message: Thanos Query {{$labels.job}} is failing to send {{ $value | humanize + }}% of requests. expr: | ( sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~"thanos-query.*"}[5m])) @@ -319,7 +341,8 @@ rules: severity: warning - alert: ThanosQueryHighDNSFailures annotations: - message: Thanos Query {{$labels.job}} have {{ $value | humanize }}% of failing DNS queries for store endpoints. + message: Thanos Query {{$labels.job}} have {{ $value | humanize }}% of failing + DNS queries for store endpoints. expr: | ( sum by (job) (rate(thanos_querier_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m])) @@ -331,7 +354,8 @@ rules: severity: warning - alert: ThanosQueryInstantLatencyHigh annotations: - message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for instant queries. + message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value + }} seconds for instant queries. expr: | ( histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m]))) > 40 @@ -343,7 +367,8 @@ rules: severity: critical - alert: ThanosQueryRangeLatencyHigh annotations: - message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for range queries. + message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value + }} seconds for range queries. expr: | ( histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query_range"}[5m]))) > 90 @@ -363,7 +388,8 @@ name: thanos-receive.rules rules: - alert: ThanosReceiveHttpRequestErrorRateHigh annotations: - message: Thanos Receive {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests. + message: Thanos Receive {{$labels.job}} is failing to handle {{ $value | humanize + }}% of requests. expr: | ( sum(rate(http_requests_total{code=~"5..", job=~"thanos-receive.*", handler="receive"}[5m])) @@ -375,7 +401,8 @@ rules: severity: critical - alert: ThanosReceiveHttpRequestLatencyHigh annotations: - message: Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for requests. + message: Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value + }} seconds for requests. expr: | ( histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-receive.*", handler="receive"}[5m]))) > 10 @@ -387,7 +414,8 @@ rules: severity: critical - alert: ThanosReceiveHighReplicationFailures annotations: - message: Thanos Receive {{$labels.job}} is failing to replicate {{ $value | humanize }}% of requests. + message: Thanos Receive {{$labels.job}} is failing to replicate {{ $value | humanize + }}% of requests. expr: | thanos_receive_replication_factor > 1 and @@ -409,7 +437,8 @@ rules: severity: warning - alert: ThanosReceiveHighForwardRequestFailures annotations: - message: Thanos Receive {{$labels.job}} is failing to forward {{ $value | humanize }}% of requests. + message: Thanos Receive {{$labels.job}} is failing to forward {{ $value | humanize + }}% of requests. expr: | ( sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~"thanos-receive.*"}[5m])) @@ -421,7 +450,8 @@ rules: severity: warning - alert: ThanosReceiveHighHashringFileRefreshFailures annotations: - message: Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{ $value | humanize }} of attempts failed. + message: Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{ + $value | humanize }} of attempts failed. expr: | ( sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~"thanos-receive.*"}[5m])) @@ -435,13 +465,15 @@ rules: - alert: ThanosReceiveConfigReloadFailure annotations: message: Thanos Receive {{$labels.job}} has not been able to reload hashring configurations. - expr: avg(thanos_receive_config_last_reload_successful{job=~"thanos-receive.*"}) by (job) != 1 + expr: avg(thanos_receive_config_last_reload_successful{job=~"thanos-receive.*"}) + by (job) != 1 for: 5m labels: severity: warning - alert: ThanosReceiveNoUpload annotations: - message: Thanos Receive {{ $labels.instance }} of {{$labels.job}} has not uploaded latest data to object storage. + message: Thanos Receive {{ $labels.instance }} of {{$labels.job}} has not uploaded + latest data to object storage. expr: | (up{job=~"thanos-receive.*"} - 1) + on (instance) # filters to only alert on current instance last 3h @@ -467,7 +499,8 @@ rules: severity: critical - alert: ThanosBucketReplicateErrorRate annotations: - message: Thanos Replicate failing to run, {{ $value | humanize }}% of attempts failed. + message: Thanos Replicate failing to run, {{ $value | humanize }}% of attempts + failed. expr: | ( sum(rate(thanos_replicate_replication_runs_total{result="error", job=~"thanos-bucket-replicate.*"}[5m])) @@ -479,7 +512,8 @@ rules: severity: critical - alert: ThanosBucketReplicateRunLatency annotations: - message: Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for the replicate operations. + message: Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{ + $value }} seconds for the replicate operations. expr: | ( histogram_quantile(0.9, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m]))) > 20 diff --git a/examples/alerts/alerts.yaml b/examples/alerts/alerts.yaml index 4f8a7b57105..72c3279e490 100644 --- a/examples/alerts/alerts.yaml +++ b/examples/alerts/alerts.yaml @@ -3,7 +3,8 @@ groups: rules: - alert: ThanosCompactMultipleRunning annotations: - message: No more than one Thanos Compact instance should be running at once. There are {{ $value }} + message: No more than one Thanos Compact instance should be running at once. + There are {{ $value }} expr: sum(up{job=~"thanos-compact.*"}) > 1 for: 5m labels: @@ -17,7 +18,8 @@ groups: severity: warning - alert: ThanosCompactHighCompactionFailures annotations: - message: Thanos Compact {{$labels.job}} is failing to execute {{ $value | humanize }}% of compactions. + message: Thanos Compact {{$labels.job}} is failing to execute {{ $value | humanize + }}% of compactions. expr: | ( sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~"thanos-compact.*"}[5m])) @@ -30,7 +32,8 @@ groups: severity: warning - alert: ThanosCompactBucketHighOperationFailures annotations: - message: Thanos Compact {{$labels.job}} Bucket is failing to execute {{ $value | humanize }}% of operations. + message: Thanos Compact {{$labels.job}} Bucket is failing to execute {{ $value + | humanize }}% of operations. expr: | ( sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-compact.*"}[5m])) @@ -44,14 +47,16 @@ groups: - alert: ThanosCompactHasNotRun annotations: message: Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours. - expr: (time() - max(max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~"thanos-compact.*"}[24h]))) / 60 / 60 > 24 + expr: (time() - max(max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~"thanos-compact.*"}[24h]))) + / 60 / 60 > 24 labels: severity: warning - name: thanos-query.rules rules: - alert: ThanosQueryHttpRequestQueryErrorRateHigh annotations: - message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of "query" requests. + message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize + }}% of "query" requests. expr: | ( sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query"}[5m])) @@ -63,7 +68,8 @@ groups: severity: critical - alert: ThanosQueryHttpRequestQueryRangeErrorRateHigh annotations: - message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of "query_range" requests. + message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize + }}% of "query_range" requests. expr: | ( sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query_range"}[5m])) @@ -75,7 +81,8 @@ groups: severity: critical - alert: ThanosQueryGrpcServerErrorRate annotations: - message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests. + message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize + }}% of requests. expr: | ( sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-query.*"}[5m])) @@ -88,7 +95,8 @@ groups: severity: warning - alert: ThanosQueryGrpcClientErrorRate annotations: - message: Thanos Query {{$labels.job}} is failing to send {{ $value | humanize }}% of requests. + message: Thanos Query {{$labels.job}} is failing to send {{ $value | humanize + }}% of requests. expr: | ( sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~"thanos-query.*"}[5m])) @@ -100,7 +108,8 @@ groups: severity: warning - alert: ThanosQueryHighDNSFailures annotations: - message: Thanos Query {{$labels.job}} have {{ $value | humanize }}% of failing DNS queries for store endpoints. + message: Thanos Query {{$labels.job}} have {{ $value | humanize }}% of failing + DNS queries for store endpoints. expr: | ( sum by (job) (rate(thanos_querier_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m])) @@ -112,7 +121,8 @@ groups: severity: warning - alert: ThanosQueryInstantLatencyHigh annotations: - message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for instant queries. + message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value + }} seconds for instant queries. expr: | ( histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m]))) > 40 @@ -124,7 +134,8 @@ groups: severity: critical - alert: ThanosQueryRangeLatencyHigh annotations: - message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for range queries. + message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value + }} seconds for range queries. expr: | ( histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query_range"}[5m]))) > 90 @@ -138,7 +149,8 @@ groups: rules: - alert: ThanosReceiveHttpRequestErrorRateHigh annotations: - message: Thanos Receive {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests. + message: Thanos Receive {{$labels.job}} is failing to handle {{ $value | humanize + }}% of requests. expr: | ( sum(rate(http_requests_total{code=~"5..", job=~"thanos-receive.*", handler="receive"}[5m])) @@ -150,7 +162,8 @@ groups: severity: critical - alert: ThanosReceiveHttpRequestLatencyHigh annotations: - message: Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for requests. + message: Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ + $value }} seconds for requests. expr: | ( histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-receive.*", handler="receive"}[5m]))) > 10 @@ -162,7 +175,8 @@ groups: severity: critical - alert: ThanosReceiveHighReplicationFailures annotations: - message: Thanos Receive {{$labels.job}} is failing to replicate {{ $value | humanize }}% of requests. + message: Thanos Receive {{$labels.job}} is failing to replicate {{ $value | + humanize }}% of requests. expr: | thanos_receive_replication_factor > 1 and @@ -184,7 +198,8 @@ groups: severity: warning - alert: ThanosReceiveHighForwardRequestFailures annotations: - message: Thanos Receive {{$labels.job}} is failing to forward {{ $value | humanize }}% of requests. + message: Thanos Receive {{$labels.job}} is failing to forward {{ $value | humanize + }}% of requests. expr: | ( sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~"thanos-receive.*"}[5m])) @@ -196,7 +211,8 @@ groups: severity: warning - alert: ThanosReceiveHighHashringFileRefreshFailures annotations: - message: Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{ $value | humanize }} of attempts failed. + message: Thanos Receive {{$labels.job}} is failing to refresh hashring file, + {{ $value | humanize }} of attempts failed. expr: | ( sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~"thanos-receive.*"}[5m])) @@ -209,14 +225,17 @@ groups: severity: warning - alert: ThanosReceiveConfigReloadFailure annotations: - message: Thanos Receive {{$labels.job}} has not been able to reload hashring configurations. - expr: avg(thanos_receive_config_last_reload_successful{job=~"thanos-receive.*"}) by (job) != 1 + message: Thanos Receive {{$labels.job}} has not been able to reload hashring + configurations. + expr: avg(thanos_receive_config_last_reload_successful{job=~"thanos-receive.*"}) + by (job) != 1 for: 5m labels: severity: warning - alert: ThanosReceiveNoUpload annotations: - message: Thanos Receive {{ $labels.instance }} of {{$labels.job}} has not uploaded latest data to object storage. + message: Thanos Receive {{ $labels.instance }} of {{$labels.job}} has not uploaded + latest data to object storage. expr: | (up{job=~"thanos-receive.*"} - 1) + on (instance) # filters to only alert on current instance last 3h @@ -236,7 +255,8 @@ groups: severity: critical - alert: ThanosSidecarUnhealthy annotations: - message: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{ $value }} seconds. + message: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{ + $value }} seconds. expr: | count(time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"}) by (job, pod) >= 300) > 0 labels: @@ -245,7 +265,8 @@ groups: rules: - alert: ThanosStoreGrpcErrorRate annotations: - message: Thanos Store {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests. + message: Thanos Store {{$labels.job}} is failing to handle {{ $value | humanize + }}% of requests. expr: | ( sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*"}[5m])) @@ -258,7 +279,8 @@ groups: severity: warning - alert: ThanosStoreSeriesGateLatencyHigh annotations: - message: Thanos Store {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for store series gate requests. + message: Thanos Store {{$labels.job}} has a 99th percentile latency of {{ $value + }} seconds for store series gate requests. expr: | ( histogram_quantile(0.9, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2 @@ -270,7 +292,8 @@ groups: severity: warning - alert: ThanosStoreBucketHighOperationFailures annotations: - message: Thanos Store {{$labels.job}} Bucket is failing to execute {{ $value | humanize }}% of operations. + message: Thanos Store {{$labels.job}} Bucket is failing to execute {{ $value + | humanize }}% of operations. expr: | ( sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-store.*"}[5m])) @@ -283,7 +306,8 @@ groups: severity: warning - alert: ThanosStoreObjstoreOperationLatencyHigh annotations: - message: Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{ $value }} seconds for the bucket operations. + message: Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of + {{ $value }} seconds for the bucket operations. expr: | ( histogram_quantile(0.9, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2 @@ -305,7 +329,8 @@ groups: severity: critical - alert: ThanosRuleSenderIsFailingAlerts annotations: - message: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to send alerts to alertmanager. + message: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to send alerts + to alertmanager. expr: | sum by (job) (rate(thanos_alert_sender_alerts_dropped_total{job=~"thanos-rule.*"}[5m])) > 0 for: 5m @@ -313,7 +338,8 @@ groups: severity: critical - alert: ThanosRuleHighRuleEvaluationFailures annotations: - message: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to evaluate rules. + message: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to evaluate + rules. expr: | ( sum by (job) (rate(prometheus_rule_evaluation_failures_total{job=~"thanos-rule.*"}[5m])) @@ -326,7 +352,8 @@ groups: severity: critical - alert: ThanosRuleHighRuleEvaluationWarnings annotations: - message: Thanos Rule {{$labels.job}} {{$labels.pod}} has high number of evaluation warnings. + message: Thanos Rule {{$labels.job}} {{$labels.pod}} has high number of evaluation + warnings. expr: | sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{job=~"thanos-rule.*"}[5m])) > 0 for: 15m @@ -334,7 +361,8 @@ groups: severity: info - alert: ThanosRuleRuleEvaluationLatencyHigh annotations: - message: Thanos Rule {{$labels.job}}/{{$labels.pod}} has higher evaluation latency than interval for {{$labels.rule_group}}. + message: Thanos Rule {{$labels.job}}/{{$labels.pod}} has higher evaluation latency + than interval for {{$labels.rule_group}}. expr: | ( sum by (job, pod, rule_group) (prometheus_rule_group_last_duration_seconds{job=~"thanos-rule.*"}) @@ -346,7 +374,8 @@ groups: severity: warning - alert: ThanosRuleGrpcErrorRate annotations: - message: Thanos Rule {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests. + message: Thanos Rule {{$labels.job}} is failing to handle {{ $value | humanize + }}% of requests. expr: | ( sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-rule.*"}[5m])) @@ -360,13 +389,15 @@ groups: - alert: ThanosRuleConfigReloadFailure annotations: message: Thanos Rule {{$labels.job}} has not been able to reload its configuration. - expr: avg(thanos_rule_config_last_reload_successful{job=~"thanos-rule.*"}) by (job) != 1 + expr: avg(thanos_rule_config_last_reload_successful{job=~"thanos-rule.*"}) by + (job) != 1 for: 5m labels: severity: info - alert: ThanosRuleQueryHighDNSFailures annotations: - message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS queries for query endpoints. + message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing + DNS queries for query endpoints. expr: | ( sum by (job) (rate(thanos_ruler_query_apis_dns_failures_total{job=~"thanos-rule.*"}[5m])) @@ -379,7 +410,8 @@ groups: severity: warning - alert: ThanosRuleAlertmanagerHighDNSFailures annotations: - message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS queries for Alertmanager endpoints. + message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing + DNS queries for Alertmanager endpoints. expr: | ( sum by (job) (rate(thanos_ruler_alertmanagers_dns_failures_total{job=~"thanos-rule.*"}[5m])) @@ -392,7 +424,8 @@ groups: severity: warning - alert: ThanosRuleNoEvaluationFor10Intervals annotations: - message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% rule groups that did not evaluate for at least 10x of their expected interval. + message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% rule groups + that did not evaluate for at least 10x of their expected interval. expr: | time() - max by (job, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~"thanos-rule.*"}) > @@ -402,7 +435,8 @@ groups: severity: info - alert: ThanosNoRuleEvaluations annotations: - message: Thanos Rule {{$labels.job}} did not perform any rule evaluations in the past 2 minutes. + message: Thanos Rule {{$labels.job}} did not perform any rule evaluations in + the past 2 minutes. expr: | sum(rate(prometheus_rule_evaluations_total{job=~"thanos-rule.*"}[2m])) <= 0 and @@ -472,7 +506,8 @@ groups: severity: critical - alert: ThanosBucketReplicateErrorRate annotations: - message: Thanos Replicate failing to run, {{ $value | humanize }}% of attempts failed. + message: Thanos Replicate failing to run, {{ $value | humanize }}% of attempts + failed. expr: | ( sum(rate(thanos_replicate_replication_runs_total{result="error", job=~"thanos-bucket-replicate.*"}[5m])) @@ -484,7 +519,8 @@ groups: severity: critical - alert: ThanosBucketReplicateRunLatency annotations: - message: Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for the replicate operations. + message: Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{ + $value }} seconds for the replicate operations. expr: | ( histogram_quantile(0.9, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m]))) > 20 diff --git a/pkg/replicate/scheme.go b/pkg/replicate/scheme.go index 034f8d99067..86e65040dcf 100644 --- a/pkg/replicate/scheme.go +++ b/pkg/replicate/scheme.go @@ -59,7 +59,6 @@ func NewBlockFilter( // Filter return true if block is non-compacted and matches selector. func (bf *BlockFilter) Filter(b *metadata.Meta) bool { - // TODO(bwplotka): Allow injecting custom labels as shipper does. if len(b.Thanos.Labels) == 0 { level.Error(bf.logger).Log("msg", "filtering block", "reason", "labels should not be empty") return false @@ -226,6 +225,7 @@ func (rs *replicationScheme) ensureBlockIsReplicated(ctx context.Context, id uli return errors.Wrap(err, "get meta file from target bucket") } + // TODO(bwplotka): Allow injecting custom labels as shipper does. originMetaFileContent, err := ioutil.ReadAll(originMetaFile) if err != nil { return errors.Wrap(err, "read origin meta file")