Skip to content

Commit

Permalink
mixin: Make alert threshold values parametric (#2317)
Browse files Browse the repository at this point in the history
* Make alert threshold values parametric

Signed-off-by: Kemal Akkoyun <[email protected]>

* Rename variable

Signed-off-by: Kemal Akkoyun <[email protected]>

* Adjsut default values for latency thresholds

Signed-off-by: Kemal Akkoyun <[email protected]>
  • Loading branch information
kakkoyun authored Mar 26, 2020
1 parent 214ff44 commit 14c9403
Show file tree
Hide file tree
Showing 8 changed files with 51 additions and 29 deletions.
6 changes: 3 additions & 3 deletions examples/alerts/alerts.md
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ rules:
{{ $value }} seconds for the bucket operations.
expr: |
(
histogram_quantile(0.9, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 15
histogram_quantile(0.9, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2
and
sum by (job) (rate(thanos_objstore_bucket_operation_duration_seconds_count{job=~"thanos-store.*"}[5m])) > 0
)
Expand Down Expand Up @@ -336,7 +336,7 @@ rules:
}} seconds for instant queries.
expr: |
(
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m]))) > 90
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m]))) > 40
and
sum by (job) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m])) > 0
)
Expand Down Expand Up @@ -461,7 +461,7 @@ rules:
$value }} seconds for the replicate operations.
expr: |
(
histogram_quantile(0.9, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m]))) > 120
histogram_quantile(0.9, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m]))) > 20
and
sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m])) > 0
)
Expand Down
6 changes: 3 additions & 3 deletions examples/alerts/alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ groups:
}} seconds for instant queries.
expr: |
(
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m]))) > 90
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m]))) > 40
and
sum by (job) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m])) > 0
)
Expand Down Expand Up @@ -277,7 +277,7 @@ groups:
{{ $value }} seconds for the bucket operations.
expr: |
(
histogram_quantile(0.9, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 15
histogram_quantile(0.9, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2
and
sum by (job) (rate(thanos_objstore_bucket_operation_duration_seconds_count{job=~"thanos-store.*"}[5m])) > 0
)
Expand Down Expand Up @@ -468,7 +468,7 @@ groups:
$value }} seconds for the replicate operations.
expr: |
(
histogram_quantile(0.9, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m]))) > 120
histogram_quantile(0.9, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m]))) > 20
and
sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m])) > 0
)
Expand Down
6 changes: 4 additions & 2 deletions mixin/thanos/alerts/bucket_replicate.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
bucket_replicate+:: {
jobPrefix: error 'must provide job prefix for Thanos Bucket Replicate dashboard',
selector: error 'must provide selector for Thanos Bucket Replicate dashboard',
errorThreshold: 10,
p99LatencyThreshold: 20,
},
prometheusAlerts+:: {
groups+: [
Expand Down Expand Up @@ -32,7 +34,7 @@
sum(rate(thanos_replicate_replication_runs_total{result="error", %(selector)s}[5m]))
/ on (namespace) group_left
sum(rate(thanos_replicate_replication_runs_total{%(selector)s}[5m]))
) * 100 >= 10
) * 100 >= %(errorThreshold)s
||| % thanos.bucket_replicate,
'for': '5m',
labels: {
Expand All @@ -46,7 +48,7 @@
},
expr: |||
(
histogram_quantile(0.9, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket{%(selector)s}[5m]))) > 120
histogram_quantile(0.9, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket{%(selector)s}[5m]))) > %(p99LatencyThreshold)s
and
sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{%(selector)s}[5m])) > 0
)
Expand Down
6 changes: 4 additions & 2 deletions mixin/thanos/alerts/compact.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
compact+:: {
jobPrefix: error 'must provide job prefix for Thanos Compact alerts',
selector: error 'must provide selector for Thanos Compact alerts',
compactionErrorThreshold: 5,
bucketOpsErrorThreshold: 5,
},
prometheusAlerts+:: {
groups+: [
Expand Down Expand Up @@ -41,7 +43,7 @@
sum by (job) (rate(thanos_compact_group_compactions_failures_total{%(selector)s}[5m]))
/
sum by (job) (rate(thanos_compact_group_compactions_total{%(selector)s}[5m]))
* 100 > 5
* 100 > %(compactionErrorThreshold)s
)
||| % thanos.compact,
'for': '15m',
Expand All @@ -59,7 +61,7 @@
sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{%(selector)s}[5m]))
/
sum by (job) (rate(thanos_objstore_bucket_operations_total{%(selector)s}[5m]))
* 100 > 5
* 100 > %(bucketOpsErrorThreshold)s
)
||| % thanos.compact,
'for': '15m',
Expand Down
19 changes: 12 additions & 7 deletions mixin/thanos/alerts/query.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@
query+:: {
jobPrefix: error 'must provide job prefix for Thanos Query alerts',
selector: error 'must provide selector for Thanos Query alerts',
httpErrorThreshold: 5,
grpcErrorThreshold: 5,
dnsErrorThreshold: 1,
p99QueryLatencyThreshold: 40,
p99QueryRangeLatencyThreshold: 90,
},
prometheusAlerts+:: {
groups+: [
Expand All @@ -19,7 +24,7 @@
sum(rate(http_requests_total{code=~"5..", %(selector)s, handler="query"}[5m]))
/
sum(rate(http_requests_total{%(selector)s, handler="query"}[5m]))
) * 100 > 5
) * 100 > %(httpErrorThreshold)s
||| % thanos.query,
'for': '5m',
labels: {
Expand All @@ -36,7 +41,7 @@
sum(rate(http_requests_total{code=~"5..", %(selector)s, handler="query_range"}[5m]))
/
sum(rate(http_requests_total{%(selector)s, handler="query_range"}[5m]))
) * 100 > 5
) * 100 > %(httpErrorThreshold)s
||| % thanos.query,
'for': '5m',
labels: {
Expand All @@ -53,7 +58,7 @@
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s}[5m]))
/
sum by (job) (rate(grpc_server_started_total{%(selector)s}[5m]))
* 100 > 5
* 100 > %(grpcErrorThreshold)s
)
||| % thanos.query,
'for': '5m',
Expand All @@ -71,7 +76,7 @@
sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", %(selector)s}[5m]))
/
sum by (job) (rate(grpc_client_started_total{%(selector)s}[5m]))
) * 100 > 5
) * 100 > %(grpcErrorThreshold)s
||| % thanos.query,
'for': '5m',
labels: {
Expand All @@ -88,7 +93,7 @@
sum by (job) (rate(thanos_querier_store_apis_dns_failures_total{%(selector)s}[5m]))
/
sum by (job) (rate(thanos_querier_store_apis_dns_lookups_total{%(selector)s}[5m]))
) * 100 > 1
) * 100 > %(dnsErrorThreshold)s
||| % thanos.query,
'for': '15m',
labels: {
Expand All @@ -102,7 +107,7 @@
},
expr: |||
(
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{%(selector)s, handler="query"}[5m]))) > 90
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{%(selector)s, handler="query"}[5m]))) > %(p99QueryLatencyThreshold)s
and
sum by (job) (rate(http_request_duration_seconds_bucket{%(selector)s, handler="query"}[5m])) > 0
)
Expand All @@ -119,7 +124,7 @@
},
expr: |||
(
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{%(selector)s, handler="query_range"}[5m]))) > 90
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{%(selector)s, handler="query_range"}[5m]))) > %(p99QueryRangeLatencyThreshold)s
and
sum by (job) (rate(http_request_duration_seconds_count{%(selector)s, handler="query_range"}[5m])) > 0
)
Expand Down
12 changes: 8 additions & 4 deletions mixin/thanos/alerts/receive.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
receive+:: {
jobPrefix: error 'must provide job prefix for Thanos Receive alerts',
selector: error 'must provide selector for Thanos Receive alerts',
httpErrorThreshold: 5,
forwardErrorThreshold: 5,
refreshErrorThreshold: 0,
p99LatencyThreshold: 10,
},
prometheusAlerts+:: {
groups+: [
Expand All @@ -19,7 +23,7 @@
sum(rate(http_requests_total{code=~"5..", %(selector)s, handler="receive"}[5m]))
/
sum(rate(http_requests_total{%(selector)s, handler="receive"}[5m]))
) * 100 > 5
) * 100 > %(httpErrorThreshold)s
||| % thanos.receive,
'for': '5m',
labels: {
Expand All @@ -33,7 +37,7 @@
},
expr: |||
(
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{%(selector)s, handler="receive"}[5m]))) > 10
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{%(selector)s, handler="receive"}[5m]))) > %(p99LatencyThreshold)s
and
sum by (job) (rate(http_request_duration_seconds_count{%(selector)s, handler="receive"}[5m])) > 0
)
Expand All @@ -53,7 +57,7 @@
sum by (job) (rate(thanos_receive_forward_requests_total{result="error", %(selector)s}[5m]))
/
sum by (job) (rate(thanos_receive_forward_requests_total{%(selector)s}[5m]))
* 100 > 5
* 100 > %(forwardErrorThreshold)s
)
||| % thanos.receive,
'for': '5m',
Expand All @@ -71,7 +75,7 @@
sum by (job) (rate(thanos_receive_hashrings_file_errors_total{%(selector)s}[5m]))
/
sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{%(selector)s}[5m]))
> 0
> %(refreshErrorThreshold)s
)
||| % thanos.receive,
'for': '15m',
Expand Down
12 changes: 8 additions & 4 deletions mixin/thanos/alerts/rule.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
rule+:: {
jobPrefix: error 'must provide job prefix for Thanos Rule alerts',
selector: error 'must provide selector for Thanos Rule alerts',
grpcErrorThreshold: 5,
rulerDnsErrorThreshold: 1,
alertManagerDnsErrorThreshold: 1,
evalErrorThreshold: 5,
},
prometheusAlerts+:: {
groups+: [
Expand Down Expand Up @@ -45,7 +49,7 @@
sum by (job) (rate(prometheus_rule_evaluation_failures_total{%(selector)s}[5m]))
/
sum by (job) (rate(prometheus_rule_evaluations_total{%(selector)s}[5m]))
* 100 > 5
* 100 > %(evalErrorThreshold)s
)
||| % thanos.rule,

Expand Down Expand Up @@ -95,7 +99,7 @@
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s}[5m]))
/
sum by (job) (rate(grpc_server_started_total{%(selector)s}[5m]))
* 100 > 5
* 100 > %(grpcErrorThreshold)s
)
||| % thanos.rule,
'for': '5m',
Expand Down Expand Up @@ -124,7 +128,7 @@
sum by (job) (rate(thanos_ruler_query_apis_dns_failures_total{%(selector)s}[5m]))
/
sum by (job) (rate(thanos_ruler_query_apis_dns_lookups_total{%(selector)s}[5m]))
* 100 > 1
* 100 > %(rulerDnsErrorThreshold)s
)
||| % thanos.rule,
'for': '15m',
Expand All @@ -142,7 +146,7 @@
sum by (job) (rate(thanos_ruler_alertmanagers_dns_failures_total{%(selector)s}[5m]))
/
sum by (job) (rate(thanos_ruler_alertmanagers_dns_lookups_total{%(selector)s}[5m]))
* 100 > 1
* 100 > %(alertManagerDnsErrorThreshold)s
)
||| % thanos.rule,
'for': '15m',
Expand Down
13 changes: 9 additions & 4 deletions mixin/thanos/alerts/store.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@
store+:: {
jobPrefix: error 'must provide job prefix for Thanos Store alerts',
selector: error 'must provide selector for Thanos Store alerts',
grpcErrorThreshold: 5,
compactionErrorThreshold: 5,
seriesGateErrorThreshold: 2,
bucketOpsErrorThreshold: 5,
bucketOpsP99LatencyThreshold: 2,
},
prometheusAlerts+:: {
groups+: [
Expand All @@ -19,7 +24,7 @@
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s}[5m]))
/
sum by (job) (rate(grpc_server_started_total{%(selector)s}[5m]))
* 100 > 5
* 100 > %(grpcErrorThreshold)s
)
||| % thanos.store,
'for': '5m',
Expand All @@ -34,7 +39,7 @@
},
expr: |||
(
histogram_quantile(0.9, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{%(selector)s}[5m]))) > 2
histogram_quantile(0.9, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{%(selector)s}[5m]))) > %(seriesGateErrorThreshold)s
and
sum by (job) (rate(thanos_bucket_store_series_gate_duration_seconds_count{%(selector)s}[5m])) > 0
)
Expand All @@ -54,7 +59,7 @@
sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{%(selector)s}[5m]))
/
sum by (job) (rate(thanos_objstore_bucket_operations_total{%(selector)s}[5m]))
* 100 > 5
* 100 > %(bucketOpsErrorThreshold)s
)
||| % thanos.store,
'for': '15m',
Expand All @@ -69,7 +74,7 @@
},
expr: |||
(
histogram_quantile(0.9, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{%(selector)s}[5m]))) > 15
histogram_quantile(0.9, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{%(selector)s}[5m]))) > %(bucketOpsP99LatencyThreshold)s
and
sum by (job) (rate(thanos_objstore_bucket_operation_duration_seconds_count{%(selector)s}[5m])) > 0
)
Expand Down

0 comments on commit 14c9403

Please sign in to comment.