Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug 2013243: jssonet: ignore Alarm method in etcdGRPCRequestsSlow alerts #691

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions jsonnet/custom.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,21 @@
{
name: 'openshift-etcd.rules',
rules: [
{
alert: 'etcdGRPCRequestsSlow',
expr: |||
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd", grpc_method!~"Defragment|Alarm", grpc_type="unary"}[5m])) without(grpc_type))
> 0.15
|||,
'for': '10m',
labels: {
severity: 'critical',
},
annotations: {
description: 'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests is {{ $value }}s on etcd instance {{ $labels.instance }}.',
summary: 'etcd grpc requests are slow',
},
},
{
alert: 'etcdHighNumberOfFailedGRPCRequests',
expr: |||
Expand Down
2 changes: 1 addition & 1 deletion jsonnet/main.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ local promRules = if std.objectHasAll(etcdMixin, 'prometheusRules') then etcdMix

// Exclude rules that are either OpenShift specific or do not work for OpenShift.
// List should be ordered!
local excludedAlerts = ['etcdHighNumberOfFailedGRPCRequests', 'etcdHighNumberOfLeaderChanges', 'etcdInsufficientMembers'];
local excludedAlerts = ['etcdGRPCRequestsSlow','etcdHighNumberOfFailedGRPCRequests', 'etcdHighNumberOfLeaderChanges', 'etcdInsufficientMembers'];
local excludeRules = std.map(
function(group) group {
rules: std.filter(
Expand Down
70 changes: 23 additions & 47 deletions manifests/0000_90_etcd-operator_03_prometheusrule.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,7 @@ spec:
rules:
- alert: etcdMembersDown
annotations:
description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value
}}).'
description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value }}).'
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdMembersDown.md
summary: etcd cluster members are down.
expr: |
Expand All @@ -31,32 +30,17 @@ spec:
severity: critical
- alert: etcdNoLeader
annotations:
description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance
}} has no leader.'
description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.'
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdNoLeader.md
summary: etcd cluster has no leader.
expr: |
etcd_server_has_leader{job=~".*etcd.*"} == 0
for: 1m
labels:
severity: critical
- alert: etcdGRPCRequestsSlow
annotations:
description: 'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests
is {{ $value }}s on etcd instance {{ $labels.instance }}.'
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdGRPCRequestsSlow.md
summary: etcd grpc requests are slow
expr: |
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type))
> 0.15
for: 10m
labels:
severity: critical
- alert: etcdMemberCommunicationSlow
annotations:
description: 'etcd cluster "{{ $labels.job }}": member communication with
{{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance
}}.'
description: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
summary: etcd cluster member communication is slow.
expr: |
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
Expand All @@ -66,8 +50,7 @@ spec:
severity: warning
- alert: etcdHighNumberOfFailedProposals
annotations:
description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures
within the last 30 minutes on etcd instance {{ $labels.instance }}.'
description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.'
summary: etcd cluster has high number of proposal failures.
expr: |
rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
Expand All @@ -76,8 +59,7 @@ spec:
severity: warning
- alert: etcdHighFsyncDurations
annotations:
description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations
are {{ $value }}s on etcd instance {{ $labels.instance }}.'
description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
summary: etcd cluster 99th percentile fsync durations are too high.
expr: |
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
Expand All @@ -87,8 +69,7 @@ spec:
severity: warning
- alert: etcdHighFsyncDurations
annotations:
description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations
are {{ $value }}s on etcd instance {{ $labels.instance }}.'
description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdHighFsyncDurations.md
expr: |
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
Expand All @@ -98,8 +79,7 @@ spec:
severity: critical
- alert: etcdHighCommitDurations
annotations:
description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations
{{ $value }}s on etcd instance {{ $labels.instance }}.'
description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.'
summary: etcd cluster 99th percentile commit durations are too high.
expr: |
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
Expand All @@ -109,9 +89,7 @@ spec:
severity: warning
- alert: etcdBackendQuotaLowSpace
annotations:
description: 'etcd cluster "{{ $labels.job }}": database size exceeds the
defined quota on etcd instance {{ $labels.instance }}, please defrag or
increase the quota as the writes to etcd will be disabled when it is full.'
description: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.'
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdBackendQuotaLowSpace.md
expr: |
(etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100 > 95
Expand All @@ -120,16 +98,25 @@ spec:
severity: critical
- alert: etcdExcessiveDatabaseGrowth
annotations:
description: 'etcd cluster "{{ $labels.job }}": Observed surge in etcd writes
leading to 50% increase in database size over the past four hours on etcd
instance {{ $labels.instance }}, please check as it might be disruptive.'
description: 'etcd cluster "{{ $labels.job }}": Observed surge in etcd writes leading to 50% increase in database size over the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive.'
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the way forward since v1.0.0 however seems to be non-wrapped lines so we should probably stick to that.

Yes, I think this formatting changed was due by @hexfusion maybe having an outdated gojsontoyaml install during recent changes? Maybe we can add another check for this here https://github.com/openshift/cluster-etcd-operator/blob/master/hack/generate.sh#L5?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah I will keep that in mind but we should bake it into the script.

expr: |
increase(((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100)[240m:1m]) > 50
for: 10m
labels:
severity: warning
- name: openshift-etcd.rules
rules:
- alert: etcdGRPCRequestsSlow
annotations:
description: 'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests is {{ $value }}s on etcd instance {{ $labels.instance }}.'
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdGRPCRequestsSlow.md
summary: etcd grpc requests are slow
expr: |
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd", grpc_method!~"Defragment|Alarm", grpc_type="unary"}[5m])) without(grpc_type))
> 0.15
for: 10m
labels:
severity: critical
- alert: etcdHighNumberOfFailedGRPCRequests
annotations:
description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
Expand Down Expand Up @@ -157,10 +144,7 @@ spec:
severity: critical
- alert: etcdHighNumberOfLeaderChanges
annotations:
description: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes
within the last 15 minutes. Frequent elections may be a sign of insufficient
resources, high network latency, or disruptions by other components and
should be investigated.'
description: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
summary: etcd cluster has high number of leader changes.
expr: |
increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 5
Expand All @@ -169,18 +153,10 @@ spec:
severity: warning
- alert: etcdInsufficientMembers
annotations:
description: etcd is reporting fewer instances are available than are needed
({{ $value }}). When etcd does not have a majority of instances available
the Kubernetes and OpenShift APIs will reject read and write requests and
operations that preserve the health of workloads cannot be performed. This
can occur when multiple control plane nodes are powered off or are unable
to connect to each other via the network. Check that all control plane nodes
are powered on and that network connections between each machine are functional.
description: etcd is reporting fewer instances are available than are needed ({{ $value }}). When etcd does not have a majority of instances available the Kubernetes and OpenShift APIs will reject read and write requests and operations that preserve the health of workloads cannot be performed. This can occur when multiple control plane nodes are powered off or are unable to connect to each other via the network. Check that all control plane nodes are powered on and that network connections between each machine are functional.
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdInsufficientMembers.md
summary: etcd is reporting that a majority of instances are unavailable.
expr: sum(up{job="etcd"} == bool 1 and etcd_server_has_leader{job="etcd"} ==
bool 1) without (instance,pod) < ((count(up{job="etcd"}) without (instance,pod)
+ 1) / 2)
expr: sum(up{job="etcd"} == bool 1 and etcd_server_has_leader{job="etcd"} == bool 1) without (instance,pod) < ((count(up{job="etcd"}) without (instance,pod) + 1) / 2)
for: 3m
labels:
severity: critical