openshift · hasbro17 · Oct 12, 2021 · lilic · Oct 13, 2021 · hexfusion
diff --git a/jsonnet/custom.libsonnet b/jsonnet/custom.libsonnet
@@ -4,6 +4,21 @@
       {
         name: 'openshift-etcd.rules',
         rules: [
+          {
+            alert: 'etcdGRPCRequestsSlow',
+            expr: |||
+              histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd", grpc_method!~"Defragment|Alarm", grpc_type="unary"}[5m])) without(grpc_type))
+              > 0.15
+            |||,
+            'for': '10m',
+            labels: {
+              severity: 'critical',
+            },
+            annotations: {
+              description: 'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests is {{ $value }}s on etcd instance {{ $labels.instance }}.',
+              summary: 'etcd grpc requests are slow',
+            },
+          },
           {
             alert: 'etcdHighNumberOfFailedGRPCRequests',
             expr: |||

diff --git a/jsonnet/main.jsonnet b/jsonnet/main.jsonnet
@@ -6,7 +6,7 @@ local promRules = if std.objectHasAll(etcdMixin, 'prometheusRules') then etcdMix
 
 // Exclude rules that are either OpenShift specific or do not work for OpenShift.
 // List should be ordered!
-local excludedAlerts = ['etcdHighNumberOfFailedGRPCRequests', 'etcdHighNumberOfLeaderChanges', 'etcdInsufficientMembers'];
+local excludedAlerts = ['etcdGRPCRequestsSlow','etcdHighNumberOfFailedGRPCRequests', 'etcdHighNumberOfLeaderChanges', 'etcdInsufficientMembers'];
 local excludeRules = std.map(
   function(group) group {
     rules: std.filter(

diff --git a/manifests/0000_90_etcd-operator_03_prometheusrule.yaml b/manifests/0000_90_etcd-operator_03_prometheusrule.yaml
@@ -13,8 +13,7 @@ spec:
     rules:
     - alert: etcdMembersDown
       annotations:
-        description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value
-          }}).'
+        description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value }}).'
         runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdMembersDown.md
         summary: etcd cluster members are down.
       expr: |
@@ -31,32 +30,17 @@ spec:
         severity: critical
     - alert: etcdNoLeader
       annotations:
-        description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance
-          }} has no leader.'
+        description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.'
         runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdNoLeader.md
         summary: etcd cluster has no leader.
       expr: |
         etcd_server_has_leader{job=~".*etcd.*"} == 0
       for: 1m
       labels:
         severity: critical
-    - alert: etcdGRPCRequestsSlow
-      annotations:
-        description: 'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests
-          is {{ $value }}s on etcd instance {{ $labels.instance }}.'
-        runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdGRPCRequestsSlow.md
-        summary: etcd grpc requests are slow
-      expr: |
-        histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type))
-        > 0.15
-      for: 10m
-      labels:
-        severity: critical
     - alert: etcdMemberCommunicationSlow
       annotations:
-        description: 'etcd cluster "{{ $labels.job }}": member communication with
-          {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance
-          }}.'
+        description: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
         summary: etcd cluster member communication is slow.
       expr: |
         histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
@@ -66,8 +50,7 @@ spec:
         severity: warning
     - alert: etcdHighNumberOfFailedProposals
       annotations:
-        description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures
-          within the last 30 minutes on etcd instance {{ $labels.instance }}.'
+        description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.'
         summary: etcd cluster has high number of proposal failures.
       expr: |
         rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
@@ -76,8 +59,7 @@ spec:
         severity: warning
     - alert: etcdHighFsyncDurations
       annotations:
-        description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations
-          are {{ $value }}s on etcd instance {{ $labels.instance }}.'
+        description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
         summary: etcd cluster 99th percentile fsync durations are too high.
       expr: |
         histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
@@ -87,8 +69,7 @@ spec:
         severity: warning
     - alert: etcdHighFsyncDurations
       annotations:
-        description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations
-          are {{ $value }}s on etcd instance {{ $labels.instance }}.'
+        description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
         runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdHighFsyncDurations.md
       expr: |
         histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
@@ -98,8 +79,7 @@ spec:
         severity: critical
     - alert: etcdHighCommitDurations
       annotations:
-        description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations
-          {{ $value }}s on etcd instance {{ $labels.instance }}.'
+        description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.'
         summary: etcd cluster 99th percentile commit durations are too high.
       expr: |
         histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
@@ -109,9 +89,7 @@ spec:
         severity: warning
     - alert: etcdBackendQuotaLowSpace
       annotations:
-        description: 'etcd cluster "{{ $labels.job }}": database size exceeds the
-          defined quota on etcd instance {{ $labels.instance }}, please defrag or
-          increase the quota as the writes to etcd will be disabled when it is full.'
+        description: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.'
         runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdBackendQuotaLowSpace.md
       expr: |
         (etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100 > 95
@@ -120,16 +98,25 @@ spec:
         severity: critical
     - alert: etcdExcessiveDatabaseGrowth
       annotations:
-        description: 'etcd cluster "{{ $labels.job }}": Observed surge in etcd writes
-          leading to 50% increase in database size over the past four hours on etcd
-          instance {{ $labels.instance }}, please check as it might be disruptive.'
+        description: 'etcd cluster "{{ $labels.job }}": Observed surge in etcd writes leading to 50% increase in database size over the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive.'
       expr: |
         increase(((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100)[240m:1m]) > 50
       for: 10m
       labels:
         severity: warning
   - name: openshift-etcd.rules
     rules:
+    - alert: etcdGRPCRequestsSlow
+      annotations:
+        description: 'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests is {{ $value }}s on etcd instance {{ $labels.instance }}.'
+        runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdGRPCRequestsSlow.md
+        summary: etcd grpc requests are slow
+      expr: |
+        histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd", grpc_method!~"Defragment|Alarm", grpc_type="unary"}[5m])) without(grpc_type))
+        > 0.15
+      for: 10m
+      labels:
+        severity: critical
     - alert: etcdHighNumberOfFailedGRPCRequests
       annotations:
         description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
@@ -157,10 +144,7 @@ spec:
         severity: critical
     - alert: etcdHighNumberOfLeaderChanges
       annotations:
-        description: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes
-          within the last 15 minutes. Frequent elections may be a sign of insufficient
-          resources, high network latency, or disruptions by other components and
-          should be investigated.'
+        description: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
         summary: etcd cluster has high number of leader changes.
       expr: |
         increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 5
@@ -169,18 +153,10 @@ spec:
         severity: warning
     - alert: etcdInsufficientMembers
       annotations:
-        description: etcd is reporting fewer instances are available than are needed
-          ({{ $value }}). When etcd does not have a majority of instances available
-          the Kubernetes and OpenShift APIs will reject read and write requests and
-          operations that preserve the health of workloads cannot be performed. This
-          can occur when multiple control plane nodes are powered off or are unable
-          to connect to each other via the network. Check that all control plane nodes
-          are powered on and that network connections between each machine are functional.
+        description: etcd is reporting fewer instances are available than are needed ({{ $value }}). When etcd does not have a majority of instances available the Kubernetes and OpenShift APIs will reject read and write requests and operations that preserve the health of workloads cannot be performed. This can occur when multiple control plane nodes are powered off or are unable to connect to each other via the network. Check that all control plane nodes are powered on and that network connections between each machine are functional.
         runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdInsufficientMembers.md
         summary: etcd is reporting that a majority of instances are unavailable.
-      expr: sum(up{job="etcd"} == bool 1 and etcd_server_has_leader{job="etcd"} ==
-        bool 1) without (instance,pod) < ((count(up{job="etcd"}) without (instance,pod)
-        + 1) / 2)
+      expr: sum(up{job="etcd"} == bool 1 and etcd_server_has_leader{job="etcd"} == bool 1) without (instance,pod) < ((count(up{job="etcd"}) without (instance,pod) + 1) / 2)
       for: 3m
       labels:
         severity: critical