From 50392e181abccd5345f58f539ce7290fea48377f Mon Sep 17 00:00:00 2001 From: Ahmad Wilson Date: Tue, 9 Mar 2021 13:49:29 -0500 Subject: [PATCH] only forward severity:critical alerts to PagerDuty (otherwise forward to Discord) --- .../templates/testnet-alert-receivers.yml.tpl | 5 +++ .../templates/testnet-alert-rules.yml.tpl | 38 +++++++++---------- 2 files changed, 24 insertions(+), 19 deletions(-) diff --git a/automation/terraform/modules/testnet-alerts/templates/testnet-alert-receivers.yml.tpl b/automation/terraform/modules/testnet-alerts/templates/testnet-alert-receivers.yml.tpl index 0d3c95b0823..d9ff5b61eec 100644 --- a/automation/terraform/modules/testnet-alerts/templates/testnet-alert-receivers.yml.tpl +++ b/automation/terraform/modules/testnet-alerts/templates/testnet-alert-receivers.yml.tpl @@ -18,3 +18,8 @@ route: - receiver: pagerduty-testnet-primary match_re: testnet: ^(${pagerduty_alert_filter})$ + severity: critical + - receiver: discord-alert-default + match_re: + testnet: ^(${pagerduty_alert_filter})$ + severity: warning diff --git a/automation/terraform/modules/testnet-alerts/templates/testnet-alert-rules.yml.tpl b/automation/terraform/modules/testnet-alerts/templates/testnet-alert-rules.yml.tpl index d9e178ee819..43e8ec74a9b 100644 --- a/automation/terraform/modules/testnet-alerts/templates/testnet-alert-rules.yml.tpl +++ b/automation/terraform/modules/testnet-alerts/templates/testnet-alert-rules.yml.tpl @@ -13,7 +13,7 @@ groups: severity: critical annotations: summary: "{{ $labels.testnet }} cluster nodes have crashed" - description: "Cluster nodes have crashed on network {{ $labels.testnet }}." + description: "{{ $value }} Cluster nodes have crashed on network {{ $labels.testnet }}." - alert: WatchdogNoNewLogs expr: max by (testnet) (Coda_watchdog_pods_with_no_new_logs) > 0 @@ -22,7 +22,7 @@ groups: severity: critical annotations: summary: "{{ $labels.testnet }} has pods which have not logged in 10 minutes" - description: "There are no new logs in the last 10 minutes for some pods on network {{ $labels.testnet }}." + description: "There are no new logs in the last 10 minutes for {{ $value }} pods on network {{ $labels.testnet }}." - alert: SeedListDown expr: min by (testnet) (min_over_time(Coda_watchdog_seeds_reachable ${rule_filter} [${alerting_timeframe}])) == 0 @@ -39,8 +39,8 @@ groups: testnet: "{{ $labels.testnet }}" severity: critical annotations: - summary: "{{ $labels.testnet }} has no new blocks posted to the google block storage bucket recently (in the last 30 minutes)" - description: "No new blocks posted to the google storage bucket for {{ $labels.testnet }}." + summary: "{{ $labels.testnet }} has no new blocks posted to the google block storage bucket recently" + description: "{{ $value }} new blocks posted to the google storage bucket for {{ $labels.testnet }}." - alert: ProverErrors expr: max by (testnet) (max_over_time(Coda_watchdog_prover_errors_total ${rule_filter} [${alerting_timeframe}])) >= 0 @@ -49,7 +49,7 @@ groups: severity: critical annotations: summary: "{{ $labels.testnet }} has observed a prover error" - description: "Prover error on network {{ $labels.testnet }}." + description: "{{ $value }} Prover errors on network {{ $labels.testnet }}." - alert: NodesNotSynced expr: min by (testnet) (min_over_time(Coda_watchdog_nodes_synced ${rule_filter} [${alerting_timeframe}])) <= .5 @@ -58,7 +58,7 @@ groups: severity: critical annotations: summary: "{{ $labels.testnet }} has <= 50% of nodes synced" - description: "<= 50% nodes synced on network {{ $labels.testnet }}." + description: "Nodes sync rate of {{ $value }} is <= 50% on network {{ $labels.testnet }}." - alert: NodesOutOfSync expr: min by (testnet) (min_over_time(Coda_watchdog_nodes_synced_near_best_tip ${rule_filter} [${alerting_timeframe}])) < .9 @@ -67,7 +67,7 @@ groups: severity: critical annotations: summary: "{{ $labels.testnet }} has < 90% of nodes that are synced on the same best tip" - description: "< 90% of nodes that are synced are on the same best tip for network {{ $labels.testnet }}." + description: "< 90% of nodes that are synced are on the same best tip for network {{ $labels.testnet }} with rate of {{ $value }}." - alert: LowPeerCount expr: min by (testnet) (Coda_Network_peers ${rule_filter}) < 3 @@ -76,7 +76,7 @@ groups: severity: critical annotations: summary: "{{ $labels.testnet }} avg. peer count is critically low" - description: "Critically low peer count on network {{ $labels.testnet }}." + description: "Critically low peer count of {{ $value }} on network {{ $labels.testnet }}." - alert: LowMinWindowDensity expr: min by (testnet) (Coda_Transition_frontier_min_window_density ${rule_filter}) < 0.75 * 0.75 * 77 @@ -85,7 +85,7 @@ groups: severity: critical annotations: summary: "{{ $labels.testnet }} min density is low" - description: "Critically low min density on network {{ $labels.testnet }}." + description: "Critically low min density of {{ $value }} on network {{ $labels.testnet }}." - alert: LowFillRate expr: min by (testnet) (Coda_Transition_frontier_slot_fill_rate ${rule_filter}) < 0.75 * 0.75 @@ -94,7 +94,7 @@ groups: severity: critical annotations: summary: "{{ $labels.testnet }} avg. peer count is critically low" - description: "Lower fill rate than expected on network {{ $labels.testnet }}." + description: "Lower fill rate of {{ $value }} than expected on network {{ $labels.testnet }}." - alert: NoTransactionsInSeveralBlocks expr: max by (testnet) (Coda_Transition_frontier_empty_blocks_at_best_tip ${rule_filter}) >= 5 @@ -103,7 +103,7 @@ groups: severity: critical annotations: summary: "{{ $labels.testnet }} has >= 5 blocks without transactions at the tip" - description: "At least 5 blocks without transactions on tip of network {{ $labels.testnet }}." + description: "{{ $value }} blocks without transactions on tip of network {{ $labels.testnet }}." - alert: NoCoinbaseInBlocks expr: min by (testnet) (min_over_time(Coda_Transition_frontier_best_tip_coinbase ${rule_filter} [10m])) < 1 @@ -112,7 +112,7 @@ groups: severity: critical annotations: summary: "{{ $labels.testnet }} has blocks without coinbases" - description: "Blocks without coinbases on tip of network {{ $labels.testnet }}." + description: "{{ $value }} Blocks without coinbases on tip of network {{ $labels.testnet }}." - alert: LongFork expr: max by (testnet) (Coda_Transition_frontier_longest_fork ${rule_filter}) >= 16 @@ -121,7 +121,7 @@ groups: severity: critical annotations: summary: "{{ $labels.testnet }} has a fork of length at least 16" - description: "Fork of length at least 16 on network {{ $labels.testnet }}." + description: "Fork of length {{ $value }} on network {{ $labels.testnet }}." - alert: OldBestTip expr: min by (testnet) ((time() - 1609459200) - Coda_Transition_frontier_best_tip_slot_time_sec ${rule_filter}) >= 15 * 180 @@ -130,7 +130,7 @@ groups: severity: critical annotations: summary: "{{ $labels.testnet }}: all nodes have best tips older than 15 slots" - description: "All nodes have best tips older than 15 slots (45 minutes) on network {{ $labels.testnet }}." + description: "All nodes have best tips older than 15 slots (45 minutes) on network {{ $labels.testnet }}. Best tip: {{ $value }}" - alert: NoNewSnarks expr: min by (testnet) ((time() - 1609459200) - Coda_Snark_work_useful_snark_work_received_time_sec ${rule_filter}) >= 2 * 180 @@ -159,7 +159,7 @@ groups: severity: warning annotations: summary: "{{ $labels.testnet }} block gossip latency is high" - description: "High block gossip latency (ms) within {{ $labels.testnet }} network." + description: "High block gossip latency of {{ $value }}(ms) within {{ $labels.testnet }} network." - alert: SomewhatOldBestTip expr: min by (testnet) ((time() - 1609459200) - Coda_Transition_frontier_best_tip_slot_time_sec ${rule_filter}) >= 8 * 180 @@ -177,7 +177,7 @@ groups: severity: warning annotations: summary: "{{ $labels.testnet }} has a fork of length at least 8" - description: "Fork of length at least 8 on network {{ $labels.testnet }}." + description: "Fork of length {{ $value }} on network {{ $labels.testnet }}." - alert: NoTransactionsInAtLeastOneBlock expr: max by (testnet) (max_over_time(Coda_Transition_frontier_empty_blocks_at_best_tip ${rule_filter} [${alerting_timeframe}])) > 0 @@ -186,7 +186,7 @@ groups: severity: warning annotations: summary: "{{ $labels.testnet }} has at least 1 block without transactions at the tip" - description: "At least 5 blocks without transactions on tip of network {{ $labels.testnet }} within ${alerting_timeframe}." + description: "{{ $value }} Blocks without transactions on tip of network {{ $labels.testnet }}." - alert: SeedListDegraded expr: min by (testnet) (min_over_time(Coda_watchdog_seeds_reachable ${rule_filter} [${alerting_timeframe}])) <= 0.5 @@ -195,7 +195,7 @@ groups: severity: warning annotations: summary: "{{ $labels.testnet }} seed list is degraded (less than 50% reachable)" - description: "Seed list is degraded on network {{ $labels.testnet }}." + description: "Seed list is degraded at {{ $value }} on network {{ $labels.testnet }}." - alert: FewBlocksPerHour expr: min by (testnet) (increase(Coda_Transition_frontier_max_blocklength_observed ${rule_filter} [${alerting_timeframe}])) < 1 @@ -204,4 +204,4 @@ groups: severity: warning annotations: summary: "{{ $labels.testnet }} block production is critically low (there has been less than 1 block in the last hour)" - description: "Zero blocks have been produced on network {{ $labels.testnet }} in the last hour (according to some node)." + description: "{{ $value }} blocks have been produced on network {{ $labels.testnet }} in the last hour (according to some node)."