Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Only forward severity:critical testnet alerts to PagerDuty and add violating condition values to alert descriptions #8182

Merged
merged 2 commits into from
Mar 16, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,8 @@ route:
- receiver: pagerduty-testnet-primary
match_re:
testnet: ^(${pagerduty_alert_filter})$
severity: critical
- receiver: discord-alert-default
match_re:
testnet: ^(${pagerduty_alert_filter})$
severity: warning
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ groups:
severity: critical
annotations:
summary: "{{ $labels.testnet }} cluster nodes have crashed"
description: "Cluster nodes have crashed on network {{ $labels.testnet }}."
description: "{{ $value }} Cluster nodes have crashed on network {{ $labels.testnet }}."

- alert: WatchdogNoNewLogs
expr: max by (testnet) (Coda_watchdog_pods_with_no_new_logs) > 0
Expand All @@ -22,7 +22,7 @@ groups:
severity: critical
annotations:
summary: "{{ $labels.testnet }} has pods which have not logged in 10 minutes"
description: "There are no new logs in the last 10 minutes for some pods on network {{ $labels.testnet }}."
description: "There are no new logs in the last 10 minutes for {{ $value }} pods on network {{ $labels.testnet }}."

- alert: SeedListDown
expr: min by (testnet) (min_over_time(Coda_watchdog_seeds_reachable ${rule_filter} [${alerting_timeframe}])) == 0
Expand All @@ -39,8 +39,8 @@ groups:
testnet: "{{ $labels.testnet }}"
severity: critical
annotations:
summary: "{{ $labels.testnet }} has no new blocks posted to the google block storage bucket recently (in the last 30 minutes)"
description: "No new blocks posted to the google storage bucket for {{ $labels.testnet }}."
summary: "{{ $labels.testnet }} has no new blocks posted to the google block storage bucket recently"
description: "{{ $value }} new blocks posted to the google storage bucket for {{ $labels.testnet }}."

- alert: ProverErrors
expr: max by (testnet) (max_over_time(Coda_watchdog_prover_errors_total ${rule_filter} [${alerting_timeframe}])) >= 0
Expand All @@ -49,7 +49,7 @@ groups:
severity: critical
annotations:
summary: "{{ $labels.testnet }} has observed a prover error"
description: "Prover error on network {{ $labels.testnet }}."
description: "{{ $value }} Prover errors on network {{ $labels.testnet }}."

- alert: NodesNotSynced
expr: min by (testnet) (min_over_time(Coda_watchdog_nodes_synced ${rule_filter} [${alerting_timeframe}])) <= .5
Expand All @@ -58,7 +58,7 @@ groups:
severity: critical
annotations:
summary: "{{ $labels.testnet }} has <= 50% of nodes synced"
description: "<= 50% nodes synced on network {{ $labels.testnet }}."
description: "Nodes sync rate of {{ $value }} is <= 50% on network {{ $labels.testnet }}."

- alert: NodesOutOfSync
expr: min by (testnet) (min_over_time(Coda_watchdog_nodes_synced_near_best_tip ${rule_filter} [${alerting_timeframe}])) < .9
Expand All @@ -67,7 +67,7 @@ groups:
severity: critical
annotations:
summary: "{{ $labels.testnet }} has < 90% of nodes that are synced on the same best tip"
description: "< 90% of nodes that are synced are on the same best tip for network {{ $labels.testnet }}."
description: "< 90% of nodes that are synced are on the same best tip for network {{ $labels.testnet }} with rate of {{ $value }}."

- alert: LowPeerCount
expr: min by (testnet) (Coda_Network_peers ${rule_filter}) < 3
Expand All @@ -76,7 +76,7 @@ groups:
severity: critical
annotations:
summary: "{{ $labels.testnet }} avg. peer count is critically low"
description: "Critically low peer count on network {{ $labels.testnet }}."
description: "Critically low peer count of {{ $value }} on network {{ $labels.testnet }}."

- alert: LowMinWindowDensity
expr: min by (testnet) (Coda_Transition_frontier_min_window_density ${rule_filter}) < 0.75 * 0.75 * 77
Expand All @@ -85,7 +85,7 @@ groups:
severity: critical
annotations:
summary: "{{ $labels.testnet }} min density is low"
description: "Critically low min density on network {{ $labels.testnet }}."
description: "Critically low min density of {{ $value }} on network {{ $labels.testnet }}."

- alert: LowFillRate
expr: min by (testnet) (Coda_Transition_frontier_slot_fill_rate ${rule_filter}) < 0.75 * 0.75
Expand All @@ -94,7 +94,7 @@ groups:
severity: critical
annotations:
summary: "{{ $labels.testnet }} avg. peer count is critically low"
description: "Lower fill rate than expected on network {{ $labels.testnet }}."
description: "Lower fill rate of {{ $value }} than expected on network {{ $labels.testnet }}."

- alert: NoTransactionsInSeveralBlocks
expr: max by (testnet) (Coda_Transition_frontier_empty_blocks_at_best_tip ${rule_filter}) >= 5
Expand All @@ -103,7 +103,7 @@ groups:
severity: critical
annotations:
summary: "{{ $labels.testnet }} has >= 5 blocks without transactions at the tip"
description: "At least 5 blocks without transactions on tip of network {{ $labels.testnet }}."
description: "{{ $value }} blocks without transactions on tip of network {{ $labels.testnet }}."

- alert: NoCoinbaseInBlocks
expr: min by (testnet) (min_over_time(Coda_Transition_frontier_best_tip_coinbase ${rule_filter} [10m])) < 1
Expand All @@ -112,7 +112,7 @@ groups:
severity: critical
annotations:
summary: "{{ $labels.testnet }} has blocks without coinbases"
description: "Blocks without coinbases on tip of network {{ $labels.testnet }}."
description: "{{ $value }} Blocks without coinbases on tip of network {{ $labels.testnet }}."

- alert: LongFork
expr: max by (testnet) (Coda_Transition_frontier_longest_fork ${rule_filter}) >= 16
Expand All @@ -121,7 +121,7 @@ groups:
severity: critical
annotations:
summary: "{{ $labels.testnet }} has a fork of length at least 16"
description: "Fork of length at least 16 on network {{ $labels.testnet }}."
description: "Fork of length {{ $value }} on network {{ $labels.testnet }}."

- alert: OldBestTip
expr: min by (testnet) ((time() - 1609459200) - Coda_Transition_frontier_best_tip_slot_time_sec ${rule_filter}) >= 15 * 180
Expand All @@ -130,7 +130,7 @@ groups:
severity: critical
annotations:
summary: "{{ $labels.testnet }}: all nodes have best tips older than 15 slots"
description: "All nodes have best tips older than 15 slots (45 minutes) on network {{ $labels.testnet }}."
description: "All nodes have best tips older than 15 slots (45 minutes) on network {{ $labels.testnet }}. Best tip: {{ $value }}"

- alert: NoNewSnarks
expr: min by (testnet) ((time() - 1609459200) - Coda_Snark_work_useful_snark_work_received_time_sec ${rule_filter}) >= 2 * 180
Expand Down Expand Up @@ -177,7 +177,7 @@ groups:
severity: warning
annotations:
summary: "{{ $labels.testnet }} block gossip latency is high"
description: "High block gossip latency (ms) within {{ $labels.testnet }} network."
description: "High block gossip latency of {{ $value }}(ms) within {{ $labels.testnet }} network."

- alert: SomewhatOldBestTip
expr: count by (testnet) (((time() - 1609459200) - Coda_Transition_frontier_best_tip_slot_time_sec ${rule_filter}) >= 8 * 180) > 1
Expand All @@ -195,7 +195,7 @@ groups:
severity: warning
annotations:
summary: "{{ $labels.testnet }} has a fork of length at least 8"
description: "Fork of length at least 8 on network {{ $labels.testnet }}."
description: "Fork of length {{ $value }} on network {{ $labels.testnet }}."

- alert: NoTransactionsInAtLeastOneBlock
expr: max by (testnet) (max_over_time(Coda_Transition_frontier_empty_blocks_at_best_tip ${rule_filter} [${alerting_timeframe}])) > 0
Expand All @@ -204,7 +204,7 @@ groups:
severity: warning
annotations:
summary: "{{ $labels.testnet }} has at least 1 block without transactions at the tip"
description: "At least 5 blocks without transactions on tip of network {{ $labels.testnet }} within ${alerting_timeframe}."
description: "{{ $value }} Blocks without transactions on tip of network {{ $labels.testnet }}."

- alert: SeedListDegraded
expr: min by (testnet) (min_over_time(Coda_watchdog_seeds_reachable ${rule_filter} [${alerting_timeframe}])) <= 0.5
Expand All @@ -213,7 +213,7 @@ groups:
severity: warning
annotations:
summary: "{{ $labels.testnet }} seed list is degraded (less than 50% reachable)"
description: "Seed list is degraded on network {{ $labels.testnet }}."
description: "Seed list is degraded at {{ $value }} on network {{ $labels.testnet }}."

- alert: FewBlocksPerHour
expr: min by (testnet) (increase(Coda_Transition_frontier_max_blocklength_observed ${rule_filter} [${alerting_timeframe}])) < 1
Expand All @@ -222,7 +222,7 @@ groups:
severity: warning
annotations:
summary: "{{ $labels.testnet }} block production is critically low (there has been less than 1 block in the last hour)"
description: "Zero blocks have been produced on network {{ $labels.testnet }} in the last hour (according to some node)."
description: "{{ $value }} blocks have been produced on network {{ $labels.testnet }} in the last hour (according to some node)."

- alert: LowPostgresBlockHeightGrowth
expr: min by (testnet) (increase(Coda_Archive_max_block_height ${rule_filter} [${alerting_timeframe}])) < 1
Expand Down