-
Notifications
You must be signed in to change notification settings - Fork 20
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge remote-tracking branch 'origin/main' into dpe-3258-fix-stanza-f…
…or-cluster-with-same-name Signed-off-by: Marcelo Henrique Neppel <[email protected]>
- Loading branch information
Showing
3 changed files
with
291 additions
and
21 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
# This file based on https://samber.github.io/awesome-prometheus-alerts/rules#patroni-1 | ||
|
||
groups: | ||
|
||
- name: PatroniExporter | ||
|
||
rules: | ||
|
||
- alert: PatroniPostgresqlDown | ||
expr: "patroni_postgres_running == 0" | ||
for: 0m | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: Patroni Posrgresql Down (instance {{ $labels.instance }}) | ||
description: "Patroni Postgresql instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
# 2.4.1 | ||
- alert: PatroniHasNoLeader | ||
expr: '(max by (scope) (patroni_master) < 1) and (max by (scope) (patroni_standby_leader) < 1)' | ||
for: 0m | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: Patroni has no Leader (instance {{ $labels.instance }}) | ||
description: "A leader node (neither primary nor standby) cannot be found inside the cluster {{ $labels.scope }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
# This file based on https://samber.github.io/awesome-prometheus-alerts/rules#pgbouncer-1 | ||
|
||
groups: | ||
|
||
- name: PgbouncerExporter | ||
|
||
rules: | ||
|
||
# 2.5.1 | ||
- alert: PgbouncerActiveConnections | ||
expr: 'pgbouncer_pools_server_active_connections > 200' | ||
for: 2m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: PGBouncer active connections (instance {{ $labels.instance }}) | ||
description: "PGBouncer pools are filling up\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
# 2.5.2 | ||
# 10 -> 3 | ||
- alert: PgbouncerErrors | ||
expr: 'increase(pgbouncer_errors_count{errmsg!="server conn crashed?"}[1m]) > 3' | ||
for: 0m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: PGBouncer errors (instance {{ $labels.instance }}) | ||
description: "PGBouncer is logging errors. This may be due to a a server restart or an admin typing commands at the pgbouncer console.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
# 2.5.3 | ||
- alert: PgbouncerMaxConnections | ||
expr: 'increase(pgbouncer_errors_count{errmsg="no more connections allowed (max_client_conn)"}[30s]) > 0' | ||
for: 0m | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: PGBouncer max connections (instance {{ $labels.instance }}) | ||
description: "The number of PGBouncer client connections has reached max_client_conn.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,22 +1,228 @@ | ||
# This file based on https://samber.github.io/awesome-prometheus-alerts/rules#postgresql-1 | ||
|
||
groups: | ||
- name: PostgresqlExporterK8s | ||
|
||
rules: | ||
# Based on https://samber.github.io/awesome-prometheus-alerts/rules#rule-postgresql-1-1 | ||
- alert: PostgresqlDown | ||
expr: pg_up == 0 | ||
for: 0m | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: Postgresql down (instance {{ $labels.instance }}) | ||
description: "Postgresql instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
# Based on https://samber.github.io/awesome-prometheus-alerts/rules#rule-postgresql-1-2 | ||
- alert: PostgresqlRestarted | ||
expr: time() - pg_postmaster_start_time_seconds < 60 | ||
for: 0m | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: Postgresql restarted (instance {{ $labels.instance }}) | ||
description: "Postgresql restarted\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
- name: PostgresqlExporter | ||
|
||
rules: | ||
|
||
# 2.2.1 | ||
- alert: PostgresqlDown | ||
expr: 'pg_up == 0' | ||
for: 0m | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: Postgresql down (instance {{ $labels.instance }}) | ||
description: "Postgresql instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
# 2.2.2 | ||
# critical -> info | ||
- alert: PostgresqlRestarted | ||
expr: 'time() - pg_postmaster_start_time_seconds < 60' | ||
for: 0m | ||
labels: | ||
severity: info | ||
annotations: | ||
summary: Postgresql restarted (instance {{ $labels.instance }}) | ||
description: "Postgresql restarted\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
# 2.2.3 | ||
- alert: PostgresqlExporterError | ||
expr: 'pg_exporter_last_scrape_error > 0' | ||
for: 0m | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: Postgresql exporter error (instance {{ $labels.instance }}) | ||
description: "Postgresql exporter is showing errors. A query may be buggy in query.yaml\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
# 2.2.4 | ||
# 10 days -> 7 days | ||
- alert: PostgresqlTableNotAutoVacuumed | ||
expr: '(pg_stat_user_tables_last_autovacuum > 0) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 7' | ||
for: 0m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: Postgresql table not auto vacuumed (instance {{ $labels.instance }}) | ||
description: "Table {{ $labels.relname }} has not been auto vacuumed for 7 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
# 2.2.5 | ||
# 10 days -> 7 days | ||
- alert: PostgresqlTableNotAutoAnalyzed | ||
expr: '(pg_stat_user_tables_last_autoanalyze > 0) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 7' | ||
for: 0m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: Postgresql table not auto analyzed (instance {{ $labels.instance }}) | ||
description: "Table {{ $labels.relname }} has not been auto analyzed for 7 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
# 2.2.6 | ||
- alert: PostgresqlTooManyConnections | ||
expr: 'sum by (instance, job, server) (pg_stat_activity_count) > min by (instance, job, server) (pg_settings_max_connections * 0.8)' | ||
for: 2m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: Postgresql too many connections (instance {{ $labels.instance }}) | ||
description: "PostgreSQL instance has too many connections (> 80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
# 2.2.7 | ||
# warning -> info | ||
- alert: PostgresqlNotEnoughConnections | ||
expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5' | ||
for: 2m | ||
labels: | ||
severity: info | ||
annotations: | ||
summary: Postgresql not enough connections (instance {{ $labels.instance }}) | ||
description: "PostgreSQL instance should have more connections (> 5)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
# 2.2.8 | ||
- alert: PostgresqlDeadLocks | ||
expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5' | ||
for: 0m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: Postgresql dead locks (instance {{ $labels.instance }}) | ||
description: "PostgreSQL has dead-locks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
# 2.2.9 | ||
- alert: PostgresqlHighRollbackRate | ||
expr: 'sum by (namespace,datname) ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m])))) > 0.02' | ||
for: 0m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: Postgresql high rollback rate (instance {{ $labels.instance }}) | ||
description: "Ratio of transactions being aborted compared to committed is > 2 %\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
# 2.2.10 | ||
# critical -> info | ||
- alert: PostgresqlCommitRateLow | ||
expr: 'rate(pg_stat_database_xact_commit[1m]) < 10' | ||
for: 2m | ||
labels: | ||
severity: info | ||
annotations: | ||
summary: Postgresql commit rate low (instance {{ $labels.instance }}) | ||
description: "Postgresql seems to be processing very few transactions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
# 2.2.11 | ||
# warning -> info | ||
- alert: PostgresqlLowXidConsumption | ||
expr: 'rate(pg_txid_current[1m]) < 5' | ||
for: 2m | ||
labels: | ||
severity: info | ||
annotations: | ||
summary: Postgresql low XID consumption (instance {{ $labels.instance }}) | ||
description: "Postgresql seems to be consuming transaction IDs very slowly\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
# 2.2.12 | ||
- alert: PostgresqlHighRateStatementTimeout | ||
expr: 'rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3' | ||
for: 0m | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: Postgresql high rate statement timeout (instance {{ $labels.instance }}) | ||
description: "Postgres transactions showing high rate of statement timeouts\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
# 2.2.13 | ||
# critical -> warning | ||
- alert: PostgresqlHighRateDeadlock | ||
expr: 'increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1' | ||
for: 0m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: Postgresql high rate deadlock (instance {{ $labels.instance }}) | ||
description: "Postgres detected deadlocks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
# 2.2.14 | ||
# warning -> info | ||
- alert: PostgresqlUnusedReplicationSlot | ||
expr: 'pg_replication_slots_active == 0' | ||
for: 1m | ||
labels: | ||
severity: info | ||
annotations: | ||
summary: Postgresql unused replication slot (instance {{ $labels.instance }}) | ||
description: "Unused Replication Slots\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
# 2.2.15 | ||
- alert: PostgresqlTooManyDeadTuples | ||
expr: '((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1' | ||
for: 2m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: Postgresql too many dead tuples (instance {{ $labels.instance }}) | ||
description: "PostgreSQL dead tuples is too large\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
# 2.2.16 | ||
- alert: PostgresqlConfigurationChanged | ||
expr: '{__name__=~"pg_settings_.*"} != ON(__name__) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m' | ||
for: 0m | ||
labels: | ||
severity: info | ||
annotations: | ||
summary: Postgresql configuration changed (instance {{ $labels.instance }}) | ||
description: "Postgres Database configuration change has occurred\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
# 2.2.17 | ||
# critical -> warning | ||
- alert: PostgresqlSslCompressionActive | ||
expr: 'sum(pg_stat_ssl_compression) > 0' | ||
for: 0m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: Postgresql SSL compression active (instance {{ $labels.instance }}) | ||
description: "Database connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
# 2.2.18 | ||
# critical -> warning | ||
- alert: PostgresqlTooManyLocksAcquired | ||
expr: '((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20' | ||
for: 2m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: Postgresql too many locks acquired (instance {{ $labels.instance }}) | ||
description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
# 2.2.19 | ||
- alert: PostgresqlBloatIndexHigh(>80%) | ||
expr: 'pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 100000000)' | ||
for: 1h | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: Postgresql bloat index high (> 80%) (instance {{ $labels.instance }}) | ||
description: "The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
# 2.2.20 | ||
- alert: PostgresqlBloatTableHigh(>80%) | ||
expr: 'pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 200000000)' | ||
for: 1h | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: Postgresql bloat table high (> 80%) (instance {{ $labels.instance }}) | ||
description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
# 2.2.21 | ||
# warning -> critical | ||
- alert: PostgresqlInvalidIndex | ||
expr: 'pg_genaral_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}' | ||
for: 6h | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: Postgresql invalid index (instance {{ $labels.instance }}) | ||
description: "The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You should execute `DROP INDEX {{ $labels.indexrelname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" |