From 30cee0c838d80fe91be4a1c55ac4b7943b7c2531 Mon Sep 17 00:00:00 2001 From: Guido Iaquinti Date: Thu, 13 Oct 2022 16:46:24 +0200 Subject: [PATCH] Prometheus: move default alerts to ConfigMap --- charts/posthog/prometheus/alerts-default.yml | 726 +++++++++++++++++ .../prometheus_default_alerting_rules.yaml | 6 + charts/posthog/values.yaml | 752 +----------------- 3 files changed, 756 insertions(+), 728 deletions(-) create mode 100644 charts/posthog/prometheus/alerts-default.yml create mode 100644 charts/posthog/templates/prometheus_default_alerting_rules.yaml diff --git a/charts/posthog/prometheus/alerts-default.yml b/charts/posthog/prometheus/alerts-default.yml new file mode 100644 index 000000000..2fdeb271d --- /dev/null +++ b/charts/posthog/prometheus/alerts-default.yml @@ -0,0 +1,726 @@ + +# -- Alerts configuration. For more information see: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/ +# +# -- NOTE: alerting is an important part of any production system. With this Helm chart we aim to provide a good +# -- collection of default rules that can be used to successfully alert an operator if a PostHog installation is not +# -- working as expected. As those rules will likely evolve over time and as we don't want to cut a new major release +# -- every time it happens, please consider those defaults as UNSTABLE. +# -- Please consider to explicitly override this input in your `values.yaml` if you need to keep it stable. +# +# +# The majority of alerts are inspired by the great collection of rules available at: +# https://github.com/samber/awesome-prometheus-alerts +# +groups: + - name: Kubernetes # via kube-state-metrics + rules: + - alert: KubernetesNodeReady + expr: kube_node_status_condition{condition="Ready",status="true"} == 0 + for: 10m + labels: + severity: critical + annotations: + summary: Kubernetes Node ready (instance {{ $labels.instance }}) + description: "Node {{ $labels.node }} has been unready for a long time" + + - alert: KubernetesMemoryPressure + expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1 + for: 2m + labels: + severity: critical + annotations: + summary: Kubernetes memory pressure (instance {{ $labels.instance }}) + description: "{{ $labels.node }} has MemoryPressure condition" + + - alert: KubernetesDiskPressure + expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1 + for: 10m + labels: + severity: critical + annotations: + summary: Kubernetes disk pressure (instance {{ $labels.instance }}) + description: "{{ $labels.node }} has DiskPressure condition" + + - alert: KubernetesOutOfDisk + expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1 + for: 2m + labels: + severity: critical + annotations: + summary: Kubernetes out of disk (instance {{ $labels.instance }}) + description: "{{ $labels.node }} has OutOfDisk condition" + + - alert: KubernetesOutOfCapacity + expr: sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90 + for: 2m + labels: + severity: critical + annotations: + summary: Kubernetes out of capacity (instance {{ $labels.instance }}) + description: "{{ $labels.node }} is out of capacity" + + - alert: KubernetesContainerOomKiller + expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1 + for: 0m + labels: + severity: critical + annotations: + summary: Kubernetes container oom killer (instance {{ $labels.instance }}) + description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes." + + - alert: KubernetesJobFailed + expr: kube_job_status_failed > 0 + for: 0m + labels: + severity: warning + annotations: + summary: Kubernetes Job failed (instance {{ $labels.instance }}) + description: "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete" + + - alert: KubernetesCronjobSuspended + expr: kube_cronjob_spec_suspend != 0 + for: 0m + labels: + severity: warning + annotations: + summary: Kubernetes CronJob suspended (instance {{ $labels.instance }}) + description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended" + + - alert: KubernetesPersistentvolumeclaimPending + expr: kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1 + for: 2m + labels: + severity: warning + annotations: + summary: Kubernetes PersistentVolumeClaim pending (instance {{ $labels.instance }}) + description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending" + + - alert: KubernetesVolumeOutOfDiskSpace + expr: kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10 + for: 2m + labels: + severity: critical + annotations: + summary: Kubernetes Volume out of disk space (instance {{ $labels.instance }}) + description: "Volume is almost full (< 10% left)" + + - alert: KubernetesVolumeFullInFourDays + expr: predict_linear(kubelet_volume_stats_available_bytes[6h], 4 * 24 * 3600) < 0 + for: 0m + labels: + severity: critical + annotations: + summary: Kubernetes Volume full in four days (instance {{ $labels.instance }}) + description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available." + + - alert: KubernetesPersistentvolumeError + expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Kubernetes PersistentVolume error (instance {{ $labels.instance }}) + description: "Persistent volume is in bad state" + + - alert: KubernetesStatefulsetDown + expr: (kube_statefulset_status_replicas_ready / kube_statefulset_status_replicas_current) != 1 + for: 1m + labels: + severity: critical + annotations: + summary: Kubernetes StatefulSet down (instance {{ $labels.instance }}) + description: "A StatefulSet went down" + + - alert: KubernetesHpaScalingAbility + expr: kube_horizontalpodautoscaler_status_condition{status="false", condition="AbleToScale"} == 1 + for: 2m + labels: + severity: critical + annotations: + summary: Kubernetes HPA scaling ability (instance {{ $labels.instance }}) + description: "Pod is unable to scale" + + - alert: KubernetesHpaMetricAvailability + expr: kube_horizontalpodautoscaler_status_condition{status="false", condition="ScalingActive"} == 1 + for: 5m + labels: + severity: critical + annotations: + summary: Kubernetes HPA metric availability (instance {{ $labels.instance }}) + description: "HPA is not able to collect metrics" + + - alert: KubernetesHpaScaleCapability + expr: kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas + for: 2m + labels: + severity: critical + annotations: + summary: Kubernetes HPA scale capability (instance {{ $labels.instance }}) + description: "The maximum number of desired Pods has been hit" + + - alert: KubernetesPodNotHealthy + expr: sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"}) > 0 + for: 15m + labels: + severity: warning + annotations: + summary: Kubernetes Pod not healthy (instance {{ $labels.instance }}) + description: "Pod has been in a non-ready state for longer than 15 minutes." + + - alert: KubernetesPodCrashLooping + expr: increase(kube_pod_container_status_restarts_total[1m]) > 3 + for: 2m + labels: + severity: warning + annotations: + summary: Kubernetes pod crash looping (instance {{ $labels.instance }}) + description: "Pod {{ $labels.pod }} is crash looping" + + - alert: KubernetesReplicassetMismatch + expr: kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas + for: 10m + labels: + severity: warning + annotations: + summary: Kubernetes ReplicasSet mismatch (instance {{ $labels.instance }}) + description: > + The number of ready pods in the Deployment's replicaset does + not match the desired number. + + - alert: KubernetesDeploymentReplicasMismatch + expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available + for: 10m + labels: + severity: warning + annotations: + summary: Kubernetes Deployment replicas mismatch (instance {{ $labels.instance }}) + description: > + The number of ready pods in the Deployment does not match the + desired number. + + - alert: KubernetesStatefulsetReplicasMismatch + expr: kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas + for: 10m + labels: + severity: warning + annotations: + summary: Kubernetes StatefulSet replicas mismatch (instance {{ $labels.instance }}) + description: > + The number of ready pods in the StatefulSet does not match the + desired number. + + - alert: KubernetesDeploymentGenerationMismatch + expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation + for: 10m + labels: + severity: critical + annotations: + summary: Kubernetes Deployment generation mismatch (instance {{ $labels.instance }}) + description: "A Deployment has failed but has not been rolled back." + + - alert: KubernetesStatefulsetGenerationMismatch + expr: kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation + for: 10m + labels: + severity: critical + annotations: + summary: Kubernetes StatefulSet generation mismatch (instance {{ $labels.instance }}) + description: "A StatefulSet has failed but has not been rolled back." + + - alert: KubernetesStatefulsetUpdateNotRolledOut + expr: max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated) + for: 10m + labels: + severity: warning + annotations: + summary: Kubernetes StatefulSet update not rolled out (instance {{ $labels.instance }}) + description: "StatefulSet update has not been rolled out." + + - alert: KubernetesDaemonsetRolloutStuck + expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0 + for: 10m + labels: + severity: warning + annotations: + summary: Kubernetes DaemonSet rollout stuck (instance {{ $labels.instance }}) + description: "Some Pods of DaemonSet are not scheduled or not ready" + + - alert: KubernetesDaemonsetMisscheduled + expr: kube_daemonset_status_number_misscheduled > 0 + for: 5m + labels: + severity: critical + annotations: + summary: Kubernetes DaemonSet misscheduled (instance {{ $labels.instance }}) + description: "Some DaemonSet Pods are running where they are not supposed to run" + + - alert: KubernetesCronjobTooLong + expr: time() - kube_cronjob_next_schedule_time > 3600 + for: 0m + labels: + severity: warning + annotations: + summary: Kubernetes CronJob too long (instance {{ $labels.instance }}) + description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete." + + - alert: KubernetesJobSlowCompletion + expr: kube_job_spec_completions - kube_job_status_succeeded > 0 + for: 12h + labels: + severity: critical + annotations: + summary: Kubernetes job slow completion (instance {{ $labels.instance }}) + description: "Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time." + + - alert: KubernetesApiServerErrors + expr: sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[1m])) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) * 100 > 3 + for: 2m + labels: + severity: critical + annotations: + summary: Kubernetes API server errors (instance {{ $labels.instance }}) + description: "Kubernetes API server is experiencing high error rate" + + - alert: KubernetesApiClientErrors + expr: (sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) / sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1 + for: 2m + labels: + severity: critical + annotations: + summary: Kubernetes API client errors (instance {{ $labels.instance }}) + description: "Kubernetes API client is experiencing high error rate" + + - alert: KubernetesClientCertificateExpiresNextWeek + expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60 + for: 0m + labels: + severity: warning + annotations: + summary: Kubernetes client certificate expires next week (instance {{ $labels.instance }}) + description: "A client certificate used to authenticate to the apiserver is expiring next week." + + - alert: KubernetesClientCertificateExpiresSoon + expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 24*60*60 + for: 0m + labels: + severity: critical + annotations: + summary: Kubernetes client certificate expires soon (instance {{ $labels.instance }}) + description: "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours." + + - alert: KubernetesApiServerLatency + expr: histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"} [10m])) WITHOUT (instance, resource)) / 1e+06 > 1 + for: 2m + labels: + severity: warning + annotations: + summary: Kubernetes API server latency (instance {{ $labels.instance }}) + description: "Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}." + + + - name: Loki # via embedded exporter + rules: + - alert: LokiProcessTooManyRestarts + expr: changes(process_start_time_seconds{app="loki"}[15m]) > 2 + for: 0m + labels: + severity: warning + annotations: + summary: Loki process too many restarts (instance {{ $labels.instance }}) + description: "A loki process had too many restarts (target {{ $labels.instance }})" + + - alert: LokiRequestErrors + expr: 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10 + for: 15m + labels: + severity: warning + annotations: + summary: Loki request errors (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing errors" + + - alert: LokiRequestPanic + expr: sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 + for: 5m + labels: + severity: warning + annotations: + summary: Loki request panic (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics" + + - alert: LokiRequestLatency + expr: (histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le))) > 3 + for: 10m + labels: + severity: warning + annotations: + summary: Loki request latency (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency" + + + - name: Promtail # via embedded exporter + rules: + - alert: PromtailRequestErrors + expr: 100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance) / sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) > 10 + for: 5m + labels: + severity: critical + annotations: + summary: Promtail request errors (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}% errors." + + - alert: PromtailRequestLatency + expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[5m])) by (le)) > 1 + for: 5m + labels: + severity: critical + annotations: + summary: Promtail request latency (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency." + + + - name: Prometheus # via embedded exporter + rules: + - alert: PrometheusJobMissing + expr: absent(up{job="prometheus"}) + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus job missing (instance {{ $labels.instance }}) + description: "A Prometheus job has disappeared" + + - alert: PrometheusTargetMissing + expr: up == 0 + for: 5m + labels: + severity: critical + annotations: + summary: Prometheus target missing (instance {{ $labels.instance }}) + description: "A Prometheus target has disappeared. An exporter might be crashed." + + - alert: PrometheusAllTargetsMissing + expr: sum by (job) (up) == 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus all targets missing (instance {{ $labels.instance }}) + description: "A Prometheus job does not have living target anymore." + + - alert: PrometheusConfigurationReloadFailure + expr: prometheus_config_last_reload_successful != 1 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus configuration reload failure (instance {{ $labels.instance }}) + description: "Prometheus configuration reload error" + + - alert: PrometheusTooManyRestarts + expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus too many restarts (instance {{ $labels.instance }}) + description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping." + + - alert: PrometheusAlertmanagerJobMissing + expr: absent(up{job="kubernetes-pods", app="prometheus", component="alertmanager"}) + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus AlertManager job missing (instance {{ $labels.instance }}) + description: "A Prometheus AlertManager job has disappeared" + + - alert: PrometheusAlertmanagerConfigurationReloadFailure + expr: alertmanager_config_last_reload_successful != 1 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}) + description: "AlertManager configuration reload error" + + - alert: PrometheusAlertmanagerConfigNotSynced + expr: count(count_values("config_hash", alertmanager_config_hash)) > 1 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }}) + description: "Configurations of AlertManager cluster instances are out of sync" + + - alert: PrometheusAlertmanagerE2eDeadManSwitch + expr: vector(1) + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }}) + description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager." + + - alert: PrometheusNotConnectedToAlertmanager + expr: prometheus_notifications_alertmanagers_discovered < 1 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }}) + description: "Prometheus cannot connect the alertmanager" + + - alert: PrometheusRuleEvaluationFailures + expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus rule evaluation failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts." + + - alert: PrometheusTemplateTextExpansionFailures + expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus template text expansion failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} template text expansion failures" + + - alert: PrometheusRuleEvaluationSlow + expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds + for: 5m + labels: + severity: warning + annotations: + summary: Prometheus rule evaluation slow (instance {{ $labels.instance }}) + description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query." + + - alert: PrometheusNotificationsBacklog + expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0 + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus notifications backlog (instance {{ $labels.instance }}) + description: "The Prometheus notification queue has not been empty for 10 minutes" + + - alert: PrometheusAlertmanagerNotificationFailing + expr: rate(alertmanager_notifications_failed_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }}) + description: "Alertmanager is failing sending notifications" + + - alert: PrometheusTargetEmpty + expr: prometheus_sd_discovered_targets == 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus target empty (instance {{ $labels.instance }}) + description: "Prometheus has no target in service discovery" + + - alert: PrometheusTargetScrapingSlow + expr: prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05 + for: 5m + labels: + severity: warning + annotations: + summary: Prometheus target scraping slow (instance {{ $labels.instance }}) + description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned." + + - alert: PrometheusLargeScrape + expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10 + for: 5m + labels: + severity: warning + annotations: + summary: Prometheus large scrape (instance {{ $labels.instance }}) + description: "Prometheus has many scrapes that exceed the sample limit" + + - alert: PrometheusTargetScrapeDuplicate + expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0 + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus target scrape duplicate (instance {{ $labels.instance }}) + description: "Prometheus has many samples rejected due to duplicate timestamps but different values" + + - alert: PrometheusTsdbCheckpointCreationFailures + expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} checkpoint creation failures" + + - alert: PrometheusTsdbCheckpointDeletionFailures + expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} checkpoint deletion failures" + + - alert: PrometheusTsdbCompactionsFailed + expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB compactions failures" + + - alert: PrometheusTsdbHeadTruncationsFailed + expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB head truncation failures" + + - alert: PrometheusTsdbReloadFailures + expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB reload failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB reload failures" + + - alert: PrometheusTsdbWalCorruptions + expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB WAL corruptions" + + - alert: PrometheusTsdbWalTruncationsFailed + expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures" + + - name: Redis # via prometheus-redis-exporter + rules: + - alert: RedisDown + expr: redis_up == 0 + for: 0m + labels: + severity: critical + annotations: + summary: Redis down (instance {{ $labels.instance }}) + description: "Redis instance is down" + + - alert: RedisMissingMaster + expr: (count(redis_instance_info{role="master"}) or vector(0)) < 1 + for: 0m + labels: + severity: critical + annotations: + summary: Redis missing master (instance {{ $labels.instance }}) + description: "Redis cluster has no node marked as master." + + - alert: RedisTooManyMasters + expr: count(redis_instance_info{role="master"}) > 1 + for: 0m + labels: + severity: critical + annotations: + summary: Redis too many masters (instance {{ $labels.instance }}) + description: "Redis cluster has too many nodes marked as master." + + - alert: RedisDisconnectedSlaves + expr: count without (instance, job) (redis_connected_slaves) - sum without (instance, job) (redis_connected_slaves) - 1 > 1 + for: 0m + labels: + severity: critical + annotations: + summary: Redis disconnected slaves (instance {{ $labels.instance }}) + description: "Redis not replicating for all slaves. Consider reviewing the redis replication status." + + - alert: RedisReplicationBroken + expr: delta(redis_connected_slaves[1m]) < 0 + for: 0m + labels: + severity: critical + annotations: + summary: Redis replication broken (instance {{ $labels.instance }}) + description: "Redis instance lost a slave" + + - alert: RedisClusterFlapping + expr: changes(redis_connected_slaves[1m]) > 1 + for: 2m + labels: + severity: critical + annotations: + summary: Redis cluster flapping (instance {{ $labels.instance }}) + description: "Changes have been detected in Redis replica connection. This can occur when replica nodes lose connection to the master and reconnect (a.k.a flapping)." + + - alert: RedisMissingBackup + expr: time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 24 + for: 0m + labels: + severity: critical + annotations: + summary: Redis missing backup (instance {{ $labels.instance }}) + description: "Redis has not been backuped for 24 hours" + + # The exporter must be started with --include-system-metrics flag or REDIS_EXPORTER_INCL_SYSTEM_METRICS=true environment variable. + - alert: RedisOutOfSystemMemory + expr: redis_memory_used_bytes / redis_total_system_memory_bytes * 100 > 90 + for: 2m + labels: + severity: warning + annotations: + summary: Redis out of system memory (instance {{ $labels.instance }}) + description: "Redis is running out of system memory (> 90%)" + + - alert: RedisOutOfConfiguredMaxmemory + expr: redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90 + for: 2m + labels: + severity: warning + annotations: + summary: Redis out of configured maxmemory (instance {{ $labels.instance }}) + description: "Redis is running out of configured maxmemory (> 90%)" + + - alert: RedisTooManyConnections + expr: redis_connected_clients > 100 + for: 2m + labels: + severity: warning + annotations: + summary: Redis too many connections (instance {{ $labels.instance }}) + description: "Redis instance has too many connections" + + - alert: RedisNotEnoughConnections + expr: redis_connected_clients < 5 + for: 2m + labels: + severity: warning + annotations: + summary: Redis not enough connections (instance {{ $labels.instance }}) + description: "Redis instance should have more connections (> 5)" + + - alert: RedisRejectedConnections + expr: increase(redis_rejected_connections_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Redis rejected connections (instance {{ $labels.instance }}) + description: "Some connections to Redis has been rejected" diff --git a/charts/posthog/templates/prometheus_default_alerting_rules.yaml b/charts/posthog/templates/prometheus_default_alerting_rules.yaml new file mode 100644 index 000000000..d7f72fd97 --- /dev/null +++ b/charts/posthog/templates/prometheus_default_alerting_rules.yaml @@ -0,0 +1,6 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: posthog-prometheus-alerts-default +data: +{{ (.Files.Glob "prometheus/alerts-default.yml").AsConfig | indent 2 }} diff --git a/charts/posthog/values.yaml b/charts/posthog/values.yaml index 03fcf9d64..897a1327a 100644 --- a/charts/posthog/values.yaml +++ b/charts/posthog/values.yaml @@ -1749,734 +1749,30 @@ prometheus: enabled: false serverFiles: - # -- Alerts configuration. For more information see: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/ - # - # -- NOTE: alerting is an important part of any production system. With this Helm chart we aim to provide a good - # -- collection of default rules that can be used to successfully alert an operator if a PostHog installation is not - # -- working as expected. As those rules will likely evolve over time and as we don't want to cut a new major release - # -- every time it happens, please consider the `prometheus.serverFiles.alerting_rules.yml` defaults as UNSTABLE. - # -- Please consider to explicitly override this input in your `values.yaml` if you need to keep it stable. - # - alerting_rules.yml: - # - # The majority of alerts are inspired by the great collection of rules available at: - # https://github.com/samber/awesome-prometheus-alerts - # - groups: - - name: Kubernetes # via kube-state-metrics - rules: - - alert: KubernetesNodeReady - expr: kube_node_status_condition{condition="Ready",status="true"} == 0 - for: 10m - labels: - severity: critical - annotations: - summary: Kubernetes Node ready (instance {{ $labels.instance }}) - description: "Node {{ $labels.node }} has been unready for a long time" - - - alert: KubernetesMemoryPressure - expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1 - for: 2m - labels: - severity: critical - annotations: - summary: Kubernetes memory pressure (instance {{ $labels.instance }}) - description: "{{ $labels.node }} has MemoryPressure condition" - - - alert: KubernetesDiskPressure - expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1 - for: 10m - labels: - severity: critical - annotations: - summary: Kubernetes disk pressure (instance {{ $labels.instance }}) - description: "{{ $labels.node }} has DiskPressure condition" - - - alert: KubernetesOutOfDisk - expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1 - for: 2m - labels: - severity: critical - annotations: - summary: Kubernetes out of disk (instance {{ $labels.instance }}) - description: "{{ $labels.node }} has OutOfDisk condition" - - - alert: KubernetesOutOfCapacity - expr: sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90 - for: 2m - labels: - severity: critical - annotations: - summary: Kubernetes out of capacity (instance {{ $labels.instance }}) - description: "{{ $labels.node }} is out of capacity" - - - alert: KubernetesContainerOomKiller - expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1 - for: 0m - labels: - severity: critical - annotations: - summary: Kubernetes container oom killer (instance {{ $labels.instance }}) - description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes." - - - alert: KubernetesJobFailed - expr: kube_job_status_failed > 0 - for: 0m - labels: - severity: warning - annotations: - summary: Kubernetes Job failed (instance {{ $labels.instance }}) - description: "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete" - - - alert: KubernetesCronjobSuspended - expr: kube_cronjob_spec_suspend != 0 - for: 0m - labels: - severity: warning - annotations: - summary: Kubernetes CronJob suspended (instance {{ $labels.instance }}) - description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended" - - - alert: KubernetesPersistentvolumeclaimPending - expr: kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1 - for: 2m - labels: - severity: warning - annotations: - summary: Kubernetes PersistentVolumeClaim pending (instance {{ $labels.instance }}) - description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending" - - - alert: KubernetesVolumeOutOfDiskSpace - expr: kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10 - for: 2m - labels: - severity: critical - annotations: - summary: Kubernetes Volume out of disk space (instance {{ $labels.instance }}) - description: "Volume is almost full (< 10% left)" - - - alert: KubernetesVolumeFullInFourDays - expr: predict_linear(kubelet_volume_stats_available_bytes[6h], 4 * 24 * 3600) < 0 - for: 0m - labels: - severity: critical - annotations: - summary: Kubernetes Volume full in four days (instance {{ $labels.instance }}) - description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available." - - - alert: KubernetesPersistentvolumeError - expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0 - for: 0m - labels: - severity: critical - annotations: - summary: Kubernetes PersistentVolume error (instance {{ $labels.instance }}) - description: "Persistent volume is in bad state" - - - alert: KubernetesStatefulsetDown - expr: (kube_statefulset_status_replicas_ready / kube_statefulset_status_replicas_current) != 1 - for: 1m - labels: - severity: critical - annotations: - summary: Kubernetes StatefulSet down (instance {{ $labels.instance }}) - description: "A StatefulSet went down" - - - alert: KubernetesHpaScalingAbility - expr: kube_horizontalpodautoscaler_status_condition{status="false", condition="AbleToScale"} == 1 - for: 2m - labels: - severity: critical - annotations: - summary: Kubernetes HPA scaling ability (instance {{ $labels.instance }}) - description: "Pod is unable to scale" - - - alert: KubernetesHpaMetricAvailability - expr: kube_horizontalpodautoscaler_status_condition{status="false", condition="ScalingActive"} == 1 - for: 5m - labels: - severity: critical - annotations: - summary: Kubernetes HPA metric availability (instance {{ $labels.instance }}) - description: "HPA is not able to collect metrics" - - - alert: KubernetesHpaScaleCapability - expr: kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas - for: 2m - labels: - severity: critical - annotations: - summary: Kubernetes HPA scale capability (instance {{ $labels.instance }}) - description: "The maximum number of desired Pods has been hit" - - - alert: KubernetesPodNotHealthy - expr: sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"}) > 0 - for: 15m - labels: - severity: warning - annotations: - summary: Kubernetes Pod not healthy (instance {{ $labels.instance }}) - description: "Pod has been in a non-ready state for longer than 15 minutes." - - - alert: KubernetesPodCrashLooping - expr: increase(kube_pod_container_status_restarts_total[1m]) > 3 - for: 2m - labels: - severity: warning - annotations: - summary: Kubernetes pod crash looping (instance {{ $labels.instance }}) - description: "Pod {{ $labels.pod }} is crash looping" - - - alert: KubernetesReplicassetMismatch - expr: kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas - for: 10m - labels: - severity: warning - annotations: - summary: Kubernetes ReplicasSet mismatch (instance {{ $labels.instance }}) - description: > - The number of ready pods in the Deployment's replicaset does - not match the desired number. - - - alert: KubernetesDeploymentReplicasMismatch - expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available - for: 10m - labels: - severity: warning - annotations: - summary: Kubernetes Deployment replicas mismatch (instance {{ $labels.instance }}) - description: > - The number of ready pods in the Deployment does not match the - desired number. - - - alert: KubernetesStatefulsetReplicasMismatch - expr: kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas - for: 10m - labels: - severity: warning - annotations: - summary: Kubernetes StatefulSet replicas mismatch (instance {{ $labels.instance }}) - description: > - The number of ready pods in the StatefulSet does not match the - desired number. - - - alert: KubernetesDeploymentGenerationMismatch - expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation - for: 10m - labels: - severity: critical - annotations: - summary: Kubernetes Deployment generation mismatch (instance {{ $labels.instance }}) - description: "A Deployment has failed but has not been rolled back." - - - alert: KubernetesStatefulsetGenerationMismatch - expr: kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation - for: 10m - labels: - severity: critical - annotations: - summary: Kubernetes StatefulSet generation mismatch (instance {{ $labels.instance }}) - description: "A StatefulSet has failed but has not been rolled back." - - - alert: KubernetesStatefulsetUpdateNotRolledOut - expr: max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated) - for: 10m - labels: - severity: warning - annotations: - summary: Kubernetes StatefulSet update not rolled out (instance {{ $labels.instance }}) - description: "StatefulSet update has not been rolled out." - - - alert: KubernetesDaemonsetRolloutStuck - expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0 - for: 10m - labels: - severity: warning - annotations: - summary: Kubernetes DaemonSet rollout stuck (instance {{ $labels.instance }}) - description: "Some Pods of DaemonSet are not scheduled or not ready" - - - alert: KubernetesDaemonsetMisscheduled - expr: kube_daemonset_status_number_misscheduled > 0 - for: 5m - labels: - severity: critical - annotations: - summary: Kubernetes DaemonSet misscheduled (instance {{ $labels.instance }}) - description: "Some DaemonSet Pods are running where they are not supposed to run" - - - alert: KubernetesCronjobTooLong - expr: time() - kube_cronjob_next_schedule_time > 3600 - for: 0m - labels: - severity: warning - annotations: - summary: Kubernetes CronJob too long (instance {{ $labels.instance }}) - description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete." - - - alert: KubernetesJobSlowCompletion - expr: kube_job_spec_completions - kube_job_status_succeeded > 0 - for: 12h - labels: - severity: critical - annotations: - summary: Kubernetes job slow completion (instance {{ $labels.instance }}) - description: "Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time." - - - alert: KubernetesApiServerErrors - expr: sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[1m])) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) * 100 > 3 - for: 2m - labels: - severity: critical - annotations: - summary: Kubernetes API server errors (instance {{ $labels.instance }}) - description: "Kubernetes API server is experiencing high error rate" - - - alert: KubernetesApiClientErrors - expr: (sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) / sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1 - for: 2m - labels: - severity: critical - annotations: - summary: Kubernetes API client errors (instance {{ $labels.instance }}) - description: "Kubernetes API client is experiencing high error rate" - - - alert: KubernetesClientCertificateExpiresNextWeek - expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60 - for: 0m - labels: - severity: warning - annotations: - summary: Kubernetes client certificate expires next week (instance {{ $labels.instance }}) - description: "A client certificate used to authenticate to the apiserver is expiring next week." - - - alert: KubernetesClientCertificateExpiresSoon - expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 24*60*60 - for: 0m - labels: - severity: critical - annotations: - summary: Kubernetes client certificate expires soon (instance {{ $labels.instance }}) - description: "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours." - - - alert: KubernetesApiServerLatency - expr: histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"} [10m])) WITHOUT (instance, resource)) / 1e+06 > 1 - for: 2m - labels: - severity: warning - annotations: - summary: Kubernetes API server latency (instance {{ $labels.instance }}) - description: "Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}." - - - - name: Loki # via embedded exporter - rules: - - alert: LokiProcessTooManyRestarts - expr: changes(process_start_time_seconds{app="loki"}[15m]) > 2 - for: 0m - labels: - severity: warning - annotations: - summary: Loki process too many restarts (instance {{ $labels.instance }}) - description: "A loki process had too many restarts (target {{ $labels.instance }})" - - - alert: LokiRequestErrors - expr: 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10 - for: 15m - labels: - severity: warning - annotations: - summary: Loki request errors (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing errors" - - - alert: LokiRequestPanic - expr: sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 - for: 5m - labels: - severity: warning - annotations: - summary: Loki request panic (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics" - - - alert: LokiRequestLatency - expr: (histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le))) > 3 - for: 10m - labels: - severity: warning - annotations: - summary: Loki request latency (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency" - - - - name: Promtail # via embedded exporter - rules: - - alert: PromtailRequestErrors - expr: 100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance) / sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) > 10 - for: 5m - labels: - severity: critical - annotations: - summary: Promtail request errors (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}% errors." - - - alert: PromtailRequestLatency - expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[5m])) by (le)) > 1 - for: 5m - labels: - severity: critical - annotations: - summary: Promtail request latency (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency." - - - - name: Prometheus # via embedded exporter - rules: - - alert: PrometheusJobMissing - expr: absent(up{job="prometheus"}) - for: 0m - labels: - severity: warning - annotations: - summary: Prometheus job missing (instance {{ $labels.instance }}) - description: "A Prometheus job has disappeared" - - - alert: PrometheusTargetMissing - expr: up == 0 - for: 5m - labels: - severity: critical - annotations: - summary: Prometheus target missing (instance {{ $labels.instance }}) - description: "A Prometheus target has disappeared. An exporter might be crashed." - - - alert: PrometheusAllTargetsMissing - expr: sum by (job) (up) == 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus all targets missing (instance {{ $labels.instance }}) - description: "A Prometheus job does not have living target anymore." - - - alert: PrometheusConfigurationReloadFailure - expr: prometheus_config_last_reload_successful != 1 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus configuration reload failure (instance {{ $labels.instance }}) - description: "Prometheus configuration reload error" - - - alert: PrometheusTooManyRestarts - expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus too many restarts (instance {{ $labels.instance }}) - description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping." - - - alert: PrometheusAlertmanagerJobMissing - expr: absent(up{job="kubernetes-pods", app="prometheus", component="alertmanager"}) - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus AlertManager job missing (instance {{ $labels.instance }}) - description: "A Prometheus AlertManager job has disappeared" - - - alert: PrometheusAlertmanagerConfigurationReloadFailure - expr: alertmanager_config_last_reload_successful != 1 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}) - description: "AlertManager configuration reload error" - - - alert: PrometheusAlertmanagerConfigNotSynced - expr: count(count_values("config_hash", alertmanager_config_hash)) > 1 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }}) - description: "Configurations of AlertManager cluster instances are out of sync" - - - alert: PrometheusAlertmanagerE2eDeadManSwitch - expr: vector(1) - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }}) - description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager." - - - alert: PrometheusNotConnectedToAlertmanager - expr: prometheus_notifications_alertmanagers_discovered < 1 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }}) - description: "Prometheus cannot connect the alertmanager" - - - alert: PrometheusRuleEvaluationFailures - expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus rule evaluation failures (instance {{ $labels.instance }}) - description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts." - - - alert: PrometheusTemplateTextExpansionFailures - expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus template text expansion failures (instance {{ $labels.instance }}) - description: "Prometheus encountered {{ $value }} template text expansion failures" - - - alert: PrometheusRuleEvaluationSlow - expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds - for: 5m - labels: - severity: warning - annotations: - summary: Prometheus rule evaluation slow (instance {{ $labels.instance }}) - description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query." - - - alert: PrometheusNotificationsBacklog - expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0 - for: 0m - labels: - severity: warning - annotations: - summary: Prometheus notifications backlog (instance {{ $labels.instance }}) - description: "The Prometheus notification queue has not been empty for 10 minutes" - - - alert: PrometheusAlertmanagerNotificationFailing - expr: rate(alertmanager_notifications_failed_total[1m]) > 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }}) - description: "Alertmanager is failing sending notifications" - - - alert: PrometheusTargetEmpty - expr: prometheus_sd_discovered_targets == 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus target empty (instance {{ $labels.instance }}) - description: "Prometheus has no target in service discovery" - - - alert: PrometheusTargetScrapingSlow - expr: prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05 - for: 5m - labels: - severity: warning - annotations: - summary: Prometheus target scraping slow (instance {{ $labels.instance }}) - description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned." - - - alert: PrometheusLargeScrape - expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10 - for: 5m - labels: - severity: warning - annotations: - summary: Prometheus large scrape (instance {{ $labels.instance }}) - description: "Prometheus has many scrapes that exceed the sample limit" - - - alert: PrometheusTargetScrapeDuplicate - expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0 - for: 0m - labels: - severity: warning - annotations: - summary: Prometheus target scrape duplicate (instance {{ $labels.instance }}) - description: "Prometheus has many samples rejected due to duplicate timestamps but different values" - - - alert: PrometheusTsdbCheckpointCreationFailures - expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }}) - description: "Prometheus encountered {{ $value }} checkpoint creation failures" - - - alert: PrometheusTsdbCheckpointDeletionFailures - expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }}) - description: "Prometheus encountered {{ $value }} checkpoint deletion failures" - - - alert: PrometheusTsdbCompactionsFailed - expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }}) - description: "Prometheus encountered {{ $value }} TSDB compactions failures" - - - alert: PrometheusTsdbHeadTruncationsFailed - expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }}) - description: "Prometheus encountered {{ $value }} TSDB head truncation failures" - - - alert: PrometheusTsdbReloadFailures - expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus TSDB reload failures (instance {{ $labels.instance }}) - description: "Prometheus encountered {{ $value }} TSDB reload failures" - - - alert: PrometheusTsdbWalCorruptions - expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}) - description: "Prometheus encountered {{ $value }} TSDB WAL corruptions" - - - alert: PrometheusTsdbWalTruncationsFailed - expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}) - description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures" - - - name: Redis # via prometheus-redis-exporter - rules: - - alert: RedisDown - expr: redis_up == 0 - for: 0m - labels: - severity: critical - annotations: - summary: Redis down (instance {{ $labels.instance }}) - description: "Redis instance is down" - - - alert: RedisMissingMaster - expr: (count(redis_instance_info{role="master"}) or vector(0)) < 1 - for: 0m - labels: - severity: critical - annotations: - summary: Redis missing master (instance {{ $labels.instance }}) - description: "Redis cluster has no node marked as master." - - - alert: RedisTooManyMasters - expr: count(redis_instance_info{role="master"}) > 1 - for: 0m - labels: - severity: critical - annotations: - summary: Redis too many masters (instance {{ $labels.instance }}) - description: "Redis cluster has too many nodes marked as master." - - - alert: RedisDisconnectedSlaves - expr: count without (instance, job) (redis_connected_slaves) - sum without (instance, job) (redis_connected_slaves) - 1 > 1 - for: 0m - labels: - severity: critical - annotations: - summary: Redis disconnected slaves (instance {{ $labels.instance }}) - description: "Redis not replicating for all slaves. Consider reviewing the redis replication status." - - - alert: RedisReplicationBroken - expr: delta(redis_connected_slaves[1m]) < 0 - for: 0m - labels: - severity: critical - annotations: - summary: Redis replication broken (instance {{ $labels.instance }}) - description: "Redis instance lost a slave" - - - alert: RedisClusterFlapping - expr: changes(redis_connected_slaves[1m]) > 1 - for: 2m - labels: - severity: critical - annotations: - summary: Redis cluster flapping (instance {{ $labels.instance }}) - description: "Changes have been detected in Redis replica connection. This can occur when replica nodes lose connection to the master and reconnect (a.k.a flapping)." - - - alert: RedisMissingBackup - expr: time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 24 - for: 0m - labels: - severity: critical - annotations: - summary: Redis missing backup (instance {{ $labels.instance }}) - description: "Redis has not been backuped for 24 hours" - - # The exporter must be started with --include-system-metrics flag or REDIS_EXPORTER_INCL_SYSTEM_METRICS=true environment variable. - - alert: RedisOutOfSystemMemory - expr: redis_memory_used_bytes / redis_total_system_memory_bytes * 100 > 90 - for: 2m - labels: - severity: warning - annotations: - summary: Redis out of system memory (instance {{ $labels.instance }}) - description: "Redis is running out of system memory (> 90%)" - - - alert: RedisOutOfConfiguredMaxmemory - expr: redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90 - for: 2m - labels: - severity: warning - annotations: - summary: Redis out of configured maxmemory (instance {{ $labels.instance }}) - description: "Redis is running out of configured maxmemory (> 90%)" - - - alert: RedisTooManyConnections - expr: redis_connected_clients > 100 - for: 2m - labels: - severity: warning - annotations: - summary: Redis too many connections (instance {{ $labels.instance }}) - description: "Redis instance has too many connections" - - - alert: RedisNotEnoughConnections - expr: redis_connected_clients < 5 - for: 2m - labels: - severity: warning - annotations: - summary: Redis not enough connections (instance {{ $labels.instance }}) - description: "Redis instance should have more connections (> 5)" - - - alert: RedisRejectedConnections - expr: increase(redis_rejected_connections_total[1m]) > 0 - for: 0m - labels: - severity: critical - annotations: - summary: Redis rejected connections (instance {{ $labels.instance }}) - description: "Some connections to Redis has been rejected" - - + prometheus.yml: + rule_files: + - /etc/config/recording_rules.yml + - /etc/config/alerting_rules.yml + - /etc/config/rules + - /etc/config/alerts + - /etc/posthog/*.yml + + server: + extraConfigmapMounts: + - name: posthog-prometheus-alerts-default + mountPath: /etc/posthog/ + configMap: posthog-prometheus-alerts-default + readOnly: true + + configmapReload: + extraConfigmapMounts: + - name: posthog-prometheus-alerts-default + mountPath: /etc/posthog/ + configMap: posthog-prometheus-alerts-default + readOnly: true + + extraVolumeDirs: + - /etc/posthog/ ### ### ### ---- prometheus-statsd-exporter ----