From 30cee0c838d80fe91be4a1c55ac4b7943b7c2531 Mon Sep 17 00:00:00 2001
From: Guido Iaquinti <guido@posthog.com>
Date: Thu, 13 Oct 2022 16:46:24 +0200
Subject: [PATCH] Prometheus: move default alerts to ConfigMap

---
 charts/posthog/prometheus/alerts-default.yml  | 726 +++++++++++++++++
 .../prometheus_default_alerting_rules.yaml    |   6 +
 charts/posthog/values.yaml                    | 752 +-----------------
 3 files changed, 756 insertions(+), 728 deletions(-)
 create mode 100644 charts/posthog/prometheus/alerts-default.yml
 create mode 100644 charts/posthog/templates/prometheus_default_alerting_rules.yaml

diff --git a/charts/posthog/prometheus/alerts-default.yml b/charts/posthog/prometheus/alerts-default.yml
new file mode 100644
index 000000000..2fdeb271d
--- /dev/null
+++ b/charts/posthog/prometheus/alerts-default.yml
@@ -0,0 +1,726 @@
+
+# -- Alerts configuration. For more information see: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
+#
+# -- NOTE: alerting is an important part of any production system. With this Helm chart we aim to provide a good
+# -- collection of default rules that can be used to successfully alert an operator if a PostHog installation is not
+# -- working as expected. As those rules will likely evolve over time and as we don't want to cut a new major release
+# -- every time it happens, please consider those defaults as UNSTABLE.
+# -- Please consider to explicitly override this input in your `values.yaml` if you need to keep it stable.
+#
+#
+# The majority of alerts are inspired by the great collection of rules available at:
+# https://github.com/samber/awesome-prometheus-alerts
+#
+groups:
+  - name: Kubernetes # via kube-state-metrics
+    rules:
+      - alert: KubernetesNodeReady
+        expr: kube_node_status_condition{condition="Ready",status="true"} == 0
+        for: 10m
+        labels:
+          severity: critical
+        annotations:
+          summary: Kubernetes Node ready (instance {{ $labels.instance }})
+          description: "Node {{ $labels.node }} has been unready for a long time"
+
+      - alert: KubernetesMemoryPressure
+        expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: Kubernetes memory pressure (instance {{ $labels.instance }})
+          description: "{{ $labels.node }} has MemoryPressure condition"
+
+      - alert: KubernetesDiskPressure
+        expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1
+        for: 10m
+        labels:
+          severity: critical
+        annotations:
+          summary: Kubernetes disk pressure (instance {{ $labels.instance }})
+          description: "{{ $labels.node }} has DiskPressure condition"
+
+      - alert: KubernetesOutOfDisk
+        expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: Kubernetes out of disk (instance {{ $labels.instance }})
+          description: "{{ $labels.node }} has OutOfDisk condition"
+
+      - alert: KubernetesOutOfCapacity
+        expr: sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: Kubernetes out of capacity (instance {{ $labels.instance }})
+          description: "{{ $labels.node }} is out of capacity"
+
+      - alert: KubernetesContainerOomKiller
+        expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Kubernetes container oom killer (instance {{ $labels.instance }})
+          description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes."
+
+      - alert: KubernetesJobFailed
+        expr: kube_job_status_failed > 0
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Kubernetes Job failed (instance {{ $labels.instance }})
+          description: "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete"
+
+      - alert: KubernetesCronjobSuspended
+        expr: kube_cronjob_spec_suspend != 0
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Kubernetes CronJob suspended (instance {{ $labels.instance }})
+          description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended"
+
+      - alert: KubernetesPersistentvolumeclaimPending
+        expr: kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Kubernetes PersistentVolumeClaim pending (instance {{ $labels.instance }})
+          description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending"
+
+      - alert: KubernetesVolumeOutOfDiskSpace
+        expr: kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: Kubernetes Volume out of disk space (instance {{ $labels.instance }})
+          description: "Volume is almost full (< 10% left)"
+
+      - alert: KubernetesVolumeFullInFourDays
+        expr: predict_linear(kubelet_volume_stats_available_bytes[6h], 4 * 24 * 3600) < 0
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Kubernetes Volume full in four days (instance {{ $labels.instance }})
+          description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available."
+
+      - alert: KubernetesPersistentvolumeError
+        expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Kubernetes PersistentVolume error (instance {{ $labels.instance }})
+          description: "Persistent volume is in bad state"
+
+      - alert: KubernetesStatefulsetDown
+        expr: (kube_statefulset_status_replicas_ready / kube_statefulset_status_replicas_current) != 1
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: Kubernetes StatefulSet down (instance {{ $labels.instance }})
+          description: "A StatefulSet went down"
+
+      - alert: KubernetesHpaScalingAbility
+        expr: kube_horizontalpodautoscaler_status_condition{status="false", condition="AbleToScale"} == 1
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: Kubernetes HPA scaling ability (instance {{ $labels.instance }})
+          description: "Pod is unable to scale"
+
+      - alert: KubernetesHpaMetricAvailability
+        expr: kube_horizontalpodautoscaler_status_condition{status="false", condition="ScalingActive"} == 1
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: Kubernetes HPA metric availability (instance {{ $labels.instance }})
+          description: "HPA is not able to collect metrics"
+
+      - alert: KubernetesHpaScaleCapability
+        expr: kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: Kubernetes HPA scale capability (instance {{ $labels.instance }})
+          description: "The maximum number of desired Pods has been hit"
+
+      - alert: KubernetesPodNotHealthy
+        expr: sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"}) > 0
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          summary: Kubernetes Pod not healthy (instance {{ $labels.instance }})
+          description: "Pod has been in a non-ready state for longer than 15 minutes."
+
+      - alert: KubernetesPodCrashLooping
+        expr: increase(kube_pod_container_status_restarts_total[1m]) > 3
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Kubernetes pod crash looping (instance {{ $labels.instance }})
+          description: "Pod {{ $labels.pod }} is crash looping"
+
+      - alert: KubernetesReplicassetMismatch
+        expr: kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary: Kubernetes ReplicasSet mismatch (instance {{ $labels.instance }})
+          description: >
+            The number of ready pods in the Deployment's replicaset does
+            not match the desired number.
+
+      - alert: KubernetesDeploymentReplicasMismatch
+        expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary: Kubernetes Deployment replicas mismatch (instance {{ $labels.instance }})
+          description: >
+            The number of ready pods in the Deployment does not match the
+            desired number.
+
+      - alert: KubernetesStatefulsetReplicasMismatch
+        expr: kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary: Kubernetes StatefulSet replicas mismatch (instance {{ $labels.instance }})
+          description: >
+            The number of ready pods in the StatefulSet does not match the
+            desired number.
+
+      - alert: KubernetesDeploymentGenerationMismatch
+        expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation
+        for: 10m
+        labels:
+          severity: critical
+        annotations:
+          summary: Kubernetes Deployment generation mismatch (instance {{ $labels.instance }})
+          description: "A Deployment has failed but has not been rolled back."
+
+      - alert: KubernetesStatefulsetGenerationMismatch
+        expr: kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation
+        for: 10m
+        labels:
+          severity: critical
+        annotations:
+          summary: Kubernetes StatefulSet generation mismatch (instance {{ $labels.instance }})
+          description: "A StatefulSet has failed but has not been rolled back."
+
+      - alert: KubernetesStatefulsetUpdateNotRolledOut
+        expr: max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated)
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary: Kubernetes StatefulSet update not rolled out (instance {{ $labels.instance }})
+          description: "StatefulSet update has not been rolled out."
+
+      - alert: KubernetesDaemonsetRolloutStuck
+        expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary: Kubernetes DaemonSet rollout stuck (instance {{ $labels.instance }})
+          description: "Some Pods of DaemonSet are not scheduled or not ready"
+
+      - alert: KubernetesDaemonsetMisscheduled
+        expr: kube_daemonset_status_number_misscheduled > 0
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: Kubernetes DaemonSet misscheduled (instance {{ $labels.instance }})
+          description: "Some DaemonSet Pods are running where they are not supposed to run"
+
+      - alert: KubernetesCronjobTooLong
+        expr: time() - kube_cronjob_next_schedule_time > 3600
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Kubernetes CronJob too long (instance {{ $labels.instance }})
+          description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete."
+
+      - alert: KubernetesJobSlowCompletion
+        expr: kube_job_spec_completions - kube_job_status_succeeded > 0
+        for: 12h
+        labels:
+          severity: critical
+        annotations:
+          summary: Kubernetes job slow completion (instance {{ $labels.instance }})
+          description: "Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time."
+
+      - alert: KubernetesApiServerErrors
+        expr: sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[1m])) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) * 100 > 3
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: Kubernetes API server errors (instance {{ $labels.instance }})
+          description: "Kubernetes API server is experiencing high error rate"
+
+      - alert: KubernetesApiClientErrors
+        expr: (sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) / sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: Kubernetes API client errors (instance {{ $labels.instance }})
+          description: "Kubernetes API client is experiencing high error rate"
+
+      - alert: KubernetesClientCertificateExpiresNextWeek
+        expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Kubernetes client certificate expires next week (instance {{ $labels.instance }})
+          description: "A client certificate used to authenticate to the apiserver is expiring next week."
+
+      - alert: KubernetesClientCertificateExpiresSoon
+        expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 24*60*60
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Kubernetes client certificate expires soon (instance {{ $labels.instance }})
+          description: "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours."
+
+      - alert: KubernetesApiServerLatency
+        expr: histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"} [10m])) WITHOUT (instance, resource)) / 1e+06 > 1
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Kubernetes API server latency (instance {{ $labels.instance }})
+          description: "Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}."
+
+
+  - name: Loki # via embedded exporter
+    rules:
+      - alert: LokiProcessTooManyRestarts
+        expr: changes(process_start_time_seconds{app="loki"}[15m]) > 2
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Loki process too many restarts (instance {{ $labels.instance }})
+          description: "A loki process had too many restarts (target {{ $labels.instance }})"
+
+      - alert: LokiRequestErrors
+        expr: 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          summary: Loki request errors (instance {{ $labels.instance }})
+          description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing errors"
+
+      - alert: LokiRequestPanic
+        expr: sum(increase(loki_panic_total[10m])) by (namespace, job) > 0
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: Loki request panic (instance {{ $labels.instance }})
+          description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics"
+
+      - alert: LokiRequestLatency
+        expr: (histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le))) > 3
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary: Loki request latency (instance {{ $labels.instance }})
+          description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency"
+
+
+  - name: Promtail # via embedded exporter
+    rules:
+      - alert: PromtailRequestErrors
+        expr: 100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance) / sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) > 10
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: Promtail request errors (instance {{ $labels.instance }})
+          description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}% errors."
+
+      - alert: PromtailRequestLatency
+        expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[5m])) by (le)) > 1
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: Promtail request latency (instance {{ $labels.instance }})
+          description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency."
+
+
+  - name: Prometheus # via embedded exporter
+    rules:
+      - alert: PrometheusJobMissing
+        expr: absent(up{job="prometheus"})
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus job missing (instance {{ $labels.instance }})
+          description: "A Prometheus job has disappeared"
+
+      - alert: PrometheusTargetMissing
+        expr: up == 0
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus target missing (instance {{ $labels.instance }})
+          description: "A Prometheus target has disappeared. An exporter might be crashed."
+
+      - alert: PrometheusAllTargetsMissing
+        expr: sum by (job) (up) == 0
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus all targets missing (instance {{ $labels.instance }})
+          description: "A Prometheus job does not have living target anymore."
+
+      - alert: PrometheusConfigurationReloadFailure
+        expr: prometheus_config_last_reload_successful != 1
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
+          description: "Prometheus configuration reload error"
+
+      - alert: PrometheusTooManyRestarts
+        expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus too many restarts (instance {{ $labels.instance }})
+          description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping."
+
+      - alert: PrometheusAlertmanagerJobMissing
+        expr: absent(up{job="kubernetes-pods", app="prometheus", component="alertmanager"})
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus AlertManager job missing (instance {{ $labels.instance }})
+          description: "A Prometheus AlertManager job has disappeared"
+
+      - alert: PrometheusAlertmanagerConfigurationReloadFailure
+        expr: alertmanager_config_last_reload_successful != 1
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})
+          description: "AlertManager configuration reload error"
+
+      - alert: PrometheusAlertmanagerConfigNotSynced
+        expr: count(count_values("config_hash", alertmanager_config_hash)) > 1
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }})
+          description: "Configurations of AlertManager cluster instances are out of sync"
+
+      - alert: PrometheusAlertmanagerE2eDeadManSwitch
+        expr: vector(1)
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }})
+          description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager."
+
+      - alert: PrometheusNotConnectedToAlertmanager
+        expr: prometheus_notifications_alertmanagers_discovered < 1
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }})
+          description: "Prometheus cannot connect the alertmanager"
+
+      - alert: PrometheusRuleEvaluationFailures
+        expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
+          description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts."
+
+      - alert: PrometheusTemplateTextExpansionFailures
+        expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus template text expansion failures (instance {{ $labels.instance }})
+          description: "Prometheus encountered {{ $value }} template text expansion failures"
+
+      - alert: PrometheusRuleEvaluationSlow
+        expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
+          description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query."
+
+      - alert: PrometheusNotificationsBacklog
+        expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus notifications backlog (instance {{ $labels.instance }})
+          description: "The Prometheus notification queue has not been empty for 10 minutes"
+
+      - alert: PrometheusAlertmanagerNotificationFailing
+        expr: rate(alertmanager_notifications_failed_total[1m]) > 0
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})
+          description: "Alertmanager is failing sending notifications"
+
+      - alert: PrometheusTargetEmpty
+        expr: prometheus_sd_discovered_targets == 0
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus target empty (instance {{ $labels.instance }})
+          description: "Prometheus has no target in service discovery"
+
+      - alert: PrometheusTargetScrapingSlow
+        expr: prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus target scraping slow (instance {{ $labels.instance }})
+          description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned."
+
+      - alert: PrometheusLargeScrape
+        expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus large scrape (instance {{ $labels.instance }})
+          description: "Prometheus has many scrapes that exceed the sample limit"
+
+      - alert: PrometheusTargetScrapeDuplicate
+        expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus target scrape duplicate (instance {{ $labels.instance }})
+          description: "Prometheus has many samples rejected due to duplicate timestamps but different values"
+
+      - alert: PrometheusTsdbCheckpointCreationFailures
+        expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})
+          description: "Prometheus encountered {{ $value }} checkpoint creation failures"
+
+      - alert: PrometheusTsdbCheckpointDeletionFailures
+        expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }})
+          description: "Prometheus encountered {{ $value }} checkpoint deletion failures"
+
+      - alert: PrometheusTsdbCompactionsFailed
+        expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }})
+          description: "Prometheus encountered {{ $value }} TSDB compactions failures"
+
+      - alert: PrometheusTsdbHeadTruncationsFailed
+        expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }})
+          description: "Prometheus encountered {{ $value }} TSDB head truncation failures"
+
+      - alert: PrometheusTsdbReloadFailures
+        expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus TSDB reload failures (instance {{ $labels.instance }})
+          description: "Prometheus encountered {{ $value }} TSDB reload failures"
+
+      - alert: PrometheusTsdbWalCorruptions
+        expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})
+          description: "Prometheus encountered {{ $value }} TSDB WAL corruptions"
+
+      - alert: PrometheusTsdbWalTruncationsFailed
+        expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})
+          description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures"
+
+  - name: Redis # via prometheus-redis-exporter
+    rules:
+      - alert: RedisDown
+        expr: redis_up == 0
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Redis down (instance {{ $labels.instance }})
+          description: "Redis instance is down"
+
+      - alert: RedisMissingMaster
+        expr: (count(redis_instance_info{role="master"}) or vector(0)) < 1
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Redis missing master (instance {{ $labels.instance }})
+          description: "Redis cluster has no node marked as master."
+
+      - alert: RedisTooManyMasters
+        expr: count(redis_instance_info{role="master"}) > 1
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Redis too many masters (instance {{ $labels.instance }})
+          description: "Redis cluster has too many nodes marked as master."
+
+      - alert: RedisDisconnectedSlaves
+        expr: count without (instance, job) (redis_connected_slaves) - sum without (instance, job) (redis_connected_slaves) - 1 > 1
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Redis disconnected slaves (instance {{ $labels.instance }})
+          description: "Redis not replicating for all slaves. Consider reviewing the redis replication status."
+
+      - alert: RedisReplicationBroken
+        expr: delta(redis_connected_slaves[1m]) < 0
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Redis replication broken (instance {{ $labels.instance }})
+          description: "Redis instance lost a slave"
+
+      - alert: RedisClusterFlapping
+        expr: changes(redis_connected_slaves[1m]) > 1
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: Redis cluster flapping (instance {{ $labels.instance }})
+          description: "Changes have been detected in Redis replica connection. This can occur when replica nodes lose connection to the master and reconnect (a.k.a flapping)."
+
+      - alert: RedisMissingBackup
+        expr: time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 24
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Redis missing backup (instance {{ $labels.instance }})
+          description: "Redis has not been backuped for 24 hours"
+
+      # The exporter must be started with --include-system-metrics flag or REDIS_EXPORTER_INCL_SYSTEM_METRICS=true environment variable.
+      - alert: RedisOutOfSystemMemory
+        expr: redis_memory_used_bytes / redis_total_system_memory_bytes * 100 > 90
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Redis out of system memory (instance {{ $labels.instance }})
+          description: "Redis is running out of system memory (> 90%)"
+
+      - alert: RedisOutOfConfiguredMaxmemory
+        expr: redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Redis out of configured maxmemory (instance {{ $labels.instance }})
+          description: "Redis is running out of configured maxmemory (> 90%)"
+
+      - alert: RedisTooManyConnections
+        expr: redis_connected_clients > 100
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Redis too many connections (instance {{ $labels.instance }})
+          description: "Redis instance has too many connections"
+
+      - alert: RedisNotEnoughConnections
+        expr: redis_connected_clients < 5
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Redis not enough connections (instance {{ $labels.instance }})
+          description: "Redis instance should have more connections (> 5)"
+
+      - alert: RedisRejectedConnections
+        expr: increase(redis_rejected_connections_total[1m]) > 0
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Redis rejected connections (instance {{ $labels.instance }})
+          description: "Some connections to Redis has been rejected"
diff --git a/charts/posthog/templates/prometheus_default_alerting_rules.yaml b/charts/posthog/templates/prometheus_default_alerting_rules.yaml
new file mode 100644
index 000000000..d7f72fd97
--- /dev/null
+++ b/charts/posthog/templates/prometheus_default_alerting_rules.yaml
@@ -0,0 +1,6 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: posthog-prometheus-alerts-default
+data:
+{{ (.Files.Glob "prometheus/alerts-default.yml").AsConfig | indent 2 }}
diff --git a/charts/posthog/values.yaml b/charts/posthog/values.yaml
index 03fcf9d64..897a1327a 100644
--- a/charts/posthog/values.yaml
+++ b/charts/posthog/values.yaml
@@ -1749,734 +1749,30 @@ prometheus:
     enabled: false
 
   serverFiles:
-    # -- Alerts configuration. For more information see: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
-    #
-    # -- NOTE: alerting is an important part of any production system. With this Helm chart we aim to provide a good
-    # -- collection of default rules that can be used to successfully alert an operator if a PostHog installation is not
-    # -- working as expected. As those rules will likely evolve over time and as we don't want to cut a new major release
-    # -- every time it happens, please consider the `prometheus.serverFiles.alerting_rules.yml` defaults as UNSTABLE.
-    # -- Please consider to explicitly override this input in your `values.yaml` if you need to keep it stable.
-    #
-    alerting_rules.yml:
-      #
-      # The majority of alerts are inspired by the great collection of rules available at:
-      # https://github.com/samber/awesome-prometheus-alerts
-      #
-      groups:
-        - name: Kubernetes # via kube-state-metrics
-          rules:
-            - alert: KubernetesNodeReady
-              expr: kube_node_status_condition{condition="Ready",status="true"} == 0
-              for: 10m
-              labels:
-                severity: critical
-              annotations:
-                summary: Kubernetes Node ready (instance {{ $labels.instance }})
-                description: "Node {{ $labels.node }} has been unready for a long time"
-
-            - alert: KubernetesMemoryPressure
-              expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1
-              for: 2m
-              labels:
-                severity: critical
-              annotations:
-                summary: Kubernetes memory pressure (instance {{ $labels.instance }})
-                description: "{{ $labels.node }} has MemoryPressure condition"
-
-            - alert: KubernetesDiskPressure
-              expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1
-              for: 10m
-              labels:
-                severity: critical
-              annotations:
-                summary: Kubernetes disk pressure (instance {{ $labels.instance }})
-                description: "{{ $labels.node }} has DiskPressure condition"
-
-            - alert: KubernetesOutOfDisk
-              expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1
-              for: 2m
-              labels:
-                severity: critical
-              annotations:
-                summary: Kubernetes out of disk (instance {{ $labels.instance }})
-                description: "{{ $labels.node }} has OutOfDisk condition"
-
-            - alert: KubernetesOutOfCapacity
-              expr: sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90
-              for: 2m
-              labels:
-                severity: critical
-              annotations:
-                summary: Kubernetes out of capacity (instance {{ $labels.instance }})
-                description: "{{ $labels.node }} is out of capacity"
-
-            - alert: KubernetesContainerOomKiller
-              expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1
-              for: 0m
-              labels:
-                severity: critical
-              annotations:
-                summary: Kubernetes container oom killer (instance {{ $labels.instance }})
-                description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes."
-
-            - alert: KubernetesJobFailed
-              expr: kube_job_status_failed > 0
-              for: 0m
-              labels:
-                severity: warning
-              annotations:
-                summary: Kubernetes Job failed (instance {{ $labels.instance }})
-                description: "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete"
-
-            - alert: KubernetesCronjobSuspended
-              expr: kube_cronjob_spec_suspend != 0
-              for: 0m
-              labels:
-                severity: warning
-              annotations:
-                summary: Kubernetes CronJob suspended (instance {{ $labels.instance }})
-                description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended"
-
-            - alert: KubernetesPersistentvolumeclaimPending
-              expr: kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1
-              for: 2m
-              labels:
-                severity: warning
-              annotations:
-                summary: Kubernetes PersistentVolumeClaim pending (instance {{ $labels.instance }})
-                description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending"
-
-            - alert: KubernetesVolumeOutOfDiskSpace
-              expr: kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10
-              for: 2m
-              labels:
-                severity: critical
-              annotations:
-                summary: Kubernetes Volume out of disk space (instance {{ $labels.instance }})
-                description: "Volume is almost full (< 10% left)"
-
-            - alert: KubernetesVolumeFullInFourDays
-              expr: predict_linear(kubelet_volume_stats_available_bytes[6h], 4 * 24 * 3600) < 0
-              for: 0m
-              labels:
-                severity: critical
-              annotations:
-                summary: Kubernetes Volume full in four days (instance {{ $labels.instance }})
-                description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available."
-
-            - alert: KubernetesPersistentvolumeError
-              expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0
-              for: 0m
-              labels:
-                severity: critical
-              annotations:
-                summary: Kubernetes PersistentVolume error (instance {{ $labels.instance }})
-                description: "Persistent volume is in bad state"
-
-            - alert: KubernetesStatefulsetDown
-              expr: (kube_statefulset_status_replicas_ready / kube_statefulset_status_replicas_current) != 1
-              for: 1m
-              labels:
-                severity: critical
-              annotations:
-                summary: Kubernetes StatefulSet down (instance {{ $labels.instance }})
-                description: "A StatefulSet went down"
-
-            - alert: KubernetesHpaScalingAbility
-              expr: kube_horizontalpodautoscaler_status_condition{status="false", condition="AbleToScale"} == 1
-              for: 2m
-              labels:
-                severity: critical
-              annotations:
-                summary: Kubernetes HPA scaling ability (instance {{ $labels.instance }})
-                description: "Pod is unable to scale"
-
-            - alert: KubernetesHpaMetricAvailability
-              expr: kube_horizontalpodautoscaler_status_condition{status="false", condition="ScalingActive"} == 1
-              for: 5m
-              labels:
-                severity: critical
-              annotations:
-                summary: Kubernetes HPA metric availability (instance {{ $labels.instance }})
-                description: "HPA is not able to collect metrics"
-
-            - alert: KubernetesHpaScaleCapability
-              expr: kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas
-              for: 2m
-              labels:
-                severity: critical
-              annotations:
-                summary: Kubernetes HPA scale capability (instance {{ $labels.instance }})
-                description: "The maximum number of desired Pods has been hit"
-
-            - alert: KubernetesPodNotHealthy
-              expr: sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"}) > 0
-              for: 15m
-              labels:
-                severity: warning
-              annotations:
-                summary: Kubernetes Pod not healthy (instance {{ $labels.instance }})
-                description: "Pod has been in a non-ready state for longer than 15 minutes."
-
-            - alert: KubernetesPodCrashLooping
-              expr: increase(kube_pod_container_status_restarts_total[1m]) > 3
-              for: 2m
-              labels:
-                severity: warning
-              annotations:
-                summary: Kubernetes pod crash looping (instance {{ $labels.instance }})
-                description: "Pod {{ $labels.pod }} is crash looping"
-
-            - alert: KubernetesReplicassetMismatch
-              expr: kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas
-              for: 10m
-              labels:
-                severity: warning
-              annotations:
-                summary: Kubernetes ReplicasSet mismatch (instance {{ $labels.instance }})
-                description: >
-                  The number of ready pods in the Deployment's replicaset does
-                  not match the desired number.
-
-            - alert: KubernetesDeploymentReplicasMismatch
-              expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available
-              for: 10m
-              labels:
-                severity: warning
-              annotations:
-                summary: Kubernetes Deployment replicas mismatch (instance {{ $labels.instance }})
-                description: >
-                  The number of ready pods in the Deployment does not match the
-                  desired number.
-
-            - alert: KubernetesStatefulsetReplicasMismatch
-              expr: kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas
-              for: 10m
-              labels:
-                severity: warning
-              annotations:
-                summary: Kubernetes StatefulSet replicas mismatch (instance {{ $labels.instance }})
-                description: >
-                  The number of ready pods in the StatefulSet does not match the
-                  desired number.
-
-            - alert: KubernetesDeploymentGenerationMismatch
-              expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation
-              for: 10m
-              labels:
-                severity: critical
-              annotations:
-                summary: Kubernetes Deployment generation mismatch (instance {{ $labels.instance }})
-                description: "A Deployment has failed but has not been rolled back."
-
-            - alert: KubernetesStatefulsetGenerationMismatch
-              expr: kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation
-              for: 10m
-              labels:
-                severity: critical
-              annotations:
-                summary: Kubernetes StatefulSet generation mismatch (instance {{ $labels.instance }})
-                description: "A StatefulSet has failed but has not been rolled back."
-
-            - alert: KubernetesStatefulsetUpdateNotRolledOut
-              expr: max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated)
-              for: 10m
-              labels:
-                severity: warning
-              annotations:
-                summary: Kubernetes StatefulSet update not rolled out (instance {{ $labels.instance }})
-                description: "StatefulSet update has not been rolled out."
-
-            - alert: KubernetesDaemonsetRolloutStuck
-              expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0
-              for: 10m
-              labels:
-                severity: warning
-              annotations:
-                summary: Kubernetes DaemonSet rollout stuck (instance {{ $labels.instance }})
-                description: "Some Pods of DaemonSet are not scheduled or not ready"
-
-            - alert: KubernetesDaemonsetMisscheduled
-              expr: kube_daemonset_status_number_misscheduled > 0
-              for: 5m
-              labels:
-                severity: critical
-              annotations:
-                summary: Kubernetes DaemonSet misscheduled (instance {{ $labels.instance }})
-                description: "Some DaemonSet Pods are running where they are not supposed to run"
-
-            - alert: KubernetesCronjobTooLong
-              expr: time() - kube_cronjob_next_schedule_time > 3600
-              for: 0m
-              labels:
-                severity: warning
-              annotations:
-                summary: Kubernetes CronJob too long (instance {{ $labels.instance }})
-                description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete."
-
-            - alert: KubernetesJobSlowCompletion
-              expr: kube_job_spec_completions - kube_job_status_succeeded > 0
-              for: 12h
-              labels:
-                severity: critical
-              annotations:
-                summary: Kubernetes job slow completion (instance {{ $labels.instance }})
-                description: "Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time."
-
-            - alert: KubernetesApiServerErrors
-              expr: sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[1m])) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) * 100 > 3
-              for: 2m
-              labels:
-                severity: critical
-              annotations:
-                summary: Kubernetes API server errors (instance {{ $labels.instance }})
-                description: "Kubernetes API server is experiencing high error rate"
-
-            - alert: KubernetesApiClientErrors
-              expr: (sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) / sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1
-              for: 2m
-              labels:
-                severity: critical
-              annotations:
-                summary: Kubernetes API client errors (instance {{ $labels.instance }})
-                description: "Kubernetes API client is experiencing high error rate"
-
-            - alert: KubernetesClientCertificateExpiresNextWeek
-              expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60
-              for: 0m
-              labels:
-                severity: warning
-              annotations:
-                summary: Kubernetes client certificate expires next week (instance {{ $labels.instance }})
-                description: "A client certificate used to authenticate to the apiserver is expiring next week."
-
-            - alert: KubernetesClientCertificateExpiresSoon
-              expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 24*60*60
-              for: 0m
-              labels:
-                severity: critical
-              annotations:
-                summary: Kubernetes client certificate expires soon (instance {{ $labels.instance }})
-                description: "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours."
-
-            - alert: KubernetesApiServerLatency
-              expr: histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"} [10m])) WITHOUT (instance, resource)) / 1e+06 > 1
-              for: 2m
-              labels:
-                severity: warning
-              annotations:
-                summary: Kubernetes API server latency (instance {{ $labels.instance }})
-                description: "Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}."
-
-
-        - name: Loki # via embedded exporter
-          rules:
-            - alert: LokiProcessTooManyRestarts
-              expr: changes(process_start_time_seconds{app="loki"}[15m]) > 2
-              for: 0m
-              labels:
-                severity: warning
-              annotations:
-                summary: Loki process too many restarts (instance {{ $labels.instance }})
-                description: "A loki process had too many restarts (target {{ $labels.instance }})"
-
-            - alert: LokiRequestErrors
-              expr: 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10
-              for: 15m
-              labels:
-                severity: warning
-              annotations:
-                summary: Loki request errors (instance {{ $labels.instance }})
-                description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing errors"
-
-            - alert: LokiRequestPanic
-              expr: sum(increase(loki_panic_total[10m])) by (namespace, job) > 0
-              for: 5m
-              labels:
-                severity: warning
-              annotations:
-                summary: Loki request panic (instance {{ $labels.instance }})
-                description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics"
-
-            - alert: LokiRequestLatency
-              expr: (histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le))) > 3
-              for: 10m
-              labels:
-                severity: warning
-              annotations:
-                summary: Loki request latency (instance {{ $labels.instance }})
-                description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency"
-
-
-        - name: Promtail # via embedded exporter
-          rules:
-            - alert: PromtailRequestErrors
-              expr: 100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance) / sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) > 10
-              for: 5m
-              labels:
-                severity: critical
-              annotations:
-                summary: Promtail request errors (instance {{ $labels.instance }})
-                description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}% errors."
-
-            - alert: PromtailRequestLatency
-              expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[5m])) by (le)) > 1
-              for: 5m
-              labels:
-                severity: critical
-              annotations:
-                summary: Promtail request latency (instance {{ $labels.instance }})
-                description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency."
-
-
-        - name: Prometheus # via embedded exporter
-          rules:
-            - alert: PrometheusJobMissing
-              expr: absent(up{job="prometheus"})
-              for: 0m
-              labels:
-                severity: warning
-              annotations:
-                summary: Prometheus job missing (instance {{ $labels.instance }})
-                description: "A Prometheus job has disappeared"
-
-            - alert: PrometheusTargetMissing
-              expr: up == 0
-              for: 5m
-              labels:
-                severity: critical
-              annotations:
-                summary: Prometheus target missing (instance {{ $labels.instance }})
-                description: "A Prometheus target has disappeared. An exporter might be crashed."
-
-            - alert: PrometheusAllTargetsMissing
-              expr: sum by (job) (up) == 0
-              for: 0m
-              labels:
-                severity: critical
-              annotations:
-                summary: Prometheus all targets missing (instance {{ $labels.instance }})
-                description: "A Prometheus job does not have living target anymore."
-
-            - alert: PrometheusConfigurationReloadFailure
-              expr: prometheus_config_last_reload_successful != 1
-              for: 0m
-              labels:
-                severity: critical
-              annotations:
-                summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
-                description: "Prometheus configuration reload error"
-
-            - alert: PrometheusTooManyRestarts
-              expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
-              for: 0m
-              labels:
-                severity: critical
-              annotations:
-                summary: Prometheus too many restarts (instance {{ $labels.instance }})
-                description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping."
-
-            - alert: PrometheusAlertmanagerJobMissing
-              expr: absent(up{job="kubernetes-pods", app="prometheus", component="alertmanager"})
-              for: 0m
-              labels:
-                severity: critical
-              annotations:
-                summary: Prometheus AlertManager job missing (instance {{ $labels.instance }})
-                description: "A Prometheus AlertManager job has disappeared"
-
-            - alert: PrometheusAlertmanagerConfigurationReloadFailure
-              expr: alertmanager_config_last_reload_successful != 1
-              for: 0m
-              labels:
-                severity: critical
-              annotations:
-                summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})
-                description: "AlertManager configuration reload error"
-
-            - alert: PrometheusAlertmanagerConfigNotSynced
-              expr: count(count_values("config_hash", alertmanager_config_hash)) > 1
-              for: 0m
-              labels:
-                severity: critical
-              annotations:
-                summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }})
-                description: "Configurations of AlertManager cluster instances are out of sync"
-
-            - alert: PrometheusAlertmanagerE2eDeadManSwitch
-              expr: vector(1)
-              for: 0m
-              labels:
-                severity: critical
-              annotations:
-                summary: Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }})
-                description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager."
-
-            - alert: PrometheusNotConnectedToAlertmanager
-              expr: prometheus_notifications_alertmanagers_discovered < 1
-              for: 0m
-              labels:
-                severity: critical
-              annotations:
-                summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }})
-                description: "Prometheus cannot connect the alertmanager"
-
-            - alert: PrometheusRuleEvaluationFailures
-              expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
-              for: 0m
-              labels:
-                severity: critical
-              annotations:
-                summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
-                description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts."
-
-            - alert: PrometheusTemplateTextExpansionFailures
-              expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
-              for: 0m
-              labels:
-                severity: critical
-              annotations:
-                summary: Prometheus template text expansion failures (instance {{ $labels.instance }})
-                description: "Prometheus encountered {{ $value }} template text expansion failures"
-
-            - alert: PrometheusRuleEvaluationSlow
-              expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
-              for: 5m
-              labels:
-                severity: warning
-              annotations:
-                summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
-                description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query."
-
-            - alert: PrometheusNotificationsBacklog
-              expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0
-              for: 0m
-              labels:
-                severity: warning
-              annotations:
-                summary: Prometheus notifications backlog (instance {{ $labels.instance }})
-                description: "The Prometheus notification queue has not been empty for 10 minutes"
-
-            - alert: PrometheusAlertmanagerNotificationFailing
-              expr: rate(alertmanager_notifications_failed_total[1m]) > 0
-              for: 0m
-              labels:
-                severity: critical
-              annotations:
-                summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})
-                description: "Alertmanager is failing sending notifications"
-
-            - alert: PrometheusTargetEmpty
-              expr: prometheus_sd_discovered_targets == 0
-              for: 0m
-              labels:
-                severity: critical
-              annotations:
-                summary: Prometheus target empty (instance {{ $labels.instance }})
-                description: "Prometheus has no target in service discovery"
-
-            - alert: PrometheusTargetScrapingSlow
-              expr: prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05
-              for: 5m
-              labels:
-                severity: warning
-              annotations:
-                summary: Prometheus target scraping slow (instance {{ $labels.instance }})
-                description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned."
-
-            - alert: PrometheusLargeScrape
-              expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
-              for: 5m
-              labels:
-                severity: warning
-              annotations:
-                summary: Prometheus large scrape (instance {{ $labels.instance }})
-                description: "Prometheus has many scrapes that exceed the sample limit"
-
-            - alert: PrometheusTargetScrapeDuplicate
-              expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0
-              for: 0m
-              labels:
-                severity: warning
-              annotations:
-                summary: Prometheus target scrape duplicate (instance {{ $labels.instance }})
-                description: "Prometheus has many samples rejected due to duplicate timestamps but different values"
-
-            - alert: PrometheusTsdbCheckpointCreationFailures
-              expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0
-              for: 0m
-              labels:
-                severity: critical
-              annotations:
-                summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})
-                description: "Prometheus encountered {{ $value }} checkpoint creation failures"
-
-            - alert: PrometheusTsdbCheckpointDeletionFailures
-              expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0
-              for: 0m
-              labels:
-                severity: critical
-              annotations:
-                summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }})
-                description: "Prometheus encountered {{ $value }} checkpoint deletion failures"
-
-            - alert: PrometheusTsdbCompactionsFailed
-              expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
-              for: 0m
-              labels:
-                severity: critical
-              annotations:
-                summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }})
-                description: "Prometheus encountered {{ $value }} TSDB compactions failures"
-
-            - alert: PrometheusTsdbHeadTruncationsFailed
-              expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
-              for: 0m
-              labels:
-                severity: critical
-              annotations:
-                summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }})
-                description: "Prometheus encountered {{ $value }} TSDB head truncation failures"
-
-            - alert: PrometheusTsdbReloadFailures
-              expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0
-              for: 0m
-              labels:
-                severity: critical
-              annotations:
-                summary: Prometheus TSDB reload failures (instance {{ $labels.instance }})
-                description: "Prometheus encountered {{ $value }} TSDB reload failures"
-
-            - alert: PrometheusTsdbWalCorruptions
-              expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
-              for: 0m
-              labels:
-                severity: critical
-              annotations:
-                summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})
-                description: "Prometheus encountered {{ $value }} TSDB WAL corruptions"
-
-            - alert: PrometheusTsdbWalTruncationsFailed
-              expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
-              for: 0m
-              labels:
-                severity: critical
-              annotations:
-                summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})
-                description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures"
-
-        - name: Redis # via prometheus-redis-exporter
-          rules:
-            - alert: RedisDown
-              expr: redis_up == 0
-              for: 0m
-              labels:
-                severity: critical
-              annotations:
-                summary: Redis down (instance {{ $labels.instance }})
-                description: "Redis instance is down"
-
-            - alert: RedisMissingMaster
-              expr: (count(redis_instance_info{role="master"}) or vector(0)) < 1
-              for: 0m
-              labels:
-                severity: critical
-              annotations:
-                summary: Redis missing master (instance {{ $labels.instance }})
-                description: "Redis cluster has no node marked as master."
-
-            - alert: RedisTooManyMasters
-              expr: count(redis_instance_info{role="master"}) > 1
-              for: 0m
-              labels:
-                severity: critical
-              annotations:
-                summary: Redis too many masters (instance {{ $labels.instance }})
-                description: "Redis cluster has too many nodes marked as master."
-
-            - alert: RedisDisconnectedSlaves
-              expr: count without (instance, job) (redis_connected_slaves) - sum without (instance, job) (redis_connected_slaves) - 1 > 1
-              for: 0m
-              labels:
-                severity: critical
-              annotations:
-                summary: Redis disconnected slaves (instance {{ $labels.instance }})
-                description: "Redis not replicating for all slaves. Consider reviewing the redis replication status."
-
-            - alert: RedisReplicationBroken
-              expr: delta(redis_connected_slaves[1m]) < 0
-              for: 0m
-              labels:
-                severity: critical
-              annotations:
-                summary: Redis replication broken (instance {{ $labels.instance }})
-                description: "Redis instance lost a slave"
-
-            - alert: RedisClusterFlapping
-              expr: changes(redis_connected_slaves[1m]) > 1
-              for: 2m
-              labels:
-                severity: critical
-              annotations:
-                summary: Redis cluster flapping (instance {{ $labels.instance }})
-                description: "Changes have been detected in Redis replica connection. This can occur when replica nodes lose connection to the master and reconnect (a.k.a flapping)."
-
-            - alert: RedisMissingBackup
-              expr: time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 24
-              for: 0m
-              labels:
-                severity: critical
-              annotations:
-                summary: Redis missing backup (instance {{ $labels.instance }})
-                description: "Redis has not been backuped for 24 hours"
-
-            # The exporter must be started with --include-system-metrics flag or REDIS_EXPORTER_INCL_SYSTEM_METRICS=true environment variable.
-            - alert: RedisOutOfSystemMemory
-              expr: redis_memory_used_bytes / redis_total_system_memory_bytes * 100 > 90
-              for: 2m
-              labels:
-                severity: warning
-              annotations:
-                summary: Redis out of system memory (instance {{ $labels.instance }})
-                description: "Redis is running out of system memory (> 90%)"
-
-            - alert: RedisOutOfConfiguredMaxmemory
-              expr: redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90
-              for: 2m
-              labels:
-                severity: warning
-              annotations:
-                summary: Redis out of configured maxmemory (instance {{ $labels.instance }})
-                description: "Redis is running out of configured maxmemory (> 90%)"
-
-            - alert: RedisTooManyConnections
-              expr: redis_connected_clients > 100
-              for: 2m
-              labels:
-                severity: warning
-              annotations:
-                summary: Redis too many connections (instance {{ $labels.instance }})
-                description: "Redis instance has too many connections"
-
-            - alert: RedisNotEnoughConnections
-              expr: redis_connected_clients < 5
-              for: 2m
-              labels:
-                severity: warning
-              annotations:
-                summary: Redis not enough connections (instance {{ $labels.instance }})
-                description: "Redis instance should have more connections (> 5)"
-
-            - alert: RedisRejectedConnections
-              expr: increase(redis_rejected_connections_total[1m]) > 0
-              for: 0m
-              labels:
-                severity: critical
-              annotations:
-                summary: Redis rejected connections (instance {{ $labels.instance }})
-                description: "Some connections to Redis has been rejected"
-
-
+    prometheus.yml:
+      rule_files:
+        - /etc/config/recording_rules.yml
+        - /etc/config/alerting_rules.yml
+        - /etc/config/rules
+        - /etc/config/alerts
+        - /etc/posthog/*.yml
+
+  server:
+    extraConfigmapMounts:
+      - name: posthog-prometheus-alerts-default
+        mountPath: /etc/posthog/
+        configMap: posthog-prometheus-alerts-default
+        readOnly: true
+
+  configmapReload:
+    extraConfigmapMounts:
+      - name: posthog-prometheus-alerts-default
+        mountPath: /etc/posthog/
+        configMap: posthog-prometheus-alerts-default
+        readOnly: true
+
+    extraVolumeDirs:
+      - /etc/posthog/
 ###
 ###
 ### ---- prometheus-statsd-exporter ----