-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathcluster-capacity-management-alerts.yaml
130 lines (130 loc) · 9.65 KB
/
cluster-capacity-management-alerts.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
{{- if and .Values.defaultRules.create .Values.defaultRules.rules.capacityManagementAlerts}}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ printf "%s-%s" (include "prometheus-alerts.fullname" .) "cluster-capacity-management-alerts" | trunc 63 | trimSuffix "-" }}
labels:
app: {{ template "prometheus-alerts.name" . }}
{{ include "prometheus-alerts.labels" . | indent 4 }}
{{- if .Values.defaultRules.alertLabels }}
{{ toYaml .Values.defaultRules.alertLabels | indent 4 }}
{{- end }}
{{- if .Values.defaultRules.annotations }}
annotations:
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
{{- end }}
spec:
groups:
- name: cluster-capacity-management-alerts
rules:
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left and is filling up.
runbook_url: {{ .Values.defaultRules.runbookUrl }}node/nodefilesystemspacefillingup
summary: Filesystem is predicted to run out of space within the next 3 days.
expr: |-
(
(node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100) < (100-{{ .Values.capacityManagementAlertsDiskLimit }})
and
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 3*24*60*60) <= (node_filesystem_size_bytes*(1-{{ .Values.capacityManagementAlertsDiskLimit }}/100))
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: warning
{{- if .Values.capacityManagementAlertsPersistentVolumeEnabled }}
- alert: PersistentVolume{{.Values.capacityManagementAlertsPersistentVolumeLimit}}PercentInThreeDays
annotations:
message: The PersistentVolume claimed by {{`{{ $labels.persistentvolumeclaim }}`}} in Namespace {{`{{ $labels.namespace }}`}} will go over {{.Values.capacityManagementAlertsPersistentVolumeLimit}} in three days.
expr: predict_linear(kubelet_volume_stats_available_bytes[24h], 3*24*60*60) <= (kubelet_volume_stats_capacity_bytes*(1-{{ .Values.capacityManagementAlertsPersistentVolumeLimit }}/100))
for: 5m
labels:
severity: warning
{{- end }}
{{- if .Values.capacityManagementAlertsPredictUsage}}
- alert: Memory{{.Values.capacityManagementAlertsUsageLimit}}PercentInThreeDays
annotations:
message: Memory usage in Cluster {{`{{ $labels.cluster }}`}} Instance {{`{{ $labels.instance }}`}} is predicted to go over {{.Values.capacityManagementAlertsUsageLimit}}% within the next 3 days at current use rate.
expr: predict_linear(node_memory_MemAvailable_bytes[24h], 3*24*60*60) <= (node_memory_MemTotal_bytes*(1-{{ .Values.capacityManagementAlertsUsageLimit }}/100))
for: 5m
labels:
severity: warning
{{- end }}
- alert: NodeGroupCPU{{.Values.capacityManagementAlertsNodeGroupCpuLimit24h}}PercentOver24h
annotations:
message: CPU usage has been over {{.Values.capacityManagementAlertsNodeGroupCpuLimit24h}}% on average over the span of 24h in the Node Group {{`{{ $labels.label_elastisys_io_node_group }}`}} in Cluster {{`{{ $labels.cluster }}`}}.
expr: avg by (label_elastisys_io_node_group,cluster) (sum by (instance) (rate(node_cpu_seconds_total{mode!='idle',cluster=~".*"}[24h])) / on (instance) instance:node_num_cpu:sum * on (instance) group_left (label_elastisys_io_node_group,cluster) label_replace(kube_node_labels{label_elastisys_io_node_group!=""}, "instance", "$1", "node", "(.*)")) > {{.Values.capacityManagementAlertsNodeGroupCpuLimit24h}}/100
for: 5m
labels:
severity: warning
- alert: NodeGroupCPU{{.Values.capacityManagementAlertsNodeGroupCpuLimit1h}}PercentOver1h
annotations:
message: CPU usage has been over {{.Values.capacityManagementAlertsNodeGroupCpuLimit1h}}% on average over the span of 1h in the Node Group {{`{{ $labels.label_elastisys_io_node_group }}`}} in Cluster {{`{{ $labels.cluster }}`}}.
expr: avg by (label_elastisys_io_node_group,cluster) (sum by (instance) (rate(node_cpu_seconds_total{mode!='idle',cluster=~".*"}[1h])) / on (instance) instance:node_num_cpu:sum * on (instance) group_left (label_elastisys_io_node_group,cluster) label_replace(kube_node_labels{label_elastisys_io_node_group!=""}, "instance", "$1", "node", "(.*)")) > {{.Values.capacityManagementAlertsNodeGroupCpuLimit1h}}/100
for: 5m
labels:
severity: warning
- alert: NodeGroupMemory{{.Values.capacityManagementAlertsNodeGroupMemoryLimit24h}}PercentOver24h
annotations:
message: Memory usage has been over {{.Values.capacityManagementAlertsNodeGroupMemoryLimit24h}}% on average over the span of 24h in the Node Group {{`{{ $labels.label_elastisys_io_node_group }}`}} in Cluster {{`{{ $labels.cluster }}`}}.
expr: avg by (label_elastisys_io_node_group,cluster) ((avg_over_time (instance:node_memory_utilisation:ratio{cluster=~".*"}[24h])) * on (instance) group_left (label_elastisys_io_node_group) label_replace(kube_node_labels{label_elastisys_io_node_group!=""}, "instance", "$1", "node", "(.*)")) > {{.Values.capacityManagementAlertsNodeGroupMemoryLimit24h}}/100
for: 5m
labels:
severity: warning
- alert: NodeGroupMemory{{.Values.capacityManagementAlertsNodeGroupMemoryLimit1h}}PercentOver1h
annotations:
message: Memory usage has been over {{.Values.capacityManagementAlertsNodeGroupMemoryLimit1h}}% on average over the span of 1h in the Node Group {{`{{ $labels.label_elastisys_io_node_group }}`}} in Cluster {{`{{ $labels.cluster }}`}}.
expr: avg by (label_elastisys_io_node_group,cluster) ((avg_over_time (instance:node_memory_utilisation:ratio{cluster=~".*"}[1h])) * on (instance) group_left (label_elastisys_io_node_group) label_replace(kube_node_labels{label_elastisys_io_node_group!=""}, "instance", "$1", "node", "(.*)")) > {{.Values.capacityManagementAlertsNodeGroupMemoryLimit1h}}/100
for: 5m
labels:
severity: warning
- alert: NodeCPU{{.Values.capacityManagementAlertsNodeCpuLimit1h}}PercentOver1h
annotations:
message: CPU usage has been over {{.Values.capacityManagementAlertsNodeCpuLimit1h}}% on average over the span of 1h for the node {{`{{ $labels.instance }}`}} in Cluster {{`{{ $labels.cluster }}`}}.
expr: sum by (instance) (rate(node_cpu_seconds_total{mode!='idle',cluster=~".*"}[1h])) / on (instance) instance:node_num_cpu:sum * on (instance) group_left (label_elastisys_io_node_group,cluster) label_replace(kube_node_labels{label_elastisys_io_node_group!=""}, "instance", "$1", "node", "(.*)") > {{.Values.capacityManagementAlertsNodeCpuLimit1h}}/100
for: 5m
labels:
severity: warning
- alert: NodeMemory{{.Values.capacityManagementAlertsNodeMemoryLimit1h}}PercentOver1h
annotations:
message: Memory usage has been over {{.Values.capacityManagementAlertsNodeMemoryLimit1h}}% on average over the span of 1h for the Node {{`{{ $labels.instance }}`}} in Cluster {{`{{ $labels.cluster }}`}}.
expr: |-
(
avg_over_time (instance:node_memory_utilisation:ratio{cluster=~".*"}[1h]) * on (instance) group_left (label_elastisys_io_node_group)
label_replace(kube_node_labels{label_elastisys_io_node_group!=""}, "instance", "$1", "node", "(.*)")
) > {{.Values.capacityManagementAlertsNodeMemoryLimit1h}}/100
for: 5m
labels:
severity: warning
- alert: NodeGroupCpuRequest{{ .Values.capacityManagementAlertsCpuRequestLimit }}Percent
annotations:
message: Average CPU requests is over {{ .Values.capacityManagementAlertsCpuRequestLimit }}% in the Node Group {{`{{ $labels.label_elastisys_io_node_group }}`}} in Cluster {{`{{ $labels.cluster }}`}}.
expr: |-
(
avg by (label_elastisys_io_node_group,cluster) (sum by (node,cluster) (kube_pod_container_resource_requests{cluster=~".*",namespace=~".*",resource="cpu"}
and
on(pod, namespace, cluster) kube_pod_status_phase{cluster=~".*",namespace=~".*",phase="Running"} == 1)
) / (
sum by(node,cluster) (kube_node_status_allocatable{cluster=~".*",resource="cpu"})) * on (node) group_left (label_elastisys_io_node_group)
label_replace(kube_node_labels{label_elastisys_io_node_group!~'{{ .Values.capacityManagementAlertsRequestsExcludePattern }}'}, "instance", "$1", "node", "(.*)")
) >= {{ .Values.capacityManagementAlertsCpuRequestLimit }}/100
for: 5m
labels:
severity: warning
- alert: NodeGroupMemoryRequest{{ .Values.capacityManagementAlertsMemoryRequestLimit }}Percent
annotations:
message: Average memory requests is over {{ .Values.capacityManagementAlertsMemoryRequestLimit }}% in the Node Group {{`{{ $labels.label_elastisys_io_node_group }}`}} in Cluster {{`{{ $labels.cluster }}`}}.
expr: |-
(
avg by (label_elastisys_io_node_group,cluster) (sum by (node,cluster) (kube_pod_container_resource_requests{cluster=~".*",namespace=~".*",resource="memory"}
and
on(pod, namespace, cluster) kube_pod_status_phase{cluster=~".*",namespace=~".*",phase="Running"} == 1)
) / (
sum by(node,cluster) (kube_node_status_allocatable{cluster=~".*",resource="memory"})) * on (node) group_left (label_elastisys_io_node_group)
label_replace(kube_node_labels{label_elastisys_io_node_group!~'{{ .Values.capacityManagementAlertsRequestsExcludePattern }}'}, "instance", "$1", "node", "(.*)")
) >= {{ .Values.capacityManagementAlertsMemoryRequestLimit }}/100
for: 5m
labels:
severity: warning
{{- end }}