Skip to content
This repository has been archived by the owner on Jun 6, 2024. It is now read-only.

Commit

Permalink
add vcs and percent in configuration
Browse files Browse the repository at this point in the history
  • Loading branch information
suiguoxin committed Aug 6, 2020
1 parent 13c6871 commit 410bc32
Show file tree
Hide file tree
Showing 6 changed files with 17 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -201,10 +201,14 @@ alert-manager:


# uncomment following if you want to change customeize prometheus
# prometheus:
prometheus:
# port: 9091
# # How frequently to scrape targets
# scrape_interval: 30
alert-handler:
virtual_clusters: default|autosys
percent: 0.3
time: 10s


# uncomment following section if you want to customize the threshold of cleaner
Expand Down
2 changes: 2 additions & 0 deletions src/alert-manager/deploy/alert-manager.yaml.template
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ spec:
env:
- name: REST_SERVER_URI
value: {{ cluster_cfg['rest-server']['uri'] }}
imagePullSecrets:
- name: {{ cluster_cfg["cluster"]["docker-registry"]["secret-name"] }}
volumes:
- name: config-volume
configMap:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,17 @@ groups:
expr: zombie_container_count > 0
for: 1h # only when it exceed 1 hour
annotations:
summary: "zombie container in {{$labels.instance}} detected"
summary: "zombie container in {{ '{{' }}$labels.instance{{ '}}' }}detected"
- alert: PaiJobPending
expr: pai_job_pod_count{pod_bound="true", phase="pending"} > 0
for: 30m
annotations:
summary: "Job {{$labels.job_name}} in pending status detected"
summary: "Job {{ '{{' }}$labels.job_name{{ '}}' }}in pending status detected"
- name: pai-alert-handler
rules:
- alert: LowTaskGpuPercent
expr: avg(task_gpu_percent{virtual_cluster=~"default|autosys"}) by (job_name) < 0.3
for: 10s
expr: avg(task_gpu_percent{virtual_cluster=~"{{ cluster_cfg["prometheus"]["alert-handler"]["virtual_clusters"] }}"}) by (job_name) < {{ cluster_cfg["prometheus"]["alert-handler"]["percent"] }}
for: {{ cluster_cfg["prometheus"]["alert-handler"]["time"] }}
annotations:
summary: "{{$labels.job_name}} has gpu percent lower than 30% for 1h, will be killed"
summary: "{{ '{{' }}$labels.instance{{ '}}' }} has gpu percent lower than 30% for 1h, will be killed"
description: In certain virtual clusters, jobs with low gpu utilization will be killed.
2 changes: 1 addition & 1 deletion src/prometheus/deploy/prometheus-configmap.yaml.template
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ data:
- "/etc/prometheus-alert/*.rules"
- "/etc/prometheus-record/*.rules"
scrape_configs:
- job_name: 'pai_serivce_exporter'
- job_name: 'pai_service_exporter'
scrape_interval: {{ prom_info["scrape_interval"] }}s
kubernetes_sd_configs:
- role: pod
Expand Down
1 change: 1 addition & 0 deletions src/prometheus/deploy/service.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ template-list:
- prometheus-deployment.yaml
- start.sh
- delete.yaml
- alerting/jobs.rules

start-script: start.sh
stop-script: stop.sh
Expand Down
6 changes: 3 additions & 3 deletions src/tools/tests/test_alert_operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def success_response():
"alertname": "JobExporterHangs",
"alertstate": "pending",
"instance": "10.0.0.1:9102",
"job": "pai_serivce_exporter",
"job": "pai_service_exporter",
"name": "docker_daemon_collector",
"pai_service_name": "job-exporter",
"scraped_from": "job-exporter-p4skn",
Expand All @@ -57,7 +57,7 @@ def success_response():
"alertname": "NodeMemoryUsage",
"alertstate": "firing",
"instance": "10.0.0.2:9100",
"job": "pai_serivce_exporter",
"job": "pai_service_exporter",
"pai_service_name": "node-exporter",
"scraped_from": "node-exporter-blkpp"
},
Expand All @@ -73,7 +73,7 @@ def success_response():
"alertstate": "firing",
"command": "nvidia-smi",
"instance": "10.0.0.3:9102",
"job": "pai_serivce_exporter",
"job": "pai_service_exporter",
"pai_service_name": "job-exporter",
"scraped_from": "job-exporter-t4sv6"
},
Expand Down

0 comments on commit 410bc32

Please sign in to comment.