Skip to content
This repository has been archived by the owner on Jun 6, 2024. It is now read-only.

Commit

Permalink
send regular GPU utilization report with CronJob (#5281)
Browse files Browse the repository at this point in the history
  • Loading branch information
suiguoxin authored Feb 7, 2021
1 parent be386f0 commit edc67c2
Show file tree
Hide file tree
Showing 19 changed files with 406 additions and 3 deletions.
23 changes: 22 additions & 1 deletion .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ jobs:
rm -rf ./src/watchdog/GOPATH/src/github.com/microsoft/watchdog/vendor/
misspell -error .
pylint:
pylint-deployment:
name: pylint of deployment scripts
runs-on: ubuntu-16.04

Expand All @@ -64,6 +64,27 @@ jobs:
- name: Lint
run: |
pylint contrib/kubespray/script --rcfile=contrib/kubespray/script/pylintrc
pylint-alert-manager:
name: pylint of alert-manager
runs-on: ubuntu-16.04

steps:
- name: Checkout
uses: actions/checkout@v1
- name: Use Python 3.7
uses: actions/setup-python@v2
with:
python-version: 3.7
architecture: x64
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install -r src/alert-manager/src/cluster-utilization/requirements.txt
python -m pip install pylint
- name: Lint
run: |
pylint src/alert-manager/src/cluster-utilization/ --rcfile=src/alert-manager/src/cluster-utilization/pylintrc
swagger-validate:
name: Validate swagger
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,9 @@ authentication:
# smtp-from: [email protected]
# smtp-auth-username: [email protected]
# smtp-auth-password: password-for-alert-sender
# cluster-utilization: # cluster-utilization is a k8s CronJob which reports the GPU utilization of the cluster
# # for schedule syntex, refer to https://kubernetes.io/docs/concepts/workloads/controllers/cron-jobs/#cron-schedule-syntax
# schedule: "0 0 * * *" # daily report at UTC 00:00
# customized-routes:
# routes:
# - receiver: pai-email-admin-user-and-stop-job
Expand Down
3 changes: 3 additions & 0 deletions deployment/quick-start/services-configuration.yaml.template
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,9 @@ rest-server:
# smtp-from: [email protected]
# smtp-auth-username: [email protected]
# smtp-auth-password: password-for-alert-sender
# cluster-utilization: # cluster-utilization is a k8s CronJob which reports the GPU utilization of the cluster
# # for schedule syntex, refer to https://kubernetes.io/docs/concepts/workloads/controllers/cron-jobs/#cron-schedule-syntax
# schedule: "0 0 * * *" # daily report at UTC 00:00
# customized-routes:
# routes:
# - receiver: pai-email-admin-user-and-stop-job
Expand Down
2 changes: 1 addition & 1 deletion docs/manual/cluster-admin/how-to-use-alert-system.md
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ Remember to re-build and push the docker image, and restart the `alert-manager`

```bash
./build/pai_build.py build -c /cluster-configuration/ -s alert-manager
./build/pai_build.py push -c /cluster-configuration/ -i alert-handler
./build/pai_build.py push -c /cluster-configuration/ -i alert-handler cluster-utilization
./paictl.py service stop -n alert-manager
./paictl.py config push -p /cluster-configuration -m service
./paictl.py service start -n alert-manager
Expand Down
3 changes: 3 additions & 0 deletions examples/cluster-configuration/services-configuration.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,9 @@ rest-server:
# smtp-from: [email protected]
# smtp-auth-username: [email protected]
# smtp-auth-password: password-for-alert-sender
# cluster-utilization: # cluster-utilization is a k8s CronJob which reports the GPU utilization of the cluster
# # for schedule syntex, refer to https://kubernetes.io/docs/concepts/workloads/controllers/cron-jobs/#cron-schedule-syntax
# schedule: "0 0 * * *" # daily report at UTC 00:00
# customized-routes:
# routes:
# - receiver: pai-email-admin-user-and-stop-job
Expand Down
24 changes: 24 additions & 0 deletions src/alert-manager/build/cluster-utilization.common.dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

FROM python:3.7

COPY ./src/cluster-utilization .

RUN pip3 install -r requirements.txt

ENTRYPOINT ["python3", "send_alert.py"]
4 changes: 3 additions & 1 deletion src/alert-manager/config/alert-manager.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,7 @@ alert-handler:
log-level: 'info'
port: 9095
configured: False
cluster-utilization:
configured: False
use-pylon: False
repeat-interval: '24h'
repeat-interval: '24h'
4 changes: 4 additions & 0 deletions src/alert-manager/config/alert_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,10 @@ def run(self):
else:
result["alert-handler"]["configured"] = False

if result.get("cluster-utilization") is not None and \
result["cluster-utilization"].get("schedule") is not None:
result["cluster-utilization"]["configured"] = True

result["host"] = self.get_master_ip()
result["url"] = "http://{0}:{1}".format(self.get_master_ip(), result["port"])

Expand Down
11 changes: 11 additions & 0 deletions src/alert-manager/deploy/alert-manager-configmap.yaml.template
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ data:
- receiver: pai-cordon-nodes
match:
alertname: NvidiaSmiDoubleEccError

- receiver: pai-cluster-usage
match:
report_type: cluster-usage

{% if 'routes' in cluster_cfg["alert-manager"]["customized-routes"] %}
{% for route in cluster_cfg["alert-manager"]["customized-routes"]["routes"] %}
Expand All @@ -62,6 +66,13 @@ data:
- url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/send-email-to-admin'
send_resolved: true
{% endif %}

- name: pai-cluster-usage
webhook_configs:
{% if 'email-admin' in cluster_cfg["alert-manager"]["actions-available"] %}
- url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/send-email-to-admin/?template=cluster-usage'
send_resolved: false
{% endif %}

- name: pai-cordon-nodes
webhook_configs:
Expand Down
44 changes: 44 additions & 0 deletions src/alert-manager/deploy/alert-manager-cronjob.yaml.template
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


apiVersion: batch/v1beta1
kind: CronJob
metadata:
name: cluster-utilization
spec:
schedule: "{{ cluster_cfg["alert-manager"]["cluster-utilization"]["schedule"] }}"
jobTemplate:
spec:
template:
spec:
containers:
- name: cluster-utilization
image: {{ cluster_cfg['cluster']['docker-registry']['prefix'] }}cluster-utilization:{{ cluster_cfg['cluster']['docker-registry']['tag'] }}
imagePullPolicy: Always
env:
- name: PAI_URI
{%- if "ssl" in cluster_cfg["pylon"] and cluster_cfg["pylon"]["ssl"] %}
value: "{{ cluster_cfg['pylon']['uri-https']}}"
{%- else %}
value: "{{ cluster_cfg['pylon']['uri']}}"
{%- endif %}
- name: PAI_BEARER_TOKEN
value: {{ cluster_cfg["alert-manager"]["alert-handler"]["pai-bearer-token"] }}
imagePullSecrets:
- name: {{ cluster_cfg["cluster"]["docker-registry"]["secret-name"] }}
restartPolicy: OnFailure
84 changes: 84 additions & 0 deletions src/alert-manager/deploy/alert-templates/cluster-usage/html.ejs
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
<!DOCTYPE html
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xmlns="http://www.w3.org/1999/xhtml">

<head>
<meta name="viewport" content="width=device-width" />
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<title>
<%= cluster_id %>: Cluster GPU utilization for One Week
</title>
</head>

<body itemscope="" itemtype="http://schema.org/EmailMessage"
style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 16px; -webkit-font-smoothing: antialiased; -webkit-text-size-adjust: none; height: 100%; line-height: 1.6em; width: 100% !important; background-color: #f6f6f6; margin: 0; padding: 0;">
<h3 style="text-align:center">Cluster GPU utilization for One Week</h3>
<table style="font-size: 16px; width: 100%; margin: 0;">
<% alerts.filter( element=> typeof element.labels.cluster_usage !== 'undefined').forEach(function(alert){ %>
<tr>
<th>Cluster GPU utilization</th>
<td>
<%= alert.labels.cluster_usage %>
</td>
</tr>
<% }); %>
</table>
<br />
<br />
<h3 style="text-align:center">User GPU Utilization for One Week</h3>
<table style="font-size: 16px; width: 100%;margin: 0;text-align:center;">
<tr>
<th>User name</th>
<th>GPU utilization</th>
</tr>
<% alerts.filter( element=> typeof element.labels.user_name !== 'undefined' && typeof element.labels.user_usage !==
'undefined').forEach(function(alert){ %>
<tr>
<td>
<%= alert.labels.user_name %>
</td>
<td>
<%= alert.labels.user_usage %>
</td>
</tr>
<% }); %>
</table>
<br />
<br />
<h3 style="text-align:center">Job GPU Utilization for One Week</h3>
<table style="font-size: 16px; width: 100%; margin: 0; text-align:center;">
<tr>
<th>Job name</th>
<th>GPU utilization</th>
<th>Job duration</th>
<th>Job start time</th>
<th>Job status</th>
<th>GPU number</th>
</tr>
<% alerts.filter( element=> typeof element.labels.job_name !== 'undefined' && typeof element.labels.job_usage !==
'undefined').forEach(function(alert){ %>
<tr>
<td>
<%= alert.labels.job_name %>
</td>
<td>
<%= alert.labels.job_usage %>
</td>
<td>
<%= alert.labels.job_duration %>
</td>
<td>
<%= alert.labels.job_start_time %>
</td>
<td>
<%= alert.labels.job_status %>
</td>
<td>
<%= alert.labels.job_gpu_number %>
</td>
</tr>
<% }); %>
</table>
</body>

</html>
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<%= cluster_id %>: Cluster GPU Utilization for One Week
1 change: 1 addition & 0 deletions src/alert-manager/deploy/service.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ prerequisite:
template-list:
- alert-manager-deployment.yaml
- alert-manager-configmap.yaml
- alert-manager-cronjob.yaml
- start.sh

start-script: start.sh
Expand Down
3 changes: 3 additions & 0 deletions src/alert-manager/deploy/start.sh.template
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ kubectl create configmap alert-templates \
kubectl apply --overwrite=true -f rbac.yaml || exit $?
kubectl apply --overwrite=true -f alert-manager-configmap.yaml || exit $?
kubectl apply --overwrite=true -f alert-manager-deployment.yaml || exit $?
{% if cluster_cfg["alert-manager"]["cluster-utilization"]["configured"] -%}
kubectl apply --overwrite=true -f alert-manager-cronjob.yaml || exit $?
{% endif -%}

sleep 10
# wait until the service is ready.
Expand Down
1 change: 1 addition & 0 deletions src/alert-manager/deploy/stop.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
kubectl delete --ignore-not-found --now configmap/alert-templates
kubectl delete --ignore-not-found --now configmap/alertmanager
kubectl delete --ignore-not-found --now deployment/alertmanager
kubectl delete --ignore-not-found --now cronjob/cluster-utilization

if kubectl get clusterrolebinding | grep -q "alert-manager-role-binding"; then
kubectl delete clusterrolebinding alert-manager-role-binding || exit $?
Expand Down
Empty file.
9 changes: 9 additions & 0 deletions src/alert-manager/src/cluster-utilization/pylintrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
[SETTINGS]

max-line-length=140

disable =
missing-docstring,
invalid-name,
cell-var-from-loop,
undefined-loop-variable,
1 change: 1 addition & 0 deletions src/alert-manager/src/cluster-utilization/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
requests==2.23.0
Loading

0 comments on commit edc67c2

Please sign in to comment.