Skip to content
This repository has been archived by the owner on Jun 6, 2024. It is now read-only.

Commit

Permalink
init
Browse files Browse the repository at this point in the history
  • Loading branch information
suiguoxin committed Mar 18, 2021
1 parent 9be9ea6 commit d7f758a
Show file tree
Hide file tree
Showing 13 changed files with 242 additions and 15 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

FROM nvidia/cuda:11.2.2-base-ubuntu16.04

RUN apt-get -y update && \
apt-get install sudo

COPY ./src/nvidia-gpu-low-perf-fixer .

ENTRYPOINT /bin/bash nvidia-gpu-low-perf-fixer.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

FROM nvidia/cuda:10.2-base

COPY ./src/nvidia-gpu-low-perf-state-fixer .

ENTRYPOINT /bin/bash nvidia-gpu-low-perf-state-fixer.sh
13 changes: 13 additions & 0 deletions src/alert-manager/deploy/alert-manager-configmap.yaml.template
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@ data:
match:
report_type: cluster-usage

- receiver: fix-nvidia-gpu-low-perf
match:
alertname: NodeGpuLowPerfState

{% if 'routes' in cluster_cfg["alert-manager"]["customized-routes"] %}
{% for route in cluster_cfg["alert-manager"]["customized-routes"]["routes"] %}
- receiver: {{ route.receiver}}
Expand Down Expand Up @@ -74,6 +78,15 @@ data:
send_resolved: false
{% endif %}

- name: fix-nvidia-gpu-low-perf
webhook_configs:
{% if 'email-admin' in cluster_cfg["alert-manager"]["actions-available"] %}
- url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/send-email-to-admin'
send_resolved: true
{% endif %}
- url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/fix-nvidia-gpu-low-perf'
send_resolved: false

- name: pai-cordon-nodes
webhook_configs:
{% if 'cordon-nodes' in cluster_cfg["alert-manager"]["actions-available"] %}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,10 @@ spec:
value: {{ cluster_cfg["cluster"]["common"]["cluster-id"] }}
- name: REST_SERVER_URI
value: {{ cluster_cfg['rest-server']['uri'] }}
- name: DOCKER_REGISTRY_PREFIX
value: {{ cluster_cfg['cluster']['docker-registry']['prefix'] }}
- name: DOCKER_REGISTRY_TAG
value: {{ cluster_cfg['cluster']['docker-registry']['tag'] }}
- name: WEBPORTAL_URI
{%- if "ssl" in cluster_cfg["pylon"] and cluster_cfg["pylon"]["ssl"] %}
value: "{{ cluster_cfg['pylon']['uri-https']}}"
Expand Down
3 changes: 3 additions & 0 deletions src/alert-manager/deploy/rbac.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ rules:
- apiGroups: [""]
resources: ["nodes"]
verbs: ["patch"]
- apiGroups: ["batch"]
resources: ["jobs"]
verbs: ["create"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
Expand Down
15 changes: 1 addition & 14 deletions src/alert-manager/src/alert-handler/controllers/mail.js
Original file line number Diff line number Diff line change
Expand Up @@ -88,19 +88,6 @@ const sendEmailToAdmin = (req, res) => {
});
};

const getUserNameByJobName = async (jobName, token) => {
return axios
.get(`${process.env.REST_SERVER_URI}/api/v2/jobs/${jobName}`, {
headers: {
Authorization: `Bearer ${token}`,
'Content-Type': 'application/json',
},
})
.then((response) => {
return response.data.jobStatus.username;
});
};

const getUserEmail = async (username, token) => {
return axios
.get(`${process.env.REST_SERVER_URI}/api/v2/users/${username}`, {
Expand Down Expand Up @@ -132,7 +119,7 @@ const sendEmailToUser = async (req, res) => {
// group alerts by username
const alertsGrouped = {};
alerts.map((alert, index) => {
let userName = alert.labels.job_name.split('~')[0];
const userName = alert.labels.job_name.split('~')[0];
if (userName in alertsGrouped) {
alertsGrouped[userName].push(alerts[index]);
} else {
Expand Down
101 changes: 100 additions & 1 deletion src/alert-manager/src/alert-handler/controllers/node.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,16 @@
const k8s = require('@kubernetes/client-node');
const kc = new k8s.KubeConfig();
const logger = require('@alert-handler/common/logger');
const crypto = require('crypto');

kc.loadFromDefault();
const k8sApi = kc.makeApiClient(k8s.CoreV1Api);

const cordonNode = async (nodeName) => {
const headers = {
'content-type': 'application/strategic-merge-patch+json',
};
// set the node unschedulable
const k8sApi = kc.makeApiClient(k8s.CoreV1Api);
return k8sApi.patchNode(
nodeName,
{ spec: { unschedulable: true } },
Expand Down Expand Up @@ -72,7 +73,105 @@ const cordonNodes = (req, res) => {
});
};

const getK8sV1Job = (jobName, nodeName, minorNumber) => {
const DOCKER_REGISTRY_PREFIX = process.env.DOCKER_REGISTRY_PREFIX;
const DOCKER_REGISTRY_TAG = process.env.DOCKER_REGISTRY_TAG;
const job = {
apiVersion: 'batch/v1',
kind: 'Job',
metadata: {
name: jobName,
},
spec: {
ttlSecondsAfterFinished: 86400, // TODO: enable this feature when install k8s / delete the job elsewhere
template: {
metadata: {
name: 'nvidia-gpu-low-perf-fixer',
},
spec: {
containers: [
{
name: 'nvidia-gpu-low-perf-fixer',
image: `${DOCKER_REGISTRY_PREFIX}nvidia-gpu-low-perf-fixer:${DOCKER_REGISTRY_TAG}`,
imagePullPolicy: 'Always',
env: [
{
name: 'MINOR_NUMBER',
value: `${minorNumber}`,
},
],
securityContext: {
privileged: true,
},
},
],
restartPolicy: 'Never',
nodeSelector: {
'kubernetes.io/hostname': nodeName,
},
},
},
},
};
return job;
};

// start a k8s job for each GPU card to fix NvidiaGPULowPerf issue
const fixNvidiaGPULowPerf = (req, res) => {
logger.info(
'Received `fixNvidiaGPULowPerf` post request from alert-manager.',
);
// filter alerts which are firing and contain `node_name` & `minor_number` as label
const jobsInfo = req.body.alerts
.filter(
(alert) =>
alert.status === 'firing' &&
'node_name' in alert.labels &&
'minor_number' in alert.labels,
)
// map each alert to a job
.map((alert) => ({
jobName: `nvidia-gpu-low-perf-fixer-${crypto
.createHash('md5')
.update(alert.labels.node_name + alert.labels.minor_number)
.digest('hex')}`, // unique job by GPU card
nodeName: alert.labels.node_name,
minorNumber: alert.labels.minor_number,
DOCKER_REGISTRY_PREFIX: process.env.DOCKER_REGISTRY_PREFIX,
DOCKER_REGISTRY_TAG: process.env.DOCKER_REGISTRY_TAG,
}));

const k8sApi = kc.makeApiClient(k8s.BatchV1Api);
jobsInfo.forEach(async (jobInfo) => {
// get k8s V1Job
const job = getK8sV1Job(
jobInfo.jobName,
jobInfo.nodeName,
jobInfo.minorNumber,
);
k8sApi
.createNamespacedJob('default', job)
.then((response) => {
logger.info(
`Successfully start job ${jobInfo.jobName} for GPU Low Performance issue in node: ${jobInfo.nodeName}, minor number: ${jobInfo.minorNumber}`,
);
})
.catch((error) => {
// ignore the job creation if already exists
if (error.response && error.response.statusCode === 409) {
logger.warn(`Kubernetes job ${jobInfo.jobName} already exists.`);
} else {
logger.error(error);
res.status(500).json({
message: `Failed to start job to fix NvidiaGPULowPerf`,
});
}
});
});
};

// module exports
module.exports = {
cordonNodes,
fixNvidiaGPULowPerf,
};
1 change: 1 addition & 0 deletions src/alert-manager/src/alert-handler/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
"express": "^4.16.2",
"express-bearer-token": "^2.4.0",
"joi": "^14.3.1",
"js-yaml": "^4.0.0",
"module-alias": "^2.2.2",
"nodemailer": "^6.4.11",
"winston": "2"
Expand Down
5 changes: 5 additions & 0 deletions src/alert-manager/src/alert-handler/routes/actions.js
Original file line number Diff line number Diff line change
Expand Up @@ -50,4 +50,9 @@ router
/** POST /alert-handler/cordon-nodes */
.post(nodeController.cordonNodes);

router
.route('/alert-handler/fix-nvidia-gpu-low-perf')
/** POST /alert-handler/fix-nvidia-gpu-low-perf */
.post(nodeController.fixNvidiaGPULowPerf);

module.exports = router;
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
apiVersion: batch/v1
kind: Job
metadata:
name: <%= jobName %>
spec:
ttlSecondsAfterFinished: 86400 # TODO: enable this feature when install k8s / delete the job elsewhere
template:
metadata:
name: nvidia-gpu-low-perf-state-fixer
spec:
containers:
- name: nvidia-gpu-low-perf-state-fixer
image: <%= DOCKER_REGISTRY_PREFIX %>nvidia-gpu-low-perf-state-fixer:<%= DOCKER_REGISTRY_TAG %>
args:
- '--minor-number=<%= minorNumber %>'
securityContext:
privileged: true
restartPolicy: Never
nodeSelector:
kubernetes.io/hostname: <%= nodeName %>
12 changes: 12 additions & 0 deletions src/alert-manager/src/alert-handler/yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,11 @@ argparse@^1.0.7:
dependencies:
sprintf-js "~1.0.2"

argparse@^2.0.1:
version "2.0.1"
resolved "https://registry.yarnpkg.com/argparse/-/argparse-2.0.1.tgz#246f50f3ca78a3240f6c997e8a9bd1eac49e4b38"
integrity sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==

array-differ@^3.0.0:
version "3.0.0"
resolved "https://registry.yarnpkg.com/array-differ/-/array-differ-3.0.0.tgz#3cbb3d0f316810eafcc47624734237d6aee4ae6b"
Expand Down Expand Up @@ -1974,6 +1979,13 @@ js-yaml@^3.13.1:
argparse "^1.0.7"
esprima "^4.0.0"

js-yaml@^4.0.0:
version "4.0.0"
resolved "https://registry.yarnpkg.com/js-yaml/-/js-yaml-4.0.0.tgz#f426bc0ff4b4051926cd588c71113183409a121f"
integrity sha512-pqon0s+4ScYUvX30wxQi3PogGFAlUyH0awepWvwkj4jD4v+ova3RiYw8bmA6x2rDrEaj8i/oWKoRxpVNW+Re8Q==
dependencies:
argparse "^2.0.1"

jsbn@~0.1.0:
version "0.1.1"
resolved "https://registry.yarnpkg.com/jsbn/-/jsbn-0.1.1.tgz#a5e654c2e5a2deb5f201d96cefbca80c0ef2f513"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash
set -e

echo "MINOR_NUMBER: ${MINOR_NUMBER}"

sudo nvidia-smi -pm ENABLED -i ${MINOR_NUMBER}

MAX_MEMORY_CLOCK=$(nvidia-smi -q -d SUPPORTED_CLOCKS | grep Memory | awk -v max=0 '{if($3>max){max=$3}}END{print max}')
MAX_GRAPHICS_CLOCK=$(nvidia-smi -q -d SUPPORTED_CLOCKS | grep Graphics | awk -v max=0 '{if($3>max){max=$3}}END{print max}')
echo "MAX_MEMORY_CLOCK: ${MAX_MEMORY_CLOCK}, MAX_GRAPHICS_CLOCK: ${MAX_GRAPHICS_CLOCK}"

sudo nvidia-smi -ac ${MAX_MEMORY_CLOCK},${MAX_GRAPHICS_CLOCK} -i ${MINOR_NUMBER}
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/bash
set -e

while getopts "n:" opt; do
case $opt in
n)
MINOR_NUMBER=$OPTARG
;;
\?)
echo "Invalid option: -$OPTARG"
exit 1
;;
esac
done

echo "MINOR_NUMBER: ${MINOR_NUMBER}"

sudo nvidia-smi -pm ENABLED -i ${MINOR_NUMBER}

MAX_MEMORY_CLOCK=$(nvidia-smi -q -d SUPPORTED_CLOCKS | grep Memory | awk -v max=0 '{if($3>max){max=$3}}END{print max}')
MAX_GRAPHICS_CLOCK=$(nvidia-smi -q -d SUPPORTED_CLOCKS | grep Graphics | awk -v max=0 '{if($3>max){max=$3}}END{print max}')
echo "MAX_MEMORY_CLOCK: ${MAX_MEMORY_CLOCK}, MAX_GRAPHICS_CLOCK: ${MAX_GRAPHICS_CLOCK}"

sudo nvidia-smi -ac ${MAX_MEMORY_CLOCK},${MAX_GRAPHICS_CLOCK} -i ${MINOR_NUMBER}

0 comments on commit d7f758a

Please sign in to comment.