From c38d54c06278178d2cd63132f33e193589042f3f Mon Sep 17 00:00:00 2001 From: suiguoxin Date: Wed, 17 Mar 2021 13:21:36 +0800 Subject: [PATCH 1/5] init --- ...vidia-gpu-low-perf-fixer.common.dockerfile | 25 +++++ .../alert-manager-configmap.yaml.template | 13 +++ .../alert-manager-deployment.yaml.template | 4 + src/alert-manager/deploy/rbac.yaml | 3 + .../src/alert-handler/controllers/mail.js | 15 +-- .../src/alert-handler/controllers/node.js | 101 +++++++++++++++++- .../src/alert-handler/routes/actions.js | 5 + .../nvidia-gpu-low-perf-fixer.sh | 12 +++ 8 files changed, 163 insertions(+), 15 deletions(-) create mode 100644 src/alert-manager/build/nvidia-gpu-low-perf-fixer.common.dockerfile create mode 100644 src/alert-manager/src/nvidia-gpu-low-perf-fixer/nvidia-gpu-low-perf-fixer.sh diff --git a/src/alert-manager/build/nvidia-gpu-low-perf-fixer.common.dockerfile b/src/alert-manager/build/nvidia-gpu-low-perf-fixer.common.dockerfile new file mode 100644 index 0000000000..58ab96a2ca --- /dev/null +++ b/src/alert-manager/build/nvidia-gpu-low-perf-fixer.common.dockerfile @@ -0,0 +1,25 @@ +# Copyright (c) Microsoft Corporation +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and +# to permit persons to whom the Software is furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING +# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +FROM nvidia/cuda:11.2.2-base-ubuntu16.04 + +RUN apt-get -y update && \ + apt-get install sudo + +COPY ./src/nvidia-gpu-low-perf-fixer . + +ENTRYPOINT /bin/bash nvidia-gpu-low-perf-fixer.sh diff --git a/src/alert-manager/deploy/alert-manager-configmap.yaml.template b/src/alert-manager/deploy/alert-manager-configmap.yaml.template index 0169673ba5..7e6cb82fad 100644 --- a/src/alert-manager/deploy/alert-manager-configmap.yaml.template +++ b/src/alert-manager/deploy/alert-manager-configmap.yaml.template @@ -41,6 +41,10 @@ data: match: report_type: cluster-usage + - receiver: fix-nvidia-gpu-low-perf + match: + alertname: NodeGpuLowPerfState + {% if 'routes' in cluster_cfg["alert-manager"]["customized-routes"] %} {% for route in cluster_cfg["alert-manager"]["customized-routes"]["routes"] %} - receiver: {{ route.receiver}} @@ -74,6 +78,15 @@ data: send_resolved: false {% endif %} + - name: fix-nvidia-gpu-low-perf + webhook_configs: + {% if 'email-admin' in cluster_cfg["alert-manager"]["actions-available"] %} + - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/send-email-to-admin' + send_resolved: true + {% endif %} + - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/fix-nvidia-gpu-low-perf' + send_resolved: false + - name: pai-cordon-nodes webhook_configs: {% if 'cordon-nodes' in cluster_cfg["alert-manager"]["actions-available"] %} diff --git a/src/alert-manager/deploy/alert-manager-deployment.yaml.template b/src/alert-manager/deploy/alert-manager-deployment.yaml.template index a9fce2333a..43ffb4d87c 100755 --- a/src/alert-manager/deploy/alert-manager-deployment.yaml.template +++ b/src/alert-manager/deploy/alert-manager-deployment.yaml.template @@ -67,6 +67,10 @@ spec: value: {{ cluster_cfg["cluster"]["common"]["cluster-id"] }} - name: REST_SERVER_URI value: {{ cluster_cfg['rest-server']['uri'] }} + - name: DOCKER_REGISTRY_PREFIX + value: {{ cluster_cfg['cluster']['docker-registry']['prefix'] }} + - name: DOCKER_REGISTRY_TAG + value: {{ cluster_cfg['cluster']['docker-registry']['tag'] }} - name: WEBPORTAL_URI {%- if "ssl" in cluster_cfg["pylon"] and cluster_cfg["pylon"]["ssl"] %} value: "{{ cluster_cfg['pylon']['uri-https']}}" diff --git a/src/alert-manager/deploy/rbac.yaml b/src/alert-manager/deploy/rbac.yaml index 1afbaf0109..ae2787bef9 100644 --- a/src/alert-manager/deploy/rbac.yaml +++ b/src/alert-manager/deploy/rbac.yaml @@ -15,6 +15,9 @@ rules: - apiGroups: [""] resources: ["nodes"] verbs: ["patch"] + - apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["create"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding diff --git a/src/alert-manager/src/alert-handler/controllers/mail.js b/src/alert-manager/src/alert-handler/controllers/mail.js index 52f80a5070..ddc96a1369 100755 --- a/src/alert-manager/src/alert-handler/controllers/mail.js +++ b/src/alert-manager/src/alert-handler/controllers/mail.js @@ -88,19 +88,6 @@ const sendEmailToAdmin = (req, res) => { }); }; -const getUserNameByJobName = async (jobName, token) => { - return axios - .get(`${process.env.REST_SERVER_URI}/api/v2/jobs/${jobName}`, { - headers: { - Authorization: `Bearer ${token}`, - 'Content-Type': 'application/json', - }, - }) - .then((response) => { - return response.data.jobStatus.username; - }); -}; - const getUserEmail = async (username, token) => { return axios .get(`${process.env.REST_SERVER_URI}/api/v2/users/${username}`, { @@ -132,7 +119,7 @@ const sendEmailToUser = async (req, res) => { // group alerts by username const alertsGrouped = {}; alerts.map((alert, index) => { - let userName = alert.labels.job_name.split('~')[0]; + const userName = alert.labels.job_name.split('~')[0]; if (userName in alertsGrouped) { alertsGrouped[userName].push(alerts[index]); } else { diff --git a/src/alert-manager/src/alert-handler/controllers/node.js b/src/alert-manager/src/alert-handler/controllers/node.js index 856b47a81f..487b057987 100644 --- a/src/alert-manager/src/alert-handler/controllers/node.js +++ b/src/alert-manager/src/alert-handler/controllers/node.js @@ -18,15 +18,16 @@ const k8s = require('@kubernetes/client-node'); const kc = new k8s.KubeConfig(); const logger = require('@alert-handler/common/logger'); +const crypto = require('crypto'); kc.loadFromDefault(); -const k8sApi = kc.makeApiClient(k8s.CoreV1Api); const cordonNode = async (nodeName) => { const headers = { 'content-type': 'application/strategic-merge-patch+json', }; // set the node unschedulable + const k8sApi = kc.makeApiClient(k8s.CoreV1Api); return k8sApi.patchNode( nodeName, { spec: { unschedulable: true } }, @@ -72,7 +73,105 @@ const cordonNodes = (req, res) => { }); }; +const getK8sV1Job = (jobName, nodeName, minorNumber) => { + const DOCKER_REGISTRY_PREFIX = process.env.DOCKER_REGISTRY_PREFIX; + const DOCKER_REGISTRY_TAG = process.env.DOCKER_REGISTRY_TAG; + const job = { + apiVersion: 'batch/v1', + kind: 'Job', + metadata: { + name: jobName, + }, + spec: { + ttlSecondsAfterFinished: 86400, // TODO: enable this feature when install k8s / delete the job elsewhere + template: { + metadata: { + name: 'nvidia-gpu-low-perf-fixer', + }, + spec: { + containers: [ + { + name: 'nvidia-gpu-low-perf-fixer', + image: `${DOCKER_REGISTRY_PREFIX}nvidia-gpu-low-perf-fixer:${DOCKER_REGISTRY_TAG}`, + imagePullPolicy: 'Always', + env: [ + { + name: 'MINOR_NUMBER', + value: `${minorNumber}`, + }, + ], + securityContext: { + privileged: true, + }, + }, + ], + restartPolicy: 'Never', + nodeSelector: { + 'kubernetes.io/hostname': nodeName, + }, + }, + }, + }, + }; + return job; +}; + +// start a k8s job for each GPU card to fix NvidiaGPULowPerf issue +const fixNvidiaGPULowPerf = (req, res) => { + logger.info( + 'Received `fixNvidiaGPULowPerf` post request from alert-manager.', + ); + // filter alerts which are firing and contain `node_name` & `minor_number` as label + const jobsInfo = req.body.alerts + .filter( + (alert) => + alert.status === 'firing' && + 'node_name' in alert.labels && + 'minor_number' in alert.labels, + ) + // map each alert to a job + .map((alert) => ({ + jobName: `nvidia-gpu-low-perf-fixer-${crypto + .createHash('md5') + .update(alert.labels.node_name + alert.labels.minor_number) + .digest('hex')}`, // unique job by GPU card + nodeName: alert.labels.node_name, + minorNumber: alert.labels.minor_number, + DOCKER_REGISTRY_PREFIX: process.env.DOCKER_REGISTRY_PREFIX, + DOCKER_REGISTRY_TAG: process.env.DOCKER_REGISTRY_TAG, + })); + + const k8sApi = kc.makeApiClient(k8s.BatchV1Api); + jobsInfo.forEach(async (jobInfo) => { + // get k8s V1Job + const job = getK8sV1Job( + jobInfo.jobName, + jobInfo.nodeName, + jobInfo.minorNumber, + ); + k8sApi + .createNamespacedJob('default', job) + .then((response) => { + logger.info( + `Successfully start job ${jobInfo.jobName} for GPU Low Performance issue in node: ${jobInfo.nodeName}, minor number: ${jobInfo.minorNumber}`, + ); + }) + .catch((error) => { + // ignore the job creation if already exists + if (error.response && error.response.statusCode === 409) { + logger.warn(`Kubernetes job ${jobInfo.jobName} already exists.`); + } else { + logger.error(error); + res.status(500).json({ + message: `Failed to start job to fix NvidiaGPULowPerf`, + }); + } + }); + }); +}; + // module exports module.exports = { cordonNodes, + fixNvidiaGPULowPerf, }; diff --git a/src/alert-manager/src/alert-handler/routes/actions.js b/src/alert-manager/src/alert-handler/routes/actions.js index 6442f2056e..734eedad7f 100644 --- a/src/alert-manager/src/alert-handler/routes/actions.js +++ b/src/alert-manager/src/alert-handler/routes/actions.js @@ -50,4 +50,9 @@ router /** POST /alert-handler/cordon-nodes */ .post(nodeController.cordonNodes); +router + .route('/alert-handler/fix-nvidia-gpu-low-perf') + /** POST /alert-handler/fix-nvidia-gpu-low-perf */ + .post(nodeController.fixNvidiaGPULowPerf); + module.exports = router; diff --git a/src/alert-manager/src/nvidia-gpu-low-perf-fixer/nvidia-gpu-low-perf-fixer.sh b/src/alert-manager/src/nvidia-gpu-low-perf-fixer/nvidia-gpu-low-perf-fixer.sh new file mode 100644 index 0000000000..d6036b1d4f --- /dev/null +++ b/src/alert-manager/src/nvidia-gpu-low-perf-fixer/nvidia-gpu-low-perf-fixer.sh @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +echo "MINOR_NUMBER: ${MINOR_NUMBER}" + +sudo nvidia-smi -pm ENABLED -i ${MINOR_NUMBER} + +MAX_MEMORY_CLOCK=$(nvidia-smi -q -d SUPPORTED_CLOCKS | grep Memory | awk -v max=0 '{if($3>max){max=$3}}END{print max}') +MAX_GRAPHICS_CLOCK=$(nvidia-smi -q -d SUPPORTED_CLOCKS | grep Graphics | awk -v max=0 '{if($3>max){max=$3}}END{print max}') +echo "MAX_MEMORY_CLOCK: ${MAX_MEMORY_CLOCK}, MAX_GRAPHICS_CLOCK: ${MAX_GRAPHICS_CLOCK}" + +sudo nvidia-smi -ac ${MAX_MEMORY_CLOCK},${MAX_GRAPHICS_CLOCK} -i ${MINOR_NUMBER} From 68732d5a7595363a7d8fe1ba334e84eeab2d752f Mon Sep 17 00:00:00 2001 From: suiguoxin Date: Fri, 19 Mar 2021 12:10:53 +0800 Subject: [PATCH 2/5] clean completed jobs after 24h --- src/alert-manager/deploy/rbac.yaml | 2 +- .../src/alert-handler/controllers/node.js | 41 ++++++++++++++++++- src/alert-manager/src/alert-handler/index.js | 7 ++++ 3 files changed, 48 insertions(+), 2 deletions(-) diff --git a/src/alert-manager/deploy/rbac.yaml b/src/alert-manager/deploy/rbac.yaml index ae2787bef9..89073ff43b 100644 --- a/src/alert-manager/deploy/rbac.yaml +++ b/src/alert-manager/deploy/rbac.yaml @@ -17,7 +17,7 @@ rules: verbs: ["patch"] - apiGroups: ["batch"] resources: ["jobs"] - verbs: ["create"] + verbs: ["create", "list", "delete"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding diff --git a/src/alert-manager/src/alert-handler/controllers/node.js b/src/alert-manager/src/alert-handler/controllers/node.js index 487b057987..b125109cd6 100644 --- a/src/alert-manager/src/alert-handler/controllers/node.js +++ b/src/alert-manager/src/alert-handler/controllers/node.js @@ -83,7 +83,9 @@ const getK8sV1Job = (jobName, nodeName, minorNumber) => { name: jobName, }, spec: { - ttlSecondsAfterFinished: 86400, // TODO: enable this feature when install k8s / delete the job elsewhere + // TTL feature is currently alpha[Kubernetes 1.15] + // To avoid using this fearure, jobs will be cleaned with function `cleanCompletedfixNvidiaGPULowPerfJobs` regularlly + // ttlSecondsAfterFinished: 86400, template: { metadata: { name: 'nvidia-gpu-low-perf-fixer', @@ -170,8 +172,45 @@ const fixNvidiaGPULowPerf = (req, res) => { }); }; +// clean completed jobs which were used to fix NvidiaGPULowPerf issue +// the jobs completed for more than 24 hours will be deleted +const cleanCompletedfixNvidiaGPULowPerfJobs = (req, res) => { + logger.info( + 'Cleaning completed jobs which were used to fix NvidiaGPULowPerf issue...', + ); + + const k8sApi = kc.makeApiClient(k8s.BatchV1Api); + k8sApi + .listNamespacedJob('default') + .then((response) => { + logger.info(`Successfully get job list.`); + const jobs = response.body.items; + jobs.forEach((job) => { + const jobName = job.metadata.name; + // check job name & if the job has completed + if ( + jobName.startsWith('nvidia-gpu-low-perf-fixer-') && + (job.status.succeeded === 1 || jobs.status.failed === 1) && + new Date() - new Date(job.status.completionTime) > 24 * 60 * 60 * 1000 // completed for more than 24h + ) + k8sApi + .deleteNamespacedJob(jobName, 'default') + .then((response) => { + logger.info(`Successfully deleted job ${jobName}`); + }) + .catch((error) => { + logger.info(`Failed to delete job ${jobName}`, error); + }); + }); + }) + .catch((error) => { + logger.error('Failed to list jobs:', error); + }); +}; + // module exports module.exports = { cordonNodes, fixNvidiaGPULowPerf, + cleanCompletedfixNvidiaGPULowPerfJobs, }; diff --git a/src/alert-manager/src/alert-handler/index.js b/src/alert-manager/src/alert-handler/index.js index d0d78a279b..836a152168 100755 --- a/src/alert-manager/src/alert-handler/index.js +++ b/src/alert-manager/src/alert-handler/index.js @@ -23,6 +23,7 @@ require('module-alias/register'); const express = require('express'); const bearerToken = require('express-bearer-token'); const actions = require('@alert-handler/routes/actions'); +const nodeController = require('@alert-handler/controllers/node'); const logger = require('@alert-handler/common/logger'); const app = express(); @@ -36,3 +37,9 @@ const port = parseInt(process.env.SERVER_PORT); app.listen(port, () => { logger.info(`alert-handler listening at http://localhost:${port}`); }); + +// check completed jobs which were used to fix NvidiaGPULowPerf issue every 1 hour +setInterval( + nodeController.cleanCompletedfixNvidiaGPULowPerfJobs, + 60 * 60 * 1000, +); From 7980865e548593e192447b41c22da7a2c9acaddc Mon Sep 17 00:00:00 2001 From: suiguoxin Date: Fri, 19 Mar 2021 12:29:18 +0800 Subject: [PATCH 3/5] update config generation rule, doc & examples --- .../services-configuration.yaml.template | 8 ++ .../services-configuration.yaml.template | 8 ++ .../cluster-admin/how-to-use-alert-system.md | 31 ++-- .../services-configuration.yaml | 134 +++++++++--------- src/alert-manager/config/alert_manager.py | 7 +- .../alert-manager-configmap.yaml.template | 18 +-- 6 files changed, 109 insertions(+), 97 deletions(-) diff --git a/contrib/kubespray/quick-start/services-configuration.yaml.template b/contrib/kubespray/quick-start/services-configuration.yaml.template index 27df33f3e4..2e48fb0585 100644 --- a/contrib/kubespray/quick-start/services-configuration.yaml.template +++ b/contrib/kubespray/quick-start/services-configuration.yaml.template @@ -232,6 +232,9 @@ authentication: # - receiver: pai-email-admin-user-and-stop-job # match: # alertname: PAIJobGpuPercentLowerThan0_3For1h +# - receiver: pai-email-admin-and-fix-nvidia-gpu-low-perf +# match: +# alertname: NodeGpuLowPerfState # customized-receivers: # receivers are combination of several actions # - name: "pai-email-admin-user-and-stop-job" # actions: @@ -244,6 +247,11 @@ authentication: # tag-jobs: # tags: # - 'stopped-by-alert-manager' +# - name: "pai-email-admin-and-fix-nvidia-gpu-low-perf" +# actions: +# email-admin: +# fix-nvidia-gpu-low-perf: + # uncomment following if you want to customize prometheus # prometheus: diff --git a/deployment/quick-start/services-configuration.yaml.template b/deployment/quick-start/services-configuration.yaml.template index 2a30de3fbe..577cb262dc 100644 --- a/deployment/quick-start/services-configuration.yaml.template +++ b/deployment/quick-start/services-configuration.yaml.template @@ -92,6 +92,9 @@ rest-server: # - receiver: pai-email-admin-user-and-stop-job # match: # alertname: PAIJobGpuPercentLowerThan0_3For1h +# - receiver: pai-email-admin-and-fix-nvidia-gpu-low-perf +# match: +# alertname: NodeGpuLowPerfState # customized-receivers: # receivers are combination of several actions # - name: "pai-email-admin-user-and-stop-job" # actions: @@ -104,6 +107,11 @@ rest-server: # tag-jobs: # tags: # - 'stopped-by-alert-manager' +# - name: "pai-email-admin-and-fix-nvidia-gpu-low-perf" +# actions: +# email-admin: +# fix-nvidia-gpu-low-perf: + # uncomment following if you want to customize prometheus # prometheus: diff --git a/docs/manual/cluster-admin/how-to-use-alert-system.md b/docs/manual/cluster-admin/how-to-use-alert-system.md index 1a986bb05b..19953b5221 100644 --- a/docs/manual/cluster-admin/how-to-use-alert-system.md +++ b/docs/manual/cluster-admin/how-to-use-alert-system.md @@ -114,26 +114,29 @@ We have provided so far these following actions: - `stop-jobs`: Stop jobs by calling OpenPAI REST API. **Be careful about this action because it stops jobs without notifying related users.** - `tag-jobs`: Add a tag to jobs by calling OpenPAI REST API. - `cordon-nodes`: Call Kubernetes API to cordon the corresponding nodes. + - `fix-nvidia-gpu-low-perf`: Start a privileged container to fix NVIDIA GPU Low Performance State issue. But before you use them, you have to add proper configuration in the `alert-handler` field. For example, `email-admin` needs you to set up an SMTP account to send the email and an admin email address to receive the email. Also, the `tag-jobs` and `stop-jobs` action calls OpenPAI REST API, so you should set a rest server token for them. To get the token, you should go to your profile page (in the top-right corner on Webporal, click `View my profile`), and use `Create application token` to create one. Generally speaking, there are two parts of the configuration in the `alert-handler` field. One is `email-configs`. The other is `pai-bearer-token`. The requirements for different actions are shown in the following table: -| | email-configs | pai-bearer-token | -| :-----------:| :-----------: | :--------------: | -| cordon-nodes | - | - | -| email-admin | required | - | -| email-user | required | required | -| stop-jobs | - | required | -| tag-jobs | - | required | +| | email-configs | pai-bearer-token | +| :-------------------------: | :-----------: | :--------------: | +| cordon-nodes | - | - | +| email-admin | required | - | +| email-user | required | required | +| stop-jobs | - | required | +| tag-jobs | - | required | +| fix-nvidia-gpu-low-perf | - | - | In addition, some actions may depend on certain fields in the `labels` of alert instances. The labels of the `alert instance` are generated based on the expression in the alert rule. For example, the expression of the `PAIJobGpuPercentLowerThan0_3For1h` alert we mentioned in previous section is `avg(task_gpu_percent{virtual_cluster=~"default"}) by (job_name) < 0.3`. This expression returns a list, the element in which contains the `job_name` field. So there will be also a `job_name` field in the labels of the alert instance. `stop-jobs` action depends on the `job_name` field, and it will stop the corresponding job based on it. To inspect the labels of an alert, you can visit `http(s):///prometheus/alerts`. If the alert is firing, you can see its labels on this page. For the depended fields of each pre-defined action, please refer to the following table: -| | depended on label field | -| :-----------:| :------------------: | -| cordon-nodes | node_name | -| email-admin | - | -| email-user | - | -| stop-jobs | job_name | -| tag-jobs | job_name | +| | depended on label field | +| :-------------------------: | :---------------------: | +| cordon-nodes | node_name | +| email-admin | - | +| email-user | - | +| stop-jobs | job_name | +| tag-jobs | job_name | +| fix-nvidia-gpu-low-perf | node_name, minor_number | The matching rules between alerts and actions are defined using `receivers` and `routes`. diff --git a/examples/cluster-configuration/services-configuration.yaml b/examples/cluster-configuration/services-configuration.yaml index 4933c0d602..6b4c1259ea 100644 --- a/examples/cluster-configuration/services-configuration.yaml +++ b/examples/cluster-configuration/services-configuration.yaml @@ -82,7 +82,6 @@ rest-server: #github-path: marketplace # Job Debugging Reservation Seconds. #debugging-reservation-seconds: 604800 - # uncomment following section if you want to customize the port of web portal # webportal: # server-port: 9286 @@ -125,6 +124,9 @@ rest-server: # - receiver: pai-email-admin-user-and-stop-job # match: # alertname: PAIJobGpuPercentLowerThan0_3For1h +# - receiver: pai-email-admin-and-fix-nvidia-gpu-low-perf +# match: +# alertname: NodeGpuLowPerfState # customized-receivers: # receivers are combination of several actions # - name: "pai-email-admin-user-and-stop-job" # actions: @@ -137,6 +139,10 @@ rest-server: # tag-jobs: # tags: # - 'stopped-by-alert-manager' +# - name: "pai-email-admin-and-fix-nvidia-gpu-low-perf" +# actions: +# email-admin: +# fix-nvidia-gpu-low-perf: # uncomment following if you want to customize prometheus # prometheus: @@ -172,8 +178,6 @@ rest-server: # # key_name: yyyyyy # # key_path: /path/to/yyyyyy - - # uncomment following section if you want to customize the threshold of cleaner # cleaner: # threshold: 90 @@ -185,65 +189,65 @@ rest-server: # uncomment following section, if you want to customize the authentication solution. #authentication: - #OIDC: false - - # If OIDC is set as the value true, you will have to configure the following properties. - #OIDC-type: AAD - # - #AAD: - # # If you wanna configure AAD-OIDC for OpenPAI, the following configuration is mandatory. - # # National Clouds endpoint list https://docs.microsoft.com/en-us/azure/active-directory/develop/authentication-national-cloud - # # AZURE: https://login.microsoftonline.com/{tenantID}/v2.0/.well-known/openid-configuration - # # China: https://login.partner.microsoftonline.cn/{tenantID}/v2.0/.well-known/openid-configuration - # # Germany: https://login.microsoftonline.de/{tenantID}/v2.0/.well-known/openid-configuration - # wellKnownURL: https://login.microsoftonline.com/{tenantID}/v2.0/.well-known/openid-configuration - # - # # If you wanna configure AAD-OIDC for OpenPAI, the following configuration is mandatory. - # tenantID: ${tenat_id} - # - # # Required, the client ID of your app in AAD - # clientID: ${your_client_id} - # - # # Required if `responseType` is 'code', 'id_token code' or 'code id_token'. - # # If app key contains '\', replace it with '\\'. - # clientSecret: '${your_client_secret}' - # - # # Optional. The lifetime of nonce in session or cookie, the default value is 3600 (seconds). - # nonceLifetime: null - # - # # Optional. The max amount of nonce saved in session or cookie, the default value is 10. - # nonceMaxAmount: 5 - # - # # Optional. The clock skew allowed in token validation, the default value is 300 seconds. - # clockSkew: null - # - #group-manager: - # # basic: If you set group-data-source as the value basic, admin should manually modify user's grouplist. - # # winbind: If you set group-data-source as the value winbind, the user's grouplist will get from winbind server based on your configuration. - # group-data-source: basic - # - # # If you set winbind as your data source, you should configure this configuration. - # # winbind-server-address: xxxxxxx - # - # # Admin group name and its user list - # admin-group: - # groupname: admingroup - # description: "admin's group" - # externalName: "" - # - # # Group for default vc. - # # For yarn default queue hack. - # default-group: - # groupname: default - # description: "group for default vc" - # externalName: "" - # - # # If the following groups are not in the data store, it will be created by default. - # grouplist: - # - groupname: forexample - # # internal name - # description: forexample - # # description of the group - # externalName: "" - # # external name, it should be set if your group-data-source is winbind. And the name will be used to query and match the group from - # # the result of winbind. If the group-data-source is basic, this field is useless. +#OIDC: false + +# If OIDC is set as the value true, you will have to configure the following properties. +#OIDC-type: AAD +# +#AAD: +# # If you wanna configure AAD-OIDC for OpenPAI, the following configuration is mandatory. +# # National Clouds endpoint list https://docs.microsoft.com/en-us/azure/active-directory/develop/authentication-national-cloud +# # AZURE: https://login.microsoftonline.com/{tenantID}/v2.0/.well-known/openid-configuration +# # China: https://login.partner.microsoftonline.cn/{tenantID}/v2.0/.well-known/openid-configuration +# # Germany: https://login.microsoftonline.de/{tenantID}/v2.0/.well-known/openid-configuration +# wellKnownURL: https://login.microsoftonline.com/{tenantID}/v2.0/.well-known/openid-configuration +# +# # If you wanna configure AAD-OIDC for OpenPAI, the following configuration is mandatory. +# tenantID: ${tenat_id} +# +# # Required, the client ID of your app in AAD +# clientID: ${your_client_id} +# +# # Required if `responseType` is 'code', 'id_token code' or 'code id_token'. +# # If app key contains '\', replace it with '\\'. +# clientSecret: '${your_client_secret}' +# +# # Optional. The lifetime of nonce in session or cookie, the default value is 3600 (seconds). +# nonceLifetime: null +# +# # Optional. The max amount of nonce saved in session or cookie, the default value is 10. +# nonceMaxAmount: 5 +# +# # Optional. The clock skew allowed in token validation, the default value is 300 seconds. +# clockSkew: null +# +#group-manager: +# # basic: If you set group-data-source as the value basic, admin should manually modify user's grouplist. +# # winbind: If you set group-data-source as the value winbind, the user's grouplist will get from winbind server based on your configuration. +# group-data-source: basic +# +# # If you set winbind as your data source, you should configure this configuration. +# # winbind-server-address: xxxxxxx +# +# # Admin group name and its user list +# admin-group: +# groupname: admingroup +# description: "admin's group" +# externalName: "" +# +# # Group for default vc. +# # For yarn default queue hack. +# default-group: +# groupname: default +# description: "group for default vc" +# externalName: "" +# +# # If the following groups are not in the data store, it will be created by default. +# grouplist: +# - groupname: forexample +# # internal name +# description: forexample +# # description of the group +# externalName: "" +# # external name, it should be set if your group-data-source is winbind. And the name will be used to query and match the group from +# # the result of winbind. If the group-data-source is basic, this field is useless. diff --git a/src/alert-manager/config/alert_manager.py b/src/alert-manager/config/alert_manager.py index 6b33c15435..63ba1468ee 100644 --- a/src/alert-manager/config/alert_manager.py +++ b/src/alert-manager/config/alert_manager.py @@ -74,17 +74,14 @@ def run(self): else: token_configured = False + result["alert-handler"]["configured"] = True + result["actions-available"] = ["fix-nvidia-gpu-low-perf"] if email_configured and token_configured: - result["alert-handler"]["configured"] = True result["actions-available"].extend(["email-admin", "email-user", "stop-jobs", "tag-jobs"]) elif email_configured: - result["alert-handler"]["configured"] = True result["actions-available"].append("email-admin") elif token_configured: - result["alert-handler"]["configured"] = True result["actions-available"].extend(["stop-jobs", "tag-jobs"]) - else: - result["alert-handler"]["configured"] = False if result.get("cluster-utilization") is not None and \ result["cluster-utilization"].get("schedule") is not None and \ diff --git a/src/alert-manager/deploy/alert-manager-configmap.yaml.template b/src/alert-manager/deploy/alert-manager-configmap.yaml.template index 7e6cb82fad..28ed7d7a5a 100644 --- a/src/alert-manager/deploy/alert-manager-configmap.yaml.template +++ b/src/alert-manager/deploy/alert-manager-configmap.yaml.template @@ -41,10 +41,6 @@ data: match: report_type: cluster-usage - - receiver: fix-nvidia-gpu-low-perf - match: - alertname: NodeGpuLowPerfState - {% if 'routes' in cluster_cfg["alert-manager"]["customized-routes"] %} {% for route in cluster_cfg["alert-manager"]["customized-routes"]["routes"] %} - receiver: {{ route.receiver}} @@ -78,15 +74,6 @@ data: send_resolved: false {% endif %} - - name: fix-nvidia-gpu-low-perf - webhook_configs: - {% if 'email-admin' in cluster_cfg["alert-manager"]["actions-available"] %} - - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/send-email-to-admin' - send_resolved: true - {% endif %} - - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/fix-nvidia-gpu-low-perf' - send_resolved: false - - name: pai-cordon-nodes webhook_configs: {% if 'cordon-nodes' in cluster_cfg["alert-manager"]["actions-available"] %} @@ -135,6 +122,11 @@ data: - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/cordon-nodes' send_resolved: false {% endif %} + + {% if (receiver["actions"]["fix-nvidia-gpu-low-perf"] is defined) and ('fix-nvidia-gpu-low-perf' in cluster_cfg["alert-manager"]["actions-available"]) %} + - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/fix-nvidia-gpu-low-perf' + send_resolved: false + {% endif %} {% endfor %} From 0a90fa4f8fb5e3dcadadeae96eb908d085c77735 Mon Sep 17 00:00:00 2001 From: suiguoxin Date: Mon, 29 Mar 2021 08:25:32 +0800 Subject: [PATCH 4/5] add label to fixer job --- .../alert-handler/controllers/kubernetes.js | 63 +++++++++++++++++++ .../src/alert-handler/controllers/node.js | 42 ++----------- src/alert-manager/src/alert-handler/index.js | 7 +-- 3 files changed, 69 insertions(+), 43 deletions(-) create mode 100644 src/alert-manager/src/alert-handler/controllers/kubernetes.js diff --git a/src/alert-manager/src/alert-handler/controllers/kubernetes.js b/src/alert-manager/src/alert-handler/controllers/kubernetes.js new file mode 100644 index 0000000000..fa64c96da7 --- /dev/null +++ b/src/alert-manager/src/alert-handler/controllers/kubernetes.js @@ -0,0 +1,63 @@ +// Copyright (c) Microsoft Corporation +// All rights reserved. +// +// MIT License +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +// documentation files (the "Software"), to deal in the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and +// to permit persons to whom the Software is furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING +// BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +const k8s = require('@kubernetes/client-node'); +const kc = new k8s.KubeConfig(); +const logger = require('@alert-handler/common/logger'); + +// clean TTL 24 hours jobs +const cleanTTL24HJobs = () => { + logger.info('Cleaning completed TTL 24h jobs...'); + + const k8sApi = kc.makeApiClient(k8s.BatchV1Api); + k8sApi + .listNamespacedJob( + 'default', + undefined, + undefined, + undefined, + undefined, + 'time-to-live=24h', // labelSelector + ) + .then((response) => { + logger.info(`Successfully get job list.`); + const jobs = response.body.items; + jobs.forEach((job) => { + const jobName = job.metadata.name; + if ( + (job.status.succeeded === 1 || jobs.status.failed === 1) && // check if the job has completed + new Date() - new Date(job.status.completionTime) > 24 * 60 * 60 * 1000 // completed for more than 24h + ) + k8sApi + .deleteNamespacedJob(jobName, 'default') + .then((response) => { + logger.info(`Successfully deleted job ${jobName}`); + }) + .catch((error) => { + logger.info(`Failed to delete job ${jobName}`, error); + }); + }); + }) + .catch((error) => { + logger.error('Failed to list jobs:', error); + }); +}; + +// module exports +module.exports = { + cleanTTL24HJobs, +}; diff --git a/src/alert-manager/src/alert-handler/controllers/node.js b/src/alert-manager/src/alert-handler/controllers/node.js index b125109cd6..1bcff5501c 100644 --- a/src/alert-manager/src/alert-handler/controllers/node.js +++ b/src/alert-manager/src/alert-handler/controllers/node.js @@ -84,11 +84,14 @@ const getK8sV1Job = (jobName, nodeName, minorNumber) => { }, spec: { // TTL feature is currently alpha[Kubernetes 1.15] - // To avoid using this fearure, jobs will be cleaned with function `cleanCompletedfixNvidiaGPULowPerfJobs` regularlly + // To avoid using this fearure, jobs with label `time-to-live=24h` ill be cleaned with function `cleanTTL24HJobs` regularlly // ttlSecondsAfterFinished: 86400, template: { metadata: { name: 'nvidia-gpu-low-perf-fixer', + labels: { + 'time-to-live': '24h', + }, }, spec: { containers: [ @@ -172,45 +175,8 @@ const fixNvidiaGPULowPerf = (req, res) => { }); }; -// clean completed jobs which were used to fix NvidiaGPULowPerf issue -// the jobs completed for more than 24 hours will be deleted -const cleanCompletedfixNvidiaGPULowPerfJobs = (req, res) => { - logger.info( - 'Cleaning completed jobs which were used to fix NvidiaGPULowPerf issue...', - ); - - const k8sApi = kc.makeApiClient(k8s.BatchV1Api); - k8sApi - .listNamespacedJob('default') - .then((response) => { - logger.info(`Successfully get job list.`); - const jobs = response.body.items; - jobs.forEach((job) => { - const jobName = job.metadata.name; - // check job name & if the job has completed - if ( - jobName.startsWith('nvidia-gpu-low-perf-fixer-') && - (job.status.succeeded === 1 || jobs.status.failed === 1) && - new Date() - new Date(job.status.completionTime) > 24 * 60 * 60 * 1000 // completed for more than 24h - ) - k8sApi - .deleteNamespacedJob(jobName, 'default') - .then((response) => { - logger.info(`Successfully deleted job ${jobName}`); - }) - .catch((error) => { - logger.info(`Failed to delete job ${jobName}`, error); - }); - }); - }) - .catch((error) => { - logger.error('Failed to list jobs:', error); - }); -}; - // module exports module.exports = { cordonNodes, fixNvidiaGPULowPerf, - cleanCompletedfixNvidiaGPULowPerfJobs, }; diff --git a/src/alert-manager/src/alert-handler/index.js b/src/alert-manager/src/alert-handler/index.js index 836a152168..bc0d121c89 100755 --- a/src/alert-manager/src/alert-handler/index.js +++ b/src/alert-manager/src/alert-handler/index.js @@ -23,7 +23,7 @@ require('module-alias/register'); const express = require('express'); const bearerToken = require('express-bearer-token'); const actions = require('@alert-handler/routes/actions'); -const nodeController = require('@alert-handler/controllers/node'); +const k8sController = require('@alert-handler/controllers/kubernetes'); const logger = require('@alert-handler/common/logger'); const app = express(); @@ -39,7 +39,4 @@ app.listen(port, () => { }); // check completed jobs which were used to fix NvidiaGPULowPerf issue every 1 hour -setInterval( - nodeController.cleanCompletedfixNvidiaGPULowPerfJobs, - 60 * 60 * 1000, -); +setInterval(k8sController.cleanTTL24HJobs, 60 * 60 * 1000); From 9f24d6e4adf92c148527d2adcada8e662ad69025 Mon Sep 17 00:00:00 2001 From: suiguoxin Date: Wed, 31 Mar 2021 10:24:58 +0800 Subject: [PATCH 5/5] refine --- .../nvidia-gpu-low-perf-fixer.common.dockerfile | 3 --- .../src/alert-handler/controllers/kubernetes.js | 4 ++-- .../src/alert-handler/controllers/node.js | 12 +++++------- .../nvidia-gpu-low-perf-fixer.sh | 6 +++--- 4 files changed, 10 insertions(+), 15 deletions(-) diff --git a/src/alert-manager/build/nvidia-gpu-low-perf-fixer.common.dockerfile b/src/alert-manager/build/nvidia-gpu-low-perf-fixer.common.dockerfile index 58ab96a2ca..dd8050d05b 100644 --- a/src/alert-manager/build/nvidia-gpu-low-perf-fixer.common.dockerfile +++ b/src/alert-manager/build/nvidia-gpu-low-perf-fixer.common.dockerfile @@ -17,9 +17,6 @@ FROM nvidia/cuda:11.2.2-base-ubuntu16.04 -RUN apt-get -y update && \ - apt-get install sudo - COPY ./src/nvidia-gpu-low-perf-fixer . ENTRYPOINT /bin/bash nvidia-gpu-low-perf-fixer.sh diff --git a/src/alert-manager/src/alert-handler/controllers/kubernetes.js b/src/alert-manager/src/alert-handler/controllers/kubernetes.js index fa64c96da7..ca41879726 100644 --- a/src/alert-manager/src/alert-handler/controllers/kubernetes.js +++ b/src/alert-manager/src/alert-handler/controllers/kubernetes.js @@ -19,7 +19,7 @@ const k8s = require('@kubernetes/client-node'); const kc = new k8s.KubeConfig(); const logger = require('@alert-handler/common/logger'); -// clean TTL 24 hours jobs +// clean TTL 24 hours jobs created by alert-handler const cleanTTL24HJobs = () => { logger.info('Cleaning completed TTL 24h jobs...'); @@ -31,7 +31,7 @@ const cleanTTL24HJobs = () => { undefined, undefined, undefined, - 'time-to-live=24h', // labelSelector + 'created-by=alert-handler,time-to-live=24h', // labelSelector ) .then((response) => { logger.info(`Successfully get job list.`); diff --git a/src/alert-manager/src/alert-handler/controllers/node.js b/src/alert-manager/src/alert-handler/controllers/node.js index 1bcff5501c..39dd01132d 100644 --- a/src/alert-manager/src/alert-handler/controllers/node.js +++ b/src/alert-manager/src/alert-handler/controllers/node.js @@ -81,18 +81,16 @@ const getK8sV1Job = (jobName, nodeName, minorNumber) => { kind: 'Job', metadata: { name: jobName, + labels: { + 'created-by': 'alert-handler', + 'time-to-live': '24h', + }, }, spec: { // TTL feature is currently alpha[Kubernetes 1.15] - // To avoid using this fearure, jobs with label `time-to-live=24h` ill be cleaned with function `cleanTTL24HJobs` regularlly + // To avoid using this fearure, jobs with label `time-to-live=24h` & `created-by=alert-handler` will be cleaned with function `cleanTTL24HJobs` regularlly // ttlSecondsAfterFinished: 86400, template: { - metadata: { - name: 'nvidia-gpu-low-perf-fixer', - labels: { - 'time-to-live': '24h', - }, - }, spec: { containers: [ { diff --git a/src/alert-manager/src/nvidia-gpu-low-perf-fixer/nvidia-gpu-low-perf-fixer.sh b/src/alert-manager/src/nvidia-gpu-low-perf-fixer/nvidia-gpu-low-perf-fixer.sh index d6036b1d4f..8903f09f3f 100644 --- a/src/alert-manager/src/nvidia-gpu-low-perf-fixer/nvidia-gpu-low-perf-fixer.sh +++ b/src/alert-manager/src/nvidia-gpu-low-perf-fixer/nvidia-gpu-low-perf-fixer.sh @@ -1,12 +1,12 @@ #!/bin/bash -set -e +set -ex echo "MINOR_NUMBER: ${MINOR_NUMBER}" -sudo nvidia-smi -pm ENABLED -i ${MINOR_NUMBER} +nvidia-smi -pm ENABLED -i ${MINOR_NUMBER} MAX_MEMORY_CLOCK=$(nvidia-smi -q -d SUPPORTED_CLOCKS | grep Memory | awk -v max=0 '{if($3>max){max=$3}}END{print max}') MAX_GRAPHICS_CLOCK=$(nvidia-smi -q -d SUPPORTED_CLOCKS | grep Graphics | awk -v max=0 '{if($3>max){max=$3}}END{print max}') echo "MAX_MEMORY_CLOCK: ${MAX_MEMORY_CLOCK}, MAX_GRAPHICS_CLOCK: ${MAX_GRAPHICS_CLOCK}" -sudo nvidia-smi -ac ${MAX_MEMORY_CLOCK},${MAX_GRAPHICS_CLOCK} -i ${MINOR_NUMBER} +nvidia-smi -ac ${MAX_MEMORY_CLOCK},${MAX_GRAPHICS_CLOCK} -i ${MINOR_NUMBER}