From c38d54c06278178d2cd63132f33e193589042f3f Mon Sep 17 00:00:00 2001
From: suiguoxin <suiguoxin@gmail.com>
Date: Wed, 17 Mar 2021 13:21:36 +0800
Subject: [PATCH 1/5] init

---
 ...vidia-gpu-low-perf-fixer.common.dockerfile |  25 +++++
 .../alert-manager-configmap.yaml.template     |  13 +++
 .../alert-manager-deployment.yaml.template    |   4 +
 src/alert-manager/deploy/rbac.yaml            |   3 +
 .../src/alert-handler/controllers/mail.js     |  15 +--
 .../src/alert-handler/controllers/node.js     | 101 +++++++++++++++++-
 .../src/alert-handler/routes/actions.js       |   5 +
 .../nvidia-gpu-low-perf-fixer.sh              |  12 +++
 8 files changed, 163 insertions(+), 15 deletions(-)
 create mode 100644 src/alert-manager/build/nvidia-gpu-low-perf-fixer.common.dockerfile
 create mode 100644 src/alert-manager/src/nvidia-gpu-low-perf-fixer/nvidia-gpu-low-perf-fixer.sh

diff --git a/src/alert-manager/build/nvidia-gpu-low-perf-fixer.common.dockerfile b/src/alert-manager/build/nvidia-gpu-low-perf-fixer.common.dockerfile
new file mode 100644
index 0000000000..58ab96a2ca
--- /dev/null
+++ b/src/alert-manager/build/nvidia-gpu-low-perf-fixer.common.dockerfile
@@ -0,0 +1,25 @@
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+FROM nvidia/cuda:11.2.2-base-ubuntu16.04
+
+RUN apt-get -y update && \
+    apt-get install sudo
+
+COPY ./src/nvidia-gpu-low-perf-fixer .
+
+ENTRYPOINT /bin/bash nvidia-gpu-low-perf-fixer.sh
diff --git a/src/alert-manager/deploy/alert-manager-configmap.yaml.template b/src/alert-manager/deploy/alert-manager-configmap.yaml.template
index 0169673ba5..7e6cb82fad 100644
--- a/src/alert-manager/deploy/alert-manager-configmap.yaml.template
+++ b/src/alert-manager/deploy/alert-manager-configmap.yaml.template
@@ -41,6 +41,10 @@ data:
         match:
           report_type: cluster-usage
 
+      - receiver: fix-nvidia-gpu-low-perf
+        match:
+          alertname: NodeGpuLowPerfState
+
       {% if 'routes' in cluster_cfg["alert-manager"]["customized-routes"] %}
       {% for route in cluster_cfg["alert-manager"]["customized-routes"]["routes"] %}
       - receiver: {{ route.receiver}}
@@ -74,6 +78,15 @@ data:
         send_resolved: false
       {% endif %}
 
+    - name: fix-nvidia-gpu-low-perf
+      webhook_configs:
+      {% if 'email-admin' in cluster_cfg["alert-manager"]["actions-available"] %}
+      - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/send-email-to-admin'
+        send_resolved: true
+      {% endif %}
+      - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/fix-nvidia-gpu-low-perf'
+        send_resolved: false
+
     - name: pai-cordon-nodes
       webhook_configs:
       {% if 'cordon-nodes' in cluster_cfg["alert-manager"]["actions-available"] %}
diff --git a/src/alert-manager/deploy/alert-manager-deployment.yaml.template b/src/alert-manager/deploy/alert-manager-deployment.yaml.template
index a9fce2333a..43ffb4d87c 100755
--- a/src/alert-manager/deploy/alert-manager-deployment.yaml.template
+++ b/src/alert-manager/deploy/alert-manager-deployment.yaml.template
@@ -67,6 +67,10 @@ spec:
           value: {{ cluster_cfg["cluster"]["common"]["cluster-id"] }}
         - name: REST_SERVER_URI
           value: {{ cluster_cfg['rest-server']['uri'] }}
+        - name: DOCKER_REGISTRY_PREFIX
+          value: {{ cluster_cfg['cluster']['docker-registry']['prefix'] }}
+        - name: DOCKER_REGISTRY_TAG
+          value: {{ cluster_cfg['cluster']['docker-registry']['tag'] }}
         - name: WEBPORTAL_URI
 {%- if "ssl" in cluster_cfg["pylon"] and cluster_cfg["pylon"]["ssl"] %}
           value: "{{ cluster_cfg['pylon']['uri-https']}}"
diff --git a/src/alert-manager/deploy/rbac.yaml b/src/alert-manager/deploy/rbac.yaml
index 1afbaf0109..ae2787bef9 100644
--- a/src/alert-manager/deploy/rbac.yaml
+++ b/src/alert-manager/deploy/rbac.yaml
@@ -15,6 +15,9 @@ rules:
   - apiGroups: [""]
     resources: ["nodes"]
     verbs: ["patch"]
+  - apiGroups: ["batch"]
+    resources: ["jobs"]
+    verbs: ["create"]
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRoleBinding
diff --git a/src/alert-manager/src/alert-handler/controllers/mail.js b/src/alert-manager/src/alert-handler/controllers/mail.js
index 52f80a5070..ddc96a1369 100755
--- a/src/alert-manager/src/alert-handler/controllers/mail.js
+++ b/src/alert-manager/src/alert-handler/controllers/mail.js
@@ -88,19 +88,6 @@ const sendEmailToAdmin = (req, res) => {
     });
 };
 
-const getUserNameByJobName = async (jobName, token) => {
-  return axios
-    .get(`${process.env.REST_SERVER_URI}/api/v2/jobs/${jobName}`, {
-      headers: {
-        Authorization: `Bearer ${token}`,
-        'Content-Type': 'application/json',
-      },
-    })
-    .then((response) => {
-      return response.data.jobStatus.username;
-    });
-};
-
 const getUserEmail = async (username, token) => {
   return axios
     .get(`${process.env.REST_SERVER_URI}/api/v2/users/${username}`, {
@@ -132,7 +119,7 @@ const sendEmailToUser = async (req, res) => {
   // group alerts by username
   const alertsGrouped = {};
   alerts.map((alert, index) => {
-    let userName = alert.labels.job_name.split('~')[0];
+    const userName = alert.labels.job_name.split('~')[0];
     if (userName in alertsGrouped) {
       alertsGrouped[userName].push(alerts[index]);
     } else {
diff --git a/src/alert-manager/src/alert-handler/controllers/node.js b/src/alert-manager/src/alert-handler/controllers/node.js
index 856b47a81f..487b057987 100644
--- a/src/alert-manager/src/alert-handler/controllers/node.js
+++ b/src/alert-manager/src/alert-handler/controllers/node.js
@@ -18,15 +18,16 @@
 const k8s = require('@kubernetes/client-node');
 const kc = new k8s.KubeConfig();
 const logger = require('@alert-handler/common/logger');
+const crypto = require('crypto');
 
 kc.loadFromDefault();
-const k8sApi = kc.makeApiClient(k8s.CoreV1Api);
 
 const cordonNode = async (nodeName) => {
   const headers = {
     'content-type': 'application/strategic-merge-patch+json',
   };
   // set the node unschedulable
+  const k8sApi = kc.makeApiClient(k8s.CoreV1Api);
   return k8sApi.patchNode(
     nodeName,
     { spec: { unschedulable: true } },
@@ -72,7 +73,105 @@ const cordonNodes = (req, res) => {
     });
 };
 
+const getK8sV1Job = (jobName, nodeName, minorNumber) => {
+  const DOCKER_REGISTRY_PREFIX = process.env.DOCKER_REGISTRY_PREFIX;
+  const DOCKER_REGISTRY_TAG = process.env.DOCKER_REGISTRY_TAG;
+  const job = {
+    apiVersion: 'batch/v1',
+    kind: 'Job',
+    metadata: {
+      name: jobName,
+    },
+    spec: {
+      ttlSecondsAfterFinished: 86400, // TODO: enable this feature when install k8s / delete the job elsewhere
+      template: {
+        metadata: {
+          name: 'nvidia-gpu-low-perf-fixer',
+        },
+        spec: {
+          containers: [
+            {
+              name: 'nvidia-gpu-low-perf-fixer',
+              image: `${DOCKER_REGISTRY_PREFIX}nvidia-gpu-low-perf-fixer:${DOCKER_REGISTRY_TAG}`,
+              imagePullPolicy: 'Always',
+              env: [
+                {
+                  name: 'MINOR_NUMBER',
+                  value: `${minorNumber}`,
+                },
+              ],
+              securityContext: {
+                privileged: true,
+              },
+            },
+          ],
+          restartPolicy: 'Never',
+          nodeSelector: {
+            'kubernetes.io/hostname': nodeName,
+          },
+        },
+      },
+    },
+  };
+  return job;
+};
+
+// start a k8s job for each GPU card to fix NvidiaGPULowPerf issue
+const fixNvidiaGPULowPerf = (req, res) => {
+  logger.info(
+    'Received `fixNvidiaGPULowPerf` post request from alert-manager.',
+  );
+  // filter alerts which are firing and contain `node_name` & `minor_number` as label
+  const jobsInfo = req.body.alerts
+    .filter(
+      (alert) =>
+        alert.status === 'firing' &&
+        'node_name' in alert.labels &&
+        'minor_number' in alert.labels,
+    )
+    // map each alert to a job
+    .map((alert) => ({
+      jobName: `nvidia-gpu-low-perf-fixer-${crypto
+        .createHash('md5')
+        .update(alert.labels.node_name + alert.labels.minor_number)
+        .digest('hex')}`, // unique job by GPU card
+      nodeName: alert.labels.node_name,
+      minorNumber: alert.labels.minor_number,
+      DOCKER_REGISTRY_PREFIX: process.env.DOCKER_REGISTRY_PREFIX,
+      DOCKER_REGISTRY_TAG: process.env.DOCKER_REGISTRY_TAG,
+    }));
+
+  const k8sApi = kc.makeApiClient(k8s.BatchV1Api);
+  jobsInfo.forEach(async (jobInfo) => {
+    // get k8s V1Job
+    const job = getK8sV1Job(
+      jobInfo.jobName,
+      jobInfo.nodeName,
+      jobInfo.minorNumber,
+    );
+    k8sApi
+      .createNamespacedJob('default', job)
+      .then((response) => {
+        logger.info(
+          `Successfully start job ${jobInfo.jobName} for GPU Low Performance issue in node: ${jobInfo.nodeName}, minor number: ${jobInfo.minorNumber}`,
+        );
+      })
+      .catch((error) => {
+        // ignore the job creation if already exists
+        if (error.response && error.response.statusCode === 409) {
+          logger.warn(`Kubernetes job ${jobInfo.jobName} already exists.`);
+        } else {
+          logger.error(error);
+          res.status(500).json({
+            message: `Failed to start job to fix NvidiaGPULowPerf`,
+          });
+        }
+      });
+  });
+};
+
 // module exports
 module.exports = {
   cordonNodes,
+  fixNvidiaGPULowPerf,
 };
diff --git a/src/alert-manager/src/alert-handler/routes/actions.js b/src/alert-manager/src/alert-handler/routes/actions.js
index 6442f2056e..734eedad7f 100644
--- a/src/alert-manager/src/alert-handler/routes/actions.js
+++ b/src/alert-manager/src/alert-handler/routes/actions.js
@@ -50,4 +50,9 @@ router
   /** POST /alert-handler/cordon-nodes */
   .post(nodeController.cordonNodes);
 
+router
+  .route('/alert-handler/fix-nvidia-gpu-low-perf')
+  /** POST /alert-handler/fix-nvidia-gpu-low-perf */
+  .post(nodeController.fixNvidiaGPULowPerf);
+
 module.exports = router;
diff --git a/src/alert-manager/src/nvidia-gpu-low-perf-fixer/nvidia-gpu-low-perf-fixer.sh b/src/alert-manager/src/nvidia-gpu-low-perf-fixer/nvidia-gpu-low-perf-fixer.sh
new file mode 100644
index 0000000000..d6036b1d4f
--- /dev/null
+++ b/src/alert-manager/src/nvidia-gpu-low-perf-fixer/nvidia-gpu-low-perf-fixer.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+set -e
+
+echo "MINOR_NUMBER: ${MINOR_NUMBER}"
+
+sudo nvidia-smi -pm ENABLED -i ${MINOR_NUMBER}
+
+MAX_MEMORY_CLOCK=$(nvidia-smi -q -d SUPPORTED_CLOCKS | grep Memory | awk -v max=0 '{if($3>max){max=$3}}END{print max}')
+MAX_GRAPHICS_CLOCK=$(nvidia-smi -q -d SUPPORTED_CLOCKS | grep Graphics | awk -v max=0 '{if($3>max){max=$3}}END{print max}')
+echo "MAX_MEMORY_CLOCK: ${MAX_MEMORY_CLOCK}, MAX_GRAPHICS_CLOCK: ${MAX_GRAPHICS_CLOCK}"
+
+sudo nvidia-smi -ac ${MAX_MEMORY_CLOCK},${MAX_GRAPHICS_CLOCK} -i ${MINOR_NUMBER}

From 68732d5a7595363a7d8fe1ba334e84eeab2d752f Mon Sep 17 00:00:00 2001
From: suiguoxin <suiguoxin@gmail.com>
Date: Fri, 19 Mar 2021 12:10:53 +0800
Subject: [PATCH 2/5] clean completed jobs after 24h

---
 src/alert-manager/deploy/rbac.yaml            |  2 +-
 .../src/alert-handler/controllers/node.js     | 41 ++++++++++++++++++-
 src/alert-manager/src/alert-handler/index.js  |  7 ++++
 3 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/src/alert-manager/deploy/rbac.yaml b/src/alert-manager/deploy/rbac.yaml
index ae2787bef9..89073ff43b 100644
--- a/src/alert-manager/deploy/rbac.yaml
+++ b/src/alert-manager/deploy/rbac.yaml
@@ -17,7 +17,7 @@ rules:
     verbs: ["patch"]
   - apiGroups: ["batch"]
     resources: ["jobs"]
-    verbs: ["create"]
+    verbs: ["create", "list", "delete"]
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRoleBinding
diff --git a/src/alert-manager/src/alert-handler/controllers/node.js b/src/alert-manager/src/alert-handler/controllers/node.js
index 487b057987..b125109cd6 100644
--- a/src/alert-manager/src/alert-handler/controllers/node.js
+++ b/src/alert-manager/src/alert-handler/controllers/node.js
@@ -83,7 +83,9 @@ const getK8sV1Job = (jobName, nodeName, minorNumber) => {
       name: jobName,
     },
     spec: {
-      ttlSecondsAfterFinished: 86400, // TODO: enable this feature when install k8s / delete the job elsewhere
+      // TTL feature is currently alpha[Kubernetes 1.15]
+      // To avoid using this fearure, jobs will be cleaned with function `cleanCompletedfixNvidiaGPULowPerfJobs` regularlly
+      // ttlSecondsAfterFinished: 86400,
       template: {
         metadata: {
           name: 'nvidia-gpu-low-perf-fixer',
@@ -170,8 +172,45 @@ const fixNvidiaGPULowPerf = (req, res) => {
   });
 };
 
+// clean completed jobs which were used to fix NvidiaGPULowPerf issue
+// the jobs completed for more than 24 hours will be deleted
+const cleanCompletedfixNvidiaGPULowPerfJobs = (req, res) => {
+  logger.info(
+    'Cleaning completed jobs which were used to fix NvidiaGPULowPerf issue...',
+  );
+
+  const k8sApi = kc.makeApiClient(k8s.BatchV1Api);
+  k8sApi
+    .listNamespacedJob('default')
+    .then((response) => {
+      logger.info(`Successfully get job list.`);
+      const jobs = response.body.items;
+      jobs.forEach((job) => {
+        const jobName = job.metadata.name;
+        // check job name & if the job has completed
+        if (
+          jobName.startsWith('nvidia-gpu-low-perf-fixer-') &&
+          (job.status.succeeded === 1 || jobs.status.failed === 1) &&
+          new Date() - new Date(job.status.completionTime) > 24 * 60 * 60 * 1000 // completed for more than 24h
+        )
+          k8sApi
+            .deleteNamespacedJob(jobName, 'default')
+            .then((response) => {
+              logger.info(`Successfully deleted job ${jobName}`);
+            })
+            .catch((error) => {
+              logger.info(`Failed to delete job ${jobName}`, error);
+            });
+      });
+    })
+    .catch((error) => {
+      logger.error('Failed to list jobs:', error);
+    });
+};
+
 // module exports
 module.exports = {
   cordonNodes,
   fixNvidiaGPULowPerf,
+  cleanCompletedfixNvidiaGPULowPerfJobs,
 };
diff --git a/src/alert-manager/src/alert-handler/index.js b/src/alert-manager/src/alert-handler/index.js
index d0d78a279b..836a152168 100755
--- a/src/alert-manager/src/alert-handler/index.js
+++ b/src/alert-manager/src/alert-handler/index.js
@@ -23,6 +23,7 @@ require('module-alias/register');
 const express = require('express');
 const bearerToken = require('express-bearer-token');
 const actions = require('@alert-handler/routes/actions');
+const nodeController = require('@alert-handler/controllers/node');
 const logger = require('@alert-handler/common/logger');
 
 const app = express();
@@ -36,3 +37,9 @@ const port = parseInt(process.env.SERVER_PORT);
 app.listen(port, () => {
   logger.info(`alert-handler listening at http://localhost:${port}`);
 });
+
+// check completed jobs which were used to fix NvidiaGPULowPerf issue every 1 hour
+setInterval(
+  nodeController.cleanCompletedfixNvidiaGPULowPerfJobs,
+  60 * 60 * 1000,
+);

From 7980865e548593e192447b41c22da7a2c9acaddc Mon Sep 17 00:00:00 2001
From: suiguoxin <suiguoxin@gmail.com>
Date: Fri, 19 Mar 2021 12:29:18 +0800
Subject: [PATCH 3/5] update config generation rule, doc & examples

---
 .../services-configuration.yaml.template      |   8 ++
 .../services-configuration.yaml.template      |   8 ++
 .../cluster-admin/how-to-use-alert-system.md  |  31 ++--
 .../services-configuration.yaml               | 134 +++++++++---------
 src/alert-manager/config/alert_manager.py     |   7 +-
 .../alert-manager-configmap.yaml.template     |  18 +--
 6 files changed, 109 insertions(+), 97 deletions(-)

diff --git a/contrib/kubespray/quick-start/services-configuration.yaml.template b/contrib/kubespray/quick-start/services-configuration.yaml.template
index 27df33f3e4..2e48fb0585 100644
--- a/contrib/kubespray/quick-start/services-configuration.yaml.template
+++ b/contrib/kubespray/quick-start/services-configuration.yaml.template
@@ -232,6 +232,9 @@ authentication:
 #     - receiver: pai-email-admin-user-and-stop-job
 #       match:
 #         alertname: PAIJobGpuPercentLowerThan0_3For1h
+#     - receiver: pai-email-admin-and-fix-nvidia-gpu-low-perf
+#       match:
+#         alertname: NodeGpuLowPerfState
 #   customized-receivers: # receivers are combination of several actions
 #   - name: "pai-email-admin-user-and-stop-job"
 #     actions:
@@ -244,6 +247,11 @@ authentication:
 #       tag-jobs:
 #         tags:
 #         - 'stopped-by-alert-manager'
+#   - name: "pai-email-admin-and-fix-nvidia-gpu-low-perf"
+#     actions:
+#       email-admin:
+#       fix-nvidia-gpu-low-perf:
+
 
 # uncomment following if you want to customize prometheus
 # prometheus:
diff --git a/deployment/quick-start/services-configuration.yaml.template b/deployment/quick-start/services-configuration.yaml.template
index 2a30de3fbe..577cb262dc 100644
--- a/deployment/quick-start/services-configuration.yaml.template
+++ b/deployment/quick-start/services-configuration.yaml.template
@@ -92,6 +92,9 @@ rest-server:
 #     - receiver: pai-email-admin-user-and-stop-job
 #       match:
 #         alertname: PAIJobGpuPercentLowerThan0_3For1h
+#     - receiver: pai-email-admin-and-fix-nvidia-gpu-low-perf
+#       match:
+#         alertname: NodeGpuLowPerfState
 #   customized-receivers: # receivers are combination of several actions
 #   - name: "pai-email-admin-user-and-stop-job"
 #     actions:
@@ -104,6 +107,11 @@ rest-server:
 #       tag-jobs:
 #         tags:
 #         - 'stopped-by-alert-manager'
+#   - name: "pai-email-admin-and-fix-nvidia-gpu-low-perf"
+#     actions:
+#       email-admin:
+#       fix-nvidia-gpu-low-perf:
+
 
 # uncomment following if you want to customize prometheus
 # prometheus:
diff --git a/docs/manual/cluster-admin/how-to-use-alert-system.md b/docs/manual/cluster-admin/how-to-use-alert-system.md
index 1a986bb05b..19953b5221 100644
--- a/docs/manual/cluster-admin/how-to-use-alert-system.md
+++ b/docs/manual/cluster-admin/how-to-use-alert-system.md
@@ -114,26 +114,29 @@ We have provided so far these following actions:
   - `stop-jobs`: Stop jobs by calling OpenPAI REST API. **Be careful about this action because it stops jobs without notifying related users.**
   - `tag-jobs`: Add a tag to jobs by calling OpenPAI REST API.
   - `cordon-nodes`: Call Kubernetes API to cordon the corresponding nodes.
+  - `fix-nvidia-gpu-low-perf`: Start a privileged container to fix NVIDIA GPU Low Performance State issue.
 
 But before you use them, you have to add proper configuration in the `alert-handler` field. For example, `email-admin` needs you to set up an SMTP account to send the email and an admin email address to receive the email. Also, the `tag-jobs` and `stop-jobs` action calls OpenPAI REST API, so you should set a rest server token for them. To get the token, you should go to your profile page (in the top-right corner on Webporal, click `View my profile`), and use `Create application token` to create one. Generally speaking, there are two parts of the configuration in the `alert-handler` field. One is `email-configs`. The other is `pai-bearer-token`. The requirements for different actions are shown in the following table:
 
-|              | email-configs | pai-bearer-token |
-| :-----------:| :-----------: | :--------------: |
-| cordon-nodes | -             | -                |
-| email-admin  | required      | -                |
-| email-user   | required      | required         |
-| stop-jobs    | -             | required         |
-| tag-jobs     | -             | required         |
+|                             | email-configs | pai-bearer-token |
+| :-------------------------: | :-----------: | :--------------: |
+| cordon-nodes                | -             | -                |
+| email-admin                 | required      | -                |
+| email-user                  | required      | required         |
+| stop-jobs                   | -             | required         |
+| tag-jobs                    | -             | required         |
+| fix-nvidia-gpu-low-perf     | -             | -                |
 
 In addition, some actions may depend on certain fields in the `labels` of alert instances. The labels of the `alert instance` are generated based on the expression in the alert rule. For example, the expression of the `PAIJobGpuPercentLowerThan0_3For1h` alert we mentioned in previous section is `avg(task_gpu_percent{virtual_cluster=~"default"}) by (job_name) < 0.3`. This expression returns a list, the element in which contains the `job_name` field. So there will be also a `job_name` field in the labels of the alert instance. `stop-jobs` action depends on the `job_name` field, and it will stop the corresponding job based on it. To inspect the labels of an alert, you can visit `http(s)://<your master IP>/prometheus/alerts`. If the alert is firing, you can see its labels on this page. For the depended fields of each pre-defined action, please refer to the following table:
 
-|              | depended on label field |
-| :-----------:| :------------------: |
-| cordon-nodes | node_name            |
-| email-admin  | -                    | 
-| email-user   | -                    |
-| stop-jobs    | job_name             |
-| tag-jobs     | job_name             |
+|                             | depended on label field |
+| :-------------------------: | :---------------------: |
+| cordon-nodes                | node_name               |
+| email-admin                 | -                       | 
+| email-user                  | -                       |
+| stop-jobs                   | job_name                |
+| tag-jobs                    | job_name                |
+| fix-nvidia-gpu-low-perf     | node_name, minor_number |
 
 
 The matching rules between alerts and actions are defined using `receivers` and `routes`.
diff --git a/examples/cluster-configuration/services-configuration.yaml b/examples/cluster-configuration/services-configuration.yaml
index 4933c0d602..6b4c1259ea 100644
--- a/examples/cluster-configuration/services-configuration.yaml
+++ b/examples/cluster-configuration/services-configuration.yaml
@@ -82,7 +82,6 @@ rest-server:
   #github-path: marketplace
   # Job Debugging Reservation Seconds.
   #debugging-reservation-seconds: 604800
-
 # uncomment following section if you want to customize the port of web portal
 # webportal:
 #   server-port: 9286
@@ -125,6 +124,9 @@ rest-server:
 #     - receiver: pai-email-admin-user-and-stop-job
 #       match:
 #         alertname: PAIJobGpuPercentLowerThan0_3For1h
+#     - receiver: pai-email-admin-and-fix-nvidia-gpu-low-perf
+#       match:
+#         alertname: NodeGpuLowPerfState
 #   customized-receivers: # receivers are combination of several actions
 #   - name: "pai-email-admin-user-and-stop-job"
 #     actions: 
@@ -137,6 +139,10 @@ rest-server:
 #       tag-jobs:
 #         tags: 
 #         - 'stopped-by-alert-manager'
+#   - name: "pai-email-admin-and-fix-nvidia-gpu-low-perf"
+#     actions:
+#       email-admin:
+#       fix-nvidia-gpu-low-perf:
 
 # uncomment following if you want to customize prometheus
 # prometheus:
@@ -172,8 +178,6 @@ rest-server:
 #  # key_name: yyyyyy
 #  # key_path: /path/to/yyyyyy
 
-
-
 # uncomment following section if you want to customize the threshold of cleaner
 # cleaner:
 #  threshold: 90
@@ -185,65 +189,65 @@ rest-server:
 
 # uncomment following section, if you want to customize the authentication solution.
 #authentication:
-  #OIDC: false
-
-  # If OIDC is set as the value true, you will have to configure the following properties.
-  #OIDC-type: AAD
-  #
-  #AAD:
-  #  # If you wanna configure AAD-OIDC for OpenPAI, the following configuration is mandatory.
-  #  # National Clouds endpoint list https://docs.microsoft.com/en-us/azure/active-directory/develop/authentication-national-cloud
-  #  # AZURE: https://login.microsoftonline.com/{tenantID}/v2.0/.well-known/openid-configuration
-  #  # China: https://login.partner.microsoftonline.cn/{tenantID}/v2.0/.well-known/openid-configuration
-  #  # Germany: https://login.microsoftonline.de/{tenantID}/v2.0/.well-known/openid-configuration
-  #  wellKnownURL: https://login.microsoftonline.com/{tenantID}/v2.0/.well-known/openid-configuration
-  #
-  #  # If you wanna configure AAD-OIDC for OpenPAI, the following configuration is mandatory.
-  #  tenantID: ${tenat_id}
-  #
-  #  # Required, the client ID of your app in AAD
-  #  clientID: ${your_client_id}
-  #
-  #  # Required if `responseType` is 'code', 'id_token code' or 'code id_token'.
-  #  # If app key contains '\', replace it with '\\'.
-  #  clientSecret: '${your_client_secret}'
-  #
-  #  # Optional. The lifetime of nonce in session or cookie, the default value is 3600 (seconds).
-  #  nonceLifetime: null
-  #
-  #  # Optional. The max amount of nonce saved in session or cookie, the default value is 10.
-  #  nonceMaxAmount: 5
-  #
-  #  # Optional. The clock skew allowed in token validation, the default value is 300 seconds.
-  #  clockSkew: null
-  #
-  #group-manager:
-  #  # basic: If you set group-data-source as the value basic, admin should manually modify user's grouplist.
-  #  # winbind: If you set group-data-source as the value winbind, the user's grouplist will get from winbind server based on your configuration.
-  #  group-data-source: basic
-  #
-  #  # If you set winbind as your data source, you should configure this configuration.
-  #  # winbind-server-address: xxxxxxx
-  #
-  #  # Admin group name and its user list
-  #  admin-group:
-  #    groupname: admingroup
-  #    description: "admin's group"
-  #    externalName: ""
-  #
-  #  # Group for default vc.
-  #  # For yarn default queue hack.
-  #  default-group:
-  #    groupname: default
-  #    description: "group for default vc"
-  #    externalName: ""
-  #
-  #  # If the following groups are not in the data store, it will be created by default.
-  #  grouplist:
-  #    - groupname: forexample
-  #      # internal name
-  #      description: forexample
-  #      # description of the group
-  #      externalName: ""
-  #      # external name, it should be set if your group-data-source is winbind. And the name will be used to query and match the group from
-  #      # the result of winbind. If the group-data-source is basic, this field is useless.
+#OIDC: false
+
+# If OIDC is set as the value true, you will have to configure the following properties.
+#OIDC-type: AAD
+#
+#AAD:
+#  # If you wanna configure AAD-OIDC for OpenPAI, the following configuration is mandatory.
+#  # National Clouds endpoint list https://docs.microsoft.com/en-us/azure/active-directory/develop/authentication-national-cloud
+#  # AZURE: https://login.microsoftonline.com/{tenantID}/v2.0/.well-known/openid-configuration
+#  # China: https://login.partner.microsoftonline.cn/{tenantID}/v2.0/.well-known/openid-configuration
+#  # Germany: https://login.microsoftonline.de/{tenantID}/v2.0/.well-known/openid-configuration
+#  wellKnownURL: https://login.microsoftonline.com/{tenantID}/v2.0/.well-known/openid-configuration
+#
+#  # If you wanna configure AAD-OIDC for OpenPAI, the following configuration is mandatory.
+#  tenantID: ${tenat_id}
+#
+#  # Required, the client ID of your app in AAD
+#  clientID: ${your_client_id}
+#
+#  # Required if `responseType` is 'code', 'id_token code' or 'code id_token'.
+#  # If app key contains '\', replace it with '\\'.
+#  clientSecret: '${your_client_secret}'
+#
+#  # Optional. The lifetime of nonce in session or cookie, the default value is 3600 (seconds).
+#  nonceLifetime: null
+#
+#  # Optional. The max amount of nonce saved in session or cookie, the default value is 10.
+#  nonceMaxAmount: 5
+#
+#  # Optional. The clock skew allowed in token validation, the default value is 300 seconds.
+#  clockSkew: null
+#
+#group-manager:
+#  # basic: If you set group-data-source as the value basic, admin should manually modify user's grouplist.
+#  # winbind: If you set group-data-source as the value winbind, the user's grouplist will get from winbind server based on your configuration.
+#  group-data-source: basic
+#
+#  # If you set winbind as your data source, you should configure this configuration.
+#  # winbind-server-address: xxxxxxx
+#
+#  # Admin group name and its user list
+#  admin-group:
+#    groupname: admingroup
+#    description: "admin's group"
+#    externalName: ""
+#
+#  # Group for default vc.
+#  # For yarn default queue hack.
+#  default-group:
+#    groupname: default
+#    description: "group for default vc"
+#    externalName: ""
+#
+#  # If the following groups are not in the data store, it will be created by default.
+#  grouplist:
+#    - groupname: forexample
+#      # internal name
+#      description: forexample
+#      # description of the group
+#      externalName: ""
+#      # external name, it should be set if your group-data-source is winbind. And the name will be used to query and match the group from
+#      # the result of winbind. If the group-data-source is basic, this field is useless.
diff --git a/src/alert-manager/config/alert_manager.py b/src/alert-manager/config/alert_manager.py
index 6b33c15435..63ba1468ee 100644
--- a/src/alert-manager/config/alert_manager.py
+++ b/src/alert-manager/config/alert_manager.py
@@ -74,17 +74,14 @@ def run(self):
         else:
             token_configured = False
 
+        result["alert-handler"]["configured"] = True
+        result["actions-available"] = ["fix-nvidia-gpu-low-perf"]
         if email_configured and token_configured:
-            result["alert-handler"]["configured"] = True
             result["actions-available"].extend(["email-admin", "email-user", "stop-jobs", "tag-jobs"])
         elif email_configured:
-            result["alert-handler"]["configured"] = True
             result["actions-available"].append("email-admin")
         elif token_configured:
-            result["alert-handler"]["configured"] = True
             result["actions-available"].extend(["stop-jobs", "tag-jobs"])
-        else:
-            result["alert-handler"]["configured"] = False
 
         if result.get("cluster-utilization") is not None and \
             result["cluster-utilization"].get("schedule") is not None and \
diff --git a/src/alert-manager/deploy/alert-manager-configmap.yaml.template b/src/alert-manager/deploy/alert-manager-configmap.yaml.template
index 7e6cb82fad..28ed7d7a5a 100644
--- a/src/alert-manager/deploy/alert-manager-configmap.yaml.template
+++ b/src/alert-manager/deploy/alert-manager-configmap.yaml.template
@@ -41,10 +41,6 @@ data:
         match:
           report_type: cluster-usage
 
-      - receiver: fix-nvidia-gpu-low-perf
-        match:
-          alertname: NodeGpuLowPerfState
-
       {% if 'routes' in cluster_cfg["alert-manager"]["customized-routes"] %}
       {% for route in cluster_cfg["alert-manager"]["customized-routes"]["routes"] %}
       - receiver: {{ route.receiver}}
@@ -78,15 +74,6 @@ data:
         send_resolved: false
       {% endif %}
 
-    - name: fix-nvidia-gpu-low-perf
-      webhook_configs:
-      {% if 'email-admin' in cluster_cfg["alert-manager"]["actions-available"] %}
-      - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/send-email-to-admin'
-        send_resolved: true
-      {% endif %}
-      - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/fix-nvidia-gpu-low-perf'
-        send_resolved: false
-
     - name: pai-cordon-nodes
       webhook_configs:
       {% if 'cordon-nodes' in cluster_cfg["alert-manager"]["actions-available"] %}
@@ -135,6 +122,11 @@ data:
       - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/cordon-nodes'
         send_resolved: false
       {% endif %}
+
+      {% if (receiver["actions"]["fix-nvidia-gpu-low-perf"] is defined) and ('fix-nvidia-gpu-low-perf' in cluster_cfg["alert-manager"]["actions-available"]) %}
+      - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/fix-nvidia-gpu-low-perf'
+        send_resolved: false
+      {% endif %}
     
     {% endfor %}
 

From 0a90fa4f8fb5e3dcadadeae96eb908d085c77735 Mon Sep 17 00:00:00 2001
From: suiguoxin <suiguoxin@gmail.com>
Date: Mon, 29 Mar 2021 08:25:32 +0800
Subject: [PATCH 4/5] add label to fixer job

---
 .../alert-handler/controllers/kubernetes.js   | 63 +++++++++++++++++++
 .../src/alert-handler/controllers/node.js     | 42 ++-----------
 src/alert-manager/src/alert-handler/index.js  |  7 +--
 3 files changed, 69 insertions(+), 43 deletions(-)
 create mode 100644 src/alert-manager/src/alert-handler/controllers/kubernetes.js

diff --git a/src/alert-manager/src/alert-handler/controllers/kubernetes.js b/src/alert-manager/src/alert-handler/controllers/kubernetes.js
new file mode 100644
index 0000000000..fa64c96da7
--- /dev/null
+++ b/src/alert-manager/src/alert-handler/controllers/kubernetes.js
@@ -0,0 +1,63 @@
+// Copyright (c) Microsoft Corporation
+// All rights reserved.
+//
+// MIT License
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+// documentation files (the "Software"), to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
+// to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+// BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+const k8s = require('@kubernetes/client-node');
+const kc = new k8s.KubeConfig();
+const logger = require('@alert-handler/common/logger');
+
+// clean TTL 24 hours jobs
+const cleanTTL24HJobs = () => {
+  logger.info('Cleaning completed TTL 24h jobs...');
+
+  const k8sApi = kc.makeApiClient(k8s.BatchV1Api);
+  k8sApi
+    .listNamespacedJob(
+      'default',
+      undefined,
+      undefined,
+      undefined,
+      undefined,
+      'time-to-live=24h', // labelSelector
+    )
+    .then((response) => {
+      logger.info(`Successfully get job list.`);
+      const jobs = response.body.items;
+      jobs.forEach((job) => {
+        const jobName = job.metadata.name;
+        if (
+          (job.status.succeeded === 1 || jobs.status.failed === 1) && // check if the job has completed
+          new Date() - new Date(job.status.completionTime) > 24 * 60 * 60 * 1000 // completed for more than 24h
+        )
+          k8sApi
+            .deleteNamespacedJob(jobName, 'default')
+            .then((response) => {
+              logger.info(`Successfully deleted job ${jobName}`);
+            })
+            .catch((error) => {
+              logger.info(`Failed to delete job ${jobName}`, error);
+            });
+      });
+    })
+    .catch((error) => {
+      logger.error('Failed to list jobs:', error);
+    });
+};
+
+// module exports
+module.exports = {
+  cleanTTL24HJobs,
+};
diff --git a/src/alert-manager/src/alert-handler/controllers/node.js b/src/alert-manager/src/alert-handler/controllers/node.js
index b125109cd6..1bcff5501c 100644
--- a/src/alert-manager/src/alert-handler/controllers/node.js
+++ b/src/alert-manager/src/alert-handler/controllers/node.js
@@ -84,11 +84,14 @@ const getK8sV1Job = (jobName, nodeName, minorNumber) => {
     },
     spec: {
       // TTL feature is currently alpha[Kubernetes 1.15]
-      // To avoid using this fearure, jobs will be cleaned with function `cleanCompletedfixNvidiaGPULowPerfJobs` regularlly
+      // To avoid using this fearure, jobs with label `time-to-live=24h` ill be cleaned with function `cleanTTL24HJobs` regularlly
       // ttlSecondsAfterFinished: 86400,
       template: {
         metadata: {
           name: 'nvidia-gpu-low-perf-fixer',
+          labels: {
+            'time-to-live': '24h',
+          },
         },
         spec: {
           containers: [
@@ -172,45 +175,8 @@ const fixNvidiaGPULowPerf = (req, res) => {
   });
 };
 
-// clean completed jobs which were used to fix NvidiaGPULowPerf issue
-// the jobs completed for more than 24 hours will be deleted
-const cleanCompletedfixNvidiaGPULowPerfJobs = (req, res) => {
-  logger.info(
-    'Cleaning completed jobs which were used to fix NvidiaGPULowPerf issue...',
-  );
-
-  const k8sApi = kc.makeApiClient(k8s.BatchV1Api);
-  k8sApi
-    .listNamespacedJob('default')
-    .then((response) => {
-      logger.info(`Successfully get job list.`);
-      const jobs = response.body.items;
-      jobs.forEach((job) => {
-        const jobName = job.metadata.name;
-        // check job name & if the job has completed
-        if (
-          jobName.startsWith('nvidia-gpu-low-perf-fixer-') &&
-          (job.status.succeeded === 1 || jobs.status.failed === 1) &&
-          new Date() - new Date(job.status.completionTime) > 24 * 60 * 60 * 1000 // completed for more than 24h
-        )
-          k8sApi
-            .deleteNamespacedJob(jobName, 'default')
-            .then((response) => {
-              logger.info(`Successfully deleted job ${jobName}`);
-            })
-            .catch((error) => {
-              logger.info(`Failed to delete job ${jobName}`, error);
-            });
-      });
-    })
-    .catch((error) => {
-      logger.error('Failed to list jobs:', error);
-    });
-};
-
 // module exports
 module.exports = {
   cordonNodes,
   fixNvidiaGPULowPerf,
-  cleanCompletedfixNvidiaGPULowPerfJobs,
 };
diff --git a/src/alert-manager/src/alert-handler/index.js b/src/alert-manager/src/alert-handler/index.js
index 836a152168..bc0d121c89 100755
--- a/src/alert-manager/src/alert-handler/index.js
+++ b/src/alert-manager/src/alert-handler/index.js
@@ -23,7 +23,7 @@ require('module-alias/register');
 const express = require('express');
 const bearerToken = require('express-bearer-token');
 const actions = require('@alert-handler/routes/actions');
-const nodeController = require('@alert-handler/controllers/node');
+const k8sController = require('@alert-handler/controllers/kubernetes');
 const logger = require('@alert-handler/common/logger');
 
 const app = express();
@@ -39,7 +39,4 @@ app.listen(port, () => {
 });
 
 // check completed jobs which were used to fix NvidiaGPULowPerf issue every 1 hour
-setInterval(
-  nodeController.cleanCompletedfixNvidiaGPULowPerfJobs,
-  60 * 60 * 1000,
-);
+setInterval(k8sController.cleanTTL24HJobs, 60 * 60 * 1000);

From 9f24d6e4adf92c148527d2adcada8e662ad69025 Mon Sep 17 00:00:00 2001
From: suiguoxin <suiguoxin@gmail.com>
Date: Wed, 31 Mar 2021 10:24:58 +0800
Subject: [PATCH 5/5] refine

---
 .../nvidia-gpu-low-perf-fixer.common.dockerfile      |  3 ---
 .../src/alert-handler/controllers/kubernetes.js      |  4 ++--
 .../src/alert-handler/controllers/node.js            | 12 +++++-------
 .../nvidia-gpu-low-perf-fixer.sh                     |  6 +++---
 4 files changed, 10 insertions(+), 15 deletions(-)

diff --git a/src/alert-manager/build/nvidia-gpu-low-perf-fixer.common.dockerfile b/src/alert-manager/build/nvidia-gpu-low-perf-fixer.common.dockerfile
index 58ab96a2ca..dd8050d05b 100644
--- a/src/alert-manager/build/nvidia-gpu-low-perf-fixer.common.dockerfile
+++ b/src/alert-manager/build/nvidia-gpu-low-perf-fixer.common.dockerfile
@@ -17,9 +17,6 @@
 
 FROM nvidia/cuda:11.2.2-base-ubuntu16.04
 
-RUN apt-get -y update && \
-    apt-get install sudo
-
 COPY ./src/nvidia-gpu-low-perf-fixer .
 
 ENTRYPOINT /bin/bash nvidia-gpu-low-perf-fixer.sh
diff --git a/src/alert-manager/src/alert-handler/controllers/kubernetes.js b/src/alert-manager/src/alert-handler/controllers/kubernetes.js
index fa64c96da7..ca41879726 100644
--- a/src/alert-manager/src/alert-handler/controllers/kubernetes.js
+++ b/src/alert-manager/src/alert-handler/controllers/kubernetes.js
@@ -19,7 +19,7 @@ const k8s = require('@kubernetes/client-node');
 const kc = new k8s.KubeConfig();
 const logger = require('@alert-handler/common/logger');
 
-// clean TTL 24 hours jobs
+// clean TTL 24 hours jobs created by alert-handler
 const cleanTTL24HJobs = () => {
   logger.info('Cleaning completed TTL 24h jobs...');
 
@@ -31,7 +31,7 @@ const cleanTTL24HJobs = () => {
       undefined,
       undefined,
       undefined,
-      'time-to-live=24h', // labelSelector
+      'created-by=alert-handler,time-to-live=24h', // labelSelector
     )
     .then((response) => {
       logger.info(`Successfully get job list.`);
diff --git a/src/alert-manager/src/alert-handler/controllers/node.js b/src/alert-manager/src/alert-handler/controllers/node.js
index 1bcff5501c..39dd01132d 100644
--- a/src/alert-manager/src/alert-handler/controllers/node.js
+++ b/src/alert-manager/src/alert-handler/controllers/node.js
@@ -81,18 +81,16 @@ const getK8sV1Job = (jobName, nodeName, minorNumber) => {
     kind: 'Job',
     metadata: {
       name: jobName,
+      labels: {
+        'created-by': 'alert-handler',
+        'time-to-live': '24h',
+      },
     },
     spec: {
       // TTL feature is currently alpha[Kubernetes 1.15]
-      // To avoid using this fearure, jobs with label `time-to-live=24h` ill be cleaned with function `cleanTTL24HJobs` regularlly
+      // To avoid using this fearure, jobs with label `time-to-live=24h` & `created-by=alert-handler` will be cleaned with function `cleanTTL24HJobs` regularlly
       // ttlSecondsAfterFinished: 86400,
       template: {
-        metadata: {
-          name: 'nvidia-gpu-low-perf-fixer',
-          labels: {
-            'time-to-live': '24h',
-          },
-        },
         spec: {
           containers: [
             {
diff --git a/src/alert-manager/src/nvidia-gpu-low-perf-fixer/nvidia-gpu-low-perf-fixer.sh b/src/alert-manager/src/nvidia-gpu-low-perf-fixer/nvidia-gpu-low-perf-fixer.sh
index d6036b1d4f..8903f09f3f 100644
--- a/src/alert-manager/src/nvidia-gpu-low-perf-fixer/nvidia-gpu-low-perf-fixer.sh
+++ b/src/alert-manager/src/nvidia-gpu-low-perf-fixer/nvidia-gpu-low-perf-fixer.sh
@@ -1,12 +1,12 @@
 #!/bin/bash
-set -e
+set -ex
 
 echo "MINOR_NUMBER: ${MINOR_NUMBER}"
 
-sudo nvidia-smi -pm ENABLED -i ${MINOR_NUMBER}
+nvidia-smi -pm ENABLED -i ${MINOR_NUMBER}
 
 MAX_MEMORY_CLOCK=$(nvidia-smi -q -d SUPPORTED_CLOCKS | grep Memory | awk -v max=0 '{if($3>max){max=$3}}END{print max}')
 MAX_GRAPHICS_CLOCK=$(nvidia-smi -q -d SUPPORTED_CLOCKS | grep Graphics | awk -v max=0 '{if($3>max){max=$3}}END{print max}')
 echo "MAX_MEMORY_CLOCK: ${MAX_MEMORY_CLOCK}, MAX_GRAPHICS_CLOCK: ${MAX_GRAPHICS_CLOCK}"
 
-sudo nvidia-smi -ac ${MAX_MEMORY_CLOCK},${MAX_GRAPHICS_CLOCK} -i ${MINOR_NUMBER}
+nvidia-smi -ac ${MAX_MEMORY_CLOCK},${MAX_GRAPHICS_CLOCK} -i ${MINOR_NUMBER}