init

microsoft · Mar 18, 2021 · d7f758a · d7f758a
1 parent 9be9ea6
commit d7f758a
Show file tree

Hide file tree

Showing 13 changed files with 242 additions and 15 deletions.
diff --git a/src/alert-manager/build/nvidia-gpu-low-perf-fixer.common.dockerfile b/src/alert-manager/build/nvidia-gpu-low-perf-fixer.common.dockerfile
@@ -0,0 +1,25 @@
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+FROM nvidia/cuda:11.2.2-base-ubuntu16.04
+
+RUN apt-get -y update && \
+    apt-get install sudo
+
+COPY ./src/nvidia-gpu-low-perf-fixer .
+
+ENTRYPOINT /bin/bash nvidia-gpu-low-perf-fixer.sh
diff --git a/src/alert-manager/build/nvidia-gpu-low-perf-state-fixer.common.dockerfile b/src/alert-manager/build/nvidia-gpu-low-perf-state-fixer.common.dockerfile
@@ -0,0 +1,22 @@
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+FROM nvidia/cuda:10.2-base
+
+COPY ./src/nvidia-gpu-low-perf-state-fixer .
+
+ENTRYPOINT /bin/bash nvidia-gpu-low-perf-state-fixer.sh
diff --git a/src/alert-manager/deploy/alert-manager-configmap.yaml.template b/src/alert-manager/deploy/alert-manager-configmap.yaml.template
@@ -41,6 +41,10 @@ data:
         match:
           report_type: cluster-usage
 
+      - receiver: fix-nvidia-gpu-low-perf
+        match:
+          alertname: NodeGpuLowPerfState
+
       {% if 'routes' in cluster_cfg["alert-manager"]["customized-routes"] %}
       {% for route in cluster_cfg["alert-manager"]["customized-routes"]["routes"] %}
       - receiver: {{ route.receiver}}
@@ -74,6 +78,15 @@ data:
         send_resolved: false
       {% endif %}
 
+    - name: fix-nvidia-gpu-low-perf
+      webhook_configs:
+      {% if 'email-admin' in cluster_cfg["alert-manager"]["actions-available"] %}
+      - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/send-email-to-admin'
+        send_resolved: true
+      {% endif %}
+      - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/fix-nvidia-gpu-low-perf'
+        send_resolved: false
+
     - name: pai-cordon-nodes
       webhook_configs:
       {% if 'cordon-nodes' in cluster_cfg["alert-manager"]["actions-available"] %}

diff --git a/src/alert-manager/deploy/alert-manager-deployment.yaml.template b/src/alert-manager/deploy/alert-manager-deployment.yaml.template
@@ -67,6 +67,10 @@ spec:
           value: {{ cluster_cfg["cluster"]["common"]["cluster-id"] }}
         - name: REST_SERVER_URI
           value: {{ cluster_cfg['rest-server']['uri'] }}
+        - name: DOCKER_REGISTRY_PREFIX
+          value: {{ cluster_cfg['cluster']['docker-registry']['prefix'] }}
+        - name: DOCKER_REGISTRY_TAG
+          value: {{ cluster_cfg['cluster']['docker-registry']['tag'] }}
         - name: WEBPORTAL_URI
 {%- if "ssl" in cluster_cfg["pylon"] and cluster_cfg["pylon"]["ssl"] %}
           value: "{{ cluster_cfg['pylon']['uri-https']}}"

diff --git a/src/alert-manager/deploy/rbac.yaml b/src/alert-manager/deploy/rbac.yaml
@@ -15,6 +15,9 @@ rules:
   - apiGroups: [""]
     resources: ["nodes"]
     verbs: ["patch"]
+  - apiGroups: ["batch"]
+    resources: ["jobs"]
+    verbs: ["create"]
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRoleBinding

diff --git a/src/alert-manager/src/alert-handler/controllers/mail.js b/src/alert-manager/src/alert-handler/controllers/mail.js
@@ -88,19 +88,6 @@ const sendEmailToAdmin = (req, res) => {
     });
 };
 
-const getUserNameByJobName = async (jobName, token) => {
-  return axios
-    .get(`${process.env.REST_SERVER_URI}/api/v2/jobs/${jobName}`, {
-      headers: {
-        Authorization: `Bearer ${token}`,
-        'Content-Type': 'application/json',
-      },
-    })
-    .then((response) => {
-      return response.data.jobStatus.username;
-    });
-};
-
 const getUserEmail = async (username, token) => {
   return axios
     .get(`${process.env.REST_SERVER_URI}/api/v2/users/${username}`, {
@@ -132,7 +119,7 @@ const sendEmailToUser = async (req, res) => {
   // group alerts by username
   const alertsGrouped = {};
   alerts.map((alert, index) => {
-    let userName = alert.labels.job_name.split('~')[0];
+    const userName = alert.labels.job_name.split('~')[0];
     if (userName in alertsGrouped) {
       alertsGrouped[userName].push(alerts[index]);
     } else {

diff --git a/src/alert-manager/src/alert-handler/controllers/node.js b/src/alert-manager/src/alert-handler/controllers/node.js
@@ -18,15 +18,16 @@
 const k8s = require('@kubernetes/client-node');
 const kc = new k8s.KubeConfig();
 const logger = require('@alert-handler/common/logger');
+const crypto = require('crypto');
 
 kc.loadFromDefault();
-const k8sApi = kc.makeApiClient(k8s.CoreV1Api);
 
 const cordonNode = async (nodeName) => {
   const headers = {
     'content-type': 'application/strategic-merge-patch+json',
   };
   // set the node unschedulable
+  const k8sApi = kc.makeApiClient(k8s.CoreV1Api);
   return k8sApi.patchNode(
     nodeName,
     { spec: { unschedulable: true } },
@@ -72,7 +73,105 @@ const cordonNodes = (req, res) => {
     });
 };
 
+const getK8sV1Job = (jobName, nodeName, minorNumber) => {
+  const DOCKER_REGISTRY_PREFIX = process.env.DOCKER_REGISTRY_PREFIX;
+  const DOCKER_REGISTRY_TAG = process.env.DOCKER_REGISTRY_TAG;
+  const job = {
+    apiVersion: 'batch/v1',
+    kind: 'Job',
+    metadata: {
+      name: jobName,
+    },
+    spec: {
+      ttlSecondsAfterFinished: 86400, // TODO: enable this feature when install k8s / delete the job elsewhere
+      template: {
+        metadata: {
+          name: 'nvidia-gpu-low-perf-fixer',
+        },
+        spec: {
+          containers: [
+            {
+              name: 'nvidia-gpu-low-perf-fixer',
+              image: `${DOCKER_REGISTRY_PREFIX}nvidia-gpu-low-perf-fixer:${DOCKER_REGISTRY_TAG}`,
+              imagePullPolicy: 'Always',
+              env: [
+                {
+                  name: 'MINOR_NUMBER',
+                  value: `${minorNumber}`,
+                },
+              ],
+              securityContext: {
+                privileged: true,
+              },
+            },
+          ],
+          restartPolicy: 'Never',
+          nodeSelector: {
+            'kubernetes.io/hostname': nodeName,
+          },
+        },
+      },
+    },
+  };
+  return job;
+};
+
+// start a k8s job for each GPU card to fix NvidiaGPULowPerf issue
+const fixNvidiaGPULowPerf = (req, res) => {
+  logger.info(
+    'Received `fixNvidiaGPULowPerf` post request from alert-manager.',
+  );
+  // filter alerts which are firing and contain `node_name` & `minor_number` as label
+  const jobsInfo = req.body.alerts
+    .filter(
+      (alert) =>
+        alert.status === 'firing' &&
+        'node_name' in alert.labels &&
+        'minor_number' in alert.labels,
+    )
+    // map each alert to a job
+    .map((alert) => ({
+      jobName: `nvidia-gpu-low-perf-fixer-${crypto
+        .createHash('md5')
+        .update(alert.labels.node_name + alert.labels.minor_number)
+        .digest('hex')}`, // unique job by GPU card
+      nodeName: alert.labels.node_name,
+      minorNumber: alert.labels.minor_number,
+      DOCKER_REGISTRY_PREFIX: process.env.DOCKER_REGISTRY_PREFIX,
+      DOCKER_REGISTRY_TAG: process.env.DOCKER_REGISTRY_TAG,
+    }));
+
+  const k8sApi = kc.makeApiClient(k8s.BatchV1Api);
+  jobsInfo.forEach(async (jobInfo) => {
+    // get k8s V1Job
+    const job = getK8sV1Job(
+      jobInfo.jobName,
+      jobInfo.nodeName,
+      jobInfo.minorNumber,
+    );
+    k8sApi
+      .createNamespacedJob('default', job)
+      .then((response) => {
+        logger.info(
+          `Successfully start job ${jobInfo.jobName} for GPU Low Performance issue in node: ${jobInfo.nodeName}, minor number: ${jobInfo.minorNumber}`,
+        );
+      })
+      .catch((error) => {
+        // ignore the job creation if already exists
+        if (error.response && error.response.statusCode === 409) {
+          logger.warn(`Kubernetes job ${jobInfo.jobName} already exists.`);
+        } else {
+          logger.error(error);
+          res.status(500).json({
+            message: `Failed to start job to fix NvidiaGPULowPerf`,
+          });
+        }
+      });
+  });
+};
+
 // module exports
 module.exports = {
   cordonNodes,
+  fixNvidiaGPULowPerf,
 };
diff --git a/src/alert-manager/src/alert-handler/package.json b/src/alert-manager/src/alert-handler/package.json
@@ -40,6 +40,7 @@
     "express": "^4.16.2",
     "express-bearer-token": "^2.4.0",
     "joi": "^14.3.1",
+    "js-yaml": "^4.0.0",
     "module-alias": "^2.2.2",
     "nodemailer": "^6.4.11",
     "winston": "2"

diff --git a/src/alert-manager/src/alert-handler/routes/actions.js b/src/alert-manager/src/alert-handler/routes/actions.js
@@ -50,4 +50,9 @@ router
   /** POST /alert-handler/cordon-nodes */
   .post(nodeController.cordonNodes);
 
+router
+  .route('/alert-handler/fix-nvidia-gpu-low-perf')
+  /** POST /alert-handler/fix-nvidia-gpu-low-perf */
+  .post(nodeController.fixNvidiaGPULowPerf);
+
 module.exports = router;
diff --git a/...ert-manager/src/alert-handler/templates/nvidia-gpu-low-perf-state-fixer-job.yaml.template b/...ert-manager/src/alert-handler/templates/nvidia-gpu-low-perf-state-fixer-job.yaml.template
@@ -0,0 +1,20 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: <%= jobName %>
+spec:
+  ttlSecondsAfterFinished: 86400 # TODO: enable this feature when install k8s / delete the job elsewhere
+  template:
+    metadata:
+      name: nvidia-gpu-low-perf-state-fixer
+    spec:
+      containers:
+      - name: nvidia-gpu-low-perf-state-fixer
+        image: <%= DOCKER_REGISTRY_PREFIX %>nvidia-gpu-low-perf-state-fixer:<%= DOCKER_REGISTRY_TAG %>
+        args:
+          - '--minor-number=<%= minorNumber %>'
+        securityContext:
+          privileged: true
+      restartPolicy: Never
+      nodeSelector:
+        kubernetes.io/hostname: <%= nodeName %>
diff --git a/src/alert-manager/src/alert-handler/yarn.lock b/src/alert-manager/src/alert-handler/yarn.lock
@@ -349,6 +349,11 @@ argparse@^1.0.7:
   dependencies:
     sprintf-js "~1.0.2"
 
+argparse@^2.0.1:
+  version "2.0.1"
+  resolved "https://registry.yarnpkg.com/argparse/-/argparse-2.0.1.tgz#246f50f3ca78a3240f6c997e8a9bd1eac49e4b38"
+  integrity sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==
+
 array-differ@^3.0.0:
   version "3.0.0"
   resolved "https://registry.yarnpkg.com/array-differ/-/array-differ-3.0.0.tgz#3cbb3d0f316810eafcc47624734237d6aee4ae6b"
@@ -1974,6 +1979,13 @@ js-yaml@^3.13.1:
     argparse "^1.0.7"
     esprima "^4.0.0"
 
+js-yaml@^4.0.0:
+  version "4.0.0"
+  resolved "https://registry.yarnpkg.com/js-yaml/-/js-yaml-4.0.0.tgz#f426bc0ff4b4051926cd588c71113183409a121f"
+  integrity sha512-pqon0s+4ScYUvX30wxQi3PogGFAlUyH0awepWvwkj4jD4v+ova3RiYw8bmA6x2rDrEaj8i/oWKoRxpVNW+Re8Q==
+  dependencies:
+    argparse "^2.0.1"
+
 jsbn@~0.1.0:
   version "0.1.1"
   resolved "https://registry.yarnpkg.com/jsbn/-/jsbn-0.1.1.tgz#a5e654c2e5a2deb5f201d96cefbca80c0ef2f513"

diff --git a/src/alert-manager/src/nvidia-gpu-low-perf-fixer/nvidia-gpu-low-perf-fixer.sh b/src/alert-manager/src/nvidia-gpu-low-perf-fixer/nvidia-gpu-low-perf-fixer.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+set -e
+
+echo "MINOR_NUMBER: ${MINOR_NUMBER}"
+
+sudo nvidia-smi -pm ENABLED -i ${MINOR_NUMBER}
+
+MAX_MEMORY_CLOCK=$(nvidia-smi -q -d SUPPORTED_CLOCKS | grep Memory | awk -v max=0 '{if($3>max){max=$3}}END{print max}')
+MAX_GRAPHICS_CLOCK=$(nvidia-smi -q -d SUPPORTED_CLOCKS | grep Graphics | awk -v max=0 '{if($3>max){max=$3}}END{print max}')
+echo "MAX_MEMORY_CLOCK: ${MAX_MEMORY_CLOCK}, MAX_GRAPHICS_CLOCK: ${MAX_GRAPHICS_CLOCK}"
+
+sudo nvidia-smi -ac ${MAX_MEMORY_CLOCK},${MAX_GRAPHICS_CLOCK} -i ${MINOR_NUMBER}
diff --git a/src/alert-manager/src/nvidia-gpu-low-perf-state-fixer/nvidia-gpu-low-perf-state-fixer.sh b/src/alert-manager/src/nvidia-gpu-low-perf-state-fixer/nvidia-gpu-low-perf-state-fixer.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+set -e
+
+while getopts "n:" opt; do
+  case $opt in
+    n)
+      MINOR_NUMBER=$OPTARG
+      ;;
+    \?)
+      echo "Invalid option: -$OPTARG"
+      exit 1
+      ;;
+  esac
+done
+
+echo "MINOR_NUMBER: ${MINOR_NUMBER}"
+
+sudo nvidia-smi -pm ENABLED -i ${MINOR_NUMBER}
+
+MAX_MEMORY_CLOCK=$(nvidia-smi -q -d SUPPORTED_CLOCKS | grep Memory | awk -v max=0 '{if($3>max){max=$3}}END{print max}')
+MAX_GRAPHICS_CLOCK=$(nvidia-smi -q -d SUPPORTED_CLOCKS | grep Graphics | awk -v max=0 '{if($3>max){max=$3}}END{print max}')
+echo "MAX_MEMORY_CLOCK: ${MAX_MEMORY_CLOCK}, MAX_GRAPHICS_CLOCK: ${MAX_GRAPHICS_CLOCK}"
+
+sudo nvidia-smi -ac ${MAX_MEMORY_CLOCK},${MAX_GRAPHICS_CLOCK} -i ${MINOR_NUMBER}