update config generation rule, doc & examples

microsoft · Mar 19, 2021 · d2991bb · d2991bb
1 parent 68732d5
commit d2991bb
Show file tree

Hide file tree

Showing 6 changed files with 115 additions and 97 deletions.
diff --git a/contrib/kubespray/quick-start/services-configuration.yaml.template b/contrib/kubespray/quick-start/services-configuration.yaml.template
@@ -232,6 +232,9 @@ authentication:
 #     - receiver: pai-email-admin-user-and-stop-job
 #       match:
 #         alertname: PAIJobGpuPercentLowerThan0_3For1h
+#     - receiver: fix-nvidia-gpu-low-perf
+#       match:
+#         alertname: NodeGpuLowPerfState
 #   customized-receivers: # receivers are combination of several actions
 #   - name: "pai-email-admin-user-and-stop-job"
 #     actions:
@@ -244,6 +247,13 @@ authentication:
 #       tag-jobs:
 #         tags:
 #         - 'stopped-by-alert-manager'
+#   - name: fix-nvidia-gpu-low-perf
+#     actions:
+#       # the email template for `email-admin` and `email-user `can be chosen from ['general-template', 'kill-low-efficiency-job-alert']
+#       # if no template specified, 'general-template' will be used.
+#       email-admin:
+#       fix-nvidia-gpu-low-perf:
+
 
 # uncomment following if you want to customize prometheus
 # prometheus:

diff --git a/deployment/quick-start/services-configuration.yaml.template b/deployment/quick-start/services-configuration.yaml.template
@@ -92,6 +92,9 @@ rest-server:
 #     - receiver: pai-email-admin-user-and-stop-job
 #       match:
 #         alertname: PAIJobGpuPercentLowerThan0_3For1h
+#     - receiver: fix-nvidia-gpu-low-perf
+#       match:
+#         alertname: NodeGpuLowPerfState
 #   customized-receivers: # receivers are combination of several actions
 #   - name: "pai-email-admin-user-and-stop-job"
 #     actions:
@@ -104,6 +107,13 @@ rest-server:
 #       tag-jobs:
 #         tags:
 #         - 'stopped-by-alert-manager'
+#   - name: fix-nvidia-gpu-low-perf
+#     actions:
+#       # the email template for `email-admin` and `email-user `can be chosen from ['general-template', 'kill-low-efficiency-job-alert']
+#       # if no template specified, 'general-template' will be used.
+#       email-admin:
+#       fix-nvidia-gpu-low-perf:
+
 
 # uncomment following if you want to customize prometheus
 # prometheus:

diff --git a/docs/manual/cluster-admin/how-to-use-alert-system.md b/docs/manual/cluster-admin/how-to-use-alert-system.md
@@ -114,26 +114,29 @@ We have provided so far these following actions:
   - `stop-jobs`: Stop jobs by calling OpenPAI REST API. **Be careful about this action because it stops jobs without notifying related users.**
   - `tag-jobs`: Add a tag to jobs by calling OpenPAI REST API.
   - `cordon-nodes`: Call Kubernetes API to cordon the corresponding nodes.
+  - `fix-nvidia-gpu-low-perf`: Start a privileged container to fix NVIDIA GPU Low Performance State issue.
 
 But before you use them, you have to add proper configuration in the `alert-handler` field. For example, `email-admin` needs you to set up an SMTP account to send the email and an admin email address to receive the email. Also, the `tag-jobs` and `stop-jobs` action calls OpenPAI REST API, so you should set a rest server token for them. To get the token, you should go to your profile page (in the top-right corner on Webporal, click `View my profile`), and use `Create application token` to create one. Generally speaking, there are two parts of the configuration in the `alert-handler` field. One is `email-configs`. The other is `pai-bearer-token`. The requirements for different actions are shown in the following table:
 
-|              | email-configs | pai-bearer-token |
-| :-----------:| :-----------: | :--------------: |
-| cordon-nodes | -             | -                |
-| email-admin  | required      | -                |
-| email-user   | required      | required         |
-| stop-jobs    | -             | required         |
-| tag-jobs     | -             | required         |
+|                             | email-configs | pai-bearer-token |
+| :-------------------------: | :-----------: | :--------------: |
+| cordon-nodes                | -             | -                |
+| email-admin                 | required      | -                |
+| email-user                  | required      | required         |
+| stop-jobs                   | -             | required         |
+| tag-jobs                    | -             | required         |
+| fix-nvidia-gpu-low-perf     | -             | -                |
 
 In addition, some actions may depend on certain fields in the `labels` of alert instances. The labels of the `alert instance` are generated based on the expression in the alert rule. For example, the expression of the `PAIJobGpuPercentLowerThan0_3For1h` alert we mentioned in previous section is `avg(task_gpu_percent{virtual_cluster=~"default"}) by (job_name) < 0.3`. This expression returns a list, the element in which contains the `job_name` field. So there will be also a `job_name` field in the labels of the alert instance. `stop-jobs` action depends on the `job_name` field, and it will stop the corresponding job based on it. To inspect the labels of an alert, you can visit `http(s)://<your master IP>/prometheus/alerts`. If the alert is firing, you can see its labels on this page. For the depended fields of each pre-defined action, please refer to the following table:
 
-|              | depended on label field |
-| :-----------:| :------------------: |
-| cordon-nodes | node_name            |
-| email-admin  | -                    | 
-| email-user   | -                    |
-| stop-jobs    | job_name             |
-| tag-jobs     | job_name             |
+|                             | depended on label field |
+| :-------------------------: | :---------------------: |
+| cordon-nodes                | node_name               |
+| email-admin                 | -                       | 
+| email-user                  | -                       |
+| stop-jobs                   | job_name                |
+| tag-jobs                    | job_name                |
+| fix-nvidia-gpu-low-perf     | node_name, minor_number |
 
 
 The matching rules between alerts and actions are defined using `receivers` and `routes`.

diff --git a/examples/cluster-configuration/services-configuration.yaml b/examples/cluster-configuration/services-configuration.yaml
@@ -82,7 +82,6 @@ rest-server:
   #github-path: marketplace
   # Job Debugging Reservation Seconds.
   #debugging-reservation-seconds: 604800
-
 # uncomment following section if you want to customize the port of web portal
 # webportal:
 #   server-port: 9286
@@ -125,6 +124,9 @@ rest-server:
 #     - receiver: pai-email-admin-user-and-stop-job
 #       match:
 #         alertname: PAIJobGpuPercentLowerThan0_3For1h
+#     - receiver: fix-nvidia-gpu-low-perf
+#       match:
+#         alertname: NodeGpuLowPerfState
 #   customized-receivers: # receivers are combination of several actions
 #   - name: "pai-email-admin-user-and-stop-job"
 #     actions: 
@@ -137,6 +139,12 @@ rest-server:
 #       tag-jobs:
 #         tags: 
 #         - 'stopped-by-alert-manager'
+#   - name: fix-nvidia-gpu-low-perf
+#     actions:
+#       # the email template for `email-admin` and `email-user `can be chosen from ['general-template', 'kill-low-efficiency-job-alert']
+#       # if no template specified, 'general-template' will be used.
+#       email-admin:
+#       fix-nvidia-gpu-low-perf:
 
 # uncomment following if you want to customize prometheus
 # prometheus:
@@ -172,8 +180,6 @@ rest-server:
 #  # key_name: yyyyyy
 #  # key_path: /path/to/yyyyyy
 
-
-
 # uncomment following section if you want to customize the threshold of cleaner
 # cleaner:
 #  threshold: 90
@@ -185,65 +191,65 @@ rest-server:
 
 # uncomment following section, if you want to customize the authentication solution.
 #authentication:
-  #OIDC: false
-
-  # If OIDC is set as the value true, you will have to configure the following properties.
-  #OIDC-type: AAD
-  #
-  #AAD:
-  #  # If you wanna configure AAD-OIDC for OpenPAI, the following configuration is mandatory.
-  #  # National Clouds endpoint list https://docs.microsoft.com/en-us/azure/active-directory/develop/authentication-national-cloud
-  #  # AZURE: https://login.microsoftonline.com/{tenantID}/v2.0/.well-known/openid-configuration
-  #  # China: https://login.partner.microsoftonline.cn/{tenantID}/v2.0/.well-known/openid-configuration
-  #  # Germany: https://login.microsoftonline.de/{tenantID}/v2.0/.well-known/openid-configuration
-  #  wellKnownURL: https://login.microsoftonline.com/{tenantID}/v2.0/.well-known/openid-configuration
-  #
-  #  # If you wanna configure AAD-OIDC for OpenPAI, the following configuration is mandatory.
-  #  tenantID: ${tenat_id}
-  #
-  #  # Required, the client ID of your app in AAD
-  #  clientID: ${your_client_id}
-  #
-  #  # Required if `responseType` is 'code', 'id_token code' or 'code id_token'.
-  #  # If app key contains '\', replace it with '\\'.
-  #  clientSecret: '${your_client_secret}'
-  #
-  #  # Optional. The lifetime of nonce in session or cookie, the default value is 3600 (seconds).
-  #  nonceLifetime: null
-  #
-  #  # Optional. The max amount of nonce saved in session or cookie, the default value is 10.
-  #  nonceMaxAmount: 5
-  #
-  #  # Optional. The clock skew allowed in token validation, the default value is 300 seconds.
-  #  clockSkew: null
-  #
-  #group-manager:
-  #  # basic: If you set group-data-source as the value basic, admin should manually modify user's grouplist.
-  #  # winbind: If you set group-data-source as the value winbind, the user's grouplist will get from winbind server based on your configuration.
-  #  group-data-source: basic
-  #
-  #  # If you set winbind as your data source, you should configure this configuration.
-  #  # winbind-server-address: xxxxxxx
-  #
-  #  # Admin group name and its user list
-  #  admin-group:
-  #    groupname: admingroup
-  #    description: "admin's group"
-  #    externalName: ""
-  #
-  #  # Group for default vc.
-  #  # For yarn default queue hack.
-  #  default-group:
-  #    groupname: default
-  #    description: "group for default vc"
-  #    externalName: ""
-  #
-  #  # If the following groups are not in the data store, it will be created by default.
-  #  grouplist:
-  #    - groupname: forexample
-  #      # internal name
-  #      description: forexample
-  #      # description of the group
-  #      externalName: ""
-  #      # external name, it should be set if your group-data-source is winbind. And the name will be used to query and match the group from
-  #      # the result of winbind. If the group-data-source is basic, this field is useless.
+#OIDC: false
+
+# If OIDC is set as the value true, you will have to configure the following properties.
+#OIDC-type: AAD
+#
+#AAD:
+#  # If you wanna configure AAD-OIDC for OpenPAI, the following configuration is mandatory.
+#  # National Clouds endpoint list https://docs.microsoft.com/en-us/azure/active-directory/develop/authentication-national-cloud
+#  # AZURE: https://login.microsoftonline.com/{tenantID}/v2.0/.well-known/openid-configuration
+#  # China: https://login.partner.microsoftonline.cn/{tenantID}/v2.0/.well-known/openid-configuration
+#  # Germany: https://login.microsoftonline.de/{tenantID}/v2.0/.well-known/openid-configuration
+#  wellKnownURL: https://login.microsoftonline.com/{tenantID}/v2.0/.well-known/openid-configuration
+#
+#  # If you wanna configure AAD-OIDC for OpenPAI, the following configuration is mandatory.
+#  tenantID: ${tenat_id}
+#
+#  # Required, the client ID of your app in AAD
+#  clientID: ${your_client_id}
+#
+#  # Required if `responseType` is 'code', 'id_token code' or 'code id_token'.
+#  # If app key contains '\', replace it with '\\'.
+#  clientSecret: '${your_client_secret}'
+#
+#  # Optional. The lifetime of nonce in session or cookie, the default value is 3600 (seconds).
+#  nonceLifetime: null
+#
+#  # Optional. The max amount of nonce saved in session or cookie, the default value is 10.
+#  nonceMaxAmount: 5
+#
+#  # Optional. The clock skew allowed in token validation, the default value is 300 seconds.
+#  clockSkew: null
+#
+#group-manager:
+#  # basic: If you set group-data-source as the value basic, admin should manually modify user's grouplist.
+#  # winbind: If you set group-data-source as the value winbind, the user's grouplist will get from winbind server based on your configuration.
+#  group-data-source: basic
+#
+#  # If you set winbind as your data source, you should configure this configuration.
+#  # winbind-server-address: xxxxxxx
+#
+#  # Admin group name and its user list
+#  admin-group:
+#    groupname: admingroup
+#    description: "admin's group"
+#    externalName: ""
+#
+#  # Group for default vc.
+#  # For yarn default queue hack.
+#  default-group:
+#    groupname: default
+#    description: "group for default vc"
+#    externalName: ""
+#
+#  # If the following groups are not in the data store, it will be created by default.
+#  grouplist:
+#    - groupname: forexample
+#      # internal name
+#      description: forexample
+#      # description of the group
+#      externalName: ""
+#      # external name, it should be set if your group-data-source is winbind. And the name will be used to query and match the group from
+#      # the result of winbind. If the group-data-source is basic, this field is useless.
diff --git a/src/alert-manager/config/alert_manager.py b/src/alert-manager/config/alert_manager.py
@@ -74,17 +74,14 @@ def run(self):
         else:
             token_configured = False
 
+        result["alert-handler"]["configured"] = True
+        result["actions-available"] = ["fix-nvidia-gpu-low-perf"]
         if email_configured and token_configured:
-            result["alert-handler"]["configured"] = True
             result["actions-available"].extend(["email-admin", "email-user", "stop-jobs", "tag-jobs"])
         elif email_configured:
-            result["alert-handler"]["configured"] = True
             result["actions-available"].append("email-admin")
         elif token_configured:
-            result["alert-handler"]["configured"] = True
             result["actions-available"].extend(["stop-jobs", "tag-jobs"])
-        else:
-            result["alert-handler"]["configured"] = False
 
         if result.get("cluster-utilization") is not None and \
             result["cluster-utilization"].get("schedule") is not None and \

diff --git a/src/alert-manager/deploy/alert-manager-configmap.yaml.template b/src/alert-manager/deploy/alert-manager-configmap.yaml.template
@@ -41,10 +41,6 @@ data:
         match:
           report_type: cluster-usage
 
-      - receiver: fix-nvidia-gpu-low-perf
-        match:
-          alertname: NodeGpuLowPerfState
-
       {% if 'routes' in cluster_cfg["alert-manager"]["customized-routes"] %}
       {% for route in cluster_cfg["alert-manager"]["customized-routes"]["routes"] %}
       - receiver: {{ route.receiver}}
@@ -78,15 +74,6 @@ data:
         send_resolved: false
       {% endif %}
 
-    - name: fix-nvidia-gpu-low-perf
-      webhook_configs:
-      {% if 'email-admin' in cluster_cfg["alert-manager"]["actions-available"] %}
-      - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/send-email-to-admin'
-        send_resolved: true
-      {% endif %}
-      - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/fix-nvidia-gpu-low-perf'
-        send_resolved: false
-
     - name: pai-cordon-nodes
       webhook_configs:
       {% if 'cordon-nodes' in cluster_cfg["alert-manager"]["actions-available"] %}
@@ -135,6 +122,11 @@ data:
       - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/cordon-nodes'
         send_resolved: false
       {% endif %}
+
+      {% if (receiver["actions"]["fix-nvidia-gpu-low-perf"] is defined) and ('fix-nvidia-gpu-low-perf' in cluster_cfg["alert-manager"]["actions-available"]) %}
+      - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/fix-nvidia-gpu-low-perf'
+        send_resolved: false
+      {% endif %}
 
     {% endfor %}