diff --git a/contrib/kubespray/quick-start/services-configuration.yaml.template b/contrib/kubespray/quick-start/services-configuration.yaml.template index 27df33f3e4b..06755b0fc1c 100644 --- a/contrib/kubespray/quick-start/services-configuration.yaml.template +++ b/contrib/kubespray/quick-start/services-configuration.yaml.template @@ -232,6 +232,9 @@ authentication: # - receiver: pai-email-admin-user-and-stop-job # match: # alertname: PAIJobGpuPercentLowerThan0_3For1h +# - receiver: fix-nvidia-gpu-low-perf +# match: +# alertname: NodeGpuLowPerfState # customized-receivers: # receivers are combination of several actions # - name: "pai-email-admin-user-and-stop-job" # actions: @@ -244,6 +247,13 @@ authentication: # tag-jobs: # tags: # - 'stopped-by-alert-manager' +# - name: fix-nvidia-gpu-low-perf +# actions: +# # the email template for `email-admin` and `email-user `can be chosen from ['general-template', 'kill-low-efficiency-job-alert'] +# # if no template specified, 'general-template' will be used. +# email-admin: +# fix-nvidia-gpu-low-perf: + # uncomment following if you want to customize prometheus # prometheus: diff --git a/deployment/quick-start/services-configuration.yaml.template b/deployment/quick-start/services-configuration.yaml.template index 2a30de3fbea..65b66e884db 100644 --- a/deployment/quick-start/services-configuration.yaml.template +++ b/deployment/quick-start/services-configuration.yaml.template @@ -92,6 +92,9 @@ rest-server: # - receiver: pai-email-admin-user-and-stop-job # match: # alertname: PAIJobGpuPercentLowerThan0_3For1h +# - receiver: fix-nvidia-gpu-low-perf +# match: +# alertname: NodeGpuLowPerfState # customized-receivers: # receivers are combination of several actions # - name: "pai-email-admin-user-and-stop-job" # actions: @@ -104,6 +107,13 @@ rest-server: # tag-jobs: # tags: # - 'stopped-by-alert-manager' +# - name: fix-nvidia-gpu-low-perf +# actions: +# # the email template for `email-admin` and `email-user `can be chosen from ['general-template', 'kill-low-efficiency-job-alert'] +# # if no template specified, 'general-template' will be used. +# email-admin: +# fix-nvidia-gpu-low-perf: + # uncomment following if you want to customize prometheus # prometheus: diff --git a/docs/manual/cluster-admin/how-to-use-alert-system.md b/docs/manual/cluster-admin/how-to-use-alert-system.md index 1a986bb05b0..19953b52211 100644 --- a/docs/manual/cluster-admin/how-to-use-alert-system.md +++ b/docs/manual/cluster-admin/how-to-use-alert-system.md @@ -114,26 +114,29 @@ We have provided so far these following actions: - `stop-jobs`: Stop jobs by calling OpenPAI REST API. **Be careful about this action because it stops jobs without notifying related users.** - `tag-jobs`: Add a tag to jobs by calling OpenPAI REST API. - `cordon-nodes`: Call Kubernetes API to cordon the corresponding nodes. + - `fix-nvidia-gpu-low-perf`: Start a privileged container to fix NVIDIA GPU Low Performance State issue. But before you use them, you have to add proper configuration in the `alert-handler` field. For example, `email-admin` needs you to set up an SMTP account to send the email and an admin email address to receive the email. Also, the `tag-jobs` and `stop-jobs` action calls OpenPAI REST API, so you should set a rest server token for them. To get the token, you should go to your profile page (in the top-right corner on Webporal, click `View my profile`), and use `Create application token` to create one. Generally speaking, there are two parts of the configuration in the `alert-handler` field. One is `email-configs`. The other is `pai-bearer-token`. The requirements for different actions are shown in the following table: -| | email-configs | pai-bearer-token | -| :-----------:| :-----------: | :--------------: | -| cordon-nodes | - | - | -| email-admin | required | - | -| email-user | required | required | -| stop-jobs | - | required | -| tag-jobs | - | required | +| | email-configs | pai-bearer-token | +| :-------------------------: | :-----------: | :--------------: | +| cordon-nodes | - | - | +| email-admin | required | - | +| email-user | required | required | +| stop-jobs | - | required | +| tag-jobs | - | required | +| fix-nvidia-gpu-low-perf | - | - | In addition, some actions may depend on certain fields in the `labels` of alert instances. The labels of the `alert instance` are generated based on the expression in the alert rule. For example, the expression of the `PAIJobGpuPercentLowerThan0_3For1h` alert we mentioned in previous section is `avg(task_gpu_percent{virtual_cluster=~"default"}) by (job_name) < 0.3`. This expression returns a list, the element in which contains the `job_name` field. So there will be also a `job_name` field in the labels of the alert instance. `stop-jobs` action depends on the `job_name` field, and it will stop the corresponding job based on it. To inspect the labels of an alert, you can visit `http(s):///prometheus/alerts`. If the alert is firing, you can see its labels on this page. For the depended fields of each pre-defined action, please refer to the following table: -| | depended on label field | -| :-----------:| :------------------: | -| cordon-nodes | node_name | -| email-admin | - | -| email-user | - | -| stop-jobs | job_name | -| tag-jobs | job_name | +| | depended on label field | +| :-------------------------: | :---------------------: | +| cordon-nodes | node_name | +| email-admin | - | +| email-user | - | +| stop-jobs | job_name | +| tag-jobs | job_name | +| fix-nvidia-gpu-low-perf | node_name, minor_number | The matching rules between alerts and actions are defined using `receivers` and `routes`. diff --git a/examples/cluster-configuration/services-configuration.yaml b/examples/cluster-configuration/services-configuration.yaml index 4933c0d6021..a42e057eb2f 100644 --- a/examples/cluster-configuration/services-configuration.yaml +++ b/examples/cluster-configuration/services-configuration.yaml @@ -82,7 +82,6 @@ rest-server: #github-path: marketplace # Job Debugging Reservation Seconds. #debugging-reservation-seconds: 604800 - # uncomment following section if you want to customize the port of web portal # webportal: # server-port: 9286 @@ -125,6 +124,9 @@ rest-server: # - receiver: pai-email-admin-user-and-stop-job # match: # alertname: PAIJobGpuPercentLowerThan0_3For1h +# - receiver: fix-nvidia-gpu-low-perf +# match: +# alertname: NodeGpuLowPerfState # customized-receivers: # receivers are combination of several actions # - name: "pai-email-admin-user-and-stop-job" # actions: @@ -137,6 +139,12 @@ rest-server: # tag-jobs: # tags: # - 'stopped-by-alert-manager' +# - name: fix-nvidia-gpu-low-perf +# actions: +# # the email template for `email-admin` and `email-user `can be chosen from ['general-template', 'kill-low-efficiency-job-alert'] +# # if no template specified, 'general-template' will be used. +# email-admin: +# fix-nvidia-gpu-low-perf: # uncomment following if you want to customize prometheus # prometheus: @@ -172,8 +180,6 @@ rest-server: # # key_name: yyyyyy # # key_path: /path/to/yyyyyy - - # uncomment following section if you want to customize the threshold of cleaner # cleaner: # threshold: 90 @@ -185,65 +191,65 @@ rest-server: # uncomment following section, if you want to customize the authentication solution. #authentication: - #OIDC: false - - # If OIDC is set as the value true, you will have to configure the following properties. - #OIDC-type: AAD - # - #AAD: - # # If you wanna configure AAD-OIDC for OpenPAI, the following configuration is mandatory. - # # National Clouds endpoint list https://docs.microsoft.com/en-us/azure/active-directory/develop/authentication-national-cloud - # # AZURE: https://login.microsoftonline.com/{tenantID}/v2.0/.well-known/openid-configuration - # # China: https://login.partner.microsoftonline.cn/{tenantID}/v2.0/.well-known/openid-configuration - # # Germany: https://login.microsoftonline.de/{tenantID}/v2.0/.well-known/openid-configuration - # wellKnownURL: https://login.microsoftonline.com/{tenantID}/v2.0/.well-known/openid-configuration - # - # # If you wanna configure AAD-OIDC for OpenPAI, the following configuration is mandatory. - # tenantID: ${tenat_id} - # - # # Required, the client ID of your app in AAD - # clientID: ${your_client_id} - # - # # Required if `responseType` is 'code', 'id_token code' or 'code id_token'. - # # If app key contains '\', replace it with '\\'. - # clientSecret: '${your_client_secret}' - # - # # Optional. The lifetime of nonce in session or cookie, the default value is 3600 (seconds). - # nonceLifetime: null - # - # # Optional. The max amount of nonce saved in session or cookie, the default value is 10. - # nonceMaxAmount: 5 - # - # # Optional. The clock skew allowed in token validation, the default value is 300 seconds. - # clockSkew: null - # - #group-manager: - # # basic: If you set group-data-source as the value basic, admin should manually modify user's grouplist. - # # winbind: If you set group-data-source as the value winbind, the user's grouplist will get from winbind server based on your configuration. - # group-data-source: basic - # - # # If you set winbind as your data source, you should configure this configuration. - # # winbind-server-address: xxxxxxx - # - # # Admin group name and its user list - # admin-group: - # groupname: admingroup - # description: "admin's group" - # externalName: "" - # - # # Group for default vc. - # # For yarn default queue hack. - # default-group: - # groupname: default - # description: "group for default vc" - # externalName: "" - # - # # If the following groups are not in the data store, it will be created by default. - # grouplist: - # - groupname: forexample - # # internal name - # description: forexample - # # description of the group - # externalName: "" - # # external name, it should be set if your group-data-source is winbind. And the name will be used to query and match the group from - # # the result of winbind. If the group-data-source is basic, this field is useless. +#OIDC: false + +# If OIDC is set as the value true, you will have to configure the following properties. +#OIDC-type: AAD +# +#AAD: +# # If you wanna configure AAD-OIDC for OpenPAI, the following configuration is mandatory. +# # National Clouds endpoint list https://docs.microsoft.com/en-us/azure/active-directory/develop/authentication-national-cloud +# # AZURE: https://login.microsoftonline.com/{tenantID}/v2.0/.well-known/openid-configuration +# # China: https://login.partner.microsoftonline.cn/{tenantID}/v2.0/.well-known/openid-configuration +# # Germany: https://login.microsoftonline.de/{tenantID}/v2.0/.well-known/openid-configuration +# wellKnownURL: https://login.microsoftonline.com/{tenantID}/v2.0/.well-known/openid-configuration +# +# # If you wanna configure AAD-OIDC for OpenPAI, the following configuration is mandatory. +# tenantID: ${tenat_id} +# +# # Required, the client ID of your app in AAD +# clientID: ${your_client_id} +# +# # Required if `responseType` is 'code', 'id_token code' or 'code id_token'. +# # If app key contains '\', replace it with '\\'. +# clientSecret: '${your_client_secret}' +# +# # Optional. The lifetime of nonce in session or cookie, the default value is 3600 (seconds). +# nonceLifetime: null +# +# # Optional. The max amount of nonce saved in session or cookie, the default value is 10. +# nonceMaxAmount: 5 +# +# # Optional. The clock skew allowed in token validation, the default value is 300 seconds. +# clockSkew: null +# +#group-manager: +# # basic: If you set group-data-source as the value basic, admin should manually modify user's grouplist. +# # winbind: If you set group-data-source as the value winbind, the user's grouplist will get from winbind server based on your configuration. +# group-data-source: basic +# +# # If you set winbind as your data source, you should configure this configuration. +# # winbind-server-address: xxxxxxx +# +# # Admin group name and its user list +# admin-group: +# groupname: admingroup +# description: "admin's group" +# externalName: "" +# +# # Group for default vc. +# # For yarn default queue hack. +# default-group: +# groupname: default +# description: "group for default vc" +# externalName: "" +# +# # If the following groups are not in the data store, it will be created by default. +# grouplist: +# - groupname: forexample +# # internal name +# description: forexample +# # description of the group +# externalName: "" +# # external name, it should be set if your group-data-source is winbind. And the name will be used to query and match the group from +# # the result of winbind. If the group-data-source is basic, this field is useless. diff --git a/src/alert-manager/config/alert_manager.py b/src/alert-manager/config/alert_manager.py index 6b33c154357..63ba1468eed 100644 --- a/src/alert-manager/config/alert_manager.py +++ b/src/alert-manager/config/alert_manager.py @@ -74,17 +74,14 @@ def run(self): else: token_configured = False + result["alert-handler"]["configured"] = True + result["actions-available"] = ["fix-nvidia-gpu-low-perf"] if email_configured and token_configured: - result["alert-handler"]["configured"] = True result["actions-available"].extend(["email-admin", "email-user", "stop-jobs", "tag-jobs"]) elif email_configured: - result["alert-handler"]["configured"] = True result["actions-available"].append("email-admin") elif token_configured: - result["alert-handler"]["configured"] = True result["actions-available"].extend(["stop-jobs", "tag-jobs"]) - else: - result["alert-handler"]["configured"] = False if result.get("cluster-utilization") is not None and \ result["cluster-utilization"].get("schedule") is not None and \ diff --git a/src/alert-manager/deploy/alert-manager-configmap.yaml.template b/src/alert-manager/deploy/alert-manager-configmap.yaml.template index 7e6cb82fade..28ed7d7a5ac 100644 --- a/src/alert-manager/deploy/alert-manager-configmap.yaml.template +++ b/src/alert-manager/deploy/alert-manager-configmap.yaml.template @@ -41,10 +41,6 @@ data: match: report_type: cluster-usage - - receiver: fix-nvidia-gpu-low-perf - match: - alertname: NodeGpuLowPerfState - {% if 'routes' in cluster_cfg["alert-manager"]["customized-routes"] %} {% for route in cluster_cfg["alert-manager"]["customized-routes"]["routes"] %} - receiver: {{ route.receiver}} @@ -78,15 +74,6 @@ data: send_resolved: false {% endif %} - - name: fix-nvidia-gpu-low-perf - webhook_configs: - {% if 'email-admin' in cluster_cfg["alert-manager"]["actions-available"] %} - - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/send-email-to-admin' - send_resolved: true - {% endif %} - - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/fix-nvidia-gpu-low-perf' - send_resolved: false - - name: pai-cordon-nodes webhook_configs: {% if 'cordon-nodes' in cluster_cfg["alert-manager"]["actions-available"] %} @@ -135,6 +122,11 @@ data: - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/cordon-nodes' send_resolved: false {% endif %} + + {% if (receiver["actions"]["fix-nvidia-gpu-low-perf"] is defined) and ('fix-nvidia-gpu-low-perf' in cluster_cfg["alert-manager"]["actions-available"]) %} + - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/fix-nvidia-gpu-low-perf' + send_resolved: false + {% endif %} {% endfor %}