refine doc and configuration

microsoft · Aug 7, 2020 · c7de70c · c7de70c
1 parent 410bc32
commit c7de70c
Show file tree

Hide file tree

Showing 7 changed files with 51 additions and 28 deletions.
diff --git a/contrib/kubespray/quick-start/services-configuration.yaml.template b/contrib/kubespray/quick-start/services-configuration.yaml.template
@@ -172,16 +172,16 @@ authentication:
 
 # if you want to enable alert manager to send alert email, uncomment following lines and fill
 # the right values.
-alert-manager:
-  receiver: [email protected]
-  smtp_url: smtp.office365.com:587
-  smtp_from: [email protected]
-  smtp_auth_username: [email protected]
-  smtp_auth_password: password_for_alert_sender
-  port: 9093 # this is optional, you should not write this if you do not want to change the port alert-manager is listening on
-  alert-handler:
-    port: 9095 # this is optional, you should not write this if you do not want to change the port alert-handler is listening on
-    bearer_token: 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VybmFtZSI6ImFkbWluIiwiYXBwbGljYXRpb24iOnRydWUsImlhdCI6MTU5NjE3Mzc3N30.bs97pLaVKMEX5u7fQoyWU_J5b-6qDLXp2sj59Y346yg'
+# alert-manager:
+#   receiver: [email protected]
+#   smtp_url: smtp.office365.com:587
+#   smtp_from: [email protected]
+#   smtp_auth_username: [email protected]
+#   smtp_auth_password: password_for_alert_sender
+#   port: 9093 # this is optional, you should not write this if you do not want to change the port alert-manager is listening on
+#   alert-handler:
+#     port: 9095 # this is optional, you should not write this if you do not want to change the port alert-handler is listening on
+#     bearer_token: 'your_application_token_for_rest_server'
 
 # uncomment following if you want to change customeize grafana
 # grafana:
@@ -201,14 +201,14 @@ alert-manager:
 
 
 # uncomment following if you want to change customeize prometheus
-prometheus:
+# prometheus:
 #   port: 9091
 #   # How frequently to scrape targets
 #   scrape_interval: 30
-  alert-handler:
-    virtual_clusters: default|autosys
-    percent: 0.3
-    time: 10s
+#   low_gpu_utilization_job: 
+#     virtual_clusters: default
+#     gpu_percent: 0.3
+#     last_time: 10m
 
 
 # uncomment following section if you want to customize the threshold of cleaner

diff --git a/.../clusterObjectModel/test/data/configuration-template-generate/services-configuration.yaml b/.../clusterObjectModel/test/data/configuration-template-generate/services-configuration.yaml
@@ -97,6 +97,11 @@ prometheus:
   port: 9091
   # How frequently to scrape targets
   scrape_interval: 30
+  # threshold of low GPU utilization jobs
+  low_gpu_utilization_job: 
+    virtual_clusters: default
+    gpu_percent: 0.3
+    last_time: 10m
 
 
 pylon:

diff --git a/src/alert-manager/config/alert-manager.md b/src/alert-manager/config/alert-manager.md
@@ -10,9 +10,9 @@
 
 #### How to configure cluster section in service-configuration.yaml <a name="HT_Config"></a>
 
-Port configurations in this section is optional which default to 9093. All other config is mandatory. If not receiver is configured, the alert manager will not start.
+Port configuration in this section is optional which default to 9093. All other config is mandatory. If `receiver` is not configured, the alert manager will not start.
 
-To configure alert-manager to send out alert email, you should configure alert manager with receiver in your service-configuration like following:
+To configure alert-manager to send out alert emails and kill low-gpu-utilization jobs, you should configure alert manager with receiver and alert-handler in your service-configuration like following:
 ```yaml
 alert-manager:
     receiver: [email protected]
@@ -23,7 +23,7 @@ alert-manager:
     port: 9093 # this is optional, you should not write this if you do not want to change the port alert-manager is listening on
     alert-handler:
         port: 9095 # this is optional, you should not write this if you do not want to change the port alert-handler is listening on
-        bearer_token: 'application_token_for_rest_server'
+        bearer_token: 'your_application_token_for_rest_server'
 ```
 
 In addition, if you deployed pai behind firewall, you should configure alert-manager with `use-pylon: True`, to make url from alert email public available.
@@ -43,7 +43,7 @@ alert-manager:
     port: 9093
     alert-handler:
         port: 9095
-        bearer_token: 'application_token_for_rest_server'
+        bearer_token: 'your_application_token_for_rest_server'
     configured: True
     host: master_ip
     url: "http://master_ip:9093"

diff --git a/src/alert-manager/deploy/alert-configmap.yaml.template b/src/alert-manager/deploy/alert-configmap.yaml.template
@@ -25,7 +25,7 @@ data:
       routes:
       - receiver: 'pai-alert-handler'
         match: 
-          alertname: LowTaskGpuPercent
+          alertname: PaiJobLowGpuPercent
 
     receivers:
     - name: "pai-alert"

diff --git a/src/prometheus/config/prometheus.md b/src/prometheus/config/prometheus.md
@@ -9,15 +9,21 @@
 
 [prometheus default configuration](prometheus.yaml)
 
+When jobs in certain virtual clusters have gpu utilization percent lower than a threshold for some time, the alert "PaiJobLowGpuPercent" will be triggered. The virtuals clusters, gpu utilization threshold, and time can be customized in the `low_gpu_utilization_job` field.
+
 #### How to configure cluster section in service-configuration.yaml <a name="HT_Config"></a>
 
-All configurations in this section is optional. If you want to customized these value, you can configure it in service-configuration.yaml.
+All configurations in this section are optional. If you want to customize these values, you can configure it in service-configuration.yaml.
 
-For example, if you want to use different port than the default 9091, add following to your service-configuration.yaml as following:
+For example, if you want to use different a port than the default 9091, add following to your service-configuration.yaml as following:
 ```yaml
 prometheus:
     port: new-value
     scrape_interval: 30
+    low_gpu_utilization_job: 
+        virtual_clusters: default # format: vc_name1|vc_name2|vc_name3
+        gpu_percent: 0.3
+        last_time: 10m
 ```
 
 #### Generated Configuration <a name="G_Config"></a>
@@ -28,6 +34,10 @@ prometheus:
     port: 9091
     scrape_interval: 30
     url: "http://master_ip:9091"
+    low_gpu_utilization_job: 
+        virtual_clusters: default
+        gpu_percent: 0.3
+        last_time: 10m
 ```
 
 
@@ -50,12 +60,18 @@ prometheus:
     <td>prometheus.scrape_interval</td>
     <td>com["prometheus"]["scrape_interval"]</td>
     <td>cluster_cfg["prometheus"]["scrape_interval"]</td>
-    <td>URL</td>
+    <td>Int</td>
 </tr>
 <tr>
     <td>prometheus.url</td>
     <td>com["prometheus"]["url"]</td>
     <td>cluster_cfg["prometheus"]["url"]</td>
     <td>URL</td>
 </tr>
+<tr>
+    <td>prometheus.low_gpu_utilization_job</td>
+    <td>com["prometheus"]["low_gpu_utilization_job"]</td>
+    <td>cluster_cfg["prometheus"]["low_gpu_utilization_job"]</td>
+    <td>YAML</td>
+</tr>
 </table>
diff --git a/src/prometheus/config/prometheus.yaml b/src/prometheus/config/prometheus.yaml
@@ -19,3 +19,7 @@ service_type: "common"
 
 port: 9091
 scrape_interval: 30
+low_gpu_utilization_job: 
+    virtual_clusters: default
+    gpu_percent: 0.3
+    last_time: 10m
diff --git a/src/prometheus/deploy/alerting/jobs.rules.template b/src/prometheus/deploy/alerting/jobs.rules.template
@@ -30,11 +30,9 @@ groups:
     for: 30m
     annotations:
       summary: "Job {{ '{{' }}$labels.job_name{{ '}}' }}in pending status detected"
-- name: pai-alert-handler
-  rules:
-  - alert: LowTaskGpuPercent
-    expr: avg(task_gpu_percent{virtual_cluster=~"{{ cluster_cfg["prometheus"]["alert-handler"]["virtual_clusters"] }}"}) by (job_name) < {{ cluster_cfg["prometheus"]["alert-handler"]["percent"] }}
-    for: {{ cluster_cfg["prometheus"]["alert-handler"]["time"] }}
+  - alert: PaiJobLowGpuPercent
+    expr: avg(task_gpu_percent{virtual_cluster=~"{{ cluster_cfg["prometheus"]["low_gpu_utilization_job"]["virtual_clusters"] }}"}) by (job_name) < {{ cluster_cfg["prometheus"]["low_gpu_utilization_job"]["gpu_percent"] }}
+    for: {{ cluster_cfg["prometheus"]["low_gpu_utilization_job"]["last_time"] }}
     annotations:
       summary: "{{ '{{' }}$labels.instance{{ '}}' }} has gpu percent lower than 30% for 1h, will be killed"
       description: In certain virtual clusters, jobs with low gpu utilization will be killed.