diff --git a/contrib/kubespray/quick-start/services-configuration.yaml.template b/contrib/kubespray/quick-start/services-configuration.yaml.template index 68ba0614557..a2cd51ed958 100644 --- a/contrib/kubespray/quick-start/services-configuration.yaml.template +++ b/contrib/kubespray/quick-start/services-configuration.yaml.template @@ -172,16 +172,16 @@ authentication: # if you want to enable alert manager to send alert email, uncomment following lines and fill # the right values. -alert-manager: - receiver: your_addr@example.com - smtp_url: smtp.office365.com:587 - smtp_from: alert_sender@example.com - smtp_auth_username: alert_sender@example.com - smtp_auth_password: password_for_alert_sender - port: 9093 # this is optional, you should not write this if you do not want to change the port alert-manager is listening on - alert-handler: - port: 9095 # this is optional, you should not write this if you do not want to change the port alert-handler is listening on - bearer_token: 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VybmFtZSI6ImFkbWluIiwiYXBwbGljYXRpb24iOnRydWUsImlhdCI6MTU5NjE3Mzc3N30.bs97pLaVKMEX5u7fQoyWU_J5b-6qDLXp2sj59Y346yg' +# alert-manager: +# receiver: your_addr@example.com +# smtp_url: smtp.office365.com:587 +# smtp_from: alert_sender@example.com +# smtp_auth_username: alert_sender@example.com +# smtp_auth_password: password_for_alert_sender +# port: 9093 # this is optional, you should not write this if you do not want to change the port alert-manager is listening on +# alert-handler: +# port: 9095 # this is optional, you should not write this if you do not want to change the port alert-handler is listening on +# bearer_token: 'your_application_token_for_rest_server' # uncomment following if you want to change customeize grafana # grafana: @@ -201,14 +201,14 @@ alert-manager: # uncomment following if you want to change customeize prometheus -prometheus: +# prometheus: # port: 9091 # # How frequently to scrape targets # scrape_interval: 30 - alert-handler: - virtual_clusters: default|autosys - percent: 0.3 - time: 10s +# low_gpu_utilization_job: +# virtual_clusters: default +# gpu_percent: 0.3 +# last_time: 10m # uncomment following section if you want to customize the threshold of cleaner diff --git a/deployment/clusterObjectModel/test/data/configuration-template-generate/services-configuration.yaml b/deployment/clusterObjectModel/test/data/configuration-template-generate/services-configuration.yaml index eae5e7ea043..57866eb0372 100644 --- a/deployment/clusterObjectModel/test/data/configuration-template-generate/services-configuration.yaml +++ b/deployment/clusterObjectModel/test/data/configuration-template-generate/services-configuration.yaml @@ -97,6 +97,11 @@ prometheus: port: 9091 # How frequently to scrape targets scrape_interval: 30 + # threshold of low GPU utilization jobs + low_gpu_utilization_job: + virtual_clusters: default + gpu_percent: 0.3 + last_time: 10m pylon: diff --git a/src/alert-manager/config/alert-manager.md b/src/alert-manager/config/alert-manager.md index f89b91fc603..6ca26d166f2 100644 --- a/src/alert-manager/config/alert-manager.md +++ b/src/alert-manager/config/alert-manager.md @@ -10,9 +10,9 @@ #### How to configure cluster section in service-configuration.yaml -Port configurations in this section is optional which default to 9093. All other config is mandatory. If not receiver is configured, the alert manager will not start. +Port configuration in this section is optional which default to 9093. All other config is mandatory. If `receiver` is not configured, the alert manager will not start. -To configure alert-manager to send out alert email, you should configure alert manager with receiver in your service-configuration like following: +To configure alert-manager to send out alert emails and kill low-gpu-utilization jobs, you should configure alert manager with receiver and alert-handler in your service-configuration like following: ```yaml alert-manager: receiver: your_addr@example.com @@ -23,7 +23,7 @@ alert-manager: port: 9093 # this is optional, you should not write this if you do not want to change the port alert-manager is listening on alert-handler: port: 9095 # this is optional, you should not write this if you do not want to change the port alert-handler is listening on - bearer_token: 'application_token_for_rest_server' + bearer_token: 'your_application_token_for_rest_server' ``` In addition, if you deployed pai behind firewall, you should configure alert-manager with `use-pylon: True`, to make url from alert email public available. @@ -43,7 +43,7 @@ alert-manager: port: 9093 alert-handler: port: 9095 - bearer_token: 'application_token_for_rest_server' + bearer_token: 'your_application_token_for_rest_server' configured: True host: master_ip url: "http://master_ip:9093" diff --git a/src/alert-manager/deploy/alert-configmap.yaml.template b/src/alert-manager/deploy/alert-configmap.yaml.template index fc25f960bd7..fca3c906199 100644 --- a/src/alert-manager/deploy/alert-configmap.yaml.template +++ b/src/alert-manager/deploy/alert-configmap.yaml.template @@ -25,7 +25,7 @@ data: routes: - receiver: 'pai-alert-handler' match: - alertname: LowTaskGpuPercent + alertname: PaiJobLowGpuPercent receivers: - name: "pai-alert" diff --git a/src/prometheus/config/prometheus.md b/src/prometheus/config/prometheus.md index 45361fa9f30..d3ce63a1084 100644 --- a/src/prometheus/config/prometheus.md +++ b/src/prometheus/config/prometheus.md @@ -9,15 +9,21 @@ [prometheus default configuration](prometheus.yaml) +When jobs in certain virtual clusters have gpu utilization percent lower than a threshold for some time, the alert "PaiJobLowGpuPercent" will be triggered. The virtuals clusters, gpu utilization threshold, and time can be customized in the `low_gpu_utilization_job` field. + #### How to configure cluster section in service-configuration.yaml -All configurations in this section is optional. If you want to customized these value, you can configure it in service-configuration.yaml. +All configurations in this section are optional. If you want to customize these values, you can configure it in service-configuration.yaml. -For example, if you want to use different port than the default 9091, add following to your service-configuration.yaml as following: +For example, if you want to use different a port than the default 9091, add following to your service-configuration.yaml as following: ```yaml prometheus: port: new-value scrape_interval: 30 + low_gpu_utilization_job: + virtual_clusters: default # format: vc_name1|vc_name2|vc_name3 + gpu_percent: 0.3 + last_time: 10m ``` #### Generated Configuration @@ -28,6 +34,10 @@ prometheus: port: 9091 scrape_interval: 30 url: "http://master_ip:9091" + low_gpu_utilization_job: + virtual_clusters: default + gpu_percent: 0.3 + last_time: 10m ``` @@ -50,7 +60,7 @@ prometheus: