Skip to content

Commit

Permalink
Merge pull request #3673 from tpdownes/enable_a3u_dcgm
Browse files Browse the repository at this point in the history
Enable NVIDIA DCGM in A3 Ultra Slurm blueprint
  • Loading branch information
tpdownes authored Feb 14, 2025
2 parents 483552e + 858ae21 commit b3dd4c3
Show file tree
Hide file tree
Showing 2 changed files with 91 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,51 @@ vars:
state: started
enabled: true
- type: ansible-local
destination: enable_dcgm.yml
content: |
---
- name: Enable NVIDIA DCGM on GPU nodes
hosts: all
become: true
vars:
enable_ops_agent: true
enable_nvidia_dcgm: true
tasks:
- name: Update Ops Agent configuration
ansible.builtin.blockinfile:
path: /etc/google-cloud-ops-agent/config.yaml
insertafter: EOF
block: |
metrics:
receivers:
dcgm:
type: dcgm
service:
pipelines:
dcgm:
receivers:
- dcgm
notify:
- Restart Google Cloud Ops Agent
handlers:
- name: Restart Google Cloud Ops Agent
ansible.builtin.service:
name: google-cloud-ops-agent.service
state: "{{ 'restarted' if enable_ops_agent else 'stopped' }}"
enabled: "{{ enable_ops_agent }}"
post_tasks:
- name: Enable Google Cloud Ops Agent
ansible.builtin.service:
name: google-cloud-ops-agent.service
state: "{{ 'started' if enable_ops_agent else 'stopped' }}"
enabled: "{{ enable_ops_agent }}"
- name: Enable NVIDIA DCGM
ansible.builtin.service:
name: nvidia-dcgm.service
state: "{{ 'started' if enable_nvidia_dcgm else 'stopped' }}"
enabled: "{{ enable_nvidia_dcgm }}"
# Configure Cloud Storage FUSE
- type: ansible-local
destination: gcsfuse.yml
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,50 @@ deployment_groups:
name: nccl-plugin@$(vars.nccl_plugin_version).service
state: started
enabled: true
- type: ansible-local
destination: enable_dcgm.yml
content: |
---
- name: Enable NVIDIA DCGM on GPU nodes
hosts: all
become: true
vars:
enable_ops_agent: true
enable_nvidia_dcgm: true
tasks:
- name: Update Ops Agent configuration
ansible.builtin.blockinfile:
path: /etc/google-cloud-ops-agent/config.yaml
insertafter: EOF
block: |
metrics:
receivers:
dcgm:
type: dcgm
service:
pipelines:
dcgm:
receivers:
- dcgm
notify:
- Restart Google Cloud Ops Agent
handlers:
- name: Restart Google Cloud Ops Agent
ansible.builtin.service:
name: google-cloud-ops-agent.service
state: "{{ 'restarted' if enable_ops_agent else 'stopped' }}"
enabled: "{{ enable_ops_agent }}"
post_tasks:
- name: Enable Google Cloud Ops Agent
ansible.builtin.service:
name: google-cloud-ops-agent.service
state: "{{ 'started' if enable_ops_agent else 'stopped' }}"
enabled: "{{ enable_ops_agent }}"
- name: Enable NVIDIA DCGM
ansible.builtin.service:
name: nvidia-dcgm.service
state: "{{ 'started' if enable_nvidia_dcgm else 'stopped' }}"
enabled: "{{ enable_nvidia_dcgm }}"
- id: a3_ultra_nodeset
source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
Expand Down Expand Up @@ -358,8 +402,8 @@ deployment_groups:
is_default: true
partition_conf:
OverSubscribe: EXCLUSIVE
ResumeTimeout: 900
SuspendTimeout: 600
ResumeTimeout: 1200
SuspendTimeout: 1200

- id: slurm_login
source: community/modules/scheduler/schedmd-slurm-gcp-v6-login
Expand Down

0 comments on commit b3dd4c3

Please sign in to comment.