From 6ec3a7317a4a3ec7a644b4aa91ca384546762e87 Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Fri, 6 Sep 2024 09:52:44 +0100 Subject: [PATCH] Add RL9 cuda build variant (#428) * determine cuda distro automatically * fix typo in CUDA samples * make facts available for cuda * add RL9 cuda build variant * fix typo in build definitions * set packer build volume sizes depending on build variant * fix volume size definition * fix cuda verfsion to workaround issue with 12-6-0-1 * don't fail all builds if one fails * bump CUDA builder disk size (build ran out of space) * download cuda image to /mnt on gh runner * download cuda image to /mnt on gh runner * fix fatimage.yml mnt permissions * Update main.yml * switch to open nvidia drivers * bump CI images * make packer build volume-backed optional again --------- Co-authored-by: bertiethorpe Co-authored-by: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> --- .github/workflows/fatimage.yml | 16 +++++++++----- ansible/extras.yml | 2 +- ansible/roles/cuda/defaults/main.yml | 7 +++--- ansible/roles/cuda/tasks/main.yml | 13 ++--------- environments/.stackhpc/ARCUS.pkrvars.hcl | 3 --- environments/.stackhpc/LEAFCLOUD.pkrvars.hcl | 3 --- environments/.stackhpc/terraform/main.tf | 4 ++-- packer/openstack.pkr.hcl | 23 +++++++++++++++----- 8 files changed, 37 insertions(+), 34 deletions(-) diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index 31fcc789a..7e2fc35b1 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -10,16 +10,20 @@ jobs: name: openstack-imagebuild runs-on: ubuntu-22.04 strategy: - matrix: + fail-fast: false # allow other matrix jobs to continue even if one fails + matrix: # build RL8, RL9+OFED, RL9+CUDA versions os_version: - RL8 - RL9 build: - openstack.openhpc - openstack.openhpc-ofed + - openstack.openhpc-cuda exclude: - os_version: RL8 build: openstack.openhpc-ofed + - os_version: RL8 + build: openstack.openhpc-cuda - os_version: RL9 build: openstack.openhpc env: @@ -81,7 +85,9 @@ jobs: - name: Download image run: | . venv/bin/activate - openstack image save --file ${{ steps.manifest.outputs.image-name }}.qcow2 ${{ steps.manifest.outputs.image-name }} + sudo mkdir /mnt/images + sudo chmod 777 /mnt/images + openstack image save --file /mnt/images/${{ steps.manifest.outputs.image-name }}.qcow2 ${{ steps.manifest.outputs.image-name }} - name: Set up QEMU uses: docker/setup-qemu-action@v3 @@ -95,13 +101,13 @@ jobs: run: sudo mkdir -p './${{ steps.manifest.outputs.image-name }}' - name: mount qcow2 file - run: sudo guestmount -a ${{ steps.manifest.outputs.image-name }}.qcow2 -i --ro -o allow_other './${{ steps.manifest.outputs.image-name }}' + run: sudo guestmount -a /mnt/images/${{ steps.manifest.outputs.image-name }}.qcow2 -i --ro -o allow_other './${{ steps.manifest.outputs.image-name }}' - name: Run Trivy vulnerability scanner uses: aquasecurity/trivy-action@0.17.0 with: scan-type: fs - scan-ref: "./${{ steps.manifest.outputs.image-name }}" + scan-ref: "${{ steps.manifest.outputs.image-name }}" scanners: "vuln" format: sarif output: "${{ steps.manifest.outputs.image-name }}.sarif" @@ -117,7 +123,7 @@ jobs: uses: aquasecurity/trivy-action@0.16.1 with: scan-type: fs - scan-ref: "./${{ steps.manifest.outputs.image-name }}" + scan-ref: "${{ steps.manifest.outputs.image-name }}" scanners: "vuln" format: table exit-code: '1' diff --git a/ansible/extras.yml b/ansible/extras.yml index 445a0cc16..c32f51c32 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -21,7 +21,7 @@ - name: Setup CUDA hosts: cuda become: yes - gather_facts: no + gather_facts: yes tags: cuda tasks: - import_role: diff --git a/ansible/roles/cuda/defaults/main.yml b/ansible/roles/cuda/defaults/main.yml index 6b377a10b..33a25d9b4 100644 --- a/ansible/roles/cuda/defaults/main.yml +++ b/ansible/roles/cuda/defaults/main.yml @@ -1,11 +1,12 @@ -cuda_distro: rhel8 +cuda_distro: "rhel{{ ansible_distribution_major_version }}" cuda_repo: "https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_distro }}/x86_64/cuda-{{ cuda_distro }}.repo" cuda_driver_stream: default +cuda_package_version: 'latest' cuda_packages: - - cuda + - "cuda{{ ('-' + cuda_package_version) if cuda_package_version != 'latest' else '' }}" - nvidia-gds # _cuda_version_tuple: # discovered from installed package e.g. ('12', '1', '0') -cuda_version_short: "{{ _cuda_version_tuple[0] }}.{{ cuda_version_tuple[1] }}" +cuda_version_short: "{{ _cuda_version_tuple[0] }}.{{ _cuda_version_tuple[1] }}" cuda_samples_release_url: "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v{{ cuda_version_short }}.tar.gz" cuda_samples_path: "/home/{{ ansible_user }}/cuda_samples" cuda_samples_programs: diff --git a/ansible/roles/cuda/tasks/main.yml b/ansible/roles/cuda/tasks/main.yml index b323cfc04..22f8e9e8e 100644 --- a/ansible/roles/cuda/tasks/main.yml +++ b/ansible/roles/cuda/tasks/main.yml @@ -24,22 +24,13 @@ failed_when: false register: _cuda_driver_module_enabled -- name: List nvidia driver dnf module stream versions - shell: - cmd: dnf module list nvidia-driver | grep -oP "\d+-dkms" | sort -V - # Output of interest from command is something like (some whitespace removed): - # "nvidia-driver 418-dkms default [d], fm, ks Nvidia driver for 418-dkms branch " - changed_when: false - register: _cuda_driver_module_streams - when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr" - - name: Enable nvidia driver module - ansible.builtin.command: "dnf module enable -y nvidia-driver:{{ _cuda_driver_module_streams.stdout_lines | last }}" + ansible.builtin.command: "dnf module enable -y nvidia-driver:open-dkms" register: _cuda_driver_module_enable when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr" changed_when: "'Nothing to do' not in _cuda_driver_module_enable.stdout" -- name: Install nvidia drivers # TODO: make removal possible? +- name: Install nvidia drivers ansible.builtin.command: dnf module install -y nvidia-driver register: _cuda_driver_install when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr" diff --git a/environments/.stackhpc/ARCUS.pkrvars.hcl b/environments/.stackhpc/ARCUS.pkrvars.hcl index c07717156..6fd80e7a6 100644 --- a/environments/.stackhpc/ARCUS.pkrvars.hcl +++ b/environments/.stackhpc/ARCUS.pkrvars.hcl @@ -1,7 +1,4 @@ flavor = "vm.ska.cpu.general.small" -use_blockstorage_volume = true -volume_size = 15 # GB -image_disk_format = "qcow2" networks = ["4b6b2722-ee5b-40ec-8e52-a6610e14cc51"] # portal-internal (DNS broken on ilab-60) ssh_keypair_name = "slurm-app-ci" ssh_private_key_file = "~/.ssh/id_rsa" diff --git a/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl b/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl index da2d96d38..5adf4199c 100644 --- a/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl +++ b/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl @@ -1,8 +1,5 @@ flavor = "ec1.large" -use_blockstorage_volume = true -volume_size = 15 # GB volume_type = "unencrypted" -image_disk_format = "qcow2" networks = ["909e49e8-6911-473a-bf88-0495ca63853c"] # slurmapp-ci ssh_keypair_name = "slurm-app-ci" ssh_private_key_file = "~/.ssh/id_rsa" diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf index ac588930c..96e04538b 100644 --- a/environments/.stackhpc/terraform/main.tf +++ b/environments/.stackhpc/terraform/main.tf @@ -30,8 +30,8 @@ variable "cluster_image" { type = map(string) default = { # https://github.com/stackhpc/ansible-slurm-appliance/pull/413 - RL8: "openhpc-RL8-240813-1317-1b370a36" - RL9: "openhpc-ofed-RL9-240813-1317-1b370a36" + RL8: "openhpc-RL8-240904-1509-1687368f" + RL9: "openhpc-ofed-RL9-240904-1509-1687368f" } } diff --git a/packer/openstack.pkr.hcl b/packer/openstack.pkr.hcl index bb6af1a38..5f66c0320 100644 --- a/packer/openstack.pkr.hcl +++ b/packer/openstack.pkr.hcl @@ -120,7 +120,7 @@ variable "manifest_output_path" { variable "use_blockstorage_volume" { type = bool - default = false + default = true } variable "volume_type" { @@ -129,13 +129,18 @@ variable "volume_type" { } variable "volume_size" { - type = number - default = null # When not specified use the size of the builder instance root disk + type = map(number) + default = { + # fat image builds, GB: + openhpc = 15 + openhpc-ofed = 15 + openhpc-cuda = 30 + } } variable "image_disk_format" { type = string - default = null # When not specified use the image default + default = "qcow2" } variable "metadata" { @@ -150,6 +155,7 @@ variable "groups" { # fat image builds: openhpc = ["control", "compute", "login"] openhpc-ofed = ["control", "compute", "login", "ofed"] + openhpc-cuda = ["control", "compute", "login", "ofed", "cuda"] } } @@ -158,11 +164,11 @@ source "openstack" "openhpc" { flavor = var.flavor use_blockstorage_volume = var.use_blockstorage_volume volume_type = var.volume_type + volume_size = var.volume_size[source.name] metadata = var.metadata networks = var.networks floating_ip_network = var.floating_ip_network security_groups = var.security_groups - volume_size = var.volume_size # Input image: source_image = "${var.source_image[var.os_version]}" @@ -178,7 +184,7 @@ source "openstack" "openhpc" { ssh_bastion_private_key_file = var.ssh_bastion_private_key_file # Output image: - image_disk_format = var.image_disk_format + image_disk_format = "qcow2" image_visibility = var.image_visibility image_name = "${source.name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}" } @@ -195,6 +201,11 @@ build { name = "openhpc-ofed" } + # CUDA fat image: + source "source.openstack.openhpc" { + name = "openhpc-cuda" + } + # Extended site-specific image, built on fat image: source "source.openstack.openhpc" { name = "openhpc-extra"