From 11f5b88a2f32be2733dd3d429105391c3ee47ca8 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande <79726937+himani2411@users.noreply.github.com> Date: Wed, 9 Aug 2023 13:44:14 -0700 Subject: [PATCH] Upgrading NVIDIA driver,fabric manager and Cuda to v535 and v12.2 respectively (#2388) Co-authored-by: Himani Deshpande --- CHANGELOG.md | 4 ++- .../attributes/platform.rb | 2 +- .../recipes/install/cuda.rb | 6 ++-- .../fabric_manager_ubuntu20+.rb | 2 +- .../nvidia_driver/nvidia_driver_amazon2.rb | 4 +++ .../partial/_nvidia_driver_common.rb | 6 +++- .../spec/unit/recipes/cuda_spec.rb | 6 ++-- .../unit/resources/fabric_manager_spec.rb | 4 +-- .../spec/unit/resources/nvidia_driver_spec.rb | 34 ++++++++++++------- 9 files changed, 44 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e44327f7d..25f2c3874 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,7 +26,9 @@ This file is used to list changes made in each version of the AWS ParallelCluste **CHANGES** - Assign Slurm dynamic nodes a priority (weight) of 1000 by default. This allows Slurm to prioritize idle static nodes over idle dynamic ones. - Create a Slurm partition-nodelist mapping JSON file to be used by the node package daemons to recognize PC-managed Slurm partitions and nodelists. -- Upgrade NVIDIA driver to version 470.199.02. +- Upgrade NVIDIA driver to version 535.54.03. +- Upgrade CUDA library to version 12.2.0. +- Upgrade NVIDIA Fabric manager to `nvidia-fabricmanager-535` - Increase EFS-utils watchdog poll interval to 10 seconds. Note: This change is meaningful only if [EncryptionInTransit](https://docs.aws.amazon.com/parallelcluster/latest/ug/SharedStorage-v3.html#yaml-SharedStorage-EfsSettings-EncryptionInTransit) is set to `true`, because watchdog does not run otherwise. - Upgrade EFA installer to `1.25.0` - Efa-driver: `efa-2.5.0-1` diff --git a/cookbooks/aws-parallelcluster-platform/attributes/platform.rb b/cookbooks/aws-parallelcluster-platform/attributes/platform.rb index a740a36fe..bb4e6066a 100644 --- a/cookbooks/aws-parallelcluster-platform/attributes/platform.rb +++ b/cookbooks/aws-parallelcluster-platform/attributes/platform.rb @@ -11,7 +11,7 @@ # NVidia default['cluster']['nvidia']['enabled'] = 'no' -default['cluster']['nvidia']['driver_version'] = '470.199.02' +default['cluster']['nvidia']['driver_version'] = '535.54.03' # DCV default['cluster']['dcv']['authenticator']['user'] = "dcvextauth" diff --git a/cookbooks/aws-parallelcluster-platform/recipes/install/cuda.rb b/cookbooks/aws-parallelcluster-platform/recipes/install/cuda.rb index c8a25c0fd..a1d8ffa5a 100644 --- a/cookbooks/aws-parallelcluster-platform/recipes/install/cuda.rb +++ b/cookbooks/aws-parallelcluster-platform/recipes/install/cuda.rb @@ -19,13 +19,13 @@ # Cuda installer from https://developer.nvidia.com/cuda-toolkit-archive # Cuda installer naming: cuda_11.8.0_520.61.05_linux -cuda_version = '11.8' +cuda_version = '12.2' cuda_patch = '0' cuda_complete_version = "#{cuda_version}.#{cuda_patch}" -cuda_version_suffix = '520.61.05' +cuda_version_suffix = '535.54.03' cuda_arch = arm_instance? ? 'linux_sbsa' : 'linux' cuda_url = "https://developer.download.nvidia.com/compute/cuda/#{cuda_complete_version}/local_installers/cuda_#{cuda_complete_version}_#{cuda_version_suffix}_#{cuda_arch}.run" -cuda_samples_version = '11.8' +cuda_samples_version = '12.2' cuda_samples_url = "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v#{cuda_samples_version}.tar.gz" tmp_cuda_run = '/tmp/cuda.run' tmp_cuda_sample_archive = '/tmp/cuda-sample.tar.gz' diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_ubuntu20+.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_ubuntu20+.rb index e92db45cd..c01265485 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_ubuntu20+.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_ubuntu20+.rb @@ -20,7 +20,7 @@ use 'partial/_fabric_manager_install_debian.rb' def fabric_manager_package - 'nvidia-fabricmanager-470' + 'nvidia-fabricmanager-535' end def fabric_manager_version diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_amazon2.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_amazon2.rb index f42dff44e..69067a8b0 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_amazon2.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_amazon2.rb @@ -20,3 +20,7 @@ def set_compiler? # Amazon linux 2 with Kernel 5 need to set CC to /usr/bin/gcc10-gcc using dkms override node['kernel']['release'].split('.')[0].to_i == 5 end + +def compiler_version + 'CC=/usr/bin/gcc10-gcc' +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/partial/_nvidia_driver_common.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/partial/_nvidia_driver_common.rb index 63e278890..84b7308be 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/partial/_nvidia_driver_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/partial/_nvidia_driver_common.rb @@ -68,7 +68,7 @@ cwd '/tmp' code <<-NVIDIA set -e - ./nvidia.run --silent --dkms --disable-nouveau --no-cc-version-check + #{compiler_version} ./nvidia.run --silent --dkms --disable-nouveau --no-cc-version-check rm -f /tmp/nvidia.run NVIDIA creates '/usr/bin/nvidia-smi' @@ -103,3 +103,7 @@ def rebuild_initramfs? def set_compiler? false end + +def compiler_version + "" +end diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cuda_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cuda_spec.rb index 71bba02fd..7505e655d 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cuda_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cuda_spec.rb @@ -1,10 +1,10 @@ require 'spec_helper' describe 'aws-parallelcluster-platform::cuda' do - cached(:cuda_version) { '11.8' } + cached(:cuda_version) { '12.2' } cached(:cuda_patch) { '0' } cached(:cuda_complete_version) { "#{cuda_version}.#{cuda_patch}" } - cached(:cuda_version_suffix) { '520.61.05' } + cached(:cuda_version_suffix) { '535.54.03' } context 'when nvidia not enabled' do cached(:chef_run) do @@ -20,7 +20,7 @@ context 'when on arm' do cached(:cuda_arch) { 'linux_sbsa' } cached(:cuda_url) { "https://developer.download.nvidia.com/compute/cuda/#{cuda_complete_version}/local_installers/cuda_#{cuda_complete_version}_#{cuda_version_suffix}_#{cuda_arch}.run" } - cached(:cuda_samples_version) { '11.8' } + cached(:cuda_samples_version) { '12.2' } cached(:cuda_samples_url) { "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v#{cuda_samples_version}.tar.gz" } cached(:chef_run) do diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fabric_manager_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fabric_manager_spec.rb index dc225f3b5..eb186fc76 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fabric_manager_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fabric_manager_spec.rb @@ -167,7 +167,7 @@ def self.configure(chef_run) for_all_oses do |platform, version| context "on #{platform}#{version}" do - cached(:fabric_manager_package) { platform == 'ubuntu' ? 'nvidia-fabricmanager-470' : 'nvidia-fabric-manager' } + cached(:fabric_manager_package) { platform == 'ubuntu' ? 'nvidia-fabricmanager-535' : 'nvidia-fabric-manager' } cached(:fabric_manager_version) { platform == 'ubuntu' ? "#{nvidia_driver_version}*" : nvidia_driver_version } context 'when fabric manager is to install' do @@ -218,7 +218,7 @@ def self.configure(chef_run) for_all_oses do |platform, version| context "on #{platform}#{version}" do - cached(:fabric_manager_package) { platform == 'ubuntu' ? 'nvidia-fabricmanager-470' : 'nvidia-fabric-manager' } + cached(:fabric_manager_package) { platform == 'ubuntu' ? 'nvidia-fabricmanager-535' : 'nvidia-fabric-manager' } cached(:fabric_manager_version) { platform == 'ubuntu' ? "#{nvidia_driver_version}*" : nvidia_driver_version } context('when nvswithes are > 1') do diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb index 3e23631c9..84d0eba0a 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb @@ -203,22 +203,32 @@ def self.setup(chef_run, nvidia_driver_version: nil) mode: '0644' ) end + it 'installs nvidia driver' do + is_expected.to run_bash('nvidia.run advanced') + .with( + user: 'root', + group: 'root', + cwd: '/tmp', + creates: '/usr/bin/nvidia-smi' + ) + .with_code(%r{CC=/usr/bin/gcc10-gcc ./nvidia.run --silent --dkms --disable-nouveau --no-cc-version-check}) + .with_code(%r{rm -f /tmp/nvidia.run}) + end else it "doesn't install gcc10" do is_expected.not_to install_package('gcc10') end - end - - it 'installs nvidia driver' do - is_expected.to run_bash('nvidia.run advanced') - .with( - user: 'root', - group: 'root', - cwd: '/tmp', - creates: '/usr/bin/nvidia-smi' - ) - .with_code(%r{./nvidia.run --silent --dkms --disable-nouveau}) - .with_code(%r{rm -f /tmp/nvidia.run}) + it 'installs nvidia driver' do + is_expected.to run_bash('nvidia.run advanced') + .with( + user: 'root', + group: 'root', + cwd: '/tmp', + creates: '/usr/bin/nvidia-smi' + ) + .with_code(%r{./nvidia.run --silent --dkms --disable-nouveau --no-cc-version-check}) + .with_code(%r{rm -f /tmp/nvidia.run}) + end end if platform == 'ubuntu'