Skip to content

Commit

Permalink
Upgrading NVIDIA driver,fabric manager and Cuda to v535 and v12.2 res…
Browse files Browse the repository at this point in the history
…pectively (#2388)

Co-authored-by: Himani Deshpande <[email protected]>
  • Loading branch information
himani2411 and Himani Deshpande authored Aug 9, 2023
1 parent 3b23fb2 commit 11f5b88
Show file tree
Hide file tree
Showing 9 changed files with 44 additions and 24 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@ This file is used to list changes made in each version of the AWS ParallelCluste
**CHANGES**
- Assign Slurm dynamic nodes a priority (weight) of 1000 by default. This allows Slurm to prioritize idle static nodes over idle dynamic ones.
- Create a Slurm partition-nodelist mapping JSON file to be used by the node package daemons to recognize PC-managed Slurm partitions and nodelists.
- Upgrade NVIDIA driver to version 470.199.02.
- Upgrade NVIDIA driver to version 535.54.03.
- Upgrade CUDA library to version 12.2.0.
- Upgrade NVIDIA Fabric manager to `nvidia-fabricmanager-535`
- Increase EFS-utils watchdog poll interval to 10 seconds. Note: This change is meaningful only if [EncryptionInTransit](https://docs.aws.amazon.com/parallelcluster/latest/ug/SharedStorage-v3.html#yaml-SharedStorage-EfsSettings-EncryptionInTransit) is set to `true`, because watchdog does not run otherwise.
- Upgrade EFA installer to `1.25.0`
- Efa-driver: `efa-2.5.0-1`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

# NVidia
default['cluster']['nvidia']['enabled'] = 'no'
default['cluster']['nvidia']['driver_version'] = '470.199.02'
default['cluster']['nvidia']['driver_version'] = '535.54.03'

# DCV
default['cluster']['dcv']['authenticator']['user'] = "dcvextauth"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,13 @@

# Cuda installer from https://developer.nvidia.com/cuda-toolkit-archive
# Cuda installer naming: cuda_11.8.0_520.61.05_linux
cuda_version = '11.8'
cuda_version = '12.2'
cuda_patch = '0'
cuda_complete_version = "#{cuda_version}.#{cuda_patch}"
cuda_version_suffix = '520.61.05'
cuda_version_suffix = '535.54.03'
cuda_arch = arm_instance? ? 'linux_sbsa' : 'linux'
cuda_url = "https://developer.download.nvidia.com/compute/cuda/#{cuda_complete_version}/local_installers/cuda_#{cuda_complete_version}_#{cuda_version_suffix}_#{cuda_arch}.run"
cuda_samples_version = '11.8'
cuda_samples_version = '12.2'
cuda_samples_url = "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v#{cuda_samples_version}.tar.gz"
tmp_cuda_run = '/tmp/cuda.run'
tmp_cuda_sample_archive = '/tmp/cuda-sample.tar.gz'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
use 'partial/_fabric_manager_install_debian.rb'

def fabric_manager_package
'nvidia-fabricmanager-470'
'nvidia-fabricmanager-535'
end

def fabric_manager_version
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,7 @@ def set_compiler?
# Amazon linux 2 with Kernel 5 need to set CC to /usr/bin/gcc10-gcc using dkms override
node['kernel']['release'].split('.')[0].to_i == 5
end

def compiler_version
'CC=/usr/bin/gcc10-gcc'
end
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@
cwd '/tmp'
code <<-NVIDIA
set -e
./nvidia.run --silent --dkms --disable-nouveau --no-cc-version-check
#{compiler_version} ./nvidia.run --silent --dkms --disable-nouveau --no-cc-version-check
rm -f /tmp/nvidia.run
NVIDIA
creates '/usr/bin/nvidia-smi'
Expand Down Expand Up @@ -103,3 +103,7 @@ def rebuild_initramfs?
def set_compiler?
false
end

def compiler_version
""
end
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
require 'spec_helper'

describe 'aws-parallelcluster-platform::cuda' do
cached(:cuda_version) { '11.8' }
cached(:cuda_version) { '12.2' }
cached(:cuda_patch) { '0' }
cached(:cuda_complete_version) { "#{cuda_version}.#{cuda_patch}" }
cached(:cuda_version_suffix) { '520.61.05' }
cached(:cuda_version_suffix) { '535.54.03' }

context 'when nvidia not enabled' do
cached(:chef_run) do
Expand All @@ -20,7 +20,7 @@
context 'when on arm' do
cached(:cuda_arch) { 'linux_sbsa' }
cached(:cuda_url) { "https://developer.download.nvidia.com/compute/cuda/#{cuda_complete_version}/local_installers/cuda_#{cuda_complete_version}_#{cuda_version_suffix}_#{cuda_arch}.run" }
cached(:cuda_samples_version) { '11.8' }
cached(:cuda_samples_version) { '12.2' }
cached(:cuda_samples_url) { "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v#{cuda_samples_version}.tar.gz" }

cached(:chef_run) do
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ def self.configure(chef_run)

for_all_oses do |platform, version|
context "on #{platform}#{version}" do
cached(:fabric_manager_package) { platform == 'ubuntu' ? 'nvidia-fabricmanager-470' : 'nvidia-fabric-manager' }
cached(:fabric_manager_package) { platform == 'ubuntu' ? 'nvidia-fabricmanager-535' : 'nvidia-fabric-manager' }
cached(:fabric_manager_version) { platform == 'ubuntu' ? "#{nvidia_driver_version}*" : nvidia_driver_version }

context 'when fabric manager is to install' do
Expand Down Expand Up @@ -218,7 +218,7 @@ def self.configure(chef_run)

for_all_oses do |platform, version|
context "on #{platform}#{version}" do
cached(:fabric_manager_package) { platform == 'ubuntu' ? 'nvidia-fabricmanager-470' : 'nvidia-fabric-manager' }
cached(:fabric_manager_package) { platform == 'ubuntu' ? 'nvidia-fabricmanager-535' : 'nvidia-fabric-manager' }
cached(:fabric_manager_version) { platform == 'ubuntu' ? "#{nvidia_driver_version}*" : nvidia_driver_version }

context('when nvswithes are > 1') do
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -203,22 +203,32 @@ def self.setup(chef_run, nvidia_driver_version: nil)
mode: '0644'
)
end
it 'installs nvidia driver' do
is_expected.to run_bash('nvidia.run advanced')
.with(
user: 'root',
group: 'root',
cwd: '/tmp',
creates: '/usr/bin/nvidia-smi'
)
.with_code(%r{CC=/usr/bin/gcc10-gcc ./nvidia.run --silent --dkms --disable-nouveau --no-cc-version-check})
.with_code(%r{rm -f /tmp/nvidia.run})
end
else
it "doesn't install gcc10" do
is_expected.not_to install_package('gcc10')
end
end

it 'installs nvidia driver' do
is_expected.to run_bash('nvidia.run advanced')
.with(
user: 'root',
group: 'root',
cwd: '/tmp',
creates: '/usr/bin/nvidia-smi'
)
.with_code(%r{./nvidia.run --silent --dkms --disable-nouveau})
.with_code(%r{rm -f /tmp/nvidia.run})
it 'installs nvidia driver' do
is_expected.to run_bash('nvidia.run advanced')
.with(
user: 'root',
group: 'root',
cwd: '/tmp',
creates: '/usr/bin/nvidia-smi'
)
.with_code(%r{./nvidia.run --silent --dkms --disable-nouveau --no-cc-version-check})
.with_code(%r{rm -f /tmp/nvidia.run})
end
end

if platform == 'ubuntu'
Expand Down

0 comments on commit 11f5b88

Please sign in to comment.