From fcbf97fb94344f215f5572cd2c4750a40099a3b7 Mon Sep 17 00:00:00 2001 From: David Rohr Date: Fri, 9 Aug 2024 13:32:45 +0200 Subject: [PATCH] Add slc9-gpu-builder --- slc8-gpu-builder/packer.json | 4 +-- slc8-gpu-builder/provision.sh | 3 +- slc9-gpu-builder/amdgpu.repo | 5 +++ slc9-gpu-builder/cuda.repo | 6 ++++ slc9-gpu-builder/packer.json | 58 +++++++++++++++++++++++++++++++++++ slc9-gpu-builder/provision.sh | 50 ++++++++++++++++++++++++++++++ slc9-gpu-builder/rocm.repo | 5 +++ 7 files changed, 128 insertions(+), 3 deletions(-) create mode 100644 slc9-gpu-builder/amdgpu.repo create mode 100644 slc9-gpu-builder/cuda.repo create mode 100644 slc9-gpu-builder/packer.json create mode 100755 slc9-gpu-builder/provision.sh create mode 100644 slc9-gpu-builder/rocm.repo diff --git a/slc8-gpu-builder/packer.json b/slc8-gpu-builder/packer.json index 0397fb3..fdcdc69 100644 --- a/slc8-gpu-builder/packer.json +++ b/slc8-gpu-builder/packer.json @@ -1,9 +1,9 @@ { - "_comment": "Alma 8.7 GPU builder X-enabled CUDA12.2-enabled AMD ROCm 5.5.3-enabled", + "_comment": "Alma 8.7 GPU builder X-enabled CUDA12.6-enabled AMD ROCm 5.5.3-enabled", "variables": { "REPO": "registry.cern.ch/alisw/slc8-gpu-builder", "TAG": "latest", - "CUDA_PKG_VERSION": "12-2-12.2.*", + "CUDA_PKG_VERSION": "12-6-12.6.*", "NVIDIA_GPGKEY_SUM": "d0664fbbdb8c32356d45de36c5984617217b2d0bef41b93ccecd326ba3b80c87" }, "builders": [ diff --git a/slc8-gpu-builder/provision.sh b/slc8-gpu-builder/provision.sh index 1cbc931..eb0bef3 100755 --- a/slc8-gpu-builder/provision.sh +++ b/slc8-gpu-builder/provision.sh @@ -17,6 +17,7 @@ curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64 sed '/^Version/d' > /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA echo "${NVIDIA_GPGKEY_SUM} /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA" | sha256sum -c --strict - +rpm --import https://repo.almalinux.org/almalinux/RPM-GPG-KEY-AlmaLinux # Install requirements for GPU event display, NVIDIA CUDA and AMD ROCm stacks yum install -y freeglut-devel lsof "cuda-cudart-$CUDA_PKG_VERSION" 'cuda-compat-12-0-*' \ "cuda-libraries-$CUDA_PKG_VERSION" "cuda-nvtx-$CUDA_PKG_VERSION" \ @@ -30,7 +31,7 @@ yum clean all rm -rf /var/cache/yum # Set up NVIDIA CUDA stack -ln -s cuda-12.2 /usr/local/cuda +ln -s cuda-12.6 /usr/local/cuda echo /usr/local/nvidia/lib >> /etc/ld.so.conf.d/nvidia.conf echo /usr/local/nvidia/lib64 >> /etc/ld.so.conf.d/nvidia.conf export PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH} diff --git a/slc9-gpu-builder/amdgpu.repo b/slc9-gpu-builder/amdgpu.repo new file mode 100644 index 0000000..079e61b --- /dev/null +++ b/slc9-gpu-builder/amdgpu.repo @@ -0,0 +1,5 @@ +[amdgpu] +name=amdgpu +baseurl=http://repo.radeon.com/amdgpu/6.2/rhel/9.4/main/x86_64/ +enabled=1 +gpgcheck=0 diff --git a/slc9-gpu-builder/cuda.repo b/slc9-gpu-builder/cuda.repo new file mode 100644 index 0000000..59b8284 --- /dev/null +++ b/slc9-gpu-builder/cuda.repo @@ -0,0 +1,6 @@ +[cuda] +name=cuda +baseurl=http://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64 +enabled=1 +gpgcheck=1 +gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA diff --git a/slc9-gpu-builder/packer.json b/slc9-gpu-builder/packer.json new file mode 100644 index 0000000..8822719 --- /dev/null +++ b/slc9-gpu-builder/packer.json @@ -0,0 +1,58 @@ +{ + "_comment": "Alma 9.4 GPU builder X-enabled CUDA12.6-enabled AMD ROCm 6.2.0-enabled", + "variables": { + "REPO": "registry.cern.ch/alisw/slc8-gpu-builder", + "TAG": "latest", + "CUDA_PKG_VERSION": "12-6-12.6.*", + "NVIDIA_GPGKEY_SUM": "d0664fbbdb8c32356d45de36c5984617217b2d0bef41b93ccecd326ba3b80c87" + }, + "builders": [ + { + "type": "docker", + "image": "registry.cern.ch/alisw/slc9-builder:latest", + "commit": true, + "changes": [ + "ENV CMAKE_PREFIX_PATH=/opt/rocm/lib/cmake:/opt/clang/lib/cmake", + "ENV AMDAPPSDKROOT=/opt/amd-app/", + "ENV PATH=\"${PATH}:/usr/local/cuda/bin\"", + "ENV ALIBUILD_O2_FORCE_GPU=1" + ] + } + ], + "provisioners": [ + { + "type": "file", + "source": "cuda.repo", + "destination": "/etc/yum.repos.d/cuda.repo" + }, + { + "type": "file", + "source": "rocm.repo", + "destination": "/etc/yum.repos.d/rocm.repo" + }, + { + "type": "file", + "source": "amdgpu.repo", + "destination": "/etc/yum.repos.d/amdgpu.repo" + }, + { + "type": "shell", + "environment_vars": [ + "CUDA_PKG_VERSION={{user `CUDA_PKG_VERSION`}}", + "NVIDIA_GPGKEY_SUM={{user `NVIDIA_GPGKEY_SUM`}}", + "GIT_VERSION={{user `GIT_VERSION`}}" + ], + "script": "provision.sh" + } + ], + "post-processors": [ + [ + { + "type": "docker-tag", + "repository": "{{user `REPO`}}", + "tag": "{{user `TAG`}}" + }, + "docker-push" + ] + ] +} diff --git a/slc9-gpu-builder/provision.sh b/slc9-gpu-builder/provision.sh new file mode 100755 index 0000000..d6df49c --- /dev/null +++ b/slc9-gpu-builder/provision.sh @@ -0,0 +1,50 @@ +#!/bin/sh -ex + +wipednf () { + rpmdb --rebuilddb + dnf clean all + rm -rf /var/cache/yum +} + +# Install AMD APP Stack +# Old version no longer available from AMD but the newer versions will not work +curl -fsSL https://s3.cern.ch/swift/v1/alibuild-repo/slc8-gpu-builder-reqs/amdappsdk.tar.bz2 | tar -xjv +./AMD-APP-SDK-v3.0.130.136-GA-linux64.sh --noexec --target /opt/amd-app +rm -v AMD-APP-SDK-v3.0.130.136-GA-linux64.sh +# Avoid file collisions between AMD APP and AMD ROCm stack +mkdir -p /etc/OpenCL/vendors +echo /opt/amd-app/lib/x86_64/sdk/libamdocl64-app.so > /etc/OpenCL/vendors/amdocl64-app.icd +mv -v /opt/amd-app/lib/x86_64/sdk/libamdocl64.so \ + /opt/amd-app/lib/x86_64/sdk/libamdocl64-app.so +echo /opt/amd-app/lib/x86_64/ > /etc/ld.so.conf.d/amd-app-sdk.conf + +# Install NVIDIA GPG key +curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/D42D0685.pub | + sed '/^Version/d' > /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA +echo "${NVIDIA_GPGKEY_SUM} /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA" | sha256sum -c --strict - + +# rpm --import https://repo.almalinux.org/almalinux/RPM-GPG-KEY-AlmaLinux +dnf update -y +# Install requirements for GPU event display, NVIDIA CUDA and AMD ROCm stacks +dnf install -y freeglut-devel lsof "cuda-cudart-$CUDA_PKG_VERSION" 'cuda-compat-12-0-*' \ + "cuda-libraries-$CUDA_PKG_VERSION" "cuda-nvtx-$CUDA_PKG_VERSION" \ + "cuda-libraries-devel-$CUDA_PKG_VERSION" "cuda-nvml-devel-$CUDA_PKG_VERSION" \ + "cuda-minimal-build-$CUDA_PKG_VERSION" "cuda-command-line-tools-$CUDA_PKG_VERSION" \ + hip-rocclr ocl-icd ocl-icd-devel hipcub rocthrust rocm-dev hipify-clang +# ROCm: Notice we do not need the version for ROCM because we target a specific distribution in rocm.repo + +# Set up NVIDIA CUDA stack +ln -s cuda-12.6 /usr/local/cuda +echo /usr/local/nvidia/lib >> /etc/ld.so.conf.d/nvidia.conf +echo /usr/local/nvidia/lib64 >> /etc/ld.so.conf.d/nvidia.conf +export PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH} +export LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64 +LIBRARY_PATH=/usr/local/cuda/lib64/stubs ldconfig + +# Fix some errors in current ROCm +sed -i "s/amdgpu-function-calls=false/amdgpu-function-calls=true/g" /opt/rocm/bin/hipcc* /opt/rocm/lib/cmake/hip/*.cmake + +# Remove clang-ocl binary, since it is currently broken, to avoid automatic pick-up +rm -fv /opt/rocm/bin/clang-ocl /usr/bin/clang-ocl + +wipednf diff --git a/slc9-gpu-builder/rocm.repo b/slc9-gpu-builder/rocm.repo new file mode 100644 index 0000000..58dc483 --- /dev/null +++ b/slc9-gpu-builder/rocm.repo @@ -0,0 +1,5 @@ +[ROCm] +name=ROCm +baseurl=http://repo.radeon.com/rocm/rhel9/6.2/main/ +enabled=1 +gpgcheck=0