ARM docker build with 24.12 pytorch fw image (#581)

### Description Updates ARM Dockerfile to work with 24.12 pytorch FW image ### Type of changes - [ ] Bug fix (non-breaking change which fixes an issue) - [ ] New feature (non-breaking change which adds functionality) - [X] Refactor - [ ] Documentation update - [ ] Other (please describe): ### CI Pipeline Configuration ARM build is not covered by pre-merge CI, is covered nightly. Having said that, this change might not work with CI at all because the blossom runners have a kernel/CUDA mismatch on their Grace systems.... more info TBD ### Usage ``` docker run --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 -it build-image-name:tag /bin/bash ``` ### Pre-submit Checklist  - [X] I have tested these changes locally - [N/A] I have updated the documentation accordingly - [N/A] I have added/updated tests as needed - [X] All existing tests pass successfully --------- Signed-off-by: Timur Rvachov <[email protected]>
NVIDIA · Jan 10, 2025 · d36c18e · d36c18e
1 parent 43d2ca3
commit d36c18e
Show file tree

Hide file tree

Showing 4 changed files with 146 additions and 34 deletions.
diff --git a/3rdparty/NeMo b/3rdparty/NeMo
diff --git a/Dockerfile.arm b/Dockerfile.arm
@@ -1,5 +1,5 @@
 # Base image with apex and transformer engine, but without NeMo or Megatron-LM.
-ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.02-py3
+ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.12-py3
 
 FROM rust:1.82.0 as rust-env
 
@@ -55,24 +55,27 @@ RUN CAUSAL_CONV1D_FORCE_BUILD=TRUE pip --disable-pip-version-check --no-cache-di
   git+https://github.com/Dao-AILab/[email protected]
 
 # Build LLVM and triton
+# It's important to select a specific version of LLVM as per triton's README instructions, and
+# also important to constrain the build targets to the systems we care about or else there will
+# be many strange unlinked symbol issues. Here we assume this dockerfile is build on an aarch64
+# target (host), and build for NVIDIA GPUS (NVPTX). Unclear why, but we also need to build for
+# AMDGPUs to get triton to properly build or else there are linker issues.
 RUN git clone https://github.com/llvm/llvm-project.git && \
     pip install ninja && \
     cd llvm-project && \
-    git fetch origin 5e5a22caf88ac1ccfa8dc5720295fdeba0ad9372 && \
-    git checkout 5e5a22caf88ac1ccfa8dc5720295fdeba0ad9372 && \
+    git fetch origin 10dc3a8e916d73291269e5e2b82dd22681489aa1 && \
+    git checkout 10dc3a8e916d73291269e5e2b82dd22681489aa1 && \
     mkdir build && cd build && \
-    cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=ON  ../llvm -DLLVM_ENABLE_PROJECTS="mlir;llvm" && \
+    cmake -G Ninja  -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=ON -DLLVM_ENABLE_PROJECTS="mlir;llvm" -DLLVM_TARGETS_TO_BUILD="host;NVPTX;AMDGPU" ../llvm && \
     ninja && \
     export LLVM_BUILD_DIR=${WORKDIR}/llvm-project/build && \
-
     cd ${WORKDIR} && \
     git clone https://github.com/triton-lang/triton.git && \
     pip install cmake wheel pybind11 && \
     cd triton && \
-    git fetch origin 79c6c9b209a5692b9a895398f4f3a033f8f80415 && \
-    git checkout 79c6c9b209a5692b9a895398f4f3a033f8f80415 && \
-    LLVM_INCLUDE_DIRS=$LLVM_BUILD_DIR/include LLVM_LIBRARY_DIR=$LLVM_BUILD_DIR/lib LLVM_SYSPATH=$LLVM_BUILD_DIR pip install python/ && \
-
+    git fetch origin release/3.1.x && \
+    git checkout release/3.1.x && \
+    LLVM_INCLUDE_DIRS=$LLVM_BUILD_DIR/include LLVM_LIBRARY_DIR=$LLVM_BUILD_DIR/lib LLVM_SYSPATH=$LLVM_BUILD_DIR pip install --verbose python/ && \
     cd ${WORKDIR} && \
     rm -rf llvm-project && \
     rm -rf triton
@@ -93,25 +96,20 @@ RUN rm -rf /build
 
 # Addressing Security Scan Vulnerabilities
 RUN rm -rf /opt/pytorch/pytorch/third_party/onnx
-RUN apt-get update  && \
-  apt-get install -y openssh-client=1:8.9p1-3ubuntu0.10 && \
-  rm -rf /var/lib/apt/lists/*
-RUN apt purge -y libslurm37 libpmi2-0 && \
+RUN apt purge -y libpmi2-0 && \
   apt autoremove -y
-RUN source /usr/local/nvm/nvm.sh && \
-  NODE_VER=$(nvm current) && \
-  nvm deactivate && \
-  nvm uninstall $NODE_VER && \
-  sed -i "/NVM/d" /root/.bashrc && \
-  sed -i "/nvm.sh/d" /etc/bash.bashrc
 
 # Use UV to install python packages from the workspace. This just installs packages into the system's python
-# environment, and does not use the current uv.lock file.
+# environment, and does not use the current uv.lock file. Note that with python 3.12, we now need to set
+# UV_BREAK_SYSTEM_PACKAGES, since the pytorch base image has made the decision not to use a virtual environment and UV
+# does not respect the PIP_BREAK_SYSTEM_PACKAGES environment variable set in the base dockerfile.
 COPY --from=ghcr.io/astral-sh/uv:0.4.25 /uv /usr/local/bin/uv
 ENV UV_LINK_MODE=copy \
   UV_COMPILE_BYTECODE=1 \
   UV_PYTHON_DOWNLOADS=never \
-  UV_SYSTEM_PYTHON=true
+  UV_SYSTEM_PYTHON=true \
+  UV_NO_CACHE=1 \
+  UV_BREAK_SYSTEM_PACKAGES=1
 
 # Install the bionemo-geomtric requirements ahead of copying over the rest of the repo, so that we can cache their
 # installation. These involve building some torch extensions, so they can take a while to install.
@@ -133,12 +131,35 @@ COPY --from=rust-env /usr/local/rustup /usr/local/rustup
 ENV PATH="/usr/local/cargo/bin:/usr/local/rustup/bin:${PATH}"
 ENV RUSTUP_HOME="/usr/local/rustup"
 
-# Build decord
+# # Build decord
+# This needs a specific version of ffmpeg:
+# root@e1fc53d00844:/workspace/bionemo2# ffmpeg -version
+# ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
+# built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
+# configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/aarch64-linux-gnu --incdir=/usr/include/aarch64-linux-gnu --arch=arm64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzimg --enable-libzmq --enable-libzvbi --enable-lv2 --enable-omx --enable-openal --enable-opencl --enable-opengl --enable-sdl2 --enable-pocketsphinx --enable-librsvg --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-chromaprint --enable-frei0r --enable-libx264 --enable-shared
+# libavutil      56. 70.100 / 56. 70.100
+# libavcodec     58.134.100 / 58.134.100
+# libavformat    58. 76.100 / 58. 76.100
+# libavdevice    58. 13.100 / 58. 13.100
+# libavfilter     7.110.100 /  7.110.100
+# libswscale      5.  9.100 /  5.  9.100
+# libswresample   3.  9.100 /  3.  9.100
+# libpostproc    55.  9.100 / 55.  9.100
+#
+# Issue link: https://github.com/dmlc/decord/issues/257
+# Diff to make it all work https://github.com/dmlc/decord/issues/186#issuecomment-1171882325
+
+# Consider this:
+# sudo apt install libnvidia-decode-550
+# cp /usr/lib/aarch64-linux-gnu/libnvcuvid* /usr/local/cuda/
+# cmake .. -DUSE_CUDA=ON -DCMAKE_BUILD_TYPE=Release
+
 RUN apt-get update && \
     apt-get install -y build-essential python3-dev python3-setuptools make cmake && \
-    apt-get install -y ffmpeg libavcodec-dev libavfilter-dev libavformat-dev libavutil-dev && \
+    apt-get install -y ffmpeg libavcodec-dev libavfilter-dev libavformat-dev libavutil-dev
+RUN --mount=type=bind,source=./arm_build/decord_ffmpeg6_fix.patch,target=/decord_ffmpeg6_fix.patch \
     git clone --recursive https://github.com/dmlc/decord && \
-    cd decord && \
+    cd decord && git apply /decord_ffmpeg6_fix.patch && \
     mkdir build && cd build && \
     cmake .. -DUSE_CUDA=0 -DCMAKE_BUILD_TYPE=Release && \
     make && \
@@ -173,20 +194,28 @@ RUN git clone --single-branch --branch 1.15.0rc4 https://github.com/single-cell-
 
 WORKDIR /workspace/bionemo2
 # Note, we need to mount the .git folder here so that setuptools-scm is able to fetch git tag for version.
+# For some reason, we do not need to do the tensorstore verson package hack on arm64, while we do need this for x86 build.
 RUN --mount=type=bind,source=./.git,target=./.git \
   --mount=type=bind,source=./requirements-test.txt,target=/requirements-test.txt \
   --mount=type=bind,source=./requirements-cve.txt,target=/requirements-cve.txt \
-  <<EOF
-set -eo pipefail
-uv pip install maturin --no-build-isolation && uv pip install --no-build-isolation \
+  uv pip install maturin --no-build-isolation --break-system-packages
+RUN --mount=type=bind,source=./.git,target=./.git \
+  --mount=type=bind,source=./requirements-test.txt,target=/requirements-test.txt \
+  --mount=type=bind,source=./requirements-cve.txt,target=/requirements-cve.txt \
+  pip install --use-deprecated=legacy-resolver  --no-build-isolation \
+  tensorstore==0.1.45
+
+RUN --mount=type=bind,source=./.git,target=./.git \
+  --mount=type=bind,source=./requirements-test.txt,target=/requirements-test.txt \
+  --mount=type=bind,source=./requirements-cve.txt,target=/requirements-cve.txt \
+# Comment out mamba install in NeMo as this causes issues.
+  sed -i "/mamba-ssm/d" ./3rdparty/NeMo/requirements/requirements_nlp.txt && \
+  uv pip install --no-build-isolation \
   ./3rdparty/* \
   ./sub-packages/bionemo-* \
   -r /requirements-cve.txt \
-  -r /requirements-test.txt
-rm -rf ./3rdparty
-rm -rf /tmp/*
-rm -rf ./sub-packages/bionemo-noodles/target
-EOF
+  -r /requirements-test.txt && rm -rf ./3rdparty && rm -rf /tmp/* && rm -rf ./sub-packages/bionemo-noodles/target \
+  && rm -rf /root/.cache/*
 
 # In the devcontainer image, we just copy over the finished `dist-packages` folder from the build image back into the
 # base pytorch container. We can then set up a non-root user and uninstall the bionemo and 3rd-party packages, so that
@@ -286,6 +315,12 @@ for sub in ./3rdparty/* ./sub-packages/bionemo-*; do
     uv pip install --no-deps --no-build-isolation --editable $sub
 done
 EOF
+# This is needed because faiss is not compatible with ARM at all.
+# Bionemo doesn't use faiss, but megatron core does.
+# We do not use this codepath at all, therefore we just make is_sve_supported return False
+# to circumvent python import issues
+RUN sed -i '42i\        # Bionemo hack to fix ARM issues with faiss\n        return False' /usr/local/lib/python3.12/dist-packages/faiss/loader.py
+
 # Since the entire repo is owned by root, swithcing username for development breaks things.
 ARG USERNAME=bionemo
 RUN chown $USERNAME:$USERNAME -R /workspace/bionemo2/
@@ -312,7 +347,6 @@ COPY --from=rust-env /usr/local/rustup /usr/local/rustup
 
 
 # RUN rm -rf /usr/local/cargo /usr/local/rustup
-RUN rm -rf /root/.cache/bazel
 RUN chmod 777 -R /workspace/bionemo2/
 
 # Transformer engine attention defaults

diff --git a/arm_build/decord_ffmpeg6_fix.patch b/arm_build/decord_ffmpeg6_fix.patch
@@ -0,0 +1,73 @@
+# This is a patch file for decord https://github.com/dmlc/decord
+# needed to build decord against ffmpeg6, taken from
+# https://github.com/dmlc/decord/issues/186#issuecomment-1171882325
+# This needs to be removed once decord natively supports latest ffmpeg versions.
+diff --git a/src/video/ffmpeg/ffmpeg_common.h b/src/video/ffmpeg/ffmpeg_common.h
+index b0b973f..f0f7316 100644
+--- a/src/video/ffmpeg/ffmpeg_common.h
++++ b/src/video/ffmpeg/ffmpeg_common.h
+@@ -21,6 +21,7 @@
+ extern "C" {
+ #endif
+ #include <libavcodec/avcodec.h>
++#include <libavcodec/bsf.h>
+ #include <libavformat/avformat.h>
+ #include <libavformat/avio.h>
+ #include <libavfilter/avfilter.h>
+diff --git a/src/video/nvcodec/cuda_threaded_decoder.cc b/src/video/nvcodec/cuda_threaded_decoder.cc
+index 62bc7ee..957a90d 100644
+--- a/src/video/nvcodec/cuda_threaded_decoder.cc
++++ b/src/video/nvcodec/cuda_threaded_decoder.cc
+@@ -17,7 +17,7 @@ namespace decord {
+ namespace cuda {
+ using namespace runtime;
+
+-CUThreadedDecoder::CUThreadedDecoder(int device_id, AVCodecParameters *codecpar, AVInputFormat *iformat)
++CUThreadedDecoder::CUThreadedDecoder(int device_id, AVCodecParameters *codecpar, const AVInputFormat *iformat)
+     : device_id_(device_id), stream_({device_id, false}), device_{}, ctx_{}, parser_{}, decoder_{},
+     pkt_queue_{}, frame_queue_{},
+     run_(false), frame_count_(0), draining_(false),
+@@ -70,7 +70,7 @@ CUThreadedDecoder::CUThreadedDecoder(int device_id, AVCodecParameters *codecpar,
+     }
+ }
+
+-void CUThreadedDecoder::InitBitStreamFilter(AVCodecParameters *codecpar, AVInputFormat *iformat) {
++void CUThreadedDecoder::InitBitStreamFilter(AVCodecParameters *codecpar, const AVInputFormat *iformat) {
+     const char* bsf_name = nullptr;
+     if (AV_CODEC_ID_H264 == codecpar->codec_id) {
+         // H.264
+diff --git a/src/video/nvcodec/cuda_threaded_decoder.h b/src/video/nvcodec/cuda_threaded_decoder.h
+index d7e6fcd..61958a1 100644
+--- a/src/video/nvcodec/cuda_threaded_decoder.h
++++ b/src/video/nvcodec/cuda_threaded_decoder.h
+@@ -46,7 +46,7 @@ class CUThreadedDecoder final : public ThreadedDecoderInterface {
+     using FrameOrderQueuePtr = std::unique_ptr<FrameOrderQueue>;
+
+     public:
+-        CUThreadedDecoder(int device_id, AVCodecParameters *codecpar, AVInputFormat *iformat);
++        CUThreadedDecoder(int device_id, AVCodecParameters *codecpar, const AVInputFormat *iformat);
+         void SetCodecContext(AVCodecContext *dec_ctx, int width = -1, int height = -1, int rotation = 0);
+         bool Initialized() const;
+         void Start();
+@@ -70,7 +70,7 @@ class CUThreadedDecoder final : public ThreadedDecoderInterface {
+         void LaunchThreadImpl();
+         void RecordInternalError(std::string message);
+         void CheckErrorStatus();
+-        void InitBitStreamFilter(AVCodecParameters *codecpar, AVInputFormat *iformat);
++        void InitBitStreamFilter(AVCodecParameters *codecpar, const AVInputFormat *iformat);
+
+         int device_id_;
+         CUStream stream_;
+diff --git a/src/video/video_reader.cc b/src/video/video_reader.cc
+index af4858d..99c9635 100644
+--- a/src/video/video_reader.cc
++++ b/src/video/video_reader.cc
+@@ -145,7 +145,7 @@ VideoReader::~VideoReader(){
+
+ void VideoReader::SetVideoStream(int stream_nb) {
+     if (!fmt_ctx_) return;
+-    AVCodec *dec;
++    const AVCodec *dec;
+     int st_nb = av_find_best_stream(fmt_ctx_.get(), AVMEDIA_TYPE_VIDEO, stream_nb, -1, &dec, 0);
+     // LOG(INFO) << "find best stream: " << st_nb;
+     CHECK_GE(st_nb, 0) << "ERROR cannot find video stream with wanted index: " << stream_nb;
diff --git a/ci/scripts/run_pytest.sh b/ci/scripts/run_pytest.sh
@@ -68,6 +68,11 @@ source "$SCRIPT_DIR/utils.sh" || { echo "Failed to source utils.sh" >&2; exit 1;
 # Set up BioNeMo home directory
 set_bionemo_home || exit 1
 
+# Echo some useful information
+lscpu
+nvidia-smi
+uname -a
+
 # Set up pytest options
 PYTEST_OPTIONS=(
     -v
+3 −2		.github/workflows/cicd-main.yml
+1 −1		Dockerfile.ci
+1 −2		docs/source/nlp/information_retrieval.rst
+0 −1		nemo/collections/diffusion/scripts/train.sh
+3 −0		nemo/collections/llm/gpt/model/gemma.py
+0 −2		nemo/collections/llm/recipes/gemma_2b.py
+0 −4		nemo/collections/llm/recipes/gemma_7b.py
+5 −0		nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+3 −0		nemo/collections/nlp/models/language_modeling/megatron_retro_model.py
+2 −2		nemo/collections/vlm/mllama/model/language.py
+1 −1		nemo/lightning/pytorch/callbacks/peft.py
+1 −1		requirements/requirements_multimodal.txt
+2 −1		scripts/checkpoint_converters/convert_bert_hf_to_nemo.py
+2 −0		tests/collections/llm/bitexact/mixtral/pretrain_mini_mixtral.py
+2 −2		tests/collections/llm/bitexact/mixtral/run.sh
+5 −0		tests/collections/llm/gpt/model/test_model_import.py
+0 −1		tests/collections/llm/hf/peft_nemorun.py
+0 −1		tests/collections/llm/hf/sft_nemorun.py
+2 −0		tests/collections/llm/megatron_mixtral_pretraining.py
+14 −0		tests/conftest.py
+2 −2		tests/core/test_exp_manager.py
+5 −5		tests/lightning/test_nemo_resume_from_ckpt.py
+0 −2		tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb
+0 −2		tutorials/llm/llama-3/nemo2-sft-peft/nemo2-sft.ipynb
+2 −7		tutorials/llm/mamba/mamba.rst