-
Notifications
You must be signed in to change notification settings - Fork 55
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
ARM docker build with 24.12 pytorch fw image
- Loading branch information
Showing
3 changed files
with
158 additions
and
44 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
# Base image with apex and transformer engine, but without NeMo or Megatron-LM. | ||
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.02-py3 | ||
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.12-py3 | ||
|
||
FROM rust:1.82.0 as rust-env | ||
|
||
|
@@ -55,23 +55,27 @@ RUN CAUSAL_CONV1D_FORCE_BUILD=TRUE pip --disable-pip-version-check --no-cache-di | |
git+https://github.com/Dao-AILab/[email protected] | ||
|
||
# Build LLVM and triton | ||
# It's important to select a specific version of LLVM as per triton's README instructions, and | ||
# also important to constrain the build targets to the systems we care about or else there will | ||
# be many strange unlinked symbol issues. Here we assume this dockerfile is build on an aarch64 | ||
# target (host), and build for NVIDIA GPUS (NVPTX). | ||
RUN git clone https://github.com/llvm/llvm-project.git && \ | ||
pip install ninja && \ | ||
cd llvm-project && \ | ||
git fetch origin 5e5a22caf88ac1ccfa8dc5720295fdeba0ad9372 && \ | ||
git checkout 5e5a22caf88ac1ccfa8dc5720295fdeba0ad9372 && \ | ||
git fetch origin 10dc3a8e916d73291269e5e2b82dd22681489aa1 && \ | ||
git checkout 10dc3a8e916d73291269e5e2b82dd22681489aa1 && \ | ||
mkdir build && cd build && \ | ||
cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=ON ../llvm -DLLVM_ENABLE_PROJECTS="mlir;llvm" && \ | ||
ninja && \ | ||
export LLVM_BUILD_DIR=${WORKDIR}/llvm-project/build && \ | ||
cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=ON -DLLVM_ENABLE_PROJECTS="mlir;llvm" -DLLVM_TARGETS_TO_BUILD="host;NVPTX;AMDGPU" ../llvm && \ | ||
ninja | ||
ENV LLVM_BUILD_DIR=${WORKDIR}/llvm-project/build | ||
|
||
cd ${WORKDIR} && \ | ||
RUN cd ${WORKDIR} && \ | ||
git clone https://github.com/triton-lang/triton.git && \ | ||
pip install cmake wheel pybind11 && \ | ||
cd triton && \ | ||
git fetch origin 79c6c9b209a5692b9a895398f4f3a033f8f80415 && \ | ||
git checkout 79c6c9b209a5692b9a895398f4f3a033f8f80415 && \ | ||
LLVM_INCLUDE_DIRS=$LLVM_BUILD_DIR/include LLVM_LIBRARY_DIR=$LLVM_BUILD_DIR/lib LLVM_SYSPATH=$LLVM_BUILD_DIR pip install python/ && \ | ||
git fetch origin release/3.1.x && \ | ||
git checkout release/3.1.x && \ | ||
LLVM_INCLUDE_DIRS=$LLVM_BUILD_DIR/include LLVM_LIBRARY_DIR=$LLVM_BUILD_DIR/lib LLVM_SYSPATH=$LLVM_BUILD_DIR pip install --verbose python/ && \ | ||
|
||
cd ${WORKDIR} && \ | ||
rm -rf llvm-project && \ | ||
|
@@ -93,17 +97,8 @@ RUN rm -rf /build | |
|
||
# Addressing Security Scan Vulnerabilities | ||
RUN rm -rf /opt/pytorch/pytorch/third_party/onnx | ||
RUN apt-get update && \ | ||
apt-get install -y openssh-client=1:8.9p1-3ubuntu0.10 && \ | ||
rm -rf /var/lib/apt/lists/* | ||
RUN apt purge -y libslurm37 libpmi2-0 && \ | ||
RUN apt purge -y libpmi2-0 && \ | ||
apt autoremove -y | ||
RUN source /usr/local/nvm/nvm.sh && \ | ||
NODE_VER=$(nvm current) && \ | ||
nvm deactivate && \ | ||
nvm uninstall $NODE_VER && \ | ||
sed -i "/NVM/d" /root/.bashrc && \ | ||
sed -i "/nvm.sh/d" /etc/bash.bashrc | ||
|
||
# Use UV to install python packages from the workspace. This just installs packages into the system's python | ||
# environment, and does not use the current uv.lock file. | ||
|
@@ -117,7 +112,7 @@ ENV UV_LINK_MODE=copy \ | |
# installation. These involve building some torch extensions, so they can take a while to install. | ||
RUN --mount=type=bind,source=./sub-packages/bionemo-geometric/requirements.txt,target=/requirements-pyg.txt \ | ||
--mount=type=cache,id=uv-cache,target=/root/.cache,sharing=locked \ | ||
uv pip install --no-build-isolation -r /requirements-pyg.txt | ||
uv pip install --no-build-isolation --break-system-packages -r /requirements-pyg.txt | ||
|
||
ENV WORKDIR=/workspace/bionemo2 | ||
WORKDIR ${WORKDIR} | ||
|
@@ -133,19 +128,43 @@ COPY --from=rust-env /usr/local/rustup /usr/local/rustup | |
ENV PATH="/usr/local/cargo/bin:/usr/local/rustup/bin:${PATH}" | ||
ENV RUSTUP_HOME="/usr/local/rustup" | ||
|
||
# Build decord | ||
# # Build decord | ||
# This needs a specific version of ffmpeg: | ||
# root@e1fc53d00844:/workspace/bionemo2# ffmpeg -version | ||
# ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers | ||
# built with gcc 11 (Ubuntu 11.2.0-19ubuntu1) | ||
# configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/aarch64-linux-gnu --incdir=/usr/include/aarch64-linux-gnu --arch=arm64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzimg --enable-libzmq --enable-libzvbi --enable-lv2 --enable-omx --enable-openal --enable-opencl --enable-opengl --enable-sdl2 --enable-pocketsphinx --enable-librsvg --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-chromaprint --enable-frei0r --enable-libx264 --enable-shared | ||
# libavutil 56. 70.100 / 56. 70.100 | ||
# libavcodec 58.134.100 / 58.134.100 | ||
# libavformat 58. 76.100 / 58. 76.100 | ||
# libavdevice 58. 13.100 / 58. 13.100 | ||
# libavfilter 7.110.100 / 7.110.100 | ||
# libswscale 5. 9.100 / 5. 9.100 | ||
# libswresample 3. 9.100 / 3. 9.100 | ||
# libpostproc 55. 9.100 / 55. 9.100 | ||
# | ||
# Issue link: https://github.com/dmlc/decord/issues/257 | ||
# Diff to make it all work https://github.com/dmlc/decord/issues/186#issuecomment-1171882325 | ||
|
||
# Consider this: | ||
# sudo apt install libnvidia-decode-550 | ||
# cp /usr/lib/aarch64-linux-gnu/libnvcuvid* /usr/local/cuda/ | ||
# cmake .. -DUSE_CUDA=ON -DCMAKE_BUILD_TYPE=Release | ||
|
||
RUN apt-get update && \ | ||
apt-get install -y build-essential python3-dev python3-setuptools make cmake && \ | ||
apt-get install -y ffmpeg libavcodec-dev libavfilter-dev libavformat-dev libavutil-dev && \ | ||
git clone --recursive https://github.com/dmlc/decord && \ | ||
cd decord && \ | ||
mkdir build && cd build && \ | ||
cmake .. -DUSE_CUDA=0 -DCMAKE_BUILD_TYPE=Release && \ | ||
make && \ | ||
cd ../python && \ | ||
pip install . && \ | ||
cd ${WORKDIR} && \ | ||
rm -rf decord | ||
apt-get install -y ffmpeg libavcodec-dev libavfilter-dev libavformat-dev libavutil-dev | ||
# && cp /usr/lib/aarch64-linux-gnu/libnvcuvid* /usr/local/cuda/ | ||
RUN --mount=type=bind,source=./arm_build/decord_ffmpeg6_fix.patch,target=/decord_ffmpeg6_fix.patch \ | ||
git clone --recursive https://github.com/dmlc/decord && \ | ||
cd decord && git apply /decord_ffmpeg6_fix.patch && \ | ||
mkdir build && cd build && \ | ||
cmake .. -DUSE_CUDA=0 -DCMAKE_BUILD_TYPE=Release && \ | ||
make && \ | ||
cd ../python && \ | ||
pip install . && \ | ||
cd ${WORKDIR} && \ | ||
rm -rf decord | ||
|
||
RUN pip install --upgrade pip setuptools | ||
RUN pip install setuptools_scm py-cpuinfo | ||
|
@@ -176,17 +195,27 @@ WORKDIR /workspace/bionemo2 | |
RUN --mount=type=bind,source=./.git,target=./.git \ | ||
--mount=type=bind,source=./requirements-test.txt,target=/requirements-test.txt \ | ||
--mount=type=bind,source=./requirements-cve.txt,target=/requirements-cve.txt \ | ||
<<EOF | ||
set -eo pipefail | ||
uv pip install maturin --no-build-isolation && uv pip install --no-build-isolation \ | ||
uv pip install maturin --no-build-isolation --break-system-packages | ||
RUN --mount=type=bind,source=./.git,target=./.git \ | ||
--mount=type=bind,source=./requirements-test.txt,target=/requirements-test.txt \ | ||
--mount=type=bind,source=./requirements-cve.txt,target=/requirements-cve.txt \ | ||
pip install --use-deprecated=legacy-resolver --no-build-isolation --break-system-packages \ | ||
tensorstore==0.1.45 | ||
|
||
# For some reason, we do not need to do the tensorstore verson package hack on arm64 | ||
# RUN sed -i 's/^Version: 0\.0\.0$/Version: 0.1.45/' /usr/local/lib/python3.12/dist-packages/tensorstore-0.0.0.dist-info/METADATA && mv /usr/local/lib/python3.12/dist-packages/tensorstore-0.0.0.dist-info /usr/local/lib/python3.12/dist-packages/tensorstore-0.1.45.dist-info | ||
|
||
RUN --mount=type=bind,source=./.git,target=./.git \ | ||
--mount=type=bind,source=./requirements-test.txt,target=/requirements-test.txt \ | ||
--mount=type=bind,source=./requirements-cve.txt,target=/requirements-cve.txt \ | ||
# Comment out mamba install in NeMo as this causes issues. | ||
sed -i "/mamba-ssm/d" ./3rdparty/NeMo/requirements/requirements_nlp.txt && \ | ||
uv pip install --no-build-isolation --break-system-packages \ | ||
./3rdparty/* \ | ||
./sub-packages/bionemo-* \ | ||
-r /requirements-cve.txt \ | ||
-r /requirements-test.txt | ||
rm -rf ./3rdparty | ||
rm -rf /tmp/* | ||
rm -rf ./sub-packages/bionemo-noodles/target | ||
EOF | ||
-r /requirements-test.txt && rm -rf ./3rdparty && rm -rf /tmp/* && rm -rf ./sub-packages/bionemo-noodles/target \ | ||
&& rm -rf /root/.cache/* | ||
|
||
# In the devcontainer image, we just copy over the finished `dist-packages` folder from the build image back into the | ||
# base pytorch container. We can then set up a non-root user and uninstall the bionemo and 3rd-party packages, so that | ||
|
@@ -244,7 +273,7 @@ ENV RUSTUP_HOME="/usr/local/rustup" | |
RUN --mount=type=bind,source=./requirements-dev.txt,target=/workspace/bionemo2/requirements-dev.txt \ | ||
--mount=type=cache,id=uv-cache,target=/root/.cache,sharing=locked <<EOF | ||
set -eo pipefail | ||
uv pip install -r /workspace/bionemo2/requirements-dev.txt | ||
uv pip install --break-system-packages -r /workspace/bionemo2/requirements-dev.txt | ||
rm -rf /tmp/* | ||
EOF | ||
|
||
|
@@ -276,16 +305,22 @@ ENV PATH="/usr/local/cargo/bin:/usr/local/rustup/bin:${PATH}" | |
ENV RUSTUP_HOME="/usr/local/rustup" | ||
|
||
RUN uv pip uninstall maturin | ||
RUN uv pip install maturin --no-build-isolation | ||
RUN uv pip install maturin --break-system-packages --no-build-isolation | ||
|
||
RUN <<EOF | ||
set -eo pipefail | ||
find . -name __pycache__ -type d -print | xargs rm -rf | ||
uv pip install --no-build-isolation --editable ./internal/infra-bionemo | ||
for sub in ./3rdparty/* ./sub-packages/bionemo-*; do | ||
uv pip install --no-deps --no-build-isolation --editable $sub | ||
uv pip install --no-deps --no-build-isolation --break-system-packages --editable $sub | ||
done | ||
EOF | ||
# This is needed because faiss is not compatible with ARM at all. | ||
# Bionemo doesn't use faiss, but megatron core does. | ||
# We do not use this codepath at all, therefore we just make is_sve_supported return False | ||
# to circumvent python import issues | ||
RUN sed -i '42i\ # Bionemo hack to fix ARM issues with faiss\n return False' /usr/local/lib/python3.12/dist-packages/faiss/loader.py | ||
|
||
# Since the entire repo is owned by root, swithcing username for development breaks things. | ||
ARG USERNAME=bionemo | ||
RUN chown $USERNAME:$USERNAME -R /workspace/bionemo2/ | ||
|
@@ -312,9 +347,10 @@ COPY --from=rust-env /usr/local/rustup /usr/local/rustup | |
|
||
|
||
# RUN rm -rf /usr/local/cargo /usr/local/rustup | ||
RUN rm -rf /root/.cache/bazel | ||
RUN chmod 777 -R /workspace/bionemo2/ | ||
|
||
# TODO fix /usr/local/lib/python3.12/dist-packages/faiss/loader.py | ||
|
||
# Transformer engine attention defaults | ||
# We have to declare this again because the devcontainer splits from the release image's base. | ||
# FIXME the following results in unstable training curves even if faster. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
# This is a patch file for decord https://github.com/dmlc/decord | ||
# needed to build decord against ffmpeg6, taken from | ||
# https://github.com/dmlc/decord/issues/186#issuecomment-1171882325 | ||
# This needs to be removed once decord natively supports latest ffmpeg versions. | ||
diff --git a/src/video/ffmpeg/ffmpeg_common.h b/src/video/ffmpeg/ffmpeg_common.h | ||
index b0b973f..f0f7316 100644 | ||
--- a/src/video/ffmpeg/ffmpeg_common.h | ||
+++ b/src/video/ffmpeg/ffmpeg_common.h | ||
@@ -21,6 +21,7 @@ | ||
extern "C" { | ||
#endif | ||
#include <libavcodec/avcodec.h> | ||
+#include <libavcodec/bsf.h> | ||
#include <libavformat/avformat.h> | ||
#include <libavformat/avio.h> | ||
#include <libavfilter/avfilter.h> | ||
diff --git a/src/video/nvcodec/cuda_threaded_decoder.cc b/src/video/nvcodec/cuda_threaded_decoder.cc | ||
index 62bc7ee..957a90d 100644 | ||
--- a/src/video/nvcodec/cuda_threaded_decoder.cc | ||
+++ b/src/video/nvcodec/cuda_threaded_decoder.cc | ||
@@ -17,7 +17,7 @@ namespace decord { | ||
namespace cuda { | ||
using namespace runtime; | ||
|
||
-CUThreadedDecoder::CUThreadedDecoder(int device_id, AVCodecParameters *codecpar, AVInputFormat *iformat) | ||
+CUThreadedDecoder::CUThreadedDecoder(int device_id, AVCodecParameters *codecpar, const AVInputFormat *iformat) | ||
: device_id_(device_id), stream_({device_id, false}), device_{}, ctx_{}, parser_{}, decoder_{}, | ||
pkt_queue_{}, frame_queue_{}, | ||
run_(false), frame_count_(0), draining_(false), | ||
@@ -70,7 +70,7 @@ CUThreadedDecoder::CUThreadedDecoder(int device_id, AVCodecParameters *codecpar, | ||
} | ||
} | ||
|
||
-void CUThreadedDecoder::InitBitStreamFilter(AVCodecParameters *codecpar, AVInputFormat *iformat) { | ||
+void CUThreadedDecoder::InitBitStreamFilter(AVCodecParameters *codecpar, const AVInputFormat *iformat) { | ||
const char* bsf_name = nullptr; | ||
if (AV_CODEC_ID_H264 == codecpar->codec_id) { | ||
// H.264 | ||
diff --git a/src/video/nvcodec/cuda_threaded_decoder.h b/src/video/nvcodec/cuda_threaded_decoder.h | ||
index d7e6fcd..61958a1 100644 | ||
--- a/src/video/nvcodec/cuda_threaded_decoder.h | ||
+++ b/src/video/nvcodec/cuda_threaded_decoder.h | ||
@@ -46,7 +46,7 @@ class CUThreadedDecoder final : public ThreadedDecoderInterface { | ||
using FrameOrderQueuePtr = std::unique_ptr<FrameOrderQueue>; | ||
|
||
public: | ||
- CUThreadedDecoder(int device_id, AVCodecParameters *codecpar, AVInputFormat *iformat); | ||
+ CUThreadedDecoder(int device_id, AVCodecParameters *codecpar, const AVInputFormat *iformat); | ||
void SetCodecContext(AVCodecContext *dec_ctx, int width = -1, int height = -1, int rotation = 0); | ||
bool Initialized() const; | ||
void Start(); | ||
@@ -70,7 +70,7 @@ class CUThreadedDecoder final : public ThreadedDecoderInterface { | ||
void LaunchThreadImpl(); | ||
void RecordInternalError(std::string message); | ||
void CheckErrorStatus(); | ||
- void InitBitStreamFilter(AVCodecParameters *codecpar, AVInputFormat *iformat); | ||
+ void InitBitStreamFilter(AVCodecParameters *codecpar, const AVInputFormat *iformat); | ||
|
||
int device_id_; | ||
CUStream stream_; | ||
diff --git a/src/video/video_reader.cc b/src/video/video_reader.cc | ||
index af4858d..99c9635 100644 | ||
--- a/src/video/video_reader.cc | ||
+++ b/src/video/video_reader.cc | ||
@@ -145,7 +145,7 @@ VideoReader::~VideoReader(){ | ||
|
||
void VideoReader::SetVideoStream(int stream_nb) { | ||
if (!fmt_ctx_) return; | ||
- AVCodec *dec; | ||
+ const AVCodec *dec; | ||
int st_nb = av_find_best_stream(fmt_ctx_.get(), AVMEDIA_TYPE_VIDEO, stream_nb, -1, &dec, 0); | ||
// LOG(INFO) << "find best stream: " << st_nb; | ||
CHECK_GE(st_nb, 0) << "ERROR cannot find video stream with wanted index: " << stream_nb; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters