Skip to content

Commit

Permalink
Merge pull request #1 from drikster80/main
Browse files Browse the repository at this point in the history
[CI/Build] Dockerfile build for ARM64 / GH200 vllm-project#10499 by cenzhiyao
  • Loading branch information
cennn authored Dec 14, 2024
2 parents ea7bd68 + ccef455 commit 7c2f32e
Show file tree
Hide file tree
Showing 5 changed files with 104 additions and 9 deletions.
78 changes: 72 additions & 6 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,14 @@ ARG CUDA_VERSION=12.4.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
ARG CUDA_VERSION=12.4.1
ARG PYTHON_VERSION=3.12
ARG TARGETPLATFORM
ENV DEBIAN_FRONTEND=noninteractive

# Install Python and other dependencies
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl sudo \
&& apt-get install -y ccache software-properties-common git curl sudo kmod \
&& add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
Expand Down Expand Up @@ -46,9 +47,14 @@ WORKDIR /workspace
# install build and runtime dependencies
COPY requirements-common.txt requirements-common.txt
COPY requirements-cuda.txt requirements-cuda.txt
COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-cuda.txt

RUN --mount=type=cache,target=/root/.cache/pip \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
python3 -m pip install -r requirements-cuda-arm64.txt; \
fi

# cuda arch list used by torch
# can be useful for both `dev` and `test`
Expand All @@ -63,13 +69,19 @@ ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}

#################### WHEEL BUILD IMAGE ####################
FROM base AS build
ARG TARGETPLATFORM

# install build dependencies
COPY requirements-build.txt requirements-build.txt

RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-build.txt

RUN --mount=type=cache,target=/root/.cache/pip \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
python3 -m pip install -r requirements-cuda-arm64.txt; \
fi

COPY . .
ARG GIT_REPO_CHECK=0
RUN --mount=type=bind,source=.git,target=.git \
Expand Down Expand Up @@ -113,6 +125,47 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
fi

RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=.git,target=.git \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
apt-get update && apt-get install zlib1g-dev && \
python3 -m pip install packaging pybind11 && \
git clone https://github.com/openai/triton && \
cd triton/python && \
git submodule update --init --recursive && \
pip --verbose wheel --use-pep517 --no-deps -w /workspace/dist --no-build-isolation --no-cache-dir . ; \
fi

RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=.git,target=.git \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
CAUSAL_CONV1D_FORCE_BUILD=TRUE CAUSAL_CONV1D_SKIP_CUDA_BUILD=FALSE pip --verbose wheel --use-pep517 --no-deps -w /workspace/dist --no-build-isolation --no-cache-dir git+https://github.com/Dao-AILab/causal-conv1d.git ; \
fi

RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=.git,target=.git \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
MAMBA_FORCE_BUILD=TRUE pip --verbose wheel --use-pep517 --no-deps -w /workspace/dist --no-build-isolation --no-cache-dir git+https://github.com/state-spaces/mamba.git ; \
fi

RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=.git,target=.git \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
apt-get update && apt-get install -y cuda-toolkit-12-4 && \
git clone -b v0.1.6 https://github.com/flashinfer-ai/flashinfer.git --recursive && \
cd flashinfer/python && \
pip --verbose wheel --use-pep517 --no-deps -w /workspace/dist --no-build-isolation --no-cache-dir . ; \
fi

RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=.git,target=.git \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
git clone -b 0.44.1 https://github.com/bitsandbytes-foundation/bitsandbytes.git && \
cd bitsandbytes && \
pip --verbose wheel --use-pep517 --no-deps -w /workspace/dist --no-build-isolation --no-cache-dir . ; \
fi


# Check the size of the wheel if RUN_WHEEL_CHECK is true
COPY .buildkite/check-wheel-size.py check-wheel-size.py
# Default max size of the wheel is 250MB
Expand All @@ -124,6 +177,7 @@ RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
else \
echo "Skipping wheel size check."; \
fi

#################### EXTENSION Build IMAGE ####################

#################### DEV IMAGE ####################
Expand All @@ -143,6 +197,9 @@ ARG CUDA_VERSION=12.4.1
ARG PYTHON_VERSION=3.12
WORKDIR /vllm-workspace
ENV DEBIAN_FRONTEND=noninteractive
ARG TARGETPLATFORM

COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt

RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
Expand All @@ -151,7 +208,7 @@ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \
&& apt-get install -y ccache software-properties-common git curl sudo vim python3-pip libnccl2\
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
&& add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update -y \
Expand All @@ -168,14 +225,23 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
# or future versions of triton.
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/

# install vllm wheel first, so that torch etc will be installed
# Install vllm wheel first, so that torch etc will be installed.
# On Arm64 platforms, all newly compiled wheels will also be installed (flashinfer, triton, mamba, etc.)
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
--mount=type=cache,target=/root/.cache/pip \
python3 -m pip install dist/*.whl --verbose

RUN --mount=type=cache,target=/root/.cache/pip \
. /etc/environment && \
python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
pip uninstall -y torch && \
python3 -m pip install -r requirements-cuda-arm64.txt; \
fi

RUN --mount=type=cache,target=/root/.cache/pip \
. /etc/environment && \
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \
fi
COPY examples examples
#################### vLLM installation IMAGE ####################

Expand Down Expand Up @@ -218,7 +284,7 @@ FROM vllm-base AS vllm-openai

# install additional dependencies for openai api server
RUN --mount=type=cache,target=/root/.cache/pip \
pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.44.0' timm==0.9.10
pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.44.0' 'timm==0.9.10'

ENV VLLM_USAGE_SOURCE production-docker-image

Expand Down
26 changes: 26 additions & 0 deletions docs/source/serving/deploying_with_docker.rst
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,32 @@ You can build and run vLLM from source via the provided `Dockerfile <https://git
current GPU type the machine is running on, you can add the argument ``--build-arg torch_cuda_arch_list=""``
for vLLM to find the current GPU type and build for that.

Building for Arm64/aarch64
--------------------------

A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use
of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.

.. note::

Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
flags to speed up build process. However, ensure your 'max_jobs' is substantially larger than 'nvcc_threads' to get the most benefits.
Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).

.. code-block:: console
# Example of building on Nvidia GH200 server. (Memory usage: ~180GB, Build time: ~2387s / ~40 min)
$ DOCKER_BUILDKIT=1 sudo docker build . \
--target vllm-openai \
-platform "linux/arm64" \
-t drikster80/vllm-gh200-openai:v0.6.4.post1 \
--build-arg max_jobs=66 \
--build-arg nvcc_threads=2 \
--build-arg torch_cuda_arch_list="9.0+PTX" \
--build-arg vllm_fa_cmake_gpu_arches="90-real"
To run vLLM:

Expand Down
2 changes: 1 addition & 1 deletion requirements-build.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,6 @@ ninja
packaging
setuptools>=61
setuptools-scm>=8
torch==2.5.1
torch==2.5.1; platform_machine != 'aarch64'
wheel
jinja2
3 changes: 3 additions & 0 deletions requirements-cuda-arm64.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
--index-url https://download.pytorch.org/whl/nightly/cu124
torchvision; platform_machine == 'aarch64'
torch; platform_machine == 'aarch64'
4 changes: 2 additions & 2 deletions requirements-cuda.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# Dependencies for NVIDIA GPUs
ray >= 2.9
nvidia-ml-py >= 12.560.30 # for pynvml package
torch == 2.5.1
torch == 2.5.1; platform_machine != 'aarch64'
# These must be updated alongside torch
torchvision == 0.20.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
torchvision == 0.20.1; platform_machine != 'aarch64' # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.5.1

0 comments on commit 7c2f32e

Please sign in to comment.