Skip to content

Commit

Permalink
Pad-8: Upgrade to ROCM 5.6 and add multi-node support. (#230)
Browse files Browse the repository at this point in the history
* Add docker_scripts; updated Makefile and Dockerfile with support for multinode execution.

* Add docker_scripts.

* large change in scrape_libs.sh to fix issues when multiple libfabric.so libs are present; and, fixed missing python libs.

* Created clean branch with all the changes need for ROCM 5.6 multi-node execution

* Bumped version; updated CircleCI config targets.

* removed 'a few extraneous comments.'
  • Loading branch information
will-HPE authored Jan 4, 2024
1 parent 622d512 commit 7287f56
Show file tree
Hide file tree
Showing 8 changed files with 487 additions and 64 deletions.
14 changes: 10 additions & 4 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -195,10 +195,13 @@ workflows:
- tf28-gpu
- pt-gpu
- pt2-gpu
- pytorch10-tf27-rocm50
- pytorch13-tf210-rocm56
- pytorch20-tf210-rocm56
exclude:
- with-mpi: 1
image-type: pytorch10-tf27-rocm50
image-type:
- pytorch13-tf210-rocm56
- pytorch20-tf210-rocm56
- build-and-publish-docker:
name: build-and-publish-docker-<<matrix.image-type>>-<<matrix.with-mpi>>
context: determined-production
Expand Down Expand Up @@ -257,11 +260,14 @@ workflows:
- tf28-gpu
- pt-gpu
- pt2-gpu
- pytorch10-tf27-rocm50
- pytorch13-tf210-rocm56
- pytorch20-tf210-rocm56
exclude:
- dev-mode: true
with-mpi: 1
image-type: pytorch10-tf27-rocm50
image-type:
- pytorch13-tf210-rocm56
- pytorch20-tf210-rocm56

- build-and-publish-docker:
name: build-and-publish-docker-<<matrix.image-type>>-<<matrix.with-mpi>>-dev
Expand Down
187 changes: 154 additions & 33 deletions Dockerfile-default-rocm
Original file line number Diff line number Diff line change
@@ -1,40 +1,117 @@
ARG BASE_IMAGE
FROM ${BASE_IMAGE}

RUN apt remove -y openmpi ucx
#Let's remove existing /opt/ompi; and, link to our version.
RUN rm -rf /opt/ompi
RUN ln -s /container/ompi /opt

RUN mkdir -p /var/run/sshd
RUN rm /etc/apt/sources.list.d/rocm.list
RUN apt-get update \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
autoconf \
automake \
autotools-dev \
build-essential \
ca-certificates \
curl \
daemontools \
libkrb5-dev \
libssl-dev \
libtool \
git \
krb5-user \
cmake \
g++-4.8 \
make \
openssh-client \
openssh-server \
pkg-config \
wget \
nfs-common \
unattended-upgrades \
&& unattended-upgrade \
&& rm -rf /var/lib/apt/lists/* \
&& rm /etc/ssh/ssh_host_ecdsa_key \
&& rm /etc/ssh/ssh_host_ed25519_key \
&& rm /etc/ssh/ssh_host_rsa_key
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
autoconf \
automake \
autotools-dev \
build-essential \
ca-certificates \
curl \
daemontools \
glibc-source \
ibverbs-providers \
libibverbs1 \
libkrb5-dev \
librdmacm1 \
libssl-dev \
libtool \
git \
krb5-user \
cmake \
g++ \
make \
openssh-client \
openssh-server \
pkg-config \
wget \
nfs-common \
libnuma1 \
libnuma-dev \
libpmi2-0-dev \
unattended-upgrades \
&& unattended-upgrade \
&& rm -rf /var/lib/apt/lists/* \
&& rm /etc/ssh/ssh_host_ecdsa_key \
&& rm /etc/ssh/ssh_host_ed25519_key \
&& rm /etc/ssh/ssh_host_rsa_key
RUN pip install pip install --upgrade pip

COPY dockerfile_scripts /tmp/det_dockerfile_scripts

ENV PATH="/opt/conda/envs/py_3.8/bin:${PATH}"

ARG CONDA="${PATH}"

ENV PYTHONUNBUFFERED=1 PYTHONFAULTHANDLER=1 PYTHONHASHSEED=0

# Install fixed version of FFI package for Ubuntu 20.04.
# This is done after above stuff to make sure we get right version.
RUN /tmp/det_dockerfile_scripts/install_package_fixes.sh

RUN apt install rocm-libs

#USING OFI
ARG WITH_MPI=1
ARG WITH_OFI=1
ARG WITH_MPICH
ARG UCX_INSTALL_DIR=/container/ucx
ARG OMPI_INSTALL_DIR=/container/ompi
ARG MPICH_INSTALL_DIR=/container/mpich
ARG OFI_INSTALL_DIR=/container/ofi
ARG OMPI_WITH_CUDA=0
ARG OMPI_WITH_ROCM=1
RUN if [ "$WITH_MPI" = "1" ]; then /tmp/det_dockerfile_scripts/ompi_rocm.sh "$UBUNTU_VERSION" "$WITH_OFI" "$OMPI_WITH_ROCM" "$WITH_MPICH"; fi

# Make sure OMPI/UCX show up in the right paths
ARG VERBS_LIB_DIR=/usr/lib/libibverbs
ARG UCX_LIB_DIR=${UCX_INSTALL_DIR}/lib:${UCX_INSTALL_DIR}/lib64
ARG UCX_PATH_DIR=${UCX_INSTALL_DIR}/bin
ARG OFI_LIB_DIR=${OFI_INSTALL_DIR}/lib:${OFI_INSTALL_DIR}/lib64
ARG OFI_PATH_DIR=${OFI_INSTALL_DIR}/bin
ARG OMPI_LIB_DIR=${OMPI_INSTALL_DIR}/lib
ARG OMPI_PATH_DIR=${OMPI_INSTALL_DIR}/bin
ARG MPICH_LIB_DIR=${MPICH_INSTALL_DIR}/lib
ARG MPICH_PATH_DIR=${MPICH_INSTALL_DIR}/bin

# Set up UCX_LIBS and OFI_LIBS
ENV UCX_LIBS="${VERBS_LIB_DIR}:${UCX_LIB_DIR}:${OMPI_LIB_DIR}:"
ENV OFI_LIBS="${VERBS_LIB_DIR}:${OFI_LIB_DIR}:${MPICH_LIB_DIR}:"

# If WITH_OFI is true, then set EXTRA_LIBS to OFI libs, else set to empty string
ENV EXTRA_LIBS="${WITH_OFI:+${OFI_LIBS}}"

# If EXTRA_LIBS is empty, set to UCX libs, else leave as OFI libs
ENV EXTRA_LIBS="${EXTRA_LIBS:-${UCX_LIBS}}"

# But, only add them if WITH_MPI
ENV LD_LIBRARY_PATH=${WITH_MPI:+$EXTRA_LIBS}$LD_LIBRARY_PATH

#USING OFI
ENV PATH=${WITH_OFI:+$PATH:${WITH_MPI:+$OFI_PATH_DIR:$MPICH_PATH_DIR}}

#USING UCX
ENV PATH=${PATH:-$CONDA:${WITH_MPI:+$UCX_PATH_DIR:$OMPI_PATH_DIR}}

# Enable running OMPI as root
ENV OMPI_ALLOW_RUN_AS_ROOT ${WITH_MPI:+1}
ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM ${WITH_MPI:+1}



RUN pip install cloudpickle
RUN pip install determined && pip uninstall -y determined
RUN pip install google-auth-oauthlib



RUN pip install -r /tmp/det_dockerfile_scripts/notebook-requirements.txt
ENV JUPYTER_CONFIG_DIR=/run/determined/jupyter/config
Expand All @@ -48,7 +125,6 @@ RUN /tmp/det_dockerfile_scripts/install_google_cloud_sdk.sh
# google-api-python-client -> google-api-core -> googleapis-common-protos -> protobuf
# Horovod cannot build with protobuf > 3.20.x
# latest google-api-python-client requires protobuf >= 3.20.1
RUN pip install protobuf==3.20.1

ARG TENSORFLOW_PIP
RUN if [ "$TENSORFLOW_PIP" ]; then pip install $TENSORFLOW_PIP; fi
Expand All @@ -69,13 +145,58 @@ ARG HOROVOD_WITH_TENSORFLOW=1
ARG HOROVOD_WITH_PYTORCH=1
ARG HOROVOD_WITHOUT_MXNET=1
ARG HOROVOD_GPU_OPERATIONS=NCCL
ARG HOROVOD_WITHOUT_MPI=1
ARG HOROVOD_WITHOUT_MPI=0
ARG HOROVOD_WITH_MPI=1
ARG HOROVOD_GPU=ROCM
ARG HOROVOD_WITHOUT_MPI=1
ENV LD_LIBRARY_PATH=/opt/rocm/lib:/opt/rocm/hip/lib
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH;/opt/rocm/lib:/opt/rocm/hip/lib

ENV HOROVOD_PIP $HOROVOD_PIP
ENV HOROVOD_WITH_TENSORFLOW $HOROVOD_WITH_TENSORFLOW
ENV HOROVOD_WITH_PYTORCH $HOROVOD_WITH_PYTORCH
ENV HOROVOD_WITHOUT_MXNET $HOROVOD_WITHOUT_MXNET
ENV HOROVOD_GPU_OPERATIONS $HOROVOD_GPU_OPERATIONS
ENV HOROVOD_WITHOUT_MPI $HOROVOD_WITHOUT_MPI
ENV HOROVOD_WITH_MPI $HOROVOD_WITH_MPI
ENV HOROVOD_GPU $HOROVOD_GPU
ENV HOROVOD_NCCL_HOME $HOROVOD_NCCL_HOME
ENV NCCL_LIB_DIR=${HOROVOD_NCCL_HOME}/lib
ENV HOROVOD_NCCL_LINK=${WITH_OFI:+SHARED}
ENV LD_LIBRARY_PATH=${WITH_OFI:+$NCCL_LIB_DIR:}$LD_LIBRARY_PATH

RUN if [ "$HOROVOD_PIP" != "0" ]; then pip install "${HOROVOD_PIP}" ; fi

RUN rm -r /tmp/*
RUN pip uninstall -y tb-nightly tensorboardX
RUN pip install -r /tmp/det_dockerfile_scripts/additional-requirements-rocm.txt


ENV HSA_FORCE_FINE_GRAIN_PCIE=1

ARG AWS_PLUGIN_INSTALL_DIR=/container/aws
ARG WITH_AWS_TRACE
ARG INTERNAL_AWS_DS
ARG INTERNAL_AWS_PATH
ARG ROCM_DIR=/opt/rocm
ENV ROCM_DIR $ROCM_DIR
RUN if [ "$WITH_OFI" = "1" ]; then /tmp/det_dockerfile_scripts/build_aws_rocm.sh "$WITH_OFI" "$WITH_AWS_TRACE" "$WITH_MPICH"; fi
ENV LD_LIBRARY_PATH=${WITH_OFI:+$AWS_PLUGIN_INSTALL_DIR:}$LD_LIBRARY_PATH
RUN ldconfig

ENV PATH=$OMPI_PATH_DIR:$OFI_INSTALL_DIR:$PATH
# Reset entrypoint.
ENTRYPOINT []

# Set an entrypoint that can scrape up the host libfabric.so and then
# run the user command. This is intended to enable performant execution
# on non-IB systems that have a proprietary libfabric.

RUN mkdir -p /container/bin && cp /tmp/det_dockerfile_scripts/scrape_libs.sh /container/bin

ARG WITH_RCCL=1
ENV WITH_RCCL=$WITH_RCCL
ARG WITH_NFS_WORKAROUND=1
ENV WITH_NFS_WORKAROUND=$WITH_NFS_WORKAROUND

ENTRYPOINT ["/container/bin/scrape_libs.sh"]
CMD ["/bin/bash"]
USER root

RUN rm -r /tmp/*
54 changes: 38 additions & 16 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ CUDA_111_PREFIX := $(REGISTRY_REPO):cuda-11.1-
CUDA_112_PREFIX := $(REGISTRY_REPO):cuda-11.2-
CUDA_113_PREFIX := $(REGISTRY_REPO):cuda-11.3-
CUDA_118_PREFIX := $(REGISTRY_REPO):cuda-11.8-
ROCM_50_PREFIX := $(REGISTRY_REPO):rocm-5.0-
ROCM_56_PREFIX := $(REGISTRY_REPO):rocm-5.6-

CPU_SUFFIX := -cpu
GPU_SUFFIX := -gpu
Expand Down Expand Up @@ -178,20 +178,38 @@ build-gpu-cuda-118-base:
-t $(DOCKERHUB_REGISTRY)/$(GPU_CUDA_118_BASE_NAME)-$(VERSION) \
.

export ROCM50_TORCH_TF_ENVIRONMENT_NAME := $(ROCM_50_PREFIX)pytorch-1.10-tf-2.7-rocm
export TF_PROFILER_PIP := tensorboard-plugin-profile
export TORCH_TB_PROFILER_PIP := torch-tb-profiler==0.4.1
ifeq ($(WITH_MPICH),1)
ROCM56_TORCH13_MPI :=pytorch-1.3-tf-2.10-rocm-mpich
else
ROCM56_TORCH13_MPI :=pytorch-1.3-tf-2.10-rocm-ompi
endif
export ROCM56_TORCH13_TF_ENVIRONMENT_NAME := $(ROCM_56_PREFIX)$(ROCM56_TORCH13_MPI)
.PHONY: build-pytorch13-tf210-rocm56
build-pytorch13-tf210-rocm56:
docker build -f Dockerfile-default-rocm \
--build-arg BASE_IMAGE="rocm/pytorch:rocm5.6_ubuntu20.04_py3.8_pytorch_1.13.1"\
--build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \
--build-arg HOROVOD_PIP="horovod==0.28.1" \
--build-arg WITH_MPICH=$(WITH_MPICH) \
-t $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH13_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
-t $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH13_TF_ENVIRONMENT_NAME)-$(VERSION) \
.

.PHONY: build-pytorch10-tf27-rocm50
build-pytorch10-tf27-rocm50:
ifeq ($(WITH_MPICH),1)
ROCM56_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-mpich
else
ROCM56_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-ompi
endif
export ROCM56_TORCH_TF_ENVIRONMENT_NAME := $(ROCM_56_PREFIX)$(ROCM56_TORCH_MPI)
.PHONY: build-pytorch20-tf210-rocm56
build-pytorch20-tf210-rocm56:
docker build -f Dockerfile-default-rocm \
--build-arg BASE_IMAGE="amdih/pytorch:rocm5.0_ubuntu18.04_py3.7_pytorch_1.10.0" \
--build-arg TORCH_TB_PROFILER_PIP="$(TORCH_TB_PROFILER_PIP)" \
--build-arg TENSORFLOW_PIP="tensorflow-rocm==2.7.1" \
--build-arg TF_PROFILER_PIP="$(TF_PROFILER_PIP)" \
--build-arg HOROVOD_PIP="horovod==0.25.0" \
-t $(DOCKERHUB_REGISTRY)/$(ROCM50_TORCH_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
-t $(DOCKERHUB_REGISTRY)/$(ROCM50_TORCH_TF_ENVIRONMENT_NAME)-$(VERSION) \
--build-arg BASE_IMAGE="rocm/pytorch:rocm5.6_ubuntu20.04_py3.8_pytorch_2.0.1" \
--build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \
--build-arg HOROVOD_PIP="horovod==0.28.1" \
--build-arg WITH_MPICH=$(WITH_MPICH) \
-t $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
-t $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH_TF_ENVIRONMENT_NAME)-$(VERSION) \
.

DEEPSPEED_VERSION := 0.8.3
Expand Down Expand Up @@ -520,9 +538,13 @@ ifneq ($(NGC_PUBLISH),)
scripts/publish-docker.sh tf28-gpu-$(WITH_MPI) $(NGC_REGISTRY)/$(GPU_TF28_ENVIRONMENT_NAME) $(SHORT_GIT_HASH) $(VERSION)
endif

.PHONY: publish-pytorch10-tf27-rocm50
publish-pytorch10-tf27-rocm50:
scripts/publish-docker.sh pytorch10-tf27-rocm50-$(WITH_MPI) $(DOCKERHUB_REGISTRY)/$(ROCM50_TORCH_TF_ENVIRONMENT_NAME) $(SHORT_GIT_HASH) $(VERSION) $(ARTIFACTS_DIR)
.PHONY: publish-pytorch13-tf210-rocm56
publish-pytorch13-tf210-rocm56:
scripts/publish-docker.sh pytorch13-tf210-rocm56-$(WITH_MPI) $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH13_TF_ENVIRONMENT_NAME) $(SHORT_GIT_HASH) $(VERSION) $(ARTIFACTS_DIR)

.PHONY: publish-pytorch20-tf210-rocm56
publish-pytorch20-tf210-rocm56:
scripts/publish-docker.sh pytorch20-tf210-rocm56-$(WITH_MPI) $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH_TF_ENVIRONMENT_NAME) $(SHORT_GIT_HASH) $(VERSION) $(ARTIFACTS_DIR)

.PHONY: publish-cloud-images
publish-cloud-images:
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.26.4
0.26.5
16 changes: 16 additions & 0 deletions dockerfile_scripts/additional-requirements-rocm.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
attrdict3
pandas
matplotlib
tensorflow-datasets==1.3.2
Keras-Preprocessing[image]
# TODO(DET-4259) Remove this when we fix the circular dependency with the main repo.
petname
azure-storage-blob
Pillow>=8.3.2,<=9.5.0
analytics-python
nvidia-ml-py
protobuf<=3.20.3
tensorboard==2.10.1
pynvml
tokenizers==0.13.0
huggingface-hub==0.16.4
Loading

0 comments on commit 7287f56

Please sign in to comment.