Pad-8: Upgrade to ROCM 5.6 and add multi-node support. (#230)

* Add docker_scripts; updated Makefile and Dockerfile with support for multinode execution. * Add docker_scripts. * large change in scrape_libs.sh to fix issues when multiple libfabric.so libs are present; and, fixed missing python libs. * Created clean branch with all the changes need for ROCM 5.6 multi-node execution * Bumped version; updated CircleCI config targets. * removed 'a few extraneous comments.'
determined-ai · Jan 4, 2024 · 7287f56 · 7287f56
1 parent 622d512
commit 7287f56
Show file tree

Hide file tree

Showing 8 changed files with 487 additions and 64 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -195,10 +195,13 @@ workflows:
                 - tf28-gpu
                 - pt-gpu
                 - pt2-gpu
-                - pytorch10-tf27-rocm50
+                - pytorch13-tf210-rocm56
+                - pytorch20-tf210-rocm56
             exclude:
               - with-mpi: 1
-                image-type: pytorch10-tf27-rocm50
+                image-type: 
+                  - pytorch13-tf210-rocm56
+                  - pytorch20-tf210-rocm56
       - build-and-publish-docker:
           name: build-and-publish-docker-<<matrix.image-type>>-<<matrix.with-mpi>>
           context: determined-production
@@ -257,11 +260,14 @@ workflows:
                 - tf28-gpu
                 - pt-gpu
                 - pt2-gpu
-                - pytorch10-tf27-rocm50
+                - pytorch13-tf210-rocm56
+                - pytorch20-tf210-rocm56
             exclude:
               - dev-mode: true
                 with-mpi: 1
-                image-type: pytorch10-tf27-rocm50
+                image-type: 
+                  - pytorch13-tf210-rocm56
+                  - pytorch20-tf210-rocm56
 
       - build-and-publish-docker:
           name: build-and-publish-docker-<<matrix.image-type>>-<<matrix.with-mpi>>-dev

diff --git a/Dockerfile-default-rocm b/Dockerfile-default-rocm
@@ -1,40 +1,117 @@
 ARG BASE_IMAGE
 FROM ${BASE_IMAGE}
 
+RUN apt remove -y openmpi ucx
+#Let's remove existing /opt/ompi; and, link to our version.
+RUN rm -rf /opt/ompi 
+RUN ln -s /container/ompi /opt 
+
 RUN mkdir -p /var/run/sshd
 RUN rm /etc/apt/sources.list.d/rocm.list
 RUN apt-get update \
-	&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
-		autoconf \
-		automake \
-		autotools-dev \
-		build-essential \
-		ca-certificates \
-		curl \
-		daemontools \
-		libkrb5-dev \
-		libssl-dev \
-		libtool \
-		git \
-		krb5-user \
-		cmake \
-		g++-4.8 \
-		make \
-		openssh-client \
-		openssh-server \
-		pkg-config \
-		wget \
-		nfs-common \
-		unattended-upgrades \
-	&& unattended-upgrade \
-	&& rm -rf /var/lib/apt/lists/* \
-	&& rm /etc/ssh/ssh_host_ecdsa_key \
-	&& rm /etc/ssh/ssh_host_ed25519_key \
-	&& rm /etc/ssh/ssh_host_rsa_key
+        && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+                autoconf \
+                automake \
+                autotools-dev \
+                build-essential \
+                ca-certificates \
+                curl \
+                daemontools \
+                glibc-source \
+                ibverbs-providers \
+                libibverbs1 \
+                libkrb5-dev \
+                librdmacm1 \
+                libssl-dev \
+                libtool \
+                git \
+                krb5-user \
+                cmake \
+                g++ \
+                make \
+                openssh-client \
+                openssh-server \
+                pkg-config \
+                wget \
+                nfs-common \
+                libnuma1 \
+                libnuma-dev \
+                libpmi2-0-dev \
+                unattended-upgrades \
+        && unattended-upgrade \
+        && rm -rf /var/lib/apt/lists/* \
+        && rm /etc/ssh/ssh_host_ecdsa_key \
+        && rm /etc/ssh/ssh_host_ed25519_key \
+        && rm /etc/ssh/ssh_host_rsa_key
+RUN pip install pip install --upgrade pip
 
 COPY dockerfile_scripts /tmp/det_dockerfile_scripts
 
+ENV PATH="/opt/conda/envs/py_3.8/bin:${PATH}"
+
+ARG CONDA="${PATH}"
+
+ENV PYTHONUNBUFFERED=1 PYTHONFAULTHANDLER=1 PYTHONHASHSEED=0
+
+# Install fixed version of FFI package for Ubuntu 20.04.
+# This is done after above stuff to make sure we get right version.
+RUN /tmp/det_dockerfile_scripts/install_package_fixes.sh
+
+RUN apt install rocm-libs 
+
+#USING OFI
+ARG WITH_MPI=1
+ARG WITH_OFI=1
+ARG WITH_MPICH
+ARG UCX_INSTALL_DIR=/container/ucx
+ARG OMPI_INSTALL_DIR=/container/ompi
+ARG MPICH_INSTALL_DIR=/container/mpich
+ARG OFI_INSTALL_DIR=/container/ofi
+ARG OMPI_WITH_CUDA=0
+ARG OMPI_WITH_ROCM=1
+RUN if [ "$WITH_MPI" = "1" ]; then /tmp/det_dockerfile_scripts/ompi_rocm.sh "$UBUNTU_VERSION" "$WITH_OFI" "$OMPI_WITH_ROCM" "$WITH_MPICH"; fi
+
+# Make sure OMPI/UCX show up in the right paths
+ARG VERBS_LIB_DIR=/usr/lib/libibverbs
+ARG UCX_LIB_DIR=${UCX_INSTALL_DIR}/lib:${UCX_INSTALL_DIR}/lib64
+ARG UCX_PATH_DIR=${UCX_INSTALL_DIR}/bin
+ARG OFI_LIB_DIR=${OFI_INSTALL_DIR}/lib:${OFI_INSTALL_DIR}/lib64
+ARG OFI_PATH_DIR=${OFI_INSTALL_DIR}/bin
+ARG OMPI_LIB_DIR=${OMPI_INSTALL_DIR}/lib
+ARG OMPI_PATH_DIR=${OMPI_INSTALL_DIR}/bin
+ARG MPICH_LIB_DIR=${MPICH_INSTALL_DIR}/lib
+ARG MPICH_PATH_DIR=${MPICH_INSTALL_DIR}/bin
+
+# Set up UCX_LIBS and OFI_LIBS
+ENV UCX_LIBS="${VERBS_LIB_DIR}:${UCX_LIB_DIR}:${OMPI_LIB_DIR}:"
+ENV OFI_LIBS="${VERBS_LIB_DIR}:${OFI_LIB_DIR}:${MPICH_LIB_DIR}:"
+
+# If WITH_OFI is true, then set EXTRA_LIBS to OFI libs, else set to empty string
+ENV EXTRA_LIBS="${WITH_OFI:+${OFI_LIBS}}"
+
+# If EXTRA_LIBS is empty, set to UCX libs, else leave as OFI libs
+ENV EXTRA_LIBS="${EXTRA_LIBS:-${UCX_LIBS}}"
+
+# But, only add them if WITH_MPI
+ENV LD_LIBRARY_PATH=${WITH_MPI:+$EXTRA_LIBS}$LD_LIBRARY_PATH
+
+#USING OFI
+ENV PATH=${WITH_OFI:+$PATH:${WITH_MPI:+$OFI_PATH_DIR:$MPICH_PATH_DIR}}
+
+#USING UCX
+ENV PATH=${PATH:-$CONDA:${WITH_MPI:+$UCX_PATH_DIR:$OMPI_PATH_DIR}}
+
+# Enable running OMPI as root
+ENV OMPI_ALLOW_RUN_AS_ROOT ${WITH_MPI:+1}
+ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM ${WITH_MPI:+1}
+
+
+
+RUN pip install cloudpickle
 RUN pip install determined && pip uninstall -y determined
+RUN pip install google-auth-oauthlib
+
+
 
 RUN pip install -r /tmp/det_dockerfile_scripts/notebook-requirements.txt
 ENV JUPYTER_CONFIG_DIR=/run/determined/jupyter/config
@@ -48,7 +125,6 @@ RUN /tmp/det_dockerfile_scripts/install_google_cloud_sdk.sh
 # google-api-python-client -> google-api-core -> googleapis-common-protos -> protobuf
 # Horovod cannot build with protobuf > 3.20.x
 # latest google-api-python-client requires protobuf >= 3.20.1
-RUN pip install protobuf==3.20.1
 
 ARG TENSORFLOW_PIP
 RUN if [ "$TENSORFLOW_PIP" ]; then pip install $TENSORFLOW_PIP; fi
@@ -69,13 +145,58 @@ ARG HOROVOD_WITH_TENSORFLOW=1
 ARG HOROVOD_WITH_PYTORCH=1
 ARG HOROVOD_WITHOUT_MXNET=1
 ARG HOROVOD_GPU_OPERATIONS=NCCL
-ARG HOROVOD_WITHOUT_MPI=1
+ARG HOROVOD_WITHOUT_MPI=0
+ARG HOROVOD_WITH_MPI=1
 ARG HOROVOD_GPU=ROCM
-ARG HOROVOD_WITHOUT_MPI=1
-ENV LD_LIBRARY_PATH=/opt/rocm/lib:/opt/rocm/hip/lib
+ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH;/opt/rocm/lib:/opt/rocm/hip/lib
+
+ENV HOROVOD_PIP $HOROVOD_PIP
+ENV HOROVOD_WITH_TENSORFLOW $HOROVOD_WITH_TENSORFLOW
+ENV HOROVOD_WITH_PYTORCH $HOROVOD_WITH_PYTORCH
+ENV HOROVOD_WITHOUT_MXNET $HOROVOD_WITHOUT_MXNET
+ENV HOROVOD_GPU_OPERATIONS $HOROVOD_GPU_OPERATIONS
+ENV HOROVOD_WITHOUT_MPI $HOROVOD_WITHOUT_MPI
+ENV HOROVOD_WITH_MPI $HOROVOD_WITH_MPI
+ENV HOROVOD_GPU $HOROVOD_GPU
+ENV HOROVOD_NCCL_HOME $HOROVOD_NCCL_HOME
+ENV NCCL_LIB_DIR=${HOROVOD_NCCL_HOME}/lib
+ENV HOROVOD_NCCL_LINK=${WITH_OFI:+SHARED}
+ENV LD_LIBRARY_PATH=${WITH_OFI:+$NCCL_LIB_DIR:}$LD_LIBRARY_PATH
+
 RUN if [ "$HOROVOD_PIP" != "0" ]; then pip install "${HOROVOD_PIP}" ; fi
 
-RUN rm -r /tmp/*
+RUN pip uninstall -y tb-nightly tensorboardX
+RUN pip install -r /tmp/det_dockerfile_scripts/additional-requirements-rocm.txt
+
+
+ENV HSA_FORCE_FINE_GRAIN_PCIE=1
+
+ARG AWS_PLUGIN_INSTALL_DIR=/container/aws
+ARG WITH_AWS_TRACE
+ARG INTERNAL_AWS_DS
+ARG INTERNAL_AWS_PATH
+ARG ROCM_DIR=/opt/rocm
+ENV ROCM_DIR $ROCM_DIR
+RUN if [ "$WITH_OFI" = "1" ]; then /tmp/det_dockerfile_scripts/build_aws_rocm.sh "$WITH_OFI" "$WITH_AWS_TRACE" "$WITH_MPICH"; fi
+ENV LD_LIBRARY_PATH=${WITH_OFI:+$AWS_PLUGIN_INSTALL_DIR:}$LD_LIBRARY_PATH
+RUN ldconfig
+
+ENV PATH=$OMPI_PATH_DIR:$OFI_INSTALL_DIR:$PATH
 # Reset entrypoint.
-ENTRYPOINT []
+
+# Set an entrypoint that can scrape up the host libfabric.so and then 
+# run the user command. This is intended to enable performant execution
+# on non-IB systems that have a proprietary libfabric.
+
+RUN mkdir -p /container/bin && cp /tmp/det_dockerfile_scripts/scrape_libs.sh /container/bin
+
+ARG WITH_RCCL=1
+ENV WITH_RCCL=$WITH_RCCL
+ARG WITH_NFS_WORKAROUND=1
+ENV WITH_NFS_WORKAROUND=$WITH_NFS_WORKAROUND
+
+ENTRYPOINT ["/container/bin/scrape_libs.sh"]
+CMD ["/bin/bash"]
 USER root
+
+RUN rm -r /tmp/*
diff --git a/Makefile b/Makefile
@@ -15,7 +15,7 @@ CUDA_111_PREFIX := $(REGISTRY_REPO):cuda-11.1-
 CUDA_112_PREFIX := $(REGISTRY_REPO):cuda-11.2-
 CUDA_113_PREFIX := $(REGISTRY_REPO):cuda-11.3-
 CUDA_118_PREFIX := $(REGISTRY_REPO):cuda-11.8-
-ROCM_50_PREFIX := $(REGISTRY_REPO):rocm-5.0-
+ROCM_56_PREFIX := $(REGISTRY_REPO):rocm-5.6-
 
 CPU_SUFFIX := -cpu
 GPU_SUFFIX := -gpu
@@ -178,20 +178,38 @@ build-gpu-cuda-118-base:
 		-t $(DOCKERHUB_REGISTRY)/$(GPU_CUDA_118_BASE_NAME)-$(VERSION) \
 		.
 
-export ROCM50_TORCH_TF_ENVIRONMENT_NAME := $(ROCM_50_PREFIX)pytorch-1.10-tf-2.7-rocm
-export TF_PROFILER_PIP := tensorboard-plugin-profile
-export TORCH_TB_PROFILER_PIP := torch-tb-profiler==0.4.1
+ifeq ($(WITH_MPICH),1)
+ROCM56_TORCH13_MPI :=pytorch-1.3-tf-2.10-rocm-mpich
+else
+ROCM56_TORCH13_MPI :=pytorch-1.3-tf-2.10-rocm-ompi
+endif
+export ROCM56_TORCH13_TF_ENVIRONMENT_NAME := $(ROCM_56_PREFIX)$(ROCM56_TORCH13_MPI)
+.PHONY: build-pytorch13-tf210-rocm56
+build-pytorch13-tf210-rocm56:
+	docker build -f Dockerfile-default-rocm \
+		--build-arg BASE_IMAGE="rocm/pytorch:rocm5.6_ubuntu20.04_py3.8_pytorch_1.13.1"\
+		--build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \
+		--build-arg HOROVOD_PIP="horovod==0.28.1" \
+		--build-arg WITH_MPICH=$(WITH_MPICH) \
+		-t $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH13_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
+		-t $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH13_TF_ENVIRONMENT_NAME)-$(VERSION) \
+		.
 
-.PHONY: build-pytorch10-tf27-rocm50
-build-pytorch10-tf27-rocm50:
+ifeq ($(WITH_MPICH),1)
+ROCM56_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-mpich
+else
+ROCM56_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-ompi
+endif
+export ROCM56_TORCH_TF_ENVIRONMENT_NAME := $(ROCM_56_PREFIX)$(ROCM56_TORCH_MPI)
+.PHONY: build-pytorch20-tf210-rocm56
+build-pytorch20-tf210-rocm56:
 	docker build -f Dockerfile-default-rocm \
-		--build-arg BASE_IMAGE="amdih/pytorch:rocm5.0_ubuntu18.04_py3.7_pytorch_1.10.0" \
-		--build-arg TORCH_TB_PROFILER_PIP="$(TORCH_TB_PROFILER_PIP)" \
-		--build-arg TENSORFLOW_PIP="tensorflow-rocm==2.7.1" \
-		--build-arg TF_PROFILER_PIP="$(TF_PROFILER_PIP)" \
-		--build-arg HOROVOD_PIP="horovod==0.25.0" \
-		-t $(DOCKERHUB_REGISTRY)/$(ROCM50_TORCH_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
-		-t $(DOCKERHUB_REGISTRY)/$(ROCM50_TORCH_TF_ENVIRONMENT_NAME)-$(VERSION) \
+		--build-arg BASE_IMAGE="rocm/pytorch:rocm5.6_ubuntu20.04_py3.8_pytorch_2.0.1" \
+		--build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \
+		--build-arg HOROVOD_PIP="horovod==0.28.1" \
+                --build-arg WITH_MPICH=$(WITH_MPICH) \
+		-t $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
+		-t $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH_TF_ENVIRONMENT_NAME)-$(VERSION) \
 		.
 
 DEEPSPEED_VERSION := 0.8.3
@@ -520,9 +538,13 @@ ifneq ($(NGC_PUBLISH),)
 	scripts/publish-docker.sh tf28-gpu-$(WITH_MPI) $(NGC_REGISTRY)/$(GPU_TF28_ENVIRONMENT_NAME) $(SHORT_GIT_HASH) $(VERSION)
 endif
 
-.PHONY: publish-pytorch10-tf27-rocm50
-publish-pytorch10-tf27-rocm50:
-	scripts/publish-docker.sh pytorch10-tf27-rocm50-$(WITH_MPI) $(DOCKERHUB_REGISTRY)/$(ROCM50_TORCH_TF_ENVIRONMENT_NAME) $(SHORT_GIT_HASH) $(VERSION) $(ARTIFACTS_DIR)
+.PHONY: publish-pytorch13-tf210-rocm56
+publish-pytorch13-tf210-rocm56:
+	scripts/publish-docker.sh pytorch13-tf210-rocm56-$(WITH_MPI) $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH13_TF_ENVIRONMENT_NAME) $(SHORT_GIT_HASH) $(VERSION) $(ARTIFACTS_DIR)
+
+.PHONY: publish-pytorch20-tf210-rocm56
+publish-pytorch20-tf210-rocm56:
+	scripts/publish-docker.sh pytorch20-tf210-rocm56-$(WITH_MPI) $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH_TF_ENVIRONMENT_NAME) $(SHORT_GIT_HASH) $(VERSION) $(ARTIFACTS_DIR)
 
 .PHONY: publish-cloud-images
 publish-cloud-images:

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-0.26.4
+0.26.5
diff --git a/dockerfile_scripts/additional-requirements-rocm.txt b/dockerfile_scripts/additional-requirements-rocm.txt
@@ -0,0 +1,16 @@
+attrdict3
+pandas
+matplotlib
+tensorflow-datasets==1.3.2
+Keras-Preprocessing[image]
+# TODO(DET-4259) Remove this when we fix the circular dependency with the main repo.
+petname
+azure-storage-blob
+Pillow>=8.3.2,<=9.5.0
+analytics-python
+nvidia-ml-py
+protobuf<=3.20.3
+tensorboard==2.10.1
+pynvml 
+tokenizers==0.13.0
+huggingface-hub==0.16.4