-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathDockerfile.neuronx
248 lines (214 loc) · 8.6 KB
/
Dockerfile.neuronx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
FROM public.ecr.aws/docker/library/ubuntu:20.04
LABEL maintainer="Amazon AI"
LABEL dlc_major_version="1"
# Neuron SDK components version numbers
ARG NEURONX_DISTRIBUTED_VERSION=0.9.0
ARG NEURONX_DISTRIBUTED_TRAINING_VERSION=1.0.1
ARG NEURONX_CC_VERSION=2.15.143.0
ARG NEURONX_FRAMEWORK_VERSION=2.1.2.2.3.2
ARG NEURONX_COLLECTIVES_LIB_VERSION=2.22.33.0-d2128d1aa
ARG NEURONX_RUNTIME_LIB_VERSION=2.22.19.0-5856c0b42
ARG NEURONX_TOOLS_VERSION=2.19.0.0
ARG PYTHON=python3.10
ARG PYTHON_VERSION=3.10.12
ARG PIP=pip3
ARG OMPI_VERSION=4.1.5
# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 20
ARG DEBIAN_FRONTEND=noninteractive
# Python won’t try to write .pyc or .pyo files on the import of source modules
# Force stdin, stdout and stderr to be totally unbuffered. Good for logging
ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONUNBUFFERED=1
ENV PYTHONIOENCODING=UTF-8
ENV LANG=C.UTF-8
ENV LC_ALL=C.UTF-8
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/aws/neuron/lib"
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib"
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib64"
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/openmpi/lib64"
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib"
ENV PATH /opt/aws/neuron/bin/:$PATH
ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main
ENV DGLBACKEND=pytorch
RUN apt-get update \
&& apt-get upgrade -y \
&& apt-get install -y --no-install-recommends \
build-essential \
ca-certificates \
cmake \
curl \
emacs \
git \
jq \
libopencv-dev \
openjdk-8-jdk-headless \
openjdk-8-jdk \
openjdk-8-jre \
libglib2.0-0 \
libgl1-mesa-glx \
libsm6 \
libxext6 \
libxrender-dev \
openjdk-11-jdk \
software-properties-common \
wget \
unzip \
vim \
zlib1g-dev \
openssl \
libssl-dev \
libsqlite3-dev \
libgdbm-dev \
libc6-dev \
libbz2-dev \
libncurses-dev \
tk-dev \
libffi-dev \
libcap-dev \
gnupg2 \
gpg-agent \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean
RUN echo "deb https://apt.repos.neuron.amazonaws.com focal main" > /etc/apt/sources.list.d/neuron.list
RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add -
RUN apt-get update \
&& apt-get install -y \
aws-neuronx-tools=$NEURONX_TOOLS_VERSION \
aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \
aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \
&& rm -rf /var/lib/apt/lists/* \
&& rm -rf /tmp/tmp* \
&& apt-get clean
# Install Open MPI
RUN mkdir -p /tmp/openmpi \
&& cd /tmp/openmpi \
&& wget --quiet https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OMPI_VERSION}.tar.gz \
&& tar zxf openmpi-${OMPI_VERSION}.tar.gz \
&& cd openmpi-${OMPI_VERSION} \
&& ./configure --enable-orterun-prefix-by-default \
&& make -j $(nproc) all \
&& make install \
&& ldconfig \
&& rm -rf /tmp/openmpi
# install Python
RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \
&& tar -xzf Python-$PYTHON_VERSION.tgz \
&& cd Python-$PYTHON_VERSION \
&& ./configure --enable-shared --prefix=/usr/local \
&& make -j $(nproc) && make install \
&& cd .. && rm -rf ../Python-$PYTHON_VERSION* \
&& ln -s /usr/local/bin/pip3 /usr/bin/pip \
&& ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \
&& ${PIP} --no-cache-dir install --upgrade \
pip \
setuptools
WORKDIR /
# The ENV variables declared below are changed in the previous section
# Grouping these ENV variables in the first section causes
# ompi_info to fail. This is only observed in CPU containers
ENV PATH="$PATH:/home/.openmpi/bin"
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/.openmpi/lib/"
RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value
# Copy workaround script for incorrect hostname
COPY changehostname.c /
COPY start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh
RUN ${PIP} install --no-cache-dir -U \
"bokeh>=2.3,<3" \
"awscli<2" \
scipy \
click \
"cryptography" \
"sagemaker>=2,<2.184" \
"sagemaker-pytorch-training" \
psutil==5.6.7 \
dataset \
transformers==4.36.2 \
Pillow
RUN ${PIP} config set global.extra-index-url https://pip.repos.neuron.amazonaws.com \
&& ${PIP} install --force-reinstall torch-neuronx==$NEURONX_FRAMEWORK_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com \
&& ${PIP} install --force-reinstall neuronx-cc==$NEURONX_CC_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com
RUN ${PIP} install --force-reinstall --no-deps neuronx_distributed==$NEURONX_DISTRIBUTED_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com
## Installation for Neuronx Distributed Training framework
# Install Cython & wheel
RUN ${PIP} install --no-cache-dir Cython \
&& ${PIP} install --no-cache-dir wheel
# Copy the apex_setup.py file
COPY apex_setup.py /root/apex_setup.py
# Clone and build Apex
RUN git clone https://github.com/NVIDIA/apex.git /root/apex \
&& cd /root/apex \
&& git checkout 23.05 \
&& cp /root/apex_setup.py setup.py \
&& python3 setup.py bdist_wheel
#Install dependencies from requirements and extras for SageMaker usecase
RUN wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed-training/master/requirements.txt \
&& ${PIP} install --no-cache-dir -r requirements.txt /root/apex/dist/apex-0.1-py3-none-any.whl \
&& ${PIP} install --force-reinstall "multiprocess==0.70.16" \
"dill==0.3.8" \
"torch==2.1.2"
RUN ${PIP} install --force-reinstall --no-deps neuronx_distributed_training==$NEURONX_DISTRIBUTED_TRAINING_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com
# attrs, neuronx-cc required: >=19.2.0, sagemaker <24,>=23.1.0
# protobuf neuronx-cc<4, sagemaker-training >=3.9.2,<=3.20.3
# awscli 1.25.47 has requirement docutils<0.17,>=0.10
# etcd for kubernetes installation
# awscli 1.27.127 has requirement rsa<4.8,>=3.1.2, but you have rsa 4.9.
# awscli 1.27.127 requires urllib3 < 1.27, python-etcd requires urllib3 >= 1.7, latest urllib3 release is 2.0.2
RUN ${PIP} install --no-cache-dir -U \
"attrs<24,>=23.1.0" \
"protobuf>=3.18.3,<=3.20.3" \
"docutils>=0.10,<0.17" \
"rsa<4.8,>=3.1.2" \
"python-etcd" \
"urllib3>=1.26.0,<1.27"
# Install extra packages needed by sagemaker (for passing test_utility_packages_using_import)
RUN pip install --no-cache-dir -U \
"bokeh>=3.0.1,<4" \
"imageio>=2.22,<3" \
"opencv-python>=4.8.1.78" \
"plotly>=5.11,<6" \
"seaborn>=0.12,<1" \
"shap>=0.41,<1"
# EFA Installer does apt get. Make sure to run apt update before that
RUN apt-get update
RUN cd $HOME \
&& curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \
&& wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \
&& cat aws-efa-installer.key | gpg --fingerprint \
&& wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \
&& tar -xf aws-efa-installer-latest.tar.gz \
&& cd aws-efa-installer \
&& ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \
&& cd $HOME
# Clean up after apt update
RUN rm -rf /var/lib/apt/lists/* \
&& rm -rf /tmp/tmp* \
&& apt-get clean
# Install some common packages used by training scripts
# torchvision needed for MLP. since it depends on torch and torch neuron/torch
# is already installed install it with nodeps
RUN pip3 install --no-cache-dir --no-deps -U \
torchvision==0.16.*
# Needed for running bert training scripts
RUN pip3 install --no-cache-dir -U \
graphviz \
tensorboard==2.6 \
accelerate \
sentencepiece!=0.1.92 \
h5py \
requests
COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py
RUN chmod +x /usr/local/bin/start_with_right_hostname.sh \
&& chmod +x /usr/local/bin/deep_learning_container.py
RUN HOME_DIR=/root \
&& curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
&& unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
&& cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
&& chmod +x /usr/local/bin/testOSSCompliance \
&& chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
&& ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
&& rm -rf ${HOME_DIR}/oss_compliance* \
&& rm -rf /tmp/tmp*
RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.1/license.txt
# Starts framework
ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"]
CMD ["/bin/bash"]