Skip to content

Commit

Permalink
Merge branch 'main' into tylertitsworth/infra-contrib-updates
Browse files Browse the repository at this point in the history
  • Loading branch information
Tyler Titsworth authored Jun 28, 2024
2 parents e4913d3 + 6006b33 commit c723e99
Show file tree
Hide file tree
Showing 3 changed files with 170 additions and 12 deletions.
59 changes: 48 additions & 11 deletions pytorch/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,7 @@ RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missin
gcc \
libgl1-mesa-glx \
libglib2.0-0 \
virtualenv && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
virtualenv

ENV SIGOPT_PROJECT=.

Expand All @@ -91,24 +89,63 @@ COPY multinode-requirements.txt .

RUN python -m pip install --no-cache-dir -r multinode-requirements.txt

ENV LD_LIBRARY_PATH="/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/lib"

RUN apt-get install -y --no-install-recommends --fix-missing \
openssh-client \
openssh-server && \
rm /etc/ssh/ssh_host_*_key \
/etc/ssh/ssh_host_*_key.pub && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

# Allow OpenSSH to talk to containers without asking for confirmation
# hadolint global ignore=SC2002
RUN mkdir -p /var/run/sshd && \
cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config

ARG PYTHON_VERSION
RUN echo "source /usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/env/setvars.sh" >> ~/.bashrc

COPY generate_ssh_keys.sh .

# modify generate_ssh_keys to be a helper script
# print how to use helper script on bash startup
# Avoids loop for further execution of the startup file
RUN echo "source /usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/env/setvars.sh" >> ~/.startup && \
cat '/generate_ssh_keys.sh' >> ~/.startup && \
rm -rf /generate_ssh_keys.sh

ENV I_MPI_ROOT="${I_MPI_ROOT}:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch"
ENV CCL_ROOT="${CCL_ROOT}:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch"
ENV FI_PROVIDER_PATH="${FI_PROVIDER_PATH}:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib/prov"
ENV LIBRARY_PATH="${LIBRARY_PATH}:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/lib"
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/lib"
ENV PATH="${PATH}:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/bin"
ENV CPATH="${CPATH}:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/include"
# hadolint global ignore=SC3037
RUN echo -e "#!/bin/bash \n\
set -e \n\
set -a \n\
source ~/.startup \n\
set +a \n\
eval \"\$@\" \n\
tail -f /dev/null" >> /usr/local/bin/dockerd-entrypoint.sh && \
chmod +x /usr/local/bin/dockerd-entrypoint.sh

RUN echo 'HostKey /etc/ssh/ssh_host_dsa_key' > /var/run/sshd_config && \
echo 'HostKey /etc/ssh/ssh_host_rsa_key' > /var/run/sshd_config && \
echo 'HostKey /etc/ssh/ssh_host_ecdsa_key' > /var/run/sshd_config && \
echo 'HostKey /etc/ssh/ssh_host_ed25519_key' > /var/run/sshd_config && \
echo 'AuthorizedKeysFile /etc/ssh/authorized_keys' > /var/run/sshd_config && \
echo '## Enable DEBUG log. You can ignore this but this may help you debug any issue while enabling SSHD for the first time' > /var/run/sshd_config && \
echo 'LogLevel DEBUG3' > /var/run/sshd_config && \
echo 'UsePAM yes' > /var/run/sshd_config && \
echo 'Subsystem sftp /usr/lib/openssh/sftp-server' > /var/run/sshd_config

RUN mkdir -p /licensing

RUN wget -q --no-check-certificate https://raw.githubusercontent.com/oneapi-src/oneCCL/b7d66de16e17f88caffd7c6df4cd5e12b266af84/third-party-programs.txt -O /licensing/oneccl_third_party_programs.txt && \
wget -q --no-check-certificate https://raw.githubusercontent.com/intel/neural-compressor/master/docker/third-party-programs-pytorch.txt -O /licensing/third-party-programs-pytorch.txt && \
wget -q --no-check-certificate https://raw.githubusercontent.com/intel/neural-compressor/master/LICENSE -O /licensing/LICENSE

ENTRYPOINT ["/usr/local/bin/dockerd-entrypoint.sh"]
CMD ["bash"]

FROM ${PYTHON_BASE} AS ipex-xpu-base

RUN apt-get update && \
Expand Down
95 changes: 94 additions & 1 deletion pytorch/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ docker run -it --rm \
--net=host \
-v $PWD/workspace:/workspace \
-w /workspace \
intel/intel-extension-for-tensorflow:xpu-jupyter
intel/intel-extension-for-pytorch:xpu-jupyter
```

After running the command above, copy the URL (something like `http://127.0.0.1:$PORT/?token=***`) into your browser to access the notebook server.
Expand All @@ -113,6 +113,99 @@ The images below additionally include [Intel® oneAPI Collective Communications
| `2.1.0-pip-mulitnode` | [v2.1.0] | [v2.1.0+cpu] | [v2.1.0][ccl-v2.1.0] | [v2.3.1] | [v0.2.3] |
| `2.0.0-pip-multinode` | [v2.0.0] | [v2.0.0+cpu] | [v2.0.0][ccl-v2.0.0] | [v2.1.1] | [v0.1.0] |

> **Note:** Passwordless SSH connection is also enabled in the image.
> The container does not contain the SSH ID keys. The user needs to mount those keys at `/root/.ssh/id_rsa` and `/root/.ssh/id_rsa.pub`.
> User also need to append content of id_rsa.pub in `/etc/ssh/authorized_keys` in the SSH server container.
> Since the SSH key is not owned by default user account in docker, please also do "chmod 644 id_rsa.pub; chmod 644 id_rsa" to grant read access for default user account.
> Users could also use "/usr/bin/ssh-keygen -t rsa -b 4096 -N '' -f ~/mnt/ssh_key/id_rsa" to generate a new SSH Key inside the container.
> Users need to mount a config file to list all hostnames at location `/root/.ssh/config` on the SSH client container.
> Once all files are added
#### Setup and Run IPEX Multi-Node Container

Some additional assembly is required to utilize this container with OpenSSH. To perform any kind of DDP (Distributed Data Parallel) execution, containers are assigned the roles of launcher and worker respectively:

SSH Server (Worker)

1. *Authorized Keys* : `/etc/ssh/authorized_keys`

SSH Client (Launcher)

1. *Config File with Host IPs* : `/root/.ssh/config`
2. *Private User Key* : `/root/.ssh/id_rsa`

To add these files correctly please follow the steps described below.

1. Setup ID Keys

You can use the commands provided below to [generate the Identity keys](https://www.ssh.com/academy/ssh/keygen#creating-an-ssh-key-pair-for-user-authentication) for OpenSSH.

```bash
ssh-keygen -q -N "" -t rsa -b 4096 -f ./id_rsa
touch authorized_keys
cat id_rsa.pub >> authorized_keys
```

2. Add hosts to config

The launcher container needs to have the a config file with all hostnames and ports specified. An example of a hostfile is provided below.

```bash
touch config
```

```txt
Host host1
HostName <Hostname of host1>
IdentitiesOnly yes
Port <SSH Port>
Host host2
HostName <Hostname of host2>
IdentitiesOnly yes
Port <SSH Port>
...
```

3. Configure the permissions and ownership for all of the files you have created so far.

```bash
chmod 600 id_rsa.pub id_rsa config authorized_keys
chown root:root id_rsa.pub id_rsa config authorized_keys
```

4. Now start the workers and execute DDP on the launcher.

1. Worker run command:

```bash
export SSH_PORT=<SSH Port>
docker run -it --rm \
--net=host \
-v $PWD/authorized_keys:/root/.ssh/authorized_keys \
-v $PWD/tests:/workspace/tests \
-w /workspace \
-e SSH_PORT=${SSH_PORT} \
intel/intel-extension-for-pytorch:2.3.0-pip-multinode \
bash -c '/usr/sbin/sshd -D -p ${SSH_PORT} -f /var/run/sshd_config'
```

2. Launcher run command:

```bash
docker run -it --rm \
--net=host \
-v $PWD/id_rsa:/root/.ssh/id_rsa \
-v $PWD/config:/root/.ssh/config \
-v $PWD/tests:/workspace/tests \
-w /workspace \
-e SSH_PORT=${SSH_PORT} \
intel/intel-extension-for-pytorch:2.3.0-pip-multinode \
bash -c 'ipexrun cpu /workspace/tests/ipex-resnet50.py --ipex --device cpu --backend ccl'
```

> [!NOTE]
> [Intel MPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/mpi-library.html) can be configured based on your machine settings. If the above commands do not work for you, see the documentation for how to configure based on your network.

---

The images below are [TorchServe*] with CPU Optimizations:
Expand Down
28 changes: 28 additions & 0 deletions pytorch/generate_ssh_keys.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/usr/bin/env bash
# Copyright (c) 2023 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# SPDX-License-Identifier: Apache-2.0

function gen_single_key() {
ALG_NAME=$1
if [[ ! -f /etc/ssh/ssh_host_${ALG_NAME}_key ]]; then
ssh-keygen -q -N "" -t "${ALG_NAME}" -f "/etc/ssh/ssh_host_${ALG_NAME}_key"
fi
}

gen_single_key dsa
gen_single_key rsa
gen_single_key ecdsa
gen_single_key ed25519

0 comments on commit c723e99

Please sign in to comment.