From e4da724e4a244c98b6bb13f77b16a8d8a80a6029 Mon Sep 17 00:00:00 2001 From: Akiki Liang Date: Thu, 30 Jan 2025 23:17:06 +0000 Subject: [PATCH] update NeMo framework examples to 24.12 --- .../a3-highgpu-8g/nemo-framework/Dockerfile | 2 +- .../machine-learning/a3-highgpu-8g/nemo-framework/README.md | 4 ++-- .../a3-highgpu-8g/nemo-framework/setup_nemo.sh | 2 +- .../a3-megagpu-8g/nemo-framework/Dockerfile | 2 +- .../machine-learning/a3-megagpu-8g/nemo-framework/README.md | 6 +++--- .../a3-megagpu-8g/nemo-framework/setup_nemo.sh | 2 +- .../a3-ultragpu-8g/nemo-framework/Dockerfile | 2 +- .../a3-ultragpu-8g/nemo-framework/README.md | 6 +++--- .../a3-ultragpu-8g/nemo-framework/setup_nemo.sh | 2 +- 9 files changed, 14 insertions(+), 14 deletions(-) diff --git a/examples/machine-learning/a3-highgpu-8g/nemo-framework/Dockerfile b/examples/machine-learning/a3-highgpu-8g/nemo-framework/Dockerfile index a4264709bc..3bfa23198b 100644 --- a/examples/machine-learning/a3-highgpu-8g/nemo-framework/Dockerfile +++ b/examples/machine-learning/a3-highgpu-8g/nemo-framework/Dockerfile @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -ARG NEMOFW_VERSION=24.07 +ARG NEMOFW_VERSION=24.12 FROM nvcr.io/nvidia/nemo:${NEMOFW_VERSION} ENV USE_TCPX=yes diff --git a/examples/machine-learning/a3-highgpu-8g/nemo-framework/README.md b/examples/machine-learning/a3-highgpu-8g/nemo-framework/README.md index 9eb9252106..6850852623 100644 --- a/examples/machine-learning/a3-highgpu-8g/nemo-framework/README.md +++ b/examples/machine-learning/a3-highgpu-8g/nemo-framework/README.md @@ -3,7 +3,7 @@ README 1. Set up NeMo Framework Container - This makes a few environment variable modifications to the [nvcr.io/nvidia/nemo:24.07](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo) + This makes a few environment variable modifications to the [nvcr.io/nvidia/nemo:24.12](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo) container, and submits a Slurm job to copy the framework launcher scripts and a few other auxiliary files into your working directory. @@ -45,7 +45,7 @@ README launcher_scripts_path=${PWD} \ stages=[training] \ env_vars.TRANSFORMERS_OFFLINE=0 \ - container=../nemofw+tcpx-24.07.sqsh \ + container=../nemofw+tcpx-24.12.sqsh \ container_mounts='['${HOME}/.cache',"/var/lib/tcpx/lib64","/run/tcpx-\${SLURM_JOB_ID}:/run/tcpx"]' \ cluster.srun_args=["--container-writable"] \ training.model.data.data_impl=mock \ diff --git a/examples/machine-learning/a3-highgpu-8g/nemo-framework/setup_nemo.sh b/examples/machine-learning/a3-highgpu-8g/nemo-framework/setup_nemo.sh index 5692b0342b..19c6212c6a 100644 --- a/examples/machine-learning/a3-highgpu-8g/nemo-framework/setup_nemo.sh +++ b/examples/machine-learning/a3-highgpu-8g/nemo-framework/setup_nemo.sh @@ -18,7 +18,7 @@ #SBATCH --partition=a3 #SBATCH --exclusive -: "${NEMOFW_VERSION:=24.07}" +: "${NEMOFW_VERSION:=24.12}" srun docker build --build-arg="NEMOFW_VERSION=${NEMOFW_VERSION}" -t nemofw:tcpx-"${NEMOFW_VERSION}" . srun rm -f nemofw+tcpx-"${NEMOFW_VERSION}".sqsh diff --git a/examples/machine-learning/a3-megagpu-8g/nemo-framework/Dockerfile b/examples/machine-learning/a3-megagpu-8g/nemo-framework/Dockerfile index 381591b9e0..4c0d449889 100644 --- a/examples/machine-learning/a3-megagpu-8g/nemo-framework/Dockerfile +++ b/examples/machine-learning/a3-megagpu-8g/nemo-framework/Dockerfile @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -ARG NEMOFW_VERSION=24.07 +ARG NEMOFW_VERSION=24.12 FROM nvcr.io/nvidia/nemo:$NEMOFW_VERSION ENV NCCL_FASTRAK_CTRL_DEV=enp0s12 diff --git a/examples/machine-learning/a3-megagpu-8g/nemo-framework/README.md b/examples/machine-learning/a3-megagpu-8g/nemo-framework/README.md index e1f16377bb..6e66a68adf 100644 --- a/examples/machine-learning/a3-megagpu-8g/nemo-framework/README.md +++ b/examples/machine-learning/a3-megagpu-8g/nemo-framework/README.md @@ -3,7 +3,7 @@ README 1. Set up NeMo Framework Container - This makes a few environment variable modifications to the [nvcr.io/nvidia/nemo:24.07](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo) + This makes a few environment variable modifications to the [nvcr.io/nvidia/nemo:24.12](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo) container, and submits a Slurm job to copy the framework launcher scripts and a few other auxiliary files into your working directory. @@ -21,7 +21,7 @@ README python3 -m venv env source env/bin/activate pip install -r requirements.txt # Copied from the NeMo Framework Container earlier - # This is needed to use 24.07 and python3.11, which is what is present on + # This is needed to use 24.12 and python3.11, which is what is present on # Debian 12 pip install -U hydra-core ``` @@ -53,7 +53,7 @@ README stages=[training] \ training=gpt3/5b \ env_vars.TRANSFORMERS_OFFLINE=0 \ - container=../nemofw+tcpxo-24.07.sqsh \ + container=../nemofw+tcpxo-24.12.sqsh \ container_mounts=[${HOME}/.cache,/var/lib/tcpxo/lib64] \ cluster.srun_args=["--container-writable"] \ training.model.data.data_impl=mock \ diff --git a/examples/machine-learning/a3-megagpu-8g/nemo-framework/setup_nemo.sh b/examples/machine-learning/a3-megagpu-8g/nemo-framework/setup_nemo.sh index e20e58d6aa..eac2fbfb23 100644 --- a/examples/machine-learning/a3-megagpu-8g/nemo-framework/setup_nemo.sh +++ b/examples/machine-learning/a3-megagpu-8g/nemo-framework/setup_nemo.sh @@ -18,7 +18,7 @@ #SBATCH --partition=a3mega #SBATCH --exclusive -: "${NEMOFW_VERSION:=24.07}" +: "${NEMOFW_VERSION:=24.12}" srun docker build --build-arg="NEMOFW_VERSION=${NEMOFW_VERSION}" -t nemofw:tcpxo-"${NEMOFW_VERSION}" . srun rm -f nemofw+tcpxo-"${NEMOFW_VERSION}".sqsh diff --git a/examples/machine-learning/a3-ultragpu-8g/nemo-framework/Dockerfile b/examples/machine-learning/a3-ultragpu-8g/nemo-framework/Dockerfile index 6b36f9a52d..ce1c9fecda 100644 --- a/examples/machine-learning/a3-ultragpu-8g/nemo-framework/Dockerfile +++ b/examples/machine-learning/a3-ultragpu-8g/nemo-framework/Dockerfile @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -ARG NEMOFW_VERSION=24.07 +ARG NEMOFW_VERSION=24.12 FROM nvcr.io/nvidia/nemo:$NEMOFW_VERSION ENV NCCL_DEBUG=INFO,WARN diff --git a/examples/machine-learning/a3-ultragpu-8g/nemo-framework/README.md b/examples/machine-learning/a3-ultragpu-8g/nemo-framework/README.md index e895f43765..9c4c9b395d 100644 --- a/examples/machine-learning/a3-ultragpu-8g/nemo-framework/README.md +++ b/examples/machine-learning/a3-ultragpu-8g/nemo-framework/README.md @@ -3,7 +3,7 @@ README 1. Set up NeMo Framework Container - This makes a few environment variable modifications to the [nvcr.io/nvidia/nemo:24.07](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo) + This makes a few environment variable modifications to the [nvcr.io/nvidia/nemo:24.12](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo) container, and submits a Slurm job to copy the framework launcher scripts and a few other auxiliary files into your working directory. @@ -21,7 +21,7 @@ README python3 -m venv env source env/bin/activate pip install -r requirements.txt # Copied from the NeMo Framework Container earlier - # This is needed to use 24.07 and python3.11, which is what is present on + # This is needed to use 24.12 and python3.11, which is what is present on # Debian 12 pip install -U hydra-core ``` @@ -53,7 +53,7 @@ README stages=[training] \ training=gpt3/5b \ env_vars.TRANSFORMERS_OFFLINE=0 \ - container=../nemo-24.07.sqsh \ + container=../nemo-24.12.sqsh \ container_mounts=[${HOME}/.cache,/usr/local/gib] \ cluster.srun_args=["--container-writable"] \ training.model.data.data_impl=mock \ diff --git a/examples/machine-learning/a3-ultragpu-8g/nemo-framework/setup_nemo.sh b/examples/machine-learning/a3-ultragpu-8g/nemo-framework/setup_nemo.sh index 9fafa83e91..53ac15486b 100644 --- a/examples/machine-learning/a3-ultragpu-8g/nemo-framework/setup_nemo.sh +++ b/examples/machine-learning/a3-ultragpu-8g/nemo-framework/setup_nemo.sh @@ -18,7 +18,7 @@ #SBATCH --partition=a3ultra #SBATCH --exclusive -: "${NEMOFW_VERSION:=24.07}" +: "${NEMOFW_VERSION:=24.12}" srun docker build --build-arg="NEMOFW_VERSION=${NEMOFW_VERSION}" -t nemo-"${NEMOFW_VERSION}" . srun rm -f nemo-"${NEMOFW_VERSION}".sqsh