From e389e879fcc5dcec7b01d0a7c911f4252d380894 Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Wed, 15 Nov 2023 16:18:35 -0800 Subject: [PATCH 01/10] commit change --- docker/generate_build_matrix.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py index ee2221187d..6a58e05d6d 100644 --- a/docker/generate_build_matrix.py +++ b/docker/generate_build_matrix.py @@ -124,7 +124,7 @@ def _write_table(table_tag: str, table_contents: str): def _main(): python_versions = ['3.10'] - pytorch_versions = ['2.1.0', '2.0.1', '1.13.1'] + pytorch_versions = ['2.1.1', '2.0.1', '1.13.1'] cuda_options = [True, False] stages = ['pytorch_stage'] interconnects = ['mellanox', 'EFA'] # mellanox is default, EFA needed for AWS From 83143454716e63a53311242daed3c599a24321d4 Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Wed, 15 Nov 2023 16:19:48 -0800 Subject: [PATCH 02/10] commit change --- docker/README.md | 6 +++--- docker/build_matrix.yaml | 22 +++++++++++----------- docker/generate_build_matrix.py | 6 +++--- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/docker/README.md b/docker/README.md index aa37f27047..62eefca42c 100644 --- a/docker/README.md +++ b/docker/README.md @@ -32,9 +32,9 @@ To install composer, once inside the image, run `pip install mosaicml`. | Linux Distro | Flavor | PyTorch Version | CUDA Version | Python Version | Docker Tags | |----------------|----------|-------------------|---------------------|------------------|------------------------------------------------------------------------------------------| | Ubuntu 20.04 | Base | 2.2.0 | 12.1.0 (Infiniband) | 3.10 | `mosaicml/pytorch:2.2.0_cu121-nightly20231024-python3.10-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.1.0 | 12.1.0 (Infiniband) | 3.10 | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.1.0 | 12.1.0 (EFA) | 3.10 | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04-aws` | -| Ubuntu 20.04 | Base | 2.1.0 | cpu | 3.10 | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.1.0_cpu-python3.10-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.1.1 | 12.1.0 (Infiniband) | 3.10 | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.1.1_cu121-python3.10-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.1.1 | 12.1.0 (EFA) | 3.10 | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.1.1_cu121-python3.10-ubuntu20.04-aws` | +| Ubuntu 20.04 | Base | 2.1.1 | cpu | 3.10 | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.1.1_cpu-python3.10-ubuntu20.04` | | Ubuntu 20.04 | Base | 2.0.1 | 11.8.0 (Infiniband) | 3.10 | `mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04` | | Ubuntu 20.04 | Base | 2.0.1 | 11.8.0 (EFA) | 3.10 | `mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04-aws` | | Ubuntu 20.04 | Base | 2.0.1 | cpu | 3.10 | `mosaicml/pytorch:2.0.1_cpu-python3.10-ubuntu20.04` | diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml index 28408b8d89..a97735ad57 100644 --- a/docker/build_matrix.yaml +++ b/docker/build_matrix.yaml @@ -2,42 +2,42 @@ - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 CUDA_VERSION: 12.1.0 - IMAGE_NAME: torch-2-1-0-cu121 + IMAGE_NAME: torch-2-1-1-cu121 MOFED_VERSION: 5.5-1.0.3.2 PYTHON_VERSION: '3.10' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.1.0 + PYTORCH_VERSION: 2.1.1 TAGS: - - mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04 + - mosaicml/pytorch:2.1.1_cu121-python3.10-ubuntu20.04 - mosaicml/pytorch:latest TARGET: pytorch_stage TORCHVISION_VERSION: 0.16.0 - AWS_OFI_NCCL_VERSION: v1.7.3-aws BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 CUDA_VERSION: 12.1.0 - IMAGE_NAME: torch-2-1-0-cu121-aws + IMAGE_NAME: torch-2-1-1-cu121-aws MOFED_VERSION: '' PYTHON_VERSION: '3.10' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.1.0 + PYTORCH_VERSION: 2.1.1 TAGS: - - mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04-aws + - mosaicml/pytorch:2.1.1_cu121-python3.10-ubuntu20.04-aws - mosaicml/pytorch:latest-aws TARGET: pytorch_stage TORCHVISION_VERSION: 0.16.0 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: ubuntu:20.04 CUDA_VERSION: '' - IMAGE_NAME: torch-2-1-0-cpu + IMAGE_NAME: torch-2-1-1-cpu MOFED_VERSION: '' PYTHON_VERSION: '3.10' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.1.0 + PYTORCH_VERSION: 2.1.1 TAGS: - - mosaicml/pytorch:2.1.0_cpu-python3.10-ubuntu20.04 + - mosaicml/pytorch:2.1.1_cpu-python3.10-ubuntu20.04 - mosaicml/pytorch:latest_cpu TARGET: pytorch_stage TORCHVISION_VERSION: 0.16.0 @@ -141,7 +141,7 @@ PYTHON_VERSION: '3.10' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.1.0 + PYTORCH_VERSION: 2.1.1 TAGS: - mosaicml/composer:0.17.0 - mosaicml/composer:latest @@ -156,7 +156,7 @@ PYTHON_VERSION: '3.10' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.1.0 + PYTORCH_VERSION: 2.1.1 TAGS: - mosaicml/composer:0.17.0_cpu - mosaicml/composer:latest_cpu diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py index 6a58e05d6d..cb24089159 100644 --- a/docker/generate_build_matrix.py +++ b/docker/generate_build_matrix.py @@ -19,11 +19,11 @@ import yaml LATEST_PYTHON_VERSION = '3.10' -PRODUCTION_PYTORCH_VERSION = '2.1.0' +PRODUCTION_PYTORCH_VERSION = '2.1.1' def _get_torchvision_version(pytorch_version: str): - if pytorch_version == '2.1.0': + if pytorch_version == '2.1.1': return '0.16.0' if pytorch_version == '2.0.1': return '0.15.2' @@ -41,7 +41,7 @@ def _get_base_image(cuda_version: str): def _get_cuda_version(pytorch_version: str, use_cuda: bool): if not use_cuda: return '' - if pytorch_version == '2.1.0': + if pytorch_version == '2.1.1': return '12.1.0' if pytorch_version == '2.0.1': return '11.8.0' From d92a8695e46c1fc2e897aa516ab08cf8851a2e1c Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Wed, 15 Nov 2023 16:50:21 -0800 Subject: [PATCH 03/10] commit change --- docker/build_matrix.yaml | 10 +++++----- docker/generate_build_matrix.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml index a97735ad57..344b337ee8 100644 --- a/docker/build_matrix.yaml +++ b/docker/build_matrix.yaml @@ -12,7 +12,7 @@ - mosaicml/pytorch:2.1.1_cu121-python3.10-ubuntu20.04 - mosaicml/pytorch:latest TARGET: pytorch_stage - TORCHVISION_VERSION: 0.16.0 + TORCHVISION_VERSION: 0.16.1 - AWS_OFI_NCCL_VERSION: v1.7.3-aws BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 CUDA_VERSION: 12.1.0 @@ -26,7 +26,7 @@ - mosaicml/pytorch:2.1.1_cu121-python3.10-ubuntu20.04-aws - mosaicml/pytorch:latest-aws TARGET: pytorch_stage - TORCHVISION_VERSION: 0.16.0 + TORCHVISION_VERSION: 0.16.1 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: ubuntu:20.04 CUDA_VERSION: '' @@ -40,7 +40,7 @@ - mosaicml/pytorch:2.1.1_cpu-python3.10-ubuntu20.04 - mosaicml/pytorch:latest_cpu TARGET: pytorch_stage - TORCHVISION_VERSION: 0.16.0 + TORCHVISION_VERSION: 0.16.1 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 CUDA_VERSION: 11.8.0 @@ -146,7 +146,7 @@ - mosaicml/composer:0.17.0 - mosaicml/composer:latest TARGET: composer_stage - TORCHVISION_VERSION: 0.16.0 + TORCHVISION_VERSION: 0.16.1 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: ubuntu:20.04 COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.17.0 @@ -161,4 +161,4 @@ - mosaicml/composer:0.17.0_cpu - mosaicml/composer:latest_cpu TARGET: composer_stage - TORCHVISION_VERSION: 0.16.0 + TORCHVISION_VERSION: 0.16.1 diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py index cb24089159..fb504f9e58 100644 --- a/docker/generate_build_matrix.py +++ b/docker/generate_build_matrix.py @@ -24,7 +24,7 @@ def _get_torchvision_version(pytorch_version: str): if pytorch_version == '2.1.1': - return '0.16.0' + return '0.16.1' if pytorch_version == '2.0.1': return '0.15.2' if pytorch_version == '1.13.1': From 26e9867488e458d00dec06ff95b1009dcedbc4cb Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Wed, 15 Nov 2023 16:54:41 -0800 Subject: [PATCH 04/10] commit change --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 065104c75b..9c6317b3d5 100644 --- a/setup.py +++ b/setup.py @@ -78,7 +78,7 @@ def package_files(prefix: str, directory: str, extension: str): 'torchmetrics>=0.10.0,<1.1', 'torch_optimizer>=0.3.0,<0.4', 'torchvision>=0.13.1,<0.17', - 'torch>=1.13.1,<2.1.1', + 'torch>=1.13.1,<2.1.2', 'requests>=2.26.0,<3', 'numpy>=1.21.5,<1.27.0', 'psutil>=5.8.0,<6', From 24ec12462b2a707d5403d8466431e0dd0db34aad Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Wed, 15 Nov 2023 17:08:29 -0800 Subject: [PATCH 05/10] commit change --- composer/trainer/mosaic_fsdp.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/composer/trainer/mosaic_fsdp.py b/composer/trainer/mosaic_fsdp.py index fd786efe6f..4bbd878c44 100644 --- a/composer/trainer/mosaic_fsdp.py +++ b/composer/trainer/mosaic_fsdp.py @@ -49,6 +49,3 @@ def patch_pytorch(): # Monkey patch partial state dict handling _state_dict_utils._sharded_pre_load_state_dict_hook = (_sharded_pre_load_state_dict_hook) - - elif version.parse(torch.__version__) >= version.parse('2.1.1'): - raise NotImplementedError(f'FullyShardedDataParallel is not supported for torch >= 2.2.0') From 9a3437a10179fe019cb38724d78ad6f88605528c Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Wed, 15 Nov 2023 17:09:50 -0800 Subject: [PATCH 06/10] commit change --- composer/trainer/mosaic_fsdp.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/composer/trainer/mosaic_fsdp.py b/composer/trainer/mosaic_fsdp.py index 4bbd878c44..05a420c6f2 100644 --- a/composer/trainer/mosaic_fsdp.py +++ b/composer/trainer/mosaic_fsdp.py @@ -49,3 +49,7 @@ def patch_pytorch(): # Monkey patch partial state dict handling _state_dict_utils._sharded_pre_load_state_dict_hook = (_sharded_pre_load_state_dict_hook) + + else: + # Nothing to patch yet for torch >= 2.1.1 + pass \ No newline at end of file From e47f8167d80c4e21452f3bde2d62cc881978483c Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Wed, 15 Nov 2023 17:10:20 -0800 Subject: [PATCH 07/10] commit change --- composer/trainer/mosaic_fsdp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/composer/trainer/mosaic_fsdp.py b/composer/trainer/mosaic_fsdp.py index 05a420c6f2..058f8804f1 100644 --- a/composer/trainer/mosaic_fsdp.py +++ b/composer/trainer/mosaic_fsdp.py @@ -49,7 +49,7 @@ def patch_pytorch(): # Monkey patch partial state dict handling _state_dict_utils._sharded_pre_load_state_dict_hook = (_sharded_pre_load_state_dict_hook) - + else: # Nothing to patch yet for torch >= 2.1.1 - pass \ No newline at end of file + pass From 760ecb0a3da144ae7ff331a00a257ae15051ae93 Mon Sep 17 00:00:00 2001 From: Charles Tang Date: Wed, 29 Nov 2023 17:11:51 -0800 Subject: [PATCH 08/10] Update composer/trainer/mosaic_fsdp.py Co-authored-by: Mihir Patel --- composer/trainer/mosaic_fsdp.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/composer/trainer/mosaic_fsdp.py b/composer/trainer/mosaic_fsdp.py index 058f8804f1..6b94144ec0 100644 --- a/composer/trainer/mosaic_fsdp.py +++ b/composer/trainer/mosaic_fsdp.py @@ -50,6 +50,3 @@ def patch_pytorch(): # Monkey patch partial state dict handling _state_dict_utils._sharded_pre_load_state_dict_hook = (_sharded_pre_load_state_dict_hook) - else: - # Nothing to patch yet for torch >= 2.1.1 - pass From 3722800cc7048dc99e1090457d8797ed96f280ec Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Wed, 29 Nov 2023 17:21:06 -0800 Subject: [PATCH 09/10] format --- a.diff | 17 +++++++++++++++++ a.yaml | 0 composer/trainer/mosaic_fsdp.py | 1 - 3 files changed, 17 insertions(+), 1 deletion(-) create mode 100644 a.diff create mode 100644 a.yaml diff --git a/a.diff b/a.diff new file mode 100644 index 0000000000..ebc9a2d457 --- /dev/null +++ b/a.diff @@ -0,0 +1,17 @@ +diff --git a/tests/utils/eval_client/test_local_eval_client.py b/tests/utils/eval_client/test_local_eval_client.py +index 90b80e1a..d8a902b0 100644 +--- a/tests/utils/eval_client/test_local_eval_client.py ++++ b/tests/utils/eval_client/test_local_eval_client.py +@@ -28,9 +28,11 @@ from tests.common.markers import world_size + ], + ) + @world_size(1, 2) +-def test_local_invoke(code: str, result: str, language: str, world_size: int): ++def test_local_invoke(code: str, result: str, language: str, world_size: int, tmp_path: str): + """Test invocation function for LocalEvalClient with code that succeeds, fails compilation, times out, and is incorrect in C, C++, Python, JS. + """ ++ import os ++ os.makedirs(os.path.dirname(tmp_path)) + eval_client = LocalEvalClient() + input = '(1,)' if language == 'python' else '1' + assert eval_client.invoke([[[{ diff --git a/a.yaml b/a.yaml new file mode 100644 index 0000000000..e69de29bb2 diff --git a/composer/trainer/mosaic_fsdp.py b/composer/trainer/mosaic_fsdp.py index 6b94144ec0..4bbd878c44 100644 --- a/composer/trainer/mosaic_fsdp.py +++ b/composer/trainer/mosaic_fsdp.py @@ -49,4 +49,3 @@ def patch_pytorch(): # Monkey patch partial state dict handling _state_dict_utils._sharded_pre_load_state_dict_hook = (_sharded_pre_load_state_dict_hook) - From 7e2baac573af4bfeb6c2b3129f6be36afa9c9d87 Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Wed, 29 Nov 2023 17:22:06 -0800 Subject: [PATCH 10/10] commit change --- a.diff | 17 ----------------- a.yaml | 0 2 files changed, 17 deletions(-) delete mode 100644 a.diff delete mode 100644 a.yaml diff --git a/a.diff b/a.diff deleted file mode 100644 index ebc9a2d457..0000000000 --- a/a.diff +++ /dev/null @@ -1,17 +0,0 @@ -diff --git a/tests/utils/eval_client/test_local_eval_client.py b/tests/utils/eval_client/test_local_eval_client.py -index 90b80e1a..d8a902b0 100644 ---- a/tests/utils/eval_client/test_local_eval_client.py -+++ b/tests/utils/eval_client/test_local_eval_client.py -@@ -28,9 +28,11 @@ from tests.common.markers import world_size - ], - ) - @world_size(1, 2) --def test_local_invoke(code: str, result: str, language: str, world_size: int): -+def test_local_invoke(code: str, result: str, language: str, world_size: int, tmp_path: str): - """Test invocation function for LocalEvalClient with code that succeeds, fails compilation, times out, and is incorrect in C, C++, Python, JS. - """ -+ import os -+ os.makedirs(os.path.dirname(tmp_path)) - eval_client = LocalEvalClient() - input = '(1,)' if language == 'python' else '1' - assert eval_client.invoke([[[{ diff --git a/a.yaml b/a.yaml deleted file mode 100644 index e69de29bb2..0000000000