From b878b96bf2ce022de2cf14a0eff470edde48c7d7 Mon Sep 17 00:00:00 2001 From: "Deng, Daisy" Date: Tue, 25 Feb 2025 09:06:24 +0000 Subject: [PATCH 1/5] enabled fsdp distrubuted test --- test/xpu/run_distributed.py | 32 ++++++++++--- test/xpu/skip_list_dist.py | 90 +++++++++++++++++++++++++++++++++++++ 2 files changed, 115 insertions(+), 7 deletions(-) create mode 100644 test/xpu/skip_list_dist.py diff --git a/test/xpu/run_distributed.py b/test/xpu/run_distributed.py index bed342d74..13b38a5d8 100644 --- a/test/xpu/run_distributed.py +++ b/test/xpu/run_distributed.py @@ -2,22 +2,40 @@ import subprocess import sys +from skip_list_dist import skip_dict +from xpu_test_utils import launch_test +res = 0 +res2 = 0 +fail_test = [] + +# run python test def run(test_command): result = subprocess.run(test_command, capture_output=True, text=True) print(result.stdout) print(result.stderr) if "FAILED" in result.stdout or "FAILED" in result.stderr: - return 0 - else: - return 1 - + fail_test.append(" ".join(test_command)) + return result.returncode -res = 0 test_command = ["python", "distributed/test_c10d_ops_xccl.py"] res += run(test_command) test_command = ["python", "distributed/test_c10d_xccl.py"] res += run(test_command) -exit_code = os.WEXITSTATUS(res) -sys.exit(exit_code) +# run pytest with skiplist +for key in skip_dict: + skip_list = skip_dict[key] + fail = launch_test(key, skip_list) + res2 += fail + if fail: + fail_test.append(key) + +if fail_test: + print(",".join(fail_test) + " have failures") + +exit_code = os.WEXITSTATUS(res2) +if exit_code == 0: + sys.exit(res) +else: + sys.exit(exit_code) diff --git a/test/xpu/skip_list_dist.py b/test/xpu/skip_list_dist.py new file mode 100644 index 000000000..7eba27035 --- /dev/null +++ b/test/xpu/skip_list_dist.py @@ -0,0 +1,90 @@ +skip_dict = { + "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": ( + "test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_False_use_orig_params_False", + "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_False_use_orig_params_False", + "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_True_use_orig_params_False", + "test_checkpoint_submodule_use_reentrant_False_xpu", + ), + "../../../../test/distributed/fsdp/test_fsdp_apply.py": None, + "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": ( + "test_ddp_parity_xpu", + ), + "../../../../test/distributed/fsdp/test_fsdp_common.py": None, + "../../../../test/distributed/fsdp/test_fsdp_core.py": ( + "test_delayed_optim_step_offload_false_no_shard_xpu", + "test_delayed_optim_step_offload_false_none_xpu", + "test_delayed_optim_step_offload_false_shard_grad_op_xpu", + "test_delayed_optim_step_offload_true_none_xpu", + "test_delayed_optim_step_offload_true_shard_grad_op_xpu", + "test_delayed_reduce_scatter_offload_false_no_shard_xpu", + "test_delayed_reduce_scatter_offload_false_none_xpu", + "test_delayed_reduce_scatter_offload_false_shard_grad_op_xpu", + "test_delayed_reduce_scatter_offload_true_none_xpu", + "test_delayed_reduce_scatter_offload_true_shard_grad_op_xpu", + "test_mixture_of_experts_offload_false_no_shard_xpu", + "test_mixture_of_experts_offload_false_none_xpu", + "test_mixture_of_experts_offload_false_shard_grad_op_xpu", + "test_mixture_of_experts_offload_true_none_xpu", + "test_mixture_of_experts_offload_true_shard_grad_op_xpu", + "test_mixture_of_experts_with_delay_before_free_offload_false_no_shard_xpu", + "test_mixture_of_experts_with_delay_before_free_offload_false_none_xpu", + "test_mixture_of_experts_with_delay_before_free_offload_false_shard_grad_op_xpu", + "test_mixture_of_experts_with_delay_before_free_offload_true_none_xpu", + "test_mixture_of_experts_with_delay_before_free_offload_true_shard_grad_op_xpu", + "test_nested_always_wrap_model_offload_false_no_shard_xpu", + "test_nested_always_wrap_model_offload_false_none_xpu", + "test_nested_always_wrap_model_offload_false_shard_grad_op_xpu", + "test_nested_always_wrap_model_offload_true_none_xpu", + "test_nested_always_wrap_model_offload_true_shard_grad_op_xpu", + "test_nested_wrapped_model_offload_false_no_shard_xpu", + "test_nested_wrapped_model_offload_false_none_xpu", + "test_nested_wrapped_model_offload_false_shard_grad_op_xpu", + "test_nested_wrapped_model_offload_true_none_xpu", + "test_nested_wrapped_model_offload_true_shard_grad_op_xpu", + "test_transformer_offload_false_no_shard_xpu", + "test_transformer_offload_false_none_xpu", + "test_transformer_offload_false_shard_grad_op_xpu", + "test_transformer_offload_true_none_xpu", + "test_transformer_offload_true_shard_grad_op_xpu", + ), + "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": ( + "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_False_is_even_sharded_model_False_xpu", + "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_False_is_even_sharded_model_True_xpu", + "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_True_is_even_sharded_model_False_xpu", + "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_True_is_even_sharded_model_True_xpu", + "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_False_is_even_sharded_model_False_xpu", + "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_False_is_even_sharded_model_True_xpu", + "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_True_is_even_sharded_model_False_xpu", + "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_True_is_even_sharded_model_True_xpu", + "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_False_is_even_sharded_model_False_xpu", + "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_False_is_even_sharded_model_True_xpu", + "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_is_even_sharded_model_False_xpu", + "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_is_even_sharded_model_True_xpu", + "test_fsdp_init_with_device_mesh_is_even_sharded_model_False_xpu", + "test_fsdp_init_with_device_mesh_is_even_sharded_model_True_xpu", + "test_raises_warning_or_errors_xpu", + ), + "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None, + "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": ( + "test_parity_with_non_frozen_fsdp_xpu", + "test_parity_with_ddp_xpu", + ), + "../../../../test/distributed/fsdp/test_fsdp_fx.py": None, + "../../../../test/distributed/fsdp/test_fsdp_input.py": None, + "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": None, + "../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": None, + "../../../../test/distributed/fsdp/test_fsdp_pure_fp16.py": None, + "../../../../test/distributed/fsdp/test_fsdp_traversal.py": None, + "../../../../test/distributed/fsdp/test_fsdp_uneven.py": None, + "../../../../test/distributed/fsdp/test_hsdp_dtensor_state_dict.py": ( + "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_False_xpu", + "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_True_xpu", + "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_False_xpu", + "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_True_xpu", + "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_False_xpu", + "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_xpu", + "test_hsdp_init_with_device_mesh_xpu", + "test_root_module_is_not_FSDP_xpu", + ), + "../../../../test/distributed/fsdp/test_utils.py": None, +} From b346bcaca2d6edbc66ef7cad3324262a9f6cb250 Mon Sep 17 00:00:00 2001 From: "Deng, Daisy" Date: Wed, 26 Feb 2025 13:44:53 +0000 Subject: [PATCH 2/5] update pytorch branch for testing only --- .github/workflows/_linux_build.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index bd25a53b0..1c29164e9 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -70,6 +70,7 @@ jobs: cd ../ && rm -rf pytorch pip install requests git clone https://github.com/pytorch/pytorch pytorch + git chekcout cherry/enable-xccl-fsdp-tests if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) # apply PRs for stock pytorch From fc0ed5ecc2f10fd3ae781629d1860630363161bd Mon Sep 17 00:00:00 2001 From: daisyden Date: Thu, 27 Feb 2025 02:41:53 +0000 Subject: [PATCH 3/5] fix lint issues --- test/xpu/run_distributed.py | 4 +++- test/xpu/skip_list_dist.py | 10 +++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/test/xpu/run_distributed.py b/test/xpu/run_distributed.py index 13b38a5d8..0fa5bb337 100644 --- a/test/xpu/run_distributed.py +++ b/test/xpu/run_distributed.py @@ -9,6 +9,7 @@ res2 = 0 fail_test = [] + # run python test def run(test_command): result = subprocess.run(test_command, capture_output=True, text=True) @@ -16,7 +17,8 @@ def run(test_command): print(result.stderr) if "FAILED" in result.stdout or "FAILED" in result.stderr: fail_test.append(" ".join(test_command)) - return result.returncode + return result.returncode + test_command = ["python", "distributed/test_c10d_ops_xccl.py"] res += run(test_command) diff --git a/test/xpu/skip_list_dist.py b/test/xpu/skip_list_dist.py index 7eba27035..992a80e8f 100644 --- a/test/xpu/skip_list_dist.py +++ b/test/xpu/skip_list_dist.py @@ -63,12 +63,12 @@ "test_fsdp_init_with_device_mesh_is_even_sharded_model_False_xpu", "test_fsdp_init_with_device_mesh_is_even_sharded_model_True_xpu", "test_raises_warning_or_errors_xpu", - ), + ), "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None, "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": ( - "test_parity_with_non_frozen_fsdp_xpu", - "test_parity_with_ddp_xpu", - ), + "test_parity_with_non_frozen_fsdp_xpu", + "test_parity_with_ddp_xpu", + ), "../../../../test/distributed/fsdp/test_fsdp_fx.py": None, "../../../../test/distributed/fsdp/test_fsdp_input.py": None, "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": None, @@ -85,6 +85,6 @@ "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_xpu", "test_hsdp_init_with_device_mesh_xpu", "test_root_module_is_not_FSDP_xpu", - ), + ), "../../../../test/distributed/fsdp/test_utils.py": None, } From d5c5a6e42a3c5e790e1a4f3c7c33063cb9b008dd Mon Sep 17 00:00:00 2001 From: daisyden Date: Thu, 27 Feb 2025 08:06:39 +0000 Subject: [PATCH 4/5] fix pytorch branch --- .github/workflows/_linux_build.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 1c29164e9..c01a9544d 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -69,10 +69,9 @@ jobs: source activate xpu_build cd ../ && rm -rf pytorch pip install requests - git clone https://github.com/pytorch/pytorch pytorch - git chekcout cherry/enable-xccl-fsdp-tests + git clone https://github.com/zhangxiaoli73/pytorch.git pytorch if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then - cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) + cd pytorch && git checkout cherry/enable-xccl-fsdp-tests # apply PRs for stock pytorch python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py git status && git show -s From 3623ae64401b12cf22b75cc3e16f7d00d75a1cab Mon Sep 17 00:00:00 2001 From: daisyden Date: Thu, 27 Feb 2025 08:16:24 +0000 Subject: [PATCH 5/5] fix pytorch branch for testing only --- .github/workflows/_linux_ut.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 22d2834ad..40c40c463 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -316,9 +316,9 @@ jobs: source activate xpu_op_${ZE_AFFINITY_MASK} cd ../ && rm -rf pytorch pip install requests - git clone https://github.com/pytorch/pytorch pytorch + git clone https://github.com/zhangxiaoli73/pytorch.git pytorch if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then - cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) + cd pytorch && git checkout cherry/enable-xccl-fsdp-tests # apply PRs for stock pytorch python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py git status && git show -s