intel · daisyden · Feb 25, 2025 · Feb 26, 2025 · Feb 27, 2025 · Feb 27, 2025
diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml
@@ -69,9 +69,9 @@ jobs:
           source activate xpu_build
           cd ../ && rm -rf pytorch
           pip install requests
-          git clone https://github.com/pytorch/pytorch pytorch
+          git clone https://github.com/zhangxiaoli73/pytorch.git pytorch
           if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
-            cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
+            cd pytorch && git checkout cherry/enable-xccl-fsdp-tests
             # apply PRs for stock pytorch
             python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
             git status && git show -s

diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
@@ -316,9 +316,9 @@ jobs:
           source activate xpu_op_${ZE_AFFINITY_MASK}
           cd ../ && rm -rf pytorch
           pip install requests
-          git clone https://github.com/pytorch/pytorch pytorch
+          git clone https://github.com/zhangxiaoli73/pytorch.git pytorch
           if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
-            cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
+            cd pytorch && git checkout cherry/enable-xccl-fsdp-tests
             # apply PRs for stock pytorch
             python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
             git status && git show -s

diff --git a/test/xpu/run_distributed.py b/test/xpu/run_distributed.py
@@ -2,22 +2,42 @@
 import subprocess
 import sys
 
+from skip_list_dist import skip_dict
+from xpu_test_utils import launch_test
 
+res = 0
+res2 = 0
+fail_test = []
+
+
+# run python test
 def run(test_command):
     result = subprocess.run(test_command, capture_output=True, text=True)
     print(result.stdout)
     print(result.stderr)
     if "FAILED" in result.stdout or "FAILED" in result.stderr:
-        return 0
-    else:
-        return 1
+        fail_test.append(" ".join(test_command))
+    return result.returncode
 
 
-res = 0
 test_command = ["python", "distributed/test_c10d_ops_xccl.py"]
 res += run(test_command)
 test_command = ["python", "distributed/test_c10d_xccl.py"]
 res += run(test_command)
 
-exit_code = os.WEXITSTATUS(res)
-sys.exit(exit_code)
+# run pytest with skiplist
+for key in skip_dict:
+    skip_list = skip_dict[key]
+    fail = launch_test(key, skip_list)
+    res2 += fail
+    if fail:
+        fail_test.append(key)
+
+if fail_test:
+    print(",".join(fail_test) + " have failures")
+
+exit_code = os.WEXITSTATUS(res2)
+if exit_code == 0:
+    sys.exit(res)
+else:
+    sys.exit(exit_code)
diff --git a/test/xpu/skip_list_dist.py b/test/xpu/skip_list_dist.py
@@ -0,0 +1,90 @@
+skip_dict = {
+    "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": (
+        "test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_False_use_orig_params_False",
+        "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_False_use_orig_params_False",
+        "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_True_use_orig_params_False",
+        "test_checkpoint_submodule_use_reentrant_False_xpu",
+    ),
+    "../../../../test/distributed/fsdp/test_fsdp_apply.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": (
+        "test_ddp_parity_xpu",
+    ),
+    "../../../../test/distributed/fsdp/test_fsdp_common.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_core.py": (
+        "test_delayed_optim_step_offload_false_no_shard_xpu",
+        "test_delayed_optim_step_offload_false_none_xpu",
+        "test_delayed_optim_step_offload_false_shard_grad_op_xpu",
+        "test_delayed_optim_step_offload_true_none_xpu",
+        "test_delayed_optim_step_offload_true_shard_grad_op_xpu",
+        "test_delayed_reduce_scatter_offload_false_no_shard_xpu",
+        "test_delayed_reduce_scatter_offload_false_none_xpu",
+        "test_delayed_reduce_scatter_offload_false_shard_grad_op_xpu",
+        "test_delayed_reduce_scatter_offload_true_none_xpu",
+        "test_delayed_reduce_scatter_offload_true_shard_grad_op_xpu",
+        "test_mixture_of_experts_offload_false_no_shard_xpu",
+        "test_mixture_of_experts_offload_false_none_xpu",
+        "test_mixture_of_experts_offload_false_shard_grad_op_xpu",
+        "test_mixture_of_experts_offload_true_none_xpu",
+        "test_mixture_of_experts_offload_true_shard_grad_op_xpu",
+        "test_mixture_of_experts_with_delay_before_free_offload_false_no_shard_xpu",
+        "test_mixture_of_experts_with_delay_before_free_offload_false_none_xpu",
+        "test_mixture_of_experts_with_delay_before_free_offload_false_shard_grad_op_xpu",
+        "test_mixture_of_experts_with_delay_before_free_offload_true_none_xpu",
+        "test_mixture_of_experts_with_delay_before_free_offload_true_shard_grad_op_xpu",
+        "test_nested_always_wrap_model_offload_false_no_shard_xpu",
+        "test_nested_always_wrap_model_offload_false_none_xpu",
+        "test_nested_always_wrap_model_offload_false_shard_grad_op_xpu",
+        "test_nested_always_wrap_model_offload_true_none_xpu",
+        "test_nested_always_wrap_model_offload_true_shard_grad_op_xpu",
+        "test_nested_wrapped_model_offload_false_no_shard_xpu",
+        "test_nested_wrapped_model_offload_false_none_xpu",
+        "test_nested_wrapped_model_offload_false_shard_grad_op_xpu",
+        "test_nested_wrapped_model_offload_true_none_xpu",
+        "test_nested_wrapped_model_offload_true_shard_grad_op_xpu",
+        "test_transformer_offload_false_no_shard_xpu",
+        "test_transformer_offload_false_none_xpu",
+        "test_transformer_offload_false_shard_grad_op_xpu",
+        "test_transformer_offload_true_none_xpu",
+        "test_transformer_offload_true_shard_grad_op_xpu",
+    ),
+    "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": (
+        "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_False_is_even_sharded_model_False_xpu",
+        "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_False_is_even_sharded_model_True_xpu",
+        "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_True_is_even_sharded_model_False_xpu",
+        "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_True_is_even_sharded_model_True_xpu",
+        "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_False_is_even_sharded_model_False_xpu",
+        "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_False_is_even_sharded_model_True_xpu",
+        "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_True_is_even_sharded_model_False_xpu",
+        "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_True_is_even_sharded_model_True_xpu",
+        "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_False_is_even_sharded_model_False_xpu",
+        "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_False_is_even_sharded_model_True_xpu",
+        "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_is_even_sharded_model_False_xpu",
+        "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_is_even_sharded_model_True_xpu",
+        "test_fsdp_init_with_device_mesh_is_even_sharded_model_False_xpu",
+        "test_fsdp_init_with_device_mesh_is_even_sharded_model_True_xpu",
+        "test_raises_warning_or_errors_xpu",
+    ),
+    "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": (
+        "test_parity_with_non_frozen_fsdp_xpu",
+        "test_parity_with_ddp_xpu",
+    ),
+    "../../../../test/distributed/fsdp/test_fsdp_fx.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_input.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_pure_fp16.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_traversal.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_uneven.py": None,
+    "../../../../test/distributed/fsdp/test_hsdp_dtensor_state_dict.py": (
+        "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_False_xpu",
+        "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_True_xpu",
+        "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_False_xpu",
+        "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_True_xpu",
+        "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_False_xpu",
+        "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_xpu",
+        "test_hsdp_init_with_device_mesh_xpu",
+        "test_root_module_is_not_FSDP_xpu",
+    ),
+    "../../../../test/distributed/fsdp/test_utils.py": None,
+}