From 47b1ffd757edee6348ca407a5c9019dec1c0e528 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Fri, 22 Aug 2025 23:19:41 +0800 Subject: [PATCH] Add ported distributed cases --- .github/actions/linux-uttest/action.yml | 6 +- .github/scripts/ut_result_check.sh | 3 +- .github/workflows/_linux_ut.yml | 3 +- test/xpu/run_distributed.py | 48 +++++++++-- test/xpu/skip_list_dist.py | 101 ++++-------------------- 5 files changed, 65 insertions(+), 96 deletions(-) diff --git a/.github/actions/linux-uttest/action.yml b/.github/actions/linux-uttest/action.yml index fb7918200..6e2cf2cee 100644 --- a/.github/actions/linux-uttest/action.yml +++ b/.github/actions/linux-uttest/action.yml @@ -155,7 +155,7 @@ runs: tee ${{ github.workspace }}/ut_log/xpu_profiling/test_profiler_tree.log - name: xpu_distributed - shell: timeout 3600 bash -xeu -o pipefail {0} + shell: timeout 36000 bash -xeu -o pipefail {0} if: ${{ inputs.ut_name == 'xpu_distributed' }} run: | xpu-smi topology -m @@ -166,9 +166,13 @@ runs: echo -e "[ERROR] XCCL is not enabled" exit 1 fi + export CCL_ROOT=$(dirname $(which python))/../ + export PATH="${CCL_ROOT}/bin/libfabric:${PATH}" + export LD_LIBRARY_PATH="${CCL_ROOT}/lib:${LD_LIBRARY_PATH}" python run_distributed.py \ 2> ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | \ tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log + find ../ -type f -name "*.xml" -exec cp {} ${{ github.workspace }}/ut_log/ \; # Summary - name: UT Test Results Summary diff --git a/.github/scripts/ut_result_check.sh b/.github/scripts/ut_result_check.sh index 0e6b95ec4..1aac53249 100644 --- a/.github/scripts/ut_result_check.sh +++ b/.github/scripts/ut_result_check.sh @@ -198,7 +198,8 @@ if [[ "${ut_suite}" == 'op_regression' || "${ut_suite}" == 'op_regression_dev1' fi if [[ "${ut_suite}" == 'xpu_distributed' ]]; then - grep -E "^FAILED" xpu_distributed_test.log | awk '{print $2}' > ./"${ut_suite}"_xpu_distributed_test_failed.log + grep -E "^FAILED" xpu_distributed_test.log | awk '{print $3}' > ./"${ut_suite}"_xpu_distributed_test_failed.log + sed -i '/^[^.]\+/d' ./"${ut_suite}"_xpu_distributed_test_failed.log grep "PASSED" xpu_distributed_test.log | awk '{print $1}' > ./"${ut_suite}"_xpu_distributed_test_passed.log echo -e "=========================================================================" echo -e "Show Failed cases in ${ut_suite} xpu distributed" diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 1751d9d97..6da0abadd 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -99,11 +99,12 @@ jobs: test-in-baremetal: needs: runner + timeout-minutes: 600 if: ${{ contains(inputs.ut, 'distributed') }} runs-on: ${{ needs.runner.outputs.runner_id }} env: AGENT_TOOLSDIRECTORY: /tmp/xpu-tool - PYTEST_ADDOPTS: -v --timeout 600 --timeout_method=thread -n 1 + PYTEST_ADDOPTS: -v --timeout 3600 --timeout_method=thread -n 1 steps: - name: Checkout torch-xpu-ops uses: actions/checkout@v4 diff --git a/test/xpu/run_distributed.py b/test/xpu/run_distributed.py index ddde5f8c8..cd0471209 100644 --- a/test/xpu/run_distributed.py +++ b/test/xpu/run_distributed.py @@ -9,9 +9,41 @@ res2 = 0 fail_test = [] -# libfabric WA to avoid hang issue -os.environ["FI_PROVIDER"] = "tcp" -# os.environ["ZE_AFFINITY_MASK"] = "0,1,2,3" +# Get the xelink group card affinity +ret = os.system("xpu-smi topology -m 2>&1|tee topology.log") +if ret == 0: + gpu_dict = {} + with open("topology.log") as file: + lines = file.readlines() + for line in lines: + if "CPU Affinity" in line: + continue + line = line.strip() + if line.startswith("GPU "): + items = line.split(" ") + items = [x for x in items if x] + gpu_id = items[1] + i = gpu_id.split("/")[0] + affinity = "" + for j, item in enumerate(items): + if "SYS" not in item and ("XL" in item or "S" in item): + if len(affinity) == 0: + affinity = str(j - 2) + else: + affinity = affinity + "," + str(j - 2) + gpu_dict[i] = affinity + + max_affinity = "" + for key, value in gpu_dict.items(): + if len(value) > len(max_affinity): + max_affinity = value + + os.environ["ZE_AFFINITY_MASK"] = str(max_affinity) + print(str("ZE_AFFINITY_MASK=" + os.environ.get("ZE_AFFINITY_MASK"))) + +else: + print("xpu-smi topology failed") + sys.exit(255) # run python test @@ -26,6 +58,10 @@ def run(test_command): test_command = ["python", "distributed/test_c10d_ops_xccl.py"] res += run(test_command) +test_command = ["python", "../../../../test/distributed/pipelining/test_backward.py"] +res += run(test_command) +test_command = ["python", "../../../../test/distributed/pipelining/test_microbatch.py"] +res += run(test_command) # run pytest with skiplist for key in skip_dict: @@ -38,8 +74,4 @@ def run(test_command): if fail_test: print(",".join(fail_test) + " have failures") -exit_code = os.WEXITSTATUS(res2) -if exit_code == 0: - sys.exit(res) -else: - sys.exit(exit_code) +sys.exit(res) diff --git a/test/xpu/skip_list_dist.py b/test/xpu/skip_list_dist.py index 1210896ec..cf5ed5cd7 100644 --- a/test/xpu/skip_list_dist.py +++ b/test/xpu/skip_list_dist.py @@ -1,105 +1,36 @@ skip_dict = { - "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": ( - "test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_False_use_orig_params_False", - "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_False_use_orig_params_False", - "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_True_use_orig_params_False", - "test_checkpoint_submodule_use_reentrant_False_xpu", - ), + "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": None, "../../../../test/distributed/fsdp/test_fsdp_apply.py": None, "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": ( "test_ddp_parity_xpu", ), "../../../../test/distributed/fsdp/test_fsdp_comm.py": None, - "../../../../test/distributed/fsdp/test_fsdp_core.py": ( - "test_delayed_optim_step_offload_false_no_shard_xpu", - "test_delayed_optim_step_offload_false_none_xpu", - "test_delayed_optim_step_offload_false_shard_grad_op_xpu", - "test_delayed_optim_step_offload_true_none_xpu", - "test_delayed_optim_step_offload_true_shard_grad_op_xpu", - "test_delayed_reduce_scatter_offload_false_no_shard_xpu", - "test_delayed_reduce_scatter_offload_false_none_xpu", - "test_delayed_reduce_scatter_offload_false_shard_grad_op_xpu", - "test_delayed_reduce_scatter_offload_true_none_xpu", - "test_delayed_reduce_scatter_offload_true_shard_grad_op_xpu", - "test_mixture_of_experts_offload_false_no_shard_xpu", - "test_mixture_of_experts_offload_false_none_xpu", - "test_mixture_of_experts_offload_false_shard_grad_op_xpu", - "test_mixture_of_experts_offload_true_no_shard_xpu", - "test_mixture_of_experts_offload_true_none_xpu", - "test_mixture_of_experts_offload_true_shard_grad_op_xpu", - "test_mixture_of_experts_with_delay_before_free_offload_false_no_shard_xpu", - "test_mixture_of_experts_with_delay_before_free_offload_false_none_xpu", - "test_mixture_of_experts_with_delay_before_free_offload_false_shard_grad_op_xpu", - "test_mixture_of_experts_with_delay_before_free_offload_true_no_shard_xpu", - "test_mixture_of_experts_with_delay_before_free_offload_true_none_xpu", - "test_mixture_of_experts_with_delay_before_free_offload_true_shard_grad_op_xpu", - "test_nested_always_wrap_model_offload_false_no_shard_xpu", - "test_nested_always_wrap_model_offload_false_none_xpu", - "test_nested_always_wrap_model_offload_false_shard_grad_op_xpu", - "test_nested_always_wrap_model_offload_true_none_xpu", - "test_nested_always_wrap_model_offload_true_shard_grad_op_xpu", - "test_nested_wrapped_model_offload_false_no_shard_xpu", - "test_nested_wrapped_model_offload_false_none_xpu", - "test_nested_wrapped_model_offload_false_shard_grad_op_xpu", - "test_nested_wrapped_model_offload_true_none_xpu", - "test_nested_wrapped_model_offload_true_shard_grad_op_xpu", - "test_transformer_offload_false_no_shard_xpu", - "test_transformer_offload_false_none_xpu", - "test_transformer_offload_false_shard_grad_op_xpu", - "test_transformer_offload_true_none_xpu", - "test_transformer_offload_true_shard_grad_op_xpu", - # https://github.com/intel/torch-xpu-ops/issues/1475 - "test_transformer_no_grad_mixed_precision_True_xpu", - "test_transformer_no_grad_mixed_precision_False_xpu", - ), - # Will add them back after debugging - # "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": ( - # "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_False_is_even_sharded_model_False_xpu", - # "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_False_is_even_sharded_model_True_xpu", - # "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_True_is_even_sharded_model_False_xpu", - # "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_True_is_even_sharded_model_True_xpu", - # "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_False_is_even_sharded_model_False_xpu", - # "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_False_is_even_sharded_model_True_xpu", - # "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_True_is_even_sharded_model_False_xpu", - # "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_True_is_even_sharded_model_True_xpu", - # "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_False_is_even_sharded_model_False_xpu", - # "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_False_is_even_sharded_model_True_xpu", - # "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_is_even_sharded_model_False_xpu", - # "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_is_even_sharded_model_True_xpu", - # "test_fsdp_init_with_device_mesh_is_even_sharded_model_False_xpu", - # "test_fsdp_init_with_device_mesh_is_even_sharded_model_True_xpu", - # "test_raises_warning_or_errors_xpu", - # ), - "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": ( - "test_invalid_first_iter_order_sharding_strategy1_xpu", - "test_train_eval_sharding_strategy1_xpu", - ), + "../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": None, + "../../../../test/distributed/fsdp/test_fsdp_core.py": None, + "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None, "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": ( "test_parity_with_non_frozen_fsdp_xpu", "test_parity_with_ddp_xpu", ), "../../../../test/distributed/fsdp/test_fsdp_fx.py": None, - # will bring back after oneccl upgrade to 2021.16.1 - # "../../../../test/distributed/fsdp/test_fsdp_input.py": None, + "../../../../test/distributed/fsdp/test_fsdp_input.py": None, "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": None, "../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": ( "test_transformer_no_grad_mixed_precision_True_xpu", ), "../../../../test/distributed/fsdp/test_fsdp_uneven.py": None, - # Will add them back after debugging - # "../../../../test/distributed/fsdp/test_hsdp_dtensor_state_dict.py": ( - # "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_False_xpu", - # "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_True_xpu", - # "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_False_xpu", - # "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_True_xpu", - # "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_False_xpu", - # "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_xpu", - # "test_hsdp_init_with_device_mesh_xpu", - # "test_root_module_is_not_FSDP_xpu", - # ), "../../../../test/distributed/fsdp/test_utils.py": None, "distributed/test_c10d_xccl.py": ( - # will bring back after oneccl upgrade to 2021.16.1 - "test_xccl_barrier", + # https://github.com/intel/torch-xpu-ops/issues/2046 + "test_unwaited", + ), + "distributed/test_c10d_ops_xccl.py": None, + "../../../../test/distributed/fsdp/test_fsdp_misc.py": None, + "../../../../test/distributed/test_functional_api.py": ( + # depends on https://github.com/pytorch/pytorch/pull/159473 + "test_tracing_with_fakepg_xpu", ), + "../../../../test/distributed/_tools/test_fsdp2_mem_tracker.py": None, + "../../../../test/distributed/_tools/test_mem_tracker.py": None, + "../../../../test/distributed/_tools/test_memory_tracker.py": None, }