Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .github/actions/linux-uttest/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ runs:
tee ${{ github.workspace }}/ut_log/xpu_profiling/test_profiler_tree.log

- name: xpu_distributed
shell: timeout 3600 bash -xeu -o pipefail {0}
shell: timeout 36000 bash -xeu -o pipefail {0}
if: ${{ inputs.ut_name == 'xpu_distributed' }}
run: |
xpu-smi topology -m
Expand All @@ -166,9 +166,13 @@ runs:
echo -e "[ERROR] XCCL is not enabled"
exit 1
fi
export CCL_ROOT=$(dirname $(which python))/../
export PATH="${CCL_ROOT}/bin/libfabric:${PATH}"
export LD_LIBRARY_PATH="${CCL_ROOT}/lib:${LD_LIBRARY_PATH}"
python run_distributed.py \
2> ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | \
tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log
find ../ -type f -name "*.xml" -exec cp {} ${{ github.workspace }}/ut_log/ \;

# Summary
- name: UT Test Results Summary
Expand Down
3 changes: 2 additions & 1 deletion .github/scripts/ut_result_check.sh
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,8 @@ if [[ "${ut_suite}" == 'op_regression' || "${ut_suite}" == 'op_regression_dev1'
fi

if [[ "${ut_suite}" == 'xpu_distributed' ]]; then
grep -E "^FAILED" xpu_distributed_test.log | awk '{print $2}' > ./"${ut_suite}"_xpu_distributed_test_failed.log
grep -E "^FAILED" xpu_distributed_test.log | awk '{print $3}' > ./"${ut_suite}"_xpu_distributed_test_failed.log
sed -i '/^[^.]\+/d' ./"${ut_suite}"_xpu_distributed_test_failed.log
grep "PASSED" xpu_distributed_test.log | awk '{print $1}' > ./"${ut_suite}"_xpu_distributed_test_passed.log
echo -e "========================================================================="
echo -e "Show Failed cases in ${ut_suite} xpu distributed"
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/_linux_ut.yml
Original file line number Diff line number Diff line change
Expand Up @@ -99,11 +99,12 @@ jobs:

test-in-baremetal:
needs: runner
timeout-minutes: 600
if: ${{ contains(inputs.ut, 'distributed') }}
runs-on: ${{ needs.runner.outputs.runner_id }}
env:
AGENT_TOOLSDIRECTORY: /tmp/xpu-tool
PYTEST_ADDOPTS: -v --timeout 600 --timeout_method=thread -n 1
PYTEST_ADDOPTS: -v --timeout 3600 --timeout_method=thread -n 1
steps:
- name: Checkout torch-xpu-ops
uses: actions/checkout@v4
Expand Down
48 changes: 40 additions & 8 deletions test/xpu/run_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,41 @@
res2 = 0
fail_test = []

# libfabric WA to avoid hang issue
os.environ["FI_PROVIDER"] = "tcp"
# os.environ["ZE_AFFINITY_MASK"] = "0,1,2,3"
# Get the xelink group card affinity
ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
if ret == 0:
gpu_dict = {}
with open("topology.log") as file:
lines = file.readlines()
for line in lines:
if "CPU Affinity" in line:
continue
line = line.strip()
if line.startswith("GPU "):
items = line.split(" ")
items = [x for x in items if x]
gpu_id = items[1]
i = gpu_id.split("/")[0]
affinity = ""
for j, item in enumerate(items):
if "SYS" not in item and ("XL" in item or "S" in item):
if len(affinity) == 0:
affinity = str(j - 2)
else:
affinity = affinity + "," + str(j - 2)
gpu_dict[i] = affinity

max_affinity = ""
for key, value in gpu_dict.items():
if len(value) > len(max_affinity):
max_affinity = value

os.environ["ZE_AFFINITY_MASK"] = str(max_affinity)
print(str("ZE_AFFINITY_MASK=" + os.environ.get("ZE_AFFINITY_MASK")))

else:
print("xpu-smi topology failed")
sys.exit(255)


# run python test
Expand All @@ -26,6 +58,10 @@ def run(test_command):

test_command = ["python", "distributed/test_c10d_ops_xccl.py"]
res += run(test_command)
test_command = ["python", "../../../../test/distributed/pipelining/test_backward.py"]
res += run(test_command)
test_command = ["python", "../../../../test/distributed/pipelining/test_microbatch.py"]
res += run(test_command)

# run pytest with skiplist
for key in skip_dict:
Expand All @@ -38,8 +74,4 @@ def run(test_command):
if fail_test:
print(",".join(fail_test) + " have failures")

exit_code = os.WEXITSTATUS(res2)
if exit_code == 0:
sys.exit(res)
else:
sys.exit(exit_code)
sys.exit(res)
101 changes: 16 additions & 85 deletions test/xpu/skip_list_dist.py
Original file line number Diff line number Diff line change
@@ -1,105 +1,36 @@
skip_dict = {
"../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": (
"test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_False_use_orig_params_False",
"test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_False_use_orig_params_False",
"test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_True_use_orig_params_False",
"test_checkpoint_submodule_use_reentrant_False_xpu",
),
"../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": None,
"../../../../test/distributed/fsdp/test_fsdp_apply.py": None,
"../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": (
"test_ddp_parity_xpu",
),
"../../../../test/distributed/fsdp/test_fsdp_comm.py": None,
"../../../../test/distributed/fsdp/test_fsdp_core.py": (
"test_delayed_optim_step_offload_false_no_shard_xpu",
"test_delayed_optim_step_offload_false_none_xpu",
"test_delayed_optim_step_offload_false_shard_grad_op_xpu",
"test_delayed_optim_step_offload_true_none_xpu",
"test_delayed_optim_step_offload_true_shard_grad_op_xpu",
"test_delayed_reduce_scatter_offload_false_no_shard_xpu",
"test_delayed_reduce_scatter_offload_false_none_xpu",
"test_delayed_reduce_scatter_offload_false_shard_grad_op_xpu",
"test_delayed_reduce_scatter_offload_true_none_xpu",
"test_delayed_reduce_scatter_offload_true_shard_grad_op_xpu",
"test_mixture_of_experts_offload_false_no_shard_xpu",
"test_mixture_of_experts_offload_false_none_xpu",
"test_mixture_of_experts_offload_false_shard_grad_op_xpu",
"test_mixture_of_experts_offload_true_no_shard_xpu",
"test_mixture_of_experts_offload_true_none_xpu",
"test_mixture_of_experts_offload_true_shard_grad_op_xpu",
"test_mixture_of_experts_with_delay_before_free_offload_false_no_shard_xpu",
"test_mixture_of_experts_with_delay_before_free_offload_false_none_xpu",
"test_mixture_of_experts_with_delay_before_free_offload_false_shard_grad_op_xpu",
"test_mixture_of_experts_with_delay_before_free_offload_true_no_shard_xpu",
"test_mixture_of_experts_with_delay_before_free_offload_true_none_xpu",
"test_mixture_of_experts_with_delay_before_free_offload_true_shard_grad_op_xpu",
"test_nested_always_wrap_model_offload_false_no_shard_xpu",
"test_nested_always_wrap_model_offload_false_none_xpu",
"test_nested_always_wrap_model_offload_false_shard_grad_op_xpu",
"test_nested_always_wrap_model_offload_true_none_xpu",
"test_nested_always_wrap_model_offload_true_shard_grad_op_xpu",
"test_nested_wrapped_model_offload_false_no_shard_xpu",
"test_nested_wrapped_model_offload_false_none_xpu",
"test_nested_wrapped_model_offload_false_shard_grad_op_xpu",
"test_nested_wrapped_model_offload_true_none_xpu",
"test_nested_wrapped_model_offload_true_shard_grad_op_xpu",
"test_transformer_offload_false_no_shard_xpu",
"test_transformer_offload_false_none_xpu",
"test_transformer_offload_false_shard_grad_op_xpu",
"test_transformer_offload_true_none_xpu",
"test_transformer_offload_true_shard_grad_op_xpu",
# https://github.com/intel/torch-xpu-ops/issues/1475
"test_transformer_no_grad_mixed_precision_True_xpu",
"test_transformer_no_grad_mixed_precision_False_xpu",
),
# Will add them back after debugging
# "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": (
# "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_False_is_even_sharded_model_False_xpu",
# "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_False_is_even_sharded_model_True_xpu",
# "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_True_is_even_sharded_model_False_xpu",
# "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_True_is_even_sharded_model_True_xpu",
# "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_False_is_even_sharded_model_False_xpu",
# "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_False_is_even_sharded_model_True_xpu",
# "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_True_is_even_sharded_model_False_xpu",
# "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_True_is_even_sharded_model_True_xpu",
# "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_False_is_even_sharded_model_False_xpu",
# "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_False_is_even_sharded_model_True_xpu",
# "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_is_even_sharded_model_False_xpu",
# "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_is_even_sharded_model_True_xpu",
# "test_fsdp_init_with_device_mesh_is_even_sharded_model_False_xpu",
# "test_fsdp_init_with_device_mesh_is_even_sharded_model_True_xpu",
# "test_raises_warning_or_errors_xpu",
# ),
"../../../../test/distributed/fsdp/test_fsdp_exec_order.py": (
"test_invalid_first_iter_order_sharding_strategy1_xpu",
"test_train_eval_sharding_strategy1_xpu",
),
"../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": None,
"../../../../test/distributed/fsdp/test_fsdp_core.py": None,
"../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None,
"../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": (
"test_parity_with_non_frozen_fsdp_xpu",
"test_parity_with_ddp_xpu",
),
"../../../../test/distributed/fsdp/test_fsdp_fx.py": None,
# will bring back after oneccl upgrade to 2021.16.1
# "../../../../test/distributed/fsdp/test_fsdp_input.py": None,
"../../../../test/distributed/fsdp/test_fsdp_input.py": None,
"../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": None,
"../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": (
"test_transformer_no_grad_mixed_precision_True_xpu",
),
"../../../../test/distributed/fsdp/test_fsdp_uneven.py": None,
# Will add them back after debugging
# "../../../../test/distributed/fsdp/test_hsdp_dtensor_state_dict.py": (
# "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_False_xpu",
# "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_True_xpu",
# "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_False_xpu",
# "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_True_xpu",
# "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_False_xpu",
# "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_xpu",
# "test_hsdp_init_with_device_mesh_xpu",
# "test_root_module_is_not_FSDP_xpu",
# ),
"../../../../test/distributed/fsdp/test_utils.py": None,
"distributed/test_c10d_xccl.py": (
# will bring back after oneccl upgrade to 2021.16.1
"test_xccl_barrier",
# https://github.com/intel/torch-xpu-ops/issues/2046
"test_unwaited",
),
"distributed/test_c10d_ops_xccl.py": None,
"../../../../test/distributed/fsdp/test_fsdp_misc.py": None,
"../../../../test/distributed/test_functional_api.py": (
# depends on https://github.com/pytorch/pytorch/pull/159473
"test_tracing_with_fakepg_xpu",
),
"../../../../test/distributed/_tools/test_fsdp2_mem_tracker.py": None,
"../../../../test/distributed/_tools/test_mem_tracker.py": None,
"../../../../test/distributed/_tools/test_memory_tracker.py": None,
}