From 6559d96796da12709389488a36ff706ee1db43c6 Mon Sep 17 00:00:00 2001 From: Micah Williamson Date: Wed, 24 Dec 2025 01:19:07 -0600 Subject: [PATCH] [ROCm][CI] Set TORCH_NCCL_BLOCKING_WAIT Distributed Tests On ROCm (#31259) Signed-off-by: Micah Williamson --- .buildkite/test-amd.yaml | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index b7a2f8edaa080..f28785e1ad205 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -222,6 +222,9 @@ steps: - tests/v1/engine/test_engine_core_client.py - tests/distributed/test_symm_mem_allreduce.py commands: + # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876 + # TODO: Remove when the bug is fixed in a future ROCm release + - export TORCH_NCCL_BLOCKING_WAIT=1 # test with torchrun tp=2 and external_dp=2 - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py # test with torchrun tp=2 and pp=2 @@ -270,9 +273,10 @@ steps: - vllm/v1/executor/uniproc_executor.py - vllm/v1/worker/gpu_worker.py commands: - # https://github.com/NVIDIA/nccl/issues/1838 - #- export NCCL_CUMEM_HOST_ENABLE=0 # test with torchrun tp=2 and dp=4 with ep + # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876 + # TODO: Remove when the bug is fixed in a future ROCm release + - export TORCH_NCCL_BLOCKING_WAIT=1 - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep - label: EPLB Algorithm Test # 5min @@ -1294,6 +1298,9 @@ steps: - tests/v1/shutdown - tests/v1/worker/test_worker_memory_snapshot.py commands: + # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876 + # TODO: Remove when the bug is fixed in a future ROCm release + - export TORCH_NCCL_BLOCKING_WAIT=1 - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py