[CI/Build] Test torchrun with 8 cards (#27548)

Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-12-10 02:05:01 +08:00 · 2025-10-29 10:26:06 -07:00 · 2025-10-29 10:26:06 -07:00 · f7a6682872
commit f7a6682872
parent a9fe0793f2
2 changed files with 94 additions and 10 deletions
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -205,6 +205,24 @@ steps:
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
  - popd

+- label: Distributed Tests (8 GPUs) # 4min
+  timeout_in_minutes: 10
+  gpu: h100
+  num_gpus: 8
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - examples/offline_inference/torchrun_dp_example.py
+  - vllm/config/parallel.py
+  - vllm/distributed/
+  - vllm/v1/engine/llm_engine.py
+  - vllm/v1/executor/uniproc_executor.py
+  - vllm/v1/worker/gpu_worker.py
+  commands:
+  # https://github.com/NVIDIA/nccl/issues/1838
+  - export NCCL_CUMEM_HOST_ENABLE=0
+  # test with torchrun tp=2 and dp=4 with ep
+  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
+
 - label: EPLB Algorithm Test # 5min
  timeout_in_minutes: 15
  working_dir: "/vllm-workspace/tests"
@ -401,7 +419,7 @@ steps:
      --ignore=lora/test_deepseekv2_tp.py \
      --ignore=lora/test_gptoss.py \
      --ignore=lora/test_qwen3moe_tp.py
-      
+
  parallelism: 4

 - label: PyTorch Compilation Unit Tests # 15min
@ -1126,7 +1144,7 @@ steps:
  - tests/weight_loading
  commands:
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
-  
+
 - label: NixlConnector PD accuracy tests (Distributed) # 30min
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
--- a/examples/offline_inference/torchrun_dp_example.py
+++ b/examples/offline_inference/torchrun_dp_example.py
@ -9,10 +9,76 @@ To run this example:
 ```bash
 $ torchrun --nproc-per-node=2 examples/offline_inference/torchrun_dp_example.py
 ```
+
+With custom parallelism settings:
+```bash
+$ torchrun --nproc-per-node=8 examples/offline_inference/torchrun_dp_example.py \
+    --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
+```
 """

+import argparse
+
 from vllm import LLM, SamplingParams

+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Data-parallel inference with torchrun"
+    )
+    parser.add_argument(
+        "--tp-size",
+        type=int,
+        default=1,
+        help="Tensor parallel size (default: 1)",
+    )
+    parser.add_argument(
+        "--pp-size",
+        type=int,
+        default=1,
+        help="Pipeline parallel size (default: 1)",
+    )
+    parser.add_argument(
+        "--dp-size",
+        type=int,
+        default=2,
+        help="Data parallel size (default: 2)",
+    )
+    parser.add_argument(
+        "--enable-ep",
+        action="store_true",
+        help="Enable expert parallel (default: False)",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="microsoft/Phi-mini-MoE-instruct",
+        help="Model name or path (default: microsoft/Phi-mini-MoE-instruct)",
+    )
+    parser.add_argument(
+        "--max-model-len",
+        type=int,
+        default=4096,
+        help="Maximum model length (default: 4096)",
+    )
+    parser.add_argument(
+        "--gpu-memory-utilization",
+        type=float,
+        default=0.6,
+        help="GPU memory utilization (default: 0.6)",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=1,
+        help="Random seed (default: 1)",
+    )
+    return parser.parse_args()
+
+
+args = parse_args()
+
+
 # Create prompts, the same across all ranks
 prompts = [
    "Hello, my name is",
@ -30,15 +96,15 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 # all ranks have the same random seed, so that sampling can be
 # deterministic across ranks.
 llm = LLM(
-    model="microsoft/Phi-mini-MoE-instruct",
-    tensor_parallel_size=1,
-    data_parallel_size=2,
-    pipeline_parallel_size=1,
-    enable_expert_parallel=False,
+    model=args.model,
+    tensor_parallel_size=args.tp_size,
+    data_parallel_size=args.dp_size,
+    pipeline_parallel_size=args.pp_size,
+    enable_expert_parallel=args.enable_ep,
    distributed_executor_backend="external_launcher",
-    max_model_len=4096,
-    gpu_memory_utilization=0.6,
-    seed=1,
+    max_model_len=args.max_model_len,
+    gpu_memory_utilization=args.gpu_memory_utilization,
+    seed=args.seed,
 )

 dp_rank = llm.llm_engine.vllm_config.parallel_config.data_parallel_rank