[CI/Build] Test torchrun with 8 cards (#27548)

Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2026-07-20 20:57:21 +08:00 · 2025-10-29 10:26:06 -07:00 · 2025-10-29 10:26:06 -07:00 · f7a6682872
commit f7a6682872
parent a9fe0793f2
2 changed files with 94 additions and 10 deletions
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -205,6 +205,24 @@ steps:
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
  - popd
 - label: Distributed Tests (8 GPUs) # 4min
  timeout_in_minutes: 10
  gpu: h100
  num_gpus: 8
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - examples/offline_inference/torchrun_dp_example.py
  - vllm/config/parallel.py
  - vllm/distributed/
  - vllm/v1/engine/llm_engine.py
  - vllm/v1/executor/uniproc_executor.py
  - vllm/v1/worker/gpu_worker.py
  commands:
  # https://github.com/NVIDIA/nccl/issues/1838
  - export NCCL_CUMEM_HOST_ENABLE=0
  # test with torchrun tp=2 and dp=4 with ep
  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
 - label: EPLB Algorithm Test # 5min
  timeout_in_minutes: 15
  working_dir: "/vllm-workspace/tests"
--- a/examples/offline_inference/torchrun_dp_example.py
+++ b/examples/offline_inference/torchrun_dp_example.py
@ -9,10 +9,76 @@ To run this example:
 ```bash
 $ torchrun --nproc-per-node=2 examples/offline_inference/torchrun_dp_example.py
 ```
 With custom parallelism settings:
 ```bash
 $ torchrun --nproc-per-node=8 examples/offline_inference/torchrun_dp_example.py \
    --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
 ```
 """
 import argparse
 from vllm import LLM, SamplingParams
 def parse_args():
    parser = argparse.ArgumentParser(
        description="Data-parallel inference with torchrun"
    )
    parser.add_argument(
        "--tp-size",
        type=int,
        default=1,
        help="Tensor parallel size (default: 1)",
    )
    parser.add_argument(
        "--pp-size",
        type=int,
        default=1,
        help="Pipeline parallel size (default: 1)",
    )
    parser.add_argument(
        "--dp-size",
        type=int,
        default=2,
        help="Data parallel size (default: 2)",
    )
    parser.add_argument(
        "--enable-ep",
        action="store_true",
        help="Enable expert parallel (default: False)",
    )
    parser.add_argument(
        "--model",
        type=str,
        default="microsoft/Phi-mini-MoE-instruct",
        help="Model name or path (default: microsoft/Phi-mini-MoE-instruct)",
    )
    parser.add_argument(
        "--max-model-len",
        type=int,
        default=4096,
        help="Maximum model length (default: 4096)",
    )
    parser.add_argument(
        "--gpu-memory-utilization",
        type=float,
        default=0.6,
        help="GPU memory utilization (default: 0.6)",
    )
    parser.add_argument(
        "--seed",
        type=int,
        default=1,
        help="Random seed (default: 1)",
    )
    return parser.parse_args()
 args = parse_args()
 # Create prompts, the same across all ranks
 prompts = [
    "Hello, my name is",
@ -30,15 +96,15 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 # all ranks have the same random seed, so that sampling can be
 # deterministic across ranks.
 llm = LLM(
-    model="microsoft/Phi-mini-MoE-instruct",
+    model=args.model,
-    tensor_parallel_size=1,
+    tensor_parallel_size=args.tp_size,
-    data_parallel_size=2,
+    data_parallel_size=args.dp_size,
-    pipeline_parallel_size=1,
+    pipeline_parallel_size=args.pp_size,
-    enable_expert_parallel=False,
+    enable_expert_parallel=args.enable_ep,
    distributed_executor_backend="external_launcher",
-    max_model_len=4096,
+    max_model_len=args.max_model_len,
-    gpu_memory_utilization=0.6,
+    gpu_memory_utilization=args.gpu_memory_utilization,
-    seed=1,
+    seed=args.seed,
 )
 dp_rank = llm.llm_engine.vllm_config.parallel_config.data_parallel_rank