From ae0770fa6bcc58892692f0d1b2a3f0a300dbe82a Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Sat, 20 Dec 2025 16:48:49 -0500 Subject: [PATCH] [CI] Fix H200 Distributed test (#31054) Signed-off-by: Lucas Wilkinson --- .buildkite/test-amd.yaml | 6 +- .buildkite/test-pipeline.yaml | 6 +- .buildkite/test_areas/distributed.yaml | 4 +- examples/offline_inference/data_parallel.py | 164 +++++--------------- 4 files changed, 51 insertions(+), 129 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 9a770869b1d17..bd00c47df8cb0 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1254,13 +1254,13 @@ steps: - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code + - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code + - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code - label: Distributed Tests (2 GPUs) # 68min timeout_in_minutes: 90 @@ -1508,7 +1508,7 @@ steps: - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py - - HIP_VISIBLE_DEVICES=0,1 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput + - HIP_VISIBLE_DEVICES=0,1 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - pytest -v -s tests/v1/distributed/test_dbo.py ##### B200 test ##### diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index f644504a5b937..3c823fc872b05 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -1109,13 +1109,13 @@ steps: - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code + - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code + - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code - label: Distributed Tests (2 GPUs) # 68min timeout_in_minutes: 90 @@ -1334,7 +1334,7 @@ steps: - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py - - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput + - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - pytest -v -s tests/v1/distributed/test_dbo.py ##### B200 test ##### diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml index 52d57c99fcfb5..1a3739cc2417a 100644 --- a/.buildkite/test_areas/distributed.yaml +++ b/.buildkite/test_areas/distributed.yaml @@ -145,7 +145,7 @@ steps: - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4' - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py - - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput + - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - pytest -v -s tests/v1/distributed/test_dbo.py - label: Distributed Tests (2 GPUs)(B200) @@ -171,7 +171,7 @@ steps: - tests/distributed/ - tests/examples/offline_inference/data_parallel.py commands: - - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code" + - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code" - label: Distributed NixlConnector PD accuracy (4 GPUs) timeout_in_minutes: 30 diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py index be0b846995a92..bcf1ba307eff1 100644 --- a/examples/offline_inference/data_parallel.py +++ b/examples/offline_inference/data_parallel.py @@ -5,25 +5,25 @@ Usage: Single node: python examples/offline_inference/data_parallel.py \ --model="ibm-research/PowerMoE-3b" \ - --dp-size=2 \ - --tp-size=2 + -dp=2 \ + -tp=2 Multi-node: Node 0 (assume the node has ip of 10.99.48.128): python examples/offline_inference/data_parallel.py \ --model="ibm-research/PowerMoE-3b" \ - --dp-size=2 \ - --tp-size=2 \ - --node-size=2 \ + -dp=2 \ + -tp=2 \ + --nnodes=2 \ --node-rank=0 \ --master-addr=10.99.48.128 \ --master-port=13345 Node 1: python examples/offline_inference/data_parallel.py \ --model="ibm-research/PowerMoE-3b" \ - --dp-size=2 \ - --tp-size=2 \ - --node-size=2 \ + -dp=2 \ + -tp=2 \ + --nnodes=2 \ --node-rank=1 \ --master-addr=10.99.48.128 \ --master-port=13345 @@ -32,103 +32,40 @@ Multi-node: import os from time import sleep -from vllm import LLM, SamplingParams +from vllm import LLM, EngineArgs, SamplingParams from vllm.platforms import current_platform +from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.network_utils import get_open_port -def parse_args(): - import argparse +def create_parser(): + parser = FlexibleArgumentParser(description="Data Parallel Inference") - parser = argparse.ArgumentParser(description="Data Parallel Inference") - parser.add_argument( - "--model", - type=str, - default="ibm-research/PowerMoE-3b", - help="Model name or path", - ) - parser.add_argument("--dp-size", type=int, default=2, help="Data parallel size") - parser.add_argument("--tp-size", type=int, default=2, help="Tensor parallel size") - parser.add_argument( - "--node-size", type=int, default=1, help="Total number of nodes" - ) - parser.add_argument( - "--node-rank", type=int, default=0, help="Rank of the current node" - ) - parser.add_argument( - "--master-addr", type=str, default="", help="Master node IP address" - ) - parser.add_argument("--master-port", type=int, default=0, help="Master node port") - parser.add_argument( - "--enforce-eager", action="store_true", help="Enforce eager mode execution." - ) - parser.add_argument( - "--trust-remote-code", action="store_true", help="Trust remote code." - ) - parser.add_argument( - "--max-num-seqs", - type=int, - default=64, - help=("Maximum number of sequences to be processed in a single iteration."), - ) - parser.add_argument( - "--max-model-len", - type=int, - help=("Maximum number of tokens to be processed in a single iteration."), + # Add all engine args + EngineArgs.add_cli_args(parser) + parser.set_defaults( + model="ibm-research/PowerMoE-3b", + enable_expert_parallel=True, ) + + # Add timeout (not in EngineArgs) parser.add_argument( "--timeout", type=int, default=300, - help=("Number of seconds before unresponsive process is killed."), + help="Number of seconds before unresponsive process is killed.", ) - parser.add_argument( - "--gpu-memory-utilization", - type=float, - default=0.8, - help=("Fraction of GPU memory vLLM is allowed to allocate (0.0, 1.0]."), - ) - parser.add_argument( - "--enable-dbo", - action="store_true", - help=("Enable microbatched execution"), - ) - parser.add_argument( - "--compilation-config", - type=int, - help=("Compilation optimization (O) mode 0-3."), - ) - parser.add_argument( - "--quantization", - type=str, - ) - parser.add_argument( - "--disable-expert-parallel", - dest="enable_expert_parallel", - action="store_false", - help="Disable expert parallel (default: enabled).", - ) - parser.set_defaults(enable_expert_parallel=True) - return parser.parse_args() + + return parser def main( - model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip, dp_master_port, - GPUs_per_dp_rank, - enforce_eager, - enable_expert_parallel, - trust_remote_code, - max_num_seqs, - max_model_len, - compilation_config, - gpu_memory_utilization, - enable_dbo, - quantization, + engine_args, ): os.environ["VLLM_DP_RANK"] = str(global_dp_rank) os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank) @@ -173,19 +110,7 @@ def main( ) # Create an LLM. - llm = LLM( - model=model, - tensor_parallel_size=GPUs_per_dp_rank, - enforce_eager=enforce_eager, - enable_expert_parallel=enable_expert_parallel, - trust_remote_code=trust_remote_code, - max_num_seqs=max_num_seqs, - max_model_len=max_model_len, - gpu_memory_utilization=gpu_memory_utilization, - enable_dbo=enable_dbo, - quantization=quantization, - compilation_config=compilation_config, - ) + llm = LLM(**engine_args) outputs = llm.generate(prompts, sampling_params) # Print the outputs. for i, output in enumerate(outputs): @@ -204,22 +129,29 @@ def main( if __name__ == "__main__": - args = parse_args() + parser = create_parser() + args = vars(parser.parse_args()) - dp_size = args.dp_size - tp_size = args.tp_size - node_size = args.node_size - node_rank = args.node_rank + # Extract DP-specific args + dp_size = args.pop("data_parallel_size") + nnodes = args.get("nnodes", 1) + node_rank = args.get("node_rank", 0) + master_addr = args.get("master_addr", "") + master_port = args.get("master_port", 0) + timeout = args.pop("timeout") - if node_size == 1: + # Remaining args are engine args + engine_args = args + + if nnodes == 1: dp_master_ip = "127.0.0.1" dp_master_port = get_open_port() else: - dp_master_ip = args.master_addr - dp_master_port = args.master_port + dp_master_ip = master_addr + dp_master_port = master_port - assert dp_size % node_size == 0, "dp_size should be divisible by node_size" - dp_per_node = dp_size // node_size + assert dp_size % nnodes == 0, "dp_size should be divisible by nnodes" + dp_per_node = dp_size // nnodes from multiprocessing import Process @@ -235,29 +167,19 @@ if __name__ == "__main__": proc = Process( target=main, args=( - args.model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip, dp_master_port, - tp_size, - args.enforce_eager, - args.enable_expert_parallel, - args.trust_remote_code, - args.max_num_seqs, - args.max_model_len, - args.compilation_config, - args.gpu_memory_utilization, - args.enable_dbo, - args.quantization, + engine_args, ), ) proc.start() procs.append(proc) exit_code = 0 for proc in procs: - proc.join(timeout=args.timeout) + proc.join(timeout=timeout) if proc.exitcode is None: print(f"Killing process {proc.pid} that didn't stop within 5 minutes.") proc.kill()