From 4bc400f47e33ef27fb69608b9ad7fe992cb8ba76 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Tue, 4 Nov 2025 02:00:46 +0900 Subject: [PATCH] [CI/Testing] Add basic single node dual batch overlap test (#27235) Signed-off-by: Lucas Wilkinson --- .buildkite/test-pipeline.yaml | 2 + tests/v1/distributed/test_dbo.py | 89 ++++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+) create mode 100644 tests/v1/distributed/test_dbo.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index a020b0d276be0..07e2bf09d8aa0 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -1223,6 +1223,7 @@ steps: - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm - pytest -v -s tests/distributed/test_context_parallel.py - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 + - pytest -v -s tests/v1/distributed/test_dbo.py ##### B200 test ##### - label: Distributed Tests (B200) # optional @@ -1233,6 +1234,7 @@ steps: commands: - pytest -v -s tests/distributed/test_context_parallel.py - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py + - pytest -v -s tests/v1/distributed/test_dbo.py ##### RL Integration Tests ##### - label: Prime-RL Integration Test # 15min diff --git a/tests/v1/distributed/test_dbo.py b/tests/v1/distributed/test_dbo.py new file mode 100644 index 0000000000000..866ae742bf3c0 --- /dev/null +++ b/tests/v1/distributed/test_dbo.py @@ -0,0 +1,89 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Test Dual Batch Overlap (DBO) with Data Parallelism + Expert Parallelism. + +DBO is specifically designed for DP+EP scenarios to hide communication latency +by overlapping computation of two batches. This test validates that DBO works +correctly with the DeepSeek-V2-Lite model using GSM8K evaluation. +""" + +import pytest + +from tests.evals.gsm8k.gsm8k_eval import evaluate_gsm8k +from tests.utils import RemoteOpenAIServer + +MODEL_NAME = "deepseek-ai/DeepSeek-V2-Lite-Chat" +DP_SIZE = 2 + +# GSM8K eval configuration +NUM_QUESTIONS = 256 # Fast eval for CI; but must be large enough to hit dbo thresholds +NUM_SHOTS = 5 # Few-shot examples +MIN_ACCURACY = 0.62 # Expected 0.64 with 2% buffer (based on vLLM test data) + +# Increase max_num_seqs to trigger DBO for decode batches +# With 64 seqs, decode batches should exceed the 32 token threshold +MAX_NUM_SEQS = 64 # Increased from 16 to trigger decode DBO + +# DeepEP backends to test +DEEPEP_BACKENDS = [ + "deepep_low_latency", + "deepep_high_throughput", +] + + +@pytest.mark.parametrize("all2all_backend", DEEPEP_BACKENDS) +def test_dbo_dp_ep_gsm8k(all2all_backend: str, num_gpus_available): + """ + Test DBO with DP+EP using GSM8K evaluation. + """ + required_gpus = DP_SIZE + + if num_gpus_available < required_gpus: + pytest.skip(f"Need at least {required_gpus} GPUs (DP={DP_SIZE})") + + # Server arguments for DBO + DP + EP + server_args = [ + "--max-model-len", + "4096", + "--max-num-seqs", + str(MAX_NUM_SEQS), # Use larger batch to trigger decode DBO + "--trust-remote-code", + # Note: Not using --enforce-eager to test DBO's alternate CUDA graph dispatching + "--data-parallel-size", + str(DP_SIZE), + "--enable-expert-parallel", + "--enable-dbo", + # Fix threshold so we know we trigger DBO + "--dbo-decode-token-threshold", + "16", + "--dbo-prefill-token-threshold", + "256", + "--all2all-backend", + all2all_backend, + ] + + with RemoteOpenAIServer( + MODEL_NAME, + server_args, + max_wait_seconds=600, # Allow time for model loading with DP+EP + ) as remote_server: + # Use host and port directly from RemoteOpenAIServer + host = f"http://{remote_server.host}" + port = remote_server.port + + # Run GSM8K evaluation + results = evaluate_gsm8k( + num_questions=NUM_QUESTIONS, + num_shots=NUM_SHOTS, + host=host, + port=port, + ) + + # Validate accuracy is reasonable + accuracy = results["accuracy"] + assert accuracy >= MIN_ACCURACY, ( + f"DBO+DP+EP accuracy too low ({all2all_backend}): " + f"{accuracy:.3f} < {MIN_ACCURACY:.3f} " + f"(correct: {results['num_correct']}/{results['num_questions']})" + )