mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-19 06:37:00 +08:00
[CI/Testing] Add basic single node dual batch overlap test (#27235)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
This commit is contained in:
parent
cac4c10ef0
commit
4bc400f47e
@ -1223,6 +1223,7 @@ steps:
|
|||||||
- pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
|
- pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
|
||||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
||||||
|
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||||
|
|
||||||
##### B200 test #####
|
##### B200 test #####
|
||||||
- label: Distributed Tests (B200) # optional
|
- label: Distributed Tests (B200) # optional
|
||||||
@ -1233,6 +1234,7 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||||
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
|
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
|
||||||
|
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||||
|
|
||||||
##### RL Integration Tests #####
|
##### RL Integration Tests #####
|
||||||
- label: Prime-RL Integration Test # 15min
|
- label: Prime-RL Integration Test # 15min
|
||||||
|
|||||||
89
tests/v1/distributed/test_dbo.py
Normal file
89
tests/v1/distributed/test_dbo.py
Normal file
@ -0,0 +1,89 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
"""
|
||||||
|
Test Dual Batch Overlap (DBO) with Data Parallelism + Expert Parallelism.
|
||||||
|
|
||||||
|
DBO is specifically designed for DP+EP scenarios to hide communication latency
|
||||||
|
by overlapping computation of two batches. This test validates that DBO works
|
||||||
|
correctly with the DeepSeek-V2-Lite model using GSM8K evaluation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from tests.evals.gsm8k.gsm8k_eval import evaluate_gsm8k
|
||||||
|
from tests.utils import RemoteOpenAIServer
|
||||||
|
|
||||||
|
MODEL_NAME = "deepseek-ai/DeepSeek-V2-Lite-Chat"
|
||||||
|
DP_SIZE = 2
|
||||||
|
|
||||||
|
# GSM8K eval configuration
|
||||||
|
NUM_QUESTIONS = 256 # Fast eval for CI; but must be large enough to hit dbo thresholds
|
||||||
|
NUM_SHOTS = 5 # Few-shot examples
|
||||||
|
MIN_ACCURACY = 0.62 # Expected 0.64 with 2% buffer (based on vLLM test data)
|
||||||
|
|
||||||
|
# Increase max_num_seqs to trigger DBO for decode batches
|
||||||
|
# With 64 seqs, decode batches should exceed the 32 token threshold
|
||||||
|
MAX_NUM_SEQS = 64 # Increased from 16 to trigger decode DBO
|
||||||
|
|
||||||
|
# DeepEP backends to test
|
||||||
|
DEEPEP_BACKENDS = [
|
||||||
|
"deepep_low_latency",
|
||||||
|
"deepep_high_throughput",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("all2all_backend", DEEPEP_BACKENDS)
|
||||||
|
def test_dbo_dp_ep_gsm8k(all2all_backend: str, num_gpus_available):
|
||||||
|
"""
|
||||||
|
Test DBO with DP+EP using GSM8K evaluation.
|
||||||
|
"""
|
||||||
|
required_gpus = DP_SIZE
|
||||||
|
|
||||||
|
if num_gpus_available < required_gpus:
|
||||||
|
pytest.skip(f"Need at least {required_gpus} GPUs (DP={DP_SIZE})")
|
||||||
|
|
||||||
|
# Server arguments for DBO + DP + EP
|
||||||
|
server_args = [
|
||||||
|
"--max-model-len",
|
||||||
|
"4096",
|
||||||
|
"--max-num-seqs",
|
||||||
|
str(MAX_NUM_SEQS), # Use larger batch to trigger decode DBO
|
||||||
|
"--trust-remote-code",
|
||||||
|
# Note: Not using --enforce-eager to test DBO's alternate CUDA graph dispatching
|
||||||
|
"--data-parallel-size",
|
||||||
|
str(DP_SIZE),
|
||||||
|
"--enable-expert-parallel",
|
||||||
|
"--enable-dbo",
|
||||||
|
# Fix threshold so we know we trigger DBO
|
||||||
|
"--dbo-decode-token-threshold",
|
||||||
|
"16",
|
||||||
|
"--dbo-prefill-token-threshold",
|
||||||
|
"256",
|
||||||
|
"--all2all-backend",
|
||||||
|
all2all_backend,
|
||||||
|
]
|
||||||
|
|
||||||
|
with RemoteOpenAIServer(
|
||||||
|
MODEL_NAME,
|
||||||
|
server_args,
|
||||||
|
max_wait_seconds=600, # Allow time for model loading with DP+EP
|
||||||
|
) as remote_server:
|
||||||
|
# Use host and port directly from RemoteOpenAIServer
|
||||||
|
host = f"http://{remote_server.host}"
|
||||||
|
port = remote_server.port
|
||||||
|
|
||||||
|
# Run GSM8K evaluation
|
||||||
|
results = evaluate_gsm8k(
|
||||||
|
num_questions=NUM_QUESTIONS,
|
||||||
|
num_shots=NUM_SHOTS,
|
||||||
|
host=host,
|
||||||
|
port=port,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Validate accuracy is reasonable
|
||||||
|
accuracy = results["accuracy"]
|
||||||
|
assert accuracy >= MIN_ACCURACY, (
|
||||||
|
f"DBO+DP+EP accuracy too low ({all2all_backend}): "
|
||||||
|
f"{accuracy:.3f} < {MIN_ACCURACY:.3f} "
|
||||||
|
f"(correct: {results['num_correct']}/{results['num_questions']})"
|
||||||
|
)
|
||||||
Loading…
x
Reference in New Issue
Block a user