From 41b6f9200fef27cd43b3299408a6f0e50654931f Mon Sep 17 00:00:00 2001 From: Elizabeth Thomas Date: Thu, 18 Dec 2025 13:46:28 -0600 Subject: [PATCH] Remove all2all backend envvar (#30363) Signed-off-by: Elizabeth Thomas Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .../qwen30b_a3b_fp8_block_ep_eplb.sh | 2 +- .../qwen3_next_mtp_async_eplb.sh | 2 +- .buildkite/test-amd.yaml | 2 +- .buildkite/test-pipeline.yaml | 2 +- .buildkite/test_areas/distributed.yaml | 2 +- docs/design/moe_kernel_features.md | 2 +- .../elastic_ep/serve_deepseek_v2.sh | 2 +- tests/v1/cudagraph/test_cudagraph_dispatch.py | 5 +- vllm/config/compilation.py | 8 ++- vllm/config/parallel.py | 49 +++++++++---------- vllm/engine/arg_utils.py | 2 +- vllm/envs.py | 5 +- 12 files changed, 40 insertions(+), 43 deletions(-) diff --git a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh index 6a1bef275d047..d0921c5699d5d 100644 --- a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh +++ b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh @@ -44,10 +44,10 @@ trap cleanup EXIT for BACK in "${BACKENDS[@]}"; do VLLM_DEEP_GEMM_WARMUP=skip \ - VLLM_ALL2ALL_BACKEND=$BACK \ vllm serve "$MODEL" \ --enforce-eager \ --enable-eplb \ + --all2all-backend $BACK \ --eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \ --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \ --data-parallel-size ${DATA_PARALLEL_SIZE} \ diff --git a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh index 937a43d1a3221..b3b65128e6062 100644 --- a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh +++ b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh @@ -43,12 +43,12 @@ trap cleanup EXIT for BACK in "${BACKENDS[@]}"; do VLLM_DEEP_GEMM_WARMUP=skip \ - VLLM_ALL2ALL_BACKEND=$BACK \ vllm serve "$MODEL" \ --enforce-eager \ --tensor-parallel-size 4 \ --enable-expert-parallel \ --enable-eplb \ + --all2all-backend $BACK \ --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \ --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \ --trust-remote-code \ diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index f294261ec8c3a..6e20ff3bf38d9 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1497,7 +1497,7 @@ steps: - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py - - HIP_VISIBLE_DEVICES=0,1 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 + - HIP_VISIBLE_DEVICES=0,1 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput - pytest -v -s tests/v1/distributed/test_dbo.py ##### B200 test ##### diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 8e3bcfe4a36bc..faf34d95735f4 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -1331,7 +1331,7 @@ steps: - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py - - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 + - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput - pytest -v -s tests/v1/distributed/test_dbo.py ##### B200 test ##### diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml index 2cc90698d916a..52d57c99fcfb5 100644 --- a/.buildkite/test_areas/distributed.yaml +++ b/.buildkite/test_areas/distributed.yaml @@ -145,7 +145,7 @@ steps: - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4' - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py - - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 + - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput - pytest -v -s tests/v1/distributed/test_dbo.py - label: Distributed Tests (2 GPUs)(B200) diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md index 48341d199cb80..6c02dcb76bec2 100644 --- a/docs/design/moe_kernel_features.md +++ b/docs/design/moe_kernel_features.md @@ -16,7 +16,7 @@ Async backends support the use of DBO (Dual Batch Overlap) and shared expert ove Certain models require the topk weights to be applied to the input activations rather than the output activations when topk==1, e.g. Llama. For modular kernels, this feature is supported by the `FusedMoEPrepareAndFinalize` subclass. For non-modular kernels, it is up to the experts function to deal with this flag. -Unless otherwise specified, backends are controlled via `VLLM_ALL2ALL_BACKEND`. All backends except `flashinfer` only work with EP+DP or EP+TP. `Flashinfer` can work with EP or DP without EP. +Unless otherwise specified, backends are controlled via the `--all2all-backend` command-line argument (or the `all2all_backend` parameter in `ParallelConfig`). All backends except `flashinfer` only work with EP+DP or EP+TP. `Flashinfer` can work with EP or DP without EP.