From f242cfcdd5f1db4e005503a02a1317369d2a8e3d Mon Sep 17 00:00:00 2001 From: zhrrr <43847754+izhuhaoran@users.noreply.github.com> Date: Tue, 25 Nov 2025 15:31:07 +0800 Subject: [PATCH] [Perf] use cpu all reduce to avoid sync when async_scheduling & dp > 1 (#29311) Signed-off-by: zhuhaoran --- vllm/engine/arg_utils.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 3cb76fc63f69c..8338e54d4fd85 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1570,6 +1570,12 @@ class EngineArgs: model_config.skip_tokenizer_init = True logger.info("Skipping tokenizer initialization for tokens-only mode.") + if self.async_scheduling and not self.disable_nccl_for_dp_synchronization: + logger.info( + "Disabling NCCL for DP synchronization when using async scheduling." + ) + self.disable_nccl_for_dp_synchronization = True + # Forward the deprecated CLI args to the EPLB config. if self.num_redundant_experts is not None: self.eplb_config.num_redundant_experts = self.num_redundant_experts