From f242cfcdd5f1db4e005503a02a1317369d2a8e3d Mon Sep 17 00:00:00 2001
From: zhrrr <43847754+izhuhaoran@users.noreply.github.com>
Date: Tue, 25 Nov 2025 15:31:07 +0800
Subject: [PATCH] [Perf] use cpu all reduce to avoid sync when async_scheduling
 & dp > 1 (#29311)

Signed-off-by: zhuhaoran <zhuhaoran.zhr@alibaba-inc.com>
---
 vllm/engine/arg_utils.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 3cb76fc63f69c..8338e54d4fd85 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1570,6 +1570,12 @@ class EngineArgs:
             model_config.skip_tokenizer_init = True
             logger.info("Skipping tokenizer initialization for tokens-only mode.")
 
+        if self.async_scheduling and not self.disable_nccl_for_dp_synchronization:
+            logger.info(
+                "Disabling NCCL for DP synchronization when using async scheduling."
+            )
+            self.disable_nccl_for_dp_synchronization = True
+
         # Forward the deprecated CLI args to the EPLB config.
         if self.num_redundant_experts is not None:
             self.eplb_config.num_redundant_experts = self.num_redundant_experts