[Perf] use cpu all reduce to avoid sync when async_scheduling & dp > 1 (#29311)

Signed-off-by: zhuhaoran <zhuhaoran.zhr@alibaba-inc.com>
2026-07-21 08:47:10 +08:00 · 2025-11-25 15:31:07 +08:00 · 2025-11-25 15:31:07 +08:00 · f242cfcdd5
commit f242cfcdd5
parent 888152bf87
1 changed files with 6 additions and 0 deletions
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@ -1570,6 +1570,12 @@ class EngineArgs:
            model_config.skip_tokenizer_init = True
            logger.info("Skipping tokenizer initialization for tokens-only mode.")

+        if self.async_scheduling and not self.disable_nccl_for_dp_synchronization:
+            logger.info(
+                "Disabling NCCL for DP synchronization when using async scheduling."
+            )
+            self.disable_nccl_for_dp_synchronization = True
+
        # Forward the deprecated CLI args to the EPLB config.
        if self.num_redundant_experts is not None:
            self.eplb_config.num_redundant_experts = self.num_redundant_experts