From 9d6235ca9a36e76911045999ed72e3c8aad66b8a Mon Sep 17 00:00:00 2001 From: Ming Yang Date: Mon, 8 Dec 2025 16:29:36 -0800 Subject: [PATCH] [moe] Allow disabling DP chunking (#29936) Signed-off-by: Ming Yang --- vllm/envs.py | 4 ++++ vllm/model_executor/layers/fused_moe/layer.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm/envs.py b/vllm/envs.py index 37711dece9abc..91d1b01076b11 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -144,6 +144,7 @@ if TYPE_CHECKING: VLLM_DP_MASTER_IP: str = "" VLLM_DP_MASTER_PORT: int = 0 VLLM_MOE_DP_CHUNK_SIZE: int = 256 + VLLM_ENABLE_MOE_DP_CHUNK: bool = True VLLM_RANDOMIZE_DP_DUMMY_INPUTS: bool = False VLLM_RAY_DP_PACK_STRATEGY: Literal["strict", "fill", "span"] = "strict" VLLM_MARLIN_USE_ATOMIC_ADD: bool = False @@ -1101,6 +1102,9 @@ environment_variables: dict[str, Callable[[], Any]] = { # rank. All DP ranks process the activations in VLLM_MOE_DP_CHUNK_SIZE # units. "VLLM_MOE_DP_CHUNK_SIZE": lambda: int(os.getenv("VLLM_MOE_DP_CHUNK_SIZE", "256")), + "VLLM_ENABLE_MOE_DP_CHUNK": lambda: bool( + int(os.getenv("VLLM_ENABLE_MOE_DP_CHUNK", "1")) + ), # Randomize inputs during dummy runs when using Data Parallel "VLLM_RANDOMIZE_DP_DUMMY_INPUTS": lambda: os.environ.get( "VLLM_RANDOMIZE_DP_DUMMY_INPUTS", "0" diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 9b4d77a060c29..5df3486093cd9 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -753,7 +753,7 @@ class FusedMoE(CustomOp): self.moe_parallel_config.use_pplx_kernels or self.moe_parallel_config.use_deepep_ll_kernels or (self.dp_size > 1 and self.use_flashinfer_cutlass_kernels) - ) + ) and envs.VLLM_ENABLE_MOE_DP_CHUNK @property def is_internal_router(self) -> bool: