[moe] Allow disabling DP chunking (#29936)

Signed-off-by: Ming Yang <minos.future@gmail.com>
2026-06-01 19:51:19 +08:00 · 2025-12-08 16:29:36 -08:00 · 2025-12-08 16:29:36 -08:00 · 9d6235ca9a
commit 9d6235ca9a
parent f1599ca55d
2 changed files with 5 additions and 1 deletions
--- a/vllm/envs.py
+++ b/vllm/envs.py
@ -144,6 +144,7 @@ if TYPE_CHECKING:
    VLLM_DP_MASTER_IP: str = ""
    VLLM_DP_MASTER_PORT: int = 0
    VLLM_MOE_DP_CHUNK_SIZE: int = 256
    VLLM_ENABLE_MOE_DP_CHUNK: bool = True
    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: bool = False
    VLLM_RAY_DP_PACK_STRATEGY: Literal["strict", "fill", "span"] = "strict"
    VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
@ -1101,6 +1102,9 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # rank. All DP ranks process the activations in VLLM_MOE_DP_CHUNK_SIZE
    # units.
    "VLLM_MOE_DP_CHUNK_SIZE": lambda: int(os.getenv("VLLM_MOE_DP_CHUNK_SIZE", "256")),
    "VLLM_ENABLE_MOE_DP_CHUNK": lambda: bool(
        int(os.getenv("VLLM_ENABLE_MOE_DP_CHUNK", "1"))
    ),
    # Randomize inputs during dummy runs when using Data Parallel
    "VLLM_RANDOMIZE_DP_DUMMY_INPUTS": lambda: os.environ.get(
        "VLLM_RANDOMIZE_DP_DUMMY_INPUTS", "0"
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@ -753,7 +753,7 @@ class FusedMoE(CustomOp):
            self.moe_parallel_config.use_pplx_kernels
            or self.moe_parallel_config.use_deepep_ll_kernels
            or (self.dp_size > 1 and self.use_flashinfer_cutlass_kernels)
-        )
+        ) and envs.VLLM_ENABLE_MOE_DP_CHUNK
    @property
    def is_internal_router(self) -> bool: