[BugFix] Partial revert of #29558 (DeepEP HT + PIECEWISE CG support) (#30910)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Signed-off-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> (cherry picked from commit 30bb19a760d6d5e8c69b3a4c78c9cb7430872a61)
2026-07-08 17:47:09 +08:00 · 2025-12-18 02:50:15 -05:00 · 2025-12-18 02:50:15 -05:00 · 2c0ee0fde8
commit 2c0ee0fde8
parent 55f1fc1b1b
2 changed files with 14 additions and 74 deletions
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@ -233,24 +233,6 @@ def test_splitting_ops_dynamic():
    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE
 def test_moe_splitting_ops_deepep_ht_piecewise():
    # Non-inductor, non-attn-fusion case: DeepEP HT with dp>1
    # should add MoE ops to splitting_ops on top of attention ops.
    config = VllmConfig(
        parallel_config=ParallelConfig(
            all2all_backend="deepep_high_throughput",
            data_parallel_size=8,
        ),
        compilation_config=CompilationConfig(
            mode=CompilationMode.VLLM_COMPILE,
        ),
    )
    splitting_ops = config.compilation_config.splitting_ops
    assert splitting_ops is not None
    assert "vllm::moe_forward" in splitting_ops
    assert "vllm::moe_forward_shared" in splitting_ops
 def test_moe_splitting_ops_deepep_ht_inductor_partition():
    # Inductor partition case: user-provided splitting_ops should be
    # preserved and MoE ops should be appended for DeepEP HT with dp>1.
@ -277,26 +259,6 @@ def test_moe_splitting_ops_deepep_ht_inductor_partition():
    ]
 def test_moe_splitting_ops_deepep_ht_attn_fusion_no_inductor():
    # Pure attn-fusion case without inductor partition: even with
    # DeepEP HT and dp>1, we should not re-enable piecewise compilation
    # or add MoE ops into splitting_ops.
    config = VllmConfig(
        parallel_config=ParallelConfig(
            all2all_backend="deepep_high_throughput",
            data_parallel_size=8,
        ),
        compilation_config=CompilationConfig(
            mode=CompilationMode.VLLM_COMPILE,
            pass_config={"fuse_attn_quant": True, "eliminate_noops": True},
            custom_ops=["+quant_fp8"],
            cudagraph_mode=CUDAGraphMode.PIECEWISE,
        ),
    )
    assert config.compilation_config.splitting_ops == []
    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL
 def test_should_split():
    import torch
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@ -915,8 +915,6 @@ class CompilationConfig:
            "mode is CompilationMode.VLLM_COMPILE"
        )
        added_default_splitting_ops = False
        if self.pass_config.fuse_attn_quant and not self.use_inductor_graph_partition:
            self.set_splitting_ops_for_attn_fusion()
        else:
@ -930,7 +928,6 @@ class CompilationConfig:
                # for details. Make a copy to avoid mutating the class-level
                # list via reference.
                self.splitting_ops = list(self._attention_ops)
                added_default_splitting_ops = True
            elif len(self.splitting_ops) == 0:
                if (
                    self.cudagraph_mode == CUDAGraphMode.PIECEWISE
@ -958,44 +955,25 @@ class CompilationConfig:
                    self.cudagraph_mode = CUDAGraphMode.FULL
                self.splitting_ops = []
-        # split MoE ops for cudagraph
+        # Disable CUDA graphs for DeepEP high-throughput since its not CG compatible
        moe_ops = [
            "vllm::moe_forward",
            "vllm::moe_forward_shared",
        ]
        backend = all2all_backend or envs.VLLM_ALL2ALL_BACKEND
        dp_size = data_parallel_size if data_parallel_size is not None else 1
-        need_moe_splitting = (
+        if (
            backend == "deepep_high_throughput"
            and dp_size > 1
-            # pure attn-fusion without inductor partition deliberately disables
+            and self.cudagraph_mode != CUDAGraphMode.NONE
-            # piecewise graphs and MoE splitting.
+        ):
-            and not (
+            # TODO: Piecewise Cuda graph might be enabled
-                self.pass_config.fuse_attn_quant
+            # if torch compile cache key issue fixed
-                and not self.use_inductor_graph_partition
+            # See https://github.com/vllm-project/vllm/pull/25093
            logger.info(
                "DeepEP: Disabling CUDA Graphs since DeepEP high-throughput kernels "
                "are optimized for prefill and are incompatible with CUDA Graphs. "
                "In order to use CUDA Graphs for decode-optimized workloads, "
                "use --all2all-backend with another option, such as "
                "deepep_low_latency, pplx, or allgather_reducescatter."
            )
-        )
+            self.cudagraph_mode = CUDAGraphMode.NONE
        if need_moe_splitting and self.cudagraph_mode != CUDAGraphMode.NONE:
            # if we just initialized default splitting_ops for this config,
            # automatically append the MoE ops
            if added_default_splitting_ops:
                for op in moe_ops:
                    if op not in self.splitting_ops:
                        self.splitting_ops.append(op)
            # make sure MoE ops are split out
            if not any(op in self.splitting_ops for op in moe_ops):
                self.cudagraph_mode = CUDAGraphMode.NONE
                logger.warning_once(
                    "DeepEP high throughput backend with data_parallel_size > 1 "
                    "requires splitting MoE ops from cudagraphs. Please ensure "
                    "'vllm::moe_forward' or 'vllm::moe_forward_shared' are "
                    "present in CompilationConfig.splitting_ops."
                )
            elif self.cudagraph_mode.has_full_cudagraphs():
                # fall back to piecewise when MoE splitting is required.
                self.cudagraph_mode = CUDAGraphMode.PIECEWISE
    def set_splitting_ops_for_attn_fusion(self):
        assert self.pass_config.fuse_attn_quant