[Feature] Full Cuda Graph Support for Cutlass MLA and 6% E2E Throughput Improvement (#22763)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
2026-03-18 12:57:23 +08:00 · 2025-08-15 02:27:30 -04:00 · 2025-08-15 02:27:30 -04:00 · 5c3fbfe46b
commit 5c3fbfe46b
parent b4cef5e6c7
2 changed files with 88 additions and 2 deletions
--- a/tests/compile/piecewise/test_full_cudagraph.py
+++ b/tests/compile/piecewise/test_full_cudagraph.py
@ -66,6 +66,80 @@ def llm_pair(request):
    )


+@pytest.fixture(scope="class")
+def cutlass_mla_llm_pair(request):
+    model = request.param
+
+    # force V1 engine and Cutlass MLA backend
+    with temporary_environ({
+            "VLLM_USE_V1": "1",
+            "VLLM_ATTENTION_BACKEND": "CUTLASS_MLA",
+            "FORCE_NUM_KV_SPLITS":
+            "1",  # TODO: remove this when hang issue is fixed
+    }):
+        full = LLM(
+            model=model,
+            gpu_memory_utilization=0.45,
+            trust_remote_code=True,
+            max_model_len=1024,
+            compilation_config=CompilationConfig(
+                full_cuda_graph=True,
+                cudagraph_capture_sizes=[16, 32, 64, 128, 256, 512],
+            ),
+        )
+        piecewise = LLM(
+            model=model,
+            gpu_memory_utilization=0.45,
+            trust_remote_code=True,
+            max_model_len=1024,
+            compilation_config=CompilationConfig(),
+        )
+
+    yield weakref.proxy(full), weakref.proxy(piecewise)
+    del full
+    del piecewise
+
+    wait_for_gpu_memory_to_clear(
+        devices=[0],
+        threshold_ratio=0.1,
+    )
+
+
+@pytest.mark.parametrize(
+    "cutlass_mla_llm_pair",
+    [
+        # use an MLA model
+        "deepseek-ai/DeepSeek-V2-Lite",
+    ],
+    indirect=True)
+@pytest.mark.skipif(current_platform.get_device_capability() != (10, 0),
+                    reason="Only Blackwell GPUs support Cutlass MLA")
+class TestFullCUDAGraphCutlassMLA:
+    """
+    Validate full CUDA Graph with Cutlass MLA (decode-only capture).
+    """
+
+    @pytest.mark.parametrize(("batch_size", "max_tokens"), [
+        (8, 8),
+    ])
+    def test_full_cudagraph_sm100_cutlass_mla(
+            self, batch_size, max_tokens, cutlass_mla_llm_pair: tuple[LLM,
+                                                                      LLM]):
+        piecewise_llm, full_cudagraph_llm = cutlass_mla_llm_pair
+
+        prompts = ["Hello, my name is"] * batch_size
+        sampling_params = SamplingParams(temperature=0.0,
+                                         max_tokens=max_tokens,
+                                         top_p=0.95)
+
+        piecewise_responses = piecewise_llm.generate(prompts, sampling_params)
+        full_responses = full_cudagraph_llm.generate(prompts, sampling_params)
+
+        for piecewise_res, full_res in zip(piecewise_responses,
+                                           full_responses):
+            assert piecewise_res.outputs[0].text == full_res.outputs[0].text
+
+
@pytest.mark.parametrize(
    "llm_pair",
    [
--- a/vllm/v1/attention/backends/mla/cutlass_mla.py
+++ b/vllm/v1/attention/backends/mla/cutlass_mla.py
@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import os
-from typing import Optional
+from typing import ClassVar, Optional

 import torch

@ -12,11 +12,19 @@ from vllm.attention.backends.abstract import (AttentionType,
 from vllm.logger import init_logger
 from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
                                                   MLACommonImpl,
-                                                   MLACommonMetadata)
+                                                   MLACommonMetadata,
+                                                   MLACommonMetadataBuilder)
+from vllm.v1.attention.backends.utils import AttentionCGSupport

 logger = init_logger(__name__)


+class CutlassMLAMetadataBuilder(MLACommonMetadataBuilder[MLACommonMetadata]):
+    # enable full CUDA Graph support for decode-only capture
+    attn_cudagraph_support: ClassVar[
+        AttentionCGSupport] = AttentionCGSupport.PURE_DECODE_ONLY
+
+
 class CutlassMLABackend(MLACommonBackend):

    @staticmethod
@ -27,6 +35,10 @@ class CutlassMLABackend(MLACommonBackend):
    def get_impl_cls() -> type["CutlassMLAImpl"]:
        return CutlassMLAImpl

+    @staticmethod
+    def get_builder_cls() -> type["CutlassMLAMetadataBuilder"]:
+        return CutlassMLAMetadataBuilder
+

 class SM100Workspace: