[Perf] Add decode full-graph support to FlashInfer-MLA backend (#26313)

Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
2026-05-01 00:30:05 +08:00 · 2025-10-06 19:03:49 -04:00 · 2025-10-06 19:03:49 -04:00 · f77df94647
commit f77df94647
parent f231e5bc21
1 changed files with 12 additions and 1 deletions
--- a/vllm/v1/attention/backends/mla/flashinfer_mla.py
+++ b/vllm/v1/attention/backends/mla/flashinfer_mla.py
@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-from typing import Optional, Union
+from typing import ClassVar, Optional, Union

 import torch
 from flashinfer.decode import trtllm_batch_decode_with_kv_cache_mla
@ -12,13 +12,20 @@ from vllm.v1.attention.backends.mla.common import (
    MLACommonBackend,
    MLACommonImpl,
    MLACommonMetadata,
+    MLACommonMetadataBuilder,
 )
+from vllm.v1.attention.backends.utils import AttentionCGSupport

 logger = init_logger(__name__)

 FLASHINFER_MLA_WORKSPACE_BUFFER_SIZE = 128 * 1024 * 1024


+class FlashInferMLAMetadataBuilder(MLACommonMetadataBuilder[MLACommonMetadata]):
+    # enable full CUDA Graph support for decode-only capture
+    cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.UNIFORM_BATCH
+
+
 class FlashInferMLABackend(MLACommonBackend):
    @staticmethod
    def get_name() -> str:
@ -28,6 +35,10 @@ class FlashInferMLABackend(MLACommonBackend):
    def get_impl_cls() -> type["FlashInferMLAImpl"]:
        return FlashInferMLAImpl

+    @staticmethod
+    def get_builder_cls() -> type["FlashInferMLAMetadataBuilder"]:
+        return FlashInferMLAMetadataBuilder
+

 g_fi_workspace = torch.zeros(
    FLASHINFER_MLA_WORKSPACE_BUFFER_SIZE,