Sage Moore fixes for full cuda graph support for DeepEP+DeepGEMM LL

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2026-07-11 18:07:15 +08:00 · 2025-06-24 11:21:52 -04:00 · 2025-06-24 11:21:52 -04:00 · e53382cc2e
commit e53382cc2e
parent 26d34eb67e
1 changed files with 2 additions and 1 deletions
--- a/vllm/v1/attention/backends/mla/flashmla.py
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@ -75,7 +75,8 @@ class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
            1, # MQA for the decode path
        )

-        if self.runner.full_cuda_graph:
+        n = num_splits.size(0)
+        if self.runner.full_cuda_graph and (n-1) <= self.runner.cudagraph_batch_sizes[-1]:
            # First time around (CUDAGraph capture), allocate the static buffer
            if self.cg_buf_tile_scheduler_metadata is None:
                self.cg_buf_tile_scheduler_metadata = tile_scheduler_metadata