From 143b09e6bed30d6f3f4ce44f0b31242a1b95db05 Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Wed, 13 Aug 2025 15:00:40 -0400 Subject: [PATCH] fix full cudagraphs for cutlass mla Signed-off-by: Sage Moore --- vllm/v1/attention/backends/mla/cutlass_mla.py | 9 +++------ vllm/v1/worker/gpu_model_runner.py | 4 ---- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py index 3353aaf760c69..b076613c8645a 100644 --- a/vllm/v1/attention/backends/mla/cutlass_mla.py +++ b/vllm/v1/attention/backends/mla/cutlass_mla.py @@ -14,18 +14,15 @@ from vllm.v1.attention.backends.mla.common import (MLACommonBackend, MLACommonImpl, MLACommonMetadata, MLACommonMetadataBuilder) -from vllm.v1.attention.backends.utils import CommonAttentionMetadata +from vllm.v1.attention.backends.utils import AttentionCGSupport logger = init_logger(__name__) class CutlassMLAMetadataBuilder(MLACommonMetadataBuilder[MLACommonMetadata]): # enable full CUDA Graph support for decode-only capture - full_cudagraph_supported: ClassVar[bool] = True # Decode-only - - def can_run_in_cudagraph( - self, common_attn_metadata: CommonAttentionMetadata) -> bool: - return common_attn_metadata.max_query_len == 1 + attn_cudagraph_support: ClassVar[ + AttentionCGSupport] = AttentionCGSupport.PURE_DECODE_ONLY class CutlassMLABackend(MLACommonBackend): diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index f0f4942677e76..dcf8cf158e307 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2031,7 +2031,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Run the model. # Use persistent buffers for CUDA graphs. -<<<<<<< HEAD # when DBO is enabled, `num_tokens_after_padding` # represents the per-ubatch DP token count. dp_tokens_for_forward = num_tokens_after_padding @@ -2045,9 +2044,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): num_tokens_across_dp=dp_tokens_for_forward, skip_cuda_graphs=skip_cuda_graphs): self.maybe_setup_kv_connector(scheduler_output) -======= - self.maybe_setup_kv_connector(scheduler_output) ->>>>>>> db77e4a92 (revert kv connector fix) model_output = self._run_model( attn_metadata=attn_metadata, num_scheduled_tokens=num_input_tokens,