From b1361c7273f60ca244e5425bdb7a9120057327fe Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 13 Aug 2025 00:22:05 -0400 Subject: [PATCH] [Bugfix] Fix default enable for CUTLASS MLA on SM100 (#22738) Signed-off-by: mgoin --- vllm/platforms/cuda.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 70959131573f9..63f6b373c322f 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -152,6 +152,9 @@ class CudaPlatformBase(Platform): if cls.is_device_capability(100): # Blackwell => Force CutlassMLA. use_cutlass_mla = True + # TODO: This does not work, because the + # global_force_attn_backend_context_manager is not set. + # See vllm/attention/selector.py:_cached_get_attn_backend envs.VLLM_ATTENTION_BACKEND = "CUTLASS_MLA" else: # Not Blackwell @@ -217,7 +220,9 @@ class CudaPlatformBase(Platform): if use_mla: # TODO(lucas): refactor to be more concise # we should probably consider factoring out V1 here - if selected_backend == _Backend.CUTLASS_MLA: + if selected_backend == _Backend.CUTLASS_MLA or ( + cls.is_device_capability(100) and selected_backend is None + and block_size == 128): if use_v1: logger.info_once("Using Cutlass MLA backend on V1 engine.") return ("vllm.v1.attention.backends.mla."