[Bugfix] Fix default enable for CUTLASS MLA on SM100 (#22738)

Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
Michael Goin 2025-08-13 00:22:05 -04:00 committed by GitHub
parent 4f0f844b16
commit b1361c7273
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -152,6 +152,9 @@ class CudaPlatformBase(Platform):
if cls.is_device_capability(100):
# Blackwell => Force CutlassMLA.
use_cutlass_mla = True
# TODO: This does not work, because the
# global_force_attn_backend_context_manager is not set.
# See vllm/attention/selector.py:_cached_get_attn_backend
envs.VLLM_ATTENTION_BACKEND = "CUTLASS_MLA"
else:
# Not Blackwell
@ -217,7 +220,9 @@ class CudaPlatformBase(Platform):
if use_mla:
# TODO(lucas): refactor to be more concise
# we should probably consider factoring out V1 here
if selected_backend == _Backend.CUTLASS_MLA:
if selected_backend == _Backend.CUTLASS_MLA or (
cls.is_device_capability(100) and selected_backend is None
and block_size == 128):
if use_v1:
logger.info_once("Using Cutlass MLA backend on V1 engine.")
return ("vllm.v1.attention.backends.mla."