diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 39101c43142f7..915392a4125f9 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -182,8 +182,8 @@ class CudaPlatformBase(Platform): if vllm_config.attention_config.backend is None: # Default case - if cls.is_device_capability(100): - # Blackwell => Force CutlassMLA. + if cls.is_device_capability(100) and not use_sparse: + # Blackwell => Force CutlassMLA (unless sparse, i.e. DSv3.2). use_cutlass_mla = True # Set the backend in AttentionConfig so it's used during # backend selection