[Bug] Raise error explicitly if using incompatible backend (#27424)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
2026-05-18 14:37:03 +08:00 · 2025-10-29 13:29:20 -04:00 · 2025-10-29 13:29:20 -04:00 · 5b0448104f
commit 5b0448104f
parent f7a6682872
1 changed files with 15 additions and 0 deletions
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@ -261,6 +261,21 @@ class CudaPlatformBase(Platform):
        from vllm.attention.backends.registry import _Backend

        if use_mla:
+            # explicitly reject non-MLA backends when MLA is enabled to avoid
+            # silently selecting an incompatible backend (e.g., FLASHINFER).
+            if selected_backend in {
+                _Backend.FLASHINFER,
+                _Backend.FLASH_ATTN,
+                _Backend.TRITON_ATTN,
+                _Backend.TREE_ATTN,
+                _Backend.XFORMERS,
+            }:
+                raise ValueError(
+                    f"Attention backend {selected_backend} incompatible with MLA. "
+                    "Please use one of the MLA backends: FLASHINFER_MLA, CUTLASS_MLA, "
+                    "FLASHMLA, FLASH_ATTN_MLA, or TRITON_MLA. Alternatively, set "
+                    "VLLM_MLA_DISABLE=1 to disable MLA for this model."
+                )
            if not use_v1:
                raise RuntimeError(
                    "MLA attention backends require the V1 engine. "