From 5b0448104fa16ca28711e715f4866d9b2805f171 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Wed, 29 Oct 2025 13:29:20 -0400
Subject: [PATCH] [Bug] Raise error explicitly if using incompatible backend
 (#27424)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 vllm/platforms/cuda.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 66cffde9503d..cc06f034fba3 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -261,6 +261,21 @@ class CudaPlatformBase(Platform):
         from vllm.attention.backends.registry import _Backend
 
         if use_mla:
+            # explicitly reject non-MLA backends when MLA is enabled to avoid
+            # silently selecting an incompatible backend (e.g., FLASHINFER).
+            if selected_backend in {
+                _Backend.FLASHINFER,
+                _Backend.FLASH_ATTN,
+                _Backend.TRITON_ATTN,
+                _Backend.TREE_ATTN,
+                _Backend.XFORMERS,
+            }:
+                raise ValueError(
+                    f"Attention backend {selected_backend} incompatible with MLA. "
+                    "Please use one of the MLA backends: FLASHINFER_MLA, CUTLASS_MLA, "
+                    "FLASHMLA, FLASH_ATTN_MLA, or TRITON_MLA. Alternatively, set "
+                    "VLLM_MLA_DISABLE=1 to disable MLA for this model."
+                )
             if not use_v1:
                 raise RuntimeError(
                     "MLA attention backends require the V1 engine. "