[Bug] Raise error explicitly if using incompatible backend (#27424)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
This commit is contained in:
Wentao Ye 2025-10-29 13:29:20 -04:00 committed by GitHub
parent f7a6682872
commit 5b0448104f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -261,6 +261,21 @@ class CudaPlatformBase(Platform):
from vllm.attention.backends.registry import _Backend
if use_mla:
# explicitly reject non-MLA backends when MLA is enabled to avoid
# silently selecting an incompatible backend (e.g., FLASHINFER).
if selected_backend in {
_Backend.FLASHINFER,
_Backend.FLASH_ATTN,
_Backend.TRITON_ATTN,
_Backend.TREE_ATTN,
_Backend.XFORMERS,
}:
raise ValueError(
f"Attention backend {selected_backend} incompatible with MLA. "
"Please use one of the MLA backends: FLASHINFER_MLA, CUTLASS_MLA, "
"FLASHMLA, FLASH_ATTN_MLA, or TRITON_MLA. Alternatively, set "
"VLLM_MLA_DISABLE=1 to disable MLA for this model."
)
if not use_v1:
raise RuntimeError(
"MLA attention backends require the V1 engine. "