From 5b0448104fa16ca28711e715f4866d9b2805f171 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Wed, 29 Oct 2025 13:29:20 -0400 Subject: [PATCH] [Bug] Raise error explicitly if using incompatible backend (#27424) Signed-off-by: yewentao256 --- vllm/platforms/cuda.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 66cffde9503d..cc06f034fba3 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -261,6 +261,21 @@ class CudaPlatformBase(Platform): from vllm.attention.backends.registry import _Backend if use_mla: + # explicitly reject non-MLA backends when MLA is enabled to avoid + # silently selecting an incompatible backend (e.g., FLASHINFER). + if selected_backend in { + _Backend.FLASHINFER, + _Backend.FLASH_ATTN, + _Backend.TRITON_ATTN, + _Backend.TREE_ATTN, + _Backend.XFORMERS, + }: + raise ValueError( + f"Attention backend {selected_backend} incompatible with MLA. " + "Please use one of the MLA backends: FLASHINFER_MLA, CUTLASS_MLA, " + "FLASHMLA, FLASH_ATTN_MLA, or TRITON_MLA. Alternatively, set " + "VLLM_MLA_DISABLE=1 to disable MLA for this model." + ) if not use_v1: raise RuntimeError( "MLA attention backends require the V1 engine. "