[Bug] Raise error explicitly if using incompatible backend (#27424)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-12-22 02:55:01 +08:00 · 2025-10-29 13:29:20 -04:00 · 2025-10-29 13:29:20 -04:00 · 5b0448104f
commit 5b0448104f
parent f7a6682872
1 changed files with 15 additions and 0 deletions
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@ -261,6 +261,21 @@ class CudaPlatformBase(Platform):
        from vllm.attention.backends.registry import _Backend
        if use_mla:
            # explicitly reject non-MLA backends when MLA is enabled to avoid
            # silently selecting an incompatible backend (e.g., FLASHINFER).
            if selected_backend in {
                _Backend.FLASHINFER,
                _Backend.FLASH_ATTN,
                _Backend.TRITON_ATTN,
                _Backend.TREE_ATTN,
                _Backend.XFORMERS,
            }:
                raise ValueError(
                    f"Attention backend {selected_backend} incompatible with MLA. "
                    "Please use one of the MLA backends: FLASHINFER_MLA, CUTLASS_MLA, "
                    "FLASHMLA, FLASH_ATTN_MLA, or TRITON_MLA. Alternatively, set "
                    "VLLM_MLA_DISABLE=1 to disable MLA for this model."
                )
            if not use_v1:
                raise RuntimeError(
                    "MLA attention backends require the V1 engine. "