From 38658ec6f3b3a09a6cd205bab23a550b3d3f8c0e Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Fri, 28 Nov 2025 03:17:37 +0800 Subject: [PATCH] [Bugfix][MM encoder] Fix ViT attention backend resolving for Turing GPU (#29614) Signed-off-by: Isotr0py --- vllm/platforms/cuda.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index d5c3a177d9c2b..4bf9401b6b051 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -264,14 +264,15 @@ class CudaPlatformBase(Platform): cls, head_size: int, dtype: torch.dtype ) -> "AttentionBackendEnum": # Try FlashAttention first - try: - backend_class = AttentionBackendEnum.FLASH_ATTN.get_class() - if backend_class.supports_head_size( - head_size - ) and backend_class.supports_dtype(dtype): - return AttentionBackendEnum.FLASH_ATTN - except ImportError: - pass + if (cc := cls.get_device_capability()) and cc.major >= 8: + try: + backend_class = AttentionBackendEnum.FLASH_ATTN.get_class() + if backend_class.supports_head_size( + head_size + ) and backend_class.supports_dtype(dtype): + return AttentionBackendEnum.FLASH_ATTN + except ImportError: + pass return AttentionBackendEnum.TORCH_SDPA