diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 63f6b373c322f..483d5e1531a92 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -316,7 +316,7 @@ class CudaPlatformBase(Platform): # FlashAttention is the default for SM 8.0+ GPUs if cls.has_device_capability(80): - if has_sink: + if has_sink and not cls.is_device_capability(90): logger.info_once("Using Triton backend on V1 engine.") return TRITON_ATTN_VLLM_V1 if is_default_backend_supported := is_attn_backend_supported(