From 39cd09dc86cadd80f3588e8d67ee9288012bcb9e Mon Sep 17 00:00:00 2001
From: Yongye Zhu <zyy1102000@gmail.com>
Date: Thu, 14 Aug 2025 19:37:22 -0400
Subject: [PATCH] [Bugfix] use flash attn on sm90 (#22933)

Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 vllm/platforms/cuda.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 63f6b373c322f..483d5e1531a92 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -316,7 +316,7 @@ class CudaPlatformBase(Platform):
 
             # FlashAttention is the default for SM 8.0+ GPUs
             if cls.has_device_capability(80):
-                if has_sink:
+                if has_sink and not cls.is_device_capability(90):
                     logger.info_once("Using Triton backend on V1 engine.")
                     return TRITON_ATTN_VLLM_V1
                 if is_default_backend_supported := is_attn_backend_supported(