[Bugfix] use flash attn on sm90 (#22933)

Signed-off-by: Yongye Zhu <zyy1102000@gmail.com> Co-authored-by: Michael Goin <mgoin64@gmail.com>
2026-03-19 23:57:19 +08:00 · 2025-08-14 19:37:22 -04:00 · 2025-08-14 19:37:22 -04:00 · 39cd09dc86
commit 39cd09dc86
parent 919234fe17
1 changed files with 1 additions and 1 deletions
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@ -316,7 +316,7 @@ class CudaPlatformBase(Platform):

            # FlashAttention is the default for SM 8.0+ GPUs
            if cls.has_device_capability(80):
-                if has_sink:
+                if has_sink and not cls.is_device_capability(90):
                    logger.info_once("Using Triton backend on V1 engine.")
                    return TRITON_ATTN_VLLM_V1
                if is_default_backend_supported := is_attn_backend_supported(