From 9ad0a4588ba4e9c979cda0d178dec4fcdb89fd0c Mon Sep 17 00:00:00 2001
From: Pavani Majety <pmajety@nvidia.com>
Date: Mon, 14 Jul 2025 20:27:50 -0700
Subject: [PATCH] [Bugfix] Switch bailout logic for kv-cache-dtype with SM100
 Flashinfer (#20934)

Signed-off-by: Pavani Majety <pmajety@nvidia.com>
---
 vllm/engine/arg_utils.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index f47499309d8f6..e2c861587583c 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1418,14 +1418,15 @@ class EngineArgs:
                 and not envs.is_set("VLLM_ATTENTION_BACKEND")
             ) or envs.VLLM_ATTENTION_BACKEND == "FLASH_ATTN_VLLM_V1"
             supported = False
-            if current_platform.is_rocm():
+            if current_platform.is_rocm() or (
+                    current_platform.is_cuda()
+                    and current_platform.is_device_capability(100)):
                 supported = True
             elif fp8_attention and will_use_fa:
                 from vllm.attention.utils.fa_utils import (
                     flash_attn_supports_fp8)
                 supported = flash_attn_supports_fp8()
-            elif envs.VLLM_USE_TRTLLM_DECODE_ATTENTION:
-                supported = True
+
             if not supported:
                 _raise_or_fallback(feature_name="--kv-cache-dtype",
                                    recommend_to_remove=False)