From 18093084be935fe8aad11a45366bea060b33d60f Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Thu, 5 Jun 2025 16:08:26 +0800
Subject: [PATCH] [Misc] Remove unnecessary fallback to prefill-decode
 attention (#19138)

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 vllm/v1/attention/backends/triton_attn.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index 968f13701118..5db592b15010 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -171,10 +171,7 @@ class TritonAttentionImpl(AttentionImpl):
         # Whenever making a change in this method, please benchmark the
         # performance to make sure it does not introduce any overhead.
 
-        num_queries_per_kv = query.shape[1] // key.shape[1]
-        num_q_is_pow2 = (num_queries_per_kv & (num_queries_per_kv - 1)) == 0
-        use_prefill_decode_attn = (self.force_prefill_decode_attn
-                                   or not num_q_is_pow2)
+        use_prefill_decode_attn = self.force_prefill_decode_attn
         num_actual_tokens = attn_metadata.num_actual_tokens
 
         if use_prefill_decode_attn: