From 18093084be935fe8aad11a45366bea060b33d60f Mon Sep 17 00:00:00 2001 From: vllmellm Date: Thu, 5 Jun 2025 16:08:26 +0800 Subject: [PATCH] [Misc] Remove unnecessary fallback to prefill-decode attention (#19138) Signed-off-by: vllmellm --- vllm/v1/attention/backends/triton_attn.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index 968f13701118..5db592b15010 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -171,10 +171,7 @@ class TritonAttentionImpl(AttentionImpl): # Whenever making a change in this method, please benchmark the # performance to make sure it does not introduce any overhead. - num_queries_per_kv = query.shape[1] // key.shape[1] - num_q_is_pow2 = (num_queries_per_kv & (num_queries_per_kv - 1)) == 0 - use_prefill_decode_attn = (self.force_prefill_decode_attn - or not num_q_is_pow2) + use_prefill_decode_attn = self.force_prefill_decode_attn num_actual_tokens = attn_metadata.num_actual_tokens if use_prefill_decode_attn: