mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-12 19:56:21 +08:00
[Misc] Remove unnecessary fallback to prefill-decode attention (#19138)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
This commit is contained in:
parent
da40380214
commit
18093084be
@ -171,10 +171,7 @@ class TritonAttentionImpl(AttentionImpl):
|
|||||||
# Whenever making a change in this method, please benchmark the
|
# Whenever making a change in this method, please benchmark the
|
||||||
# performance to make sure it does not introduce any overhead.
|
# performance to make sure it does not introduce any overhead.
|
||||||
|
|
||||||
num_queries_per_kv = query.shape[1] // key.shape[1]
|
use_prefill_decode_attn = self.force_prefill_decode_attn
|
||||||
num_q_is_pow2 = (num_queries_per_kv & (num_queries_per_kv - 1)) == 0
|
|
||||||
use_prefill_decode_attn = (self.force_prefill_decode_attn
|
|
||||||
or not num_q_is_pow2)
|
|
||||||
num_actual_tokens = attn_metadata.num_actual_tokens
|
num_actual_tokens = attn_metadata.num_actual_tokens
|
||||||
|
|
||||||
if use_prefill_decode_attn:
|
if use_prefill_decode_attn:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user