mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-03 00:37:07 +08:00
flash attn changes
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
This commit is contained in:
parent
0c7e6c1e36
commit
7eba374599
@ -162,7 +162,7 @@ class FlashAttentionImpl(AttentionImpl):
|
||||
value,
|
||||
key_cache,
|
||||
value_cache,
|
||||
attn_metadata.slot_mapping,
|
||||
attn_metadata.slot_mapping,#[:num_actual_tokens],
|
||||
self.kv_cache_dtype,
|
||||
k_scale,
|
||||
v_scale,
|
||||
@ -174,9 +174,9 @@ class FlashAttentionImpl(AttentionImpl):
|
||||
k=key_cache,
|
||||
v=value_cache,
|
||||
out=output[:num_actual_tokens],
|
||||
cu_seqlens_q=attn_metadata.query_start_loc,
|
||||
cu_seqlens_q=attn_metadata.query_start_loc[:num_actual_tokens],
|
||||
max_seqlen_q=attn_metadata.max_query_len,
|
||||
cu_seqlens_k=attn_metadata.seq_start_loc,
|
||||
cu_seqlens_k=attn_metadata.seq_start_loc[:num_actual_tokens],
|
||||
max_seqlen_k=attn_metadata.max_seq_len,
|
||||
softmax_scale=self.scale,
|
||||
causal=True,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user