From 7eba37459911dba1d17c37d88f0ddda40e47a3c2 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Thu, 2 Jan 2025 13:26:21 -0500 Subject: [PATCH] flash attn changes Signed-off-by: Tyler Michael Smith --- vllm/v1/attention/backends/flash_attn.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 026a0292cc339..00cf744b67d8c 100644 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -162,7 +162,7 @@ class FlashAttentionImpl(AttentionImpl): value, key_cache, value_cache, - attn_metadata.slot_mapping, + attn_metadata.slot_mapping,#[:num_actual_tokens], self.kv_cache_dtype, k_scale, v_scale, @@ -174,9 +174,9 @@ class FlashAttentionImpl(AttentionImpl): k=key_cache, v=value_cache, out=output[:num_actual_tokens], - cu_seqlens_q=attn_metadata.query_start_loc, + cu_seqlens_q=attn_metadata.query_start_loc[:num_actual_tokens], max_seqlen_q=attn_metadata.max_query_len, - cu_seqlens_k=attn_metadata.seq_start_loc, + cu_seqlens_k=attn_metadata.seq_start_loc[:num_actual_tokens], max_seqlen_k=attn_metadata.max_seq_len, softmax_scale=self.scale, causal=True,