diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 026a0292cc339..00cf744b67d8c 100644 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -162,7 +162,7 @@ class FlashAttentionImpl(AttentionImpl): value, key_cache, value_cache, - attn_metadata.slot_mapping, + attn_metadata.slot_mapping,#[:num_actual_tokens], self.kv_cache_dtype, k_scale, v_scale, @@ -174,9 +174,9 @@ class FlashAttentionImpl(AttentionImpl): k=key_cache, v=value_cache, out=output[:num_actual_tokens], - cu_seqlens_q=attn_metadata.query_start_loc, + cu_seqlens_q=attn_metadata.query_start_loc[:num_actual_tokens], max_seqlen_q=attn_metadata.max_query_len, - cu_seqlens_k=attn_metadata.seq_start_loc, + cu_seqlens_k=attn_metadata.seq_start_loc[:num_actual_tokens], max_seqlen_k=attn_metadata.max_seq_len, softmax_scale=self.scale, causal=True,