mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-15 09:35:53 +08:00
[V1] Support sliding window attention (#9679)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
parent
a6f3721861
commit
9645b9f646
@ -82,8 +82,10 @@ class FlashAttentionImpl(AttentionImpl):
|
|||||||
if alibi_slopes is not None:
|
if alibi_slopes is not None:
|
||||||
alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
|
alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
|
||||||
self.alibi_slopes = alibi_slopes
|
self.alibi_slopes = alibi_slopes
|
||||||
self.sliding_window = ((sliding_window, sliding_window)
|
if sliding_window is None:
|
||||||
if sliding_window is not None else (-1, -1))
|
self.sliding_window = (-1, -1)
|
||||||
|
else:
|
||||||
|
self.sliding_window = (sliding_window - 1, 0)
|
||||||
self.kv_cache_dtype = kv_cache_dtype
|
self.kv_cache_dtype = kv_cache_dtype
|
||||||
if logits_soft_cap is None:
|
if logits_soft_cap is None:
|
||||||
# In flash-attn, setting logits_soft_cap as 0 means no soft cap.
|
# In flash-attn, setting logits_soft_cap as 0 means no soft cap.
|
||||||
@ -93,12 +95,6 @@ class FlashAttentionImpl(AttentionImpl):
|
|||||||
assert self.num_heads % self.num_kv_heads == 0
|
assert self.num_heads % self.num_kv_heads == 0
|
||||||
self.num_queries_per_kv = self.num_heads // self.num_kv_heads
|
self.num_queries_per_kv = self.num_heads // self.num_kv_heads
|
||||||
|
|
||||||
if sliding_window is not None:
|
|
||||||
# NOTE(woosuk): flash-attn's sliding window does not work with
|
|
||||||
# paged KV cache.
|
|
||||||
raise ValueError(
|
|
||||||
"Sliding window is not supported in FlashAttention.")
|
|
||||||
|
|
||||||
support_head_sizes = FlashAttentionBackend.get_supported_head_sizes()
|
support_head_sizes = FlashAttentionBackend.get_supported_head_sizes()
|
||||||
if head_size not in support_head_sizes:
|
if head_size not in support_head_sizes:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user