mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-13 13:55:01 +08:00
hotfix attn alibi wo head mapping (#496)
Co-authored-by: oliveryuan <oliveryuan@basemind.com>
This commit is contained in:
parent
453bafb96f
commit
bda41c70dd
@ -199,6 +199,7 @@ def run_single_query_cached_kv_attention(
|
|||||||
]
|
]
|
||||||
block_tables.append(block_table)
|
block_tables.append(block_table)
|
||||||
block_tables = torch.tensor(block_tables, dtype=torch.int, device='cuda')
|
block_tables = torch.tensor(block_tables, dtype=torch.int, device='cuda')
|
||||||
|
head_mapping = torch.arange(num_heads, dtype=torch.int32, device="cuda")
|
||||||
|
|
||||||
scale = float(1.0 / (head_size**0.5))
|
scale = float(1.0 / (head_size**0.5))
|
||||||
output = torch.empty(num_tokens,
|
output = torch.empty(num_tokens,
|
||||||
@ -211,6 +212,7 @@ def run_single_query_cached_kv_attention(
|
|||||||
query,
|
query,
|
||||||
key_cache,
|
key_cache,
|
||||||
value_cache,
|
value_cache,
|
||||||
|
head_mapping,
|
||||||
scale,
|
scale,
|
||||||
block_tables,
|
block_tables,
|
||||||
context_lens,
|
context_lens,
|
||||||
|
|||||||
@ -408,6 +408,7 @@ class PagedAttentionWithALiBi(PagedAttention):
|
|||||||
query,
|
query,
|
||||||
key_cache,
|
key_cache,
|
||||||
value_cache,
|
value_cache,
|
||||||
|
self.head_mapping,
|
||||||
self.scale,
|
self.scale,
|
||||||
input_metadata.block_tables,
|
input_metadata.block_tables,
|
||||||
input_metadata.context_lens,
|
input_metadata.context_lens,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user