hotfix attn alibi wo head mapping (#496)

Co-authored-by: oliveryuan <oliveryuan@basemind.com>
2026-01-28 04:27:15 +08:00 · 2023-07-19 02:31:48 +08:00 · 2023-07-19 02:31:48 +08:00 · bda41c70dd
commit bda41c70dd
parent 453bafb96f
2 changed files with 3 additions and 0 deletions
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@ -199,6 +199,7 @@ def run_single_query_cached_kv_attention(
        ]
        block_tables.append(block_table)
    block_tables = torch.tensor(block_tables, dtype=torch.int, device='cuda')
+    head_mapping = torch.arange(num_heads, dtype=torch.int32, device="cuda")

    scale = float(1.0 / (head_size**0.5))
    output = torch.empty(num_tokens,
@ -211,6 +212,7 @@ def run_single_query_cached_kv_attention(
        query,
        key_cache,
        value_cache,
+        head_mapping,
        scale,
        block_tables,
        context_lens,
--- a/vllm/model_executor/layers/attention.py
+++ b/vllm/model_executor/layers/attention.py
@ -408,6 +408,7 @@ class PagedAttentionWithALiBi(PagedAttention):
            query,
            key_cache,
            value_cache,
+            self.head_mapping,
            self.scale,
            input_metadata.block_tables,
            input_metadata.context_lens,