mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-28 22:43:38 +08:00
[Bugfix][Hardware][Gaudi] Bump vllm_hpu_extension version (#11028)
Signed-off-by: Konrad Zawora <kzawora@habana.ai>
This commit is contained in:
parent
a811dd6608
commit
cbcbdb1ceb
@ -8,4 +8,4 @@ pandas
|
||||
tabulate
|
||||
setuptools>=61
|
||||
setuptools-scm>=8
|
||||
vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@fd7f2e6
|
||||
vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@e096d6f
|
||||
|
||||
@ -111,8 +111,16 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
|
||||
self.matmul_qk = Matmul()
|
||||
self.softmax = Softmax()
|
||||
self.matmul_av = Matmul()
|
||||
self.batch2block_matmul = Matmul()
|
||||
self.block2batch_matmul = Matmul()
|
||||
# NOTE(kzawora): Contiguous PA is off until model runner supports it
|
||||
self.k_cache = VLLMKVCache()
|
||||
self.k_cache.use_contiguous_pa = False
|
||||
self.v_cache = VLLMKVCache()
|
||||
self.v_cache.use_contiguous_pa = False
|
||||
# NOTE(kzawora): Pipelined PA is off until model runner supports it
|
||||
ops.pa_impl = ops.pa
|
||||
|
||||
self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
|
||||
self.sliding_window = sliding_window
|
||||
self.alibi_slopes = alibi_slopes
|
||||
@ -228,9 +236,12 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
|
||||
block_mapping=attn_metadata.block_mapping,
|
||||
block_bias=attn_metadata.attn_bias,
|
||||
block_scales=attn_metadata.block_scales,
|
||||
block_groups=None,
|
||||
scale=self.scale,
|
||||
matmul_qk_op=self.matmul_qk,
|
||||
matmul_av_op=self.matmul_av,
|
||||
batch2block_matmul_op=self.batch2block_matmul,
|
||||
block2batch_matmul_op=self.block2batch_matmul,
|
||||
keys_fetch_func=self.k_cache.fetch_from_cache,
|
||||
values_fetch_func=self.v_cache.fetch_from_cache)
|
||||
# Reshape the output tensor.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user