mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-09 07:14:58 +08:00
Quick fix for IMA with the Prefix Prefill kernel during graph capture (#25983)
Signed-off-by: Sage Moore <sage@neuralmagic.com>
This commit is contained in:
parent
aa5053e3fe
commit
5f2cacdb1e
@ -83,6 +83,14 @@ class RocmAttentionMetadataBuilder(
|
||||
# max_model_len will cause graph capture to be extremely
|
||||
# slow, so here we set it to 1.
|
||||
attn_metadata.seq_lens.fill_(1)
|
||||
|
||||
if envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION:
|
||||
# Here we set the query start locs to 0. This is to
|
||||
# cover up an invalid memory access in the prefix_prefil kernel
|
||||
# that we run into during graph capture (#25985)
|
||||
common_attn_metadata.query_start_loc.zero_()
|
||||
common_attn_metadata.query_start_loc_cpu.zero_()
|
||||
|
||||
return attn_metadata
|
||||
|
||||
def build(self,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user