From 5f2cacdb1e62e468c707eab168c6d6961254bc93 Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Fri, 3 Oct 2025 04:28:22 -0700 Subject: [PATCH] Quick fix for IMA with the Prefix Prefill kernel during graph capture (#25983) Signed-off-by: Sage Moore --- vllm/v1/attention/backends/rocm_attn.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py index 37fc85bf351a..1748a48168d4 100644 --- a/vllm/v1/attention/backends/rocm_attn.py +++ b/vllm/v1/attention/backends/rocm_attn.py @@ -83,6 +83,14 @@ class RocmAttentionMetadataBuilder( # max_model_len will cause graph capture to be extremely # slow, so here we set it to 1. attn_metadata.seq_lens.fill_(1) + + if envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION: + # Here we set the query start locs to 0. This is to + # cover up an invalid memory access in the prefix_prefil kernel + # that we run into during graph capture (#25985) + common_attn_metadata.query_start_loc.zero_() + common_attn_metadata.query_start_loc_cpu.zero_() + return attn_metadata def build(self,