From 770e5dcdb8c2e8bc31171e614efc471d4a0c2442 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 9 Jun 2025 06:32:56 -0700 Subject: [PATCH] [full_graph] Fix query_start_loc padding (#19321) Signed-off-by: Yinghai Lu --- vllm/v1/worker/gpu_model_runner.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index e3535ef143ad..c39aea3d7ec7 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -655,7 +655,10 @@ class GPUModelRunner(LoRAModelRunnerMixin): # Fill unused with -1. Needed for reshape_and_cache self.seq_lens[num_reqs:].fill_(0) - self.query_start_loc[num_reqs + 1:].fill_(-1) + # Note: pad query_start_loc to be non-decreasing, as kernels + # like FlashAttention requires that + self.query_start_loc[num_reqs + 1:].fill_( + self.query_start_loc_cpu[num_reqs].item()) query_start_loc = self.query_start_loc[:num_reqs + 1] seq_lens = self.seq_lens[:num_reqs]