From af7fc84fd2e9b0fb70bc6349730b3ee73f0e1f8d Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 4 Jun 2025 22:41:25 -0700 Subject: [PATCH] [BugFix][Minor] Fix full cuda graph bug when max_num_seqs < 512 (#19171) Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu_model_runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 4a67e37781bf..f6ccf0fa1d36 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1737,7 +1737,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): # has num_tokens in total. assert num_tokens <= self.scheduler_config.max_num_batched_tokens max_num_reqs = self.scheduler_config.max_num_seqs - num_reqs = max_num_reqs if num_tokens >= max_num_reqs else num_tokens + num_reqs = min(num_tokens, max_num_reqs) min_tokens_per_req = num_tokens // num_reqs num_scheduled_tokens_list = [min_tokens_per_req] * num_reqs num_scheduled_tokens_list[-1] += num_tokens % num_reqs @@ -1765,7 +1765,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): self.kv_cache_config.kv_cache_groups): attn_metadata_i = ( self.attn_metadata_builders[kv_cache_group_id].build( - num_reqs=num_tokens, + num_reqs=num_reqs, num_actual_tokens=num_tokens, max_query_len=num_tokens, common_prefix_len=0,