From 56d0073f2a80876c9af68ee366f1d90f7cab8051 Mon Sep 17 00:00:00 2001 From: Benjamin Chislett Date: Fri, 3 Oct 2025 00:00:20 -0400 Subject: [PATCH] [Bug]: Limit num_reqs in dummy_run when max_num_seqs is small (#26144) Signed-off-by: Benjamin Chislett Signed-off-by: yewentao256 --- vllm/v1/worker/gpu_model_runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index efb4a8c0054f7..8b92cb052efd6 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3060,7 +3060,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): assert not uniform_decode # Create mixed batch: # first half decode tokens, second half one prefill - num_decode_tokens = num_tokens // 2 + num_decode_tokens = min(max_num_reqs - 1, num_tokens // 2) num_prefill_tokens = num_tokens - num_decode_tokens num_reqs = num_decode_tokens + 1 @@ -3072,7 +3072,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): max_query_len = num_prefill_tokens elif uniform_decode: assert not create_mixed_batch - num_reqs = cdiv(num_tokens, max_query_len) + num_reqs = min(max_num_reqs, cdiv(num_tokens, max_query_len)) num_scheduled_tokens_list = [max_query_len] * num_reqs if num_tokens % max_query_len != 0: num_scheduled_tokens_list[-1] = num_tokens % max_query_len