mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-24 13:06:01 +08:00
[Bug]: Limit num_reqs in dummy_run when max_num_seqs is small (#26144)
Signed-off-by: Benjamin Chislett <bchislett@nvidia.com> Signed-off-by: yewentao256 <zhyanwentao@126.com>
This commit is contained in:
parent
a06bb9bf36
commit
56d0073f2a
@ -3060,7 +3060,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
assert not uniform_decode
|
||||
# Create mixed batch:
|
||||
# first half decode tokens, second half one prefill
|
||||
num_decode_tokens = num_tokens // 2
|
||||
num_decode_tokens = min(max_num_reqs - 1, num_tokens // 2)
|
||||
num_prefill_tokens = num_tokens - num_decode_tokens
|
||||
num_reqs = num_decode_tokens + 1
|
||||
|
||||
@ -3072,7 +3072,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
max_query_len = num_prefill_tokens
|
||||
elif uniform_decode:
|
||||
assert not create_mixed_batch
|
||||
num_reqs = cdiv(num_tokens, max_query_len)
|
||||
num_reqs = min(max_num_reqs, cdiv(num_tokens, max_query_len))
|
||||
num_scheduled_tokens_list = [max_query_len] * num_reqs
|
||||
if num_tokens % max_query_len != 0:
|
||||
num_scheduled_tokens_list[-1] = num_tokens % max_query_len
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user