mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-14 19:05:35 +08:00
[TPU] [V1] fix cases when max_num_reqs is set smaller than MIN_NUM_SEQS (#15583)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
This commit is contained in:
parent
ecff8309a3
commit
619d3de8bd
@ -14,10 +14,7 @@ answers = [
|
|||||||
]
|
]
|
||||||
N = 1
|
N = 1
|
||||||
# Currently, top-p sampling is disabled. `top_p` should be 1.0.
|
# Currently, top-p sampling is disabled. `top_p` should be 1.0.
|
||||||
sampling_params = SamplingParams(temperature=0.7,
|
sampling_params = SamplingParams(temperature=0, top_p=1.0, n=N, max_tokens=16)
|
||||||
top_p=1.0,
|
|
||||||
n=N,
|
|
||||||
max_tokens=16)
|
|
||||||
|
|
||||||
# Set `enforce_eager=True` to avoid ahead-of-time compilation.
|
# Set `enforce_eager=True` to avoid ahead-of-time compilation.
|
||||||
# In real workloads, `enforace_eager` should be `False`.
|
# In real workloads, `enforace_eager` should be `False`.
|
||||||
|
|||||||
@ -88,7 +88,7 @@ class TPUModelRunner:
|
|||||||
self.max_model_len = model_config.max_model_len
|
self.max_model_len = model_config.max_model_len
|
||||||
self.max_num_blocks_per_req = cdiv(self.max_model_len, self.block_size)
|
self.max_num_blocks_per_req = cdiv(self.max_model_len, self.block_size)
|
||||||
self.max_num_tokens = scheduler_config.max_num_batched_tokens
|
self.max_num_tokens = scheduler_config.max_num_batched_tokens
|
||||||
self.max_num_reqs = scheduler_config.max_num_seqs
|
self.max_num_reqs = max(scheduler_config.max_num_seqs, MIN_NUM_SEQS)
|
||||||
|
|
||||||
# Model-related.
|
# Model-related.
|
||||||
self.num_attn_layers = model_config.get_num_layers_by_block_type(
|
self.num_attn_layers = model_config.get_num_layers_by_block_type(
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user