mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-21 02:15:39 +08:00
Update default max_num_batch_tokens for chunked prefill to 2048 (#10544)
This commit is contained in:
parent
cfea9c04ef
commit
02a43f82a9
@ -1133,9 +1133,9 @@ class SchedulerConfig:
|
|||||||
# max_num_batched_tokens.
|
# max_num_batched_tokens.
|
||||||
self.max_num_batched_tokens = max(self.max_model_len, 2048)
|
self.max_num_batched_tokens = max(self.max_model_len, 2048)
|
||||||
else:
|
else:
|
||||||
# It is the values that have the best balance between ITL
|
# This value is chosen to have a balance between ITL
|
||||||
# and TTFT on A100. Note it is not optimized for throughput.
|
# and TTFT. Note it is not optimized for throughput.
|
||||||
self.max_num_batched_tokens = 512
|
self.max_num_batched_tokens = 2048
|
||||||
else:
|
else:
|
||||||
# If max_model_len is too short, use 2048 as the default value
|
# If max_model_len is too short, use 2048 as the default value
|
||||||
# for higher throughput.
|
# for higher throughput.
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user