mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-20 00:55:47 +08:00
[V1] Increase default batch size for H100/H200 (#12369)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
parent
55ef66edf4
commit
0e74d797ce
@ -1279,11 +1279,22 @@ class EngineArgs:
|
|||||||
self.enable_chunked_prefill = True
|
self.enable_chunked_prefill = True
|
||||||
# When no user override, set the default values based on the usage
|
# When no user override, set the default values based on the usage
|
||||||
# context.
|
# context.
|
||||||
# TODO(woosuk): Tune the default values for different hardware.
|
# Use different default values for different hardware.
|
||||||
default_max_num_batched_tokens = {
|
from vllm.platforms import current_platform
|
||||||
UsageContext.LLM_CLASS: 8192,
|
device_name = current_platform.get_device_name().lower()
|
||||||
UsageContext.OPENAI_API_SERVER: 2048,
|
if "h100" in device_name or "h200" in device_name:
|
||||||
}
|
# For H100 and H200, we use larger default values.
|
||||||
|
default_max_num_batched_tokens = {
|
||||||
|
UsageContext.LLM_CLASS: 16384,
|
||||||
|
UsageContext.OPENAI_API_SERVER: 8192,
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
# TODO(woosuk): Tune the default values for other hardware.
|
||||||
|
default_max_num_batched_tokens = {
|
||||||
|
UsageContext.LLM_CLASS: 8192,
|
||||||
|
UsageContext.OPENAI_API_SERVER: 2048,
|
||||||
|
}
|
||||||
|
|
||||||
if (self.max_num_batched_tokens is None
|
if (self.max_num_batched_tokens is None
|
||||||
and usage_context in default_max_num_batched_tokens):
|
and usage_context in default_max_num_batched_tokens):
|
||||||
self.max_num_batched_tokens = default_max_num_batched_tokens[
|
self.max_num_batched_tokens = default_max_num_batched_tokens[
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user