mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-23 05:57:03 +08:00
[V1][Bugfix] Always set enable_chunked_prefill = True for V1 (#11061)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
parent
75f89dc44c
commit
134810b3d9
@ -122,7 +122,7 @@ class EngineArgs:
|
||||
cpu_offload_gb: float = 0 # GiB
|
||||
gpu_memory_utilization: float = 0.90
|
||||
max_num_batched_tokens: Optional[int] = None
|
||||
max_num_seqs: int = 256
|
||||
max_num_seqs: Optional[int] = None
|
||||
max_logprobs: int = 20 # Default value for OpenAI Chat Completions API
|
||||
disable_log_stats: bool = False
|
||||
revision: Optional[str] = None
|
||||
@ -205,6 +205,9 @@ class EngineArgs:
|
||||
# by user.
|
||||
if self.enable_prefix_caching is None:
|
||||
self.enable_prefix_caching = bool(envs.VLLM_USE_V1)
|
||||
# Override max_num_seqs if it's not set by user.
|
||||
if self.max_num_seqs is None:
|
||||
self.max_num_seqs = 256 if not envs.VLLM_USE_V1 else 1024
|
||||
|
||||
# support `EngineArgs(compilation_config={...})`
|
||||
# without having to manually construct a
|
||||
@ -1225,19 +1228,19 @@ class EngineArgs:
|
||||
"""
|
||||
assert envs.VLLM_USE_V1, "V1 is not enabled"
|
||||
|
||||
# V1 always uses chunked prefills.
|
||||
self.enable_chunked_prefill = True
|
||||
# When no user override, set the default values based on the usage
|
||||
# context.
|
||||
# TODO(woosuk): Tune the default values for different hardware.
|
||||
if self.max_num_batched_tokens is None:
|
||||
# When no user override, set the default values based on the
|
||||
# usage context.
|
||||
if usage_context == UsageContext.LLM_CLASS:
|
||||
logger.warning("Setting max_num_batched_tokens to 8192 "
|
||||
"for LLM_CLASS usage context.")
|
||||
self.max_num_seqs = 1024
|
||||
self.max_num_batched_tokens = 8192
|
||||
elif usage_context == UsageContext.OPENAI_API_SERVER:
|
||||
logger.warning("Setting max_num_batched_tokens to 2048 "
|
||||
"for OPENAI_API_SERVER usage context.")
|
||||
self.max_num_seqs = 1024
|
||||
self.max_num_batched_tokens = 2048
|
||||
logger.warning(
|
||||
"Setting max_num_batched_tokens to %d for %s usage context.",
|
||||
self.max_num_batched_tokens, usage_context.value)
|
||||
|
||||
def _override_v1_engine_config(self, engine_config: VllmConfig) -> None:
|
||||
"""
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user