[V1][Bugfix] Always set enable_chunked_prefill = True for V1 (#11061)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
Woosuk Kwon 2024-12-10 14:41:23 -08:00 committed by GitHub
parent 75f89dc44c
commit 134810b3d9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -122,7 +122,7 @@ class EngineArgs:
cpu_offload_gb: float = 0 # GiB
gpu_memory_utilization: float = 0.90
max_num_batched_tokens: Optional[int] = None
max_num_seqs: int = 256
max_num_seqs: Optional[int] = None
max_logprobs: int = 20 # Default value for OpenAI Chat Completions API
disable_log_stats: bool = False
revision: Optional[str] = None
@ -205,6 +205,9 @@ class EngineArgs:
# by user.
if self.enable_prefix_caching is None:
self.enable_prefix_caching = bool(envs.VLLM_USE_V1)
# Override max_num_seqs if it's not set by user.
if self.max_num_seqs is None:
self.max_num_seqs = 256 if not envs.VLLM_USE_V1 else 1024
# support `EngineArgs(compilation_config={...})`
# without having to manually construct a
@ -1225,19 +1228,19 @@ class EngineArgs:
"""
assert envs.VLLM_USE_V1, "V1 is not enabled"
# V1 always uses chunked prefills.
self.enable_chunked_prefill = True
# When no user override, set the default values based on the usage
# context.
# TODO(woosuk): Tune the default values for different hardware.
if self.max_num_batched_tokens is None:
# When no user override, set the default values based on the
# usage context.
if usage_context == UsageContext.LLM_CLASS:
logger.warning("Setting max_num_batched_tokens to 8192 "
"for LLM_CLASS usage context.")
self.max_num_seqs = 1024
self.max_num_batched_tokens = 8192
elif usage_context == UsageContext.OPENAI_API_SERVER:
logger.warning("Setting max_num_batched_tokens to 2048 "
"for OPENAI_API_SERVER usage context.")
self.max_num_seqs = 1024
self.max_num_batched_tokens = 2048
logger.warning(
"Setting max_num_batched_tokens to %d for %s usage context.",
self.max_num_batched_tokens, usage_context.value)
def _override_v1_engine_config(self, engine_config: VllmConfig) -> None:
"""