mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-21 22:15:49 +08:00
[V0 deprecation] Remove _set_default_args_v0 function (#25409)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Signed-off-by: yewentao256 <zhyanwentao@126.com>
This commit is contained in:
parent
a0b5617263
commit
1b30043f0d
@ -1147,20 +1147,15 @@ class EngineArgs:
|
|||||||
else:
|
else:
|
||||||
envs.set_vllm_use_v1(use_v1)
|
envs.set_vllm_use_v1(use_v1)
|
||||||
|
|
||||||
# Set default arguments for V0 or V1 Engine.
|
# Set default arguments for V1 Engine.
|
||||||
if use_v1:
|
self._set_default_args(usage_context, model_config)
|
||||||
self._set_default_args_v1(usage_context, model_config)
|
|
||||||
# Disable chunked prefill for POWER (ppc64le)/ARM/s390x CPUs in V1
|
# Disable chunked prefill for POWER (ppc64le)/ARM/s390x CPUs in V1
|
||||||
if current_platform.is_cpu(
|
if current_platform.is_cpu() and current_platform.get_cpu_architecture(
|
||||||
) and current_platform.get_cpu_architecture() in (
|
) in (CpuArchEnum.POWERPC, CpuArchEnum.S390X, CpuArchEnum.ARM):
|
||||||
CpuArchEnum.POWERPC, CpuArchEnum.S390X, CpuArchEnum.ARM):
|
logger.info("Chunked prefill is not supported for ARM and POWER "
|
||||||
logger.info(
|
|
||||||
"Chunked prefill is not supported for ARM and POWER "
|
|
||||||
"and S390X CPUs; "
|
"and S390X CPUs; "
|
||||||
"disabling it for V1 backend.")
|
"disabling it for V1 backend.")
|
||||||
self.enable_chunked_prefill = False
|
self.enable_chunked_prefill = False
|
||||||
else:
|
|
||||||
self._set_default_args_v0(model_config)
|
|
||||||
assert self.enable_chunked_prefill is not None
|
assert self.enable_chunked_prefill is not None
|
||||||
|
|
||||||
sliding_window: Optional[int] = None
|
sliding_window: Optional[int] = None
|
||||||
@ -1528,63 +1523,7 @@ class EngineArgs:
|
|||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def _set_default_args_v0(self, model_config: ModelConfig) -> None:
|
def _set_default_args(self, usage_context: UsageContext,
|
||||||
"""Set Default Arguments for V0 Engine."""
|
|
||||||
|
|
||||||
max_model_len = model_config.max_model_len
|
|
||||||
use_long_context = max_model_len > 32768
|
|
||||||
if self.enable_chunked_prefill is None:
|
|
||||||
# Chunked prefill not supported for Multimodal or MLA in V0.
|
|
||||||
if model_config.is_multimodal_model or model_config.use_mla:
|
|
||||||
self.enable_chunked_prefill = False
|
|
||||||
|
|
||||||
# Enable chunked prefill by default for long context (> 32K)
|
|
||||||
# models to avoid OOM errors in initial memory profiling phase.
|
|
||||||
elif use_long_context:
|
|
||||||
is_gpu = current_platform.is_cuda()
|
|
||||||
use_sliding_window = (model_config.get_sliding_window()
|
|
||||||
is not None)
|
|
||||||
use_spec_decode = self.speculative_config is not None
|
|
||||||
|
|
||||||
if (is_gpu and not use_sliding_window and not use_spec_decode
|
|
||||||
and not self.enable_lora):
|
|
||||||
self.enable_chunked_prefill = True
|
|
||||||
logger.warning(
|
|
||||||
"Chunked prefill is enabled by default for models "
|
|
||||||
"with max_model_len > 32K. Chunked prefill might "
|
|
||||||
"not work with some features or models. If you "
|
|
||||||
"encounter any issues, please disable by launching "
|
|
||||||
"with --enable-chunked-prefill=False.")
|
|
||||||
|
|
||||||
if self.enable_chunked_prefill is None:
|
|
||||||
self.enable_chunked_prefill = False
|
|
||||||
|
|
||||||
if not self.enable_chunked_prefill and use_long_context:
|
|
||||||
logger.warning(
|
|
||||||
"The model has a long context length (%s). This may cause"
|
|
||||||
"OOM during the initial memory profiling phase, or result "
|
|
||||||
"in low performance due to small KV cache size. Consider "
|
|
||||||
"setting --max-model-len to a smaller value.", max_model_len)
|
|
||||||
|
|
||||||
# Disable prefix caching for multimodal models for VLLM_V0.
|
|
||||||
if self.enable_prefix_caching and model_config.is_multimodal_model:
|
|
||||||
logger.warning(
|
|
||||||
"--enable-prefix-caching is not supported for multimodal "
|
|
||||||
"models in V0 and has been disabled.")
|
|
||||||
self.enable_prefix_caching = False
|
|
||||||
|
|
||||||
if self.enable_prompt_embeds:
|
|
||||||
logger.warning(
|
|
||||||
"--enable-prompt-embeds and --enable-prefix-caching "
|
|
||||||
"are not supported together in V0. Prefix caching has "
|
|
||||||
"been disabled.")
|
|
||||||
self.enable_prefix_caching = False
|
|
||||||
|
|
||||||
# Set max_num_seqs to 256 for VLLM_V0.
|
|
||||||
if self.max_num_seqs is None:
|
|
||||||
self.max_num_seqs = 256
|
|
||||||
|
|
||||||
def _set_default_args_v1(self, usage_context: UsageContext,
|
|
||||||
model_config: ModelConfig) -> None:
|
model_config: ModelConfig) -> None:
|
||||||
"""Set Default Arguments for V1 Engine."""
|
"""Set Default Arguments for V1 Engine."""
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user