mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-06-01 15:27:12 +08:00
[Misc] Clean Up EngineArgs.create_engine_config (#13734)
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
This commit is contained in:
parent
db986c19ea
commit
1f0ae3ed0a
@ -1124,6 +1124,10 @@ class CacheConfig:
|
|||||||
return {key: str(value) for key, value in self.__dict__.items()}
|
return {key: str(value) for key, value in self.__dict__.items()}
|
||||||
|
|
||||||
def _verify_args(self) -> None:
|
def _verify_args(self) -> None:
|
||||||
|
if self.cpu_offload_gb < 0:
|
||||||
|
raise ValueError("CPU offload space must be non-negative"
|
||||||
|
f", but got {self.cpu_offload_gb}")
|
||||||
|
|
||||||
if self.gpu_memory_utilization > 1.0:
|
if self.gpu_memory_utilization > 1.0:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"GPU memory utilization must be less than 1.0. Got "
|
"GPU memory utilization must be less than 1.0. Got "
|
||||||
|
|||||||
@ -1062,6 +1062,17 @@ class EngineArgs:
|
|||||||
return engine_args
|
return engine_args
|
||||||
|
|
||||||
def create_model_config(self) -> ModelConfig:
|
def create_model_config(self) -> ModelConfig:
|
||||||
|
# gguf file needs a specific model loader and doesn't use hf_repo
|
||||||
|
if check_gguf_file(self.model):
|
||||||
|
self.quantization = self.load_format = "gguf"
|
||||||
|
|
||||||
|
# NOTE: This is to allow model loading from S3 in CI
|
||||||
|
if (not isinstance(self, AsyncEngineArgs) and envs.VLLM_CI_USE_S3
|
||||||
|
and self.model in MODELS_ON_S3
|
||||||
|
and self.load_format == LoadFormat.AUTO): # noqa: E501
|
||||||
|
self.model = f"{MODEL_WEIGHTS_S3_BUCKET}/{self.model}"
|
||||||
|
self.load_format = LoadFormat.RUNAI_STREAMER
|
||||||
|
|
||||||
return ModelConfig(
|
return ModelConfig(
|
||||||
model=self.model,
|
model=self.model,
|
||||||
task=self.task,
|
task=self.task,
|
||||||
@ -1101,26 +1112,6 @@ class EngineArgs:
|
|||||||
)
|
)
|
||||||
|
|
||||||
def create_load_config(self) -> LoadConfig:
|
def create_load_config(self) -> LoadConfig:
|
||||||
return LoadConfig(
|
|
||||||
load_format=self.load_format,
|
|
||||||
download_dir=self.download_dir,
|
|
||||||
model_loader_extra_config=self.model_loader_extra_config,
|
|
||||||
ignore_patterns=self.ignore_patterns,
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_engine_config(self,
|
|
||||||
usage_context: Optional[UsageContext] = None
|
|
||||||
) -> VllmConfig:
|
|
||||||
from vllm.platforms import current_platform
|
|
||||||
current_platform.pre_register_and_update()
|
|
||||||
|
|
||||||
if envs.VLLM_USE_V1:
|
|
||||||
self._override_v1_engine_args(usage_context)
|
|
||||||
|
|
||||||
# gguf file needs a specific model loader and doesn't use hf_repo
|
|
||||||
if check_gguf_file(self.model):
|
|
||||||
self.quantization = self.load_format = "gguf"
|
|
||||||
|
|
||||||
# bitsandbytes quantization needs a specific model loader
|
# bitsandbytes quantization needs a specific model loader
|
||||||
# so we make sure the quant method and the load format are consistent
|
# so we make sure the quant method and the load format are consistent
|
||||||
if (self.quantization == "bitsandbytes" or
|
if (self.quantization == "bitsandbytes" or
|
||||||
@ -1137,19 +1128,23 @@ class EngineArgs:
|
|||||||
"BitsAndBytes load format and QLoRA adapter only support "
|
"BitsAndBytes load format and QLoRA adapter only support "
|
||||||
f"'bitsandbytes' quantization, but got {self.quantization}")
|
f"'bitsandbytes' quantization, but got {self.quantization}")
|
||||||
|
|
||||||
assert self.cpu_offload_gb >= 0, (
|
return LoadConfig(
|
||||||
"CPU offload space must be non-negative"
|
load_format=self.load_format,
|
||||||
f", but got {self.cpu_offload_gb}")
|
download_dir=self.download_dir,
|
||||||
|
model_loader_extra_config=self.model_loader_extra_config,
|
||||||
|
ignore_patterns=self.ignore_patterns,
|
||||||
|
)
|
||||||
|
|
||||||
|
def create_engine_config(self,
|
||||||
|
usage_context: Optional[UsageContext] = None
|
||||||
|
) -> VllmConfig:
|
||||||
|
from vllm.platforms import current_platform
|
||||||
|
current_platform.pre_register_and_update()
|
||||||
|
|
||||||
|
if envs.VLLM_USE_V1:
|
||||||
|
self._override_v1_engine_args(usage_context)
|
||||||
|
|
||||||
device_config = DeviceConfig(device=self.device)
|
device_config = DeviceConfig(device=self.device)
|
||||||
|
|
||||||
# NOTE: This is to allow model loading from S3 in CI
|
|
||||||
if (not isinstance(self, AsyncEngineArgs) and envs.VLLM_CI_USE_S3
|
|
||||||
and self.model in MODELS_ON_S3
|
|
||||||
and self.load_format == LoadFormat.AUTO): # noqa: E501
|
|
||||||
self.model = f"{MODEL_WEIGHTS_S3_BUCKET}/{self.model}"
|
|
||||||
self.load_format = LoadFormat.RUNAI_STREAMER
|
|
||||||
|
|
||||||
model_config = self.create_model_config()
|
model_config = self.create_model_config()
|
||||||
|
|
||||||
if (model_config.is_multimodal_model and not envs.VLLM_USE_V1
|
if (model_config.is_multimodal_model and not envs.VLLM_USE_V1
|
||||||
@ -1281,16 +1276,6 @@ class EngineArgs:
|
|||||||
if speculative_config is None \
|
if speculative_config is None \
|
||||||
else speculative_config.num_lookahead_slots
|
else speculative_config.num_lookahead_slots
|
||||||
|
|
||||||
if not self.use_v2_block_manager:
|
|
||||||
logger.warning(
|
|
||||||
"[DEPRECATED] Block manager v1 has been removed, "
|
|
||||||
"and setting --use-v2-block-manager to True or False has "
|
|
||||||
"no effect on vLLM behavior. Please remove "
|
|
||||||
"--use-v2-block-manager in your engine argument. "
|
|
||||||
"If your use case is not supported by "
|
|
||||||
"SelfAttnBlockSpaceManager (i.e. block manager v2),"
|
|
||||||
" please file an issue with detailed information.")
|
|
||||||
|
|
||||||
scheduler_config = SchedulerConfig(
|
scheduler_config = SchedulerConfig(
|
||||||
runner_type=model_config.runner_type,
|
runner_type=model_config.runner_type,
|
||||||
max_num_batched_tokens=self.max_num_batched_tokens,
|
max_num_batched_tokens=self.max_num_batched_tokens,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user