[Misc] Clean Up EngineArgs.create_engine_config (#13734)

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
2026-06-01 15:27:12 +08:00 · 2025-02-24 13:52:21 -05:00 · 2025-02-24 13:52:21 -05:00 · 1f0ae3ed0a
commit 1f0ae3ed0a
parent db986c19ea
2 changed files with 30 additions and 41 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@ -1124,6 +1124,10 @@ class CacheConfig:
        return {key: str(value) for key, value in self.__dict__.items()}
    def _verify_args(self) -> None:
        if self.cpu_offload_gb < 0:
            raise ValueError("CPU offload space must be non-negative"
                             f", but got {self.cpu_offload_gb}")
        if self.gpu_memory_utilization > 1.0:
            raise ValueError(
                "GPU memory utilization must be less than 1.0. Got "
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@ -1062,6 +1062,17 @@ class EngineArgs:
        return engine_args
    def create_model_config(self) -> ModelConfig:
        # gguf file needs a specific model loader and doesn't use hf_repo
        if check_gguf_file(self.model):
            self.quantization = self.load_format = "gguf"
        # NOTE: This is to allow model loading from S3 in CI
        if (not isinstance(self, AsyncEngineArgs) and envs.VLLM_CI_USE_S3
                and self.model in MODELS_ON_S3
                and self.load_format == LoadFormat.AUTO):  # noqa: E501
            self.model = f"{MODEL_WEIGHTS_S3_BUCKET}/{self.model}"
            self.load_format = LoadFormat.RUNAI_STREAMER
        return ModelConfig(
            model=self.model,
            task=self.task,
@ -1101,26 +1112,6 @@ class EngineArgs:
        )
    def create_load_config(self) -> LoadConfig:
        return LoadConfig(
            load_format=self.load_format,
            download_dir=self.download_dir,
            model_loader_extra_config=self.model_loader_extra_config,
            ignore_patterns=self.ignore_patterns,
        )
    def create_engine_config(self,
                             usage_context: Optional[UsageContext] = None
                             ) -> VllmConfig:
        from vllm.platforms import current_platform
        current_platform.pre_register_and_update()
        if envs.VLLM_USE_V1:
            self._override_v1_engine_args(usage_context)
        # gguf file needs a specific model loader and doesn't use hf_repo
        if check_gguf_file(self.model):
            self.quantization = self.load_format = "gguf"
        # bitsandbytes quantization needs a specific model loader
        # so we make sure the quant method and the load format are consistent
        if (self.quantization == "bitsandbytes" or
@ -1137,19 +1128,23 @@ class EngineArgs:
                "BitsAndBytes load format and QLoRA adapter only support "
                f"'bitsandbytes' quantization, but got {self.quantization}")
-        assert self.cpu_offload_gb >= 0, (
+        return LoadConfig(
-            "CPU offload space must be non-negative"
+            load_format=self.load_format,
-            f", but got {self.cpu_offload_gb}")
+            download_dir=self.download_dir,
            model_loader_extra_config=self.model_loader_extra_config,
            ignore_patterns=self.ignore_patterns,
        )
    def create_engine_config(self,
                             usage_context: Optional[UsageContext] = None
                             ) -> VllmConfig:
        from vllm.platforms import current_platform
        current_platform.pre_register_and_update()
        if envs.VLLM_USE_V1:
            self._override_v1_engine_args(usage_context)
        device_config = DeviceConfig(device=self.device)
        # NOTE: This is to allow model loading from S3 in CI
        if (not isinstance(self, AsyncEngineArgs) and envs.VLLM_CI_USE_S3
                and self.model in MODELS_ON_S3
                and self.load_format == LoadFormat.AUTO):  # noqa: E501
            self.model = f"{MODEL_WEIGHTS_S3_BUCKET}/{self.model}"
            self.load_format = LoadFormat.RUNAI_STREAMER
        model_config = self.create_model_config()
        if (model_config.is_multimodal_model and not envs.VLLM_USE_V1
@ -1281,16 +1276,6 @@ class EngineArgs:
            if speculative_config is None \
            else speculative_config.num_lookahead_slots
        if not self.use_v2_block_manager:
            logger.warning(
                "[DEPRECATED] Block manager v1 has been removed, "
                "and setting --use-v2-block-manager to True or False has "
                "no effect on vLLM behavior. Please remove "
                "--use-v2-block-manager in your engine argument. "
                "If your use case is not supported by "
                "SelfAttnBlockSpaceManager (i.e. block manager v2),"
                " please file an issue with detailed information.")
        scheduler_config = SchedulerConfig(
            runner_type=model_config.runner_type,
            max_num_batched_tokens=self.max_num_batched_tokens,