[Frontend] speed up import time of vllm.config (#18036)

Signed-off-by: David Xia <david@davidxia.com>
This commit is contained in:
David Xia 2025-06-25 00:41:11 -04:00 committed by GitHub
parent 3443aaf8dd
commit 7108934142
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -27,19 +27,13 @@ from pydantic import (ConfigDict, SkipValidation, TypeAdapter, field_validator,
from pydantic.dataclasses import dataclass
from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
from torch.distributed import ProcessGroup, ReduceOp
from transformers import PretrainedConfig
from typing_extensions import Self, deprecated, runtime_checkable
import vllm.envs as envs
from vllm import version
from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
from vllm.logger import init_logger
from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS,
QuantizationMethods,
get_quantization_config)
from vllm.model_executor.models import ModelRegistry
from vllm.platforms import current_platform
from vllm.tracing import is_otel_available, otel_import_error_traceback
from vllm.transformers_utils.config import (
ConfigFormat, get_config, get_hf_image_processor_config,
get_hf_text_config, get_pooling_config,
@ -48,32 +42,49 @@ from vllm.transformers_utils.config import (
try_get_tokenizer_config, uses_mrope)
from vllm.transformers_utils.s3_utils import S3Model
from vllm.transformers_utils.utils import is_s3, maybe_model_redirect
# yapf conflicts with isort for this block
# yapf: disable
from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS,
MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, GiB_bytes,
LayerBlockType, common_broadcastable_dtype,
LayerBlockType, LazyLoader, common_broadcastable_dtype,
cuda_device_count_stateless, get_cpu_memory,
get_open_port, is_torch_equal_or_newer, random_uuid,
resolve_obj_by_qualname)
# yapf: enable
if TYPE_CHECKING:
from _typeshed import DataclassInstance
from ray.util.placement_group import PlacementGroup
from transformers.configuration_utils import PretrainedConfig
import vllm.model_executor.layers.quantization as me_quant
import vllm.model_executor.models as me_models
from vllm.executor.executor_base import ExecutorBase
from vllm.model_executor.layers.quantization import QuantizationMethods
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig)
from vllm.model_executor.model_loader import BaseModelLoader
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
ConfigType = type[DataclassInstance]
HfOverrides = Union[dict, Callable[[type], type]]
else:
PlacementGroup = Any
PretrainedConfig = Any
ExecutorBase = Any
QuantizationConfig = Any
QuantizationMethods = Any
BaseModelLoader = Any
TensorizerConfig = Any
ConfigType = type
HfOverrides = Union[dict[str, Any], Callable[[type], type]]
me_quant = LazyLoader("model_executor", globals(),
"vllm.model_executor.layers.quantization")
me_models = LazyLoader("model_executor", globals(),
"vllm.model_executor.models")
logger = init_logger(__name__)
@ -100,9 +111,6 @@ _TASK_RUNNER: dict[_ResolvedTask, RunnerType] = {
for task in tasks
}
HfOverrides = Union[dict[str, Any], Callable[[PretrainedConfig],
PretrainedConfig]]
@runtime_checkable
class SupportsHash(Protocol):
@ -648,7 +656,7 @@ class ModelConfig:
@property
def registry(self):
return ModelRegistry
return me_models.ModelRegistry
@property
def architectures(self) -> list[str]:
@ -859,14 +867,15 @@ class ModelConfig:
return quant_cfg
def _verify_quantization(self) -> None:
supported_quantization = QUANTIZATION_METHODS
supported_quantization = me_quant.QUANTIZATION_METHODS
optimized_quantization_methods = [
"fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin",
"awq_marlin", "fbgemm_fp8", "compressed-tensors", "experts_int8",
"quark", "modelopt_fp4", "bitblas", "gptq_bitblas"
]
if self.quantization is not None:
self.quantization = cast(QuantizationMethods, self.quantization)
self.quantization = cast(me_quant.QuantizationMethods,
self.quantization)
# Parse quantization method from the HF model config, if available.
quant_cfg = self._parse_quant_hf_config()
@ -900,14 +909,14 @@ class ModelConfig:
# Detect which checkpoint is it
for name in quantization_methods:
method = get_quantization_config(name)
method = me_quant.get_quantization_config(name)
quantization_override = method.override_quantization_method(
quant_cfg, self.quantization)
if quantization_override is not None:
# Raise error if the override is not custom (custom would
# be in QUANTIZATION_METHODS but not QuantizationMethods)
# and hasn't been added to the overrides list.
if (name in get_args(QuantizationMethods)
if (name in get_args(me_quant.QuantizationMethods)
and name not in overrides):
raise ValueError(
f"Quantization method {name} is an override but "
@ -1417,7 +1426,7 @@ class ModelConfig:
@property
def is_v1_compatible(self) -> bool:
architectures = getattr(self.hf_config, "architectures", [])
return ModelRegistry.is_v1_compatible(architectures)
return me_models.ModelRegistry.is_v1_compatible(architectures)
@property
def is_matryoshka(self) -> bool:
@ -2376,7 +2385,7 @@ class SpeculativeConfig:
according to the log probability settings in SamplingParams."""
# Draft model configuration
quantization: Optional[QuantizationMethods] = None
quantization: Optional[me_quant.QuantizationMethods] = None
"""Quantization method that was used to quantize the draft model weights.
If `None`, we assume the model weights are not quantized. Note that it only
takes effect when using the draft model-based speculative method."""
@ -3624,6 +3633,7 @@ class ObservabilityConfig:
and "," in self.collect_detailed_traces[0]):
self._parse_collect_detailed_traces()
from vllm.tracing import is_otel_available, otel_import_error_traceback
if not is_otel_available() and self.otlp_traces_endpoint is not None:
raise ValueError(
"OpenTelemetry is not available. Unable to configure "