Generate _ModelInfo properties file when loading to improve loading speed (#23558)

Signed-off-by: Manoel Marques <manoel.marques@ibm.com> Signed-off-by: Manoel Marques <manoelmrqs@gmail.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Signed-off-by: yewentao256 <zhyanwentao@126.com>
2026-05-27 23:27:20 +08:00 · 2025-09-20 07:51:13 -04:00 · 2025-09-20 07:51:13 -04:00 · af4dedf6d3
commit af4dedf6d3
parent dad5f4d16d
4 changed files with 167 additions and 3 deletions
--- a/vllm/logging_utils/init.py
+++ b/vllm/logging_utils/init.py
@ -2,7 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from vllm.logging_utils.formatter import NewLineFormatter
 from vllm.logging_utils.log_time import logtime
 __all__ = [
    "NewLineFormatter",
    "logtime",
 ]
--- a/vllm/logging_utils/log_time.py
+++ b/vllm/logging_utils/log_time.py
@ -0,0 +1,32 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Provides a timeslice logging decorator
 """
 import functools
 import time
 def logtime(logger, msg=None):
    """
    Logs the execution time of the decorated function.
    Always place it beneath other decorators.
    """
    def _inner(func):
        @functools.wraps(func)
        def _wrapper(*args, **kwargs):
            start = time.perf_counter()
            result = func(*args, **kwargs)
            elapsed = time.perf_counter() - start
            prefix = f"Function '{func.__module__}.{func.__qualname__}'" \
                if msg is None else msg
            logger.debug("%s: Elapsed time %.7f secs", prefix, elapsed)
            return result
        return _wrapper
    return _inner
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@ -11,6 +11,7 @@ import tempfile
 import time
 from collections import defaultdict
 from collections.abc import Generator
 from contextlib import contextmanager
 from pathlib import Path
 from typing import Any, Callable, Optional, Union
@ -98,6 +99,49 @@ def get_lock(model_name_or_path: Union[str, Path],
    return lock
@contextmanager
 def atomic_writer(filepath: Union[str, Path],
                  mode: str = 'w',
                  encoding: Optional[str] = None):
    """
    Context manager that provides an atomic file writing routine.
    The context manager writes to a temporary file and, if successful,
    atomically replaces the original file.
    Args:
        filepath (str or Path): The path to the file to write.
        mode (str): The file mode for the temporary file (e.g., 'w', 'wb').
        encoding (str): The encoding for text mode.
    Yields:
        file object: A handle to the temporary file.
    """
    # Create a temporary file in the same directory as the target file
    # to ensure it's on the same filesystem for an atomic replace.
    temp_dir = os.path.dirname(filepath)
    temp_fd, temp_path = tempfile.mkstemp(dir=temp_dir)
    try:
        # Open the temporary file for writing
        with os.fdopen(temp_fd, mode=mode, encoding=encoding) as temp_file:
            yield temp_file
        # If the 'with' block completes successfully,
        # perform the atomic replace.
        os.replace(temp_path, filepath)
    except Exception:
        logger.exception(
            "Error during atomic write. Original file '%s' not modified",
            filepath)
        raise
    finally:
        # Clean up the temporary file if it still exists.
        if os.path.exists(temp_path):
            os.remove(temp_path)
 def maybe_download_from_modelscope(
        model: str,
        revision: Optional[str] = None,
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@ -4,7 +4,9 @@
 Whenever you add an architecture to this page, please also update
 `tests/models/registry.py` with example HuggingFace models for it.
 """
 import hashlib
 import importlib
 import json
 import os
 import pickle
 import subprocess
@ -12,16 +14,19 @@ import sys
 import tempfile
 from abc import ABC, abstractmethod
 from collections.abc import Set
-from dataclasses import dataclass, field
+from dataclasses import asdict, dataclass, field
 from functools import lru_cache
 from pathlib import Path
 from typing import Callable, Optional, TypeVar, Union
 import torch.nn as nn
 import transformers
 from vllm import envs
 from vllm.config import (ModelConfig, iter_architecture_defaults,
                         try_match_architecture_defaults)
 from vllm.logger import init_logger
 from vllm.logging_utils import logtime
 from vllm.transformers_utils.dynamic_module import (
    try_get_class_from_dynamic_module)
@ -421,10 +426,91 @@ class _LazyRegisteredModel(_BaseRegisteredModel):
    module_name: str
    class_name: str
-    # Performed in another process to avoid initializing CUDA
+    @staticmethod
    def _get_cache_dir() -> Path:
        return Path(envs.VLLM_CACHE_ROOT) / "modelinfos"
    def _get_cache_filename(self) -> str:
        cls_name = f"{self.module_name}-{self.class_name}".replace(".", "-")
        return f"{cls_name}.json"
    def _load_modelinfo_from_cache(self,
                                   module_hash: str) -> _ModelInfo | None:
        try:
            try:
                modelinfo_path = self._get_cache_dir(
                ) / self._get_cache_filename()
                with open(modelinfo_path, encoding="utf-8") as file:
                    mi_dict = json.load(file)
            except FileNotFoundError:
                logger.debug(("Cached model info file "
                              "for class %s.%s not found"), self.module_name,
                             self.class_name)
                return None
            if mi_dict["hash"] != module_hash:
                logger.debug(("Cached model info file "
                              "for class %s.%s is stale"), self.module_name,
                             self.class_name)
                return None
            # file not changed, use cached _ModelInfo properties
            return _ModelInfo(**mi_dict["modelinfo"])
        except Exception:
            logger.exception(("Cached model info "
                              "for class %s.%s error. "), self.module_name,
                             self.class_name)
            return None
    def _save_modelinfo_to_cache(self, mi: _ModelInfo,
                                 module_hash: str) -> None:
        """save dictionary json file to cache"""
        from vllm.model_executor.model_loader.weight_utils import atomic_writer
        try:
            modelinfo_dict = {
                "hash": module_hash,
                "modelinfo": asdict(mi),
            }
            cache_dir = self._get_cache_dir()
            cache_dir.mkdir(parents=True, exist_ok=True)
            modelinfo_path = cache_dir / self._get_cache_filename()
            with atomic_writer(modelinfo_path, encoding='utf-8') as f:
                json.dump(modelinfo_dict, f, indent=2)
        except Exception:
            logger.exception("Error saving model info cache.")
    @logtime(logger=logger, msg="Registry inspect model class")
    def inspect_model_cls(self) -> _ModelInfo:
-        return _run_in_subprocess(
+        model_path = Path(
            __file__).parent / f"{self.module_name.split('.')[-1]}.py"
        assert model_path.exists(), \
            f"Model {self.module_name} expected to be on path {model_path}"
        with open(model_path, "rb") as f:
            module_hash = hashlib.md5(f.read()).hexdigest()
        mi = self._load_modelinfo_from_cache(module_hash)
        if mi is not None:
            logger.debug(("Loaded model info "
                          "for class %s.%s from cache"), self.module_name,
                         self.class_name)
            return mi
        else:
            logger.debug(("Cache model info "
                          "for class %s.%s miss. "
                          "Loading model instead."), self.module_name,
                         self.class_name)
        # Performed in another process to avoid initializing CUDA
        mi = _run_in_subprocess(
            lambda: _ModelInfo.from_model_cls(self.load_model_cls()))
        logger.debug("Loaded model info for class %s.%s", self.module_name,
                     self.class_name)
        # save cache file
        self._save_modelinfo_to_cache(mi, module_hash)
        return mi
    def load_model_cls(self) -> type[nn.Module]:
        mod = importlib.import_module(self.module_name)