From af4dedf6d3cf8fa518826aa7d430395fc2414fa3 Mon Sep 17 00:00:00 2001 From: Manoel Marques Date: Sat, 20 Sep 2025 07:51:13 -0400 Subject: [PATCH] Generate _ModelInfo properties file when loading to improve loading speed (#23558) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Manoel Marques Signed-off-by: Manoel Marques Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Luka Govedič Signed-off-by: yewentao256 --- vllm/logging_utils/__init__.py | 2 + vllm/logging_utils/log_time.py | 32 +++++++ .../model_loader/weight_utils.py | 44 +++++++++ vllm/model_executor/models/registry.py | 92 ++++++++++++++++++- 4 files changed, 167 insertions(+), 3 deletions(-) create mode 100644 vllm/logging_utils/log_time.py diff --git a/vllm/logging_utils/__init__.py b/vllm/logging_utils/__init__.py index cf690a89ae9bc..7202259ca21aa 100644 --- a/vllm/logging_utils/__init__.py +++ b/vllm/logging_utils/__init__.py @@ -2,7 +2,9 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.logging_utils.formatter import NewLineFormatter +from vllm.logging_utils.log_time import logtime __all__ = [ "NewLineFormatter", + "logtime", ] diff --git a/vllm/logging_utils/log_time.py b/vllm/logging_utils/log_time.py new file mode 100644 index 0000000000000..013dd144beaf8 --- /dev/null +++ b/vllm/logging_utils/log_time.py @@ -0,0 +1,32 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Provides a timeslice logging decorator +""" + +import functools +import time + + +def logtime(logger, msg=None): + """ + Logs the execution time of the decorated function. + Always place it beneath other decorators. + """ + + def _inner(func): + + @functools.wraps(func) + def _wrapper(*args, **kwargs): + start = time.perf_counter() + result = func(*args, **kwargs) + elapsed = time.perf_counter() - start + + prefix = f"Function '{func.__module__}.{func.__qualname__}'" \ + if msg is None else msg + logger.debug("%s: Elapsed time %.7f secs", prefix, elapsed) + return result + + return _wrapper + + return _inner diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index f2c66763d0816..a72086da18c4d 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -11,6 +11,7 @@ import tempfile import time from collections import defaultdict from collections.abc import Generator +from contextlib import contextmanager from pathlib import Path from typing import Any, Callable, Optional, Union @@ -98,6 +99,49 @@ def get_lock(model_name_or_path: Union[str, Path], return lock +@contextmanager +def atomic_writer(filepath: Union[str, Path], + mode: str = 'w', + encoding: Optional[str] = None): + """ + Context manager that provides an atomic file writing routine. + + The context manager writes to a temporary file and, if successful, + atomically replaces the original file. + + Args: + filepath (str or Path): The path to the file to write. + mode (str): The file mode for the temporary file (e.g., 'w', 'wb'). + encoding (str): The encoding for text mode. + + Yields: + file object: A handle to the temporary file. + """ + # Create a temporary file in the same directory as the target file + # to ensure it's on the same filesystem for an atomic replace. + temp_dir = os.path.dirname(filepath) + temp_fd, temp_path = tempfile.mkstemp(dir=temp_dir) + + try: + # Open the temporary file for writing + with os.fdopen(temp_fd, mode=mode, encoding=encoding) as temp_file: + yield temp_file + + # If the 'with' block completes successfully, + # perform the atomic replace. + os.replace(temp_path, filepath) + + except Exception: + logger.exception( + "Error during atomic write. Original file '%s' not modified", + filepath) + raise + finally: + # Clean up the temporary file if it still exists. + if os.path.exists(temp_path): + os.remove(temp_path) + + def maybe_download_from_modelscope( model: str, revision: Optional[str] = None, diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 76f2bd087624c..5dc5d545bb9c5 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -4,7 +4,9 @@ Whenever you add an architecture to this page, please also update `tests/models/registry.py` with example HuggingFace models for it. """ +import hashlib import importlib +import json import os import pickle import subprocess @@ -12,16 +14,19 @@ import sys import tempfile from abc import ABC, abstractmethod from collections.abc import Set -from dataclasses import dataclass, field +from dataclasses import asdict, dataclass, field from functools import lru_cache +from pathlib import Path from typing import Callable, Optional, TypeVar, Union import torch.nn as nn import transformers +from vllm import envs from vllm.config import (ModelConfig, iter_architecture_defaults, try_match_architecture_defaults) from vllm.logger import init_logger +from vllm.logging_utils import logtime from vllm.transformers_utils.dynamic_module import ( try_get_class_from_dynamic_module) @@ -421,10 +426,91 @@ class _LazyRegisteredModel(_BaseRegisteredModel): module_name: str class_name: str - # Performed in another process to avoid initializing CUDA + @staticmethod + def _get_cache_dir() -> Path: + return Path(envs.VLLM_CACHE_ROOT) / "modelinfos" + + def _get_cache_filename(self) -> str: + cls_name = f"{self.module_name}-{self.class_name}".replace(".", "-") + return f"{cls_name}.json" + + def _load_modelinfo_from_cache(self, + module_hash: str) -> _ModelInfo | None: + try: + try: + modelinfo_path = self._get_cache_dir( + ) / self._get_cache_filename() + with open(modelinfo_path, encoding="utf-8") as file: + mi_dict = json.load(file) + except FileNotFoundError: + logger.debug(("Cached model info file " + "for class %s.%s not found"), self.module_name, + self.class_name) + return None + + if mi_dict["hash"] != module_hash: + logger.debug(("Cached model info file " + "for class %s.%s is stale"), self.module_name, + self.class_name) + return None + + # file not changed, use cached _ModelInfo properties + return _ModelInfo(**mi_dict["modelinfo"]) + except Exception: + logger.exception(("Cached model info " + "for class %s.%s error. "), self.module_name, + self.class_name) + return None + + def _save_modelinfo_to_cache(self, mi: _ModelInfo, + module_hash: str) -> None: + """save dictionary json file to cache""" + from vllm.model_executor.model_loader.weight_utils import atomic_writer + try: + modelinfo_dict = { + "hash": module_hash, + "modelinfo": asdict(mi), + } + cache_dir = self._get_cache_dir() + cache_dir.mkdir(parents=True, exist_ok=True) + modelinfo_path = cache_dir / self._get_cache_filename() + with atomic_writer(modelinfo_path, encoding='utf-8') as f: + json.dump(modelinfo_dict, f, indent=2) + except Exception: + logger.exception("Error saving model info cache.") + + @logtime(logger=logger, msg="Registry inspect model class") def inspect_model_cls(self) -> _ModelInfo: - return _run_in_subprocess( + model_path = Path( + __file__).parent / f"{self.module_name.split('.')[-1]}.py" + + assert model_path.exists(), \ + f"Model {self.module_name} expected to be on path {model_path}" + with open(model_path, "rb") as f: + module_hash = hashlib.md5(f.read()).hexdigest() + + mi = self._load_modelinfo_from_cache(module_hash) + if mi is not None: + logger.debug(("Loaded model info " + "for class %s.%s from cache"), self.module_name, + self.class_name) + return mi + else: + logger.debug(("Cache model info " + "for class %s.%s miss. " + "Loading model instead."), self.module_name, + self.class_name) + + # Performed in another process to avoid initializing CUDA + mi = _run_in_subprocess( lambda: _ModelInfo.from_model_cls(self.load_model_cls())) + logger.debug("Loaded model info for class %s.%s", self.module_name, + self.class_name) + + # save cache file + self._save_modelinfo_to_cache(mi, module_hash) + + return mi def load_model_cls(self) -> type[nn.Module]: mod = importlib.import_module(self.module_name)