Generate _ModelInfo properties file when loading to improve loading speed (#23558)

Signed-off-by: Manoel Marques <manoel.marques@ibm.com>
Signed-off-by: Manoel Marques <manoelmrqs@gmail.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Signed-off-by: yewentao256 <zhyanwentao@126.com>
This commit is contained in:
Manoel Marques 2025-09-20 07:51:13 -04:00 committed by yewentao256
parent dad5f4d16d
commit af4dedf6d3
4 changed files with 167 additions and 3 deletions

View File

@ -2,7 +2,9 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm.logging_utils.formatter import NewLineFormatter from vllm.logging_utils.formatter import NewLineFormatter
from vllm.logging_utils.log_time import logtime
__all__ = [ __all__ = [
"NewLineFormatter", "NewLineFormatter",
"logtime",
] ]

View File

@ -0,0 +1,32 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Provides a timeslice logging decorator
"""
import functools
import time
def logtime(logger, msg=None):
"""
Logs the execution time of the decorated function.
Always place it beneath other decorators.
"""
def _inner(func):
@functools.wraps(func)
def _wrapper(*args, **kwargs):
start = time.perf_counter()
result = func(*args, **kwargs)
elapsed = time.perf_counter() - start
prefix = f"Function '{func.__module__}.{func.__qualname__}'" \
if msg is None else msg
logger.debug("%s: Elapsed time %.7f secs", prefix, elapsed)
return result
return _wrapper
return _inner

View File

@ -11,6 +11,7 @@ import tempfile
import time import time
from collections import defaultdict from collections import defaultdict
from collections.abc import Generator from collections.abc import Generator
from contextlib import contextmanager
from pathlib import Path from pathlib import Path
from typing import Any, Callable, Optional, Union from typing import Any, Callable, Optional, Union
@ -98,6 +99,49 @@ def get_lock(model_name_or_path: Union[str, Path],
return lock return lock
@contextmanager
def atomic_writer(filepath: Union[str, Path],
mode: str = 'w',
encoding: Optional[str] = None):
"""
Context manager that provides an atomic file writing routine.
The context manager writes to a temporary file and, if successful,
atomically replaces the original file.
Args:
filepath (str or Path): The path to the file to write.
mode (str): The file mode for the temporary file (e.g., 'w', 'wb').
encoding (str): The encoding for text mode.
Yields:
file object: A handle to the temporary file.
"""
# Create a temporary file in the same directory as the target file
# to ensure it's on the same filesystem for an atomic replace.
temp_dir = os.path.dirname(filepath)
temp_fd, temp_path = tempfile.mkstemp(dir=temp_dir)
try:
# Open the temporary file for writing
with os.fdopen(temp_fd, mode=mode, encoding=encoding) as temp_file:
yield temp_file
# If the 'with' block completes successfully,
# perform the atomic replace.
os.replace(temp_path, filepath)
except Exception:
logger.exception(
"Error during atomic write. Original file '%s' not modified",
filepath)
raise
finally:
# Clean up the temporary file if it still exists.
if os.path.exists(temp_path):
os.remove(temp_path)
def maybe_download_from_modelscope( def maybe_download_from_modelscope(
model: str, model: str,
revision: Optional[str] = None, revision: Optional[str] = None,

View File

@ -4,7 +4,9 @@
Whenever you add an architecture to this page, please also update Whenever you add an architecture to this page, please also update
`tests/models/registry.py` with example HuggingFace models for it. `tests/models/registry.py` with example HuggingFace models for it.
""" """
import hashlib
import importlib import importlib
import json
import os import os
import pickle import pickle
import subprocess import subprocess
@ -12,16 +14,19 @@ import sys
import tempfile import tempfile
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from collections.abc import Set from collections.abc import Set
from dataclasses import dataclass, field from dataclasses import asdict, dataclass, field
from functools import lru_cache from functools import lru_cache
from pathlib import Path
from typing import Callable, Optional, TypeVar, Union from typing import Callable, Optional, TypeVar, Union
import torch.nn as nn import torch.nn as nn
import transformers import transformers
from vllm import envs
from vllm.config import (ModelConfig, iter_architecture_defaults, from vllm.config import (ModelConfig, iter_architecture_defaults,
try_match_architecture_defaults) try_match_architecture_defaults)
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.logging_utils import logtime
from vllm.transformers_utils.dynamic_module import ( from vllm.transformers_utils.dynamic_module import (
try_get_class_from_dynamic_module) try_get_class_from_dynamic_module)
@ -421,10 +426,91 @@ class _LazyRegisteredModel(_BaseRegisteredModel):
module_name: str module_name: str
class_name: str class_name: str
# Performed in another process to avoid initializing CUDA @staticmethod
def _get_cache_dir() -> Path:
return Path(envs.VLLM_CACHE_ROOT) / "modelinfos"
def _get_cache_filename(self) -> str:
cls_name = f"{self.module_name}-{self.class_name}".replace(".", "-")
return f"{cls_name}.json"
def _load_modelinfo_from_cache(self,
module_hash: str) -> _ModelInfo | None:
try:
try:
modelinfo_path = self._get_cache_dir(
) / self._get_cache_filename()
with open(modelinfo_path, encoding="utf-8") as file:
mi_dict = json.load(file)
except FileNotFoundError:
logger.debug(("Cached model info file "
"for class %s.%s not found"), self.module_name,
self.class_name)
return None
if mi_dict["hash"] != module_hash:
logger.debug(("Cached model info file "
"for class %s.%s is stale"), self.module_name,
self.class_name)
return None
# file not changed, use cached _ModelInfo properties
return _ModelInfo(**mi_dict["modelinfo"])
except Exception:
logger.exception(("Cached model info "
"for class %s.%s error. "), self.module_name,
self.class_name)
return None
def _save_modelinfo_to_cache(self, mi: _ModelInfo,
module_hash: str) -> None:
"""save dictionary json file to cache"""
from vllm.model_executor.model_loader.weight_utils import atomic_writer
try:
modelinfo_dict = {
"hash": module_hash,
"modelinfo": asdict(mi),
}
cache_dir = self._get_cache_dir()
cache_dir.mkdir(parents=True, exist_ok=True)
modelinfo_path = cache_dir / self._get_cache_filename()
with atomic_writer(modelinfo_path, encoding='utf-8') as f:
json.dump(modelinfo_dict, f, indent=2)
except Exception:
logger.exception("Error saving model info cache.")
@logtime(logger=logger, msg="Registry inspect model class")
def inspect_model_cls(self) -> _ModelInfo: def inspect_model_cls(self) -> _ModelInfo:
return _run_in_subprocess( model_path = Path(
__file__).parent / f"{self.module_name.split('.')[-1]}.py"
assert model_path.exists(), \
f"Model {self.module_name} expected to be on path {model_path}"
with open(model_path, "rb") as f:
module_hash = hashlib.md5(f.read()).hexdigest()
mi = self._load_modelinfo_from_cache(module_hash)
if mi is not None:
logger.debug(("Loaded model info "
"for class %s.%s from cache"), self.module_name,
self.class_name)
return mi
else:
logger.debug(("Cache model info "
"for class %s.%s miss. "
"Loading model instead."), self.module_name,
self.class_name)
# Performed in another process to avoid initializing CUDA
mi = _run_in_subprocess(
lambda: _ModelInfo.from_model_cls(self.load_model_cls())) lambda: _ModelInfo.from_model_cls(self.load_model_cls()))
logger.debug("Loaded model info for class %s.%s", self.module_name,
self.class_name)
# save cache file
self._save_modelinfo_to_cache(mi, module_hash)
return mi
def load_model_cls(self) -> type[nn.Module]: def load_model_cls(self) -> type[nn.Module]:
mod = importlib.import_module(self.module_name) mod = importlib.import_module(self.module_name)