mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-01-03 03:35:15 +08:00
[V1] Consolidate MM cache size to vllm.envs (#13239)
This commit is contained in:
parent
8c32b08a86
commit
dd5ede4440
11
vllm/envs.py
11
vllm/envs.py
@ -55,6 +55,7 @@ if TYPE_CHECKING:
|
||||
VLLM_IMAGE_FETCH_TIMEOUT: int = 5
|
||||
VLLM_VIDEO_FETCH_TIMEOUT: int = 30
|
||||
VLLM_AUDIO_FETCH_TIMEOUT: int = 10
|
||||
VLLM_MM_INPUT_CACHE_SIZE: int = 256
|
||||
VLLM_TARGET_DEVICE: str = "cuda"
|
||||
MAX_JOBS: Optional[str] = None
|
||||
NVCC_THREADS: Optional[str] = None
|
||||
@ -401,15 +402,21 @@ environment_variables: Dict[str, Callable[[], Any]] = {
|
||||
lambda: int(os.getenv("VLLM_IMAGE_FETCH_TIMEOUT", "5")),
|
||||
|
||||
# Timeout for fetching videos when serving multimodal models
|
||||
# Default is 15 seconds
|
||||
# Default is 30 seconds
|
||||
"VLLM_VIDEO_FETCH_TIMEOUT":
|
||||
lambda: int(os.getenv("VLLM_VIDEO_FETCH_TIMEOUT", "15")),
|
||||
lambda: int(os.getenv("VLLM_VIDEO_FETCH_TIMEOUT", "30")),
|
||||
|
||||
# Timeout for fetching audio when serving multimodal models
|
||||
# Default is 10 seconds
|
||||
"VLLM_AUDIO_FETCH_TIMEOUT":
|
||||
lambda: int(os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")),
|
||||
|
||||
# Cache size for multimodal feature/input cache for multimodal models
|
||||
# in unit of number of multimodal data items (e.g. image, video, audio).
|
||||
# Default is 256 multimodal data items.
|
||||
"VLLM_MM_INPUT_CACHE_SIZE":
|
||||
lambda: int(os.getenv("VLLM_MM_INPUT_CACHE_SIZE", "256")),
|
||||
|
||||
# Path to the XLA persistent cache directory.
|
||||
# Only used for XLA devices such as TPUs.
|
||||
"VLLM_XLA_CACHE_PATH":
|
||||
|
||||
@ -8,6 +8,7 @@ from typing import (TYPE_CHECKING, Any, Dict, Generic, Mapping, Optional,
|
||||
|
||||
import torch.nn as nn
|
||||
|
||||
from vllm.envs import VLLM_MM_INPUT_CACHE_SIZE
|
||||
from vllm.inputs import InputProcessingContext
|
||||
from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
@ -28,9 +29,6 @@ if TYPE_CHECKING:
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
# TODO: Tune the MM cache size
|
||||
MM_CACHE_SIZE = 256
|
||||
|
||||
N = TypeVar("N", bound=Type[nn.Module])
|
||||
_I = TypeVar("_I", bound=BaseProcessingInfo)
|
||||
_I_co = TypeVar("_I_co", bound=BaseProcessingInfo, covariant=True)
|
||||
@ -121,7 +119,7 @@ class MultiModalRegistry:
|
||||
|
||||
self._limits_by_model = _MultiModalLimits()
|
||||
|
||||
self._processing_cache = ProcessingCache(MM_CACHE_SIZE)
|
||||
self._processing_cache = ProcessingCache(VLLM_MM_INPUT_CACHE_SIZE)
|
||||
|
||||
def register_plugin(self, plugin: MultiModalPlugin) -> None:
|
||||
"""
|
||||
|
||||
@ -3,6 +3,7 @@
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.envs import VLLM_MM_INPUT_CACHE_SIZE
|
||||
from vllm.logger import init_logger
|
||||
from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict,
|
||||
MultiModalKwargs, MultiModalRegistry)
|
||||
@ -28,9 +29,8 @@ logger = init_logger(__name__)
|
||||
# client (=P0) and server (=P1) processes.
|
||||
|
||||
# Both Client and Server must use the same cache size
|
||||
# (to perform mirrored caching)
|
||||
# TODO: Tune the MM cache size
|
||||
MM_CACHE_SIZE = 256
|
||||
# (to perform mirrored caching). This cache size is set by the environment
|
||||
# variable VLLM_MM_INPUT_CACHE_SIZE.
|
||||
|
||||
|
||||
# TODO(ywang96): Deprecate this class once all multimodal models migrate to use
|
||||
@ -50,7 +50,8 @@ class MMInputCacheClient:
|
||||
|
||||
# Init cache
|
||||
self.use_cache = not model_config.disable_mm_preprocessor_cache
|
||||
self.mm_cache = LRUCache[str, MultiModalKwargs](MM_CACHE_SIZE)
|
||||
self.mm_cache = LRUCache[str,
|
||||
MultiModalKwargs](VLLM_MM_INPUT_CACHE_SIZE)
|
||||
|
||||
# DEBUG: Set to None to disable
|
||||
self.mm_debug_cache_hit_ratio_steps = None
|
||||
@ -127,7 +128,8 @@ class MMInputCacheServer:
|
||||
|
||||
def __init__(self, model_config):
|
||||
self.use_cache = not model_config.disable_mm_preprocessor_cache
|
||||
self.mm_cache = LRUCache[str, MultiModalKwargs](MM_CACHE_SIZE)
|
||||
self.mm_cache = LRUCache[str,
|
||||
MultiModalKwargs](VLLM_MM_INPUT_CACHE_SIZE)
|
||||
|
||||
def get_and_update(
|
||||
self,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user