[FEATURE]: Use pydantic validation in multimodal.py config (#26629)

Signed-off-by: Anand Roy <86306690+andycandy@users.noreply.github.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Anand Roy 2025-10-13 20:26:59 +05:30 committed by GitHub
parent 4a61950f4d
commit 10214b6935
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -3,10 +3,9 @@
import hashlib import hashlib
from collections.abc import Mapping from collections.abc import Mapping
from dataclasses import field
from typing import Any, Literal, TypeAlias from typing import Any, Literal, TypeAlias
from pydantic import ConfigDict, Field, field_validator from pydantic import ConfigDict, Field, field_validator, model_validator
from pydantic.dataclasses import dataclass from pydantic.dataclasses import dataclass
from vllm.config.utils import config from vllm.config.utils import config
@ -55,7 +54,7 @@ DummyOptions: TypeAlias = (
class MultiModalConfig: class MultiModalConfig:
"""Controls the behavior of multimodal models.""" """Controls the behavior of multimodal models."""
limit_per_prompt: dict[str, DummyOptions] = field(default_factory=dict) limit_per_prompt: dict[str, DummyOptions] = Field(default_factory=dict)
"""The maximum number of input items and options allowed per """The maximum number of input items and options allowed per
prompt for each modality. prompt for each modality.
Defaults to 999 for each modality. Defaults to 999 for each modality.
@ -71,7 +70,7 @@ class MultiModalConfig:
{"image": 16, "video": {"count": 1, "num_frames": 32, "width": 512, {"image": 16, "video": {"count": 1, "num_frames": 32, "width": 512,
"height": 512}} "height": 512}}
""" """
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) media_io_kwargs: dict[str, dict[str, Any]] = Field(default_factory=dict)
"""Additional args passed to process media inputs, keyed by modalities. """Additional args passed to process media inputs, keyed by modalities.
For example, to set num_frames for video, set For example, to set num_frames for video, set
`--media-io-kwargs '{"video": {"num_frames": 40} }'`""" `--media-io-kwargs '{"video": {"num_frames": 40} }'`"""
@ -84,7 +83,7 @@ class MultiModalConfig:
For example, for Phi-3-Vision: For example, for Phi-3-Vision:
`{"num_crops": 4}`.""" `{"num_crops": 4}`."""
mm_processor_cache_gb: float = 4 mm_processor_cache_gb: float = Field(default=4, ge=0)
"""The size (in GiB) of the multi-modal processor cache, which is used to """The size (in GiB) of the multi-modal processor cache, which is used to
avoid re-processing past multi-modal inputs. avoid re-processing past multi-modal inputs.
@ -96,7 +95,7 @@ class MultiModalConfig:
mm_processor_cache_type: MMCacheType = "lru" mm_processor_cache_type: MMCacheType = "lru"
"""Type of cache to use for the multi-modal preprocessor/mapper. If `shm`, """Type of cache to use for the multi-modal preprocessor/mapper. If `shm`,
use shared memory FIFO cache. If `lru`, use mirrored LRU cache.""" use shared memory FIFO cache. If `lru`, use mirrored LRU cache."""
mm_shm_cache_max_object_size_mb: int = 128 mm_shm_cache_max_object_size_mb: int = Field(default=128, ge=0)
"""Size limit (in MiB) for each object stored in the multi-modal processor """Size limit (in MiB) for each object stored in the multi-modal processor
shared memory cache. Only effective when `mm_processor_cache_type` is shared memory cache. Only effective when `mm_processor_cache_type` is
`"shm"`.""" `"shm"`."""
@ -123,7 +122,7 @@ class MultiModalConfig:
This reduces engine startup time but shifts the responsibility to users for This reduces engine startup time but shifts the responsibility to users for
estimating the peak memory usage of the activation of multimodal encoder and estimating the peak memory usage of the activation of multimodal encoder and
embedding cache.""" embedding cache."""
video_pruning_rate: float | None = None video_pruning_rate: float | None = Field(default=None, ge=0.0, lt=1.0)
"""Sets pruning rate for video pruning via Efficient Video Sampling. """Sets pruning rate for video pruning via Efficient Video Sampling.
Value sits in range [0;1) and determines fraction of media tokens Value sits in range [0;1) and determines fraction of media tokens
from each video to be pruned. from each video to be pruned.
@ -149,6 +148,18 @@ class MultiModalConfig:
value[k] = BaseDummyOptions(**v) value[k] = BaseDummyOptions(**v)
return value return value
@model_validator(mode="after")
def _validate_multimodal_config(self):
if self.mm_processor_cache_type != "shm" and (
self.mm_shm_cache_max_object_size_mb
!= MultiModalConfig.mm_shm_cache_max_object_size_mb
):
raise ValueError(
"'mm_shm_cache_max_object_size_mb' should only be set when "
"'mm_processor_cache_type' is 'shm'."
)
return self
def compute_hash(self) -> str: def compute_hash(self) -> str:
""" """
WARNING: Whenever a new field is added to this config, WARNING: Whenever a new field is added to this config,