[FEATURE]: Use pydantic validation in multimodal.py config (#26629)

Signed-off-by: Anand Roy <86306690+andycandy@users.noreply.github.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-05-25 12:31:19 +08:00 · 2025-10-13 20:26:59 +05:30 · 2025-10-13 20:26:59 +05:30 · 10214b6935
commit 10214b6935
parent 4a61950f4d
1 changed files with 18 additions and 7 deletions
--- a/vllm/config/multimodal.py
+++ b/vllm/config/multimodal.py
@ -3,10 +3,9 @@
 import hashlib
 from collections.abc import Mapping
 from dataclasses import field
 from typing import Any, Literal, TypeAlias
-from pydantic import ConfigDict, Field, field_validator
+from pydantic import ConfigDict, Field, field_validator, model_validator
 from pydantic.dataclasses import dataclass
 from vllm.config.utils import config
@ -55,7 +54,7 @@ DummyOptions: TypeAlias = (
 class MultiModalConfig:
    """Controls the behavior of multimodal models."""
-    limit_per_prompt: dict[str, DummyOptions] = field(default_factory=dict)
+    limit_per_prompt: dict[str, DummyOptions] = Field(default_factory=dict)
    """The maximum number of input items and options allowed per 
        prompt for each modality.
    Defaults to 999 for each modality.
@ -71,7 +70,7 @@ class MultiModalConfig:
        {"image": 16, "video": {"count": 1, "num_frames": 32, "width": 512, 
        "height": 512}}
    """
-    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
+    media_io_kwargs: dict[str, dict[str, Any]] = Field(default_factory=dict)
    """Additional args passed to process media inputs, keyed by modalities.
    For example, to set num_frames for video, set
    `--media-io-kwargs '{"video": {"num_frames": 40} }'`"""
@ -84,7 +83,7 @@ class MultiModalConfig:
    For example, for Phi-3-Vision:
    `{"num_crops": 4}`."""
-    mm_processor_cache_gb: float = 4
+    mm_processor_cache_gb: float = Field(default=4, ge=0)
    """The size (in GiB) of the multi-modal processor cache, which is used to
    avoid re-processing past multi-modal inputs.
@ -96,7 +95,7 @@ class MultiModalConfig:
    mm_processor_cache_type: MMCacheType = "lru"
    """Type of cache to use for the multi-modal preprocessor/mapper. If `shm`,
    use shared memory FIFO cache. If `lru`, use mirrored LRU cache."""
-    mm_shm_cache_max_object_size_mb: int = 128
+    mm_shm_cache_max_object_size_mb: int = Field(default=128, ge=0)
    """Size limit (in MiB) for each object stored in the multi-modal processor
    shared memory cache. Only effective when `mm_processor_cache_type` is
    `"shm"`."""
@ -123,7 +122,7 @@ class MultiModalConfig:
    This reduces engine startup time but shifts the responsibility to users for
    estimating the peak memory usage of the activation of multimodal encoder and
    embedding cache."""
-    video_pruning_rate: float | None = None
+    video_pruning_rate: float | None = Field(default=None, ge=0.0, lt=1.0)
    """Sets pruning rate for video pruning via Efficient Video Sampling.
    Value sits in range [0;1) and determines fraction of media tokens
    from each video to be pruned.
@ -149,6 +148,18 @@ class MultiModalConfig:
                value[k] = BaseDummyOptions(**v)
        return value
    @model_validator(mode="after")
    def _validate_multimodal_config(self):
        if self.mm_processor_cache_type != "shm" and (
            self.mm_shm_cache_max_object_size_mb
            != MultiModalConfig.mm_shm_cache_max_object_size_mb
        ):
            raise ValueError(
                "'mm_shm_cache_max_object_size_mb' should only be set when "
                "'mm_processor_cache_type' is 'shm'."
            )
        return self
    def compute_hash(self) -> str:
        """
        WARNING: Whenever a new field is added to this config,