mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-25 13:24:32 +08:00
[FEATURE]: Use pydantic validation in multimodal.py config (#26629)
Signed-off-by: Anand Roy <86306690+andycandy@users.noreply.github.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
parent
4a61950f4d
commit
10214b6935
@ -3,10 +3,9 @@
|
|||||||
|
|
||||||
import hashlib
|
import hashlib
|
||||||
from collections.abc import Mapping
|
from collections.abc import Mapping
|
||||||
from dataclasses import field
|
|
||||||
from typing import Any, Literal, TypeAlias
|
from typing import Any, Literal, TypeAlias
|
||||||
|
|
||||||
from pydantic import ConfigDict, Field, field_validator
|
from pydantic import ConfigDict, Field, field_validator, model_validator
|
||||||
from pydantic.dataclasses import dataclass
|
from pydantic.dataclasses import dataclass
|
||||||
|
|
||||||
from vllm.config.utils import config
|
from vllm.config.utils import config
|
||||||
@ -55,7 +54,7 @@ DummyOptions: TypeAlias = (
|
|||||||
class MultiModalConfig:
|
class MultiModalConfig:
|
||||||
"""Controls the behavior of multimodal models."""
|
"""Controls the behavior of multimodal models."""
|
||||||
|
|
||||||
limit_per_prompt: dict[str, DummyOptions] = field(default_factory=dict)
|
limit_per_prompt: dict[str, DummyOptions] = Field(default_factory=dict)
|
||||||
"""The maximum number of input items and options allowed per
|
"""The maximum number of input items and options allowed per
|
||||||
prompt for each modality.
|
prompt for each modality.
|
||||||
Defaults to 999 for each modality.
|
Defaults to 999 for each modality.
|
||||||
@ -71,7 +70,7 @@ class MultiModalConfig:
|
|||||||
{"image": 16, "video": {"count": 1, "num_frames": 32, "width": 512,
|
{"image": 16, "video": {"count": 1, "num_frames": 32, "width": 512,
|
||||||
"height": 512}}
|
"height": 512}}
|
||||||
"""
|
"""
|
||||||
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
|
media_io_kwargs: dict[str, dict[str, Any]] = Field(default_factory=dict)
|
||||||
"""Additional args passed to process media inputs, keyed by modalities.
|
"""Additional args passed to process media inputs, keyed by modalities.
|
||||||
For example, to set num_frames for video, set
|
For example, to set num_frames for video, set
|
||||||
`--media-io-kwargs '{"video": {"num_frames": 40} }'`"""
|
`--media-io-kwargs '{"video": {"num_frames": 40} }'`"""
|
||||||
@ -84,7 +83,7 @@ class MultiModalConfig:
|
|||||||
|
|
||||||
For example, for Phi-3-Vision:
|
For example, for Phi-3-Vision:
|
||||||
`{"num_crops": 4}`."""
|
`{"num_crops": 4}`."""
|
||||||
mm_processor_cache_gb: float = 4
|
mm_processor_cache_gb: float = Field(default=4, ge=0)
|
||||||
"""The size (in GiB) of the multi-modal processor cache, which is used to
|
"""The size (in GiB) of the multi-modal processor cache, which is used to
|
||||||
avoid re-processing past multi-modal inputs.
|
avoid re-processing past multi-modal inputs.
|
||||||
|
|
||||||
@ -96,7 +95,7 @@ class MultiModalConfig:
|
|||||||
mm_processor_cache_type: MMCacheType = "lru"
|
mm_processor_cache_type: MMCacheType = "lru"
|
||||||
"""Type of cache to use for the multi-modal preprocessor/mapper. If `shm`,
|
"""Type of cache to use for the multi-modal preprocessor/mapper. If `shm`,
|
||||||
use shared memory FIFO cache. If `lru`, use mirrored LRU cache."""
|
use shared memory FIFO cache. If `lru`, use mirrored LRU cache."""
|
||||||
mm_shm_cache_max_object_size_mb: int = 128
|
mm_shm_cache_max_object_size_mb: int = Field(default=128, ge=0)
|
||||||
"""Size limit (in MiB) for each object stored in the multi-modal processor
|
"""Size limit (in MiB) for each object stored in the multi-modal processor
|
||||||
shared memory cache. Only effective when `mm_processor_cache_type` is
|
shared memory cache. Only effective when `mm_processor_cache_type` is
|
||||||
`"shm"`."""
|
`"shm"`."""
|
||||||
@ -123,7 +122,7 @@ class MultiModalConfig:
|
|||||||
This reduces engine startup time but shifts the responsibility to users for
|
This reduces engine startup time but shifts the responsibility to users for
|
||||||
estimating the peak memory usage of the activation of multimodal encoder and
|
estimating the peak memory usage of the activation of multimodal encoder and
|
||||||
embedding cache."""
|
embedding cache."""
|
||||||
video_pruning_rate: float | None = None
|
video_pruning_rate: float | None = Field(default=None, ge=0.0, lt=1.0)
|
||||||
"""Sets pruning rate for video pruning via Efficient Video Sampling.
|
"""Sets pruning rate for video pruning via Efficient Video Sampling.
|
||||||
Value sits in range [0;1) and determines fraction of media tokens
|
Value sits in range [0;1) and determines fraction of media tokens
|
||||||
from each video to be pruned.
|
from each video to be pruned.
|
||||||
@ -149,6 +148,18 @@ class MultiModalConfig:
|
|||||||
value[k] = BaseDummyOptions(**v)
|
value[k] = BaseDummyOptions(**v)
|
||||||
return value
|
return value
|
||||||
|
|
||||||
|
@model_validator(mode="after")
|
||||||
|
def _validate_multimodal_config(self):
|
||||||
|
if self.mm_processor_cache_type != "shm" and (
|
||||||
|
self.mm_shm_cache_max_object_size_mb
|
||||||
|
!= MultiModalConfig.mm_shm_cache_max_object_size_mb
|
||||||
|
):
|
||||||
|
raise ValueError(
|
||||||
|
"'mm_shm_cache_max_object_size_mb' should only be set when "
|
||||||
|
"'mm_processor_cache_type' is 'shm'."
|
||||||
|
)
|
||||||
|
return self
|
||||||
|
|
||||||
def compute_hash(self) -> str:
|
def compute_hash(self) -> str:
|
||||||
"""
|
"""
|
||||||
WARNING: Whenever a new field is added to this config,
|
WARNING: Whenever a new field is added to this config,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user