diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py index 818efd825640c..2bf29ecf087f3 100644 --- a/tests/entrypoints/openai/test_lora_resolvers.py +++ b/tests/entrypoints/openai/test_lora_resolvers.py @@ -9,7 +9,7 @@ from unittest.mock import MagicMock import pytest -from vllm.config import MultiModalConfig +from vllm.config.multimodal import MultiModalConfig from vllm.engine.multiprocessing.client import MQLLMEngineClient from vllm.entrypoints.openai.protocol import CompletionRequest, ErrorResponse from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index d219a1f311f15..502704c9bbdff 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -12,7 +12,7 @@ from unittest.mock import MagicMock import pytest import pytest_asyncio -from vllm.config import MultiModalConfig +from vllm.config.multimodal import MultiModalConfig from vllm.engine.multiprocessing.client import MQLLMEngineClient from vllm.entrypoints.openai.protocol import ChatCompletionRequest from vllm.entrypoints.openai.serving_chat import OpenAIServingChat diff --git a/tests/multimodal/test_cache.py b/tests/multimodal/test_cache.py index 3c61ee26e092e..3c737acfbfe28 100644 --- a/tests/multimodal/test_cache.py +++ b/tests/multimodal/test_cache.py @@ -7,6 +7,7 @@ import pytest import torch from vllm.config import ModelConfig, ParallelConfig, VllmConfig +from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.cache import (MultiModalCache, MultiModalProcessorCacheItem, MultiModalProcessorCacheItemMetadata, @@ -17,7 +18,6 @@ from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargsItem, MultiModalKwargsItems, MultiModalSharedField) from vllm.multimodal.processing import PromptInsertion -from vllm.multimodal.registry import MultiModalRegistry def _dummy_elem( @@ -96,7 +96,9 @@ def _create_vllm_config( enable_ipc: bool, ): return VllmConfig( - model_config=ModelConfig(mm_processor_cache_gb=mm_processor_cache_gb), + model_config=ModelConfig( + model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", + mm_processor_cache_gb=mm_processor_cache_gb), parallel_config=ParallelConfig( data_parallel_size=1 if enable_ipc else 2), ) @@ -113,15 +115,16 @@ def _compare_caches( n_iter: int = 100, seed: int = 0, ): - mm_registry = MultiModalRegistry() - cache_0_p0 = processor_cache_from_config(config_0, mm_registry) - cache_0_p1 = engine_receiver_cache_from_config(config_0, mm_registry) - cache_1_p0 = processor_cache_from_config(config_1, mm_registry) - cache_1_p1 = engine_receiver_cache_from_config(config_1, mm_registry) + cache_0_p0 = processor_cache_from_config(config_0, MULTIMODAL_REGISTRY) + cache_0_p1 = engine_receiver_cache_from_config(config_0, + MULTIMODAL_REGISTRY) + cache_1_p0 = processor_cache_from_config(config_1, MULTIMODAL_REGISTRY) + cache_1_p1 = engine_receiver_cache_from_config(config_1, + MULTIMODAL_REGISTRY) cache_size_gb = max( - config_0.model_config.mm_processor_cache_gb, - config_1.model_config.mm_processor_cache_gb, + config_0.model_config.multimodal_config.mm_processor_cache_gb, + config_1.model_config.multimodal_config.mm_processor_cache_gb, ) item_size_gb = int(cache_size_gb / item_capacity) diff --git a/tests/test_config.py b/tests/test_config.py index 373fbd267539a..8db04de5469ad 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -6,9 +6,9 @@ from dataclasses import MISSING, Field, asdict, dataclass, field import pytest from vllm.compilation.backends import VllmBackend -from vllm.config import (ModelConfig, PoolerConfig, VllmConfig, get_field, - update_config) +from vllm.config import ModelConfig, PoolerConfig, VllmConfig, update_config from vllm.config.load import LoadConfig +from vllm.config.utils import get_field from vllm.model_executor.layers.pooler import PoolingType from vllm.platforms import current_platform diff --git a/tests/v1/engine/test_processor_multi_modal_uuids.py b/tests/v1/engine/test_processor_multi_modal_uuids.py index 955c74d262a09..bdd41eece2317 100644 --- a/tests/v1/engine/test_processor_multi_modal_uuids.py +++ b/tests/v1/engine/test_processor_multi_modal_uuids.py @@ -31,7 +31,7 @@ def _mk_processor(monkeypatch, raising=True) monkeypatch.setattr(ModelConfig, "__post_init__", - lambda self: None, + lambda self, *args: None, raising=True) monkeypatch.setattr(UnspecifiedPlatform, "is_async_output_supported", diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 85e58c290b792..599eeb63bf804 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -11,13 +11,12 @@ import json import os import textwrap import warnings -from collections.abc import Mapping from contextlib import contextmanager -from dataclasses import MISSING, Field, field, fields, is_dataclass, replace +from dataclasses import InitVar, field, fields, is_dataclass, replace from functools import cached_property, lru_cache from importlib.util import find_spec -from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Literal, Optional, - Protocol, TypeVar, Union, cast, get_args) +from typing import (TYPE_CHECKING, Any, Callable, Literal, Optional, Protocol, + TypeVar, Union, cast, get_args) import regex as re import torch @@ -37,6 +36,8 @@ from vllm.config.kv_events import KVEventsConfig from vllm.config.kv_transfer import KVTransferConfig from vllm.config.load import LoadConfig from vllm.config.lora import LoRAConfig +from vllm.config.multimodal import (MMCacheType, MMEncoderTPMode, + MultiModalConfig) from vllm.config.parallel import (DistributedExecutorBackend, EPLBConfig, ParallelConfig) from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy @@ -238,31 +239,12 @@ def get_attr_docs(cls: type[Any]) -> dict[str, str]: return out -def get_field(cls: ConfigType, name: str) -> Field: - """Get the default factory field of a dataclass by name. Used for getting - default factory fields in `EngineArgs`.""" - if not is_dataclass(cls): - raise TypeError("The given class is not a dataclass.") - cls_fields = {f.name: f for f in fields(cls)} - if name not in cls_fields: - raise ValueError(f"Field '{name}' not found in {cls.__name__}.") - named_field: Field = cls_fields[name] - if (default_factory := named_field.default_factory) is not MISSING: - return field(default_factory=default_factory) - if (default := named_field.default) is not MISSING: - return field(default=default) - raise ValueError( - f"{cls.__name__}.{name} must have a default value or default factory.") - - def is_init_field(cls: ConfigType, name: str) -> bool: return next(f for f in fields(cls) if f.name == name).init TokenizerMode = Literal["auto", "slow", "mistral", "custom"] ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"] -MMEncoderTPMode = Literal["weights", "data"] -MMCacheType = Literal["shm", "lru"] class LogprobsMode(enum.Enum): @@ -407,20 +389,6 @@ class ModelConfig: that this name(s) will also be used in `model_name` tag content of prometheus metrics, if multiple names provided, metrics tag will take the first one.""" - limit_mm_per_prompt: dict[str, int] = field(default_factory=dict) - """Maximum number of data items per modality per prompt. Only applicable - for multimodal models.""" - interleave_mm_strings: bool = False - """Enable fully interleaved support for multimodal prompts, while using - --chat-template-content-format=string. Defaults to False.""" - skip_mm_profiling: bool = False - """When enabled, skips multimodal memory profiling and only profiles with - language backbone model during engine initialization. - """ - media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) - """Additional args passed to process media inputs, keyed by modalities. - For example, to set num_frames for video, set - `--media-io-kwargs '{"video": {"num_frames": 40} }'` """ use_async_output_proc: bool = True """Whether to use async output processor.""" config_format: Union[str, ConfigFormat] = "auto" @@ -436,41 +404,6 @@ class ModelConfig: hf_overrides: HfOverrides = field(default_factory=dict) """If a dictionary, contains arguments to be forwarded to the Hugging Face config. If a callable, it is called to update the HuggingFace config.""" - mm_processor_kwargs: Optional[dict[str, Any]] = None - """Arguments to be forwarded to the model's processor for multi-modal data, - e.g., image processor. Overrides for the multi-modal processor obtained - from `AutoProcessor.from_pretrained`. The available overrides depend on the - model that is being run. For example, for Phi-3-Vision: `{"num_crops": 4}`. - """ - mm_processor_cache_gb: float = 4 - """The size (in GiB) of the multi-modal processor cache, which is used to - avoid re-processing past multi-modal inputs. - - This cache is duplicated for each API process and engine core process, - resulting in a total memory usage of - `mm_processor_cache_gb * (api_server_count + data_parallel_size)`. - - Set to `0` to disable this cache completely (not recommended).""" - mm_processor_cache_type: MMCacheType = "lru" - """Type of cache to use for the multi-modal preprocessor/mapper. If `shm`, - use shared memory FIFO cache. If `lru`, use mirrored LRU cache.""" - mm_shm_cache_max_object_size_mb: int = 128 - """Size limit (in MiB) for each object stored in the multi-modal processor - shared memory cache. Only effective when `mm_processor_cache_type` is - `"shm"`.""" - mm_encoder_tp_mode: MMEncoderTPMode = "weights" - """Indicates how to optimize multi-modal encoder inference using - tensor parallelism (TP). - - - `"weights"`: Within the same vLLM engine, split the weights of - each layer across TP ranks. (default TP behavior) - - `"data"`: Within the same vLLM engine, split the batched input data - across TP ranks to process the data in parallel, while hosting - the full weights on each TP rank. - This batch-level DP is not to be confused with API request-level - DP (which is controlled by `--data-parallel-size`). - This is only supported on a per-model basis and falls back to - `"weights"` if the encoder does not support DP.""" pooler_config: Optional["PoolerConfig"] = field(init=False) """Pooler config which controls the behaviour of output pooling in pooling models.""" @@ -513,6 +446,18 @@ class ModelConfig: io_processor_plugin: Optional[str] = None """IOProcessor plugin name to load at model startup""" + # Multimodal config and init vars + multimodal_config: Optional[MultiModalConfig] = None + limit_mm_per_prompt: InitVar[Optional[dict[str, int]]] = None + media_io_kwargs: InitVar[Optional[dict[str, dict[str, Any]]]] = None + mm_processor_kwargs: InitVar[Optional[dict[str, Any]]] = None + mm_processor_cache_gb: InitVar[Optional[float]] = None + mm_processor_cache_type: InitVar[Optional[MMCacheType]] = None + mm_shm_cache_max_object_size_mb: InitVar[Optional[int]] = None + mm_encoder_tp_mode: InitVar[Optional[MMEncoderTPMode]] = None + interleave_mm_strings: InitVar[Optional[bool]] = None + skip_mm_profiling: InitVar[Optional[bool]] = None + def compute_hash(self) -> str: """ WARNING: Whenever a new field is added to this config, @@ -546,7 +491,18 @@ class ModelConfig: assert_hashable(str_factors) return hashlib.sha256(str(factors).encode()).hexdigest() - def __post_init__(self) -> None: + def __post_init__( + self, + # Multimodal config init vars + limit_mm_per_prompt: Optional[dict[str, int]], + media_io_kwargs: Optional[dict[str, dict[str, Any]]], + mm_processor_kwargs: Optional[dict[str, Any]], + mm_processor_cache_gb: Optional[float], + mm_processor_cache_type: Optional[MMCacheType], + mm_shm_cache_max_object_size_mb: Optional[int], + mm_encoder_tp_mode: Optional[MMEncoderTPMode], + interleave_mm_strings: Optional[bool], + skip_mm_profiling: Optional[bool]) -> None: # Set the default seed to 0 in V1. # NOTE(woosuk): In V0, we set the default seed to None because the # driver worker shares the same process as the user process, and thus @@ -777,7 +733,33 @@ class ModelConfig: self.original_max_model_len = self.max_model_len self.max_model_len = self.get_and_verify_max_len(self.max_model_len) - self.multimodal_config = self._init_multimodal_config() + # Init multimodal config if needed + if self._model_info.supports_multimodal: + if (mm_encoder_tp_mode == "data" and + not self._model_info.supports_multimodal_encoder_tp_data): + logger.warning_once( + "This model does not support `--mm-encoder-tp-mode data`. " + "Falling back to `--mm-encoder-tp-mode weights`.") + mm_encoder_tp_mode = "weights" + + mm_config_kwargs = dict( + limit_per_prompt=limit_mm_per_prompt, + media_io_kwargs=media_io_kwargs, + mm_processor_kwargs=mm_processor_kwargs, + mm_processor_cache_gb=mm_processor_cache_gb, + mm_processor_cache_type=mm_processor_cache_type, + mm_shm_cache_max_object_size_mb=mm_shm_cache_max_object_size_mb, + mm_encoder_tp_mode=mm_encoder_tp_mode, + interleave_mm_strings=interleave_mm_strings, + skip_mm_profiling=skip_mm_profiling, + ) + + mm_config_kwargs = { + k: v + for k, v in mm_config_kwargs.items() if v is not None + } + + self.multimodal_config = MultiModalConfig(**mm_config_kwargs) if self.disable_sliding_window: # Set after get_and_verify_max_len to ensure that max_model_len @@ -875,30 +857,6 @@ class ModelConfig: ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors"]) self.tokenizer = object_storage_tokenizer.dir - def _init_multimodal_config(self) -> Optional["MultiModalConfig"]: - if self._model_info.supports_multimodal: - if (self.mm_encoder_tp_mode == "data" and - not self._model_info.supports_multimodal_encoder_tp_data): - logger.warning_once( - "This model does not support `--mm-encoder-tp-mode data`. " - "Falling back to `--mm-encoder-tp-mode weights`.") - self.mm_encoder_tp_mode = "weights" - - return MultiModalConfig( - limit_per_prompt=self.limit_mm_per_prompt, - media_io_kwargs=self.media_io_kwargs, - mm_processor_kwargs=self.mm_processor_kwargs, - mm_processor_cache_gb=self.mm_processor_cache_gb, - mm_processor_cache_type=self.mm_processor_cache_type, - mm_shm_cache_max_object_size_mb=self. - mm_shm_cache_max_object_size_mb, - mm_encoder_tp_mode=self.mm_encoder_tp_mode, - interleave_mm_strings=self.interleave_mm_strings, - skip_mm_profiling=self.skip_mm_profiling, - ) - - return None - def _get_encoder_config(self): return get_sentence_transformer_tokenizer_config( self.model, self.revision) @@ -2417,129 +2375,6 @@ class SpeculativeConfig: return f"SpeculativeConfig({method=}, {model=}, {num_spec_tokens=})" -@config -@dataclass -class MultiModalConfig: - """Controls the behavior of multimodal models.""" - - limit_per_prompt: dict[str, int] = \ - cast(dict[str, int], get_field(ModelConfig, "limit_mm_per_prompt")) - """ - The maximum number of input items allowed per prompt for each modality. - Defaults to 1 (V0) or 999 (V1) for each modality. - - For example, to allow up to 16 images and 2 videos per prompt: - `{"image": 16, "video": 2}` - """ - - media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) - """Additional args passed to process media inputs, keyed by modalities. - For example, to set num_frames for video, set - `--media-io-kwargs '{"video": {"num_frames": 40} }'` """ - - mm_processor_kwargs: Optional[dict[str, object]] = None - """ - Overrides for the multi-modal processor obtained from - `transformers.AutoProcessor.from_pretrained`. - - The available overrides depend on the model that is being run. - - For example, for Phi-3-Vision: - `{"num_crops": 4}`. - """ - - mm_processor_cache_gb: float = 4 - """ - The size (in GiB) of the multi-modal processor cache, which is used to - - This cache is duplicated for each API process and engine core process, - resulting in a total memory usage of - `mm_processor_cache_gb * (api_server_count + data_parallel_size)`. - - Set to `0` to disable this cache completely (not recommended). - """ - - mm_processor_cache_type: MMCacheType = "lru" - """Type of cache to use for the multi-modal preprocessor/mapper. If `shm`, - use shared memory FIFO cache. If `lru`, use mirrored LRU cache.""" - - mm_shm_cache_max_object_size_mb: int = 128 - """Size limit (in MiB) for each object stored in the multi-modal processor - shared memory cache. Only effective when `mm_processor_cache_type` is - `"shm"`.""" - - mm_encoder_tp_mode: MMEncoderTPMode = "weights" - """ - Indicates how to optimize multi-modal encoder inference using - tensor parallelism (TP). - - - `"weights"`: Within the same vLLM engine, split the weights of - each layer across TP ranks. (default TP behavior) - - `"data"`: Within the same vLLM engine, split the batched input data - across TP ranks to process the data in parallel, while hosting - the full weights on each TP rank. - This batch-level DP is not to be confused with API request-level - DP (which is controlled by `--data-parallel-size`). - This is only supported on a per-model basis and falls back to - `"weights"` if the encoder does not support DP. - """ - - interleave_mm_strings: bool = False - """ - Enable fully interleaved support for multimodal prompts. - """ - - skip_mm_profiling: bool = False - """ - When enabled, skips multimodal memory profiling and only profiles with - language backbone model during engine initialization. - - This reduces engine startup time but shifts the responsibility to users for - estimating the peak memory usage of the activation of multimodal encoder and - embedding cache. - """ - - def compute_hash(self) -> str: - """ - WARNING: Whenever a new field is added to this config, - ensure that it is included in the factors list if - it affects the computation graph. - - Provide a hash that uniquely identifies all the configs - that affect the structure of the computation - graph from input ids/embeddings to the final hidden states, - excluding anything before input ids/embeddings and after - the final hidden states. - """ - # no factors to consider. - # this config will not affect the computation graph. - factors: list[Any] = [] - hash_str = hashlib.md5(str(factors).encode(), - usedforsecurity=False).hexdigest() - return hash_str - - def get_limit_per_prompt(self, modality: str) -> int: - """ - Get the maximum number of input items allowed per prompt - for the given modality. - """ - return self.limit_per_prompt.get( - modality, - 999 if envs.VLLM_USE_V1 else 1, - ) - - def merge_mm_processor_kwargs( - self, - inference_kwargs: Mapping[str, object], - ) -> dict[str, object]: - """ - Get the keyword arguments to pass to the multi-modal processor - according to the extra arguments passed during inference. - """ - kwargs = self.mm_processor_kwargs or {} - return kwargs | dict(inference_kwargs) - - @config @dataclass class PoolerConfig: diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py new file mode 100644 index 0000000000000..1b93b520f33f9 --- /dev/null +++ b/vllm/config/multimodal.py @@ -0,0 +1,120 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import hashlib +from collections.abc import Mapping +from dataclasses import field +from typing import Any, Literal, Optional + +from pydantic.dataclasses import dataclass + +import vllm.envs as envs +from vllm.config.utils import config + +MMEncoderTPMode = Literal["weights", "data"] +MMCacheType = Literal["shm", "lru"] + + +@config +@dataclass +class MultiModalConfig: + """Controls the behavior of multimodal models.""" + + limit_per_prompt: dict[str, int] = field(default_factory=dict) + """The maximum number of input items allowed per prompt for each modality. + Defaults to 1 (V0) or 999 (V1) for each modality. + + For example, to allow up to 16 images and 2 videos per prompt: + `{"image": 16, "video": 2}`""" + media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) + """Additional args passed to process media inputs, keyed by modalities. + For example, to set num_frames for video, set + `--media-io-kwargs '{"video": {"num_frames": 40} }'`""" + mm_processor_kwargs: Optional[dict[str, object]] = None + """Arguments to be forwarded to the model's processor for multi-modal data, + e.g., image processor. Overrides for the multi-modal processor obtained + from `transformers.AutoProcessor.from_pretrained`. + + The available overrides depend on the model that is being run. + + For example, for Phi-3-Vision: + `{"num_crops": 4}`.""" + mm_processor_cache_gb: float = 4 + """The size (in GiB) of the multi-modal processor cache, which is used to + avoid re-processing past multi-modal inputs. + + This cache is duplicated for each API process and engine core process, + resulting in a total memory usage of + `mm_processor_cache_gb * (api_server_count + data_parallel_size)`. + + Set to `0` to disable this cache completely (not recommended).""" + mm_processor_cache_type: MMCacheType = "lru" + """Type of cache to use for the multi-modal preprocessor/mapper. If `shm`, + use shared memory FIFO cache. If `lru`, use mirrored LRU cache.""" + mm_shm_cache_max_object_size_mb: int = 128 + """Size limit (in MiB) for each object stored in the multi-modal processor + shared memory cache. Only effective when `mm_processor_cache_type` is + `"shm"`.""" + mm_encoder_tp_mode: MMEncoderTPMode = "weights" + """Indicates how to optimize multi-modal encoder inference using tensor + parallelism (TP). + + - `"weights"`: Within the same vLLM engine, split the weights of + each layer across TP ranks. (default TP behavior)\n + - `"data"`: Within the same vLLM engine, split the batched input data + across TP ranks to process the data in parallel, while hosting + the full weights on each TP rank. + This batch-level DP is not to be confused with API request-level + DP (which is controlled by `--data-parallel-size`). + This is only supported on a per-model basis and falls back to + `"weights"` if the encoder does not support DP.""" + interleave_mm_strings: bool = False + """Enable fully interleaved support for multimodal prompts, while using + --chat-template-content-format=string.""" + skip_mm_profiling: bool = False + """When enabled, skips multimodal memory profiling and only profiles with + language backbone model during engine initialization. + + This reduces engine startup time but shifts the responsibility to users for + estimating the peak memory usage of the activation of multimodal encoder and + embedding cache.""" + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # this config will not affect the computation graph. + factors: list[Any] = [] + hash_str = hashlib.md5(str(factors).encode(), + usedforsecurity=False).hexdigest() + return hash_str + + def get_limit_per_prompt(self, modality: str) -> int: + """ + Get the maximum number of input items allowed per prompt + for the given modality. + """ + return self.limit_per_prompt.get( + modality, + 999 if envs.VLLM_USE_V1 else 1, + ) + + def merge_mm_processor_kwargs( + self, + inference_kwargs: Mapping[str, object], + ) -> dict[str, object]: + """ + Get the keyword arguments to pass to the multi-modal processor + according to the extra arguments passed during inference. + """ + kwargs = self.mm_processor_kwargs or {} + return kwargs | dict(inference_kwargs) diff --git a/vllm/config/utils.py b/vllm/config/utils.py index 98fbeb1fa86aa..db8c05ef8be4a 100644 --- a/vllm/config/utils.py +++ b/vllm/config/utils.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from dataclasses import MISSING, Field, field, fields, is_dataclass from typing import TYPE_CHECKING, TypeVar if TYPE_CHECKING: @@ -27,3 +28,20 @@ def config(cls: ConfigT) -> ConfigT: script, which is invoked during the pre-commit checks. """ return cls + + +def get_field(cls: ConfigType, name: str) -> Field: + """Get the default factory field of a dataclass by name. Used for getting + default factory fields in `EngineArgs`.""" + if not is_dataclass(cls): + raise TypeError("The given class is not a dataclass.") + cls_fields = {f.name: f for f in fields(cls)} + if name not in cls_fields: + raise ValueError(f"Field '{name}' not found in {cls.__name__}.") + named_field: Field = cls_fields[name] + if (default_factory := named_field.default_factory) is not MISSING: + return field(default_factory=default_factory) + if (default := named_field.default) is not MISSING: + return field(default=default) + raise ValueError( + f"{cls.__name__}.{name} must have a default value or default factory.") diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index ab43c0edc98d7..595d318fbaafe 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -27,12 +27,14 @@ from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig, DistributedExecutorBackend, EPLBConfig, GuidedDecodingBackend, HfOverrides, KVEventsConfig, KVTransferConfig, LoadConfig, LogprobsMode, - LoRAConfig, MambaDType, MMCacheType, MMEncoderTPMode, - ModelConfig, ModelDType, ModelImpl, MultiModalConfig, - ObservabilityConfig, ParallelConfig, PoolerConfig, - PrefixCachingHashAlgo, RunnerOption, SchedulerConfig, - SchedulerPolicy, SpeculativeConfig, TaskOption, - TokenizerMode, VllmConfig, get_attr_docs, get_field) + LoRAConfig, MambaDType, MMEncoderTPMode, ModelConfig, + ModelDType, ModelImpl, ObservabilityConfig, + ParallelConfig, PoolerConfig, PrefixCachingHashAlgo, + RunnerOption, SchedulerConfig, SchedulerPolicy, + SpeculativeConfig, TaskOption, TokenizerMode, + VllmConfig, get_attr_docs) +from vllm.config.multimodal import MMCacheType, MultiModalConfig +from vllm.config.utils import get_field from vllm.logger import init_logger from vllm.platforms import CpuArchEnum, current_platform from vllm.plugins import load_general_plugins diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index aa231de93c0c3..00ef39f134653 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -800,9 +800,10 @@ class MultiModalContentParser(BaseMultiModalContentParser): super().__init__() self._tracker = tracker - + multimodal_config = self._tracker.model_config.multimodal_config + media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None) self._connector = MediaConnector( - media_io_kwargs=self._tracker._model_config.media_io_kwargs, + media_io_kwargs=media_io_kwargs, allowed_local_media_path=tracker.allowed_local_media_path, ) @@ -883,8 +884,10 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser): super().__init__() self._tracker = tracker + multimodal_config = self._tracker.model_config.multimodal_config + media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None) self._connector = MediaConnector( - media_io_kwargs=self._tracker._model_config.media_io_kwargs, + media_io_kwargs=media_io_kwargs, allowed_local_media_path=tracker.allowed_local_media_path, ) diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index a386f47e1929f..4f51441e28efa 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -229,7 +229,8 @@ class MultiModalProcessingInfo(BaseProcessingInfo): def get_max_image_tokens(self) -> int: width, height = self.get_max_image_size() processor = self.get_hf_processor() - mm_processor_kwargs = self.ctx.model_config.mm_processor_kwargs or {} + multimodal_config = self.ctx.model_config.multimodal_config + mm_processor_kwargs = multimodal_config.mm_processor_kwargs or {} mm_tokens = processor._get_num_multimodal_tokens( image_sizes=([height, width], ), **mm_processor_kwargs) image_tokens = mm_tokens["num_image_tokens"][0] @@ -380,8 +381,8 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]): # Below tested on Llava. Prompts and `mm_token_type_ids` are always bs=1 mm_positions = torch.where(mm_token_type_ids == 1)[1] images = mm_items.get_items("image", ImageProcessorItems) - mm_processor_kwargs = (self.info.ctx.model_config.mm_processor_kwargs - or {}) + multimodal_config = self.info.ctx.model_config.multimodal_config + mm_processor_kwargs = multimodal_config.mm_processor_kwargs or {} image_sizes = [] for item_idx in range(len(images)): image_size = images.get_image_size(item_idx)