diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py
index 818efd825640c..2bf29ecf087f3 100644
--- a/tests/entrypoints/openai/test_lora_resolvers.py
+++ b/tests/entrypoints/openai/test_lora_resolvers.py
@@ -9,7 +9,7 @@ from unittest.mock import MagicMock
 
 import pytest
 
-from vllm.config import MultiModalConfig
+from vllm.config.multimodal import MultiModalConfig
 from vllm.engine.multiprocessing.client import MQLLMEngineClient
 from vllm.entrypoints.openai.protocol import CompletionRequest, ErrorResponse
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index d219a1f311f15..502704c9bbdff 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -12,7 +12,7 @@ from unittest.mock import MagicMock
 import pytest
 import pytest_asyncio
 
-from vllm.config import MultiModalConfig
+from vllm.config.multimodal import MultiModalConfig
 from vllm.engine.multiprocessing.client import MQLLMEngineClient
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
diff --git a/tests/multimodal/test_cache.py b/tests/multimodal/test_cache.py
index 3c61ee26e092e..3c737acfbfe28 100644
--- a/tests/multimodal/test_cache.py
+++ b/tests/multimodal/test_cache.py
@@ -7,6 +7,7 @@ import pytest
 import torch
 
 from vllm.config import ModelConfig, ParallelConfig, VllmConfig
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.cache import (MultiModalCache,
                                    MultiModalProcessorCacheItem,
                                    MultiModalProcessorCacheItemMetadata,
@@ -17,7 +18,6 @@ from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargsItem,
                                     MultiModalKwargsItems,
                                     MultiModalSharedField)
 from vllm.multimodal.processing import PromptInsertion
-from vllm.multimodal.registry import MultiModalRegistry
 
 
 def _dummy_elem(
@@ -96,7 +96,9 @@ def _create_vllm_config(
     enable_ipc: bool,
 ):
     return VllmConfig(
-        model_config=ModelConfig(mm_processor_cache_gb=mm_processor_cache_gb),
+        model_config=ModelConfig(
+            model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
+            mm_processor_cache_gb=mm_processor_cache_gb),
         parallel_config=ParallelConfig(
             data_parallel_size=1 if enable_ipc else 2),
     )
@@ -113,15 +115,16 @@ def _compare_caches(
     n_iter: int = 100,
     seed: int = 0,
 ):
-    mm_registry = MultiModalRegistry()
-    cache_0_p0 = processor_cache_from_config(config_0, mm_registry)
-    cache_0_p1 = engine_receiver_cache_from_config(config_0, mm_registry)
-    cache_1_p0 = processor_cache_from_config(config_1, mm_registry)
-    cache_1_p1 = engine_receiver_cache_from_config(config_1, mm_registry)
+    cache_0_p0 = processor_cache_from_config(config_0, MULTIMODAL_REGISTRY)
+    cache_0_p1 = engine_receiver_cache_from_config(config_0,
+                                                   MULTIMODAL_REGISTRY)
+    cache_1_p0 = processor_cache_from_config(config_1, MULTIMODAL_REGISTRY)
+    cache_1_p1 = engine_receiver_cache_from_config(config_1,
+                                                   MULTIMODAL_REGISTRY)
 
     cache_size_gb = max(
-        config_0.model_config.mm_processor_cache_gb,
-        config_1.model_config.mm_processor_cache_gb,
+        config_0.model_config.multimodal_config.mm_processor_cache_gb,
+        config_1.model_config.multimodal_config.mm_processor_cache_gb,
     )
     item_size_gb = int(cache_size_gb / item_capacity)
 
diff --git a/tests/test_config.py b/tests/test_config.py
index 373fbd267539a..8db04de5469ad 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -6,9 +6,9 @@ from dataclasses import MISSING, Field, asdict, dataclass, field
 import pytest
 
 from vllm.compilation.backends import VllmBackend
-from vllm.config import (ModelConfig, PoolerConfig, VllmConfig, get_field,
-                         update_config)
+from vllm.config import ModelConfig, PoolerConfig, VllmConfig, update_config
 from vllm.config.load import LoadConfig
+from vllm.config.utils import get_field
 from vllm.model_executor.layers.pooler import PoolingType
 from vllm.platforms import current_platform
 
diff --git a/tests/v1/engine/test_processor_multi_modal_uuids.py b/tests/v1/engine/test_processor_multi_modal_uuids.py
index 955c74d262a09..bdd41eece2317 100644
--- a/tests/v1/engine/test_processor_multi_modal_uuids.py
+++ b/tests/v1/engine/test_processor_multi_modal_uuids.py
@@ -31,7 +31,7 @@ def _mk_processor(monkeypatch,
                         raising=True)
     monkeypatch.setattr(ModelConfig,
                         "__post_init__",
-                        lambda self: None,
+                        lambda self, *args: None,
                         raising=True)
     monkeypatch.setattr(UnspecifiedPlatform,
                         "is_async_output_supported",
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 85e58c290b792..599eeb63bf804 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -11,13 +11,12 @@ import json
 import os
 import textwrap
 import warnings
-from collections.abc import Mapping
 from contextlib import contextmanager
-from dataclasses import MISSING, Field, field, fields, is_dataclass, replace
+from dataclasses import InitVar, field, fields, is_dataclass, replace
 from functools import cached_property, lru_cache
 from importlib.util import find_spec
-from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Literal, Optional,
-                    Protocol, TypeVar, Union, cast, get_args)
+from typing import (TYPE_CHECKING, Any, Callable, Literal, Optional, Protocol,
+                    TypeVar, Union, cast, get_args)
 
 import regex as re
 import torch
@@ -37,6 +36,8 @@ from vllm.config.kv_events import KVEventsConfig
 from vllm.config.kv_transfer import KVTransferConfig
 from vllm.config.load import LoadConfig
 from vllm.config.lora import LoRAConfig
+from vllm.config.multimodal import (MMCacheType, MMEncoderTPMode,
+                                    MultiModalConfig)
 from vllm.config.parallel import (DistributedExecutorBackend, EPLBConfig,
                                   ParallelConfig)
 from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy
@@ -238,31 +239,12 @@ def get_attr_docs(cls: type[Any]) -> dict[str, str]:
     return out
 
 
-def get_field(cls: ConfigType, name: str) -> Field:
-    """Get the default factory field of a dataclass by name. Used for getting
-    default factory fields in `EngineArgs`."""
-    if not is_dataclass(cls):
-        raise TypeError("The given class is not a dataclass.")
-    cls_fields = {f.name: f for f in fields(cls)}
-    if name not in cls_fields:
-        raise ValueError(f"Field '{name}' not found in {cls.__name__}.")
-    named_field: Field = cls_fields[name]
-    if (default_factory := named_field.default_factory) is not MISSING:
-        return field(default_factory=default_factory)
-    if (default := named_field.default) is not MISSING:
-        return field(default=default)
-    raise ValueError(
-        f"{cls.__name__}.{name} must have a default value or default factory.")
-
-
 def is_init_field(cls: ConfigType, name: str) -> bool:
     return next(f for f in fields(cls) if f.name == name).init
 
 
 TokenizerMode = Literal["auto", "slow", "mistral", "custom"]
 ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
-MMEncoderTPMode = Literal["weights", "data"]
-MMCacheType = Literal["shm", "lru"]
 
 
 class LogprobsMode(enum.Enum):
@@ -407,20 +389,6 @@ class ModelConfig:
     that this name(s) will also be used in `model_name` tag content of
     prometheus metrics, if multiple names provided, metrics tag will take the
     first one."""
-    limit_mm_per_prompt: dict[str, int] = field(default_factory=dict)
-    """Maximum number of data items per modality per prompt. Only applicable
-    for multimodal models."""
-    interleave_mm_strings: bool = False
-    """Enable fully interleaved support for multimodal prompts, while using
-    --chat-template-content-format=string. Defaults to False."""
-    skip_mm_profiling: bool = False
-    """When enabled, skips multimodal memory profiling and only profiles with
-    language backbone model during engine initialization.
-    """
-    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
-    """Additional args passed to process media inputs, keyed by modalities.
-    For example, to set num_frames for video, set
-    `--media-io-kwargs '{"video": {"num_frames": 40} }'` """
     use_async_output_proc: bool = True
     """Whether to use async output processor."""
     config_format: Union[str, ConfigFormat] = "auto"
@@ -436,41 +404,6 @@ class ModelConfig:
     hf_overrides: HfOverrides = field(default_factory=dict)
     """If a dictionary, contains arguments to be forwarded to the Hugging Face
     config. If a callable, it is called to update the HuggingFace config."""
-    mm_processor_kwargs: Optional[dict[str, Any]] = None
-    """Arguments to be forwarded to the model's processor for multi-modal data,
-    e.g., image processor. Overrides for the multi-modal processor obtained
-    from `AutoProcessor.from_pretrained`. The available overrides depend on the
-    model that is being run. For example, for Phi-3-Vision: `{"num_crops": 4}`.
-    """
-    mm_processor_cache_gb: float = 4
-    """The size (in GiB) of the multi-modal processor cache, which is used to
-    avoid re-processing past multi-modal inputs.
-
-    This cache is duplicated for each API process and engine core process,
-    resulting in a total memory usage of
-    `mm_processor_cache_gb * (api_server_count + data_parallel_size)`.
-
-    Set to `0` to disable this cache completely (not recommended)."""
-    mm_processor_cache_type: MMCacheType = "lru"
-    """Type of cache to use for the multi-modal preprocessor/mapper. If `shm`,
-    use shared memory FIFO cache. If `lru`, use mirrored LRU cache."""
-    mm_shm_cache_max_object_size_mb: int = 128
-    """Size limit (in MiB) for each object stored in the multi-modal processor
-    shared memory cache. Only effective when `mm_processor_cache_type` is
-    `"shm"`."""
-    mm_encoder_tp_mode: MMEncoderTPMode = "weights"
-    """Indicates how to optimize multi-modal encoder inference using
-    tensor parallelism (TP).
-
-    - `"weights"`: Within the same vLLM engine, split the weights of
-        each layer across TP ranks. (default TP behavior)
-    - `"data"`: Within the same vLLM engine, split the batched input data
-        across TP ranks to process the data in parallel, while hosting
-        the full weights on each TP rank.
-        This batch-level DP is not to be confused with API request-level
-        DP (which is controlled by `--data-parallel-size`).
-        This is only supported on a per-model basis and falls back to
-        `"weights"` if the encoder does not support DP."""
     pooler_config: Optional["PoolerConfig"] = field(init=False)
     """Pooler config which controls the behaviour of output pooling in pooling
     models."""
@@ -513,6 +446,18 @@ class ModelConfig:
     io_processor_plugin: Optional[str] = None
     """IOProcessor plugin name to load at model startup"""
 
+    # Multimodal config and init vars
+    multimodal_config: Optional[MultiModalConfig] = None
+    limit_mm_per_prompt: InitVar[Optional[dict[str, int]]] = None
+    media_io_kwargs: InitVar[Optional[dict[str, dict[str, Any]]]] = None
+    mm_processor_kwargs: InitVar[Optional[dict[str, Any]]] = None
+    mm_processor_cache_gb: InitVar[Optional[float]] = None
+    mm_processor_cache_type: InitVar[Optional[MMCacheType]] = None
+    mm_shm_cache_max_object_size_mb: InitVar[Optional[int]] = None
+    mm_encoder_tp_mode: InitVar[Optional[MMEncoderTPMode]] = None
+    interleave_mm_strings: InitVar[Optional[bool]] = None
+    skip_mm_profiling: InitVar[Optional[bool]] = None
+
     def compute_hash(self) -> str:
         """
         WARNING: Whenever a new field is added to this config,
@@ -546,7 +491,18 @@ class ModelConfig:
         assert_hashable(str_factors)
         return hashlib.sha256(str(factors).encode()).hexdigest()
 
-    def __post_init__(self) -> None:
+    def __post_init__(
+            self,
+            # Multimodal config init vars
+            limit_mm_per_prompt: Optional[dict[str, int]],
+            media_io_kwargs: Optional[dict[str, dict[str, Any]]],
+            mm_processor_kwargs: Optional[dict[str, Any]],
+            mm_processor_cache_gb: Optional[float],
+            mm_processor_cache_type: Optional[MMCacheType],
+            mm_shm_cache_max_object_size_mb: Optional[int],
+            mm_encoder_tp_mode: Optional[MMEncoderTPMode],
+            interleave_mm_strings: Optional[bool],
+            skip_mm_profiling: Optional[bool]) -> None:
         # Set the default seed to 0 in V1.
         # NOTE(woosuk): In V0, we set the default seed to None because the
         # driver worker shares the same process as the user process, and thus
@@ -777,7 +733,33 @@ class ModelConfig:
 
         self.original_max_model_len = self.max_model_len
         self.max_model_len = self.get_and_verify_max_len(self.max_model_len)
-        self.multimodal_config = self._init_multimodal_config()
+        # Init multimodal config if needed
+        if self._model_info.supports_multimodal:
+            if (mm_encoder_tp_mode == "data" and
+                    not self._model_info.supports_multimodal_encoder_tp_data):
+                logger.warning_once(
+                    "This model does not support `--mm-encoder-tp-mode data`. "
+                    "Falling back to `--mm-encoder-tp-mode weights`.")
+                mm_encoder_tp_mode = "weights"
+
+            mm_config_kwargs = dict(
+                limit_per_prompt=limit_mm_per_prompt,
+                media_io_kwargs=media_io_kwargs,
+                mm_processor_kwargs=mm_processor_kwargs,
+                mm_processor_cache_gb=mm_processor_cache_gb,
+                mm_processor_cache_type=mm_processor_cache_type,
+                mm_shm_cache_max_object_size_mb=mm_shm_cache_max_object_size_mb,
+                mm_encoder_tp_mode=mm_encoder_tp_mode,
+                interleave_mm_strings=interleave_mm_strings,
+                skip_mm_profiling=skip_mm_profiling,
+            )
+
+            mm_config_kwargs = {
+                k: v
+                for k, v in mm_config_kwargs.items() if v is not None
+            }
+
+            self.multimodal_config = MultiModalConfig(**mm_config_kwargs)
 
         if self.disable_sliding_window:
             # Set after get_and_verify_max_len to ensure that max_model_len
@@ -875,30 +857,6 @@ class ModelConfig:
                 ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors"])
             self.tokenizer = object_storage_tokenizer.dir
 
-    def _init_multimodal_config(self) -> Optional["MultiModalConfig"]:
-        if self._model_info.supports_multimodal:
-            if (self.mm_encoder_tp_mode == "data" and
-                    not self._model_info.supports_multimodal_encoder_tp_data):
-                logger.warning_once(
-                    "This model does not support `--mm-encoder-tp-mode data`. "
-                    "Falling back to `--mm-encoder-tp-mode weights`.")
-                self.mm_encoder_tp_mode = "weights"
-
-            return MultiModalConfig(
-                limit_per_prompt=self.limit_mm_per_prompt,
-                media_io_kwargs=self.media_io_kwargs,
-                mm_processor_kwargs=self.mm_processor_kwargs,
-                mm_processor_cache_gb=self.mm_processor_cache_gb,
-                mm_processor_cache_type=self.mm_processor_cache_type,
-                mm_shm_cache_max_object_size_mb=self.
-                mm_shm_cache_max_object_size_mb,
-                mm_encoder_tp_mode=self.mm_encoder_tp_mode,
-                interleave_mm_strings=self.interleave_mm_strings,
-                skip_mm_profiling=self.skip_mm_profiling,
-            )
-
-        return None
-
     def _get_encoder_config(self):
         return get_sentence_transformer_tokenizer_config(
             self.model, self.revision)
@@ -2417,129 +2375,6 @@ class SpeculativeConfig:
         return f"SpeculativeConfig({method=}, {model=}, {num_spec_tokens=})"
 
 
-@config
-@dataclass
-class MultiModalConfig:
-    """Controls the behavior of multimodal models."""
-
-    limit_per_prompt: dict[str, int] = \
-        cast(dict[str, int], get_field(ModelConfig, "limit_mm_per_prompt"))
-    """
-    The maximum number of input items allowed per prompt for each modality.
-    Defaults to 1 (V0) or 999 (V1) for each modality.
-
-    For example, to allow up to 16 images and 2 videos per prompt:
-    `{"image": 16, "video": 2}`
-    """
-
-    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
-    """Additional args passed to process media inputs, keyed by modalities.
-    For example, to set num_frames for video, set
-    `--media-io-kwargs '{"video": {"num_frames": 40} }'` """
-
-    mm_processor_kwargs: Optional[dict[str, object]] = None
-    """
-    Overrides for the multi-modal processor obtained from
-    `transformers.AutoProcessor.from_pretrained`.
-
-    The available overrides depend on the model that is being run.
-
-    For example, for Phi-3-Vision:
-    `{"num_crops": 4}`.
-    """
-
-    mm_processor_cache_gb: float = 4
-    """
-    The size (in GiB) of the multi-modal processor cache, which is used to
-
-    This cache is duplicated for each API process and engine core process,
-    resulting in a total memory usage of
-    `mm_processor_cache_gb * (api_server_count + data_parallel_size)`.
-
-    Set to `0` to disable this cache completely (not recommended).
-    """
-
-    mm_processor_cache_type: MMCacheType = "lru"
-    """Type of cache to use for the multi-modal preprocessor/mapper. If `shm`,
-    use shared memory FIFO cache. If `lru`, use mirrored LRU cache."""
-
-    mm_shm_cache_max_object_size_mb: int = 128
-    """Size limit (in MiB) for each object stored in the multi-modal processor
-    shared memory cache. Only effective when `mm_processor_cache_type` is
-    `"shm"`."""
-
-    mm_encoder_tp_mode: MMEncoderTPMode = "weights"
-    """
-    Indicates how to optimize multi-modal encoder inference using
-    tensor parallelism (TP).
-
-    - `"weights"`: Within the same vLLM engine, split the weights of
-        each layer across TP ranks. (default TP behavior)
-    - `"data"`: Within the same vLLM engine, split the batched input data
-        across TP ranks to process the data in parallel, while hosting
-        the full weights on each TP rank.
-        This batch-level DP is not to be confused with API request-level
-        DP (which is controlled by `--data-parallel-size`).
-        This is only supported on a per-model basis and falls back to
-        `"weights"` if the encoder does not support DP.
-    """
-
-    interleave_mm_strings: bool = False
-    """
-    Enable fully interleaved support for multimodal prompts.
-    """
-
-    skip_mm_profiling: bool = False
-    """
-    When enabled, skips multimodal memory profiling and only profiles with
-    language backbone model during engine initialization.
-
-    This reduces engine startup time but shifts the responsibility to users for
-    estimating the peak memory usage of the activation of multimodal encoder and
-    embedding cache.
-    """
-
-    def compute_hash(self) -> str:
-        """
-        WARNING: Whenever a new field is added to this config,
-        ensure that it is included in the factors list if
-        it affects the computation graph.
-
-        Provide a hash that uniquely identifies all the configs
-        that affect the structure of the computation
-        graph from input ids/embeddings to the final hidden states,
-        excluding anything before input ids/embeddings and after
-        the final hidden states.
-        """
-        # no factors to consider.
-        # this config will not affect the computation graph.
-        factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(),
-                               usedforsecurity=False).hexdigest()
-        return hash_str
-
-    def get_limit_per_prompt(self, modality: str) -> int:
-        """
-        Get the maximum number of input items allowed per prompt
-        for the given modality.
-        """
-        return self.limit_per_prompt.get(
-            modality,
-            999 if envs.VLLM_USE_V1 else 1,
-        )
-
-    def merge_mm_processor_kwargs(
-        self,
-        inference_kwargs: Mapping[str, object],
-    ) -> dict[str, object]:
-        """
-        Get the keyword arguments to pass to the multi-modal processor
-        according to the extra arguments passed during inference.
-        """
-        kwargs = self.mm_processor_kwargs or {}
-        return kwargs | dict(inference_kwargs)
-
-
 @config
 @dataclass
 class PoolerConfig:
diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py
new file mode 100644
index 0000000000000..1b93b520f33f9
--- /dev/null
+++ b/vllm/config/multimodal.py
@@ -0,0 +1,120 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import hashlib
+from collections.abc import Mapping
+from dataclasses import field
+from typing import Any, Literal, Optional
+
+from pydantic.dataclasses import dataclass
+
+import vllm.envs as envs
+from vllm.config.utils import config
+
+MMEncoderTPMode = Literal["weights", "data"]
+MMCacheType = Literal["shm", "lru"]
+
+
+@config
+@dataclass
+class MultiModalConfig:
+    """Controls the behavior of multimodal models."""
+
+    limit_per_prompt: dict[str, int] = field(default_factory=dict)
+    """The maximum number of input items allowed per prompt for each modality.
+    Defaults to 1 (V0) or 999 (V1) for each modality.
+
+    For example, to allow up to 16 images and 2 videos per prompt:
+    `{"image": 16, "video": 2}`"""
+    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
+    """Additional args passed to process media inputs, keyed by modalities.
+    For example, to set num_frames for video, set
+    `--media-io-kwargs '{"video": {"num_frames": 40} }'`"""
+    mm_processor_kwargs: Optional[dict[str, object]] = None
+    """Arguments to be forwarded to the model's processor for multi-modal data,
+    e.g., image processor. Overrides for the multi-modal processor obtained
+    from `transformers.AutoProcessor.from_pretrained`.
+
+    The available overrides depend on the model that is being run.
+
+    For example, for Phi-3-Vision:
+    `{"num_crops": 4}`."""
+    mm_processor_cache_gb: float = 4
+    """The size (in GiB) of the multi-modal processor cache, which is used to
+    avoid re-processing past multi-modal inputs.
+
+    This cache is duplicated for each API process and engine core process,
+    resulting in a total memory usage of
+    `mm_processor_cache_gb * (api_server_count + data_parallel_size)`.
+
+    Set to `0` to disable this cache completely (not recommended)."""
+    mm_processor_cache_type: MMCacheType = "lru"
+    """Type of cache to use for the multi-modal preprocessor/mapper. If `shm`,
+    use shared memory FIFO cache. If `lru`, use mirrored LRU cache."""
+    mm_shm_cache_max_object_size_mb: int = 128
+    """Size limit (in MiB) for each object stored in the multi-modal processor
+    shared memory cache. Only effective when `mm_processor_cache_type` is
+    `"shm"`."""
+    mm_encoder_tp_mode: MMEncoderTPMode = "weights"
+    """Indicates how to optimize multi-modal encoder inference using tensor
+    parallelism (TP).
+
+    - `"weights"`: Within the same vLLM engine, split the weights of
+        each layer across TP ranks. (default TP behavior)\n
+    - `"data"`: Within the same vLLM engine, split the batched input data
+        across TP ranks to process the data in parallel, while hosting
+        the full weights on each TP rank.
+        This batch-level DP is not to be confused with API request-level
+        DP (which is controlled by `--data-parallel-size`).
+        This is only supported on a per-model basis and falls back to
+        `"weights"` if the encoder does not support DP."""
+    interleave_mm_strings: bool = False
+    """Enable fully interleaved support for multimodal prompts, while using
+    --chat-template-content-format=string."""
+    skip_mm_profiling: bool = False
+    """When enabled, skips multimodal memory profiling and only profiles with
+    language backbone model during engine initialization.
+
+    This reduces engine startup time but shifts the responsibility to users for
+    estimating the peak memory usage of the activation of multimodal encoder and
+    embedding cache."""
+
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: list[Any] = []
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
+        return hash_str
+
+    def get_limit_per_prompt(self, modality: str) -> int:
+        """
+        Get the maximum number of input items allowed per prompt
+        for the given modality.
+        """
+        return self.limit_per_prompt.get(
+            modality,
+            999 if envs.VLLM_USE_V1 else 1,
+        )
+
+    def merge_mm_processor_kwargs(
+        self,
+        inference_kwargs: Mapping[str, object],
+    ) -> dict[str, object]:
+        """
+        Get the keyword arguments to pass to the multi-modal processor
+        according to the extra arguments passed during inference.
+        """
+        kwargs = self.mm_processor_kwargs or {}
+        return kwargs | dict(inference_kwargs)
diff --git a/vllm/config/utils.py b/vllm/config/utils.py
index 98fbeb1fa86aa..db8c05ef8be4a 100644
--- a/vllm/config/utils.py
+++ b/vllm/config/utils.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from dataclasses import MISSING, Field, field, fields, is_dataclass
 from typing import TYPE_CHECKING, TypeVar
 
 if TYPE_CHECKING:
@@ -27,3 +28,20 @@ def config(cls: ConfigT) -> ConfigT:
     script, which is invoked during the pre-commit checks.
     """
     return cls
+
+
+def get_field(cls: ConfigType, name: str) -> Field:
+    """Get the default factory field of a dataclass by name. Used for getting
+    default factory fields in `EngineArgs`."""
+    if not is_dataclass(cls):
+        raise TypeError("The given class is not a dataclass.")
+    cls_fields = {f.name: f for f in fields(cls)}
+    if name not in cls_fields:
+        raise ValueError(f"Field '{name}' not found in {cls.__name__}.")
+    named_field: Field = cls_fields[name]
+    if (default_factory := named_field.default_factory) is not MISSING:
+        return field(default_factory=default_factory)
+    if (default := named_field.default) is not MISSING:
+        return field(default=default)
+    raise ValueError(
+        f"{cls.__name__}.{name} must have a default value or default factory.")
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index ab43c0edc98d7..595d318fbaafe 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -27,12 +27,14 @@ from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
                          DistributedExecutorBackend, EPLBConfig,
                          GuidedDecodingBackend, HfOverrides, KVEventsConfig,
                          KVTransferConfig, LoadConfig, LogprobsMode,
-                         LoRAConfig, MambaDType, MMCacheType, MMEncoderTPMode,
-                         ModelConfig, ModelDType, ModelImpl, MultiModalConfig,
-                         ObservabilityConfig, ParallelConfig, PoolerConfig,
-                         PrefixCachingHashAlgo, RunnerOption, SchedulerConfig,
-                         SchedulerPolicy, SpeculativeConfig, TaskOption,
-                         TokenizerMode, VllmConfig, get_attr_docs, get_field)
+                         LoRAConfig, MambaDType, MMEncoderTPMode, ModelConfig,
+                         ModelDType, ModelImpl, ObservabilityConfig,
+                         ParallelConfig, PoolerConfig, PrefixCachingHashAlgo,
+                         RunnerOption, SchedulerConfig, SchedulerPolicy,
+                         SpeculativeConfig, TaskOption, TokenizerMode,
+                         VllmConfig, get_attr_docs)
+from vllm.config.multimodal import MMCacheType, MultiModalConfig
+from vllm.config.utils import get_field
 from vllm.logger import init_logger
 from vllm.platforms import CpuArchEnum, current_platform
 from vllm.plugins import load_general_plugins
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index aa231de93c0c3..00ef39f134653 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -800,9 +800,10 @@ class MultiModalContentParser(BaseMultiModalContentParser):
         super().__init__()
 
         self._tracker = tracker
-
+        multimodal_config = self._tracker.model_config.multimodal_config
+        media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None)
         self._connector = MediaConnector(
-            media_io_kwargs=self._tracker._model_config.media_io_kwargs,
+            media_io_kwargs=media_io_kwargs,
             allowed_local_media_path=tracker.allowed_local_media_path,
         )
 
@@ -883,8 +884,10 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
         super().__init__()
 
         self._tracker = tracker
+        multimodal_config = self._tracker.model_config.multimodal_config
+        media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None)
         self._connector = MediaConnector(
-            media_io_kwargs=self._tracker._model_config.media_io_kwargs,
+            media_io_kwargs=media_io_kwargs,
             allowed_local_media_path=tracker.allowed_local_media_path,
         )
 
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index a386f47e1929f..4f51441e28efa 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -229,7 +229,8 @@ class MultiModalProcessingInfo(BaseProcessingInfo):
     def get_max_image_tokens(self) -> int:
         width, height = self.get_max_image_size()
         processor = self.get_hf_processor()
-        mm_processor_kwargs = self.ctx.model_config.mm_processor_kwargs or {}
+        multimodal_config = self.ctx.model_config.multimodal_config
+        mm_processor_kwargs = multimodal_config.mm_processor_kwargs or {}
         mm_tokens = processor._get_num_multimodal_tokens(
             image_sizes=([height, width], ), **mm_processor_kwargs)
         image_tokens = mm_tokens["num_image_tokens"][0]
@@ -380,8 +381,8 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):
         # Below tested on Llava. Prompts and `mm_token_type_ids` are always bs=1
         mm_positions = torch.where(mm_token_type_ids == 1)[1]
         images = mm_items.get_items("image", ImageProcessorItems)
-        mm_processor_kwargs = (self.info.ctx.model_config.mm_processor_kwargs
-                               or {})
+        multimodal_config = self.info.ctx.model_config.multimodal_config
+        mm_processor_kwargs = multimodal_config.mm_processor_kwargs or {}
         image_sizes = []
         for item_idx in range(len(images)):
             image_size = images.get_image_size(item_idx)