[Doc] Expand Multimodal API Reference (#11852)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-01-17 22:04:36 +08:00 · 2025-01-09 01:14:14 +08:00 · 2025-01-09 01:14:14 +08:00 · 5984499e47
commit 5984499e47
parent ca47e176af
9 changed files with 141 additions and 73 deletions
--- a/docs/source/api/multimodal/index.md
+++ b/docs/source/api/multimodal/index.md
@ -2,10 +2,6 @@

 # Multi-Modality

-```{eval-rst}
-.. currentmodule:: vllm.multimodal
-```
-
 vLLM provides experimental support for multi-modal models through the {mod}`vllm.multimodal` package.

 Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models)
@ -13,61 +9,20 @@ via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`.

 Looking to add your own multi-modal model? Please follow the instructions listed [here](#enabling-multimodal-inputs).

-
 ## Module Contents

-```{eval-rst}
-.. automodule:: vllm.multimodal
-```
-
-### Registry
-
 ```{eval-rst}
 .. autodata:: vllm.multimodal.MULTIMODAL_REGISTRY
 ```

-```{eval-rst}
-.. autoclass:: vllm.multimodal.MultiModalRegistry
-    :members:
-    :show-inheritance:
-```
-
-### Base Classes
-
-```{eval-rst}
-.. automodule:: vllm.multimodal.base
-    :members:
-    :show-inheritance:
-```
-
-### Input Classes
-
-```{eval-rst}
-.. automodule:: vllm.multimodal.inputs
-    :members:
-    :show-inheritance:
-```
-
-### Audio Classes
-
-```{eval-rst}
-.. automodule:: vllm.multimodal.audio
-    :members:
-    :show-inheritance:
-```
-
-### Image Classes
-
-```{eval-rst}
-.. automodule:: vllm.multimodal.image
-    :members:
-    :show-inheritance:
-```
-
-### Video Classes
-
-```{eval-rst}
-.. automodule:: vllm.multimodal.video
-    :members:
-    :show-inheritance:
+## Submodules
+
+```{toctree}
+:maxdepth: 1
+
+inputs
+parse
+processing
+profiling
+registry
 ```
--- a/docs/source/api/multimodal/inputs.md
+++ b/docs/source/api/multimodal/inputs.md
@ -0,0 +1,49 @@
+# Input Definitions
+
+## User-facing inputs
+
+```{eval-rst}
+.. autodata:: vllm.multimodal.MultiModalDataDict
+```
+
+## Internal data structures
+
+```{eval-rst}
+.. autoclass:: vllm.multimodal.inputs.PlaceholderRange
+    :members:
+    :show-inheritance:
+```
+
+```{eval-rst}
+.. autodata:: vllm.multimodal.inputs.NestedTensors
+```
+
+```{eval-rst}
+.. autoclass:: vllm.multimodal.inputs.MultiModalFieldElem
+    :members:
+    :show-inheritance:
+```
+
+```{eval-rst}
+.. autoclass:: vllm.multimodal.inputs.MultiModalFieldConfig
+    :members:
+    :show-inheritance:
+```
+
+```{eval-rst}
+.. autoclass:: vllm.multimodal.inputs.MultiModalKwargsItem
+    :members:
+    :show-inheritance:
+```
+
+```{eval-rst}
+.. autoclass:: vllm.multimodal.inputs.MultiModalKwargs
+    :members:
+    :show-inheritance:
+```
+
+```{eval-rst}
+.. autoclass:: vllm.multimodal.inputs.MultiModalInputsV2
+    :members:
+    :show-inheritance:
+```
--- a/docs/source/api/multimodal/parse.md
+++ b/docs/source/api/multimodal/parse.md
@ -0,0 +1,9 @@
+# Data Parsing
+
+## Module Contents
+
+```{eval-rst}
+.. automodule:: vllm.multimodal.parse
+    :members:
+    :member-order: bysource
+```
--- a/docs/source/api/multimodal/processing.md
+++ b/docs/source/api/multimodal/processing.md
@ -0,0 +1,9 @@
+# Data Processing
+
+## Module Contents
+
+```{eval-rst}
+.. automodule:: vllm.multimodal.processing
+    :members:
+    :member-order: bysource
+```
--- a/docs/source/api/multimodal/profiling.md
+++ b/docs/source/api/multimodal/profiling.md
@ -0,0 +1,9 @@
+# Memory Profiling
+
+## Module Contents
+
+```{eval-rst}
+.. automodule:: vllm.multimodal.profiling
+    :members:
+    :member-order: bysource
+```
--- a/docs/source/api/multimodal/registry.md
+++ b/docs/source/api/multimodal/registry.md
@ -0,0 +1,9 @@
+# Registry
+
+## Module Contents
+
+```{eval-rst}
+.. automodule:: vllm.multimodal.registry
+    :members:
+    :member-order: bysource
+```
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@ -13,14 +13,16 @@ from vllm.utils import is_list_of

 from .audio import resample_audio
 from .inputs import (AudioItem, HfAudioItem, HfImageItem, HfVideoItem,
-                     ImageItem, ModalityData, MultiModalDataDict,
-                     NestedTensors, VideoItem)
+                     ImageItem, ModalityData, MultiModalDataDict, VideoItem)

 _T = TypeVar("_T")
 _I = TypeVar("_I")


 class ModalityDataItems(ABC, Generic[_T, _I]):
+    """
+    Represents data items for a modality in :class:`MultiModalDataItems`.
+    """

    def __init__(self, data: _T, modality: str) -> None:
        super().__init__()
@ -69,6 +71,7 @@ class ModalityDataItems(ABC, Generic[_T, _I]):


 class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]):
+    """Base class for data items that are arranged in a list."""

    def get_count(self) -> int:
        return len(self.data)
@ -83,7 +86,12 @@ class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]):
        return {}


-class EmbeddingItems(ModalityDataItems[NestedTensors, torch.Tensor]):
+class EmbeddingItems(ModalityDataItems[Union[torch.Tensor, list[torch.Tensor]],
+                                       torch.Tensor]):
+    """
+    Base class for data items that are expressed as a batched embedding tensor,
+    or a list of embedding tensors (one per item).
+    """

    def get_count(self) -> int:
        return len(self.data)
@ -109,7 +117,7 @@ class AudioProcessorItems(ProcessorBatchItems[HfAudioItem]):

 class AudioEmbeddingItems(EmbeddingItems):

-    def __init__(self, data: NestedTensors) -> None:
+    def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
        super().__init__(data, "audio")


@ -137,7 +145,7 @@ class ImageProcessorItems(ProcessorBatchItems[HfImageItem]):

 class ImageEmbeddingItems(EmbeddingItems):

-    def __init__(self, data: NestedTensors) -> None:
+    def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
        super().__init__(data, "image")


@ -163,7 +171,7 @@ class VideoProcessorItems(ProcessorBatchItems[HfVideoItem]):

 class VideoEmbeddingItems(EmbeddingItems):

-    def __init__(self, data: NestedTensors) -> None:
+    def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
        super().__init__(data, "video")


@ -172,8 +180,8 @@ _D = TypeVar("_D", bound=ModalityDataItems[Any, Any])

 class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]):
    """
-    As :class:`MultiModalDataDict`, but normalized such that each entry
-    corresponds to a list.
+    As :data:`~vllm.multimodal.inputs.MultiModalDataDict`, but normalized
+    such that each entry corresponds to a list.
    """

    def get_count(self, modality: str, *, strict: bool = True) -> int:
@ -226,7 +234,8 @@ ModalityDataParser: TypeAlias = Callable[[ModalityData[Any]],

 class MultiModalDataParser:
    """
-    Parses :class:`MultiModalDataDict` into :class:`MultiModalDataItems`.
+    Parses :data:`~vllm.multimodal.inputs.MultiModalDataDict` into
+    :class:`MultiModalDataItems`.

    Args:
        target_sr (float, optional): Enables automatic resampling of audio
@ -238,7 +247,9 @@ class MultiModalDataParser:

        self.target_sr = target_sr

-    def _is_embeddings(self, data: object) -> TypeGuard[NestedTensors]:
+    def _is_embeddings(
+            self, data: object
+    ) -> TypeGuard[Union[torch.Tensor, list[torch.Tensor]]]:
        if isinstance(data, torch.Tensor):
            return data.ndim == 3
        if is_list_of(data, torch.Tensor):
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@ -33,20 +33,24 @@ _PromptSeq = Union[str, list[int]]

@dataclass
 class PromptReplacement:
+    """
+    Defines how to replace portions of an input prompt with placeholder tokens.
+    """
+
    modality: str
    """The modality for which the replacement is made."""

    target: _PromptSeq
-    """The text or token sequence to find and replace."""
+    """The token sequence (or text) to find and replace."""

    replacement: Union[Callable[[int], _PromptSeq],
                       _PromptSeq] = field(repr=False)
    """
-    Given the index of the processed item within :attr:`modality`, output the
-    replacement text or token sequence.
+    Given the index of the processed item within :attr:`modality`,
+    output the replacement token sequence (or text).

-    For convenience, you can pass in the replacement instead of a function
-    if it does not depend on the input.
+    For convenience, you can directly pass in the replacement token sequence
+    (or text) instead of a function if it does not depend on the input.
    """

    def bind(self, tokenizer: AnyTokenizer) -> "BoundPromptReplacement":
@ -132,6 +136,11 @@ class _BoundPromptSequence:

@dataclass
 class BoundPromptReplacement:
+    """
+    A :class:`PromptReplacement` bound to a tokenizer to automatically
+    convert :attr:`target` and the result of :meth:`get_replacement` between
+    token sequence and text representations.
+    """
    tokenizer: AnyTokenizer = field(repr=False)
    modality: str

@ -144,6 +153,7 @@ class BoundPromptReplacement:

    @property
    def target(self) -> _BoundPromptSequence:
+        """The token sequence (or text) to find and replace."""
        target = self._target

        return _BoundPromptSequence(
@ -153,6 +163,10 @@ class BoundPromptReplacement:
        )

    def get_replacement(self, item_idx: int) -> _BoundPromptSequence:
+        """
+        Given the index of the processed item within :attr:`modality`,
+        output the replacement token sequence (or text).
+        """
        replacement = self._replacement
        if callable(replacement):
            cache_key = item_idx
@ -528,7 +542,7 @@ class ProcessingCache:


 class BaseProcessingInfo:
-    """Base class containing information to perform processing."""
+    """Base class to provide the information necessary for data processing."""

    def __init__(self, ctx: InputProcessingContext) -> None:
        super().__init__()
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@ -19,7 +19,10 @@ logger = init_logger(__name__)

@dataclass
 class ProcessorInputs:
-    """Keyword arguments to :meth:`BaseMultiModalProcessor`."""
+    """
+    Represents the keyword arguments to
+    :meth:`vllm.multimodal.processing.BaseMultiModalProcessor.apply`.
+    """
    prompt_text: str
    mm_data: MultiModalDataDict
    hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict)
@ -47,7 +50,7 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]):
    ) -> ProcessorInputs:
        """
        Build the input which, after processing, results in
-        `self.info.get_mm_max_tokens_per_item()` placeholder tokens.
+        :code:`self.info.get_mm_max_tokens_per_item()` placeholder tokens.
        """
        raise NotImplementedError