[Doc] Expand Multimodal API Reference (#11852)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung 2025-01-09 01:14:14 +08:00 committed by GitHub
parent ca47e176af
commit 5984499e47
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 141 additions and 73 deletions

View File

@ -2,10 +2,6 @@
# Multi-Modality
```{eval-rst}
.. currentmodule:: vllm.multimodal
```
vLLM provides experimental support for multi-modal models through the {mod}`vllm.multimodal` package.
Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models)
@ -13,61 +9,20 @@ via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`.
Looking to add your own multi-modal model? Please follow the instructions listed [here](#enabling-multimodal-inputs).
## Module Contents
```{eval-rst}
.. automodule:: vllm.multimodal
```
### Registry
```{eval-rst}
.. autodata:: vllm.multimodal.MULTIMODAL_REGISTRY
```
```{eval-rst}
.. autoclass:: vllm.multimodal.MultiModalRegistry
:members:
:show-inheritance:
```
### Base Classes
```{eval-rst}
.. automodule:: vllm.multimodal.base
:members:
:show-inheritance:
```
### Input Classes
```{eval-rst}
.. automodule:: vllm.multimodal.inputs
:members:
:show-inheritance:
```
### Audio Classes
```{eval-rst}
.. automodule:: vllm.multimodal.audio
:members:
:show-inheritance:
```
### Image Classes
```{eval-rst}
.. automodule:: vllm.multimodal.image
:members:
:show-inheritance:
```
### Video Classes
```{eval-rst}
.. automodule:: vllm.multimodal.video
:members:
:show-inheritance:
## Submodules
```{toctree}
:maxdepth: 1
inputs
parse
processing
profiling
registry
```

View File

@ -0,0 +1,49 @@
# Input Definitions
## User-facing inputs
```{eval-rst}
.. autodata:: vllm.multimodal.MultiModalDataDict
```
## Internal data structures
```{eval-rst}
.. autoclass:: vllm.multimodal.inputs.PlaceholderRange
:members:
:show-inheritance:
```
```{eval-rst}
.. autodata:: vllm.multimodal.inputs.NestedTensors
```
```{eval-rst}
.. autoclass:: vllm.multimodal.inputs.MultiModalFieldElem
:members:
:show-inheritance:
```
```{eval-rst}
.. autoclass:: vllm.multimodal.inputs.MultiModalFieldConfig
:members:
:show-inheritance:
```
```{eval-rst}
.. autoclass:: vllm.multimodal.inputs.MultiModalKwargsItem
:members:
:show-inheritance:
```
```{eval-rst}
.. autoclass:: vllm.multimodal.inputs.MultiModalKwargs
:members:
:show-inheritance:
```
```{eval-rst}
.. autoclass:: vllm.multimodal.inputs.MultiModalInputsV2
:members:
:show-inheritance:
```

View File

@ -0,0 +1,9 @@
# Data Parsing
## Module Contents
```{eval-rst}
.. automodule:: vllm.multimodal.parse
:members:
:member-order: bysource
```

View File

@ -0,0 +1,9 @@
# Data Processing
## Module Contents
```{eval-rst}
.. automodule:: vllm.multimodal.processing
:members:
:member-order: bysource
```

View File

@ -0,0 +1,9 @@
# Memory Profiling
## Module Contents
```{eval-rst}
.. automodule:: vllm.multimodal.profiling
:members:
:member-order: bysource
```

View File

@ -0,0 +1,9 @@
# Registry
## Module Contents
```{eval-rst}
.. automodule:: vllm.multimodal.registry
:members:
:member-order: bysource
```

View File

@ -13,14 +13,16 @@ from vllm.utils import is_list_of
from .audio import resample_audio
from .inputs import (AudioItem, HfAudioItem, HfImageItem, HfVideoItem,
ImageItem, ModalityData, MultiModalDataDict,
NestedTensors, VideoItem)
ImageItem, ModalityData, MultiModalDataDict, VideoItem)
_T = TypeVar("_T")
_I = TypeVar("_I")
class ModalityDataItems(ABC, Generic[_T, _I]):
"""
Represents data items for a modality in :class:`MultiModalDataItems`.
"""
def __init__(self, data: _T, modality: str) -> None:
super().__init__()
@ -69,6 +71,7 @@ class ModalityDataItems(ABC, Generic[_T, _I]):
class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]):
"""Base class for data items that are arranged in a list."""
def get_count(self) -> int:
return len(self.data)
@ -83,7 +86,12 @@ class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]):
return {}
class EmbeddingItems(ModalityDataItems[NestedTensors, torch.Tensor]):
class EmbeddingItems(ModalityDataItems[Union[torch.Tensor, list[torch.Tensor]],
torch.Tensor]):
"""
Base class for data items that are expressed as a batched embedding tensor,
or a list of embedding tensors (one per item).
"""
def get_count(self) -> int:
return len(self.data)
@ -109,7 +117,7 @@ class AudioProcessorItems(ProcessorBatchItems[HfAudioItem]):
class AudioEmbeddingItems(EmbeddingItems):
def __init__(self, data: NestedTensors) -> None:
def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
super().__init__(data, "audio")
@ -137,7 +145,7 @@ class ImageProcessorItems(ProcessorBatchItems[HfImageItem]):
class ImageEmbeddingItems(EmbeddingItems):
def __init__(self, data: NestedTensors) -> None:
def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
super().__init__(data, "image")
@ -163,7 +171,7 @@ class VideoProcessorItems(ProcessorBatchItems[HfVideoItem]):
class VideoEmbeddingItems(EmbeddingItems):
def __init__(self, data: NestedTensors) -> None:
def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
super().__init__(data, "video")
@ -172,8 +180,8 @@ _D = TypeVar("_D", bound=ModalityDataItems[Any, Any])
class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]):
"""
As :class:`MultiModalDataDict`, but normalized such that each entry
corresponds to a list.
As :data:`~vllm.multimodal.inputs.MultiModalDataDict`, but normalized
such that each entry corresponds to a list.
"""
def get_count(self, modality: str, *, strict: bool = True) -> int:
@ -226,7 +234,8 @@ ModalityDataParser: TypeAlias = Callable[[ModalityData[Any]],
class MultiModalDataParser:
"""
Parses :class:`MultiModalDataDict` into :class:`MultiModalDataItems`.
Parses :data:`~vllm.multimodal.inputs.MultiModalDataDict` into
:class:`MultiModalDataItems`.
Args:
target_sr (float, optional): Enables automatic resampling of audio
@ -238,7 +247,9 @@ class MultiModalDataParser:
self.target_sr = target_sr
def _is_embeddings(self, data: object) -> TypeGuard[NestedTensors]:
def _is_embeddings(
self, data: object
) -> TypeGuard[Union[torch.Tensor, list[torch.Tensor]]]:
if isinstance(data, torch.Tensor):
return data.ndim == 3
if is_list_of(data, torch.Tensor):

View File

@ -33,20 +33,24 @@ _PromptSeq = Union[str, list[int]]
@dataclass
class PromptReplacement:
"""
Defines how to replace portions of an input prompt with placeholder tokens.
"""
modality: str
"""The modality for which the replacement is made."""
target: _PromptSeq
"""The text or token sequence to find and replace."""
"""The token sequence (or text) to find and replace."""
replacement: Union[Callable[[int], _PromptSeq],
_PromptSeq] = field(repr=False)
"""
Given the index of the processed item within :attr:`modality`, output the
replacement text or token sequence.
Given the index of the processed item within :attr:`modality`,
output the replacement token sequence (or text).
For convenience, you can pass in the replacement instead of a function
if it does not depend on the input.
For convenience, you can directly pass in the replacement token sequence
(or text) instead of a function if it does not depend on the input.
"""
def bind(self, tokenizer: AnyTokenizer) -> "BoundPromptReplacement":
@ -132,6 +136,11 @@ class _BoundPromptSequence:
@dataclass
class BoundPromptReplacement:
"""
A :class:`PromptReplacement` bound to a tokenizer to automatically
convert :attr:`target` and the result of :meth:`get_replacement` between
token sequence and text representations.
"""
tokenizer: AnyTokenizer = field(repr=False)
modality: str
@ -144,6 +153,7 @@ class BoundPromptReplacement:
@property
def target(self) -> _BoundPromptSequence:
"""The token sequence (or text) to find and replace."""
target = self._target
return _BoundPromptSequence(
@ -153,6 +163,10 @@ class BoundPromptReplacement:
)
def get_replacement(self, item_idx: int) -> _BoundPromptSequence:
"""
Given the index of the processed item within :attr:`modality`,
output the replacement token sequence (or text).
"""
replacement = self._replacement
if callable(replacement):
cache_key = item_idx
@ -528,7 +542,7 @@ class ProcessingCache:
class BaseProcessingInfo:
"""Base class containing information to perform processing."""
"""Base class to provide the information necessary for data processing."""
def __init__(self, ctx: InputProcessingContext) -> None:
super().__init__()

View File

@ -19,7 +19,10 @@ logger = init_logger(__name__)
@dataclass
class ProcessorInputs:
"""Keyword arguments to :meth:`BaseMultiModalProcessor`."""
"""
Represents the keyword arguments to
:meth:`vllm.multimodal.processing.BaseMultiModalProcessor.apply`.
"""
prompt_text: str
mm_data: MultiModalDataDict
hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict)
@ -47,7 +50,7 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]):
) -> ProcessorInputs:
"""
Build the input which, after processing, results in
`self.info.get_mm_max_tokens_per_item()` placeholder tokens.
:code:`self.info.get_mm_max_tokens_per_item()` placeholder tokens.
"""
raise NotImplementedError