mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-01-17 22:04:36 +08:00
[Doc] Expand Multimodal API Reference (#11852)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
parent
ca47e176af
commit
5984499e47
@ -2,10 +2,6 @@
|
||||
|
||||
# Multi-Modality
|
||||
|
||||
```{eval-rst}
|
||||
.. currentmodule:: vllm.multimodal
|
||||
```
|
||||
|
||||
vLLM provides experimental support for multi-modal models through the {mod}`vllm.multimodal` package.
|
||||
|
||||
Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models)
|
||||
@ -13,61 +9,20 @@ via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`.
|
||||
|
||||
Looking to add your own multi-modal model? Please follow the instructions listed [here](#enabling-multimodal-inputs).
|
||||
|
||||
|
||||
## Module Contents
|
||||
|
||||
```{eval-rst}
|
||||
.. automodule:: vllm.multimodal
|
||||
```
|
||||
|
||||
### Registry
|
||||
|
||||
```{eval-rst}
|
||||
.. autodata:: vllm.multimodal.MULTIMODAL_REGISTRY
|
||||
```
|
||||
|
||||
```{eval-rst}
|
||||
.. autoclass:: vllm.multimodal.MultiModalRegistry
|
||||
:members:
|
||||
:show-inheritance:
|
||||
```
|
||||
|
||||
### Base Classes
|
||||
|
||||
```{eval-rst}
|
||||
.. automodule:: vllm.multimodal.base
|
||||
:members:
|
||||
:show-inheritance:
|
||||
```
|
||||
|
||||
### Input Classes
|
||||
|
||||
```{eval-rst}
|
||||
.. automodule:: vllm.multimodal.inputs
|
||||
:members:
|
||||
:show-inheritance:
|
||||
```
|
||||
|
||||
### Audio Classes
|
||||
|
||||
```{eval-rst}
|
||||
.. automodule:: vllm.multimodal.audio
|
||||
:members:
|
||||
:show-inheritance:
|
||||
```
|
||||
|
||||
### Image Classes
|
||||
|
||||
```{eval-rst}
|
||||
.. automodule:: vllm.multimodal.image
|
||||
:members:
|
||||
:show-inheritance:
|
||||
```
|
||||
|
||||
### Video Classes
|
||||
|
||||
```{eval-rst}
|
||||
.. automodule:: vllm.multimodal.video
|
||||
:members:
|
||||
:show-inheritance:
|
||||
## Submodules
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: 1
|
||||
|
||||
inputs
|
||||
parse
|
||||
processing
|
||||
profiling
|
||||
registry
|
||||
```
|
||||
|
||||
49
docs/source/api/multimodal/inputs.md
Normal file
49
docs/source/api/multimodal/inputs.md
Normal file
@ -0,0 +1,49 @@
|
||||
# Input Definitions
|
||||
|
||||
## User-facing inputs
|
||||
|
||||
```{eval-rst}
|
||||
.. autodata:: vllm.multimodal.MultiModalDataDict
|
||||
```
|
||||
|
||||
## Internal data structures
|
||||
|
||||
```{eval-rst}
|
||||
.. autoclass:: vllm.multimodal.inputs.PlaceholderRange
|
||||
:members:
|
||||
:show-inheritance:
|
||||
```
|
||||
|
||||
```{eval-rst}
|
||||
.. autodata:: vllm.multimodal.inputs.NestedTensors
|
||||
```
|
||||
|
||||
```{eval-rst}
|
||||
.. autoclass:: vllm.multimodal.inputs.MultiModalFieldElem
|
||||
:members:
|
||||
:show-inheritance:
|
||||
```
|
||||
|
||||
```{eval-rst}
|
||||
.. autoclass:: vllm.multimodal.inputs.MultiModalFieldConfig
|
||||
:members:
|
||||
:show-inheritance:
|
||||
```
|
||||
|
||||
```{eval-rst}
|
||||
.. autoclass:: vllm.multimodal.inputs.MultiModalKwargsItem
|
||||
:members:
|
||||
:show-inheritance:
|
||||
```
|
||||
|
||||
```{eval-rst}
|
||||
.. autoclass:: vllm.multimodal.inputs.MultiModalKwargs
|
||||
:members:
|
||||
:show-inheritance:
|
||||
```
|
||||
|
||||
```{eval-rst}
|
||||
.. autoclass:: vllm.multimodal.inputs.MultiModalInputsV2
|
||||
:members:
|
||||
:show-inheritance:
|
||||
```
|
||||
9
docs/source/api/multimodal/parse.md
Normal file
9
docs/source/api/multimodal/parse.md
Normal file
@ -0,0 +1,9 @@
|
||||
# Data Parsing
|
||||
|
||||
## Module Contents
|
||||
|
||||
```{eval-rst}
|
||||
.. automodule:: vllm.multimodal.parse
|
||||
:members:
|
||||
:member-order: bysource
|
||||
```
|
||||
9
docs/source/api/multimodal/processing.md
Normal file
9
docs/source/api/multimodal/processing.md
Normal file
@ -0,0 +1,9 @@
|
||||
# Data Processing
|
||||
|
||||
## Module Contents
|
||||
|
||||
```{eval-rst}
|
||||
.. automodule:: vllm.multimodal.processing
|
||||
:members:
|
||||
:member-order: bysource
|
||||
```
|
||||
9
docs/source/api/multimodal/profiling.md
Normal file
9
docs/source/api/multimodal/profiling.md
Normal file
@ -0,0 +1,9 @@
|
||||
# Memory Profiling
|
||||
|
||||
## Module Contents
|
||||
|
||||
```{eval-rst}
|
||||
.. automodule:: vllm.multimodal.profiling
|
||||
:members:
|
||||
:member-order: bysource
|
||||
```
|
||||
9
docs/source/api/multimodal/registry.md
Normal file
9
docs/source/api/multimodal/registry.md
Normal file
@ -0,0 +1,9 @@
|
||||
# Registry
|
||||
|
||||
## Module Contents
|
||||
|
||||
```{eval-rst}
|
||||
.. automodule:: vllm.multimodal.registry
|
||||
:members:
|
||||
:member-order: bysource
|
||||
```
|
||||
@ -13,14 +13,16 @@ from vllm.utils import is_list_of
|
||||
|
||||
from .audio import resample_audio
|
||||
from .inputs import (AudioItem, HfAudioItem, HfImageItem, HfVideoItem,
|
||||
ImageItem, ModalityData, MultiModalDataDict,
|
||||
NestedTensors, VideoItem)
|
||||
ImageItem, ModalityData, MultiModalDataDict, VideoItem)
|
||||
|
||||
_T = TypeVar("_T")
|
||||
_I = TypeVar("_I")
|
||||
|
||||
|
||||
class ModalityDataItems(ABC, Generic[_T, _I]):
|
||||
"""
|
||||
Represents data items for a modality in :class:`MultiModalDataItems`.
|
||||
"""
|
||||
|
||||
def __init__(self, data: _T, modality: str) -> None:
|
||||
super().__init__()
|
||||
@ -69,6 +71,7 @@ class ModalityDataItems(ABC, Generic[_T, _I]):
|
||||
|
||||
|
||||
class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]):
|
||||
"""Base class for data items that are arranged in a list."""
|
||||
|
||||
def get_count(self) -> int:
|
||||
return len(self.data)
|
||||
@ -83,7 +86,12 @@ class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]):
|
||||
return {}
|
||||
|
||||
|
||||
class EmbeddingItems(ModalityDataItems[NestedTensors, torch.Tensor]):
|
||||
class EmbeddingItems(ModalityDataItems[Union[torch.Tensor, list[torch.Tensor]],
|
||||
torch.Tensor]):
|
||||
"""
|
||||
Base class for data items that are expressed as a batched embedding tensor,
|
||||
or a list of embedding tensors (one per item).
|
||||
"""
|
||||
|
||||
def get_count(self) -> int:
|
||||
return len(self.data)
|
||||
@ -109,7 +117,7 @@ class AudioProcessorItems(ProcessorBatchItems[HfAudioItem]):
|
||||
|
||||
class AudioEmbeddingItems(EmbeddingItems):
|
||||
|
||||
def __init__(self, data: NestedTensors) -> None:
|
||||
def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
|
||||
super().__init__(data, "audio")
|
||||
|
||||
|
||||
@ -137,7 +145,7 @@ class ImageProcessorItems(ProcessorBatchItems[HfImageItem]):
|
||||
|
||||
class ImageEmbeddingItems(EmbeddingItems):
|
||||
|
||||
def __init__(self, data: NestedTensors) -> None:
|
||||
def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
|
||||
super().__init__(data, "image")
|
||||
|
||||
|
||||
@ -163,7 +171,7 @@ class VideoProcessorItems(ProcessorBatchItems[HfVideoItem]):
|
||||
|
||||
class VideoEmbeddingItems(EmbeddingItems):
|
||||
|
||||
def __init__(self, data: NestedTensors) -> None:
|
||||
def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
|
||||
super().__init__(data, "video")
|
||||
|
||||
|
||||
@ -172,8 +180,8 @@ _D = TypeVar("_D", bound=ModalityDataItems[Any, Any])
|
||||
|
||||
class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]):
|
||||
"""
|
||||
As :class:`MultiModalDataDict`, but normalized such that each entry
|
||||
corresponds to a list.
|
||||
As :data:`~vllm.multimodal.inputs.MultiModalDataDict`, but normalized
|
||||
such that each entry corresponds to a list.
|
||||
"""
|
||||
|
||||
def get_count(self, modality: str, *, strict: bool = True) -> int:
|
||||
@ -226,7 +234,8 @@ ModalityDataParser: TypeAlias = Callable[[ModalityData[Any]],
|
||||
|
||||
class MultiModalDataParser:
|
||||
"""
|
||||
Parses :class:`MultiModalDataDict` into :class:`MultiModalDataItems`.
|
||||
Parses :data:`~vllm.multimodal.inputs.MultiModalDataDict` into
|
||||
:class:`MultiModalDataItems`.
|
||||
|
||||
Args:
|
||||
target_sr (float, optional): Enables automatic resampling of audio
|
||||
@ -238,7 +247,9 @@ class MultiModalDataParser:
|
||||
|
||||
self.target_sr = target_sr
|
||||
|
||||
def _is_embeddings(self, data: object) -> TypeGuard[NestedTensors]:
|
||||
def _is_embeddings(
|
||||
self, data: object
|
||||
) -> TypeGuard[Union[torch.Tensor, list[torch.Tensor]]]:
|
||||
if isinstance(data, torch.Tensor):
|
||||
return data.ndim == 3
|
||||
if is_list_of(data, torch.Tensor):
|
||||
|
||||
@ -33,20 +33,24 @@ _PromptSeq = Union[str, list[int]]
|
||||
|
||||
@dataclass
|
||||
class PromptReplacement:
|
||||
"""
|
||||
Defines how to replace portions of an input prompt with placeholder tokens.
|
||||
"""
|
||||
|
||||
modality: str
|
||||
"""The modality for which the replacement is made."""
|
||||
|
||||
target: _PromptSeq
|
||||
"""The text or token sequence to find and replace."""
|
||||
"""The token sequence (or text) to find and replace."""
|
||||
|
||||
replacement: Union[Callable[[int], _PromptSeq],
|
||||
_PromptSeq] = field(repr=False)
|
||||
"""
|
||||
Given the index of the processed item within :attr:`modality`, output the
|
||||
replacement text or token sequence.
|
||||
Given the index of the processed item within :attr:`modality`,
|
||||
output the replacement token sequence (or text).
|
||||
|
||||
For convenience, you can pass in the replacement instead of a function
|
||||
if it does not depend on the input.
|
||||
For convenience, you can directly pass in the replacement token sequence
|
||||
(or text) instead of a function if it does not depend on the input.
|
||||
"""
|
||||
|
||||
def bind(self, tokenizer: AnyTokenizer) -> "BoundPromptReplacement":
|
||||
@ -132,6 +136,11 @@ class _BoundPromptSequence:
|
||||
|
||||
@dataclass
|
||||
class BoundPromptReplacement:
|
||||
"""
|
||||
A :class:`PromptReplacement` bound to a tokenizer to automatically
|
||||
convert :attr:`target` and the result of :meth:`get_replacement` between
|
||||
token sequence and text representations.
|
||||
"""
|
||||
tokenizer: AnyTokenizer = field(repr=False)
|
||||
modality: str
|
||||
|
||||
@ -144,6 +153,7 @@ class BoundPromptReplacement:
|
||||
|
||||
@property
|
||||
def target(self) -> _BoundPromptSequence:
|
||||
"""The token sequence (or text) to find and replace."""
|
||||
target = self._target
|
||||
|
||||
return _BoundPromptSequence(
|
||||
@ -153,6 +163,10 @@ class BoundPromptReplacement:
|
||||
)
|
||||
|
||||
def get_replacement(self, item_idx: int) -> _BoundPromptSequence:
|
||||
"""
|
||||
Given the index of the processed item within :attr:`modality`,
|
||||
output the replacement token sequence (or text).
|
||||
"""
|
||||
replacement = self._replacement
|
||||
if callable(replacement):
|
||||
cache_key = item_idx
|
||||
@ -528,7 +542,7 @@ class ProcessingCache:
|
||||
|
||||
|
||||
class BaseProcessingInfo:
|
||||
"""Base class containing information to perform processing."""
|
||||
"""Base class to provide the information necessary for data processing."""
|
||||
|
||||
def __init__(self, ctx: InputProcessingContext) -> None:
|
||||
super().__init__()
|
||||
|
||||
@ -19,7 +19,10 @@ logger = init_logger(__name__)
|
||||
|
||||
@dataclass
|
||||
class ProcessorInputs:
|
||||
"""Keyword arguments to :meth:`BaseMultiModalProcessor`."""
|
||||
"""
|
||||
Represents the keyword arguments to
|
||||
:meth:`vllm.multimodal.processing.BaseMultiModalProcessor.apply`.
|
||||
"""
|
||||
prompt_text: str
|
||||
mm_data: MultiModalDataDict
|
||||
hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict)
|
||||
@ -47,7 +50,7 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]):
|
||||
) -> ProcessorInputs:
|
||||
"""
|
||||
Build the input which, after processing, results in
|
||||
`self.info.get_mm_max_tokens_per_item()` placeholder tokens.
|
||||
:code:`self.info.get_mm_max_tokens_per_item()` placeholder tokens.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user