diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index 4a619fd303ca..53eb4c591ef9 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -21,6 +21,8 @@ class MediaWithBytes(Generic[_T]): The wrapper delegates attribute access to the underlying media object, making it behave transparently like the wrapped type (e.g., PIL.Image). + + NOTE: Currently, this wrapper is used only for the image modality. """ media: _T diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index f4e38b1f3325..397684fa2f83 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -32,6 +32,7 @@ if TYPE_CHECKING: from PIL.Image import Image from transformers.feature_extraction_utils import BatchFeature + from .base import MediaWithBytes from .processing import MultiModalHashes else: @@ -59,7 +60,7 @@ Represents a single audio item, which can be passed to a HuggingFace `AudioProcessor`. """ -ImageItem: TypeAlias = Union[HfImageItem, "torch.Tensor"] +ImageItem: TypeAlias = Union[HfImageItem, "torch.Tensor", "MediaWithBytes[HfImageItem]"] """ A `transformers.image_utils.ImageInput` representing a single image item, which can be passed to a HuggingFace `ImageProcessor`. diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index 650368dcb8fc..c3c7cc2c3da0 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -484,7 +484,7 @@ class MultiModalDataParser: return ImageEmbeddingItems(data) if ( - isinstance(data, PILImage.Image) + isinstance(data, (PILImage.Image, MediaWithBytes)) or isinstance(data, (np.ndarray, torch.Tensor)) and data.ndim == 3 ):