From 787b84a9fc9d1744f82addf40912e9fb84c0b4c5 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Wed, 3 Dec 2025 02:42:49 -0800 Subject: [PATCH] [Bugfix] Follow-up fix on MediaWithBytes (#29951) Signed-off-by: Roger Wang --- vllm/multimodal/base.py | 2 ++ vllm/multimodal/inputs.py | 3 ++- vllm/multimodal/parse.py | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index 4a619fd303ca..53eb4c591ef9 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -21,6 +21,8 @@ class MediaWithBytes(Generic[_T]): The wrapper delegates attribute access to the underlying media object, making it behave transparently like the wrapped type (e.g., PIL.Image). + + NOTE: Currently, this wrapper is used only for the image modality. """ media: _T diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index f4e38b1f3325..397684fa2f83 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -32,6 +32,7 @@ if TYPE_CHECKING: from PIL.Image import Image from transformers.feature_extraction_utils import BatchFeature + from .base import MediaWithBytes from .processing import MultiModalHashes else: @@ -59,7 +60,7 @@ Represents a single audio item, which can be passed to a HuggingFace `AudioProcessor`. """ -ImageItem: TypeAlias = Union[HfImageItem, "torch.Tensor"] +ImageItem: TypeAlias = Union[HfImageItem, "torch.Tensor", "MediaWithBytes[HfImageItem]"] """ A `transformers.image_utils.ImageInput` representing a single image item, which can be passed to a HuggingFace `ImageProcessor`. diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index 650368dcb8fc..c3c7cc2c3da0 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -484,7 +484,7 @@ class MultiModalDataParser: return ImageEmbeddingItems(data) if ( - isinstance(data, PILImage.Image) + isinstance(data, (PILImage.Image, MediaWithBytes)) or isinstance(data, (np.ndarray, torch.Tensor)) and data.ndim == 3 ):