[Bugfix] Follow-up fix on MediaWithBytes (#29951)

Signed-off-by: Roger Wang <hey@rogerw.io>
2026-01-26 05:04:28 +08:00 · 2025-12-03 02:42:49 -08:00 · 2025-12-03 02:42:49 -08:00 · 787b84a9fc
commit 787b84a9fc
parent 42c1949643
3 changed files with 5 additions and 2 deletions
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@ -21,6 +21,8 @@ class MediaWithBytes(Generic[_T]):

    The wrapper delegates attribute access to the underlying media object,
    making it behave transparently like the wrapped type (e.g., PIL.Image).
+
+    NOTE: Currently, this wrapper is used only for the image modality.
    """

    media: _T
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@ -32,6 +32,7 @@ if TYPE_CHECKING:
    from PIL.Image import Image
    from transformers.feature_extraction_utils import BatchFeature

+    from .base import MediaWithBytes
    from .processing import MultiModalHashes

 else:
@ -59,7 +60,7 @@ Represents a single audio
 item, which can be passed to a HuggingFace `AudioProcessor`.
 """

-ImageItem: TypeAlias = Union[HfImageItem, "torch.Tensor"]
+ImageItem: TypeAlias = Union[HfImageItem, "torch.Tensor", "MediaWithBytes[HfImageItem]"]
 """
 A `transformers.image_utils.ImageInput` representing a single image
 item, which can be passed to a HuggingFace `ImageProcessor`.
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@ -484,7 +484,7 @@ class MultiModalDataParser:
            return ImageEmbeddingItems(data)

        if (
-            isinstance(data, PILImage.Image)
+            isinstance(data, (PILImage.Image, MediaWithBytes))
            or isinstance(data, (np.ndarray, torch.Tensor))
            and data.ndim == 3
        ):