[Bugfix] Follow-up fix on MediaWithBytes (#29951)

Signed-off-by: Roger Wang <hey@rogerw.io>
This commit is contained in:
Roger Wang 2025-12-03 02:42:49 -08:00 committed by GitHub
parent 42c1949643
commit 787b84a9fc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 5 additions and 2 deletions

View File

@ -21,6 +21,8 @@ class MediaWithBytes(Generic[_T]):
The wrapper delegates attribute access to the underlying media object,
making it behave transparently like the wrapped type (e.g., PIL.Image).
NOTE: Currently, this wrapper is used only for the image modality.
"""
media: _T

View File

@ -32,6 +32,7 @@ if TYPE_CHECKING:
from PIL.Image import Image
from transformers.feature_extraction_utils import BatchFeature
from .base import MediaWithBytes
from .processing import MultiModalHashes
else:
@ -59,7 +60,7 @@ Represents a single audio
item, which can be passed to a HuggingFace `AudioProcessor`.
"""
ImageItem: TypeAlias = Union[HfImageItem, "torch.Tensor"]
ImageItem: TypeAlias = Union[HfImageItem, "torch.Tensor", "MediaWithBytes[HfImageItem]"]
"""
A `transformers.image_utils.ImageInput` representing a single image
item, which can be passed to a HuggingFace `ImageProcessor`.

View File

@ -484,7 +484,7 @@ class MultiModalDataParser:
return ImageEmbeddingItems(data)
if (
isinstance(data, PILImage.Image)
isinstance(data, (PILImage.Image, MediaWithBytes))
or isinstance(data, (np.ndarray, torch.Tensor))
and data.ndim == 3
):