diff --git a/tests/multimodal/assets/image1.png b/tests/multimodal/assets/image1.png new file mode 100644 index 0000000000000..17c7d4cdffe91 Binary files /dev/null and b/tests/multimodal/assets/image1.png differ diff --git a/tests/multimodal/assets/image2.png b/tests/multimodal/assets/image2.png new file mode 100644 index 0000000000000..0f13ce5d983d1 Binary files /dev/null and b/tests/multimodal/assets/image2.png differ diff --git a/tests/multimodal/test_hasher.py b/tests/multimodal/test_hasher.py new file mode 100644 index 0000000000000..17b36b36888d5 --- /dev/null +++ b/tests/multimodal/test_hasher.py @@ -0,0 +1,61 @@ +# SPDX-License-Identifier: Apache-2.0 +from pathlib import Path + +import numpy as np +import pytest +import torch +from PIL import Image, ImageDraw + +from vllm.multimodal.hasher import MultiModalHasher + +ASSETS_DIR = Path(__file__).parent / "assets" +assert ASSETS_DIR.exists() + + +# NOTE: Images that are the same visually are allowed to have the same hash +@pytest.mark.parametrize("mode_pair", [("1", "L"), ("RGBA", "CMYK")]) +def test_hash_collision_image_mode(mode_pair): + mode1, mode2 = mode_pair + image1 = Image.new(mode1, size=(10, 10), color=1) + image2 = Image.new(mode2, size=(10, 10), color=1) + + hasher = MultiModalHasher + assert hasher.hash_kwargs(image=image1) != hasher.hash_kwargs(image=image2) + + +def test_hash_collision_image_palette(): + # These images differ only in Image.palette._palette + image1 = Image.open(ASSETS_DIR / "image1.png") + image2 = Image.open(ASSETS_DIR / "image2.png") + + hasher = MultiModalHasher + assert hasher.hash_kwargs(image=image1) != hasher.hash_kwargs(image=image2) + + +def test_hash_collision_image_transpose(): + image1 = Image.new("1", size=(10, 20)) + ImageDraw.Draw(image1).line([(0, 0), (10, 0)]) + + image2 = Image.new("1", size=(20, 10)) + ImageDraw.Draw(image2).line([(0, 0), (0, 10)]) + + hasher = MultiModalHasher + assert hasher.hash_kwargs(image=image1) != hasher.hash_kwargs(image=image2) + + +def test_hash_collision_tensor_shape(): + # The hash should be different though the data is the same when flattened + arr1 = torch.zeros((5, 10, 20, 3)) + arr2 = torch.zeros((10, 20, 5, 3)) + + hasher = MultiModalHasher + assert hasher.hash_kwargs(data=arr1) != hasher.hash_kwargs(data=arr2) + + +def test_hash_collision_array_shape(): + # The hash should be different though the data is the same when flattened + arr1 = np.zeros((5, 10, 20, 3)) + arr2 = np.zeros((10, 20, 5, 3)) + + hasher = MultiModalHasher + assert hasher.hash_kwargs(data=arr1) != hasher.hash_kwargs(data=arr2) diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py index 11665ef667538..53e289370a9f4 100644 --- a/vllm/multimodal/hasher.py +++ b/vllm/multimodal/hasher.py @@ -31,16 +31,20 @@ class MultiModalHasher: return obj.encode("utf-8") if isinstance(obj, bytes): return obj - if isinstance(obj, Image.Image): - return obj.tobytes() - - # Convertible to NumPy arrays - if isinstance(obj, torch.Tensor): - obj = obj.numpy() if isinstance(obj, (int, float)): - obj = np.array(obj) + return np.array(obj).tobytes() + + if isinstance(obj, Image.Image): + return cls.item_to_bytes("image", np.array(obj.convert("RGBA"))) + if isinstance(obj, torch.Tensor): + return cls.item_to_bytes("tensor", obj.numpy()) if isinstance(obj, np.ndarray): - return obj.tobytes() + return cls.item_to_bytes( + "ndarray", { + "dtype": obj.dtype.str, + "shape": obj.shape, + "data": obj.data.tobytes(), + }) logger.warning( "No serialization method found for %s. " @@ -53,14 +57,22 @@ class MultiModalHasher: cls, key: str, obj: object, + ) -> bytes: + return b''.join(kb + vb for kb, vb in cls.iter_item_to_bytes(key, obj)) + + @classmethod + def iter_item_to_bytes( + cls, + key: str, + obj: object, ) -> Iterable[tuple[bytes, bytes]]: # Recursive cases if isinstance(obj, (list, tuple)): for i, elem in enumerate(obj): - yield from cls.item_to_bytes(f"{key}.{i}", elem) + yield from cls.iter_item_to_bytes(f"{key}.{i}", elem) elif isinstance(obj, dict): for k, v in obj.items(): - yield from cls.item_to_bytes(f"{key}.{k}", v) + yield from cls.iter_item_to_bytes(f"{key}.{k}", v) else: key_bytes = cls.serialize_item(key) value_bytes = cls.serialize_item(obj) @@ -71,7 +83,7 @@ class MultiModalHasher: hasher = blake3() for k, v in kwargs.items(): - for k_bytes, v_bytes in cls.item_to_bytes(k, v): + for k_bytes, v_bytes in cls.iter_item_to_bytes(k, v): hasher.update(k_bytes) hasher.update(v_bytes)