diff --git a/tests/multimodal/assets/image1.png b/tests/multimodal/assets/image1.png
new file mode 100644
index 0000000000000..17c7d4cdffe91
Binary files /dev/null and b/tests/multimodal/assets/image1.png differ
diff --git a/tests/multimodal/assets/image2.png b/tests/multimodal/assets/image2.png
new file mode 100644
index 0000000000000..0f13ce5d983d1
Binary files /dev/null and b/tests/multimodal/assets/image2.png differ
diff --git a/tests/multimodal/test_hasher.py b/tests/multimodal/test_hasher.py
new file mode 100644
index 0000000000000..17b36b36888d5
--- /dev/null
+++ b/tests/multimodal/test_hasher.py
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: Apache-2.0
+from pathlib import Path
+
+import numpy as np
+import pytest
+import torch
+from PIL import Image, ImageDraw
+
+from vllm.multimodal.hasher import MultiModalHasher
+
+ASSETS_DIR = Path(__file__).parent / "assets"
+assert ASSETS_DIR.exists()
+
+
+# NOTE: Images that are the same visually are allowed to have the same hash
+@pytest.mark.parametrize("mode_pair", [("1", "L"), ("RGBA", "CMYK")])
+def test_hash_collision_image_mode(mode_pair):
+    mode1, mode2 = mode_pair
+    image1 = Image.new(mode1, size=(10, 10), color=1)
+    image2 = Image.new(mode2, size=(10, 10), color=1)
+
+    hasher = MultiModalHasher
+    assert hasher.hash_kwargs(image=image1) != hasher.hash_kwargs(image=image2)
+
+
+def test_hash_collision_image_palette():
+    # These images differ only in Image.palette._palette
+    image1 = Image.open(ASSETS_DIR / "image1.png")
+    image2 = Image.open(ASSETS_DIR / "image2.png")
+
+    hasher = MultiModalHasher
+    assert hasher.hash_kwargs(image=image1) != hasher.hash_kwargs(image=image2)
+
+
+def test_hash_collision_image_transpose():
+    image1 = Image.new("1", size=(10, 20))
+    ImageDraw.Draw(image1).line([(0, 0), (10, 0)])
+
+    image2 = Image.new("1", size=(20, 10))
+    ImageDraw.Draw(image2).line([(0, 0), (0, 10)])
+
+    hasher = MultiModalHasher
+    assert hasher.hash_kwargs(image=image1) != hasher.hash_kwargs(image=image2)
+
+
+def test_hash_collision_tensor_shape():
+    # The hash should be different though the data is the same when flattened
+    arr1 = torch.zeros((5, 10, 20, 3))
+    arr2 = torch.zeros((10, 20, 5, 3))
+
+    hasher = MultiModalHasher
+    assert hasher.hash_kwargs(data=arr1) != hasher.hash_kwargs(data=arr2)
+
+
+def test_hash_collision_array_shape():
+    # The hash should be different though the data is the same when flattened
+    arr1 = np.zeros((5, 10, 20, 3))
+    arr2 = np.zeros((10, 20, 5, 3))
+
+    hasher = MultiModalHasher
+    assert hasher.hash_kwargs(data=arr1) != hasher.hash_kwargs(data=arr2)
diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py
index 11665ef667538..53e289370a9f4 100644
--- a/vllm/multimodal/hasher.py
+++ b/vllm/multimodal/hasher.py
@@ -31,16 +31,20 @@ class MultiModalHasher:
             return obj.encode("utf-8")
         if isinstance(obj, bytes):
             return obj
-        if isinstance(obj, Image.Image):
-            return obj.tobytes()
-
-        # Convertible to NumPy arrays
-        if isinstance(obj, torch.Tensor):
-            obj = obj.numpy()
         if isinstance(obj, (int, float)):
-            obj = np.array(obj)
+            return np.array(obj).tobytes()
+
+        if isinstance(obj, Image.Image):
+            return cls.item_to_bytes("image", np.array(obj.convert("RGBA")))
+        if isinstance(obj, torch.Tensor):
+            return cls.item_to_bytes("tensor", obj.numpy())
         if isinstance(obj, np.ndarray):
-            return obj.tobytes()
+            return cls.item_to_bytes(
+                "ndarray", {
+                    "dtype": obj.dtype.str,
+                    "shape": obj.shape,
+                    "data": obj.data.tobytes(),
+                })
 
         logger.warning(
             "No serialization method found for %s. "
@@ -53,14 +57,22 @@ class MultiModalHasher:
         cls,
         key: str,
         obj: object,
+    ) -> bytes:
+        return b''.join(kb + vb for kb, vb in cls.iter_item_to_bytes(key, obj))
+
+    @classmethod
+    def iter_item_to_bytes(
+        cls,
+        key: str,
+        obj: object,
     ) -> Iterable[tuple[bytes, bytes]]:
         # Recursive cases
         if isinstance(obj, (list, tuple)):
             for i, elem in enumerate(obj):
-                yield from cls.item_to_bytes(f"{key}.{i}", elem)
+                yield from cls.iter_item_to_bytes(f"{key}.{i}", elem)
         elif isinstance(obj, dict):
             for k, v in obj.items():
-                yield from cls.item_to_bytes(f"{key}.{k}", v)
+                yield from cls.iter_item_to_bytes(f"{key}.{k}", v)
         else:
             key_bytes = cls.serialize_item(key)
             value_bytes = cls.serialize_item(obj)
@@ -71,7 +83,7 @@ class MultiModalHasher:
         hasher = blake3()
 
         for k, v in kwargs.items():
-            for k_bytes, v_bytes in cls.item_to_bytes(k, v):
+            for k_bytes, v_bytes in cls.iter_item_to_bytes(k, v):
                 hasher.update(k_bytes)
                 hasher.update(v_bytes)