diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py index a5a4dcd0b6e1..b4cd6a90834c 100644 --- a/vllm/multimodal/hasher.py +++ b/vllm/multimodal/hasher.py @@ -36,8 +36,8 @@ class MultiModalHasher: return np.array(obj).tobytes() if isinstance(obj, Image.Image): - return cls.item_to_bytes("image", - np.array(convert_image_mode(obj, "RGBA"))) + return cls.item_to_bytes( + "image", np.asarray(convert_image_mode(obj, "RGBA"))) if isinstance(obj, torch.Tensor): return cls.item_to_bytes("tensor", obj.numpy()) if isinstance(obj, np.ndarray): diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index 3685fd4c3458..261d56abad9c 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -164,7 +164,7 @@ class VideoMediaIO(MediaIO[npt.NDArray]): ) return np.stack([ - np.array(load_frame(frame_data)) + np.asarray(load_frame(frame_data)) for frame_data in data.split(",") ])