From 0a298ea4181a9b324afdb24876560b7e98e69bba Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Tue, 1 Apr 2025 18:17:11 +0800 Subject: [PATCH] [Bugfix] Fix no video/image profiling edge case for `MultiModalDataParser` (#15828) Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/multimodal/parse.py | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index 772b1609a9fbb..fc5a294564e3c 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -295,7 +295,7 @@ class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]): ModalityDataParser: TypeAlias = Callable[[ModalityData[Any]], - ModalityDataItems[Any, Any]] + Optional[ModalityDataItems[Any, Any]]] class MultiModalDataParser: @@ -319,7 +319,15 @@ class MultiModalDataParser: if isinstance(data, torch.Tensor): return data.ndim == 3 if is_list_of(data, torch.Tensor): - return len(data) == 0 or data[0].ndim == 2 + return data[0].ndim == 2 + + return False + + def _is_empty(self, data: object) -> TypeGuard[None]: + if isinstance(data, list): + return len(data) == 0 + if isinstance(data, (np.ndarray, torch.Tensor)): + return data.size == 0 return False @@ -341,7 +349,12 @@ class MultiModalDataParser: def _parse_audio_data( self, data: ModalityData[AudioItem], - ) -> ModalityDataItems[Any, Any]: + ) -> Optional[ModalityDataItems[Any, Any]]: + # also check single audio item with sampling rate + if self._is_empty(data) or (isinstance(data, tuple) + and self._is_empty(data[0])): + return None + if self._is_embeddings(data): return AudioEmbeddingItems(data) @@ -378,7 +391,10 @@ class MultiModalDataParser: def _parse_image_data( self, data: ModalityData[ImageItem], - ) -> ModalityDataItems[Any, Any]: + ) -> Optional[ModalityDataItems[Any, Any]]: + if self._is_empty(data): + return None + if self._is_embeddings(data): return ImageEmbeddingItems(data) @@ -396,7 +412,10 @@ class MultiModalDataParser: def _parse_video_data( self, data: ModalityData[VideoItem], - ) -> ModalityDataItems[Any, Any]: + ) -> Optional[ModalityDataItems[Any, Any]]: + if self._is_empty(data): + return None + if self._is_embeddings(data): return VideoEmbeddingItems(data) @@ -427,6 +446,8 @@ class MultiModalDataParser: if k not in subparsers: raise ValueError(f"Unsupported modality: {k}") - mm_items[k] = subparsers[k](v) + # ignore empty embedding data + if (parsed_data := subparsers[k](v)) is not None: + mm_items[k] = parsed_data return mm_items