mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-06-01 08:37:04 +08:00
[Bugfix] Fix no video/image profiling edge case for MultiModalDataParser (#15828)
Signed-off-by: Isotr0py <2037008807@qq.com>
This commit is contained in:
parent
d330558bab
commit
0a298ea418
@ -295,7 +295,7 @@ class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]):
|
|||||||
|
|
||||||
|
|
||||||
ModalityDataParser: TypeAlias = Callable[[ModalityData[Any]],
|
ModalityDataParser: TypeAlias = Callable[[ModalityData[Any]],
|
||||||
ModalityDataItems[Any, Any]]
|
Optional[ModalityDataItems[Any, Any]]]
|
||||||
|
|
||||||
|
|
||||||
class MultiModalDataParser:
|
class MultiModalDataParser:
|
||||||
@ -319,7 +319,15 @@ class MultiModalDataParser:
|
|||||||
if isinstance(data, torch.Tensor):
|
if isinstance(data, torch.Tensor):
|
||||||
return data.ndim == 3
|
return data.ndim == 3
|
||||||
if is_list_of(data, torch.Tensor):
|
if is_list_of(data, torch.Tensor):
|
||||||
return len(data) == 0 or data[0].ndim == 2
|
return data[0].ndim == 2
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _is_empty(self, data: object) -> TypeGuard[None]:
|
||||||
|
if isinstance(data, list):
|
||||||
|
return len(data) == 0
|
||||||
|
if isinstance(data, (np.ndarray, torch.Tensor)):
|
||||||
|
return data.size == 0
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@ -341,7 +349,12 @@ class MultiModalDataParser:
|
|||||||
def _parse_audio_data(
|
def _parse_audio_data(
|
||||||
self,
|
self,
|
||||||
data: ModalityData[AudioItem],
|
data: ModalityData[AudioItem],
|
||||||
) -> ModalityDataItems[Any, Any]:
|
) -> Optional[ModalityDataItems[Any, Any]]:
|
||||||
|
# also check single audio item with sampling rate
|
||||||
|
if self._is_empty(data) or (isinstance(data, tuple)
|
||||||
|
and self._is_empty(data[0])):
|
||||||
|
return None
|
||||||
|
|
||||||
if self._is_embeddings(data):
|
if self._is_embeddings(data):
|
||||||
return AudioEmbeddingItems(data)
|
return AudioEmbeddingItems(data)
|
||||||
|
|
||||||
@ -378,7 +391,10 @@ class MultiModalDataParser:
|
|||||||
def _parse_image_data(
|
def _parse_image_data(
|
||||||
self,
|
self,
|
||||||
data: ModalityData[ImageItem],
|
data: ModalityData[ImageItem],
|
||||||
) -> ModalityDataItems[Any, Any]:
|
) -> Optional[ModalityDataItems[Any, Any]]:
|
||||||
|
if self._is_empty(data):
|
||||||
|
return None
|
||||||
|
|
||||||
if self._is_embeddings(data):
|
if self._is_embeddings(data):
|
||||||
return ImageEmbeddingItems(data)
|
return ImageEmbeddingItems(data)
|
||||||
|
|
||||||
@ -396,7 +412,10 @@ class MultiModalDataParser:
|
|||||||
def _parse_video_data(
|
def _parse_video_data(
|
||||||
self,
|
self,
|
||||||
data: ModalityData[VideoItem],
|
data: ModalityData[VideoItem],
|
||||||
) -> ModalityDataItems[Any, Any]:
|
) -> Optional[ModalityDataItems[Any, Any]]:
|
||||||
|
if self._is_empty(data):
|
||||||
|
return None
|
||||||
|
|
||||||
if self._is_embeddings(data):
|
if self._is_embeddings(data):
|
||||||
return VideoEmbeddingItems(data)
|
return VideoEmbeddingItems(data)
|
||||||
|
|
||||||
@ -427,6 +446,8 @@ class MultiModalDataParser:
|
|||||||
if k not in subparsers:
|
if k not in subparsers:
|
||||||
raise ValueError(f"Unsupported modality: {k}")
|
raise ValueError(f"Unsupported modality: {k}")
|
||||||
|
|
||||||
mm_items[k] = subparsers[k](v)
|
# ignore empty embedding data
|
||||||
|
if (parsed_data := subparsers[k](v)) is not None:
|
||||||
|
mm_items[k] = parsed_data
|
||||||
|
|
||||||
return mm_items
|
return mm_items
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user