[Bugfix] Fixed the issue of not being able to input video and image simultaneously (#15387)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
This commit is contained in:
Chauncey 2025-03-25 11:48:08 +08:00 committed by GitHub
parent b5269db959
commit 10b34e36b9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -556,11 +556,11 @@ class MultiModalItemTracker(BaseMultiModalItemTracker[object]):
raise ValueError(\ raise ValueError(\
"Only one message can have {'type': 'image_embeds'}") "Only one message can have {'type': 'image_embeds'}")
mm_inputs["image"] = image_embeds_lst[0] mm_inputs["image"] = image_embeds_lst[0]
elif "image" in items_by_modality: if "image" in items_by_modality:
mm_inputs["image"] = items_by_modality["image"] # A list of images mm_inputs["image"] = items_by_modality["image"] # A list of images
elif "audio" in items_by_modality: if "audio" in items_by_modality:
mm_inputs["audio"] = items_by_modality["audio"] # A list of audios mm_inputs["audio"] = items_by_modality["audio"] # A list of audios
elif "video" in items_by_modality: if "video" in items_by_modality:
mm_inputs["video"] = items_by_modality["video"] # A list of videos mm_inputs["video"] = items_by_modality["video"] # A list of videos
return mm_inputs return mm_inputs
@ -589,11 +589,11 @@ class AsyncMultiModalItemTracker(BaseMultiModalItemTracker[Awaitable[object]]):
raise ValueError( raise ValueError(
"Only one message can have {'type': 'image_embeds'}") "Only one message can have {'type': 'image_embeds'}")
mm_inputs["image"] = image_embeds_lst[0] mm_inputs["image"] = image_embeds_lst[0]
elif "image" in items_by_modality: if "image" in items_by_modality:
mm_inputs["image"] = items_by_modality["image"] # A list of images mm_inputs["image"] = items_by_modality["image"] # A list of images
elif "audio" in items_by_modality: if "audio" in items_by_modality:
mm_inputs["audio"] = items_by_modality["audio"] # A list of audios mm_inputs["audio"] = items_by_modality["audio"] # A list of audios
elif "video" in items_by_modality: if "video" in items_by_modality:
mm_inputs["video"] = items_by_modality["video"] # A list of videos mm_inputs["video"] = items_by_modality["video"] # A list of videos
return mm_inputs return mm_inputs