From d2f816d6ff99ec0623f6596b90925f8164e6c7a6 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 14 Oct 2025 17:36:21 +0800 Subject: [PATCH] [Bugfix] Standardize merging multimodal embeddings (#26771) Signed-off-by: DarkLight1337 --- vllm/model_executor/models/ernie45_vl.py | 6 +++--- vllm/model_executor/models/glm4_1v.py | 6 +++--- vllm/model_executor/models/hyperclovax_vision.py | 6 +++--- vllm/model_executor/models/interns1.py | 6 +++--- vllm/model_executor/models/internvl.py | 6 +++--- vllm/model_executor/models/keye.py | 6 +++--- vllm/model_executor/models/llava_onevision.py | 4 ++-- vllm/model_executor/models/minicpmo.py | 4 ++-- vllm/model_executor/models/minicpmv.py | 8 ++++---- vllm/model_executor/models/nano_nemotron_vl.py | 6 +++--- vllm/model_executor/models/nemotron_vl.py | 4 ++-- vllm/model_executor/models/ovis2_5.py | 6 +++--- vllm/model_executor/models/phi4_multimodal.py | 4 ++-- vllm/model_executor/models/phi4mm.py | 4 ++-- vllm/model_executor/models/qwen2_5_omni_thinker.py | 8 ++++---- vllm/model_executor/models/qwen2_5_vl.py | 10 +++++----- vllm/model_executor/models/qwen2_vl.py | 6 +++--- vllm/model_executor/models/qwen3_omni_moe_thinker.py | 8 ++++---- vllm/model_executor/models/qwen3_vl.py | 6 +++--- 19 files changed, 57 insertions(+), 57 deletions(-) diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py index dc465c87cf4b9..f40bd01deccd5 100644 --- a/vllm/model_executor/models/ernie45_vl.py +++ b/vllm/model_executor/models/ernie45_vl.py @@ -1645,12 +1645,12 @@ class Ernie4_5_VLMoeForConditionalGeneration( for modality in modalities: if modality == "images": image_input = modalities["images"] - vision_embeddings = self._process_image_input(image_input) - multimodal_embeddings += vision_embeddings + image_embeddings = self._process_image_input(image_input) + multimodal_embeddings += tuple(image_embeddings) if modality == "videos": video_input = modalities["videos"] video_embeddings = self._process_video_input(video_input) - multimodal_embeddings += video_embeddings + multimodal_embeddings += tuple(video_embeddings) return multimodal_embeddings diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 6e58f8c32f8ad..132f26253b367 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -1608,11 +1608,11 @@ class Glm4vForConditionalGeneration( for modality in mm_input_by_modality: multimodal_input = mm_input_by_modality[modality] if modality == "image": - vision_embeddings = self._process_image_input(multimodal_input) - multimodal_embeddings += vision_embeddings + image_embeddings = self._process_image_input(multimodal_input) + multimodal_embeddings += tuple(image_embeddings) if modality == "video": video_embeddings = self._process_video_input(multimodal_input) - multimodal_embeddings += video_embeddings + multimodal_embeddings += tuple(video_embeddings) return multimodal_embeddings def forward( diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py index ad39443f93daa..3d28ba951b94e 100644 --- a/vllm/model_executor/models/hyperclovax_vision.py +++ b/vllm/model_executor/models/hyperclovax_vision.py @@ -749,12 +749,12 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): for modality in modalities: if modality == "images": image_input = modalities["images"] - vision_embeddings = self._process_image_input(image_input) - multimodal_embeddings += vision_embeddings + image_embeddings = self._process_image_input(image_input) + multimodal_embeddings += tuple(image_embeddings) if modality == "videos": video_input = modalities["videos"] video_embeddings = self._process_video_input(video_input) - multimodal_embeddings += video_embeddings + multimodal_embeddings += tuple(video_embeddings) return multimodal_embeddings diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py index 28a4a1e8d2596..176aa3252d67b 100644 --- a/vllm/model_executor/models/interns1.py +++ b/vllm/model_executor/models/interns1.py @@ -753,12 +753,12 @@ class InternS1ForConditionalGeneration( for modality in modalities: if modality == "images": image_input = modalities["images"] - vision_embeddings = self._process_vision_input(image_input) - multimodal_embeddings += vision_embeddings + image_embeddings = self._process_vision_input(image_input) + multimodal_embeddings += tuple(image_embeddings) if modality == "videos": video_input = modalities["videos"] video_embeddings = self._process_vision_input(video_input) - multimodal_embeddings += video_embeddings + multimodal_embeddings += tuple(video_embeddings) return multimodal_embeddings diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 28a35595f43aa..05b822d6fdbf5 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -1358,12 +1358,12 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA) for modality in modalities: if modality == "images": image_input = modalities["images"] - vision_embeddings = self._process_vision_input(image_input) - multimodal_embeddings += vision_embeddings + image_embeddings = self._process_vision_input(image_input) + multimodal_embeddings += tuple(image_embeddings) if modality == "videos": video_input = modalities["videos"] video_embeddings = self._process_vision_input(video_input) - multimodal_embeddings += video_embeddings + multimodal_embeddings += tuple(video_embeddings) return multimodal_embeddings diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index 028162fdbf110..292a07c00d07b 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -1459,12 +1459,12 @@ class BaseKeyeModule(nn.Module): for modality in modalities: if modality == "images": image_input = modalities["images"] - vision_embeddings = self._process_image_input(image_input) - multimodal_embeddings += vision_embeddings + image_embeddings = self._process_image_input(image_input) + multimodal_embeddings += tuple(image_embeddings) if modality == "videos": video_input = modalities["videos"] video_embeddings = self._process_video_input(video_input) - multimodal_embeddings += video_embeddings + multimodal_embeddings += tuple(video_embeddings) return multimodal_embeddings def forward( diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index c9a27728eb735..c4cae240ea469 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -881,8 +881,8 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, Supp for modality in mm_input_by_modality: multimodal_input = mm_input_by_modality[modality] if modality == "image": - vision_embeddings = self._process_image_input(multimodal_input) - multimodal_embeddings += tuple(vision_embeddings) + image_embeddings = self._process_image_input(multimodal_input) + multimodal_embeddings += tuple(image_embeddings) if modality == "video": video_embeddings = self._process_video_pixels(multimodal_input) multimodal_embeddings += tuple(video_embeddings) diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py index 371c9607c5c5b..fa2feb0ba10b4 100644 --- a/vllm/model_executor/models/minicpmo.py +++ b/vllm/model_executor/models/minicpmo.py @@ -762,7 +762,7 @@ class MiniCPMO(MiniCPMV2_6): for modality in modalities: if modality == "audios": audio_input = modalities["audios"] - audio_features = self._process_audio_input(audio_input) - multimodal_embeddings += tuple(audio_features) + audio_embeddings = self._process_audio_input(audio_input) + multimodal_embeddings += tuple(audio_embeddings) return multimodal_embeddings diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 173cab3bffc10..ef2bbac756541 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -1129,12 +1129,12 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP): for modality in modalities: if modality == "images": image_input = modalities["images"] - image_features = self._process_vision_input(image_input) - multimodal_embeddings += tuple(image_features) + image_embeddings = self._process_vision_input(image_input) + multimodal_embeddings += tuple(image_embeddings) if modality == "videos": video_input = modalities["videos"] - video_features = self._process_vision_input(video_input) - multimodal_embeddings += tuple(video_features) + video_embeddings = self._process_vision_input(video_input) + multimodal_embeddings += tuple(video_embeddings) return multimodal_embeddings diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index dfb7cb7fe6bd4..e874aaa0fc7ad 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -1263,12 +1263,12 @@ class NemotronH_Nano_VL_V2( for modality in modalities: if modality == "images": image_input = modalities["images"] - vision_embeddings = self._process_image_input(image_input) - multimodal_embeddings += vision_embeddings + image_embeddings = self._process_image_input(image_input) + multimodal_embeddings += tuple(image_embeddings) if modality == "videos": video_input = modalities["videos"] video_embeddings = self._process_video_input(video_input) - multimodal_embeddings += video_embeddings + multimodal_embeddings += tuple(video_embeddings) return multimodal_embeddings diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py index 42f70ef105a5d..2f78e2f60c93b 100644 --- a/vllm/model_executor/models/nemotron_vl.py +++ b/vllm/model_executor/models/nemotron_vl.py @@ -575,8 +575,8 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, Suppor for modality in modalities: if modality == "images": image_input = modalities["images"] - vision_embeddings = self._process_image_input(image_input) - multimodal_embeddings += vision_embeddings + image_embeddings = self._process_image_input(image_input) + multimodal_embeddings += tuple(image_embeddings) return multimodal_embeddings diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py index b4e2f42be5979..758611afb9a46 100644 --- a/vllm/model_executor/models/ovis2_5.py +++ b/vllm/model_executor/models/ovis2_5.py @@ -616,12 +616,12 @@ class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP): for modality in modalities: if modality == "images": image_input = modalities["images"] - vision_embeddings = self._process_visual_input(image_input) - multimodal_embeddings += vision_embeddings + image_embeddings = self._process_visual_input(image_input) + multimodal_embeddings += tuple(image_embeddings) if modality == "videos": video_input = modalities["videos"] video_embeddings = self._process_visual_input(video_input) - multimodal_embeddings += video_embeddings + multimodal_embeddings += tuple(video_embeddings) return multimodal_embeddings diff --git a/vllm/model_executor/models/phi4_multimodal.py b/vllm/model_executor/models/phi4_multimodal.py index b99e3a5a1fd84..207bd000c5b7a 100644 --- a/vllm/model_executor/models/phi4_multimodal.py +++ b/vllm/model_executor/models/phi4_multimodal.py @@ -1430,8 +1430,8 @@ class Phi4MultimodalForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): if modality == "images": audio_projection_mode = "vision" image_input = modalities["images"] - vision_embeddings = self._process_image_input(image_input) - multimodal_embeddings += tuple(vision_embeddings) + image_embeddings = self._process_image_input(image_input) + multimodal_embeddings += tuple(image_embeddings) if modality == "audios": audio_input = modalities["audios"] audio_embeddings = self._process_audio_input( diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index dce31f9d0aac6..a54d4d15ba9bb 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -1248,8 +1248,8 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): if modality == "images": audio_projection_mode = "vision" image_input = modalities["images"] - vision_embeddings = self._process_image_input(image_input) - multimodal_embeddings += tuple(vision_embeddings) + image_embeddings = self._process_image_input(image_input) + multimodal_embeddings += tuple(image_embeddings) if modality == "audios": audio_input = modalities["audios"] audio_embeddings = self._process_audio_input( diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index 07f814ef64187..c40b97a2c4e09 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -1210,14 +1210,14 @@ class Qwen2_5OmniThinkerForConditionalGeneration( for modality in mm_input_by_modality: multimodal_input = mm_input_by_modality[modality] if modality == "image": - vision_embeddings = self._process_image_input(multimodal_input) - multimodal_embeddings += vision_embeddings + image_embeddings = self._process_image_input(multimodal_input) + multimodal_embeddings += tuple(image_embeddings) if modality == "video": video_embeddings = self._process_video_input(multimodal_input) - multimodal_embeddings += video_embeddings + multimodal_embeddings += tuple(video_embeddings) if modality == "audio": audio_embeddings = self._process_audio_input(multimodal_input) - multimodal_embeddings += audio_embeddings + multimodal_embeddings += tuple(audio_embeddings) return multimodal_embeddings # TODO (ywang96): support overlapping modality embeddings so that diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 3f205307cb225..3079d3b9a41aa 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -1586,19 +1586,19 @@ class Qwen2_5_VLForConditionalGeneration( for modality in mm_input_by_modality: multimodal_input = mm_input_by_modality[modality] if modality == "image": - vision_embeddings = self._process_image_input(multimodal_input) + image_embeddings = self._process_image_input(multimodal_input) if self.is_multimodal_pruning_enabled: - vision_embeddings = self._postprocess_image_embeds_evs( - vision_embeddings, multimodal_input + image_embeddings = self._postprocess_image_embeds_evs( + image_embeddings, multimodal_input ) - multimodal_embeddings += vision_embeddings + multimodal_embeddings += tuple(image_embeddings) if modality == "video": video_embeddings = self._process_video_input(multimodal_input) if self.is_multimodal_pruning_enabled: video_embeddings = self._postprocess_video_embeds_evs( video_embeddings, multimodal_input ) - multimodal_embeddings += video_embeddings + multimodal_embeddings += tuple(video_embeddings) return multimodal_embeddings def forward( diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 8069039b0c560..821a9d13dc6f7 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1561,12 +1561,12 @@ class Qwen2VLForConditionalGeneration( for modality in modalities: if modality == "images": image_input = modalities["images"] - vision_embeddings = self._process_image_input(image_input) - multimodal_embeddings += vision_embeddings + image_embeddings = self._process_image_input(image_input) + multimodal_embeddings += tuple(image_embeddings) if modality == "videos": video_input = modalities["videos"] video_embeddings = self._process_video_input(video_input) - multimodal_embeddings += video_embeddings + multimodal_embeddings += tuple(video_embeddings) return multimodal_embeddings diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index b1eceaa6ef41d..d565a0108432a 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -1260,14 +1260,14 @@ class Qwen3OmniMoeThinkerForConditionalGeneration( for modality in mm_input_by_modality: multimodal_input = mm_input_by_modality[modality] if modality == "image": - vision_embeddings = self._process_image_input(multimodal_input) - multimodal_embeddings += vision_embeddings + image_embeddings = self._process_image_input(multimodal_input) + multimodal_embeddings += tuple(image_embeddings) if modality == "video": video_embeddings = self._process_video_input(multimodal_input) - multimodal_embeddings += video_embeddings + multimodal_embeddings += tuple(video_embeddings) if modality == "audio": audio_embeddings = self._process_audio_input(multimodal_input) - multimodal_embeddings += audio_embeddings + multimodal_embeddings += tuple(audio_embeddings) return multimodal_embeddings def get_input_embeddings( diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 39714faf9833e..f114aae25c51b 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -1601,11 +1601,11 @@ class Qwen3VLForConditionalGeneration( for modality in mm_input_by_modality: multimodal_input = mm_input_by_modality[modality] if modality == "image": - vision_embeddings = self._process_image_input(multimodal_input) - multimodal_embeddings += vision_embeddings + image_embeddings = self._process_image_input(multimodal_input) + multimodal_embeddings += tuple(image_embeddings) if modality == "video": video_embeddings = self._process_video_input(multimodal_input) - multimodal_embeddings += video_embeddings + multimodal_embeddings += tuple(video_embeddings) return multimodal_embeddings def _compute_deepstack_embeds(