diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index 4fe6a7b9e938..eb7435d6e1d8 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -620,7 +620,8 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal): multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None: + if multimodal_embeddings is not None \ + and len(multimodal_embeddings) != 0: inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, multimodal_embeddings, self.config.image_token_index) diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py index 7c02d245db8b..a48631ad709f 100644 --- a/vllm/model_executor/models/aya_vision.py +++ b/vllm/model_executor/models/aya_vision.py @@ -430,7 +430,8 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal, multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None: + if multimodal_embeddings is not None \ + and len(multimodal_embeddings) != 0: inputs_embeds = merge_multimodal_embeddings( input_ids=input_ids, inputs_embeds=inputs_embeds, diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 87fc6b5b0240..3c3955161daa 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -641,7 +641,8 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP, multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None: + if multimodal_embeddings is not None \ + and len(multimodal_embeddings) != 0: inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, multimodal_embeddings, _IMAGE_TOKEN_ID) diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 21f29dc43c26..d538ba09c65c 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -1005,7 +1005,8 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal, ) -> torch.Tensor: inputs_embeds = self.model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None: + if multimodal_embeddings is not None \ + and len(multimodal_embeddings) != 0: inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, multimodal_embeddings, self.model.vocabulary_mapping.image_token_id) diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index 6341c65a5d4c..da5452409d2f 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -600,7 +600,8 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None: + if multimodal_embeddings is not None \ + and len(multimodal_embeddings) != 0: inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, multimodal_embeddings, self.image_token_id) diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py index 4b220ea483e8..425407c19ab5 100644 --- a/vllm/model_executor/models/florence2.py +++ b/vllm/model_executor/models/florence2.py @@ -1046,7 +1046,8 @@ class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal, multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None: + if multimodal_embeddings is not None \ + and len(multimodal_embeddings) != 0: inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, multimodal_embeddings, self.pad_token_id) diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 9692899f7b99..7e03982e78e6 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -345,7 +345,8 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None: + if multimodal_embeddings is not None \ + and len(multimodal_embeddings) != 0: inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index 415a8dbdcf87..3a1c14978b45 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -592,7 +592,8 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP, multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None: + if multimodal_embeddings is not None \ + and len(multimodal_embeddings) != 0: inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py index e9271367a472..70916c45c0e0 100644 --- a/vllm/model_executor/models/glm4v.py +++ b/vllm/model_executor/models/glm4v.py @@ -609,7 +609,8 @@ class GLM4VForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP, ) -> torch.Tensor: inputs_embeds = self.transformer.get_input_embeddings(input_ids) - if multimodal_embeddings is not None: + if multimodal_embeddings is not None \ + and len(multimodal_embeddings) != 0: inputs_embeds = merge_multimodal_embeddings( input_ids=input_ids, inputs_embeds=inputs_embeds, diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py index 137aad926cb9..f2dc5708028b 100644 --- a/vllm/model_executor/models/granite_speech.py +++ b/vllm/model_executor/models/granite_speech.py @@ -721,7 +721,8 @@ class GraniteSpeechForConditionalGeneration( multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: """Compute the merged LLM / audio embeddings.""" - if multimodal_embeddings is None: + if multimodal_embeddings is None \ + or len(multimodal_embeddings) == 0: return self.language_model.get_input_embeddings(input_ids) inputs_embeds = embed_multimodal( diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index be04ad0422df..b1d0626217a0 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -720,7 +720,8 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal, multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None: + if multimodal_embeddings is not None \ + and len(multimodal_embeddings) != 0: inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 9d5cceccff2f..bb71177ecad8 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -1336,7 +1336,8 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None: + if multimodal_embeddings is not None \ + and len(multimodal_embeddings) != 0: context_token_ids = [ token_id for token_id in (self.img_context_token_id, self.video_context_token_id) diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py index 351d1fbdc744..f32c2075f6a8 100644 --- a/vllm/model_executor/models/kimi_vl.py +++ b/vllm/model_executor/models/kimi_vl.py @@ -393,7 +393,8 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal): # model as one of the requirements of basic vLLM model implementation. inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None: + if multimodal_embeddings is not None and len( + multimodal_embeddings) != 0: inputs_embeds = merge_multimodal_embeddings( input_ids=input_ids, inputs_embeds=inputs_embeds, diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index f70ad37a3d3a..1c35bf5206db 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -683,7 +683,8 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None: + if multimodal_embeddings is not None \ + and len(multimodal_embeddings) != 0: inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index bc792be19dbf..142d5740f077 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -502,7 +502,8 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: - if not multimodal_embeddings: + if multimodal_embeddings is None \ + or len(multimodal_embeddings) == 0: return self.language_model.get_input_embeddings(input_ids) inputs_embeds = embed_multimodal( diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index c13e8e9b2414..f930f3ce8a16 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -426,7 +426,8 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None: + if multimodal_embeddings is not None \ + and len(multimodal_embeddings) != 0: inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, multimodal_embeddings, self.config.video_token_index) diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 373b0a2a7d5e..c5403762f539 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -881,7 +881,8 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None: + if multimodal_embeddings is not None \ + and len(multimodal_embeddings) != 0: inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, multimodal_embeddings, [self.config.image_token_index, self.config.video_token_index]) diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index b923287dca3e..9dc03c800182 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -892,7 +892,8 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP): multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.llm.get_input_embeddings(input_ids) - if multimodal_embeddings is not None: + if multimodal_embeddings is not None \ + and len(multimodal_embeddings) != 0: assert len(self.mm_token_ids) > 0 inputs_embeds = merge_multimodal_embeddings( input_ids, diff --git a/vllm/model_executor/models/minimax_vl_01.py b/vllm/model_executor/models/minimax_vl_01.py index bc00af2ec6b9..8ce94540e87f 100644 --- a/vllm/model_executor/models/minimax_vl_01.py +++ b/vllm/model_executor/models/minimax_vl_01.py @@ -201,7 +201,8 @@ class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal, multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None: + if multimodal_embeddings is not None \ + and len(multimodal_embeddings) != 0: inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py index ebc176e2c724..04d6d347cb84 100644 --- a/vllm/model_executor/models/mistral3.py +++ b/vllm/model_executor/models/mistral3.py @@ -521,7 +521,8 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA, multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None: + if multimodal_embeddings is not None \ + and len(multimodal_embeddings) != 0: inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index bf4bd309eea2..a420e757e219 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -808,7 +808,8 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None: + if multimodal_embeddings is not None and len( + multimodal_embeddings) != 0: inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 70c60c6d528b..bb08cd59f6fc 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -1487,7 +1487,8 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None: + if multimodal_embeddings is not None \ + and len(multimodal_embeddings) != 0: assert self.img_patch_id is not None inputs_embeds = merge_multimodal_embeddings( diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py index 900a1f5de458..6eecd4499fb9 100644 --- a/vllm/model_executor/models/ovis.py +++ b/vllm/model_executor/models/ovis.py @@ -515,7 +515,8 @@ class Ovis(nn.Module, SupportsMultiModal, SupportsPP): multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.llm.get_input_embeddings(input_ids) - if multimodal_embeddings is not None: + if multimodal_embeddings is not None \ + and len(multimodal_embeddings) != 0: inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, multimodal_embeddings, self.image_pad_token_id) diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index 103a267c41f5..e1de8cf45878 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -364,7 +364,8 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None: + if multimodal_embeddings is not None \ + and len(multimodal_embeddings) != 0: inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, multimodal_embeddings, self.config.image_token_index) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 9cec7831ae0c..0a7adf91e488 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -669,7 +669,8 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.embed_tokens(input_ids) - if multimodal_embeddings: + if multimodal_embeddings is not None \ + and len(multimodal_embeddings) != 0: inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, multimodal_embeddings, self.image_token_id) diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index a3ca72d1f5cf..5d1f0775b07f 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -1148,7 +1148,8 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.model.embed_tokens(input_ids) - if multimodal_embeddings is not None: + if multimodal_embeddings is not None and len( + multimodal_embeddings) != 0: inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, multimodal_embeddings, [_IMAGE_PLACEHOLDER_TOKEN_ID, _AUDIO_PLACEHOLDER_TOKEN_ID]) diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 320c0e10d06a..709ac1d9df94 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -423,7 +423,8 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None: + if multimodal_embeddings is not None \ + and len(multimodal_embeddings) != 0: inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index ad1e8fcb39d5..9344bf8e03a7 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -805,7 +805,8 @@ class Qwen2_5OmniThinkerForConditionalGeneration( multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None: + if multimodal_embeddings is not None \ + and len(multimodal_embeddings) != 0: # TODO (ywang96): support overlapping modalitiy embeddings so that # `use_audio_in_video` will work on V1. @@ -845,7 +846,7 @@ class Qwen2_5OmniThinkerForConditionalGeneration( multimodal_embeddings: Optional[NestedTensors] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is None: + if multimodal_embeddings is None or len(multimodal_embeddings) == 0: return inputs_embeds for embeddings, modality in multimodal_embeddings: diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 202cd5e860d1..01e85ae80577 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -1046,7 +1046,8 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal, multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None: + if multimodal_embeddings is not None \ + and len(multimodal_embeddings) != 0: inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, multimodal_embeddings, [self.config.image_token_id, self.config.video_token_id]) diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index e77a8e05d200..aefa1db24628 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -364,7 +364,8 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal, multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None: + if multimodal_embeddings is not None \ + and len(multimodal_embeddings) != 0: inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, multimodal_embeddings, self.config.audio_token_index) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 49b709069cd2..d5e297ea66da 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1289,7 +1289,8 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None: + if multimodal_embeddings is not None \ + and len(multimodal_embeddings) != 0: inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, multimodal_embeddings, [self.config.image_token_id, self.config.video_token_id]) diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py index 546737621a7c..fc29785af95a 100644 --- a/vllm/model_executor/models/qwen_vl.py +++ b/vllm/model_executor/models/qwen_vl.py @@ -754,7 +754,8 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA, ) -> torch.Tensor: inputs_embeds = self.transformer.get_input_embeddings(input_ids) - if multimodal_embeddings is not None: + if multimodal_embeddings is not None \ + and len(multimodal_embeddings) != 0: inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, multimodal_embeddings, self.transformer.visual.image_pad_id) diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py index 9fba24ac5cec..28f181dde215 100644 --- a/vllm/model_executor/models/skyworkr1v.py +++ b/vllm/model_executor/models/skyworkr1v.py @@ -883,7 +883,8 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP): multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None: + if multimodal_embeddings is not None \ + and len(multimodal_embeddings) != 0: assert self.img_context_token_id is not None self._set_visual_token_mask(input_ids) inputs_embeds = merge_multimodal_embeddings( diff --git a/vllm/model_executor/models/tarsier.py b/vllm/model_executor/models/tarsier.py index 2645e700fcda..a5736f124f25 100644 --- a/vllm/model_executor/models/tarsier.py +++ b/vllm/model_executor/models/tarsier.py @@ -598,7 +598,8 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None: + if multimodal_embeddings is not None \ + and len(multimodal_embeddings) != 0: inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index f6b9d19694ef..94f5e03fd446 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -560,7 +560,8 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None: + if multimodal_embeddings is not None \ + and len(multimodal_embeddings) != 0: # TODO(ywang96): remove this block after v0 is deprecated. if not envs.VLLM_USE_V1: