From b765adccd7bf5f09ecb4b82e80812757e621eb60 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Mon, 29 Sep 2025 14:09:18 +0800 Subject: [PATCH] [V0 Deprecation][Models] Remove all V0 condition for mm embeddings merge (#25331) Signed-off-by: Isotr0py Signed-off-by: isotr0py <2037008807@qq.com> Signed-off-by: yewentao256 --- vllm/model_executor/models/aya_vision.py | 11 --- vllm/model_executor/models/blip2.py | 11 --- vllm/model_executor/models/chameleon.py | 12 --- vllm/model_executor/models/cohere2_vision.py | 11 --- vllm/model_executor/models/deepseek_vl2.py | 11 --- vllm/model_executor/models/fuyu.py | 11 --- vllm/model_executor/models/gemma3_mm.py | 19 ---- vllm/model_executor/models/glm4_1v.py | 50 +--------- vllm/model_executor/models/glm4v.py | 17 +--- vllm/model_executor/models/granite_speech.py | 11 --- .../models/hyperclovax_vision.py | 15 +-- vllm/model_executor/models/idefics3.py | 11 --- vllm/model_executor/models/interns1.py | 18 +--- vllm/model_executor/models/internvl.py | 18 +--- vllm/model_executor/models/kimi_vl.py | 16 ---- vllm/model_executor/models/llava.py | 11 --- vllm/model_executor/models/llava_next.py | 11 --- .../model_executor/models/llava_next_video.py | 11 --- vllm/model_executor/models/llava_onevision.py | 46 +-------- vllm/model_executor/models/minicpmv.py | 15 +-- vllm/model_executor/models/mistral3.py | 11 --- vllm/model_executor/models/mllama4.py | 11 --- vllm/model_executor/models/molmo.py | 11 --- .../model_executor/models/nano_nemotron_vl.py | 18 +--- vllm/model_executor/models/nemotron_vl.py | 11 --- vllm/model_executor/models/ovis.py | 11 --- vllm/model_executor/models/ovis2_5.py | 12 --- vllm/model_executor/models/paligemma.py | 11 --- vllm/model_executor/models/phi3v.py | 11 --- vllm/model_executor/models/phi4_multimodal.py | 16 ---- vllm/model_executor/models/phi4mm.py | 48 +--------- vllm/model_executor/models/pixtral.py | 11 --- .../models/qwen2_5_omni_thinker.py | 31 +------ vllm/model_executor/models/qwen2_5_vl.py | 58 +----------- vllm/model_executor/models/qwen2_audio.py | 11 --- vllm/model_executor/models/qwen2_vl.py | 50 +--------- vllm/model_executor/models/qwen3_vl.py | 93 +------------------ vllm/model_executor/models/qwen_vl.py | 12 --- vllm/model_executor/models/skyworkr1v.py | 11 --- vllm/model_executor/models/transformers.py | 13 --- vllm/model_executor/models/ultravox.py | 12 --- vllm/model_executor/models/voxtral.py | 13 --- 42 files changed, 13 insertions(+), 809 deletions(-) diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py index eab996e9ba22b..f6dfa435ddd4e 100644 --- a/vllm/model_executor/models/aya_vision.py +++ b/vllm/model_executor/models/aya_vision.py @@ -427,17 +427,6 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal, if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=input_ids == self.config.image_token_index, - ) - input_ids = None - hidden_states = self.language_model.model( input_ids=input_ids, positions=positions, diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 4d1850d07b28e..334743a7358c6 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -672,17 +672,6 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP, if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=input_ids == _IMAGE_TOKEN_ID, - ) - input_ids = None - hidden_states = self.language_model.model(input_ids, positions, intermediate_tensors, diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index f9740adb151b5..86dbf63fa5df6 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -1014,18 +1014,6 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal, if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - image_token_id = self.model.vocabulary_mapping.image_token_id - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=input_ids == image_token_id, - ) - input_ids = None - hidden_states = self.model(input_ids, positions, intermediate_tensors, diff --git a/vllm/model_executor/models/cohere2_vision.py b/vllm/model_executor/models/cohere2_vision.py index 99edcba4d874a..7162571c08d91 100644 --- a/vllm/model_executor/models/cohere2_vision.py +++ b/vllm/model_executor/models/cohere2_vision.py @@ -440,17 +440,6 @@ class Cohere2VisionForConditionalGeneration(nn.Module, SupportsMultiModal, if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=input_ids == self.config.image_token_id, - ) - input_ids = None - hidden_states = self.language_model.model( input_ids=input_ids, positions=positions, diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index b98008c83bdcc..0f87fb34bf329 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -614,17 +614,6 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=input_ids == self.image_token_id, - ) - input_ids = None - hidden_states = self.language_model(input_ids, positions, intermediate_tensors, diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index b99fe33a1dcce..9e491c0b50d24 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -352,17 +352,6 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=input_ids == _IMAGE_TOKEN_ID, - ) - input_ids = None - hidden_states = self.language_model( input_ids=input_ids, positions=positions, diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index be75e36fe23b5..36f8651371baa 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -596,25 +596,6 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP, if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=input_ids == self.config.image_token_index, - ) - if (vision_embeddings is not None) and len(vision_embeddings) != 0: - kwargs = self.prepare_attn_masks( - input_ids, - positions, - mask_dtype=self.dtype, - **kwargs, - ) - input_ids = None - hidden_states = self.language_model.model(input_ids, positions, intermediate_tensors, diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index dbb5431ae4919..722f1e428be7b 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -71,7 +71,6 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.platforms import _Backend from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.config import uses_mrope from vllm.utils.tensor_schema import TensorSchema, TensorShape from ..layers.activation import SiluAndMul @@ -80,8 +79,7 @@ from .interfaces import (MultiModalEmbeddings, SupportsLoRA, from .qwen2_vl import (_create_qwen2vl_field_factory, apply_rotary_pos_emb_vision) from .utils import (AutoWeightsLoader, WeightsMapper, - init_vllm_registered_model, maybe_prefix, - merge_multimodal_embeddings) + init_vllm_registered_model, maybe_prefix) from .vision import get_vit_attn_backend, run_dp_sharded_mrope_vision_model logger = init_logger(__name__) @@ -1552,32 +1550,6 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal, multimodal_embeddings += video_embeddings return multimodal_embeddings - def get_input_embeddings_v0( - self, - input_ids: torch.Tensor, - image_input: Optional[Glm4vImageInputs] = None, - video_input: Optional[Glm4vVideoInputs] = None, - ) -> torch.Tensor: - inputs_embeds = self.get_input_embeddings(input_ids) - if image_input is not None: - image_embeds = self._process_image_input(image_input) - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - image_embeds, - placeholder_token_id=self.config.image_token_id, - ) - - if video_input is not None: - video_embeds = self._process_video_input(video_input) - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - video_embeds, - placeholder_token_id=self.config.video_token_id, - ) - return inputs_embeds - def forward( self, input_ids: torch.Tensor, @@ -1604,26 +1576,6 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal, if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner from - # `get_multimodal_embeddings` and `get_input_embeddings`, this - # condition is only for v0 compatibility. - elif inputs_embeds is None: - image_input = self._parse_and_validate_image_input(**kwargs) - video_input = self._parse_and_validate_video_input(**kwargs) - - if image_input is None and video_input is None: - inputs_embeds = None - else: - if uses_mrope(self.config): - assert positions.ndim == 2 and positions.size(0) == 3, ( - "multimodal section rotary embedding requires " - f"(3, seq_len) positions, but got {positions.size()}") - inputs_embeds = self.get_input_embeddings_v0( - input_ids, - image_input=image_input, - video_input=video_input) - input_ids = None - hidden_states = self.language_model.model( input_ids=input_ids, positions=positions, diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py index ace9c05daf15a..22ddb1d751607 100644 --- a/vllm/model_executor/models/glm4v.py +++ b/vllm/model_executor/models/glm4v.py @@ -43,7 +43,7 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape from .chatglm import ChatGLMBaseModel, ChatGLMModel from .interfaces import (MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal, SupportsPP) -from .utils import flatten_bn, isin_list +from .utils import flatten_bn class GLMVImagePixelInputs(TensorSchema): @@ -618,21 +618,6 @@ class GLM4VForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP, if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=isin_list(input_ids, [ - self.config.boi_token_id, - self.config.pad_token_id, - self.config.eoi_token_id, - ]), - ) - input_ids = None - hidden_states = self.transformer(input_ids, positions, intermediate_tensors, inputs_embeds) diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py index 8a02da58ea0b9..0ec451356f5ee 100644 --- a/vllm/model_executor/models/granite_speech.py +++ b/vllm/model_executor/models/granite_speech.py @@ -765,17 +765,6 @@ class GraniteSpeechForConditionalGeneration( if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - audio_embeds = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - audio_embeds, - is_multimodal=input_ids == self.config.audio_token_index, - ) - input_ids = None - model_output = self.language_model(input_ids, positions, intermediate_tensors, inputs_embeds) return model_output diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py index f851688bf7bab..b0f9d5e2657e4 100644 --- a/vllm/model_executor/models/hyperclovax_vision.py +++ b/vllm/model_executor/models/hyperclovax_vision.py @@ -45,8 +45,7 @@ from vllm.sequence import IntermediateTensors from .clip import CLIPVisionModel from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .siglip import SiglipVisionModel -from .utils import (AutoWeightsLoader, init_vllm_registered_model, isin_list, - maybe_prefix) +from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix from .vision import get_vision_encoder_info EOT = "<|endofturn|>" @@ -747,18 +746,6 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - multimodal_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - multimodal_embeddings, - is_multimodal=isin_list( - input_ids, - [self.config.image_token_id, self.config.video_token_id]), - ) - input_ids = None hidden_states = self.language_model.model(input_ids, positions, intermediate_tensors, diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index 3334ee2242531..dddf1c6fb6269 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -702,17 +702,6 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal, if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=input_ids == self.config.image_token_id, - ) - input_ids = None - hidden_states = self.model.text_model(input_ids, positions, intermediate_tensors, diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py index 545dad1a96f5e..0292845f819c8 100644 --- a/vllm/model_executor/models/interns1.py +++ b/vllm/model_executor/models/interns1.py @@ -40,7 +40,7 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import (MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal, SupportsPP) from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, - init_vllm_registered_model, isin_list, maybe_prefix) + init_vllm_registered_model, maybe_prefix) class InternS1MultiModalProjector(nn.Module): @@ -798,22 +798,6 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal, input_ids = None inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - context_token_ids = [ - token_id for token_id in (self.img_context_token_id, - self.video_context_token_id) - if token_id is not None - ] - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=isin_list(input_ids, context_token_ids), - ) - input_ids = None - forward_kwargs = { "input_ids": input_ids, "positions": positions, diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 78aac85414344..0c95c49f90b18 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -43,7 +43,7 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import (MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal, SupportsPP) from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, - isin_list, maybe_prefix) + maybe_prefix) IMG_START = '' IMG_END = '' @@ -1371,22 +1371,6 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, input_ids = None inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - context_token_ids = [ - token_id for token_id in (self.img_context_token_id, - self.video_context_token_id) - if token_id is not None - ] - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=isin_list(input_ids, context_token_ids), - ) - input_ids = None - forward_kwargs = { "input_ids": input_ids, "positions": positions, diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py index db032736f9148..30ec9029f74f1 100644 --- a/vllm/model_executor/models/kimi_vl.py +++ b/vllm/model_executor/models/kimi_vl.py @@ -433,22 +433,6 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal, ) -> IntermediateTensors: if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner from - # `get_multimodal_embeddings` and `get_input_embeddings`, this - # condition is only for v0 compatibility. - elif inputs_embeds is None: - image_input = self._parse_and_validate_image_input(**kwargs) - if image_input is None: - inputs_embeds = None - else: - image_embeds = self._process_image_input(image_input) - inputs_embeds = self.get_input_embeddings( - input_ids, - image_embeds, - is_multimodal=input_ids == - self.config.media_placeholder_token_id, - ) - input_ids = None hidden_states = self.language_model( input_ids=input_ids, diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 6f3cfd88aee23..46cf93be191ea 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -723,17 +723,6 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=input_ids == self.config.image_token_index, - ) - input_ids = None - hidden_states = self.language_model.model(input_ids, positions, intermediate_tensors, diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index e132389c4f061..c4f1daaab9bf2 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -547,17 +547,6 @@ model_executor.models.llava_next.LlavaNextProcessingInfo.get_num_image_tokens]. if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=input_ids == self.config.image_token_index, - ) - input_ids = None - hidden_states = self.language_model.model(input_ids, positions, intermediate_tensors, diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index 2642d8c77cf3b..aebc661d53f8b 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -431,17 +431,6 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=input_ids == self.config.video_token_index, - ) - input_ids = None - hidden_states = self.language_model.model(input_ids, positions, intermediate_tensors, diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 906858f4e2f47..6088195c91d5c 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -30,8 +30,7 @@ from .llava_next import (BaseLlavaNextMultiModalProcessor, LlavaNextLikeConfig, LlavaNextProcessingInfo) from .siglip import SiglipVisionModel from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, - init_vllm_registered_model, maybe_prefix, - merge_multimodal_embeddings) + init_vllm_registered_model, maybe_prefix) # For profile run _MAX_FRAMES_PER_VIDEO = 16 @@ -850,33 +849,6 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, return multimodal_embeddings - def get_input_embeddings_v0( - self, - input_ids: torch.Tensor, - image_input: Optional[LlavaOnevisionImagePixelInputs] = None, - video_input: Optional[LlavaOnevisionVideoPixelInputs] = None, - ) -> torch.Tensor: - inputs_embeds = self.get_input_embeddings(input_ids) - if image_input is not None: - image_embeds = self._process_image_input(image_input) - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - image_embeds, - placeholder_token_id=self.config.image_token_index, - ) - - if video_input is not None: - video_embeds = self._process_video_pixels(video_input) - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - video_embeds, - placeholder_token_id=self.config.video_token_index, - ) - - return inputs_embeds - def forward( self, input_ids: torch.Tensor, @@ -894,22 +866,6 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner from - # `get_multimodal_embeddings` and `get_input_embeddings`, this - # condition is only for v0 compatibility. - elif inputs_embeds is None: - image_input = self._parse_and_validate_image_input(**kwargs) - video_input = self._parse_and_validate_video_input(**kwargs) - - if image_input is None and video_input is None: - inputs_embeds = None - else: - inputs_embeds = self.get_input_embeddings_v0( - input_ids, - image_input=image_input, - video_input=video_input) - input_ids = None - hidden_states = self.language_model.model(input_ids, positions, intermediate_tensors, diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index bffc9a0c125ea..eaa3839af37b3 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -71,7 +71,7 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape from .idefics2_vision_model import Idefics2VisionTransformer from .interfaces import (MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal, SupportsPP) -from .utils import AutoWeightsLoader, flatten_bn, isin_list, maybe_prefix +from .utils import AutoWeightsLoader, flatten_bn, maybe_prefix # For profile run _MAX_FRAMES_PER_VIDEO = 16 @@ -1154,19 +1154,6 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP): if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner from - # `get_multimodal_embeddings` and `get_input_embeddings`, this - # condition is only for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=isin_list(input_ids, list(self.mm_token_ids)), - ) - input_ids = None - hidden_states = self.llm.model( input_ids=input_ids, positions=positions, diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py index 31571ce962d18..e932f7f007f5f 100644 --- a/vllm/model_executor/models/mistral3.py +++ b/vllm/model_executor/models/mistral3.py @@ -571,17 +571,6 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA, if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=input_ids == self.config.image_token_index, - ) - input_ids = None - hidden_states = self.language_model.model(input_ids, positions, intermediate_tensors, diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 3af5267928cde..db5a9fbc6a33b 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -823,17 +823,6 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal, if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, - # this condition is for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=input_ids == self.config.image_token_index, - ) - input_ids = None - return self.language_model(input_ids, positions, intermediate_tensors, inputs_embeds) diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 054caee9e8a4f..0227a83a1f550 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -1490,17 +1490,6 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=input_ids == self.img_patch_id, - ) - input_ids = None - hidden_states = self.model(input_ids, positions, intermediate_tensors, diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index 505806a15c891..2d0ebdc902778 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -35,7 +35,7 @@ from vllm.model_executor.models.nemotron_h import NemotronHForCausalLM from vllm.model_executor.models.radio import RadioModel from vllm.model_executor.models.utils import (flatten_bn, init_vllm_registered_model, - isin_list, maybe_prefix) + maybe_prefix) from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargs, MultiModalKwargsItems, @@ -1135,22 +1135,6 @@ class NemotronH_Nano_VL_V2(nn.Module, HasInnerState, IsHybrid, input_ids = None inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - context_token_ids = [ - token_id for token_id in (self.img_context_token_id, - self.video_context_token_id) - if token_id is not None - ] - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=isin_list(input_ids, context_token_ids), - ) - input_ids = None - hidden_states = self.language_model( input_ids=input_ids, positions=positions, diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py index 2627a262e9582..0e7ec8e458cfe 100644 --- a/vllm/model_executor/models/nemotron_vl.py +++ b/vllm/model_executor/models/nemotron_vl.py @@ -608,17 +608,6 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, input_ids = None inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=input_ids == self.img_context_token_id, - ) - input_ids = None - forward_kwargs = { "input_ids": input_ids, "positions": positions, diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py index 8503d3f71d1c9..2f9c6ddfc6618 100644 --- a/vllm/model_executor/models/ovis.py +++ b/vllm/model_executor/models/ovis.py @@ -511,17 +511,6 @@ class Ovis(nn.Module, SupportsMultiModal, SupportsPP): if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=input_ids == self.image_pad_token_id, - ) - input_ids = None - # up until here we have an inputs_embeds 100% numerical identity # between the OG HF Transformers implementation and ours hidden_states = self.llm( diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py index 2ecc7bff07e07..86ce7e9eab274 100644 --- a/vllm/model_executor/models/ovis2_5.py +++ b/vllm/model_executor/models/ovis2_5.py @@ -596,18 +596,6 @@ class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP): if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=input_ids == self.image_pad_token_id, - ) - input_ids = None - # up until here we have a inputs_embeds 100% numerical identity # between the OG HF Transformers implementation and ours hidden_states = self.llm( diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index f07f444819f4c..d118e6c89ab56 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -370,17 +370,6 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=input_ids == self.config.image_token_index, - ) - input_ids = None - hidden_states = self.language_model.model(input_ids, positions, intermediate_tensors, diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index ea34c8d92f136..59977796e2af9 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -679,17 +679,6 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=self.image_token_id, - ) - input_ids = None - hidden_states = self.language_model.model(input_ids, positions, intermediate_tensors, diff --git a/vllm/model_executor/models/phi4_multimodal.py b/vllm/model_executor/models/phi4_multimodal.py index e8b79717d75d0..3dbb67d280652 100644 --- a/vllm/model_executor/models/phi4_multimodal.py +++ b/vllm/model_executor/models/phi4_multimodal.py @@ -1411,22 +1411,6 @@ class Phi4MultimodalForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner from - # `get_multimodal_embeddings` and `get_input_embeddings`, this - # condition is only for v0 compatibility. - elif inputs_embeds is None: - image_input = self._parse_and_validate_image_input(**kwargs) - audio_input = self._parse_and_validate_audio_input(**kwargs) - - if image_input is None and audio_input is None: - inputs_embeds = None - else: - inputs_embeds = self.get_input_embeddings_v0( - input_ids, - image_input=image_input, - audio_input=audio_input) - input_ids = None - hidden_states = self.language_model( input_ids, positions, diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 15b09c7ae2bc9..8ccc7129ddb2e 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -35,8 +35,7 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape from .idefics2_vision_model import Idefics2VisionTransformer from .interfaces import MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal from .phi4mm_audio import AudioEmbedding -from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, maybe_prefix, - merge_multimodal_embeddings) +from .utils import AutoWeightsLoader, WeightsMapper, flatten_bn, maybe_prefix # <|endoftext10|> (see vocab.json in hf model) _IMAGE_PLACEHOLDER_TOKEN_ID = 200010 @@ -1174,35 +1173,6 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): return multimodal_embeddings - def get_input_embeddings_v0( - self, - input_ids: torch.Tensor, - image_input: Optional[Phi4MMImagePixelInputs] = None, - audio_input: Optional[Phi4MMAudioFeatureInputs] = None, - ) -> torch.Tensor: - audio_projection_mode = 'speech' - inputs_embeds = self.get_input_embeddings(input_ids) - if image_input is not None: - image_embeds = self._process_image_input(image_input) - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - image_embeds, - placeholder_token_id=_IMAGE_PLACEHOLDER_TOKEN_ID, - ) - audio_projection_mode = 'vision' - - if audio_input is not None: - audio_embeds = self._process_audio_input( - audio_input, audio_projection_mode=audio_projection_mode) - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - audio_embeds, - placeholder_token_id=_AUDIO_PLACEHOLDER_TOKEN_ID, - ) - return inputs_embeds - def forward( self, input_ids: torch.Tensor, @@ -1214,22 +1184,6 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner from - # `get_multimodal_embeddings` and `get_input_embeddings`, this - # condition is only for v0 compatibility. - elif inputs_embeds is None: - image_input = self._parse_and_validate_image_input(**kwargs) - audio_input = self._parse_and_validate_audio_input(**kwargs) - - if image_input is None and audio_input is None: - inputs_embeds = None - else: - inputs_embeds = self.get_input_embeddings_v0( - input_ids, - image_input=image_input, - audio_input=audio_input) - input_ids = None - hidden_states = self.model( input_ids, positions, diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 2c04b6f0f4f90..6344fc3948339 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -444,17 +444,6 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=input_ids == self.vision_args.image_token_id, - ) - input_ids = None - hidden_states = self.language_model.model(input_ids, positions, intermediate_tensors, diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index bfa398ee43b56..8f069710b0f92 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -69,8 +69,7 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import (MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal, SupportsPP) from .utils import (AutoWeightsLoader, WeightsMapper, - init_vllm_registered_model, maybe_prefix, - merge_multimodal_embeddings) + init_vllm_registered_model, maybe_prefix) try: import flash_attn @@ -908,26 +907,6 @@ class Qwen2_5OmniThinkerForConditionalGeneration( multimodal_embeddings.append((video_embeds, "video")) return multimodal_embeddings - def get_input_embeddings_v0( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[NestedTensors] = None, - ) -> torch.Tensor: - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is None or len(multimodal_embeddings) == 0: - return inputs_embeds - - for embeddings, modality in multimodal_embeddings: - if modality == "audio": - placeholder_token_id = self.config.audio_token_index - if modality == "image": - placeholder_token_id = self.config.image_token_index - if modality == "video": - placeholder_token_id = self.config.video_token_index - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, embeddings, placeholder_token_id) - return inputs_embeds - def forward( self, input_ids: torch.Tensor, @@ -939,14 +918,6 @@ class Qwen2_5OmniThinkerForConditionalGeneration( if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - multimodal_embeddings = self.get_multimodal_embeddings_v0(**kwargs) - inputs_embeds = self.get_input_embeddings_v0( - input_ids, multimodal_embeddings) - input_ids = None - hidden_states = self.language_model.model(input_ids, positions, intermediate_tensors, diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 5b092b42205fa..da3889d31a7d1 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -64,7 +64,6 @@ from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import PromptReplacement, PromptUpdate from vllm.platforms import _Backend from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.config import uses_mrope from vllm.utils import is_pin_memory_available from vllm.utils.tensor_schema import TensorSchema, TensorShape @@ -75,8 +74,7 @@ from .qwen2_vl import Qwen2VLDummyInputsBuilder as Qwen2_5_VLDummyInputsBuilder from .qwen2_vl import (Qwen2VLMultiModalProcessor, Qwen2VLProcessingInfo, apply_rotary_pos_emb_vision) from .utils import (AutoWeightsLoader, WeightsMapper, cast_overflow_tensors, - init_vllm_registered_model, maybe_prefix, - merge_multimodal_embeddings) + init_vllm_registered_model, maybe_prefix) from .vision import get_vit_attn_backend, run_dp_sharded_mrope_vision_model logger = init_logger(__name__) @@ -1365,40 +1363,6 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal, multimodal_embeddings += video_embeddings return multimodal_embeddings - def get_input_embeddings_v0( - self, - input_ids: torch.Tensor, - image_input: Optional[Qwen2_5_VLImageInputs] = None, - video_input: Optional[Qwen2_5_VLVideoInputs] = None, - ) -> torch.Tensor: - inputs_embeds = self.get_input_embeddings(input_ids) - if image_input is not None: - image_embeds = self._process_image_input(image_input) - if self.is_multimodal_pruning_enabled: - image_embeds = self._postprocess_image_embeds_evs( - image_embeds, image_input - ) - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - image_embeds, - placeholder_token_id=self.config.image_token_id, - ) - - if video_input is not None: - video_embeds = self._process_video_input(video_input) - if self.is_multimodal_pruning_enabled: - video_embeds = self._postprocess_video_embeds_evs( - video_embeds, video_input - ) - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - video_embeds, - placeholder_token_id=self.config.video_token_id, - ) - return inputs_embeds - def forward( self, input_ids: torch.Tensor, @@ -1421,26 +1385,6 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal, if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner from - # `get_multimodal_embeddings` and `get_input_embeddings`, this - # condition is only for v0 compatibility. - elif inputs_embeds is None: - image_input = self._parse_and_validate_image_input(**kwargs) - video_input = self._parse_and_validate_video_input(**kwargs) - - if image_input is None and video_input is None: - inputs_embeds = None - else: - if uses_mrope(self.config): - assert positions.ndim == 2 and positions.size(0) == 3, ( - "multimodal section rotary embedding requires " - f"(3, seq_len) positions, but got {positions.size()}") - inputs_embeds = self.get_input_embeddings_v0( - input_ids, - image_input=image_input, - video_input=video_input) - input_ids = None - hidden_states = self.language_model.model( input_ids=input_ids, positions=positions, diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index 9dfa29eef5ce7..f9136863b8d65 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -449,17 +449,6 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal, if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - multimodal_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - multimodal_embeddings, - is_multimodal=input_ids == self.config.audio_token_index, - ) - input_ids = None - hidden_states = self.language_model.model(input_ids, positions, intermediate_tensors, diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 6ef01f333554e..f83a411459cc9 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -65,15 +65,13 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.platforms import _Backend, current_platform from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.config import uses_mrope from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import (MultiModalEmbeddings, SupportsLoRA, SupportsMRoPE, SupportsMultiModal, SupportsPP) from .utils import (AutoWeightsLoader, WeightsMapper, - init_vllm_registered_model, maybe_prefix, - merge_multimodal_embeddings) + init_vllm_registered_model, maybe_prefix) from .vision import get_vit_attn_backend, run_dp_sharded_mrope_vision_model logger = init_logger(__name__) @@ -1464,32 +1462,6 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, return multimodal_embeddings - def get_input_embeddings_v0( - self, - input_ids: torch.Tensor, - image_input: Optional[Qwen2VLImagePixelInputs] = None, - video_input: Optional[Qwen2VLVideoPixelInputs] = None, - ) -> torch.Tensor: - inputs_embeds = self.get_input_embeddings(input_ids) - if image_input is not None: - image_embeds = self._process_image_input(image_input) - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - image_embeds, - placeholder_token_id=self.config.image_token_id, - ) - - if video_input is not None: - video_embeds = self._process_video_input(video_input) - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - video_embeds, - placeholder_token_id=self.config.video_token_id, - ) - return inputs_embeds - def forward( self, input_ids: torch.Tensor, @@ -1515,26 +1487,6 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner from - # `get_multimodal_embeddings` and `get_input_embeddings`, this - # condition is only for v0 compatibility. - elif inputs_embeds is None: - image_input = self._parse_and_validate_image_input(**kwargs) - video_input = self._parse_and_validate_video_input(**kwargs) - - if image_input is None and video_input is None: - inputs_embeds = None - else: - if uses_mrope(self.config): - assert positions.ndim == 2 and positions.size(0) == 3, ( - "multimodal section rotary embedding requires " - f"(3, seq_len) positions, but got {positions.size()}") - inputs_embeds = self.get_input_embeddings_v0( - input_ids, - image_input=image_input, - video_input=video_input) - input_ids = None - hidden_states = self.language_model.model( input_ids=input_ids, positions=positions, diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 6d2a6019ef6f4..ce92557d64249 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -68,7 +68,6 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.platforms import _Backend from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.config import uses_mrope from vllm.utils import is_list_of from .interfaces import (MultiModalEmbeddings, SupportsLoRA, @@ -82,8 +81,7 @@ from .qwen2_5_vl import (Qwen2_5_VisionAttention, from .qwen2_vl import Qwen2VLProcessingInfo from .qwen3 import Qwen3ForCausalLM, Qwen3Model from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper, - _merge_multimodal_embeddings, maybe_prefix, - merge_multimodal_embeddings) + _merge_multimodal_embeddings, maybe_prefix) from .vision import get_vit_attn_backend, run_dp_sharded_mrope_vision_model logger = init_logger(__name__) @@ -1464,75 +1462,6 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal, return inputs_embeds - def get_input_embeddings_v0( - self, - input_ids: torch.Tensor, - image_input: Optional[Qwen2_5_VLImageInputs] = None, - video_input: Optional[Qwen2_5_VLVideoInputs] = None, - ) -> torch.Tensor: - inputs_embeds = self.get_input_embeddings(input_ids) - - if self.use_deepstack: - visual_dim = inputs_embeds.shape[-1] - deepstack_input_embeds = None - if image_input is not None or video_input is not None: - deepstack_input_embeds = torch.zeros_like( - inputs_embeds).unsqueeze(1).repeat( - 1, self.deepstack_num_level, 1).flatten(1) - - if image_input is not None: - image_embeds = self._process_image_input(image_input) - if self.use_deepstack: - image_embeds = torch.cat(image_embeds) - - image_embeds, image_embeds_multiscale = image_embeds.split( - [visual_dim, visual_dim * self.deepstack_num_level], - dim=-1) - - deepstack_input_embeds = merge_multimodal_embeddings( - input_ids, - deepstack_input_embeds, - image_embeds_multiscale, - placeholder_token_id=self.config.image_token_id, - ) - - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - image_embeds, - placeholder_token_id=self.config.image_token_id, - ) - - if video_input is not None: - video_embeds = self._process_video_input(video_input) - if self.use_deepstack: - video_embeds = torch.cat(video_embeds) - - video_embeds, video_embeds_multiscale = video_embeds.split( - [visual_dim, visual_dim * self.deepstack_num_level], - dim=-1) - - deepstack_input_embeds = merge_multimodal_embeddings( - input_ids, - deepstack_input_embeds, - video_embeds_multiscale, - placeholder_token_id=self.config.video_token_id, - ) - - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - video_embeds, - placeholder_token_id=self.config.video_token_id, - ) - - if self.use_deepstack and deepstack_input_embeds is not None: - deepstack_input_embeds = deepstack_input_embeds.view( - inputs_embeds.shape[0], self.deepstack_num_level, - visual_dim).permute(1, 0, 2).contiguous() - self._set_deepstack_input_embeds(deepstack_input_embeds) - return inputs_embeds - def forward( self, input_ids: torch.Tensor, @@ -1568,26 +1497,6 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal, if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner from - # `get_multimodal_embeddings` and `get_input_embeddings`, this - # condition is only for v0 compatibility. - elif inputs_embeds is None: - image_input = self._parse_and_validate_image_input(**kwargs) - video_input = self._parse_and_validate_video_input(**kwargs) - - if image_input is None and video_input is None: - inputs_embeds = None - else: - if uses_mrope(self.config): - assert positions.ndim == 2 and positions.size(0) == 3, ( - "multimodal section rotary embedding requires " - f"(3, seq_len) positions, but got {positions.size()}") - inputs_embeds = self.get_input_embeddings_v0( - input_ids, - image_input=image_input, - video_input=video_input) - input_ids = None - if self.use_deepstack and inputs_embeds is not None and get_pp_group( ).is_first_rank: deepstack_input_embeds = self._get_deepstack_input_embeds( diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py index dc11b60604a91..924119ed63aba 100644 --- a/vllm/model_executor/models/qwen_vl.py +++ b/vllm/model_executor/models/qwen_vl.py @@ -767,18 +767,6 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA, if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=input_ids == - self.transformer.visual.image_pad_id, - ) - input_ids = None - hidden_states = self.transformer(input_ids, positions, intermediate_tensors, inputs_embeds) return hidden_states diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py index f9a107c06085b..f03022aa719ce 100644 --- a/vllm/model_executor/models/skyworkr1v.py +++ b/vllm/model_executor/models/skyworkr1v.py @@ -874,17 +874,6 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP): input_ids = None inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=input_ids == self.img_context_token_id, - ) - input_ids = None - forward_kwargs = { "input_ids": input_ids, "positions": positions, diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 7cfb639f675d5..00d87f560e70a 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -881,19 +881,6 @@ class TransformersForMultimodalLM(TransformersForCausalLM, SupportsMultiModal): inputs_embeds: Optional[torch.Tensor] = None, **kwargs: object, ) -> Union[torch.Tensor, IntermediateTensors]: - # NOTE: In v1, inputs_embeds is always generated at model runner from - # `get_multimodal_embeddings` and `get_input_embeddings`, this - # condition is only for v0 compatibility. - if inputs_embeds is None: - multimodal_embeds = self.get_multimodal_embeddings(**kwargs) - if multimodal_embeds is not None: - inputs_embeds = self.get_input_embeddings( - input_ids, - multimodal_embeds, - is_multimodal=input_ids == self.config.image_token_id, - ) - input_ids = None - model_output = super().forward(input_ids, positions, intermediate_tensors, inputs_embeds) return model_output diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 77e886c22e634..70aabf6dfe789 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -597,18 +597,6 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - multimodal_embeddings = self.get_multimodal_embeddings(**kwargs) - - inputs_embeds = self.get_input_embeddings( - input_ids, - multimodal_embeddings, - is_multimodal=input_ids == self.config.audio_token_index, - ) - input_ids = None - language_model = self.language_model if hasattr(language_model, "language_model"): language_model = language_model.language_model diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index f93e7ccfd06ff..1edeaeb0f319d 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -371,19 +371,6 @@ class VoxtralForConditionalGeneration(nn.Module, SupportsMultiModal, if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - audio_encoder = self.tokenizer.instruct.audio_encoder - audio_tok_id = audio_encoder.audio_token - audio_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - audio_embeddings, - is_multimodal=input_ids == audio_tok_id, - ) - input_ids = None - hidden_states = self.language_model.model(input_ids, positions, intermediate_tensors,