diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index 2b25dc7666c3..0adb32a7ac33 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -443,6 +443,8 @@ For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embedd print(generated_text) ``` +For Qwen3-VL, the `image_embeds` should contain both the base image embedding and deepstack features. + #### Audio Embeddings You can pass pre-computed audio embeddings similar to image embeddings: diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 1d3929b936a9..58721303dfc8 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -103,7 +103,7 @@ from .qwen2_5_vl import ( Qwen2_5_VLVideoInputs, Qwen2_5_VLVideoPixelInputs, ) -from .qwen2_vl import Qwen2VLProcessingInfo +from .qwen2_vl import Qwen2VLMultiModalDataParser, Qwen2VLProcessingInfo from .qwen3 import Qwen3ForCausalLM, Qwen3Model from .utils import ( AutoWeightsLoader, @@ -884,7 +884,10 @@ class Qwen3VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3VLProcessingInfo]): class Qwen3VLMultiModalProcessor(BaseMultiModalProcessor[Qwen3VLProcessingInfo]): def _get_data_parser(self) -> MultiModalDataParser: - return MultiModalDataParser(video_needs_metadata=True) + return Qwen2VLMultiModalDataParser( + self.info.get_hf_config().vision_config.spatial_merge_size, + video_needs_metadata=True, + ) def _call_hf_processor( self,