diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index d530cf629f4ad..13014bebb1054 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -820,34 +820,7 @@ class Qwen2VLMultiModalDataParser(MultiModalDataParser): return super()._parse_video_data(data) -class QwenVLSeriesProcessingInfoMixin: - """ - Mixin that provides get_num_mm_encoder_tokens() - and get_num_mm_connector_tokens() methods for - QwenVL series models without affecting other multi-modal models. - """ - - def get_num_mm_encoder_tokens( - self, - num_image_tokens: int, - ) -> int: - hf_config = self.get_hf_config() - vision_config = hf_config.vision_config - merge_size = vision_config.spatial_merge_size - - return num_image_tokens * merge_size**2 - - def get_num_mm_connector_tokens( - self, - num_vision_tokens: int, - ) -> int: - hf_config = self.get_hf_config() - vision_config = hf_config.vision_config - merge_size = vision_config.spatial_merge_size - return num_vision_tokens // merge_size**2 - - -class Qwen2VLProcessingInfo(QwenVLSeriesProcessingInfoMixin, BaseProcessingInfo): +class Qwen2VLProcessingInfo(BaseProcessingInfo): def get_hf_config(self): return self.ctx.get_hf_config(Qwen2VLConfig) @@ -1131,6 +1104,25 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]) for modality in ("image", "video") ] + def get_num_mm_encoder_tokens( + self, + num_image_tokens: int, + ) -> int: + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + merge_size = vision_config.spatial_merge_size + + return num_image_tokens * merge_size**2 + + def get_num_mm_connector_tokens( + self, + num_vision_tokens: int, + ) -> int: + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + merge_size = vision_config.spatial_merge_size + return num_vision_tokens // merge_size**2 + def _get_mm_fields_config( self, hf_inputs: BatchFeature, diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 0390773783961..056eee502448c 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1412,6 +1412,28 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): """ raise NotImplementedError + def get_num_mm_encoder_tokens( + self, + num_image_tokens: int, + ) -> int: + """ + Implement this function to enable LoRA support + for the tower module of the multi-modal model + Given the number of image tokens, output the number of multi-modal encoder tokens + """ + raise NotImplementedError + + def get_num_mm_connector_tokens( + self, + num_vision_tokens: int, + ) -> int: + """ + Implement this function to enable LoRA support + for the connector module of the multi-modal model + Given the number of vision tokens, output the number of multi-modal connector tokens + """ + raise NotImplementedError + def _bind_and_group_updates( self, prompt_updates: Sequence[PromptUpdate],