From b03d1a04a867c5392afae8e374951ef079f5f6ba Mon Sep 17 00:00:00 2001 From: Anexdeus <5142168@mail.ru> Date: Sat, 20 Dec 2025 12:29:46 +0300 Subject: [PATCH] added ProcessingInfoMixin for QwenVL series models --- vllm/model_executor/models/qwen2_vl.py | 67 +++++++++++--------------- vllm/multimodal/processing.py | 52 -------------------- 2 files changed, 28 insertions(+), 91 deletions(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 44f076e8d70f3..d530cf629f4ad 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -820,7 +820,34 @@ class Qwen2VLMultiModalDataParser(MultiModalDataParser): return super()._parse_video_data(data) -class Qwen2VLProcessingInfo(BaseProcessingInfo): +class QwenVLSeriesProcessingInfoMixin: + """ + Mixin that provides get_num_mm_encoder_tokens() + and get_num_mm_connector_tokens() methods for + QwenVL series models without affecting other multi-modal models. + """ + + def get_num_mm_encoder_tokens( + self, + num_image_tokens: int, + ) -> int: + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + merge_size = vision_config.spatial_merge_size + + return num_image_tokens * merge_size**2 + + def get_num_mm_connector_tokens( + self, + num_vision_tokens: int, + ) -> int: + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + merge_size = vision_config.spatial_merge_size + return num_vision_tokens // merge_size**2 + + +class Qwen2VLProcessingInfo(QwenVLSeriesProcessingInfoMixin, BaseProcessingInfo): def get_hf_config(self): return self.ctx.get_hf_config(Qwen2VLConfig) @@ -1017,25 +1044,6 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo): image_processor=None, ) - def get_num_mm_encoder_tokens( - self, - num_image_tokens: int, - ) -> int: - hf_config = self.get_hf_config() - vision_config = hf_config.vision_config - merge_size = vision_config.spatial_merge_size - - return num_image_tokens * merge_size**2 - - def get_num_mm_connector_tokens( - self, - num_vision_tokens: int, - ) -> int: - hf_config = self.get_hf_config() - vision_config = hf_config.vision_config - merge_size = vision_config.spatial_merge_size - return num_vision_tokens // merge_size**2 - class Qwen2VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2VLProcessingInfo]): def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: @@ -1132,25 +1140,6 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]) self.info.get_hf_config().vision_config.spatial_merge_size )(hf_inputs) - def get_num_mm_encoder_tokens( - self, - num_image_tokens: int, - ) -> int: - hf_config = self.info.get_hf_config() - vision_config = hf_config.vision_config - merge_size = vision_config.spatial_merge_size - - return num_image_tokens * merge_size**2 - - def get_num_mm_connector_tokens( - self, - num_vision_tokens: int, - ) -> int: - hf_config = self.info.get_hf_config() - vision_config = hf_config.vision_config - merge_size = vision_config.spatial_merge_size - return num_vision_tokens // merge_size**2 - @MULTIMODAL_REGISTRY.register_processor( Qwen2VLMultiModalProcessor, diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 99ecaf61badd2..0390773783961 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1185,32 +1185,6 @@ class BaseProcessingInfo: """ return self.ctx.get_hf_processor(**kwargs) - @abstractmethod - def get_num_mm_encoder_tokens( - self, - num_image_tokens: int, - ) -> int: - """ - Implement this function to enable LoRA support - for the tower module of the multi-modal model - - Given the number of image tokens, output the number of multi-modal encoder tokens - """ - raise NotImplementedError - - @abstractmethod - def get_num_mm_connector_tokens( - self, - num_vision_tokens: int, - ) -> int: - """ - Implement this function to enable LoRA support - for the connector module of the multi-modal model - - Given the number of vision tokens, output the number of multi-modal connector tokens - """ - raise NotImplementedError - @abstractmethod def get_supported_mm_limits(self) -> Mapping[str, int | None]: """ @@ -1415,32 +1389,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): """Given the HF-processed data, output the metadata of each field.""" raise NotImplementedError - @abstractmethod - def get_num_mm_encoder_tokens( - self, - num_image_tokens: int, - ) -> int: - """ - Implement this function to enable LoRA support - for the tower module of the multi-modal model - - Given the number of image tokens, output the number of multi-modal encoder tokens - """ - raise NotImplementedError - - @abstractmethod - def get_num_mm_connector_tokens( - self, - num_vision_tokens: int, - ) -> int: - """ - Implement this function to enable LoRA support - for the connector module of the multi-modal model - - Given the number of vision tokens, output the number of multi-modal connector tokens - """ - raise NotImplementedError - @abstractmethod def _get_prompt_updates( self,