From 36121c6db040732873b03b3c726d1b58097689e8 Mon Sep 17 00:00:00 2001 From: Anexdeus <5142168@mail.ru> Date: Wed, 17 Dec 2025 01:31:34 +0300 Subject: [PATCH] fixed property bug in processor and added abstract methods in BaseProcessingInfo --- vllm/model_executor/models/qwen2_5_vl.py | 19 ----------- vllm/model_executor/models/qwen2_vl.py | 19 +++++++++++ vllm/multimodal/processing.py | 40 ++++++++++++++++++++++-- 3 files changed, 57 insertions(+), 21 deletions(-) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 0799f4500a351..02fac0b78a4b4 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -1007,25 +1007,6 @@ class Qwen2_5_VLMultiModalProcessor(Qwen2VLMultiModalProcessor): for modality in ("image", "video") ] - def get_num_mm_encoder_tokens( - self, - num_image_tokens: int, - ) -> int: - hf_config = self.get_hf_config() - vision_config = hf_config.vision_config - merge_size = vision_config.spatial_merge_size - - return num_image_tokens * merge_size**2 - - def get_num_mm_connector_tokens( - self, - num_vision_tokens: int, - ) -> int: - hf_config = self.get_hf_config() - vision_config = hf_config.vision_config - merge_size = vision_config.spatial_merge_size - return num_vision_tokens // merge_size**2 - @MULTIMODAL_REGISTRY.register_processor( Qwen2_5_VLMultiModalProcessor, diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 6e5560b945f2f..44f076e8d70f3 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1132,6 +1132,25 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]) self.info.get_hf_config().vision_config.spatial_merge_size )(hf_inputs) + def get_num_mm_encoder_tokens( + self, + num_image_tokens: int, + ) -> int: + hf_config = self.info.get_hf_config() + vision_config = hf_config.vision_config + merge_size = vision_config.spatial_merge_size + + return num_image_tokens * merge_size**2 + + def get_num_mm_connector_tokens( + self, + num_vision_tokens: int, + ) -> int: + hf_config = self.info.get_hf_config() + vision_config = hf_config.vision_config + merge_size = vision_config.spatial_merge_size + return num_vision_tokens // merge_size**2 + @MULTIMODAL_REGISTRY.register_processor( Qwen2VLMultiModalProcessor, diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index f337bc9b0f7ba..99ecaf61badd2 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1185,6 +1185,32 @@ class BaseProcessingInfo: """ return self.ctx.get_hf_processor(**kwargs) + @abstractmethod + def get_num_mm_encoder_tokens( + self, + num_image_tokens: int, + ) -> int: + """ + Implement this function to enable LoRA support + for the tower module of the multi-modal model + + Given the number of image tokens, output the number of multi-modal encoder tokens + """ + raise NotImplementedError + + @abstractmethod + def get_num_mm_connector_tokens( + self, + num_vision_tokens: int, + ) -> int: + """ + Implement this function to enable LoRA support + for the connector module of the multi-modal model + + Given the number of vision tokens, output the number of multi-modal connector tokens + """ + raise NotImplementedError + @abstractmethod def get_supported_mm_limits(self) -> Mapping[str, int | None]: """ @@ -1394,7 +1420,12 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): self, num_image_tokens: int, ) -> int: - """Given the number of image tokens, output the number of multi-modal encoder tokens""" + """ + Implement this function to enable LoRA support + for the tower module of the multi-modal model + + Given the number of image tokens, output the number of multi-modal encoder tokens + """ raise NotImplementedError @abstractmethod @@ -1402,7 +1433,12 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): self, num_vision_tokens: int, ) -> int: - """Given the number of vision tokens, output the number of multi-modal connector tokens""" + """ + Implement this function to enable LoRA support + for the connector module of the multi-modal model + + Given the number of vision tokens, output the number of multi-modal connector tokens + """ raise NotImplementedError @abstractmethod