From b03d1a04a867c5392afae8e374951ef079f5f6ba Mon Sep 17 00:00:00 2001
From: Anexdeus <5142168@mail.ru>
Date: Sat, 20 Dec 2025 12:29:46 +0300
Subject: [PATCH] added ProcessingInfoMixin for QwenVL series models

---
 vllm/model_executor/models/qwen2_vl.py | 67 +++++++++++---------------
 vllm/multimodal/processing.py          | 52 --------------------
 2 files changed, 28 insertions(+), 91 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 44f076e8d70f3..d530cf629f4ad 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -820,7 +820,34 @@ class Qwen2VLMultiModalDataParser(MultiModalDataParser):
         return super()._parse_video_data(data)
 
 
-class Qwen2VLProcessingInfo(BaseProcessingInfo):
+class QwenVLSeriesProcessingInfoMixin:
+    """
+    Mixin that provides get_num_mm_encoder_tokens()
+    and get_num_mm_connector_tokens() methods for
+    QwenVL series models without affecting other multi-modal models.
+    """
+
+    def get_num_mm_encoder_tokens(
+        self,
+        num_image_tokens: int,
+    ) -> int:
+        hf_config = self.get_hf_config()
+        vision_config = hf_config.vision_config
+        merge_size = vision_config.spatial_merge_size
+
+        return num_image_tokens * merge_size**2
+
+    def get_num_mm_connector_tokens(
+        self,
+        num_vision_tokens: int,
+    ) -> int:
+        hf_config = self.get_hf_config()
+        vision_config = hf_config.vision_config
+        merge_size = vision_config.spatial_merge_size
+        return num_vision_tokens // merge_size**2
+
+
+class Qwen2VLProcessingInfo(QwenVLSeriesProcessingInfoMixin, BaseProcessingInfo):
     def get_hf_config(self):
         return self.ctx.get_hf_config(Qwen2VLConfig)
 
@@ -1017,25 +1044,6 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
             image_processor=None,
         )
 
-    def get_num_mm_encoder_tokens(
-        self,
-        num_image_tokens: int,
-    ) -> int:
-        hf_config = self.get_hf_config()
-        vision_config = hf_config.vision_config
-        merge_size = vision_config.spatial_merge_size
-
-        return num_image_tokens * merge_size**2
-
-    def get_num_mm_connector_tokens(
-        self,
-        num_vision_tokens: int,
-    ) -> int:
-        hf_config = self.get_hf_config()
-        vision_config = hf_config.vision_config
-        merge_size = vision_config.spatial_merge_size
-        return num_vision_tokens // merge_size**2
-
 
 class Qwen2VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2VLProcessingInfo]):
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
@@ -1132,25 +1140,6 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo])
             self.info.get_hf_config().vision_config.spatial_merge_size
         )(hf_inputs)
 
-    def get_num_mm_encoder_tokens(
-        self,
-        num_image_tokens: int,
-    ) -> int:
-        hf_config = self.info.get_hf_config()
-        vision_config = hf_config.vision_config
-        merge_size = vision_config.spatial_merge_size
-
-        return num_image_tokens * merge_size**2
-
-    def get_num_mm_connector_tokens(
-        self,
-        num_vision_tokens: int,
-    ) -> int:
-        hf_config = self.info.get_hf_config()
-        vision_config = hf_config.vision_config
-        merge_size = vision_config.spatial_merge_size
-        return num_vision_tokens // merge_size**2
-
 
 @MULTIMODAL_REGISTRY.register_processor(
     Qwen2VLMultiModalProcessor,
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 99ecaf61badd2..0390773783961 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1185,32 +1185,6 @@ class BaseProcessingInfo:
         """
         return self.ctx.get_hf_processor(**kwargs)
 
-    @abstractmethod
-    def get_num_mm_encoder_tokens(
-        self,
-        num_image_tokens: int,
-    ) -> int:
-        """
-        Implement this function to enable LoRA support 
-        for the tower module of the multi-modal model
-
-        Given the number of image tokens, output the number of multi-modal encoder tokens
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def get_num_mm_connector_tokens(
-        self,
-        num_vision_tokens: int,
-    ) -> int:
-        """
-        Implement this function to enable LoRA support
-        for the connector module of the multi-modal model
-
-        Given the number of vision tokens, output the number of multi-modal connector tokens
-        """
-        raise NotImplementedError
-
     @abstractmethod
     def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         """
@@ -1415,32 +1389,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         """Given the HF-processed data, output the metadata of each field."""
         raise NotImplementedError
 
-    @abstractmethod
-    def get_num_mm_encoder_tokens(
-        self,
-        num_image_tokens: int,
-    ) -> int:
-        """
-        Implement this function to enable LoRA support 
-        for the tower module of the multi-modal model
-
-        Given the number of image tokens, output the number of multi-modal encoder tokens
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def get_num_mm_connector_tokens(
-        self,
-        num_vision_tokens: int,
-    ) -> int:
-        """
-        Implement this function to enable LoRA support
-        for the connector module of the multi-modal model
-
-        Given the number of vision tokens, output the number of multi-modal connector tokens
-        """
-        raise NotImplementedError
-
     @abstractmethod
     def _get_prompt_updates(
         self,