From 36121c6db040732873b03b3c726d1b58097689e8 Mon Sep 17 00:00:00 2001
From: Anexdeus <5142168@mail.ru>
Date: Wed, 17 Dec 2025 01:31:34 +0300
Subject: [PATCH] fixed property bug in processor and added abstract methods in
 BaseProcessingInfo

---
 vllm/model_executor/models/qwen2_5_vl.py | 19 -----------
 vllm/model_executor/models/qwen2_vl.py   | 19 +++++++++++
 vllm/multimodal/processing.py            | 40 ++++++++++++++++++++++--
 3 files changed, 57 insertions(+), 21 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 0799f4500a351..02fac0b78a4b4 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -1007,25 +1007,6 @@ class Qwen2_5_VLMultiModalProcessor(Qwen2VLMultiModalProcessor):
             for modality in ("image", "video")
         ]
 
-    def get_num_mm_encoder_tokens(
-        self,
-        num_image_tokens: int,
-    ) -> int:
-        hf_config = self.get_hf_config()
-        vision_config = hf_config.vision_config
-        merge_size = vision_config.spatial_merge_size
-
-        return num_image_tokens * merge_size**2
-
-    def get_num_mm_connector_tokens(
-        self,
-        num_vision_tokens: int,
-    ) -> int:
-        hf_config = self.get_hf_config()
-        vision_config = hf_config.vision_config
-        merge_size = vision_config.spatial_merge_size
-        return num_vision_tokens // merge_size**2
-
 
 @MULTIMODAL_REGISTRY.register_processor(
     Qwen2_5_VLMultiModalProcessor,
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 6e5560b945f2f..44f076e8d70f3 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -1132,6 +1132,25 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo])
             self.info.get_hf_config().vision_config.spatial_merge_size
         )(hf_inputs)
 
+    def get_num_mm_encoder_tokens(
+        self,
+        num_image_tokens: int,
+    ) -> int:
+        hf_config = self.info.get_hf_config()
+        vision_config = hf_config.vision_config
+        merge_size = vision_config.spatial_merge_size
+
+        return num_image_tokens * merge_size**2
+
+    def get_num_mm_connector_tokens(
+        self,
+        num_vision_tokens: int,
+    ) -> int:
+        hf_config = self.info.get_hf_config()
+        vision_config = hf_config.vision_config
+        merge_size = vision_config.spatial_merge_size
+        return num_vision_tokens // merge_size**2
+
 
 @MULTIMODAL_REGISTRY.register_processor(
     Qwen2VLMultiModalProcessor,
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index f337bc9b0f7ba..99ecaf61badd2 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1185,6 +1185,32 @@ class BaseProcessingInfo:
         """
         return self.ctx.get_hf_processor(**kwargs)
 
+    @abstractmethod
+    def get_num_mm_encoder_tokens(
+        self,
+        num_image_tokens: int,
+    ) -> int:
+        """
+        Implement this function to enable LoRA support 
+        for the tower module of the multi-modal model
+
+        Given the number of image tokens, output the number of multi-modal encoder tokens
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_num_mm_connector_tokens(
+        self,
+        num_vision_tokens: int,
+    ) -> int:
+        """
+        Implement this function to enable LoRA support
+        for the connector module of the multi-modal model
+
+        Given the number of vision tokens, output the number of multi-modal connector tokens
+        """
+        raise NotImplementedError
+
     @abstractmethod
     def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         """
@@ -1394,7 +1420,12 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         self,
         num_image_tokens: int,
     ) -> int:
-        """Given the number of image tokens, output the number of multi-modal encoder tokens"""
+        """
+        Implement this function to enable LoRA support 
+        for the tower module of the multi-modal model
+
+        Given the number of image tokens, output the number of multi-modal encoder tokens
+        """
         raise NotImplementedError
 
     @abstractmethod
@@ -1402,7 +1433,12 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         self,
         num_vision_tokens: int,
     ) -> int:
-        """Given the number of vision tokens, output the number of multi-modal connector tokens"""
+        """
+        Implement this function to enable LoRA support
+        for the connector module of the multi-modal model
+
+        Given the number of vision tokens, output the number of multi-modal connector tokens
+        """
         raise NotImplementedError
 
     @abstractmethod