diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index cb99d57e8b8c7..ae119969b5846 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -141,6 +141,22 @@ class SupportsMultiModal(Protocol): """ ... + def get_num_mm_encoder_tokens(self, num_image_tokens: int) -> int: + """ + Implement this function to enable LoRA support + for the tower module of the multi-modal model + Given the number of image tokens, output the number of multi-modal encoder tokens + """ + ... + + def get_num_mm_connector_tokens(self, num_vision_tokens: int) -> int: + """ + Implement this function to enable LoRA support + for the connector module of the multi-modal model + Given the number of vision tokens, output the number of multi-modal connector tokens + """ + ... + @overload def embed_input_ids(self, input_ids: Tensor) -> Tensor: ... diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 0799f4500a351..998cefd33e801 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -1007,25 +1007,6 @@ class Qwen2_5_VLMultiModalProcessor(Qwen2VLMultiModalProcessor): for modality in ("image", "video") ] - def get_num_mm_encoder_tokens( - self, - num_image_tokens: int, - ) -> int: - hf_config = self.get_hf_config() - vision_config = hf_config.vision_config - merge_size = vision_config.spatial_merge_size - - return num_image_tokens * merge_size**2 - - def get_num_mm_connector_tokens( - self, - num_vision_tokens: int, - ) -> int: - hf_config = self.get_hf_config() - vision_config = hf_config.vision_config - merge_size = vision_config.spatial_merge_size - return num_vision_tokens // merge_size**2 - @MULTIMODAL_REGISTRY.register_processor( Qwen2_5_VLMultiModalProcessor, @@ -1587,3 +1568,22 @@ class Qwen2_5_VLForConditionalGeneration( connector="visual.merger.", tower_model="visual.", ) + + def get_num_mm_encoder_tokens( + self, + num_image_tokens: int, + ) -> int: + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + merge_size = vision_config.spatial_merge_size + + return num_image_tokens * merge_size**2 + + def get_num_mm_connector_tokens( + self, + num_vision_tokens: int, + ) -> int: + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + merge_size = vision_config.spatial_merge_size + return num_vision_tokens // merge_size**2 diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 6e5560b945f2f..cd9ddaa532490 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1017,25 +1017,6 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo): image_processor=None, ) - def get_num_mm_encoder_tokens( - self, - num_image_tokens: int, - ) -> int: - hf_config = self.get_hf_config() - vision_config = hf_config.vision_config - merge_size = vision_config.spatial_merge_size - - return num_image_tokens * merge_size**2 - - def get_num_mm_connector_tokens( - self, - num_vision_tokens: int, - ) -> int: - hf_config = self.get_hf_config() - vision_config = hf_config.vision_config - merge_size = vision_config.spatial_merge_size - return num_vision_tokens // merge_size**2 - class Qwen2VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2VLProcessingInfo]): def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: @@ -1510,6 +1491,25 @@ class Qwen2VLForConditionalGeneration( tower_model="visual.", ) + def get_num_mm_encoder_tokens( + self, + num_image_tokens: int, + ) -> int: + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + merge_size = vision_config.spatial_merge_size + + return num_image_tokens * merge_size**2 + + def get_num_mm_connector_tokens( + self, + num_vision_tokens: int, + ) -> int: + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + merge_size = vision_config.spatial_merge_size + return num_vision_tokens // merge_size**2 + class Tarsier2MultiModalProcessor(Qwen2VLMultiModalProcessor): pass diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 80e951257e536..be0e5f8759d17 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -2091,3 +2091,22 @@ class Qwen3VLForConditionalGeneration( connector=["visual.merger", "visual.deepstack_merger_list"], tower_model="visual.", ) + + def get_num_mm_encoder_tokens( + self, + num_image_tokens: int, + ) -> int: + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + merge_size = vision_config.spatial_merge_size + + return num_image_tokens * merge_size**2 + + def get_num_mm_connector_tokens( + self, + num_vision_tokens: int, + ) -> int: + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + merge_size = vision_config.spatial_merge_size + return num_vision_tokens // merge_size**2