From 20402090b8501942671461d0a2cb463527aea206 Mon Sep 17 00:00:00 2001 From: bk-201 Date: Sun, 21 Dec 2025 03:34:32 +0000 Subject: [PATCH] move mm-token-functions to model Signed-off-by: bk-201 --- vllm/lora/model_manager.py | 8 ++++---- vllm/model_executor/models/interfaces.py | 12 +++++++----- vllm/model_executor/models/qwen2_5_vl.py | 6 +++--- vllm/model_executor/models/qwen2_vl.py | 4 ++-- vllm/model_executor/models/qwen3_vl.py | 4 ++-- vllm/v1/worker/gpu_model_runner.py | 13 +++++-------- 6 files changed, 23 insertions(+), 24 deletions(-) diff --git a/vllm/lora/model_manager.py b/vllm/lora/model_manager.py index e2f65c2b2ce1b..4506b0a4461ec 100644 --- a/vllm/lora/model_manager.py +++ b/vllm/lora/model_manager.py @@ -158,7 +158,7 @@ class LoRAModelManager: model_config ).info self.supports_tower_connector_lora = self.supports_mm and hasattr( - self.mm_processor_info, "get_num_mm_encoder_tokens" + self.model, "get_num_mm_encoder_tokens" ) if not self.supports_tower_connector_lora: return @@ -177,7 +177,7 @@ class LoRAModelManager: limit_per_prompt: int = max( self.mm_processor_info.get_allowed_mm_limits().values() ) - num_encoder_tokens = self.mm_processor_info.get_num_mm_encoder_tokens( + num_encoder_tokens = self.model.get_num_mm_encoder_tokens( mm_budget.get_encoder_budget() ) @@ -193,8 +193,8 @@ class LoRAModelManager: # Use wrapper for connector if present. if self.mm_mapping.connector: - if hasattr(self.mm_processor_info, "get_num_mm_connector_tokens"): - connector_tokens = self.mm_processor_info.get_num_mm_connector_tokens( + if hasattr(self.model, "get_num_mm_connector_tokens"): + connector_tokens = self.model.get_num_mm_connector_tokens( num_encoder_tokens ) connector_punica_wrapper = get_punica_wrapper( diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index ae119969b5846..031a9cb40e3ff 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -143,17 +143,19 @@ class SupportsMultiModal(Protocol): def get_num_mm_encoder_tokens(self, num_image_tokens: int) -> int: """ - Implement this function to enable LoRA support - for the tower module of the multi-modal model - Given the number of image tokens, output the number of multi-modal encoder tokens + Implement this function to enable LoRA support + for the tower module of the multi-modal model. + Given the number of image tokens, output the number of + multi-modal encoder tokens. """ ... def get_num_mm_connector_tokens(self, num_vision_tokens: int) -> int: """ Implement this function to enable LoRA support - for the connector module of the multi-modal model - Given the number of vision tokens, output the number of multi-modal connector tokens + for the connector module of the multi-modal model. + Given the number of vision tokens, output the number of + multi-modal connector tokens. """ ... diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 998cefd33e801..1c8024cf12725 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -1568,12 +1568,12 @@ class Qwen2_5_VLForConditionalGeneration( connector="visual.merger.", tower_model="visual.", ) - + def get_num_mm_encoder_tokens( self, num_image_tokens: int, ) -> int: - hf_config = self.get_hf_config() + hf_config = self.config vision_config = hf_config.vision_config merge_size = vision_config.spatial_merge_size @@ -1583,7 +1583,7 @@ class Qwen2_5_VLForConditionalGeneration( self, num_vision_tokens: int, ) -> int: - hf_config = self.get_hf_config() + hf_config = self.config vision_config = hf_config.vision_config merge_size = vision_config.spatial_merge_size return num_vision_tokens // merge_size**2 diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index cd9ddaa532490..379e50742bb84 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1495,7 +1495,7 @@ class Qwen2VLForConditionalGeneration( self, num_image_tokens: int, ) -> int: - hf_config = self.get_hf_config() + hf_config = self.config vision_config = hf_config.vision_config merge_size = vision_config.spatial_merge_size @@ -1505,7 +1505,7 @@ class Qwen2VLForConditionalGeneration( self, num_vision_tokens: int, ) -> int: - hf_config = self.get_hf_config() + hf_config = self.config vision_config = hf_config.vision_config merge_size = vision_config.spatial_merge_size return num_vision_tokens // merge_size**2 diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index be0e5f8759d17..1daba20a95676 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -2096,7 +2096,7 @@ class Qwen3VLForConditionalGeneration( self, num_image_tokens: int, ) -> int: - hf_config = self.get_hf_config() + hf_config = self.config vision_config = hf_config.vision_config merge_size = vision_config.spatial_merge_size @@ -2106,7 +2106,7 @@ class Qwen3VLForConditionalGeneration( self, num_vision_tokens: int, ) -> int: - hf_config = self.get_hf_config() + hf_config = self.config vision_config = hf_config.vision_config merge_size = vision_config.spatial_merge_size return num_vision_tokens // merge_size**2 diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index d2885deece9c6..31acbe5e20538 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2160,15 +2160,12 @@ class GPUModelRunner( # encoder outputs. model = cast(SupportsMultiModal, self.model) - if self.lora_manager.supports_tower_connector_lora(): + if self.lora_config and self.lora_manager.supports_tower_connector_lora(): # Build LoRA mappings independently for encoder inputs # (encoder batch structure is different from main batch) prompt_lora_mapping = [] token_lora_mapping = [] lora_requests = set() - # This implementation is a bit hacky, but it's mainly to retrieve - # the get_num_mm_*_tokens helper functions from ProcessingInfo. - mm_processor_info = self.lora_manager._adapter_manager.mm_processor_info for req_id, (_, pos_info) in zip(encoder_req_ids, mm_hashes_pos): req_idx = self.input_batch.req_id_to_index[req_id] @@ -2177,7 +2174,7 @@ class GPUModelRunner( # Prefer pos_info.is_embed to count actual MM embedding tokens. # pos_info.length may overcount (e.g., special tokens in Qwen-VL). # Fall back to length if is_embed is None. - num_tokens = mm_processor_info.get_num_mm_encoder_tokens( # type: ignore[attr-defined] + num_tokens = self.model.get_num_mm_encoder_tokens( # type: ignore[attr-defined] pos_info.get_num_embeds ) prompt_lora_mapping.append(lora_id) @@ -2196,13 +2193,13 @@ class GPUModelRunner( ) self.lora_manager.set_active_adapters(lora_requests, lora_mapping) - if hasattr(mm_processor_info, "get_num_mm_connector_tokens"): + if hasattr(self.model, "get_num_mm_connector_tokens"): num_post_op_tokens = [] for _, pos_info in mm_hashes_pos: - mm_token_count = mm_processor_info.get_num_mm_encoder_tokens( # type: ignore[attr-defined] + mm_token_count = self.model.get_num_mm_encoder_tokens( # type: ignore[attr-defined] pos_info.length ) - post_op_count = mm_processor_info.get_num_mm_connector_tokens( # type: ignore[attr-defined] + post_op_count = self.model.get_num_mm_connector_tokens( # type: ignore[attr-defined] mm_token_count ) num_post_op_tokens.append(post_op_count)