diff --git a/vllm/lora/model_manager.py b/vllm/lora/model_manager.py index d9cdabd086d6f..135c5593ed698 100644 --- a/vllm/lora/model_manager.py +++ b/vllm/lora/model_manager.py @@ -154,9 +154,8 @@ class LoRAModelManager: self.punica_wrapper_mapping[lm_prefix] = llm_punica_wrapper if self.lora_config.enable_tower_connector_lora: - self.info = MULTIMODAL_REGISTRY.create_processor(model_config).info self.supports_tower_connector_lora = self.supports_mm and hasattr( - self.info, "get_num_mm_encoder_tokens" + self.model, "get_num_mm_encoder_tokens" ) if not self.supports_tower_connector_lora: return @@ -172,8 +171,8 @@ class LoRAModelManager: vllm_config.scheduler_config, MULTIMODAL_REGISTRY, ) - limit_per_prompt: int = max(self.info.get_allowed_mm_limits().values()) - num_encoder_tokens = self.info.get_num_mm_encoder_tokens( + limit_per_prompt: int = max(self.model.get_allowed_mm_limits().values()) + num_encoder_tokens = self.model.get_num_mm_encoder_tokens( mm_budget.get_encoder_budget() ) @@ -189,8 +188,8 @@ class LoRAModelManager: # Use wrapper for connector if present. if self.mm_mapping.connector: - if hasattr(self.info, "get_num_mm_connector_tokens"): - connector_tokens = self.info.get_num_mm_connector_tokens( + if hasattr(self.model, "get_num_mm_connector_tokens"): + connector_tokens = self.model.get_num_mm_connector_tokens( num_encoder_tokens ) connector_punica_wrapper = get_punica_wrapper( diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index cb99d57e8b8c7..ae119969b5846 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -141,6 +141,22 @@ class SupportsMultiModal(Protocol): """ ... + def get_num_mm_encoder_tokens(self, num_image_tokens: int) -> int: + """ + Implement this function to enable LoRA support + for the tower module of the multi-modal model + Given the number of image tokens, output the number of multi-modal encoder tokens + """ + ... + + def get_num_mm_connector_tokens(self, num_vision_tokens: int) -> int: + """ + Implement this function to enable LoRA support + for the connector module of the multi-modal model + Given the number of vision tokens, output the number of multi-modal connector tokens + """ + ... + @overload def embed_input_ids(self, input_ids: Tensor) -> Tensor: ... diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 02fac0b78a4b4..9d42ace2c8e8e 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -1568,3 +1568,39 @@ class Qwen2_5_VLForConditionalGeneration( connector="visual.merger.", tower_model="visual.", ) + + def get_num_mm_encoder_tokens( + self, + num_image_tokens: int, + ) -> int: + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + merge_size = vision_config.spatial_merge_size + + return num_image_tokens * merge_size**2 + + def get_num_mm_connector_tokens( + self, + num_vision_tokens: int, + ) -> int: + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + merge_size = vision_config.spatial_merge_size + return num_vision_tokens // merge_size**2 + + def get_allowed_mm_limits(self) -> Mapping[str, int]: + """Return the maximum allowed number of items for each modality.""" + supported_mm_limits = self.get_supported_mm_limits() + mm_config = self.ctx.get_mm_config() + + allowed_limits = dict[str, int]() + for modality, supported_limit in supported_mm_limits.items(): + user_limit = mm_config.get_limit_per_prompt(modality) + + allowed_limits[modality] = ( + user_limit + if supported_limit is None + else min(user_limit, supported_limit) + ) + + return allowed_limits diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 13014bebb1054..a13859a2a71c3 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1104,25 +1104,6 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]) for modality in ("image", "video") ] - def get_num_mm_encoder_tokens( - self, - num_image_tokens: int, - ) -> int: - hf_config = self.get_hf_config() - vision_config = hf_config.vision_config - merge_size = vision_config.spatial_merge_size - - return num_image_tokens * merge_size**2 - - def get_num_mm_connector_tokens( - self, - num_vision_tokens: int, - ) -> int: - hf_config = self.get_hf_config() - vision_config = hf_config.vision_config - merge_size = vision_config.spatial_merge_size - return num_vision_tokens // merge_size**2 - def _get_mm_fields_config( self, hf_inputs: BatchFeature, @@ -1510,6 +1491,42 @@ class Qwen2VLForConditionalGeneration( tower_model="visual.", ) + def get_allowed_mm_limits(self) -> Mapping[str, int]: + """Return the maximum allowed number of items for each modality.""" + supported_mm_limits = self.get_supported_mm_limits() + mm_config = self.ctx.get_mm_config() + + allowed_limits = dict[str, int]() + for modality, supported_limit in supported_mm_limits.items(): + user_limit = mm_config.get_limit_per_prompt(modality) + + allowed_limits[modality] = ( + user_limit + if supported_limit is None + else min(user_limit, supported_limit) + ) + + return allowed_limits + + def get_num_mm_encoder_tokens( + self, + num_image_tokens: int, + ) -> int: + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + merge_size = vision_config.spatial_merge_size + + return num_image_tokens * merge_size**2 + + def get_num_mm_connector_tokens( + self, + num_vision_tokens: int, + ) -> int: + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + merge_size = vision_config.spatial_merge_size + return num_vision_tokens // merge_size**2 + class Tarsier2MultiModalProcessor(Qwen2VLMultiModalProcessor): pass diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 80e951257e536..18c0fd68afdc4 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -2091,3 +2091,39 @@ class Qwen3VLForConditionalGeneration( connector=["visual.merger", "visual.deepstack_merger_list"], tower_model="visual.", ) + + def get_num_mm_encoder_tokens( + self, + num_image_tokens: int, + ) -> int: + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + merge_size = vision_config.spatial_merge_size + + return num_image_tokens * merge_size**2 + + def get_num_mm_connector_tokens( + self, + num_vision_tokens: int, + ) -> int: + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + merge_size = vision_config.spatial_merge_size + return num_vision_tokens // merge_size**2 + + def get_allowed_mm_limits(self) -> Mapping[str, int]: + """Return the maximum allowed number of items for each modality.""" + supported_mm_limits = self.get_supported_mm_limits() + mm_config = self.ctx.get_mm_config() + + allowed_limits = dict[str, int]() + for modality, supported_limit in supported_mm_limits.items(): + user_limit = mm_config.get_limit_per_prompt(modality) + + allowed_limits[modality] = ( + user_limit + if supported_limit is None + else min(user_limit, supported_limit) + ) + + return allowed_limits diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 51143822fe0bb..3bbdab3b393c5 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1420,28 +1420,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): """ raise NotImplementedError - def get_num_mm_encoder_tokens( - self, - num_image_tokens: int, - ) -> int: - """ - Implement this function to enable LoRA support - for the tower module of the multi-modal model - Given the number of image tokens, output the number of multi-modal encoder tokens - """ - raise NotImplementedError - - def get_num_mm_connector_tokens( - self, - num_vision_tokens: int, - ) -> int: - """ - Implement this function to enable LoRA support - for the connector module of the multi-modal model - Given the number of vision tokens, output the number of multi-modal connector tokens - """ - raise NotImplementedError - def _bind_and_group_updates( self, prompt_updates: Sequence[PromptUpdate], diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index f4ed37a7b6771..a3e64e89ff60c 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -593,9 +593,9 @@ class GPUModelRunner( # Multimodal LoRA support self.enable_tower_connector_lora = False if self.supports_mm_inputs and self.lora_config: - self.info = self.mm_registry.create_processor(self.model_config).info + self.mm_model_cls = self.mm_registry._get_model_cls(model_config) self.enable_tower_connector_lora = ( - hasattr(self.info, "get_num_mm_encoder_tokens") + hasattr(self.mm_model_cls, "get_num_mm_encoder_tokens") and self.lora_config.enable_tower_connector_lora ) @@ -2183,7 +2183,7 @@ class GPUModelRunner( # Prefer pos_info.is_embed to count actual MM embedding tokens. # pos_info.length may overcount (e.g., special tokens in Qwen-VL). # Fall back to length if is_embed is None. - num_tokens = self.info.get_num_mm_encoder_tokens( # type: ignore[attr-defined] + num_tokens = model.get_num_mm_encoder_tokens( # type: ignore[attr-defined] pos_info.get_num_embeds ) prompt_lora_mapping.append(lora_id) @@ -2202,13 +2202,13 @@ class GPUModelRunner( ) self.lora_manager.set_active_adapters(lora_requests, lora_mapping) - if hasattr(self.info, "get_num_mm_connector_tokens"): + if hasattr(model, "get_num_mm_connector_tokens"): num_post_op_tokens = [] for _, pos_info in mm_hashes_pos: - mm_token_count = self.info.get_num_mm_encoder_tokens( # type: ignore[attr-defined] + mm_token_count = model.get_num_mm_encoder_tokens( # type: ignore[attr-defined] pos_info.length ) - post_op_count = self.info.get_num_mm_connector_tokens( # type: ignore[attr-defined] + post_op_count = model.get_num_mm_connector_tokens( # type: ignore[attr-defined] mm_token_count ) num_post_op_tokens.append(post_op_count)