From e7026a7c50f4049ac0e071a3af2d4d991ec1fabf Mon Sep 17 00:00:00 2001 From: bk-201 Date: Thu, 22 May 2025 00:31:56 +0800 Subject: [PATCH 01/53] add mm_punica_warpper Signed-off-by: bk-201 --- requirements/test.txt | 22 ++++- vllm/lora/layers.py | 8 ++ vllm/lora/models.py | 97 +++++++++++++++++++---- vllm/lora/worker_manager.py | 4 +- vllm/model_executor/models/idefics3.py | 9 +++ vllm/model_executor/models/qwen2_vl.py | 10 +++ vllm/multimodal/profiling.py | 4 + vllm/v1/worker/gpu_model_runner.py | 55 ++++++++++--- vllm/v1/worker/lora_model_runner_mixin.py | 27 ++++--- 9 files changed, 200 insertions(+), 36 deletions(-) diff --git a/requirements/test.txt b/requirements/test.txt index 89d477017342e..df3770856022f 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -27,6 +27,10 @@ argcomplete==3.5.1 # via datamodel-code-generator arrow==1.3.0 # via isoduration +async-timeout==5.0.1 + # via + # aiohttp + # redis attrs==24.2.0 # via # aiohttp @@ -129,6 +133,11 @@ eval-type-backport==0.2.2 # via mteb evaluate==0.4.3 # via lm-eval +exceptiongroup==1.3.0 + # via + # anyio + # hypothesis + # pytest fastparquet==2024.11.0 # via genai-perf fastrlock==0.8.2 @@ -640,7 +649,6 @@ setuptools==77.0.3 # via # mamba-ssm # pytablewriter - # torch # triton shellingham==1.5.4 # via typer @@ -700,8 +708,13 @@ tokenizers==0.21.1 # via # -r requirements/test.in # transformers +toml==0.10.2 + # via datamodel-code-generator tomli==2.2.1 - # via schemathesis + # via + # black + # pytest + # schemathesis tomli-w==1.2.0 # via schemathesis torch==2.7.0+cu128 @@ -775,13 +788,18 @@ types-python-dateutil==2.9.0.20241206 # via arrow typing-extensions==4.12.2 # via + # anyio + # black + # exceptiongroup # huggingface-hub # librosa # mistral-common # mteb + # multidict # pqdm # pydantic # pydantic-core + # rich # torch # typer tzdata==2024.2 diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 023c8e9c9a864..4a1f860b3bbb8 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -77,6 +77,7 @@ def _not_fully_sharded_can_replace(can_replace): @dataclass class LoRAMapping(AdapterMapping): is_prefill: bool = False + is_mm_input: bool = False class BaseLayerWithLoRA(nn.Module): @@ -410,6 +411,9 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA): bias: Optional[torch.Tensor] = None) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) + # Store original shape for later reshaping + original_shape = output.shape if output.ndim == 3 else None + # In transformers backend, x and output have extra batch dimension like # (1, seq_len, hidden_dim), while punica expects (seq_len, hidden_dim), # therefore we need to flatten the batch dimensions. @@ -424,6 +428,10 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA): if not current_platform.can_update_inplace(): output = lora_output + # Restore original shape if it was flattened + if original_shape is not None: + output = output.reshape(original_shape) + return output @property diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 959fe4a672a6d..9556579ca5a6c 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -17,14 +17,14 @@ from vllm.adapter_commons.models import (AdapterLRUCache, AdapterModel, from vllm.adapter_commons.utils import (add_adapter, deactivate_adapter, get_adapter, list_adapters, remove_adapter, set_adapter_mapping) -from vllm.config import LoRAConfig +from vllm.config import LoRAConfig, ModelConfig from vllm.logger import init_logger from vllm.lora.layers import (BaseLayerWithLoRA, LinearScalingRotaryEmbeddingWithLoRA, LoRAMapping) from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights from vllm.lora.peft_helper import PEFTHelper -from vllm.lora.punica_wrapper import get_punica_wrapper +from vllm.lora.punica_wrapper import PunicaWrapperBase, get_punica_wrapper from vllm.lora.utils import (from_layer, from_layer_logits_processor, get_supported_lora_modules, is_regex_target_modules, @@ -33,6 +33,7 @@ from vllm.model_executor.models import SupportsLoRA, supports_multimodal from vllm.model_executor.models.interfaces import is_pooling_model from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.utils import PPMissingLayer, WeightsMapper +from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.utils import is_pin_memory_available logger = init_logger(__name__) @@ -311,6 +312,7 @@ class LoRAModelManager(AdapterModelManager): max_num_batched_tokens: int, vocab_size: int, lora_config: LoRAConfig, + model_config: Optional[ModelConfig], device: torch.device, ): """Create a LoRAModelManager and adapter for a given model. @@ -357,6 +359,30 @@ class LoRAModelManager(AdapterModelManager): # In case the model only supports LoRA for # text modules (e.g. ChatGLM) and hasattr(self.model, "get_mm_mapping")) + # For v0 compatibility + if model_config is not None: + self.mm_registry = MULTIMODAL_REGISTRY + self.info = self.mm_registry.create_processor( + model_config, disable_cache=True).info + self.supports_mm_lora = self.supports_mm and hasattr( + self.info, "get_num_mm_encoder_tokens") + else: + self.supports_mm_lora = False + if self.supports_mm_lora: + self.mm_mapping: MultiModelKeys = self.model.get_mm_mapping() + self.mm_punica_wrapper_mapping = { + name: + get_punica_wrapper( + self.info.get_num_mm_encoder_tokens( + max_num_batched_tokens), + max_batches=self.max_num_seqs, # TODO + device=self.device, + max_loras=self.lora_config.max_loras, + ) + for name in self.mm_mapping.tower_model + } + self.mm_punica_wrapper_mapping[ + self.mm_mapping.language_model[0]] = self.punica_wrapper self.is_pooling_model = is_pooling_model(self.model) self.packed_modules: dict[str, list[str]] = {} self.modules: dict[str, BaseLayerWithLoRA] = {} @@ -452,14 +478,35 @@ class LoRAModelManager(AdapterModelManager): def _set_adapter_mapping(self, mapping: LoRAMapping) -> None: # update lora states - self.punica_wrapper.update_metadata( - mapping, - self.lora_index_to_id, - self.lora_slots + 1, - self.vocab_size, - self.lora_config.lora_extra_vocab_size, - self.long_lora_context, - ) + if not self.supports_mm_lora: + self.punica_wrapper.update_metadata( + mapping, + self.lora_index_to_id, + self.lora_slots + 1, + self.vocab_size, + self.lora_config.lora_extra_vocab_size, + self.long_lora_context, + ) + elif mapping.is_mm_input: + self.mm_punica_wrapper_mapping[ + self.mm_mapping.tower_model[0]].update_metadata( + mapping, + self.lora_index_to_id, + self.lora_slots + 1, + self.vocab_size, + self.lora_config.lora_extra_vocab_size, + self.long_lora_context, + ) + else: + self.mm_punica_wrapper_mapping[ + self.mm_mapping.language_model[0]].update_metadata( + mapping, + self.lora_index_to_id, + self.lora_slots + 1, + self.vocab_size, + self.lora_config.lora_extra_vocab_size, + self.long_lora_context, + ) def remove_all_adapters(self): """Remove all LoRAModels from the manager.""" @@ -476,7 +523,9 @@ class LoRAModelManager(AdapterModelManager): continue # A temporary approach for multimodal models to support LoRA # TODO: Remove this restriction - if self._filter_unsupported_mm_module(module_name): + if (self._filter_unsupported_mm_module(module_name) + and not self.supports_mm_lora + or self._get_mm_punica_wrapper(module_name) is None): logger.warning( "Regarding multimodal models, vLLM currently only supports " "adding LoRA to language model, %s will be ignored.", @@ -519,7 +568,11 @@ class LoRAModelManager(AdapterModelManager): self.register_module(module_name, new_module) self._register_packed_modules(module_name) # All lora layers share the same punica_wrapper based on reference. - new_module.set_mapping(self.punica_wrapper) + if self.supports_mm_lora: + new_module.set_mapping( + self._get_mm_punica_wrapper(module_name)) + else: + new_module.set_mapping(self.punica_wrapper) def register_module(self, module_name: str, module: "BaseLayerWithLoRA"): assert isinstance(module, BaseLayerWithLoRA) @@ -615,6 +668,19 @@ class LoRAModelManager(AdapterModelManager): [module_name.startswith(prefix) for prefix in prefix_lst]) return False + def _get_mm_punica_wrapper(self, module_name: str) -> PunicaWrapperBase: + """ + TODO + """ + if self.supports_mm_lora: + for ( + prefix, + punica_wrapper, + ) in self.mm_punica_wrapper_mapping.items(): + if module_name.startswith(prefix): + return punica_wrapper + return None + def _register_packed_modules(self, module_full_name: str) -> None: parts = module_full_name.split(".") module_name = parts[-1] @@ -713,9 +779,10 @@ class LRUCacheLoRAModelManager(LoRAModelManager): def __init__(self, model: nn.Module, max_num_seqs: int, max_num_batched_tokens: int, vocab_size: int, - lora_config: LoRAConfig, device: torch.device): + lora_config: LoRAConfig, model_config: ModelConfig, + device: torch.device): super().__init__(model, max_num_seqs, max_num_batched_tokens, - vocab_size, lora_config, device) + vocab_size, lora_config, model_config, device) self._registered_adapters: LoRALRUCache = LoRALRUCache( self.capacity, self.deactivate_adapter) self._active_adapters: LoRALRUCache = LoRALRUCache( @@ -785,6 +852,7 @@ def create_lora_manager( max_num_batched_tokens: int, vocab_size: int, lora_config: LoRAConfig, + model_config: ModelConfig, device: torch.device, lora_manager_cls: type[LoRAModelManager] = LoRAModelManager, **kwargs) -> LoRAModelManager: @@ -797,6 +865,7 @@ def create_lora_manager( max_num_batched_tokens=max_num_batched_tokens, vocab_size=vocab_size, lora_config=lora_config, + model_config=model_config, device=device, **kwargs) return lora_manager diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index 8e5bc61066593..016de3cbc0f25 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -10,7 +10,7 @@ from vllm.adapter_commons.utils import (add_adapter_worker, list_adapters_worker, set_active_adapters_worker) from vllm.adapter_commons.worker_manager import AbstractWorkerManager -from vllm.config import LoRAConfig +from vllm.config import LoRAConfig, ModelConfig from vllm.logger import init_logger from vllm.lora.models import (LoRAModel, LoRAModelManager, LRUCacheLoRAModelManager, create_lora_manager) @@ -200,6 +200,7 @@ class LRUCacheWorkerLoRAManager(WorkerLoRAManager): def create_lora_manager( self, model: torch.nn.Module, + model_config: Optional[ModelConfig] = None, ) -> Any: lora_manager = create_lora_manager( model, @@ -209,6 +210,7 @@ class LRUCacheWorkerLoRAManager(WorkerLoRAManager): lora_config=self.lora_config, device=self.device, max_num_batched_tokens=self.max_num_batched_tokens, + model_config=model_config, ) self._adapter_manager = lora_manager return lora_manager.model diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index fdb128ef5b541..96602848dd53f 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -279,6 +279,15 @@ class Idefics3ProcessingInfo(BaseProcessingInfo): height=image_processor.size["longest_edge"], ) + def get_num_mm_encoder_tokens( + self, + num_image_tokens: int, + ) -> int: + hf_config = self.get_hf_config() + scale_factor = hf_config.scale_factor + + return num_image_tokens * scale_factor**2 + class Idefics3DummyInputsBuilder(BaseDummyInputsBuilder[Idefics3ProcessingInfo] ): diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 0ff0836b08975..5da8798620585 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -962,6 +962,16 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo): image_processor=None, ) + def get_num_mm_encoder_tokens( + self, + num_image_tokens: int, + ) -> int: + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + merge_size = vision_config.spatial_merge_size + + return num_image_tokens * merge_size**2 + class Qwen2VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2VLProcessingInfo]): diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index b5875124c1266..8e6e53413d923 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 +import copy from abc import ABC from collections.abc import Mapping from dataclasses import dataclass, field @@ -44,6 +45,7 @@ class DummyDecoderData(NamedTuple): prompt_token_ids: list[int] multi_modal_data: MultiModalKwargs multi_modal_placeholders: MultiModalPlaceholderDict + multi_modal_token_ids: list[int] _I = TypeVar("_I", bound=BaseProcessingInfo) @@ -249,6 +251,7 @@ class MultiModalProfiler(Generic[_I]): str(self._get_mm_num_tokens(mm_inputs)), ) + multi_modal_token_ids = copy.deepcopy(prompt_token_ids) if total_len < seq_len: prompt_token_ids.extend([0] * (seq_len - total_len)) @@ -256,6 +259,7 @@ class MultiModalProfiler(Generic[_I]): prompt_token_ids=prompt_token_ids, multi_modal_data=mm_inputs["mm_kwargs"], multi_modal_placeholders=mm_inputs["mm_placeholders"], + multi_modal_token_ids=multi_modal_token_ids, ) def get_mm_max_tokens( diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 201796c96ee5c..98d76d6afe68d 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -263,6 +263,15 @@ class GPUModelRunner(LoRAModelRunnerMixin): pin_memory=self.pin_memory) self.seq_lens_np = self.seq_lens_cpu.numpy() + # Multimodal LoRA support + if self.is_multimodal_model: + self.info = self.mm_registry.create_processor( + self.model_config, disable_cache=True).info + self.supports_mm_lora = hasattr(self.info, + "get_num_mm_encoder_tokens") + else: + self.supports_mm_lora = False + def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> bool: """ Update the order of requests in the batch based on the attention @@ -892,12 +901,14 @@ class GPUModelRunner(LoRAModelRunnerMixin): return # Batch the multi-modal inputs. + mm_tokens = list[int]() mm_inputs = list[MultiModalKwargs]() req_ids_pos = list[tuple[str, int, PlaceholderRange]]() for req_id, encoder_input_ids in scheduled_encoder_inputs.items(): req_state = self.requests[req_id] for mm_input_id in encoder_input_ids: + mm_tokens.append(req_state.mm_positions[mm_input_id].length) mm_inputs.append(req_state.mm_inputs[mm_input_id]) req_ids_pos.append( (req_id, mm_input_id, req_state.mm_positions[mm_input_id])) @@ -911,6 +922,16 @@ class GPUModelRunner(LoRAModelRunnerMixin): # encoder outputs. grouped_mm_inputs_list = group_mm_inputs_by_modality(mm_inputs) + if self.lora_config and self.supports_mm_lora: + mm_tokens = [ + self.info.get_num_mm_encoder_tokens(num_token) + for num_token in mm_tokens + ] + num_scheduled_tokens = np.array(mm_tokens, dtype=np.int32) + self.set_active_loras(self.input_batch, + num_scheduled_tokens, + is_mm_input=True) + encoder_outputs = [] for grouped_mm_inputs in grouped_mm_inputs_list: batched_mm_inputs = MultiModalKwargs.batch(grouped_mm_inputs) @@ -1826,22 +1847,38 @@ class GPUModelRunner(LoRAModelRunnerMixin): encoder_budget, max_num_mm_items, dummy_data_modality) # Create dummy batch of multimodal inputs. - dummy_mm_kwargs = self.mm_registry.get_decoder_dummy_data( + dummy_mm_data = self.mm_registry.get_decoder_dummy_data( model_config=self.model_config, seq_len=self.max_num_tokens, - mm_counts={ - dummy_data_modality: 1 - }, - ).multi_modal_data + mm_counts={dummy_data_modality: 1}, + ) + dummy_mm_kwargs = dummy_mm_data.multi_modal_data + dummy_mm_token_ids = dummy_mm_data.multi_modal_token_ids + max_num_mm_items = 1 # temporary batched_dummy_mm_inputs = MultiModalKwargs.batch( - [dummy_mm_kwargs] * max_num_mm_items) + [dummy_mm_kwargs] * max_num_mm_items) # ??? batched_dummy_mm_inputs = MultiModalKwargs.as_kwargs( batched_dummy_mm_inputs, device=self.device) - # Run multimodal encoder. - dummy_encoder_outputs = self.model.get_multimodal_embeddings( - **batched_dummy_mm_inputs) + if self.supports_mm_lora: + num_scheduled_tokens_list = [ + self.info.get_num_mm_encoder_tokens( + len(dummy_mm_token_ids)) + ] * max_num_mm_items + num_scheduled_tokens = np.array(num_scheduled_tokens_list, + dtype=np.int32) + lora_config = self.lora_config + else: + num_scheduled_tokens = None + lora_config = None + + with self.maybe_dummy_run_with_lora(lora_config, + num_scheduled_tokens, + is_mm_input=True): + # Run multimodal encoder. + dummy_encoder_outputs = self.model.get_multimodal_embeddings( + **batched_dummy_mm_inputs) sanity_check_mm_encoder_outputs( dummy_encoder_outputs, diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py index 3cbab840e9693..41a795d91b6f0 100644 --- a/vllm/v1/worker/lora_model_runner_mixin.py +++ b/vllm/v1/worker/lora_model_runner_mixin.py @@ -50,11 +50,13 @@ class LoRAModelRunnerMixin: model.embedding_padding_modules, max_position_embeddings=text_config.max_position_embeddings, ) - return self.lora_manager.create_lora_manager(model) + return self.lora_manager.create_lora_manager(model, model_config) - def _set_active_loras(self, prompt_lora_mapping: tuple[int, ...], + def _set_active_loras(self, + prompt_lora_mapping: tuple[int, ...], token_lora_mapping: tuple[int, ...], - lora_requests: set[LoRARequest]) -> None: + lora_requests: set[LoRARequest], + is_mm_input: bool = False) -> None: if not self.lora_manager: raise RuntimeError("LoRA is not enabled.") @@ -64,11 +66,14 @@ class LoRAModelRunnerMixin: # decode and this flag is generally ignored. lora_mapping = LoRAMapping(token_lora_mapping, prompt_lora_mapping, - is_prefill=True) + is_prefill=True, + is_mm_input=is_mm_input) self.lora_manager.set_active_adapters(lora_requests, lora_mapping) - def set_active_loras(self, input_batch: InputBatch, - num_scheduled_tokens: np.ndarray) -> None: + def set_active_loras(self, + input_batch: InputBatch, + num_scheduled_tokens: np.ndarray, + is_mm_input: bool = False) -> None: prompt_lora_mapping: tuple[int, ...] # of size input_batch.num_reqs token_lora_mapping: tuple[int, @@ -77,11 +82,13 @@ class LoRAModelRunnerMixin: prompt_lora_mapping, token_lora_mapping, lora_requests = \ input_batch.make_lora_inputs(num_scheduled_tokens) return self._set_active_loras(prompt_lora_mapping, token_lora_mapping, - lora_requests) + lora_requests, is_mm_input) @contextmanager - def maybe_dummy_run_with_lora(self, lora_config: LoRAConfig, - num_scheduled_tokens: np.ndarray): + def maybe_dummy_run_with_lora(self, + lora_config: LoRAConfig, + num_scheduled_tokens: np.ndarray, + is_mm_input: bool = False): if lora_config is None: yield else: @@ -117,7 +124,7 @@ class LoRAModelRunnerMixin: self._set_active_loras(tuple(prompt_lora_mapping), tuple(token_lora_mapping), - lora_requests) + lora_requests, is_mm_input) yield From 7db0d5990aa644d1d022ec9ea2079bf2109fda4e Mon Sep 17 00:00:00 2001 From: bk-201 Date: Wed, 9 Jul 2025 13:55:45 +0800 Subject: [PATCH 02/53] update mm filter Signed-off-by: bk-201 --- vllm/lora/models.py | 35 ++++++++++++++++------- vllm/v1/worker/lora_model_runner_mixin.py | 5 ++-- 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/vllm/lora/models.py b/vllm/lora/models.py index b9e6b3af3309d..2d65008074e01 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -386,19 +386,33 @@ class LoRAModelManager(AdapterModelManager): self.supports_mm_lora = False if self.supports_mm_lora: self.mm_mapping: MultiModelKeys = self.model.get_mm_mapping() + self.mm_config = model_config.multimodal_config + limit_per_prompt: int = max( + [1] + \ + list(self.mm_config.limit_per_prompt.values()) + ) + + # For vision tower self.mm_punica_wrapper_mapping = { name: get_punica_wrapper( self.info.get_num_mm_encoder_tokens( max_num_batched_tokens), - max_batches=self.max_num_seqs, # TODO + max_batches=self.max_num_seqs * limit_per_prompt, device=self.device, max_loras=self.lora_config.max_loras, ) for name in self.mm_mapping.tower_model } - self.mm_punica_wrapper_mapping[ - self.mm_mapping.language_model[0]] = self.punica_wrapper + # For language model + self.mm_punica_wrapper_mapping.update( + { + self.mm_mapping.language_model[0]: self.punica_wrapper + } + ) + # For other + # TODO + self.is_pooling_model = is_pooling_model(self.model) self.packed_modules: dict[str, list[str]] = {} self.modules: dict[str, BaseLayerWithLoRA] = {} @@ -539,9 +553,7 @@ class LoRAModelManager(AdapterModelManager): continue # A temporary approach for multimodal models to support LoRA # TODO: Remove this restriction - if (self._filter_unsupported_mm_module(module_name) - and not self.supports_mm_lora - or self._get_mm_punica_wrapper(module_name) is None): + if self._filter_unsupported_mm_module(module_name): logger.warning( "Regarding multimodal models, vLLM currently only supports " "adding LoRA to language model, %s will be ignored.", @@ -678,10 +690,13 @@ class LoRAModelManager(AdapterModelManager): be filtered out. """ if self.supports_mm: - module_mapping: MultiModelKeys = self.model.get_mm_mapping() - prefix_lst = module_mapping.connector + module_mapping.tower_model - return any( - [module_name.startswith(prefix) for prefix in prefix_lst]) + prefix_lst = self.mm_mapping.connector + self.mm_mapping.tower_model + if self.supports_mm_lora: + + return self._get_mm_punica_wrapper(module_name) is None + else: + return any( + [module_name.startswith(prefix) for prefix in prefix_lst]) return False def _get_mm_punica_wrapper(self, module_name: str) -> PunicaWrapperBase: diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py index e9a2ac4792e81..0634e274717e0 100644 --- a/vllm/v1/worker/lora_model_runner_mixin.py +++ b/vllm/v1/worker/lora_model_runner_mixin.py @@ -157,10 +157,11 @@ class LoRAModelRunnerMixin: @contextmanager def maybe_dummy_run_with_lora(self, lora_config: LoRAConfig, - num_scheduled_tokens: np.ndarray): + num_scheduled_tokens: np.ndarray, + is_mm_input: bool = False): with self.maybe_setup_dummy_loras( lora_config), self.maybe_select_dummy_loras( - lora_config, num_scheduled_tokens): + lora_config, num_scheduled_tokens, is_mm_input): yield def add_lora(self, lora_request: LoRARequest) -> bool: From 891df1db6fae9e98e822ee2234f1ce3222a11e9a Mon Sep 17 00:00:00 2001 From: bk-201 Date: Sat, 11 Oct 2025 02:07:28 +0000 Subject: [PATCH 03/53] update Signed-off-by: bk-201 --- vllm/lora/models.py | 34 ++++++++++++++++++------------ vllm/v1/worker/gpu_model_runner.py | 4 +++- 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 46cce598121cd..6afc73d2b04e5 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -385,15 +385,16 @@ class LoRAModelManager(AdapterModelManager): self.info, "get_num_mm_encoder_tokens") else: self.supports_mm_lora = False - if self.supports_mm_lora: + if self.supports_mm_lora: # 从init传进来就可以了,不需要model_config了 self.mm_mapping: MultiModelKeys = self.model.get_mm_mapping() self.mm_config = model_config.multimodal_config - limit_per_prompt: int = max( - [1] + \ - list(self.mm_config.limit_per_prompt.values()) - ) + # limit_per_prompt: int = max( + # self.info.get_allowed_mm_limits().values()) + limit_per_prompt = 5 # For vision tower + # max_num_batched_tokens = encoder_budget + # max_batches = max_batches * limit_per_prompt self.mm_punica_wrapper_mapping = { name: get_punica_wrapper( @@ -411,8 +412,13 @@ class LoRAModelManager(AdapterModelManager): self.mm_mapping.language_model[0]: self.punica_wrapper } ) - # For other - # TODO + # TODO Connector is not supported at the moment. + self.mm_punica_wrapper_mapping.update( + { + name: None + for name in self.mm_mapping.connector + } + ) self.is_pooling_model = is_pooling_model(self.model) self.packed_modules: dict[str, list[str]] = {} @@ -702,15 +708,17 @@ class LoRAModelManager(AdapterModelManager): def _get_mm_punica_wrapper(self, module_name: str) -> PunicaWrapperBase: """ - TODO + Match the corresponding punica_wrapper based on module_name, + and return None if lora is not supported for this module. """ if self.supports_mm_lora: - for ( - prefix, - punica_wrapper, - ) in self.mm_punica_wrapper_mapping.items(): + # Ensure matching by the longest prefix. + sorted_prefixes = sorted(self.mm_punica_wrapper_mapping.keys(), + key=lambda x: len(x), reverse=True) + + for prefix in sorted_prefixes: if module_name.startswith(prefix): - return punica_wrapper + return self.mm_punica_wrapper_mapping[prefix] return None def _register_packed_modules(self, module_full_name: str) -> None: diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 43620a0cc2d67..a8a513fd57aad 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2271,7 +2271,9 @@ class GPUModelRunner(LoRAModelRunnerMixin): mm_counts={ dummy_data_modality: 1 }, - ).multi_modal_data + ) + dummy_mm_kwargs = dummy_mm_data.multi_modal_data + dummy_mm_token_ids = dummy_mm_data.multi_modal_token_ids batched_dummy_mm_inputs = MultiModalKwargs.batch( [dummy_mm_kwargs] * max_num_mm_items, From cb1a6f074a9dba3876c7c72850dbdfef8534f8d8 Mon Sep 17 00:00:00 2001 From: bk-201 Date: Mon, 13 Oct 2025 02:14:36 +0000 Subject: [PATCH 04/53] update Signed-off-by: bk-201 --- requirements/test.txt | 41 +------ vllm/lora/layers/__init__.py | 3 +- vllm/lora/layers/base_linear.py | 7 ++ vllm/lora/layers/utils.py | 1 + vllm/lora/models.py | 127 ++++++++++++---------- vllm/lora/worker_manager.py | 4 +- vllm/v1/worker/gpu_model_runner.py | 31 ++++-- vllm/v1/worker/lora_model_runner_mixin.py | 36 ++++-- 8 files changed, 132 insertions(+), 118 deletions(-) diff --git a/requirements/test.txt b/requirements/test.txt index ba72502ff43d1..01a501badb1fc 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -17,7 +17,6 @@ aiohttp==3.13.0 # aiohttp-cors # datasets # fsspec - # gpt-oss # lm-eval # ray aiohttp-cors==0.8.1 @@ -45,9 +44,7 @@ argcomplete==3.5.1 arrow==1.3.0 # via isoduration async-timeout==5.0.1 - # via - # aiohttp - # redis + # via redis attrs==24.2.0 # via # aiohttp @@ -108,8 +105,6 @@ chardet==5.2.0 # via mbstrdecoder charset-normalizer==3.4.0 # via requests -chz==0.3.0 - # via gpt-oss click==8.1.7 # via # black @@ -180,9 +175,7 @@ distlib==0.3.9 dnspython==2.7.0 # via email-validator docker==7.1.0 - # via - # gpt-oss - # mlflow + # via mlflow docopt==0.6.2 # via num2words docstring-parser==0.17.0 @@ -208,9 +201,7 @@ eval-type-backport==0.2.2 evaluate==0.4.3 # via lm-eval fastapi==0.116.1 - # via - # gpt-oss - # mlflow-skinny + # via mlflow-skinny fastparquet==2024.11.0 # via genai-perf fastrlock==0.8.2 @@ -285,8 +276,6 @@ google-resumable-media==2.7.2 # via google-cloud-storage googleapis-common-protos==1.70.0 # via google-api-core -gpt-oss==0.0.8 - # via -r requirements/test.in graphene==3.4.3 # via mlflow graphql-core==3.2.6 @@ -314,8 +303,6 @@ hf-xet==1.1.7 # via huggingface-hub hiredis==3.0.0 # via tensorizer -html2text==2025.4.15 - # via gpt-oss httpcore==1.0.6 # via httpx httpx==0.27.2 @@ -450,7 +437,6 @@ lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b772215 lxml==5.3.0 # via # blobfile - # gpt-oss # sacrebleu mako==1.3.10 # via alembic @@ -620,8 +606,6 @@ omegaconf==2.3.0 # lightning open-clip-torch==2.32.0 # via -r requirements/test.in -openai-harmony==0.0.4 - # via gpt-oss opencensus==0.11.4 # via ray opencensus-context==0.1.3 @@ -793,12 +777,10 @@ pydantic==2.12.0 # albumentations # datamodel-code-generator # fastapi - # gpt-oss # lightly # mistral-common # mlflow-skinny # mteb - # openai-harmony # pydantic-extra-types # ray pydantic-core==2.41.1 @@ -929,7 +911,6 @@ requests==2.32.3 # evaluate # google-api-core # google-cloud-storage - # gpt-oss # huggingface-hub # lightly # lm-eval @@ -1072,8 +1053,6 @@ starlette-testclient==0.4.1 # via schemathesis statsmodels==0.14.4 # via genai-perf -structlog==25.4.0 - # via gpt-oss sympy==1.13.3 # via # einx @@ -1088,15 +1067,12 @@ tcolorpy==0.1.6 # via pytablewriter tenacity==9.1.2 # via - # gpt-oss # lm-eval # plotly tensorboardx==2.6.4 # via lightning tensorizer==2.10.1 # via -r requirements/test.in -termcolor==3.1.0 - # via gpt-oss terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e # via -r requirements/test.in threadpoolctl==3.5.0 @@ -1107,7 +1083,6 @@ tifffile==2025.3.30 # terratorch tiktoken==0.12.0 # via - # gpt-oss # lm-eval # mistral-common timm==1.0.17 @@ -1121,12 +1096,9 @@ tokenizers==0.22.0 # via # -r requirements/test.in # transformers -toml==0.10.2 - # via datamodel-code-generator tomli==2.2.1 # via - # black - # pytest + # coverage # schemathesis tomli-w==1.2.0 # via schemathesis @@ -1235,7 +1207,6 @@ typing-extensions==4.15.0 # aiosignal # albumentations # alembic - # chz # fastapi # graphene # huggingface-hub @@ -1275,9 +1246,7 @@ urllib3==2.2.3 # responses # tritonclient uvicorn==0.35.0 - # via - # gpt-oss - # mlflow-skinny + # via mlflow-skinny vector-quantize-pytorch==1.21.2 # via -r requirements/test.in virtualenv==20.31.2 diff --git a/vllm/lora/layers/__init__.py b/vllm/lora/layers/__init__.py index cfbf51922505f..4915ef85f4f73 100644 --- a/vllm/lora/layers/__init__.py +++ b/vllm/lora/layers/__init__.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from vllm.lora.layers.base import BaseLayerWithLoRA, PunicaWrapperBase +from vllm.lora.layers.base import BaseLayerWithLoRA from vllm.lora.layers.column_parallel_linear import ( ColumnParallelLinearWithLoRA, ColumnParallelLinearWithShardedLoRA, @@ -36,5 +36,4 @@ __all__ = [ "RowParallelLinearWithShardedLoRA", "ReplicatedLinearWithLoRA", "LoRAMapping", - "PunicaWrapperBase", ] diff --git a/vllm/lora/layers/base_linear.py b/vllm/lora/layers/base_linear.py index da053f0923aba..e2b7a247f9d36 100644 --- a/vllm/lora/layers/base_linear.py +++ b/vllm/lora/layers/base_linear.py @@ -124,6 +124,9 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA): ) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) + # Store original shape for later reshaping + original_shape = output.shape if output.ndim == 3 else None + # In transformers backend, x and output have extra batch dimension like # (1, seq_len, hidden_dim), while punica expects (seq_len, hidden_dim), # therefore we need to flatten the batch dimensions. @@ -137,6 +140,10 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA): if not current_platform.can_update_inplace(): output = lora_output + # Restore original shape if it was flattened + if original_shape is not None: + output = output.reshape(original_shape) + return output @property diff --git a/vllm/lora/layers/utils.py b/vllm/lora/layers/utils.py index 2da90f180ee74..002dc934636b9 100644 --- a/vllm/lora/layers/utils.py +++ b/vllm/lora/layers/utils.py @@ -12,6 +12,7 @@ class LoRAMapping: index_mapping: tuple[int, ...] prompt_mapping: tuple[int, ...] is_prefill: bool = False + is_mm_input: bool = False def __post_init__(self): self.index_mapping = tuple(self.index_mapping) diff --git a/vllm/lora/models.py b/vllm/lora/models.py index b78a7d8332fcd..2b27a67680624 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -12,10 +12,10 @@ from torch import nn from vllm.config.lora import LoRAConfig, ModelConfig from vllm.logger import init_logger -from vllm.lora.layers import BaseLayerWithLoRA, LoRAMapping, PunicaWrapperBase +from vllm.lora.layers import BaseLayerWithLoRA, LoRAMapping from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights from vllm.lora.peft_helper import PEFTHelper -from vllm.lora.punica_wrapper import get_punica_wrapper +from vllm.lora.punica_wrapper import PunicaWrapperBase, get_punica_wrapper from vllm.lora.utils import ( from_layer, from_layer_logits_processor, @@ -30,8 +30,8 @@ from vllm.model_executor.models import SupportsLoRA, supports_multimodal from vllm.model_executor.models.interfaces import is_pooling_model from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.utils import PPMissingLayer, WeightsMapper -from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.model_executor.utils import get_packed_modules_mapping +from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.utils import is_pin_memory_available from vllm.utils.cache import LRUCache @@ -378,17 +378,18 @@ class LoRAModelManager: supports_multimodal(self.model) # In case the model only supports LoRA for # text modules (e.g. ChatGLM) - and hasattr(self.model, "get_mm_mapping")) + and hasattr(self.model, "get_mm_mapping") + ) # For v0 compatibility if model_config is not None: self.mm_registry = MULTIMODAL_REGISTRY - self.info = self.mm_registry.create_processor( - model_config, disable_cache=True).info + self.info = self.mm_registry.create_processor(model_config).info self.supports_mm_lora = self.supports_mm and hasattr( - self.info, "get_num_mm_encoder_tokens") + self.info, "get_num_mm_encoder_tokens" + ) else: self.supports_mm_lora = False - if self.supports_mm_lora: # 从init传进来就可以了,不需要model_config了 + if self.supports_mm_lora: # 从init传进来就可以了,不需要model_config了 self.mm_mapping: MultiModelKeys = self.model.get_mm_mapping() self.mm_config = model_config.multimodal_config # limit_per_prompt: int = max( @@ -399,10 +400,8 @@ class LoRAModelManager: # max_num_batched_tokens = encoder_budget # max_batches = max_batches * limit_per_prompt self.mm_punica_wrapper_mapping = { - name: - get_punica_wrapper( - self.info.get_num_mm_encoder_tokens( - max_num_batched_tokens), + name: get_punica_wrapper( + self.info.get_num_mm_encoder_tokens(max_num_batched_tokens), max_batches=self.max_num_seqs * limit_per_prompt, device=self.device, max_loras=self.lora_config.max_loras, @@ -411,16 +410,11 @@ class LoRAModelManager: } # For language model self.mm_punica_wrapper_mapping.update( - { - self.mm_mapping.language_model[0]: self.punica_wrapper - } + {self.mm_mapping.language_model[0]: self.punica_wrapper} ) # TODO Connector is not supported at the moment. self.mm_punica_wrapper_mapping.update( - { - name: None - for name in self.mm_mapping.connector - } + {name: None for name in self.mm_mapping.connector} ) self.is_pooling_model = is_pooling_model(self.model) @@ -512,28 +506,27 @@ class LoRAModelManager: self.lora_slots + 1, self.vocab_size, self.lora_config.lora_extra_vocab_size, - self.long_lora_context, ) elif mapping.is_mm_input: self.mm_punica_wrapper_mapping[ - self.mm_mapping.tower_model[0]].update_metadata( - mapping, - self.lora_index_to_id, - self.lora_slots + 1, - self.vocab_size, - self.lora_config.lora_extra_vocab_size, - self.long_lora_context, - ) + self.mm_mapping.tower_model[0] + ].update_metadata( + mapping, + self.lora_index_to_id, + self.lora_slots + 1, + self.vocab_size, + self.lora_config.lora_extra_vocab_size, + ) else: self.mm_punica_wrapper_mapping[ - self.mm_mapping.language_model[0]].update_metadata( - mapping, - self.lora_index_to_id, - self.lora_slots + 1, - self.vocab_size, - self.lora_config.lora_extra_vocab_size, - self.long_lora_context, - ) + self.mm_mapping.language_model[0] + ].update_metadata( + mapping, + self.lora_index_to_id, + self.lora_slots + 1, + self.vocab_size, + self.lora_config.lora_extra_vocab_size, + ) def remove_all_adapters(self): """Remove all LoRAModels from the manager.""" @@ -613,8 +606,7 @@ class LoRAModelManager: self._register_packed_modules(module_name) # All lora layers share the same punica_wrapper based on reference. if self.supports_mm_lora: - new_module.set_mapping( - self._get_mm_punica_wrapper(module_name)) + new_module.set_mapping(self._get_mm_punica_wrapper(module_name)) else: new_module.set_mapping(self.punica_wrapper) @@ -711,22 +703,23 @@ class LoRAModelManager: if self.supports_mm: prefix_lst = self.mm_mapping.connector + self.mm_mapping.tower_model if self.supports_mm_lora: - return self._get_mm_punica_wrapper(module_name) is None else: - return any( - [module_name.startswith(prefix) for prefix in prefix_lst]) + return any([module_name.startswith(prefix) for prefix in prefix_lst]) return False - def _get_mm_punica_wrapper(self, module_name: str) -> PunicaWrapperBase: + def _get_mm_punica_wrapper(self, module_name: str) -> Optional[PunicaWrapperBase]: """ - Match the corresponding punica_wrapper based on module_name, + Match the corresponding punica_wrapper based on module_name, and return None if lora is not supported for this module. """ if self.supports_mm_lora: # Ensure matching by the longest prefix. - sorted_prefixes = sorted(self.mm_punica_wrapper_mapping.keys(), - key=lambda x: len(x), reverse=True) + sorted_prefixes = sorted( + self.mm_punica_wrapper_mapping.keys(), + key=lambda x: len(x), + reverse=True, + ) for prefix in sorted_prefixes: if module_name.startswith(prefix): @@ -834,12 +827,25 @@ class LoRALRUCache(AdapterLRUCache[LoRAModel]): class LRUCacheLoRAModelManager(LoRAModelManager): """A model manager that manages multiple LoRAs with LRU cache.""" - def __init__(self, model: nn.Module, max_num_seqs: int, - max_num_batched_tokens: int, vocab_size: int, - lora_config: LoRAConfig, model_config: ModelConfig, - device: torch.device): - super().__init__(model, max_num_seqs, max_num_batched_tokens, - vocab_size, lora_config, model_config, device) + def __init__( + self, + model: nn.Module, + max_num_seqs: int, + max_num_batched_tokens: int, + vocab_size: int, + lora_config: LoRAConfig, + model_config: ModelConfig, + device: torch.device, + ): + super().__init__( + model, + max_num_seqs, + max_num_batched_tokens, + vocab_size, + lora_config, + model_config, + device, + ) self._registered_adapters: LoRALRUCache = LoRALRUCache( self.capacity, self.deactivate_adapter ) @@ -906,15 +912,16 @@ class LRUCacheLoRAModelManager(LoRAModelManager): def create_lora_manager( - model: nn.Module, - max_num_seqs: int, - max_num_batched_tokens: int, - vocab_size: int, - lora_config: LoRAConfig, - model_config: ModelConfig, - device: torch.device, - lora_manager_cls: type[LoRAModelManager] = LoRAModelManager, - **kwargs) -> LoRAModelManager: + model: nn.Module, + max_num_seqs: int, + max_num_batched_tokens: int, + vocab_size: int, + lora_config: LoRAConfig, + model_config: ModelConfig, + device: torch.device, + lora_manager_cls: type[LoRAModelManager] = LoRAModelManager, + **kwargs, +) -> LoRAModelManager: """Create a LoRA adapter for a given model.""" if not isinstance(model, SupportsLoRA): raise ValueError(f"Model {type(model)} is not supported for LoRA.") diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index 4a4772037ee72..d717e81792fed 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -6,7 +6,7 @@ from typing import Any, Literal, Optional, Union import torch -from vllm.config import VllmConfig, ModelConfig +from vllm.config import ModelConfig, VllmConfig from vllm.logger import init_logger from vllm.lora.models import ( LoRAModel, @@ -71,6 +71,7 @@ class WorkerLoRAManager: def create_lora_manager( self, model: torch.nn.Module, + model_config: Optional[ModelConfig] = None, ) -> Any: lora_manager = create_lora_manager( model, @@ -80,6 +81,7 @@ class WorkerLoRAManager: lora_config=self.lora_config, device=self.device, lora_manager_cls=self._manager_cls, + model_config=model_config, ) self._adapter_manager = lora_manager return lora_manager.model diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index c270fd9bce23a..c446103aac629 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -512,6 +512,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): pin_memory=self.pin_memory, ) + # Multimodal LoRA support + if self.supports_mm_inputs: + self.info = self.mm_registry.create_processor(self.model_config).info + self.supports_mm_lora = hasattr(self.info, "get_num_mm_encoder_tokens") + else: + self.supports_mm_lora = False + def reset_mm_cache(self) -> None: if self.mm_budget: self.mm_budget.reset_cache() @@ -571,15 +578,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ) return model_kwargs - # Multimodal LoRA support - if self.is_multimodal_model: - self.info = self.mm_registry.create_processor( - self.model_config, disable_cache=True).info - self.supports_mm_lora = hasattr(self.info, - "get_num_mm_encoder_tokens") - else: - self.supports_mm_lora = False - def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None: """ Update the order of requests in the batch based on the attention @@ -1751,6 +1749,19 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # encoder outputs. model = cast(SupportsMultiModal, self.model) encoder_outputs = [] + + if self.lora_config and self.supports_mm_lora: + mm_tokens = [ + self.info.get_num_mm_encoder_tokens(pos_info.length) + for _, pos_info in mm_hashes_pos + ] + num_scheduled_tokens = np.array(mm_tokens, dtype=np.int32) + self.set_active_loras( + self.input_batch, + num_scheduled_tokens, + is_mm_input=True, + ) + for modality, num_items, mm_kwargs_group in group_mm_kwargs_by_modality( mm_kwargs, device=self.device, @@ -2903,7 +2914,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ) if self.lora_config: self.model = self.load_lora_model( - self.model, self.vllm_config, self.device + self.model, self.vllm_config, self.device, self.model_config ) if hasattr(self, "drafter"): logger.info("Loading drafter model...") diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py index 36a2a0124fbfc..48ae88a9850b7 100644 --- a/vllm/v1/worker/lora_model_runner_mixin.py +++ b/vllm/v1/worker/lora_model_runner_mixin.py @@ -11,7 +11,7 @@ import numpy as np import torch import torch.nn as nn -from vllm.config import VllmConfig +from vllm.config import ModelConfig, VllmConfig from vllm.config.lora import LoRAConfig from vllm.logger import init_logger from vllm.lora.layers import LoRAMapping @@ -29,7 +29,11 @@ logger = init_logger(__name__) # Defined as a mixin for GPUModelRunner class LoRAModelRunnerMixin: def load_lora_model( - self, model: nn.Module, vllm_config: VllmConfig, device: torch.device + self, + model: nn.Module, + vllm_config: VllmConfig, + device: torch.device, + model_config: ModelConfig = None, ) -> nn.Module: if not supports_lora(model): raise ValueError(f"{model.__class__.__name__} does not support LoRA yet.") @@ -54,7 +58,7 @@ class LoRAModelRunnerMixin: prompt_lora_mapping: tuple[int, ...], token_lora_mapping: tuple[int, ...], lora_requests: set[LoRARequest], - is_mm_input: bool = False + is_mm_input: bool = False, ) -> None: self._ensure_lora_enabled() @@ -63,7 +67,10 @@ class LoRAModelRunnerMixin: # On cuda platforms we use the same kernels for prefill and # decode and this flag is generally ignored. lora_mapping = LoRAMapping( - token_lora_mapping, prompt_lora_mapping, is_prefill=True, is_mm_input=is_mm_input + token_lora_mapping, + prompt_lora_mapping, + is_prefill=True, + is_mm_input=is_mm_input, ) self.lora_manager.set_active_adapters(lora_requests, lora_mapping) @@ -72,7 +79,10 @@ class LoRAModelRunnerMixin: raise RuntimeError("LoRA is not enabled. Use --enable-lora to enable LoRA.") def set_active_loras( - self, input_batch: InputBatch, num_scheduled_tokens: np.ndarray, is_mm_input: bool = False + self, + input_batch: InputBatch, + num_scheduled_tokens: np.ndarray, + is_mm_input: bool = False, ) -> None: prompt_lora_mapping: tuple[int, ...] # of size input_batch.num_reqs token_lora_mapping: tuple[int, ...] # of size np.sum(num_scheduled_tokens) @@ -122,7 +132,10 @@ class LoRAModelRunnerMixin: @contextmanager def maybe_select_dummy_loras( - self, lora_config: Optional[LoRAConfig], num_scheduled_tokens: np.ndarray, is_mm_input: bool = False + self, + lora_config: Optional[LoRAConfig], + num_scheduled_tokens: np.ndarray, + is_mm_input: bool = False, ): if lora_config is None: yield @@ -151,7 +164,10 @@ class LoRAModelRunnerMixin: } self._set_active_loras( - tuple(prompt_lora_mapping), tuple(token_lora_mapping), lora_requests, is_mm_input + tuple(prompt_lora_mapping), + tuple(token_lora_mapping), + lora_requests, + is_mm_input, ) yield @@ -162,11 +178,13 @@ class LoRAModelRunnerMixin: lora_config: Optional[LoRAConfig], num_scheduled_tokens: np.ndarray, remove_lora: bool = True, - is_mm_input: bool = False + is_mm_input: bool = False, ): with ( self.maybe_setup_dummy_loras(lora_config, remove_lora), - self.maybe_select_dummy_loras(lora_config, num_scheduled_tokens, is_mm_input), + self.maybe_select_dummy_loras( + lora_config, num_scheduled_tokens, is_mm_input + ), ): yield From 882213cca2af7b6a61e46b3a7e665e249efc8396 Mon Sep 17 00:00:00 2001 From: bk-201 Date: Mon, 13 Oct 2025 02:39:01 +0000 Subject: [PATCH 05/53] update Signed-off-by: bk-201 --- vllm/lora/models.py | 4 ++-- vllm/lora/worker_manager.py | 4 ++-- vllm/v1/worker/lora_model_runner_mixin.py | 5 ++++- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/vllm/lora/models.py b/vllm/lora/models.py index f7b35802f55b8..bfb65bbab9015 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -336,7 +336,7 @@ class LoRAModelManager: max_num_batched_tokens: int, vocab_size: int, lora_config: LoRAConfig, - model_config: Optional[ModelConfig], + model_config: ModelConfig | None, device: torch.device, ): """Create a LoRAModelManager and adapter for a given model. @@ -709,7 +709,7 @@ class LoRAModelManager: return any([module_name.startswith(prefix) for prefix in prefix_lst]) return False - def _get_mm_punica_wrapper(self, module_name: str) -> Optional[PunicaWrapperBase]: + def _get_mm_punica_wrapper(self, module_name: str) -> PunicaWrapperBase | None: """ Match the corresponding punica_wrapper based on module_name, and return None if lora is not supported for this module. diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index 49d4c29113760..dc8200fa8d006 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -71,7 +71,7 @@ class WorkerLoRAManager: def create_lora_manager( self, model: torch.nn.Module, - model_config: Optional[ModelConfig] = None, + model_config: ModelConfig | None = None, ) -> Any: lora_manager = create_lora_manager( model, @@ -222,7 +222,7 @@ class LRUCacheWorkerLoRAManager(WorkerLoRAManager): def create_lora_manager( self, model: torch.nn.Module, - model_config: Optional[ModelConfig] = None, + model_config: ModelConfig | None = None, ) -> Any: lora_manager = create_lora_manager( model, diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py index 539f413d34a9b..31094dcbda124 100644 --- a/vllm/v1/worker/lora_model_runner_mixin.py +++ b/vllm/v1/worker/lora_model_runner_mixin.py @@ -131,7 +131,10 @@ class LoRAModelRunnerMixin: @contextmanager def maybe_select_dummy_loras( - self, lora_config: LoRAConfig | None, num_scheduled_tokens: np.ndarray, is_mm_input: bool = False + self, + lora_config: LoRAConfig | None, + num_scheduled_tokens: np.ndarray, + is_mm_input: bool = False, ): if lora_config is None: yield From 8863bd2b74501cf79037fee9da269e6b0b2d5d34 Mon Sep 17 00:00:00 2001 From: bk-201 Date: Mon, 13 Oct 2025 02:41:29 +0000 Subject: [PATCH 06/53] update Signed-off-by: bk-201 --- vllm/lora/models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/lora/models.py b/vllm/lora/models.py index bfb65bbab9015..0de2b4ceec9bf 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -390,12 +390,12 @@ class LoRAModelManager: ) else: self.supports_mm_lora = False - if self.supports_mm_lora: # 从init传进来就可以了,不需要model_config了 + if self.supports_mm_lora: self.mm_mapping: MultiModelKeys = self.model.get_mm_mapping() self.mm_config = model_config.multimodal_config # limit_per_prompt: int = max( # self.info.get_allowed_mm_limits().values()) - limit_per_prompt = 5 + limit_per_prompt = 5 # TODO # For vision tower # max_num_batched_tokens = encoder_budget From 5c156c9f09065f4ee4a401412c2fd7f4782253df Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Mon, 13 Oct 2025 03:30:43 +0000 Subject: [PATCH 07/53] Init Signed-off-by: Jee Jee Li --- requirements/test.txt | 39 +++++++++++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/requirements/test.txt b/requirements/test.txt index 01a501badb1fc..03fbdcc8d453b 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -17,6 +17,7 @@ aiohttp==3.13.0 # aiohttp-cors # datasets # fsspec + # gpt-oss # lm-eval # ray aiohttp-cors==0.8.1 @@ -43,8 +44,6 @@ argcomplete==3.5.1 # via datamodel-code-generator arrow==1.3.0 # via isoduration -async-timeout==5.0.1 - # via redis attrs==24.2.0 # via # aiohttp @@ -105,6 +104,8 @@ chardet==5.2.0 # via mbstrdecoder charset-normalizer==3.4.0 # via requests +chz==0.3.0 + # via gpt-oss click==8.1.7 # via # black @@ -175,7 +176,9 @@ distlib==0.3.9 dnspython==2.7.0 # via email-validator docker==7.1.0 - # via mlflow + # via + # gpt-oss + # mlflow docopt==0.6.2 # via num2words docstring-parser==0.17.0 @@ -201,7 +204,9 @@ eval-type-backport==0.2.2 evaluate==0.4.3 # via lm-eval fastapi==0.116.1 - # via mlflow-skinny + # via + # gpt-oss + # mlflow-skinny fastparquet==2024.11.0 # via genai-perf fastrlock==0.8.2 @@ -276,6 +281,8 @@ google-resumable-media==2.7.2 # via google-cloud-storage googleapis-common-protos==1.70.0 # via google-api-core +gpt-oss==0.0.8 + # via -r requirements/test.in graphene==3.4.3 # via mlflow graphql-core==3.2.6 @@ -303,6 +310,8 @@ hf-xet==1.1.7 # via huggingface-hub hiredis==3.0.0 # via tensorizer +html2text==2025.4.15 + # via gpt-oss httpcore==1.0.6 # via httpx httpx==0.27.2 @@ -437,6 +446,7 @@ lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b772215 lxml==5.3.0 # via # blobfile + # gpt-oss # sacrebleu mako==1.3.10 # via alembic @@ -606,6 +616,8 @@ omegaconf==2.3.0 # lightning open-clip-torch==2.32.0 # via -r requirements/test.in +openai-harmony==0.0.4 + # via gpt-oss opencensus==0.11.4 # via ray opencensus-context==0.1.3 @@ -777,10 +789,12 @@ pydantic==2.12.0 # albumentations # datamodel-code-generator # fastapi + # gpt-oss # lightly # mistral-common # mlflow-skinny # mteb + # openai-harmony # pydantic-extra-types # ray pydantic-core==2.41.1 @@ -911,6 +925,7 @@ requests==2.32.3 # evaluate # google-api-core # google-cloud-storage + # gpt-oss # huggingface-hub # lightly # lm-eval @@ -1001,6 +1016,7 @@ setuptools==77.0.3 # via # lightning-utilities # pytablewriter + # torch # triton shapely==2.1.1 # via @@ -1053,6 +1069,8 @@ starlette-testclient==0.4.1 # via schemathesis statsmodels==0.14.4 # via genai-perf +structlog==25.4.0 + # via gpt-oss sympy==1.13.3 # via # einx @@ -1067,12 +1085,15 @@ tcolorpy==0.1.6 # via pytablewriter tenacity==9.1.2 # via + # gpt-oss # lm-eval # plotly tensorboardx==2.6.4 # via lightning tensorizer==2.10.1 # via -r requirements/test.in +termcolor==3.1.0 + # via gpt-oss terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e # via -r requirements/test.in threadpoolctl==3.5.0 @@ -1083,6 +1104,7 @@ tifffile==2025.3.30 # terratorch tiktoken==0.12.0 # via + # gpt-oss # lm-eval # mistral-common timm==1.0.17 @@ -1097,9 +1119,7 @@ tokenizers==0.22.0 # -r requirements/test.in # transformers tomli==2.2.1 - # via - # coverage - # schemathesis + # via schemathesis tomli-w==1.2.0 # via schemathesis torch==2.8.0+cu128 @@ -1207,6 +1227,7 @@ typing-extensions==4.15.0 # aiosignal # albumentations # alembic + # chz # fastapi # graphene # huggingface-hub @@ -1246,7 +1267,9 @@ urllib3==2.2.3 # responses # tritonclient uvicorn==0.35.0 - # via mlflow-skinny + # via + # gpt-oss + # mlflow-skinny vector-quantize-pytorch==1.21.2 # via -r requirements/test.in virtualenv==20.31.2 From a69bde7e8fb7036eb7ecbb58593e5233d1c08723 Mon Sep 17 00:00:00 2001 From: prashanth058 Date: Thu, 20 Nov 2025 15:04:33 +0000 Subject: [PATCH 08/53] [feat] add connector support Signed-off-by: prashanth058 --- tests/lora/conftest.py | 15 ++ tests/lora/test_qwen2vl.py | 78 +++++++++- vllm/lora/layers/__init__.py | 3 +- vllm/lora/layers/row_parallel_linear.py | 25 ++-- vllm/lora/layers/utils.py | 9 +- vllm/lora/models.py | 173 +++++++++++----------- vllm/model_executor/models/qwen2_5_vl.py | 19 +++ vllm/model_executor/models/qwen2_vl.py | 9 ++ vllm/v1/worker/gpu_model_runner.py | 87 +++++++++-- vllm/v1/worker/lora_model_runner_mixin.py | 29 ++-- vllm/v1/worker/tpu_model_runner.py | 11 +- 11 files changed, 325 insertions(+), 133 deletions(-) diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index f805a74a4dba8..e5328cde3a046 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -225,6 +225,21 @@ def qwen25vl_lora_files(): return snapshot_download(repo_id="jeeejeee/qwen25-vl-lora-pokemon") +@pytest.fixture(scope="session") +def qwen2vl_language_lora_files(): + return snapshot_download(repo_id="prashanth058/qwen2vl-flickr-lora-language") + + +@pytest.fixture(scope="session") +def qwen2vl_vision_tower_connector_lora_files(): + return snapshot_download(repo_id="prashanth058/qwen2vl-flickr-lora-tower-connector") + + +@pytest.fixture(scope="session") +def qwen2vl_vision_tower_lora_files(): + return snapshot_download(repo_id="prashanth058/qwen2vl-flickr-lora-tower") + + @pytest.fixture(scope="session") def tinyllama_lora_files(): return snapshot_download(repo_id="jashing/tinyllama-colorist-lora") diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py index 1800ca107a426..a323bd642b7c2 100644 --- a/tests/lora/test_qwen2vl.py +++ b/tests/lora/test_qwen2vl.py @@ -79,7 +79,6 @@ class Qwen2VLTester: lora_request = LoRARequest(str(lora_id), lora_id, self.config.lora_path) outputs = self.llm.generate(inputs, sampling_params, lora_request=lora_request) generated_texts = [output.outputs[0].text.strip() for output in outputs] - # Validate outputs for generated, expected in zip(generated_texts, expected_outputs): assert expected.startswith(generated), ( @@ -130,6 +129,22 @@ EXPECTED_OUTPUTS = [ "A majestic skyscraper stands tall, partially obscured by a vibrant canopy of cherry blossoms, against a clear blue sky.", # noqa: E501 ] +EXPECTED_OUTPUTS_LANGUAGE = [ + "A stop sign is shown in an Asian city, with buildings and a car in the " + "background.", + "The Tokyo Skytree can be seen behind the pink blossoms of the cherry trees.", +] + +EXPECTED_OUTPUTS_VISION = [ + "A stop sign in front of oriental buildings.", + "A tree with pink flowers in front of it and a blue sky behind the flowers.", +] + +EXPECTED_OUTPUTS_VISION_NO_CONNECTOR = [ + "A stop sign is located on the street of a Chinese neighborhood.", + "A closeup shot of the Tokyo Skytree with pink flowers in the foreground.", +] + # NOTE - beam search .text contains the whole text EXPECTED_BEAM_SEARCH_OUTPUTS = [ [ @@ -190,3 +205,64 @@ def test_qwen25vl_lora(qwen25vl_lora_files): # Test with different LoRA IDs for lora_id in [1, 2]: tester.run_test(TEST_IMAGES, expected_outputs=EXPECTED_OUTPUTS, lora_id=lora_id) + + +@pytest.mark.xfail( + current_platform.is_rocm(), + reason="Qwen2-VL dependency xformers incompatible with ROCm", +) +def test_qwen2vl_language_lora(qwen2vl_language_lora_files): + """ + Test language-only LoRA adapter. + """ + config = TestConfig( + model_path=QWEN2VL_MODEL_PATH, lora_path=qwen2vl_language_lora_files + ) + tester = Qwen2VLTester(config) + for lora_id in [1, 2]: + tester.run_test( + TEST_IMAGES, expected_outputs=EXPECTED_OUTPUTS_LANGUAGE, lora_id=lora_id + ) + + +@pytest.mark.xfail( + current_platform.is_rocm(), + reason="Qwen2-VL dependency xformers incompatible with ROCm", +) +def test_qwen2vl_vision_lora(qwen2vl_vision_tower_connector_lora_files): + """ + Test vision tower + connector LoRA adapter. + """ + config = TestConfig( + model_path=QWEN2VL_MODEL_PATH, + lora_path=qwen2vl_vision_tower_connector_lora_files, + ) + tester = Qwen2VLTester(config) + for lora_id in [1, 2]: + tester.run_test( + TEST_IMAGES, expected_outputs=EXPECTED_OUTPUTS_VISION, lora_id=lora_id + ) + + +@pytest.mark.xfail( + current_platform.is_rocm(), + reason="Qwen2-VL dependency xformers incompatible with ROCm", +) +def test_qwen2vl_vision_no_connector_lora( + qwen2vl_vision_tower_lora_files, +): + """ + Test vision tower only LoRA adapter. + + """ + config = TestConfig( + model_path=QWEN2VL_MODEL_PATH, + lora_path=qwen2vl_vision_tower_lora_files, + ) + tester = Qwen2VLTester(config) + for lora_id in [1, 2]: + tester.run_test( + TEST_IMAGES, + expected_outputs=EXPECTED_OUTPUTS_VISION_NO_CONNECTOR, + lora_id=lora_id, + ) diff --git a/vllm/lora/layers/__init__.py b/vllm/lora/layers/__init__.py index 4915ef85f4f73..80dc5b382031e 100644 --- a/vllm/lora/layers/__init__.py +++ b/vllm/lora/layers/__init__.py @@ -17,7 +17,7 @@ from vllm.lora.layers.row_parallel_linear import ( RowParallelLinearWithLoRA, RowParallelLinearWithShardedLoRA, ) -from vllm.lora.layers.utils import LoRAMapping +from vllm.lora.layers.utils import LoRAMapping, LoRAMappingType from vllm.lora.layers.vocal_parallel_embedding import VocabParallelEmbeddingWithLoRA __all__ = [ @@ -36,4 +36,5 @@ __all__ = [ "RowParallelLinearWithShardedLoRA", "ReplicatedLinearWithLoRA", "LoRAMapping", + "LoRAMappingType", ] diff --git a/vllm/lora/layers/row_parallel_linear.py b/vllm/lora/layers/row_parallel_linear.py index 2ef1bd98fc612..d74e403ca39c3 100644 --- a/vllm/lora/layers/row_parallel_linear.py +++ b/vllm/lora/layers/row_parallel_linear.py @@ -63,22 +63,25 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA): input_parallel = splitted_input[self.tp_rank].contiguous() # Matrix multiply. - output_parallel = self.apply(input_parallel) + # Only fuse bias add into GEMM for rank 0 (matches base + # RowParallelLinear behavior). This ensures bias will not get + # added more than once in TP>1 case and matches the numerical + # behavior of the unwrapped layer + bias_ = ( + None + if (self.tp_rank > 0 or self.base_layer.skip_bias_add) + else self.base_layer.bias + ) + output_parallel = self.apply(input_parallel, bias_) + if self.base_layer.reduce_results and self.tp_size > 1: output_ = tensor_model_parallel_all_reduce(output_parallel) else: output_ = output_parallel - if not self.base_layer.skip_bias_add: - output = ( - output_ + self.base_layer.bias - if self.base_layer.bias is not None - else output_ - ) - output_bias = None - else: - output = output_ - output_bias = self.base_layer.bias + # Bias was already added by rank 0 in apply(), no need to add again + output_bias = self.base_layer.bias if self.base_layer.skip_bias_add else None + output = output_ if not self.base_layer.return_bias: return output diff --git a/vllm/lora/layers/utils.py b/vllm/lora/layers/utils.py index 002dc934636b9..3f89f77b663c8 100644 --- a/vllm/lora/layers/utils.py +++ b/vllm/lora/layers/utils.py @@ -2,17 +2,24 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass +from enum import Enum import torch import torch.nn as nn +class LoRAMappingType(Enum): + LANGUAGE = 1 + TOWER = 2 + CONNECTOR = 3 + + @dataclass class LoRAMapping: index_mapping: tuple[int, ...] prompt_mapping: tuple[int, ...] is_prefill: bool = False - is_mm_input: bool = False + type: LoRAMappingType = LoRAMappingType.LANGUAGE def __post_init__(self): self.index_mapping = tuple(self.index_mapping) diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 0de2b4ceec9bf..0c536d8ea192e 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -13,7 +13,7 @@ from torch import nn from vllm.config.lora import LoRAConfig, ModelConfig from vllm.logger import init_logger -from vllm.lora.layers import BaseLayerWithLoRA, LoRAMapping +from vllm.lora.layers import BaseLayerWithLoRA, LoRAMapping, LoRAMappingType from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights from vllm.lora.peft_helper import PEFTHelper from vllm.lora.punica_wrapper import PunicaWrapperBase, get_punica_wrapper @@ -374,50 +374,7 @@ class LoRAModelManager: f" {self.model.__class__.__name__}." self.packed_modules_mapping = get_packed_modules_mapping(self.model) - # Used to indicate whether the model is a multimodal model - self.supports_mm: bool = ( - supports_multimodal(self.model) - # In case the model only supports LoRA for - # text modules (e.g. ChatGLM) - and hasattr(self.model, "get_mm_mapping") - ) - # For v0 compatibility - if model_config is not None: - self.mm_registry = MULTIMODAL_REGISTRY - self.info = self.mm_registry.create_processor(model_config).info - self.supports_mm_lora = self.supports_mm and hasattr( - self.info, "get_num_mm_encoder_tokens" - ) - else: - self.supports_mm_lora = False - if self.supports_mm_lora: - self.mm_mapping: MultiModelKeys = self.model.get_mm_mapping() - self.mm_config = model_config.multimodal_config - # limit_per_prompt: int = max( - # self.info.get_allowed_mm_limits().values()) - limit_per_prompt = 5 # TODO - - # For vision tower - # max_num_batched_tokens = encoder_budget - # max_batches = max_batches * limit_per_prompt - self.mm_punica_wrapper_mapping = { - name: get_punica_wrapper( - self.info.get_num_mm_encoder_tokens(max_num_batched_tokens), - max_batches=self.max_num_seqs * limit_per_prompt, - device=self.device, - max_loras=self.lora_config.max_loras, - ) - for name in self.mm_mapping.tower_model - } - # For language model - self.mm_punica_wrapper_mapping.update( - {self.mm_mapping.language_model[0]: self.punica_wrapper} - ) - # TODO Connector is not supported at the moment. - self.mm_punica_wrapper_mapping.update( - {name: None for name in self.mm_mapping.connector} - ) - + self._init_multimodal_config(model_config) self.is_pooling_model = is_pooling_model(self.model) self.is_moe_model = is_moe_model(self.model) self.packed_modules: dict[str, list[str]] = {} @@ -427,6 +384,72 @@ class LoRAModelManager: self._create_lora_modules() self.model.lora_manager = self + def _init_multimodal_config(self, model_config): + # Used to indicate whether the model is a multimodal model + self.supports_mm: bool = ( + supports_multimodal(self.model) + # In case the model only supports LoRA for + # text modules (e.g. ChatGLM) + and hasattr(self.model, "get_mm_mapping") + ) + # For v0 compatibility + self.supports_mm_lora = False + if model_config is not None: + self.mm_registry = MULTIMODAL_REGISTRY + self.info = self.mm_registry.create_processor(model_config).info + self.supports_mm_lora = self.supports_mm and hasattr( + self.info, "get_num_mm_encoder_tokens" + ) + + if not self.supports_mm_lora: + return + + self.mm_mapping: MultiModelKeys = self.model.get_mm_mapping() + self.mm_config = model_config.multimodal_config + limit_per_prompt: int = max(self.info.get_allowed_mm_limits().values()) + + # For vision tower + num_encoder_tokens = self.info.get_num_mm_encoder_tokens( + self.max_num_batched_tokens + ) + self.mm_punica_wrapper_mapping = { + name: get_punica_wrapper( + num_encoder_tokens, + max_batches=self.max_num_seqs * limit_per_prompt, + device=self.device, + max_loras=self.lora_config.max_loras, + ) + for name in self.mm_mapping.tower_model + } + # For language model + self.mm_punica_wrapper_mapping.update( + {self.mm_mapping.language_model[0]: self.punica_wrapper} + ) + # Use wrapper for connector if present. + if self.mm_mapping.connector: + if hasattr(self.info, "get_num_mm_connector_tokens"): + connector_tokens = self.info.get_num_mm_connector_tokens( + num_encoder_tokens + ) + connector_punica_wrapper = get_punica_wrapper( + connector_tokens, + max_batches=self.max_num_seqs * limit_per_prompt, + device=self.device, + max_loras=self.lora_config.max_loras, + ) + self.mm_punica_wrapper_mapping.update( + { + name: connector_punica_wrapper + for name in self.mm_mapping.connector + } + ) + else: + logger.warning_once( + "Connector LoRA support disabled: model does not implement " + "get_num_mm_connector_tokens(). This method is required to " + "determine the connector's token budget for LoRA operations." + ) + def __len__(self) -> int: return len(self._registered_adapters) @@ -499,35 +522,27 @@ class LoRAModelManager: ) # type: ignore def _set_adapter_mapping(self, mapping: LoRAMapping) -> None: - # update lora states - if not self.supports_mm_lora: - self.punica_wrapper.update_metadata( - mapping, - self.lora_index_to_id, - self.lora_slots + 1, - self.vocab_size, - self.lora_config.lora_extra_vocab_size, - ) - elif mapping.is_mm_input: - self.mm_punica_wrapper_mapping[ - self.mm_mapping.tower_model[0] - ].update_metadata( - mapping, - self.lora_index_to_id, - self.lora_slots + 1, - self.vocab_size, - self.lora_config.lora_extra_vocab_size, - ) - else: - self.mm_punica_wrapper_mapping[ - self.mm_mapping.language_model[0] - ].update_metadata( - mapping, - self.lora_index_to_id, - self.lora_slots + 1, - self.vocab_size, - self.lora_config.lora_extra_vocab_size, - ) + # Default to the main language model wrapper + target_wrapper = self.punica_wrapper + + if self.supports_mm_lora: + if mapping.type == LoRAMappingType.TOWER: + target_name = self.mm_mapping.tower_model[0] + target_wrapper = self.mm_punica_wrapper_mapping[target_name] + elif mapping.type == LoRAMappingType.CONNECTOR: + target_name = self.mm_mapping.connector[0] + target_wrapper = self.mm_punica_wrapper_mapping[target_name] + else: + target_name = self.mm_mapping.language_model[0] + target_wrapper = self.mm_punica_wrapper_mapping[target_name] + + target_wrapper.update_metadata( + mapping, + self.lora_index_to_id, + self.lora_slots + 1, + self.vocab_size, + self.lora_config.lora_extra_vocab_size, + ) def remove_all_adapters(self): """Remove all LoRAModels from the manager.""" @@ -548,15 +563,6 @@ class LoRAModelManager: continue if not self._match_target_modules(module_name): continue - # A temporary approach for multimodal models to support LoRA - # TODO: Remove this restriction - if self._filter_unsupported_mm_module(module_name): - logger.warning( - "Regarding multimodal models, vLLM currently only supports " - "adding LoRA to language model, %s will be ignored.", - module_name, - ) - continue parts = module_name.split(".")[-1] packed_moduled_lst = self.packed_modules_mapping.get(parts, []) new_module = replace_submodule( @@ -604,6 +610,7 @@ class LoRAModelManager: if self.supports_mm and not isinstance(new_module, BaseLayerWithLoRA): continue self.register_module(module_name, new_module) + self._register_packed_modules(module_name) # All lora layers share the same punica_wrapper based on reference. if self.supports_mm_lora: diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 3f205307cb225..dd06431b54eea 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -1040,6 +1040,25 @@ class Qwen2_5_VLMultiModalProcessor(Qwen2VLMultiModalProcessor): for modality in ("image", "video") ] + def get_num_mm_encoder_tokens( + self, + num_image_tokens: int, + ) -> int: + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + merge_size = vision_config.spatial_merge_size + + return num_image_tokens * merge_size**2 + + def get_num_mm_connector_tokens( + self, + num_vision_tokens: int, + ) -> int: + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + merge_size = vision_config.spatial_merge_size + return num_vision_tokens // merge_size**2 + @MULTIMODAL_REGISTRY.register_processor( Qwen2_5_VLMultiModalProcessor, diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 004eacc1b4b79..287a55a66c6bc 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1094,6 +1094,15 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo): return num_image_tokens * merge_size**2 + def get_num_mm_connector_tokens( + self, + num_vision_tokens: int, + ) -> int: + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + merge_size = vision_config.spatial_merge_size + return num_vision_tokens // merge_size**2 + class Qwen2VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2VLProcessingInfo]): def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index b776870763956..052b85d5cb336 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -43,6 +43,7 @@ from vllm.distributed.parallel_state import ( ) from vllm.forward_context import BatchDescriptor, set_forward_context from vllm.logger import init_logger +from vllm.lora.layers import LoRAMapping, LoRAMappingType from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase from vllm.model_executor.layers.mamba.abstract import MambaBase from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding @@ -1689,7 +1690,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): def _batch_mm_kwargs_from_scheduler( self, scheduler_output: "SchedulerOutput", - ) -> tuple[list[MultiModalKwargsItem], list[tuple[str, PlaceholderRange]]]: + ) -> tuple[ + list[MultiModalKwargsItem], + list[tuple[str, PlaceholderRange]], + list[str], + ]: """Batch multimodal kwargs from scheduled encoder inputs. Args: @@ -1697,17 +1702,20 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): inputs. Returns: - A tuple of (mm_kwargs, req_ids_pos) where: + A tuple of (mm_kwargs, mm_hashes_pos, req_ids) where: - mm_kwargs: List of multimodal kwargs items to be batched - mm_hashes_pos: List of (mm_hash, position_info) tuples + - req_ids: List of request IDs for each encoder input """ scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs if not scheduled_encoder_inputs: - return [], [] + return [], [], [] # Batch the multi-modal inputs. mm_kwargs = list[MultiModalKwargsItem]() # list of tuple (mm_hash, position_info) mm_hashes_pos = list[tuple[str, PlaceholderRange]]() + # list of request IDs for each encoder input + req_ids = list[str]() for req_id, encoder_input_ids in scheduled_encoder_inputs.items(): req_state = self.requests[req_id] @@ -1716,13 +1724,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): mm_hash = mm_feature.identifier mm_kwargs.append(mm_feature.data) mm_hashes_pos.append((mm_hash, mm_feature.mm_position)) + req_ids.append(req_id) - return mm_kwargs, mm_hashes_pos + return mm_kwargs, mm_hashes_pos, req_ids def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"): # Batch the multi-modal inputs using the helper method. - mm_kwargs, mm_hashes_pos = self._batch_mm_kwargs_from_scheduler( - scheduler_output + mm_kwargs, mm_hashes_pos, encoder_req_ids = ( + self._batch_mm_kwargs_from_scheduler(scheduler_output) ) if not mm_kwargs: @@ -1739,16 +1748,62 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): encoder_outputs = [] if self.lora_config and self.supports_mm_lora: - mm_tokens = [ - self.info.get_num_mm_encoder_tokens(pos_info.length) - for _, pos_info in mm_hashes_pos - ] - num_scheduled_tokens = np.array(mm_tokens, dtype=np.int32) - self.set_active_loras( - self.input_batch, - num_scheduled_tokens, - is_mm_input=True, + # Build LoRA mappings independently for encoder inputs + # (encoder batch structure is different from main batch) + prompt_lora_mapping = [] + token_lora_mapping = [] + lora_requests = set() + + for req_id, (_, pos_info) in zip(encoder_req_ids, mm_hashes_pos): + req_idx = self.input_batch.req_id_to_index[req_id] + lora_id = int(self.input_batch.request_lora_mapping[req_idx]) + + num_tokens = self.info.get_num_mm_encoder_tokens(pos_info.length) + prompt_lora_mapping.append(lora_id) + token_lora_mapping.extend([lora_id] * num_tokens) + + if lora_id > 0: + lora_request = self.input_batch.lora_id_to_lora_request.get(lora_id) + if lora_request is not None: + lora_requests.add(lora_request) + + lora_mapping = LoRAMapping( + tuple(token_lora_mapping), + tuple(prompt_lora_mapping), + is_prefill=True, + type=LoRAMappingType.TOWER, ) + self.lora_manager.set_active_adapters(lora_requests, lora_mapping) + + if hasattr(self.info, "get_num_mm_connector_tokens"): + num_post_op_tokens = [] + for _, pos_info in mm_hashes_pos: + mm_token_count = self.info.get_num_mm_encoder_tokens( + pos_info.length + ) + post_op_count = self.info.get_num_mm_connector_tokens( + mm_token_count + ) + num_post_op_tokens.append(post_op_count) + + lora_ids = np.array( + self.lora_manager._adapter_manager._last_mapping.prompt_mapping, + dtype=np.int32, + ) + post_op_counts_np = np.array(num_post_op_tokens, dtype=np.int32) + new_token_indices = lora_ids.repeat(post_op_counts_np) + + connector_mapping = LoRAMapping( + index_mapping=tuple(new_token_indices.tolist()), + prompt_mapping=self.lora_manager._adapter_manager._last_mapping.prompt_mapping, + is_prefill=self.lora_manager._adapter_manager._last_mapping.is_prefill, + type=LoRAMappingType.CONNECTOR, + ) + + self.lora_manager.set_active_adapters( + lora_requests, + connector_mapping, + ) for modality, num_items, mm_kwargs_group in group_mm_kwargs_by_modality( mm_kwargs, @@ -1898,7 +1953,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): inputs and formats them for the encoder-decoder model forward pass. """ # Batch the multi-modal inputs using the helper method. - mm_kwargs, _ = self._batch_mm_kwargs_from_scheduler(scheduler_output) + mm_kwargs, _, _ = self._batch_mm_kwargs_from_scheduler(scheduler_output) if not mm_kwargs: return {} diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py index 31094dcbda124..98f2825e483a9 100644 --- a/vllm/v1/worker/lora_model_runner_mixin.py +++ b/vllm/v1/worker/lora_model_runner_mixin.py @@ -5,6 +5,7 @@ Define LoRA functionality mixin for model runners. """ from contextlib import contextmanager +from typing import TypeAlias import numpy as np import torch @@ -13,14 +14,14 @@ import torch.nn as nn from vllm.config import ModelConfig, VllmConfig from vllm.config.lora import LoRAConfig from vllm.logger import init_logger -from vllm.lora.layers import LoRAMapping +from vllm.lora.layers import LoRAMapping, LoRAMappingType from vllm.lora.request import LoRARequest from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager -from vllm.model_executor.models import supports_lora, supports_multimodal +from vllm.model_executor.models import supports_lora from vllm.v1.worker.gpu_input_batch import InputBatch as GPUInputBatch from vllm.v1.worker.tpu_input_batch import InputBatch as TPUInputBatch -InputBatch = TPUInputBatch | GPUInputBatch +InputBatch: TypeAlias = TPUInputBatch | GPUInputBatch logger = init_logger(__name__) @@ -37,12 +38,6 @@ class LoRAModelRunnerMixin: if not supports_lora(model): raise ValueError(f"{model.__class__.__name__} does not support LoRA yet.") - if supports_multimodal(model): - logger.warning( - "Regarding multimodal models, vLLM currently " - "only supports adding LoRA to language model." - ) - # Add LoRA Manager to the Model Runner self.lora_manager = LRUCacheWorkerLoRAManager( vllm_config, @@ -57,7 +52,7 @@ class LoRAModelRunnerMixin: prompt_lora_mapping: tuple[int, ...], token_lora_mapping: tuple[int, ...], lora_requests: set[LoRARequest], - is_mm_input: bool = False, + mapping_type: LoRAMappingType = LoRAMappingType.LANGUAGE, ) -> None: self._ensure_lora_enabled() @@ -69,7 +64,7 @@ class LoRAModelRunnerMixin: token_lora_mapping, prompt_lora_mapping, is_prefill=True, - is_mm_input=is_mm_input, + type=mapping_type, ) self.lora_manager.set_active_adapters(lora_requests, lora_mapping) @@ -81,7 +76,7 @@ class LoRAModelRunnerMixin: self, input_batch: InputBatch, num_scheduled_tokens: np.ndarray, - is_mm_input: bool = False, + mapping_type: LoRAMappingType = LoRAMappingType.LANGUAGE, ) -> None: prompt_lora_mapping: tuple[int, ...] # of size input_batch.num_reqs token_lora_mapping: tuple[int, ...] # of size np.sum(num_scheduled_tokens) @@ -90,7 +85,7 @@ class LoRAModelRunnerMixin: input_batch.make_lora_inputs(num_scheduled_tokens) ) return self._set_active_loras( - prompt_lora_mapping, token_lora_mapping, lora_requests, is_mm_input + prompt_lora_mapping, token_lora_mapping, lora_requests, mapping_type ) @contextmanager @@ -134,7 +129,7 @@ class LoRAModelRunnerMixin: self, lora_config: LoRAConfig | None, num_scheduled_tokens: np.ndarray, - is_mm_input: bool = False, + mapping_type: LoRAMappingType = LoRAMappingType.LANGUAGE, ): if lora_config is None: yield @@ -166,7 +161,7 @@ class LoRAModelRunnerMixin: tuple(prompt_lora_mapping), tuple(token_lora_mapping), lora_requests, - is_mm_input, + mapping_type, ) yield @@ -177,12 +172,12 @@ class LoRAModelRunnerMixin: lora_config: LoRAConfig | None, num_scheduled_tokens: np.ndarray, remove_lora: bool = True, - is_mm_input: bool = False, + mapping_type: LoRAMappingType = LoRAMappingType.LANGUAGE, ): with ( self.maybe_setup_dummy_loras(lora_config, remove_lora), self.maybe_select_dummy_loras( - lora_config, num_scheduled_tokens, is_mm_input + lora_config, num_scheduled_tokens, mapping_type ), ): yield diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 6fd71259dbcbf..88cd19ba3935d 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -32,7 +32,8 @@ from vllm.distributed.kv_transfer import get_kv_transfer_group, has_kv_transfer_ from vllm.distributed.kv_transfer.kv_connector.utils import copy_kv_blocks from vllm.forward_context import set_forward_context from vllm.logger import init_logger -from vllm.lora.layers import BaseLayerWithLoRA +from vllm.lora.layers import BaseLayerWithLoRA, LoRAMappingType +from vllm.lora.request import LoRARequest from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase from vllm.model_executor.model_loader import get_model_loader from vllm.model_executor.model_loader.tpu import TPUModelLoader @@ -1422,11 +1423,15 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self._hidden_states_dtype = out.dtype def _set_active_loras( - self, prompt_lora_mapping, token_lora_mapping, lora_requests + self, + prompt_lora_mapping: tuple[int, ...], + token_lora_mapping: tuple[int, ...], + lora_requests: set[LoRARequest], + mapping_type: LoRAMappingType = LoRAMappingType.LANGUAGE, ) -> None: torch_xla.sync(wait=False) # Captures input updates super()._set_active_loras( - prompt_lora_mapping, token_lora_mapping, lora_requests + prompt_lora_mapping, token_lora_mapping, lora_requests, mapping_type ) torch_xla.sync(wait=False) # Captures metadata updates From a3647878c2dd8909af4a634236c68cd5ca8e22e9 Mon Sep 17 00:00:00 2001 From: bk-201 Date: Tue, 25 Nov 2025 14:45:49 +0000 Subject: [PATCH 09/53] fix pre-commit Signed-off-by: bk-201 --- vllm/lora/models.py | 9 ++++++--- vllm/v1/worker/lora_model_runner_mixin.py | 14 +++++++++----- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 97739d96bea32..27a85a3ca2297 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -13,7 +13,12 @@ from torch import nn from vllm.config.lora import LoRAConfig, ModelConfig from vllm.logger import init_logger -from vllm.lora.layers import BaseLayerWithLoRA, LoRAMapping, LoRAMappingType +from vllm.lora.layers import ( + BaseLayerWithLoRA, + FusedMoEWithLoRA, + LoRAMapping, + LoRAMappingType, +) from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights from vllm.lora.peft_helper import PEFTHelper from vllm.lora.punica_wrapper import PunicaWrapperBase, get_punica_wrapper @@ -25,7 +30,6 @@ from vllm.lora.utils import ( is_moe_model, is_regex_target_modules, parse_fine_tuned_lora_name, - process_packed_modules_mapping, replace_submodule, ) from vllm.model_executor.layers.fused_moe import FusedMoE @@ -36,7 +40,6 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.utils import PPMissingLayer, WeightsMapper from vllm.model_executor.utils import get_packed_modules_mapping from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.utils import is_pin_memory_available from vllm.utils.cache import LRUCache from vllm.utils.platform_utils import is_pin_memory_available diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py index 2e40de8491217..ed6b5525fa1a7 100644 --- a/vllm/v1/worker/lora_model_runner_mixin.py +++ b/vllm/v1/worker/lora_model_runner_mixin.py @@ -175,10 +175,10 @@ class LoRAModelRunnerMixin: } self._set_active_loras( - tuple(sample_lora_mapping), - tuple(token_lora_mapping), - lora_requests, - mapping_type + tuple(sample_lora_mapping), + tuple(token_lora_mapping), + lora_requests, + mapping_type, ) yield @@ -196,7 +196,11 @@ class LoRAModelRunnerMixin: with ( self.maybe_setup_dummy_loras(lora_config, remove_lora), self.maybe_select_dummy_loras( - lora_config, num_scheduled_tokens, mapping_type, num_sampled_tokens, activate_lora + lora_config, + num_scheduled_tokens, + mapping_type, + num_sampled_tokens, + activate_lora, ), ): yield From 81573635da2f88e2487ae7a7646c7d8bc80645e0 Mon Sep 17 00:00:00 2001 From: prashanth058 Date: Tue, 25 Nov 2025 21:37:17 +0000 Subject: [PATCH 10/53] qwen2.5 & 3 vl fixes and tests Signed-off-by: prashanth058 --- tests/lora/conftest.py | 10 ++++ .../lora/{test_qwen2vl.py => test_qwenvl.py} | 57 ++++++++++++++----- vllm/lora/layers/column_parallel_linear.py | 7 ++- vllm/lora/models.py | 6 +- vllm/model_executor/models/qwen3_vl.py | 2 +- vllm/v1/worker/gpu_model_runner.py | 16 ++++-- vllm/v1/worker/lora_model_runner_mixin.py | 2 +- 7 files changed, 75 insertions(+), 25 deletions(-) rename tests/lora/{test_qwen2vl.py => test_qwenvl.py} (85%) diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 1b301ee75f802..74e2fe22414d2 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -240,6 +240,16 @@ def qwen2vl_vision_tower_lora_files(): return snapshot_download(repo_id="prashanth058/qwen2vl-flickr-lora-tower") +@pytest.fixture(scope="session") +def qwen25vl_vision_lora_files(): + return snapshot_download(repo_id="prashanth058/qwen2.5-3b-vl-flickr-lora-vision") + + +@pytest.fixture(scope="session") +def qwen3vl_vision_lora_files(): + return snapshot_download(repo_id="prashanth058/qwen3-4b-vl-lora-vision-connector") + + @pytest.fixture(scope="session") def tinyllama_lora_files(): return snapshot_download(repo_id="jashing/tinyllama-colorist-lora") diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwenvl.py similarity index 85% rename from tests/lora/test_qwen2vl.py rename to tests/lora/test_qwenvl.py index 41c06cca36b27..ec9990fee0f67 100644 --- a/tests/lora/test_qwen2vl.py +++ b/tests/lora/test_qwenvl.py @@ -14,8 +14,9 @@ class TestConfig: lora_path: str max_num_seqs: int = 2 max_loras: int = 2 - max_lora_rank: int = 16 - max_model_len: int = 4096 + max_lora_rank: int = 32 + max_model_len: int = 8192 + gpu_memory_utilization: float = 0.85 mm_processor_kwargs: dict[str, int] | None = None def __post_init__(self): @@ -49,6 +50,7 @@ class Qwen2VLTester: max_loras=self.config.max_loras, max_lora_rank=self.config.max_lora_rank, trust_remote_code=True, + gpu_memory_utilization=self.config.gpu_memory_utilization, mm_processor_kwargs=self.config.mm_processor_kwargs, max_model_len=self.config.max_model_len, ) @@ -142,6 +144,16 @@ EXPECTED_OUTPUTS_VISION_NO_CONNECTOR = [ "A closeup shot of the Tokyo Skytree with pink flowers in the foreground.", ] +EXPECTED_OUTPUTS_VISION_QWEN2_5_VL = [ + "A black car is driving past a stop sign and a large red and gold arch.", + "A view of the Tokyo Skytree through the branches of a cherry blossom tree.", +] + +EXPECTED_OUTPUTS_VISION_QWEN3_VL = [ + "A black SUV drives past a stop sign in front of a Chinese gate.", + "A white tower is seen through pink flowers.", +] + # NOTE - beam search .text contains the whole text EXPECTED_BEAM_SEARCH_OUTPUTS = [ [ @@ -152,6 +164,7 @@ EXPECTED_BEAM_SEARCH_OUTPUTS = [ QWEN2VL_MODEL_PATH = "Qwen/Qwen2-VL-2B-Instruct" QWEN25VL_MODEL_PATH = "Qwen/Qwen2.5-VL-3B-Instruct" +QWEN3VL_MODEL_PATH = "Qwen/Qwen3-VL-4B-Instruct" def test_qwen2vl_lora(qwen2vl_lora_files): @@ -192,10 +205,6 @@ def test_qwen25vl_lora(qwen25vl_lora_files): tester.run_test(TEST_IMAGES, expected_outputs=EXPECTED_OUTPUTS, lora_id=lora_id) -@pytest.mark.xfail( - current_platform.is_rocm(), - reason="Qwen2-VL dependency xformers incompatible with ROCm", -) def test_qwen2vl_language_lora(qwen2vl_language_lora_files): """ Test language-only LoRA adapter. @@ -210,10 +219,6 @@ def test_qwen2vl_language_lora(qwen2vl_language_lora_files): ) -@pytest.mark.xfail( - current_platform.is_rocm(), - reason="Qwen2-VL dependency xformers incompatible with ROCm", -) def test_qwen2vl_vision_lora(qwen2vl_vision_tower_connector_lora_files): """ Test vision tower + connector LoRA adapter. @@ -229,10 +234,6 @@ def test_qwen2vl_vision_lora(qwen2vl_vision_tower_connector_lora_files): ) -@pytest.mark.xfail( - current_platform.is_rocm(), - reason="Qwen2-VL dependency xformers incompatible with ROCm", -) def test_qwen2vl_vision_no_connector_lora( qwen2vl_vision_tower_lora_files, ): @@ -251,3 +252,31 @@ def test_qwen2vl_vision_no_connector_lora( expected_outputs=EXPECTED_OUTPUTS_VISION_NO_CONNECTOR, lora_id=lora_id, ) + + +def test_qwen25vl_vision_lora(qwen25vl_vision_lora_files): + config = TestConfig( + model_path=QWEN25VL_MODEL_PATH, + lora_path=qwen25vl_vision_lora_files, + ) + tester = Qwen2VLTester(config) + for lora_id in [1, 2]: + tester.run_test( + TEST_IMAGES, + expected_outputs=EXPECTED_OUTPUTS_VISION_QWEN2_5_VL, + lora_id=lora_id, + ) + + +def test_qwen3vl_vision_lora(qwen3vl_vision_lora_files): + config = TestConfig( + model_path=QWEN3VL_MODEL_PATH, + lora_path=qwen3vl_vision_lora_files, + ) + tester = Qwen2VLTester(config) + for lora_id in [1, 2]: + tester.run_test( + TEST_IMAGES, + expected_outputs=EXPECTED_OUTPUTS_VISION_QWEN3_VL, + lora_id=lora_id, + ) diff --git a/vllm/lora/layers/column_parallel_linear.py b/vllm/lora/layers/column_parallel_linear.py index 3e21d426c304a..f6f610669056d 100644 --- a/vllm/lora/layers/column_parallel_linear.py +++ b/vllm/lora/layers/column_parallel_linear.py @@ -340,7 +340,12 @@ class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): packed_modules_list: list, model_config: PretrainedConfig | None, ) -> bool: - return type(source_layer) is QKVParallelLinear and len(packed_modules_list) == 1 + # Vision tower QKV has packed_modules_list=[] (already packed in checkpoint) + # Language models have packed_modules_list=[module_name] + # (single LoRA for qkv_proj) + return type(source_layer) is QKVParallelLinear and ( + len(packed_modules_list) <= 1 + ) class MergedQKVParallelLinearWithLoRA(MergedColumnParallelLinearWithLoRA): diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 27a85a3ca2297..8402cd73b9263 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -562,10 +562,12 @@ class LoRAModelManager: target_wrapper = self.punica_wrapper if self.supports_mm_lora: - if mapping.type == LoRAMappingType.TOWER: + if mapping.type == LoRAMappingType.TOWER and self.mm_mapping.tower_model: target_name = self.mm_mapping.tower_model[0] target_wrapper = self.mm_punica_wrapper_mapping[target_name] - elif mapping.type == LoRAMappingType.CONNECTOR: + elif ( + mapping.type == LoRAMappingType.CONNECTOR and self.mm_mapping.connector + ): target_name = self.mm_mapping.connector[0] target_wrapper = self.mm_punica_wrapper_mapping[target_name] else: diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 4cd6fa14c32df..181c2957565ea 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -1675,6 +1675,6 @@ class Qwen3VLForConditionalGeneration( """ return MultiModelKeys.from_string_field( language_model="language_model", - connector="visual.merger", + connector=["visual.merger", "visual.deepstack_merger_list"], tower_model="visual.", ) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 3126687fb4baf..9c4eff8b16389 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2075,7 +2075,9 @@ class GPUModelRunner( req_idx = self.input_batch.req_id_to_index[req_id] lora_id = int(self.input_batch.request_lora_mapping[req_idx]) - num_tokens = self.info.get_num_mm_encoder_tokens(pos_info.length) + num_tokens = self.info.get_num_mm_encoder_tokens( # type: ignore[attr-defined] + pos_info.length + ) prompt_lora_mapping.append(lora_id) token_lora_mapping.extend([lora_id] * num_tokens) @@ -2095,16 +2097,18 @@ class GPUModelRunner( if hasattr(self.info, "get_num_mm_connector_tokens"): num_post_op_tokens = [] for _, pos_info in mm_hashes_pos: - mm_token_count = self.info.get_num_mm_encoder_tokens( + mm_token_count = self.info.get_num_mm_encoder_tokens( # type: ignore[attr-defined] pos_info.length ) - post_op_count = self.info.get_num_mm_connector_tokens( + post_op_count = self.info.get_num_mm_connector_tokens( # type: ignore[attr-defined] mm_token_count ) num_post_op_tokens.append(post_op_count) + last_mapping = self.lora_manager._adapter_manager._last_mapping + assert last_mapping is not None lora_ids = np.array( - self.lora_manager._adapter_manager._last_mapping.prompt_mapping, + last_mapping.prompt_mapping, dtype=np.int32, ) post_op_counts_np = np.array(num_post_op_tokens, dtype=np.int32) @@ -2112,8 +2116,8 @@ class GPUModelRunner( connector_mapping = LoRAMapping( index_mapping=tuple(new_token_indices.tolist()), - prompt_mapping=self.lora_manager._adapter_manager._last_mapping.prompt_mapping, - is_prefill=self.lora_manager._adapter_manager._last_mapping.is_prefill, + prompt_mapping=last_mapping.prompt_mapping, + is_prefill=last_mapping.is_prefill, type=LoRAMappingType.CONNECTOR, ) diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py index ed6b5525fa1a7..efb184d060d43 100644 --- a/vllm/v1/worker/lora_model_runner_mixin.py +++ b/vllm/v1/worker/lora_model_runner_mixin.py @@ -33,7 +33,7 @@ class LoRAModelRunnerMixin: model: nn.Module, vllm_config: VllmConfig, device: torch.device, - model_config: ModelConfig = None, + model_config: ModelConfig | None = None, ) -> nn.Module: if not supports_lora(model): raise ValueError(f"{model.__class__.__name__} does not support LoRA yet.") From 181b5f85d3e18d412136131308bf2e066fb98fb4 Mon Sep 17 00:00:00 2001 From: prashanth058 Date: Wed, 26 Nov 2025 02:11:37 +0000 Subject: [PATCH 11/53] remove redundant assingments Signed-off-by: prashanth058 --- vllm/lora/layers/row_parallel_linear.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/vllm/lora/layers/row_parallel_linear.py b/vllm/lora/layers/row_parallel_linear.py index a93cd2d220093..e8dcdfa62cd75 100644 --- a/vllm/lora/layers/row_parallel_linear.py +++ b/vllm/lora/layers/row_parallel_linear.py @@ -76,11 +76,7 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA): if self.base_layer.reduce_results and self.tp_size > 1: output = tensor_model_parallel_all_reduce(output_parallel) else: - output_ = output_parallel - - # Bias was already added by rank 0 in apply(), no need to add again - output_bias = self.base_layer.bias if self.base_layer.skip_bias_add else None - output = output_ + output = output_parallel output_bias = self.base_layer.bias if self.base_layer.skip_bias_add else None if not self.base_layer.return_bias: From 92ed13c1c8b02244c7afbd61cd84521de1cf1b29 Mon Sep 17 00:00:00 2001 From: bk-201 Date: Wed, 26 Nov 2025 02:46:18 +0000 Subject: [PATCH 12/53] fix bug Signed-off-by: bk-201 --- vllm/lora/models.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 8402cd73b9263..604f9925d00d9 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -601,6 +601,13 @@ class LoRAModelManager: if not self._match_target_modules(module_name): continue + + if ( + self.supports_mm_lora + and self._get_mm_punica_wrapper(module_name) is None + ): + continue + parts = module_name.split(".")[-1] packed_moduled_lst = self.packed_modules_mapping.get(parts, []) if isinstance(module, FusedMoE): From 598052b04eb8c842cb55c1a04b6a58b2132aa23c Mon Sep 17 00:00:00 2001 From: bk-201 Date: Thu, 4 Dec 2025 16:57:49 +0000 Subject: [PATCH 13/53] fix bug Signed-off-by: bk-201 --- vllm/lora/models.py | 38 +++++++++++++---------- vllm/lora/worker_manager.py | 10 +++--- vllm/multimodal/profiling.py | 5 --- vllm/v1/worker/gpu_model_runner.py | 2 +- vllm/v1/worker/lora_model_runner_mixin.py | 5 ++- 5 files changed, 29 insertions(+), 31 deletions(-) diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 59167351a0fa3..d0081ff72eba6 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -11,6 +11,7 @@ import safetensors.torch import torch from torch import nn +from vllm.config import VllmConfig from vllm.config.lora import LoRAConfig, ModelConfig from vllm.logger import init_logger from vllm.lora.layers import ( @@ -42,6 +43,7 @@ from vllm.model_executor.utils import get_packed_modules_mapping from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.utils.cache import LRUCache from vllm.utils.platform_utils import is_pin_memory_available +from vllm.v1.worker.utils import MultiModalBudget logger = init_logger(__name__) @@ -302,7 +304,7 @@ class LoRAModelManager: max_num_batched_tokens: int, vocab_size: int, lora_config: LoRAConfig, - model_config: ModelConfig | None, + vllm_config: VllmConfig, device: torch.device, ): """Create a LoRAModelManager and adapter for a given model. @@ -340,7 +342,7 @@ class LoRAModelManager: f" {self.model.__class__.__name__}." self.packed_modules_mapping = get_packed_modules_mapping(self.model) - self._init_multimodal_config(model_config) + self._init_multimodal_config(vllm_config) self.is_pooling_model = is_pooling_model(self.model) self.packed_modules: dict[str, list[str]] = {} self.modules: dict[str, BaseLayerWithLoRA] = {} @@ -351,7 +353,7 @@ class LoRAModelManager: self.model.lora_manager = self - def _init_multimodal_config(self, model_config): + def _init_multimodal_config(self, vllm_config: VllmConfig): # Used to indicate whether the model is a multimodal model self.supports_mm: bool = ( supports_multimodal(self.model) @@ -359,25 +361,27 @@ class LoRAModelManager: # text modules (e.g. ChatGLM) and hasattr(self.model, "get_mm_mapping") ) - # For v0 compatibility - self.supports_mm_lora = False - if model_config is not None: - self.mm_registry = MULTIMODAL_REGISTRY - self.info = self.mm_registry.create_processor(model_config).info - self.supports_mm_lora = self.supports_mm and hasattr( - self.info, "get_num_mm_encoder_tokens" - ) + + model_config: ModelConfig = vllm_config.model_config + self.info = MULTIMODAL_REGISTRY.create_processor(model_config).info + self.supports_mm_lora = self.supports_mm and hasattr( + self.info, "get_num_mm_encoder_tokens" + ) if not self.supports_mm_lora: return + mm_budget = MultiModalBudget( + model_config, + vllm_config.scheduler_config, + MULTIMODAL_REGISTRY, + ) self.mm_mapping: MultiModelKeys = self.model.get_mm_mapping() - self.mm_config = model_config.multimodal_config limit_per_prompt: int = max(self.info.get_allowed_mm_limits().values()) # For vision tower num_encoder_tokens = self.info.get_num_mm_encoder_tokens( - self.max_num_batched_tokens + mm_budget.get_encoder_budget() ) self.mm_punica_wrapper_mapping = { name: get_punica_wrapper( @@ -911,7 +915,7 @@ class LRUCacheLoRAModelManager(LoRAModelManager): max_num_batched_tokens: int, vocab_size: int, lora_config: LoRAConfig, - model_config: ModelConfig, + vllm_config: VllmConfig, device: torch.device, ): super().__init__( @@ -920,7 +924,7 @@ class LRUCacheLoRAModelManager(LoRAModelManager): max_num_batched_tokens, vocab_size, lora_config, - model_config, + vllm_config, device, ) self._registered_adapters: LoRALRUCache = LoRALRUCache( @@ -994,7 +998,7 @@ def create_lora_manager( max_num_batched_tokens: int, vocab_size: int, lora_config: LoRAConfig, - model_config: ModelConfig, + vllm_config: VllmConfig, device: torch.device, lora_manager_cls: type[LoRAModelManager] = LoRAModelManager, **kwargs, @@ -1008,7 +1012,7 @@ def create_lora_manager( max_num_batched_tokens=max_num_batched_tokens, vocab_size=vocab_size, lora_config=lora_config, - model_config=model_config, + vllm_config=vllm_config, device=device, **kwargs, ) diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index 7448963df7c74..6b45683b0bd6b 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -6,7 +6,7 @@ from typing import Any, Literal import torch -from vllm.config import ModelConfig, VllmConfig +from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.lora.models import ( LoRAModel, @@ -69,7 +69,7 @@ class WorkerLoRAManager: def create_lora_manager( self, model: torch.nn.Module, - model_config: ModelConfig | None = None, + vllm_config: VllmConfig, ) -> Any: lora_manager = create_lora_manager( model, @@ -79,7 +79,7 @@ class WorkerLoRAManager: lora_config=self.lora_config, device=self.device, lora_manager_cls=self._manager_cls, - model_config=model_config, + vllm_config=vllm_config, ) self._adapter_manager = lora_manager return lora_manager.model @@ -212,7 +212,7 @@ class LRUCacheWorkerLoRAManager(WorkerLoRAManager): def create_lora_manager( self, model: torch.nn.Module, - model_config: ModelConfig | None = None, + vllm_config: VllmConfig, ) -> Any: lora_manager = create_lora_manager( model, @@ -222,7 +222,7 @@ class LRUCacheWorkerLoRAManager(WorkerLoRAManager): lora_config=self.lora_config, device=self.device, max_num_batched_tokens=self.max_num_batched_tokens, - model_config=model_config, + vllm_config=vllm_config, ) self._adapter_manager = lora_manager return lora_manager.model diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index 7acd9e556729d..cb70041e9744f 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import copy from abc import ABC, abstractmethod from collections.abc import Mapping from dataclasses import dataclass, field @@ -59,7 +58,6 @@ class DummyDecoderData(NamedTuple): prompt_token_ids: list[int] multi_modal_data: MultiModalKwargsItems multi_modal_placeholders: MultiModalPlaceholderDict - multi_modal_token_ids: list[int] _I = TypeVar("_I", bound=BaseProcessingInfo) @@ -324,13 +322,10 @@ class MultiModalProfiler(Generic[_I]): if total_len < seq_len: prompt_token_ids.extend([0] * (seq_len - total_len)) - multi_modal_token_ids = copy.deepcopy(prompt_token_ids) - return DummyDecoderData( prompt_token_ids=prompt_token_ids, multi_modal_data=mm_inputs["mm_kwargs"].require_data(), multi_modal_placeholders=mm_inputs["mm_placeholders"], - multi_modal_token_ids=multi_modal_token_ids, ) def _get_mm_max_tokens( diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 824f6c8f991de..ea21bb16f6c74 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3620,7 +3620,7 @@ class GPUModelRunner( ) if self.lora_config: self.model = self.load_lora_model( - self.model, self.vllm_config, self.device, self.model_config + self.model, self.vllm_config, self.device ) if hasattr(self, "drafter"): logger.info_once("Loading drafter model...") diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py index 85badf3a9755e..b7d488ea1c182 100644 --- a/vllm/v1/worker/lora_model_runner_mixin.py +++ b/vllm/v1/worker/lora_model_runner_mixin.py @@ -11,7 +11,7 @@ import numpy as np import torch import torch.nn as nn -from vllm.config import ModelConfig, VllmConfig +from vllm.config import VllmConfig from vllm.config.lora import LoRAConfig from vllm.logger import init_logger from vllm.lora.layers import LoRAMapping, LoRAMappingType @@ -33,7 +33,6 @@ class LoRAModelRunnerMixin: model: nn.Module, vllm_config: VllmConfig, device: torch.device, - model_config: ModelConfig | None = None, ) -> nn.Module: if not supports_lora(model): raise ValueError(f"{model.__class__.__name__} does not support LoRA yet.") @@ -44,7 +43,7 @@ class LoRAModelRunnerMixin: device, model.embedding_modules, ) - return self.lora_manager.create_lora_manager(model, model_config) + return self.lora_manager.create_lora_manager(model, vllm_config) def _set_active_loras( self, From 1745bb935325124ace38fb7bf5496cf53ea25920 Mon Sep 17 00:00:00 2001 From: bk-201 Date: Fri, 5 Dec 2025 11:14:53 +0000 Subject: [PATCH 14/53] address the ci issue Signed-off-by: bk-201 --- vllm/lora/models.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/vllm/lora/models.py b/vllm/lora/models.py index b961a35d3b84f..e8b8ddb6841e4 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -362,11 +362,15 @@ class LoRAModelManager: and hasattr(self.model, "get_mm_mapping") ) - model_config: ModelConfig = vllm_config.model_config - self.info = MULTIMODAL_REGISTRY.create_processor(model_config).info - self.supports_mm_lora = self.supports_mm and hasattr( - self.info, "get_num_mm_encoder_tokens" - ) + self.supports_mm_lora = False + + if self.supports_mm: + model_config: ModelConfig = vllm_config.model_config + self.mm_mapping: MultiModelKeys = self.model.get_mm_mapping() + self.info = MULTIMODAL_REGISTRY.create_processor(model_config).info + self.supports_mm_lora = self.supports_mm and hasattr( + self.info, "get_num_mm_encoder_tokens" + ) if not self.supports_mm_lora: return @@ -591,10 +595,12 @@ class LoRAModelManager: if not self._match_target_modules(module_name): continue - if ( - self.supports_mm_lora - and self._get_mm_punica_wrapper(module_name) is None - ): + if self._filter_unsupported_mm_module(module_name): + logger.warning( + "Module %s does not support adding LoRA for " + "now and has been ignored.", + module_name, + ) continue parts = module_name.split(".")[-1] From 113eb2e0b8376a56cc0a9e00e4f5ee2c6b09b940 Mon Sep 17 00:00:00 2001 From: bk-201 Date: Fri, 5 Dec 2025 12:14:53 +0000 Subject: [PATCH 15/53] add a enable option Signed-off-by: bk-201 --- tests/lora/test_qwenvl.py | 2 ++ vllm/config/lora.py | 3 +++ vllm/engine/arg_utils.py | 7 +++++++ vllm/lora/models.py | 10 +++++----- 4 files changed, 17 insertions(+), 5 deletions(-) diff --git a/tests/lora/test_qwenvl.py b/tests/lora/test_qwenvl.py index ec9990fee0f67..dfddbbd168894 100644 --- a/tests/lora/test_qwenvl.py +++ b/tests/lora/test_qwenvl.py @@ -15,6 +15,7 @@ class TestConfig: max_num_seqs: int = 2 max_loras: int = 2 max_lora_rank: int = 32 + enable_mm_lora: bool = True max_model_len: int = 8192 gpu_memory_utilization: float = 0.85 mm_processor_kwargs: dict[str, int] | None = None @@ -49,6 +50,7 @@ class Qwen2VLTester: enable_lora=True, max_loras=self.config.max_loras, max_lora_rank=self.config.max_lora_rank, + enable_mm_lora=self.config.enable_mm_lora, trust_remote_code=True, gpu_memory_utilization=self.config.gpu_memory_utilization, mm_processor_kwargs=self.config.mm_processor_kwargs, diff --git a/vllm/config/lora.py b/vllm/config/lora.py index 6a8fd6359aadd..23a46b9632cd1 100644 --- a/vllm/config/lora.py +++ b/vllm/config/lora.py @@ -55,6 +55,9 @@ class LoRAConfig: per prompt. When run in offline mode, the lora IDs for n modalities will be automatically assigned to 1-n with the names of the modalities in alphabetic order.""" + enable_mm_lora: bool = False + """If `True`, LoRA support for multimodal models will be enabled. Currently, + only the qwenvl series models support this feature. The default is False.""" def compute_hash(self) -> str: """ diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index fd07cded7bc51..978f553d7b8a1 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -484,6 +484,7 @@ class EngineArgs: fully_sharded_loras: bool = LoRAConfig.fully_sharded_loras max_cpu_loras: int | None = LoRAConfig.max_cpu_loras lora_dtype: str | torch.dtype | None = LoRAConfig.lora_dtype + enable_mm_lora: bool = LoRAConfig.enable_mm_lora ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight num_gpu_blocks_override: int | None = CacheConfig.num_gpu_blocks_override @@ -985,6 +986,11 @@ class EngineArgs: "--lora-dtype", **lora_kwargs["lora_dtype"], ) + lora_group.add_argument( + "--enable-mm-lora", + action=argparse.BooleanOptionalAction, + **lora_kwargs["enable_mm_lora"], + ) lora_group.add_argument("--max-cpu-loras", **lora_kwargs["max_cpu_loras"]) lora_group.add_argument( "--fully-sharded-loras", **lora_kwargs["fully_sharded_loras"] @@ -1660,6 +1666,7 @@ class EngineArgs: default_mm_loras=self.default_mm_loras, fully_sharded_loras=self.fully_sharded_loras, lora_dtype=self.lora_dtype, + enable_mm_lora=self.enable_mm_lora, max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras and self.max_cpu_loras > 0 else None, diff --git a/vllm/lora/models.py b/vllm/lora/models.py index e8b8ddb6841e4..dc6b0790f36d8 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -367,10 +367,11 @@ class LoRAModelManager: if self.supports_mm: model_config: ModelConfig = vllm_config.model_config self.mm_mapping: MultiModelKeys = self.model.get_mm_mapping() - self.info = MULTIMODAL_REGISTRY.create_processor(model_config).info - self.supports_mm_lora = self.supports_mm and hasattr( - self.info, "get_num_mm_encoder_tokens" - ) + if self.lora_config.enable_mm_lora: + self.info = MULTIMODAL_REGISTRY.create_processor(model_config).info + self.supports_mm_lora = self.supports_mm and hasattr( + self.info, "get_num_mm_encoder_tokens" + ) if not self.supports_mm_lora: return @@ -380,7 +381,6 @@ class LoRAModelManager: vllm_config.scheduler_config, MULTIMODAL_REGISTRY, ) - self.mm_mapping: MultiModelKeys = self.model.get_mm_mapping() limit_per_prompt: int = max(self.info.get_allowed_mm_limits().values()) # For vision tower From 3e33423eb27ad4bf172b5d13c343d67173ca6fea Mon Sep 17 00:00:00 2001 From: bk-201 Date: Fri, 5 Dec 2025 14:08:44 +0000 Subject: [PATCH 16/53] address ci issue Signed-off-by: bk-201 --- vllm/engine/arg_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 8065b609ca5b8..e49eaabe30e04 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -991,7 +991,6 @@ class EngineArgs: ) lora_group.add_argument( "--enable-mm-lora", - action=argparse.BooleanOptionalAction, **lora_kwargs["enable_mm_lora"], ) lora_group.add_argument("--max-cpu-loras", **lora_kwargs["max_cpu_loras"]) From d3c2f3dbe3f99abf3677fad398ac8e40b594cfaf Mon Sep 17 00:00:00 2001 From: bk-201 Date: Wed, 10 Dec 2025 15:25:23 +0000 Subject: [PATCH 17/53] address ci issue Signed-off-by: bk-201 --- vllm/lora/model_manager.py | 10 +++++----- vllm/lora/worker_manager.py | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/vllm/lora/model_manager.py b/vllm/lora/model_manager.py index 10164654c816a..7c0903674ce53 100644 --- a/vllm/lora/model_manager.py +++ b/vllm/lora/model_manager.py @@ -65,8 +65,8 @@ class LoRAModelManager: max_num_batched_tokens: int, vocab_size: int, lora_config: LoRAConfig, - vllm_config: VllmConfig, device: torch.device, + vllm_config: VllmConfig | None = None, ): """Create a LoRAModelManager and adapter for a given model. @@ -114,7 +114,7 @@ class LoRAModelManager: self.model.lora_manager = self - def _init_multimodal_config(self, vllm_config: VllmConfig): + def _init_multimodal_config(self, vllm_config: VllmConfig | None = None): # Used to indicate whether the model is a multimodal model self.supports_mm: bool = ( supports_multimodal(self.model) @@ -125,7 +125,7 @@ class LoRAModelManager: self.supports_mm_lora = False - if self.supports_mm: + if self.supports_mm and vllm_config is not None: model_config: ModelConfig = vllm_config.model_config self.mm_mapping: MultiModelKeys = self.model.get_mm_mapping() if self.lora_config.enable_mm_lora: @@ -708,8 +708,8 @@ class LRUCacheLoRAModelManager(LoRAModelManager): max_num_batched_tokens: int, vocab_size: int, lora_config: LoRAConfig, - vllm_config: VllmConfig, device: torch.device, + vllm_config: VllmConfig | None = None, ): super().__init__( model, @@ -717,8 +717,8 @@ class LRUCacheLoRAModelManager(LoRAModelManager): max_num_batched_tokens, vocab_size, lora_config, - vllm_config, device, + vllm_config, ) self._registered_adapters: LoRALRUCache = LoRALRUCache( self.capacity, self.deactivate_adapter diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index 170c9ec226ab0..967ce458995c6 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -69,7 +69,7 @@ class WorkerLoRAManager: def create_lora_manager( self, model: torch.nn.Module, - vllm_config: VllmConfig, + vllm_config: VllmConfig | None = None, ) -> Any: lora_manager = create_lora_manager( model, @@ -212,7 +212,7 @@ class LRUCacheWorkerLoRAManager(WorkerLoRAManager): def create_lora_manager( self, model: torch.nn.Module, - vllm_config: VllmConfig, + vllm_config: VllmConfig | None = None, ) -> Any: lora_manager = create_lora_manager( model, From 1d2c53973401b67c07d0fd23c98de16adce19409 Mon Sep 17 00:00:00 2001 From: bk-201 Date: Wed, 10 Dec 2025 16:30:49 +0000 Subject: [PATCH 18/53] address ci issue Signed-off-by: bk-201 --- vllm/lora/model_manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/lora/model_manager.py b/vllm/lora/model_manager.py index 7c0903674ce53..a1270f032d2b8 100644 --- a/vllm/lora/model_manager.py +++ b/vllm/lora/model_manager.py @@ -26,6 +26,7 @@ from vllm.lora.utils import ( from_layer_logits_processor, get_supported_lora_modules, is_moe_model, + process_packed_modules_mapping, replace_submodule, ) from vllm.model_executor.layers.fused_moe import FusedMoE @@ -33,7 +34,6 @@ from vllm.model_executor.models import SupportsLoRA, supports_multimodal from vllm.model_executor.models.interfaces import is_pooling_model from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.utils import PPMissingLayer -from vllm.model_executor.utils import get_packed_modules_mapping from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.utils.cache import LRUCache from vllm.utils.platform_utils import is_pin_memory_available @@ -102,7 +102,7 @@ class LoRAModelManager: assert self.supported_lora_modules, "No supported LoRA modules found in" f" {self.model.__class__.__name__}." - self.packed_modules_mapping = get_packed_modules_mapping(self.model) + self.packed_modules_mapping = process_packed_modules_mapping(self.model) self._init_multimodal_config(vllm_config) self.is_pooling_model = is_pooling_model(self.model) self.packed_modules: dict[str, list[str]] = {} From e2ea025ee36468fe5b9f2f2a3e91d2741baeeffe Mon Sep 17 00:00:00 2001 From: bk-201 Date: Thu, 11 Dec 2025 03:46:27 +0000 Subject: [PATCH 19/53] address pre-commit & ci issue Signed-off-by: bk-201 --- tests/lora/test_qwenvl.py | 2 +- vllm/lora/model_manager.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/lora/test_qwenvl.py b/tests/lora/test_qwenvl.py index dfddbbd168894..2780f5e71dc21 100644 --- a/tests/lora/test_qwenvl.py +++ b/tests/lora/test_qwenvl.py @@ -153,7 +153,7 @@ EXPECTED_OUTPUTS_VISION_QWEN2_5_VL = [ EXPECTED_OUTPUTS_VISION_QWEN3_VL = [ "A black SUV drives past a stop sign in front of a Chinese gate.", - "A white tower is seen through pink flowers.", + "A tall white tower is seen through pink flowers.", ] # NOTE - beam search .text contains the whole text diff --git a/vllm/lora/model_manager.py b/vllm/lora/model_manager.py index a1270f032d2b8..c308e2958cc25 100644 --- a/vllm/lora/model_manager.py +++ b/vllm/lora/model_manager.py @@ -137,6 +137,9 @@ class LoRAModelManager: if not self.supports_mm_lora: return + assert vllm_config is not None, ( + "vllm_config should not be None when supports_mm_lora is True" + ) mm_budget = MultiModalBudget( model_config, vllm_config.scheduler_config, From 27448490f1c35b05abb40029d6267fd293054308 Mon Sep 17 00:00:00 2001 From: bk-201 Date: Thu, 11 Dec 2025 06:46:53 +0000 Subject: [PATCH 20/53] update argument name Signed-off-by: bk-201 --- tests/lora/test_qwenvl.py | 4 ++-- vllm/config/lora.py | 2 +- vllm/engine/arg_utils.py | 8 ++++---- vllm/lora/model_manager.py | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/lora/test_qwenvl.py b/tests/lora/test_qwenvl.py index 2780f5e71dc21..4b3b92447789f 100644 --- a/tests/lora/test_qwenvl.py +++ b/tests/lora/test_qwenvl.py @@ -15,7 +15,7 @@ class TestConfig: max_num_seqs: int = 2 max_loras: int = 2 max_lora_rank: int = 32 - enable_mm_lora: bool = True + enable_tower_connector_lora: bool = True max_model_len: int = 8192 gpu_memory_utilization: float = 0.85 mm_processor_kwargs: dict[str, int] | None = None @@ -50,7 +50,7 @@ class Qwen2VLTester: enable_lora=True, max_loras=self.config.max_loras, max_lora_rank=self.config.max_lora_rank, - enable_mm_lora=self.config.enable_mm_lora, + enable_tower_connector_lora=self.config.enable_tower_connector_lora, trust_remote_code=True, gpu_memory_utilization=self.config.gpu_memory_utilization, mm_processor_kwargs=self.config.mm_processor_kwargs, diff --git a/vllm/config/lora.py b/vllm/config/lora.py index 23a46b9632cd1..97915ce925bc8 100644 --- a/vllm/config/lora.py +++ b/vllm/config/lora.py @@ -55,7 +55,7 @@ class LoRAConfig: per prompt. When run in offline mode, the lora IDs for n modalities will be automatically assigned to 1-n with the names of the modalities in alphabetic order.""" - enable_mm_lora: bool = False + enable_tower_connector_lora: bool = False """If `True`, LoRA support for multimodal models will be enabled. Currently, only the qwenvl series models support this feature. The default is False.""" diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 2ba4cb0fbdca0..00b439a6ab9a1 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -486,7 +486,7 @@ class EngineArgs: fully_sharded_loras: bool = LoRAConfig.fully_sharded_loras max_cpu_loras: int | None = LoRAConfig.max_cpu_loras lora_dtype: str | torch.dtype | None = LoRAConfig.lora_dtype - enable_mm_lora: bool = LoRAConfig.enable_mm_lora + enable_tower_connector_lora: bool = LoRAConfig.enable_tower_connector_lora ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight num_gpu_blocks_override: int | None = CacheConfig.num_gpu_blocks_override @@ -1008,8 +1008,8 @@ class EngineArgs: **lora_kwargs["lora_dtype"], ) lora_group.add_argument( - "--enable-mm-lora", - **lora_kwargs["enable_mm_lora"], + "--enable-tower-connector-lora", + **lora_kwargs["enable_tower_connector_lora"], ) lora_group.add_argument("--max-cpu-loras", **lora_kwargs["max_cpu_loras"]) lora_group.add_argument( @@ -1687,7 +1687,7 @@ class EngineArgs: default_mm_loras=self.default_mm_loras, fully_sharded_loras=self.fully_sharded_loras, lora_dtype=self.lora_dtype, - enable_mm_lora=self.enable_mm_lora, + enable_tower_connector_lora=self.enable_tower_connector_lora, max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras and self.max_cpu_loras > 0 else None, diff --git a/vllm/lora/model_manager.py b/vllm/lora/model_manager.py index c308e2958cc25..479dcf88d02b3 100644 --- a/vllm/lora/model_manager.py +++ b/vllm/lora/model_manager.py @@ -128,7 +128,7 @@ class LoRAModelManager: if self.supports_mm and vllm_config is not None: model_config: ModelConfig = vllm_config.model_config self.mm_mapping: MultiModelKeys = self.model.get_mm_mapping() - if self.lora_config.enable_mm_lora: + if self.lora_config.enable_tower_connector_lora: self.info = MULTIMODAL_REGISTRY.create_processor(model_config).info self.supports_mm_lora = self.supports_mm and hasattr( self.info, "get_num_mm_encoder_tokens" From 65e403d335159a0e4829c83ddbc44c612b66c49e Mon Sep 17 00:00:00 2001 From: bk-201 Date: Thu, 11 Dec 2025 16:00:16 +0000 Subject: [PATCH 21/53] remove outdated comment Signed-off-by: bk-201 --- vllm/lora/layers/row_parallel_linear.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/vllm/lora/layers/row_parallel_linear.py b/vllm/lora/layers/row_parallel_linear.py index d860359877377..958aa6af36746 100644 --- a/vllm/lora/layers/row_parallel_linear.py +++ b/vllm/lora/layers/row_parallel_linear.py @@ -63,10 +63,6 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA): input_parallel = splitted_input[self.tp_rank].contiguous() # Matrix multiply. - # Only fuse bias add into GEMM for rank 0 (matches base - # RowParallelLinear behavior). This ensures bias will not get - # added more than once in TP>1 case and matches the numerical - # behavior of the unwrapped layer bias_ = ( None if (self.tp_rank > 0 or self.base_layer.skip_bias_add) From 208dc0c954091085599dbc255bed3828807ba270 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Fri, 12 Dec 2025 00:05:07 +0000 Subject: [PATCH 22/53] Fix comments Signed-off-by: Jee Jee Li --- vllm/config/lora.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/config/lora.py b/vllm/config/lora.py index 97915ce925bc8..12693acec0e95 100644 --- a/vllm/config/lora.py +++ b/vllm/config/lora.py @@ -56,8 +56,10 @@ class LoRAConfig: will be automatically assigned to 1-n with the names of the modalities in alphabetic order.""" enable_tower_connector_lora: bool = False - """If `True`, LoRA support for multimodal models will be enabled. Currently, - only the qwenvl series models support this feature. The default is False.""" + """If `True`, LoRA support for the tower (vision encoder) and connector + of multimodal models will be enabled. This is an experimental feature and + currently only supports some MM models such as the Qwen VL series. The default + is False.""" def compute_hash(self) -> str: """ From d4f39dc38ab95d4d99ab621db55a49632f7e6d9b Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Fri, 12 Dec 2025 00:41:48 +0000 Subject: [PATCH 23/53] Move forward Signed-off-by: Jee Jee Li --- vllm/lora/model_manager.py | 49 ++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/vllm/lora/model_manager.py b/vllm/lora/model_manager.py index 479dcf88d02b3..6175afdae46c1 100644 --- a/vllm/lora/model_manager.py +++ b/vllm/lora/model_manager.py @@ -103,7 +103,7 @@ class LoRAModelManager: f" {self.model.__class__.__name__}." self.packed_modules_mapping = process_packed_modules_mapping(self.model) - self._init_multimodal_config(vllm_config) + self._maybe_init_mm(vllm_config) self.is_pooling_model = is_pooling_model(self.model) self.packed_modules: dict[str, list[str]] = {} self.modules: dict[str, BaseLayerWithLoRA] = {} @@ -114,7 +114,7 @@ class LoRAModelManager: self.model.lora_manager = self - def _init_multimodal_config(self, vllm_config: VllmConfig | None = None): + def _maybe_init_mm(self, vllm_config: VllmConfig): # Used to indicate whether the model is a multimodal model self.supports_mm: bool = ( supports_multimodal(self.model) @@ -122,24 +122,26 @@ class LoRAModelManager: # text modules (e.g. ChatGLM) and hasattr(self.model, "get_mm_mapping") ) - - self.supports_mm_lora = False - - if self.supports_mm and vllm_config is not None: - model_config: ModelConfig = vllm_config.model_config - self.mm_mapping: MultiModelKeys = self.model.get_mm_mapping() - if self.lora_config.enable_tower_connector_lora: - self.info = MULTIMODAL_REGISTRY.create_processor(model_config).info - self.supports_mm_lora = self.supports_mm and hasattr( - self.info, "get_num_mm_encoder_tokens" - ) - - if not self.supports_mm_lora: + if not self.supports_mm: return - assert vllm_config is not None, ( - "vllm_config should not be None when supports_mm_lora is True" + self.supports_tower_connector_lora = False + model_config: ModelConfig = vllm_config.model_config + + self.mm_mapping: MultiModelKeys = self.model.get_mm_mapping() + if self.lora_config.enable_tower_connector_lora: + self.info = MULTIMODAL_REGISTRY.create_processor(model_config).info + self.supports_tower_connector_lora = self.supports_mm and hasattr( + self.info, "get_num_mm_encoder_tokens" + ) + if not self.supports_tower_connector_lora: + return + logger.warning( + "LoRA for the tower and connector of multimodal models is " + "experimental and may contain bugs. Please report any related issues on " + "GitHub if you encounter them." ) + mm_budget = MultiModalBudget( model_config, vllm_config.scheduler_config, @@ -318,7 +320,7 @@ class LoRAModelManager: # Default to the main language model wrapper target_wrapper = self.punica_wrapper - if self.supports_mm_lora: + if self.supports_tower_connector_lora: if mapping.type == LoRAMappingType.TOWER and self.mm_mapping.tower_model: target_name = self.mm_mapping.tower_model[0] target_wrapper = self.mm_punica_wrapper_mapping[target_name] @@ -361,8 +363,9 @@ class LoRAModelManager: if self._filter_unsupported_mm_module(module_name): logger.warning( - "Module %s does not support adding LoRA for " - "now and has been ignored.", + "Regarding %s, vLLM currently only supports adding LoRA to" + " language model, {module_name} will be ignored.", + self.model.__class__.__name__, module_name, ) continue @@ -424,7 +427,7 @@ class LoRAModelManager: self._register_packed_modules(module_name) # All lora layers share the same punica_wrapper based on reference. - if self.supports_mm_lora: + if self.supports_tower_connector_lora: new_module.set_mapping(self._get_mm_punica_wrapper(module_name)) else: new_module.set_mapping(self.punica_wrapper) @@ -545,7 +548,7 @@ class LoRAModelManager: """ if self.supports_mm: prefix_lst = self.mm_mapping.connector + self.mm_mapping.tower_model - if self.supports_mm_lora: + if self.supports_tower_connector_lora: return self._get_mm_punica_wrapper(module_name) is None else: return any([module_name.startswith(prefix) for prefix in prefix_lst]) @@ -556,7 +559,7 @@ class LoRAModelManager: Match the corresponding punica_wrapper based on module_name, and return None if lora is not supported for this module. """ - if self.supports_mm_lora: + if self.supports_tower_connector_lora: # Ensure matching by the longest prefix. sorted_prefixes = sorted( self.mm_punica_wrapper_mapping.keys(), From 064261071974dcd1608c8d6a1ae178af84b169c8 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Fri, 12 Dec 2025 01:58:29 +0000 Subject: [PATCH 24/53] Move forward Signed-off-by: Jee Jee Li --- vllm/lora/model_manager.py | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/vllm/lora/model_manager.py b/vllm/lora/model_manager.py index 6175afdae46c1..9bffa17b36712 100644 --- a/vllm/lora/model_manager.py +++ b/vllm/lora/model_manager.py @@ -80,6 +80,10 @@ class LoRAModelManager: lora_config: the LoRA configuration. """ self.model: SupportsLoRA = model + self.supported_lora_modules = get_supported_lora_modules(self.model) + assert self.supported_lora_modules, "No supported LoRA modules found in" + f" {self.model.__class__.__name__}." + self._registered_adapters: dict[int, LoRAModel] = {} # Dict instead of a set for compatibility with LRUCache. self._active_adapters: dict[int, None] = {} @@ -91,30 +95,31 @@ class LoRAModelManager: self.max_num_batched_tokens = math.ceil(max_num_batched_tokens / 8) * 8 self.lora_index_to_id: list[int | None] = [None] * self.lora_slots self.vocab_size = vocab_size - self.punica_wrapper = get_punica_wrapper( - max_num_batched_tokens, - max_batches=self.max_num_seqs, - device=self.device, - max_loras=self.lora_config.max_loras, - ) - - self.supported_lora_modules = get_supported_lora_modules(self.model) - assert self.supported_lora_modules, "No supported LoRA modules found in" - f" {self.model.__class__.__name__}." - self.packed_modules_mapping = process_packed_modules_mapping(self.model) - self._maybe_init_mm(vllm_config) + self.is_pooling_model = is_pooling_model(self.model) self.packed_modules: dict[str, list[str]] = {} self.modules: dict[str, BaseLayerWithLoRA] = {} # Dict instead of a set for compatibility with LRUCache. self._last_mapping: LoRAMapping | None = None self._is_3d_moe_model = is_moe_model(self.model) and self.model.is_3d_moe_weight + self._init_punica_wrapper(max_num_batched_tokens, vllm_config) self._create_lora_modules() self.model.lora_manager = self - def _maybe_init_mm(self, vllm_config: VllmConfig): + def _init_punica_wrapper( + self, max_num_batched_tokens: int, vllm_config: VllmConfig + ) -> None: + self.punica_wrapper = get_punica_wrapper( + max_num_batched_tokens, + max_batches=self.max_num_seqs, + device=self.device, + max_loras=self.lora_config.max_loras, + ) + self._maybe_init_mm(vllm_config) + + def _maybe_init_mm(self, vllm_config: VllmConfig) -> None: # Used to indicate whether the model is a multimodal model self.supports_mm: bool = ( supports_multimodal(self.model) @@ -320,7 +325,7 @@ class LoRAModelManager: # Default to the main language model wrapper target_wrapper = self.punica_wrapper - if self.supports_tower_connector_lora: + if self.supports_mm and self.supports_tower_connector_lora: if mapping.type == LoRAMappingType.TOWER and self.mm_mapping.tower_model: target_name = self.mm_mapping.tower_model[0] target_wrapper = self.mm_punica_wrapper_mapping[target_name] @@ -363,7 +368,7 @@ class LoRAModelManager: if self._filter_unsupported_mm_module(module_name): logger.warning( - "Regarding %s, vLLM currently only supports adding LoRA to" + "Regarding %s, vLLM currently only supports adding LoRA to" " language model, {module_name} will be ignored.", self.model.__class__.__name__, module_name, @@ -427,7 +432,7 @@ class LoRAModelManager: self._register_packed_modules(module_name) # All lora layers share the same punica_wrapper based on reference. - if self.supports_tower_connector_lora: + if self.supports_mm and self.supports_tower_connector_lora: new_module.set_mapping(self._get_mm_punica_wrapper(module_name)) else: new_module.set_mapping(self.punica_wrapper) From 5e78570cce8be9af020c9b40996c68cf0ffb199a Mon Sep 17 00:00:00 2001 From: prashanth058 Date: Thu, 11 Dec 2025 23:55:32 -0600 Subject: [PATCH 25/53] update packed modules mapping (#11) Signed-off-by: prashanth058 --- vllm/lora/layers/column_parallel_linear.py | 7 +------ vllm/model_executor/models/qwen2_5_vl.py | 1 + vllm/model_executor/models/qwen3_vl.py | 1 + 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/vllm/lora/layers/column_parallel_linear.py b/vllm/lora/layers/column_parallel_linear.py index 8273046bb6ecd..904025901fba7 100644 --- a/vllm/lora/layers/column_parallel_linear.py +++ b/vllm/lora/layers/column_parallel_linear.py @@ -340,12 +340,7 @@ class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): packed_modules_list: list, model_config: PretrainedConfig | None = None, ) -> bool: - # Vision tower QKV has packed_modules_list=[] (already packed in checkpoint) - # Language models have packed_modules_list=[module_name] - # (single LoRA for qkv_proj) - return type(source_layer) is QKVParallelLinear and ( - len(packed_modules_list) <= 1 - ) + return type(source_layer) is QKVParallelLinear and len(packed_modules_list) == 1 class MergedQKVParallelLinearWithLoRA(MergedColumnParallelLinearWithLoRA): diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 69fac625dde4c..76e2627b65ca6 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -1061,6 +1061,7 @@ class Qwen2_5_VLForConditionalGeneration( packed_modules_mapping = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], "gate_up_proj": ["gate_proj", "up_proj"], + "qkv": ["qkv"], # For vision tower's already-packed QKV } # To ensure correct weight loading and mapping. diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index eca21bad718f0..8a34e6e77faf6 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -1201,6 +1201,7 @@ class Qwen3VLForConditionalGeneration( "gate_proj", "up_proj", ], + "qkv": ["qkv"], # For vision tower's already-packed QKV } supports_encoder_tp_data = True From 1cb35461fc864288b1b3c31f4e389563c994c9ba Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Fri, 12 Dec 2025 07:00:24 +0000 Subject: [PATCH 26/53] Move forward Signed-off-by: Jee Jee Li --- docs/features/lora.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/features/lora.md b/docs/features/lora.md index d42a3cef76bde..f38bf4088db52 100644 --- a/docs/features/lora.md +++ b/docs/features/lora.md @@ -275,6 +275,10 @@ The new format of `--lora-modules` is mainly to support the display of parent mo } ``` +## LoRA Support for Tower and Connector of Multi-Modal Model + +Currently, vLLM experimentally supports LoRA for the Tower and Connector components of multi-modal models. To enable this feature, you need to implement the corresponding token helper functions for the tower and connector. For more details on the rationale behind this approach, please refer to [PR 26674](https://github.com/vllm-project/vllm/pull/26674). We welcome contributions to extend LoRA support to additional models' tower and connector. + ## Default LoRA Models For Multimodal Models Some models, e.g., [Granite Speech](https://huggingface.co/ibm-granite/granite-speech-3.3-8b) and [Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct) multimodal, contain LoRA adapter(s) that are expected to always be applied when a given modality is present. This can be a bit tedious to manage with the above approaches, as it requires the user to send the `LoRARequest` (offline) or to filter requests between the base model and LoRA model (server) depending on the content of the request's multimodal data. @@ -347,8 +351,11 @@ vllm serve ibm-granite/granite-speech-3.3-2b \ --max-lora-rank 64 ``` + + Note: Default multimodal LoRAs are currently only available for `.generate` and chat completions. + ## Using Tips ### Configuring `max_lora_rank` From 35acd22a5de28f1e3c3e964db939dcd7259dc1cc Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Fri, 12 Dec 2025 08:53:09 +0000 Subject: [PATCH 27/53] Move forward Signed-off-by: Jee Jee Li --- vllm/config/lora.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/config/lora.py b/vllm/config/lora.py index 12693acec0e95..81ee135a9a4b0 100644 --- a/vllm/config/lora.py +++ b/vllm/config/lora.py @@ -78,6 +78,7 @@ class LoRAConfig: factors.append(self.max_loras) factors.append(self.fully_sharded_loras) factors.append(self.lora_dtype) + factors.append(self.enable_tower_connector_lora) hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest() return hash_str From 6a3f0a5abc1509a31fa8cc2a34fc1b9df474abad Mon Sep 17 00:00:00 2001 From: bk-201 Date: Sat, 13 Dec 2025 16:45:54 +0000 Subject: [PATCH 28/53] fix the issue with the MM token count Signed-off-by: bk-201 --- vllm/lora/layers/base_linear.py | 4 ++-- vllm/v1/worker/gpu_model_runner.py | 5 +++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/vllm/lora/layers/base_linear.py b/vllm/lora/layers/base_linear.py index 0ed3508510914..fc79aec5d650f 100644 --- a/vllm/lora/layers/base_linear.py +++ b/vllm/lora/layers/base_linear.py @@ -122,7 +122,6 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA): def apply(self, x: torch.Tensor, bias: torch.Tensor | None = None) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - # Store original shape for later reshaping original_shape = output.shape if output.ndim == 3 else None # In transformers backend, x and output have extra batch dimension like @@ -138,7 +137,8 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA): if not current_platform.can_update_inplace(): output = lora_output - # Restore original shape if it was flattened + # Reshape the flattened output back to its original shape, + # as some MM encoders cannot handle flattened inputs. if original_shape is not None: output = output.reshape(original_shape) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 17f77000f5305..d4f214a20595c 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2157,8 +2157,13 @@ class GPUModelRunner( req_idx = self.input_batch.req_id_to_index[req_id] lora_id = int(self.input_batch.request_lora_mapping[req_idx]) + # Prefer pos_info.is_embed to count actual MM embedding tokens. + # pos_info.length may overcount (e.g., special tokens in Qwen-VL). + # Fall back to length if is_embed is None. num_tokens = self.info.get_num_mm_encoder_tokens( # type: ignore[attr-defined] pos_info.length + if pos_info.is_embed is None + else pos_info.is_embed.sum() ) prompt_lora_mapping.append(lora_id) token_lora_mapping.extend([lora_id] * num_tokens) From c1bb71ef6bc7665fa9d8d08447d26da720a088cd Mon Sep 17 00:00:00 2001 From: bk-201 Date: Sat, 13 Dec 2025 17:03:47 +0000 Subject: [PATCH 29/53] fix pre-commit Signed-off-by: bk-201 --- docs/features/lora.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/docs/features/lora.md b/docs/features/lora.md index f38bf4088db52..eb9f446385431 100644 --- a/docs/features/lora.md +++ b/docs/features/lora.md @@ -351,11 +351,8 @@ vllm serve ibm-granite/granite-speech-3.3-2b \ --max-lora-rank 64 ``` - - Note: Default multimodal LoRAs are currently only available for `.generate` and chat completions. - ## Using Tips ### Configuring `max_lora_rank` From 58d2c47b9a65e43b464a6737caad0d4bce4e2375 Mon Sep 17 00:00:00 2001 From: bk-201 Date: Mon, 15 Dec 2025 07:49:52 +0000 Subject: [PATCH 30/53] update punica_wrapper_mapping Signed-off-by: bk-201 --- vllm/lora/model_manager.py | 98 +++++++++++++++++------------- vllm/v1/worker/gpu_model_runner.py | 4 +- 2 files changed, 56 insertions(+), 46 deletions(-) diff --git a/vllm/lora/model_manager.py b/vllm/lora/model_manager.py index 9bffa17b36712..33e147195a6f7 100644 --- a/vllm/lora/model_manager.py +++ b/vllm/lora/model_manager.py @@ -42,6 +42,7 @@ from vllm.v1.worker.utils import MultiModalBudget logger = init_logger(__name__) T = TypeVar("T") +DEFAULT_WRAPPER_KEY = "__default__" class AdapterLRUCache(LRUCache[int, T]): @@ -117,6 +118,11 @@ class LoRAModelManager: device=self.device, max_loras=self.lora_config.max_loras, ) + + self.punica_wrapper_mapping: dict[str, PunicaWrapperBase] = { + DEFAULT_WRAPPER_KEY: self.punica_wrapper + } + self._maybe_init_mm(vllm_config) def _maybe_init_mm(self, vllm_config: VllmConfig) -> None: @@ -132,8 +138,8 @@ class LoRAModelManager: self.supports_tower_connector_lora = False model_config: ModelConfig = vllm_config.model_config - self.mm_mapping: MultiModelKeys = self.model.get_mm_mapping() + if self.lora_config.enable_tower_connector_lora: self.info = MULTIMODAL_REGISTRY.create_processor(model_config).info self.supports_tower_connector_lora = self.supports_mm and hasattr( @@ -153,24 +159,26 @@ class LoRAModelManager: MULTIMODAL_REGISTRY, ) limit_per_prompt: int = max(self.info.get_allowed_mm_limits().values()) - - # For vision tower num_encoder_tokens = self.info.get_num_mm_encoder_tokens( mm_budget.get_encoder_budget() ) - self.mm_punica_wrapper_mapping = { - name: get_punica_wrapper( + + self.punica_wrapper_mapping = {} + + # Tower wrappers + for name in self.mm_mapping.tower_model: + self.punica_wrapper_mapping[name] = get_punica_wrapper( num_encoder_tokens, max_batches=self.max_num_seqs * limit_per_prompt, device=self.device, max_loras=self.lora_config.max_loras, ) - for name in self.mm_mapping.tower_model - } - # For language model - self.mm_punica_wrapper_mapping.update( - {self.mm_mapping.language_model[0]: self.punica_wrapper} + + # Language wrapper + self.punica_wrapper_mapping[self.mm_mapping.language_model[0]] = ( + self.punica_wrapper ) + # Use wrapper for connector if present. if self.mm_mapping.connector: if hasattr(self.info, "get_num_mm_connector_tokens"): @@ -183,7 +191,7 @@ class LoRAModelManager: device=self.device, max_loras=self.lora_config.max_loras, ) - self.mm_punica_wrapper_mapping.update( + self.punica_wrapper_mapping.update( { name: connector_punica_wrapper for name in self.mm_mapping.connector @@ -323,20 +331,19 @@ class LoRAModelManager: def _set_adapter_mapping(self, mapping: LoRAMapping) -> None: # Default to the main language model wrapper - target_wrapper = self.punica_wrapper - - if self.supports_mm and self.supports_tower_connector_lora: + if not (self.supports_mm and self.supports_tower_connector_lora): + target_wrapper = self.punica_wrapper_mapping[DEFAULT_WRAPPER_KEY] + else: if mapping.type == LoRAMappingType.TOWER and self.mm_mapping.tower_model: - target_name = self.mm_mapping.tower_model[0] - target_wrapper = self.mm_punica_wrapper_mapping[target_name] + target_prefix = self.mm_mapping.tower_model[0] elif ( mapping.type == LoRAMappingType.CONNECTOR and self.mm_mapping.connector ): - target_name = self.mm_mapping.connector[0] - target_wrapper = self.mm_punica_wrapper_mapping[target_name] + target_prefix = self.mm_mapping.connector[0] else: - target_name = self.mm_mapping.language_model[0] - target_wrapper = self.mm_punica_wrapper_mapping[target_name] + target_prefix = self.mm_mapping.language_model[0] + + target_wrapper = self.punica_wrapper_mapping[target_prefix] target_wrapper.update_metadata( mapping, @@ -369,7 +376,7 @@ class LoRAModelManager: if self._filter_unsupported_mm_module(module_name): logger.warning( "Regarding %s, vLLM currently only supports adding LoRA to" - " language model, {module_name} will be ignored.", + " language model, %s will be ignored.", self.model.__class__.__name__, module_name, ) @@ -432,10 +439,10 @@ class LoRAModelManager: self._register_packed_modules(module_name) # All lora layers share the same punica_wrapper based on reference. - if self.supports_mm and self.supports_tower_connector_lora: - new_module.set_mapping(self._get_mm_punica_wrapper(module_name)) - else: - new_module.set_mapping(self.punica_wrapper) + wrapper = self._get_punica_wrapper_for_module(module_name) + if wrapper is None: + continue + new_module.set_mapping(wrapper) def register_module(self, module_name: str, module: "BaseLayerWithLoRA"): assert isinstance(module, BaseLayerWithLoRA), ( @@ -551,31 +558,36 @@ class LoRAModelManager: language model. LoRA for other modules, such as the vision tower, will be filtered out. """ - if self.supports_mm: - prefix_lst = self.mm_mapping.connector + self.mm_mapping.tower_model - if self.supports_tower_connector_lora: - return self._get_mm_punica_wrapper(module_name) is None - else: - return any([module_name.startswith(prefix) for prefix in prefix_lst]) - return False + if not self.supports_mm: + return False - def _get_mm_punica_wrapper(self, module_name: str) -> PunicaWrapperBase | None: + if self.supports_tower_connector_lora: + return self._get_punica_wrapper_for_module(module_name) is None + + prefix_lst = self.mm_mapping.connector + self.mm_mapping.tower_model + return any(module_name.startswith(prefix) for prefix in prefix_lst) + + def _get_punica_wrapper_for_module( + self, module_name: str + ) -> PunicaWrapperBase | None: """ Match the corresponding punica_wrapper based on module_name, and return None if lora is not supported for this module. """ - if self.supports_tower_connector_lora: + best_prefix = None + for prefix in self.punica_wrapper_mapping: + if prefix == DEFAULT_WRAPPER_KEY: + continue # Ensure matching by the longest prefix. - sorted_prefixes = sorted( - self.mm_punica_wrapper_mapping.keys(), - key=lambda x: len(x), - reverse=True, - ) + if module_name.startswith(prefix) and ( + best_prefix is None or len(prefix) > len(best_prefix) + ): + best_prefix = prefix - for prefix in sorted_prefixes: - if module_name.startswith(prefix): - return self.mm_punica_wrapper_mapping[prefix] - return None + if best_prefix is not None: + return self.punica_wrapper_mapping[best_prefix] + + return self.punica_wrapper_mapping.get(DEFAULT_WRAPPER_KEY) def _register_packed_modules(self, module_full_name: str) -> None: parts = module_full_name.split(".") diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index d4f214a20595c..ce5bc48ebaafb 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2161,9 +2161,7 @@ class GPUModelRunner( # pos_info.length may overcount (e.g., special tokens in Qwen-VL). # Fall back to length if is_embed is None. num_tokens = self.info.get_num_mm_encoder_tokens( # type: ignore[attr-defined] - pos_info.length - if pos_info.is_embed is None - else pos_info.is_embed.sum() + pos_info.get_num_embeds() ) prompt_lora_mapping.append(lora_id) token_lora_mapping.extend([lora_id] * num_tokens) From 57917818a444a9b5d7e53322e323e701a60ce191 Mon Sep 17 00:00:00 2001 From: bk-201 Date: Tue, 16 Dec 2025 16:30:20 +0000 Subject: [PATCH 31/53] fix bug Signed-off-by: bk-201 --- vllm/v1/worker/gpu_model_runner.py | 43 +++++------------------------- 1 file changed, 7 insertions(+), 36 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 0c6deaad3deef..9af9aa7ad2a2c 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -593,11 +593,13 @@ class GPUModelRunner( ) # Multimodal LoRA support - if self.supports_mm_inputs: + self.enable_tower_connector_lora = False + if self.supports_mm_inputs and self.lora_config: self.info = self.mm_registry.create_processor(self.model_config).info - self.supports_mm_lora = hasattr(self.info, "get_num_mm_encoder_tokens") - else: - self.supports_mm_lora = False + self.enable_tower_connector_lora = ( + hasattr(self.info, "get_num_mm_encoder_tokens") + and self.lora_config.enable_tower_connector_lora + ) # Pre-allocated tensor for copying valid sampled token counts to CPU, # with dedicated stream for overlapping and event for coordination. @@ -2148,7 +2150,7 @@ class GPUModelRunner( # encoder outputs. model = cast(SupportsMultiModal, self.model) - if self.lora_config and self.supports_mm_lora: + if self.enable_tower_connector_lora: # Build LoRA mappings independently for encoder inputs # (encoder batch structure is different from main batch) prompt_lora_mapping = [] @@ -2371,37 +2373,6 @@ class GPUModelRunner( return mm_embeds, is_mm_embed - def _extract_encoder_inputs( - self, - scheduler_output: "SchedulerOutput", - ) -> dict[str, torch.Tensor]: - """Extract encoder inputs for encoder-decoder models. - - This method extracts multimodal input features from scheduled encoder - inputs and formats them for the encoder-decoder model forward pass. - """ - # Batch the multi-modal inputs using the helper method. - mm_kwargs, _, _ = self._batch_mm_kwargs_from_scheduler(scheduler_output) - - if not mm_kwargs: - return {} - - # Group MM kwargs by modality and extract features - model = cast(SupportsMultiModal, self.model) - encoder_features = {} - for _, _, mm_kwargs_group in group_mm_kwargs_by_modality( - mm_kwargs, - device=self.device, - pin_memory=self.pin_memory, - merge_by_field_config=model.merge_by_field_config, - ): - # Add the grouped features to encoder_features dict - # This allows the model to receive them as kwargs (e.g., - # input_features=...) - encoder_features.update(mm_kwargs_group) - - return encoder_features - def get_model(self) -> nn.Module: # get raw model out of the cudagraph wrapper. if isinstance(self.model, (CUDAGraphWrapper, UBatchWrapper)): From da0adea88e6cab78128c0962d47c0de1fb27ccf9 Mon Sep 17 00:00:00 2001 From: Anexdeus <5142168@mail.ru> Date: Tue, 16 Dec 2025 22:40:40 +0300 Subject: [PATCH 32/53] added abstract methods to the base class --- vllm/multimodal/processing.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 0390773783961..f337bc9b0f7ba 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1389,6 +1389,22 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): """Given the HF-processed data, output the metadata of each field.""" raise NotImplementedError + @abstractmethod + def get_num_mm_encoder_tokens( + self, + num_image_tokens: int, + ) -> int: + """Given the number of image tokens, output the number of multi-modal encoder tokens""" + raise NotImplementedError + + @abstractmethod + def get_num_mm_connector_tokens( + self, + num_vision_tokens: int, + ) -> int: + """Given the number of vision tokens, output the number of multi-modal connector tokens""" + raise NotImplementedError + @abstractmethod def _get_prompt_updates( self, From 36121c6db040732873b03b3c726d1b58097689e8 Mon Sep 17 00:00:00 2001 From: Anexdeus <5142168@mail.ru> Date: Wed, 17 Dec 2025 01:31:34 +0300 Subject: [PATCH 33/53] fixed property bug in processor and added abstract methods in BaseProcessingInfo --- vllm/model_executor/models/qwen2_5_vl.py | 19 ----------- vllm/model_executor/models/qwen2_vl.py | 19 +++++++++++ vllm/multimodal/processing.py | 40 ++++++++++++++++++++++-- 3 files changed, 57 insertions(+), 21 deletions(-) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 0799f4500a351..02fac0b78a4b4 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -1007,25 +1007,6 @@ class Qwen2_5_VLMultiModalProcessor(Qwen2VLMultiModalProcessor): for modality in ("image", "video") ] - def get_num_mm_encoder_tokens( - self, - num_image_tokens: int, - ) -> int: - hf_config = self.get_hf_config() - vision_config = hf_config.vision_config - merge_size = vision_config.spatial_merge_size - - return num_image_tokens * merge_size**2 - - def get_num_mm_connector_tokens( - self, - num_vision_tokens: int, - ) -> int: - hf_config = self.get_hf_config() - vision_config = hf_config.vision_config - merge_size = vision_config.spatial_merge_size - return num_vision_tokens // merge_size**2 - @MULTIMODAL_REGISTRY.register_processor( Qwen2_5_VLMultiModalProcessor, diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 6e5560b945f2f..44f076e8d70f3 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1132,6 +1132,25 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]) self.info.get_hf_config().vision_config.spatial_merge_size )(hf_inputs) + def get_num_mm_encoder_tokens( + self, + num_image_tokens: int, + ) -> int: + hf_config = self.info.get_hf_config() + vision_config = hf_config.vision_config + merge_size = vision_config.spatial_merge_size + + return num_image_tokens * merge_size**2 + + def get_num_mm_connector_tokens( + self, + num_vision_tokens: int, + ) -> int: + hf_config = self.info.get_hf_config() + vision_config = hf_config.vision_config + merge_size = vision_config.spatial_merge_size + return num_vision_tokens // merge_size**2 + @MULTIMODAL_REGISTRY.register_processor( Qwen2VLMultiModalProcessor, diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index f337bc9b0f7ba..99ecaf61badd2 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1185,6 +1185,32 @@ class BaseProcessingInfo: """ return self.ctx.get_hf_processor(**kwargs) + @abstractmethod + def get_num_mm_encoder_tokens( + self, + num_image_tokens: int, + ) -> int: + """ + Implement this function to enable LoRA support + for the tower module of the multi-modal model + + Given the number of image tokens, output the number of multi-modal encoder tokens + """ + raise NotImplementedError + + @abstractmethod + def get_num_mm_connector_tokens( + self, + num_vision_tokens: int, + ) -> int: + """ + Implement this function to enable LoRA support + for the connector module of the multi-modal model + + Given the number of vision tokens, output the number of multi-modal connector tokens + """ + raise NotImplementedError + @abstractmethod def get_supported_mm_limits(self) -> Mapping[str, int | None]: """ @@ -1394,7 +1420,12 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): self, num_image_tokens: int, ) -> int: - """Given the number of image tokens, output the number of multi-modal encoder tokens""" + """ + Implement this function to enable LoRA support + for the tower module of the multi-modal model + + Given the number of image tokens, output the number of multi-modal encoder tokens + """ raise NotImplementedError @abstractmethod @@ -1402,7 +1433,12 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): self, num_vision_tokens: int, ) -> int: - """Given the number of vision tokens, output the number of multi-modal connector tokens""" + """ + Implement this function to enable LoRA support + for the connector module of the multi-modal model + + Given the number of vision tokens, output the number of multi-modal connector tokens + """ raise NotImplementedError @abstractmethod From 3d39188d38dc93030f81f4c011620b856b32cb78 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Wed, 17 Dec 2025 04:24:25 +0000 Subject: [PATCH 34/53] Fix Signed-off-by: Jee Jee Li --- tests/lora/test_lora_manager.py | 19 ++++++++++++------- vllm/lora/model_manager.py | 18 ++++++------------ 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index 50f17ced5dd74..b4bb23a9e6b4f 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -18,6 +18,7 @@ from vllm.lora.layers import ( from vllm.lora.lora_model import LoRAModel from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights from vllm.lora.model_manager import ( + DEFAULT_WRAPPER_KEY, LoRAMapping, LoRAModelManager, LRUCacheLoRAModelManager, @@ -183,9 +184,8 @@ def test_lora_model_manager(dist_init, dummy_model, device): assert manager.activate_adapter(2) assert manager.lora_index_to_id[0] == 3 assert manager.lora_index_to_id[1] == 2 - assert manager.device == device - assert manager.punica_wrapper.device == device + assert manager.punica_wrapper_mapping.get(DEFAULT_WRAPPER_KEY).device == device assert hasattr(manager, "supported_lora_modules") assert sorted(manager.supported_lora_modules) == [ "dense1", @@ -278,8 +278,7 @@ def test_lora_lru_cache_model_manager(dist_init, dummy_model, device): assert manager.remove_adapter(3) with pytest.raises(ValueError): assert manager.pin_adapter(3) - - assert manager.punica_wrapper.device == device + assert manager.punica_wrapper_mapping.get(DEFAULT_WRAPPER_KEY).device == device assert manager.device == device @@ -402,7 +401,7 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device): assert manager.remove_oldest_adapter() assert set(manager.list_adapters()) == {1} - assert manager.punica_wrapper.device == device + assert manager.punica_wrapper_mapping.get(DEFAULT_WRAPPER_KEY).device == device assert manager.device == device @@ -514,7 +513,10 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device, tmp_pa ) assert worker_adapter_manager.device == device - assert worker_adapter_manager._adapter_manager.punica_wrapper.device == device + punica_wrapper = worker_adapter_manager._adapter_manager.punica_wrapper_mapping.get( + DEFAULT_WRAPPER_KEY + ) + assert punica_wrapper.device == device @pytest.mark.parametrize("device", DEVICES) @@ -618,7 +620,10 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path ) assert worker_adapter_manager.device == device - assert worker_adapter_manager._adapter_manager.punica_wrapper.device == device + punica_wrapper = worker_adapter_manager._adapter_manager.punica_wrapper_mapping.get( + DEFAULT_WRAPPER_KEY + ) + assert punica_wrapper.device == device @pytest.mark.parametrize("device", DEVICES) diff --git a/vllm/lora/model_manager.py b/vllm/lora/model_manager.py index 33e147195a6f7..c9297d0071f13 100644 --- a/vllm/lora/model_manager.py +++ b/vllm/lora/model_manager.py @@ -42,7 +42,7 @@ from vllm.v1.worker.utils import MultiModalBudget logger = init_logger(__name__) T = TypeVar("T") -DEFAULT_WRAPPER_KEY = "__default__" +DEFAULT_WRAPPER_KEY = "language_model" class AdapterLRUCache(LRUCache[int, T]): @@ -112,15 +112,15 @@ class LoRAModelManager: def _init_punica_wrapper( self, max_num_batched_tokens: int, vllm_config: VllmConfig ) -> None: - self.punica_wrapper = get_punica_wrapper( + llm_punica_wrapper = get_punica_wrapper( max_num_batched_tokens, max_batches=self.max_num_seqs, device=self.device, max_loras=self.lora_config.max_loras, ) - + # NOTE This assumes the existence of a language model LoRA self.punica_wrapper_mapping: dict[str, PunicaWrapperBase] = { - DEFAULT_WRAPPER_KEY: self.punica_wrapper + DEFAULT_WRAPPER_KEY: llm_punica_wrapper } self._maybe_init_mm(vllm_config) @@ -163,8 +163,8 @@ class LoRAModelManager: mm_budget.get_encoder_budget() ) - self.punica_wrapper_mapping = {} - + # Only one language model can be included in the model. + assert len(self.mm_mapping.language_model == 1) # Tower wrappers for name in self.mm_mapping.tower_model: self.punica_wrapper_mapping[name] = get_punica_wrapper( @@ -173,12 +173,6 @@ class LoRAModelManager: device=self.device, max_loras=self.lora_config.max_loras, ) - - # Language wrapper - self.punica_wrapper_mapping[self.mm_mapping.language_model[0]] = ( - self.punica_wrapper - ) - # Use wrapper for connector if present. if self.mm_mapping.connector: if hasattr(self.info, "get_num_mm_connector_tokens"): From 1c8e3c4486b0d5ffda82c784e36a587fc590edf6 Mon Sep 17 00:00:00 2001 From: bk-201 Date: Wed, 17 Dec 2025 06:57:34 +0000 Subject: [PATCH 35/53] fix pre-commit Signed-off-by: bk-201 --- vllm/v1/worker/gpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 8cb814a6d053f..affe295e747b8 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2185,7 +2185,7 @@ class GPUModelRunner( # pos_info.length may overcount (e.g., special tokens in Qwen-VL). # Fall back to length if is_embed is None. num_tokens = self.info.get_num_mm_encoder_tokens( # type: ignore[attr-defined] - pos_info.get_num_embeds() + pos_info.get_num_embeds ) prompt_lora_mapping.append(lora_id) token_lora_mapping.extend([lora_id] * num_tokens) From df3ec2210655b6787068952ea19d7e41d34b5db0 Mon Sep 17 00:00:00 2001 From: bk-201 Date: Thu, 18 Dec 2025 16:45:41 +0000 Subject: [PATCH 36/53] remove hacky code Signed-off-by: bk-201 --- vllm/lora/model_manager.py | 131 +++++++++++++++++-------------------- 1 file changed, 61 insertions(+), 70 deletions(-) diff --git a/vllm/lora/model_manager.py b/vllm/lora/model_manager.py index c9297d0071f13..3805e3c72f9f7 100644 --- a/vllm/lora/model_manager.py +++ b/vllm/lora/model_manager.py @@ -3,6 +3,7 @@ import math from collections.abc import Callable +from dataclasses import dataclass from typing import TypeVar import regex as re @@ -42,7 +43,13 @@ from vllm.v1.worker.utils import MultiModalBudget logger = init_logger(__name__) T = TypeVar("T") -DEFAULT_WRAPPER_KEY = "language_model" +DEFAULT_LANGUAGE_WRAPPER_KEY = "language_model" + + +@dataclass(frozen=True) +class LoRATarget: + wrapper: PunicaWrapperBase + prefix: str class AdapterLRUCache(LRUCache[int, T]): @@ -112,17 +119,16 @@ class LoRAModelManager: def _init_punica_wrapper( self, max_num_batched_tokens: int, vllm_config: VllmConfig ) -> None: + self._lora_targets: list[tuple[str, PunicaWrapperBase]] = [] llm_punica_wrapper = get_punica_wrapper( max_num_batched_tokens, max_batches=self.max_num_seqs, device=self.device, max_loras=self.lora_config.max_loras, ) - # NOTE This assumes the existence of a language model LoRA - self.punica_wrapper_mapping: dict[str, PunicaWrapperBase] = { - DEFAULT_WRAPPER_KEY: llm_punica_wrapper - } + # NOTE This assumes the existence of a language model LoRA + self._lora_targets.append((DEFAULT_LANGUAGE_WRAPPER_KEY, llm_punica_wrapper)) self._maybe_init_mm(vllm_config) def _maybe_init_mm(self, vllm_config: VllmConfig) -> None: @@ -164,15 +170,27 @@ class LoRAModelManager: ) # Only one language model can be included in the model. - assert len(self.mm_mapping.language_model == 1) + assert len(self.mm_mapping.language_model) == 1 + + # Update prefix of language model + lm_prefix = ( + self.mm_mapping.language_model[0] + if self.supports_mm + else DEFAULT_LANGUAGE_WRAPPER_KEY + ) + _, llm_punica_wrapper = self._lora_targets.pop() + self._lora_targets.append((lm_prefix, llm_punica_wrapper)) + # Tower wrappers - for name in self.mm_mapping.tower_model: - self.punica_wrapper_mapping[name] = get_punica_wrapper( - num_encoder_tokens, - max_batches=self.max_num_seqs * limit_per_prompt, - device=self.device, - max_loras=self.lora_config.max_loras, - ) + tower_punica_wrapper = get_punica_wrapper( + num_encoder_tokens, + max_batches=self.max_num_seqs * limit_per_prompt, + device=self.device, + max_loras=self.lora_config.max_loras, + ) + for prefix in self.mm_mapping.tower_model: + self._lora_targets.append((prefix, tower_punica_wrapper)) + # Use wrapper for connector if present. if self.mm_mapping.connector: if hasattr(self.info, "get_num_mm_connector_tokens"): @@ -185,12 +203,8 @@ class LoRAModelManager: device=self.device, max_loras=self.lora_config.max_loras, ) - self.punica_wrapper_mapping.update( - { - name: connector_punica_wrapper - for name in self.mm_mapping.connector - } - ) + for prefix in self.mm_mapping.connector: + self._lora_targets.append((prefix, connector_punica_wrapper)) else: logger.warning_once( "Connector LoRA support disabled: model does not implement " @@ -198,6 +212,11 @@ class LoRAModelManager: "determine the connector's token budget for LoRA operations." ) + # Longest-prefix-first + self._lora_targets = sorted( + self._lora_targets, key=lambda x: len(x[0]), reverse=True + ) + def __len__(self) -> int: return len(self._registered_adapters) @@ -326,20 +345,22 @@ class LoRAModelManager: def _set_adapter_mapping(self, mapping: LoRAMapping) -> None: # Default to the main language model wrapper if not (self.supports_mm and self.supports_tower_connector_lora): - target_wrapper = self.punica_wrapper_mapping[DEFAULT_WRAPPER_KEY] + target_prefix = ( + self.mm_mapping.language_model[0] + if self.supports_mm + else DEFAULT_LANGUAGE_WRAPPER_KEY + ) + elif mapping.type == LoRAMappingType.TOWER and self.mm_mapping.tower_model: + target_prefix = self.mm_mapping.tower_model[0] + elif mapping.type == LoRAMappingType.CONNECTOR and self.mm_mapping.connector: + target_prefix = self.mm_mapping.connector[0] else: - if mapping.type == LoRAMappingType.TOWER and self.mm_mapping.tower_model: - target_prefix = self.mm_mapping.tower_model[0] - elif ( - mapping.type == LoRAMappingType.CONNECTOR and self.mm_mapping.connector - ): - target_prefix = self.mm_mapping.connector[0] - else: - target_prefix = self.mm_mapping.language_model[0] + target_prefix = self.mm_mapping.language_model[0] - target_wrapper = self.punica_wrapper_mapping[target_prefix] + target = self._get_lora_target(target_prefix) + assert target is not None - target_wrapper.update_metadata( + target.wrapper.update_metadata( mapping, self.lora_index_to_id, self.lora_slots + 1, @@ -367,7 +388,8 @@ class LoRAModelManager: if not self._match_target_modules(module_name): continue - if self._filter_unsupported_mm_module(module_name): + target = self._get_lora_target(module_name) + if target is None: logger.warning( "Regarding %s, vLLM currently only supports adding LoRA to" " language model, %s will be ignored.", @@ -433,10 +455,7 @@ class LoRAModelManager: self._register_packed_modules(module_name) # All lora layers share the same punica_wrapper based on reference. - wrapper = self._get_punica_wrapper_for_module(module_name) - if wrapper is None: - continue - new_module.set_mapping(wrapper) + new_module.set_mapping(target.wrapper) def register_module(self, module_name: str, module: "BaseLayerWithLoRA"): assert isinstance(module, BaseLayerWithLoRA), ( @@ -457,7 +476,7 @@ class LoRAModelManager: if ( not self._match_target_modules(module_name) or not isinstance(module, BaseLayerWithLoRA) - or self._filter_unsupported_mm_module(module_name) + or self._get_lora_target(module_name) is None ): continue parts = module_name.split(".") @@ -546,42 +565,14 @@ class LoRAModelManager: for target_module in self.supported_lora_modules ) - def _filter_unsupported_mm_module(self, module_name: str) -> bool: + def _get_lora_target(self, module_name: str) -> LoRATarget | None: """ - Regarding multimodal models, vLLM currently only supports adding LoRA to - language model. LoRA for other modules, such as the vision tower, will - be filtered out. + Determine whether this module supports LoRA and which wrapper to use. """ - if not self.supports_mm: - return False - - if self.supports_tower_connector_lora: - return self._get_punica_wrapper_for_module(module_name) is None - - prefix_lst = self.mm_mapping.connector + self.mm_mapping.tower_model - return any(module_name.startswith(prefix) for prefix in prefix_lst) - - def _get_punica_wrapper_for_module( - self, module_name: str - ) -> PunicaWrapperBase | None: - """ - Match the corresponding punica_wrapper based on module_name, - and return None if lora is not supported for this module. - """ - best_prefix = None - for prefix in self.punica_wrapper_mapping: - if prefix == DEFAULT_WRAPPER_KEY: - continue - # Ensure matching by the longest prefix. - if module_name.startswith(prefix) and ( - best_prefix is None or len(prefix) > len(best_prefix) - ): - best_prefix = prefix - - if best_prefix is not None: - return self.punica_wrapper_mapping[best_prefix] - - return self.punica_wrapper_mapping.get(DEFAULT_WRAPPER_KEY) + for prefix, wrapper in self._lora_targets: + if module_name.startswith(prefix): + return LoRATarget(wrapper=wrapper, prefix=prefix) + return None def _register_packed_modules(self, module_full_name: str) -> None: parts = module_full_name.split(".") From 764aa451403a691197a7f4c8a3ccc85e493de304 Mon Sep 17 00:00:00 2001 From: bk-201 Date: Fri, 19 Dec 2025 16:57:25 +0000 Subject: [PATCH 37/53] fix bug Signed-off-by: bk-201 --- tests/lora/test_lora_manager.py | 21 ++++++++++++----- vllm/lora/model_manager.py | 40 ++++++++++++++++++++++++--------- 2 files changed, 45 insertions(+), 16 deletions(-) diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index b4bb23a9e6b4f..d401db6fdde2a 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -18,7 +18,7 @@ from vllm.lora.layers import ( from vllm.lora.lora_model import LoRAModel from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights from vllm.lora.model_manager import ( - DEFAULT_WRAPPER_KEY, + DEFAULT_LANGUAGE_WRAPPER_KEY, LoRAMapping, LoRAModelManager, LRUCacheLoRAModelManager, @@ -185,7 +185,10 @@ def test_lora_model_manager(dist_init, dummy_model, device): assert manager.lora_index_to_id[0] == 3 assert manager.lora_index_to_id[1] == 2 assert manager.device == device - assert manager.punica_wrapper_mapping.get(DEFAULT_WRAPPER_KEY).device == device + assert ( + manager.punica_wrapper_mapping.get(DEFAULT_LANGUAGE_WRAPPER_KEY).device + == device + ) assert hasattr(manager, "supported_lora_modules") assert sorted(manager.supported_lora_modules) == [ "dense1", @@ -278,7 +281,10 @@ def test_lora_lru_cache_model_manager(dist_init, dummy_model, device): assert manager.remove_adapter(3) with pytest.raises(ValueError): assert manager.pin_adapter(3) - assert manager.punica_wrapper_mapping.get(DEFAULT_WRAPPER_KEY).device == device + assert ( + manager.punica_wrapper_mapping.get(DEFAULT_LANGUAGE_WRAPPER_KEY).device + == device + ) assert manager.device == device @@ -401,7 +407,10 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device): assert manager.remove_oldest_adapter() assert set(manager.list_adapters()) == {1} - assert manager.punica_wrapper_mapping.get(DEFAULT_WRAPPER_KEY).device == device + assert ( + manager.punica_wrapper_mapping.get(DEFAULT_LANGUAGE_WRAPPER_KEY).device + == device + ) assert manager.device == device @@ -514,7 +523,7 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device, tmp_pa assert worker_adapter_manager.device == device punica_wrapper = worker_adapter_manager._adapter_manager.punica_wrapper_mapping.get( - DEFAULT_WRAPPER_KEY + DEFAULT_LANGUAGE_WRAPPER_KEY ) assert punica_wrapper.device == device @@ -621,7 +630,7 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path assert worker_adapter_manager.device == device punica_wrapper = worker_adapter_manager._adapter_manager.punica_wrapper_mapping.get( - DEFAULT_WRAPPER_KEY + DEFAULT_LANGUAGE_WRAPPER_KEY ) assert punica_wrapper.device == device diff --git a/vllm/lora/model_manager.py b/vllm/lora/model_manager.py index 3805e3c72f9f7..f685db14af17c 100644 --- a/vllm/lora/model_manager.py +++ b/vllm/lora/model_manager.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math +from collections import OrderedDict from collections.abc import Callable from dataclasses import dataclass from typing import TypeVar @@ -119,7 +120,7 @@ class LoRAModelManager: def _init_punica_wrapper( self, max_num_batched_tokens: int, vllm_config: VllmConfig ) -> None: - self._lora_targets: list[tuple[str, PunicaWrapperBase]] = [] + self.punica_wrapper_mapping: OrderedDict[str, PunicaWrapperBase] = OrderedDict() llm_punica_wrapper = get_punica_wrapper( max_num_batched_tokens, max_batches=self.max_num_seqs, @@ -128,7 +129,9 @@ class LoRAModelManager: ) # NOTE This assumes the existence of a language model LoRA - self._lora_targets.append((DEFAULT_LANGUAGE_WRAPPER_KEY, llm_punica_wrapper)) + self.punica_wrapper_mapping.setdefault( + DEFAULT_LANGUAGE_WRAPPER_KEY, llm_punica_wrapper + ) self._maybe_init_mm(vllm_config) def _maybe_init_mm(self, vllm_config: VllmConfig) -> None: @@ -178,8 +181,10 @@ class LoRAModelManager: if self.supports_mm else DEFAULT_LANGUAGE_WRAPPER_KEY ) - _, llm_punica_wrapper = self._lora_targets.pop() - self._lora_targets.append((lm_prefix, llm_punica_wrapper)) + llm_punica_wrapper = self.punica_wrapper_mapping.pop( + DEFAULT_LANGUAGE_WRAPPER_KEY + ) + self.punica_wrapper_mapping[lm_prefix] = llm_punica_wrapper # Tower wrappers tower_punica_wrapper = get_punica_wrapper( @@ -189,7 +194,7 @@ class LoRAModelManager: max_loras=self.lora_config.max_loras, ) for prefix in self.mm_mapping.tower_model: - self._lora_targets.append((prefix, tower_punica_wrapper)) + self.punica_wrapper_mapping[prefix] = tower_punica_wrapper # Use wrapper for connector if present. if self.mm_mapping.connector: @@ -204,7 +209,7 @@ class LoRAModelManager: max_loras=self.lora_config.max_loras, ) for prefix in self.mm_mapping.connector: - self._lora_targets.append((prefix, connector_punica_wrapper)) + self.punica_wrapper_mapping[prefix] = connector_punica_wrapper else: logger.warning_once( "Connector LoRA support disabled: model does not implement " @@ -213,8 +218,12 @@ class LoRAModelManager: ) # Longest-prefix-first - self._lora_targets = sorted( - self._lora_targets, key=lambda x: len(x[0]), reverse=True + self.punica_wrapper_mapping = OrderedDict( + sorted( + self.punica_wrapper_mapping.items(), + key=lambda x: len(x[0]), + reverse=True, + ) ) def __len__(self) -> int: @@ -569,9 +578,20 @@ class LoRAModelManager: """ Determine whether this module supports LoRA and which wrapper to use. """ - for prefix, wrapper in self._lora_targets: - if module_name.startswith(prefix): + # For language Model (early return) + if not self.supports_mm: + wrapper = list(self.punica_wrapper_mapping.values())[0] + return LoRATarget(wrapper=wrapper, prefix=DEFAULT_LANGUAGE_WRAPPER_KEY) + + # For multimodal model + for prefix, wrapper in self.punica_wrapper_mapping.items(): + is_language_model = ( + prefix == DEFAULT_LANGUAGE_WRAPPER_KEY + and module_name.startswith(self.mm_mapping.language_model[0]) + ) + if is_language_model or module_name.startswith(prefix): return LoRATarget(wrapper=wrapper, prefix=prefix) + return None def _register_packed_modules(self, module_full_name: str) -> None: From d053aa73e1eae20342235eea9715ff0c380dc264 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Sat, 20 Dec 2025 01:47:11 +0000 Subject: [PATCH 38/53] Fix Signed-off-by: Jee Jee Li --- vllm/lora/model_manager.py | 112 ++++++++++++++++--------------------- 1 file changed, 47 insertions(+), 65 deletions(-) diff --git a/vllm/lora/model_manager.py b/vllm/lora/model_manager.py index f685db14af17c..05713bda91236 100644 --- a/vllm/lora/model_manager.py +++ b/vllm/lora/model_manager.py @@ -2,9 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math -from collections import OrderedDict from collections.abc import Callable -from dataclasses import dataclass from typing import TypeVar import regex as re @@ -47,12 +45,6 @@ T = TypeVar("T") DEFAULT_LANGUAGE_WRAPPER_KEY = "language_model" -@dataclass(frozen=True) -class LoRATarget: - wrapper: PunicaWrapperBase - prefix: str - - class AdapterLRUCache(LRUCache[int, T]): def __init__(self, capacity: int, deactivate_fn: Callable[[int], object]): super().__init__(capacity) @@ -120,21 +112,6 @@ class LoRAModelManager: def _init_punica_wrapper( self, max_num_batched_tokens: int, vllm_config: VllmConfig ) -> None: - self.punica_wrapper_mapping: OrderedDict[str, PunicaWrapperBase] = OrderedDict() - llm_punica_wrapper = get_punica_wrapper( - max_num_batched_tokens, - max_batches=self.max_num_seqs, - device=self.device, - max_loras=self.lora_config.max_loras, - ) - - # NOTE This assumes the existence of a language model LoRA - self.punica_wrapper_mapping.setdefault( - DEFAULT_LANGUAGE_WRAPPER_KEY, llm_punica_wrapper - ) - self._maybe_init_mm(vllm_config) - - def _maybe_init_mm(self, vllm_config: VllmConfig) -> None: # Used to indicate whether the model is a multimodal model self.supports_mm: bool = ( supports_multimodal(self.model) @@ -142,13 +119,39 @@ class LoRAModelManager: # text modules (e.g. ChatGLM) and hasattr(self.model, "get_mm_mapping") ) - if not self.supports_mm: - return + self.punica_wrapper_mapping: dict[str, PunicaWrapperBase] = {} + if self.supports_mm: + self._maybe_init_mm(vllm_config,max_num_batched_tokens) + else: + llm_punica_wrapper = get_punica_wrapper( + max_num_batched_tokens, + max_batches=self.max_num_seqs, + device=self.device, + max_loras=self.lora_config.max_loras, + ) + self.punica_wrapper_mapping[DEFAULT_LANGUAGE_WRAPPER_KEY] = ( + llm_punica_wrapper + ) + + def _maybe_init_mm(self, vllm_config: VllmConfig, max_num_batched_tokens) -> None: self.supports_tower_connector_lora = False model_config: ModelConfig = vllm_config.model_config self.mm_mapping: MultiModelKeys = self.model.get_mm_mapping() + # Only one language model can be included in the model. + assert len(self.mm_mapping.language_model) == 1 + + # Language model punica wrapper + llm_punica_wrapper = get_punica_wrapper( + max_num_batched_tokens, + max_batches=self.max_num_seqs, + device=self.device, + max_loras=self.lora_config.max_loras, + ) + lm_prefix = self.mm_mapping.language_model[0] + self.punica_wrapper_mapping[lm_prefix] = llm_punica_wrapper + if self.lora_config.enable_tower_connector_lora: self.info = MULTIMODAL_REGISTRY.create_processor(model_config).info self.supports_tower_connector_lora = self.supports_mm and hasattr( @@ -156,6 +159,7 @@ class LoRAModelManager: ) if not self.supports_tower_connector_lora: return + logger.warning( "LoRA for the tower and connector of multimodal models is " "experimental and may contain bugs. Please report any related issues on " @@ -172,20 +176,6 @@ class LoRAModelManager: mm_budget.get_encoder_budget() ) - # Only one language model can be included in the model. - assert len(self.mm_mapping.language_model) == 1 - - # Update prefix of language model - lm_prefix = ( - self.mm_mapping.language_model[0] - if self.supports_mm - else DEFAULT_LANGUAGE_WRAPPER_KEY - ) - llm_punica_wrapper = self.punica_wrapper_mapping.pop( - DEFAULT_LANGUAGE_WRAPPER_KEY - ) - self.punica_wrapper_mapping[lm_prefix] = llm_punica_wrapper - # Tower wrappers tower_punica_wrapper = get_punica_wrapper( num_encoder_tokens, @@ -217,15 +207,6 @@ class LoRAModelManager: "determine the connector's token budget for LoRA operations." ) - # Longest-prefix-first - self.punica_wrapper_mapping = OrderedDict( - sorted( - self.punica_wrapper_mapping.items(), - key=lambda x: len(x[0]), - reverse=True, - ) - ) - def __len__(self) -> int: return len(self._registered_adapters) @@ -366,10 +347,10 @@ class LoRAModelManager: else: target_prefix = self.mm_mapping.language_model[0] - target = self._get_lora_target(target_prefix) - assert target is not None + punica_wrapper = self._get_punica_wrapper(target_prefix) + assert punica_wrapper is not None - target.wrapper.update_metadata( + punica_wrapper.wrapper.update_metadata( mapping, self.lora_index_to_id, self.lora_slots + 1, @@ -397,8 +378,8 @@ class LoRAModelManager: if not self._match_target_modules(module_name): continue - target = self._get_lora_target(module_name) - if target is None: + punica_wrapper = self._get_punica_wrapper(module_name) + if punica_wrapper is None: logger.warning( "Regarding %s, vLLM currently only supports adding LoRA to" " language model, %s will be ignored.", @@ -464,7 +445,7 @@ class LoRAModelManager: self._register_packed_modules(module_name) # All lora layers share the same punica_wrapper based on reference. - new_module.set_mapping(target.wrapper) + new_module.set_mapping(punica_wrapper) def register_module(self, module_name: str, module: "BaseLayerWithLoRA"): assert isinstance(module, BaseLayerWithLoRA), ( @@ -485,7 +466,7 @@ class LoRAModelManager: if ( not self._match_target_modules(module_name) or not isinstance(module, BaseLayerWithLoRA) - or self._get_lora_target(module_name) is None + or self._get_punica_wrapper(module_name) is None ): continue parts = module_name.split(".") @@ -574,23 +555,24 @@ class LoRAModelManager: for target_module in self.supported_lora_modules ) - def _get_lora_target(self, module_name: str) -> LoRATarget | None: + def _get_punica_wrapper(self, module_name: str) -> PunicaWrapperBase | None: """ Determine whether this module supports LoRA and which wrapper to use. """ # For language Model (early return) if not self.supports_mm: - wrapper = list(self.punica_wrapper_mapping.values())[0] - return LoRATarget(wrapper=wrapper, prefix=DEFAULT_LANGUAGE_WRAPPER_KEY) + return self.punica_wrapper_mapping[DEFAULT_LANGUAGE_WRAPPER_KEY] + + # For multimodal model - for prefix, wrapper in self.punica_wrapper_mapping.items(): - is_language_model = ( - prefix == DEFAULT_LANGUAGE_WRAPPER_KEY - and module_name.startswith(self.mm_mapping.language_model[0]) - ) - if is_language_model or module_name.startswith(prefix): - return LoRATarget(wrapper=wrapper, prefix=prefix) + # for prefix, wrapper in self.punica_wrapper_mapping.items(): + # is_language_model = ( + # prefix == DEFAULT_LANGUAGE_WRAPPER_KEY + # and module_name.startswith(self.mm_mapping.language_model[0]) + # ) + # if is_language_model or module_name.startswith(prefix): + # return LoRATarget(wrapper=wrapper, prefix=prefix) return None From 9c9950c08037e14df2316b2aaa844f0f52f13c6c Mon Sep 17 00:00:00 2001 From: bk-201 Date: Sat, 20 Dec 2025 04:05:59 +0000 Subject: [PATCH 39/53] fix Signed-off-by: bk-201 --- vllm/lora/model_manager.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/vllm/lora/model_manager.py b/vllm/lora/model_manager.py index 05713bda91236..b7f88ed69e152 100644 --- a/vllm/lora/model_manager.py +++ b/vllm/lora/model_manager.py @@ -121,7 +121,7 @@ class LoRAModelManager: ) self.punica_wrapper_mapping: dict[str, PunicaWrapperBase] = {} if self.supports_mm: - self._maybe_init_mm(vllm_config,max_num_batched_tokens) + self._maybe_init_mm(vllm_config, max_num_batched_tokens) else: llm_punica_wrapper = get_punica_wrapper( max_num_batched_tokens, @@ -350,7 +350,7 @@ class LoRAModelManager: punica_wrapper = self._get_punica_wrapper(target_prefix) assert punica_wrapper is not None - punica_wrapper.wrapper.update_metadata( + punica_wrapper.update_metadata( mapping, self.lora_index_to_id, self.lora_slots + 1, @@ -563,16 +563,12 @@ class LoRAModelManager: if not self.supports_mm: return self.punica_wrapper_mapping[DEFAULT_LANGUAGE_WRAPPER_KEY] - - # For multimodal model - # for prefix, wrapper in self.punica_wrapper_mapping.items(): - # is_language_model = ( - # prefix == DEFAULT_LANGUAGE_WRAPPER_KEY - # and module_name.startswith(self.mm_mapping.language_model[0]) - # ) - # if is_language_model or module_name.startswith(prefix): - # return LoRATarget(wrapper=wrapper, prefix=prefix) + # NOTE Sort by prefix length (descending) to match the longest prefix first + # e.g., 'visual.merger' should match 'visual.merger' instead of 'visual.' + for prefix in sorted(self.punica_wrapper_mapping.keys(), key=len, reverse=True): + if module_name.startswith(prefix): + return self.punica_wrapper_mapping[prefix] return None From 4c2e95ad565e95c46a9e1117de21571caa49d7ff Mon Sep 17 00:00:00 2001 From: bk-201 Date: Sat, 20 Dec 2025 06:23:33 +0000 Subject: [PATCH 40/53] correct f-string formatting Signed-off-by: bk-201 --- vllm/lora/model_manager.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/lora/model_manager.py b/vllm/lora/model_manager.py index b7f88ed69e152..d9cdabd086d6f 100644 --- a/vllm/lora/model_manager.py +++ b/vllm/lora/model_manager.py @@ -82,8 +82,9 @@ class LoRAModelManager: """ self.model: SupportsLoRA = model self.supported_lora_modules = get_supported_lora_modules(self.model) - assert self.supported_lora_modules, "No supported LoRA modules found in" - f" {self.model.__class__.__name__}." + assert self.supported_lora_modules, ( + f"No supported LoRA modules found in {self.model.__class__.__name__}." + ) self._registered_adapters: dict[int, LoRAModel] = {} # Dict instead of a set for compatibility with LRUCache. @@ -559,7 +560,7 @@ class LoRAModelManager: """ Determine whether this module supports LoRA and which wrapper to use. """ - # For language Model (early return) + # For language model (early return) if not self.supports_mm: return self.punica_wrapper_mapping[DEFAULT_LANGUAGE_WRAPPER_KEY] From b03d1a04a867c5392afae8e374951ef079f5f6ba Mon Sep 17 00:00:00 2001 From: Anexdeus <5142168@mail.ru> Date: Sat, 20 Dec 2025 12:29:46 +0300 Subject: [PATCH 41/53] added ProcessingInfoMixin for QwenVL series models --- vllm/model_executor/models/qwen2_vl.py | 67 +++++++++++--------------- vllm/multimodal/processing.py | 52 -------------------- 2 files changed, 28 insertions(+), 91 deletions(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 44f076e8d70f3..d530cf629f4ad 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -820,7 +820,34 @@ class Qwen2VLMultiModalDataParser(MultiModalDataParser): return super()._parse_video_data(data) -class Qwen2VLProcessingInfo(BaseProcessingInfo): +class QwenVLSeriesProcessingInfoMixin: + """ + Mixin that provides get_num_mm_encoder_tokens() + and get_num_mm_connector_tokens() methods for + QwenVL series models without affecting other multi-modal models. + """ + + def get_num_mm_encoder_tokens( + self, + num_image_tokens: int, + ) -> int: + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + merge_size = vision_config.spatial_merge_size + + return num_image_tokens * merge_size**2 + + def get_num_mm_connector_tokens( + self, + num_vision_tokens: int, + ) -> int: + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + merge_size = vision_config.spatial_merge_size + return num_vision_tokens // merge_size**2 + + +class Qwen2VLProcessingInfo(QwenVLSeriesProcessingInfoMixin, BaseProcessingInfo): def get_hf_config(self): return self.ctx.get_hf_config(Qwen2VLConfig) @@ -1017,25 +1044,6 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo): image_processor=None, ) - def get_num_mm_encoder_tokens( - self, - num_image_tokens: int, - ) -> int: - hf_config = self.get_hf_config() - vision_config = hf_config.vision_config - merge_size = vision_config.spatial_merge_size - - return num_image_tokens * merge_size**2 - - def get_num_mm_connector_tokens( - self, - num_vision_tokens: int, - ) -> int: - hf_config = self.get_hf_config() - vision_config = hf_config.vision_config - merge_size = vision_config.spatial_merge_size - return num_vision_tokens // merge_size**2 - class Qwen2VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2VLProcessingInfo]): def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: @@ -1132,25 +1140,6 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]) self.info.get_hf_config().vision_config.spatial_merge_size )(hf_inputs) - def get_num_mm_encoder_tokens( - self, - num_image_tokens: int, - ) -> int: - hf_config = self.info.get_hf_config() - vision_config = hf_config.vision_config - merge_size = vision_config.spatial_merge_size - - return num_image_tokens * merge_size**2 - - def get_num_mm_connector_tokens( - self, - num_vision_tokens: int, - ) -> int: - hf_config = self.info.get_hf_config() - vision_config = hf_config.vision_config - merge_size = vision_config.spatial_merge_size - return num_vision_tokens // merge_size**2 - @MULTIMODAL_REGISTRY.register_processor( Qwen2VLMultiModalProcessor, diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 99ecaf61badd2..0390773783961 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1185,32 +1185,6 @@ class BaseProcessingInfo: """ return self.ctx.get_hf_processor(**kwargs) - @abstractmethod - def get_num_mm_encoder_tokens( - self, - num_image_tokens: int, - ) -> int: - """ - Implement this function to enable LoRA support - for the tower module of the multi-modal model - - Given the number of image tokens, output the number of multi-modal encoder tokens - """ - raise NotImplementedError - - @abstractmethod - def get_num_mm_connector_tokens( - self, - num_vision_tokens: int, - ) -> int: - """ - Implement this function to enable LoRA support - for the connector module of the multi-modal model - - Given the number of vision tokens, output the number of multi-modal connector tokens - """ - raise NotImplementedError - @abstractmethod def get_supported_mm_limits(self) -> Mapping[str, int | None]: """ @@ -1415,32 +1389,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): """Given the HF-processed data, output the metadata of each field.""" raise NotImplementedError - @abstractmethod - def get_num_mm_encoder_tokens( - self, - num_image_tokens: int, - ) -> int: - """ - Implement this function to enable LoRA support - for the tower module of the multi-modal model - - Given the number of image tokens, output the number of multi-modal encoder tokens - """ - raise NotImplementedError - - @abstractmethod - def get_num_mm_connector_tokens( - self, - num_vision_tokens: int, - ) -> int: - """ - Implement this function to enable LoRA support - for the connector module of the multi-modal model - - Given the number of vision tokens, output the number of multi-modal connector tokens - """ - raise NotImplementedError - @abstractmethod def _get_prompt_updates( self, From d525556a250a795cd8f5ec179b6853e64c842311 Mon Sep 17 00:00:00 2001 From: Anexdeus <5142168@mail.ru> Date: Sat, 20 Dec 2025 13:31:53 +0300 Subject: [PATCH 42/53] Revert the mixin changes --- vllm/model_executor/models/qwen2_vl.py | 48 +++++++++++--------------- vllm/multimodal/processing.py | 22 ++++++++++++ 2 files changed, 42 insertions(+), 28 deletions(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index d530cf629f4ad..13014bebb1054 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -820,34 +820,7 @@ class Qwen2VLMultiModalDataParser(MultiModalDataParser): return super()._parse_video_data(data) -class QwenVLSeriesProcessingInfoMixin: - """ - Mixin that provides get_num_mm_encoder_tokens() - and get_num_mm_connector_tokens() methods for - QwenVL series models without affecting other multi-modal models. - """ - - def get_num_mm_encoder_tokens( - self, - num_image_tokens: int, - ) -> int: - hf_config = self.get_hf_config() - vision_config = hf_config.vision_config - merge_size = vision_config.spatial_merge_size - - return num_image_tokens * merge_size**2 - - def get_num_mm_connector_tokens( - self, - num_vision_tokens: int, - ) -> int: - hf_config = self.get_hf_config() - vision_config = hf_config.vision_config - merge_size = vision_config.spatial_merge_size - return num_vision_tokens // merge_size**2 - - -class Qwen2VLProcessingInfo(QwenVLSeriesProcessingInfoMixin, BaseProcessingInfo): +class Qwen2VLProcessingInfo(BaseProcessingInfo): def get_hf_config(self): return self.ctx.get_hf_config(Qwen2VLConfig) @@ -1131,6 +1104,25 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]) for modality in ("image", "video") ] + def get_num_mm_encoder_tokens( + self, + num_image_tokens: int, + ) -> int: + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + merge_size = vision_config.spatial_merge_size + + return num_image_tokens * merge_size**2 + + def get_num_mm_connector_tokens( + self, + num_vision_tokens: int, + ) -> int: + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + merge_size = vision_config.spatial_merge_size + return num_vision_tokens // merge_size**2 + def _get_mm_fields_config( self, hf_inputs: BatchFeature, diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 0390773783961..056eee502448c 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1412,6 +1412,28 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): """ raise NotImplementedError + def get_num_mm_encoder_tokens( + self, + num_image_tokens: int, + ) -> int: + """ + Implement this function to enable LoRA support + for the tower module of the multi-modal model + Given the number of image tokens, output the number of multi-modal encoder tokens + """ + raise NotImplementedError + + def get_num_mm_connector_tokens( + self, + num_vision_tokens: int, + ) -> int: + """ + Implement this function to enable LoRA support + for the connector module of the multi-modal model + Given the number of vision tokens, output the number of multi-modal connector tokens + """ + raise NotImplementedError + def _bind_and_group_updates( self, prompt_updates: Sequence[PromptUpdate], From c6831e793d13fd390092049c009aa2e3e3bcd6c0 Mon Sep 17 00:00:00 2001 From: Anexdeus <5142168@mail.ru> Date: Sat, 20 Dec 2025 17:22:41 +0300 Subject: [PATCH 43/53] extended SupportsMultiModal --- vllm/lora/model_manager.py | 11 +++-- vllm/model_executor/models/interfaces.py | 16 +++++++ vllm/model_executor/models/qwen2_5_vl.py | 36 ++++++++++++++++ vllm/model_executor/models/qwen2_vl.py | 55 ++++++++++++++++-------- vllm/model_executor/models/qwen3_vl.py | 36 ++++++++++++++++ vllm/multimodal/processing.py | 22 ---------- vllm/v1/worker/gpu_model_runner.py | 12 +++--- 7 files changed, 135 insertions(+), 53 deletions(-) diff --git a/vllm/lora/model_manager.py b/vllm/lora/model_manager.py index d9cdabd086d6f..135c5593ed698 100644 --- a/vllm/lora/model_manager.py +++ b/vllm/lora/model_manager.py @@ -154,9 +154,8 @@ class LoRAModelManager: self.punica_wrapper_mapping[lm_prefix] = llm_punica_wrapper if self.lora_config.enable_tower_connector_lora: - self.info = MULTIMODAL_REGISTRY.create_processor(model_config).info self.supports_tower_connector_lora = self.supports_mm and hasattr( - self.info, "get_num_mm_encoder_tokens" + self.model, "get_num_mm_encoder_tokens" ) if not self.supports_tower_connector_lora: return @@ -172,8 +171,8 @@ class LoRAModelManager: vllm_config.scheduler_config, MULTIMODAL_REGISTRY, ) - limit_per_prompt: int = max(self.info.get_allowed_mm_limits().values()) - num_encoder_tokens = self.info.get_num_mm_encoder_tokens( + limit_per_prompt: int = max(self.model.get_allowed_mm_limits().values()) + num_encoder_tokens = self.model.get_num_mm_encoder_tokens( mm_budget.get_encoder_budget() ) @@ -189,8 +188,8 @@ class LoRAModelManager: # Use wrapper for connector if present. if self.mm_mapping.connector: - if hasattr(self.info, "get_num_mm_connector_tokens"): - connector_tokens = self.info.get_num_mm_connector_tokens( + if hasattr(self.model, "get_num_mm_connector_tokens"): + connector_tokens = self.model.get_num_mm_connector_tokens( num_encoder_tokens ) connector_punica_wrapper = get_punica_wrapper( diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index cb99d57e8b8c7..ae119969b5846 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -141,6 +141,22 @@ class SupportsMultiModal(Protocol): """ ... + def get_num_mm_encoder_tokens(self, num_image_tokens: int) -> int: + """ + Implement this function to enable LoRA support + for the tower module of the multi-modal model + Given the number of image tokens, output the number of multi-modal encoder tokens + """ + ... + + def get_num_mm_connector_tokens(self, num_vision_tokens: int) -> int: + """ + Implement this function to enable LoRA support + for the connector module of the multi-modal model + Given the number of vision tokens, output the number of multi-modal connector tokens + """ + ... + @overload def embed_input_ids(self, input_ids: Tensor) -> Tensor: ... diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 02fac0b78a4b4..9d42ace2c8e8e 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -1568,3 +1568,39 @@ class Qwen2_5_VLForConditionalGeneration( connector="visual.merger.", tower_model="visual.", ) + + def get_num_mm_encoder_tokens( + self, + num_image_tokens: int, + ) -> int: + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + merge_size = vision_config.spatial_merge_size + + return num_image_tokens * merge_size**2 + + def get_num_mm_connector_tokens( + self, + num_vision_tokens: int, + ) -> int: + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + merge_size = vision_config.spatial_merge_size + return num_vision_tokens // merge_size**2 + + def get_allowed_mm_limits(self) -> Mapping[str, int]: + """Return the maximum allowed number of items for each modality.""" + supported_mm_limits = self.get_supported_mm_limits() + mm_config = self.ctx.get_mm_config() + + allowed_limits = dict[str, int]() + for modality, supported_limit in supported_mm_limits.items(): + user_limit = mm_config.get_limit_per_prompt(modality) + + allowed_limits[modality] = ( + user_limit + if supported_limit is None + else min(user_limit, supported_limit) + ) + + return allowed_limits diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 13014bebb1054..a13859a2a71c3 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1104,25 +1104,6 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]) for modality in ("image", "video") ] - def get_num_mm_encoder_tokens( - self, - num_image_tokens: int, - ) -> int: - hf_config = self.get_hf_config() - vision_config = hf_config.vision_config - merge_size = vision_config.spatial_merge_size - - return num_image_tokens * merge_size**2 - - def get_num_mm_connector_tokens( - self, - num_vision_tokens: int, - ) -> int: - hf_config = self.get_hf_config() - vision_config = hf_config.vision_config - merge_size = vision_config.spatial_merge_size - return num_vision_tokens // merge_size**2 - def _get_mm_fields_config( self, hf_inputs: BatchFeature, @@ -1510,6 +1491,42 @@ class Qwen2VLForConditionalGeneration( tower_model="visual.", ) + def get_allowed_mm_limits(self) -> Mapping[str, int]: + """Return the maximum allowed number of items for each modality.""" + supported_mm_limits = self.get_supported_mm_limits() + mm_config = self.ctx.get_mm_config() + + allowed_limits = dict[str, int]() + for modality, supported_limit in supported_mm_limits.items(): + user_limit = mm_config.get_limit_per_prompt(modality) + + allowed_limits[modality] = ( + user_limit + if supported_limit is None + else min(user_limit, supported_limit) + ) + + return allowed_limits + + def get_num_mm_encoder_tokens( + self, + num_image_tokens: int, + ) -> int: + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + merge_size = vision_config.spatial_merge_size + + return num_image_tokens * merge_size**2 + + def get_num_mm_connector_tokens( + self, + num_vision_tokens: int, + ) -> int: + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + merge_size = vision_config.spatial_merge_size + return num_vision_tokens // merge_size**2 + class Tarsier2MultiModalProcessor(Qwen2VLMultiModalProcessor): pass diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 80e951257e536..18c0fd68afdc4 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -2091,3 +2091,39 @@ class Qwen3VLForConditionalGeneration( connector=["visual.merger", "visual.deepstack_merger_list"], tower_model="visual.", ) + + def get_num_mm_encoder_tokens( + self, + num_image_tokens: int, + ) -> int: + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + merge_size = vision_config.spatial_merge_size + + return num_image_tokens * merge_size**2 + + def get_num_mm_connector_tokens( + self, + num_vision_tokens: int, + ) -> int: + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + merge_size = vision_config.spatial_merge_size + return num_vision_tokens // merge_size**2 + + def get_allowed_mm_limits(self) -> Mapping[str, int]: + """Return the maximum allowed number of items for each modality.""" + supported_mm_limits = self.get_supported_mm_limits() + mm_config = self.ctx.get_mm_config() + + allowed_limits = dict[str, int]() + for modality, supported_limit in supported_mm_limits.items(): + user_limit = mm_config.get_limit_per_prompt(modality) + + allowed_limits[modality] = ( + user_limit + if supported_limit is None + else min(user_limit, supported_limit) + ) + + return allowed_limits diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 51143822fe0bb..3bbdab3b393c5 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1420,28 +1420,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): """ raise NotImplementedError - def get_num_mm_encoder_tokens( - self, - num_image_tokens: int, - ) -> int: - """ - Implement this function to enable LoRA support - for the tower module of the multi-modal model - Given the number of image tokens, output the number of multi-modal encoder tokens - """ - raise NotImplementedError - - def get_num_mm_connector_tokens( - self, - num_vision_tokens: int, - ) -> int: - """ - Implement this function to enable LoRA support - for the connector module of the multi-modal model - Given the number of vision tokens, output the number of multi-modal connector tokens - """ - raise NotImplementedError - def _bind_and_group_updates( self, prompt_updates: Sequence[PromptUpdate], diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index f4ed37a7b6771..a3e64e89ff60c 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -593,9 +593,9 @@ class GPUModelRunner( # Multimodal LoRA support self.enable_tower_connector_lora = False if self.supports_mm_inputs and self.lora_config: - self.info = self.mm_registry.create_processor(self.model_config).info + self.mm_model_cls = self.mm_registry._get_model_cls(model_config) self.enable_tower_connector_lora = ( - hasattr(self.info, "get_num_mm_encoder_tokens") + hasattr(self.mm_model_cls, "get_num_mm_encoder_tokens") and self.lora_config.enable_tower_connector_lora ) @@ -2183,7 +2183,7 @@ class GPUModelRunner( # Prefer pos_info.is_embed to count actual MM embedding tokens. # pos_info.length may overcount (e.g., special tokens in Qwen-VL). # Fall back to length if is_embed is None. - num_tokens = self.info.get_num_mm_encoder_tokens( # type: ignore[attr-defined] + num_tokens = model.get_num_mm_encoder_tokens( # type: ignore[attr-defined] pos_info.get_num_embeds ) prompt_lora_mapping.append(lora_id) @@ -2202,13 +2202,13 @@ class GPUModelRunner( ) self.lora_manager.set_active_adapters(lora_requests, lora_mapping) - if hasattr(self.info, "get_num_mm_connector_tokens"): + if hasattr(model, "get_num_mm_connector_tokens"): num_post_op_tokens = [] for _, pos_info in mm_hashes_pos: - mm_token_count = self.info.get_num_mm_encoder_tokens( # type: ignore[attr-defined] + mm_token_count = model.get_num_mm_encoder_tokens( # type: ignore[attr-defined] pos_info.length ) - post_op_count = self.info.get_num_mm_connector_tokens( # type: ignore[attr-defined] + post_op_count = model.get_num_mm_connector_tokens( # type: ignore[attr-defined] mm_token_count ) num_post_op_tokens.append(post_op_count) From 68116edfe2a3581534ed18033b27ca887f5873b0 Mon Sep 17 00:00:00 2001 From: bk-201 Date: Sat, 20 Dec 2025 16:20:12 +0000 Subject: [PATCH 44/53] fix bug Signed-off-by: bk-201 --- vllm/inputs/preprocess.py | 12 ++++++++++++ vllm/lora/model_manager.py | 16 ++++++++++------ vllm/lora/worker_manager.py | 6 ++++++ vllm/multimodal/processing.py | 11 +++++++++++ vllm/v1/engine/input_processor.py | 13 +++++++++++++ vllm/v1/worker/gpu_model_runner.py | 22 ++++++++-------------- 6 files changed, 60 insertions(+), 20 deletions(-) diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 0372b06d0017f..8707fc310033c 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -246,6 +246,7 @@ class InputPreprocessor: tokenization_kwargs: dict[str, Any] | None = None, *, mm_uuids: MultiModalUUIDDict | None = None, + lora_kwargs: dict[str, Any] | None = None, ) -> MultiModalInputs: """ Apply the model's multi-modal processor to a multi-modal prompt, @@ -262,6 +263,7 @@ class InputPreprocessor: hf_processor_mm_kwargs=mm_processor_kwargs, tokenization_kwargs=tokenization_kwargs, mm_uuids=mm_uuids, + lora_kwargs=lora_kwargs, ) mm_hashes = mm_input["mm_hashes"] @@ -359,6 +361,7 @@ class InputPreprocessor: tokenization_kwargs: dict[str, Any] | None = None, *, mm_uuids: MultiModalUUIDDict | None = None, + lora_kwargs: dict[str, Any] | None = None, ) -> TokenInputs | MultiModalInputs: prompt_text = parsed_content["prompt"] @@ -370,6 +373,7 @@ class InputPreprocessor: parsed_content.get("mm_processor_kwargs") or {}, tokenization_kwargs=tokenization_kwargs, mm_uuids=mm_uuids, + lora_kwargs=lora_kwargs, ) else: prompt_token_ids = self._tokenize_prompt( @@ -389,6 +393,7 @@ class InputPreprocessor: tokenization_kwargs: dict[str, Any] | None = None, *, mm_uuids: MultiModalUUIDDict | None = None, + lora_kwargs: dict[str, Any] | None = None, ) -> SingletonInputs: """ Extract the singleton inputs from a prompt. @@ -415,6 +420,7 @@ class InputPreprocessor: parsed["content"], tokenization_kwargs=tokenization_kwargs, mm_uuids=mm_uuids, + lora_kwargs=lora_kwargs, ) if parsed["type"] == "str": return self._process_text( @@ -626,6 +632,7 @@ class InputPreprocessor: tokenization_kwargs: dict[str, Any] | None = None, *, mm_uuids: MultiModalUUIDDict | None = None, + lora_kwargs: dict[str, Any] | None = None, ) -> DecoderOnlyInputs: """ For decoder-only models: @@ -645,6 +652,7 @@ class InputPreprocessor: prompt, tokenization_kwargs=tokenization_kwargs, mm_uuids=mm_uuids, + lora_kwargs=lora_kwargs, ) return self._build_decoder_only_llm_inputs(prompt_comps) @@ -655,6 +663,7 @@ class InputPreprocessor: tokenization_kwargs: dict[str, Any] | None = None, *, mm_uuids: MultiModalUUIDDict | None = None, + lora_kwargs: dict[str, Any] | None = None, ) -> ProcessorInputs: if self.model_config.is_encoder_decoder: # Encoder-decoder model requires special mapping of @@ -676,6 +685,7 @@ class InputPreprocessor: cast(SingletonPrompt, prompt), tokenization_kwargs=tokenization_kwargs, mm_uuids=mm_uuids, + lora_kwargs=lora_kwargs, ) def preprocess( @@ -684,12 +694,14 @@ class InputPreprocessor: tokenization_kwargs: dict[str, Any] | None = None, *, mm_uuids: MultiModalUUIDDict | None = None, + lora_kwargs: dict[str, Any] | None = None, ) -> ProcessorInputs: """Preprocess the input prompt.""" res = self._preprocess( prompt, tokenization_kwargs, mm_uuids=mm_uuids, + lora_kwargs=lora_kwargs, ) if self.mm_processor_cache and self.mm_cache_stats is not None: diff --git a/vllm/lora/model_manager.py b/vllm/lora/model_manager.py index d9cdabd086d6f..e2f65c2b2ce1b 100644 --- a/vllm/lora/model_manager.py +++ b/vllm/lora/model_manager.py @@ -154,9 +154,11 @@ class LoRAModelManager: self.punica_wrapper_mapping[lm_prefix] = llm_punica_wrapper if self.lora_config.enable_tower_connector_lora: - self.info = MULTIMODAL_REGISTRY.create_processor(model_config).info + self.mm_processor_info = MULTIMODAL_REGISTRY.create_processor( + model_config + ).info self.supports_tower_connector_lora = self.supports_mm and hasattr( - self.info, "get_num_mm_encoder_tokens" + self.mm_processor_info, "get_num_mm_encoder_tokens" ) if not self.supports_tower_connector_lora: return @@ -172,8 +174,10 @@ class LoRAModelManager: vllm_config.scheduler_config, MULTIMODAL_REGISTRY, ) - limit_per_prompt: int = max(self.info.get_allowed_mm_limits().values()) - num_encoder_tokens = self.info.get_num_mm_encoder_tokens( + limit_per_prompt: int = max( + self.mm_processor_info.get_allowed_mm_limits().values() + ) + num_encoder_tokens = self.mm_processor_info.get_num_mm_encoder_tokens( mm_budget.get_encoder_budget() ) @@ -189,8 +193,8 @@ class LoRAModelManager: # Use wrapper for connector if present. if self.mm_mapping.connector: - if hasattr(self.info, "get_num_mm_connector_tokens"): - connector_tokens = self.info.get_num_mm_connector_tokens( + if hasattr(self.mm_processor_info, "get_num_mm_connector_tokens"): + connector_tokens = self.mm_processor_info.get_num_mm_connector_tokens( num_encoder_tokens ) connector_punica_wrapper = get_punica_wrapper( diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index 967ce458995c6..277e462a39e00 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -163,6 +163,12 @@ class WorkerLoRAManager: if mapping is not None: self._adapter_manager.set_adapter_mapping(mapping) + def supports_tower_connector_lora(self) -> bool: + return ( + self._adapter_manager.supports_mm + and self._adapter_manager.supports_tower_connector_lora + ) + def _apply_adapters(self, adapter_requests: set[Any]) -> None: existing_adapters = self.list_adapters() models_map = { diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 3bbdab3b393c5..39e476156542c 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1672,6 +1672,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): tokenization_kwargs: Mapping[str, object], *, mm_uuids: MultiModalUUIDDict | None = None, + lora_kwargs: dict[str, Any] | None = None, ) -> MultiModalHashes: """Create MM hashes to be returned. @@ -1683,6 +1684,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): hashes: MultiModalHashes = {} mm_uuids = mm_uuids or {} + lora_kwargs = lora_kwargs or {} for modality, items in mm_items.items(): if modality in mm_uuids: @@ -1703,6 +1705,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): item_uuid is None or hf_processor_mm_kwargs or tokenization_kwargs + or lora_kwargs ): # NOTE: use provided hash string to hash with kwargs # if available for better performance. @@ -1713,6 +1716,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): **{modality: item}, **hf_processor_mm_kwargs, **tokenization_kwargs, + **lora_kwargs, ) ) else: @@ -1725,6 +1729,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): **{modality: item}, **hf_processor_mm_kwargs, **tokenization_kwargs, + **lora_kwargs, ) for item in items ] @@ -1883,6 +1888,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): tokenization_kwargs: Mapping[str, object], *, mm_uuids: MultiModalUUIDDict | None = None, + lora_kwargs: dict[str, Any] | None = None, ) -> tuple[list[int], MultiModalProcessingInfo, bool]: """ Apply the HF processor on the full prompt text, @@ -1905,6 +1911,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): hf_processor_mm_kwargs, tokenization_kwargs, mm_uuids=mm_uuids, + lora_kwargs=lora_kwargs, ) mm_is_cached, mm_missing_data_items = self._get_cache_missing_items( @@ -2115,6 +2122,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): tokenization_kwargs: Mapping[str, object] | None = None, *, mm_uuids: MultiModalUUIDDict | None = None, + lora_kwargs: dict[str, Any] | None = None, ) -> MultiModalInputs: """ Process multi-modal inputs to be used in vLLM. @@ -2144,6 +2152,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, mm_uuids=mm_uuids, + lora_kwargs=lora_kwargs, ) # NOTE: tokenization_kwargs are not required to init processor @@ -2224,6 +2233,7 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]): tokenization_kwargs: Mapping[str, object] | None = None, *, mm_uuids: MultiModalUUIDDict | None = None, + lora_kwargs: dict[str, Any] | None = None, ) -> MultiModalEncDecInputs: """ Process multi-modal inputs to be used in vLLM. @@ -2239,6 +2249,7 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]): hf_processor_mm_kwargs, tokenization_kwargs, mm_uuids=mm_uuids, + lora_kwargs=lora_kwargs, ) return self._get_enc_dec_inputs( diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py index 29293877cb69d..acae5c5108afa 100644 --- a/vllm/v1/engine/input_processor.py +++ b/vllm/v1/engine/input_processor.py @@ -5,6 +5,8 @@ import time from collections.abc import Mapping from typing import Any, Literal, cast +import msgspec + from vllm.config import VllmConfig from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs from vllm.inputs.parse import split_enc_dec_inputs @@ -458,6 +460,16 @@ class InputProcessor: else: mm_uuids = None + # When enable_tower_connector_lora is True, multi-modal embeddings + # vary depending on the LoRA request. Therefore, the mm_hash must be + # generated based on the LoRA request to prevent incorrect cache hits. + lora_kwargs = ( + msgspec.structs.asdict(lora_request) + if lora_request and self.lora_config.enable_tower_connector_lora + else {} + ) + lora_kwargs = {k: v for k, v in lora_kwargs.items() if v is not None} + # Process inputs, which includes: # 1. Tokenize text prompt, with LoRA request if one exists. # 2. For multimodal models with a merged preprocessor, preprocess @@ -466,6 +478,7 @@ class InputProcessor: prompt, tokenization_kwargs=tokenization_kwargs, mm_uuids=mm_uuids, + lora_kwargs=lora_kwargs, ) from vllm.platforms import current_platform diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index f4ed37a7b6771..d2885deece9c6 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -590,15 +590,6 @@ class GPUModelRunner( pin_memory=self.pin_memory, ) - # Multimodal LoRA support - self.enable_tower_connector_lora = False - if self.supports_mm_inputs and self.lora_config: - self.info = self.mm_registry.create_processor(self.model_config).info - self.enable_tower_connector_lora = ( - hasattr(self.info, "get_num_mm_encoder_tokens") - and self.lora_config.enable_tower_connector_lora - ) - # Pre-allocated tensor for copying valid sampled token counts to CPU, # with dedicated stream for overlapping and event for coordination. self.valid_sampled_token_count_event: torch.Event | None = None @@ -2169,12 +2160,15 @@ class GPUModelRunner( # encoder outputs. model = cast(SupportsMultiModal, self.model) - if self.enable_tower_connector_lora: + if self.lora_manager.supports_tower_connector_lora(): # Build LoRA mappings independently for encoder inputs # (encoder batch structure is different from main batch) prompt_lora_mapping = [] token_lora_mapping = [] lora_requests = set() + # This implementation is a bit hacky, but it's mainly to retrieve + # the get_num_mm_*_tokens helper functions from ProcessingInfo. + mm_processor_info = self.lora_manager._adapter_manager.mm_processor_info for req_id, (_, pos_info) in zip(encoder_req_ids, mm_hashes_pos): req_idx = self.input_batch.req_id_to_index[req_id] @@ -2183,7 +2177,7 @@ class GPUModelRunner( # Prefer pos_info.is_embed to count actual MM embedding tokens. # pos_info.length may overcount (e.g., special tokens in Qwen-VL). # Fall back to length if is_embed is None. - num_tokens = self.info.get_num_mm_encoder_tokens( # type: ignore[attr-defined] + num_tokens = mm_processor_info.get_num_mm_encoder_tokens( # type: ignore[attr-defined] pos_info.get_num_embeds ) prompt_lora_mapping.append(lora_id) @@ -2202,13 +2196,13 @@ class GPUModelRunner( ) self.lora_manager.set_active_adapters(lora_requests, lora_mapping) - if hasattr(self.info, "get_num_mm_connector_tokens"): + if hasattr(mm_processor_info, "get_num_mm_connector_tokens"): num_post_op_tokens = [] for _, pos_info in mm_hashes_pos: - mm_token_count = self.info.get_num_mm_encoder_tokens( # type: ignore[attr-defined] + mm_token_count = mm_processor_info.get_num_mm_encoder_tokens( # type: ignore[attr-defined] pos_info.length ) - post_op_count = self.info.get_num_mm_connector_tokens( # type: ignore[attr-defined] + post_op_count = mm_processor_info.get_num_mm_connector_tokens( # type: ignore[attr-defined] mm_token_count ) num_post_op_tokens.append(post_op_count) From cb72a0ef0135060c4bd3c9187c7532d710786c0a Mon Sep 17 00:00:00 2001 From: bk-201 Date: Sat, 20 Dec 2025 16:36:13 +0000 Subject: [PATCH 45/53] fix pre-commit Signed-off-by: bk-201 --- vllm/v1/engine/input_processor.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py index acae5c5108afa..01d0b7f50f45e 100644 --- a/vllm/v1/engine/input_processor.py +++ b/vllm/v1/engine/input_processor.py @@ -463,9 +463,10 @@ class InputProcessor: # When enable_tower_connector_lora is True, multi-modal embeddings # vary depending on the LoRA request. Therefore, the mm_hash must be # generated based on the LoRA request to prevent incorrect cache hits. + lora_config = self.lora_config lora_kwargs = ( msgspec.structs.asdict(lora_request) - if lora_request and self.lora_config.enable_tower_connector_lora + if lora_request and lora_config and lora_config.enable_tower_connector_lora else {} ) lora_kwargs = {k: v for k, v in lora_kwargs.items() if v is not None} From 86c6c5cf005d9a1a2c02b93b562d32b20baa27d5 Mon Sep 17 00:00:00 2001 From: Anexdeus <5142168@mail.ru> Date: Sat, 20 Dec 2025 21:56:07 +0300 Subject: [PATCH 46/53] removed get_allowed_mm_limits() from models --- vllm/model_executor/models/qwen2_5_vl.py | 17 ----------------- vllm/model_executor/models/qwen2_vl.py | 17 ----------------- vllm/model_executor/models/qwen3_vl.py | 17 ----------------- 3 files changed, 51 deletions(-) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 9d42ace2c8e8e..998cefd33e801 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -1587,20 +1587,3 @@ class Qwen2_5_VLForConditionalGeneration( vision_config = hf_config.vision_config merge_size = vision_config.spatial_merge_size return num_vision_tokens // merge_size**2 - - def get_allowed_mm_limits(self) -> Mapping[str, int]: - """Return the maximum allowed number of items for each modality.""" - supported_mm_limits = self.get_supported_mm_limits() - mm_config = self.ctx.get_mm_config() - - allowed_limits = dict[str, int]() - for modality, supported_limit in supported_mm_limits.items(): - user_limit = mm_config.get_limit_per_prompt(modality) - - allowed_limits[modality] = ( - user_limit - if supported_limit is None - else min(user_limit, supported_limit) - ) - - return allowed_limits diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index a13859a2a71c3..cd9ddaa532490 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1491,23 +1491,6 @@ class Qwen2VLForConditionalGeneration( tower_model="visual.", ) - def get_allowed_mm_limits(self) -> Mapping[str, int]: - """Return the maximum allowed number of items for each modality.""" - supported_mm_limits = self.get_supported_mm_limits() - mm_config = self.ctx.get_mm_config() - - allowed_limits = dict[str, int]() - for modality, supported_limit in supported_mm_limits.items(): - user_limit = mm_config.get_limit_per_prompt(modality) - - allowed_limits[modality] = ( - user_limit - if supported_limit is None - else min(user_limit, supported_limit) - ) - - return allowed_limits - def get_num_mm_encoder_tokens( self, num_image_tokens: int, diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 18c0fd68afdc4..be0e5f8759d17 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -2110,20 +2110,3 @@ class Qwen3VLForConditionalGeneration( vision_config = hf_config.vision_config merge_size = vision_config.spatial_merge_size return num_vision_tokens // merge_size**2 - - def get_allowed_mm_limits(self) -> Mapping[str, int]: - """Return the maximum allowed number of items for each modality.""" - supported_mm_limits = self.get_supported_mm_limits() - mm_config = self.ctx.get_mm_config() - - allowed_limits = dict[str, int]() - for modality, supported_limit in supported_mm_limits.items(): - user_limit = mm_config.get_limit_per_prompt(modality) - - allowed_limits[modality] = ( - user_limit - if supported_limit is None - else min(user_limit, supported_limit) - ) - - return allowed_limits From 20402090b8501942671461d0a2cb463527aea206 Mon Sep 17 00:00:00 2001 From: bk-201 Date: Sun, 21 Dec 2025 03:34:32 +0000 Subject: [PATCH 47/53] move mm-token-functions to model Signed-off-by: bk-201 --- vllm/lora/model_manager.py | 8 ++++---- vllm/model_executor/models/interfaces.py | 12 +++++++----- vllm/model_executor/models/qwen2_5_vl.py | 6 +++--- vllm/model_executor/models/qwen2_vl.py | 4 ++-- vllm/model_executor/models/qwen3_vl.py | 4 ++-- vllm/v1/worker/gpu_model_runner.py | 13 +++++-------- 6 files changed, 23 insertions(+), 24 deletions(-) diff --git a/vllm/lora/model_manager.py b/vllm/lora/model_manager.py index e2f65c2b2ce1b..4506b0a4461ec 100644 --- a/vllm/lora/model_manager.py +++ b/vllm/lora/model_manager.py @@ -158,7 +158,7 @@ class LoRAModelManager: model_config ).info self.supports_tower_connector_lora = self.supports_mm and hasattr( - self.mm_processor_info, "get_num_mm_encoder_tokens" + self.model, "get_num_mm_encoder_tokens" ) if not self.supports_tower_connector_lora: return @@ -177,7 +177,7 @@ class LoRAModelManager: limit_per_prompt: int = max( self.mm_processor_info.get_allowed_mm_limits().values() ) - num_encoder_tokens = self.mm_processor_info.get_num_mm_encoder_tokens( + num_encoder_tokens = self.model.get_num_mm_encoder_tokens( mm_budget.get_encoder_budget() ) @@ -193,8 +193,8 @@ class LoRAModelManager: # Use wrapper for connector if present. if self.mm_mapping.connector: - if hasattr(self.mm_processor_info, "get_num_mm_connector_tokens"): - connector_tokens = self.mm_processor_info.get_num_mm_connector_tokens( + if hasattr(self.model, "get_num_mm_connector_tokens"): + connector_tokens = self.model.get_num_mm_connector_tokens( num_encoder_tokens ) connector_punica_wrapper = get_punica_wrapper( diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index ae119969b5846..031a9cb40e3ff 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -143,17 +143,19 @@ class SupportsMultiModal(Protocol): def get_num_mm_encoder_tokens(self, num_image_tokens: int) -> int: """ - Implement this function to enable LoRA support - for the tower module of the multi-modal model - Given the number of image tokens, output the number of multi-modal encoder tokens + Implement this function to enable LoRA support + for the tower module of the multi-modal model. + Given the number of image tokens, output the number of + multi-modal encoder tokens. """ ... def get_num_mm_connector_tokens(self, num_vision_tokens: int) -> int: """ Implement this function to enable LoRA support - for the connector module of the multi-modal model - Given the number of vision tokens, output the number of multi-modal connector tokens + for the connector module of the multi-modal model. + Given the number of vision tokens, output the number of + multi-modal connector tokens. """ ... diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 998cefd33e801..1c8024cf12725 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -1568,12 +1568,12 @@ class Qwen2_5_VLForConditionalGeneration( connector="visual.merger.", tower_model="visual.", ) - + def get_num_mm_encoder_tokens( self, num_image_tokens: int, ) -> int: - hf_config = self.get_hf_config() + hf_config = self.config vision_config = hf_config.vision_config merge_size = vision_config.spatial_merge_size @@ -1583,7 +1583,7 @@ class Qwen2_5_VLForConditionalGeneration( self, num_vision_tokens: int, ) -> int: - hf_config = self.get_hf_config() + hf_config = self.config vision_config = hf_config.vision_config merge_size = vision_config.spatial_merge_size return num_vision_tokens // merge_size**2 diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index cd9ddaa532490..379e50742bb84 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1495,7 +1495,7 @@ class Qwen2VLForConditionalGeneration( self, num_image_tokens: int, ) -> int: - hf_config = self.get_hf_config() + hf_config = self.config vision_config = hf_config.vision_config merge_size = vision_config.spatial_merge_size @@ -1505,7 +1505,7 @@ class Qwen2VLForConditionalGeneration( self, num_vision_tokens: int, ) -> int: - hf_config = self.get_hf_config() + hf_config = self.config vision_config = hf_config.vision_config merge_size = vision_config.spatial_merge_size return num_vision_tokens // merge_size**2 diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index be0e5f8759d17..1daba20a95676 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -2096,7 +2096,7 @@ class Qwen3VLForConditionalGeneration( self, num_image_tokens: int, ) -> int: - hf_config = self.get_hf_config() + hf_config = self.config vision_config = hf_config.vision_config merge_size = vision_config.spatial_merge_size @@ -2106,7 +2106,7 @@ class Qwen3VLForConditionalGeneration( self, num_vision_tokens: int, ) -> int: - hf_config = self.get_hf_config() + hf_config = self.config vision_config = hf_config.vision_config merge_size = vision_config.spatial_merge_size return num_vision_tokens // merge_size**2 diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index d2885deece9c6..31acbe5e20538 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2160,15 +2160,12 @@ class GPUModelRunner( # encoder outputs. model = cast(SupportsMultiModal, self.model) - if self.lora_manager.supports_tower_connector_lora(): + if self.lora_config and self.lora_manager.supports_tower_connector_lora(): # Build LoRA mappings independently for encoder inputs # (encoder batch structure is different from main batch) prompt_lora_mapping = [] token_lora_mapping = [] lora_requests = set() - # This implementation is a bit hacky, but it's mainly to retrieve - # the get_num_mm_*_tokens helper functions from ProcessingInfo. - mm_processor_info = self.lora_manager._adapter_manager.mm_processor_info for req_id, (_, pos_info) in zip(encoder_req_ids, mm_hashes_pos): req_idx = self.input_batch.req_id_to_index[req_id] @@ -2177,7 +2174,7 @@ class GPUModelRunner( # Prefer pos_info.is_embed to count actual MM embedding tokens. # pos_info.length may overcount (e.g., special tokens in Qwen-VL). # Fall back to length if is_embed is None. - num_tokens = mm_processor_info.get_num_mm_encoder_tokens( # type: ignore[attr-defined] + num_tokens = self.model.get_num_mm_encoder_tokens( # type: ignore[attr-defined] pos_info.get_num_embeds ) prompt_lora_mapping.append(lora_id) @@ -2196,13 +2193,13 @@ class GPUModelRunner( ) self.lora_manager.set_active_adapters(lora_requests, lora_mapping) - if hasattr(mm_processor_info, "get_num_mm_connector_tokens"): + if hasattr(self.model, "get_num_mm_connector_tokens"): num_post_op_tokens = [] for _, pos_info in mm_hashes_pos: - mm_token_count = mm_processor_info.get_num_mm_encoder_tokens( # type: ignore[attr-defined] + mm_token_count = self.model.get_num_mm_encoder_tokens( # type: ignore[attr-defined] pos_info.length ) - post_op_count = mm_processor_info.get_num_mm_connector_tokens( # type: ignore[attr-defined] + post_op_count = self.model.get_num_mm_connector_tokens( # type: ignore[attr-defined] mm_token_count ) num_post_op_tokens.append(post_op_count) From 81b5ace12810dce1c61bd67636f38846fb9b9e90 Mon Sep 17 00:00:00 2001 From: bk-201 Date: Sun, 21 Dec 2025 04:14:11 +0000 Subject: [PATCH 48/53] revert lora_kwargs change Signed-off-by: bk-201 --- vllm/inputs/preprocess.py | 12 ------------ vllm/multimodal/processing.py | 11 ----------- vllm/v1/engine/input_processor.py | 14 -------------- 3 files changed, 37 deletions(-) diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 8707fc310033c..0372b06d0017f 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -246,7 +246,6 @@ class InputPreprocessor: tokenization_kwargs: dict[str, Any] | None = None, *, mm_uuids: MultiModalUUIDDict | None = None, - lora_kwargs: dict[str, Any] | None = None, ) -> MultiModalInputs: """ Apply the model's multi-modal processor to a multi-modal prompt, @@ -263,7 +262,6 @@ class InputPreprocessor: hf_processor_mm_kwargs=mm_processor_kwargs, tokenization_kwargs=tokenization_kwargs, mm_uuids=mm_uuids, - lora_kwargs=lora_kwargs, ) mm_hashes = mm_input["mm_hashes"] @@ -361,7 +359,6 @@ class InputPreprocessor: tokenization_kwargs: dict[str, Any] | None = None, *, mm_uuids: MultiModalUUIDDict | None = None, - lora_kwargs: dict[str, Any] | None = None, ) -> TokenInputs | MultiModalInputs: prompt_text = parsed_content["prompt"] @@ -373,7 +370,6 @@ class InputPreprocessor: parsed_content.get("mm_processor_kwargs") or {}, tokenization_kwargs=tokenization_kwargs, mm_uuids=mm_uuids, - lora_kwargs=lora_kwargs, ) else: prompt_token_ids = self._tokenize_prompt( @@ -393,7 +389,6 @@ class InputPreprocessor: tokenization_kwargs: dict[str, Any] | None = None, *, mm_uuids: MultiModalUUIDDict | None = None, - lora_kwargs: dict[str, Any] | None = None, ) -> SingletonInputs: """ Extract the singleton inputs from a prompt. @@ -420,7 +415,6 @@ class InputPreprocessor: parsed["content"], tokenization_kwargs=tokenization_kwargs, mm_uuids=mm_uuids, - lora_kwargs=lora_kwargs, ) if parsed["type"] == "str": return self._process_text( @@ -632,7 +626,6 @@ class InputPreprocessor: tokenization_kwargs: dict[str, Any] | None = None, *, mm_uuids: MultiModalUUIDDict | None = None, - lora_kwargs: dict[str, Any] | None = None, ) -> DecoderOnlyInputs: """ For decoder-only models: @@ -652,7 +645,6 @@ class InputPreprocessor: prompt, tokenization_kwargs=tokenization_kwargs, mm_uuids=mm_uuids, - lora_kwargs=lora_kwargs, ) return self._build_decoder_only_llm_inputs(prompt_comps) @@ -663,7 +655,6 @@ class InputPreprocessor: tokenization_kwargs: dict[str, Any] | None = None, *, mm_uuids: MultiModalUUIDDict | None = None, - lora_kwargs: dict[str, Any] | None = None, ) -> ProcessorInputs: if self.model_config.is_encoder_decoder: # Encoder-decoder model requires special mapping of @@ -685,7 +676,6 @@ class InputPreprocessor: cast(SingletonPrompt, prompt), tokenization_kwargs=tokenization_kwargs, mm_uuids=mm_uuids, - lora_kwargs=lora_kwargs, ) def preprocess( @@ -694,14 +684,12 @@ class InputPreprocessor: tokenization_kwargs: dict[str, Any] | None = None, *, mm_uuids: MultiModalUUIDDict | None = None, - lora_kwargs: dict[str, Any] | None = None, ) -> ProcessorInputs: """Preprocess the input prompt.""" res = self._preprocess( prompt, tokenization_kwargs, mm_uuids=mm_uuids, - lora_kwargs=lora_kwargs, ) if self.mm_processor_cache and self.mm_cache_stats is not None: diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 39e476156542c..3bbdab3b393c5 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1672,7 +1672,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): tokenization_kwargs: Mapping[str, object], *, mm_uuids: MultiModalUUIDDict | None = None, - lora_kwargs: dict[str, Any] | None = None, ) -> MultiModalHashes: """Create MM hashes to be returned. @@ -1684,7 +1683,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): hashes: MultiModalHashes = {} mm_uuids = mm_uuids or {} - lora_kwargs = lora_kwargs or {} for modality, items in mm_items.items(): if modality in mm_uuids: @@ -1705,7 +1703,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): item_uuid is None or hf_processor_mm_kwargs or tokenization_kwargs - or lora_kwargs ): # NOTE: use provided hash string to hash with kwargs # if available for better performance. @@ -1716,7 +1713,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): **{modality: item}, **hf_processor_mm_kwargs, **tokenization_kwargs, - **lora_kwargs, ) ) else: @@ -1729,7 +1725,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): **{modality: item}, **hf_processor_mm_kwargs, **tokenization_kwargs, - **lora_kwargs, ) for item in items ] @@ -1888,7 +1883,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): tokenization_kwargs: Mapping[str, object], *, mm_uuids: MultiModalUUIDDict | None = None, - lora_kwargs: dict[str, Any] | None = None, ) -> tuple[list[int], MultiModalProcessingInfo, bool]: """ Apply the HF processor on the full prompt text, @@ -1911,7 +1905,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): hf_processor_mm_kwargs, tokenization_kwargs, mm_uuids=mm_uuids, - lora_kwargs=lora_kwargs, ) mm_is_cached, mm_missing_data_items = self._get_cache_missing_items( @@ -2122,7 +2115,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): tokenization_kwargs: Mapping[str, object] | None = None, *, mm_uuids: MultiModalUUIDDict | None = None, - lora_kwargs: dict[str, Any] | None = None, ) -> MultiModalInputs: """ Process multi-modal inputs to be used in vLLM. @@ -2152,7 +2144,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, mm_uuids=mm_uuids, - lora_kwargs=lora_kwargs, ) # NOTE: tokenization_kwargs are not required to init processor @@ -2233,7 +2224,6 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]): tokenization_kwargs: Mapping[str, object] | None = None, *, mm_uuids: MultiModalUUIDDict | None = None, - lora_kwargs: dict[str, Any] | None = None, ) -> MultiModalEncDecInputs: """ Process multi-modal inputs to be used in vLLM. @@ -2249,7 +2239,6 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]): hf_processor_mm_kwargs, tokenization_kwargs, mm_uuids=mm_uuids, - lora_kwargs=lora_kwargs, ) return self._get_enc_dec_inputs( diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py index 01d0b7f50f45e..29293877cb69d 100644 --- a/vllm/v1/engine/input_processor.py +++ b/vllm/v1/engine/input_processor.py @@ -5,8 +5,6 @@ import time from collections.abc import Mapping from typing import Any, Literal, cast -import msgspec - from vllm.config import VllmConfig from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs from vllm.inputs.parse import split_enc_dec_inputs @@ -460,17 +458,6 @@ class InputProcessor: else: mm_uuids = None - # When enable_tower_connector_lora is True, multi-modal embeddings - # vary depending on the LoRA request. Therefore, the mm_hash must be - # generated based on the LoRA request to prevent incorrect cache hits. - lora_config = self.lora_config - lora_kwargs = ( - msgspec.structs.asdict(lora_request) - if lora_request and lora_config and lora_config.enable_tower_connector_lora - else {} - ) - lora_kwargs = {k: v for k, v in lora_kwargs.items() if v is not None} - # Process inputs, which includes: # 1. Tokenize text prompt, with LoRA request if one exists. # 2. For multimodal models with a merged preprocessor, preprocess @@ -479,7 +466,6 @@ class InputProcessor: prompt, tokenization_kwargs=tokenization_kwargs, mm_uuids=mm_uuids, - lora_kwargs=lora_kwargs, ) from vllm.platforms import current_platform From fa6dd854216910b6ee81775c18093c629937a016 Mon Sep 17 00:00:00 2001 From: bk-201 Date: Sun, 21 Dec 2025 04:25:59 +0000 Subject: [PATCH 49/53] fix Signed-off-by: bk-201 --- vllm/model_executor/models/idefics3.py | 27 +++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index dbdb2d96b7b24..459043e91da4a 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -291,15 +291,6 @@ class Idefics3ProcessingInfo(BaseProcessingInfo): height=image_processor.size["longest_edge"], ) - def get_num_mm_encoder_tokens( - self, - num_image_tokens: int, - ) -> int: - hf_config = self.get_hf_config() - scale_factor = hf_config.scale_factor - - return num_image_tokens * scale_factor**2 - class Idefics3DummyInputsBuilder(BaseDummyInputsBuilder[Idefics3ProcessingInfo]): def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: @@ -723,3 +714,21 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLo connector="model.connector", tower_model="model.vision_model", ) + + def get_num_mm_encoder_tokens( + self, + num_image_tokens: int, + ) -> int: + hf_config = self.config + scale_factor = hf_config.scale_factor + + return num_image_tokens * scale_factor**2 + + def get_num_mm_connector_tokens( + self, + num_vision_tokens: int, + ) -> int: + hf_config = self.config + scale_factor = hf_config.scale_factor + + return num_vision_tokens // scale_factor**2 From f3a55ff958eb850c60cdf2d7861db2b14b4e5694 Mon Sep 17 00:00:00 2001 From: bk-201 Date: Mon, 22 Dec 2025 13:53:52 +0000 Subject: [PATCH 50/53] fix mm_hash Signed-off-by: bk-201 --- vllm/v1/engine/input_processor.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py index 29293877cb69d..3bccebe612571 100644 --- a/vllm/v1/engine/input_processor.py +++ b/vllm/v1/engine/input_processor.py @@ -458,6 +458,28 @@ class InputProcessor: else: mm_uuids = None + # When enable_tower_connector_lora is True, multi-modal embeddings + # vary depending on the LoRA request. Therefore, the mm_hash must be + # generated based on the LoRA request to prevent incorrect cache hits. + lora_config = self.lora_config + if ( + mm_uuids + and lora_request + and lora_config + and lora_config.enable_tower_connector_lora + ): + + def add_mm_lora_prefix(val): + if isinstance(val, list): + return [ + f"{lora_request.lora_name}:{v}" if v is not None else None + for v in val + ] + else: + return f"{lora_request.lora_name}:{val}" + + mm_uuids = {k: add_mm_lora_prefix(v) for k, v in mm_uuids.items()} + # Process inputs, which includes: # 1. Tokenize text prompt, with LoRA request if one exists. # 2. For multimodal models with a merged preprocessor, preprocess From f114b4e14346c1466531f7928be718735dd45eb7 Mon Sep 17 00:00:00 2001 From: bk-201 Date: Tue, 23 Dec 2025 01:47:18 +0000 Subject: [PATCH 51/53] disable mm cache when enable_tower_connector_lora Signed-off-by: bk-201 --- tests/lora/test_qwenvl.py | 126 ++++++++++++++++++------------ vllm/engine/arg_utils.py | 13 +++ vllm/v1/engine/input_processor.py | 41 +++++----- 3 files changed, 106 insertions(+), 74 deletions(-) diff --git a/tests/lora/test_qwenvl.py b/tests/lora/test_qwenvl.py index 4b3b92447789f..5f84bfee2c1e2 100644 --- a/tests/lora/test_qwenvl.py +++ b/tests/lora/test_qwenvl.py @@ -15,10 +15,11 @@ class TestConfig: max_num_seqs: int = 2 max_loras: int = 2 max_lora_rank: int = 32 - enable_tower_connector_lora: bool = True + enable_tower_connector_lora: bool = False max_model_len: int = 8192 gpu_memory_utilization: float = 0.85 mm_processor_kwargs: dict[str, int] | None = None + mm_processor_cache_gb: float = 4 def __post_init__(self): if self.mm_processor_kwargs is None: @@ -54,6 +55,7 @@ class Qwen2VLTester: trust_remote_code=True, gpu_memory_utilization=self.config.gpu_memory_utilization, mm_processor_kwargs=self.config.mm_processor_kwargs, + mm_processor_cache_gb=self.config.mm_processor_cache_gb, max_model_len=self.config.max_model_len, ) @@ -62,6 +64,7 @@ class Qwen2VLTester: images: list[ImageAsset], expected_outputs: list[str], lora_id: int | None = None, + lora_name: str | None = None, temperature: float = 0, max_tokens: int = 5, ): @@ -77,7 +80,9 @@ class Qwen2VLTester: for asset in images ] - lora_request = LoRARequest(str(lora_id), lora_id, self.config.lora_path) + lora_request = LoRARequest( + lora_name if lora_name else str(lora_id), lora_id, self.config.lora_path + ) outputs = self.llm.generate(inputs, sampling_params, lora_request=lora_request) generated_texts = [output.outputs[0].text.strip() for output in outputs] # Validate outputs @@ -207,59 +212,15 @@ def test_qwen25vl_lora(qwen25vl_lora_files): tester.run_test(TEST_IMAGES, expected_outputs=EXPECTED_OUTPUTS, lora_id=lora_id) -def test_qwen2vl_language_lora(qwen2vl_language_lora_files): - """ - Test language-only LoRA adapter. - """ - config = TestConfig( - model_path=QWEN2VL_MODEL_PATH, lora_path=qwen2vl_language_lora_files - ) - tester = Qwen2VLTester(config) - for lora_id in [1, 2]: - tester.run_test( - TEST_IMAGES, expected_outputs=EXPECTED_OUTPUTS_LANGUAGE, lora_id=lora_id - ) - - -def test_qwen2vl_vision_lora(qwen2vl_vision_tower_connector_lora_files): - """ - Test vision tower + connector LoRA adapter. - """ - config = TestConfig( - model_path=QWEN2VL_MODEL_PATH, - lora_path=qwen2vl_vision_tower_connector_lora_files, - ) - tester = Qwen2VLTester(config) - for lora_id in [1, 2]: - tester.run_test( - TEST_IMAGES, expected_outputs=EXPECTED_OUTPUTS_VISION, lora_id=lora_id - ) - - -def test_qwen2vl_vision_no_connector_lora( - qwen2vl_vision_tower_lora_files, -): - """ - Test vision tower only LoRA adapter. - - """ - config = TestConfig( - model_path=QWEN2VL_MODEL_PATH, - lora_path=qwen2vl_vision_tower_lora_files, - ) - tester = Qwen2VLTester(config) - for lora_id in [1, 2]: - tester.run_test( - TEST_IMAGES, - expected_outputs=EXPECTED_OUTPUTS_VISION_NO_CONNECTOR, - lora_id=lora_id, - ) - - def test_qwen25vl_vision_lora(qwen25vl_vision_lora_files): config = TestConfig( model_path=QWEN25VL_MODEL_PATH, lora_path=qwen25vl_vision_lora_files, + # Currently, tower_connector_lora is incompatible with + # the multi-modal processor cache. + # TODO: Remove this restriction + mm_processor_cache_gb=0, + enable_tower_connector_lora=True, ) tester = Qwen2VLTester(config) for lora_id in [1, 2]: @@ -274,6 +235,11 @@ def test_qwen3vl_vision_lora(qwen3vl_vision_lora_files): config = TestConfig( model_path=QWEN3VL_MODEL_PATH, lora_path=qwen3vl_vision_lora_files, + # Currently, tower_connector_lora is incompatible with + # the multi-modal processor cache. + # TODO: Remove this restriction + mm_processor_cache_gb=0, + enable_tower_connector_lora=True, ) tester = Qwen2VLTester(config) for lora_id in [1, 2]: @@ -282,3 +248,61 @@ def test_qwen3vl_vision_lora(qwen3vl_vision_lora_files): expected_outputs=EXPECTED_OUTPUTS_VISION_QWEN3_VL, lora_id=lora_id, ) + + +def test_qwen2vl_multiple_lora_types( + qwen2vl_language_lora_files, + qwen2vl_vision_tower_connector_lora_files, + qwen2vl_vision_tower_lora_files, +): + """ + Test multiple LoRA adapter types (language, vision tower + connector, + vision tower only) using the same LLM instance to verify mm_encoder_cache + behavior with different LoRA requests. + + By reusing the same LLM instance across different LoRA requests, we ensure that + the multimodal encoder cache correctly manages state transitions between + language-only and vision-enabled LoRA adapters. + """ + config = TestConfig( + model_path=QWEN2VL_MODEL_PATH, + # We'll override the lora_path for each specific test, but need to provide + # an initial path for initialization + lora_path=qwen2vl_language_lora_files, + # Currently, tower_connector_lora is incompatible with + # the multi-modal processor cache. + # TODO: Remove this restriction + mm_processor_cache_gb=0, + enable_tower_connector_lora=True, + ) + tester = Qwen2VLTester(config) + + # Test 1: Language-only LoRA adapter + tester.config.lora_path = qwen2vl_language_lora_files + for lora_id in [1, 2]: + tester.run_test( + TEST_IMAGES, + expected_outputs=EXPECTED_OUTPUTS_LANGUAGE, + lora_id=lora_id, + lora_name="language_only", + ) + + # Test 2: Vision tower + connector LoRA adapter + tester.config.lora_path = qwen2vl_vision_tower_connector_lora_files + for lora_id in [3, 4]: + tester.run_test( + TEST_IMAGES, + expected_outputs=EXPECTED_OUTPUTS_VISION, + lora_id=lora_id, + lora_name="vision_tower_connector", + ) + + # Test 3: Vision tower only LoRA adapter (no connector) + tester.config.lora_path = qwen2vl_vision_tower_lora_files + for lora_id in [5, 6]: + tester.run_test( + TEST_IMAGES, + expected_outputs=EXPECTED_OUTPUTS_VISION_NO_CONNECTOR, + lora_id=lora_id, + lora_name="vision_tower", + ) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index b222b63853c9f..24c4f1d91638e 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1647,6 +1647,19 @@ class EngineArgs: else None ) + if ( + lora_config is not None + and lora_config.enable_tower_connector_lora + and self.mm_processor_cache_gb != 0 + ): + raise ValueError( + "Currently, enable_tower_connector_lora is " + "incompatible with the multi-modal processor cache. " + "When enable_tower_connector_lora is set, " + "mm_processor_cache_gb must be 0, got %s", + self.mm_processor_cache_gb, + ) + if ( lora_config is not None and speculative_config is not None diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py index 3bccebe612571..8717e7e24d7ae 100644 --- a/vllm/v1/engine/input_processor.py +++ b/vllm/v1/engine/input_processor.py @@ -406,6 +406,20 @@ class InputProcessor: mm_uuids[modality] = [f"{request_id}-{modality}-{i}" for i in range(n)] return mm_uuids + def _get_mm_identifier( + self, + mm_hash: str, + lora_request: LoRARequest | None, + ) -> str: + """ + When enable_tower_connector_lora is True, multi-modal embeddings + vary depending on the LoRA request. Therefore, the mm_hash must be + generated based on the LoRA request to prevent incorrect cache hits. + """ + if lora_request is None or not self.lora_config.enable_tower_connector_lora: + return mm_hash + return f"{lora_request.lora_name}:{mm_hash}" + def process_inputs( self, request_id: str, @@ -458,28 +472,6 @@ class InputProcessor: else: mm_uuids = None - # When enable_tower_connector_lora is True, multi-modal embeddings - # vary depending on the LoRA request. Therefore, the mm_hash must be - # generated based on the LoRA request to prevent incorrect cache hits. - lora_config = self.lora_config - if ( - mm_uuids - and lora_request - and lora_config - and lora_config.enable_tower_connector_lora - ): - - def add_mm_lora_prefix(val): - if isinstance(val, list): - return [ - f"{lora_request.lora_name}:{v}" if v is not None else None - for v in val - ] - else: - return f"{lora_request.lora_name}:{val}" - - mm_uuids = {k: add_mm_lora_prefix(v) for k, v in mm_uuids.items()} - # Process inputs, which includes: # 1. Tokenize text prompt, with LoRA request if one exists. # 2. For multimodal models with a merged preprocessor, preprocess @@ -548,7 +540,10 @@ class InputProcessor: MultiModalFeatureSpec( data=decoder_mm_inputs[modality][idx], modality=modality, - identifier=decoder_mm_hashes[modality][idx], + identifier=self._get_mm_identifier( + decoder_mm_hashes[modality][idx], + lora_request, + ), mm_position=decoder_mm_positions[modality][idx], ) ) From 390ac9a4a48d09ffa7cba861a942fe00330ec3b8 Mon Sep 17 00:00:00 2001 From: bk-201 Date: Tue, 23 Dec 2025 02:48:23 +0000 Subject: [PATCH 52/53] fix pre-commit Signed-off-by: bk-201 --- vllm/v1/engine/input_processor.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py index 8717e7e24d7ae..2cbf39c859d6e 100644 --- a/vllm/v1/engine/input_processor.py +++ b/vllm/v1/engine/input_processor.py @@ -416,7 +416,11 @@ class InputProcessor: vary depending on the LoRA request. Therefore, the mm_hash must be generated based on the LoRA request to prevent incorrect cache hits. """ - if lora_request is None or not self.lora_config.enable_tower_connector_lora: + if ( + lora_request is None + or self.lora_config is None + or not self.lora_config.enable_tower_connector_lora + ): return mm_hash return f"{lora_request.lora_name}:{mm_hash}" From 57d7267fee3c21f36547f3a6cff4675552879ae2 Mon Sep 17 00:00:00 2001 From: bk-201 Date: Wed, 24 Dec 2025 04:43:05 +0000 Subject: [PATCH 53/53] cleanup Signed-off-by: bk-201 --- vllm/v1/worker/gpu_model_runner.py | 41 ++++++++++++------------------ 1 file changed, 16 insertions(+), 25 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 160b882c6beb7..6916b2c867aa4 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2167,57 +2167,48 @@ class GPUModelRunner( prompt_lora_mapping = [] token_lora_mapping = [] lora_requests = set() + encoder_token_counts = [] for req_id, pos_info in mm_lora_refs: req_idx = self.input_batch.req_id_to_index[req_id] lora_id = int(self.input_batch.request_lora_mapping[req_idx]) - # Prefer pos_info.is_embed to count actual MM embedding tokens. - # pos_info.length may overcount (e.g., special tokens in Qwen-VL). - # Fall back to length if is_embed is None. + # Prefer pos_info.get_num_embeds to count precise MM embedding tokens. num_tokens = self.model.get_num_mm_encoder_tokens( # type: ignore[attr-defined] pos_info.get_num_embeds ) prompt_lora_mapping.append(lora_id) token_lora_mapping.extend([lora_id] * num_tokens) + encoder_token_counts.append(num_tokens) if lora_id > 0: lora_request = self.input_batch.lora_id_to_lora_request.get(lora_id) if lora_request is not None: lora_requests.add(lora_request) - lora_mapping = LoRAMapping( + # Set tower adapter mapping + tower_mapping = LoRAMapping( tuple(token_lora_mapping), tuple(prompt_lora_mapping), is_prefill=True, type=LoRAMappingType.TOWER, ) - self.lora_manager.set_active_adapters(lora_requests, lora_mapping) + self.lora_manager.set_active_adapters(lora_requests, tower_mapping) if hasattr(self.model, "get_num_mm_connector_tokens"): - num_post_op_tokens = [] - for _, pos_info in mm_lora_refs: - mm_token_count = self.model.get_num_mm_encoder_tokens( # type: ignore[attr-defined] - pos_info.length - ) - post_op_count = self.model.get_num_mm_connector_tokens( # type: ignore[attr-defined] - mm_token_count - ) - num_post_op_tokens.append(post_op_count) + post_op_counts = [ + self.model.get_num_mm_connector_tokens(num_tokens) # type: ignore[attr-defined] + for num_tokens in encoder_token_counts + ] - last_mapping = self.lora_manager._adapter_manager._last_mapping - assert last_mapping is not None - lora_ids = np.array( - last_mapping.prompt_mapping, - dtype=np.int32, + connector_token_mapping = np.repeat( + np.array(prompt_lora_mapping, dtype=np.int32), + np.array(post_op_counts, dtype=np.int32), ) - post_op_counts_np = np.array(num_post_op_tokens, dtype=np.int32) - new_token_indices = lora_ids.repeat(post_op_counts_np) - connector_mapping = LoRAMapping( - index_mapping=tuple(new_token_indices.tolist()), - prompt_mapping=last_mapping.prompt_mapping, - is_prefill=last_mapping.is_prefill, + index_mapping=tuple(connector_token_mapping.tolist()), + prompt_mapping=tuple(prompt_lora_mapping), + is_prefill=True, type=LoRAMappingType.CONNECTOR, )