From 2f5f9acd551cfb737997a1f7f86982ec74aabf79 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Thu, 27 Nov 2025 21:56:28 +0800 Subject: [PATCH] [LoRA] Continue optimizing MoE LoRA weight loading (#29322) Signed-off-by: Jee Jee Li --- tests/lora/test_lora_checkpoints.py | 15 +- tests/lora/test_lora_huggingface.py | 8 +- vllm/lora/layers/base.py | 2 +- vllm/lora/layers/column_parallel_linear.py | 16 +- vllm/lora/layers/fused_moe.py | 202 ++++++++++--------- vllm/lora/layers/logits_processor.py | 2 +- vllm/lora/layers/replicated_linear.py | 2 +- vllm/lora/layers/row_parallel_linear.py | 4 +- vllm/lora/layers/vocal_parallel_embedding.py | 2 +- vllm/lora/lora_weights.py | 53 +++++ vllm/lora/models.py | 50 ++--- vllm/lora/utils.py | 17 +- vllm/lora/worker_manager.py | 10 +- vllm/model_executor/models/interfaces.py | 1 + vllm/model_executor/models/qwen3_vl_moe.py | 1 + 15 files changed, 228 insertions(+), 157 deletions(-) diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py index 2219d470e91a1..b9b1bc59c6ed7 100644 --- a/tests/lora/test_lora_checkpoints.py +++ b/tests/lora/test_lora_checkpoints.py @@ -28,12 +28,13 @@ def test_load_checkpoints( packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping embedding_modules = BaiChuanBaseForCausalLM.embedding_modules embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules - expected_lora_modules: list[str] = [] + expected_lora_lst: list[str] = [] for module in BAICHUAN_LORA_MODULES: if module in packed_modules_mapping: - expected_lora_modules.extend(packed_modules_mapping[module]) + expected_lora_lst.extend(packed_modules_mapping[module]) else: - expected_lora_modules.append(module) + expected_lora_lst.append(module) + expected_lora_modules = set(expected_lora_lst) if lora_name == "baichuan7B": peft_helper = PEFTHelper.from_local_dir( baichuan_lora_files, max_position_embeddings=4096 @@ -103,13 +104,13 @@ def test_lora_weights_mapping(baichuan_lora_files): packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping embedding_modules = BaiChuanBaseForCausalLM.embedding_modules embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules - expected_lora_modules: list[str] = [] + expected_lora_lst: list[str] = [] for module in BAICHUAN_LORA_MODULES: if module in packed_modules_mapping: - expected_lora_modules.extend(packed_modules_mapping[module]) + expected_lora_lst.extend(packed_modules_mapping[module]) else: - expected_lora_modules.append(module) - + expected_lora_lst.append(module) + expected_lora_modules = set(expected_lora_lst) hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ "model.": "language_model.model.", diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py index 7d20faef541aa..6a787471c74fd 100644 --- a/tests/lora/test_lora_huggingface.py +++ b/tests/lora/test_lora_huggingface.py @@ -26,13 +26,13 @@ def test_load_checkpoints_from_huggingface(lora_fixture_name, request): packed_modules_mapping = LlamaForCausalLM.packed_modules_mapping embedding_modules = LlamaForCausalLM.embedding_modules embed_padding_modules = LlamaForCausalLM.embedding_padding_modules - expected_lora_modules: list[str] = [] + expected_lora_lst: list[str] = [] for module in LLAMA_LORA_MODULES: if module in packed_modules_mapping: - expected_lora_modules.extend(packed_modules_mapping[module]) + expected_lora_lst.extend(packed_modules_mapping[module]) else: - expected_lora_modules.append(module) - + expected_lora_lst.append(module) + expected_lora_modules = set(expected_lora_lst) lora_path = get_adapter_absolute_path(lora_name) # lora loading should work for either absolute path and huggingface id. diff --git a/vllm/lora/layers/base.py b/vllm/lora/layers/base.py index 3bfb88c007622..a4b8fb4d2aec5 100644 --- a/vllm/lora/layers/base.py +++ b/vllm/lora/layers/base.py @@ -60,7 +60,7 @@ class BaseLayerWithLoRA(nn.Module): source_layer: nn.Module, lora_config: LoRAConfig, packed_modules_list: list, - model_config: PretrainedConfig | None, + model_config: PretrainedConfig | None = None, ) -> bool: """Returns True if the layer can be replaced by this LoRA layer.""" raise NotImplementedError diff --git a/vllm/lora/layers/column_parallel_linear.py b/vllm/lora/layers/column_parallel_linear.py index 3e21d426c304a..904025901fba7 100644 --- a/vllm/lora/layers/column_parallel_linear.py +++ b/vllm/lora/layers/column_parallel_linear.py @@ -153,7 +153,7 @@ class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA): source_layer: nn.Module, lora_config: LoRAConfig, packed_modules_list: list, - model_config: PretrainedConfig | None, + model_config: PretrainedConfig | None = None, ) -> bool: return type(source_layer) is ColumnParallelLinear or ( type(source_layer) is MergedColumnParallelLinear @@ -272,7 +272,7 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): source_layer: nn.Module, lora_config: LoRAConfig, packed_modules_list: list, - model_config: PretrainedConfig | None, + model_config: PretrainedConfig | None = None, ) -> bool: return ( type(source_layer) is MergedColumnParallelLinear @@ -338,7 +338,7 @@ class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): source_layer: nn.Module, lora_config: LoRAConfig, packed_modules_list: list, - model_config: PretrainedConfig | None, + model_config: PretrainedConfig | None = None, ) -> bool: return type(source_layer) is QKVParallelLinear and len(packed_modules_list) == 1 @@ -396,7 +396,7 @@ class MergedQKVParallelLinearWithLoRA(MergedColumnParallelLinearWithLoRA): source_layer: nn.Module, lora_config: LoRAConfig, packed_modules_list: list, - model_config: PretrainedConfig | None, + model_config: PretrainedConfig | None = None, ) -> bool: return type(source_layer) is QKVParallelLinear and len(packed_modules_list) == 3 @@ -434,7 +434,7 @@ class ColumnParallelLinearWithShardedLoRA(ColumnParallelLinearWithLoRA): source_layer: nn.Module, lora_config: LoRAConfig, packed_modules_list: list, - model_config: PretrainedConfig | None, + model_config: PretrainedConfig | None = None, ) -> bool: # specifying kwargs so they can be easily accessed in decorator return super().can_replace_layer( @@ -480,7 +480,7 @@ class MergedColumnParallelLinearWithShardedLoRA(MergedColumnParallelLinearWithLo source_layer: nn.Module, lora_config: LoRAConfig, packed_modules_list: list, - model_config: PretrainedConfig | None, + model_config: PretrainedConfig | None = None, ) -> bool: # specifying kwargs so they can be easily accessed in decorator return super().can_replace_layer( @@ -516,7 +516,7 @@ class QKVParallelLinearWithShardedLoRA(QKVParallelLinearWithLoRA): source_layer: nn.Module, lora_config: LoRAConfig, packed_modules_list: list, - model_config: PretrainedConfig | None, + model_config: PretrainedConfig | None = None, ) -> bool: # specifying kwargs so they can be easily accessed in decorator return super().can_replace_layer( @@ -565,7 +565,7 @@ class MergedQKVParallelLinearWithShardedLoRA(MergedQKVParallelLinearWithLoRA): source_layer: nn.Module, lora_config: LoRAConfig, packed_modules_list: list, - model_config: PretrainedConfig | None, + model_config: PretrainedConfig | None = None, ) -> bool: # specifying kwargs so they can be easily accessed in decorator return super().can_replace_layer( diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py index 1b925742c3002..3ad19370962ab 100644 --- a/vllm/lora/layers/fused_moe.py +++ b/vllm/lora/layers/fused_moe.py @@ -401,6 +401,61 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): self.w13_lora_b_stacked[1][lora_id][experts_id] ) + def _slice_w13_a(self, w13_lora_a: torch.Tensor) -> torch.Tensor: + """ + Applies to FusedMoEWithLoRA and FusedMoE3DWithLoRA + """ + if self.tp_size == 1 or not self.fully_sharded: + return w13_lora_a + + # w13_lora_a shape (num_experts,rank,input_size) + current_lora_rank = w13_lora_a.shape[1] + assert current_lora_rank % self.tp_size == 0 + # Based on S-LoRA, we slice W13/W1/W3 A along the rank dim. + sliced_rank = current_lora_rank // self.tp_size + start_idx = self.tp_rank * sliced_rank + end_idx = (self.tp_rank + 1) * sliced_rank + return w13_lora_a[:, start_idx:end_idx, :] + + def _slice_w13_b(self, w13_lora_b: torch.Tensor): + if self.tp_size == 1: + return w13_lora_b + + # w13_lora_b shape (num_experts,output_size,rank) + shard_size = self.base_layer.intermediate_size_per_partition + start_idx = self.tp_rank * shard_size + end_idx = (self.tp_rank + 1) * shard_size + + return w13_lora_b[:, start_idx:end_idx, :] + + def _slice_w2_a(self, w2_lora_a: torch.Tensor) -> torch.Tensor: + """ + Applies to FusedMoEWithLoRA and FusedMoE3DWithLoRA + """ + if self.tp_size == 1: + return w2_lora_a + # w2_lora_a shape (num_experts,rank,input_size) + shard_size = self.base_layer.intermediate_size_per_partition + start_idx = self.tp_rank * shard_size + end_idx = (self.tp_rank + 1) * shard_size + + return w2_lora_a[:, :, start_idx:end_idx] + + def _slice_w2_b(self, w2_lora_b: torch.Tensor) -> torch.Tensor: + """ + Applies to FusedMoEWithLoRA and FusedMoE3DWithLoRA + """ + if self.tp_size == 1 or not self.fully_sharded: + return w2_lora_b + # Based on S-LoRA, we slice W2 B along the hidden_size dim. + # w2_lora_b shape (num_experts,output_size,rank) + current_lora_size = w2_lora_b.shape[1] + + sliced_size = current_lora_size // self.tp_size + start_idx = self.tp_rank * sliced_size + end_idx = (self.tp_rank + 1) * sliced_size + return w2_lora_b[:, start_idx:end_idx, :] + def reset_lora(self, index: int): """Resets the lora weights at index back to 0.""" for pos in range(self._w13_slices): @@ -411,6 +466,8 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): self.w2_lora_b_stacked[0][index] = 0 self.adapter_enabled[index] = 0 + # + def set_lora( self, index: int, @@ -418,69 +475,55 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): lora_b: torch.Tensor | list[torch.Tensor], ): """Overwrites lora tensors at index.""" + # Make mypy happy assert isinstance(lora_a, list) assert isinstance(lora_b, list) + self.reset_lora(index) self.adapter_enabled[index] = 1 - for eid in range(len(lora_a) // 3): - w1_lora_a = lora_a[eid * 3] - w2_lora_a = lora_a[eid * 3 + 1] - w3_lora_a = lora_a[eid * 3 + 2] - w1_lora_b = lora_b[eid * 3] - w2_lora_b = lora_b[eid * 3 + 1] - w3_lora_b = lora_b[eid * 3 + 2] - # Handle the case of adding LoRA to only a subset of experts - if w1_lora_a is None or w2_lora_a is None or w3_lora_a is None: - continue + num_experts = self.w13_lora_a_stacked[0].shape[1] - if self.tp_size > 1: - shard_size = self.base_layer.intermediate_size_per_partition - start_idx = self.tp_rank * shard_size - end_idx = (self.tp_rank + 1) * shard_size + w1_lora_a, w2_lora_a, w3_lora_a = lora_a + w1_lora_b, w2_lora_b, w3_lora_b = lora_b + assert ( + num_experts + == w1_lora_a.shape[0] + == w2_lora_a.shape[0] + == w3_lora_a.shape[0] + ) - w1_lora_b = w1_lora_b[start_idx:end_idx, :] - w3_lora_b = w3_lora_b[start_idx:end_idx, :] - w2_lora_a = w2_lora_a[:, start_idx:end_idx] + slliced_w1_lora_a = self._slice_w13_a(w1_lora_a) + slliced_w1_lora_b = self._slice_w13_b(w1_lora_b) + slliced_w3_lora_a = self._slice_w13_a(w3_lora_a) + slliced_w3_lora_b = self._slice_w13_b(w3_lora_b) - if self.fully_sharded: - # Based on S-LoRA, we slice W1 and W3 A along the rank dim, - # and W2 B along the hidden_size dim. - w13_shard_size = self.w13_lora_a_stacked[0][index, eid].shape[0] - w13_start_idx = self.tp_rank * w13_shard_size - w13_end_idx = (self.tp_rank + 1) * w13_shard_size - w1_lora_a = w1_lora_a[w13_start_idx:w13_end_idx, :] - w3_lora_a = w3_lora_a[w13_start_idx:w13_end_idx, :] + sliced_w2_lora_a = self._slice_w2_a(w2_lora_a) + sliced_w2_lora_b = self._slice_w2_b(w2_lora_b) - w2_shard_size = self.w2_lora_b_stacked[0][index, eid].shape[0] - w2_start_idx = self.tp_rank * w2_shard_size - w2_end_idx = (self.tp_rank + 1) * w2_shard_size - w2_lora_b = w2_lora_b[w2_start_idx:w2_end_idx, :] - # w1 lora_a - self.w13_lora_a_stacked[0][ - index, eid, : w1_lora_a.shape[0], : w1_lora_a.shape[1] - ].copy_(w1_lora_a, non_blocking=True) - # w3 lora_a - self.w13_lora_a_stacked[1][ - index, eid, : w3_lora_a.shape[0], : w3_lora_a.shape[1] - ].copy_(w3_lora_a, non_blocking=True) + self.w13_lora_a_stacked[0][ + index, :, : slliced_w1_lora_a.shape[1], : slliced_w1_lora_a.shape[2] + ].copy_(slliced_w1_lora_a, non_blocking=True) - # w1 lora_b - self.w13_lora_b_stacked[0][ - index, eid, : w1_lora_b.shape[0], : w1_lora_b.shape[1] - ].copy_(w1_lora_b, non_blocking=True) - # w3 lora_b - self.w13_lora_b_stacked[1][ - index, eid, : w3_lora_b.shape[0], : w3_lora_b.shape[1] - ].copy_(w3_lora_b, non_blocking=True) + self.w13_lora_a_stacked[1][ + index, :, : slliced_w3_lora_a.shape[1], : slliced_w3_lora_a.shape[2] + ].copy_(slliced_w3_lora_a, non_blocking=True) - self.w2_lora_a_stacked[0][ - index, eid, : w2_lora_a.shape[0], : w2_lora_a.shape[1] - ].copy_(w2_lora_a, non_blocking=True) + self.w13_lora_b_stacked[0][ + index, :, : slliced_w1_lora_b.shape[1], : slliced_w1_lora_b.shape[2] + ].copy_(slliced_w1_lora_b, non_blocking=True) - self.w2_lora_b_stacked[0][ - index, eid, : w2_lora_b.shape[0], : w2_lora_b.shape[1] - ].copy_(w2_lora_b, non_blocking=True) + self.w13_lora_b_stacked[1][ + index, :, : slliced_w3_lora_b.shape[1], : slliced_w3_lora_b.shape[2] + ].copy_(slliced_w3_lora_b, non_blocking=True) + + self.w2_lora_a_stacked[0][ + index, :, : sliced_w2_lora_a.shape[1], : sliced_w2_lora_a.shape[2] + ].copy_(sliced_w2_lora_a, non_blocking=True) + + self.w2_lora_b_stacked[0][ + index, :, : sliced_w2_lora_b.shape[1], : sliced_w2_lora_b.shape[2] + ].copy_(sliced_w2_lora_b, non_blocking=True) def forward(self, *args, **kwargs): return self.base_layer.forward(*args, **kwargs) @@ -506,12 +549,12 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): source_layer: nn.Module, lora_config: LoRAConfig, packed_modules_list: list, - model_config: PretrainedConfig | None, + model_config: PretrainedConfig | None = None, ) -> bool: """Returns True if the layer can be replaced by this LoRA layer.""" - # return type(source_layer) is FusedMoE - return type(source_layer) is FusedMoE and len(packed_modules_list) == 2 + # source_layer is FusedMoE or SharedFusedMoE + return isinstance(source_layer, FusedMoE) and len(packed_modules_list) == 2 class FusedMoE3DWithLoRA(FusedMoEWithLoRA): @@ -555,6 +598,9 @@ class FusedMoE3DWithLoRA(FusedMoEWithLoRA): model_config: PretrainedConfig | None = None, ) -> None: """Initializes lora matrices.""" + + assert isinstance(model_config, PretrainedConfig) + self._base_model = model_config.architectures[0] self.max_loras = lora_config.max_loras self.fully_sharded = lora_config.fully_sharded_loras @@ -565,20 +611,7 @@ class FusedMoE3DWithLoRA(FusedMoEWithLoRA): self._create_lora_a_weights(max_loras, lora_config) self._create_lora_b_weights(max_loras, lora_config) - def _slice_w13_a(self, w13_lora_a: torch.Tensor) -> torch.Tensor: - if self.tp_size == 1 or not self.fully_sharded: - return w13_lora_a - - # w13_lora_a shape (num_experts,rank,input_size) - current_lora_rank = w13_lora_a.shape[1] - assert current_lora_rank % self.tp_size == 0 - - sliced_rank = current_lora_rank // self.tp_size - start_idx = self.tp_rank * sliced_rank - end_idx = (self.tp_rank + 1) * sliced_rank - return w13_lora_a[:, start_idx:end_idx, :] - - def _slice_w13_b(self, w13_lora_b: torch.Tensor, is_interleave: bool = True): + def _slice_w13_b(self, w13_lora_b: torch.Tensor): if self.tp_size == 1: return w13_lora_b @@ -586,7 +619,8 @@ class FusedMoE3DWithLoRA(FusedMoEWithLoRA): shard_size = self.base_layer.intermediate_size_per_partition start_idx = self.tp_rank * shard_size end_idx = (self.tp_rank + 1) * shard_size - if is_interleave: + # HACK: Currently, only GPT-OSS is in interleaved order + if self._base_model == "GptOssForCausalLM": # For models like GPT-OSS, the weights of w1 (gate_proj) and w3 (up_proj) # in the interleaved order, and corresponding LoRA need to be processed. w1_lora_b = w13_lora_b[:, ::2, :] @@ -606,28 +640,6 @@ class FusedMoE3DWithLoRA(FusedMoEWithLoRA): return torch.cat([sliced_w1_lora_b, sliced_w3_lora_b], dim=1) - def _slice_w2_a(self, w2_lora_a: torch.Tensor) -> torch.Tensor: - if self.tp_size == 1: - return w2_lora_a - # w2_lora_a shape (num_experts,rank,input_size) - shard_size = self.base_layer.intermediate_size_per_partition - start_idx = self.tp_rank * shard_size - end_idx = (self.tp_rank + 1) * shard_size - - return w2_lora_a[:, :, start_idx:end_idx] - - def _slice_w2_b(self, w2_lora_b: torch.Tensor) -> torch.Tensor: - if self.tp_size == 1 or not self.fully_sharded: - return w2_lora_b - # Based on S-LoRA, we slice W2 B along the hidden_size dim. - # w2_lora_b shape (num_experts,output_size,rank) - current_lora_size = w2_lora_b.shape[1] - - sliced_size = current_lora_size // self.tp_size - start_idx = self.tp_rank * sliced_size - end_idx = (self.tp_rank + 1) * sliced_size - return w2_lora_b[:, start_idx:end_idx, :] - def set_lora( self, index: int, @@ -658,7 +670,7 @@ class FusedMoE3DWithLoRA(FusedMoEWithLoRA): w2_lora_b = w2_lora_b.permute(1, 0, 2) sliced_w13_lora_a = self._slice_w13_a(w13_lora_a) - sliced_w13_lora_b = self._slice_w13_b(w13_lora_b, is_interleave=True) + sliced_w13_lora_b = self._slice_w13_b(w13_lora_b) sliced_w2_lora_a = self._slice_w2_a(w2_lora_a) sliced_w2_lora_b = self._slice_w2_b(w2_lora_b) @@ -711,8 +723,8 @@ class FusedMoE3DWithLoRA(FusedMoEWithLoRA): source_layer: nn.Module, lora_config: LoRAConfig, packed_modules_list: list, - model_config: PretrainedConfig | None, + model_config: PretrainedConfig | None = None, ) -> bool: """Returns True if the layer can be replaced by this LoRA layer.""" - - return type(source_layer) is FusedMoE and len(packed_modules_list) == 1 + # source_layer is FusedMoE or SharedFusedMoE + return isinstance(source_layer, FusedMoE) and len(packed_modules_list) == 1 diff --git a/vllm/lora/layers/logits_processor.py b/vllm/lora/layers/logits_processor.py index c01984db4e64c..01515f6136371 100644 --- a/vllm/lora/layers/logits_processor.py +++ b/vllm/lora/layers/logits_processor.py @@ -197,7 +197,7 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA): source_layer: nn.Module, lora_config: LoRAConfig, packed_modules_list: list, - model_config: PretrainedConfig | None, + model_config: PretrainedConfig | None = None, ) -> bool: # Special handling for the LogitsProcessor. return False diff --git a/vllm/lora/layers/replicated_linear.py b/vllm/lora/layers/replicated_linear.py index 243736c4ebc65..62bac546ccd1a 100644 --- a/vllm/lora/layers/replicated_linear.py +++ b/vllm/lora/layers/replicated_linear.py @@ -53,7 +53,7 @@ class ReplicatedLinearWithLoRA(BaseLinearLayerWithLoRA): source_layer: nn.Module, lora_config: LoRAConfig, packed_modules_list: list, - model_config: PretrainedConfig | None, + model_config: PretrainedConfig | None = None, ) -> bool: return type(source_layer) is ReplicatedLinear diff --git a/vllm/lora/layers/row_parallel_linear.py b/vllm/lora/layers/row_parallel_linear.py index 95517b1aee263..958aa6af36746 100644 --- a/vllm/lora/layers/row_parallel_linear.py +++ b/vllm/lora/layers/row_parallel_linear.py @@ -87,7 +87,7 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA): source_layer: nn.Module, lora_config: LoRAConfig, packed_modules_list: list, - model_config: PretrainedConfig | None, + model_config: PretrainedConfig | None = None, ) -> bool: return type(source_layer) is RowParallelLinear @@ -164,7 +164,7 @@ class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA): source_layer: nn.Module, lora_config: LoRAConfig, packed_modules_list: list, - model_config: PretrainedConfig | None, + model_config: PretrainedConfig | None = None, ) -> bool: # specifying kwargs so they can be easily accessed in decorator return super().can_replace_layer( diff --git a/vllm/lora/layers/vocal_parallel_embedding.py b/vllm/lora/layers/vocal_parallel_embedding.py index c87ca9e24dece..4c1550d09e5e2 100644 --- a/vllm/lora/layers/vocal_parallel_embedding.py +++ b/vllm/lora/layers/vocal_parallel_embedding.py @@ -131,7 +131,7 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA): source_layer: nn.Module, lora_config: LoRAConfig, packed_modules_list: list, - model_config: PretrainedConfig | None, + model_config: PretrainedConfig | None = None, ) -> bool: return type(source_layer) is VocabParallelEmbedding diff --git a/vllm/lora/lora_weights.py b/vllm/lora/lora_weights.py index f0d8e22194050..15c4a1be63eeb 100644 --- a/vllm/lora/lora_weights.py +++ b/vllm/lora/lora_weights.py @@ -152,6 +152,59 @@ class PackedLoRALayerWeights(LoRALayerWeights): ) return obj + @classmethod + def pack_moe( + cls, loras: GenericSequence[Optional["LoRALayerWeights"]], module_name: str + ) -> "PackedLoRALayerWeights": + """Pack a list of LoRAs into a single LoRA. + + If LoRA is None, it signifies that the submodule does not have a LoRA. + """ + + first_lora = next(lora for lora in loras if lora is not None) + assert first_lora is not None + rank = first_lora.rank + lora_alpha = first_lora.lora_alpha + assert len(loras) % 3 == 0 + w1_lora_a_lst = [] + w2_lora_a_lst = [] + w3_lora_a_lst = [] + w1_lora_b_lst = [] + w2_lora_b_lst = [] + w3_lora_b_lst = [] + # TODO: Consider the case where some experts don't have LoRA added. + for eid in range(len(loras) // 3): + w1_lora = loras[eid * 3] + w2_lora = loras[eid * 3 + 1] + w3_lora = loras[eid * 3 + 2] + assert w1_lora is not None + assert w2_lora is not None + assert w3_lora is not None + + w1_lora_a_lst.append(w1_lora.lora_a) + w2_lora_a_lst.append(w2_lora.lora_a) + w3_lora_a_lst.append(w3_lora.lora_a) + + w1_lora_b_lst.append(w1_lora.lora_b) + w2_lora_b_lst.append(w2_lora.lora_b) + w3_lora_b_lst.append(w3_lora.lora_b) + + w1_lora_a = torch.stack(w1_lora_a_lst, dim=0) # (num_experts,rank,input_size) + w2_lora_a = torch.stack(w2_lora_a_lst, dim=0) + w3_lora_a = torch.stack(w3_lora_a_lst, dim=0) + w1_lora_b = torch.stack(w1_lora_b_lst, dim=0) # (num_experts,output_size,rank) + w2_lora_b = torch.stack(w2_lora_b_lst, dim=0) + w3_lora_b = torch.stack(w3_lora_b_lst, dim=0) + + obj = cls( + module_name, + rank, + [lora_alpha, lora_alpha, lora_alpha], + [w1_lora_a, w2_lora_a, w3_lora_a], + [w1_lora_b, w2_lora_b, w3_lora_b], + ) + return obj + def optimize(self) -> "PackedLoRALayerWeights": """Optimize the LoRA by merging the scaling into lora_b.""" for i in range(len(self.lora_b)): diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 636f062feb7b0..4caaf0e117cc4 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -13,7 +13,7 @@ from torch import nn from vllm.config.lora import LoRAConfig from vllm.logger import init_logger -from vllm.lora.layers import BaseLayerWithLoRA, FusedMoEWithLoRA, LoRAMapping +from vllm.lora.layers import BaseLayerWithLoRA, FusedMoE3DWithLoRA, LoRAMapping from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights from vllm.lora.peft_helper import PEFTHelper from vllm.lora.punica_wrapper import get_punica_wrapper @@ -151,16 +151,13 @@ class LoRAModel: if pin_memory: loras[module_name].lora_b = loras[module_name].lora_b.pin_memory() - for lora in loras.values(): - lora.optimize() - return cls(lora_model_id, peft_helper.r, loras) @classmethod def from_local_checkpoint( cls, lora_dir: str, - expected_lora_modules: list[str], + expected_lora_modules: set[str], peft_helper: PEFTHelper, *, lora_model_id: int | None = None, @@ -190,10 +187,7 @@ class LoRAModel: lora_tensor_path = os.path.join(lora_dir, "adapter_model.safetensors") lora_bin_file_path = os.path.join(lora_dir, "adapter_model.bin") lora_pt_file_path = os.path.join(lora_dir, "adapter_model.pt") - # new_embeddings_tensor_path = os.path.join( - # lora_dir, "new_embeddings.safetensors" - # ) - # new_embeddings_bin_file_path = os.path.join(lora_dir, "new_embeddings.bin") + tensors: dict[str, torch.Tensor] = {} unexpected_modules: list[list[str] | str] = [] @@ -201,18 +195,19 @@ class LoRAModel: for lora_module in modules.keys(): # noqa if is_base_embeddding_weights(lora_module): continue - module_name, _ = parse_fine_tuned_lora_name(lora_module, weights_mapper) - # Handle FSDP file format where experts.base_layer is the + # Handle PEFT file format where experts.base_layer is the # gate_up_proj and experts is the down_proj if "base_layer" in lora_module: continue + module_name, _ = parse_fine_tuned_lora_name(lora_module, weights_mapper) # Case for expert lora weights if ".experts" in module_name: - if not any( - module_name.endswith(ele) for ele in expected_lora_modules - ): + expert_idx = module_name.find(".experts") + expert_suffix = module_name[expert_idx + 1 :] + if expert_suffix not in expected_lora_modules: unexpected_modules.append(module_name) - elif module_name.split(".")[-1] not in expected_lora_modules: + + elif module_name.rsplit(".", 1)[-1] not in expected_lora_modules: unexpected_modules.append(module_name) if unexpected_modules: @@ -358,9 +353,7 @@ class LoRAModelManager: self.modules: dict[str, BaseLayerWithLoRA] = {} # Dict instead of a set for compatibility with LRUCache. self._last_mapping: LoRAMapping | None = None - self._is_3d_moe_model = is_moe_model(self.model) and hasattr( - self.model, "is_3d_moe_weight" - ) + self._is_3d_moe_model = is_moe_model(self.model) and self.model.is_3d_moe_weight self._create_lora_modules() self.model.lora_manager = self @@ -411,7 +404,7 @@ class LoRAModelManager: continue # Note (gnovack) - If MOE lora weights are not split into # num_experts chunks, we split them here - if isinstance(module, FusedMoEWithLoRA) and torch.is_tensor( + if isinstance(module, FusedMoE3DWithLoRA) and torch.is_tensor( module_lora.lora_a ): # Handle PEFT file format where experts.base_layer is the @@ -679,7 +672,10 @@ class LoRAModelManager: "cpu", ) subloras.append(lora) - lora = PackedLoRALayerWeights.pack(subloras) + if module.__class__.__name__ == "FusedMoEWithLoRA": + lora = PackedLoRALayerWeights.pack_moe(subloras, module_name) + else: + lora = PackedLoRALayerWeights.pack(subloras) model.loras[module_name] = lora return model @@ -739,13 +735,21 @@ class LoRAModelManager: replaced_module_name = module_name.replace("model.", "") if lora_model.check_lora_name(module_name): module_name = replaced_module_name - lora_model.loras[module_name] = PackedLoRALayerWeights.pack( - replacement_loras - ) + if module_name.endswith(".experts"): + lora_model.loras[module_name] = PackedLoRALayerWeights.pack_moe( + replacement_loras, module_name + ) + else: + lora_model.loras[module_name] = PackedLoRALayerWeights.pack( + replacement_loras + ) # Remove the modules that have been replaced. for module in replaced_module: lora_model.loras.pop(module, None) + for lora in lora_model.loras.values(): + lora.optimize() + def _get_lora_layer_weights( self, lora_model: LoRAModel, module_name: str ) -> LoRALayerWeights | None: diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index 12524994d4968..47484b2b984df 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -170,16 +170,15 @@ def parse_fine_tuned_lora_name( def is_base_embeddding_weights(name: str) -> bool: # hardcoded subfixes for input & output embedding weights - input_embedding_subfix = ".embed_tokens.base_layer.weight" - output_embedding_subfix = ".lm_head.base_layer.weight" - - return name.endswith(input_embedding_subfix) or name.endswith( - output_embedding_subfix + embedding_suffixes = ( + ".embed_tokens.base_layer.weight", + ".lm_head.base_layer.weight", ) + return name.endswith(embedding_suffixes) def is_regex_target_modules( - load_modules: str | list[str], expected_lora_modules: list[str] + load_modules: str | list[str], expected_lora_modules: set[str] ) -> bool: """ PEFT supports passing `target_modules` in the form of regular expressions, @@ -195,8 +194,8 @@ def is_regex_target_modules( except re.error: return False - def is_subset(sub_list, full_list): - return set(sub_list).issubset(set(full_list)) + def is_subset(sub_list, full_set): + return set(sub_list).issubset(full_set) # Similar to PEFT's processing logic, regex-related operations are only # executed when the load_modules is a `str`. @@ -290,7 +289,7 @@ def process_packed_modules_mapping(model: nn.Module) -> dict[str, list[str]]: # the expert indices are expanded based on the configured number # of routed experts. packed_modules_mapping = get_packed_modules_mapping(model) - if not hasattr(model, "is_3d_moe_weight"): + if not model.is_3d_moe_weight: # 3D MoE LoRA does not need `packed_modules_mapping` packed_modules_mapping["experts"] = [ weight_name.rstrip(".") diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index 4cc201a6414f1..d9a03f0500497 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -88,15 +88,15 @@ class WorkerLoRAManager: try: supported_lora_modules = self._adapter_manager.supported_lora_modules packed_modules_mapping = self._adapter_manager.packed_modules_mapping - expected_lora_modules: list[str] = [] + expected_lora_lst: list[str] = [] for module in supported_lora_modules: if module in packed_modules_mapping: - expected_lora_modules.extend(packed_modules_mapping[module]) + expected_lora_lst.extend(packed_modules_mapping[module]) else: - expected_lora_modules.append(module) + expected_lora_lst.append(module) if module == "experts": - expected_lora_modules.append(module) - expected_lora_modules = list(set(expected_lora_modules)) + expected_lora_lst.append(module) + expected_lora_modules = set(expected_lora_lst) lora_path = get_adapter_absolute_path(lora_request.lora_path) peft_helper = PEFTHelper.from_local_dir( diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 6f6ce32538b71..cee0b79e5e5ac 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -336,6 +336,7 @@ class SupportsLoRA(Protocol): There is no need to redefine this flag if this class is in the MRO of your model class. """ + is_3d_moe_weight: ClassVar[bool] = False # The `embedding_module` and `embedding_padding_modules` # are empty by default. embedding_modules: ClassVar[dict[str, str]] = {} diff --git a/vllm/model_executor/models/qwen3_vl_moe.py b/vllm/model_executor/models/qwen3_vl_moe.py index e2c129120b1a5..a054bd5b3831e 100644 --- a/vllm/model_executor/models/qwen3_vl_moe.py +++ b/vllm/model_executor/models/qwen3_vl_moe.py @@ -401,6 +401,7 @@ class Qwen3VLMoeMixtureOfExperts(MixtureOfExperts): class Qwen3VLMoeForConditionalGeneration( Qwen3VLForConditionalGeneration, Qwen3VLMoeMixtureOfExperts ): + is_3d_moe_weight: bool = True packed_modules_mapping = { "qkv_proj": [ "q_proj",