diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py
index 1928fc2e3f93..e7bc61329d5a 100644
--- a/vllm/lora/layers/fused_moe.py
+++ b/vllm/lora/layers/fused_moe.py
@@ -74,7 +74,6 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
                 moe_state_dict["apply_router_weight_on_input"] = kwargs[
                     "apply_router_weight_on_input"
                 ]
-                moe_state_dict["max_loras"] = layer.w1_lora_a_stacked.shape[0]
                 result = func(*args, **kwargs)
                 return result
 
@@ -89,7 +88,6 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
                 curr_topk_ids = moe_state_dict["topk_ids"]
                 global_num_experts = moe_state_dict["global_num_experts"]
                 expert_map = moe_state_dict["expert_map"]
-                max_loras = moe_state_dict["max_loras"]
 
                 config_dtype = _get_config_dtype_str(
                     dtype=hidden_states.dtype,
@@ -110,6 +108,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
                     block_shape=layer.quant_method.moe_quant_config.block_shape,
                 )
 
+                max_loras = self.w1_lora_a_stacked.shape[0]
                 config = get_config_func(M)
                 (
                     sorted_token_ids_lora,
@@ -161,7 +160,6 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
             def wrapper(*args, **kwargs):
                 hidden_states = moe_state_dict["hidden_states"]
                 topk_weights = moe_state_dict["topk_weights"]
-                max_loras = moe_state_dict["max_loras"]
 
                 config_dtype = _get_config_dtype_str(
                     dtype=hidden_states.dtype,
@@ -189,7 +187,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
                 num_tokens_post_padded_lora = moe_state_dict[
                     "num_tokens_post_padded_lora"
                 ]
-
+                max_loras = self.w1_lora_a_stacked.shape[0]
                 expert_ids_lora = expert_ids_lora.view(max_loras, -1)
                 sorted_token_ids_lora = sorted_token_ids_lora.view(max_loras, -1)
                 intermediate_cache2 = moe_state_dict["intermediate_cache2"]
@@ -305,12 +303,6 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
             device=self.device,
         )
 
-        self.base_layer.w1_lora_a_stacked = self.w1_lora_a_stacked
-        self.base_layer.w1_lora_b_stacked = self.w1_lora_b_stacked
-        self.base_layer.w2_lora_a_stacked = self.w2_lora_a_stacked
-        self.base_layer.w2_lora_b_stacked = self.w2_lora_b_stacked
-        self.base_layer.w3_lora_a_stacked = self.w3_lora_a_stacked
-        self.base_layer.w3_lora_b_stacked = self.w3_lora_b_stacked
         # They will be used by 'LoRALayerWeights.create_dummy_lora_weights'
         # to create a dummy LoRA weights.
         self.lora_a_stacked = []
@@ -343,6 +335,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
         embeddings_tensor: torch.Tensor | None,
         bias: torch.Tensor | None = None,
     ):
+        self.reset_lora(index)
         """Overwrites lora tensors at index."""
         for eid in range(len(lora_a) // 3):
             w1_lora_a = lora_a[eid * 3]
@@ -352,6 +345,10 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
             w2_lora_b = lora_b[eid * 3 + 1]
             w3_lora_b = lora_b[eid * 3 + 2]
 
+            # Handle the case of adding LoRA to only a subset of experts
+            if w1_lora_a is None or w2_lora_a is None or w3_lora_a is None:
+                continue
+
             if self.tp_size > 1:
                 shard_size = self.base_layer.intermediate_size_per_partition
                 start_idx = self.tp_rank * shard_size
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 27d52f17816a..fb2b9bdceddf 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -426,7 +426,6 @@ class LoRAModelManager:
         for module_name, module in self.modules.items():
             module_lora = self._get_lora_layer_weights(lora_model, module_name)
             if module_lora:
-                module_lora.optimize()
                 # Note (gnovack) - If MOE lora weights are not split into
                 # num_experts chunks, we split them here
                 if isinstance(module, FusedMoEWithLoRA) and torch.is_tensor(