From 0616d19f8ba8e8f79dc98365cabf6f86b67e75a8 Mon Sep 17 00:00:00 2001
From: Jackmin801 <ongjackm@gmail.com>
Date: Wed, 24 Dec 2025 23:56:52 +0000
Subject: [PATCH] make LRUCacheWorkerLoRAManager force reload lora when not
 doing _apply_adapters

Signed-off-by: Jackmin801 <ongjackm@gmail.com>
---
 vllm/lora/worker_manager.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index 28c2a53d84e42..805f3a6f09862 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -236,21 +236,24 @@ class LRUCacheWorkerLoRAManager(WorkerLoRAManager):
                 f"({self._adapter_manager.lora_slots})."
             )
         for lora in loras_map.values():
-            self.add_adapter(lora)
+            self.add_adapter(lora, force_load=False)
 
-    def add_adapter(self, lora_request: LoRARequest) -> bool:
+    def add_adapter(self, lora_request: LoRARequest, force_load: bool = True) -> bool:
         # Note that this method is not thread-safe. It may be invoked multiple
         # times for the same adapter when using multiple API servers.
         # This is ok because it's currently only called from
         # the single-threaded core engine loop.
 
-        if lora_request.lora_int_id not in self.list_adapters():
+        if lora_request.lora_int_id not in self.list_adapters() or force_load:
             # Load the new adapter first to ensure it is actually valid, before
             # evicting any existing adapters.
             # This may cause the # of loaded lora adapters to very temporarily
             # exceed `--max-cpu-loras`.
             lora = self._load_adapter(lora_request)
 
+            # Remove the existing adapter if it exists
+            self._adapter_manager.remove_adapter(lora.id)
+
             # Loading succeeded, now check if we will exceed cache capacity and
             # evict if the oldest adapter if so
             if len(self._adapter_manager) + 1 > self._adapter_manager.capacity: