mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-31 20:07:08 +08:00
make LRUCacheWorkerLoRAManager force reload lora when not doing _apply_adapters
Signed-off-by: Jackmin801 <ongjackm@gmail.com>
This commit is contained in:
parent
3cce5bd947
commit
0616d19f8b
@ -236,21 +236,24 @@ class LRUCacheWorkerLoRAManager(WorkerLoRAManager):
|
||||
f"({self._adapter_manager.lora_slots})."
|
||||
)
|
||||
for lora in loras_map.values():
|
||||
self.add_adapter(lora)
|
||||
self.add_adapter(lora, force_load=False)
|
||||
|
||||
def add_adapter(self, lora_request: LoRARequest) -> bool:
|
||||
def add_adapter(self, lora_request: LoRARequest, force_load: bool = True) -> bool:
|
||||
# Note that this method is not thread-safe. It may be invoked multiple
|
||||
# times for the same adapter when using multiple API servers.
|
||||
# This is ok because it's currently only called from
|
||||
# the single-threaded core engine loop.
|
||||
|
||||
if lora_request.lora_int_id not in self.list_adapters():
|
||||
if lora_request.lora_int_id not in self.list_adapters() or force_load:
|
||||
# Load the new adapter first to ensure it is actually valid, before
|
||||
# evicting any existing adapters.
|
||||
# This may cause the # of loaded lora adapters to very temporarily
|
||||
# exceed `--max-cpu-loras`.
|
||||
lora = self._load_adapter(lora_request)
|
||||
|
||||
# Remove the existing adapter if it exists
|
||||
self._adapter_manager.remove_adapter(lora.id)
|
||||
|
||||
# Loading succeeded, now check if we will exceed cache capacity and
|
||||
# evict if the oldest adapter if so
|
||||
if len(self._adapter_manager) + 1 > self._adapter_manager.capacity:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user