From 3cce5bd9476d44dc00eb414bb6b61036585858ac Mon Sep 17 00:00:00 2001 From: Jackmin801 Date: Wed, 24 Dec 2025 23:43:19 +0000 Subject: [PATCH 1/3] reuse old request when same lora is reloaded Signed-off-by: Jackmin801 --- vllm/entrypoints/openai/serving_models.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py index 6b03fa72fc594..ede418f09937d 100644 --- a/vllm/entrypoints/openai/serving_models.py +++ b/vllm/entrypoints/openai/serving_models.py @@ -142,10 +142,14 @@ class OpenAIServingModels: return error_check_ret lora_path = request.lora_path - unique_id = self.lora_id_counter.inc(1) - lora_request = LoRARequest( - lora_name=lora_name, lora_int_id=unique_id, lora_path=lora_path - ) + if lora_name in self.lora_requests: + lora_request = self.lora_requests[lora_name] + lora_request.lora_path = lora_path + else: + unique_id = self.lora_id_counter.inc(1) + lora_request = LoRARequest( + lora_name=lora_name, lora_int_id=unique_id, lora_path=lora_path + ) if base_model_name is not None and self.is_base_model(base_model_name): lora_request.base_model_name = base_model_name @@ -197,15 +201,6 @@ class OpenAIServingModels: status_code=HTTPStatus.BAD_REQUEST, ) - # Check if the lora adapter with the given name already exists - if request.lora_name in self.lora_requests: - return create_error_response( - message=f"The lora adapter '{request.lora_name}' has already been " - "loaded.", - err_type="InvalidUserInput", - status_code=HTTPStatus.BAD_REQUEST, - ) - return None async def _check_unload_lora_adapter_request( From 0616d19f8ba8e8f79dc98365cabf6f86b67e75a8 Mon Sep 17 00:00:00 2001 From: Jackmin801 Date: Wed, 24 Dec 2025 23:56:52 +0000 Subject: [PATCH 2/3] make LRUCacheWorkerLoRAManager force reload lora when not doing _apply_adapters Signed-off-by: Jackmin801 --- vllm/lora/worker_manager.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index 28c2a53d84e42..805f3a6f09862 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -236,21 +236,24 @@ class LRUCacheWorkerLoRAManager(WorkerLoRAManager): f"({self._adapter_manager.lora_slots})." ) for lora in loras_map.values(): - self.add_adapter(lora) + self.add_adapter(lora, force_load=False) - def add_adapter(self, lora_request: LoRARequest) -> bool: + def add_adapter(self, lora_request: LoRARequest, force_load: bool = True) -> bool: # Note that this method is not thread-safe. It may be invoked multiple # times for the same adapter when using multiple API servers. # This is ok because it's currently only called from # the single-threaded core engine loop. - if lora_request.lora_int_id not in self.list_adapters(): + if lora_request.lora_int_id not in self.list_adapters() or force_load: # Load the new adapter first to ensure it is actually valid, before # evicting any existing adapters. # This may cause the # of loaded lora adapters to very temporarily # exceed `--max-cpu-loras`. lora = self._load_adapter(lora_request) + # Remove the existing adapter if it exists + self._adapter_manager.remove_adapter(lora.id) + # Loading succeeded, now check if we will exceed cache capacity and # evict if the oldest adapter if so if len(self._adapter_manager) + 1 > self._adapter_manager.capacity: From 0718a782eea404a90f461fed3da6eb48f30f5114 Mon Sep 17 00:00:00 2001 From: Jackmin801 Date: Thu, 25 Dec 2025 00:22:35 +0000 Subject: [PATCH 3/3] use new request object so that errors dont update request Signed-off-by: Jackmin801 --- vllm/entrypoints/openai/serving_models.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py index ede418f09937d..b610a1f1101ac 100644 --- a/vllm/entrypoints/openai/serving_models.py +++ b/vllm/entrypoints/openai/serving_models.py @@ -142,14 +142,14 @@ class OpenAIServingModels: return error_check_ret lora_path = request.lora_path - if lora_name in self.lora_requests: - lora_request = self.lora_requests[lora_name] - lora_request.lora_path = lora_path - else: - unique_id = self.lora_id_counter.inc(1) - lora_request = LoRARequest( - lora_name=lora_name, lora_int_id=unique_id, lora_path=lora_path - ) + lora_int_id = ( + self.lora_requests[lora_name].lora_int_id + if lora_name in self.lora_requests + else self.lora_id_counter.inc(1) + ) + lora_request = LoRARequest( + lora_name=lora_name, lora_int_id=lora_int_id, lora_path=lora_path + ) if base_model_name is not None and self.is_base_model(base_model_name): lora_request.base_model_name = base_model_name