mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-25 07:37:52 +08:00
Merge 0718a782eea404a90f461fed3da6eb48f30f5114 into 254f6b986720c92ddf97fbb1a6a6465da8e87e29
This commit is contained in:
commit
0562ae7cd3
@ -142,9 +142,13 @@ class OpenAIServingModels:
|
|||||||
return error_check_ret
|
return error_check_ret
|
||||||
|
|
||||||
lora_path = request.lora_path
|
lora_path = request.lora_path
|
||||||
unique_id = self.lora_id_counter.inc(1)
|
lora_int_id = (
|
||||||
|
self.lora_requests[lora_name].lora_int_id
|
||||||
|
if lora_name in self.lora_requests
|
||||||
|
else self.lora_id_counter.inc(1)
|
||||||
|
)
|
||||||
lora_request = LoRARequest(
|
lora_request = LoRARequest(
|
||||||
lora_name=lora_name, lora_int_id=unique_id, lora_path=lora_path
|
lora_name=lora_name, lora_int_id=lora_int_id, lora_path=lora_path
|
||||||
)
|
)
|
||||||
if base_model_name is not None and self.is_base_model(base_model_name):
|
if base_model_name is not None and self.is_base_model(base_model_name):
|
||||||
lora_request.base_model_name = base_model_name
|
lora_request.base_model_name = base_model_name
|
||||||
@ -197,15 +201,6 @@ class OpenAIServingModels:
|
|||||||
status_code=HTTPStatus.BAD_REQUEST,
|
status_code=HTTPStatus.BAD_REQUEST,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Check if the lora adapter with the given name already exists
|
|
||||||
if request.lora_name in self.lora_requests:
|
|
||||||
return create_error_response(
|
|
||||||
message=f"The lora adapter '{request.lora_name}' has already been "
|
|
||||||
"loaded.",
|
|
||||||
err_type="InvalidUserInput",
|
|
||||||
status_code=HTTPStatus.BAD_REQUEST,
|
|
||||||
)
|
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
async def _check_unload_lora_adapter_request(
|
async def _check_unload_lora_adapter_request(
|
||||||
|
|||||||
@ -236,21 +236,24 @@ class LRUCacheWorkerLoRAManager(WorkerLoRAManager):
|
|||||||
f"({self._adapter_manager.lora_slots})."
|
f"({self._adapter_manager.lora_slots})."
|
||||||
)
|
)
|
||||||
for lora in loras_map.values():
|
for lora in loras_map.values():
|
||||||
self.add_adapter(lora)
|
self.add_adapter(lora, force_load=False)
|
||||||
|
|
||||||
def add_adapter(self, lora_request: LoRARequest) -> bool:
|
def add_adapter(self, lora_request: LoRARequest, force_load: bool = True) -> bool:
|
||||||
# Note that this method is not thread-safe. It may be invoked multiple
|
# Note that this method is not thread-safe. It may be invoked multiple
|
||||||
# times for the same adapter when using multiple API servers.
|
# times for the same adapter when using multiple API servers.
|
||||||
# This is ok because it's currently only called from
|
# This is ok because it's currently only called from
|
||||||
# the single-threaded core engine loop.
|
# the single-threaded core engine loop.
|
||||||
|
|
||||||
if lora_request.lora_int_id not in self.list_adapters():
|
if lora_request.lora_int_id not in self.list_adapters() or force_load:
|
||||||
# Load the new adapter first to ensure it is actually valid, before
|
# Load the new adapter first to ensure it is actually valid, before
|
||||||
# evicting any existing adapters.
|
# evicting any existing adapters.
|
||||||
# This may cause the # of loaded lora adapters to very temporarily
|
# This may cause the # of loaded lora adapters to very temporarily
|
||||||
# exceed `--max-cpu-loras`.
|
# exceed `--max-cpu-loras`.
|
||||||
lora = self._load_adapter(lora_request)
|
lora = self._load_adapter(lora_request)
|
||||||
|
|
||||||
|
# Remove the existing adapter if it exists
|
||||||
|
self._adapter_manager.remove_adapter(lora.id)
|
||||||
|
|
||||||
# Loading succeeded, now check if we will exceed cache capacity and
|
# Loading succeeded, now check if we will exceed cache capacity and
|
||||||
# evict if the oldest adapter if so
|
# evict if the oldest adapter if so
|
||||||
if len(self._adapter_manager) + 1 > self._adapter_manager.capacity:
|
if len(self._adapter_manager) + 1 > self._adapter_manager.capacity:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user