mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-27 08:01:19 +08:00
[tech debt] Revisit lora request model checker (#20636)
Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
This commit is contained in:
parent
0b407479ef
commit
baed180aa0
@ -57,7 +57,8 @@ async def test_load_lora_adapter_success():
|
|||||||
response = await serving_models.load_lora_adapter(request)
|
response = await serving_models.load_lora_adapter(request)
|
||||||
assert response == LORA_LOADING_SUCCESS_MESSAGE.format(lora_name='adapter')
|
assert response == LORA_LOADING_SUCCESS_MESSAGE.format(lora_name='adapter')
|
||||||
assert len(serving_models.lora_requests) == 1
|
assert len(serving_models.lora_requests) == 1
|
||||||
assert serving_models.lora_requests[0].lora_name == "adapter"
|
assert "adapter" in serving_models.lora_requests
|
||||||
|
assert serving_models.lora_requests["adapter"].lora_name == "adapter"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
|
|||||||
@ -438,9 +438,7 @@ class OpenAIServing:
|
|||||||
|
|
||||||
if self._is_model_supported(request.model):
|
if self._is_model_supported(request.model):
|
||||||
return None
|
return None
|
||||||
if request.model in [
|
if request.model in self.models.lora_requests:
|
||||||
lora.lora_name for lora in self.models.lora_requests
|
|
||||||
]:
|
|
||||||
return None
|
return None
|
||||||
if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING and request.model and (
|
if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING and request.model and (
|
||||||
load_result := await self.models.resolve_lora(request.model)):
|
load_result := await self.models.resolve_lora(request.model)):
|
||||||
@ -466,9 +464,8 @@ class OpenAIServing:
|
|||||||
None, PromptAdapterRequest]]:
|
None, PromptAdapterRequest]]:
|
||||||
if self._is_model_supported(request.model):
|
if self._is_model_supported(request.model):
|
||||||
return None, None
|
return None, None
|
||||||
for lora in self.models.lora_requests:
|
if request.model in self.models.lora_requests:
|
||||||
if request.model == lora.lora_name:
|
return self.models.lora_requests[request.model], None
|
||||||
return lora, None
|
|
||||||
for prompt_adapter in self.models.prompt_adapter_requests:
|
for prompt_adapter in self.models.prompt_adapter_requests:
|
||||||
if request.model == prompt_adapter.prompt_adapter_name:
|
if request.model == prompt_adapter.prompt_adapter_name:
|
||||||
return None, prompt_adapter
|
return None, prompt_adapter
|
||||||
|
|||||||
@ -65,12 +65,13 @@ class OpenAIServingModels:
|
|||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
self.base_model_paths = base_model_paths
|
self.base_model_paths = base_model_paths
|
||||||
|
|
||||||
self.max_model_len = model_config.max_model_len
|
self.max_model_len = model_config.max_model_len
|
||||||
self.engine_client = engine_client
|
self.engine_client = engine_client
|
||||||
self.model_config = model_config
|
self.model_config = model_config
|
||||||
|
|
||||||
self.static_lora_modules = lora_modules
|
self.static_lora_modules = lora_modules
|
||||||
self.lora_requests: list[LoRARequest] = []
|
self.lora_requests: dict[str, LoRARequest] = {}
|
||||||
self.lora_id_counter = AtomicCounter(0)
|
self.lora_id_counter = AtomicCounter(0)
|
||||||
|
|
||||||
self.lora_resolvers: list[LoRAResolver] = []
|
self.lora_resolvers: list[LoRAResolver] = []
|
||||||
@ -138,7 +139,7 @@ class OpenAIServingModels:
|
|||||||
parent=lora.base_model_name if lora.base_model_name else
|
parent=lora.base_model_name if lora.base_model_name else
|
||||||
self.base_model_paths[0].name,
|
self.base_model_paths[0].name,
|
||||||
permission=[ModelPermission()])
|
permission=[ModelPermission()])
|
||||||
for lora in self.lora_requests
|
for lora in self.lora_requests.values()
|
||||||
]
|
]
|
||||||
prompt_adapter_cards = [
|
prompt_adapter_cards = [
|
||||||
ModelCard(id=prompt_adapter.prompt_adapter_name,
|
ModelCard(id=prompt_adapter.prompt_adapter_name,
|
||||||
@ -155,53 +156,60 @@ class OpenAIServingModels:
|
|||||||
request: LoadLoRAAdapterRequest,
|
request: LoadLoRAAdapterRequest,
|
||||||
base_model_name: Optional[str] = None
|
base_model_name: Optional[str] = None
|
||||||
) -> Union[ErrorResponse, str]:
|
) -> Union[ErrorResponse, str]:
|
||||||
error_check_ret = await self._check_load_lora_adapter_request(request)
|
lora_name = request.lora_name
|
||||||
if error_check_ret is not None:
|
|
||||||
return error_check_ret
|
|
||||||
|
|
||||||
lora_name, lora_path = request.lora_name, request.lora_path
|
# Ensure atomicity based on the lora name
|
||||||
unique_id = self.lora_id_counter.inc(1)
|
async with self.lora_resolver_lock[lora_name]:
|
||||||
lora_request = LoRARequest(lora_name=lora_name,
|
error_check_ret = await self._check_load_lora_adapter_request(
|
||||||
lora_int_id=unique_id,
|
request)
|
||||||
lora_path=lora_path)
|
if error_check_ret is not None:
|
||||||
if base_model_name is not None and self.is_base_model(base_model_name):
|
return error_check_ret
|
||||||
lora_request.base_model_name = base_model_name
|
|
||||||
|
|
||||||
# Validate that the adapter can be loaded into the engine
|
lora_path = request.lora_path
|
||||||
# This will also pre-load it for incoming requests
|
unique_id = self.lora_id_counter.inc(1)
|
||||||
try:
|
lora_request = LoRARequest(lora_name=lora_name,
|
||||||
await self.engine_client.add_lora(lora_request)
|
lora_int_id=unique_id,
|
||||||
except BaseException as e:
|
lora_path=lora_path)
|
||||||
error_type = "BadRequestError"
|
if base_model_name is not None and self.is_base_model(
|
||||||
status_code = HTTPStatus.BAD_REQUEST
|
base_model_name):
|
||||||
if "No adapter found" in str(e):
|
lora_request.base_model_name = base_model_name
|
||||||
error_type = "NotFoundError"
|
|
||||||
status_code = HTTPStatus.NOT_FOUND
|
|
||||||
|
|
||||||
return create_error_response(message=str(e),
|
# Validate that the adapter can be loaded into the engine
|
||||||
err_type=error_type,
|
# This will also pre-load it for incoming requests
|
||||||
status_code=status_code)
|
try:
|
||||||
|
await self.engine_client.add_lora(lora_request)
|
||||||
|
except Exception as e:
|
||||||
|
error_type = "BadRequestError"
|
||||||
|
status_code = HTTPStatus.BAD_REQUEST
|
||||||
|
if "No adapter found" in str(e):
|
||||||
|
error_type = "NotFoundError"
|
||||||
|
status_code = HTTPStatus.NOT_FOUND
|
||||||
|
|
||||||
self.lora_requests.append(lora_request)
|
return create_error_response(message=str(e),
|
||||||
logger.info("Loaded new LoRA adapter: name '%s', path '%s'", lora_name,
|
err_type=error_type,
|
||||||
lora_path)
|
status_code=status_code)
|
||||||
return f"Success: LoRA adapter '{lora_name}' added successfully."
|
|
||||||
|
self.lora_requests[lora_name] = lora_request
|
||||||
|
logger.info("Loaded new LoRA adapter: name '%s', path '%s'",
|
||||||
|
lora_name, lora_path)
|
||||||
|
return f"Success: LoRA adapter '{lora_name}' added successfully."
|
||||||
|
|
||||||
async def unload_lora_adapter(
|
async def unload_lora_adapter(
|
||||||
self,
|
self,
|
||||||
request: UnloadLoRAAdapterRequest) -> Union[ErrorResponse, str]:
|
request: UnloadLoRAAdapterRequest) -> Union[ErrorResponse, str]:
|
||||||
error_check_ret = await self._check_unload_lora_adapter_request(request
|
|
||||||
)
|
|
||||||
if error_check_ret is not None:
|
|
||||||
return error_check_ret
|
|
||||||
|
|
||||||
lora_name = request.lora_name
|
lora_name = request.lora_name
|
||||||
self.lora_requests = [
|
|
||||||
lora_request for lora_request in self.lora_requests
|
# Ensure atomicity based on the lora name
|
||||||
if lora_request.lora_name != lora_name
|
async with self.lora_resolver_lock[lora_name]:
|
||||||
]
|
error_check_ret = await self._check_unload_lora_adapter_request(
|
||||||
logger.info("Removed LoRA adapter: name '%s'", lora_name)
|
request)
|
||||||
return f"Success: LoRA adapter '{lora_name}' removed successfully."
|
if error_check_ret is not None:
|
||||||
|
return error_check_ret
|
||||||
|
|
||||||
|
# Safe to delete now since we hold the lock
|
||||||
|
del self.lora_requests[lora_name]
|
||||||
|
logger.info("Removed LoRA adapter: name '%s'", lora_name)
|
||||||
|
return f"Success: LoRA adapter '{lora_name}' removed successfully."
|
||||||
|
|
||||||
async def _check_load_lora_adapter_request(
|
async def _check_load_lora_adapter_request(
|
||||||
self, request: LoadLoRAAdapterRequest) -> Optional[ErrorResponse]:
|
self, request: LoadLoRAAdapterRequest) -> Optional[ErrorResponse]:
|
||||||
@ -213,8 +221,7 @@ class OpenAIServingModels:
|
|||||||
status_code=HTTPStatus.BAD_REQUEST)
|
status_code=HTTPStatus.BAD_REQUEST)
|
||||||
|
|
||||||
# Check if the lora adapter with the given name already exists
|
# Check if the lora adapter with the given name already exists
|
||||||
if any(lora_request.lora_name == request.lora_name
|
if request.lora_name in self.lora_requests:
|
||||||
for lora_request in self.lora_requests):
|
|
||||||
return create_error_response(
|
return create_error_response(
|
||||||
message=
|
message=
|
||||||
f"The lora adapter '{request.lora_name}' has already been "
|
f"The lora adapter '{request.lora_name}' has already been "
|
||||||
@ -227,17 +234,16 @@ class OpenAIServingModels:
|
|||||||
async def _check_unload_lora_adapter_request(
|
async def _check_unload_lora_adapter_request(
|
||||||
self,
|
self,
|
||||||
request: UnloadLoRAAdapterRequest) -> Optional[ErrorResponse]:
|
request: UnloadLoRAAdapterRequest) -> Optional[ErrorResponse]:
|
||||||
# Check if either 'lora_name' or 'lora_int_id' is provided
|
# Check if 'lora_name' is not provided return an error
|
||||||
if not request.lora_name and not request.lora_int_id:
|
if not request.lora_name:
|
||||||
return create_error_response(
|
return create_error_response(
|
||||||
message=
|
message=
|
||||||
"either 'lora_name' and 'lora_int_id' needs to be provided.",
|
"'lora_name' needs to be provided to unload a LoRA adapter.",
|
||||||
err_type="InvalidUserInput",
|
err_type="InvalidUserInput",
|
||||||
status_code=HTTPStatus.BAD_REQUEST)
|
status_code=HTTPStatus.BAD_REQUEST)
|
||||||
|
|
||||||
# Check if the lora adapter with the given name exists
|
# Check if the lora adapter with the given name exists
|
||||||
if not any(lora_request.lora_name == request.lora_name
|
if request.lora_name not in self.lora_requests:
|
||||||
for lora_request in self.lora_requests):
|
|
||||||
return create_error_response(
|
return create_error_response(
|
||||||
message=
|
message=
|
||||||
f"The lora adapter '{request.lora_name}' cannot be found.",
|
f"The lora adapter '{request.lora_name}' cannot be found.",
|
||||||
@ -260,9 +266,8 @@ class OpenAIServingModels:
|
|||||||
"""
|
"""
|
||||||
async with self.lora_resolver_lock[lora_name]:
|
async with self.lora_resolver_lock[lora_name]:
|
||||||
# First check if this LoRA is already loaded
|
# First check if this LoRA is already loaded
|
||||||
for existing in self.lora_requests:
|
if lora_name in self.lora_requests:
|
||||||
if existing.lora_name == lora_name:
|
return self.lora_requests[lora_name]
|
||||||
return existing
|
|
||||||
|
|
||||||
base_model_name = self.model_config.model
|
base_model_name = self.model_config.model
|
||||||
unique_id = self.lora_id_counter.inc(1)
|
unique_id = self.lora_id_counter.inc(1)
|
||||||
@ -279,7 +284,7 @@ class OpenAIServingModels:
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
await self.engine_client.add_lora(lora_request)
|
await self.engine_client.add_lora(lora_request)
|
||||||
self.lora_requests.append(lora_request)
|
self.lora_requests[lora_name] = lora_request
|
||||||
logger.info(
|
logger.info(
|
||||||
"Resolved and loaded LoRA adapter '%s' using %s",
|
"Resolved and loaded LoRA adapter '%s' using %s",
|
||||||
lora_name, resolver.__class__.__name__)
|
lora_name, resolver.__class__.__name__)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user