[tech debt] Revisit lora request model checker (#20636)

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
This commit is contained in:
kourosh hakhamaneshi 2025-07-08 18:42:41 -07:00 committed by GitHub
parent 0b407479ef
commit baed180aa0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 61 additions and 58 deletions

View File

@ -57,7 +57,8 @@ async def test_load_lora_adapter_success():
response = await serving_models.load_lora_adapter(request) response = await serving_models.load_lora_adapter(request)
assert response == LORA_LOADING_SUCCESS_MESSAGE.format(lora_name='adapter') assert response == LORA_LOADING_SUCCESS_MESSAGE.format(lora_name='adapter')
assert len(serving_models.lora_requests) == 1 assert len(serving_models.lora_requests) == 1
assert serving_models.lora_requests[0].lora_name == "adapter" assert "adapter" in serving_models.lora_requests
assert serving_models.lora_requests["adapter"].lora_name == "adapter"
@pytest.mark.asyncio @pytest.mark.asyncio

View File

@ -438,9 +438,7 @@ class OpenAIServing:
if self._is_model_supported(request.model): if self._is_model_supported(request.model):
return None return None
if request.model in [ if request.model in self.models.lora_requests:
lora.lora_name for lora in self.models.lora_requests
]:
return None return None
if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING and request.model and ( if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING and request.model and (
load_result := await self.models.resolve_lora(request.model)): load_result := await self.models.resolve_lora(request.model)):
@ -466,9 +464,8 @@ class OpenAIServing:
None, PromptAdapterRequest]]: None, PromptAdapterRequest]]:
if self._is_model_supported(request.model): if self._is_model_supported(request.model):
return None, None return None, None
for lora in self.models.lora_requests: if request.model in self.models.lora_requests:
if request.model == lora.lora_name: return self.models.lora_requests[request.model], None
return lora, None
for prompt_adapter in self.models.prompt_adapter_requests: for prompt_adapter in self.models.prompt_adapter_requests:
if request.model == prompt_adapter.prompt_adapter_name: if request.model == prompt_adapter.prompt_adapter_name:
return None, prompt_adapter return None, prompt_adapter

View File

@ -65,12 +65,13 @@ class OpenAIServingModels:
super().__init__() super().__init__()
self.base_model_paths = base_model_paths self.base_model_paths = base_model_paths
self.max_model_len = model_config.max_model_len self.max_model_len = model_config.max_model_len
self.engine_client = engine_client self.engine_client = engine_client
self.model_config = model_config self.model_config = model_config
self.static_lora_modules = lora_modules self.static_lora_modules = lora_modules
self.lora_requests: list[LoRARequest] = [] self.lora_requests: dict[str, LoRARequest] = {}
self.lora_id_counter = AtomicCounter(0) self.lora_id_counter = AtomicCounter(0)
self.lora_resolvers: list[LoRAResolver] = [] self.lora_resolvers: list[LoRAResolver] = []
@ -138,7 +139,7 @@ class OpenAIServingModels:
parent=lora.base_model_name if lora.base_model_name else parent=lora.base_model_name if lora.base_model_name else
self.base_model_paths[0].name, self.base_model_paths[0].name,
permission=[ModelPermission()]) permission=[ModelPermission()])
for lora in self.lora_requests for lora in self.lora_requests.values()
] ]
prompt_adapter_cards = [ prompt_adapter_cards = [
ModelCard(id=prompt_adapter.prompt_adapter_name, ModelCard(id=prompt_adapter.prompt_adapter_name,
@ -155,53 +156,60 @@ class OpenAIServingModels:
request: LoadLoRAAdapterRequest, request: LoadLoRAAdapterRequest,
base_model_name: Optional[str] = None base_model_name: Optional[str] = None
) -> Union[ErrorResponse, str]: ) -> Union[ErrorResponse, str]:
error_check_ret = await self._check_load_lora_adapter_request(request) lora_name = request.lora_name
if error_check_ret is not None:
return error_check_ret
lora_name, lora_path = request.lora_name, request.lora_path # Ensure atomicity based on the lora name
unique_id = self.lora_id_counter.inc(1) async with self.lora_resolver_lock[lora_name]:
lora_request = LoRARequest(lora_name=lora_name, error_check_ret = await self._check_load_lora_adapter_request(
lora_int_id=unique_id, request)
lora_path=lora_path) if error_check_ret is not None:
if base_model_name is not None and self.is_base_model(base_model_name): return error_check_ret
lora_request.base_model_name = base_model_name
# Validate that the adapter can be loaded into the engine lora_path = request.lora_path
# This will also pre-load it for incoming requests unique_id = self.lora_id_counter.inc(1)
try: lora_request = LoRARequest(lora_name=lora_name,
await self.engine_client.add_lora(lora_request) lora_int_id=unique_id,
except BaseException as e: lora_path=lora_path)
error_type = "BadRequestError" if base_model_name is not None and self.is_base_model(
status_code = HTTPStatus.BAD_REQUEST base_model_name):
if "No adapter found" in str(e): lora_request.base_model_name = base_model_name
error_type = "NotFoundError"
status_code = HTTPStatus.NOT_FOUND
return create_error_response(message=str(e), # Validate that the adapter can be loaded into the engine
err_type=error_type, # This will also pre-load it for incoming requests
status_code=status_code) try:
await self.engine_client.add_lora(lora_request)
except Exception as e:
error_type = "BadRequestError"
status_code = HTTPStatus.BAD_REQUEST
if "No adapter found" in str(e):
error_type = "NotFoundError"
status_code = HTTPStatus.NOT_FOUND
self.lora_requests.append(lora_request) return create_error_response(message=str(e),
logger.info("Loaded new LoRA adapter: name '%s', path '%s'", lora_name, err_type=error_type,
lora_path) status_code=status_code)
return f"Success: LoRA adapter '{lora_name}' added successfully."
self.lora_requests[lora_name] = lora_request
logger.info("Loaded new LoRA adapter: name '%s', path '%s'",
lora_name, lora_path)
return f"Success: LoRA adapter '{lora_name}' added successfully."
async def unload_lora_adapter( async def unload_lora_adapter(
self, self,
request: UnloadLoRAAdapterRequest) -> Union[ErrorResponse, str]: request: UnloadLoRAAdapterRequest) -> Union[ErrorResponse, str]:
error_check_ret = await self._check_unload_lora_adapter_request(request
)
if error_check_ret is not None:
return error_check_ret
lora_name = request.lora_name lora_name = request.lora_name
self.lora_requests = [
lora_request for lora_request in self.lora_requests # Ensure atomicity based on the lora name
if lora_request.lora_name != lora_name async with self.lora_resolver_lock[lora_name]:
] error_check_ret = await self._check_unload_lora_adapter_request(
logger.info("Removed LoRA adapter: name '%s'", lora_name) request)
return f"Success: LoRA adapter '{lora_name}' removed successfully." if error_check_ret is not None:
return error_check_ret
# Safe to delete now since we hold the lock
del self.lora_requests[lora_name]
logger.info("Removed LoRA adapter: name '%s'", lora_name)
return f"Success: LoRA adapter '{lora_name}' removed successfully."
async def _check_load_lora_adapter_request( async def _check_load_lora_adapter_request(
self, request: LoadLoRAAdapterRequest) -> Optional[ErrorResponse]: self, request: LoadLoRAAdapterRequest) -> Optional[ErrorResponse]:
@ -213,8 +221,7 @@ class OpenAIServingModels:
status_code=HTTPStatus.BAD_REQUEST) status_code=HTTPStatus.BAD_REQUEST)
# Check if the lora adapter with the given name already exists # Check if the lora adapter with the given name already exists
if any(lora_request.lora_name == request.lora_name if request.lora_name in self.lora_requests:
for lora_request in self.lora_requests):
return create_error_response( return create_error_response(
message= message=
f"The lora adapter '{request.lora_name}' has already been " f"The lora adapter '{request.lora_name}' has already been "
@ -227,17 +234,16 @@ class OpenAIServingModels:
async def _check_unload_lora_adapter_request( async def _check_unload_lora_adapter_request(
self, self,
request: UnloadLoRAAdapterRequest) -> Optional[ErrorResponse]: request: UnloadLoRAAdapterRequest) -> Optional[ErrorResponse]:
# Check if either 'lora_name' or 'lora_int_id' is provided # Check if 'lora_name' is not provided return an error
if not request.lora_name and not request.lora_int_id: if not request.lora_name:
return create_error_response( return create_error_response(
message= message=
"either 'lora_name' and 'lora_int_id' needs to be provided.", "'lora_name' needs to be provided to unload a LoRA adapter.",
err_type="InvalidUserInput", err_type="InvalidUserInput",
status_code=HTTPStatus.BAD_REQUEST) status_code=HTTPStatus.BAD_REQUEST)
# Check if the lora adapter with the given name exists # Check if the lora adapter with the given name exists
if not any(lora_request.lora_name == request.lora_name if request.lora_name not in self.lora_requests:
for lora_request in self.lora_requests):
return create_error_response( return create_error_response(
message= message=
f"The lora adapter '{request.lora_name}' cannot be found.", f"The lora adapter '{request.lora_name}' cannot be found.",
@ -260,9 +266,8 @@ class OpenAIServingModels:
""" """
async with self.lora_resolver_lock[lora_name]: async with self.lora_resolver_lock[lora_name]:
# First check if this LoRA is already loaded # First check if this LoRA is already loaded
for existing in self.lora_requests: if lora_name in self.lora_requests:
if existing.lora_name == lora_name: return self.lora_requests[lora_name]
return existing
base_model_name = self.model_config.model base_model_name = self.model_config.model
unique_id = self.lora_id_counter.inc(1) unique_id = self.lora_id_counter.inc(1)
@ -279,7 +284,7 @@ class OpenAIServingModels:
try: try:
await self.engine_client.add_lora(lora_request) await self.engine_client.add_lora(lora_request)
self.lora_requests.append(lora_request) self.lora_requests[lora_name] = lora_request
logger.info( logger.info(
"Resolved and loaded LoRA adapter '%s' using %s", "Resolved and loaded LoRA adapter '%s' using %s",
lora_name, resolver.__class__.__name__) lora_name, resolver.__class__.__name__)