diff --git a/tests/entrypoints/openai/test_classification.py b/tests/entrypoints/openai/test_classification.py index bcf127307f730..886267c211243 100644 --- a/tests/entrypoints/openai/test_classification.py +++ b/tests/entrypoints/openai/test_classification.py @@ -121,8 +121,7 @@ def test_invalid_truncate_prompt_tokens_error(server: RemoteOpenAIServer, error = classification_response.json() assert classification_response.status_code == 400 - assert error["object"] == "error" - assert "truncate_prompt_tokens" in error["message"] + assert "truncate_prompt_tokens" in error["error"]["message"] @pytest.mark.parametrize("model_name", [MODEL_NAME]) @@ -137,7 +136,7 @@ def test_empty_input_error(server: RemoteOpenAIServer, model_name: str): error = classification_response.json() assert classification_response.status_code == 400 - assert error["object"] == "error" + assert "error" in error @pytest.mark.parametrize("model_name", [MODEL_NAME]) diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py index d4afdf7751c8f..f4801172580c6 100644 --- a/tests/entrypoints/openai/test_lora_resolvers.py +++ b/tests/entrypoints/openai/test_lora_resolvers.py @@ -160,8 +160,8 @@ async def test_serving_completion_resolver_not_found(mock_serving_setup, mock_engine.generate.assert_not_called() assert isinstance(response, ErrorResponse) - assert response.code == HTTPStatus.NOT_FOUND.value - assert non_existent_model in response.message + assert response.error.code == HTTPStatus.NOT_FOUND.value + assert non_existent_model in response.error.message @pytest.mark.asyncio @@ -190,8 +190,8 @@ async def test_serving_completion_resolver_add_lora_fails( # Assert the correct error response assert isinstance(response, ErrorResponse) - assert response.code == HTTPStatus.BAD_REQUEST.value - assert invalid_model in response.message + assert response.error.code == HTTPStatus.BAD_REQUEST.value + assert invalid_model in response.error.message @pytest.mark.asyncio diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py index c3b458d717fb2..bc6a0341f59f6 100644 --- a/tests/entrypoints/openai/test_serving_models.py +++ b/tests/entrypoints/openai/test_serving_models.py @@ -66,8 +66,8 @@ async def test_load_lora_adapter_missing_fields(): request = LoadLoRAAdapterRequest(lora_name="", lora_path="") response = await serving_models.load_lora_adapter(request) assert isinstance(response, ErrorResponse) - assert response.type == "InvalidUserInput" - assert response.code == HTTPStatus.BAD_REQUEST + assert response.error.type == "InvalidUserInput" + assert response.error.code == HTTPStatus.BAD_REQUEST @pytest.mark.asyncio @@ -84,8 +84,8 @@ async def test_load_lora_adapter_duplicate(): lora_path="/path/to/adapter1") response = await serving_models.load_lora_adapter(request) assert isinstance(response, ErrorResponse) - assert response.type == "InvalidUserInput" - assert response.code == HTTPStatus.BAD_REQUEST + assert response.error.type == "InvalidUserInput" + assert response.error.code == HTTPStatus.BAD_REQUEST assert len(serving_models.lora_requests) == 1 @@ -110,8 +110,8 @@ async def test_unload_lora_adapter_missing_fields(): request = UnloadLoRAAdapterRequest(lora_name="", lora_int_id=None) response = await serving_models.unload_lora_adapter(request) assert isinstance(response, ErrorResponse) - assert response.type == "InvalidUserInput" - assert response.code == HTTPStatus.BAD_REQUEST + assert response.error.type == "InvalidUserInput" + assert response.error.code == HTTPStatus.BAD_REQUEST @pytest.mark.asyncio @@ -120,5 +120,5 @@ async def test_unload_lora_adapter_not_found(): request = UnloadLoRAAdapterRequest(lora_name="nonexistent_adapter") response = await serving_models.unload_lora_adapter(request) assert isinstance(response, ErrorResponse) - assert response.type == "NotFoundError" - assert response.code == HTTPStatus.NOT_FOUND + assert response.error.type == "NotFoundError" + assert response.error.code == HTTPStatus.NOT_FOUND diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py index a8e2eb40b157f..28fd02171b954 100644 --- a/tests/entrypoints/openai/test_transcription_validation.py +++ b/tests/entrypoints/openai/test_transcription_validation.py @@ -116,8 +116,10 @@ async def test_non_asr_model(winning_call): file=winning_call, language="en", temperature=0.0) - assert res.code == 400 and not res.text - assert res.message == "The model does not support Transcriptions API" + err = res.error + assert err["code"] == 400 and not res.text + assert err[ + "message"] == "The model does not support Transcriptions API" @pytest.mark.asyncio @@ -133,12 +135,15 @@ async def test_completion_endpoints(): "role": "system", "content": "You are a helpful assistant." }]) - assert res.code == 400 - assert res.message == "The model does not support Chat Completions API" + err = res.error + assert err["code"] == 400 + assert err[ + "message"] == "The model does not support Chat Completions API" res = await client.completions.create(model=model_name, prompt="Hello") - assert res.code == 400 - assert res.message == "The model does not support Completions API" + err = res.error + assert err["code"] == 400 + assert err["message"] == "The model does not support Completions API" @pytest.mark.asyncio diff --git a/tests/entrypoints/openai/test_translation_validation.py b/tests/entrypoints/openai/test_translation_validation.py index 79e769e3a1aa4..bfa9bdef1c001 100644 --- a/tests/entrypoints/openai/test_translation_validation.py +++ b/tests/entrypoints/openai/test_translation_validation.py @@ -73,8 +73,9 @@ async def test_non_asr_model(foscolo): res = await client.audio.translations.create(model=model_name, file=foscolo, temperature=0.0) - assert res.code == 400 and not res.text - assert res.message == "The model does not support Translations API" + err = res.error + assert err["code"] == 400 and not res.text + assert err["message"] == "The model does not support Translations API" @pytest.mark.asyncio diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index f6f83223a1488..c695ea8b5a0ef 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -62,7 +62,8 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DetokenizeRequest, DetokenizeResponse, EmbeddingRequest, - EmbeddingResponse, ErrorResponse, + EmbeddingResponse, ErrorInfo, + ErrorResponse, LoadLoRAAdapterRequest, PoolingRequest, PoolingResponse, RerankRequest, RerankResponse, @@ -506,7 +507,7 @@ async def tokenize(request: TokenizeRequest, raw_request: Request): if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), - status_code=generator.code) + status_code=generator.error.code) elif isinstance(generator, TokenizeResponse): return JSONResponse(content=generator.model_dump()) @@ -540,7 +541,7 @@ async def detokenize(request: DetokenizeRequest, raw_request: Request): if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), - status_code=generator.code) + status_code=generator.error.code) elif isinstance(generator, DetokenizeResponse): return JSONResponse(content=generator.model_dump()) @@ -556,7 +557,7 @@ def maybe_register_tokenizer_info_endpoint(args): """Get comprehensive tokenizer information.""" result = await tokenization(raw_request).get_tokenizer_info() return JSONResponse(content=result.model_dump(), - status_code=result.code if isinstance( + status_code=result.error.code if isinstance( result, ErrorResponse) else 200) @@ -603,7 +604,7 @@ async def create_responses(request: ResponsesRequest, raw_request: Request): if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), - status_code=generator.code) + status_code=generator.error.code) elif isinstance(generator, ResponsesResponse): return JSONResponse(content=generator.model_dump()) return StreamingResponse(content=generator, media_type="text/event-stream") @@ -620,7 +621,7 @@ async def retrieve_responses(response_id: str, raw_request: Request): if isinstance(response, ErrorResponse): return JSONResponse(content=response.model_dump(), - status_code=response.code) + status_code=response.error.code) return JSONResponse(content=response.model_dump()) @@ -635,7 +636,7 @@ async def cancel_responses(response_id: str, raw_request: Request): if isinstance(response, ErrorResponse): return JSONResponse(content=response.model_dump(), - status_code=response.code) + status_code=response.error.code) return JSONResponse(content=response.model_dump()) @@ -670,7 +671,7 @@ async def create_chat_completion(request: ChatCompletionRequest, if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), - status_code=generator.code) + status_code=generator.error.code) elif isinstance(generator, ChatCompletionResponse): return JSONResponse(content=generator.model_dump()) @@ -715,7 +716,7 @@ async def create_completion(request: CompletionRequest, raw_request: Request): if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), - status_code=generator.code) + status_code=generator.error.code) elif isinstance(generator, CompletionResponse): return JSONResponse(content=generator.model_dump()) @@ -744,7 +745,7 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request): if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), - status_code=generator.code) + status_code=generator.error.code) elif isinstance(generator, EmbeddingResponse): return JSONResponse(content=generator.model_dump()) @@ -772,7 +773,7 @@ async def create_pooling(request: PoolingRequest, raw_request: Request): generator = await handler.create_pooling(request, raw_request) if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), - status_code=generator.code) + status_code=generator.error.code) elif isinstance(generator, PoolingResponse): return JSONResponse(content=generator.model_dump()) @@ -792,7 +793,7 @@ async def create_classify(request: ClassificationRequest, generator = await handler.create_classify(request, raw_request) if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), - status_code=generator.code) + status_code=generator.error.code) elif isinstance(generator, ClassificationResponse): return JSONResponse(content=generator.model_dump()) @@ -821,7 +822,7 @@ async def create_score(request: ScoreRequest, raw_request: Request): generator = await handler.create_score(request, raw_request) if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), - status_code=generator.code) + status_code=generator.error.code) elif isinstance(generator, ScoreResponse): return JSONResponse(content=generator.model_dump()) @@ -881,7 +882,7 @@ async def create_transcriptions(raw_request: Request, if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), - status_code=generator.code) + status_code=generator.error.code) elif isinstance(generator, TranscriptionResponse): return JSONResponse(content=generator.model_dump()) @@ -922,7 +923,7 @@ async def create_translations(request: Annotated[TranslationRequest, if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), - status_code=generator.code) + status_code=generator.error.code) elif isinstance(generator, TranslationResponse): return JSONResponse(content=generator.model_dump()) @@ -950,7 +951,7 @@ async def do_rerank(request: RerankRequest, raw_request: Request): generator = await handler.do_rerank(request, raw_request) if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), - status_code=generator.code) + status_code=generator.error.code) elif isinstance(generator, RerankResponse): return JSONResponse(content=generator.model_dump()) @@ -1175,7 +1176,7 @@ async def invocations(raw_request: Request): msg = ("Cannot find suitable handler for request. " f"Expected one of: {type_names}") res = base(raw_request).create_error_response(message=msg) - return JSONResponse(content=res.model_dump(), status_code=res.code) + return JSONResponse(content=res.model_dump(), status_code=res.error.code) if envs.VLLM_TORCH_PROFILER_DIR: @@ -1211,7 +1212,7 @@ if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING: response = await handler.load_lora_adapter(request) if isinstance(response, ErrorResponse): return JSONResponse(content=response.model_dump(), - status_code=response.code) + status_code=response.error.code) return Response(status_code=200, content=response) @@ -1223,7 +1224,7 @@ if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING: response = await handler.unload_lora_adapter(request) if isinstance(response, ErrorResponse): return JSONResponse(content=response.model_dump(), - status_code=response.code) + status_code=response.error.code) return Response(status_code=200, content=response) @@ -1502,9 +1503,10 @@ def build_app(args: Namespace) -> FastAPI: @app.exception_handler(HTTPException) async def http_exception_handler(_: Request, exc: HTTPException): - err = ErrorResponse(message=exc.detail, + err = ErrorResponse( + error=ErrorInfo(message=exc.detail, type=HTTPStatus(exc.status_code).phrase, - code=exc.status_code) + code=exc.status_code)) return JSONResponse(err.model_dump(), status_code=exc.status_code) @app.exception_handler(RequestValidationError) @@ -1518,9 +1520,9 @@ def build_app(args: Namespace) -> FastAPI: else: message = exc_str - err = ErrorResponse(message=message, - type=HTTPStatus.BAD_REQUEST.phrase, - code=HTTPStatus.BAD_REQUEST) + err = ErrorResponse(error=ErrorInfo(message=message, + type=HTTPStatus.BAD_REQUEST.phrase, + code=HTTPStatus.BAD_REQUEST)) return JSONResponse(err.model_dump(), status_code=HTTPStatus.BAD_REQUEST) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 421927d61bba1..ea2cf57563f48 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -78,14 +78,17 @@ class OpenAIBaseModel(BaseModel): return result -class ErrorResponse(OpenAIBaseModel): - object: str = "error" +class ErrorInfo(OpenAIBaseModel): message: str type: str param: Optional[str] = None code: int +class ErrorResponse(OpenAIBaseModel): + error: ErrorInfo + + class ModelPermission(OpenAIBaseModel): id: str = Field(default_factory=lambda: f"modelperm-{random_uuid()}") object: str = "model_permission" diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index d146ad485d194..a10d57456ba08 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -302,7 +302,7 @@ async def run_request(serving_engine_func: Callable, id=f"vllm-{random_uuid()}", custom_id=request.custom_id, response=BatchResponseData( - status_code=response.code, + status_code=response.error.code, request_id=f"vllm-batch-{random_uuid()}"), error=response, ) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 822f1868406c7..efd2f20299d09 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -47,10 +47,10 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, EmbeddingChatRequest, EmbeddingCompletionRequest, EmbeddingRequest, - EmbeddingResponse, ErrorResponse, - PoolingResponse, RerankRequest, - ResponsesRequest, ScoreRequest, - ScoreResponse, + EmbeddingResponse, ErrorInfo, + ErrorResponse, PoolingResponse, + RerankRequest, ResponsesRequest, + ScoreRequest, ScoreResponse, TokenizeChatRequest, TokenizeCompletionRequest, TokenizeResponse, @@ -412,21 +412,18 @@ class OpenAIServing: message: str, err_type: str = "BadRequestError", status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> ErrorResponse: - return ErrorResponse(message=message, - type=err_type, - code=status_code.value) + return ErrorResponse(error=ErrorInfo( + message=message, type=err_type, code=status_code.value)) def create_streaming_error_response( self, message: str, err_type: str = "BadRequestError", status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> str: - json_str = json.dumps({ - "error": + json_str = json.dumps( self.create_error_response(message=message, err_type=err_type, - status_code=status_code).model_dump() - }) + status_code=status_code).model_dump()) return json_str async def _check_model( @@ -445,7 +442,7 @@ class OpenAIServing: if isinstance(load_result, LoRARequest): return None if isinstance(load_result, ErrorResponse) and \ - load_result.code == HTTPStatus.BAD_REQUEST.value: + load_result.error.code == HTTPStatus.BAD_REQUEST.value: error_response = load_result return error_response or self.create_error_response( diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py index 27614fcb41123..a4efa0815b4e7 100644 --- a/vllm/entrypoints/openai/serving_models.py +++ b/vllm/entrypoints/openai/serving_models.py @@ -9,7 +9,7 @@ from typing import Optional, Union from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient -from vllm.entrypoints.openai.protocol import (ErrorResponse, +from vllm.entrypoints.openai.protocol import (ErrorInfo, ErrorResponse, LoadLoRAAdapterRequest, ModelCard, ModelList, ModelPermission, @@ -82,7 +82,7 @@ class OpenAIServingModels: load_result = await self.load_lora_adapter( request=load_request, base_model_name=lora.base_model_name) if isinstance(load_result, ErrorResponse): - raise ValueError(load_result.message) + raise ValueError(load_result.error.message) def is_base_model(self, model_name) -> bool: return any(model.name == model_name for model in self.base_model_paths) @@ -284,6 +284,5 @@ def create_error_response( message: str, err_type: str = "BadRequestError", status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> ErrorResponse: - return ErrorResponse(message=message, - type=err_type, - code=status_code.value) + return ErrorResponse(error=ErrorInfo( + message=message, type=err_type, code=status_code.value))