diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 233c31a1bd7f7..bc8855a76e2a2 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -909,6 +909,16 @@ def build_app(args: Namespace) -> FastAPI: @app.exception_handler(RequestValidationError) async def validation_exception_handler(_: Request, exc: RequestValidationError): + from vllm.entrypoints.openai.protocol import VLLMValidationError + + param = None + for error in exc.errors(): + if "ctx" in error and "error" in error["ctx"]: + ctx_error = error["ctx"]["error"] + if isinstance(ctx_error, VLLMValidationError): + param = ctx_error.parameter + break + exc_str = str(exc) errors_str = str(exc.errors()) @@ -922,6 +932,7 @@ def build_app(args: Namespace) -> FastAPI: message=message, type=HTTPStatus.BAD_REQUEST.phrase, code=HTTPStatus.BAD_REQUEST, + param=param, ) ) return JSONResponse(err.model_dump(), status_code=HTTPStatus.BAD_REQUEST) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index a3c347cb1bd3f..982f5533ad7f9 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -131,6 +131,36 @@ class ErrorResponse(OpenAIBaseModel): error: ErrorInfo +class VLLMValidationError(ValueError): + """vLLM-specific validation error for request validation failures. + + Args: + message: The error message describing the validation failure. + parameter: Optional parameter name that failed validation. + value: Optional value that was rejected during validation. + """ + + def __init__( + self, + message: str, + *, + parameter: str | None = None, + value: Any = None, + ) -> None: + super().__init__(message) + self.parameter = parameter + self.value = value + + def __str__(self): + base = super().__str__() + extras = [] + if self.parameter is not None: + extras.append(f"parameter={self.parameter}") + if self.value is not None: + extras.append(f"value={self.value}") + return f"{base} ({', '.join(extras)})" if extras else base + + class ModelPermission(OpenAIBaseModel): id: str = Field(default_factory=lambda: f"modelperm-{random_uuid()}") object: str = "model_permission" @@ -466,7 +496,9 @@ class ResponsesRequest(OpenAIBaseModel): @model_validator(mode="before") def validate_prompt(cls, data): if data.get("prompt") is not None: - raise ValueError("prompt template is not supported") + raise VLLMValidationError( + "prompt template is not supported", parameter="prompt" + ) return data @model_validator(mode="before") @@ -850,7 +882,10 @@ class ChatCompletionRequest(OpenAIBaseModel): @classmethod def validate_stream_options(cls, data): if data.get("stream_options") and not data.get("stream"): - raise ValueError("Stream options can only be defined when `stream=True`.") + raise VLLMValidationError( + "Stream options can only be defined when `stream=True`.", + parameter="stream_options", + ) return data @@ -859,19 +894,29 @@ class ChatCompletionRequest(OpenAIBaseModel): def check_logprobs(cls, data): if (prompt_logprobs := data.get("prompt_logprobs")) is not None: if data.get("stream") and (prompt_logprobs > 0 or prompt_logprobs == -1): - raise ValueError( - "`prompt_logprobs` are not available when `stream=True`." + raise VLLMValidationError( + "`prompt_logprobs` are not available when `stream=True`.", + parameter="prompt_logprobs", ) if prompt_logprobs < 0 and prompt_logprobs != -1: - raise ValueError("`prompt_logprobs` must be a positive value or -1.") + raise VLLMValidationError( + "`prompt_logprobs` must be a positive value or -1.", + parameter="prompt_logprobs", + value=prompt_logprobs, + ) if (top_logprobs := data.get("top_logprobs")) is not None: if top_logprobs < 0 and top_logprobs != -1: - raise ValueError("`top_logprobs` must be a positive value or -1.") + raise VLLMValidationError( + "`top_logprobs` must be a positive value or -1.", + parameter="top_logprobs", + value=top_logprobs, + ) if (top_logprobs == -1 or top_logprobs > 0) and not data.get("logprobs"): - raise ValueError( - "when using `top_logprobs`, `logprobs` must be set to true." + raise VLLMValidationError( + "when using `top_logprobs`, `logprobs` must be set to true.", + parameter="top_logprobs", ) return data @@ -1285,9 +1330,10 @@ class CompletionRequest(OpenAIBaseModel): for k in ("json", "regex", "choice") ) if count > 1: - raise ValueError( + raise VLLMValidationError( "You can only use one kind of constraints for structured " - "outputs ('json', 'regex' or 'choice')." + "outputs ('json', 'regex' or 'choice').", + parameter="structured_outputs", ) return data @@ -1296,14 +1342,23 @@ class CompletionRequest(OpenAIBaseModel): def check_logprobs(cls, data): if (prompt_logprobs := data.get("prompt_logprobs")) is not None: if data.get("stream") and (prompt_logprobs > 0 or prompt_logprobs == -1): - raise ValueError( - "`prompt_logprobs` are not available when `stream=True`." + raise VLLMValidationError( + "`prompt_logprobs` are not available when `stream=True`.", + parameter="prompt_logprobs", ) if prompt_logprobs < 0 and prompt_logprobs != -1: - raise ValueError("`prompt_logprobs` must be a positive value or -1.") + raise VLLMValidationError( + "`prompt_logprobs` must be a positive value or -1.", + parameter="prompt_logprobs", + value=prompt_logprobs, + ) if (logprobs := data.get("logprobs")) is not None and logprobs < 0: - raise ValueError("`logprobs` must be a positive value.") + raise VLLMValidationError( + "`logprobs` must be a positive value.", + parameter="logprobs", + value=logprobs, + ) return data @@ -1311,7 +1366,10 @@ class CompletionRequest(OpenAIBaseModel): @classmethod def validate_stream_options(cls, data): if data.get("stream_options") and not data.get("stream"): - raise ValueError("Stream options can only be defined when `stream=True`.") + raise VLLMValidationError( + "Stream options can only be defined when `stream=True`.", + parameter="stream_options", + ) return data @@ -2138,7 +2196,15 @@ class TranscriptionRequest(OpenAIBaseModel): stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"] stream = data.get("stream", False) if any(bool(data.get(so, False)) for so in stream_opts) and not stream: - raise ValueError("Stream options can only be defined when `stream=True`.") + # Find which specific stream option was set + invalid_param = next( + (so for so in stream_opts if data.get(so, False)), + "stream_include_usage", + ) + raise VLLMValidationError( + "Stream options can only be defined when `stream=True`.", + parameter=invalid_param, + ) return data @@ -2351,7 +2417,15 @@ class TranslationRequest(OpenAIBaseModel): stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"] stream = data.get("stream", False) if any(bool(data.get(so, False)) for so in stream_opts) and not stream: - raise ValueError("Stream options can only be defined when `stream=True`.") + # Find which specific stream option was set + invalid_param = next( + (so for so in stream_opts if data.get(so, False)), + "stream_include_usage", + ) + raise VLLMValidationError( + "Stream options can only be defined when `stream=True`.", + parameter=invalid_param, + ) return data diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 422a8c18e8e98..690fb22e2274d 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -417,8 +417,7 @@ class OpenAIServingChat(OpenAIServing): generators.append(generator) except ValueError as e: - # TODO: Use a vllm-specific Validation Error - return self.create_error_response(str(e)) + return self.create_error_response(e) assert len(generators) == 1 (result_generator,) = generators @@ -448,8 +447,7 @@ class OpenAIServingChat(OpenAIServing): except GenerationError as e: return self._convert_generation_error_to_response(e) except ValueError as e: - # TODO: Use a vllm-specific Validation Error - return self.create_error_response(str(e)) + return self.create_error_response(e) def get_chat_request_role(self, request: ChatCompletionRequest) -> str: if request.add_generation_prompt: @@ -682,7 +680,7 @@ class OpenAIServingChat(OpenAIServing): tool_parsers = [None] * num_choices except Exception as e: logger.exception("Error in tool parser creation.") - data = self.create_streaming_error_response(str(e)) + data = self.create_streaming_error_response(e) yield f"data: {data}\n\n" yield "data: [DONE]\n\n" return @@ -1328,9 +1326,8 @@ class OpenAIServingChat(OpenAIServing): except GenerationError as e: yield f"data: {self._convert_generation_error_to_streaming_response(e)}\n\n" except Exception as e: - # TODO: Use a vllm-specific Validation Error logger.exception("Error in chat completion stream generator.") - data = self.create_streaming_error_response(str(e)) + data = self.create_streaming_error_response(e) yield f"data: {data}\n\n" # Send the final done message after all response.n are finished yield "data: [DONE]\n\n" @@ -1354,8 +1351,7 @@ class OpenAIServingChat(OpenAIServing): except asyncio.CancelledError: return self.create_error_response("Client disconnected") except ValueError as e: - # TODO: Use a vllm-specific Validation Error - return self.create_error_response(str(e)) + return self.create_error_response(e) assert final_res is not None diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 265ca9915e5db..d9a8ccb9f851d 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -23,6 +23,7 @@ from vllm.entrypoints.openai.protocol import ( PromptTokenUsageInfo, RequestResponseMetadata, UsageInfo, + VLLMValidationError, ) from vllm.entrypoints.openai.serving_engine import ( GenerationError, @@ -247,8 +248,7 @@ class OpenAIServingCompletion(OpenAIServing): generators.append(generator) except ValueError as e: - # TODO: Use a vllm-specific Validation Error - return self.create_error_response(str(e)) + return self.create_error_response(e) result_generator = merge_async_iterators(*generators) @@ -308,8 +308,7 @@ class OpenAIServingCompletion(OpenAIServing): except GenerationError as e: return self._convert_generation_error_to_response(e) except ValueError as e: - # TODO: Use a vllm-specific Validation Error - return self.create_error_response(str(e)) + return self.create_error_response(e) # When user requests streaming but we don't stream, we still need to # return a streaming response with a single event. @@ -510,9 +509,8 @@ class OpenAIServingCompletion(OpenAIServing): except GenerationError as e: yield f"data: {self._convert_generation_error_to_streaming_response(e)}\n\n" except Exception as e: - # TODO: Use a vllm-specific Validation Error logger.exception("Error in completion stream generator.") - data = self.create_streaming_error_response(str(e)) + data = self.create_streaming_error_response(e) yield f"data: {data}\n\n" yield "data: [DONE]\n\n" @@ -660,8 +658,11 @@ class OpenAIServingCompletion(OpenAIServing): token = f"token_id:{token_id}" else: if tokenizer is None: - raise ValueError( - "Unable to get tokenizer because `skip_tokenizer_init=True`" + raise VLLMValidationError( + "Unable to get tokenizer because " + "`skip_tokenizer_init=True`", + parameter="skip_tokenizer_init", + value=True, ) token = tokenizer.decode(token_id) @@ -720,6 +721,15 @@ class OpenAIServingCompletion(OpenAIServing): request: CompletionRequest, max_input_length: int | None = None, ) -> RenderConfig: + # Validate max_tokens before using it + if request.max_tokens is not None and request.max_tokens > self.max_model_len: + raise VLLMValidationError( + f"'max_tokens' ({request.max_tokens}) cannot be greater than " + f"the model's maximum context length ({self.max_model_len}).", + parameter="max_tokens", + value=request.max_tokens, + ) + max_input_tokens_len = self.max_model_len - (request.max_tokens or 0) return RenderConfig( max_length=max_input_tokens_len, diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index b9771963c6d4c..5ea2a7a572650 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -57,6 +57,7 @@ from vllm.entrypoints.openai.protocol import ( TranscriptionRequest, TranscriptionResponse, TranslationRequest, + VLLMValidationError, ) from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.pooling.classify.protocol import ( @@ -322,8 +323,10 @@ class OpenAIServing: input_processor = self.input_processor tokenizer = input_processor.tokenizer if tokenizer is None: - raise ValueError( - "You cannot use beam search when `skip_tokenizer_init=True`" + raise VLLMValidationError( + "You cannot use beam search when `skip_tokenizer_init=True`", + parameter="skip_tokenizer_init", + value=True, ) eos_token_id: int = tokenizer.eos_token_id # type: ignore @@ -706,8 +709,7 @@ class OpenAIServing: return None except Exception as e: - # TODO: Use a vllm-specific Validation Error - return self.create_error_response(str(e)) + return self.create_error_response(e) async def _collect_batch( self, @@ -738,14 +740,43 @@ class OpenAIServing: return None except Exception as e: - return self.create_error_response(str(e)) + return self.create_error_response(e) def create_error_response( self, - message: str, + message: str | Exception, err_type: str = "BadRequestError", status_code: HTTPStatus = HTTPStatus.BAD_REQUEST, + param: str | None = None, ) -> ErrorResponse: + exc: Exception | None = None + + if isinstance(message, Exception): + exc = message + + from vllm.entrypoints.openai.protocol import VLLMValidationError + + if isinstance(exc, VLLMValidationError): + err_type = "BadRequestError" + status_code = HTTPStatus.BAD_REQUEST + param = exc.parameter + elif isinstance(exc, (ValueError, TypeError, RuntimeError)): + # Common validation errors from user input + err_type = "BadRequestError" + status_code = HTTPStatus.BAD_REQUEST + param = None + elif exc.__class__.__name__ == "TemplateError": + # jinja2.TemplateError (avoid importing jinja2) + err_type = "BadRequestError" + status_code = HTTPStatus.BAD_REQUEST + param = None + else: + err_type = "InternalServerError" + status_code = HTTPStatus.INTERNAL_SERVER_ERROR + param = None + + message = str(exc) + if self.log_error_stack: exc_type, _, _ = sys.exc_info() if exc_type is not None: @@ -753,18 +784,27 @@ class OpenAIServing: else: traceback.print_stack() return ErrorResponse( - error=ErrorInfo(message=message, type=err_type, code=status_code.value) + error=ErrorInfo( + message=message, + type=err_type, + code=status_code.value, + param=param, + ) ) def create_streaming_error_response( self, - message: str, + message: str | Exception, err_type: str = "BadRequestError", status_code: HTTPStatus = HTTPStatus.BAD_REQUEST, + param: str | None = None, ) -> str: json_str = json.dumps( self.create_error_response( - message=message, err_type=err_type, status_code=status_code + message=message, + err_type=err_type, + status_code=status_code, + param=param, ).model_dump() ) return json_str @@ -825,6 +865,7 @@ class OpenAIServing: message=f"The model `{request.model}` does not exist.", err_type="NotFoundError", status_code=HTTPStatus.NOT_FOUND, + param="model", ) def _get_active_default_mm_loras(self, request: AnyRequest) -> LoRARequest | None: @@ -991,11 +1032,13 @@ class OpenAIServing: ClassificationChatRequest: "classification", } operation = operations.get(type(request), "embedding generation") - raise ValueError( + raise VLLMValidationError( f"This model's maximum context length is " f"{self.max_model_len} tokens. However, you requested " f"{token_num} tokens in the input for {operation}. " - f"Please reduce the length of the input." + f"Please reduce the length of the input.", + parameter="input_tokens", + value=token_num, ) return TokensPrompt(prompt=input_text, prompt_token_ids=input_ids) @@ -1017,20 +1060,24 @@ class OpenAIServing: # Note: input length can be up to model context length - 1 for # completion-like requests. if token_num >= self.max_model_len: - raise ValueError( + raise VLLMValidationError( f"This model's maximum context length is " f"{self.max_model_len} tokens. However, your request has " f"{token_num} input tokens. Please reduce the length of " - "the input messages." + "the input messages.", + parameter="input_tokens", + value=token_num, ) if max_tokens is not None and token_num + max_tokens > self.max_model_len: - raise ValueError( + raise VLLMValidationError( "'max_tokens' or 'max_completion_tokens' is too large: " f"{max_tokens}. This model's maximum context length is " f"{self.max_model_len} tokens and your request has " f"{token_num} input tokens ({max_tokens} > {self.max_model_len}" - f" - {token_num})." + f" - {token_num}).", + parameter="max_tokens", + value=max_tokens, ) return TokensPrompt(prompt=input_text, prompt_token_ids=input_ids) diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 1f9b5704624ab..e9eaaa49275d3 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -94,6 +94,7 @@ from vllm.entrypoints.openai.protocol import ( ResponsesResponse, ResponseUsage, StreamingResponsesResponse, + VLLMValidationError, ) from vllm.entrypoints.openai.serving_engine import ( GenerationError, @@ -271,6 +272,7 @@ class OpenAIServingResponses(OpenAIServing): err_type="invalid_request_error", message=error_message, status_code=HTTPStatus.BAD_REQUEST, + param="input", ) return None @@ -282,6 +284,7 @@ class OpenAIServingResponses(OpenAIServing): err_type="invalid_request_error", message="logprobs are not supported with gpt-oss models", status_code=HTTPStatus.BAD_REQUEST, + param="logprobs", ) if request.store and not self.enable_store and request.background: return self.create_error_response( @@ -294,6 +297,7 @@ class OpenAIServingResponses(OpenAIServing): "the vLLM server." ), status_code=HTTPStatus.BAD_REQUEST, + param="background", ) if request.previous_input_messages and request.previous_response_id: return self.create_error_response( @@ -301,6 +305,7 @@ class OpenAIServingResponses(OpenAIServing): message="Only one of `previous_input_messages` and " "`previous_response_id` can be set.", status_code=HTTPStatus.BAD_REQUEST, + param="previous_response_id", ) return None @@ -457,8 +462,7 @@ class OpenAIServingResponses(OpenAIServing): ) generators.append(generator) except ValueError as e: - # TODO: Use a vllm-specific Validation Error - return self.create_error_response(str(e)) + return self.create_error_response(e) assert len(generators) == 1 (result_generator,) = generators @@ -546,7 +550,7 @@ class OpenAIServingResponses(OpenAIServing): except GenerationError as e: return self._convert_generation_error_to_response(e) except Exception as e: - return self.create_error_response(str(e)) + return self.create_error_response(e) async def _make_request( self, @@ -630,8 +634,7 @@ class OpenAIServingResponses(OpenAIServing): except asyncio.CancelledError: return self.create_error_response("Client disconnected") except ValueError as e: - # TODO: Use a vllm-specific Validation Error - return self.create_error_response(str(e)) + return self.create_error_response(e) # NOTE: Implementation of stauts is still WIP, but for now # we guarantee that if the status is not "completed", it is accurate. @@ -1074,7 +1077,7 @@ class OpenAIServingResponses(OpenAIServing): response = self._convert_generation_error_to_response(e) except Exception as e: logger.exception("Background request failed for %s", request.request_id) - response = self.create_error_response(str(e)) + response = self.create_error_response(e) finally: new_event_signal.set() @@ -1099,7 +1102,7 @@ class OpenAIServingResponses(OpenAIServing): response = self._convert_generation_error_to_response(e) except Exception as e: logger.exception("Background request failed for %s", request.request_id) - response = self.create_error_response(str(e)) + response = self.create_error_response(e) if isinstance(response, ErrorResponse): # If the request has failed, update the status to "failed". @@ -1116,7 +1119,11 @@ class OpenAIServingResponses(OpenAIServing): starting_after: int | None = None, ) -> AsyncGenerator[StreamingResponsesResponse, None]: if response_id not in self.event_store: - raise ValueError(f"Unknown response_id: {response_id}") + raise VLLMValidationError( + f"Unknown response_id: {response_id}", + parameter="response_id", + value=response_id, + ) event_deque, new_event_signal = self.event_store[response_id] start_index = 0 if starting_after is None else starting_after + 1 @@ -1172,6 +1179,7 @@ class OpenAIServingResponses(OpenAIServing): return self.create_error_response( err_type="invalid_request_error", message="Cannot cancel a synchronous response.", + param="response_id", ) # Update the status to "cancelled". @@ -1191,6 +1199,7 @@ class OpenAIServingResponses(OpenAIServing): err_type="invalid_request_error", message=f"Response with id '{response_id}' not found.", status_code=HTTPStatus.NOT_FOUND, + param="response_id", ) def _make_store_not_supported_error(self) -> ErrorResponse: @@ -1203,6 +1212,7 @@ class OpenAIServingResponses(OpenAIServing): "starting the vLLM server." ), status_code=HTTPStatus.BAD_REQUEST, + param="store", ) async def _process_simple_streaming_events( diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py index 3e648f44f380b..22da46902da14 100644 --- a/vllm/entrypoints/openai/speech_to_text.py +++ b/vllm/entrypoints/openai/speech_to_text.py @@ -30,6 +30,7 @@ from vllm.entrypoints.openai.protocol import ( TranslationSegment, TranslationStreamResponse, UsageInfo, + VLLMValidationError, ) from vllm.entrypoints.openai.serving_engine import OpenAIServing, SpeechToTextRequest from vllm.entrypoints.openai.serving_models import OpenAIServingModels @@ -259,7 +260,11 @@ class OpenAISpeechToText(OpenAIServing): ) if len(audio_data) / 1024**2 > self.max_audio_filesize_mb: - raise ValueError("Maximum file size exceeded.") + raise VLLMValidationError( + "Maximum file size exceeded", + parameter="audio_filesize_mb", + value=len(audio_data) / 1024**2, + ) with io.BytesIO(audio_data) as bytes_: # NOTE resample to model SR here for efficiency. This is also a @@ -287,12 +292,18 @@ class OpenAISpeechToText(OpenAIServing): ) if request.response_format == "verbose_json": if not isinstance(prompt, dict): - raise ValueError(f"Expected prompt to be a dict,got {type(prompt)}") + raise VLLMValidationError( + "Expected prompt to be a dict", + parameter="prompt", + value=type(prompt).__name__, + ) prompt_dict = cast(dict, prompt) decoder_prompt = prompt.get("decoder_prompt") if not isinstance(decoder_prompt, str): - raise ValueError( - f"Expected decoder_prompt to bestr, got {type(decoder_prompt)}" + raise VLLMValidationError( + "Expected decoder_prompt to be str", + parameter="decoder_prompt", + value=type(decoder_prompt).__name__, ) prompt_dict["decoder_prompt"] = decoder_prompt.replace( "<|notimestamps|>", "<|0.00|>" @@ -412,7 +423,7 @@ class OpenAISpeechToText(OpenAIServing): except ValueError as e: logger.exception("Error in preprocessing prompt inputs") - return self.create_error_response(str(e)) + return self.create_error_response(e) list_result_generator: list[AsyncGenerator[RequestOutput, None]] | None = None try: @@ -448,8 +459,7 @@ class OpenAISpeechToText(OpenAIServing): for i, prompt in enumerate(prompts) ] except ValueError as e: - # TODO: Use a vllm-specific Validation Error - return self.create_error_response(str(e)) + return self.create_error_response(e) if request.stream: return stream_generator_method( @@ -523,8 +533,7 @@ class OpenAISpeechToText(OpenAIServing): except asyncio.CancelledError: return self.create_error_response("Client disconnected") except ValueError as e: - # TODO: Use a vllm-specific Validation Error - return self.create_error_response(str(e)) + return self.create_error_response(e) async def _speech_to_text_stream_generator( self, @@ -634,9 +643,8 @@ class OpenAISpeechToText(OpenAIServing): ) except Exception as e: - # TODO: Use a vllm-specific Validation Error logger.exception("Error in %s stream generator.", self.task_type) - data = self.create_streaming_error_response(str(e)) + data = self.create_streaming_error_response(e) yield f"data: {data}\n\n" # Send the final done message after all response.n are finished yield "data: [DONE]\n\n" diff --git a/vllm/entrypoints/renderer.py b/vllm/entrypoints/renderer.py index 0f89c840be80f..ca2e27fa4428b 100644 --- a/vllm/entrypoints/renderer.py +++ b/vllm/entrypoints/renderer.py @@ -12,6 +12,7 @@ import torch from pydantic import Field from vllm.config import ModelConfig +from vllm.entrypoints.openai.protocol import VLLMValidationError from vllm.inputs.data import EmbedsPrompt, TextPrompt, TokensPrompt from vllm.inputs.parse import get_prompt_components, parse_raw_prompts from vllm.tokenizers import TokenizerLike @@ -162,8 +163,9 @@ class BaseRenderer(ABC): ) -> list[EmbedsPrompt]: """Load and validate base64-encoded embeddings into prompt objects.""" if not self.model_config.enable_prompt_embeds: - raise ValueError( - "You must set `--enable-prompt-embeds` to input `prompt_embeds`." + raise VLLMValidationError( + "You must set `--enable-prompt-embeds` to input `prompt_embeds`.", + parameter="prompt_embeds", ) def _load_and_validate_embed(embed: bytes) -> EmbedsPrompt: @@ -396,10 +398,12 @@ class CompletionRenderer(BaseRenderer): ) -> TokensPrompt: """Create validated TokensPrompt.""" if max_length is not None and len(token_ids) > max_length: - raise ValueError( + raise VLLMValidationError( f"This model's maximum context length is {max_length} tokens. " f"However, your request has {len(token_ids)} input tokens. " - "Please reduce the length of the input messages." + "Please reduce the length of the input messages.", + parameter="input_tokens", + value=len(token_ids), ) tokens_prompt = TokensPrompt(prompt_token_ids=token_ids)