From da2705198fa19030a25d0bea437f7be6547d47d4 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Wed, 13 Aug 2025 07:22:56 -0700 Subject: [PATCH] [Misc] clear and separate error messages for input too long and input + max-tokens too long (#22803) Signed-off-by: Roger Wang --- vllm/entrypoints/openai/serving_engine.py | 31 +++++++++++++---------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index fb9d456df78e..d6f92a63301e 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -585,6 +585,8 @@ class OpenAIServing: (EmbeddingChatRequest, EmbeddingCompletionRequest, ScoreRequest, RerankRequest, ClassificationRequest)): + # Note: input length can be up to the entire model context length + # since these requests don't generate tokens. if token_num > self.max_model_len: operations: dict[type[AnyRequest], str] = { ScoreRequest: "score", @@ -613,21 +615,24 @@ class OpenAIServing: max_tokens = request.max_completion_tokens or request.max_tokens else: max_tokens = getattr(request, "max_tokens", None) - if max_tokens is None: - if token_num >= self.max_model_len: - raise ValueError( - f"This model's maximum context length is " - f"{self.max_model_len} tokens. However, you requested " - f"{token_num} tokens in the messages, " - f"Please reduce the length of the messages.") - elif token_num + max_tokens > self.max_model_len: + + # Note: input length can be up to model context length - 1 for + # completion-like requests. + if token_num >= self.max_model_len: raise ValueError( f"This model's maximum context length is " - f"{self.max_model_len} tokens. However, you requested " - f"{max_tokens + token_num} tokens " - f"({token_num} in the messages, " - f"{max_tokens} in the completion). " - f"Please reduce the length of the messages or completion.") + f"{self.max_model_len} tokens. However, your request has " + f"{token_num} input tokens. Please reduce the length of " + "the input messages.") + + if max_tokens is not None and \ + token_num + max_tokens > self.max_model_len: + raise ValueError( + "'max_tokens' or 'max_completion_tokens' is too large: " + f"{max_tokens}. This model's maximum context length is " + f"{self.max_model_len} tokens and your request has " + f"{token_num} input tokens ({max_tokens} > {self.max_model_len}" + f" - {token_num}).") return TextTokensPrompt(prompt=input_text, prompt_token_ids=input_ids)