diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py index 9773f3e45b99c..7d823542e3744 100644 --- a/tests/entrypoints/openai/test_tokenization.py +++ b/tests/entrypoints/openai/test_tokenization.py @@ -76,11 +76,11 @@ async def test_tokenize_completions( }) response.raise_for_status() - assert response.json() == { - "tokens": tokens, - "count": len(tokens), - "max_model_len": 8192 - } + result = response.json() + assert result["tokens"] == tokens + assert result["count"] == len(tokens) + assert result["max_model_len"] == 8192 + assert result["token_strs"] is None @pytest.mark.asyncio @@ -138,11 +138,11 @@ async def test_tokenize_chat( }) response.raise_for_status() - assert response.json() == { - "tokens": tokens, - "count": len(tokens), - "max_model_len": 8192 - } + result = response.json() + assert result["tokens"] == tokens + assert result["count"] == len(tokens) + assert result["max_model_len"] == 8192 + assert result["token_strs"] is None @pytest.mark.asyncio @@ -215,11 +215,46 @@ async def test_tokenize_chat_with_tools( ) response.raise_for_status() - assert response.json() == { - "tokens": tokens, - "count": len(tokens), - "max_model_len": 8192, - } + result = response.json() + assert result["tokens"] == tokens + assert result["count"] == len(tokens) + assert result["max_model_len"] == 8192 + assert result["token_strs"] is None + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name, tokenizer_name", + [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")], + indirect=["tokenizer_name"], +) +async def test_tokenize_with_return_token_strs( + server: RemoteOpenAIServer, + model_name: str, + tokenizer_name: str, +): + tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, + tokenizer_mode="fast") + + prompt = "This is a token_strs test prompt! vllm1" + response = requests.post( + server.url_for("tokenize"), + json={ + "prompt": prompt, + "model": model_name, + "return_token_strs": True + }, + ) + response.raise_for_status() + + tokens = tokenizer.encode(prompt, add_special_tokens=True) + tokens_str = tokenizer.convert_ids_to_tokens(tokens) + + result = response.json() + assert result["tokens"] == tokens + assert result["count"] == len(tokens) + assert result["max_model_len"] == 8192 + assert result["token_strs"] == tokens_str @pytest.mark.asyncio diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 2f641079e5840..e72c23993ac8c 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -1563,6 +1563,11 @@ class TokenizeCompletionRequest(OpenAIBaseModel): "If true (the default), special tokens (e.g. BOS) will be added to " "the prompt."), ) + return_token_strs: Optional[bool] = Field( + default=False, + description=("If true, also return the token strings " + "corresponding to the token ids."), + ) class TokenizeChatRequest(OpenAIBaseModel): @@ -1576,6 +1581,11 @@ class TokenizeChatRequest(OpenAIBaseModel): "This is a parameter used by chat template in tokenizer config of the " "model."), ) + return_token_strs: Optional[bool] = Field( + default=False, + description=("If true, also return the token strings " + "corresponding to the token ids."), + ) continue_final_message: bool = Field( default=False, description= @@ -1633,6 +1643,7 @@ class TokenizeResponse(OpenAIBaseModel): count: int max_model_len: int tokens: list[int] + token_strs: Optional[list[str]] = None class DetokenizeRequest(OpenAIBaseModel): diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py index 5ef1a486d86c8..0d739bbf9bf22 100644 --- a/vllm/entrypoints/openai/serving_tokenization.py +++ b/vllm/entrypoints/openai/serving_tokenization.py @@ -110,7 +110,12 @@ class OpenAIServingTokenization(OpenAIServing): dict) and "prompt_token_ids" in engine_prompt: input_ids.extend(engine_prompt["prompt_token_ids"]) + token_strs = None + if request.return_token_strs: + token_strs = tokenizer.convert_ids_to_tokens(input_ids) + return TokenizeResponse(tokens=input_ids, + token_strs=token_strs, count=len(input_ids), max_model_len=self.max_model_len)