From dba68f9159e769c2783d96577dabb3aea15ad823 Mon Sep 17 00:00:00 2001 From: Aaron Pham Date: Thu, 12 Jun 2025 18:50:31 -0400 Subject: [PATCH] [Doc] Unify structured outputs examples (#18196) Signed-off-by: Aaron Pham --- docs/features/reasoning_outputs.md | 45 --- docs/features/structured_outputs.md | 80 ++++-- ...enai_chat_completion_structured_outputs.py | 175 ----------- ...etion_structured_outputs_structural_tag.py | 87 ------ ...etion_structured_outputs_with_reasoning.py | 167 ----------- .../structured_outputs/README.md | 54 ++++ .../structured_outputs/pyproject.toml | 8 + .../structured_outputs/structured_outputs.py | 272 ++++++++++++++++++ 8 files changed, 397 insertions(+), 491 deletions(-) delete mode 100644 examples/online_serving/openai_chat_completion_structured_outputs.py delete mode 100644 examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py delete mode 100644 examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py create mode 100644 examples/online_serving/structured_outputs/README.md create mode 100644 examples/online_serving/structured_outputs/pyproject.toml create mode 100644 examples/online_serving/structured_outputs/structured_outputs.py diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md index cbcb246912f4c..59ef10d9c963b 100644 --- a/docs/features/reasoning_outputs.md +++ b/docs/features/reasoning_outputs.md @@ -142,51 +142,6 @@ for chunk in stream: Remember to check whether the `reasoning_content` exists in the response before accessing it. You could checkout the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py). -## Structured output - -The reasoning content is also available in the structured output. The structured output engine like `xgrammar` will use the reasoning content to generate structured output. It is only supported in v0 engine now. - -```bash -vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B --reasoning-parser deepseek_r1 -``` - -The following is an example client: - -```python -from openai import OpenAI -from pydantic import BaseModel - -# Modify OpenAI's API key and API base to use vLLM's API server. -openai_api_key = "EMPTY" -openai_api_base = "http://localhost:8000/v1" - -client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, -) - -models = client.models.list() -model = models.data[0].id - -class People(BaseModel): - name: str - age: int - -json_schema = People.model_json_schema() - -prompt = ("Generate a JSON with the name and age of one random person.") -completion = client.chat.completions.create( - model=model, - messages=[{ - "role": "user", - "content": prompt, - }], - extra_body={"guided_json": json_schema}, -) -print("reasoning_content: ", completion.choices[0].message.reasoning_content) -print("content: ", completion.choices[0].message.content) -``` - ## Tool Calling The reasoning content is also available when both tool calling and the reasoning parser are enabled. Additionally, tool calling only parses functions from the `content` field, not from the `reasoning_content`. diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md index f96b598cff98d..c7abd9d4c2b78 100644 --- a/docs/features/structured_outputs.md +++ b/docs/features/structured_outputs.md @@ -39,9 +39,10 @@ client = OpenAI( base_url="http://localhost:8000/v1", api_key="-", ) +model = client.models.list().data[0].id completion = client.chat.completions.create( - model="Qwen/Qwen2.5-3B-Instruct", + model=model, messages=[ {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} ], @@ -54,7 +55,7 @@ The next example shows how to use the `guided_regex`. The idea is to generate an ```python completion = client.chat.completions.create( - model="Qwen/Qwen2.5-3B-Instruct", + model=model, messages=[ { "role": "user", @@ -92,26 +93,32 @@ class CarDescription(BaseModel): json_schema = CarDescription.model_json_schema() completion = client.chat.completions.create( - model="Qwen/Qwen2.5-3B-Instruct", + model=model, messages=[ { "role": "user", "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's", } ], - extra_body={"guided_json": json_schema}, + "response_format": { + "type": "json_schema", + "json_schema": { + "name": "car-description", + "schema": CarDescription.model_json_schema() + }, + }, ) print(completion.choices[0].message.content) ``` !!! tip While not strictly necessary, normally it´s better to indicate in the prompt the - JSON schema and how the fields should be populated. This can improve the + JSON schema and how the fields should be populated. This can improve the results notably in most cases. Finally we have the `guided_grammar` option, which is probably the most difficult to use, but it´s really powerful. It allows us to define complete -languages like SQL queries. It works by using a context free EBNF grammar. +languages like SQL queries. It works by using a context free EBNF grammar. As an example, we can use to define a specific format of simplified SQL queries: ```python @@ -130,7 +137,7 @@ simplified_sql_grammar = """ """ completion = client.chat.completions.create( - model="Qwen/Qwen2.5-3B-Instruct", + model=model, messages=[ { "role": "user", @@ -142,7 +149,48 @@ completion = client.chat.completions.create( print(completion.choices[0].message.content) ``` -Full example: +See also: [full example](../../examples/online_serving/structured_outputs) + +## Reasoning Outputs + +You can also use structured outputs with for reasoning models. + +```bash +vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --reasoning-parser deepseek_r1 +``` + +Note that you can use reasoning with any provided structured outputs feature. The following uses one with JSON schema: + +```python +from pydantic import BaseModel + + +class People(BaseModel): + name: str + age: int + + +completion = client.chat.completions.create( + model=model, + messages=[ + { + "role": "user", + "content": "Generate a JSON with the name and age of one random person.", + } + ], + response_format={ + "type": "json_schema", + "json_schema": { + "name": "people", + "schema": People.model_json_schema() + } + }, +) +print("reasoning_content: ", completion.choices[0].message.reasoning_content) +print("content: ", completion.choices[0].message.content) +``` + +See also: [full example](../../examples/online_serving/structured_outputs) ## Experimental Automatic Parsing (OpenAI API) @@ -163,14 +211,14 @@ class Info(BaseModel): age: int client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy") +model = client.models.list().data[0].id completion = client.beta.chat.completions.parse( - model="meta-llama/Llama-3.1-8B-Instruct", + model=model, messages=[ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "My name is Cameron, I'm 28. What's my name and age?"}, ], response_format=Info, - extra_body=dict(guided_decoding_backend="outlines"), ) message = completion.choices[0].message @@ -203,15 +251,13 @@ class MathResponse(BaseModel): steps: list[Step] final_answer: str -client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy") completion = client.beta.chat.completions.parse( - model="meta-llama/Llama-3.1-8B-Instruct", + model=model, messages=[ {"role": "system", "content": "You are a helpful expert math tutor."}, {"role": "user", "content": "Solve 8x + 31 = 2."}, ], response_format=MathResponse, - extra_body=dict(guided_decoding_backend="outlines"), ) message = completion.choices[0].message @@ -232,11 +278,11 @@ Step #2: explanation="Next, let's isolate 'x' by dividing both sides of the equa Answer: x = -29/8 ``` -An example of using `structural_tag` can be found here: +An example of using `structural_tag` can be found here: ## Offline Inference -Offline inference allows for the same types of guided decoding. +Offline inference allows for the same types of structured outputs. To use it, we´ll need to configure the guided decoding using the class `GuidedDecodingParams` inside `SamplingParams`. The main available options inside `GuidedDecodingParams` are: @@ -247,7 +293,7 @@ The main available options inside `GuidedDecodingParams` are: - `structural_tag` These parameters can be used in the same way as the parameters from the Online -Serving examples above. One example for the usage of the `choice` parameter is +Serving examples above. One example for the usage of the `choice` parameter is shown below: ```python @@ -265,4 +311,4 @@ outputs = llm.generate( print(outputs[0].outputs[0].text) ``` -Full example: +See also: [full example](../../examples/online_serving/structured_outputs) diff --git a/examples/online_serving/openai_chat_completion_structured_outputs.py b/examples/online_serving/openai_chat_completion_structured_outputs.py deleted file mode 100644 index 5c55d53138a8f..0000000000000 --- a/examples/online_serving/openai_chat_completion_structured_outputs.py +++ /dev/null @@ -1,175 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -To run this example, you need to start the vLLM server: - -```bash -vllm serve Qwen/Qwen2.5-3B-Instruct -``` -""" - -from enum import Enum - -from openai import BadRequestError, OpenAI -from pydantic import BaseModel - -openai_api_key = "EMPTY" -openai_api_base = "http://localhost:8000/v1" - - -# Guided decoding by Choice (list of possible options) -def guided_choice_completion(client: OpenAI, model: str): - completion = client.chat.completions.create( - model=model, - messages=[ - {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} - ], - extra_body={"guided_choice": ["positive", "negative"]}, - ) - return completion.choices[0].message.content - - -# Guided decoding by Regex -def guided_regex_completion(client: OpenAI, model: str): - prompt = ( - "Generate an email address for Alan Turing, who works in Enigma." - "End in .com and new line. Example result:" - "alan.turing@enigma.com\n" - ) - - completion = client.chat.completions.create( - model=model, - messages=[ - { - "role": "user", - "content": prompt, - } - ], - extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]}, - ) - return completion.choices[0].message.content - - -# Guided decoding by JSON using Pydantic schema -class CarType(str, Enum): - sedan = "sedan" - suv = "SUV" - truck = "Truck" - coupe = "Coupe" - - -class CarDescription(BaseModel): - brand: str - model: str - car_type: CarType - - -def guided_json_completion(client: OpenAI, model: str): - json_schema = CarDescription.model_json_schema() - - prompt = ( - "Generate a JSON with the brand, model and car_type of" - "the most iconic car from the 90's" - ) - completion = client.chat.completions.create( - model=model, - messages=[ - { - "role": "user", - "content": prompt, - } - ], - extra_body={"guided_json": json_schema}, - ) - return completion.choices[0].message.content - - -# Guided decoding by Grammar -def guided_grammar_completion(client: OpenAI, model: str): - simplified_sql_grammar = """ - root ::= select_statement - - select_statement ::= "SELECT " column " from " table " where " condition - - column ::= "col_1 " | "col_2 " - - table ::= "table_1 " | "table_2 " - - condition ::= column "= " number - - number ::= "1 " | "2 " - """ - - prompt = ( - "Generate an SQL query to show the 'username' and 'email'" - "from the 'users' table." - ) - completion = client.chat.completions.create( - model=model, - messages=[ - { - "role": "user", - "content": prompt, - } - ], - extra_body={"guided_grammar": simplified_sql_grammar}, - ) - return completion.choices[0].message.content - - -# Extra backend options -def extra_backend_options_completion(client: OpenAI, model: str): - prompt = ( - "Generate an email address for Alan Turing, who works in Enigma." - "End in .com and new line. Example result:" - "alan.turing@enigma.com\n" - ) - - try: - # The guided_decoding_disable_fallback option forces vLLM to use - # xgrammar, so when it fails you get a 400 with the reason why - completion = client.chat.completions.create( - model=model, - messages=[ - { - "role": "user", - "content": prompt, - } - ], - extra_body={ - "guided_regex": r"\w+@\w+\.com\n", - "stop": ["\n"], - "guided_decoding_disable_fallback": True, - }, - ) - return completion.choices[0].message.content - except BadRequestError as e: - print("This error is expected:", e) - - -def main(): - client: OpenAI = OpenAI( - base_url=openai_api_base, - api_key=openai_api_key, - ) - - model = client.models.list().data[0].id - - print("Guided Choice Completion:") - print(guided_choice_completion(client, model)) - - print("\nGuided Regex Completion:") - print(guided_regex_completion(client, model)) - - print("\nGuided JSON Completion:") - print(guided_json_completion(client, model)) - - print("\nGuided Grammar Completion:") - print(guided_grammar_completion(client, model)) - - print("\nExtra Backend Options Completion:") - print(extra_backend_options_completion(client, model)) - - -if __name__ == "__main__": - main() diff --git a/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py b/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py deleted file mode 100644 index ec7d8b95472e6..0000000000000 --- a/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py +++ /dev/null @@ -1,87 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from openai import OpenAI - -# This example demonstrates the `structural_tag` response format. -# It can be used to specify a structured output format that occurs between -# specific tags in the response. This example shows how it could be used -# to enforce the format of a tool call response, but it could be used for -# any structured output within a subset of the response. - -openai_api_key = "EMPTY" -openai_api_base = "http://localhost:8000/v1" - - -def main(): - client = OpenAI( - base_url=openai_api_base, - api_key=openai_api_key, - ) - - messages = [ - { - "role": "user", - "content": """ -You have access to the following function to retrieve the weather in a city: - - { - "name": "get_weather", - "parameters": { - "city": { - "param_type": "string", - "description": "The city to get the weather for", - "required": True - } - } - } - -If a you choose to call a function ONLY reply in the following format: -<{start_tag}={function_name}>{parameters}{end_tag} -where - -start_tag => ` a JSON dict with the function argument name as key and function - argument value as value. -end_tag => `` - -Here is an example, -{"example_name": "example_value"} - -Reminder: -- Function calls MUST follow the specified format -- Required parameters MUST be specified -- Only call one function at a time -- Put the entire function call reply on one line -- Always add your sources when using search results to answer the user query - -You are a helpful assistant. - -Given the previous instructions, what is the weather in New York City, Boston, -and San Francisco? -""", - } - ] - - response = client.chat.completions.create( - model=client.models.list().data[0].id, - messages=messages, - response_format={ - "type": "structural_tag", - "structures": [ - { - "begin": "", - "schema": { - "type": "object", - "properties": {"city": {"type": "string"}}, - }, - "end": "", - } - ], - "triggers": ["` before running the script. + +## Usage + +Run all constraints, non-streaming: + +```bash +uv run structured_outputs.py +``` + +Run all constraints, streaming: + +```bash +uv run structured_outputs.py --stream +``` + +Run certain constraints, for example `structural_tag` and `regex`, streaming: + +```bash +uv run structured_outputs.py --constraint structural_tag regex --stream +``` + +Run all constraints, with reasoning models and streaming: + +```bash +uv run structured_outputs.py --reasoning --stream +``` diff --git a/examples/online_serving/structured_outputs/pyproject.toml b/examples/online_serving/structured_outputs/pyproject.toml new file mode 100644 index 0000000000000..8f31405ff584a --- /dev/null +++ b/examples/online_serving/structured_outputs/pyproject.toml @@ -0,0 +1,8 @@ +[project] +name = "examples-online-structured-outputs" +requires-python = ">=3.9, <3.13" +dependencies = ["openai==1.78.1", "pydantic==2.11.4"] +version = "0.0.0" + +[project.scripts] +structured-outputs = "structured_outputs:main" diff --git a/examples/online_serving/structured_outputs/structured_outputs.py b/examples/online_serving/structured_outputs/structured_outputs.py new file mode 100644 index 0000000000000..2a8f4637260c2 --- /dev/null +++ b/examples/online_serving/structured_outputs/structured_outputs.py @@ -0,0 +1,272 @@ +# ruff: noqa: E501 +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from __future__ import annotations + +import argparse +import asyncio +import enum +import os +from typing import TYPE_CHECKING, Any, Literal + +import openai +import pydantic + +if TYPE_CHECKING: + from openai.types.chat import ChatCompletionChunk + + +ConstraintsFormat = Literal[ + "choice", + "regex", + "json", + "grammar", + "structural_tag", +] + + +async def print_stream_response( + stream_response: openai.AsyncStream[ChatCompletionChunk], + title: str, + args: argparse.Namespace, +): + print(f"\n\n{title} (Streaming):") + + local_reasoning_header_printed = False + local_content_header_printed = False + + async for chunk in stream_response: + delta = chunk.choices[0].delta + + reasoning_chunk_text: str | None = getattr(delta, "reasoning_content", None) + content_chunk_text = delta.content + + if args.reasoning: + if reasoning_chunk_text: + if not local_reasoning_header_printed: + print(" Reasoning: ", end="") + local_reasoning_header_printed = True + print(reasoning_chunk_text, end="", flush=True) + + if content_chunk_text: + if not local_content_header_printed: + if local_reasoning_header_printed: + print() + print(" Content: ", end="") + local_content_header_printed = True + print(content_chunk_text, end="", flush=True) + else: + if content_chunk_text: + if not local_content_header_printed: + print(" Content: ", end="") + local_content_header_printed = True + print(content_chunk_text, end="", flush=True) + print() + + +class CarType(str, enum.Enum): + SEDAN = "SEDAN" + SUV = "SUV" + TRUCK = "TRUCK" + COUPE = "COUPE" + + +class CarDescription(pydantic.BaseModel): + brand: str + model: str + car_type: CarType + + +PARAMS: dict[ConstraintsFormat, dict[str, Any]] = { + "choice": { + "messages": [ + { + "role": "user", + "content": "Classify this sentiment: vLLM is wonderful!", + } + ], + "extra_body": {"guided_choice": ["positive", "negative"]}, + }, + "regex": { + "messages": [ + { + "role": "user", + "content": "Generate an email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: 'alan.turing@enigma.com\n'", + } + ], + "extra_body": { + "guided_regex": r"[a-z0-9.]{1,20}@\w{6,10}\.com\n", + }, + }, + "json": { + "messages": [ + { + "role": "user", + "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's", + } + ], + "response_format": { + "type": "json_schema", + "json_schema": { + "name": "car-description", + "schema": CarDescription.model_json_schema(), + }, + }, + }, + "grammar": { + "messages": [ + { + "role": "user", + "content": "Generate an SQL query to show the 'username' and 'email'from the 'users' table.", + } + ], + "extra_body": { + "guided_grammar": """ +root ::= select_statement + +select_statement ::= "SELECT " column " from " table " where " condition + +column ::= "col_1 " | "col_2 " + +table ::= "table_1 " | "table_2 " + +condition ::= column "= " number + +number ::= "1 " | "2 " +""", + }, + }, + "structural_tag": { + "messages": [ + { + "role": "user", + "content": """ +You have access to the following function to retrieve the weather in a city: + +{ + "name": "get_weather", + "parameters": { + "city": { + "param_type": "string", + "description": "The city to get the weather for", + "required": True + } + } +} + +If a you choose to call a function ONLY reply in the following format: +<{start_tag}={function_name}>{parameters}{end_tag} +where + +start_tag => ` a JSON dict with the function argument name as key and function + argument value as value. +end_tag => `` + +Here is an example, +{"example_name": "example_value"} + +Reminder: +- Function calls MUST follow the specified format +- Required parameters MUST be specified +- Only call one function at a time +- Put the entire function call reply on one line +- Always add your sources when using search results to answer the user query + +You are a helpful assistant. + +Given the previous instructions, what is the weather in New York City, Boston, +and San Francisco?""", + }, + ], + "response_format": { + "type": "structural_tag", + "structures": [ + { + "begin": "", + "schema": { + "type": "object", + "properties": {"city": {"type": "string"}}, + "required": ["city"], + }, + "end": "", + } + ], + "triggers": ["