From dba68f9159e769c2783d96577dabb3aea15ad823 Mon Sep 17 00:00:00 2001
From: Aaron Pham <contact@aarnphm.xyz>
Date: Thu, 12 Jun 2025 18:50:31 -0400
Subject: [PATCH] [Doc] Unify structured outputs examples (#18196)

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
---
 docs/features/reasoning_outputs.md            |  45 ---
 docs/features/structured_outputs.md           |  80 ++++--
 ...enai_chat_completion_structured_outputs.py | 175 -----------
 ...etion_structured_outputs_structural_tag.py |  87 ------
 ...etion_structured_outputs_with_reasoning.py | 167 -----------
 .../structured_outputs/README.md              |  54 ++++
 .../structured_outputs/pyproject.toml         |   8 +
 .../structured_outputs/structured_outputs.py  | 272 ++++++++++++++++++
 8 files changed, 397 insertions(+), 491 deletions(-)
 delete mode 100644 examples/online_serving/openai_chat_completion_structured_outputs.py
 delete mode 100644 examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py
 delete mode 100644 examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
 create mode 100644 examples/online_serving/structured_outputs/README.md
 create mode 100644 examples/online_serving/structured_outputs/pyproject.toml
 create mode 100644 examples/online_serving/structured_outputs/structured_outputs.py

diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md
index cbcb246912f4c..59ef10d9c963b 100644
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@@ -142,51 +142,6 @@ for chunk in stream:
 
 Remember to check whether the `reasoning_content` exists in the response before accessing it. You could checkout the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).
 
-## Structured output
-
-The reasoning content is also available in the structured output. The structured output engine like `xgrammar` will use the reasoning content to generate structured output. It is only supported in v0 engine now.
-
-```bash
-vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B --reasoning-parser deepseek_r1
-```
-
-The following is an example client:
-
-```python
-from openai import OpenAI
-from pydantic import BaseModel
-
-# Modify OpenAI's API key and API base to use vLLM's API server.
-openai_api_key = "EMPTY"
-openai_api_base = "http://localhost:8000/v1"
-
-client = OpenAI(
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
-
-models = client.models.list()
-model = models.data[0].id
-
-class People(BaseModel):
-    name: str
-    age: int
-
-json_schema = People.model_json_schema()
-
-prompt = ("Generate a JSON with the name and age of one random person.")
-completion = client.chat.completions.create(
-    model=model,
-    messages=[{
-        "role": "user",
-        "content": prompt,
-    }],
-    extra_body={"guided_json": json_schema},
-)
-print("reasoning_content: ", completion.choices[0].message.reasoning_content)
-print("content: ", completion.choices[0].message.content)
-```
-
 ## Tool Calling
 
 The reasoning content is also available when both tool calling and the reasoning parser are enabled. Additionally, tool calling only parses functions from the `content` field, not from the `reasoning_content`.
diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md
index f96b598cff98d..c7abd9d4c2b78 100644
--- a/docs/features/structured_outputs.md
+++ b/docs/features/structured_outputs.md
@@ -39,9 +39,10 @@ client = OpenAI(
     base_url="http://localhost:8000/v1",
     api_key="-",
 )
+model = client.models.list().data[0].id
 
 completion = client.chat.completions.create(
-    model="Qwen/Qwen2.5-3B-Instruct",
+    model=model,
     messages=[
         {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
     ],
@@ -54,7 +55,7 @@ The next example shows how to use the `guided_regex`. The idea is to generate an
 
 ```python
 completion = client.chat.completions.create(
-    model="Qwen/Qwen2.5-3B-Instruct",
+    model=model,
     messages=[
         {
             "role": "user",
@@ -92,26 +93,32 @@ class CarDescription(BaseModel):
 json_schema = CarDescription.model_json_schema()
 
 completion = client.chat.completions.create(
-    model="Qwen/Qwen2.5-3B-Instruct",
+    model=model,
     messages=[
         {
             "role": "user",
             "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's",
         }
     ],
-    extra_body={"guided_json": json_schema},
+    "response_format": {
+        "type": "json_schema",
+        "json_schema": {
+            "name": "car-description",
+            "schema": CarDescription.model_json_schema()
+        },
+    },
 )
 print(completion.choices[0].message.content)
 ```
 
 !!! tip
     While not strictly necessary, normally it´s better to indicate in the prompt the
-    JSON schema and how the fields should be populated.  This can improve the
+    JSON schema and how the fields should be populated. This can improve the
     results notably in most cases.
 
 Finally we have the `guided_grammar` option, which is probably the most
 difficult to use, but it´s really powerful. It allows us to define complete
-languages like SQL queries.  It works by using a context free EBNF grammar.
+languages like SQL queries. It works by using a context free EBNF grammar.
 As an example, we can use to define a specific format of simplified SQL queries:
 
 ```python
@@ -130,7 +137,7 @@ simplified_sql_grammar = """
 """
 
 completion = client.chat.completions.create(
-    model="Qwen/Qwen2.5-3B-Instruct",
+    model=model,
     messages=[
         {
             "role": "user",
@@ -142,7 +149,48 @@ completion = client.chat.completions.create(
 print(completion.choices[0].message.content)
 ```
 
-Full example: <gh-file:examples/online_serving/openai_chat_completion_structured_outputs.py>
+See also: [full example](../../examples/online_serving/structured_outputs)
+
+## Reasoning Outputs
+
+You can also use structured outputs with <project:#reasoning-outputs> for reasoning models.
+
+```bash
+vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --reasoning-parser deepseek_r1
+```
+
+Note that you can use reasoning with any provided structured outputs feature. The following uses one with JSON schema:
+
+```python
+from pydantic import BaseModel
+
+
+class People(BaseModel):
+    name: str
+    age: int
+
+
+completion = client.chat.completions.create(
+    model=model,
+    messages=[
+        {
+            "role": "user",
+            "content": "Generate a JSON with the name and age of one random person.",
+        }
+    ],
+    response_format={
+        "type": "json_schema",
+        "json_schema": {
+            "name": "people",
+            "schema": People.model_json_schema()
+        }
+    },
+)
+print("reasoning_content: ", completion.choices[0].message.reasoning_content)
+print("content: ", completion.choices[0].message.content)
+```
+
+See also: [full example](../../examples/online_serving/structured_outputs)
 
 ## Experimental Automatic Parsing (OpenAI API)
 
@@ -163,14 +211,14 @@ class Info(BaseModel):
     age: int
 
 client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy")
+model = client.models.list().data[0].id
 completion = client.beta.chat.completions.parse(
-    model="meta-llama/Llama-3.1-8B-Instruct",
+    model=model,
     messages=[
         {"role": "system", "content": "You are a helpful assistant."},
         {"role": "user", "content": "My name is Cameron, I'm 28. What's my name and age?"},
     ],
     response_format=Info,
-    extra_body=dict(guided_decoding_backend="outlines"),
 )
 
 message = completion.choices[0].message
@@ -203,15 +251,13 @@ class MathResponse(BaseModel):
     steps: list[Step]
     final_answer: str
 
-client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy")
 completion = client.beta.chat.completions.parse(
-    model="meta-llama/Llama-3.1-8B-Instruct",
+    model=model,
     messages=[
         {"role": "system", "content": "You are a helpful expert math tutor."},
         {"role": "user", "content": "Solve 8x + 31 = 2."},
     ],
     response_format=MathResponse,
-    extra_body=dict(guided_decoding_backend="outlines"),
 )
 
 message = completion.choices[0].message
@@ -232,11 +278,11 @@ Step #2: explanation="Next, let's isolate 'x' by dividing both sides of the equa
 Answer: x = -29/8
 ```
 
-An example of using `structural_tag` can be found here: <gh-file:examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py>
+An example of using `structural_tag` can be found here: <gh-file:examples/online_serving/structured_outputs>
 
 ## Offline Inference
 
-Offline inference allows for the same types of guided decoding.
+Offline inference allows for the same types of structured outputs.
 To use it, we´ll need to configure the guided decoding using the class `GuidedDecodingParams` inside `SamplingParams`.
 The main available options inside `GuidedDecodingParams` are:
 
@@ -247,7 +293,7 @@ The main available options inside `GuidedDecodingParams` are:
 - `structural_tag`
 
 These parameters can be used in the same way as the parameters from the Online
-Serving examples above.  One example for the usage of the `choice` parameter is
+Serving examples above. One example for the usage of the `choice` parameter is
 shown below:
 
 ```python
@@ -265,4 +311,4 @@ outputs = llm.generate(
 print(outputs[0].outputs[0].text)
 ```
 
-Full example: <gh-file:examples/offline_inference/structured_outputs.py>
+See also: [full example](../../examples/online_serving/structured_outputs)
diff --git a/examples/online_serving/openai_chat_completion_structured_outputs.py b/examples/online_serving/openai_chat_completion_structured_outputs.py
deleted file mode 100644
index 5c55d53138a8f..0000000000000
--- a/examples/online_serving/openai_chat_completion_structured_outputs.py
+++ /dev/null
@@ -1,175 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-To run this example, you need to start the vLLM server:
-
-```bash
-vllm serve Qwen/Qwen2.5-3B-Instruct
-```
-"""
-
-from enum import Enum
-
-from openai import BadRequestError, OpenAI
-from pydantic import BaseModel
-
-openai_api_key = "EMPTY"
-openai_api_base = "http://localhost:8000/v1"
-
-
-# Guided decoding by Choice (list of possible options)
-def guided_choice_completion(client: OpenAI, model: str):
-    completion = client.chat.completions.create(
-        model=model,
-        messages=[
-            {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
-        ],
-        extra_body={"guided_choice": ["positive", "negative"]},
-    )
-    return completion.choices[0].message.content
-
-
-# Guided decoding by Regex
-def guided_regex_completion(client: OpenAI, model: str):
-    prompt = (
-        "Generate an email address for Alan Turing, who works in Enigma."
-        "End in .com and new line. Example result:"
-        "alan.turing@enigma.com\n"
-    )
-
-    completion = client.chat.completions.create(
-        model=model,
-        messages=[
-            {
-                "role": "user",
-                "content": prompt,
-            }
-        ],
-        extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]},
-    )
-    return completion.choices[0].message.content
-
-
-# Guided decoding by JSON using Pydantic schema
-class CarType(str, Enum):
-    sedan = "sedan"
-    suv = "SUV"
-    truck = "Truck"
-    coupe = "Coupe"
-
-
-class CarDescription(BaseModel):
-    brand: str
-    model: str
-    car_type: CarType
-
-
-def guided_json_completion(client: OpenAI, model: str):
-    json_schema = CarDescription.model_json_schema()
-
-    prompt = (
-        "Generate a JSON with the brand, model and car_type of"
-        "the most iconic car from the 90's"
-    )
-    completion = client.chat.completions.create(
-        model=model,
-        messages=[
-            {
-                "role": "user",
-                "content": prompt,
-            }
-        ],
-        extra_body={"guided_json": json_schema},
-    )
-    return completion.choices[0].message.content
-
-
-# Guided decoding by Grammar
-def guided_grammar_completion(client: OpenAI, model: str):
-    simplified_sql_grammar = """
-        root ::= select_statement
-
-        select_statement ::= "SELECT " column " from " table " where " condition
-
-        column ::= "col_1 " | "col_2 "
-
-        table ::= "table_1 " | "table_2 "
-
-        condition ::= column "= " number
-
-        number ::= "1 " | "2 "
-    """
-
-    prompt = (
-        "Generate an SQL query to show the 'username' and 'email'"
-        "from the 'users' table."
-    )
-    completion = client.chat.completions.create(
-        model=model,
-        messages=[
-            {
-                "role": "user",
-                "content": prompt,
-            }
-        ],
-        extra_body={"guided_grammar": simplified_sql_grammar},
-    )
-    return completion.choices[0].message.content
-
-
-# Extra backend options
-def extra_backend_options_completion(client: OpenAI, model: str):
-    prompt = (
-        "Generate an email address for Alan Turing, who works in Enigma."
-        "End in .com and new line. Example result:"
-        "alan.turing@enigma.com\n"
-    )
-
-    try:
-        # The guided_decoding_disable_fallback option forces vLLM to use
-        # xgrammar, so when it fails you get a 400 with the reason why
-        completion = client.chat.completions.create(
-            model=model,
-            messages=[
-                {
-                    "role": "user",
-                    "content": prompt,
-                }
-            ],
-            extra_body={
-                "guided_regex": r"\w+@\w+\.com\n",
-                "stop": ["\n"],
-                "guided_decoding_disable_fallback": True,
-            },
-        )
-        return completion.choices[0].message.content
-    except BadRequestError as e:
-        print("This error is expected:", e)
-
-
-def main():
-    client: OpenAI = OpenAI(
-        base_url=openai_api_base,
-        api_key=openai_api_key,
-    )
-
-    model = client.models.list().data[0].id
-
-    print("Guided Choice Completion:")
-    print(guided_choice_completion(client, model))
-
-    print("\nGuided Regex Completion:")
-    print(guided_regex_completion(client, model))
-
-    print("\nGuided JSON Completion:")
-    print(guided_json_completion(client, model))
-
-    print("\nGuided Grammar Completion:")
-    print(guided_grammar_completion(client, model))
-
-    print("\nExtra Backend Options Completion:")
-    print(extra_backend_options_completion(client, model))
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py b/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py
deleted file mode 100644
index ec7d8b95472e6..0000000000000
--- a/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from openai import OpenAI
-
-# This example demonstrates the `structural_tag` response format.
-# It can be used to specify a structured output format that occurs between
-# specific tags in the response. This example shows how it could be used
-# to enforce the format of a tool call response, but it could be used for
-# any structured output within a subset of the response.
-
-openai_api_key = "EMPTY"
-openai_api_base = "http://localhost:8000/v1"
-
-
-def main():
-    client = OpenAI(
-        base_url=openai_api_base,
-        api_key=openai_api_key,
-    )
-
-    messages = [
-        {
-            "role": "user",
-            "content": """
-You have access to the following function to retrieve the weather in a city:
-
-    {
-        "name": "get_weather",
-        "parameters": {
-            "city": {
-                "param_type": "string",
-                "description": "The city to get the weather for",
-                "required": True
-            }
-        }
-    }
-
-If a you choose to call a function ONLY reply in the following format:
-<{start_tag}={function_name}>{parameters}{end_tag}
-where
-
-start_tag => `<function`
-parameters => a JSON dict with the function argument name as key and function
-              argument value as value.
-end_tag => `</function>`
-
-Here is an example,
-<function=example_function_name>{"example_name": "example_value"}</function>
-
-Reminder:
-- Function calls MUST follow the specified format
-- Required parameters MUST be specified
-- Only call one function at a time
-- Put the entire function call reply on one line
-- Always add your sources when using search results to answer the user query
-
-You are a helpful assistant.
-
-Given the previous instructions, what is the weather in New York City, Boston,
-and San Francisco?
-""",
-        }
-    ]
-
-    response = client.chat.completions.create(
-        model=client.models.list().data[0].id,
-        messages=messages,
-        response_format={
-            "type": "structural_tag",
-            "structures": [
-                {
-                    "begin": "<function=get_weather>",
-                    "schema": {
-                        "type": "object",
-                        "properties": {"city": {"type": "string"}},
-                    },
-                    "end": "</function>",
-                }
-            ],
-            "triggers": ["<function="],
-        },
-    )
-    print(response)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py b/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
deleted file mode 100644
index bfbee7513874a..0000000000000
--- a/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
+++ /dev/null
@@ -1,167 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-An example shows how to generate structured outputs from reasoning models
-like DeepSeekR1. The thinking process will not be guided by the JSON
-schema provided by the user. Only the final output will be structured.
-
-To run this example, you need to start the vLLM server with the reasoning
-parser:
-
-```bash
-vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
-    --reasoning-parser deepseek_r1
-```
-
-This example demonstrates how to generate chat completions from reasoning models
-using the OpenAI Python client library.
-"""
-
-from enum import Enum
-
-from openai import OpenAI
-from pydantic import BaseModel
-
-# Modify OpenAI's API key and API base to use vLLM's API server.
-openai_api_key = "EMPTY"
-openai_api_base = "http://localhost:8000/v1"
-
-
-def print_completion_details(completion):
-    print("reasoning_content: ", completion.choices[0].message.reasoning_content)
-    print("content: ", completion.choices[0].message.content)
-
-
-# Guided decoding by Regex
-def guided_regex_completion(client: OpenAI, model: str):
-    prompt = "What is the capital of France?"
-
-    completion = client.chat.completions.create(
-        model=model,
-        messages=[
-            {
-                "role": "user",
-                "content": prompt,
-            }
-        ],
-        extra_body={
-            "guided_regex": "(Paris|London)",
-        },
-    )
-    print_completion_details(completion)
-
-
-class People(BaseModel):
-    name: str
-    age: int
-
-
-def guided_json_completion(client: OpenAI, model: str):
-    json_schema = People.model_json_schema()
-
-    prompt = "Generate a JSON with the name and age of one random person."
-    completion = client.chat.completions.create(
-        model=model,
-        messages=[
-            {
-                "role": "user",
-                "content": prompt,
-            }
-        ],
-        extra_body={"guided_json": json_schema},
-    )
-    print_completion_details(completion)
-
-
-# Guided decoding by JSON using Pydantic schema
-class CarType(str, Enum):
-    sedan = "sedan"
-    suv = "SUV"
-    truck = "Truck"
-    coupe = "Coupe"
-
-
-class CarDescription(BaseModel):
-    brand: str
-    model: str
-    car_type: CarType
-
-
-def guided_car_json_completion(client: OpenAI, model: str):
-    json_schema = CarDescription.model_json_schema()
-
-    prompt = (
-        "Generate a JSON with the brand, model and car_type of"
-        "the most iconic car from the 90's"
-    )
-    completion = client.chat.completions.create(
-        model=model,
-        messages=[
-            {
-                "role": "user",
-                "content": prompt,
-            }
-        ],
-        extra_body={"guided_json": json_schema},
-    )
-    print_completion_details(completion)
-
-
-# Guided decoding by Grammar
-def guided_grammar_completion(client: OpenAI, model: str):
-    simplified_sql_grammar = """
-        root ::= select_statement
-
-        select_statement ::= "SELECT " column " from " table " where " condition
-
-        column ::= "col_1 " | "col_2 "
-
-        table ::= "table_1 " | "table_2 "
-
-        condition ::= column "= " number
-
-        number ::= "1 " | "2 "
-    """
-
-    # This may be very slow https://github.com/vllm-project/vllm/issues/12122
-    prompt = (
-        "Generate an SQL query to show the 'username' and 'email'"
-        "from the 'users' table."
-    )
-    completion = client.chat.completions.create(
-        model=model,
-        messages=[
-            {
-                "role": "user",
-                "content": prompt,
-            }
-        ],
-        extra_body={"guided_grammar": simplified_sql_grammar},
-    )
-    print_completion_details(completion)
-
-
-def main():
-    client: OpenAI = OpenAI(
-        api_key=openai_api_key,
-        base_url=openai_api_base,
-    )
-
-    models = client.models.list()
-    model: str = models.data[0].id
-
-    print("Guided Regex Completion:")
-    guided_regex_completion(client, model)
-
-    print("\nGuided JSON Completion (People):")
-    guided_json_completion(client, model)
-
-    print("\nGuided JSON Completion (CarDescription):")
-    guided_car_json_completion(client, model)
-
-    print("\nGuided Grammar Completion:")
-    guided_grammar_completion(client, model)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/online_serving/structured_outputs/README.md b/examples/online_serving/structured_outputs/README.md
new file mode 100644
index 0000000000000..d38feca746373
--- /dev/null
+++ b/examples/online_serving/structured_outputs/README.md
@@ -0,0 +1,54 @@
+# Structured Outputs
+
+This script demonstrates various structured output capabilities of vLLM's OpenAI-compatible server.
+It can run individual constraint type or all of them.
+It supports both streaming responses and concurrent non-streaming requests.
+
+To use this example, you must start an vLLM server with any model of your choice.
+
+```bash
+vllm serve Qwen/Qwen2.5-3B-Instruct
+```
+
+To serve a reasoning model, you can use the following command:
+
+```bash
+vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --reasoning-parser deepseek_r1
+```
+
+If you want to run this script standalone with `uv`, you can use the following:
+
+```bash
+uvx --from git+https://github.com/vllm-project/vllm#subdirectory=examples/online_serving/structured_outputs structured-output
+```
+
+See [feature docs](../../../features/structured_outputs.md) for more information.
+
+!!! tip
+    If vLLM is running remotely, then set `OPENAI_BASE_URL=<remote_url>` before running the script.
+
+## Usage
+
+Run all constraints, non-streaming:
+
+```bash
+uv run structured_outputs.py
+```
+
+Run all constraints, streaming:
+
+```bash
+uv run structured_outputs.py --stream
+```
+
+Run certain constraints, for example `structural_tag` and `regex`, streaming:
+
+```bash
+uv run structured_outputs.py --constraint structural_tag regex --stream
+```
+
+Run all constraints, with reasoning models and streaming:
+
+```bash
+uv run structured_outputs.py --reasoning --stream
+```
diff --git a/examples/online_serving/structured_outputs/pyproject.toml b/examples/online_serving/structured_outputs/pyproject.toml
new file mode 100644
index 0000000000000..8f31405ff584a
--- /dev/null
+++ b/examples/online_serving/structured_outputs/pyproject.toml
@@ -0,0 +1,8 @@
+[project]
+name = "examples-online-structured-outputs"
+requires-python = ">=3.9, <3.13"
+dependencies = ["openai==1.78.1", "pydantic==2.11.4"]
+version = "0.0.0"
+
+[project.scripts]
+structured-outputs = "structured_outputs:main"
diff --git a/examples/online_serving/structured_outputs/structured_outputs.py b/examples/online_serving/structured_outputs/structured_outputs.py
new file mode 100644
index 0000000000000..2a8f4637260c2
--- /dev/null
+++ b/examples/online_serving/structured_outputs/structured_outputs.py
@@ -0,0 +1,272 @@
+# ruff: noqa: E501
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import enum
+import os
+from typing import TYPE_CHECKING, Any, Literal
+
+import openai
+import pydantic
+
+if TYPE_CHECKING:
+    from openai.types.chat import ChatCompletionChunk
+
+
+ConstraintsFormat = Literal[
+    "choice",
+    "regex",
+    "json",
+    "grammar",
+    "structural_tag",
+]
+
+
+async def print_stream_response(
+    stream_response: openai.AsyncStream[ChatCompletionChunk],
+    title: str,
+    args: argparse.Namespace,
+):
+    print(f"\n\n{title} (Streaming):")
+
+    local_reasoning_header_printed = False
+    local_content_header_printed = False
+
+    async for chunk in stream_response:
+        delta = chunk.choices[0].delta
+
+        reasoning_chunk_text: str | None = getattr(delta, "reasoning_content", None)
+        content_chunk_text = delta.content
+
+        if args.reasoning:
+            if reasoning_chunk_text:
+                if not local_reasoning_header_printed:
+                    print("  Reasoning: ", end="")
+                    local_reasoning_header_printed = True
+                print(reasoning_chunk_text, end="", flush=True)
+
+            if content_chunk_text:
+                if not local_content_header_printed:
+                    if local_reasoning_header_printed:
+                        print()
+                    print("  Content: ", end="")
+                    local_content_header_printed = True
+                print(content_chunk_text, end="", flush=True)
+        else:
+            if content_chunk_text:
+                if not local_content_header_printed:
+                    print("  Content: ", end="")
+                    local_content_header_printed = True
+                print(content_chunk_text, end="", flush=True)
+    print()
+
+
+class CarType(str, enum.Enum):
+    SEDAN = "SEDAN"
+    SUV = "SUV"
+    TRUCK = "TRUCK"
+    COUPE = "COUPE"
+
+
+class CarDescription(pydantic.BaseModel):
+    brand: str
+    model: str
+    car_type: CarType
+
+
+PARAMS: dict[ConstraintsFormat, dict[str, Any]] = {
+    "choice": {
+        "messages": [
+            {
+                "role": "user",
+                "content": "Classify this sentiment: vLLM is wonderful!",
+            }
+        ],
+        "extra_body": {"guided_choice": ["positive", "negative"]},
+    },
+    "regex": {
+        "messages": [
+            {
+                "role": "user",
+                "content": "Generate an email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: 'alan.turing@enigma.com\n'",
+            }
+        ],
+        "extra_body": {
+            "guided_regex": r"[a-z0-9.]{1,20}@\w{6,10}\.com\n",
+        },
+    },
+    "json": {
+        "messages": [
+            {
+                "role": "user",
+                "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's",
+            }
+        ],
+        "response_format": {
+            "type": "json_schema",
+            "json_schema": {
+                "name": "car-description",
+                "schema": CarDescription.model_json_schema(),
+            },
+        },
+    },
+    "grammar": {
+        "messages": [
+            {
+                "role": "user",
+                "content": "Generate an SQL query to show the 'username' and 'email'from the 'users' table.",
+            }
+        ],
+        "extra_body": {
+            "guided_grammar": """
+root ::= select_statement
+
+select_statement ::= "SELECT " column " from " table " where " condition
+
+column ::= "col_1 " | "col_2 "
+
+table ::= "table_1 " | "table_2 "
+
+condition ::= column "= " number
+
+number ::= "1 " | "2 "
+""",
+        },
+    },
+    "structural_tag": {
+        "messages": [
+            {
+                "role": "user",
+                "content": """
+You have access to the following function to retrieve the weather in a city:
+
+{
+    "name": "get_weather",
+    "parameters": {
+        "city": {
+            "param_type": "string",
+            "description": "The city to get the weather for",
+            "required": True
+        }
+    }
+}
+
+If a you choose to call a function ONLY reply in the following format:
+<{start_tag}={function_name}>{parameters}{end_tag}
+where
+
+start_tag => `<function`
+parameters => a JSON dict with the function argument name as key and function
+              argument value as value.
+end_tag => `</function>`
+
+Here is an example,
+<function=example_function_name>{"example_name": "example_value"}</function>
+
+Reminder:
+- Function calls MUST follow the specified format
+- Required parameters MUST be specified
+- Only call one function at a time
+- Put the entire function call reply on one line
+- Always add your sources when using search results to answer the user query
+
+You are a helpful assistant.
+
+Given the previous instructions, what is the weather in New York City, Boston,
+and San Francisco?""",
+            },
+        ],
+        "response_format": {
+            "type": "structural_tag",
+            "structures": [
+                {
+                    "begin": "<function=get_weather>",
+                    "schema": {
+                        "type": "object",
+                        "properties": {"city": {"type": "string"}},
+                        "required": ["city"],
+                    },
+                    "end": "</function>",
+                }
+            ],
+            "triggers": ["<function="],
+        },
+    },
+}
+
+
+async def cli():
+    parser = argparse.ArgumentParser(
+        description="Run OpenAI Chat Completion with various structured outputs capabilities",
+    )
+    _ = parser.add_argument(
+        "--constraint",
+        type=str,
+        nargs="+",
+        choices=[*list(PARAMS), "*"],
+        default=["*"],
+        help="Specify which constraint(s) to run.",
+    )
+    _ = parser.add_argument(
+        "--stream",
+        action=argparse.BooleanOptionalAction,
+        default=False,
+        help="Enable streaming output",
+    )
+    _ = parser.add_argument(
+        "--reasoning",
+        action=argparse.BooleanOptionalAction,
+        default=False,
+        help="Enable printing of reasoning traces if available.",
+    )
+    args = parser.parse_args()
+
+    base_url = os.getenv("OPENAI_BASE_URL", "http://localhost:8000/v1")
+    client = openai.AsyncOpenAI(base_url=base_url, api_key="EMPTY")
+    constraints = list(PARAMS) if "*" in args.constraint else list(set(args.constraint))
+    model = (await client.models.list()).data[0].id
+
+    if args.stream:
+        results = await asyncio.gather(
+            *[
+                client.chat.completions.create(
+                    model=model,
+                    max_tokens=1024,
+                    stream=True,
+                    **PARAMS[name],
+                )
+                for name in constraints
+            ]
+        )
+        for constraint, stream in zip(constraints, results):
+            await print_stream_response(stream, constraint, args)
+    else:
+        results = await asyncio.gather(
+            *[
+                client.chat.completions.create(
+                    model=model,
+                    max_tokens=1024,
+                    stream=False,
+                    **PARAMS[name],
+                )
+                for name in constraints
+            ]
+        )
+        for constraint, response in zip(constraints, results):
+            print(f"\n\n{constraint}:")
+            message = response.choices[0].message
+            if args.reasoning and hasattr(message, "reasoning_content"):
+                print(f"  Reasoning: {message.reasoning_content or ''}")
+            print(f"  Content: {message.content!r}")
+
+
+def main():
+    asyncio.run(cli())
+
+
+if __name__ == "__main__":
+    main()