mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-16 06:37:03 +08:00
[Chore] Cleanup guided namespace, move to structured outputs config (#22772)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
parent
05b044e698
commit
29283e8976
@ -167,12 +167,6 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
|
||||
--ignore=entrypoints/llm/test_prompt_validation.py "}
|
||||
fi
|
||||
|
||||
#Obsolete currently
|
||||
##ignore certain Entrypoints/llm tests
|
||||
#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
|
||||
# commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
|
||||
#fi
|
||||
|
||||
# --ignore=entrypoints/openai/test_encoder_decoder.py \
|
||||
# --ignore=entrypoints/openai/test_embedding.py \
|
||||
# --ignore=entrypoints/openai/test_oot_registration.py
|
||||
|
||||
@ -108,8 +108,7 @@ steps:
|
||||
- tests/entrypoints/offline_mode
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||
|
||||
|
||||
2
.github/mergify.yml
vendored
2
.github/mergify.yml
vendored
@ -171,7 +171,7 @@ pull_request_rules:
|
||||
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
|
||||
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
|
||||
- files~=^tests/v1/structured_output/
|
||||
- files=tests/v1/entrypoints/llm/test_guided_generate.py
|
||||
- files=tests/v1/entrypoints/llm/test_struct_output_generate.py
|
||||
- files~=^vllm/v1/structured_output/
|
||||
actions:
|
||||
label:
|
||||
|
||||
@ -696,11 +696,11 @@ def evaluate(ret, args):
|
||||
return re.match(args.regex, actual) is not None
|
||||
|
||||
def _eval_correctness(expected, actual):
|
||||
if args.structure_type == "guided_json":
|
||||
if args.structure_type == "json":
|
||||
return _eval_correctness_json(expected, actual)
|
||||
elif args.structure_type == "guided_regex":
|
||||
elif args.structure_type == "regex":
|
||||
return _eval_correctness_regex(expected, actual)
|
||||
elif args.structure_type == "guided_choice":
|
||||
elif args.structure_type == "choice":
|
||||
return _eval_correctness_choice(expected, actual)
|
||||
else:
|
||||
return None
|
||||
@ -780,18 +780,18 @@ def main(args: argparse.Namespace):
|
||||
)
|
||||
|
||||
if args.dataset == "grammar":
|
||||
args.structure_type = "guided_grammar"
|
||||
args.structure_type = "grammar"
|
||||
elif args.dataset == "regex":
|
||||
args.structure_type = "guided_regex"
|
||||
args.structure_type = "regex"
|
||||
elif args.dataset == "choice":
|
||||
args.structure_type = "guided_choice"
|
||||
args.structure_type = "choice"
|
||||
else:
|
||||
args.structure_type = "guided_json"
|
||||
args.structure_type = "json"
|
||||
|
||||
if args.no_structured_output:
|
||||
args.structured_output_ratio = 0
|
||||
if args.save_results:
|
||||
result_file_name = f"{args.structured_output_ratio}guided"
|
||||
result_file_name = f"{args.structured_output_ratio}so"
|
||||
result_file_name += f"_{backend}"
|
||||
result_file_name += f"_{args.request_rate}qps"
|
||||
result_file_name += f"_{args.model.split('/')[-1]}"
|
||||
|
||||
@ -14,7 +14,7 @@ API documentation for vLLM's configuration classes.
|
||||
- [vllm.config.LoRAConfig][]
|
||||
- [vllm.config.MultiModalConfig][]
|
||||
- [vllm.config.PoolerConfig][]
|
||||
- [vllm.config.DecodingConfig][]
|
||||
- [vllm.config.StructuredOutputsConfig][]
|
||||
- [vllm.config.ObservabilityConfig][]
|
||||
- [vllm.config.KVTransferConfig][]
|
||||
- [vllm.config.CompilationConfig][]
|
||||
|
||||
@ -10,12 +10,12 @@ vLLM currently supports the following reasoning models:
|
||||
|
||||
| Model Series | Parser Name | Structured Output Support | Tool Calling |
|
||||
|--------------|-------------|------------------|-------------|
|
||||
| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `guided_json`, `guided_regex` | ❌ |
|
||||
| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `guided_json`, `guided_regex` | ✅ |
|
||||
| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `json`, `regex` | ❌ |
|
||||
| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `json`, `regex` | ✅ |
|
||||
| [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ |
|
||||
| [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `guided_json`, `guided_regex` | ✅ |
|
||||
| [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `guided_json`, `guided_regex` | ✅ |
|
||||
| [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `guided_json`, `guided_regex` | ✅ |
|
||||
| [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `json`, `regex` | ✅ |
|
||||
| [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `json`, `regex` | ✅ |
|
||||
| [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `json`, `regex` | ✅ |
|
||||
|
||||
!!! note
|
||||
IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
|
||||
|
||||
@ -12,23 +12,23 @@ You can generate structured outputs using the OpenAI's [Completions](https://pla
|
||||
|
||||
The following parameters are supported, which must be added as extra parameters:
|
||||
|
||||
- `guided_choice`: the output will be exactly one of the choices.
|
||||
- `guided_regex`: the output will follow the regex pattern.
|
||||
- `guided_json`: the output will follow the JSON schema.
|
||||
- `guided_grammar`: the output will follow the context free grammar.
|
||||
- `choice`: the output will be exactly one of the choices.
|
||||
- `regex`: the output will follow the regex pattern.
|
||||
- `json`: the output will follow the JSON schema.
|
||||
- `grammar`: the output will follow the context free grammar.
|
||||
- `structural_tag`: Follow a JSON schema within a set of specified tags within the generated text.
|
||||
|
||||
You can see the complete list of supported parameters on the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) page.
|
||||
|
||||
Structured outputs are supported by default in the OpenAI-Compatible Server. You
|
||||
may choose to specify the backend to use by setting the
|
||||
`--guided-decoding-backend` flag to `vllm serve`. The default backend is `auto`,
|
||||
`--structured-outputs-config.backend` flag to `vllm serve`. The default backend is `auto`,
|
||||
which will try to choose an appropriate backend based on the details of the
|
||||
request. You may also choose a specific backend, along with
|
||||
some options. A full set of options is available in the `vllm serve --help`
|
||||
text.
|
||||
|
||||
Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one:
|
||||
Now let´s see an example for each of the cases, starting with the `choice`, as it´s the easiest one:
|
||||
|
||||
??? code
|
||||
|
||||
@ -45,12 +45,12 @@ Now let´s see an example for each of the cases, starting with the `guided_choic
|
||||
messages=[
|
||||
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
|
||||
],
|
||||
extra_body={"guided_choice": ["positive", "negative"]},
|
||||
extra_body={"structured_outputs": {"choice": ["positive", "negative"]}},
|
||||
)
|
||||
print(completion.choices[0].message.content)
|
||||
```
|
||||
|
||||
The next example shows how to use the `guided_regex`. The idea is to generate an email address, given a simple regex template:
|
||||
The next example shows how to use the `regex`. The idea is to generate an email address, given a simple regex template:
|
||||
|
||||
??? code
|
||||
|
||||
@ -63,18 +63,18 @@ The next example shows how to use the `guided_regex`. The idea is to generate an
|
||||
"content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n",
|
||||
}
|
||||
],
|
||||
extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]},
|
||||
extra_body={"structured_outputs": {"regex": r"\w+@\w+\.com\n"}, "stop": ["\n"]},
|
||||
)
|
||||
print(completion.choices[0].message.content)
|
||||
```
|
||||
|
||||
One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats.
|
||||
For this we can use the `guided_json` parameter in two different ways:
|
||||
For this we can use the `json` parameter in two different ways:
|
||||
|
||||
- Using directly a [JSON Schema](https://json-schema.org/)
|
||||
- Defining a [Pydantic model](https://docs.pydantic.dev/latest/) and then extracting the JSON Schema from it (which is normally an easier option).
|
||||
|
||||
The next example shows how to use the `guided_json` parameter with a Pydantic model:
|
||||
The next example shows how to use the `response_format` parameter with a Pydantic model:
|
||||
|
||||
??? code
|
||||
|
||||
@ -119,7 +119,7 @@ The next example shows how to use the `guided_json` parameter with a Pydantic mo
|
||||
JSON schema and how the fields should be populated. This can improve the
|
||||
results notably in most cases.
|
||||
|
||||
Finally we have the `guided_grammar` option, which is probably the most
|
||||
Finally we have the `grammar` option, which is probably the most
|
||||
difficult to use, but it´s really powerful. It allows us to define complete
|
||||
languages like SQL queries. It works by using a context free EBNF grammar.
|
||||
As an example, we can use to define a specific format of simplified SQL queries:
|
||||
@ -149,7 +149,7 @@ As an example, we can use to define a specific format of simplified SQL queries:
|
||||
"content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.",
|
||||
}
|
||||
],
|
||||
extra_body={"guided_grammar": simplified_sql_grammar},
|
||||
extra_body={"structured_outputs": {"grammar": simplified_sql_grammar}},
|
||||
)
|
||||
print(completion.choices[0].message.content)
|
||||
```
|
||||
@ -292,8 +292,8 @@ An example of using `structural_tag` can be found here: <gh-file:examples/online
|
||||
## Offline Inference
|
||||
|
||||
Offline inference allows for the same types of structured outputs.
|
||||
To use it, we´ll need to configure the guided decoding using the class `GuidedDecodingParams` inside `SamplingParams`.
|
||||
The main available options inside `GuidedDecodingParams` are:
|
||||
To use it, we´ll need to configure the structured outputs using the class `StructuredOutputsParams` inside `SamplingParams`.
|
||||
The main available options inside `StructuredOutputsParams` are:
|
||||
|
||||
- `json`
|
||||
- `regex`
|
||||
@ -309,12 +309,12 @@ shown below:
|
||||
|
||||
```python
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.sampling_params import GuidedDecodingParams
|
||||
from vllm.sampling_params import StructuredOutputsParams
|
||||
|
||||
llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct")
|
||||
|
||||
guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"])
|
||||
sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
|
||||
structured_outputs_params = StructuredOutputsParams(choice=["Positive", "Negative"])
|
||||
sampling_params = SamplingParams(structured_outputs=structured_outputs_params)
|
||||
outputs = llm.generate(
|
||||
prompts="Classify this sentiment: vLLM is wonderful!",
|
||||
sampling_params=sampling_params,
|
||||
|
||||
@ -71,7 +71,7 @@ This example demonstrates:
|
||||
* Making a request with `tool_choice="auto"`
|
||||
* Handling the structured response and executing the corresponding function
|
||||
|
||||
You can also specify a particular function using named function calling by setting `tool_choice={"type": "function", "function": {"name": "get_weather"}}`. Note that this will use the guided decoding backend - so the first time this is used, there will be several seconds of latency (or more) as the FSM is compiled for the first time before it is cached for subsequent requests.
|
||||
You can also specify a particular function using named function calling by setting `tool_choice={"type": "function", "function": {"name": "get_weather"}}`. Note that this will use the structured outputs backend - so the first time this is used, there will be several seconds of latency (or more) as the FSM is compiled for the first time before it is cached for subsequent requests.
|
||||
|
||||
Remember that it's the caller's responsibility to:
|
||||
|
||||
@ -83,19 +83,18 @@ For more advanced usage, including parallel tool calls and different model-speci
|
||||
|
||||
## Named Function Calling
|
||||
|
||||
vLLM supports named function calling in the chat completion API by default. It does so using Outlines through guided decoding, so this is
|
||||
enabled by default and will work with any supported model. You are guaranteed a validly-parsable function call - not a
|
||||
vLLM supports named function calling in the chat completion API by default. This should work with most structured outputs backends supported by vLLM. You are guaranteed a validly-parsable function call - not a
|
||||
high-quality one.
|
||||
|
||||
vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter.
|
||||
For best results, we recommend ensuring that the expected output format / schema is specified in the prompt to ensure that the model's intended generation is aligned with the schema that it's being forced to generate by the guided decoding backend.
|
||||
vLLM will use structured outputs to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter.
|
||||
For best results, we recommend ensuring that the expected output format / schema is specified in the prompt to ensure that the model's intended generation is aligned with the schema that it's being forced to generate by the structured outputs backend.
|
||||
|
||||
To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and
|
||||
specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request.
|
||||
|
||||
## Required Function Calling
|
||||
|
||||
vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses guided decoding, so this is enabled by default and will work with any supported model. The guided decoding features for `tool_choice='required'` (such as JSON schema with `anyOf`) are currently only supported in the V0 engine with the guided decoding backend `outlines`. However, support for alternative decoding backends are on the [roadmap](../usage/v1_guide.md#features) for the V1 engine.
|
||||
vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses structured outputs, so this is enabled by default and will work with any supported model. However, support for alternative decoding backends are on the [roadmap](../usage/v1_guide.md#features) for the V1 engine.
|
||||
|
||||
When tool_choice='required' is set, the model is guaranteed to generate one or more tool calls based on the specified tool list in the `tools` parameter. The number of tool calls depends on the user's query. The output format strictly follows the schema defined in the `tools` parameter.
|
||||
|
||||
|
||||
@ -133,7 +133,7 @@ completion = client.chat.completions.create(
|
||||
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
|
||||
],
|
||||
extra_body={
|
||||
"guided_choice": ["positive", "negative"]
|
||||
"structured_outputs": {"choice": ["positive", "negative"]}
|
||||
}
|
||||
)
|
||||
```
|
||||
@ -374,7 +374,7 @@ The following extra parameters are supported:
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params"
|
||||
```
|
||||
|
||||
|
||||
[](){ #translations-api }
|
||||
|
||||
### Translations API
|
||||
|
||||
@ -1,11 +1,10 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
This file demonstrates the example usage of guided decoding
|
||||
to generate structured outputs using vLLM. It shows how to apply
|
||||
different guided decoding techniques such as Choice, Regex, JSON schema,
|
||||
and Grammar to produce structured and formatted results
|
||||
based on specific prompts.
|
||||
This file demonstrates the example usage of structured outputs
|
||||
in vLLM. It shows how to apply different constraints such as choice,
|
||||
regex, json schema, and grammar to produce structured and formatted
|
||||
results based on specific prompts.
|
||||
"""
|
||||
|
||||
from enum import Enum
|
||||
@ -13,19 +12,23 @@ from enum import Enum
|
||||
from pydantic import BaseModel
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.sampling_params import GuidedDecodingParams
|
||||
from vllm.sampling_params import StructuredOutputsParams
|
||||
|
||||
MAX_TOKENS = 50
|
||||
|
||||
# Guided decoding by Choice (list of possible options)
|
||||
guided_decoding_params_choice = GuidedDecodingParams(choice=["Positive", "Negative"])
|
||||
sampling_params_choice = SamplingParams(guided_decoding=guided_decoding_params_choice)
|
||||
# Structured outputs by Choice (list of possible options)
|
||||
structured_outputs_params_choice = StructuredOutputsParams(
|
||||
choice=["Positive", "Negative"]
|
||||
)
|
||||
sampling_params_choice = SamplingParams(
|
||||
structured_outputs=structured_outputs_params_choice
|
||||
)
|
||||
prompt_choice = "Classify this sentiment: vLLM is wonderful!"
|
||||
|
||||
# Guided decoding by Regex
|
||||
guided_decoding_params_regex = GuidedDecodingParams(regex=r"\w+@\w+\.com\n")
|
||||
# Structured outputs by Regex
|
||||
structured_outputs_params_regex = StructuredOutputsParams(regex=r"\w+@\w+\.com\n")
|
||||
sampling_params_regex = SamplingParams(
|
||||
guided_decoding=guided_decoding_params_regex,
|
||||
structured_outputs=structured_outputs_params_regex,
|
||||
stop=["\n"],
|
||||
max_tokens=MAX_TOKENS,
|
||||
)
|
||||
@ -36,7 +39,7 @@ prompt_regex = (
|
||||
)
|
||||
|
||||
|
||||
# Guided decoding by JSON using Pydantic schema
|
||||
# Structured outputs by JSON using Pydantic schema
|
||||
class CarType(str, Enum):
|
||||
sedan = "sedan"
|
||||
suv = "SUV"
|
||||
@ -51,17 +54,16 @@ class CarDescription(BaseModel):
|
||||
|
||||
|
||||
json_schema = CarDescription.model_json_schema()
|
||||
guided_decoding_params_json = GuidedDecodingParams(json=json_schema)
|
||||
structured_outputs_params_json = StructuredOutputsParams(json=json_schema)
|
||||
sampling_params_json = SamplingParams(
|
||||
guided_decoding=guided_decoding_params_json,
|
||||
max_tokens=MAX_TOKENS,
|
||||
structured_outputs=structured_outputs_params_json, max_tokens=MAX_TOKENS
|
||||
)
|
||||
prompt_json = (
|
||||
"Generate a JSON with the brand, model and car_type of"
|
||||
"Generate a JSON with the brand, model and car_type of "
|
||||
"the most iconic car from the 90's"
|
||||
)
|
||||
|
||||
# Guided decoding by Grammar
|
||||
# Structured outputs by Grammar
|
||||
simplified_sql_grammar = """
|
||||
root ::= select_statement
|
||||
select_statement ::= "SELECT " column " from " table " where " condition
|
||||
@ -70,13 +72,15 @@ table ::= "table_1 " | "table_2 "
|
||||
condition ::= column "= " number
|
||||
number ::= "1 " | "2 "
|
||||
"""
|
||||
guided_decoding_params_grammar = GuidedDecodingParams(grammar=simplified_sql_grammar)
|
||||
structured_outputs_params_grammar = StructuredOutputsParams(
|
||||
grammar=simplified_sql_grammar
|
||||
)
|
||||
sampling_params_grammar = SamplingParams(
|
||||
guided_decoding=guided_decoding_params_grammar,
|
||||
structured_outputs=structured_outputs_params_grammar,
|
||||
max_tokens=MAX_TOKENS,
|
||||
)
|
||||
prompt_grammar = (
|
||||
"Generate an SQL query to show the 'username' and 'email'from the 'users' table."
|
||||
"Generate an SQL query to show the 'username' and 'email' from the 'users' table."
|
||||
)
|
||||
|
||||
|
||||
@ -93,16 +97,16 @@ def main():
|
||||
llm = LLM(model="Qwen/Qwen2.5-3B-Instruct", max_model_len=100)
|
||||
|
||||
choice_output = generate_output(prompt_choice, sampling_params_choice, llm)
|
||||
format_output("Guided decoding by Choice", choice_output)
|
||||
format_output("Structured outputs by Choice", choice_output)
|
||||
|
||||
regex_output = generate_output(prompt_regex, sampling_params_regex, llm)
|
||||
format_output("Guided decoding by Regex", regex_output)
|
||||
format_output("Structured outputs by Regex", regex_output)
|
||||
|
||||
json_output = generate_output(prompt_json, sampling_params_json, llm)
|
||||
format_output("Guided decoding by JSON", json_output)
|
||||
format_output("Structured outputs by JSON", json_output)
|
||||
|
||||
grammar_output = generate_output(prompt_grammar, sampling_params_grammar, llm)
|
||||
format_output("Guided decoding by Grammar", grammar_output)
|
||||
format_output("Structured outputs by Grammar", grammar_output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@ -6,7 +6,7 @@ without any specific flags:
|
||||
|
||||
```bash
|
||||
VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct \
|
||||
--guided-decoding-backend outlines
|
||||
--structured-outputs-config.backend outlines
|
||||
```
|
||||
|
||||
This example demonstrates how to generate chat completions
|
||||
|
||||
@ -86,7 +86,7 @@ PARAMS: dict[ConstraintsFormat, dict[str, Any]] = {
|
||||
"content": "Classify this sentiment: vLLM is wonderful!",
|
||||
}
|
||||
],
|
||||
"extra_body": {"guided_choice": ["positive", "negative"]},
|
||||
"extra_body": {"structured_outputs": {"choice": ["positive", "negative"]}},
|
||||
},
|
||||
"regex": {
|
||||
"messages": [
|
||||
@ -96,7 +96,7 @@ PARAMS: dict[ConstraintsFormat, dict[str, Any]] = {
|
||||
}
|
||||
],
|
||||
"extra_body": {
|
||||
"guided_regex": r"[a-z0-9.]{1,20}@\w{6,10}\.com\n",
|
||||
"structured_outputs": {"regex": r"[a-z0-9.]{1,20}@\w{6,10}\.com\n"},
|
||||
},
|
||||
},
|
||||
"json": {
|
||||
@ -122,7 +122,8 @@ PARAMS: dict[ConstraintsFormat, dict[str, Any]] = {
|
||||
}
|
||||
],
|
||||
"extra_body": {
|
||||
"guided_grammar": """
|
||||
"structured_outputs": {
|
||||
"grammar": """
|
||||
root ::= select_statement
|
||||
|
||||
select_statement ::= "SELECT " column " from " table " where " condition
|
||||
@ -135,6 +136,7 @@ condition ::= column "= " number
|
||||
|
||||
number ::= "1 " | "2 "
|
||||
""",
|
||||
}
|
||||
},
|
||||
},
|
||||
"structural_tag": {
|
||||
|
||||
@ -184,7 +184,7 @@ def sample_enum_json_schema():
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_guided_choice():
|
||||
def sample_structured_outputs_choices():
|
||||
return [
|
||||
"Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript",
|
||||
"Ruby", "Swift", "Kotlin"
|
||||
|
||||
@ -1,82 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import sys
|
||||
from contextlib import nullcontext
|
||||
|
||||
from vllm_test_utils import BlameResult, blame
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.sampling_params import GuidedDecodingParams
|
||||
|
||||
|
||||
def run_normal():
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
# Create an LLM without guided decoding as a baseline.
|
||||
llm = LLM(model="distilbert/distilgpt2",
|
||||
enforce_eager=True,
|
||||
gpu_memory_utilization=0.3)
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
|
||||
# Destroy the LLM object and free up the GPU memory.
|
||||
del llm
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
def run_xgrammar(sample_regex):
|
||||
# Create an LLM with guided decoding enabled.
|
||||
llm = LLM(model="distilbert/distilgpt2",
|
||||
enforce_eager=True,
|
||||
guided_decoding_backend="xgrammar",
|
||||
gpu_memory_utilization=0.3)
|
||||
prompt = f"Give an example IPv4 address with this regex: {sample_regex}"
|
||||
guided_decoding = GuidedDecodingParams(regex=sample_regex)
|
||||
sampling_params = SamplingParams(temperature=0.8,
|
||||
top_p=0.95,
|
||||
guided_decoding=guided_decoding)
|
||||
outputs = llm.generate(
|
||||
prompts=[prompt] * 2,
|
||||
sampling_params=sampling_params,
|
||||
use_tqdm=True,
|
||||
)
|
||||
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
|
||||
|
||||
def test_lazy_outlines(sample_regex):
|
||||
"""If users don't use guided decoding, outlines should not be imported.
|
||||
"""
|
||||
# make sure outlines is not imported
|
||||
module_name = "outlines"
|
||||
# In CI, we only check finally if the module is imported.
|
||||
# If it is indeed imported, we can rerun the test with `use_blame=True`,
|
||||
# which will trace every function call to find the first import location,
|
||||
# and help find the root cause.
|
||||
# We don't run it in CI by default because it is slow.
|
||||
use_blame = False
|
||||
context = blame(
|
||||
lambda: module_name in sys.modules) if use_blame else nullcontext()
|
||||
with context as result:
|
||||
run_normal()
|
||||
run_xgrammar(sample_regex)
|
||||
if use_blame:
|
||||
assert isinstance(result, BlameResult)
|
||||
print(f"the first import location is:\n{result.trace_stack}")
|
||||
assert module_name not in sys.modules, (
|
||||
f"Module {module_name} is imported. To see the first"
|
||||
f" import location, run the test with `use_blame=True`.")
|
||||
@ -1,7 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# imports for guided decoding tests
|
||||
# imports for structured outputs tests
|
||||
import json
|
||||
from typing import Optional
|
||||
|
||||
@ -480,10 +480,11 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_guided_choice_chat(client: openai.AsyncOpenAI,
|
||||
sample_guided_choice, is_v1_server: bool):
|
||||
async def test_structured_outputs_choice_chat(
|
||||
client: openai.AsyncOpenAI, sample_structured_outputs_choices,
|
||||
is_v1_server: bool):
|
||||
if not is_v1_server:
|
||||
pytest.skip("Guided decoding is only supported in v1 engine")
|
||||
pytest.skip("Structured outputs is only supported in v1 engine")
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
@ -498,9 +499,10 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.7,
|
||||
extra_body=dict(guided_choice=sample_guided_choice))
|
||||
extra_body=dict(
|
||||
structured_outputs={"choice": sample_structured_outputs_choices}))
|
||||
choice1 = chat_completion.choices[0].message.content
|
||||
assert choice1 in sample_guided_choice
|
||||
assert choice1 in sample_structured_outputs_choices
|
||||
|
||||
messages.append({"role": "assistant", "content": choice1})
|
||||
messages.append({
|
||||
@ -512,17 +514,19 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.7,
|
||||
extra_body=dict(guided_choice=sample_guided_choice))
|
||||
extra_body=dict(
|
||||
structured_outputs={"choice": sample_structured_outputs_choices}))
|
||||
choice2 = chat_completion.choices[0].message.content
|
||||
assert choice2 in sample_guided_choice
|
||||
assert choice2 in sample_structured_outputs_choices
|
||||
assert choice1 != choice2
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_guided_json_chat(client: openai.AsyncOpenAI, sample_json_schema,
|
||||
is_v1_server: bool):
|
||||
async def test_structured_outputs_json_chat(client: openai.AsyncOpenAI,
|
||||
sample_json_schema,
|
||||
is_v1_server: bool):
|
||||
if not is_v1_server:
|
||||
pytest.skip("Guided decoding is only supported in v1 engine")
|
||||
pytest.skip("Structured outputs is only supported in v1 engine")
|
||||
|
||||
messages = [{
|
||||
"role": "system",
|
||||
@ -538,7 +542,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, sample_json_schema,
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=1000,
|
||||
extra_body=dict(guided_json=sample_json_schema))
|
||||
extra_body=dict(structured_outputs={"json": sample_json_schema}))
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None
|
||||
json1 = json.loads(message.content)
|
||||
@ -555,7 +559,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, sample_json_schema,
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=1000,
|
||||
extra_body=dict(guided_json=sample_json_schema))
|
||||
extra_body=dict(structured_outputs={"json": sample_json_schema}))
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None
|
||||
json2 = json.loads(message.content)
|
||||
@ -565,10 +569,10 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, sample_json_schema,
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex,
|
||||
is_v1_server: bool):
|
||||
async def test_structured_outputs_regex_chat(client: openai.AsyncOpenAI,
|
||||
sample_regex, is_v1_server: bool):
|
||||
if not is_v1_server:
|
||||
pytest.skip("Guided decoding is only supported in v1 engine")
|
||||
pytest.skip("Structured outputs is only supported in v1 engine")
|
||||
|
||||
messages = [{
|
||||
"role": "system",
|
||||
@ -583,7 +587,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex,
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=20,
|
||||
extra_body=dict(guided_regex=sample_regex))
|
||||
extra_body=dict(structured_outputs={"regex": sample_regex}))
|
||||
ip1 = chat_completion.choices[0].message.content
|
||||
assert ip1 is not None
|
||||
assert re.fullmatch(sample_regex, ip1) is not None
|
||||
@ -594,7 +598,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex,
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=20,
|
||||
extra_body=dict(guided_regex=sample_regex))
|
||||
extra_body=dict(structured_outputs={"regex": sample_regex}))
|
||||
ip2 = chat_completion.choices[0].message.content
|
||||
assert ip2 is not None
|
||||
assert re.fullmatch(sample_regex, ip2) is not None
|
||||
@ -602,7 +606,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex,
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_guided_decoding_type_error(client: openai.AsyncOpenAI):
|
||||
async def test_structured_outputs_type_error(client: openai.AsyncOpenAI):
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
@ -614,17 +618,19 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI):
|
||||
}]
|
||||
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
_ = await client.chat.completions.create(model=MODEL_NAME,
|
||||
messages=messages,
|
||||
extra_body=dict(guided_regex={
|
||||
1: "Python",
|
||||
2: "C++"
|
||||
}))
|
||||
_ = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
extra_body=dict(
|
||||
structured_outputs={"regex": {
|
||||
1: "Python",
|
||||
2: "C++"
|
||||
}}))
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
|
||||
sample_guided_choice):
|
||||
async def test_structured_outputs_choice_chat_logprobs(
|
||||
client: openai.AsyncOpenAI, sample_structured_outputs_choices):
|
||||
|
||||
messages = [{
|
||||
"role": "system",
|
||||
@ -641,7 +647,8 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
|
||||
max_completion_tokens=10,
|
||||
logprobs=True,
|
||||
top_logprobs=5,
|
||||
extra_body=dict(guided_choice=sample_guided_choice))
|
||||
extra_body=dict(
|
||||
structured_outputs={"choice": sample_structured_outputs_choices}))
|
||||
|
||||
assert chat_completion.choices[0].logprobs is not None
|
||||
assert chat_completion.choices[0].logprobs.content is not None
|
||||
@ -663,10 +670,23 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema,
|
||||
}, {
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
f"Give an example JSON for an employee profile that "
|
||||
f"fits this schema: {sample_json_schema}"
|
||||
"content": ("Give an example JSON for an employee "
|
||||
"profile using the specified tool.")
|
||||
}]
|
||||
tools = [{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "dummy_function_name",
|
||||
"description": "This is a dummy function",
|
||||
"parameters": sample_json_schema
|
||||
}
|
||||
}]
|
||||
tool_choice = {
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "dummy_function_name"
|
||||
}
|
||||
}
|
||||
|
||||
# non-streaming
|
||||
|
||||
@ -674,20 +694,8 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema,
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=1000,
|
||||
tools=[{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "dummy_function_name",
|
||||
"description": "This is a dummy function",
|
||||
"parameters": sample_json_schema
|
||||
}
|
||||
}],
|
||||
tool_choice={
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "dummy_function_name"
|
||||
}
|
||||
},
|
||||
tools=tools,
|
||||
tool_choice=tool_choice,
|
||||
)
|
||||
message = chat_completion.choices[0].message
|
||||
assert len(message.content) == 0
|
||||
@ -705,25 +713,12 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema,
|
||||
|
||||
# streaming
|
||||
|
||||
stream = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=1000,
|
||||
tools=[{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "dummy_function_name",
|
||||
"description": "This is a dummy function",
|
||||
"parameters": sample_json_schema
|
||||
}
|
||||
}],
|
||||
tool_choice={
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "dummy_function_name"
|
||||
}
|
||||
},
|
||||
stream=True)
|
||||
stream = await client.chat.completions.create(model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=1000,
|
||||
tools=tools,
|
||||
tool_choice=tool_choice,
|
||||
stream=True)
|
||||
|
||||
output = []
|
||||
finish_reason_count = 0
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# imports for guided decoding tests
|
||||
# imports for structured outputs tests
|
||||
import json
|
||||
import os
|
||||
from typing import Optional
|
||||
@ -23,8 +23,6 @@ MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||
# technically these adapters use a different base model,
|
||||
# but we're not testing generation quality here
|
||||
|
||||
GUIDED_DECODING_BACKENDS = ["outlines", "xgrammar", "guidance"]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def default_server_args(zephyr_lora_files):
|
||||
@ -595,12 +593,13 @@ async def test_allowed_token_ids(client: openai.AsyncOpenAI):
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
|
||||
async def test_guided_json_completion(client: openai.AsyncOpenAI,
|
||||
guided_decoding_backend: str,
|
||||
sample_json_schema, is_v1_server: bool):
|
||||
async def test_structured_outputs_json_completion(
|
||||
client: openai.AsyncOpenAI,
|
||||
sample_json_schema,
|
||||
is_v1_server: bool,
|
||||
):
|
||||
if not is_v1_server:
|
||||
pytest.skip("Guided decoding is only supported in v1 engine")
|
||||
pytest.skip("structured outputs is only supported in v1 engine")
|
||||
|
||||
completion = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
@ -609,8 +608,7 @@ async def test_guided_json_completion(client: openai.AsyncOpenAI,
|
||||
n=3,
|
||||
temperature=1.0,
|
||||
max_tokens=500,
|
||||
extra_body=dict(guided_json=sample_json_schema,
|
||||
guided_decoding_backend=guided_decoding_backend))
|
||||
extra_body=dict(structured_outputs=dict(json=sample_json_schema)))
|
||||
|
||||
assert completion.id is not None
|
||||
assert len(completion.choices) == 3
|
||||
@ -620,12 +618,13 @@ async def test_guided_json_completion(client: openai.AsyncOpenAI,
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
|
||||
async def test_guided_regex_completion(client: openai.AsyncOpenAI,
|
||||
guided_decoding_backend: str,
|
||||
sample_regex, is_v1_server: bool):
|
||||
async def test_structured_outputs_regex_completion(
|
||||
client: openai.AsyncOpenAI,
|
||||
sample_regex,
|
||||
is_v1_server: bool,
|
||||
):
|
||||
if not is_v1_server:
|
||||
pytest.skip("Guided decoding is only supported in v1 engine")
|
||||
pytest.skip("structured outputs is only supported in v1 engine")
|
||||
|
||||
completion = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
@ -633,8 +632,7 @@ async def test_guided_regex_completion(client: openai.AsyncOpenAI,
|
||||
n=3,
|
||||
temperature=1.0,
|
||||
max_tokens=20,
|
||||
extra_body=dict(guided_regex=sample_regex,
|
||||
guided_decoding_backend=guided_decoding_backend))
|
||||
extra_body=dict(structured_outputs=dict(regex=sample_regex)))
|
||||
|
||||
assert completion.id is not None
|
||||
assert len(completion.choices) == 3
|
||||
@ -644,13 +642,13 @@ async def test_guided_regex_completion(client: openai.AsyncOpenAI,
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
|
||||
async def test_guided_choice_completion(client: openai.AsyncOpenAI,
|
||||
guided_decoding_backend: str,
|
||||
sample_guided_choice,
|
||||
is_v1_server: bool):
|
||||
async def test_structured_outputs_choice_completion(
|
||||
client: openai.AsyncOpenAI,
|
||||
sample_structured_outputs_choices,
|
||||
is_v1_server: bool,
|
||||
):
|
||||
if not is_v1_server:
|
||||
pytest.skip("Guided decoding is only supported in v1 engine")
|
||||
pytest.skip("structured outputs is only supported in v1 engine")
|
||||
|
||||
completion = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
@ -658,20 +656,21 @@ async def test_guided_choice_completion(client: openai.AsyncOpenAI,
|
||||
n=2,
|
||||
temperature=1.0,
|
||||
max_tokens=10,
|
||||
extra_body=dict(guided_choice=sample_guided_choice,
|
||||
guided_decoding_backend=guided_decoding_backend))
|
||||
extra_body=dict(structured_outputs=dict(
|
||||
choice=sample_structured_outputs_choices)))
|
||||
|
||||
assert completion.id is not None
|
||||
assert len(completion.choices) == 2
|
||||
for i in range(2):
|
||||
assert completion.choices[i].text in sample_guided_choice
|
||||
assert completion.choices[i].text in sample_structured_outputs_choices
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_guided_grammar(client: openai.AsyncOpenAI,
|
||||
sample_sql_statements, is_v1_server: bool):
|
||||
async def test_structured_outputs_grammar(client: openai.AsyncOpenAI,
|
||||
sample_sql_statements,
|
||||
is_v1_server: bool):
|
||||
if not is_v1_server:
|
||||
pytest.skip("Guided grammar is only supported in v1 engine")
|
||||
pytest.skip("grammar is only supported in v1 engine")
|
||||
|
||||
completion = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
@ -679,7 +678,8 @@ async def test_guided_grammar(client: openai.AsyncOpenAI,
|
||||
"table_1 where it is equals to 1"),
|
||||
temperature=1.0,
|
||||
max_tokens=500,
|
||||
extra_body=dict(guided_grammar=sample_sql_statements))
|
||||
extra_body=dict(
|
||||
structured_outputs=dict(grammar=sample_sql_statements), ))
|
||||
|
||||
content = completion.choices[0].text
|
||||
|
||||
@ -730,27 +730,26 @@ async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
|
||||
async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
|
||||
guided_decoding_backend: str,
|
||||
sample_json_schema, sample_regex,
|
||||
is_v1_server: bool):
|
||||
async def test_structured_outputs_type_error(client: openai.AsyncOpenAI,
|
||||
sample_json_schema, sample_regex,
|
||||
is_v1_server: bool):
|
||||
if not is_v1_server:
|
||||
pytest.skip("Guided decoding is only supported in v1 engine")
|
||||
pytest.skip("structured outputs is only supported in v1 engine")
|
||||
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
_ = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt="Give an example JSON that fits this schema: 42",
|
||||
extra_body=dict(guided_json=42,
|
||||
guided_decoding_backend=guided_decoding_backend))
|
||||
extra_body=dict(structured_outputs=dict(json=42)))
|
||||
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
_ = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt="Give an example string that fits this regex",
|
||||
extra_body=dict(guided_regex=sample_regex,
|
||||
guided_json=sample_json_schema))
|
||||
extra_body=dict(structured_outputs=dict(
|
||||
regex=sample_regex,
|
||||
json=sample_json_schema,
|
||||
)))
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
@ -142,7 +142,7 @@ def server(): # noqa: F811
|
||||
"--dtype",
|
||||
"half",
|
||||
"--enable-auto-tool-choice",
|
||||
"--guided-decoding-backend",
|
||||
"--structured-outputs-config.backend",
|
||||
"xgrammar",
|
||||
"--tool-call-parser",
|
||||
"hermes",
|
||||
@ -225,7 +225,7 @@ def k2_server(): # noqa: F811
|
||||
"--dtype",
|
||||
"half",
|
||||
"--enable-auto-tool-choice",
|
||||
"--guided-decoding-backend",
|
||||
"--structured-outputs-config.backend",
|
||||
"xgrammar",
|
||||
"--tool-call-parser",
|
||||
"hermes",
|
||||
|
||||
@ -102,12 +102,14 @@ def before_generate_case(context: schemathesis.hooks.HookContext, strategy):
|
||||
if "custom" in tool_call:
|
||||
return False
|
||||
|
||||
# Sometimes guided_grammar is generated to be empty
|
||||
# Sometimes structured_outputs.grammar is generated to be empty
|
||||
# Causing a server error in EBNF grammar parsing
|
||||
# https://github.com/vllm-project/vllm/pull/22587#issuecomment-3195253421
|
||||
guided_grammar = case.body.get("guided_grammar")
|
||||
structured_outputs = case.body.get("structured_outputs", {})
|
||||
grammar = structured_outputs.get("grammar") if isinstance(
|
||||
structured_outputs, dict) else None
|
||||
|
||||
if guided_grammar == '':
|
||||
if grammar == '':
|
||||
# Allow None (will be handled as no grammar)
|
||||
# But skip empty strings
|
||||
return False
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
|
||||
import io
|
||||
|
||||
# imports for guided decoding tests
|
||||
# imports for structured outputs tests
|
||||
import openai
|
||||
import pybase64
|
||||
import pytest
|
||||
|
||||
@ -333,7 +333,6 @@ async def test_serving_chat_should_set_correct_max_tokens():
|
||||
"role": "user",
|
||||
"content": "what is 1+1?"
|
||||
}],
|
||||
guided_decoding_backend="outlines",
|
||||
)
|
||||
|
||||
with suppress(Exception):
|
||||
@ -378,7 +377,6 @@ async def test_serving_chat_should_set_correct_max_tokens():
|
||||
"role": "user",
|
||||
"content": "what is 1+1?"
|
||||
}],
|
||||
guided_decoding_backend="outlines",
|
||||
)
|
||||
|
||||
with suppress(Exception):
|
||||
@ -433,7 +431,6 @@ async def test_serving_chat_should_set_correct_max_tokens():
|
||||
"role": "user",
|
||||
"content": "what is 1+1?"
|
||||
}],
|
||||
guided_decoding_backend="outlines",
|
||||
)
|
||||
|
||||
with suppress(Exception):
|
||||
@ -489,7 +486,6 @@ async def test_serving_chat_could_load_correct_generation_config():
|
||||
"role": "user",
|
||||
"content": "what is 1+1?"
|
||||
}],
|
||||
guided_decoding_backend="outlines",
|
||||
)
|
||||
|
||||
with suppress(Exception):
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# imports for guided decoding tests
|
||||
# imports for structured outputs tests
|
||||
import io
|
||||
import json
|
||||
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import io
|
||||
# imports for guided decoding tests
|
||||
# imports for structured outputs tests
|
||||
import json
|
||||
|
||||
import httpx
|
||||
|
||||
@ -1,84 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Tests for the SamplingParams class.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import SamplingParams
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
|
||||
|
||||
MODEL_NAME = "Qwen/Qwen1.5-7B"
|
||||
|
||||
|
||||
def test_max_tokens_none():
|
||||
"""max_tokens=None should be allowed"""
|
||||
SamplingParams(temperature=0.01, top_p=0.1, max_tokens=None)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def model_config():
|
||||
return ModelConfig(
|
||||
MODEL_NAME,
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def default_max_tokens():
|
||||
return 4096
|
||||
|
||||
|
||||
def test_sampling_params_from_request_with_no_guided_decoding_backend(
|
||||
model_config, default_max_tokens):
|
||||
# guided_decoding_backend is not present at request level
|
||||
request = ChatCompletionRequest.model_validate({
|
||||
'messages': [{
|
||||
'role': 'user',
|
||||
'content': 'Hello'
|
||||
}],
|
||||
'model':
|
||||
MODEL_NAME,
|
||||
'response_format': {
|
||||
'type': 'json_object',
|
||||
},
|
||||
})
|
||||
|
||||
sampling_params = request.to_sampling_params(
|
||||
default_max_tokens,
|
||||
model_config.logits_processor_pattern,
|
||||
)
|
||||
# we do not expect any backend to be present and the default
|
||||
# guided_decoding_backend at engine level will be used.
|
||||
assert sampling_params.guided_decoding.backend is None
|
||||
|
||||
|
||||
@pytest.mark.parametrize("request_level_guided_decoding_backend,expected",
|
||||
[("xgrammar", "xgrammar"), ("guidance", "guidance"),
|
||||
("outlines", "outlines")])
|
||||
def test_sampling_params_from_request_with_guided_decoding_backend(
|
||||
request_level_guided_decoding_backend: str, expected: str,
|
||||
model_config, default_max_tokens):
|
||||
|
||||
request = ChatCompletionRequest.model_validate({
|
||||
'messages': [{
|
||||
'role': 'user',
|
||||
'content': 'Hello'
|
||||
}],
|
||||
'model':
|
||||
MODEL_NAME,
|
||||
'response_format': {
|
||||
'type': 'json_object',
|
||||
},
|
||||
'guided_decoding_backend':
|
||||
request_level_guided_decoding_backend,
|
||||
})
|
||||
|
||||
sampling_params = request.to_sampling_params(
|
||||
default_max_tokens,
|
||||
model_config.logits_processor_pattern,
|
||||
)
|
||||
# backend correctly identified in resulting sampling_params
|
||||
assert sampling_params.guided_decoding.backend == expected
|
||||
@ -68,7 +68,7 @@ EXAMPLE_TOOLS = [
|
||||
def _compile_and_check(tools: list[ChatCompletionToolsParam], sample_output,
|
||||
should_match: bool):
|
||||
self = MagicMock(tool_choice="required", tools=tools)
|
||||
schema = ChatCompletionRequest._get_guided_json_from_tool(self)
|
||||
schema = ChatCompletionRequest._get_json_schema_from_tool(self)
|
||||
assert isinstance(schema, dict)
|
||||
|
||||
# use build_regex_from_schema used in JSONLogitsProcessor to create Guide
|
||||
@ -218,7 +218,7 @@ VALID_TOOLS = [t[0] for t in VALID_TOOL_OUTPUTS]
|
||||
}
|
||||
}, {}], False),
|
||||
])
|
||||
def test_guided_json(sample_output, should_match):
|
||||
def test_structured_outputs_json(sample_output, should_match):
|
||||
_compile_and_check(tools=TypeAdapter(
|
||||
list[ChatCompletionToolsParam]).validate_python(EXAMPLE_TOOLS),
|
||||
sample_output=sample_output,
|
||||
@ -273,8 +273,9 @@ def update_parameters_empty_dict(
|
||||
@pytest.mark.parametrize(
|
||||
"update_parameters",
|
||||
[update_parameters_none, update_parameters_empty_dict])
|
||||
def test_guided_json_without_parameters(sample_output, should_match,
|
||||
update_parameters):
|
||||
def test_structured_outputs_json_without_parameters(sample_output,
|
||||
should_match,
|
||||
update_parameters):
|
||||
updated_tools = [deepcopy(EXAMPLE_TOOLS[0])]
|
||||
tools = TypeAdapter(
|
||||
list[ChatCompletionToolsParam]).validate_python(updated_tools)
|
||||
@ -334,4 +335,4 @@ def test_streaming_output_valid(output, empty_params, delta_len):
|
||||
combined_messages += message.tool_calls[0].function.arguments
|
||||
combined_messages += "}]"
|
||||
assert json.loads(combined_messages) == output
|
||||
assert json.dumps(json.loads(combined_messages)) == output_json
|
||||
assert json.dumps(json.loads(combined_messages)) == output_json
|
||||
|
||||
@ -10,7 +10,7 @@ from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
|
||||
SchedulerConfig, SpeculativeConfig, VllmConfig)
|
||||
from vllm.multimodal.inputs import (MultiModalFeatureSpec,
|
||||
MultiModalKwargsItem, PlaceholderRange)
|
||||
from vllm.sampling_params import GuidedDecodingParams, SamplingParams
|
||||
from vllm.sampling_params import SamplingParams, StructuredOutputsParams
|
||||
from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput
|
||||
from vllm.v1.core.sched.scheduler import Scheduler
|
||||
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
|
||||
@ -1796,11 +1796,11 @@ def test_schedule_skip_tokenizer_init():
|
||||
|
||||
def test_schedule_skip_tokenizer_init_structured_output_request():
|
||||
scheduler = create_scheduler(skip_tokenizer_init=True)
|
||||
guided_params = GuidedDecodingParams(regex="[0-9]+")
|
||||
structured_outputs_params = StructuredOutputsParams(regex="[0-9]+")
|
||||
sampling_params = SamplingParams(
|
||||
ignore_eos=False,
|
||||
max_tokens=16,
|
||||
guided_decoding=guided_params,
|
||||
structured_outputs=structured_outputs_params,
|
||||
)
|
||||
request = Request(
|
||||
request_id="0",
|
||||
|
||||
@ -8,7 +8,7 @@ from typing import TYPE_CHECKING, Optional
|
||||
import pytest
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.sampling_params import GuidedDecodingParams, SamplingParams
|
||||
from vllm.sampling_params import SamplingParams, StructuredOutputsParams
|
||||
from vllm.v1.metrics.reader import Counter, Gauge, Histogram, Metric, Vector
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@ -97,7 +97,7 @@ def _get_test_sampling_params(
|
||||
top_p=0.95,
|
||||
n=n,
|
||||
seed=seed,
|
||||
guided_decoding=GuidedDecodingParams(
|
||||
structured_outputs=StructuredOutputsParams(
|
||||
regex="[0-9]+") if structured_outputs else None,
|
||||
) for n in n_list
|
||||
], n_list
|
||||
|
||||
@ -151,7 +151,7 @@ def sample_definition_json_schema():
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_guided_choice():
|
||||
def sample_structured_outputs_choices():
|
||||
return [
|
||||
"Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript",
|
||||
"Ruby", "Swift", "Kotlin"
|
||||
|
||||
@ -15,12 +15,13 @@ import torch
|
||||
from pydantic import BaseModel
|
||||
|
||||
from tests.reasoning.utils import run_reasoning_extraction
|
||||
from vllm.config import StructuredOutputsConfig
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.entrypoints.llm import LLM
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager
|
||||
from vllm.sampling_params import GuidedDecodingParams, SamplingParams
|
||||
from vllm.sampling_params import SamplingParams, StructuredOutputsParams
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import TokenizerMode
|
||||
@ -90,7 +91,7 @@ def _load_json(s: str, backend: str) -> str:
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
@pytest.mark.parametrize(
|
||||
"model_name, guided_decoding_backend, tokenizer_mode, speculative_config",
|
||||
"model_name, backend, tokenizer_mode, speculative_config",
|
||||
PARAMS_MODELS_BACKENDS_TOKENIZER_MODE)
|
||||
def test_structured_output(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
@ -99,8 +100,8 @@ def test_structured_output(
|
||||
sample_sql_ebnf: str,
|
||||
sample_sql_lark: str,
|
||||
sample_regex: str,
|
||||
sample_guided_choice: str,
|
||||
guided_decoding_backend: str,
|
||||
sample_structured_outputs_choices: str,
|
||||
backend: str,
|
||||
tokenizer_mode: str,
|
||||
model_name: str,
|
||||
speculative_config: dict[str, Any],
|
||||
@ -115,16 +116,15 @@ def test_structured_output(
|
||||
enforce_eager = bool(not current_platform.is_tpu())
|
||||
# Use a single LLM instance for several scenarios to
|
||||
# speed up the test suite.
|
||||
llm = LLM(
|
||||
model=model_name,
|
||||
enforce_eager=enforce_eager,
|
||||
max_model_len=1024,
|
||||
guided_decoding_backend=guided_decoding_backend,
|
||||
guided_decoding_disable_any_whitespace=(guided_decoding_backend
|
||||
in {"xgrammar", "guidance"}),
|
||||
seed=120,
|
||||
tokenizer_mode=tokenizer_mode,
|
||||
speculative_config=speculative_config)
|
||||
llm = LLM(model=model_name,
|
||||
enforce_eager=enforce_eager,
|
||||
max_model_len=1024,
|
||||
structured_outputs_config=dict(backend=backend,
|
||||
disable_any_whitespace=backend
|
||||
in {"xgrammar", "guidance"}),
|
||||
seed=120,
|
||||
tokenizer_mode=tokenizer_mode,
|
||||
speculative_config=speculative_config)
|
||||
|
||||
#
|
||||
# Test 1: Generate JSON output based on a provided schema
|
||||
@ -132,7 +132,7 @@ def test_structured_output(
|
||||
sampling_params = SamplingParams(
|
||||
temperature=1.0,
|
||||
max_tokens=4096,
|
||||
guided_decoding=GuidedDecodingParams(json=sample_json_schema))
|
||||
structured_outputs=StructuredOutputsParams(json=sample_json_schema))
|
||||
|
||||
prompt = ("Give an example JSON for an employee profile that fits this "
|
||||
"schema. Make the response as short as possible. Schema: "
|
||||
@ -152,7 +152,7 @@ def test_structured_output(
|
||||
|
||||
generated_text = output.outputs[0].text
|
||||
assert generated_text is not None
|
||||
if guided_decoding_backend != 'lm-format-enforcer':
|
||||
if backend != 'lm-format-enforcer':
|
||||
assert "\n" not in generated_text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
output_json = json.loads(generated_text)
|
||||
@ -161,12 +161,12 @@ def test_structured_output(
|
||||
#
|
||||
# Test 2: Generate JSON object without a schema
|
||||
#
|
||||
if guided_decoding_backend != "outlines":
|
||||
if backend != "outlines":
|
||||
sampling_params = SamplingParams(
|
||||
temperature=1.0,
|
||||
max_tokens=4096,
|
||||
n=2,
|
||||
guided_decoding=GuidedDecodingParams(json_object=True))
|
||||
structured_outputs=StructuredOutputsParams(json_object=True))
|
||||
|
||||
outputs = llm.generate(prompts=(
|
||||
"Generate a JSON object with curly braces for a person with "
|
||||
@ -195,8 +195,9 @@ def test_structured_output(
|
||||
sampling_params = SamplingParams(
|
||||
temperature=1.0,
|
||||
max_tokens=4096,
|
||||
guided_decoding=GuidedDecodingParams(json=unsupported_json_schema))
|
||||
if guided_decoding_backend.startswith("xgrammar"):
|
||||
structured_outputs=StructuredOutputsParams(
|
||||
json=unsupported_json_schema))
|
||||
if backend.startswith("xgrammar"):
|
||||
with pytest.raises(ValueError,
|
||||
match="The provided JSON schema contains features "
|
||||
"not supported by xgrammar."):
|
||||
@ -230,7 +231,7 @@ def test_structured_output(
|
||||
parsed_json = json.loads(generated_text)
|
||||
assert isinstance(parsed_json, dict)
|
||||
|
||||
if guided_decoding_backend not in ["outlines", "lm-format-enforcer"]:
|
||||
if backend not in ["outlines", "lm-format-enforcer"]:
|
||||
#
|
||||
# Test 4: Generate SQL statement using EBNF grammar
|
||||
#
|
||||
@ -238,7 +239,8 @@ def test_structured_output(
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
max_tokens=1000,
|
||||
guided_decoding=GuidedDecodingParams(grammar=sample_sql_ebnf))
|
||||
structured_outputs=StructuredOutputsParams(
|
||||
grammar=sample_sql_ebnf))
|
||||
outputs = llm.generate(
|
||||
("Generate a sql statement that selects col_1 from "
|
||||
"table_1 where it is equal to 1. Make the response as short as "
|
||||
@ -271,7 +273,8 @@ def test_structured_output(
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
max_tokens=1000,
|
||||
guided_decoding=GuidedDecodingParams(grammar=sample_sql_lark))
|
||||
structured_outputs=StructuredOutputsParams(
|
||||
grammar=sample_sql_lark))
|
||||
outputs = llm.generate(
|
||||
("Generate a sql statement that selects col_1 from "
|
||||
"table_1 where it is equal to 1. Make the response as short as "
|
||||
@ -309,7 +312,8 @@ def test_structured_output(
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
max_tokens=1000,
|
||||
guided_decoding=GuidedDecodingParams(grammar="not a grammar"))
|
||||
structured_outputs=StructuredOutputsParams(
|
||||
grammar="not a grammar"))
|
||||
with pytest.raises(ValueError, match="Failed to convert the grammar "):
|
||||
llm.generate(
|
||||
("Generate a sql statement that selects col_1 from "
|
||||
@ -325,7 +329,7 @@ def test_structured_output(
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
guided_decoding=GuidedDecodingParams(regex=sample_regex))
|
||||
structured_outputs=StructuredOutputsParams(regex=sample_regex))
|
||||
|
||||
prompt = (f"Give an example IPv4 address with this regex: {sample_regex}. "
|
||||
f"Make the response as short as possible.")
|
||||
@ -352,7 +356,8 @@ def test_structured_output(
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
guided_decoding=GuidedDecodingParams(choice=sample_guided_choice))
|
||||
structured_outputs=StructuredOutputsParams(
|
||||
choice=sample_structured_outputs_choices))
|
||||
|
||||
outputs = llm.generate(
|
||||
("The best language for type-safe systems programming is "
|
||||
@ -368,7 +373,7 @@ def test_structured_output(
|
||||
generated_text = output.outputs[0].text
|
||||
print(generated_text)
|
||||
assert generated_text is not None
|
||||
assert generated_text in sample_guided_choice
|
||||
assert generated_text in sample_structured_outputs_choices
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
|
||||
#
|
||||
@ -378,7 +383,7 @@ def test_structured_output(
|
||||
sampling_params = SamplingParams(
|
||||
temperature=1.0,
|
||||
max_tokens=1000,
|
||||
guided_decoding=GuidedDecodingParams(json=json_schema))
|
||||
structured_outputs=StructuredOutputsParams(json=json_schema))
|
||||
|
||||
outputs = llm.generate(
|
||||
("Generate a JSON with the brand, model and car_type of the most "
|
||||
@ -422,7 +427,7 @@ def test_structured_output(
|
||||
sampling_params = SamplingParams(
|
||||
temperature=1.0,
|
||||
max_tokens=4096,
|
||||
guided_decoding=GuidedDecodingParams(json=json_schema))
|
||||
structured_outputs=StructuredOutputsParams(json=json_schema))
|
||||
|
||||
outputs = llm.generate(
|
||||
("Generate a description of a frog using 50 characters. "
|
||||
@ -444,7 +449,7 @@ def test_structured_output(
|
||||
output_json = json.loads(generated_text)
|
||||
jsonschema.validate(instance=output_json, schema=json_schema)
|
||||
|
||||
if guided_decoding_backend not in ["outlines", "lm-format-enforcer"]:
|
||||
if backend not in ["outlines", "lm-format-enforcer"]:
|
||||
#
|
||||
# Test 11: Generate structured output using structural_tag format
|
||||
#
|
||||
@ -470,7 +475,7 @@ def test_structured_output(
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.0,
|
||||
max_tokens=4096,
|
||||
guided_decoding=GuidedDecodingParams(
|
||||
structured_outputs=StructuredOutputsParams(
|
||||
structural_tag=json.dumps(structural_tag_config)))
|
||||
|
||||
prompt = """
|
||||
@ -547,7 +552,7 @@ Make the response as short as possible.
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
@pytest.mark.parametrize(
|
||||
"model_name, guided_decoding_backend, tokenizer_mode, reasoning_parser, speculative_config", # noqa: E501
|
||||
"model_name, backend, tokenizer_mode, reasoning_parser, speculative_config", # noqa: E501
|
||||
[
|
||||
("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", "xgrammar", "auto",
|
||||
"deepseek_r1", NGRAM_SPEC_CONFIG),
|
||||
@ -556,7 +561,7 @@ Make the response as short as possible.
|
||||
)
|
||||
def test_structured_output_with_reasoning_matrices(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
guided_decoding_backend: str,
|
||||
backend: str,
|
||||
tokenizer_mode: TokenizerMode,
|
||||
reasoning_parser: str,
|
||||
model_name: str,
|
||||
@ -576,10 +581,11 @@ def test_structured_output_with_reasoning_matrices(
|
||||
enforce_eager=bool(not current_platform.is_tpu()),
|
||||
max_model_len=1024,
|
||||
max_num_seqs=16,
|
||||
guided_decoding_backend=guided_decoding_backend,
|
||||
guided_decoding_disable_any_whitespace=True,
|
||||
structured_outputs_config=dict(backend=backend,
|
||||
disable_any_whitespace=backend
|
||||
in {"xgrammar", "guidance"},
|
||||
reasoning_parser=reasoning_parser),
|
||||
tokenizer_mode=tokenizer_mode,
|
||||
reasoning_parser=reasoning_parser,
|
||||
speculative_config=speculative_config,
|
||||
)
|
||||
tokenizer = llm.get_tokenizer()
|
||||
@ -603,7 +609,7 @@ def test_structured_output_with_reasoning_matrices(
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.1,
|
||||
max_tokens=8192,
|
||||
guided_decoding=GuidedDecodingParams(json=reasoning_schema),
|
||||
structured_outputs=StructuredOutputsParams(json=reasoning_schema),
|
||||
)
|
||||
outputs = llm.generate(
|
||||
[reasoning_prompt],
|
||||
@ -640,13 +646,14 @@ def test_structured_output_auto_mode(
|
||||
|
||||
llm = LLM(model=model_name,
|
||||
max_model_len=1024,
|
||||
guided_decoding_backend="auto",
|
||||
structured_outputs_config=dict(backend="auto"),
|
||||
tokenizer_mode=tokenizer_mode)
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=1.0,
|
||||
max_tokens=1000,
|
||||
guided_decoding=GuidedDecodingParams(json=unsupported_json_schema))
|
||||
structured_outputs=StructuredOutputsParams(
|
||||
json=unsupported_json_schema))
|
||||
|
||||
prompts = (
|
||||
"Give an example JSON object for a grade "
|
||||
@ -681,9 +688,10 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
|
||||
|
||||
llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct",
|
||||
max_model_len=1024,
|
||||
guided_decoding_backend="guidance",
|
||||
guided_decoding_disable_any_whitespace=True,
|
||||
guided_decoding_disable_additional_properties=True)
|
||||
structured_outputs_config=dict(
|
||||
backend="guidance",
|
||||
disable_any_whitespace=True,
|
||||
disable_additional_properties=True))
|
||||
|
||||
schema = {
|
||||
'type': 'object',
|
||||
@ -709,14 +717,15 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
|
||||
"<|im_end|>\n<|im_start|>assistant\n")
|
||||
|
||||
def generate_with_backend(backend):
|
||||
guided_params = GuidedDecodingParams(
|
||||
structured_outputs_params = StructuredOutputsParams(
|
||||
json=schema,
|
||||
backend=backend,
|
||||
disable_any_whitespace=True,
|
||||
disable_additional_properties=True)
|
||||
sampling_params = SamplingParams(temperature=0,
|
||||
max_tokens=256,
|
||||
guided_decoding=guided_params)
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
max_tokens=256,
|
||||
structured_outputs=structured_outputs_params)
|
||||
|
||||
outputs = llm.generate(prompt, sampling_params=sampling_params)
|
||||
assert outputs is not None
|
||||
@ -736,12 +745,11 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
|
||||
assert "a6" not in generated
|
||||
|
||||
|
||||
@pytest.mark.parametrize("guided_decoding_backend",
|
||||
["guidance", "xgrammar", "outlines"])
|
||||
def test_structured_output_batched_with_non_guided_requests(
|
||||
@pytest.mark.parametrize("backend", ["guidance", "xgrammar", "outlines"])
|
||||
def test_structured_output_batched_with_non_structured_outputs_requests(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
sample_json_schema: dict[str, Any],
|
||||
guided_decoding_backend: str,
|
||||
backend: str,
|
||||
):
|
||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
@ -753,24 +761,25 @@ def test_structured_output_batched_with_non_guided_requests(
|
||||
model="meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
enforce_eager=enforce_eager,
|
||||
max_model_len=1024,
|
||||
guided_decoding_backend=guided_decoding_backend,
|
||||
guided_decoding_disable_any_whitespace=(guided_decoding_backend
|
||||
in {"xgrammar", "guidance"}),
|
||||
structured_outputs_config=StructuredOutputsConfig(
|
||||
backend=backend,
|
||||
disable_any_whitespace=backend in {"xgrammar", "guidance"},
|
||||
),
|
||||
)
|
||||
|
||||
guided_prompt = (
|
||||
structured_outputs_prompt = (
|
||||
"Give an example JSON for an employee profile that fits this "
|
||||
"schema. Make the response as short as possible. Schema: "
|
||||
f"{sample_json_schema}")
|
||||
|
||||
non_guided_prompt = "The diameter of the Earth in kilometers is "
|
||||
non_structured_outputs_prompt = "The diameter of the Earth in kilometers is "
|
||||
|
||||
prompts = [guided_prompt, non_guided_prompt]
|
||||
prompts = [structured_outputs_prompt, non_structured_outputs_prompt]
|
||||
sampling_params = [
|
||||
SamplingParams(
|
||||
temperature=1.0,
|
||||
max_tokens=400,
|
||||
guided_decoding=GuidedDecodingParams(json=sample_json_schema)),
|
||||
SamplingParams(temperature=1.0,
|
||||
max_tokens=400,
|
||||
structured_outputs=StructuredOutputsParams(
|
||||
json=sample_json_schema)),
|
||||
# No max tokens, temp=0 to assert on contents
|
||||
SamplingParams(
|
||||
seed=42,
|
||||
@ -801,16 +810,16 @@ def test_structured_output_batched_with_non_guided_requests(
|
||||
print(f"Prompt:\n{prompt!r}\nGenerated text:\n{generated_text!r}")
|
||||
|
||||
if index == 0:
|
||||
# First prompt is guided, expect valid JSON
|
||||
# First prompt is structured outputs, expect valid JSON
|
||||
assert "\n" not in generated_text
|
||||
output_json = json.loads(generated_text)
|
||||
jsonschema.validate(instance=output_json,
|
||||
schema=sample_json_schema)
|
||||
else:
|
||||
# Second prompt is not guided, expect valid output
|
||||
# Second prompt is not structured outputs, expect valid output
|
||||
# Cannot assert on exact output, but we can expect it to be factual
|
||||
assert "12,742" in generated_text
|
||||
|
||||
# non-guided requests should not return a valid JSON here
|
||||
# non-structured outputs requests should not return a valid JSON here
|
||||
with pytest.raises(ValueError):
|
||||
output_json = json.loads(generated_text)
|
||||
|
||||
@ -77,7 +77,9 @@ async def test_invalid_json_schema(client: openai.AsyncOpenAI,
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}],
|
||||
extra_body={"guided_json": invalid_json_schema},
|
||||
extra_body={"structured_outputs": {
|
||||
"json": invalid_json_schema
|
||||
}},
|
||||
)
|
||||
|
||||
|
||||
@ -99,7 +101,9 @@ async def test_invalid_regex(client: openai.AsyncOpenAI, model_name: str):
|
||||
"content": prompt,
|
||||
}],
|
||||
extra_body={
|
||||
"guided_regex": r"[.*",
|
||||
"structured_outputs": {
|
||||
"regex": r"[.*"
|
||||
},
|
||||
"stop": ["\n"]
|
||||
},
|
||||
)
|
||||
@ -134,5 +138,9 @@ async def test_invalid_grammar(client: openai.AsyncOpenAI, model_name: str):
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}],
|
||||
extra_body={"guided_grammar": invalid_simplified_sql_grammar},
|
||||
extra_body={
|
||||
"structured_outputs": {
|
||||
"grammar": invalid_simplified_sql_grammar
|
||||
}
|
||||
},
|
||||
)
|
||||
|
||||
@ -627,7 +627,9 @@ async def test_invalid_json_schema(client: openai.AsyncOpenAI,
|
||||
await client.completions.create(
|
||||
model=model_name,
|
||||
prompt=prompt,
|
||||
extra_body={"guided_json": invalid_json_schema},
|
||||
extra_body={"structured_outputs": {
|
||||
"json": invalid_json_schema
|
||||
}},
|
||||
)
|
||||
|
||||
|
||||
@ -646,7 +648,9 @@ async def test_invalid_regex(client: openai.AsyncOpenAI, model_name: str):
|
||||
model=model_name,
|
||||
prompt=prompt,
|
||||
extra_body={
|
||||
"guided_regex": r"[.*",
|
||||
"structured_outputs": {
|
||||
"regex": r"[.*"
|
||||
},
|
||||
"stop": ["\n"]
|
||||
},
|
||||
)
|
||||
@ -678,7 +682,11 @@ async def test_invalid_grammar(client: openai.AsyncOpenAI, model_name: str):
|
||||
await client.completions.create(
|
||||
model=model_name,
|
||||
prompt=prompt,
|
||||
extra_body={"guided_grammar": invalid_simplified_sql_grammar},
|
||||
extra_body={
|
||||
"structured_outputs": {
|
||||
"grammar": invalid_simplified_sql_grammar
|
||||
}
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
|
||||
@ -2277,34 +2277,34 @@ def get_served_model_name(model: str,
|
||||
return served_model_name
|
||||
|
||||
|
||||
GuidedDecodingBackend = Literal["auto", "xgrammar", "guidance", "outlines",
|
||||
"lm-format-enforcer"]
|
||||
StructuredOutputsBackend = Literal["auto", "xgrammar", "guidance", "outlines",
|
||||
"lm-format-enforcer"]
|
||||
|
||||
|
||||
@config
|
||||
@dataclass
|
||||
class DecodingConfig:
|
||||
"""Dataclass which contains the decoding strategy of the engine."""
|
||||
class StructuredOutputsConfig:
|
||||
"""Dataclass which contains structured outputs config for the engine."""
|
||||
|
||||
backend: GuidedDecodingBackend = "auto"
|
||||
"""Which engine will be used for guided decoding (JSON schema / regex etc)
|
||||
by default. With "auto", we will make opinionated choices based on request
|
||||
contents and what the backend libraries currently support, so the behavior
|
||||
is subject to change in each release."""
|
||||
backend: StructuredOutputsBackend = "auto"
|
||||
"""Which engine will be used for structured outputs (e.g. JSON schema,
|
||||
regex, etc) by default. With "auto", we will make opinionated choices
|
||||
based on request contents and what the backend libraries currently support,
|
||||
so the behavior is subject to change in each release."""
|
||||
|
||||
disable_fallback: bool = False
|
||||
"""If `True`, vLLM will not fallback to a different backend on error."""
|
||||
|
||||
disable_any_whitespace: bool = False
|
||||
"""If `True`, the model will not generate any whitespace during guided
|
||||
decoding. This is only supported for xgrammar and guidance backends."""
|
||||
"""If `True`, the model will not generate any whitespace during structured
|
||||
outputs. This is only supported for xgrammar and guidance backends."""
|
||||
|
||||
disable_additional_properties: bool = False
|
||||
"""If `True`, the `guidance` backend will not use `additionalProperties`
|
||||
in the JSON schema. This is only supported for the `guidance` backend and
|
||||
is used to better align its behaviour with `outlines` and `xgrammar`."""
|
||||
|
||||
reasoning_backend: str = ""
|
||||
reasoning_parser: str = ""
|
||||
"""Select the reasoning parser depending on the model that you're using.
|
||||
This is used to parse the reasoning content into OpenAI API format."""
|
||||
|
||||
@ -2451,8 +2451,9 @@ class VllmConfig:
|
||||
"""LoRA configuration."""
|
||||
speculative_config: Optional[SpeculativeConfig] = None
|
||||
"""Speculative decoding configuration."""
|
||||
decoding_config: DecodingConfig = field(default_factory=DecodingConfig)
|
||||
"""Decoding configuration."""
|
||||
structured_outputs_config: StructuredOutputsConfig = field(
|
||||
default_factory=StructuredOutputsConfig)
|
||||
"""Structured outputs configuration."""
|
||||
observability_config: Optional[ObservabilityConfig] = None
|
||||
"""Observability configuration."""
|
||||
quant_config: Optional[QuantizationConfig] = None
|
||||
@ -2543,8 +2544,8 @@ class VllmConfig:
|
||||
vllm_factors.append(self.speculative_config.compute_hash())
|
||||
else:
|
||||
vllm_factors.append("None")
|
||||
if self.decoding_config:
|
||||
vllm_factors.append(self.decoding_config.compute_hash())
|
||||
if self.structured_outputs_config:
|
||||
vllm_factors.append(self.structured_outputs_config.compute_hash())
|
||||
else:
|
||||
vllm_factors.append("None")
|
||||
if self.observability_config:
|
||||
@ -3063,7 +3064,7 @@ class VllmConfig:
|
||||
f"enforce_eager={self.model_config.enforce_eager}, "
|
||||
f"kv_cache_dtype={self.cache_config.cache_dtype}, "
|
||||
f"device_config={self.device_config.device}, "
|
||||
f"decoding_config={self.decoding_config!r}, "
|
||||
f"structured_outputs_config={self.structured_outputs_config!r}, "
|
||||
f"observability_config={self.observability_config!r}, "
|
||||
f"seed={self.model_config.seed}, "
|
||||
f"served_model_name={self.model_config.served_model_name}, "
|
||||
|
||||
@ -22,17 +22,16 @@ from typing_extensions import TypeIs, deprecated
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
|
||||
ConfigType, ConvertOption, DecodingConfig,
|
||||
DetailedTraceModules, Device, DeviceConfig,
|
||||
DistributedExecutorBackend, EPLBConfig,
|
||||
GuidedDecodingBackend, HfOverrides, KVEventsConfig,
|
||||
ConfigType, ConvertOption, DetailedTraceModules,
|
||||
Device, DeviceConfig, DistributedExecutorBackend,
|
||||
EPLBConfig, HfOverrides, KVEventsConfig,
|
||||
KVTransferConfig, LoadConfig, LogprobsMode,
|
||||
LoRAConfig, MambaDType, MMEncoderTPMode, ModelConfig,
|
||||
ModelDType, ModelImpl, ObservabilityConfig,
|
||||
ParallelConfig, PoolerConfig, PrefixCachingHashAlgo,
|
||||
RunnerOption, SchedulerConfig, SchedulerPolicy,
|
||||
SpeculativeConfig, TaskOption, TokenizerMode,
|
||||
VllmConfig, get_attr_docs)
|
||||
SpeculativeConfig, StructuredOutputsConfig,
|
||||
TaskOption, TokenizerMode, VllmConfig, get_attr_docs)
|
||||
from vllm.config.multimodal import MMCacheType, MultiModalConfig
|
||||
from vllm.config.parallel import ExpertPlacementStrategy
|
||||
from vllm.config.utils import get_field
|
||||
@ -418,12 +417,15 @@ class EngineArgs:
|
||||
disable_hybrid_kv_cache_manager: bool = (
|
||||
SchedulerConfig.disable_hybrid_kv_cache_manager)
|
||||
|
||||
guided_decoding_backend: GuidedDecodingBackend = DecodingConfig.backend
|
||||
guided_decoding_disable_fallback: bool = DecodingConfig.disable_fallback
|
||||
guided_decoding_disable_any_whitespace: bool = \
|
||||
DecodingConfig.disable_any_whitespace
|
||||
guided_decoding_disable_additional_properties: bool = \
|
||||
DecodingConfig.disable_additional_properties
|
||||
structured_outputs_config: StructuredOutputsConfig = get_field(
|
||||
VllmConfig, "structured_outputs_config")
|
||||
reasoning_parser: str = StructuredOutputsConfig.reasoning_parser
|
||||
# Deprecated guided decoding fields
|
||||
guided_decoding_backend: Optional[str] = None
|
||||
guided_decoding_disable_fallback: Optional[bool] = None
|
||||
guided_decoding_disable_any_whitespace: Optional[bool] = None
|
||||
guided_decoding_disable_additional_properties: Optional[bool] = None
|
||||
|
||||
logits_processor_pattern: Optional[
|
||||
str] = ModelConfig.logits_processor_pattern
|
||||
|
||||
@ -462,7 +464,6 @@ class EngineArgs:
|
||||
|
||||
additional_config: dict[str, Any] = \
|
||||
get_field(VllmConfig, "additional_config")
|
||||
reasoning_parser: str = DecodingConfig.reasoning_backend
|
||||
|
||||
use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load
|
||||
pt_load_map_location: str = LoadConfig.pt_load_map_location
|
||||
@ -618,28 +619,29 @@ class EngineArgs:
|
||||
load_group.add_argument('--pt-load-map-location',
|
||||
**load_kwargs["pt_load_map_location"])
|
||||
|
||||
# Guided decoding arguments
|
||||
guided_decoding_kwargs = get_kwargs(DecodingConfig)
|
||||
guided_decoding_group = parser.add_argument_group(
|
||||
title="DecodingConfig",
|
||||
description=DecodingConfig.__doc__,
|
||||
# Structured outputs arguments
|
||||
structured_outputs_kwargs = get_kwargs(StructuredOutputsConfig)
|
||||
structured_outputs_group = parser.add_argument_group(
|
||||
title="StructuredOutputsConfig",
|
||||
description=StructuredOutputsConfig.__doc__,
|
||||
)
|
||||
guided_decoding_group.add_argument("--guided-decoding-backend",
|
||||
**guided_decoding_kwargs["backend"])
|
||||
guided_decoding_group.add_argument(
|
||||
"--guided-decoding-disable-fallback",
|
||||
**guided_decoding_kwargs["disable_fallback"])
|
||||
guided_decoding_group.add_argument(
|
||||
"--guided-decoding-disable-any-whitespace",
|
||||
**guided_decoding_kwargs["disable_any_whitespace"])
|
||||
guided_decoding_group.add_argument(
|
||||
"--guided-decoding-disable-additional-properties",
|
||||
**guided_decoding_kwargs["disable_additional_properties"])
|
||||
guided_decoding_group.add_argument(
|
||||
structured_outputs_group.add_argument(
|
||||
"--reasoning-parser",
|
||||
# This choice is a special case because it's not static
|
||||
choices=list(ReasoningParserManager.reasoning_parsers),
|
||||
**guided_decoding_kwargs["reasoning_backend"])
|
||||
**structured_outputs_kwargs["reasoning_parser"])
|
||||
# Deprecated guided decoding arguments
|
||||
for arg, type in [
|
||||
("--guided-decoding-backend", str),
|
||||
("--guided-decoding-disable-fallback", bool),
|
||||
("--guided-decoding-disable-any-whitespace", bool),
|
||||
("--guided-decoding-disable-additional-properties", bool),
|
||||
]:
|
||||
structured_outputs_group.add_argument(
|
||||
arg,
|
||||
type=type,
|
||||
help=(f"[DEPRECATED] {arg} will be removed in v0.12.0."),
|
||||
deprecated=True)
|
||||
|
||||
# Parallel arguments
|
||||
parallel_kwargs = get_kwargs(ParallelConfig)
|
||||
@ -934,6 +936,8 @@ class EngineArgs:
|
||||
**vllm_kwargs["compilation_config"])
|
||||
vllm_group.add_argument("--additional-config",
|
||||
**vllm_kwargs["additional_config"])
|
||||
vllm_group.add_argument('--structured-outputs-config',
|
||||
**vllm_kwargs["structured_outputs_config"])
|
||||
|
||||
# Other arguments
|
||||
parser.add_argument('--disable-log-stats',
|
||||
@ -1421,14 +1425,25 @@ class EngineArgs:
|
||||
|
||||
load_config = self.create_load_config()
|
||||
|
||||
decoding_config = DecodingConfig(
|
||||
backend=self.guided_decoding_backend,
|
||||
disable_fallback=self.guided_decoding_disable_fallback,
|
||||
disable_any_whitespace=self.guided_decoding_disable_any_whitespace,
|
||||
disable_additional_properties=\
|
||||
self.guided_decoding_disable_additional_properties,
|
||||
reasoning_backend=self.reasoning_parser
|
||||
)
|
||||
# Pass reasoning_parser into StructuredOutputsConfig
|
||||
if self.reasoning_parser:
|
||||
self.structured_outputs_config.reasoning_parser = \
|
||||
self.reasoning_parser
|
||||
|
||||
# Forward the deprecated CLI args to the StructuredOutputsConfig
|
||||
so_config = self.structured_outputs_config
|
||||
if self.guided_decoding_backend is not None:
|
||||
so_config.guided_decoding_backend = \
|
||||
self.guided_decoding_backend
|
||||
if self.guided_decoding_disable_fallback is not None:
|
||||
so_config.guided_decoding_disable_fallback = \
|
||||
self.guided_decoding_disable_fallback
|
||||
if self.guided_decoding_disable_any_whitespace is not None:
|
||||
so_config.guided_decoding_disable_any_whitespace = \
|
||||
self.guided_decoding_disable_any_whitespace
|
||||
if self.guided_decoding_disable_additional_properties is not None:
|
||||
so_config.guided_decoding_disable_additional_properties = \
|
||||
self.guided_decoding_disable_additional_properties
|
||||
|
||||
observability_config = ObservabilityConfig(
|
||||
show_hidden_metrics_for_version=(
|
||||
@ -1446,7 +1461,7 @@ class EngineArgs:
|
||||
lora_config=lora_config,
|
||||
speculative_config=speculative_config,
|
||||
load_config=load_config,
|
||||
decoding_config=decoding_config,
|
||||
structured_outputs_config=self.structured_outputs_config,
|
||||
observability_config=observability_config,
|
||||
compilation_config=self.compilation_config,
|
||||
kv_transfer_config=self.kv_transfer_config,
|
||||
|
||||
@ -10,9 +10,8 @@ from typing import (Any, AsyncGenerator, Callable, Dict, Iterable, List,
|
||||
from weakref import ReferenceType
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config import (DecodingConfig, ModelConfig, ParallelConfig,
|
||||
from vllm.config import (LoRAConfig, ModelConfig, ParallelConfig,
|
||||
SchedulerConfig, VllmConfig)
|
||||
from vllm.config.lora import LoRAConfig
|
||||
from vllm.core.scheduler import SchedulerOutputs
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.engine.async_timeout import asyncio_timeout
|
||||
@ -955,10 +954,6 @@ class AsyncLLMEngine(EngineClient):
|
||||
"""Get the parallel configuration of the vLLM engine."""
|
||||
return self.engine.get_parallel_config()
|
||||
|
||||
async def get_decoding_config(self) -> DecodingConfig:
|
||||
"""Get the decoding configuration of the vLLM engine."""
|
||||
return self.engine.get_decoding_config()
|
||||
|
||||
async def get_scheduler_config(self) -> SchedulerConfig:
|
||||
"""Get the scheduling configuration of the vLLM engine."""
|
||||
return self.engine.get_scheduler_config()
|
||||
|
||||
@ -16,9 +16,8 @@ import torch
|
||||
from typing_extensions import TypeVar
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config import (DecodingConfig, ModelConfig, ObservabilityConfig,
|
||||
from vllm.config import (LoRAConfig, ModelConfig, ObservabilityConfig,
|
||||
ParallelConfig, SchedulerConfig, VllmConfig)
|
||||
from vllm.config.lora import LoRAConfig
|
||||
from vllm.core.scheduler import ScheduledSequenceGroup, SchedulerOutputs
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.engine.metrics_types import StatLoggerBase, Stats
|
||||
@ -213,8 +212,7 @@ class LLMEngine:
|
||||
self.device_config = vllm_config.device_config
|
||||
self.speculative_config = vllm_config.speculative_config # noqa
|
||||
self.load_config = vllm_config.load_config
|
||||
self.decoding_config = vllm_config.decoding_config or DecodingConfig( # noqa
|
||||
)
|
||||
self.structured_outputs_config = vllm_config.structured_outputs_config
|
||||
self.observability_config = vllm_config.observability_config or ObservabilityConfig( # noqa
|
||||
)
|
||||
|
||||
@ -364,10 +362,9 @@ class LLMEngine:
|
||||
self.observability_config.otlp_traces_endpoint)
|
||||
|
||||
# Initialize reasoning parser if reasoning backend is set.
|
||||
if self.decoding_config.reasoning_backend and \
|
||||
self.tokenizer:
|
||||
if self.structured_outputs_config.reasoning_parser and self.tokenizer:
|
||||
reasoner_class = ReasoningParserManager.get_reasoning_parser(
|
||||
self.decoding_config.reasoning_backend)
|
||||
self.structured_outputs_config.reasoning_parser)
|
||||
self.reasoner: ReasoningParser = reasoner_class(
|
||||
self.tokenizer.get_lora_tokenizer())
|
||||
|
||||
@ -381,7 +378,8 @@ class LLMEngine:
|
||||
self.seq_counter,
|
||||
stop_checker=StopChecker(
|
||||
self.scheduler_config.max_model_len,
|
||||
self.reasoner if self.decoding_config.reasoning_backend
|
||||
self.reasoner
|
||||
if self.structured_outputs_config.reasoning_parser
|
||||
and self.tokenizer else None,
|
||||
),
|
||||
))
|
||||
@ -772,10 +770,6 @@ class LLMEngine:
|
||||
"""Gets the parallel configuration."""
|
||||
return self.parallel_config
|
||||
|
||||
def get_decoding_config(self) -> DecodingConfig:
|
||||
"""Gets the decoding configuration."""
|
||||
return self.decoding_config
|
||||
|
||||
def get_scheduler_config(self) -> SchedulerConfig:
|
||||
"""Gets the scheduler configuration."""
|
||||
return self.scheduler_config
|
||||
|
||||
@ -6,7 +6,7 @@ from abc import ABC, abstractmethod
|
||||
from typing import Any, AsyncGenerator, Iterable, Mapping, Optional, Union
|
||||
|
||||
from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function
|
||||
from vllm.config import DecodingConfig, ModelConfig, VllmConfig
|
||||
from vllm.config import ModelConfig, VllmConfig
|
||||
from vllm.core.scheduler import SchedulerOutputs
|
||||
from vllm.inputs.data import PromptType, TokensPrompt
|
||||
from vllm.inputs.parse import is_explicit_encoder_decoder_prompt
|
||||
@ -248,11 +248,6 @@ class EngineClient(ABC):
|
||||
"""Get the model configuration of the vLLM engine."""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
async def get_decoding_config(self) -> DecodingConfig:
|
||||
"""Get the decoding configuration of the vLLM engine."""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
async def get_input_preprocessor(self) -> InputPreprocessor:
|
||||
"""Get the input processor of the vLLM engine."""
|
||||
|
||||
@ -15,8 +15,8 @@ import vllm.envs as envs
|
||||
from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput,
|
||||
BeamSearchSequence,
|
||||
create_sort_beams_key_function)
|
||||
from vllm.config import (CompilationConfig, ModelDType, TokenizerMode,
|
||||
is_init_field)
|
||||
from vllm.config import (CompilationConfig, ModelDType,
|
||||
StructuredOutputsConfig, TokenizerMode, is_init_field)
|
||||
from vllm.engine.arg_utils import (ConvertOption, EngineArgs, HfOverrides,
|
||||
PoolerConfig, RunnerOption)
|
||||
from vllm.engine.llm_engine import LLMEngine
|
||||
@ -192,6 +192,8 @@ class LLM:
|
||||
hf_overrides: Optional[HfOverrides] = None,
|
||||
mm_processor_kwargs: Optional[dict[str, Any]] = None,
|
||||
override_pooler_config: Optional[PoolerConfig] = None,
|
||||
structured_outputs_config: Optional[Union[dict[
|
||||
str, Any], StructuredOutputsConfig]] = None,
|
||||
kv_cache_memory_bytes: Optional[int] = None,
|
||||
compilation_config: Optional[Union[int, dict[str, Any],
|
||||
CompilationConfig]] = None,
|
||||
@ -236,14 +238,30 @@ class LLM:
|
||||
compilation_config_instance = CompilationConfig(
|
||||
level=compilation_config)
|
||||
elif isinstance(compilation_config, dict):
|
||||
predicate = lambda x: is_init_field(CompilationConfig, x[0])
|
||||
compilation_config_instance = CompilationConfig(
|
||||
**dict(filter(predicate, compilation_config.items())))
|
||||
**{
|
||||
k: v
|
||||
for k, v in compilation_config.items()
|
||||
if is_init_field(CompilationConfig, k)
|
||||
})
|
||||
else:
|
||||
compilation_config_instance = compilation_config
|
||||
else:
|
||||
compilation_config_instance = CompilationConfig()
|
||||
|
||||
if structured_outputs_config is not None:
|
||||
if isinstance(structured_outputs_config, dict):
|
||||
structured_outputs_instance = StructuredOutputsConfig(
|
||||
**{
|
||||
k: v
|
||||
for k, v in structured_outputs_config.items()
|
||||
if is_init_field(StructuredOutputsConfig, k)
|
||||
})
|
||||
else:
|
||||
structured_outputs_instance = structured_outputs_config
|
||||
else:
|
||||
structured_outputs_instance = StructuredOutputsConfig()
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model,
|
||||
runner=runner,
|
||||
@ -271,6 +289,7 @@ class LLM:
|
||||
hf_overrides=hf_overrides,
|
||||
mm_processor_kwargs=mm_processor_kwargs,
|
||||
override_pooler_config=override_pooler_config,
|
||||
structured_outputs_config=structured_outputs_instance,
|
||||
compilation_config=compilation_config_instance,
|
||||
logits_processors=logits_processors,
|
||||
**kwargs,
|
||||
|
||||
@ -1678,7 +1678,7 @@ async def init_app_state(
|
||||
enable_auto_tools=args.enable_auto_tool_choice,
|
||||
tool_parser=args.tool_call_parser,
|
||||
tool_server=tool_server,
|
||||
reasoning_parser=args.reasoning_parser,
|
||||
reasoning_parser=args.structured_outputs_config.reasoning_parser,
|
||||
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
|
||||
enable_force_include_usage=args.enable_force_include_usage,
|
||||
enable_log_outputs=args.enable_log_outputs,
|
||||
@ -1697,7 +1697,7 @@ async def init_app_state(
|
||||
exclude_tools_when_tool_choice_none=args.
|
||||
exclude_tools_when_tool_choice_none,
|
||||
tool_parser=args.tool_call_parser,
|
||||
reasoning_parser=args.reasoning_parser,
|
||||
reasoning_parser=args.structured_outputs_config.reasoning_parser,
|
||||
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
|
||||
enable_force_include_usage=args.enable_force_include_usage,
|
||||
enable_log_outputs=args.enable_log_outputs,
|
||||
@ -1800,10 +1800,10 @@ def validate_api_server_args(args):
|
||||
f"(chose from {{ {','.join(valid_tool_parses)} }})")
|
||||
|
||||
valid_reasoning_parses = ReasoningParserManager.reasoning_parsers.keys()
|
||||
if args.reasoning_parser \
|
||||
and args.reasoning_parser not in valid_reasoning_parses:
|
||||
if ((reasoning_parser := args.structured_outputs_config.reasoning_parser)
|
||||
and reasoning_parser not in valid_reasoning_parses):
|
||||
raise KeyError(
|
||||
f"invalid reasoning parser: {args.reasoning_parser} "
|
||||
f"invalid reasoning parser: {reasoning_parser} "
|
||||
f"(chose from {{ {','.join(valid_reasoning_parses)} }})")
|
||||
|
||||
|
||||
|
||||
@ -54,8 +54,8 @@ from vllm.entrypoints.score_utils import (ScoreContentPartParam,
|
||||
from vllm.logger import init_logger
|
||||
from vllm.logprobs import Logprob
|
||||
from vllm.pooling_params import PoolingParams
|
||||
from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
|
||||
RequestOutputKind, SamplingParams)
|
||||
from vllm.sampling_params import (BeamSearchParams, RequestOutputKind,
|
||||
SamplingParams, StructuredOutputsParams)
|
||||
from vllm.utils import random_uuid, resolve_obj_by_qualname
|
||||
|
||||
logger = init_logger(__name__)
|
||||
@ -373,11 +373,12 @@ class ResponsesRequest(OpenAIBaseModel):
|
||||
stop_token_ids = default_sampling_params.get("stop_token_ids")
|
||||
|
||||
# Structured output
|
||||
guided_decoding = None
|
||||
structured_outputs = None
|
||||
if self.text is not None and self.text.format is not None:
|
||||
response_format = self.text.format
|
||||
if response_format.type == "json_schema":
|
||||
guided_decoding = GuidedDecodingParams.from_optional(
|
||||
if (response_format.type == "json_schema"
|
||||
and response_format.schema_ is not None):
|
||||
structured_outputs = StructuredOutputsParams(
|
||||
json=response_format.schema_)
|
||||
elif response_format.type == "json_object":
|
||||
raise NotImplementedError("json_object is not supported")
|
||||
@ -392,7 +393,7 @@ class ResponsesRequest(OpenAIBaseModel):
|
||||
stop_token_ids=stop_token_ids,
|
||||
output_kind=(RequestOutputKind.DELTA
|
||||
if self.stream else RequestOutputKind.FINAL_ONLY),
|
||||
guided_decoding=guided_decoding,
|
||||
structured_outputs=structured_outputs,
|
||||
)
|
||||
|
||||
def is_include_output_logprobs(self) -> bool:
|
||||
@ -547,42 +548,9 @@ class ChatCompletionRequest(OpenAIBaseModel):
|
||||
default=None,
|
||||
description=("Additional kwargs to pass to the HF processor."),
|
||||
)
|
||||
guided_json: Optional[Union[str, dict, BaseModel]] = Field(
|
||||
structured_outputs: Optional[StructuredOutputsParams] = Field(
|
||||
default=None,
|
||||
description=("If specified, the output will follow the JSON schema."),
|
||||
)
|
||||
guided_regex: Optional[str] = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"If specified, the output will follow the regex pattern."),
|
||||
)
|
||||
guided_choice: Optional[list[str]] = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"If specified, the output will be exactly one of the choices."),
|
||||
)
|
||||
guided_grammar: Optional[str] = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"If specified, the output will follow the context free grammar."),
|
||||
)
|
||||
structural_tag: Optional[str] = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"If specified, the output will follow the structural tag schema."),
|
||||
)
|
||||
guided_decoding_backend: Optional[str] = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"If specified, will override the default guided decoding backend "
|
||||
"of the server for this specific request. If set, must be either "
|
||||
"'outlines' / 'lm-format-enforcer'"),
|
||||
)
|
||||
guided_whitespace_pattern: Optional[str] = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"If specified, will override the default whitespace pattern "
|
||||
"for guided json decoding."),
|
||||
description="Additional kwargs for structured outputs",
|
||||
)
|
||||
priority: int = Field(
|
||||
default=0,
|
||||
@ -701,31 +669,33 @@ class ChatCompletionRequest(OpenAIBaseModel):
|
||||
if prompt_logprobs is None and self.echo:
|
||||
prompt_logprobs = self.top_logprobs
|
||||
|
||||
guided_json_object = None
|
||||
if self.response_format is not None:
|
||||
if self.response_format.type == "json_object":
|
||||
guided_json_object = True
|
||||
elif self.response_format.type == "json_schema":
|
||||
json_schema = self.response_format.json_schema
|
||||
assert json_schema is not None
|
||||
self.guided_json = json_schema.json_schema
|
||||
elif self.response_format.type == "structural_tag":
|
||||
structural_tag = self.response_format
|
||||
assert structural_tag is not None and isinstance(
|
||||
structural_tag, StructuralTagResponseFormat)
|
||||
s_tag_obj = structural_tag.model_dump(by_alias=True)
|
||||
self.structural_tag = json.dumps(s_tag_obj)
|
||||
response_format = self.response_format
|
||||
json_schema_from_tool = self._get_json_schema_from_tool()
|
||||
if response_format is not None or json_schema_from_tool is not None:
|
||||
# If structured outputs wasn't already enabled,
|
||||
# we must enable it for these features to work
|
||||
if self.structured_outputs is None:
|
||||
self.structured_outputs = StructuredOutputsParams()
|
||||
|
||||
guided_decoding = GuidedDecodingParams.from_optional(
|
||||
json=self._get_guided_json_from_tool() or self.guided_json,
|
||||
regex=self.guided_regex,
|
||||
choice=self.guided_choice,
|
||||
grammar=self.guided_grammar,
|
||||
json_object=guided_json_object,
|
||||
backend=self.guided_decoding_backend,
|
||||
whitespace_pattern=self.guided_whitespace_pattern,
|
||||
structural_tag=self.structural_tag,
|
||||
)
|
||||
# Set structured output params for response format
|
||||
if response_format is not None:
|
||||
if response_format.type == "json_object":
|
||||
self.structured_outputs.json_object = True
|
||||
elif response_format.type == "json_schema":
|
||||
json_schema = response_format.json_schema
|
||||
assert json_schema is not None
|
||||
self.structured_outputs.json = json_schema.json_schema
|
||||
elif response_format.type == "structural_tag":
|
||||
structural_tag = response_format
|
||||
assert structural_tag is not None and isinstance(
|
||||
structural_tag, StructuralTagResponseFormat)
|
||||
s_tag_obj = structural_tag.model_dump(by_alias=True)
|
||||
self.structured_outputs.structural_tag = json.dumps(
|
||||
s_tag_obj)
|
||||
|
||||
# Set structured output params for tool calling
|
||||
if json_schema_from_tool is not None:
|
||||
self.structured_outputs.json = json_schema_from_tool
|
||||
|
||||
extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
|
||||
if self.kv_transfer_params:
|
||||
@ -757,15 +727,14 @@ class ChatCompletionRequest(OpenAIBaseModel):
|
||||
truncate_prompt_tokens=self.truncate_prompt_tokens,
|
||||
output_kind=RequestOutputKind.DELTA if self.stream \
|
||||
else RequestOutputKind.FINAL_ONLY,
|
||||
guided_decoding=guided_decoding,
|
||||
structured_outputs=self.structured_outputs,
|
||||
logit_bias=self.logit_bias,
|
||||
bad_words= self.bad_words,
|
||||
bad_words=self.bad_words,
|
||||
allowed_token_ids=self.allowed_token_ids,
|
||||
extra_args=extra_args or None,
|
||||
)
|
||||
|
||||
def _get_guided_json_from_tool(
|
||||
self) -> Optional[Union[str, dict, BaseModel]]:
|
||||
def _get_json_schema_from_tool(self) -> Optional[Union[str, dict]]:
|
||||
# user has chosen to not use any tool
|
||||
if self.tool_choice == "none" or self.tools is None:
|
||||
return None
|
||||
@ -875,28 +844,31 @@ class ChatCompletionRequest(OpenAIBaseModel):
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def check_guided_decoding_count(cls, data):
|
||||
def check_structured_outputs_count(cls, data):
|
||||
if isinstance(data, ValueError):
|
||||
raise data
|
||||
|
||||
guide_count = sum([
|
||||
"guided_json" in data and data["guided_json"] is not None,
|
||||
"guided_regex" in data and data["guided_regex"] is not None,
|
||||
"guided_choice" in data and data["guided_choice"] is not None
|
||||
])
|
||||
# you can only use one kind of guided decoding
|
||||
if guide_count > 1:
|
||||
if "structured_outputs" not in data:
|
||||
return data
|
||||
|
||||
structured_outputs_kwargs = data['structured_outputs']
|
||||
count = sum(
|
||||
structured_outputs_kwargs.get(k) is not None
|
||||
for k in ("json", "regex", "choice"))
|
||||
# you can only use one kind of constraints for structured outputs
|
||||
if count > 1:
|
||||
raise ValueError(
|
||||
"You can only use one kind of guided decoding "
|
||||
"('guided_json', 'guided_regex' or 'guided_choice').")
|
||||
# you can only either use guided decoding or tools, not both
|
||||
if guide_count > 1 and data.get("tool_choice", "none") not in (
|
||||
"You can only use one kind of constraints for structured "
|
||||
"outputs ('json', 'regex' or 'choice').")
|
||||
# you can only either use structured outputs or tools, not both
|
||||
if count > 1 and data.get("tool_choice", "none") not in (
|
||||
"none",
|
||||
"auto",
|
||||
"required",
|
||||
):
|
||||
raise ValueError(
|
||||
"You can only either use guided decoding or tools, not both.")
|
||||
"You can only either use constraints for structured outputs "
|
||||
"or tools, not both.")
|
||||
return data
|
||||
|
||||
@model_validator(mode="before")
|
||||
@ -1049,37 +1021,9 @@ class CompletionRequest(OpenAIBaseModel):
|
||||
", {'type': 'structural_tag'}, or {'type': 'text' } is supported."
|
||||
),
|
||||
)
|
||||
guided_json: Optional[Union[str, dict, BaseModel]] = Field(
|
||||
structured_outputs: Optional[StructuredOutputsParams] = Field(
|
||||
default=None,
|
||||
description="If specified, the output will follow the JSON schema.",
|
||||
)
|
||||
guided_regex: Optional[str] = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"If specified, the output will follow the regex pattern."),
|
||||
)
|
||||
guided_choice: Optional[list[str]] = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"If specified, the output will be exactly one of the choices."),
|
||||
)
|
||||
guided_grammar: Optional[str] = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"If specified, the output will follow the context free grammar."),
|
||||
)
|
||||
guided_decoding_backend: Optional[str] = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"If specified, will override the default guided decoding backend "
|
||||
"of the server for this specific request. If set, must be one of "
|
||||
"'outlines' / 'lm-format-enforcer'"),
|
||||
)
|
||||
guided_whitespace_pattern: Optional[str] = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"If specified, will override the default whitespace pattern "
|
||||
"for guided json decoding."),
|
||||
description="Additional kwargs for structured outputs",
|
||||
)
|
||||
priority: int = Field(
|
||||
default=0,
|
||||
@ -1210,20 +1154,10 @@ class CompletionRequest(OpenAIBaseModel):
|
||||
|
||||
echo_without_generation = self.echo and self.max_tokens == 0
|
||||
|
||||
guided_json_object = None
|
||||
if (self.response_format is not None
|
||||
if (self.structured_outputs is not None
|
||||
and self.response_format is not None
|
||||
and self.response_format.type == "json_object"):
|
||||
guided_json_object = True
|
||||
|
||||
guided_decoding = GuidedDecodingParams.from_optional(
|
||||
json=self.guided_json,
|
||||
regex=self.guided_regex,
|
||||
choice=self.guided_choice,
|
||||
grammar=self.guided_grammar,
|
||||
json_object=guided_json_object,
|
||||
backend=self.guided_decoding_backend,
|
||||
whitespace_pattern=self.guided_whitespace_pattern,
|
||||
)
|
||||
self.structured_outputs.json_object = True
|
||||
|
||||
extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
|
||||
if self.kv_transfer_params:
|
||||
@ -1255,7 +1189,7 @@ class CompletionRequest(OpenAIBaseModel):
|
||||
truncate_prompt_tokens=self.truncate_prompt_tokens,
|
||||
output_kind=RequestOutputKind.DELTA if self.stream \
|
||||
else RequestOutputKind.FINAL_ONLY,
|
||||
guided_decoding=guided_decoding,
|
||||
structured_outputs=self.structured_outputs,
|
||||
logit_bias=self.logit_bias,
|
||||
allowed_token_ids=self.allowed_token_ids,
|
||||
extra_args=extra_args or None,
|
||||
@ -1263,16 +1197,18 @@ class CompletionRequest(OpenAIBaseModel):
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def check_guided_decoding_count(cls, data):
|
||||
guide_count = sum([
|
||||
"guided_json" in data and data["guided_json"] is not None,
|
||||
"guided_regex" in data and data["guided_regex"] is not None,
|
||||
"guided_choice" in data and data["guided_choice"] is not None
|
||||
])
|
||||
if guide_count > 1:
|
||||
def check_structured_outputs_count(cls, data):
|
||||
if "structured_outputs" not in data:
|
||||
return data
|
||||
|
||||
structured_outputs_kwargs = data['structured_outputs']
|
||||
count = sum(
|
||||
structured_outputs_kwargs.get(k) is not None
|
||||
for k in ("json", "regex", "choice"))
|
||||
if count > 1:
|
||||
raise ValueError(
|
||||
"You can only use one kind of guided decoding "
|
||||
"('guided_json', 'guided_regex' or 'guided_choice').")
|
||||
"You can only use one kind of constraints for structured "
|
||||
"outputs ('json', 'regex' or 'choice').")
|
||||
return data
|
||||
|
||||
@model_validator(mode="before")
|
||||
|
||||
@ -993,7 +993,7 @@ class OpenAIServingChat(OpenAIServing):
|
||||
# check to make sure we haven't "forgotten" to stream
|
||||
# any tokens that were generated but previously
|
||||
# matched by partial json parsing
|
||||
# only happens if we are NOT using guided decoding
|
||||
# only happens if we are NOT using structured outputs
|
||||
auto_tools_called = False
|
||||
if tool_parser:
|
||||
auto_tools_called = len(
|
||||
|
||||
@ -262,9 +262,9 @@ class GptOssForCausalLMConfig(VerifyAndUpdateConfig):
|
||||
|
||||
@staticmethod
|
||||
def verify_and_update_config(vllm_config: "VllmConfig") -> None:
|
||||
decoding_config = vllm_config.decoding_config
|
||||
if decoding_config.reasoning_backend == "":
|
||||
decoding_config.reasoning_backend = "openai_gptoss"
|
||||
structured_outputs_config = vllm_config.structured_outputs_config
|
||||
if structured_outputs_config.reasoning_parser == "":
|
||||
structured_outputs_config.reasoning_parser = "openai_gptoss"
|
||||
|
||||
# Increase the max capture size from 512 to 1024 for performance.
|
||||
# NOTE(woosuk): This will increase the number of CUDA graphs
|
||||
|
||||
@ -2,13 +2,13 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Sampling parameters for text generation."""
|
||||
import copy
|
||||
from dataclasses import dataclass
|
||||
from dataclasses import field
|
||||
from enum import Enum, IntEnum
|
||||
from functools import cached_property
|
||||
from typing import Annotated, Any, Optional, Union
|
||||
|
||||
import msgspec
|
||||
from pydantic import BaseModel
|
||||
from pydantic.dataclasses import dataclass
|
||||
|
||||
from vllm.logger import init_logger
|
||||
from vllm.logits_process import LogitsProcessor
|
||||
@ -28,60 +28,35 @@ class SamplingType(IntEnum):
|
||||
|
||||
# maybe make msgspec?
|
||||
@dataclass
|
||||
class GuidedDecodingParams:
|
||||
"""One of these fields will be used to build a logit processor."""
|
||||
class StructuredOutputsParams:
|
||||
# One of these fields will be used to build a logit processor.
|
||||
json: Optional[Union[str, dict]] = None
|
||||
regex: Optional[str] = None
|
||||
choice: Optional[list[str]] = None
|
||||
grammar: Optional[str] = None
|
||||
json_object: Optional[bool] = None
|
||||
"""These are other options that can be set"""
|
||||
backend: Optional[str] = None
|
||||
backend_was_auto: bool = False
|
||||
# These are other options that can be set.
|
||||
disable_fallback: bool = False
|
||||
disable_any_whitespace: bool = False
|
||||
disable_additional_properties: bool = False
|
||||
whitespace_pattern: Optional[str] = None
|
||||
structural_tag: Optional[str] = None
|
||||
|
||||
@staticmethod
|
||||
def from_optional(
|
||||
json: Optional[Union[dict, BaseModel, str]] = None,
|
||||
regex: Optional[str] = None,
|
||||
choice: Optional[list[str]] = None,
|
||||
grammar: Optional[str] = None,
|
||||
json_object: Optional[bool] = None,
|
||||
backend: Optional[str] = None,
|
||||
whitespace_pattern: Optional[str] = None,
|
||||
structural_tag: Optional[str] = None,
|
||||
) -> Optional["GuidedDecodingParams"]:
|
||||
if all(arg is None for arg in (json, regex, choice, grammar,
|
||||
json_object, structural_tag)):
|
||||
return None
|
||||
# Extract json schemas from pydantic models
|
||||
if isinstance(json, (BaseModel, type(BaseModel))):
|
||||
json = json.model_json_schema()
|
||||
return GuidedDecodingParams(
|
||||
json=json,
|
||||
regex=regex,
|
||||
choice=choice,
|
||||
grammar=grammar,
|
||||
json_object=json_object,
|
||||
backend=backend,
|
||||
whitespace_pattern=whitespace_pattern,
|
||||
structural_tag=structural_tag,
|
||||
)
|
||||
_backend: Optional[str] = field(default=None, init=False)
|
||||
"""CAUTION: Should only be set by Processor._validate_structured_output"""
|
||||
_backend_was_auto: bool = field(default=False, init=False)
|
||||
"""CAUTION: Should only be set by Processor._validate_structured_output"""
|
||||
|
||||
def __post_init__(self):
|
||||
"""Validate that some fields are mutually exclusive."""
|
||||
guide_count = sum([
|
||||
count = sum([
|
||||
self.json is not None, self.regex is not None, self.choice
|
||||
is not None, self.grammar is not None, self.json_object is not None
|
||||
])
|
||||
if guide_count > 1:
|
||||
if count > 1:
|
||||
raise ValueError(
|
||||
"You can only use one kind of guided decoding but multiple are "
|
||||
f"specified: {self.__dict__}")
|
||||
"You can only use one kind of structured outputs constraint "
|
||||
f"but multiple are specified: {self.__dict__}")
|
||||
|
||||
|
||||
class RequestOutputKind(Enum):
|
||||
@ -196,9 +171,8 @@ class SamplingParams(
|
||||
_all_stop_token_ids: set[int] = msgspec.field(default_factory=set)
|
||||
|
||||
# Fields used to construct logits processors
|
||||
guided_decoding: Optional[GuidedDecodingParams] = None
|
||||
"""If provided, the engine will construct a guided decoding logits
|
||||
processor from these parameters."""
|
||||
structured_outputs: Optional[StructuredOutputsParams] = None
|
||||
"""Parameters for configuring structured outputs."""
|
||||
logit_bias: Optional[dict[int, float]] = None
|
||||
"""If provided, the engine will construct a logits processor that applies
|
||||
these logit biases."""
|
||||
@ -246,7 +220,7 @@ class SamplingParams(
|
||||
msgspec.Meta(
|
||||
ge=-1)]] = None,
|
||||
output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE,
|
||||
guided_decoding: Optional[GuidedDecodingParams] = None,
|
||||
structured_outputs: Optional[StructuredOutputsParams] = None,
|
||||
logit_bias: Optional[Union[dict[int, float], dict[str, float]]] = None,
|
||||
allowed_token_ids: Optional[list[int]] = None,
|
||||
extra_args: Optional[dict[str, Any]] = None,
|
||||
@ -288,7 +262,7 @@ class SamplingParams(
|
||||
logits_processors=logits_processors,
|
||||
truncate_prompt_tokens=truncate_prompt_tokens,
|
||||
output_kind=output_kind,
|
||||
guided_decoding=guided_decoding,
|
||||
structured_outputs=structured_outputs,
|
||||
logit_bias=logit_bias,
|
||||
allowed_token_ids=allowed_token_ids,
|
||||
extra_args=extra_args,
|
||||
@ -559,7 +533,7 @@ class SamplingParams(
|
||||
"spaces_between_special_tokens="
|
||||
f"{self.spaces_between_special_tokens}, "
|
||||
f"truncate_prompt_tokens={self.truncate_prompt_tokens}, "
|
||||
f"guided_decoding={self.guided_decoding}, "
|
||||
f"structured_outputs={self.structured_outputs}, "
|
||||
f"extra_args={self.extra_args})")
|
||||
|
||||
|
||||
|
||||
@ -274,7 +274,7 @@ class MistralTokenizer(TokenizerBase):
|
||||
return tokenizer_file
|
||||
|
||||
# the following attributes are set to fit vLLM's design and are used
|
||||
# by the guided structured output backends.
|
||||
# by the structured output backends.
|
||||
@property
|
||||
def all_special_tokens_extended(self) -> list[str]:
|
||||
from mistral_common.tokens.tokenizers.base import SpecialTokens
|
||||
@ -463,9 +463,6 @@ class MistralTokenizer(TokenizerBase):
|
||||
|
||||
return decoded
|
||||
|
||||
# WARN: Outlines logits processors can overwrite this method.
|
||||
# See: guided_decoding/outlines_logits_processors.py::_adapt_tokenizer
|
||||
# for more.
|
||||
def decode(self,
|
||||
ids: Union[list[int], int],
|
||||
skip_special_tokens: bool = True) -> str:
|
||||
|
||||
@ -588,9 +588,6 @@ class AsyncLLM(EngineClient):
|
||||
async def get_model_config(self) -> ModelConfig:
|
||||
return self.model_config
|
||||
|
||||
async def get_decoding_config(self):
|
||||
raise ValueError("Not Supported on V1 yet.")
|
||||
|
||||
async def get_input_preprocessor(self) -> InputPreprocessor:
|
||||
return self.processor.input_preprocessor
|
||||
|
||||
|
||||
@ -45,7 +45,7 @@ class Processor:
|
||||
self.model_config = vllm_config.model_config
|
||||
self.cache_config = vllm_config.cache_config
|
||||
self.lora_config = vllm_config.lora_config
|
||||
self.decoding_config = vllm_config.decoding_config
|
||||
self.structured_outputs_config = vllm_config.structured_outputs_config
|
||||
self.tokenizer = tokenizer
|
||||
|
||||
self.generation_config_fields = (
|
||||
@ -219,58 +219,57 @@ class Processor:
|
||||
"[lora_path]` to use the LoRA tokenizer.")
|
||||
|
||||
def _validate_structured_output(self, params: SamplingParams) -> None:
|
||||
if not params.guided_decoding or not self.decoding_config:
|
||||
if not params.structured_outputs or not self.structured_outputs_config:
|
||||
return
|
||||
|
||||
if self.model_config.skip_tokenizer_init and params.guided_decoding:
|
||||
if self.model_config.skip_tokenizer_init and params.structured_outputs:
|
||||
raise ValueError(
|
||||
"Structured outputs requires a tokenizer so it can't be used with 'skip_tokenizer_init'" # noqa: E501
|
||||
)
|
||||
|
||||
engine_level_backend = self.decoding_config.backend
|
||||
if params.guided_decoding.backend:
|
||||
# Request-level backend selection is not supported in V1.
|
||||
backend = self.structured_outputs_config.backend
|
||||
if _backend := params.structured_outputs._backend:
|
||||
# Request-level backend selection is not supported.
|
||||
# The values may differ if `params` is reused and was set
|
||||
# to a specific backend based on `auto` behavior in a previous
|
||||
# request. We remember that it was set as a result of `auto`
|
||||
# using the `_auto` option set on the backend in the params.
|
||||
if (params.guided_decoding.backend != engine_level_backend
|
||||
and not (engine_level_backend == "auto"
|
||||
and params.guided_decoding.backend_was_auto)):
|
||||
# using the `_backend_was_auto` field set in the params.
|
||||
if (backend != _backend
|
||||
and not (backend == "auto"
|
||||
and params.structured_outputs._backend_was_auto)):
|
||||
raise ValueError(
|
||||
"Request-level structured output backend selection is no "
|
||||
"longer supported. The request specified "
|
||||
f"'{params.guided_decoding.backend}', but vLLM was "
|
||||
f"initialised with '{engine_level_backend}'. This error "
|
||||
"can be resolved by removing backend selection from the "
|
||||
"request.")
|
||||
"Request-level structured output backend selection is not "
|
||||
f"supported. The request specified '{_backend}', but vLLM "
|
||||
f"was initialised with '{backend}'. This error can be "
|
||||
"resolved by removing '_backend' from the request.")
|
||||
else:
|
||||
params.guided_decoding.backend = engine_level_backend
|
||||
params.structured_outputs._backend = backend
|
||||
|
||||
# Request content validation
|
||||
if (isinstance(params.guided_decoding.choice, list)
|
||||
and not params.guided_decoding.choice):
|
||||
if (isinstance(params.structured_outputs.choice, list)
|
||||
and not params.structured_outputs.choice):
|
||||
# It is invalid for choice to be an empty list
|
||||
raise ValueError(f"Choice '{params.guided_decoding.choice}' "
|
||||
"cannot be an empty list")
|
||||
raise ValueError(
|
||||
f"Choice '{params.structured_outputs.choice}' cannot be an empty list" # noqa: E501
|
||||
)
|
||||
|
||||
if engine_level_backend.startswith("xgrammar"):
|
||||
if backend.startswith("xgrammar"):
|
||||
# xgrammar with no fallback
|
||||
validate_xgrammar_grammar(params)
|
||||
elif engine_level_backend.startswith("guidance"):
|
||||
elif backend.startswith("guidance"):
|
||||
# TODO: ideally we would have the LLTokenizer here as Lark syntax
|
||||
# allows <|special_token|> and similar, see
|
||||
# https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md#special-tokens
|
||||
# Without tokenizer these are disallowed in grammars.
|
||||
validate_guidance_grammar(params, tokenizer=None)
|
||||
elif engine_level_backend == "outlines":
|
||||
elif backend == "outlines":
|
||||
# outlines backend
|
||||
validate_structured_output_request_outlines(params)
|
||||
elif engine_level_backend == "lm-format-enforcer":
|
||||
elif backend == "lm-format-enforcer":
|
||||
# lm format enforcer backend
|
||||
validate_structured_output_request_lm_format_enforcer(params)
|
||||
else:
|
||||
# NOTE: engine_level_backend must be "auto" here, because we have
|
||||
# NOTE: backend must be "auto" here, because we have
|
||||
# checked supported_backends above.
|
||||
# In this mode, we set opinionated defaults based on what we think
|
||||
# will satisfy the most use cases without having to worry about
|
||||
@ -278,15 +277,15 @@ class Processor:
|
||||
# other setting where a specific backend was specified.
|
||||
try:
|
||||
validate_xgrammar_grammar(params)
|
||||
params.guided_decoding.backend = "xgrammar"
|
||||
params.structured_outputs._backend = "xgrammar"
|
||||
except ValueError:
|
||||
# The request either failed validation
|
||||
# or includes some jsonschema feature(s) that
|
||||
# are not supported in xgrammar. Fall back to guidance.
|
||||
validate_guidance_grammar(params, tokenizer=None)
|
||||
params.guided_decoding.backend = "guidance"
|
||||
params.structured_outputs._backend = "guidance"
|
||||
# Remember that this backend was set automatically
|
||||
params.guided_decoding.backend_was_auto = True
|
||||
params.structured_outputs._backend_was_auto = True
|
||||
|
||||
def _maybe_build_mm_uuids(
|
||||
self,
|
||||
|
||||
@ -67,7 +67,7 @@ class Request:
|
||||
# Generative models.
|
||||
assert sampling_params.max_tokens is not None
|
||||
self.max_tokens = sampling_params.max_tokens
|
||||
if sampling_params.guided_decoding is not None:
|
||||
if sampling_params.structured_outputs is not None:
|
||||
self.status = RequestStatus.WAITING_FOR_FSM
|
||||
self.use_structured_output = True
|
||||
|
||||
|
||||
@ -61,11 +61,11 @@ class StructuredOutputManager:
|
||||
self.executor = ThreadPoolExecutor(max_workers=max_workers)
|
||||
self.tokenizer = init_tokenizer_from_configs(
|
||||
model_config=self.vllm_config.model_config)
|
||||
reasoning_backend = \
|
||||
self.vllm_config.decoding_config.reasoning_backend
|
||||
if reasoning_backend:
|
||||
reasoning_parser = \
|
||||
self.vllm_config.structured_outputs_config.reasoning_parser
|
||||
if reasoning_parser:
|
||||
reasoner_cls = ReasoningParserManager.get_reasoning_parser(
|
||||
reasoning_backend)
|
||||
reasoning_parser)
|
||||
self.reasoner = reasoner_cls(tokenizer=self.tokenizer)
|
||||
|
||||
def grammar_init(self, request: Request) -> None:
|
||||
@ -74,15 +74,16 @@ class StructuredOutputManager:
|
||||
|
||||
if TYPE_CHECKING:
|
||||
assert request.sampling_params is not None and \
|
||||
request.sampling_params.guided_decoding is not None
|
||||
request.sampling_params.structured_outputs is not None
|
||||
|
||||
# Initialize the backend the first time it is needed.
|
||||
#
|
||||
# NOTE: We only support a single backend. We do NOT support different
|
||||
# backends on a per-request basis in V1 (for now, anyway...).
|
||||
# _backend is set in Processor._validate_structured_output
|
||||
if self.backend is None:
|
||||
assert request.sampling_params is not None
|
||||
backend = request.sampling_params.guided_decoding.backend
|
||||
backend = request.sampling_params.structured_outputs._backend
|
||||
vocab_size = self.vllm_config.model_config.get_vocab_size()
|
||||
if backend == "xgrammar":
|
||||
self.backend = XgrammarBackend(
|
||||
|
||||
@ -60,9 +60,9 @@ class GuidanceBackend(StructuredOutputBackend):
|
||||
|
||||
def __post_init__(self):
|
||||
self.disable_any_whitespace = \
|
||||
self.vllm_config.decoding_config.disable_any_whitespace
|
||||
self.vllm_config.structured_outputs_config.disable_any_whitespace
|
||||
self.disable_additional_properties = \
|
||||
self.vllm_config.decoding_config.disable_additional_properties
|
||||
self.vllm_config.structured_outputs_config.disable_additional_properties
|
||||
|
||||
self.ll_tokenizer = llguidance_hf.from_tokenizer(
|
||||
self.tokenizer, self.vocab_size)
|
||||
|
||||
@ -138,30 +138,30 @@ class LMFormatEnforcerBackend(StructuredOutputBackend):
|
||||
|
||||
def validate_structured_output_request_lm_format_enforcer(
|
||||
params: SamplingParams):
|
||||
if params.guided_decoding is None:
|
||||
if params.structured_outputs is None:
|
||||
return
|
||||
|
||||
gd_params = params.guided_decoding
|
||||
so_params = params.structured_outputs
|
||||
|
||||
if gd_params.regex:
|
||||
if so_params.regex:
|
||||
return
|
||||
elif gd_params.json:
|
||||
if isinstance(gd_params.json, str):
|
||||
elif so_params.json:
|
||||
if isinstance(so_params.json, str):
|
||||
try:
|
||||
# make sure schema is valid json
|
||||
json.loads(gd_params.json)
|
||||
json.loads(so_params.json)
|
||||
except json.JSONDecodeError as e:
|
||||
raise ValueError("Invalid JSON grammar specification.") from e
|
||||
else:
|
||||
try:
|
||||
json.dumps(gd_params.json)
|
||||
json.dumps(so_params.json)
|
||||
except Exception as e:
|
||||
raise ValueError(
|
||||
f"Error serializing guided decoding jsonschema: {e}"
|
||||
f"Error serializing structured outputs jsonschema: {e}"
|
||||
) from e
|
||||
return
|
||||
elif gd_params.choice:
|
||||
elif so_params.choice:
|
||||
return
|
||||
elif gd_params.grammar:
|
||||
raise ValueError("LM Format Enforcer guided decoding backend "
|
||||
elif so_params.grammar:
|
||||
raise ValueError("LM Format Enforcer structured outputs backend "
|
||||
"does not support grammar specifications")
|
||||
|
||||
@ -158,36 +158,36 @@ class OutlinesGrammar(StructuredOutputGrammar):
|
||||
|
||||
|
||||
def validate_structured_output_request_outlines(params: SamplingParams):
|
||||
if params.guided_decoding is None:
|
||||
if params.structured_outputs is None:
|
||||
return
|
||||
|
||||
gd_params = params.guided_decoding
|
||||
so_params = params.structured_outputs
|
||||
|
||||
if gd_params.regex:
|
||||
validate_regex_is_buildable(gd_params.regex)
|
||||
elif gd_params.json:
|
||||
if isinstance(gd_params.json, str):
|
||||
if so_params.regex:
|
||||
validate_regex_is_buildable(so_params.regex)
|
||||
elif so_params.json:
|
||||
if isinstance(so_params.json, str):
|
||||
try:
|
||||
# make sure schema is valid json
|
||||
json.loads(gd_params.json)
|
||||
schema = gd_params.json
|
||||
json.loads(so_params.json)
|
||||
schema = so_params.json
|
||||
except json.JSONDecodeError as e:
|
||||
raise ValueError("Invalid JSON grammar specification.") from e
|
||||
else:
|
||||
try:
|
||||
schema = json.dumps(gd_params.json)
|
||||
schema = json.dumps(so_params.json)
|
||||
except Exception as e:
|
||||
raise ValueError(
|
||||
f"Error serializing guided decoding jsonschema: {e}"
|
||||
f"Error serializing structured outputs jsonschema: {e}"
|
||||
) from e
|
||||
pattern = json_schema.build_regex_from_schema(schema)
|
||||
validate_regex_is_buildable(pattern)
|
||||
elif gd_params.choice:
|
||||
choices = [regex_escape(str(choice)) for choice in gd_params.choice]
|
||||
elif so_params.choice:
|
||||
choices = [regex_escape(str(choice)) for choice in so_params.choice]
|
||||
regex = "(" + "|".join(choices) + ")"
|
||||
validate_regex_is_buildable(regex)
|
||||
elif gd_params.grammar:
|
||||
raise ValueError("Outlines guided decoding backend "
|
||||
elif so_params.grammar:
|
||||
raise ValueError("Outlines structured outputs backend "
|
||||
"does not support grammar specifications")
|
||||
|
||||
|
||||
@ -306,7 +306,7 @@ def validate_regex_is_buildable(pattern: str) -> None:
|
||||
_check_unsupported(parsed)
|
||||
except ValueError as e:
|
||||
raise ValueError(
|
||||
f"Regex uses unsupported feature for guided decoding: {e}. "
|
||||
f"Regex uses unsupported feature for structured outputs: {e}. "
|
||||
"Only basic matching constructs are supported—lookarounds, "
|
||||
"backreferences, and unicode boundaries are not.") from e
|
||||
|
||||
@ -315,6 +315,6 @@ def validate_regex_is_buildable(pattern: str) -> None:
|
||||
"Regex does not have a anchored universal start state"
|
||||
"This means that the Regex uses anchors (^) or look-arounds "
|
||||
"in a way which requires context before any token is matched."
|
||||
"Guided decoding needs regexes that can match without needing "
|
||||
"structured outputs needs regexes that can match without needing "
|
||||
"that context. Try rewriting the pattern without using these "
|
||||
f"constructs. Pattern:\n{pattern}")
|
||||
|
||||
@ -34,7 +34,7 @@ class XgrammarBackend(StructuredOutputBackend):
|
||||
|
||||
def __post_init__(self):
|
||||
self.disable_any_whitespace = \
|
||||
self.vllm_config.decoding_config.disable_any_whitespace
|
||||
self.vllm_config.structured_outputs_config.disable_any_whitespace
|
||||
|
||||
if isinstance(self.tokenizer, MistralTokenizer):
|
||||
# NOTE: ideally, xgrammar should handle this accordingly.
|
||||
@ -248,37 +248,37 @@ def validate_xgrammar_grammar(sampling_params: SamplingParams) -> None:
|
||||
|
||||
Raises ValueError if the request is not supported.
|
||||
"""
|
||||
if sampling_params.guided_decoding is None:
|
||||
if sampling_params.structured_outputs is None:
|
||||
return
|
||||
|
||||
gd_params = sampling_params.guided_decoding
|
||||
so_params = sampling_params.structured_outputs
|
||||
|
||||
if gd_params.regex:
|
||||
if so_params.regex:
|
||||
try:
|
||||
xgr.Grammar.from_regex(gd_params.regex)
|
||||
xgr.Grammar.from_regex(so_params.regex)
|
||||
except Exception as err:
|
||||
raise ValueError("Failed to transform regex into a grammar: "
|
||||
f"{err}") from err
|
||||
|
||||
if gd_params.choice:
|
||||
choice_grammar = choice_as_grammar(gd_params.choice)
|
||||
if so_params.choice:
|
||||
choice_grammar = choice_as_grammar(so_params.choice)
|
||||
try:
|
||||
xgr.Grammar.from_ebnf(choice_grammar)
|
||||
except Exception as err:
|
||||
raise ValueError("Failed to transform choices into a grammar: "
|
||||
"{err}") from err
|
||||
gd_params.choice = None
|
||||
gd_params.grammar = choice_grammar
|
||||
so_params.choice = None
|
||||
so_params.grammar = choice_grammar
|
||||
return
|
||||
|
||||
if gd_params.json:
|
||||
if isinstance(gd_params.json, str):
|
||||
if so_params.json:
|
||||
if isinstance(so_params.json, str):
|
||||
try:
|
||||
schema = json.loads(gd_params.json)
|
||||
schema = json.loads(so_params.json)
|
||||
except json.JSONDecodeError as e:
|
||||
raise ValueError("Invalid JSON grammar specification.") from e
|
||||
else:
|
||||
schema = gd_params.json
|
||||
schema = so_params.json
|
||||
|
||||
try:
|
||||
xgr.Grammar.from_json_schema(schema)
|
||||
@ -291,11 +291,11 @@ def validate_xgrammar_grammar(sampling_params: SamplingParams) -> None:
|
||||
"supported by xgrammar.")
|
||||
return
|
||||
|
||||
if gd_params.grammar:
|
||||
if grammar_is_likely_lark(gd_params.grammar):
|
||||
if so_params.grammar:
|
||||
if grammar_is_likely_lark(so_params.grammar):
|
||||
# xgrammar supports EBNF grammars only
|
||||
try:
|
||||
gd_params.grammar = convert_lark_to_ebnf(gd_params.grammar)
|
||||
so_params.grammar = convert_lark_to_ebnf(so_params.grammar)
|
||||
except ValueError as e:
|
||||
raise ValueError(
|
||||
"Failed to convert the grammar from Lark to EBNF. ") from e
|
||||
@ -303,14 +303,14 @@ def validate_xgrammar_grammar(sampling_params: SamplingParams) -> None:
|
||||
# Test parsing EBNF grammar, possibly already converted from Lark
|
||||
try:
|
||||
# parse the grammar, but we aren't compiling it.
|
||||
xgr.Grammar.from_ebnf(gd_params.grammar)
|
||||
xgr.Grammar.from_ebnf(so_params.grammar)
|
||||
except Exception as e:
|
||||
raise ValueError("Invalid grammar specification.") from e
|
||||
return
|
||||
|
||||
if gd_params.structural_tag:
|
||||
if so_params.structural_tag:
|
||||
try:
|
||||
s_tag = json.loads(gd_params.structural_tag)
|
||||
s_tag = json.loads(so_params.structural_tag)
|
||||
tags = [
|
||||
xgr.StructuralTagItem(
|
||||
begin=s["begin"],
|
||||
|
||||
@ -60,7 +60,7 @@ class StructuredOutputRequest:
|
||||
|
||||
def get_structured_output_key(
|
||||
sampling_params: SamplingParams) -> StructuredOutputKey:
|
||||
params = sampling_params.guided_decoding
|
||||
params = sampling_params.structured_outputs
|
||||
assert params is not None, "params can't be None."
|
||||
if params.json is not None:
|
||||
if not isinstance(params.json, str):
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user