mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 05:25:00 +08:00
[Feature][Frontend]: Deprecate --enable-reasoning (#17452)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
This commit is contained in:
parent
f5a3c655b2
commit
98060b001d
@ -21,11 +21,10 @@ vLLM currently supports the following reasoning models:
|
||||
|
||||
## Quickstart
|
||||
|
||||
To use reasoning models, you need to specify the `--enable-reasoning` and `--reasoning-parser` flags when making a request to the chat completion endpoint. The `--reasoning-parser` flag specifies the reasoning parser to use for extracting reasoning content from the model output.
|
||||
To use reasoning models, you need to specify the `--reasoning-parser` flags when making a request to the chat completion endpoint. The `--reasoning-parser` flag specifies the reasoning parser to use for extracting reasoning content from the model output.
|
||||
|
||||
```bash
|
||||
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
|
||||
--enable-reasoning --reasoning-parser deepseek_r1
|
||||
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B --reasoning-parser deepseek_r1
|
||||
```
|
||||
|
||||
Next, make a request to the model that should return the reasoning content in the response.
|
||||
@ -140,8 +139,7 @@ Remember to check whether the `reasoning_content` exists in the response before
|
||||
The reasoning content is also available in the structured output. The structured output engine like `xgrammar` will use the reasoning content to generate structured output. It is only supported in v0 engine now.
|
||||
|
||||
```bash
|
||||
VLLM_USE_V1=0 vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
|
||||
--enable-reasoning --reasoning-parser deepseek_r1
|
||||
VLLM_USE_V1=0 vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B --reasoning-parser deepseek_r1
|
||||
```
|
||||
|
||||
Please note that the `VLLM_USE_V1` environment variable must be set to `0` to use the v0 engine.
|
||||
@ -316,9 +314,8 @@ class DeepSeekReasoner(Reasoner):
|
||||
|
||||
The structured output engine like `xgrammar` will use `end_token_id` to check if the reasoning content is present in the model output and skip the structured output if it is the case.
|
||||
|
||||
Finally, you can enable reasoning for the model by using the `--enable-reasoning` and `--reasoning-parser` flags.
|
||||
Finally, you can enable reasoning for the model by using the `--reasoning-parser` flags.
|
||||
|
||||
```bash
|
||||
vllm serve <model_tag> \
|
||||
--enable-reasoning --reasoning-parser example
|
||||
vllm serve <model_tag> --reasoning-parser example
|
||||
```
|
||||
|
||||
@ -9,7 +9,7 @@ parser:
|
||||
|
||||
```bash
|
||||
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
|
||||
--enable-reasoning --reasoning-parser deepseek_r1
|
||||
--reasoning-parser deepseek_r1
|
||||
```
|
||||
|
||||
This example demonstrates how to generate chat completions from reasoning models
|
||||
|
||||
@ -9,7 +9,7 @@ the reasoning parser and tool calling enabled.
|
||||
|
||||
```bash
|
||||
vllm serve Qwen/QwQ-32B \
|
||||
--enable-reasoning --reasoning-parser deepseek_r1 \
|
||||
--reasoning-parser deepseek_r1 \
|
||||
--enable-auto-tool-choice --tool-call-parser hermes
|
||||
|
||||
```
|
||||
|
||||
@ -8,7 +8,7 @@ with the reasoning parser:
|
||||
|
||||
```bash
|
||||
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
|
||||
--enable-reasoning --reasoning-parser deepseek_r1
|
||||
--reasoning-parser deepseek_r1
|
||||
```
|
||||
|
||||
This example demonstrates how to generate chat completions from reasoning models
|
||||
|
||||
@ -8,7 +8,7 @@ parser:
|
||||
|
||||
```bash
|
||||
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
|
||||
--enable-reasoning --reasoning-parser deepseek_r1
|
||||
--reasoning-parser deepseek_r1
|
||||
```
|
||||
|
||||
Unlike openai_chat_completion_with_reasoning.py, this example demonstrates the
|
||||
|
||||
@ -13,9 +13,9 @@ MODEL_NAME = "Qwen/QwQ-32B"
|
||||
@pytest.fixture(scope="module")
|
||||
def server(): # noqa: F811
|
||||
args = [
|
||||
"--max-model-len", "8192", "--enforce-eager", "--enable-reasoning",
|
||||
"--reasoning-parser", "deepseek_r1", "--enable-auto-tool-choice",
|
||||
"--tool-call-parser", "hermes"
|
||||
"--max-model-len", "8192", "--enforce-eager", "--reasoning-parser",
|
||||
"deepseek_r1", "--enable-auto-tool-choice", "--tool-call-parser",
|
||||
"hermes"
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
|
||||
@ -122,31 +122,23 @@ def test_enable_auto_choice_fails_with_enable_reasoning(serve_parser):
|
||||
"""Ensure validation fails if reasoning is enabled with auto tool choice"""
|
||||
args = serve_parser.parse_args(args=[
|
||||
"--enable-auto-tool-choice",
|
||||
"--enable-reasoning",
|
||||
"--reasoning-parser",
|
||||
"deepseek_r1",
|
||||
])
|
||||
with pytest.raises(TypeError):
|
||||
validate_parsed_serve_args(args)
|
||||
|
||||
|
||||
def test_enable_reasoning_passes_with_reasoning_parser(serve_parser):
|
||||
def test_passes_with_reasoning_parser(serve_parser):
|
||||
"""Ensure validation passes if reasoning is enabled
|
||||
with a reasoning parser"""
|
||||
args = serve_parser.parse_args(args=[
|
||||
"--enable-reasoning",
|
||||
"--reasoning-parser",
|
||||
"deepseek_r1",
|
||||
])
|
||||
validate_parsed_serve_args(args)
|
||||
|
||||
|
||||
def test_enable_reasoning_fails_without_reasoning_parser(serve_parser):
|
||||
"""Ensure validation fails if reasoning is enabled
|
||||
without a reasoning parser"""
|
||||
args = serve_parser.parse_args(args=["--enable-reasoning"])
|
||||
with pytest.raises(TypeError):
|
||||
validate_parsed_serve_args(args)
|
||||
|
||||
|
||||
def test_chat_template_validation_for_happy_paths(serve_parser):
|
||||
"""Ensure validation passes if the chat template exists"""
|
||||
args = serve_parser.parse_args(
|
||||
|
||||
@ -3225,10 +3225,9 @@ class DecodingConfig:
|
||||
in the JSON schema. This is only supported for the `guidance` backend and
|
||||
is used to better align its behaviour with `outlines` and `xgrammar`."""
|
||||
|
||||
reasoning_backend: Optional[str] = None
|
||||
reasoning_backend: str = ""
|
||||
"""Select the reasoning parser depending on the model that you're using.
|
||||
This is used to parse the reasoning content into OpenAI API format.
|
||||
Required for `--enable-reasoning`."""
|
||||
This is used to parse the reasoning content into OpenAI API format."""
|
||||
|
||||
def compute_hash(self) -> str:
|
||||
"""
|
||||
|
||||
@ -365,8 +365,9 @@ class EngineArgs:
|
||||
calculate_kv_scales: bool = CacheConfig.calculate_kv_scales
|
||||
|
||||
additional_config: Optional[Dict[str, Any]] = None
|
||||
enable_reasoning: Optional[bool] = None
|
||||
reasoning_parser: Optional[str] = DecodingConfig.reasoning_backend
|
||||
enable_reasoning: Optional[bool] = None # DEPRECATED
|
||||
reasoning_parser: str = DecodingConfig.reasoning_backend
|
||||
|
||||
use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load
|
||||
|
||||
def __post_init__(self):
|
||||
@ -798,8 +799,15 @@ class EngineArgs:
|
||||
"--enable-reasoning",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Whether to enable reasoning_content for the model. "
|
||||
"If enabled, the model will be able to generate reasoning content."
|
||||
help=
|
||||
"[DEPRECATED] " \
|
||||
"The --enable-reasoning flag is deprecated as of v0.8.6. "
|
||||
"Use --reasoning-parser to specify " \
|
||||
"the reasoning parser backend instead. "
|
||||
"This flag (--enable-reasoning) will be " \
|
||||
"removed in v0.10.0. "
|
||||
"When --reasoning-parser is specified, " \
|
||||
"reasoning mode is automatically enabled."
|
||||
)
|
||||
|
||||
return parser
|
||||
@ -1088,7 +1096,6 @@ class EngineArgs:
|
||||
disable_additional_properties=\
|
||||
self.guided_decoding_disable_additional_properties,
|
||||
reasoning_backend=self.reasoning_parser
|
||||
if self.enable_reasoning else None,
|
||||
)
|
||||
|
||||
observability_config = ObservabilityConfig(
|
||||
|
||||
@ -2096,7 +2096,7 @@ class LLMEngine:
|
||||
guided_decoding.backend = guided_decoding.backend or \
|
||||
self.decoding_config.backend
|
||||
|
||||
if self.decoding_config.reasoning_backend is not None:
|
||||
if self.decoding_config.reasoning_backend:
|
||||
logger.debug("Building with reasoning backend %s",
|
||||
self.decoding_config.reasoning_backend)
|
||||
|
||||
|
||||
@ -967,7 +967,6 @@ async def init_app_state(
|
||||
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
|
||||
enable_auto_tools=args.enable_auto_tool_choice,
|
||||
tool_parser=args.tool_call_parser,
|
||||
enable_reasoning=args.enable_reasoning,
|
||||
reasoning_parser=args.reasoning_parser,
|
||||
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
|
||||
) if model_config.runner_type == "generate" else None
|
||||
@ -1053,7 +1052,7 @@ async def run_server(args, **uvicorn_kwargs) -> None:
|
||||
f"(chose from {{ {','.join(valid_tool_parses)} }})")
|
||||
|
||||
valid_reasoning_parses = ReasoningParserManager.reasoning_parsers.keys()
|
||||
if args.enable_reasoning \
|
||||
if args.reasoning_parser \
|
||||
and args.reasoning_parser not in valid_reasoning_parses:
|
||||
raise KeyError(
|
||||
f"invalid reasoning parser: {args.reasoning_parser} "
|
||||
|
||||
@ -284,11 +284,6 @@ def validate_parsed_serve_args(args: argparse.Namespace):
|
||||
raise TypeError("Error: --enable-auto-tool-choice requires "
|
||||
"--tool-call-parser")
|
||||
|
||||
# Enable reasoning needs a reasoning parser to be valid
|
||||
if args.enable_reasoning and not args.reasoning_parser:
|
||||
raise TypeError("Error: --enable-reasoning requires "
|
||||
"--reasoning-parser")
|
||||
|
||||
|
||||
def create_parser_for_docs() -> FlexibleArgumentParser:
|
||||
parser_for_docs = FlexibleArgumentParser(
|
||||
|
||||
@ -58,8 +58,7 @@ class OpenAIServingChat(OpenAIServing):
|
||||
chat_template: Optional[str],
|
||||
chat_template_content_format: ChatTemplateContentFormatOption,
|
||||
return_tokens_as_token_ids: bool = False,
|
||||
enable_reasoning: bool = False,
|
||||
reasoning_parser: Optional[str] = None,
|
||||
reasoning_parser: str = "",
|
||||
enable_auto_tools: bool = False,
|
||||
tool_parser: Optional[str] = None,
|
||||
enable_prompt_tokens_details: bool = False,
|
||||
@ -82,18 +81,17 @@ class OpenAIServingChat(OpenAIServing):
|
||||
" the parallel_tool_calls client option is preset for "
|
||||
"compatibility reasons, it will be ignored.")
|
||||
|
||||
self.enable_reasoning: bool = enable_reasoning
|
||||
self.reasoning_parser: Optional[Callable[[AnyTokenizer],
|
||||
ReasoningParser]] = None
|
||||
if self.enable_reasoning:
|
||||
if reasoning_parser:
|
||||
try:
|
||||
self.reasoning_parser = (
|
||||
ReasoningParserManager.get_reasoning_parser(
|
||||
reasoning_parser))
|
||||
assert self.reasoning_parser is not None
|
||||
except Exception as e:
|
||||
raise TypeError("Error: --enable-reasoning requires "
|
||||
f"reasoning_parser:'{reasoning_parser}' "
|
||||
"which has not been registered") from e
|
||||
raise TypeError(
|
||||
f"{reasoning_parser=} has not been registered") from e
|
||||
self.tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None
|
||||
if self.enable_auto_tools:
|
||||
try:
|
||||
@ -423,15 +421,12 @@ class OpenAIServingChat(OpenAIServing):
|
||||
not tool_choice_function_name
|
||||
and self._should_stream_with_auto_tool_parsing(request))
|
||||
|
||||
should_stream_with_reasoning_parsing = (
|
||||
self._should_stream_with_reasoning_parsing(request))
|
||||
|
||||
all_previous_token_ids: Optional[list[list[int]]]
|
||||
function_name_returned: Optional[list[bool]] = None
|
||||
|
||||
# Only one of these will be used, thus previous_texts and
|
||||
# all_previous_token_ids will not be used twice in the same iteration.
|
||||
if tool_choice_auto or should_stream_with_reasoning_parsing:
|
||||
if tool_choice_auto or self.reasoning_parser:
|
||||
# These are only required in "auto" tool choice case
|
||||
previous_texts = [""] * num_choices
|
||||
all_previous_token_ids = [[]] * num_choices
|
||||
@ -446,12 +441,7 @@ class OpenAIServingChat(OpenAIServing):
|
||||
previous_texts, all_previous_token_ids = None, None
|
||||
|
||||
try:
|
||||
# There is no need to check if the reasoning_parser is None
|
||||
# because the should_stream_with_reasoning_parsing check
|
||||
# already ensures that the reasoning_parser is not None.
|
||||
# but the pre-commit hook requires it.
|
||||
if should_stream_with_reasoning_parsing and \
|
||||
self.reasoning_parser is not None:
|
||||
if self.reasoning_parser:
|
||||
reasoning_parser = self.reasoning_parser(tokenizer)
|
||||
except RuntimeError as e:
|
||||
logger.exception("Error in reasoning parser creation.")
|
||||
@ -459,7 +449,6 @@ class OpenAIServingChat(OpenAIServing):
|
||||
yield f"data: {data}\n\n"
|
||||
yield "data: [DONE]\n\n"
|
||||
return
|
||||
|
||||
# Prepare the tool parser if it's needed
|
||||
try:
|
||||
if tool_choice_auto and self.tool_parser:
|
||||
@ -592,7 +581,7 @@ class OpenAIServingChat(OpenAIServing):
|
||||
delta_message: Optional[DeltaMessage]
|
||||
|
||||
# just update previous_texts and previous_token_ids
|
||||
if tool_choice_auto or should_stream_with_reasoning_parsing:
|
||||
if tool_choice_auto or self.reasoning_parser:
|
||||
assert previous_texts is not None
|
||||
assert all_previous_token_ids is not None
|
||||
previous_text = previous_texts[i]
|
||||
@ -603,7 +592,7 @@ class OpenAIServingChat(OpenAIServing):
|
||||
|
||||
# handle streaming deltas for tools with named tool_choice
|
||||
if tool_choice_function_name:
|
||||
if (self.enable_reasoning
|
||||
if (self.reasoning_parser
|
||||
and not reasoning_parser.is_reasoning_end(
|
||||
previous_token_ids)):
|
||||
assert reasoning_parser is not None
|
||||
@ -630,7 +619,7 @@ class OpenAIServingChat(OpenAIServing):
|
||||
current_text = ""
|
||||
else:
|
||||
# Just to add remaining `content`
|
||||
if self.enable_reasoning:
|
||||
if self.reasoning_parser:
|
||||
delta_text = previous_text + delta_text
|
||||
current_text = ""
|
||||
|
||||
@ -660,7 +649,7 @@ class OpenAIServingChat(OpenAIServing):
|
||||
|
||||
# handle streaming deltas for tools with "auto" tool choice
|
||||
# and reasoning parser
|
||||
elif tool_choice_auto and self.enable_reasoning:
|
||||
elif tool_choice_auto and self.reasoning_parser:
|
||||
assert tool_parser is not None
|
||||
assert reasoning_parser is not None
|
||||
assert added_content_delta_arr is not None
|
||||
@ -728,8 +717,7 @@ class OpenAIServingChat(OpenAIServing):
|
||||
delta_token_ids=output.token_ids,
|
||||
request=request))
|
||||
# when only reasoning
|
||||
elif self.enable_reasoning:
|
||||
assert reasoning_parser is not None
|
||||
elif self.reasoning_parser:
|
||||
delta_message = (reasoning_parser.
|
||||
extract_reasoning_content_streaming(
|
||||
previous_text,
|
||||
@ -744,7 +732,7 @@ class OpenAIServingChat(OpenAIServing):
|
||||
delta_message = DeltaMessage(content=delta_text)
|
||||
|
||||
# update the previous values for the next iteration
|
||||
if tool_choice_auto or should_stream_with_reasoning_parsing:
|
||||
if tool_choice_auto or self.reasoning_parser:
|
||||
assert previous_texts is not None
|
||||
assert all_previous_token_ids is not None
|
||||
previous_texts[i] = current_text
|
||||
@ -931,17 +919,9 @@ class OpenAIServingChat(OpenAIServing):
|
||||
)
|
||||
else:
|
||||
logprobs = None
|
||||
|
||||
should_stream_with_reasoning_parsing = (
|
||||
self._should_stream_with_reasoning_parsing(request))
|
||||
|
||||
# In the OpenAI API the finish_reason is "tools_called"
|
||||
# if the tool choice is auto and the model produced a tool
|
||||
# call. The same is not true for named function calls
|
||||
auto_tools_called = False
|
||||
|
||||
if should_stream_with_reasoning_parsing and \
|
||||
self.reasoning_parser is not None:
|
||||
if self.reasoning_parser:
|
||||
try:
|
||||
reasoning_parser = self.reasoning_parser(tokenizer)
|
||||
except RuntimeError as e:
|
||||
@ -1176,17 +1156,6 @@ class OpenAIServingChat(OpenAIServing):
|
||||
return (request.tools and self.tool_parser and self.enable_auto_tools
|
||||
and request.tool_choice in ['auto', None])
|
||||
|
||||
def _should_stream_with_reasoning_parsing(self,
|
||||
request: ChatCompletionRequest):
|
||||
"""
|
||||
Utility function to check if streamed tokens should go through the
|
||||
reasoning parser that was configured.
|
||||
|
||||
We only want to do this IF reasoning is enabled and a reasoning
|
||||
parser is configured.
|
||||
"""
|
||||
return self.enable_reasoning and self.reasoning_parser is not None
|
||||
|
||||
def _should_check_for_unstreamed_tool_arg_tokens(
|
||||
self,
|
||||
delta_message: Optional[DeltaMessage],
|
||||
|
||||
@ -103,7 +103,7 @@ async def get_guided_decoding_logits_processor(
|
||||
reasoning_backend: str | None = None) -> LogitsProcessor | None:
|
||||
|
||||
reasoner = None
|
||||
if reasoning_backend is not None:
|
||||
if reasoning_backend:
|
||||
reasoner_class = ReasoningParserManager.get_reasoning_parser(
|
||||
reasoning_backend)
|
||||
reasoner = reasoner_class(tokenizer)
|
||||
@ -146,7 +146,7 @@ def get_local_guided_decoding_logits_processor(
|
||||
guided_params = maybe_backend_fallback(guided_params)
|
||||
|
||||
reasoner = None
|
||||
if reasoning_backend is not None:
|
||||
if reasoning_backend:
|
||||
reasoner_class = ReasoningParserManager.get_reasoning_parser(
|
||||
reasoning_backend)
|
||||
reasoner = reasoner_class(tokenizer)
|
||||
|
||||
@ -61,7 +61,7 @@ class BaseLogitsProcessor:
|
||||
"""Use the FSM to bias the logits before sampling the next token."""
|
||||
|
||||
# Skip the structured logits processing if reasoning is not finished.
|
||||
# reasoner is not None only when `--enable-reasoning` is set.
|
||||
# reasoner is not None only when `--reasoning-parser` is set.
|
||||
if self._reasoner is not None:
|
||||
if not self._reasoner.is_reasoning_end(input_ids):
|
||||
return scores
|
||||
|
||||
@ -346,7 +346,7 @@ class XGrammarLogitsProcessor:
|
||||
scores: torch.Tensor) -> torch.Tensor:
|
||||
|
||||
# Skip the structured logits processing if reasoning is not finished.
|
||||
# reasoner is not None only when `--enable-reasoning` is set.
|
||||
# reasoner is not None only when `--reasoning-parser` is set.
|
||||
if self.reasoner is not None and \
|
||||
not self.reasoner.is_reasoning_end(
|
||||
input_ids):
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user