[Feature][Frontend]: Deprecate --enable-reasoning (#17452)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
This commit is contained in:
Chauncey 2025-05-01 21:46:16 +08:00 committed by GitHub
parent f5a3c655b2
commit 98060b001d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
16 changed files with 49 additions and 91 deletions

View File

@ -21,11 +21,10 @@ vLLM currently supports the following reasoning models:
## Quickstart
To use reasoning models, you need to specify the `--enable-reasoning` and `--reasoning-parser` flags when making a request to the chat completion endpoint. The `--reasoning-parser` flag specifies the reasoning parser to use for extracting reasoning content from the model output.
To use reasoning models, you need to specify the `--reasoning-parser` flags when making a request to the chat completion endpoint. The `--reasoning-parser` flag specifies the reasoning parser to use for extracting reasoning content from the model output.
```bash
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
--enable-reasoning --reasoning-parser deepseek_r1
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B --reasoning-parser deepseek_r1
```
Next, make a request to the model that should return the reasoning content in the response.
@ -140,8 +139,7 @@ Remember to check whether the `reasoning_content` exists in the response before
The reasoning content is also available in the structured output. The structured output engine like `xgrammar` will use the reasoning content to generate structured output. It is only supported in v0 engine now.
```bash
VLLM_USE_V1=0 vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
--enable-reasoning --reasoning-parser deepseek_r1
VLLM_USE_V1=0 vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B --reasoning-parser deepseek_r1
```
Please note that the `VLLM_USE_V1` environment variable must be set to `0` to use the v0 engine.
@ -316,9 +314,8 @@ class DeepSeekReasoner(Reasoner):
The structured output engine like `xgrammar` will use `end_token_id` to check if the reasoning content is present in the model output and skip the structured output if it is the case.
Finally, you can enable reasoning for the model by using the `--enable-reasoning` and `--reasoning-parser` flags.
Finally, you can enable reasoning for the model by using the `--reasoning-parser` flags.
```bash
vllm serve <model_tag> \
--enable-reasoning --reasoning-parser example
vllm serve <model_tag> --reasoning-parser example
```

View File

@ -9,7 +9,7 @@ parser:
```bash
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
--enable-reasoning --reasoning-parser deepseek_r1
--reasoning-parser deepseek_r1
```
This example demonstrates how to generate chat completions from reasoning models

View File

@ -9,7 +9,7 @@ the reasoning parser and tool calling enabled.
```bash
vllm serve Qwen/QwQ-32B \
--enable-reasoning --reasoning-parser deepseek_r1 \
--reasoning-parser deepseek_r1 \
--enable-auto-tool-choice --tool-call-parser hermes
```

View File

@ -8,7 +8,7 @@ with the reasoning parser:
```bash
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
--enable-reasoning --reasoning-parser deepseek_r1
--reasoning-parser deepseek_r1
```
This example demonstrates how to generate chat completions from reasoning models

View File

@ -8,7 +8,7 @@ parser:
```bash
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
--enable-reasoning --reasoning-parser deepseek_r1
--reasoning-parser deepseek_r1
```
Unlike openai_chat_completion_with_reasoning.py, this example demonstrates the

View File

@ -13,9 +13,9 @@ MODEL_NAME = "Qwen/QwQ-32B"
@pytest.fixture(scope="module")
def server(): # noqa: F811
args = [
"--max-model-len", "8192", "--enforce-eager", "--enable-reasoning",
"--reasoning-parser", "deepseek_r1", "--enable-auto-tool-choice",
"--tool-call-parser", "hermes"
"--max-model-len", "8192", "--enforce-eager", "--reasoning-parser",
"deepseek_r1", "--enable-auto-tool-choice", "--tool-call-parser",
"hermes"
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:

View File

@ -122,31 +122,23 @@ def test_enable_auto_choice_fails_with_enable_reasoning(serve_parser):
"""Ensure validation fails if reasoning is enabled with auto tool choice"""
args = serve_parser.parse_args(args=[
"--enable-auto-tool-choice",
"--enable-reasoning",
"--reasoning-parser",
"deepseek_r1",
])
with pytest.raises(TypeError):
validate_parsed_serve_args(args)
def test_enable_reasoning_passes_with_reasoning_parser(serve_parser):
def test_passes_with_reasoning_parser(serve_parser):
"""Ensure validation passes if reasoning is enabled
with a reasoning parser"""
args = serve_parser.parse_args(args=[
"--enable-reasoning",
"--reasoning-parser",
"deepseek_r1",
])
validate_parsed_serve_args(args)
def test_enable_reasoning_fails_without_reasoning_parser(serve_parser):
"""Ensure validation fails if reasoning is enabled
without a reasoning parser"""
args = serve_parser.parse_args(args=["--enable-reasoning"])
with pytest.raises(TypeError):
validate_parsed_serve_args(args)
def test_chat_template_validation_for_happy_paths(serve_parser):
"""Ensure validation passes if the chat template exists"""
args = serve_parser.parse_args(

View File

@ -3225,10 +3225,9 @@ class DecodingConfig:
in the JSON schema. This is only supported for the `guidance` backend and
is used to better align its behaviour with `outlines` and `xgrammar`."""
reasoning_backend: Optional[str] = None
reasoning_backend: str = ""
"""Select the reasoning parser depending on the model that you're using.
This is used to parse the reasoning content into OpenAI API format.
Required for `--enable-reasoning`."""
This is used to parse the reasoning content into OpenAI API format."""
def compute_hash(self) -> str:
"""

View File

@ -365,8 +365,9 @@ class EngineArgs:
calculate_kv_scales: bool = CacheConfig.calculate_kv_scales
additional_config: Optional[Dict[str, Any]] = None
enable_reasoning: Optional[bool] = None
reasoning_parser: Optional[str] = DecodingConfig.reasoning_backend
enable_reasoning: Optional[bool] = None # DEPRECATED
reasoning_parser: str = DecodingConfig.reasoning_backend
use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load
def __post_init__(self):
@ -798,8 +799,15 @@ class EngineArgs:
"--enable-reasoning",
action="store_true",
default=False,
help="Whether to enable reasoning_content for the model. "
"If enabled, the model will be able to generate reasoning content."
help=
"[DEPRECATED] " \
"The --enable-reasoning flag is deprecated as of v0.8.6. "
"Use --reasoning-parser to specify " \
"the reasoning parser backend instead. "
"This flag (--enable-reasoning) will be " \
"removed in v0.10.0. "
"When --reasoning-parser is specified, " \
"reasoning mode is automatically enabled."
)
return parser
@ -1088,7 +1096,6 @@ class EngineArgs:
disable_additional_properties=\
self.guided_decoding_disable_additional_properties,
reasoning_backend=self.reasoning_parser
if self.enable_reasoning else None,
)
observability_config = ObservabilityConfig(

View File

@ -2096,7 +2096,7 @@ class LLMEngine:
guided_decoding.backend = guided_decoding.backend or \
self.decoding_config.backend
if self.decoding_config.reasoning_backend is not None:
if self.decoding_config.reasoning_backend:
logger.debug("Building with reasoning backend %s",
self.decoding_config.reasoning_backend)

View File

@ -967,7 +967,6 @@ async def init_app_state(
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
enable_auto_tools=args.enable_auto_tool_choice,
tool_parser=args.tool_call_parser,
enable_reasoning=args.enable_reasoning,
reasoning_parser=args.reasoning_parser,
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
) if model_config.runner_type == "generate" else None
@ -1053,7 +1052,7 @@ async def run_server(args, **uvicorn_kwargs) -> None:
f"(chose from {{ {','.join(valid_tool_parses)} }})")
valid_reasoning_parses = ReasoningParserManager.reasoning_parsers.keys()
if args.enable_reasoning \
if args.reasoning_parser \
and args.reasoning_parser not in valid_reasoning_parses:
raise KeyError(
f"invalid reasoning parser: {args.reasoning_parser} "

View File

@ -284,11 +284,6 @@ def validate_parsed_serve_args(args: argparse.Namespace):
raise TypeError("Error: --enable-auto-tool-choice requires "
"--tool-call-parser")
# Enable reasoning needs a reasoning parser to be valid
if args.enable_reasoning and not args.reasoning_parser:
raise TypeError("Error: --enable-reasoning requires "
"--reasoning-parser")
def create_parser_for_docs() -> FlexibleArgumentParser:
parser_for_docs = FlexibleArgumentParser(

View File

@ -58,8 +58,7 @@ class OpenAIServingChat(OpenAIServing):
chat_template: Optional[str],
chat_template_content_format: ChatTemplateContentFormatOption,
return_tokens_as_token_ids: bool = False,
enable_reasoning: bool = False,
reasoning_parser: Optional[str] = None,
reasoning_parser: str = "",
enable_auto_tools: bool = False,
tool_parser: Optional[str] = None,
enable_prompt_tokens_details: bool = False,
@ -82,18 +81,17 @@ class OpenAIServingChat(OpenAIServing):
" the parallel_tool_calls client option is preset for "
"compatibility reasons, it will be ignored.")
self.enable_reasoning: bool = enable_reasoning
self.reasoning_parser: Optional[Callable[[AnyTokenizer],
ReasoningParser]] = None
if self.enable_reasoning:
if reasoning_parser:
try:
self.reasoning_parser = (
ReasoningParserManager.get_reasoning_parser(
reasoning_parser))
assert self.reasoning_parser is not None
except Exception as e:
raise TypeError("Error: --enable-reasoning requires "
f"reasoning_parser:'{reasoning_parser}' "
"which has not been registered") from e
raise TypeError(
f"{reasoning_parser=} has not been registered") from e
self.tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None
if self.enable_auto_tools:
try:
@ -423,15 +421,12 @@ class OpenAIServingChat(OpenAIServing):
not tool_choice_function_name
and self._should_stream_with_auto_tool_parsing(request))
should_stream_with_reasoning_parsing = (
self._should_stream_with_reasoning_parsing(request))
all_previous_token_ids: Optional[list[list[int]]]
function_name_returned: Optional[list[bool]] = None
# Only one of these will be used, thus previous_texts and
# all_previous_token_ids will not be used twice in the same iteration.
if tool_choice_auto or should_stream_with_reasoning_parsing:
if tool_choice_auto or self.reasoning_parser:
# These are only required in "auto" tool choice case
previous_texts = [""] * num_choices
all_previous_token_ids = [[]] * num_choices
@ -446,12 +441,7 @@ class OpenAIServingChat(OpenAIServing):
previous_texts, all_previous_token_ids = None, None
try:
# There is no need to check if the reasoning_parser is None
# because the should_stream_with_reasoning_parsing check
# already ensures that the reasoning_parser is not None.
# but the pre-commit hook requires it.
if should_stream_with_reasoning_parsing and \
self.reasoning_parser is not None:
if self.reasoning_parser:
reasoning_parser = self.reasoning_parser(tokenizer)
except RuntimeError as e:
logger.exception("Error in reasoning parser creation.")
@ -459,7 +449,6 @@ class OpenAIServingChat(OpenAIServing):
yield f"data: {data}\n\n"
yield "data: [DONE]\n\n"
return
# Prepare the tool parser if it's needed
try:
if tool_choice_auto and self.tool_parser:
@ -592,7 +581,7 @@ class OpenAIServingChat(OpenAIServing):
delta_message: Optional[DeltaMessage]
# just update previous_texts and previous_token_ids
if tool_choice_auto or should_stream_with_reasoning_parsing:
if tool_choice_auto or self.reasoning_parser:
assert previous_texts is not None
assert all_previous_token_ids is not None
previous_text = previous_texts[i]
@ -603,7 +592,7 @@ class OpenAIServingChat(OpenAIServing):
# handle streaming deltas for tools with named tool_choice
if tool_choice_function_name:
if (self.enable_reasoning
if (self.reasoning_parser
and not reasoning_parser.is_reasoning_end(
previous_token_ids)):
assert reasoning_parser is not None
@ -630,7 +619,7 @@ class OpenAIServingChat(OpenAIServing):
current_text = ""
else:
# Just to add remaining `content`
if self.enable_reasoning:
if self.reasoning_parser:
delta_text = previous_text + delta_text
current_text = ""
@ -660,7 +649,7 @@ class OpenAIServingChat(OpenAIServing):
# handle streaming deltas for tools with "auto" tool choice
# and reasoning parser
elif tool_choice_auto and self.enable_reasoning:
elif tool_choice_auto and self.reasoning_parser:
assert tool_parser is not None
assert reasoning_parser is not None
assert added_content_delta_arr is not None
@ -728,8 +717,7 @@ class OpenAIServingChat(OpenAIServing):
delta_token_ids=output.token_ids,
request=request))
# when only reasoning
elif self.enable_reasoning:
assert reasoning_parser is not None
elif self.reasoning_parser:
delta_message = (reasoning_parser.
extract_reasoning_content_streaming(
previous_text,
@ -744,7 +732,7 @@ class OpenAIServingChat(OpenAIServing):
delta_message = DeltaMessage(content=delta_text)
# update the previous values for the next iteration
if tool_choice_auto or should_stream_with_reasoning_parsing:
if tool_choice_auto or self.reasoning_parser:
assert previous_texts is not None
assert all_previous_token_ids is not None
previous_texts[i] = current_text
@ -931,17 +919,9 @@ class OpenAIServingChat(OpenAIServing):
)
else:
logprobs = None
should_stream_with_reasoning_parsing = (
self._should_stream_with_reasoning_parsing(request))
# In the OpenAI API the finish_reason is "tools_called"
# if the tool choice is auto and the model produced a tool
# call. The same is not true for named function calls
auto_tools_called = False
if should_stream_with_reasoning_parsing and \
self.reasoning_parser is not None:
if self.reasoning_parser:
try:
reasoning_parser = self.reasoning_parser(tokenizer)
except RuntimeError as e:
@ -1176,17 +1156,6 @@ class OpenAIServingChat(OpenAIServing):
return (request.tools and self.tool_parser and self.enable_auto_tools
and request.tool_choice in ['auto', None])
def _should_stream_with_reasoning_parsing(self,
request: ChatCompletionRequest):
"""
Utility function to check if streamed tokens should go through the
reasoning parser that was configured.
We only want to do this IF reasoning is enabled and a reasoning
parser is configured.
"""
return self.enable_reasoning and self.reasoning_parser is not None
def _should_check_for_unstreamed_tool_arg_tokens(
self,
delta_message: Optional[DeltaMessage],

View File

@ -103,7 +103,7 @@ async def get_guided_decoding_logits_processor(
reasoning_backend: str | None = None) -> LogitsProcessor | None:
reasoner = None
if reasoning_backend is not None:
if reasoning_backend:
reasoner_class = ReasoningParserManager.get_reasoning_parser(
reasoning_backend)
reasoner = reasoner_class(tokenizer)
@ -146,7 +146,7 @@ def get_local_guided_decoding_logits_processor(
guided_params = maybe_backend_fallback(guided_params)
reasoner = None
if reasoning_backend is not None:
if reasoning_backend:
reasoner_class = ReasoningParserManager.get_reasoning_parser(
reasoning_backend)
reasoner = reasoner_class(tokenizer)

View File

@ -61,7 +61,7 @@ class BaseLogitsProcessor:
"""Use the FSM to bias the logits before sampling the next token."""
# Skip the structured logits processing if reasoning is not finished.
# reasoner is not None only when `--enable-reasoning` is set.
# reasoner is not None only when `--reasoning-parser` is set.
if self._reasoner is not None:
if not self._reasoner.is_reasoning_end(input_ids):
return scores

View File

@ -346,7 +346,7 @@ class XGrammarLogitsProcessor:
scores: torch.Tensor) -> torch.Tensor:
# Skip the structured logits processing if reasoning is not finished.
# reasoner is not None only when `--enable-reasoning` is set.
# reasoner is not None only when `--reasoning-parser` is set.
if self.reasoner is not None and \
not self.reasoner.is_reasoning_end(
input_ids):