mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-11 00:15:01 +08:00
[Feature][Frontend]: Deprecate --enable-reasoning (#17452)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
This commit is contained in:
parent
f5a3c655b2
commit
98060b001d
@ -21,11 +21,10 @@ vLLM currently supports the following reasoning models:
|
|||||||
|
|
||||||
## Quickstart
|
## Quickstart
|
||||||
|
|
||||||
To use reasoning models, you need to specify the `--enable-reasoning` and `--reasoning-parser` flags when making a request to the chat completion endpoint. The `--reasoning-parser` flag specifies the reasoning parser to use for extracting reasoning content from the model output.
|
To use reasoning models, you need to specify the `--reasoning-parser` flags when making a request to the chat completion endpoint. The `--reasoning-parser` flag specifies the reasoning parser to use for extracting reasoning content from the model output.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
|
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B --reasoning-parser deepseek_r1
|
||||||
--enable-reasoning --reasoning-parser deepseek_r1
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Next, make a request to the model that should return the reasoning content in the response.
|
Next, make a request to the model that should return the reasoning content in the response.
|
||||||
@ -140,8 +139,7 @@ Remember to check whether the `reasoning_content` exists in the response before
|
|||||||
The reasoning content is also available in the structured output. The structured output engine like `xgrammar` will use the reasoning content to generate structured output. It is only supported in v0 engine now.
|
The reasoning content is also available in the structured output. The structured output engine like `xgrammar` will use the reasoning content to generate structured output. It is only supported in v0 engine now.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
VLLM_USE_V1=0 vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
|
VLLM_USE_V1=0 vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B --reasoning-parser deepseek_r1
|
||||||
--enable-reasoning --reasoning-parser deepseek_r1
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Please note that the `VLLM_USE_V1` environment variable must be set to `0` to use the v0 engine.
|
Please note that the `VLLM_USE_V1` environment variable must be set to `0` to use the v0 engine.
|
||||||
@ -316,9 +314,8 @@ class DeepSeekReasoner(Reasoner):
|
|||||||
|
|
||||||
The structured output engine like `xgrammar` will use `end_token_id` to check if the reasoning content is present in the model output and skip the structured output if it is the case.
|
The structured output engine like `xgrammar` will use `end_token_id` to check if the reasoning content is present in the model output and skip the structured output if it is the case.
|
||||||
|
|
||||||
Finally, you can enable reasoning for the model by using the `--enable-reasoning` and `--reasoning-parser` flags.
|
Finally, you can enable reasoning for the model by using the `--reasoning-parser` flags.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
vllm serve <model_tag> \
|
vllm serve <model_tag> --reasoning-parser example
|
||||||
--enable-reasoning --reasoning-parser example
|
|
||||||
```
|
```
|
||||||
|
|||||||
@ -9,7 +9,7 @@ parser:
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
|
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
|
||||||
--enable-reasoning --reasoning-parser deepseek_r1
|
--reasoning-parser deepseek_r1
|
||||||
```
|
```
|
||||||
|
|
||||||
This example demonstrates how to generate chat completions from reasoning models
|
This example demonstrates how to generate chat completions from reasoning models
|
||||||
|
|||||||
@ -9,7 +9,7 @@ the reasoning parser and tool calling enabled.
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
vllm serve Qwen/QwQ-32B \
|
vllm serve Qwen/QwQ-32B \
|
||||||
--enable-reasoning --reasoning-parser deepseek_r1 \
|
--reasoning-parser deepseek_r1 \
|
||||||
--enable-auto-tool-choice --tool-call-parser hermes
|
--enable-auto-tool-choice --tool-call-parser hermes
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|||||||
@ -8,7 +8,7 @@ with the reasoning parser:
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
|
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
|
||||||
--enable-reasoning --reasoning-parser deepseek_r1
|
--reasoning-parser deepseek_r1
|
||||||
```
|
```
|
||||||
|
|
||||||
This example demonstrates how to generate chat completions from reasoning models
|
This example demonstrates how to generate chat completions from reasoning models
|
||||||
|
|||||||
@ -8,7 +8,7 @@ parser:
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
|
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
|
||||||
--enable-reasoning --reasoning-parser deepseek_r1
|
--reasoning-parser deepseek_r1
|
||||||
```
|
```
|
||||||
|
|
||||||
Unlike openai_chat_completion_with_reasoning.py, this example demonstrates the
|
Unlike openai_chat_completion_with_reasoning.py, this example demonstrates the
|
||||||
|
|||||||
@ -13,9 +13,9 @@ MODEL_NAME = "Qwen/QwQ-32B"
|
|||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def server(): # noqa: F811
|
def server(): # noqa: F811
|
||||||
args = [
|
args = [
|
||||||
"--max-model-len", "8192", "--enforce-eager", "--enable-reasoning",
|
"--max-model-len", "8192", "--enforce-eager", "--reasoning-parser",
|
||||||
"--reasoning-parser", "deepseek_r1", "--enable-auto-tool-choice",
|
"deepseek_r1", "--enable-auto-tool-choice", "--tool-call-parser",
|
||||||
"--tool-call-parser", "hermes"
|
"hermes"
|
||||||
]
|
]
|
||||||
|
|
||||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||||
|
|||||||
@ -122,31 +122,23 @@ def test_enable_auto_choice_fails_with_enable_reasoning(serve_parser):
|
|||||||
"""Ensure validation fails if reasoning is enabled with auto tool choice"""
|
"""Ensure validation fails if reasoning is enabled with auto tool choice"""
|
||||||
args = serve_parser.parse_args(args=[
|
args = serve_parser.parse_args(args=[
|
||||||
"--enable-auto-tool-choice",
|
"--enable-auto-tool-choice",
|
||||||
"--enable-reasoning",
|
"--reasoning-parser",
|
||||||
|
"deepseek_r1",
|
||||||
])
|
])
|
||||||
with pytest.raises(TypeError):
|
with pytest.raises(TypeError):
|
||||||
validate_parsed_serve_args(args)
|
validate_parsed_serve_args(args)
|
||||||
|
|
||||||
|
|
||||||
def test_enable_reasoning_passes_with_reasoning_parser(serve_parser):
|
def test_passes_with_reasoning_parser(serve_parser):
|
||||||
"""Ensure validation passes if reasoning is enabled
|
"""Ensure validation passes if reasoning is enabled
|
||||||
with a reasoning parser"""
|
with a reasoning parser"""
|
||||||
args = serve_parser.parse_args(args=[
|
args = serve_parser.parse_args(args=[
|
||||||
"--enable-reasoning",
|
|
||||||
"--reasoning-parser",
|
"--reasoning-parser",
|
||||||
"deepseek_r1",
|
"deepseek_r1",
|
||||||
])
|
])
|
||||||
validate_parsed_serve_args(args)
|
validate_parsed_serve_args(args)
|
||||||
|
|
||||||
|
|
||||||
def test_enable_reasoning_fails_without_reasoning_parser(serve_parser):
|
|
||||||
"""Ensure validation fails if reasoning is enabled
|
|
||||||
without a reasoning parser"""
|
|
||||||
args = serve_parser.parse_args(args=["--enable-reasoning"])
|
|
||||||
with pytest.raises(TypeError):
|
|
||||||
validate_parsed_serve_args(args)
|
|
||||||
|
|
||||||
|
|
||||||
def test_chat_template_validation_for_happy_paths(serve_parser):
|
def test_chat_template_validation_for_happy_paths(serve_parser):
|
||||||
"""Ensure validation passes if the chat template exists"""
|
"""Ensure validation passes if the chat template exists"""
|
||||||
args = serve_parser.parse_args(
|
args = serve_parser.parse_args(
|
||||||
|
|||||||
@ -3225,10 +3225,9 @@ class DecodingConfig:
|
|||||||
in the JSON schema. This is only supported for the `guidance` backend and
|
in the JSON schema. This is only supported for the `guidance` backend and
|
||||||
is used to better align its behaviour with `outlines` and `xgrammar`."""
|
is used to better align its behaviour with `outlines` and `xgrammar`."""
|
||||||
|
|
||||||
reasoning_backend: Optional[str] = None
|
reasoning_backend: str = ""
|
||||||
"""Select the reasoning parser depending on the model that you're using.
|
"""Select the reasoning parser depending on the model that you're using.
|
||||||
This is used to parse the reasoning content into OpenAI API format.
|
This is used to parse the reasoning content into OpenAI API format."""
|
||||||
Required for `--enable-reasoning`."""
|
|
||||||
|
|
||||||
def compute_hash(self) -> str:
|
def compute_hash(self) -> str:
|
||||||
"""
|
"""
|
||||||
|
|||||||
@ -365,8 +365,9 @@ class EngineArgs:
|
|||||||
calculate_kv_scales: bool = CacheConfig.calculate_kv_scales
|
calculate_kv_scales: bool = CacheConfig.calculate_kv_scales
|
||||||
|
|
||||||
additional_config: Optional[Dict[str, Any]] = None
|
additional_config: Optional[Dict[str, Any]] = None
|
||||||
enable_reasoning: Optional[bool] = None
|
enable_reasoning: Optional[bool] = None # DEPRECATED
|
||||||
reasoning_parser: Optional[str] = DecodingConfig.reasoning_backend
|
reasoning_parser: str = DecodingConfig.reasoning_backend
|
||||||
|
|
||||||
use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load
|
use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
@ -798,8 +799,15 @@ class EngineArgs:
|
|||||||
"--enable-reasoning",
|
"--enable-reasoning",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
default=False,
|
default=False,
|
||||||
help="Whether to enable reasoning_content for the model. "
|
help=
|
||||||
"If enabled, the model will be able to generate reasoning content."
|
"[DEPRECATED] " \
|
||||||
|
"The --enable-reasoning flag is deprecated as of v0.8.6. "
|
||||||
|
"Use --reasoning-parser to specify " \
|
||||||
|
"the reasoning parser backend instead. "
|
||||||
|
"This flag (--enable-reasoning) will be " \
|
||||||
|
"removed in v0.10.0. "
|
||||||
|
"When --reasoning-parser is specified, " \
|
||||||
|
"reasoning mode is automatically enabled."
|
||||||
)
|
)
|
||||||
|
|
||||||
return parser
|
return parser
|
||||||
@ -1088,7 +1096,6 @@ class EngineArgs:
|
|||||||
disable_additional_properties=\
|
disable_additional_properties=\
|
||||||
self.guided_decoding_disable_additional_properties,
|
self.guided_decoding_disable_additional_properties,
|
||||||
reasoning_backend=self.reasoning_parser
|
reasoning_backend=self.reasoning_parser
|
||||||
if self.enable_reasoning else None,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
observability_config = ObservabilityConfig(
|
observability_config = ObservabilityConfig(
|
||||||
|
|||||||
@ -2096,7 +2096,7 @@ class LLMEngine:
|
|||||||
guided_decoding.backend = guided_decoding.backend or \
|
guided_decoding.backend = guided_decoding.backend or \
|
||||||
self.decoding_config.backend
|
self.decoding_config.backend
|
||||||
|
|
||||||
if self.decoding_config.reasoning_backend is not None:
|
if self.decoding_config.reasoning_backend:
|
||||||
logger.debug("Building with reasoning backend %s",
|
logger.debug("Building with reasoning backend %s",
|
||||||
self.decoding_config.reasoning_backend)
|
self.decoding_config.reasoning_backend)
|
||||||
|
|
||||||
|
|||||||
@ -967,7 +967,6 @@ async def init_app_state(
|
|||||||
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
|
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
|
||||||
enable_auto_tools=args.enable_auto_tool_choice,
|
enable_auto_tools=args.enable_auto_tool_choice,
|
||||||
tool_parser=args.tool_call_parser,
|
tool_parser=args.tool_call_parser,
|
||||||
enable_reasoning=args.enable_reasoning,
|
|
||||||
reasoning_parser=args.reasoning_parser,
|
reasoning_parser=args.reasoning_parser,
|
||||||
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
|
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
|
||||||
) if model_config.runner_type == "generate" else None
|
) if model_config.runner_type == "generate" else None
|
||||||
@ -1053,7 +1052,7 @@ async def run_server(args, **uvicorn_kwargs) -> None:
|
|||||||
f"(chose from {{ {','.join(valid_tool_parses)} }})")
|
f"(chose from {{ {','.join(valid_tool_parses)} }})")
|
||||||
|
|
||||||
valid_reasoning_parses = ReasoningParserManager.reasoning_parsers.keys()
|
valid_reasoning_parses = ReasoningParserManager.reasoning_parsers.keys()
|
||||||
if args.enable_reasoning \
|
if args.reasoning_parser \
|
||||||
and args.reasoning_parser not in valid_reasoning_parses:
|
and args.reasoning_parser not in valid_reasoning_parses:
|
||||||
raise KeyError(
|
raise KeyError(
|
||||||
f"invalid reasoning parser: {args.reasoning_parser} "
|
f"invalid reasoning parser: {args.reasoning_parser} "
|
||||||
|
|||||||
@ -284,11 +284,6 @@ def validate_parsed_serve_args(args: argparse.Namespace):
|
|||||||
raise TypeError("Error: --enable-auto-tool-choice requires "
|
raise TypeError("Error: --enable-auto-tool-choice requires "
|
||||||
"--tool-call-parser")
|
"--tool-call-parser")
|
||||||
|
|
||||||
# Enable reasoning needs a reasoning parser to be valid
|
|
||||||
if args.enable_reasoning and not args.reasoning_parser:
|
|
||||||
raise TypeError("Error: --enable-reasoning requires "
|
|
||||||
"--reasoning-parser")
|
|
||||||
|
|
||||||
|
|
||||||
def create_parser_for_docs() -> FlexibleArgumentParser:
|
def create_parser_for_docs() -> FlexibleArgumentParser:
|
||||||
parser_for_docs = FlexibleArgumentParser(
|
parser_for_docs = FlexibleArgumentParser(
|
||||||
|
|||||||
@ -58,8 +58,7 @@ class OpenAIServingChat(OpenAIServing):
|
|||||||
chat_template: Optional[str],
|
chat_template: Optional[str],
|
||||||
chat_template_content_format: ChatTemplateContentFormatOption,
|
chat_template_content_format: ChatTemplateContentFormatOption,
|
||||||
return_tokens_as_token_ids: bool = False,
|
return_tokens_as_token_ids: bool = False,
|
||||||
enable_reasoning: bool = False,
|
reasoning_parser: str = "",
|
||||||
reasoning_parser: Optional[str] = None,
|
|
||||||
enable_auto_tools: bool = False,
|
enable_auto_tools: bool = False,
|
||||||
tool_parser: Optional[str] = None,
|
tool_parser: Optional[str] = None,
|
||||||
enable_prompt_tokens_details: bool = False,
|
enable_prompt_tokens_details: bool = False,
|
||||||
@ -82,18 +81,17 @@ class OpenAIServingChat(OpenAIServing):
|
|||||||
" the parallel_tool_calls client option is preset for "
|
" the parallel_tool_calls client option is preset for "
|
||||||
"compatibility reasons, it will be ignored.")
|
"compatibility reasons, it will be ignored.")
|
||||||
|
|
||||||
self.enable_reasoning: bool = enable_reasoning
|
|
||||||
self.reasoning_parser: Optional[Callable[[AnyTokenizer],
|
self.reasoning_parser: Optional[Callable[[AnyTokenizer],
|
||||||
ReasoningParser]] = None
|
ReasoningParser]] = None
|
||||||
if self.enable_reasoning:
|
if reasoning_parser:
|
||||||
try:
|
try:
|
||||||
self.reasoning_parser = (
|
self.reasoning_parser = (
|
||||||
ReasoningParserManager.get_reasoning_parser(
|
ReasoningParserManager.get_reasoning_parser(
|
||||||
reasoning_parser))
|
reasoning_parser))
|
||||||
|
assert self.reasoning_parser is not None
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise TypeError("Error: --enable-reasoning requires "
|
raise TypeError(
|
||||||
f"reasoning_parser:'{reasoning_parser}' "
|
f"{reasoning_parser=} has not been registered") from e
|
||||||
"which has not been registered") from e
|
|
||||||
self.tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None
|
self.tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None
|
||||||
if self.enable_auto_tools:
|
if self.enable_auto_tools:
|
||||||
try:
|
try:
|
||||||
@ -423,15 +421,12 @@ class OpenAIServingChat(OpenAIServing):
|
|||||||
not tool_choice_function_name
|
not tool_choice_function_name
|
||||||
and self._should_stream_with_auto_tool_parsing(request))
|
and self._should_stream_with_auto_tool_parsing(request))
|
||||||
|
|
||||||
should_stream_with_reasoning_parsing = (
|
|
||||||
self._should_stream_with_reasoning_parsing(request))
|
|
||||||
|
|
||||||
all_previous_token_ids: Optional[list[list[int]]]
|
all_previous_token_ids: Optional[list[list[int]]]
|
||||||
function_name_returned: Optional[list[bool]] = None
|
function_name_returned: Optional[list[bool]] = None
|
||||||
|
|
||||||
# Only one of these will be used, thus previous_texts and
|
# Only one of these will be used, thus previous_texts and
|
||||||
# all_previous_token_ids will not be used twice in the same iteration.
|
# all_previous_token_ids will not be used twice in the same iteration.
|
||||||
if tool_choice_auto or should_stream_with_reasoning_parsing:
|
if tool_choice_auto or self.reasoning_parser:
|
||||||
# These are only required in "auto" tool choice case
|
# These are only required in "auto" tool choice case
|
||||||
previous_texts = [""] * num_choices
|
previous_texts = [""] * num_choices
|
||||||
all_previous_token_ids = [[]] * num_choices
|
all_previous_token_ids = [[]] * num_choices
|
||||||
@ -446,12 +441,7 @@ class OpenAIServingChat(OpenAIServing):
|
|||||||
previous_texts, all_previous_token_ids = None, None
|
previous_texts, all_previous_token_ids = None, None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# There is no need to check if the reasoning_parser is None
|
if self.reasoning_parser:
|
||||||
# because the should_stream_with_reasoning_parsing check
|
|
||||||
# already ensures that the reasoning_parser is not None.
|
|
||||||
# but the pre-commit hook requires it.
|
|
||||||
if should_stream_with_reasoning_parsing and \
|
|
||||||
self.reasoning_parser is not None:
|
|
||||||
reasoning_parser = self.reasoning_parser(tokenizer)
|
reasoning_parser = self.reasoning_parser(tokenizer)
|
||||||
except RuntimeError as e:
|
except RuntimeError as e:
|
||||||
logger.exception("Error in reasoning parser creation.")
|
logger.exception("Error in reasoning parser creation.")
|
||||||
@ -459,7 +449,6 @@ class OpenAIServingChat(OpenAIServing):
|
|||||||
yield f"data: {data}\n\n"
|
yield f"data: {data}\n\n"
|
||||||
yield "data: [DONE]\n\n"
|
yield "data: [DONE]\n\n"
|
||||||
return
|
return
|
||||||
|
|
||||||
# Prepare the tool parser if it's needed
|
# Prepare the tool parser if it's needed
|
||||||
try:
|
try:
|
||||||
if tool_choice_auto and self.tool_parser:
|
if tool_choice_auto and self.tool_parser:
|
||||||
@ -592,7 +581,7 @@ class OpenAIServingChat(OpenAIServing):
|
|||||||
delta_message: Optional[DeltaMessage]
|
delta_message: Optional[DeltaMessage]
|
||||||
|
|
||||||
# just update previous_texts and previous_token_ids
|
# just update previous_texts and previous_token_ids
|
||||||
if tool_choice_auto or should_stream_with_reasoning_parsing:
|
if tool_choice_auto or self.reasoning_parser:
|
||||||
assert previous_texts is not None
|
assert previous_texts is not None
|
||||||
assert all_previous_token_ids is not None
|
assert all_previous_token_ids is not None
|
||||||
previous_text = previous_texts[i]
|
previous_text = previous_texts[i]
|
||||||
@ -603,7 +592,7 @@ class OpenAIServingChat(OpenAIServing):
|
|||||||
|
|
||||||
# handle streaming deltas for tools with named tool_choice
|
# handle streaming deltas for tools with named tool_choice
|
||||||
if tool_choice_function_name:
|
if tool_choice_function_name:
|
||||||
if (self.enable_reasoning
|
if (self.reasoning_parser
|
||||||
and not reasoning_parser.is_reasoning_end(
|
and not reasoning_parser.is_reasoning_end(
|
||||||
previous_token_ids)):
|
previous_token_ids)):
|
||||||
assert reasoning_parser is not None
|
assert reasoning_parser is not None
|
||||||
@ -630,7 +619,7 @@ class OpenAIServingChat(OpenAIServing):
|
|||||||
current_text = ""
|
current_text = ""
|
||||||
else:
|
else:
|
||||||
# Just to add remaining `content`
|
# Just to add remaining `content`
|
||||||
if self.enable_reasoning:
|
if self.reasoning_parser:
|
||||||
delta_text = previous_text + delta_text
|
delta_text = previous_text + delta_text
|
||||||
current_text = ""
|
current_text = ""
|
||||||
|
|
||||||
@ -660,7 +649,7 @@ class OpenAIServingChat(OpenAIServing):
|
|||||||
|
|
||||||
# handle streaming deltas for tools with "auto" tool choice
|
# handle streaming deltas for tools with "auto" tool choice
|
||||||
# and reasoning parser
|
# and reasoning parser
|
||||||
elif tool_choice_auto and self.enable_reasoning:
|
elif tool_choice_auto and self.reasoning_parser:
|
||||||
assert tool_parser is not None
|
assert tool_parser is not None
|
||||||
assert reasoning_parser is not None
|
assert reasoning_parser is not None
|
||||||
assert added_content_delta_arr is not None
|
assert added_content_delta_arr is not None
|
||||||
@ -728,8 +717,7 @@ class OpenAIServingChat(OpenAIServing):
|
|||||||
delta_token_ids=output.token_ids,
|
delta_token_ids=output.token_ids,
|
||||||
request=request))
|
request=request))
|
||||||
# when only reasoning
|
# when only reasoning
|
||||||
elif self.enable_reasoning:
|
elif self.reasoning_parser:
|
||||||
assert reasoning_parser is not None
|
|
||||||
delta_message = (reasoning_parser.
|
delta_message = (reasoning_parser.
|
||||||
extract_reasoning_content_streaming(
|
extract_reasoning_content_streaming(
|
||||||
previous_text,
|
previous_text,
|
||||||
@ -744,7 +732,7 @@ class OpenAIServingChat(OpenAIServing):
|
|||||||
delta_message = DeltaMessage(content=delta_text)
|
delta_message = DeltaMessage(content=delta_text)
|
||||||
|
|
||||||
# update the previous values for the next iteration
|
# update the previous values for the next iteration
|
||||||
if tool_choice_auto or should_stream_with_reasoning_parsing:
|
if tool_choice_auto or self.reasoning_parser:
|
||||||
assert previous_texts is not None
|
assert previous_texts is not None
|
||||||
assert all_previous_token_ids is not None
|
assert all_previous_token_ids is not None
|
||||||
previous_texts[i] = current_text
|
previous_texts[i] = current_text
|
||||||
@ -931,17 +919,9 @@ class OpenAIServingChat(OpenAIServing):
|
|||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
logprobs = None
|
logprobs = None
|
||||||
|
|
||||||
should_stream_with_reasoning_parsing = (
|
|
||||||
self._should_stream_with_reasoning_parsing(request))
|
|
||||||
|
|
||||||
# In the OpenAI API the finish_reason is "tools_called"
|
|
||||||
# if the tool choice is auto and the model produced a tool
|
|
||||||
# call. The same is not true for named function calls
|
|
||||||
auto_tools_called = False
|
auto_tools_called = False
|
||||||
|
|
||||||
if should_stream_with_reasoning_parsing and \
|
if self.reasoning_parser:
|
||||||
self.reasoning_parser is not None:
|
|
||||||
try:
|
try:
|
||||||
reasoning_parser = self.reasoning_parser(tokenizer)
|
reasoning_parser = self.reasoning_parser(tokenizer)
|
||||||
except RuntimeError as e:
|
except RuntimeError as e:
|
||||||
@ -1176,17 +1156,6 @@ class OpenAIServingChat(OpenAIServing):
|
|||||||
return (request.tools and self.tool_parser and self.enable_auto_tools
|
return (request.tools and self.tool_parser and self.enable_auto_tools
|
||||||
and request.tool_choice in ['auto', None])
|
and request.tool_choice in ['auto', None])
|
||||||
|
|
||||||
def _should_stream_with_reasoning_parsing(self,
|
|
||||||
request: ChatCompletionRequest):
|
|
||||||
"""
|
|
||||||
Utility function to check if streamed tokens should go through the
|
|
||||||
reasoning parser that was configured.
|
|
||||||
|
|
||||||
We only want to do this IF reasoning is enabled and a reasoning
|
|
||||||
parser is configured.
|
|
||||||
"""
|
|
||||||
return self.enable_reasoning and self.reasoning_parser is not None
|
|
||||||
|
|
||||||
def _should_check_for_unstreamed_tool_arg_tokens(
|
def _should_check_for_unstreamed_tool_arg_tokens(
|
||||||
self,
|
self,
|
||||||
delta_message: Optional[DeltaMessage],
|
delta_message: Optional[DeltaMessage],
|
||||||
|
|||||||
@ -103,7 +103,7 @@ async def get_guided_decoding_logits_processor(
|
|||||||
reasoning_backend: str | None = None) -> LogitsProcessor | None:
|
reasoning_backend: str | None = None) -> LogitsProcessor | None:
|
||||||
|
|
||||||
reasoner = None
|
reasoner = None
|
||||||
if reasoning_backend is not None:
|
if reasoning_backend:
|
||||||
reasoner_class = ReasoningParserManager.get_reasoning_parser(
|
reasoner_class = ReasoningParserManager.get_reasoning_parser(
|
||||||
reasoning_backend)
|
reasoning_backend)
|
||||||
reasoner = reasoner_class(tokenizer)
|
reasoner = reasoner_class(tokenizer)
|
||||||
@ -146,7 +146,7 @@ def get_local_guided_decoding_logits_processor(
|
|||||||
guided_params = maybe_backend_fallback(guided_params)
|
guided_params = maybe_backend_fallback(guided_params)
|
||||||
|
|
||||||
reasoner = None
|
reasoner = None
|
||||||
if reasoning_backend is not None:
|
if reasoning_backend:
|
||||||
reasoner_class = ReasoningParserManager.get_reasoning_parser(
|
reasoner_class = ReasoningParserManager.get_reasoning_parser(
|
||||||
reasoning_backend)
|
reasoning_backend)
|
||||||
reasoner = reasoner_class(tokenizer)
|
reasoner = reasoner_class(tokenizer)
|
||||||
|
|||||||
@ -61,7 +61,7 @@ class BaseLogitsProcessor:
|
|||||||
"""Use the FSM to bias the logits before sampling the next token."""
|
"""Use the FSM to bias the logits before sampling the next token."""
|
||||||
|
|
||||||
# Skip the structured logits processing if reasoning is not finished.
|
# Skip the structured logits processing if reasoning is not finished.
|
||||||
# reasoner is not None only when `--enable-reasoning` is set.
|
# reasoner is not None only when `--reasoning-parser` is set.
|
||||||
if self._reasoner is not None:
|
if self._reasoner is not None:
|
||||||
if not self._reasoner.is_reasoning_end(input_ids):
|
if not self._reasoner.is_reasoning_end(input_ids):
|
||||||
return scores
|
return scores
|
||||||
|
|||||||
@ -346,7 +346,7 @@ class XGrammarLogitsProcessor:
|
|||||||
scores: torch.Tensor) -> torch.Tensor:
|
scores: torch.Tensor) -> torch.Tensor:
|
||||||
|
|
||||||
# Skip the structured logits processing if reasoning is not finished.
|
# Skip the structured logits processing if reasoning is not finished.
|
||||||
# reasoner is not None only when `--enable-reasoning` is set.
|
# reasoner is not None only when `--reasoning-parser` is set.
|
||||||
if self.reasoner is not None and \
|
if self.reasoner is not None and \
|
||||||
not self.reasoner.is_reasoning_end(
|
not self.reasoner.is_reasoning_end(
|
||||||
input_ids):
|
input_ids):
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user