mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-09 21:55:50 +08:00
reasoning_content -> reasoning (#27752)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
parent
608bb14462
commit
d9ab1ad9d1
@ -2,7 +2,10 @@
|
||||
|
||||
vLLM offers support for reasoning models like [DeepSeek R1](https://huggingface.co/deepseek-ai/DeepSeek-R1), which are designed to generate outputs containing both reasoning steps and final conclusions.
|
||||
|
||||
Reasoning models return an additional `reasoning_content` field in their outputs, which contains the reasoning steps that led to the final conclusion. This field is not present in the outputs of other models.
|
||||
Reasoning models return an additional `reasoning` field in their outputs, which contains the reasoning steps that led to the final conclusion. This field is not present in the outputs of other models.
|
||||
|
||||
!!! warning
|
||||
`reasoning` used to be called `reasoning_content`. For now, `reasoning_content` will continue to work. However, we encourage you to migrate to `reasoning` in case `reasoning_content` is removed in future.
|
||||
|
||||
## Supported Models
|
||||
|
||||
@ -61,18 +64,18 @@ Next, make a request to the model that should return the reasoning content in th
|
||||
# extra_body={"chat_template_kwargs": {"enable_thinking": False}}
|
||||
response = client.chat.completions.create(model=model, messages=messages)
|
||||
|
||||
reasoning_content = response.choices[0].message.reasoning_content
|
||||
reasoning = response.choices[0].message.reasoning
|
||||
content = response.choices[0].message.content
|
||||
|
||||
print("reasoning_content:", reasoning_content)
|
||||
print("reasoning:", reasoning)
|
||||
print("content:", content)
|
||||
```
|
||||
|
||||
The `reasoning_content` field contains the reasoning steps that led to the final conclusion, while the `content` field contains the final conclusion.
|
||||
The `reasoning` field contains the reasoning steps that led to the final conclusion, while the `content` field contains the final conclusion.
|
||||
|
||||
## Streaming chat completions
|
||||
|
||||
Streaming chat completions are also supported for reasoning models. The `reasoning_content` field is available in the `delta` field in [chat completion response chunks](https://platform.openai.com/docs/api-reference/chat/streaming).
|
||||
Streaming chat completions are also supported for reasoning models. The `reasoning` field is available in the `delta` field in [chat completion response chunks](https://platform.openai.com/docs/api-reference/chat/streaming).
|
||||
|
||||
??? console "Json"
|
||||
|
||||
@ -88,7 +91,7 @@ Streaming chat completions are also supported for reasoning models. The `reasoni
|
||||
"index": 0,
|
||||
"delta": {
|
||||
"role": "assistant",
|
||||
"reasoning_content": "is",
|
||||
"reasoning": "is",
|
||||
},
|
||||
"logprobs": null,
|
||||
"finish_reason": null
|
||||
@ -97,7 +100,7 @@ Streaming chat completions are also supported for reasoning models. The `reasoni
|
||||
}
|
||||
```
|
||||
|
||||
OpenAI Python client library does not officially support `reasoning_content` attribute for streaming output. But the client supports extra attributes in the response. You can use `hasattr` to check if the `reasoning_content` attribute is present in the response. For example:
|
||||
OpenAI Python client library does not officially support `reasoning` attribute for streaming output. But the client supports extra attributes in the response. You can use `hasattr` to check if the `reasoning` attribute is present in the response. For example:
|
||||
|
||||
??? code
|
||||
|
||||
@ -127,22 +130,22 @@ OpenAI Python client library does not officially support `reasoning_content` att
|
||||
)
|
||||
|
||||
print("client: Start streaming chat completions...")
|
||||
printed_reasoning_content = False
|
||||
printed_reasoning = False
|
||||
printed_content = False
|
||||
|
||||
for chunk in stream:
|
||||
# Safely extract reasoning_content and content from delta,
|
||||
# Safely extract reasoning and content from delta,
|
||||
# defaulting to None if attributes don't exist or are empty strings
|
||||
reasoning_content = (
|
||||
getattr(chunk.choices[0].delta, "reasoning_content", None) or None
|
||||
reasoning = (
|
||||
getattr(chunk.choices[0].delta, "reasoning", None) or None
|
||||
)
|
||||
content = getattr(chunk.choices[0].delta, "content", None) or None
|
||||
|
||||
if reasoning_content is not None:
|
||||
if not printed_reasoning_content:
|
||||
printed_reasoning_content = True
|
||||
print("reasoning_content:", end="", flush=True)
|
||||
print(reasoning_content, end="", flush=True)
|
||||
if reasoning is not None:
|
||||
if not printed_reasoning:
|
||||
printed_reasoning = True
|
||||
print("reasoning:", end="", flush=True)
|
||||
print(reasoning, end="", flush=True)
|
||||
elif content is not None:
|
||||
if not printed_content:
|
||||
printed_content = True
|
||||
@ -151,11 +154,11 @@ OpenAI Python client library does not officially support `reasoning_content` att
|
||||
print(content, end="", flush=True)
|
||||
```
|
||||
|
||||
Remember to check whether the `reasoning_content` exists in the response before accessing it. You could check out the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).
|
||||
Remember to check whether the `reasoning` exists in the response before accessing it. You could check out the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).
|
||||
|
||||
## Tool Calling
|
||||
|
||||
The reasoning content is also available when both tool calling and the reasoning parser are enabled. Additionally, tool calling only parses functions from the `content` field, not from the `reasoning_content`.
|
||||
The reasoning content is also available when both tool calling and the reasoning parser are enabled. Additionally, tool calling only parses functions from the `content` field, not from the `reasoning`.
|
||||
|
||||
??? code
|
||||
|
||||
@ -192,7 +195,7 @@ The reasoning content is also available when both tool calling and the reasoning
|
||||
print(response)
|
||||
tool_call = response.choices[0].message.tool_calls[0].function
|
||||
|
||||
print(f"reasoning_content: {response.choices[0].message.reasoning_content}")
|
||||
print(f"reasoning: {response.choices[0].message.reasoning}")
|
||||
print(f"Function called: {tool_call.name}")
|
||||
print(f"Arguments: {tool_call.arguments}")
|
||||
```
|
||||
@ -223,7 +226,7 @@ You can add a new `ReasoningParser` similar to [vllm/reasoning/deepseek_r1_reaso
|
||||
def __init__(self, tokenizer: AnyTokenizer):
|
||||
super().__init__(tokenizer)
|
||||
|
||||
def extract_reasoning_content_streaming(
|
||||
def extract_reasoning_streaming(
|
||||
self,
|
||||
previous_text: str,
|
||||
current_text: str,
|
||||
@ -240,7 +243,7 @@ You can add a new `ReasoningParser` similar to [vllm/reasoning/deepseek_r1_reaso
|
||||
previously been parsed and extracted (see constructor)
|
||||
"""
|
||||
|
||||
def extract_reasoning_content(
|
||||
def extract_reasoning(
|
||||
self,
|
||||
model_output: str,
|
||||
request: ChatCompletionRequest | ResponsesRequest,
|
||||
|
||||
@ -204,7 +204,7 @@ Note that you can use reasoning with any provided structured outputs feature. Th
|
||||
}
|
||||
},
|
||||
)
|
||||
print("reasoning_content: ", completion.choices[0].message.reasoning_content)
|
||||
print("reasoning: ", completion.choices[0].message.reasoning)
|
||||
print("content: ", completion.choices[0].message.content)
|
||||
```
|
||||
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
An example demonstrates how to use tool calling with reasoning models
|
||||
like QwQ-32B. The reasoning_content will not be parsed by the tool
|
||||
like QwQ-32B. The reasoning will not be parsed by the tool
|
||||
calling process; only the final output will be parsed.
|
||||
|
||||
To run this example, you need to start the vLLM server with both
|
||||
@ -78,7 +78,7 @@ messages = [
|
||||
|
||||
|
||||
def extract_reasoning_and_calls(chunks: list):
|
||||
reasoning_content = ""
|
||||
reasoning = ""
|
||||
tool_call_idx = -1
|
||||
arguments = []
|
||||
function_names = []
|
||||
@ -97,9 +97,9 @@ def extract_reasoning_and_calls(chunks: list):
|
||||
if tool_call.function.arguments:
|
||||
arguments[tool_call_idx] += tool_call.function.arguments
|
||||
else:
|
||||
if hasattr(chunk.choices[0].delta, "reasoning_content"):
|
||||
reasoning_content += chunk.choices[0].delta.reasoning_content
|
||||
return reasoning_content, arguments, function_names
|
||||
if hasattr(chunk.choices[0].delta, "reasoning"):
|
||||
reasoning += chunk.choices[0].delta.reasoning
|
||||
return reasoning, arguments, function_names
|
||||
|
||||
|
||||
def main():
|
||||
@ -115,7 +115,7 @@ def main():
|
||||
tool_calls = client.chat.completions.create(
|
||||
messages=messages, model=model, tools=tools
|
||||
)
|
||||
print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}")
|
||||
print(f"reasoning: {tool_calls.choices[0].message.reasoning}")
|
||||
print(f"function name: {tool_calls.choices[0].message.tool_calls[0].function.name}")
|
||||
print(
|
||||
f"function arguments: "
|
||||
@ -129,9 +129,9 @@ def main():
|
||||
|
||||
chunks = list(tool_calls_stream)
|
||||
|
||||
reasoning_content, arguments, function_names = extract_reasoning_and_calls(chunks)
|
||||
reasoning, arguments, function_names = extract_reasoning_and_calls(chunks)
|
||||
|
||||
print(f"reasoning_content: {reasoning_content}")
|
||||
print(f"reasoning: {reasoning}")
|
||||
print(f"function name: {function_names[0]}")
|
||||
print(f"function arguments: {arguments[0]}")
|
||||
|
||||
@ -144,7 +144,7 @@ def main():
|
||||
)
|
||||
|
||||
tool_call = tool_calls.choices[0].message.tool_calls[0].function
|
||||
print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}")
|
||||
print(f"reasoning: {tool_calls.choices[0].message.reasoning}")
|
||||
print(f"function name: {tool_call.name}")
|
||||
print(f"function arguments: {tool_call.arguments}")
|
||||
print("----------Stream Generate With Named Function Calling--------------")
|
||||
@ -159,8 +159,8 @@ def main():
|
||||
|
||||
chunks = list(tool_calls_stream)
|
||||
|
||||
reasoning_content, arguments, function_names = extract_reasoning_and_calls(chunks)
|
||||
print(f"reasoning_content: {reasoning_content}")
|
||||
reasoning, arguments, function_names = extract_reasoning_and_calls(chunks)
|
||||
print(f"reasoning: {reasoning}")
|
||||
print(f"function name: {function_names[0]}")
|
||||
print(f"function arguments: {arguments[0]}")
|
||||
print("\n\n")
|
||||
|
||||
@ -38,10 +38,10 @@ def main():
|
||||
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
|
||||
response = client.chat.completions.create(model=model, messages=messages)
|
||||
|
||||
reasoning_content = response.choices[0].message.reasoning_content
|
||||
reasoning = response.choices[0].message.reasoning
|
||||
content = response.choices[0].message.content
|
||||
|
||||
print("reasoning_content for Round 1:", reasoning_content)
|
||||
print("reasoning for Round 1:", reasoning)
|
||||
print("content for Round 1:", content)
|
||||
|
||||
# Round 2
|
||||
@ -54,10 +54,10 @@ def main():
|
||||
)
|
||||
response = client.chat.completions.create(model=model, messages=messages)
|
||||
|
||||
reasoning_content = response.choices[0].message.reasoning_content
|
||||
reasoning = response.choices[0].message.reasoning
|
||||
content = response.choices[0].message.content
|
||||
|
||||
print("reasoning_content for Round 2:", reasoning_content)
|
||||
print("reasoning for Round 2:", reasoning)
|
||||
print("content for Round 2:", content)
|
||||
|
||||
|
||||
|
||||
@ -20,7 +20,7 @@ in real-time as they are generated by the model. This is useful for scenarios
|
||||
where you want to display chat completions to the user as they are generated
|
||||
by the model.
|
||||
|
||||
Remember to check content and reasoning_content exist in `ChatCompletionChunk`,
|
||||
Remember to check content and reasoning exist in `ChatCompletionChunk`,
|
||||
content may not exist leading to errors if you try to access it.
|
||||
"""
|
||||
|
||||
@ -47,22 +47,20 @@ def main():
|
||||
stream = client.chat.completions.create(model=model, messages=messages, stream=True)
|
||||
|
||||
print("client: Start streaming chat completions...")
|
||||
printed_reasoning_content = False
|
||||
printed_reasoning = False
|
||||
printed_content = False
|
||||
|
||||
for chunk in stream:
|
||||
# Safely extract reasoning_content and content from delta,
|
||||
# Safely extract reasoning and content from delta,
|
||||
# defaulting to None if attributes don't exist or are empty strings
|
||||
reasoning_content = (
|
||||
getattr(chunk.choices[0].delta, "reasoning_content", None) or None
|
||||
)
|
||||
reasoning = getattr(chunk.choices[0].delta, "reasoning", None) or None
|
||||
content = getattr(chunk.choices[0].delta, "content", None) or None
|
||||
|
||||
if reasoning_content is not None:
|
||||
if not printed_reasoning_content:
|
||||
printed_reasoning_content = True
|
||||
print("reasoning_content:", end="", flush=True)
|
||||
print(reasoning_content, end="", flush=True)
|
||||
if reasoning is not None:
|
||||
if not printed_reasoning:
|
||||
printed_reasoning = True
|
||||
print("reasoning:", end="", flush=True)
|
||||
print(reasoning, end="", flush=True)
|
||||
elif content is not None:
|
||||
if not printed_content:
|
||||
printed_content = True
|
||||
|
||||
@ -159,8 +159,8 @@ def get_llm_response(messages, model, reason, content_ph=None, reasoning_ph=None
|
||||
for chunk in response:
|
||||
delta = chunk.choices[0].delta
|
||||
# Stream reasoning first
|
||||
if reason and hasattr(delta, "reasoning_content") and live_think:
|
||||
rc = delta.reasoning_content
|
||||
if reason and hasattr(delta, "reasoning") and live_think:
|
||||
rc = delta.reasoning
|
||||
if rc:
|
||||
think_text += rc
|
||||
live_think.markdown(think_text + "▌")
|
||||
@ -262,8 +262,8 @@ def server_supports_reasoning():
|
||||
messages=[{"role": "user", "content": "Hi"}],
|
||||
stream=False,
|
||||
)
|
||||
return hasattr(resp.choices[0].message, "reasoning_content") and bool(
|
||||
resp.choices[0].message.reasoning_content
|
||||
return hasattr(resp.choices[0].message, "reasoning") and bool(
|
||||
resp.choices[0].message.reasoning
|
||||
)
|
||||
|
||||
|
||||
|
||||
@ -33,7 +33,7 @@ async def print_stream_response(
|
||||
async for chunk in stream_response:
|
||||
delta = chunk.choices[0].delta
|
||||
|
||||
reasoning_chunk_text: str | None = getattr(delta, "reasoning_content", None)
|
||||
reasoning_chunk_text: str | None = getattr(delta, "reasoning", None)
|
||||
content_chunk_text = delta.content
|
||||
|
||||
if args.reasoning:
|
||||
@ -255,8 +255,8 @@ async def cli():
|
||||
for constraint, response in zip(constraints, results):
|
||||
print(f"\n\n{constraint}:")
|
||||
message = response.choices[0].message
|
||||
if args.reasoning and hasattr(message, "reasoning_content"):
|
||||
print(f" Reasoning: {message.reasoning_content or ''}")
|
||||
if args.reasoning and hasattr(message, "reasoning"):
|
||||
print(f" Reasoning: {message.reasoning or ''}")
|
||||
print(f" Content: {message.content!r}")
|
||||
|
||||
|
||||
|
||||
@ -80,7 +80,7 @@ FUNC_ARGS = """{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}"""
|
||||
|
||||
|
||||
def extract_reasoning_and_calls(chunks: list):
|
||||
reasoning_content = ""
|
||||
reasoning = ""
|
||||
tool_call_idx = -1
|
||||
arguments = []
|
||||
function_names = []
|
||||
@ -99,9 +99,9 @@ def extract_reasoning_and_calls(chunks: list):
|
||||
if tool_call.function.arguments:
|
||||
arguments[tool_call_idx] += tool_call.function.arguments
|
||||
else:
|
||||
if hasattr(chunk.choices[0].delta, "reasoning_content"):
|
||||
reasoning_content += chunk.choices[0].delta.reasoning_content
|
||||
return reasoning_content, arguments, function_names
|
||||
if hasattr(chunk.choices[0].delta, "reasoning"):
|
||||
reasoning += chunk.choices[0].delta.reasoning
|
||||
return reasoning, arguments, function_names
|
||||
|
||||
|
||||
# test streaming
|
||||
@ -119,8 +119,8 @@ async def test_chat_streaming_of_tool_and_reasoning(client: openai.AsyncOpenAI):
|
||||
async for chunk in stream:
|
||||
chunks.append(chunk)
|
||||
|
||||
reasoning_content, arguments, function_names = extract_reasoning_and_calls(chunks)
|
||||
assert len(reasoning_content) > 0
|
||||
reasoning, arguments, function_names = extract_reasoning_and_calls(chunks)
|
||||
assert len(reasoning) > 0
|
||||
assert len(function_names) > 0 and function_names[0] == FUNC_NAME
|
||||
assert len(arguments) > 0 and arguments[0] == FUNC_ARGS
|
||||
|
||||
@ -136,6 +136,6 @@ async def test_chat_full_of_tool_and_reasoning(client: openai.AsyncOpenAI):
|
||||
stream=False,
|
||||
)
|
||||
|
||||
assert len(tool_calls.choices[0].message.reasoning_content) > 0
|
||||
assert len(tool_calls.choices[0].message.reasoning) > 0
|
||||
assert tool_calls.choices[0].message.tool_calls[0].function.name == FUNC_NAME
|
||||
assert tool_calls.choices[0].message.tool_calls[0].function.arguments == FUNC_ARGS
|
||||
|
||||
@ -180,8 +180,8 @@ async def test_function_tool_use(
|
||||
extra_body={"chat_template_kwargs": {"enable_thinking": enable_thinking}},
|
||||
)
|
||||
if enable_thinking:
|
||||
assert chat_completion.choices[0].message.reasoning_content is not None
|
||||
assert chat_completion.choices[0].message.reasoning_content != ""
|
||||
assert chat_completion.choices[0].message.reasoning is not None
|
||||
assert chat_completion.choices[0].message.reasoning != ""
|
||||
assert chat_completion.choices[0].message.tool_calls is not None
|
||||
assert len(chat_completion.choices[0].message.tool_calls) > 0
|
||||
else:
|
||||
@ -200,9 +200,9 @@ async def test_function_tool_use(
|
||||
async for chunk in output_stream:
|
||||
if chunk.choices:
|
||||
if enable_thinking and getattr(
|
||||
chunk.choices[0].delta, "reasoning_content", None
|
||||
chunk.choices[0].delta, "reasoning", None
|
||||
):
|
||||
reasoning.append(chunk.choices[0].delta.reasoning_content)
|
||||
reasoning.append(chunk.choices[0].delta.reasoning)
|
||||
if chunk.choices[0].delta.tool_calls:
|
||||
output.extend(chunk.choices[0].delta.tool_calls)
|
||||
|
||||
|
||||
@ -232,9 +232,9 @@ def test_reasoning_parser():
|
||||
assert isinstance(line_dict, dict)
|
||||
assert line_dict["error"] is None
|
||||
|
||||
# Check that reasoning_content is present and not empty
|
||||
reasoning_content = line_dict["response"]["body"]["choices"][0]["message"][
|
||||
"reasoning_content"
|
||||
# Check that reasoning is present and not empty
|
||||
reasoning = line_dict["response"]["body"]["choices"][0]["message"][
|
||||
"reasoning"
|
||||
]
|
||||
assert reasoning_content is not None
|
||||
assert len(reasoning_content) > 0
|
||||
assert reasoning is not None
|
||||
assert len(reasoning) > 0
|
||||
|
||||
@ -151,57 +151,57 @@ class TestBaseThinkingReasoningParserMethods:
|
||||
class TestBaseThinkingReasoningParserExtraction:
|
||||
"""Test reasoning content extraction methods."""
|
||||
|
||||
def test_extract_reasoning_content_with_both_tokens(self, test_tokenizer):
|
||||
def test_extract_reasoning_with_both_tokens(self, test_tokenizer):
|
||||
"""Test extraction when both start and end tokens are present."""
|
||||
parser = TestThinkingReasoningParser(test_tokenizer)
|
||||
request = ChatCompletionRequest(messages=[], model="test-model")
|
||||
|
||||
model_output = "<test:think>This is reasoning</test:think>This is content"
|
||||
reasoning, content = parser.extract_reasoning_content(model_output, request)
|
||||
reasoning, content = parser.extract_reasoning(model_output, request)
|
||||
|
||||
assert reasoning == "This is reasoning"
|
||||
assert content == "This is content"
|
||||
|
||||
def test_extract_reasoning_content_only_end_token(self, test_tokenizer):
|
||||
def test_extract_reasoning_only_end_token(self, test_tokenizer):
|
||||
"""Test extraction when only end token is present."""
|
||||
parser = TestThinkingReasoningParser(test_tokenizer)
|
||||
request = ChatCompletionRequest(messages=[], model="test-model")
|
||||
|
||||
model_output = "This is reasoning</test:think>This is content"
|
||||
reasoning, content = parser.extract_reasoning_content(model_output, request)
|
||||
reasoning, content = parser.extract_reasoning(model_output, request)
|
||||
|
||||
assert reasoning == "This is reasoning"
|
||||
assert content == "This is content"
|
||||
|
||||
def test_extract_reasoning_content_no_end_token(self, test_tokenizer):
|
||||
def test_extract_reasoning_no_end_token(self, test_tokenizer):
|
||||
"""Test extraction when no end token is present."""
|
||||
parser = TestThinkingReasoningParser(test_tokenizer)
|
||||
request = ChatCompletionRequest(messages=[], model="test-model")
|
||||
|
||||
model_output = "This is just content"
|
||||
reasoning, content = parser.extract_reasoning_content(model_output, request)
|
||||
reasoning, content = parser.extract_reasoning(model_output, request)
|
||||
|
||||
assert reasoning == "This is just content"
|
||||
assert content is None
|
||||
|
||||
def test_extract_reasoning_content_empty_output(self, test_tokenizer):
|
||||
def test_extract_reasoning_empty_output(self, test_tokenizer):
|
||||
"""Test extraction with empty output."""
|
||||
parser = TestThinkingReasoningParser(test_tokenizer)
|
||||
request = ChatCompletionRequest(messages=[], model="test-model")
|
||||
|
||||
model_output = ""
|
||||
reasoning, content = parser.extract_reasoning_content(model_output, request)
|
||||
reasoning, content = parser.extract_reasoning(model_output, request)
|
||||
|
||||
assert reasoning == ""
|
||||
assert content is None
|
||||
|
||||
def test_extract_reasoning_content_only_tokens(self, test_tokenizer):
|
||||
def test_extract_reasoning_only_tokens(self, test_tokenizer):
|
||||
"""Test extraction with only tokens and no content."""
|
||||
parser = TestThinkingReasoningParser(test_tokenizer)
|
||||
request = ChatCompletionRequest(messages=[], model="test-model")
|
||||
|
||||
model_output = "<test:think></test:think>"
|
||||
reasoning, content = parser.extract_reasoning_content(model_output, request)
|
||||
reasoning, content = parser.extract_reasoning(model_output, request)
|
||||
|
||||
assert reasoning == ""
|
||||
assert content is None
|
||||
|
||||
@ -21,97 +21,97 @@ def deepseek_r1_qwen_tokenizer():
|
||||
|
||||
SIMPLE_REASONING = {
|
||||
"output": "This is a reasoning section</think>This is the rest",
|
||||
"reasoning_content": "This is a reasoning section",
|
||||
"reasoning": "This is a reasoning section",
|
||||
"content": "This is the rest",
|
||||
"is_reasoning_end": True,
|
||||
}
|
||||
COMPLETE_REASONING = {
|
||||
"output": "This is a reasoning section</think>",
|
||||
"reasoning_content": "This is a reasoning section",
|
||||
"reasoning": "This is a reasoning section",
|
||||
"content": None,
|
||||
"is_reasoning_end": True,
|
||||
}
|
||||
NO_CONTENT = {
|
||||
"output": "This is content",
|
||||
"reasoning_content": "This is content",
|
||||
"reasoning": "This is content",
|
||||
"content": None,
|
||||
"is_reasoning_end": False,
|
||||
}
|
||||
NO_REASONING_STREAMING = {
|
||||
"output": "This is a reasoning section",
|
||||
"reasoning_content": "This is a reasoning section",
|
||||
"reasoning": "This is a reasoning section",
|
||||
"content": None,
|
||||
"is_reasoning_end": False,
|
||||
}
|
||||
MULTIPLE_LINES = {
|
||||
"output": "This\nThat</think>This is the rest\nThat",
|
||||
"reasoning_content": "This\nThat",
|
||||
"reasoning": "This\nThat",
|
||||
"content": "This is the rest\nThat",
|
||||
"is_reasoning_end": True,
|
||||
}
|
||||
SHORTEST_REASONING_NO_STREAMING = {
|
||||
"output": "</think>This is the rest",
|
||||
"reasoning_content": "",
|
||||
"reasoning": "",
|
||||
"content": "This is the rest",
|
||||
"is_reasoning_end": True,
|
||||
}
|
||||
SHORTEST_REASONING = {
|
||||
"output": "</think>This is the rest",
|
||||
"reasoning_content": None,
|
||||
"reasoning": None,
|
||||
"content": "This is the rest",
|
||||
"is_reasoning_end": True,
|
||||
}
|
||||
REASONING_WITH_THINK = {
|
||||
"output": "<think>This is a reasoning section</think>This is the rest",
|
||||
"reasoning_content": "This is a reasoning section",
|
||||
"reasoning": "This is a reasoning section",
|
||||
"content": "This is the rest",
|
||||
"is_reasoning_end": True,
|
||||
}
|
||||
COMPLETE_REASONING_WITH_THINK = {
|
||||
"output": "<think>This is a reasoning section</think>",
|
||||
"reasoning_content": "This is a reasoning section",
|
||||
"reasoning": "This is a reasoning section",
|
||||
"content": None,
|
||||
"is_reasoning_end": True,
|
||||
}
|
||||
MULTIPLE_LINES_WITH_THINK = {
|
||||
"output": "<think>This\nThat</think>This is the rest\nThat",
|
||||
"reasoning_content": "This\nThat",
|
||||
"reasoning": "This\nThat",
|
||||
"content": "This is the rest\nThat",
|
||||
"is_reasoning_end": True,
|
||||
}
|
||||
SHORTEST_REASONING_NO_STREAMING_WITH_THINK = {
|
||||
"output": "</think>This is the rest",
|
||||
"reasoning_content": "",
|
||||
"reasoning": "",
|
||||
"content": "This is the rest",
|
||||
"is_reasoning_end": True,
|
||||
}
|
||||
SHORTEST_REASONING_WITH_THINK = {
|
||||
"output": "</think>This is the rest",
|
||||
"reasoning_content": None,
|
||||
"reasoning": None,
|
||||
"content": "This is the rest",
|
||||
"is_reasoning_end": True,
|
||||
}
|
||||
THINK_NO_END = {
|
||||
"output": "<think>This is a reasoning section",
|
||||
"reasoning_content": "This is a reasoning section",
|
||||
"reasoning": "This is a reasoning section",
|
||||
"content": None,
|
||||
"is_reasoning_end": False,
|
||||
}
|
||||
EMPTY = {
|
||||
"output": "",
|
||||
"reasoning_content": "",
|
||||
"reasoning": "",
|
||||
"content": None,
|
||||
"is_reasoning_end": False,
|
||||
}
|
||||
EMPTY_STREAMING = {
|
||||
"output": "",
|
||||
"reasoning_content": None,
|
||||
"reasoning": None,
|
||||
"content": None,
|
||||
"is_reasoning_end": False,
|
||||
}
|
||||
NEW_LINE = {
|
||||
"output": "\n<think>This is a reasoning section</think>\nThis is the rest",
|
||||
"reasoning_content": "This is a reasoning section",
|
||||
"reasoning": "This is a reasoning section",
|
||||
"content": "\nThis is the rest",
|
||||
"is_reasoning_end": True,
|
||||
}
|
||||
@ -121,7 +121,7 @@ NEW_LINE = {
|
||||
# or not.
|
||||
NEW_LINE_STREAMING = {
|
||||
"output": "\n<think>This is a reasoning section</think>\nThis is the rest",
|
||||
"reasoning_content": "\nThis is a reasoning section",
|
||||
"reasoning": "\nThis is a reasoning section",
|
||||
"content": "\nThis is the rest",
|
||||
"is_reasoning_end": True,
|
||||
}
|
||||
@ -269,7 +269,7 @@ def test_reasoning(
|
||||
parser, output_tokens, streaming=streaming
|
||||
)
|
||||
|
||||
assert reasoning == param_dict["reasoning_content"]
|
||||
assert reasoning == param_dict["reasoning"]
|
||||
assert content == param_dict["content"]
|
||||
|
||||
# Test is_reasoning_end
|
||||
|
||||
@ -44,14 +44,14 @@ def test_identity_reasoning_parser_basic(tokenizer):
|
||||
# Test extract_content_ids returns all input_ids
|
||||
assert parser.extract_content_ids(input_ids) == input_ids
|
||||
|
||||
# Test extract_reasoning_content returns (None, model_output)
|
||||
# Test extract_reasoning returns (None, model_output)
|
||||
request = ChatCompletionRequest(model="test-model", messages=[], temperature=1.0)
|
||||
reasoning, content = parser.extract_reasoning_content(input_text, request)
|
||||
reasoning, content = parser.extract_reasoning(input_text, request)
|
||||
assert reasoning is None
|
||||
assert content == input_text
|
||||
|
||||
# Test extract_reasoning_content_streaming returns DeltaMessage or None
|
||||
result = parser.extract_reasoning_content_streaming(
|
||||
# Test extract_reasoning_streaming returns DeltaMessage or None
|
||||
result = parser.extract_reasoning_streaming(
|
||||
previous_text="",
|
||||
current_text="Hello world",
|
||||
delta_text="Hello world",
|
||||
@ -63,7 +63,7 @@ def test_identity_reasoning_parser_basic(tokenizer):
|
||||
assert result.content == "Hello world"
|
||||
|
||||
# If delta_text is empty, should return None
|
||||
result_none = parser.extract_reasoning_content_streaming(
|
||||
result_none = parser.extract_reasoning_streaming(
|
||||
previous_text="Hello world",
|
||||
current_text="Hello world",
|
||||
delta_text="",
|
||||
|
||||
@ -20,36 +20,36 @@ def ernie45_tokenizer():
|
||||
# 带 </think>,非stream
|
||||
WITH_THINK = {
|
||||
"output": "abc</think>def",
|
||||
"reasoning_content": "abc",
|
||||
"reasoning": "abc",
|
||||
"content": "def",
|
||||
}
|
||||
# 带 </think>,stream
|
||||
WITH_THINK_STREAM = {
|
||||
"output": "abc</think>def",
|
||||
"reasoning_content": "abc",
|
||||
"reasoning": "abc",
|
||||
"content": "def",
|
||||
}
|
||||
# without </think>, all is reasoning_content
|
||||
# without </think>, all is reasoning
|
||||
WITHOUT_THINK = {
|
||||
"output": "abc",
|
||||
"reasoning_content": "abc",
|
||||
"reasoning": "abc",
|
||||
"content": None,
|
||||
}
|
||||
# without </think>, all is reasoning_content
|
||||
# without </think>, all is reasoning
|
||||
WITHOUT_THINK_STREAM = {
|
||||
"output": "abc",
|
||||
"reasoning_content": "abc",
|
||||
"reasoning": "abc",
|
||||
"content": None,
|
||||
}
|
||||
|
||||
COMPLETE_REASONING = {
|
||||
"output": "abc</think>",
|
||||
"reasoning_content": "abc",
|
||||
"reasoning": "abc",
|
||||
"content": None,
|
||||
}
|
||||
MULTILINE_REASONING = {
|
||||
"output": "abc\nABC</think>def\nDEF",
|
||||
"reasoning_content": "abc\nABC",
|
||||
"reasoning": "abc\nABC",
|
||||
"content": "def\nDEF",
|
||||
}
|
||||
|
||||
@ -120,5 +120,5 @@ def test_reasoning(
|
||||
|
||||
print()
|
||||
|
||||
assert reasoning == param_dict["reasoning_content"]
|
||||
assert reasoning == param_dict["reasoning"]
|
||||
assert content == param_dict["content"]
|
||||
|
||||
@ -21,54 +21,54 @@ def glm45_tokenizer():
|
||||
|
||||
WITH_THINK = {
|
||||
"output": "<think>This is a reasoning section</think>This is the rest",
|
||||
"reasoning_content": "This is a reasoning section",
|
||||
"reasoning": "This is a reasoning section",
|
||||
"content": "This is the rest",
|
||||
"is_reasoning_end": True,
|
||||
}
|
||||
|
||||
WITH_THINK_STREAM = {
|
||||
"output": "<think>This is a reasoning section</think>This is the rest",
|
||||
"reasoning_content": "This is a reasoning section",
|
||||
"reasoning": "This is a reasoning section",
|
||||
"content": "This is the rest",
|
||||
"is_reasoning_end": True,
|
||||
}
|
||||
|
||||
WITHOUT_THINK = {
|
||||
"output": "This is the rest",
|
||||
"reasoning_content": None,
|
||||
"reasoning": None,
|
||||
"content": "This is the rest",
|
||||
"is_reasoning_end": False,
|
||||
}
|
||||
|
||||
WITHOUT_THINK_STREAM = {
|
||||
"output": "This is the rest",
|
||||
"reasoning_content": None,
|
||||
"reasoning": None,
|
||||
"content": "This is the rest",
|
||||
"is_reasoning_end": False,
|
||||
}
|
||||
|
||||
COMPLETE_REASONING = {
|
||||
"output": "<think>This is a reasoning section</think>",
|
||||
"reasoning_content": "This is a reasoning section",
|
||||
"reasoning": "This is a reasoning section",
|
||||
"content": None,
|
||||
"is_reasoning_end": True,
|
||||
}
|
||||
MULTILINE_REASONING = {
|
||||
"output": "<think>This is a reasoning\nsection</think>This is the rest\nThat",
|
||||
"reasoning_content": "This is a reasoning\nsection",
|
||||
"reasoning": "This is a reasoning\nsection",
|
||||
"content": "This is the rest\nThat",
|
||||
"is_reasoning_end": True,
|
||||
}
|
||||
ONLY_OPEN_TAG = {
|
||||
"output": "<think>This is a reasoning section",
|
||||
"reasoning_content": None,
|
||||
"reasoning": None,
|
||||
"content": "<think>This is a reasoning section",
|
||||
"is_reasoning_end": False,
|
||||
}
|
||||
|
||||
ONLY_OPEN_TAG_STREAM = {
|
||||
"output": "<think>This is a reasoning section",
|
||||
"reasoning_content": "This is a reasoning section",
|
||||
"reasoning": "This is a reasoning section",
|
||||
"content": None,
|
||||
"is_reasoning_end": False,
|
||||
}
|
||||
@ -184,7 +184,7 @@ def test_reasoning(
|
||||
parser, output_tokens, streaming=streaming
|
||||
)
|
||||
|
||||
assert reasoning == param_dict["reasoning_content"]
|
||||
assert reasoning == param_dict["reasoning"]
|
||||
assert content == param_dict["content"]
|
||||
|
||||
output_ids = glm45_tokenizer.convert_tokens_to_ids(output)
|
||||
|
||||
@ -12,37 +12,37 @@ START_RESPONSE = "Here is my response:"
|
||||
|
||||
SIMPLE_REASONING = {
|
||||
"output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest", # noqa: E501
|
||||
"reasoning_content": "This is a reasoning section",
|
||||
"reasoning": "This is a reasoning section",
|
||||
"content": "This is the rest",
|
||||
}
|
||||
COMPLETE_REASONING = {
|
||||
"output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}",
|
||||
"reasoning_content": "This is a reasoning section",
|
||||
"reasoning": "This is a reasoning section",
|
||||
"content": None,
|
||||
}
|
||||
NO_REASONING = {
|
||||
"output": "This is content",
|
||||
"reasoning_content": None,
|
||||
"reasoning": None,
|
||||
"content": "This is content",
|
||||
}
|
||||
MULTIPLE_LINES = {
|
||||
"output": f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
|
||||
"reasoning_content": "This\nThat",
|
||||
"reasoning": "This\nThat",
|
||||
"content": "This is the rest\nThat",
|
||||
}
|
||||
REASONING_WITH_THINK = {
|
||||
"output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest", # noqa: E501
|
||||
"reasoning_content": "This is a reasoning section",
|
||||
"reasoning": "This is a reasoning section",
|
||||
"content": "This is the rest",
|
||||
}
|
||||
COMPLETE_REASONING_WITH_THINK = {
|
||||
"output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}",
|
||||
"reasoning_content": "This is a reasoning section",
|
||||
"reasoning": "This is a reasoning section",
|
||||
"content": None,
|
||||
}
|
||||
MULTIPLE_LINES_WITH_THINK = {
|
||||
"output": f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
|
||||
"reasoning_content": "This\nThat",
|
||||
"reasoning": "This\nThat",
|
||||
"content": "This is the rest\nThat",
|
||||
}
|
||||
|
||||
@ -141,7 +141,7 @@ def test_reasoning(
|
||||
parser, output_tokens, streaming=streaming
|
||||
)
|
||||
|
||||
assert reasoning == param_dict["reasoning_content"]
|
||||
assert reasoning == param_dict["reasoning"]
|
||||
assert content == param_dict["content"]
|
||||
|
||||
|
||||
@ -155,7 +155,7 @@ STREAMING_1 = {
|
||||
"previous_text": None,
|
||||
"current_text": "Here",
|
||||
"delta_text": "Here",
|
||||
"reasoning_content": None,
|
||||
"reasoning": None,
|
||||
"content": None,
|
||||
}
|
||||
# When we fail, we should give what was previously being silenced first
|
||||
@ -163,7 +163,7 @@ STREAMING_2 = {
|
||||
"previous_text": "Here is my thought",
|
||||
"current_text": "Here is my thought failure",
|
||||
"delta_text": " failure",
|
||||
"reasoning_content": None,
|
||||
"reasoning": None,
|
||||
"content": "Here is my thought failure",
|
||||
}
|
||||
# But then after the first one, we should only add the delta text to content
|
||||
@ -171,7 +171,7 @@ STREAMING_3 = {
|
||||
"previous_text": "Here wrong",
|
||||
"current_text": " words",
|
||||
"delta_text": " Here wrong words",
|
||||
"reasoning_content": None,
|
||||
"reasoning": None,
|
||||
"content": " words",
|
||||
}
|
||||
# But then after the first one, we should only add the delta text to content
|
||||
@ -179,7 +179,7 @@ STREAMING_4 = {
|
||||
"previous_text": "Here is my thought",
|
||||
"current_text": "Here is my thought process:",
|
||||
"delta_text": " process:",
|
||||
"reasoning_content": None,
|
||||
"reasoning": None,
|
||||
"content": None,
|
||||
}
|
||||
# Reasoning started successfully; parse reasoning content
|
||||
@ -187,7 +187,7 @@ STREAMING_5 = {
|
||||
"previous_text": "Here is my thought process:",
|
||||
"current_text": "Here is my thought process: foo",
|
||||
"delta_text": " foo",
|
||||
"reasoning_content": " foo",
|
||||
"reasoning": " foo",
|
||||
"content": None,
|
||||
}
|
||||
# Response special sequence has started, but not finished.
|
||||
@ -195,7 +195,7 @@ STREAMING_6 = {
|
||||
"previous_text": "Here is my thought process: foo",
|
||||
"current_text": "Here is my thought process: foo Here is",
|
||||
"delta_text": " Here is",
|
||||
"reasoning_content": " ",
|
||||
"reasoning": " ",
|
||||
"content": None,
|
||||
}
|
||||
# Response special sequence started, but was broken; the reasoning
|
||||
@ -204,7 +204,7 @@ STREAMING_7 = {
|
||||
"previous_text": "Here is my thought process: foo Here is",
|
||||
"current_text": "Here is my thought process: foo Here is Here",
|
||||
"delta_text": " Here",
|
||||
"reasoning_content": "Here is ",
|
||||
"reasoning": "Here is ",
|
||||
"content": None,
|
||||
}
|
||||
# Response special sequence is ongoing
|
||||
@ -212,7 +212,7 @@ STREAMING_8 = {
|
||||
"previous_text": "Here is my thought process: foo Here is my response:",
|
||||
"current_text": "Here is my thought process: foo Here is my response: bar",
|
||||
"delta_text": " bar",
|
||||
"reasoning_content": None,
|
||||
"reasoning": None,
|
||||
"content": " bar",
|
||||
}
|
||||
# The delta text has everything; we should be able to correctly parse both
|
||||
@ -220,7 +220,7 @@ STREAMING_9 = {
|
||||
"previous_text": None,
|
||||
"current_text": "Here is my thought process: foo Here is my response: bar",
|
||||
"delta_text": "Here is my thought process: foo Here is my response: bar",
|
||||
"reasoning_content": " foo ",
|
||||
"reasoning": " foo ",
|
||||
"content": " bar",
|
||||
}
|
||||
## The Response is ongoing, and the delta mixes reasoning content / content
|
||||
@ -228,7 +228,7 @@ STREAMING_10 = {
|
||||
"previous_text": "Here is my thought process: foo",
|
||||
"current_text": "Here is my thought process: foo bar Here is my response: baz",
|
||||
"delta_text": " bar Here is my response: baz",
|
||||
"reasoning_content": " bar ",
|
||||
"reasoning": " bar ",
|
||||
"content": " baz",
|
||||
}
|
||||
# The delta text starts a new substring that might be a response special seq
|
||||
@ -236,7 +236,7 @@ STREAMING_11 = {
|
||||
"previous_text": "Here is my thought process: This is a reasoning section ",
|
||||
"current_text": "Here is my thought process: This is a reasoning section Here",
|
||||
"delta_text": "Here",
|
||||
"reasoning_content": None,
|
||||
"reasoning": None,
|
||||
"content": None,
|
||||
}
|
||||
# The delta text is finishing the response special seq
|
||||
@ -244,14 +244,14 @@ STREAMING_12 = {
|
||||
"previous_text": "Here is my thought process: foo Here is my response",
|
||||
"current_text": "Here is my thought process: foo Here is my response:",
|
||||
"delta_text": ":",
|
||||
"reasoning_content": None,
|
||||
"reasoning": None,
|
||||
"content": None,
|
||||
}
|
||||
STREAMING_13 = {
|
||||
"previous_text": "Here is my thought process: foo Here",
|
||||
"current_text": "Here is my thought process: foo Here was",
|
||||
"delta_text": " was",
|
||||
"reasoning_content": "Here was",
|
||||
"reasoning": "Here was",
|
||||
"content": None,
|
||||
}
|
||||
|
||||
@ -326,7 +326,7 @@ def test_streaming_subcases(param_dict):
|
||||
tokenizer
|
||||
)
|
||||
|
||||
response = parser.extract_reasoning_content_streaming(
|
||||
response = parser.extract_reasoning_streaming(
|
||||
previous_text=param_dict["previous_text"],
|
||||
current_text=param_dict["current_text"],
|
||||
delta_text=param_dict["delta_text"],
|
||||
@ -336,9 +336,9 @@ def test_streaming_subcases(param_dict):
|
||||
)
|
||||
# Streaming currently expects at least one of reasoning content / content,
|
||||
# so the response should return None in that case.
|
||||
if param_dict["reasoning_content"] is None and param_dict["content"] is None:
|
||||
if param_dict["reasoning"] is None and param_dict["content"] is None:
|
||||
assert response is None
|
||||
else:
|
||||
assert isinstance(response, DeltaMessage)
|
||||
assert param_dict["reasoning_content"] == response.reasoning_content
|
||||
assert param_dict["reasoning"] == response.reasoning
|
||||
assert param_dict["content"] == response.content
|
||||
|
||||
@ -14,49 +14,49 @@ END_RESPONSE = "\n</answer>"
|
||||
|
||||
NO_REASONING_QUICK_THROUGHT = {
|
||||
"output": f"{START_REASONING}{START_RESPONSE}This is the rest{END_RESPONSE}", # noqa: E501
|
||||
"reasoning_content": None,
|
||||
"reasoning": None,
|
||||
"content": "This is the rest",
|
||||
}
|
||||
|
||||
SIMPLE_REASONING = {
|
||||
"output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest{END_RESPONSE}", # noqa: E501
|
||||
"reasoning_content": "This is a reasoning section",
|
||||
"reasoning": "This is a reasoning section",
|
||||
"content": "This is the rest",
|
||||
}
|
||||
COMPLETE_REASONING = {
|
||||
"output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}",
|
||||
"reasoning_content": "This is a reasoning section",
|
||||
"reasoning": "This is a reasoning section",
|
||||
"content": None,
|
||||
}
|
||||
|
||||
COMPLETE_REASONING_WITH_SYMBOL = {
|
||||
"output": f"{START_REASONING}This is a reasoning section!{START_RESPONSE}",
|
||||
"reasoning_content": "This is a reasoning section!",
|
||||
"reasoning": "This is a reasoning section!",
|
||||
"content": None,
|
||||
}
|
||||
NO_REASONING = {
|
||||
"output": "This is content",
|
||||
"reasoning_content": None,
|
||||
"reasoning": None,
|
||||
"content": "This is content",
|
||||
}
|
||||
MULTIPLE_LINES = {
|
||||
"output": f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
|
||||
"reasoning_content": "This\nThat",
|
||||
"reasoning": "This\nThat",
|
||||
"content": "This is the rest\nThat",
|
||||
}
|
||||
REASONING_WITH_THINK = {
|
||||
"output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest", # noqa: E501
|
||||
"reasoning_content": "This is a reasoning section",
|
||||
"reasoning": "This is a reasoning section",
|
||||
"content": "This is the rest",
|
||||
}
|
||||
COMPLETE_REASONING_WITH_THINK = {
|
||||
"output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}",
|
||||
"reasoning_content": "This is a reasoning section",
|
||||
"reasoning": "This is a reasoning section",
|
||||
"content": None,
|
||||
}
|
||||
MULTIPLE_LINES_WITH_THINK = {
|
||||
"output": f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
|
||||
"reasoning_content": "This\nThat",
|
||||
"reasoning": "This\nThat",
|
||||
"content": "This is the rest\nThat",
|
||||
}
|
||||
|
||||
@ -164,5 +164,5 @@ def test_reasoning(
|
||||
parser, output_tokens, streaming=streaming
|
||||
)
|
||||
|
||||
assert reasoning == param_dict["reasoning_content"]
|
||||
assert reasoning == param_dict["reasoning"]
|
||||
assert content == param_dict["content"]
|
||||
|
||||
@ -20,97 +20,97 @@ def mistral_tokenizer():
|
||||
|
||||
SIMPLE_REASONING = {
|
||||
"output": "This is a reasoning section[/THINK]This is the rest",
|
||||
"reasoning_content": "This is a reasoning section",
|
||||
"reasoning": "This is a reasoning section",
|
||||
"content": "This is the rest",
|
||||
"is_reasoning_end": True,
|
||||
}
|
||||
COMPLETE_REASONING = {
|
||||
"output": "This is a reasoning section[/THINK]",
|
||||
"reasoning_content": "This is a reasoning section",
|
||||
"reasoning": "This is a reasoning section",
|
||||
"content": None,
|
||||
"is_reasoning_end": True,
|
||||
}
|
||||
NO_CONTENT = {
|
||||
"output": "This is content",
|
||||
"reasoning_content": "This is content",
|
||||
"reasoning": "This is content",
|
||||
"content": None,
|
||||
"is_reasoning_end": False,
|
||||
}
|
||||
NO_REASONING_STREAMING = {
|
||||
"output": "This is a reasoning section",
|
||||
"reasoning_content": "This is a reasoning section",
|
||||
"reasoning": "This is a reasoning section",
|
||||
"content": None,
|
||||
"is_reasoning_end": False,
|
||||
}
|
||||
MULTIPLE_LINES = {
|
||||
"output": "This\nThat[/THINK]This is the rest\nThat",
|
||||
"reasoning_content": "This\nThat",
|
||||
"reasoning": "This\nThat",
|
||||
"content": "This is the rest\nThat",
|
||||
"is_reasoning_end": True,
|
||||
}
|
||||
SHORTEST_REASONING_NO_STREAMING = {
|
||||
"output": "[/THINK]This is the rest",
|
||||
"reasoning_content": "",
|
||||
"reasoning": "",
|
||||
"content": "This is the rest",
|
||||
"is_reasoning_end": True,
|
||||
}
|
||||
SHORTEST_REASONING = {
|
||||
"output": "[/THINK]This is the rest",
|
||||
"reasoning_content": None,
|
||||
"reasoning": None,
|
||||
"content": "This is the rest",
|
||||
"is_reasoning_end": True,
|
||||
}
|
||||
REASONING_WITH_THINK = {
|
||||
"output": "[THINK]This is a reasoning section[/THINK]This is the rest",
|
||||
"reasoning_content": "This is a reasoning section",
|
||||
"reasoning": "This is a reasoning section",
|
||||
"content": "This is the rest",
|
||||
"is_reasoning_end": True,
|
||||
}
|
||||
COMPLETE_REASONING_WITH_THINK = {
|
||||
"output": "[THINK]This is a reasoning section[/THINK]",
|
||||
"reasoning_content": "This is a reasoning section",
|
||||
"reasoning": "This is a reasoning section",
|
||||
"content": None,
|
||||
"is_reasoning_end": True,
|
||||
}
|
||||
MULTIPLE_LINES_WITH_THINK = {
|
||||
"output": "[THINK]This\nThat[/THINK]This is the rest\nThat",
|
||||
"reasoning_content": "This\nThat",
|
||||
"reasoning": "This\nThat",
|
||||
"content": "This is the rest\nThat",
|
||||
"is_reasoning_end": True,
|
||||
}
|
||||
SHORTEST_REASONING_NO_STREAMING_WITH_THINK = {
|
||||
"output": "[/THINK]This is the rest",
|
||||
"reasoning_content": "",
|
||||
"reasoning": "",
|
||||
"content": "This is the rest",
|
||||
"is_reasoning_end": True,
|
||||
}
|
||||
SHORTEST_REASONING_WITH_THINK = {
|
||||
"output": "[/THINK]This is the rest",
|
||||
"reasoning_content": None,
|
||||
"reasoning": None,
|
||||
"content": "This is the rest",
|
||||
"is_reasoning_end": True,
|
||||
}
|
||||
THINK_NO_END = {
|
||||
"output": "[THINK]This is a reasoning section",
|
||||
"reasoning_content": "This is a reasoning section",
|
||||
"reasoning": "This is a reasoning section",
|
||||
"content": None,
|
||||
"is_reasoning_end": False,
|
||||
}
|
||||
EMPTY = {
|
||||
"output": "",
|
||||
"reasoning_content": "",
|
||||
"reasoning": "",
|
||||
"content": None,
|
||||
"is_reasoning_end": False,
|
||||
}
|
||||
EMPTY_STREAMING = {
|
||||
"output": "",
|
||||
"reasoning_content": None,
|
||||
"reasoning": None,
|
||||
"content": None,
|
||||
"is_reasoning_end": False,
|
||||
}
|
||||
NEW_LINE = {
|
||||
"output": "\n[THINK]This is a reasoning section[/THINK]\nThis is the rest",
|
||||
"reasoning_content": "This is a reasoning section",
|
||||
"reasoning": "This is a reasoning section",
|
||||
"content": "\nThis is the rest",
|
||||
"is_reasoning_end": True,
|
||||
}
|
||||
@ -120,7 +120,7 @@ NEW_LINE = {
|
||||
# or not.
|
||||
NEW_LINE_STREAMING = {
|
||||
"output": "\n[THINK]This is a reasoning section[/THINK]\nThis is the rest",
|
||||
"reasoning_content": "\nThis is a reasoning section",
|
||||
"reasoning": "\nThis is a reasoning section",
|
||||
"content": "\nThis is the rest",
|
||||
"is_reasoning_end": True,
|
||||
}
|
||||
@ -307,7 +307,7 @@ def test_mistral_reasoning(
|
||||
parser, output_tokens, streaming=streaming
|
||||
)
|
||||
|
||||
assert reasoning == param_dict["reasoning_content"]
|
||||
assert reasoning == param_dict["reasoning"]
|
||||
assert content == param_dict["content"]
|
||||
|
||||
# Test is_reasoning_end
|
||||
|
||||
@ -13,43 +13,43 @@ END_REASONING = "</think>"
|
||||
|
||||
NO_REASONING = {
|
||||
"output": f"{START_REASONING}{END_REASONING}No thoughts, head empty!",
|
||||
"reasoning_content": None,
|
||||
"reasoning": None,
|
||||
"content": "No thoughts, head empty!",
|
||||
}
|
||||
|
||||
NO_REASONING_WITH_NEWLINE = {
|
||||
"output": f"{START_REASONING}\n{END_REASONING}\n\nNo thoughts, head empty!",
|
||||
"reasoning_content": "\n",
|
||||
"reasoning": "\n",
|
||||
"content": "\n\nNo thoughts, head empty!",
|
||||
}
|
||||
|
||||
SIMPLE_REASONING = {
|
||||
"output": f"{START_REASONING}This is a reasoning section{END_REASONING}This is the rest", # noqa: E501
|
||||
"reasoning_content": "This is a reasoning section",
|
||||
"reasoning": "This is a reasoning section",
|
||||
"content": "This is the rest",
|
||||
}
|
||||
|
||||
SIMPLE_REASONING_WITH_NEWLINE = {
|
||||
"output": f"{START_REASONING} Look!\n\nI'm thinking...{END_REASONING}\nThis is the rest", # noqa: E501
|
||||
"reasoning_content": " Look!\n\nI'm thinking...",
|
||||
"reasoning": " Look!\n\nI'm thinking...",
|
||||
"content": "\nThis is the rest",
|
||||
}
|
||||
|
||||
SIMPLE_REASONING_WITH_MULTIPLE_NEWLINES = {
|
||||
"output": f"{START_REASONING}\nLook!\nI'm thinking...\n\n{END_REASONING}\n\n\nThis is the rest", # noqa: E501
|
||||
"reasoning_content": "\nLook!\nI'm thinking...\n\n",
|
||||
"reasoning": "\nLook!\nI'm thinking...\n\n",
|
||||
"content": "\n\n\nThis is the rest",
|
||||
}
|
||||
|
||||
NO_REASONING_ONLY_END_THINK = {
|
||||
"output": f"{END_REASONING}\n\nNo thoughts, head empty!",
|
||||
"reasoning_content": None,
|
||||
"reasoning": None,
|
||||
"content": "\n\nNo thoughts, head empty!",
|
||||
}
|
||||
|
||||
REASONING_ONLY_END_THINK = {
|
||||
"output": f"The user is asking me not to think.{END_REASONING}No thoughts!",
|
||||
"reasoning_content": "The user is asking me not to think.",
|
||||
"reasoning": "The user is asking me not to think.",
|
||||
"content": "No thoughts!",
|
||||
}
|
||||
|
||||
@ -148,5 +148,5 @@ def test_reasoning(
|
||||
reasoning_parser=parser, model_output=model_output, streaming=streaming
|
||||
)
|
||||
|
||||
assert reasoning == param_dict["reasoning_content"]
|
||||
assert reasoning == param_dict["reasoning"]
|
||||
assert content == param_dict["content"]
|
||||
|
||||
@ -22,47 +22,47 @@ def qwen3_tokenizer():
|
||||
# 带 <think></think>,非stream
|
||||
WITH_THINK = {
|
||||
"output": "<think>This is a reasoning section</think>This is the rest",
|
||||
"reasoning_content": "This is a reasoning section",
|
||||
"reasoning": "This is a reasoning section",
|
||||
"content": "This is the rest",
|
||||
}
|
||||
# 带 <think></think>,stream
|
||||
WITH_THINK_STREAM = {
|
||||
"output": "<think>This is a reasoning section</think>This is the rest",
|
||||
"reasoning_content": "This is a reasoning section",
|
||||
"reasoning": "This is a reasoning section",
|
||||
"content": "This is the rest",
|
||||
}
|
||||
# 不带 <think></think>,非stream
|
||||
WITHOUT_THINK = {
|
||||
"output": "This is the rest",
|
||||
"reasoning_content": None,
|
||||
"reasoning": None,
|
||||
"content": "This is the rest",
|
||||
}
|
||||
# 不带 <think></think>,stream
|
||||
WITHOUT_THINK_STREAM = {
|
||||
"output": "This is the rest",
|
||||
"reasoning_content": None,
|
||||
"reasoning": None,
|
||||
"content": "This is the rest",
|
||||
}
|
||||
|
||||
COMPLETE_REASONING = {
|
||||
"output": "<think>This is a reasoning section</think>",
|
||||
"reasoning_content": "This is a reasoning section",
|
||||
"reasoning": "This is a reasoning section",
|
||||
"content": None,
|
||||
}
|
||||
MULTILINE_REASONING = {
|
||||
"output": "<think>This is a reasoning\nsection</think>This is the rest\nThat",
|
||||
"reasoning_content": "This is a reasoning\nsection",
|
||||
"reasoning": "This is a reasoning\nsection",
|
||||
"content": "This is the rest\nThat",
|
||||
}
|
||||
ONLY_OPEN_TAG = {
|
||||
"output": "<think>This is a reasoning section",
|
||||
"reasoning_content": None,
|
||||
"reasoning": None,
|
||||
"content": "<think>This is a reasoning section",
|
||||
}
|
||||
|
||||
ONLY_OPEN_TAG_STREAM = {
|
||||
"output": "<think>This is a reasoning section",
|
||||
"reasoning_content": "This is a reasoning section",
|
||||
"reasoning": "This is a reasoning section",
|
||||
"content": None,
|
||||
}
|
||||
|
||||
@ -138,5 +138,5 @@ def test_reasoning(
|
||||
parser, output_tokens, streaming=streaming
|
||||
)
|
||||
|
||||
assert reasoning == param_dict["reasoning_content"]
|
||||
assert reasoning == param_dict["reasoning"]
|
||||
assert content == param_dict["content"]
|
||||
|
||||
@ -28,49 +28,49 @@ def seedoss_tokenizer():
|
||||
|
||||
SIMPLE_REASONING: dict[str, Any] = {
|
||||
"output": "This is a reasoning section</seed:think>This is the rest",
|
||||
"reasoning_content": "This is a reasoning section",
|
||||
"reasoning": "This is a reasoning section",
|
||||
"content": "This is the rest",
|
||||
"is_reasoning_end": True,
|
||||
}
|
||||
COMPLETE_REASONING: dict[str, Any] = {
|
||||
"output": "This is a reasoning section</seed:think>",
|
||||
"reasoning_content": "This is a reasoning section",
|
||||
"reasoning": "This is a reasoning section",
|
||||
"content": None,
|
||||
"is_reasoning_end": True,
|
||||
}
|
||||
NO_CONTENT: dict[str, Any] = {
|
||||
"output": "This is content",
|
||||
"reasoning_content": "This is content",
|
||||
"reasoning": "This is content",
|
||||
"content": None,
|
||||
"is_reasoning_end": False,
|
||||
}
|
||||
NO_REASONING_STREAMING: dict[str, Any] = {
|
||||
"output": "This is a reasoning section",
|
||||
"reasoning_content": "This is a reasoning section",
|
||||
"reasoning": "This is a reasoning section",
|
||||
"content": None,
|
||||
"is_reasoning_end": False,
|
||||
}
|
||||
MULTIPLE_LINES: dict[str, Any] = {
|
||||
"output": "This\nThat</seed:think>This is the rest\nThat",
|
||||
"reasoning_content": "This\nThat",
|
||||
"reasoning": "This\nThat",
|
||||
"content": "This is the rest\nThat",
|
||||
"is_reasoning_end": True,
|
||||
}
|
||||
WITH_START_TOKEN: dict[str, Any] = {
|
||||
"output": ("<seed:think>This is a reasoning section</seed:think>This is the rest"),
|
||||
"reasoning_content": "This is a reasoning section",
|
||||
"reasoning": "This is a reasoning section",
|
||||
"content": "This is the rest",
|
||||
"is_reasoning_end": True,
|
||||
}
|
||||
ONLY_END_TOKEN: dict[str, Any] = {
|
||||
"output": "Some reasoning</seed:think>This is the rest",
|
||||
"reasoning_content": "Some reasoning",
|
||||
"reasoning": "Some reasoning",
|
||||
"content": "This is the rest",
|
||||
"is_reasoning_end": True,
|
||||
}
|
||||
NO_TOKENS: dict[str, Any] = {
|
||||
"output": "This is just content without any reasoning tokens",
|
||||
"reasoning_content": "This is just content without any reasoning tokens",
|
||||
"reasoning": "This is just content without any reasoning tokens",
|
||||
"content": None,
|
||||
"is_reasoning_end": False,
|
||||
}
|
||||
@ -95,7 +95,7 @@ def test_simple_reasoning(seedoss_tokenizer, streaming):
|
||||
parser, [cast(str, SIMPLE_REASONING["output"])], streaming=streaming
|
||||
)
|
||||
|
||||
assert reasoning == SIMPLE_REASONING["reasoning_content"]
|
||||
assert reasoning == SIMPLE_REASONING["reasoning"]
|
||||
assert content == SIMPLE_REASONING["content"]
|
||||
|
||||
|
||||
@ -109,7 +109,7 @@ def test_complete_reasoning(seedoss_tokenizer, streaming):
|
||||
parser, [cast(str, COMPLETE_REASONING["output"])], streaming=streaming
|
||||
)
|
||||
|
||||
assert reasoning == COMPLETE_REASONING["reasoning_content"]
|
||||
assert reasoning == COMPLETE_REASONING["reasoning"]
|
||||
assert content == COMPLETE_REASONING["content"]
|
||||
|
||||
|
||||
@ -123,7 +123,7 @@ def test_no_content(seedoss_tokenizer, streaming):
|
||||
parser, [cast(str, NO_CONTENT["output"])], streaming=streaming
|
||||
)
|
||||
|
||||
assert reasoning == NO_CONTENT["reasoning_content"]
|
||||
assert reasoning == NO_CONTENT["reasoning"]
|
||||
assert content == NO_CONTENT["content"]
|
||||
|
||||
|
||||
@ -137,7 +137,7 @@ def test_multiple_lines(seedoss_tokenizer, streaming):
|
||||
parser, [cast(str, MULTIPLE_LINES["output"])], streaming=streaming
|
||||
)
|
||||
|
||||
assert reasoning == MULTIPLE_LINES["reasoning_content"]
|
||||
assert reasoning == MULTIPLE_LINES["reasoning"]
|
||||
assert content == MULTIPLE_LINES["content"]
|
||||
|
||||
|
||||
@ -151,7 +151,7 @@ def test_with_start_token(seedoss_tokenizer, streaming):
|
||||
parser, [cast(str, WITH_START_TOKEN["output"])], streaming=streaming
|
||||
)
|
||||
|
||||
assert reasoning == WITH_START_TOKEN["reasoning_content"]
|
||||
assert reasoning == WITH_START_TOKEN["reasoning"]
|
||||
assert content == WITH_START_TOKEN["content"]
|
||||
|
||||
|
||||
@ -168,7 +168,7 @@ def test_only_end_token(seedoss_tokenizer, streaming):
|
||||
parser, [cast(str, ONLY_END_TOKEN["output"])], streaming=streaming
|
||||
)
|
||||
|
||||
assert reasoning == ONLY_END_TOKEN["reasoning_content"]
|
||||
assert reasoning == ONLY_END_TOKEN["reasoning"]
|
||||
assert content == ONLY_END_TOKEN["content"]
|
||||
|
||||
|
||||
@ -182,7 +182,7 @@ def test_no_tokens(seedoss_tokenizer, streaming):
|
||||
parser, [cast(str, NO_TOKENS["output"])], streaming=streaming
|
||||
)
|
||||
|
||||
assert reasoning == NO_TOKENS["reasoning_content"]
|
||||
assert reasoning == NO_TOKENS["reasoning"]
|
||||
assert content == NO_TOKENS["content"]
|
||||
|
||||
|
||||
|
||||
@ -9,25 +9,28 @@ from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
|
||||
|
||||
class StreamingReasoningReconstructor:
|
||||
def __init__(self):
|
||||
self.reasoning_content = None
|
||||
self.reasoning = None
|
||||
self.other_content = None
|
||||
|
||||
def append_delta(self, delta: DeltaMessage):
|
||||
# content and the reasoning content should not be present
|
||||
# at the same time
|
||||
assert delta.content is None or delta.reasoning_content is None, (
|
||||
assert delta.content is None or delta.reasoning is None, (
|
||||
"Both content and reasoning content are present in the delta message"
|
||||
)
|
||||
assert delta.reasoning == delta.reasoning_content, (
|
||||
"reasoning_content should be present for backwards compatibility"
|
||||
)
|
||||
if delta.content is not None:
|
||||
if self.other_content is None:
|
||||
self.other_content = delta.content
|
||||
else:
|
||||
self.other_content += delta.content
|
||||
else:
|
||||
if self.reasoning_content is None:
|
||||
self.reasoning_content = delta.reasoning_content
|
||||
if self.reasoning is None:
|
||||
self.reasoning = delta.reasoning
|
||||
else:
|
||||
self.reasoning_content += delta.reasoning_content
|
||||
self.reasoning += delta.reasoning
|
||||
|
||||
|
||||
def run_reasoning_extraction(
|
||||
@ -43,7 +46,7 @@ def run_reasoning_extraction(
|
||||
request,
|
||||
)
|
||||
return (
|
||||
reconstructor.reasoning_content,
|
||||
reconstructor.reasoning,
|
||||
reconstructor.other_content or None,
|
||||
)
|
||||
else:
|
||||
@ -69,7 +72,7 @@ def run_reasoning_extraction_mistral(
|
||||
request,
|
||||
)
|
||||
return (
|
||||
reconstructor.reasoning_content,
|
||||
reconstructor.reasoning,
|
||||
reconstructor.other_content or None,
|
||||
)
|
||||
else:
|
||||
@ -88,7 +91,7 @@ def run_reasoning_extraction_nonstreaming(
|
||||
request: ChatCompletionRequest | None = None,
|
||||
) -> tuple[str | None, str | None]:
|
||||
request = request or ChatCompletionRequest(messages=[], model="test-model")
|
||||
return reasoning_parser.extract_reasoning_content(
|
||||
return reasoning_parser.extract_reasoning(
|
||||
model_output="".join(model_output), request=request
|
||||
)
|
||||
|
||||
@ -110,7 +113,7 @@ def run_reasoning_extraction_streaming(
|
||||
]
|
||||
current_text = previous_text + delta
|
||||
current_tokens = previous_tokens + token_delta
|
||||
delta_message = reasoning_parser.extract_reasoning_content_streaming(
|
||||
delta_message = reasoning_parser.extract_reasoning_streaming(
|
||||
previous_text,
|
||||
current_text,
|
||||
delta,
|
||||
@ -142,7 +145,7 @@ def run_reasoning_extraction_streaming_mistral(
|
||||
delta = reasoning_parser.model_tokenizer.convert_ids_to_tokens([model_delta])[0]
|
||||
current_text = previous_text + delta
|
||||
current_tokens = previous_tokens + token_delta
|
||||
delta_message = reasoning_parser.extract_reasoning_content_streaming(
|
||||
delta_message = reasoning_parser.extract_reasoning_streaming(
|
||||
previous_text,
|
||||
current_text,
|
||||
delta,
|
||||
|
||||
@ -102,7 +102,7 @@ def test_prepare_apply_chat_template_tools_and_messages(
|
||||
assert actual_request == expected_mistral_output
|
||||
|
||||
|
||||
# Tool use with list content and reasoning_content
|
||||
# Tool use with list content and reasoning
|
||||
@pytest.mark.parametrize(
|
||||
"openai_request,expected_mistral_output",
|
||||
[
|
||||
@ -115,7 +115,7 @@ def test_prepare_apply_chat_template_tools_and_messages(
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"reasoning_content": None,
|
||||
"reasoning": None,
|
||||
"content": None,
|
||||
"tool_calls": [
|
||||
{
|
||||
|
||||
@ -337,7 +337,7 @@ def test_extract_tool_calls_streaming_incremental(
|
||||
if (
|
||||
delta_message.role is None
|
||||
and delta_message.content is None
|
||||
and delta_message.reasoning_content is None
|
||||
and delta_message.reasoning is None
|
||||
and len(delta_message.tool_calls) == 0
|
||||
):
|
||||
continue
|
||||
|
||||
@ -674,10 +674,10 @@ def test_structured_output_with_reasoning_matrices(
|
||||
assert output is not None and isinstance(output, RequestOutput)
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
reasoning_content, content = run_reasoning_extraction(reasoner, [generated_text])
|
||||
print(f"Prompt: {prompt!r}\nReasoning: {reasoning_content!r}\nContent: {content!r}")
|
||||
reasoning, content = run_reasoning_extraction(reasoner, [generated_text])
|
||||
print(f"Prompt: {prompt!r}\nReasoning: {reasoning!r}\nContent: {content!r}")
|
||||
|
||||
assert content is not None and reasoning_content is not None
|
||||
assert content is not None and reasoning is not None
|
||||
output_json = json.loads(content)
|
||||
jsonschema.validate(instance=output_json, schema=reasoning_schema)
|
||||
|
||||
|
||||
@ -521,15 +521,15 @@ def parse_chat_output(
|
||||
is_tool_call = False # TODO: update this when tool call is supported
|
||||
if len(output_msgs) == 0:
|
||||
# The generation has stopped during reasoning.
|
||||
reasoning_content = parser.current_content
|
||||
reasoning = parser.current_content
|
||||
final_content = None
|
||||
elif len(output_msgs) == 1:
|
||||
# The generation has stopped during final message.
|
||||
reasoning_content = output_msgs[0].content[0].text
|
||||
reasoning = output_msgs[0].content[0].text
|
||||
final_content = parser.current_content
|
||||
else:
|
||||
reasoning_msg = output_msgs[:-1]
|
||||
final_msg = output_msgs[-1]
|
||||
reasoning_content = "\n".join([msg.content[0].text for msg in reasoning_msg])
|
||||
reasoning = "\n".join([msg.content[0].text for msg in reasoning_msg])
|
||||
final_content = final_msg.content[0].text
|
||||
return reasoning_content, final_content, is_tool_call
|
||||
return reasoning, final_content, is_tool_call
|
||||
|
||||
@ -2102,7 +2102,15 @@ class ChatMessage(OpenAIBaseModel):
|
||||
tool_calls: list[ToolCall] = Field(default_factory=list)
|
||||
|
||||
# vLLM-specific fields that are not in OpenAI spec
|
||||
reasoning: str | None = None
|
||||
reasoning_content: str | None = None
|
||||
"""Deprecated: use `reasoning` instead."""
|
||||
|
||||
@model_validator(mode="after")
|
||||
def handle_deprecated_reasoning_content(self):
|
||||
"""Copy reasoning to reasoning_content for backward compatibility."""
|
||||
self.reasoning_content = self.reasoning
|
||||
return self
|
||||
|
||||
|
||||
class ChatCompletionLogProb(OpenAIBaseModel):
|
||||
@ -2156,9 +2164,17 @@ class ChatCompletionResponse(OpenAIBaseModel):
|
||||
class DeltaMessage(OpenAIBaseModel):
|
||||
role: str | None = None
|
||||
content: str | None = None
|
||||
reasoning: str | None = None
|
||||
reasoning_content: str | None = None
|
||||
"""Deprecated: use `reasoning` instead."""
|
||||
tool_calls: list[DeltaToolCall] = Field(default_factory=list)
|
||||
|
||||
@model_validator(mode="after")
|
||||
def handle_deprecated_reasoning_content(self):
|
||||
"""Copy reasoning to reasoning_content for backward compatibility."""
|
||||
self.reasoning_content = self.reasoning
|
||||
return self
|
||||
|
||||
|
||||
class ChatCompletionResponseStreamChoice(OpenAIBaseModel):
|
||||
index: int
|
||||
|
||||
@ -759,9 +759,7 @@ class OpenAIServingChat(OpenAIServing):
|
||||
delta_message = DeltaMessage(content=delta_text)
|
||||
elif cur_channel == "analysis":
|
||||
if request.include_reasoning:
|
||||
delta_message = DeltaMessage(
|
||||
reasoning_content=delta_text
|
||||
)
|
||||
delta_message = DeltaMessage(reasoning=delta_text)
|
||||
else:
|
||||
delta_message = None
|
||||
elif (
|
||||
@ -823,7 +821,7 @@ class OpenAIServingChat(OpenAIServing):
|
||||
):
|
||||
assert reasoning_parser is not None
|
||||
delta_message = (
|
||||
reasoning_parser.extract_reasoning_content_streaming(
|
||||
reasoning_parser.extract_reasoning_streaming(
|
||||
previous_text,
|
||||
current_text,
|
||||
delta_text,
|
||||
@ -836,7 +834,7 @@ class OpenAIServingChat(OpenAIServing):
|
||||
# or think end id in prompt_token_ids
|
||||
# i.e {"enable_thinking": False},
|
||||
# set reasoning status to end.
|
||||
# Only keep 'content', remove 'reasoning_content'.
|
||||
# Only keep 'content', remove 'reasoning'.
|
||||
if reasoning_parser.is_reasoning_end(
|
||||
as_list(output.token_ids)
|
||||
) or (
|
||||
@ -899,7 +897,7 @@ class OpenAIServingChat(OpenAIServing):
|
||||
|
||||
if self.reasoning_parser and not reasoning_end_arr[i]:
|
||||
delta_message = (
|
||||
reasoning_parser.extract_reasoning_content_streaming(
|
||||
reasoning_parser.extract_reasoning_streaming(
|
||||
previous_text,
|
||||
current_text,
|
||||
delta_text,
|
||||
@ -948,7 +946,7 @@ class OpenAIServingChat(OpenAIServing):
|
||||
output_token_ids = as_list(output.token_ids)
|
||||
if not reasoning_end_arr[i]:
|
||||
delta_message = (
|
||||
reasoning_parser.extract_reasoning_content_streaming(
|
||||
reasoning_parser.extract_reasoning_streaming(
|
||||
previous_text,
|
||||
current_text,
|
||||
delta_text,
|
||||
@ -961,7 +959,7 @@ class OpenAIServingChat(OpenAIServing):
|
||||
# i.e {"enable_thinking": False},
|
||||
# set reasoning status to end.
|
||||
# Remove the text and token ids related
|
||||
# to 'reasoning_content'.
|
||||
# to 'reasoning'.
|
||||
if (
|
||||
res.prompt_token_ids
|
||||
and reasoning_parser.is_reasoning_end(
|
||||
@ -978,7 +976,7 @@ class OpenAIServingChat(OpenAIServing):
|
||||
# When encountering think end id in delta_token_ids,
|
||||
# set reasoning status to end.
|
||||
# Remove the text and token ids related
|
||||
# to 'reasoning_content'.
|
||||
# to 'reasoning'.
|
||||
if reasoning_parser.is_reasoning_end(output_token_ids):
|
||||
reasoning_end_arr[i] = True
|
||||
current_token_ids = (
|
||||
@ -1033,15 +1031,13 @@ class OpenAIServingChat(OpenAIServing):
|
||||
|
||||
# when only reasoning
|
||||
elif self.reasoning_parser:
|
||||
delta_message = (
|
||||
reasoning_parser.extract_reasoning_content_streaming(
|
||||
previous_text,
|
||||
current_text,
|
||||
delta_text,
|
||||
previous_token_ids,
|
||||
current_token_ids,
|
||||
output.token_ids,
|
||||
)
|
||||
delta_message = reasoning_parser.extract_reasoning_streaming(
|
||||
previous_text,
|
||||
current_text,
|
||||
delta_text,
|
||||
previous_token_ids,
|
||||
current_token_ids,
|
||||
output.token_ids,
|
||||
)
|
||||
# handle streaming just a content delta
|
||||
else:
|
||||
@ -1334,9 +1330,9 @@ class OpenAIServingChat(OpenAIServing):
|
||||
logprobs = None
|
||||
|
||||
if self.use_harmony:
|
||||
reasoning_content, content, _ = parse_chat_output(token_ids)
|
||||
reasoning, content, _ = parse_chat_output(token_ids)
|
||||
if not request.include_reasoning:
|
||||
reasoning_content = None
|
||||
reasoning = None
|
||||
|
||||
if self.tool_parser is not None:
|
||||
tool_parser = self.tool_parser(tokenizer)
|
||||
@ -1349,14 +1345,14 @@ class OpenAIServingChat(OpenAIServing):
|
||||
content = tool_call_info.content
|
||||
message = ChatMessage(
|
||||
role=role,
|
||||
reasoning_content=reasoning_content,
|
||||
reasoning=reasoning,
|
||||
content=content,
|
||||
tool_calls=tool_call_info.tool_calls,
|
||||
)
|
||||
else:
|
||||
message = ChatMessage(
|
||||
role=role,
|
||||
reasoning_content=reasoning_content,
|
||||
reasoning=reasoning,
|
||||
content=content,
|
||||
)
|
||||
|
||||
@ -1390,13 +1386,13 @@ class OpenAIServingChat(OpenAIServing):
|
||||
return self.create_error_response(str(e))
|
||||
# If the reasoning parser is enabled,
|
||||
# tool calls are extracted exclusively from the content.
|
||||
reasoning_content, content = reasoning_parser.extract_reasoning_content(
|
||||
reasoning, content = reasoning_parser.extract_reasoning(
|
||||
output.text, request=request
|
||||
)
|
||||
if not request.include_reasoning:
|
||||
reasoning_content = None
|
||||
reasoning = None
|
||||
else:
|
||||
reasoning_content = None
|
||||
reasoning = None
|
||||
content = output.text
|
||||
|
||||
auto_tools_called = False
|
||||
@ -1416,9 +1412,7 @@ class OpenAIServingChat(OpenAIServing):
|
||||
not isinstance(request.tool_choice, ChatCompletionNamedToolChoiceParam)
|
||||
and request.tool_choice != "required"
|
||||
):
|
||||
message = ChatMessage(
|
||||
role=role, reasoning_content=reasoning_content, content=content
|
||||
)
|
||||
message = ChatMessage(role=role, reasoning=reasoning, content=content)
|
||||
|
||||
# if the request uses tools and specified a tool choice
|
||||
elif (
|
||||
@ -1428,7 +1422,7 @@ class OpenAIServingChat(OpenAIServing):
|
||||
assert tool_calls is not None and len(tool_calls) > 0
|
||||
message = ChatMessage(
|
||||
role=role,
|
||||
reasoning_content=reasoning_content,
|
||||
reasoning=reasoning,
|
||||
content="",
|
||||
tool_calls=[tool_call_class(function=tc) for tc in tool_calls],
|
||||
)
|
||||
@ -1452,15 +1446,13 @@ class OpenAIServingChat(OpenAIServing):
|
||||
role=role,
|
||||
content="",
|
||||
tool_calls=tool_call_class_items,
|
||||
reasoning_content=reasoning_content,
|
||||
reasoning=reasoning,
|
||||
)
|
||||
|
||||
# if the request doesn't use tool choice
|
||||
# OR specifies to not use a tool
|
||||
elif not request.tool_choice or request.tool_choice == "none":
|
||||
message = ChatMessage(
|
||||
role=role, reasoning_content=reasoning_content, content=content
|
||||
)
|
||||
message = ChatMessage(role=role, reasoning=reasoning, content=content)
|
||||
|
||||
# handle when there are tools and tool choice is auto
|
||||
elif (
|
||||
@ -1476,7 +1468,7 @@ class OpenAIServingChat(OpenAIServing):
|
||||
if tool_calls:
|
||||
message = ChatMessage(
|
||||
role=role,
|
||||
reasoning_content=reasoning_content,
|
||||
reasoning=reasoning,
|
||||
content=content,
|
||||
tool_calls=[
|
||||
ToolCall(
|
||||
@ -1498,7 +1490,7 @@ class OpenAIServingChat(OpenAIServing):
|
||||
ret_content = content
|
||||
message = ChatMessage(
|
||||
role=role,
|
||||
reasoning_content=reasoning_content,
|
||||
reasoning=reasoning,
|
||||
content=ret_content,
|
||||
)
|
||||
|
||||
@ -1509,9 +1501,7 @@ class OpenAIServingChat(OpenAIServing):
|
||||
" if tools should be extracted. Returning a standard chat "
|
||||
"completion."
|
||||
)
|
||||
message = ChatMessage(
|
||||
role=role, reasoning_content=reasoning_content, content=content
|
||||
)
|
||||
message = ChatMessage(role=role, reasoning=reasoning, content=content)
|
||||
# In OpenAI's API, when a tool is called, the finish_reason is:
|
||||
# "tool_calls" for "auto" or "required" tool calls,
|
||||
# and "stop" for named tool calls.
|
||||
|
||||
@ -778,11 +778,11 @@ class OpenAIServingResponses(OpenAIServing):
|
||||
logger.exception("Error in reasoning parser creation.")
|
||||
raise e
|
||||
|
||||
reasoning_content, content = reasoning_parser.extract_reasoning_content(
|
||||
reasoning, content = reasoning_parser.extract_reasoning(
|
||||
final_output.text, request=request
|
||||
)
|
||||
else:
|
||||
reasoning_content = None
|
||||
reasoning = None
|
||||
content = final_output.text
|
||||
|
||||
# Log complete response if output logging is enabled
|
||||
@ -790,8 +790,8 @@ class OpenAIServingResponses(OpenAIServing):
|
||||
output_text = ""
|
||||
if content:
|
||||
output_text = content
|
||||
elif reasoning_content:
|
||||
output_text = f"[reasoning: {reasoning_content}]"
|
||||
elif reasoning:
|
||||
output_text = f"[reasoning: {reasoning}]"
|
||||
|
||||
if output_text:
|
||||
self.request_logger.log_outputs(
|
||||
@ -805,15 +805,13 @@ class OpenAIServingResponses(OpenAIServing):
|
||||
|
||||
reasoning_item = None
|
||||
message_item = None
|
||||
if reasoning_content:
|
||||
if reasoning:
|
||||
reasoning_item = ResponseReasoningItem(
|
||||
id=f"rs_{random_uuid()}",
|
||||
summary=[],
|
||||
type="reasoning",
|
||||
content=[
|
||||
ResponseReasoningTextContent(
|
||||
text=reasoning_content, type="reasoning_text"
|
||||
)
|
||||
ResponseReasoningTextContent(text=reasoning, type="reasoning_text")
|
||||
],
|
||||
status=None, # NOTE: Only the last output item has status.
|
||||
)
|
||||
@ -1208,15 +1206,13 @@ class OpenAIServingResponses(OpenAIServing):
|
||||
if ctx.last_output.outputs:
|
||||
output = ctx.last_output.outputs[0]
|
||||
if reasoning_parser:
|
||||
delta_message = (
|
||||
reasoning_parser.extract_reasoning_content_streaming(
|
||||
previous_text=previous_text,
|
||||
current_text=previous_text + output.text,
|
||||
delta_text=output.text,
|
||||
previous_token_ids=previous_token_ids,
|
||||
current_token_ids=previous_token_ids + output.token_ids,
|
||||
delta_token_ids=output.token_ids,
|
||||
)
|
||||
delta_message = reasoning_parser.extract_reasoning_streaming(
|
||||
previous_text=previous_text,
|
||||
current_text=previous_text + output.text,
|
||||
delta_text=output.text,
|
||||
previous_token_ids=previous_token_ids,
|
||||
current_token_ids=previous_token_ids + output.token_ids,
|
||||
delta_token_ids=output.token_ids,
|
||||
)
|
||||
else:
|
||||
delta_message = DeltaMessage(
|
||||
@ -1228,7 +1224,7 @@ class OpenAIServingResponses(OpenAIServing):
|
||||
continue
|
||||
if not first_delta_sent:
|
||||
current_item_id = str(uuid.uuid4())
|
||||
if delta_message.reasoning_content:
|
||||
if delta_message.reasoning:
|
||||
yield _increment_sequence_number_and_return(
|
||||
ResponseOutputItemAddedEvent(
|
||||
type="response.output_item.added",
|
||||
@ -1280,15 +1276,15 @@ class OpenAIServingResponses(OpenAIServing):
|
||||
# same as content or reasoning content
|
||||
if (
|
||||
previous_delta_messages
|
||||
and previous_delta_messages[-1].reasoning_content is not None
|
||||
and previous_delta_messages[-1].reasoning is not None
|
||||
and delta_message.content is not None
|
||||
):
|
||||
# from reasoning to normal content, send done
|
||||
# event for reasoning
|
||||
reason_content = "".join(
|
||||
pm.reasoning_content
|
||||
pm.reasoning
|
||||
for pm in previous_delta_messages
|
||||
if pm.reasoning_content is not None
|
||||
if pm.reasoning is not None
|
||||
)
|
||||
yield _increment_sequence_number_and_return(
|
||||
ResponseReasoningTextDoneEvent(
|
||||
@ -1356,7 +1352,7 @@ class OpenAIServingResponses(OpenAIServing):
|
||||
# reset previous delta messages
|
||||
previous_delta_messages = []
|
||||
|
||||
if delta_message.reasoning_content is not None:
|
||||
if delta_message.reasoning is not None:
|
||||
yield _increment_sequence_number_and_return(
|
||||
ResponseReasoningTextDeltaEvent(
|
||||
type="response.reasoning_text.delta",
|
||||
@ -1364,7 +1360,7 @@ class OpenAIServingResponses(OpenAIServing):
|
||||
content_index=current_content_index,
|
||||
output_index=current_output_index,
|
||||
item_id=current_item_id,
|
||||
delta=delta_message.reasoning_content,
|
||||
delta=delta_message.reasoning,
|
||||
)
|
||||
)
|
||||
elif delta_message.content is not None:
|
||||
@ -1392,11 +1388,11 @@ class OpenAIServingResponses(OpenAIServing):
|
||||
|
||||
previous_delta_messages.append(delta_message)
|
||||
if previous_delta_messages:
|
||||
if previous_delta_messages[-1].reasoning_content is not None:
|
||||
if previous_delta_messages[-1].reasoning is not None:
|
||||
reason_content = "".join(
|
||||
pm.reasoning_content
|
||||
pm.reasoning
|
||||
for pm in previous_delta_messages
|
||||
if pm.reasoning_content is not None
|
||||
if pm.reasoning is not None
|
||||
)
|
||||
yield _increment_sequence_number_and_return(
|
||||
ResponseReasoningTextDoneEvent(
|
||||
|
||||
@ -279,7 +279,7 @@ class StreamingXMLToolCallParser:
|
||||
final_delta = DeltaMessage(
|
||||
role=None,
|
||||
content=None,
|
||||
reasoning_content=None,
|
||||
reasoning=None,
|
||||
tool_calls=[
|
||||
DeltaToolCall(
|
||||
index=self.tool_call_index - 1,
|
||||
|
||||
@ -76,7 +76,7 @@ class ReasoningParser:
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def extract_reasoning_content(
|
||||
def extract_reasoning(
|
||||
self,
|
||||
model_output: str,
|
||||
request: ChatCompletionRequest | ResponsesRequest,
|
||||
@ -100,7 +100,7 @@ class ReasoningParser:
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def extract_reasoning_content_streaming(
|
||||
def extract_reasoning_streaming(
|
||||
self,
|
||||
previous_text: str,
|
||||
current_text: str,
|
||||
|
||||
@ -76,7 +76,7 @@ class BaseThinkingReasoningParser(ReasoningParser):
|
||||
else:
|
||||
return input_ids[input_ids.index(self.end_token_id) + 1 :]
|
||||
|
||||
def extract_reasoning_content_streaming(
|
||||
def extract_reasoning_streaming(
|
||||
self,
|
||||
previous_text: str,
|
||||
current_text: str,
|
||||
@ -103,11 +103,10 @@ class BaseThinkingReasoningParser(ReasoningParser):
|
||||
# start token in previous, end token in delta,
|
||||
# extract reasoning content
|
||||
end_index = delta_text.find(self.end_token)
|
||||
reasoning_content = delta_text[:end_index]
|
||||
reasoning = delta_text[:end_index]
|
||||
content = delta_text[end_index + len(self.end_token) :]
|
||||
return DeltaMessage(
|
||||
reasoning_content=reasoning_content,
|
||||
content=content if content else None,
|
||||
reasoning=reasoning, content=content if content else None
|
||||
)
|
||||
elif self.end_token_id in previous_token_ids:
|
||||
# start token in previous, end token in previous,
|
||||
@ -116,30 +115,27 @@ class BaseThinkingReasoningParser(ReasoningParser):
|
||||
else:
|
||||
# start token in previous, no end token in previous or delta,
|
||||
# reasoning content continues
|
||||
return DeltaMessage(reasoning_content=delta_text)
|
||||
return DeltaMessage(reasoning=delta_text)
|
||||
elif self.start_token_id in delta_token_ids:
|
||||
if self.end_token_id in delta_token_ids:
|
||||
# start token in delta, end token in delta,
|
||||
# extract reasoning content
|
||||
start_index = delta_text.find(self.start_token)
|
||||
end_index = delta_text.find(self.end_token)
|
||||
reasoning_content = delta_text[
|
||||
start_index + len(self.start_token) : end_index
|
||||
]
|
||||
reasoning = delta_text[start_index + len(self.start_token) : end_index]
|
||||
content = delta_text[end_index + len(self.end_token) :]
|
||||
return DeltaMessage(
|
||||
reasoning_content=reasoning_content,
|
||||
content=content if content else None,
|
||||
reasoning=reasoning, content=content if content else None
|
||||
)
|
||||
else:
|
||||
# start token in delta, no end token in delta,
|
||||
# reasoning content continues
|
||||
return DeltaMessage(reasoning_content=delta_text)
|
||||
return DeltaMessage(reasoning=delta_text)
|
||||
else:
|
||||
# not find thinking start token
|
||||
return DeltaMessage(content=delta_text)
|
||||
|
||||
def extract_reasoning_content(
|
||||
def extract_reasoning(
|
||||
self, model_output: str, request: ChatCompletionRequest | ResponsesRequest
|
||||
) -> tuple[str | None, str | None]:
|
||||
"""
|
||||
@ -160,7 +156,7 @@ class BaseThinkingReasoningParser(ReasoningParser):
|
||||
if self.end_token not in model_output:
|
||||
return model_output, None
|
||||
else:
|
||||
reasoning_content, _, content = model_output.partition(self.end_token)
|
||||
reasoning, _, content = model_output.partition(self.end_token)
|
||||
# If generation stops right after end-of-think, return null content
|
||||
final_content = content or None
|
||||
return reasoning_content, final_content
|
||||
return reasoning, final_content
|
||||
|
||||
@ -25,7 +25,7 @@ class DeepSeekR1ReasoningParser(BaseThinkingReasoningParser):
|
||||
"""The token that ends reasoning content."""
|
||||
return "</think>"
|
||||
|
||||
def extract_reasoning_content_streaming(
|
||||
def extract_reasoning_streaming(
|
||||
self,
|
||||
previous_text: str,
|
||||
current_text: str,
|
||||
@ -34,7 +34,7 @@ class DeepSeekR1ReasoningParser(BaseThinkingReasoningParser):
|
||||
current_token_ids: Sequence[int],
|
||||
delta_token_ids: Sequence[int],
|
||||
) -> DeltaMessage | None:
|
||||
ret = super().extract_reasoning_content_streaming(
|
||||
ret = super().extract_reasoning_streaming(
|
||||
previous_text,
|
||||
current_text,
|
||||
delta_text,
|
||||
@ -51,10 +51,10 @@ class DeepSeekR1ReasoningParser(BaseThinkingReasoningParser):
|
||||
# end token in delta with more tokens,
|
||||
# extract reasoning content and content
|
||||
end_index = delta_text.find(self.end_token)
|
||||
reasoning_content = delta_text[:end_index]
|
||||
reasoning = delta_text[:end_index]
|
||||
content = delta_text[end_index + len(self.end_token) :]
|
||||
return DeltaMessage(
|
||||
reasoning_content=reasoning_content,
|
||||
reasoning=reasoning,
|
||||
content=content if content else None,
|
||||
)
|
||||
elif self.end_token_id in previous_token_ids:
|
||||
@ -62,6 +62,6 @@ class DeepSeekR1ReasoningParser(BaseThinkingReasoningParser):
|
||||
return DeltaMessage(content=delta_text)
|
||||
else:
|
||||
# no end token in previous or delta, reasoning content continues
|
||||
return DeltaMessage(reasoning_content=delta_text)
|
||||
return DeltaMessage(reasoning=delta_text)
|
||||
|
||||
return ret
|
||||
|
||||
@ -38,12 +38,12 @@ class DeepSeekV3ReasoningParser(ReasoningParser):
|
||||
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
|
||||
return self._parser.extract_content_ids(input_ids)
|
||||
|
||||
def extract_reasoning_content(
|
||||
def extract_reasoning(
|
||||
self, model_output: str, request: ChatCompletionRequest
|
||||
) -> tuple[str | None, str | None]:
|
||||
return self._parser.extract_reasoning_content(model_output, request)
|
||||
return self._parser.extract_reasoning(model_output, request)
|
||||
|
||||
def extract_reasoning_content_streaming(
|
||||
def extract_reasoning_streaming(
|
||||
self,
|
||||
previous_text: str,
|
||||
current_text: str,
|
||||
@ -52,7 +52,7 @@ class DeepSeekV3ReasoningParser(ReasoningParser):
|
||||
current_token_ids: Sequence[int],
|
||||
delta_token_ids: Sequence[int],
|
||||
) -> DeltaMessage | None:
|
||||
return self._parser.extract_reasoning_content_streaming(
|
||||
return self._parser.extract_reasoning_streaming(
|
||||
previous_text,
|
||||
current_text,
|
||||
delta_text,
|
||||
|
||||
@ -57,7 +57,7 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser):
|
||||
"tokens in the tokenizer!"
|
||||
)
|
||||
|
||||
def extract_reasoning_content_streaming(
|
||||
def extract_reasoning_streaming(
|
||||
self,
|
||||
previous_text: str,
|
||||
current_text: str,
|
||||
@ -73,7 +73,7 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser):
|
||||
The Ernie45 thinking model ouput format is
|
||||
abc\n</think>\n\n<response>\ndef\n</response>\n
|
||||
or abc\n</think>\ndef
|
||||
- 'abc' goes to reasoning_content
|
||||
- 'abc' goes to reasoning
|
||||
- 'def' goes to content
|
||||
"""
|
||||
# Skip single special tokens
|
||||
@ -94,7 +94,7 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser):
|
||||
# </think> in delta with more tokens,
|
||||
# extract reasoning content and content
|
||||
think_end_index = delta_text.find(self.end_token)
|
||||
reasoning_content = delta_text[:think_end_index]
|
||||
reasoning = delta_text[:think_end_index]
|
||||
content = delta_text[think_end_index + len(self.end_token) :]
|
||||
content = content.lstrip("\n")
|
||||
response_start_idx = content.find(self.response_start_token)
|
||||
@ -104,7 +104,7 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser):
|
||||
if response_end_idx != -1:
|
||||
content = content[:response_end_idx]
|
||||
return DeltaMessage(
|
||||
reasoning_content=reasoning_content,
|
||||
reasoning=reasoning,
|
||||
content=content if content else None,
|
||||
)
|
||||
elif self.end_token_id in previous_token_ids:
|
||||
@ -138,9 +138,9 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser):
|
||||
return DeltaMessage(content=content if content else None)
|
||||
else:
|
||||
# no </think> in previous or delta, reasoning content continues
|
||||
return DeltaMessage(reasoning_content=delta_text)
|
||||
return DeltaMessage(reasoning=delta_text)
|
||||
|
||||
def extract_reasoning_content(
|
||||
def extract_reasoning(
|
||||
self, model_output: str, request: ChatCompletionRequest
|
||||
) -> tuple[str | None, str | None]:
|
||||
"""
|
||||
@ -148,14 +148,12 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser):
|
||||
The Ernie45 thinking model ouput format is
|
||||
abc\n</think>\n\n\n<response>\ndef\n</response>\n
|
||||
or abc\n</think>\ndef
|
||||
- 'abc' goes to reasoning_content
|
||||
- 'abc' goes to reasoning
|
||||
- 'def' goes to content
|
||||
Returns:
|
||||
tuple[Optional[str], Optional[str]]: reasoning content and content
|
||||
"""
|
||||
reasoning_content, content = super().extract_reasoning_content(
|
||||
model_output, request
|
||||
)
|
||||
reasoning, content = super().extract_reasoning(model_output, request)
|
||||
if content:
|
||||
start_idx = content.find(self.response_start_token)
|
||||
end_idx = content.rfind(self.response_end_token)
|
||||
@ -164,4 +162,4 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser):
|
||||
content = content[start_idx + len(self.response_start_token) : end_idx]
|
||||
final_content = content or None
|
||||
|
||||
return reasoning_content, final_content
|
||||
return reasoning, final_content
|
||||
|
||||
@ -70,7 +70,7 @@ class Glm4MoeModelReasoningParser(ReasoningParser):
|
||||
else:
|
||||
return input_ids[input_ids.index(self.think_end_token_id) + 1 :]
|
||||
|
||||
def extract_reasoning_content_streaming(
|
||||
def extract_reasoning_streaming(
|
||||
self,
|
||||
previous_text: str,
|
||||
current_text: str,
|
||||
@ -84,7 +84,7 @@ class Glm4MoeModelReasoningParser(ReasoningParser):
|
||||
Handles streaming output where previous + delta = current.
|
||||
Uses token IDs for faster processing.
|
||||
For text <think>abc</think>xyz:
|
||||
- 'abc' goes to reasoning_content
|
||||
- 'abc' goes to reasoning
|
||||
- 'xyz' goes to content
|
||||
"""
|
||||
# Skip single special tokens
|
||||
@ -98,10 +98,10 @@ class Glm4MoeModelReasoningParser(ReasoningParser):
|
||||
# <think> in previous, </think> in delta,
|
||||
# extract reasoning content
|
||||
end_index = delta_text.find(self.think_end_token)
|
||||
reasoning_content = delta_text[:end_index]
|
||||
reasoning = delta_text[:end_index]
|
||||
content = delta_text[end_index + len(self.think_end_token) :]
|
||||
return DeltaMessage(
|
||||
reasoning_content=reasoning_content,
|
||||
reasoning=reasoning,
|
||||
content=content if content else None,
|
||||
)
|
||||
elif self.think_end_token_id in previous_token_ids:
|
||||
@ -111,36 +111,36 @@ class Glm4MoeModelReasoningParser(ReasoningParser):
|
||||
else:
|
||||
# <think> in previous, no </think> in previous or delta,
|
||||
# reasoning content continues
|
||||
return DeltaMessage(reasoning_content=delta_text)
|
||||
return DeltaMessage(reasoning=delta_text)
|
||||
elif self.think_start_token_id in delta_token_ids:
|
||||
if self.think_end_token_id in delta_token_ids:
|
||||
# <think> in delta, </think> in delta, extract reasoning content
|
||||
start_index = delta_text.find(self.think_start_token)
|
||||
end_index = delta_text.find(self.think_end_token)
|
||||
reasoning_content = delta_text[
|
||||
reasoning = delta_text[
|
||||
start_index + len(self.think_start_token) : end_index
|
||||
]
|
||||
content = delta_text[end_index + len(self.think_end_token) :]
|
||||
return DeltaMessage(
|
||||
reasoning_content=reasoning_content,
|
||||
reasoning=reasoning,
|
||||
content=content if content else None,
|
||||
)
|
||||
else:
|
||||
# <think> in delta, no </think> in delta,
|
||||
# reasoning content continues
|
||||
return DeltaMessage(reasoning_content=delta_text)
|
||||
return DeltaMessage(reasoning=delta_text)
|
||||
else:
|
||||
# thinking is disabled, just content
|
||||
return DeltaMessage(content=delta_text)
|
||||
|
||||
def extract_reasoning_content(
|
||||
def extract_reasoning(
|
||||
self, model_output: str, request: ChatCompletionRequest
|
||||
) -> tuple[str | None, str | None]:
|
||||
"""
|
||||
Extract reasoning content from the model output.
|
||||
|
||||
For text <think>abc</think>xyz:
|
||||
- 'abc' goes to reasoning_content
|
||||
- 'abc' goes to reasoning
|
||||
- 'xyz' goes to content
|
||||
|
||||
Returns:
|
||||
@ -165,7 +165,7 @@ class Glm4MoeModelReasoningParser(ReasoningParser):
|
||||
return None, model_output
|
||||
|
||||
# Extract reasoning content from the model output.
|
||||
reasoning_content, _, content = model_output.partition(self.think_end_token)
|
||||
reasoning, _, content = model_output.partition(self.think_end_token)
|
||||
|
||||
final_content = content or None
|
||||
return reasoning_content, final_content
|
||||
return reasoning, final_content
|
||||
|
||||
@ -104,7 +104,7 @@ class GptOssReasoningParser(ReasoningParser):
|
||||
return []
|
||||
return self.model_tokenizer.encode(content)
|
||||
|
||||
def extract_reasoning_content_streaming(
|
||||
def extract_reasoning_streaming(
|
||||
self,
|
||||
previous_text: str,
|
||||
current_text: str,
|
||||
@ -131,9 +131,9 @@ class GptOssReasoningParser(ReasoningParser):
|
||||
content_delta = cur_content
|
||||
if reasoning_delta is None and content_delta is None:
|
||||
return None
|
||||
return DeltaMessage(reasoning_content=reasoning_delta, content=content_delta)
|
||||
return DeltaMessage(reasoning=reasoning_delta, content=content_delta)
|
||||
|
||||
def extract_reasoning_content(
|
||||
def extract_reasoning(
|
||||
self,
|
||||
model_output: str,
|
||||
request: ChatCompletionRequest,
|
||||
|
||||
@ -49,7 +49,7 @@ class GraniteReasoningParser(ReasoningParser):
|
||||
len(think_start) for think_start in self.valid_think_starts
|
||||
)
|
||||
|
||||
def extract_reasoning_content(
|
||||
def extract_reasoning(
|
||||
self, model_output: str, request: ChatCompletionRequest
|
||||
) -> tuple[str | None, str | None]:
|
||||
"""Extract the reasoning content & content sections, respectively.
|
||||
@ -67,12 +67,12 @@ class GraniteReasoningParser(ReasoningParser):
|
||||
re_match = self.reasoning_regex.findall(model_output)
|
||||
if not re_match:
|
||||
return None, model_output
|
||||
reasoning_content, response_content = re_match[0]
|
||||
reasoning, response_content = re_match[0]
|
||||
if not response_content:
|
||||
return reasoning_content, None
|
||||
return reasoning_content, response_content
|
||||
return reasoning, None
|
||||
return reasoning, response_content
|
||||
|
||||
def extract_reasoning_content_streaming(
|
||||
def extract_reasoning_streaming(
|
||||
self,
|
||||
previous_text: str,
|
||||
current_text: str,
|
||||
@ -107,12 +107,10 @@ class GraniteReasoningParser(ReasoningParser):
|
||||
Union[DeltaMessage, None]
|
||||
DeltaMessage with either reasoning content or content, or None.
|
||||
"""
|
||||
reasoning_content, resp_seq_len, content = self._get_content_sections(
|
||||
current_text
|
||||
)
|
||||
reasoning, resp_seq_len, content = self._get_content_sections(current_text)
|
||||
# Either we haven't finished the start of the reasoning sequence,
|
||||
# or the model is generating something unexpected.
|
||||
if not reasoning_content:
|
||||
if not reasoning:
|
||||
delta_message = self._get_delta_message_with_no_reasoning_bounds(
|
||||
current_text, delta_text
|
||||
)
|
||||
@ -120,16 +118,16 @@ class GraniteReasoningParser(ReasoningParser):
|
||||
# the start of response sequence.
|
||||
elif not content:
|
||||
delta_message = self._get_delta_message_with_no_response_bounds(
|
||||
current_text, reasoning_content, delta_text
|
||||
current_text, reasoning, delta_text
|
||||
)
|
||||
# We've finished both the start of reasoning and start of response seq.
|
||||
else:
|
||||
# This should never happen since we matched on the response
|
||||
assert resp_seq_len is not None
|
||||
delta_message = self._get_delta_message_with_both_bounds(
|
||||
delta_text, reasoning_content, content, current_text, resp_seq_len
|
||||
delta_text, reasoning, content, current_text, resp_seq_len
|
||||
)
|
||||
if not delta_message.content and not delta_message.reasoning_content:
|
||||
if not delta_message.content and not delta_message.reasoning:
|
||||
return None
|
||||
return delta_message
|
||||
|
||||
@ -185,20 +183,20 @@ class GraniteReasoningParser(ReasoningParser):
|
||||
# message and append everything to content in the future.
|
||||
if was_substr and not is_substr:
|
||||
return DeltaMessage(
|
||||
reasoning_content=None,
|
||||
reasoning=None,
|
||||
content=current_text,
|
||||
)
|
||||
if is_substr:
|
||||
# Might still be in the special token sequence; return nothing
|
||||
return DeltaMessage(reasoning_content=None, content=None)
|
||||
return DeltaMessage(reasoning=None, content=None)
|
||||
# Otherwise the sequence has already been broken and we already
|
||||
# corrected; just return the delta text as normal content.
|
||||
return DeltaMessage(reasoning_content=None, content=delta_text)
|
||||
return DeltaMessage(reasoning=None, content=delta_text)
|
||||
|
||||
def _get_delta_message_with_no_response_bounds(
|
||||
self,
|
||||
current_text: str,
|
||||
reasoning_content: str,
|
||||
reasoning: str,
|
||||
delta_text: str,
|
||||
) -> DeltaMessage:
|
||||
"""Parse the delta message when the current text has both reasoning
|
||||
@ -208,7 +206,7 @@ class GraniteReasoningParser(ReasoningParser):
|
||||
|
||||
Args:
|
||||
current_text (str): The full previous + delta text.
|
||||
reasoning_content (str): reasoning content from current_text.
|
||||
reasoning (str): reasoning content from current_text.
|
||||
delta_text (str): Text to consider and parse content from.
|
||||
|
||||
Returns:
|
||||
@ -222,12 +220,12 @@ class GraniteReasoningParser(ReasoningParser):
|
||||
current_text.endswith(response_start)
|
||||
for response_start in self.valid_response_starts
|
||||
)
|
||||
if reasoning_content is None or ends_with_start_response_seq:
|
||||
return DeltaMessage(reasoning_content=None, content=None)
|
||||
if reasoning is None or ends_with_start_response_seq:
|
||||
return DeltaMessage(reasoning=None, content=None)
|
||||
|
||||
# Consider previous / current text only within context of the reasoning
|
||||
previous_text = reasoning_content[: -len(delta_text)]
|
||||
current_text = reasoning_content
|
||||
previous_text = reasoning[: -len(delta_text)]
|
||||
current_text = reasoning
|
||||
|
||||
# We need to be careful about adding unfinished response sequences;
|
||||
# Find the place at which we MIGHT be starting a response sequence
|
||||
@ -253,32 +251,30 @@ class GraniteReasoningParser(ReasoningParser):
|
||||
|
||||
# Delta only contains potential continued response sequence text.
|
||||
if delta_continues_substr:
|
||||
return DeltaMessage(reasoning_content=None, content=None)
|
||||
return DeltaMessage(reasoning=None, content=None)
|
||||
|
||||
if not prev_was_substr:
|
||||
# Delta may be starting a new response seq but has other text too.
|
||||
if delta_new_substr:
|
||||
return DeltaMessage(
|
||||
reasoning_content=delta_text[:delta_idx], content=None
|
||||
)
|
||||
return DeltaMessage(reasoning=delta_text[:delta_idx], content=None)
|
||||
# Normal case for most reasoning text (no potential special seqs).
|
||||
return DeltaMessage(reasoning_content=delta_text, content=None)
|
||||
return DeltaMessage(reasoning=delta_text, content=None)
|
||||
# The substring that previously seemed to be a potential response
|
||||
# seq wasn't one; we need to add the content to the delta message,
|
||||
# and also slice off the potential response sequence
|
||||
elif delta_new_substr:
|
||||
reasoning_content = previous_text[prev_idx:] + delta_text[:delta_idx]
|
||||
return DeltaMessage(reasoning_content=reasoning_content, content=None)
|
||||
reasoning = previous_text[prev_idx:] + delta_text[:delta_idx]
|
||||
return DeltaMessage(reasoning=reasoning, content=None)
|
||||
# No new substring yet, and we broke our old one; take the whole delta
|
||||
return DeltaMessage(
|
||||
reasoning_content=previous_text[prev_idx:] + delta_text,
|
||||
reasoning=previous_text[prev_idx:] + delta_text,
|
||||
content=None,
|
||||
)
|
||||
|
||||
def _get_delta_message_with_both_bounds(
|
||||
self,
|
||||
delta_text: str,
|
||||
reasoning_content: str,
|
||||
reasoning: str,
|
||||
response_content: str,
|
||||
current_text: str,
|
||||
response_seq_len: int,
|
||||
@ -288,7 +284,7 @@ class GraniteReasoningParser(ReasoningParser):
|
||||
|
||||
Args:
|
||||
delta_text: Text to consider and parse content from.
|
||||
reasoning_content: reasoning content from current_text.
|
||||
reasoning: reasoning content from current_text.
|
||||
response_content: response content from current_text.
|
||||
current_text: The full previous + delta text.
|
||||
response_seq_len: Len of the complete response sequence used.
|
||||
@ -301,20 +297,20 @@ class GraniteReasoningParser(ReasoningParser):
|
||||
reasoning_end_idx = len(delta_text) - (len(response_content) + response_seq_len)
|
||||
|
||||
if reasoning_end_idx < 0:
|
||||
delta_reasoning_content = None
|
||||
delta_reasoning = None
|
||||
else:
|
||||
# Get the starting offset
|
||||
start_reasoning_content_idx = (
|
||||
len(reasoning_content) + response_seq_len + len(response_content) - 1
|
||||
start_reasoning_idx = (
|
||||
len(reasoning) + response_seq_len + len(response_content) - 1
|
||||
)
|
||||
delta_offset = len(current_text) - len(delta_text)
|
||||
start_offset = start_reasoning_content_idx - delta_offset
|
||||
start_offset = start_reasoning_idx - delta_offset
|
||||
if start_offset < 0:
|
||||
start_offset = 0
|
||||
delta_reasoning_content = delta_text[start_offset:reasoning_end_idx]
|
||||
delta_reasoning = delta_text[start_offset:reasoning_end_idx]
|
||||
|
||||
return DeltaMessage(
|
||||
reasoning_content=delta_reasoning_content,
|
||||
reasoning=delta_reasoning,
|
||||
content=delta_content,
|
||||
)
|
||||
|
||||
@ -333,7 +329,7 @@ class GraniteReasoningParser(ReasoningParser):
|
||||
(if there is one) and the non-reasoning content.
|
||||
"""
|
||||
current_chunk_start = 0
|
||||
start_reasoning_content = None
|
||||
start_reasoning = None
|
||||
parsed_content = False
|
||||
delimiter_idxs = [
|
||||
idx
|
||||
@ -344,10 +340,10 @@ class GraniteReasoningParser(ReasoningParser):
|
||||
for current_chunk_end in delimiter_idxs:
|
||||
current_chunk = current_text[current_chunk_start:current_chunk_end]
|
||||
# Check to see if the start of reasoning seq if complete
|
||||
if start_reasoning_content is None:
|
||||
if start_reasoning is None:
|
||||
for think_start in self.valid_think_starts:
|
||||
if current_chunk == think_start[:-1]:
|
||||
start_reasoning_content = current_chunk_end + 1
|
||||
start_reasoning = current_chunk_end + 1
|
||||
current_chunk_start = current_chunk_end + 1
|
||||
break
|
||||
|
||||
@ -357,13 +353,11 @@ class GraniteReasoningParser(ReasoningParser):
|
||||
if current_chunk[-len(response_start) + 1 :] == response_start[:-1]:
|
||||
# Mark end of reasoning and start response content
|
||||
# after the start of response sequence.
|
||||
end_reasoning_content = current_chunk_end - len(response_start)
|
||||
reasoning_content = current_text[
|
||||
start_reasoning_content:end_reasoning_content
|
||||
]
|
||||
end_reasoning = current_chunk_end - len(response_start)
|
||||
reasoning = current_text[start_reasoning:end_reasoning]
|
||||
response_content = current_text[current_chunk_end + 1 :]
|
||||
return reasoning_content, len(response_start), response_content
|
||||
return reasoning, len(response_start), response_content
|
||||
|
||||
if start_reasoning_content and not parsed_content:
|
||||
return current_text[start_reasoning_content:], None, None
|
||||
if start_reasoning and not parsed_content:
|
||||
return current_text[start_reasoning:], None, None
|
||||
return None, None, None
|
||||
|
||||
@ -86,7 +86,7 @@ class HunyuanA13BReasoningParser(ReasoningParser):
|
||||
# this id is not part of content, so just return [] here.
|
||||
return []
|
||||
|
||||
def extract_reasoning_content(
|
||||
def extract_reasoning(
|
||||
self, model_output: str, request: ChatCompletionRequest
|
||||
) -> tuple[str | None, str | None]:
|
||||
"""Extract the reasoning content & content sections, respectively.
|
||||
@ -104,27 +104,27 @@ class HunyuanA13BReasoningParser(ReasoningParser):
|
||||
|
||||
re_match = self.full_match_reasoning_regex.findall(model_output)
|
||||
if re_match:
|
||||
reasoning_content, response_content = re_match[0]
|
||||
if len(reasoning_content) == 0:
|
||||
reasoning_content = None
|
||||
reasoning, response_content = re_match[0]
|
||||
if len(reasoning) == 0:
|
||||
reasoning = None
|
||||
if len(response_content) == 0:
|
||||
response_content = None
|
||||
return reasoning_content, response_content
|
||||
return reasoning, response_content
|
||||
|
||||
fallback_regex = self.half_match_reasoning_regex
|
||||
fallback_match = fallback_regex.findall(model_output)
|
||||
if fallback_match:
|
||||
reasoning_content, response_content = fallback_match[0]
|
||||
reasoning, response_content = fallback_match[0]
|
||||
|
||||
if response_content.endswith(self.response_end_expr):
|
||||
response_content = response_content[: -len(self.response_end_expr)]
|
||||
|
||||
if len(reasoning_content) == 0:
|
||||
reasoning_content = None
|
||||
if len(reasoning) == 0:
|
||||
reasoning = None
|
||||
if len(response_content) == 0:
|
||||
response_content = None
|
||||
|
||||
return reasoning_content, response_content
|
||||
return reasoning, response_content
|
||||
|
||||
return None, model_output
|
||||
|
||||
@ -140,7 +140,7 @@ class HunyuanA13BReasoningParser(ReasoningParser):
|
||||
sub_idx += 1
|
||||
return sub_idx == len(subsequence)
|
||||
|
||||
def extract_reasoning_content_streaming(
|
||||
def extract_reasoning_streaming(
|
||||
self,
|
||||
previous_text: str,
|
||||
current_text: str,
|
||||
@ -223,19 +223,15 @@ class HunyuanA13BReasoningParser(ReasoningParser):
|
||||
|
||||
# Return content based on current state
|
||||
if self.current_state == "think":
|
||||
return DeltaMessage(
|
||||
reasoning_content=buffered_content, content=None
|
||||
)
|
||||
return DeltaMessage(reasoning=buffered_content, content=None)
|
||||
else:
|
||||
return DeltaMessage(
|
||||
reasoning_content=None, content=buffered_content
|
||||
)
|
||||
return DeltaMessage(reasoning=None, content=buffered_content)
|
||||
else:
|
||||
# No buffered content, send normally
|
||||
if self.current_state == "think":
|
||||
return DeltaMessage(reasoning_content=delta_text, content=None)
|
||||
return DeltaMessage(reasoning=delta_text, content=None)
|
||||
else:
|
||||
return DeltaMessage(reasoning_content=None, content=delta_text)
|
||||
return DeltaMessage(reasoning=None, content=delta_text)
|
||||
|
||||
# If no content to send in this delta
|
||||
return None
|
||||
|
||||
@ -36,7 +36,7 @@ class IdentityReasoningParser(ReasoningParser):
|
||||
# Identity: return all tokens as content
|
||||
return input_ids
|
||||
|
||||
def extract_reasoning_content_streaming(
|
||||
def extract_reasoning_streaming(
|
||||
self,
|
||||
previous_text: str,
|
||||
current_text: str,
|
||||
@ -50,9 +50,9 @@ class IdentityReasoningParser(ReasoningParser):
|
||||
return DeltaMessage(content=delta_text)
|
||||
return None
|
||||
|
||||
def extract_reasoning_content(
|
||||
def extract_reasoning(
|
||||
self, model_output: str, request: ChatCompletionRequest
|
||||
) -> tuple[str | None, str | None]:
|
||||
# No reasoning separation: return None for reasoning_content,
|
||||
# No reasoning separation: return None for reasoning,
|
||||
# and full model_output as content
|
||||
return None, model_output
|
||||
|
||||
@ -48,7 +48,7 @@ class MiniMaxM2AppendThinkReasoningParser(ReasoningParser):
|
||||
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
|
||||
return input_ids
|
||||
|
||||
def extract_reasoning_content_streaming(
|
||||
def extract_reasoning_streaming(
|
||||
self,
|
||||
previous_text: str,
|
||||
current_text: str,
|
||||
@ -61,7 +61,7 @@ class MiniMaxM2AppendThinkReasoningParser(ReasoningParser):
|
||||
delta_text = "<think>" + delta_text
|
||||
return DeltaMessage(content=delta_text)
|
||||
|
||||
def extract_reasoning_content(
|
||||
def extract_reasoning(
|
||||
self, model_output: str, request: ChatCompletionRequest | ResponsesRequest
|
||||
) -> tuple[str | None, str | None]:
|
||||
return None, "<think>" + model_output
|
||||
|
||||
@ -115,7 +115,7 @@ class Olmo3ReasoningBuffer:
|
||||
if end_think_idx > 0:
|
||||
# this covers the case there's content before
|
||||
# the end of the reasoning block
|
||||
return DeltaMessage(reasoning_content=pretext)
|
||||
return DeltaMessage(reasoning=pretext)
|
||||
|
||||
if self.state == Olmo3ReasoningState.REASONING:
|
||||
# we are inside reasoning block, return and empty
|
||||
@ -124,7 +124,7 @@ class Olmo3ReasoningBuffer:
|
||||
text_buffer,
|
||||
self.buffer,
|
||||
) = self.buffer, ""
|
||||
return DeltaMessage(reasoning_content=text_buffer)
|
||||
return DeltaMessage(reasoning=text_buffer)
|
||||
|
||||
if self.state == Olmo3ReasoningState.CONTENT:
|
||||
# we are outside reasoning block, return and empty
|
||||
@ -250,7 +250,7 @@ class Olmo3ReasoningParser(ReasoningParser):
|
||||
# this id is not part of content, so just return [] here.
|
||||
return []
|
||||
|
||||
def extract_reasoning_content(
|
||||
def extract_reasoning(
|
||||
self,
|
||||
model_output: str,
|
||||
request: ChatCompletionRequest | ResponsesRequest,
|
||||
@ -271,14 +271,14 @@ class Olmo3ReasoningParser(ReasoningParser):
|
||||
|
||||
re_match = self.reasoning_regex.match(model_output)
|
||||
if re_match:
|
||||
reasoning_content = re_match.group("reasoning") or None
|
||||
reasoning = re_match.group("reasoning") or None
|
||||
content = re_match.group("content") or None
|
||||
return reasoning_content, content
|
||||
return reasoning, content
|
||||
|
||||
# no reasoning content
|
||||
return None, model_output
|
||||
|
||||
def extract_reasoning_content_streaming(
|
||||
def extract_reasoning_streaming(
|
||||
self,
|
||||
previous_text: str,
|
||||
current_text: str,
|
||||
|
||||
@ -27,7 +27,7 @@ class Qwen3ReasoningParser(BaseThinkingReasoningParser):
|
||||
"""The token that ends reasoning content."""
|
||||
return "</think>"
|
||||
|
||||
def extract_reasoning_content(
|
||||
def extract_reasoning(
|
||||
self, model_output: str, request: ChatCompletionRequest | ResponsesRequest
|
||||
) -> tuple[str | None, str | None]:
|
||||
"""
|
||||
@ -37,7 +37,7 @@ class Qwen3ReasoningParser(BaseThinkingReasoningParser):
|
||||
to be present, unlike other models that work with just the end token.
|
||||
|
||||
For text <think>abc</think>xyz:
|
||||
- 'abc' goes to reasoning_content
|
||||
- 'abc' goes to reasoning
|
||||
- 'xyz' goes to content
|
||||
|
||||
Returns:
|
||||
@ -61,7 +61,7 @@ class Qwen3ReasoningParser(BaseThinkingReasoningParser):
|
||||
return None, model_output
|
||||
|
||||
# Extract reasoning content from the model output.
|
||||
reasoning_content, _, content = model_output.partition(self.end_token)
|
||||
reasoning, _, content = model_output.partition(self.end_token)
|
||||
|
||||
final_content = content or None
|
||||
return reasoning_content, final_content
|
||||
return reasoning, final_content
|
||||
|
||||
@ -40,7 +40,7 @@ class Step3ReasoningParser(ReasoningParser):
|
||||
"token in the tokenizer!"
|
||||
)
|
||||
|
||||
def extract_reasoning_content_streaming(
|
||||
def extract_reasoning_streaming(
|
||||
self,
|
||||
previous_text: str,
|
||||
current_text: str,
|
||||
@ -54,7 +54,7 @@ class Step3ReasoningParser(ReasoningParser):
|
||||
Handles streaming output where previous + delta = current.
|
||||
Uses token IDs for faster processing.
|
||||
For text "abc</think>xyz":
|
||||
- 'abc' goes to reasoning_content
|
||||
- 'abc' goes to reasoning
|
||||
- 'xyz' goes to content
|
||||
"""
|
||||
# Skip single special token
|
||||
@ -64,10 +64,10 @@ class Step3ReasoningParser(ReasoningParser):
|
||||
if self.think_end_token_id in delta_token_ids:
|
||||
# </think> in delta, extract reasoning content and remaining content
|
||||
end_index = delta_text.find(self.think_end_token)
|
||||
reasoning_content = delta_text[:end_index]
|
||||
reasoning = delta_text[:end_index]
|
||||
content = delta_text[end_index + len(self.think_end_token) :]
|
||||
return DeltaMessage(
|
||||
reasoning_content=reasoning_content,
|
||||
reasoning=reasoning,
|
||||
content=content if content else None,
|
||||
)
|
||||
elif self.think_end_token_id in previous_token_ids:
|
||||
@ -75,9 +75,9 @@ class Step3ReasoningParser(ReasoningParser):
|
||||
return DeltaMessage(content=delta_text)
|
||||
else:
|
||||
# No </think> seen yet, everything is reasoning
|
||||
return DeltaMessage(reasoning_content=delta_text)
|
||||
return DeltaMessage(reasoning=delta_text)
|
||||
|
||||
def extract_reasoning_content(
|
||||
def extract_reasoning(
|
||||
self, model_output: str, request: ChatCompletionRequest
|
||||
) -> tuple[str | None, str | None]:
|
||||
# Check if the model output contains the </think> token
|
||||
@ -87,7 +87,7 @@ class Step3ReasoningParser(ReasoningParser):
|
||||
else:
|
||||
# Find the first occurrence of </think>
|
||||
end_index = model_output.find(self.think_end_token)
|
||||
reasoning_content = model_output[:end_index]
|
||||
reasoning = model_output[:end_index]
|
||||
|
||||
# Content after </think> token
|
||||
content = model_output[end_index + len(self.think_end_token) :]
|
||||
@ -95,7 +95,7 @@ class Step3ReasoningParser(ReasoningParser):
|
||||
if len(content) == 0:
|
||||
content = None
|
||||
|
||||
return reasoning_content, content
|
||||
return reasoning, content
|
||||
|
||||
def is_reasoning_end(self, input_ids: list[int]) -> bool:
|
||||
return self.think_end_token_id in input_ids
|
||||
|
||||
@ -30,18 +30,18 @@
|
||||
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
|
||||
{%- elif message.role == "assistant" %}
|
||||
{%- set content = message.content %}
|
||||
{%- set reasoning_content = '' %}
|
||||
{%- if message.reasoning_content is defined and message.reasoning_content is not none %}
|
||||
{%- set reasoning_content = message.reasoning_content %}
|
||||
{%- set reasoning = '' %}
|
||||
{%- if message.reasoning is defined and message.reasoning is not none %}
|
||||
{%- set reasoning = message.reasoning %}
|
||||
{%- else %}
|
||||
{%- if '</think>' in message.content %}
|
||||
{%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
|
||||
{%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
|
||||
{%- set reasoning = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
{%- if loop.index0 > ns.last_query_index %}
|
||||
{%- if loop.last or (not loop.last and reasoning_content) %}
|
||||
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
|
||||
{%- if loop.last or (not loop.last and reasoning) %}
|
||||
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
|
||||
{%- else %}
|
||||
{{- '<|im_start|>' + message.role + '\n' + content }}
|
||||
{%- endif %}
|
||||
|
||||
@ -121,8 +121,8 @@ def _prepare_apply_chat_template_tools_and_messages(
|
||||
#
|
||||
# [1]: https://github.com/mistralai/mistral-common/blob/f4a06998b75ed78bbf5aaf569590b772ea26c9f6/src/mistral_common/protocol/instruct/messages.py#L80
|
||||
for message in messages:
|
||||
# Remove reasoning_content as unsupported by Mistral
|
||||
_ = message.pop("reasoning_content", None) # type: ignore
|
||||
# Remove reasoning as unsupported by Mistral
|
||||
_ = message.pop("reasoning", None) # type: ignore
|
||||
|
||||
# The Mistral client, in comparison to the OpenAI client, requires the
|
||||
# "parameters" dict and the "description" string to be present
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user