diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md index 28520a80ed36..5f26c7cf182b 100644 --- a/docs/features/reasoning_outputs.md +++ b/docs/features/reasoning_outputs.md @@ -2,7 +2,10 @@ vLLM offers support for reasoning models like [DeepSeek R1](https://huggingface.co/deepseek-ai/DeepSeek-R1), which are designed to generate outputs containing both reasoning steps and final conclusions. -Reasoning models return an additional `reasoning_content` field in their outputs, which contains the reasoning steps that led to the final conclusion. This field is not present in the outputs of other models. +Reasoning models return an additional `reasoning` field in their outputs, which contains the reasoning steps that led to the final conclusion. This field is not present in the outputs of other models. + +!!! warning + `reasoning` used to be called `reasoning_content`. For now, `reasoning_content` will continue to work. However, we encourage you to migrate to `reasoning` in case `reasoning_content` is removed in future. ## Supported Models @@ -61,18 +64,18 @@ Next, make a request to the model that should return the reasoning content in th # extra_body={"chat_template_kwargs": {"enable_thinking": False}} response = client.chat.completions.create(model=model, messages=messages) - reasoning_content = response.choices[0].message.reasoning_content + reasoning = response.choices[0].message.reasoning content = response.choices[0].message.content - print("reasoning_content:", reasoning_content) + print("reasoning:", reasoning) print("content:", content) ``` -The `reasoning_content` field contains the reasoning steps that led to the final conclusion, while the `content` field contains the final conclusion. +The `reasoning` field contains the reasoning steps that led to the final conclusion, while the `content` field contains the final conclusion. ## Streaming chat completions -Streaming chat completions are also supported for reasoning models. The `reasoning_content` field is available in the `delta` field in [chat completion response chunks](https://platform.openai.com/docs/api-reference/chat/streaming). +Streaming chat completions are also supported for reasoning models. The `reasoning` field is available in the `delta` field in [chat completion response chunks](https://platform.openai.com/docs/api-reference/chat/streaming). ??? console "Json" @@ -88,7 +91,7 @@ Streaming chat completions are also supported for reasoning models. The `reasoni "index": 0, "delta": { "role": "assistant", - "reasoning_content": "is", + "reasoning": "is", }, "logprobs": null, "finish_reason": null @@ -97,7 +100,7 @@ Streaming chat completions are also supported for reasoning models. The `reasoni } ``` -OpenAI Python client library does not officially support `reasoning_content` attribute for streaming output. But the client supports extra attributes in the response. You can use `hasattr` to check if the `reasoning_content` attribute is present in the response. For example: +OpenAI Python client library does not officially support `reasoning` attribute for streaming output. But the client supports extra attributes in the response. You can use `hasattr` to check if the `reasoning` attribute is present in the response. For example: ??? code @@ -127,22 +130,22 @@ OpenAI Python client library does not officially support `reasoning_content` att ) print("client: Start streaming chat completions...") - printed_reasoning_content = False + printed_reasoning = False printed_content = False for chunk in stream: - # Safely extract reasoning_content and content from delta, + # Safely extract reasoning and content from delta, # defaulting to None if attributes don't exist or are empty strings - reasoning_content = ( - getattr(chunk.choices[0].delta, "reasoning_content", None) or None + reasoning = ( + getattr(chunk.choices[0].delta, "reasoning", None) or None ) content = getattr(chunk.choices[0].delta, "content", None) or None - if reasoning_content is not None: - if not printed_reasoning_content: - printed_reasoning_content = True - print("reasoning_content:", end="", flush=True) - print(reasoning_content, end="", flush=True) + if reasoning is not None: + if not printed_reasoning: + printed_reasoning = True + print("reasoning:", end="", flush=True) + print(reasoning, end="", flush=True) elif content is not None: if not printed_content: printed_content = True @@ -151,11 +154,11 @@ OpenAI Python client library does not officially support `reasoning_content` att print(content, end="", flush=True) ``` -Remember to check whether the `reasoning_content` exists in the response before accessing it. You could check out the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py). +Remember to check whether the `reasoning` exists in the response before accessing it. You could check out the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py). ## Tool Calling -The reasoning content is also available when both tool calling and the reasoning parser are enabled. Additionally, tool calling only parses functions from the `content` field, not from the `reasoning_content`. +The reasoning content is also available when both tool calling and the reasoning parser are enabled. Additionally, tool calling only parses functions from the `content` field, not from the `reasoning`. ??? code @@ -192,7 +195,7 @@ The reasoning content is also available when both tool calling and the reasoning print(response) tool_call = response.choices[0].message.tool_calls[0].function - print(f"reasoning_content: {response.choices[0].message.reasoning_content}") + print(f"reasoning: {response.choices[0].message.reasoning}") print(f"Function called: {tool_call.name}") print(f"Arguments: {tool_call.arguments}") ``` @@ -223,7 +226,7 @@ You can add a new `ReasoningParser` similar to [vllm/reasoning/deepseek_r1_reaso def __init__(self, tokenizer: AnyTokenizer): super().__init__(tokenizer) - def extract_reasoning_content_streaming( + def extract_reasoning_streaming( self, previous_text: str, current_text: str, @@ -240,7 +243,7 @@ You can add a new `ReasoningParser` similar to [vllm/reasoning/deepseek_r1_reaso previously been parsed and extracted (see constructor) """ - def extract_reasoning_content( + def extract_reasoning( self, model_output: str, request: ChatCompletionRequest | ResponsesRequest, diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md index 9e1da37ca962..e38627c70788 100644 --- a/docs/features/structured_outputs.md +++ b/docs/features/structured_outputs.md @@ -204,7 +204,7 @@ Note that you can use reasoning with any provided structured outputs feature. Th } }, ) - print("reasoning_content: ", completion.choices[0].message.reasoning_content) + print("reasoning: ", completion.choices[0].message.reasoning) print("content: ", completion.choices[0].message.content) ``` diff --git a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py index 4006d07f73b0..1dfc3084646d 100644 --- a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py +++ b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ An example demonstrates how to use tool calling with reasoning models -like QwQ-32B. The reasoning_content will not be parsed by the tool +like QwQ-32B. The reasoning will not be parsed by the tool calling process; only the final output will be parsed. To run this example, you need to start the vLLM server with both @@ -78,7 +78,7 @@ messages = [ def extract_reasoning_and_calls(chunks: list): - reasoning_content = "" + reasoning = "" tool_call_idx = -1 arguments = [] function_names = [] @@ -97,9 +97,9 @@ def extract_reasoning_and_calls(chunks: list): if tool_call.function.arguments: arguments[tool_call_idx] += tool_call.function.arguments else: - if hasattr(chunk.choices[0].delta, "reasoning_content"): - reasoning_content += chunk.choices[0].delta.reasoning_content - return reasoning_content, arguments, function_names + if hasattr(chunk.choices[0].delta, "reasoning"): + reasoning += chunk.choices[0].delta.reasoning + return reasoning, arguments, function_names def main(): @@ -115,7 +115,7 @@ def main(): tool_calls = client.chat.completions.create( messages=messages, model=model, tools=tools ) - print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}") + print(f"reasoning: {tool_calls.choices[0].message.reasoning}") print(f"function name: {tool_calls.choices[0].message.tool_calls[0].function.name}") print( f"function arguments: " @@ -129,9 +129,9 @@ def main(): chunks = list(tool_calls_stream) - reasoning_content, arguments, function_names = extract_reasoning_and_calls(chunks) + reasoning, arguments, function_names = extract_reasoning_and_calls(chunks) - print(f"reasoning_content: {reasoning_content}") + print(f"reasoning: {reasoning}") print(f"function name: {function_names[0]}") print(f"function arguments: {arguments[0]}") @@ -144,7 +144,7 @@ def main(): ) tool_call = tool_calls.choices[0].message.tool_calls[0].function - print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}") + print(f"reasoning: {tool_calls.choices[0].message.reasoning}") print(f"function name: {tool_call.name}") print(f"function arguments: {tool_call.arguments}") print("----------Stream Generate With Named Function Calling--------------") @@ -159,8 +159,8 @@ def main(): chunks = list(tool_calls_stream) - reasoning_content, arguments, function_names = extract_reasoning_and_calls(chunks) - print(f"reasoning_content: {reasoning_content}") + reasoning, arguments, function_names = extract_reasoning_and_calls(chunks) + print(f"reasoning: {reasoning}") print(f"function name: {function_names[0]}") print(f"function arguments: {arguments[0]}") print("\n\n") diff --git a/examples/online_serving/openai_chat_completion_with_reasoning.py b/examples/online_serving/openai_chat_completion_with_reasoning.py index 932dbeb2e7a2..87043897b058 100644 --- a/examples/online_serving/openai_chat_completion_with_reasoning.py +++ b/examples/online_serving/openai_chat_completion_with_reasoning.py @@ -38,10 +38,10 @@ def main(): # For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}` response = client.chat.completions.create(model=model, messages=messages) - reasoning_content = response.choices[0].message.reasoning_content + reasoning = response.choices[0].message.reasoning content = response.choices[0].message.content - print("reasoning_content for Round 1:", reasoning_content) + print("reasoning for Round 1:", reasoning) print("content for Round 1:", content) # Round 2 @@ -54,10 +54,10 @@ def main(): ) response = client.chat.completions.create(model=model, messages=messages) - reasoning_content = response.choices[0].message.reasoning_content + reasoning = response.choices[0].message.reasoning content = response.choices[0].message.content - print("reasoning_content for Round 2:", reasoning_content) + print("reasoning for Round 2:", reasoning) print("content for Round 2:", content) diff --git a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py index 7d1ea3771459..8e262701b720 100644 --- a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py +++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py @@ -20,7 +20,7 @@ in real-time as they are generated by the model. This is useful for scenarios where you want to display chat completions to the user as they are generated by the model. -Remember to check content and reasoning_content exist in `ChatCompletionChunk`, +Remember to check content and reasoning exist in `ChatCompletionChunk`, content may not exist leading to errors if you try to access it. """ @@ -47,22 +47,20 @@ def main(): stream = client.chat.completions.create(model=model, messages=messages, stream=True) print("client: Start streaming chat completions...") - printed_reasoning_content = False + printed_reasoning = False printed_content = False for chunk in stream: - # Safely extract reasoning_content and content from delta, + # Safely extract reasoning and content from delta, # defaulting to None if attributes don't exist or are empty strings - reasoning_content = ( - getattr(chunk.choices[0].delta, "reasoning_content", None) or None - ) + reasoning = getattr(chunk.choices[0].delta, "reasoning", None) or None content = getattr(chunk.choices[0].delta, "content", None) or None - if reasoning_content is not None: - if not printed_reasoning_content: - printed_reasoning_content = True - print("reasoning_content:", end="", flush=True) - print(reasoning_content, end="", flush=True) + if reasoning is not None: + if not printed_reasoning: + printed_reasoning = True + print("reasoning:", end="", flush=True) + print(reasoning, end="", flush=True) elif content is not None: if not printed_content: printed_content = True diff --git a/examples/online_serving/streamlit_openai_chatbot_webserver.py b/examples/online_serving/streamlit_openai_chatbot_webserver.py index 64c8a9178280..d60dbf4d753a 100644 --- a/examples/online_serving/streamlit_openai_chatbot_webserver.py +++ b/examples/online_serving/streamlit_openai_chatbot_webserver.py @@ -159,8 +159,8 @@ def get_llm_response(messages, model, reason, content_ph=None, reasoning_ph=None for chunk in response: delta = chunk.choices[0].delta # Stream reasoning first - if reason and hasattr(delta, "reasoning_content") and live_think: - rc = delta.reasoning_content + if reason and hasattr(delta, "reasoning") and live_think: + rc = delta.reasoning if rc: think_text += rc live_think.markdown(think_text + "▌") @@ -262,8 +262,8 @@ def server_supports_reasoning(): messages=[{"role": "user", "content": "Hi"}], stream=False, ) - return hasattr(resp.choices[0].message, "reasoning_content") and bool( - resp.choices[0].message.reasoning_content + return hasattr(resp.choices[0].message, "reasoning") and bool( + resp.choices[0].message.reasoning ) diff --git a/examples/online_serving/structured_outputs/structured_outputs.py b/examples/online_serving/structured_outputs/structured_outputs.py index 02853a95469a..ff473d044e32 100644 --- a/examples/online_serving/structured_outputs/structured_outputs.py +++ b/examples/online_serving/structured_outputs/structured_outputs.py @@ -33,7 +33,7 @@ async def print_stream_response( async for chunk in stream_response: delta = chunk.choices[0].delta - reasoning_chunk_text: str | None = getattr(delta, "reasoning_content", None) + reasoning_chunk_text: str | None = getattr(delta, "reasoning", None) content_chunk_text = delta.content if args.reasoning: @@ -255,8 +255,8 @@ async def cli(): for constraint, response in zip(constraints, results): print(f"\n\n{constraint}:") message = response.choices[0].message - if args.reasoning and hasattr(message, "reasoning_content"): - print(f" Reasoning: {message.reasoning_content or ''}") + if args.reasoning and hasattr(message, "reasoning"): + print(f" Reasoning: {message.reasoning or ''}") print(f" Content: {message.content!r}") diff --git a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py index e452b578ba22..7b3092b56303 100644 --- a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py +++ b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py @@ -80,7 +80,7 @@ FUNC_ARGS = """{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}""" def extract_reasoning_and_calls(chunks: list): - reasoning_content = "" + reasoning = "" tool_call_idx = -1 arguments = [] function_names = [] @@ -99,9 +99,9 @@ def extract_reasoning_and_calls(chunks: list): if tool_call.function.arguments: arguments[tool_call_idx] += tool_call.function.arguments else: - if hasattr(chunk.choices[0].delta, "reasoning_content"): - reasoning_content += chunk.choices[0].delta.reasoning_content - return reasoning_content, arguments, function_names + if hasattr(chunk.choices[0].delta, "reasoning"): + reasoning += chunk.choices[0].delta.reasoning + return reasoning, arguments, function_names # test streaming @@ -119,8 +119,8 @@ async def test_chat_streaming_of_tool_and_reasoning(client: openai.AsyncOpenAI): async for chunk in stream: chunks.append(chunk) - reasoning_content, arguments, function_names = extract_reasoning_and_calls(chunks) - assert len(reasoning_content) > 0 + reasoning, arguments, function_names = extract_reasoning_and_calls(chunks) + assert len(reasoning) > 0 assert len(function_names) > 0 and function_names[0] == FUNC_NAME assert len(arguments) > 0 and arguments[0] == FUNC_ARGS @@ -136,6 +136,6 @@ async def test_chat_full_of_tool_and_reasoning(client: openai.AsyncOpenAI): stream=False, ) - assert len(tool_calls.choices[0].message.reasoning_content) > 0 + assert len(tool_calls.choices[0].message.reasoning) > 0 assert tool_calls.choices[0].message.tool_calls[0].function.name == FUNC_NAME assert tool_calls.choices[0].message.tool_calls[0].function.arguments == FUNC_ARGS diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py index 6d8db361a57d..53369f074eca 100644 --- a/tests/entrypoints/openai/test_completion_with_function_calling.py +++ b/tests/entrypoints/openai/test_completion_with_function_calling.py @@ -180,8 +180,8 @@ async def test_function_tool_use( extra_body={"chat_template_kwargs": {"enable_thinking": enable_thinking}}, ) if enable_thinking: - assert chat_completion.choices[0].message.reasoning_content is not None - assert chat_completion.choices[0].message.reasoning_content != "" + assert chat_completion.choices[0].message.reasoning is not None + assert chat_completion.choices[0].message.reasoning != "" assert chat_completion.choices[0].message.tool_calls is not None assert len(chat_completion.choices[0].message.tool_calls) > 0 else: @@ -200,9 +200,9 @@ async def test_function_tool_use( async for chunk in output_stream: if chunk.choices: if enable_thinking and getattr( - chunk.choices[0].delta, "reasoning_content", None + chunk.choices[0].delta, "reasoning", None ): - reasoning.append(chunk.choices[0].delta.reasoning_content) + reasoning.append(chunk.choices[0].delta.reasoning) if chunk.choices[0].delta.tool_calls: output.extend(chunk.choices[0].delta.tool_calls) diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py index 2f678a0535cc..f951b57fe726 100644 --- a/tests/entrypoints/openai/test_run_batch.py +++ b/tests/entrypoints/openai/test_run_batch.py @@ -232,9 +232,9 @@ def test_reasoning_parser(): assert isinstance(line_dict, dict) assert line_dict["error"] is None - # Check that reasoning_content is present and not empty - reasoning_content = line_dict["response"]["body"]["choices"][0]["message"][ - "reasoning_content" + # Check that reasoning is present and not empty + reasoning = line_dict["response"]["body"]["choices"][0]["message"][ + "reasoning" ] - assert reasoning_content is not None - assert len(reasoning_content) > 0 + assert reasoning is not None + assert len(reasoning) > 0 diff --git a/tests/reasoning/test_base_thinking_reasoning_parser.py b/tests/reasoning/test_base_thinking_reasoning_parser.py index ddda50fe770a..d31b1c7d169b 100644 --- a/tests/reasoning/test_base_thinking_reasoning_parser.py +++ b/tests/reasoning/test_base_thinking_reasoning_parser.py @@ -151,57 +151,57 @@ class TestBaseThinkingReasoningParserMethods: class TestBaseThinkingReasoningParserExtraction: """Test reasoning content extraction methods.""" - def test_extract_reasoning_content_with_both_tokens(self, test_tokenizer): + def test_extract_reasoning_with_both_tokens(self, test_tokenizer): """Test extraction when both start and end tokens are present.""" parser = TestThinkingReasoningParser(test_tokenizer) request = ChatCompletionRequest(messages=[], model="test-model") model_output = "This is reasoningThis is content" - reasoning, content = parser.extract_reasoning_content(model_output, request) + reasoning, content = parser.extract_reasoning(model_output, request) assert reasoning == "This is reasoning" assert content == "This is content" - def test_extract_reasoning_content_only_end_token(self, test_tokenizer): + def test_extract_reasoning_only_end_token(self, test_tokenizer): """Test extraction when only end token is present.""" parser = TestThinkingReasoningParser(test_tokenizer) request = ChatCompletionRequest(messages=[], model="test-model") model_output = "This is reasoningThis is content" - reasoning, content = parser.extract_reasoning_content(model_output, request) + reasoning, content = parser.extract_reasoning(model_output, request) assert reasoning == "This is reasoning" assert content == "This is content" - def test_extract_reasoning_content_no_end_token(self, test_tokenizer): + def test_extract_reasoning_no_end_token(self, test_tokenizer): """Test extraction when no end token is present.""" parser = TestThinkingReasoningParser(test_tokenizer) request = ChatCompletionRequest(messages=[], model="test-model") model_output = "This is just content" - reasoning, content = parser.extract_reasoning_content(model_output, request) + reasoning, content = parser.extract_reasoning(model_output, request) assert reasoning == "This is just content" assert content is None - def test_extract_reasoning_content_empty_output(self, test_tokenizer): + def test_extract_reasoning_empty_output(self, test_tokenizer): """Test extraction with empty output.""" parser = TestThinkingReasoningParser(test_tokenizer) request = ChatCompletionRequest(messages=[], model="test-model") model_output = "" - reasoning, content = parser.extract_reasoning_content(model_output, request) + reasoning, content = parser.extract_reasoning(model_output, request) assert reasoning == "" assert content is None - def test_extract_reasoning_content_only_tokens(self, test_tokenizer): + def test_extract_reasoning_only_tokens(self, test_tokenizer): """Test extraction with only tokens and no content.""" parser = TestThinkingReasoningParser(test_tokenizer) request = ChatCompletionRequest(messages=[], model="test-model") model_output = "" - reasoning, content = parser.extract_reasoning_content(model_output, request) + reasoning, content = parser.extract_reasoning(model_output, request) assert reasoning == "" assert content is None diff --git a/tests/reasoning/test_deepseekr1_reasoning_parser.py b/tests/reasoning/test_deepseekr1_reasoning_parser.py index 946d01c123c5..91f0c93653d3 100644 --- a/tests/reasoning/test_deepseekr1_reasoning_parser.py +++ b/tests/reasoning/test_deepseekr1_reasoning_parser.py @@ -21,97 +21,97 @@ def deepseek_r1_qwen_tokenizer(): SIMPLE_REASONING = { "output": "This is a reasoning sectionThis is the rest", - "reasoning_content": "This is a reasoning section", + "reasoning": "This is a reasoning section", "content": "This is the rest", "is_reasoning_end": True, } COMPLETE_REASONING = { "output": "This is a reasoning section", - "reasoning_content": "This is a reasoning section", + "reasoning": "This is a reasoning section", "content": None, "is_reasoning_end": True, } NO_CONTENT = { "output": "This is content", - "reasoning_content": "This is content", + "reasoning": "This is content", "content": None, "is_reasoning_end": False, } NO_REASONING_STREAMING = { "output": "This is a reasoning section", - "reasoning_content": "This is a reasoning section", + "reasoning": "This is a reasoning section", "content": None, "is_reasoning_end": False, } MULTIPLE_LINES = { "output": "This\nThatThis is the rest\nThat", - "reasoning_content": "This\nThat", + "reasoning": "This\nThat", "content": "This is the rest\nThat", "is_reasoning_end": True, } SHORTEST_REASONING_NO_STREAMING = { "output": "This is the rest", - "reasoning_content": "", + "reasoning": "", "content": "This is the rest", "is_reasoning_end": True, } SHORTEST_REASONING = { "output": "This is the rest", - "reasoning_content": None, + "reasoning": None, "content": "This is the rest", "is_reasoning_end": True, } REASONING_WITH_THINK = { "output": "This is a reasoning sectionThis is the rest", - "reasoning_content": "This is a reasoning section", + "reasoning": "This is a reasoning section", "content": "This is the rest", "is_reasoning_end": True, } COMPLETE_REASONING_WITH_THINK = { "output": "This is a reasoning section", - "reasoning_content": "This is a reasoning section", + "reasoning": "This is a reasoning section", "content": None, "is_reasoning_end": True, } MULTIPLE_LINES_WITH_THINK = { "output": "This\nThatThis is the rest\nThat", - "reasoning_content": "This\nThat", + "reasoning": "This\nThat", "content": "This is the rest\nThat", "is_reasoning_end": True, } SHORTEST_REASONING_NO_STREAMING_WITH_THINK = { "output": "This is the rest", - "reasoning_content": "", + "reasoning": "", "content": "This is the rest", "is_reasoning_end": True, } SHORTEST_REASONING_WITH_THINK = { "output": "This is the rest", - "reasoning_content": None, + "reasoning": None, "content": "This is the rest", "is_reasoning_end": True, } THINK_NO_END = { "output": "This is a reasoning section", - "reasoning_content": "This is a reasoning section", + "reasoning": "This is a reasoning section", "content": None, "is_reasoning_end": False, } EMPTY = { "output": "", - "reasoning_content": "", + "reasoning": "", "content": None, "is_reasoning_end": False, } EMPTY_STREAMING = { "output": "", - "reasoning_content": None, + "reasoning": None, "content": None, "is_reasoning_end": False, } NEW_LINE = { "output": "\nThis is a reasoning section\nThis is the rest", - "reasoning_content": "This is a reasoning section", + "reasoning": "This is a reasoning section", "content": "\nThis is the rest", "is_reasoning_end": True, } @@ -121,7 +121,7 @@ NEW_LINE = { # or not. NEW_LINE_STREAMING = { "output": "\nThis is a reasoning section\nThis is the rest", - "reasoning_content": "\nThis is a reasoning section", + "reasoning": "\nThis is a reasoning section", "content": "\nThis is the rest", "is_reasoning_end": True, } @@ -269,7 +269,7 @@ def test_reasoning( parser, output_tokens, streaming=streaming ) - assert reasoning == param_dict["reasoning_content"] + assert reasoning == param_dict["reasoning"] assert content == param_dict["content"] # Test is_reasoning_end diff --git a/tests/reasoning/test_deepseekv3_reasoning_parser.py b/tests/reasoning/test_deepseekv3_reasoning_parser.py index e1ff7462b1fa..6e8f0e8dcc9b 100644 --- a/tests/reasoning/test_deepseekv3_reasoning_parser.py +++ b/tests/reasoning/test_deepseekv3_reasoning_parser.py @@ -44,14 +44,14 @@ def test_identity_reasoning_parser_basic(tokenizer): # Test extract_content_ids returns all input_ids assert parser.extract_content_ids(input_ids) == input_ids - # Test extract_reasoning_content returns (None, model_output) + # Test extract_reasoning returns (None, model_output) request = ChatCompletionRequest(model="test-model", messages=[], temperature=1.0) - reasoning, content = parser.extract_reasoning_content(input_text, request) + reasoning, content = parser.extract_reasoning(input_text, request) assert reasoning is None assert content == input_text - # Test extract_reasoning_content_streaming returns DeltaMessage or None - result = parser.extract_reasoning_content_streaming( + # Test extract_reasoning_streaming returns DeltaMessage or None + result = parser.extract_reasoning_streaming( previous_text="", current_text="Hello world", delta_text="Hello world", @@ -63,7 +63,7 @@ def test_identity_reasoning_parser_basic(tokenizer): assert result.content == "Hello world" # If delta_text is empty, should return None - result_none = parser.extract_reasoning_content_streaming( + result_none = parser.extract_reasoning_streaming( previous_text="Hello world", current_text="Hello world", delta_text="", diff --git a/tests/reasoning/test_ernie45_reasoning_parser.py b/tests/reasoning/test_ernie45_reasoning_parser.py index 344478013e6b..dbf5507ae68b 100644 --- a/tests/reasoning/test_ernie45_reasoning_parser.py +++ b/tests/reasoning/test_ernie45_reasoning_parser.py @@ -20,36 +20,36 @@ def ernie45_tokenizer(): # 带 ,非stream WITH_THINK = { "output": "abcdef", - "reasoning_content": "abc", + "reasoning": "abc", "content": "def", } # 带 ,stream WITH_THINK_STREAM = { "output": "abcdef", - "reasoning_content": "abc", + "reasoning": "abc", "content": "def", } -# without , all is reasoning_content +# without , all is reasoning WITHOUT_THINK = { "output": "abc", - "reasoning_content": "abc", + "reasoning": "abc", "content": None, } -# without , all is reasoning_content +# without , all is reasoning WITHOUT_THINK_STREAM = { "output": "abc", - "reasoning_content": "abc", + "reasoning": "abc", "content": None, } COMPLETE_REASONING = { "output": "abc", - "reasoning_content": "abc", + "reasoning": "abc", "content": None, } MULTILINE_REASONING = { "output": "abc\nABCdef\nDEF", - "reasoning_content": "abc\nABC", + "reasoning": "abc\nABC", "content": "def\nDEF", } @@ -120,5 +120,5 @@ def test_reasoning( print() - assert reasoning == param_dict["reasoning_content"] + assert reasoning == param_dict["reasoning"] assert content == param_dict["content"] diff --git a/tests/reasoning/test_glm4_moe_reasoning_parser.py b/tests/reasoning/test_glm4_moe_reasoning_parser.py index 0a8595a00fcb..6f7827e5b827 100644 --- a/tests/reasoning/test_glm4_moe_reasoning_parser.py +++ b/tests/reasoning/test_glm4_moe_reasoning_parser.py @@ -21,54 +21,54 @@ def glm45_tokenizer(): WITH_THINK = { "output": "This is a reasoning sectionThis is the rest", - "reasoning_content": "This is a reasoning section", + "reasoning": "This is a reasoning section", "content": "This is the rest", "is_reasoning_end": True, } WITH_THINK_STREAM = { "output": "This is a reasoning sectionThis is the rest", - "reasoning_content": "This is a reasoning section", + "reasoning": "This is a reasoning section", "content": "This is the rest", "is_reasoning_end": True, } WITHOUT_THINK = { "output": "This is the rest", - "reasoning_content": None, + "reasoning": None, "content": "This is the rest", "is_reasoning_end": False, } WITHOUT_THINK_STREAM = { "output": "This is the rest", - "reasoning_content": None, + "reasoning": None, "content": "This is the rest", "is_reasoning_end": False, } COMPLETE_REASONING = { "output": "This is a reasoning section", - "reasoning_content": "This is a reasoning section", + "reasoning": "This is a reasoning section", "content": None, "is_reasoning_end": True, } MULTILINE_REASONING = { "output": "This is a reasoning\nsectionThis is the rest\nThat", - "reasoning_content": "This is a reasoning\nsection", + "reasoning": "This is a reasoning\nsection", "content": "This is the rest\nThat", "is_reasoning_end": True, } ONLY_OPEN_TAG = { "output": "This is a reasoning section", - "reasoning_content": None, + "reasoning": None, "content": "This is a reasoning section", "is_reasoning_end": False, } ONLY_OPEN_TAG_STREAM = { "output": "This is a reasoning section", - "reasoning_content": "This is a reasoning section", + "reasoning": "This is a reasoning section", "content": None, "is_reasoning_end": False, } @@ -184,7 +184,7 @@ def test_reasoning( parser, output_tokens, streaming=streaming ) - assert reasoning == param_dict["reasoning_content"] + assert reasoning == param_dict["reasoning"] assert content == param_dict["content"] output_ids = glm45_tokenizer.convert_tokens_to_ids(output) diff --git a/tests/reasoning/test_granite_reasoning_parser.py b/tests/reasoning/test_granite_reasoning_parser.py index de1663408d72..14aad3ad0818 100644 --- a/tests/reasoning/test_granite_reasoning_parser.py +++ b/tests/reasoning/test_granite_reasoning_parser.py @@ -12,37 +12,37 @@ START_RESPONSE = "Here is my response:" SIMPLE_REASONING = { "output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest", # noqa: E501 - "reasoning_content": "This is a reasoning section", + "reasoning": "This is a reasoning section", "content": "This is the rest", } COMPLETE_REASONING = { "output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}", - "reasoning_content": "This is a reasoning section", + "reasoning": "This is a reasoning section", "content": None, } NO_REASONING = { "output": "This is content", - "reasoning_content": None, + "reasoning": None, "content": "This is content", } MULTIPLE_LINES = { "output": f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat", - "reasoning_content": "This\nThat", + "reasoning": "This\nThat", "content": "This is the rest\nThat", } REASONING_WITH_THINK = { "output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest", # noqa: E501 - "reasoning_content": "This is a reasoning section", + "reasoning": "This is a reasoning section", "content": "This is the rest", } COMPLETE_REASONING_WITH_THINK = { "output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}", - "reasoning_content": "This is a reasoning section", + "reasoning": "This is a reasoning section", "content": None, } MULTIPLE_LINES_WITH_THINK = { "output": f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat", - "reasoning_content": "This\nThat", + "reasoning": "This\nThat", "content": "This is the rest\nThat", } @@ -141,7 +141,7 @@ def test_reasoning( parser, output_tokens, streaming=streaming ) - assert reasoning == param_dict["reasoning_content"] + assert reasoning == param_dict["reasoning"] assert content == param_dict["content"] @@ -155,7 +155,7 @@ STREAMING_1 = { "previous_text": None, "current_text": "Here", "delta_text": "Here", - "reasoning_content": None, + "reasoning": None, "content": None, } # When we fail, we should give what was previously being silenced first @@ -163,7 +163,7 @@ STREAMING_2 = { "previous_text": "Here is my thought", "current_text": "Here is my thought failure", "delta_text": " failure", - "reasoning_content": None, + "reasoning": None, "content": "Here is my thought failure", } # But then after the first one, we should only add the delta text to content @@ -171,7 +171,7 @@ STREAMING_3 = { "previous_text": "Here wrong", "current_text": " words", "delta_text": " Here wrong words", - "reasoning_content": None, + "reasoning": None, "content": " words", } # But then after the first one, we should only add the delta text to content @@ -179,7 +179,7 @@ STREAMING_4 = { "previous_text": "Here is my thought", "current_text": "Here is my thought process:", "delta_text": " process:", - "reasoning_content": None, + "reasoning": None, "content": None, } # Reasoning started successfully; parse reasoning content @@ -187,7 +187,7 @@ STREAMING_5 = { "previous_text": "Here is my thought process:", "current_text": "Here is my thought process: foo", "delta_text": " foo", - "reasoning_content": " foo", + "reasoning": " foo", "content": None, } # Response special sequence has started, but not finished. @@ -195,7 +195,7 @@ STREAMING_6 = { "previous_text": "Here is my thought process: foo", "current_text": "Here is my thought process: foo Here is", "delta_text": " Here is", - "reasoning_content": " ", + "reasoning": " ", "content": None, } # Response special sequence started, but was broken; the reasoning @@ -204,7 +204,7 @@ STREAMING_7 = { "previous_text": "Here is my thought process: foo Here is", "current_text": "Here is my thought process: foo Here is Here", "delta_text": " Here", - "reasoning_content": "Here is ", + "reasoning": "Here is ", "content": None, } # Response special sequence is ongoing @@ -212,7 +212,7 @@ STREAMING_8 = { "previous_text": "Here is my thought process: foo Here is my response:", "current_text": "Here is my thought process: foo Here is my response: bar", "delta_text": " bar", - "reasoning_content": None, + "reasoning": None, "content": " bar", } # The delta text has everything; we should be able to correctly parse both @@ -220,7 +220,7 @@ STREAMING_9 = { "previous_text": None, "current_text": "Here is my thought process: foo Here is my response: bar", "delta_text": "Here is my thought process: foo Here is my response: bar", - "reasoning_content": " foo ", + "reasoning": " foo ", "content": " bar", } ## The Response is ongoing, and the delta mixes reasoning content / content @@ -228,7 +228,7 @@ STREAMING_10 = { "previous_text": "Here is my thought process: foo", "current_text": "Here is my thought process: foo bar Here is my response: baz", "delta_text": " bar Here is my response: baz", - "reasoning_content": " bar ", + "reasoning": " bar ", "content": " baz", } # The delta text starts a new substring that might be a response special seq @@ -236,7 +236,7 @@ STREAMING_11 = { "previous_text": "Here is my thought process: This is a reasoning section ", "current_text": "Here is my thought process: This is a reasoning section Here", "delta_text": "Here", - "reasoning_content": None, + "reasoning": None, "content": None, } # The delta text is finishing the response special seq @@ -244,14 +244,14 @@ STREAMING_12 = { "previous_text": "Here is my thought process: foo Here is my response", "current_text": "Here is my thought process: foo Here is my response:", "delta_text": ":", - "reasoning_content": None, + "reasoning": None, "content": None, } STREAMING_13 = { "previous_text": "Here is my thought process: foo Here", "current_text": "Here is my thought process: foo Here was", "delta_text": " was", - "reasoning_content": "Here was", + "reasoning": "Here was", "content": None, } @@ -326,7 +326,7 @@ def test_streaming_subcases(param_dict): tokenizer ) - response = parser.extract_reasoning_content_streaming( + response = parser.extract_reasoning_streaming( previous_text=param_dict["previous_text"], current_text=param_dict["current_text"], delta_text=param_dict["delta_text"], @@ -336,9 +336,9 @@ def test_streaming_subcases(param_dict): ) # Streaming currently expects at least one of reasoning content / content, # so the response should return None in that case. - if param_dict["reasoning_content"] is None and param_dict["content"] is None: + if param_dict["reasoning"] is None and param_dict["content"] is None: assert response is None else: assert isinstance(response, DeltaMessage) - assert param_dict["reasoning_content"] == response.reasoning_content + assert param_dict["reasoning"] == response.reasoning assert param_dict["content"] == response.content diff --git a/tests/reasoning/test_hunyuan_reasoning_parser.py b/tests/reasoning/test_hunyuan_reasoning_parser.py index b7e3ea73ccde..32e753d2abb7 100644 --- a/tests/reasoning/test_hunyuan_reasoning_parser.py +++ b/tests/reasoning/test_hunyuan_reasoning_parser.py @@ -14,49 +14,49 @@ END_RESPONSE = "\n" NO_REASONING_QUICK_THROUGHT = { "output": f"{START_REASONING}{START_RESPONSE}This is the rest{END_RESPONSE}", # noqa: E501 - "reasoning_content": None, + "reasoning": None, "content": "This is the rest", } SIMPLE_REASONING = { "output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest{END_RESPONSE}", # noqa: E501 - "reasoning_content": "This is a reasoning section", + "reasoning": "This is a reasoning section", "content": "This is the rest", } COMPLETE_REASONING = { "output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}", - "reasoning_content": "This is a reasoning section", + "reasoning": "This is a reasoning section", "content": None, } COMPLETE_REASONING_WITH_SYMBOL = { "output": f"{START_REASONING}This is a reasoning section!{START_RESPONSE}", - "reasoning_content": "This is a reasoning section!", + "reasoning": "This is a reasoning section!", "content": None, } NO_REASONING = { "output": "This is content", - "reasoning_content": None, + "reasoning": None, "content": "This is content", } MULTIPLE_LINES = { "output": f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat", - "reasoning_content": "This\nThat", + "reasoning": "This\nThat", "content": "This is the rest\nThat", } REASONING_WITH_THINK = { "output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest", # noqa: E501 - "reasoning_content": "This is a reasoning section", + "reasoning": "This is a reasoning section", "content": "This is the rest", } COMPLETE_REASONING_WITH_THINK = { "output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}", - "reasoning_content": "This is a reasoning section", + "reasoning": "This is a reasoning section", "content": None, } MULTIPLE_LINES_WITH_THINK = { "output": f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat", - "reasoning_content": "This\nThat", + "reasoning": "This\nThat", "content": "This is the rest\nThat", } @@ -164,5 +164,5 @@ def test_reasoning( parser, output_tokens, streaming=streaming ) - assert reasoning == param_dict["reasoning_content"] + assert reasoning == param_dict["reasoning"] assert content == param_dict["content"] diff --git a/tests/reasoning/test_mistral_reasoning_parser.py b/tests/reasoning/test_mistral_reasoning_parser.py index ff7f94b40ee1..5163c863863a 100644 --- a/tests/reasoning/test_mistral_reasoning_parser.py +++ b/tests/reasoning/test_mistral_reasoning_parser.py @@ -20,97 +20,97 @@ def mistral_tokenizer(): SIMPLE_REASONING = { "output": "This is a reasoning section[/THINK]This is the rest", - "reasoning_content": "This is a reasoning section", + "reasoning": "This is a reasoning section", "content": "This is the rest", "is_reasoning_end": True, } COMPLETE_REASONING = { "output": "This is a reasoning section[/THINK]", - "reasoning_content": "This is a reasoning section", + "reasoning": "This is a reasoning section", "content": None, "is_reasoning_end": True, } NO_CONTENT = { "output": "This is content", - "reasoning_content": "This is content", + "reasoning": "This is content", "content": None, "is_reasoning_end": False, } NO_REASONING_STREAMING = { "output": "This is a reasoning section", - "reasoning_content": "This is a reasoning section", + "reasoning": "This is a reasoning section", "content": None, "is_reasoning_end": False, } MULTIPLE_LINES = { "output": "This\nThat[/THINK]This is the rest\nThat", - "reasoning_content": "This\nThat", + "reasoning": "This\nThat", "content": "This is the rest\nThat", "is_reasoning_end": True, } SHORTEST_REASONING_NO_STREAMING = { "output": "[/THINK]This is the rest", - "reasoning_content": "", + "reasoning": "", "content": "This is the rest", "is_reasoning_end": True, } SHORTEST_REASONING = { "output": "[/THINK]This is the rest", - "reasoning_content": None, + "reasoning": None, "content": "This is the rest", "is_reasoning_end": True, } REASONING_WITH_THINK = { "output": "[THINK]This is a reasoning section[/THINK]This is the rest", - "reasoning_content": "This is a reasoning section", + "reasoning": "This is a reasoning section", "content": "This is the rest", "is_reasoning_end": True, } COMPLETE_REASONING_WITH_THINK = { "output": "[THINK]This is a reasoning section[/THINK]", - "reasoning_content": "This is a reasoning section", + "reasoning": "This is a reasoning section", "content": None, "is_reasoning_end": True, } MULTIPLE_LINES_WITH_THINK = { "output": "[THINK]This\nThat[/THINK]This is the rest\nThat", - "reasoning_content": "This\nThat", + "reasoning": "This\nThat", "content": "This is the rest\nThat", "is_reasoning_end": True, } SHORTEST_REASONING_NO_STREAMING_WITH_THINK = { "output": "[/THINK]This is the rest", - "reasoning_content": "", + "reasoning": "", "content": "This is the rest", "is_reasoning_end": True, } SHORTEST_REASONING_WITH_THINK = { "output": "[/THINK]This is the rest", - "reasoning_content": None, + "reasoning": None, "content": "This is the rest", "is_reasoning_end": True, } THINK_NO_END = { "output": "[THINK]This is a reasoning section", - "reasoning_content": "This is a reasoning section", + "reasoning": "This is a reasoning section", "content": None, "is_reasoning_end": False, } EMPTY = { "output": "", - "reasoning_content": "", + "reasoning": "", "content": None, "is_reasoning_end": False, } EMPTY_STREAMING = { "output": "", - "reasoning_content": None, + "reasoning": None, "content": None, "is_reasoning_end": False, } NEW_LINE = { "output": "\n[THINK]This is a reasoning section[/THINK]\nThis is the rest", - "reasoning_content": "This is a reasoning section", + "reasoning": "This is a reasoning section", "content": "\nThis is the rest", "is_reasoning_end": True, } @@ -120,7 +120,7 @@ NEW_LINE = { # or not. NEW_LINE_STREAMING = { "output": "\n[THINK]This is a reasoning section[/THINK]\nThis is the rest", - "reasoning_content": "\nThis is a reasoning section", + "reasoning": "\nThis is a reasoning section", "content": "\nThis is the rest", "is_reasoning_end": True, } @@ -307,7 +307,7 @@ def test_mistral_reasoning( parser, output_tokens, streaming=streaming ) - assert reasoning == param_dict["reasoning_content"] + assert reasoning == param_dict["reasoning"] assert content == param_dict["content"] # Test is_reasoning_end diff --git a/tests/reasoning/test_olmo3_reasoning_parser.py b/tests/reasoning/test_olmo3_reasoning_parser.py index 4a2eca994610..bc0e72e2a456 100644 --- a/tests/reasoning/test_olmo3_reasoning_parser.py +++ b/tests/reasoning/test_olmo3_reasoning_parser.py @@ -13,43 +13,43 @@ END_REASONING = "" NO_REASONING = { "output": f"{START_REASONING}{END_REASONING}No thoughts, head empty!", - "reasoning_content": None, + "reasoning": None, "content": "No thoughts, head empty!", } NO_REASONING_WITH_NEWLINE = { "output": f"{START_REASONING}\n{END_REASONING}\n\nNo thoughts, head empty!", - "reasoning_content": "\n", + "reasoning": "\n", "content": "\n\nNo thoughts, head empty!", } SIMPLE_REASONING = { "output": f"{START_REASONING}This is a reasoning section{END_REASONING}This is the rest", # noqa: E501 - "reasoning_content": "This is a reasoning section", + "reasoning": "This is a reasoning section", "content": "This is the rest", } SIMPLE_REASONING_WITH_NEWLINE = { "output": f"{START_REASONING} Look!\n\nI'm thinking...{END_REASONING}\nThis is the rest", # noqa: E501 - "reasoning_content": " Look!\n\nI'm thinking...", + "reasoning": " Look!\n\nI'm thinking...", "content": "\nThis is the rest", } SIMPLE_REASONING_WITH_MULTIPLE_NEWLINES = { "output": f"{START_REASONING}\nLook!\nI'm thinking...\n\n{END_REASONING}\n\n\nThis is the rest", # noqa: E501 - "reasoning_content": "\nLook!\nI'm thinking...\n\n", + "reasoning": "\nLook!\nI'm thinking...\n\n", "content": "\n\n\nThis is the rest", } NO_REASONING_ONLY_END_THINK = { "output": f"{END_REASONING}\n\nNo thoughts, head empty!", - "reasoning_content": None, + "reasoning": None, "content": "\n\nNo thoughts, head empty!", } REASONING_ONLY_END_THINK = { "output": f"The user is asking me not to think.{END_REASONING}No thoughts!", - "reasoning_content": "The user is asking me not to think.", + "reasoning": "The user is asking me not to think.", "content": "No thoughts!", } @@ -148,5 +148,5 @@ def test_reasoning( reasoning_parser=parser, model_output=model_output, streaming=streaming ) - assert reasoning == param_dict["reasoning_content"] + assert reasoning == param_dict["reasoning"] assert content == param_dict["content"] diff --git a/tests/reasoning/test_qwen3_reasoning_parser.py b/tests/reasoning/test_qwen3_reasoning_parser.py index c06e40d72de2..92a8b6ab3761 100644 --- a/tests/reasoning/test_qwen3_reasoning_parser.py +++ b/tests/reasoning/test_qwen3_reasoning_parser.py @@ -22,47 +22,47 @@ def qwen3_tokenizer(): # 带 ,非stream WITH_THINK = { "output": "This is a reasoning sectionThis is the rest", - "reasoning_content": "This is a reasoning section", + "reasoning": "This is a reasoning section", "content": "This is the rest", } # 带 ,stream WITH_THINK_STREAM = { "output": "This is a reasoning sectionThis is the rest", - "reasoning_content": "This is a reasoning section", + "reasoning": "This is a reasoning section", "content": "This is the rest", } # 不带 ,非stream WITHOUT_THINK = { "output": "This is the rest", - "reasoning_content": None, + "reasoning": None, "content": "This is the rest", } # 不带 ,stream WITHOUT_THINK_STREAM = { "output": "This is the rest", - "reasoning_content": None, + "reasoning": None, "content": "This is the rest", } COMPLETE_REASONING = { "output": "This is a reasoning section", - "reasoning_content": "This is a reasoning section", + "reasoning": "This is a reasoning section", "content": None, } MULTILINE_REASONING = { "output": "This is a reasoning\nsectionThis is the rest\nThat", - "reasoning_content": "This is a reasoning\nsection", + "reasoning": "This is a reasoning\nsection", "content": "This is the rest\nThat", } ONLY_OPEN_TAG = { "output": "This is a reasoning section", - "reasoning_content": None, + "reasoning": None, "content": "This is a reasoning section", } ONLY_OPEN_TAG_STREAM = { "output": "This is a reasoning section", - "reasoning_content": "This is a reasoning section", + "reasoning": "This is a reasoning section", "content": None, } @@ -138,5 +138,5 @@ def test_reasoning( parser, output_tokens, streaming=streaming ) - assert reasoning == param_dict["reasoning_content"] + assert reasoning == param_dict["reasoning"] assert content == param_dict["content"] diff --git a/tests/reasoning/test_seedoss_reasoning_parser.py b/tests/reasoning/test_seedoss_reasoning_parser.py index b356b8545f41..33d56d32965a 100644 --- a/tests/reasoning/test_seedoss_reasoning_parser.py +++ b/tests/reasoning/test_seedoss_reasoning_parser.py @@ -28,49 +28,49 @@ def seedoss_tokenizer(): SIMPLE_REASONING: dict[str, Any] = { "output": "This is a reasoning sectionThis is the rest", - "reasoning_content": "This is a reasoning section", + "reasoning": "This is a reasoning section", "content": "This is the rest", "is_reasoning_end": True, } COMPLETE_REASONING: dict[str, Any] = { "output": "This is a reasoning section", - "reasoning_content": "This is a reasoning section", + "reasoning": "This is a reasoning section", "content": None, "is_reasoning_end": True, } NO_CONTENT: dict[str, Any] = { "output": "This is content", - "reasoning_content": "This is content", + "reasoning": "This is content", "content": None, "is_reasoning_end": False, } NO_REASONING_STREAMING: dict[str, Any] = { "output": "This is a reasoning section", - "reasoning_content": "This is a reasoning section", + "reasoning": "This is a reasoning section", "content": None, "is_reasoning_end": False, } MULTIPLE_LINES: dict[str, Any] = { "output": "This\nThatThis is the rest\nThat", - "reasoning_content": "This\nThat", + "reasoning": "This\nThat", "content": "This is the rest\nThat", "is_reasoning_end": True, } WITH_START_TOKEN: dict[str, Any] = { "output": ("This is a reasoning sectionThis is the rest"), - "reasoning_content": "This is a reasoning section", + "reasoning": "This is a reasoning section", "content": "This is the rest", "is_reasoning_end": True, } ONLY_END_TOKEN: dict[str, Any] = { "output": "Some reasoningThis is the rest", - "reasoning_content": "Some reasoning", + "reasoning": "Some reasoning", "content": "This is the rest", "is_reasoning_end": True, } NO_TOKENS: dict[str, Any] = { "output": "This is just content without any reasoning tokens", - "reasoning_content": "This is just content without any reasoning tokens", + "reasoning": "This is just content without any reasoning tokens", "content": None, "is_reasoning_end": False, } @@ -95,7 +95,7 @@ def test_simple_reasoning(seedoss_tokenizer, streaming): parser, [cast(str, SIMPLE_REASONING["output"])], streaming=streaming ) - assert reasoning == SIMPLE_REASONING["reasoning_content"] + assert reasoning == SIMPLE_REASONING["reasoning"] assert content == SIMPLE_REASONING["content"] @@ -109,7 +109,7 @@ def test_complete_reasoning(seedoss_tokenizer, streaming): parser, [cast(str, COMPLETE_REASONING["output"])], streaming=streaming ) - assert reasoning == COMPLETE_REASONING["reasoning_content"] + assert reasoning == COMPLETE_REASONING["reasoning"] assert content == COMPLETE_REASONING["content"] @@ -123,7 +123,7 @@ def test_no_content(seedoss_tokenizer, streaming): parser, [cast(str, NO_CONTENT["output"])], streaming=streaming ) - assert reasoning == NO_CONTENT["reasoning_content"] + assert reasoning == NO_CONTENT["reasoning"] assert content == NO_CONTENT["content"] @@ -137,7 +137,7 @@ def test_multiple_lines(seedoss_tokenizer, streaming): parser, [cast(str, MULTIPLE_LINES["output"])], streaming=streaming ) - assert reasoning == MULTIPLE_LINES["reasoning_content"] + assert reasoning == MULTIPLE_LINES["reasoning"] assert content == MULTIPLE_LINES["content"] @@ -151,7 +151,7 @@ def test_with_start_token(seedoss_tokenizer, streaming): parser, [cast(str, WITH_START_TOKEN["output"])], streaming=streaming ) - assert reasoning == WITH_START_TOKEN["reasoning_content"] + assert reasoning == WITH_START_TOKEN["reasoning"] assert content == WITH_START_TOKEN["content"] @@ -168,7 +168,7 @@ def test_only_end_token(seedoss_tokenizer, streaming): parser, [cast(str, ONLY_END_TOKEN["output"])], streaming=streaming ) - assert reasoning == ONLY_END_TOKEN["reasoning_content"] + assert reasoning == ONLY_END_TOKEN["reasoning"] assert content == ONLY_END_TOKEN["content"] @@ -182,7 +182,7 @@ def test_no_tokens(seedoss_tokenizer, streaming): parser, [cast(str, NO_TOKENS["output"])], streaming=streaming ) - assert reasoning == NO_TOKENS["reasoning_content"] + assert reasoning == NO_TOKENS["reasoning"] assert content == NO_TOKENS["content"] diff --git a/tests/reasoning/utils.py b/tests/reasoning/utils.py index ccd4ff8dd263..bd0b230a847c 100644 --- a/tests/reasoning/utils.py +++ b/tests/reasoning/utils.py @@ -9,25 +9,28 @@ from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer class StreamingReasoningReconstructor: def __init__(self): - self.reasoning_content = None + self.reasoning = None self.other_content = None def append_delta(self, delta: DeltaMessage): # content and the reasoning content should not be present # at the same time - assert delta.content is None or delta.reasoning_content is None, ( + assert delta.content is None or delta.reasoning is None, ( "Both content and reasoning content are present in the delta message" ) + assert delta.reasoning == delta.reasoning_content, ( + "reasoning_content should be present for backwards compatibility" + ) if delta.content is not None: if self.other_content is None: self.other_content = delta.content else: self.other_content += delta.content else: - if self.reasoning_content is None: - self.reasoning_content = delta.reasoning_content + if self.reasoning is None: + self.reasoning = delta.reasoning else: - self.reasoning_content += delta.reasoning_content + self.reasoning += delta.reasoning def run_reasoning_extraction( @@ -43,7 +46,7 @@ def run_reasoning_extraction( request, ) return ( - reconstructor.reasoning_content, + reconstructor.reasoning, reconstructor.other_content or None, ) else: @@ -69,7 +72,7 @@ def run_reasoning_extraction_mistral( request, ) return ( - reconstructor.reasoning_content, + reconstructor.reasoning, reconstructor.other_content or None, ) else: @@ -88,7 +91,7 @@ def run_reasoning_extraction_nonstreaming( request: ChatCompletionRequest | None = None, ) -> tuple[str | None, str | None]: request = request or ChatCompletionRequest(messages=[], model="test-model") - return reasoning_parser.extract_reasoning_content( + return reasoning_parser.extract_reasoning( model_output="".join(model_output), request=request ) @@ -110,7 +113,7 @@ def run_reasoning_extraction_streaming( ] current_text = previous_text + delta current_tokens = previous_tokens + token_delta - delta_message = reasoning_parser.extract_reasoning_content_streaming( + delta_message = reasoning_parser.extract_reasoning_streaming( previous_text, current_text, delta, @@ -142,7 +145,7 @@ def run_reasoning_extraction_streaming_mistral( delta = reasoning_parser.model_tokenizer.convert_ids_to_tokens([model_delta])[0] current_text = previous_text + delta current_tokens = previous_tokens + token_delta - delta_message = reasoning_parser.extract_reasoning_content_streaming( + delta_message = reasoning_parser.extract_reasoning_streaming( previous_text, current_text, delta, diff --git a/tests/tokenization/test_mistral_tokenizer.py b/tests/tokenization/test_mistral_tokenizer.py index 926ad2503398..1ada8ee187c3 100644 --- a/tests/tokenization/test_mistral_tokenizer.py +++ b/tests/tokenization/test_mistral_tokenizer.py @@ -102,7 +102,7 @@ def test_prepare_apply_chat_template_tools_and_messages( assert actual_request == expected_mistral_output -# Tool use with list content and reasoning_content +# Tool use with list content and reasoning @pytest.mark.parametrize( "openai_request,expected_mistral_output", [ @@ -115,7 +115,7 @@ def test_prepare_apply_chat_template_tools_and_messages( }, { "role": "assistant", - "reasoning_content": None, + "reasoning": None, "content": None, "tool_calls": [ { diff --git a/tests/tool_use/test_ernie45_moe_tool_parser.py b/tests/tool_use/test_ernie45_moe_tool_parser.py index fb5af6e13a96..36a07bb561d9 100644 --- a/tests/tool_use/test_ernie45_moe_tool_parser.py +++ b/tests/tool_use/test_ernie45_moe_tool_parser.py @@ -337,7 +337,7 @@ def test_extract_tool_calls_streaming_incremental( if ( delta_message.role is None and delta_message.content is None - and delta_message.reasoning_content is None + and delta_message.reasoning is None and len(delta_message.tool_calls) == 0 ): continue diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index 676423f2ca91..4cd26e7b41d3 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -674,10 +674,10 @@ def test_structured_output_with_reasoning_matrices( assert output is not None and isinstance(output, RequestOutput) prompt = output.prompt generated_text = output.outputs[0].text - reasoning_content, content = run_reasoning_extraction(reasoner, [generated_text]) - print(f"Prompt: {prompt!r}\nReasoning: {reasoning_content!r}\nContent: {content!r}") + reasoning, content = run_reasoning_extraction(reasoner, [generated_text]) + print(f"Prompt: {prompt!r}\nReasoning: {reasoning!r}\nContent: {content!r}") - assert content is not None and reasoning_content is not None + assert content is not None and reasoning is not None output_json = json.loads(content) jsonschema.validate(instance=output_json, schema=reasoning_schema) diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py index 7958d0317739..47a252348c10 100644 --- a/vllm/entrypoints/harmony_utils.py +++ b/vllm/entrypoints/harmony_utils.py @@ -521,15 +521,15 @@ def parse_chat_output( is_tool_call = False # TODO: update this when tool call is supported if len(output_msgs) == 0: # The generation has stopped during reasoning. - reasoning_content = parser.current_content + reasoning = parser.current_content final_content = None elif len(output_msgs) == 1: # The generation has stopped during final message. - reasoning_content = output_msgs[0].content[0].text + reasoning = output_msgs[0].content[0].text final_content = parser.current_content else: reasoning_msg = output_msgs[:-1] final_msg = output_msgs[-1] - reasoning_content = "\n".join([msg.content[0].text for msg in reasoning_msg]) + reasoning = "\n".join([msg.content[0].text for msg in reasoning_msg]) final_content = final_msg.content[0].text - return reasoning_content, final_content, is_tool_call + return reasoning, final_content, is_tool_call diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index cf80c4fccbad..69e757d4764d 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -2102,7 +2102,15 @@ class ChatMessage(OpenAIBaseModel): tool_calls: list[ToolCall] = Field(default_factory=list) # vLLM-specific fields that are not in OpenAI spec + reasoning: str | None = None reasoning_content: str | None = None + """Deprecated: use `reasoning` instead.""" + + @model_validator(mode="after") + def handle_deprecated_reasoning_content(self): + """Copy reasoning to reasoning_content for backward compatibility.""" + self.reasoning_content = self.reasoning + return self class ChatCompletionLogProb(OpenAIBaseModel): @@ -2156,9 +2164,17 @@ class ChatCompletionResponse(OpenAIBaseModel): class DeltaMessage(OpenAIBaseModel): role: str | None = None content: str | None = None + reasoning: str | None = None reasoning_content: str | None = None + """Deprecated: use `reasoning` instead.""" tool_calls: list[DeltaToolCall] = Field(default_factory=list) + @model_validator(mode="after") + def handle_deprecated_reasoning_content(self): + """Copy reasoning to reasoning_content for backward compatibility.""" + self.reasoning_content = self.reasoning + return self + class ChatCompletionResponseStreamChoice(OpenAIBaseModel): index: int diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 888aa4eb6fa8..59e1c8d53179 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -759,9 +759,7 @@ class OpenAIServingChat(OpenAIServing): delta_message = DeltaMessage(content=delta_text) elif cur_channel == "analysis": if request.include_reasoning: - delta_message = DeltaMessage( - reasoning_content=delta_text - ) + delta_message = DeltaMessage(reasoning=delta_text) else: delta_message = None elif ( @@ -823,7 +821,7 @@ class OpenAIServingChat(OpenAIServing): ): assert reasoning_parser is not None delta_message = ( - reasoning_parser.extract_reasoning_content_streaming( + reasoning_parser.extract_reasoning_streaming( previous_text, current_text, delta_text, @@ -836,7 +834,7 @@ class OpenAIServingChat(OpenAIServing): # or think end id in prompt_token_ids # i.e {"enable_thinking": False}, # set reasoning status to end. - # Only keep 'content', remove 'reasoning_content'. + # Only keep 'content', remove 'reasoning'. if reasoning_parser.is_reasoning_end( as_list(output.token_ids) ) or ( @@ -899,7 +897,7 @@ class OpenAIServingChat(OpenAIServing): if self.reasoning_parser and not reasoning_end_arr[i]: delta_message = ( - reasoning_parser.extract_reasoning_content_streaming( + reasoning_parser.extract_reasoning_streaming( previous_text, current_text, delta_text, @@ -948,7 +946,7 @@ class OpenAIServingChat(OpenAIServing): output_token_ids = as_list(output.token_ids) if not reasoning_end_arr[i]: delta_message = ( - reasoning_parser.extract_reasoning_content_streaming( + reasoning_parser.extract_reasoning_streaming( previous_text, current_text, delta_text, @@ -961,7 +959,7 @@ class OpenAIServingChat(OpenAIServing): # i.e {"enable_thinking": False}, # set reasoning status to end. # Remove the text and token ids related - # to 'reasoning_content'. + # to 'reasoning'. if ( res.prompt_token_ids and reasoning_parser.is_reasoning_end( @@ -978,7 +976,7 @@ class OpenAIServingChat(OpenAIServing): # When encountering think end id in delta_token_ids, # set reasoning status to end. # Remove the text and token ids related - # to 'reasoning_content'. + # to 'reasoning'. if reasoning_parser.is_reasoning_end(output_token_ids): reasoning_end_arr[i] = True current_token_ids = ( @@ -1033,15 +1031,13 @@ class OpenAIServingChat(OpenAIServing): # when only reasoning elif self.reasoning_parser: - delta_message = ( - reasoning_parser.extract_reasoning_content_streaming( - previous_text, - current_text, - delta_text, - previous_token_ids, - current_token_ids, - output.token_ids, - ) + delta_message = reasoning_parser.extract_reasoning_streaming( + previous_text, + current_text, + delta_text, + previous_token_ids, + current_token_ids, + output.token_ids, ) # handle streaming just a content delta else: @@ -1334,9 +1330,9 @@ class OpenAIServingChat(OpenAIServing): logprobs = None if self.use_harmony: - reasoning_content, content, _ = parse_chat_output(token_ids) + reasoning, content, _ = parse_chat_output(token_ids) if not request.include_reasoning: - reasoning_content = None + reasoning = None if self.tool_parser is not None: tool_parser = self.tool_parser(tokenizer) @@ -1349,14 +1345,14 @@ class OpenAIServingChat(OpenAIServing): content = tool_call_info.content message = ChatMessage( role=role, - reasoning_content=reasoning_content, + reasoning=reasoning, content=content, tool_calls=tool_call_info.tool_calls, ) else: message = ChatMessage( role=role, - reasoning_content=reasoning_content, + reasoning=reasoning, content=content, ) @@ -1390,13 +1386,13 @@ class OpenAIServingChat(OpenAIServing): return self.create_error_response(str(e)) # If the reasoning parser is enabled, # tool calls are extracted exclusively from the content. - reasoning_content, content = reasoning_parser.extract_reasoning_content( + reasoning, content = reasoning_parser.extract_reasoning( output.text, request=request ) if not request.include_reasoning: - reasoning_content = None + reasoning = None else: - reasoning_content = None + reasoning = None content = output.text auto_tools_called = False @@ -1416,9 +1412,7 @@ class OpenAIServingChat(OpenAIServing): not isinstance(request.tool_choice, ChatCompletionNamedToolChoiceParam) and request.tool_choice != "required" ): - message = ChatMessage( - role=role, reasoning_content=reasoning_content, content=content - ) + message = ChatMessage(role=role, reasoning=reasoning, content=content) # if the request uses tools and specified a tool choice elif ( @@ -1428,7 +1422,7 @@ class OpenAIServingChat(OpenAIServing): assert tool_calls is not None and len(tool_calls) > 0 message = ChatMessage( role=role, - reasoning_content=reasoning_content, + reasoning=reasoning, content="", tool_calls=[tool_call_class(function=tc) for tc in tool_calls], ) @@ -1452,15 +1446,13 @@ class OpenAIServingChat(OpenAIServing): role=role, content="", tool_calls=tool_call_class_items, - reasoning_content=reasoning_content, + reasoning=reasoning, ) # if the request doesn't use tool choice # OR specifies to not use a tool elif not request.tool_choice or request.tool_choice == "none": - message = ChatMessage( - role=role, reasoning_content=reasoning_content, content=content - ) + message = ChatMessage(role=role, reasoning=reasoning, content=content) # handle when there are tools and tool choice is auto elif ( @@ -1476,7 +1468,7 @@ class OpenAIServingChat(OpenAIServing): if tool_calls: message = ChatMessage( role=role, - reasoning_content=reasoning_content, + reasoning=reasoning, content=content, tool_calls=[ ToolCall( @@ -1498,7 +1490,7 @@ class OpenAIServingChat(OpenAIServing): ret_content = content message = ChatMessage( role=role, - reasoning_content=reasoning_content, + reasoning=reasoning, content=ret_content, ) @@ -1509,9 +1501,7 @@ class OpenAIServingChat(OpenAIServing): " if tools should be extracted. Returning a standard chat " "completion." ) - message = ChatMessage( - role=role, reasoning_content=reasoning_content, content=content - ) + message = ChatMessage(role=role, reasoning=reasoning, content=content) # In OpenAI's API, when a tool is called, the finish_reason is: # "tool_calls" for "auto" or "required" tool calls, # and "stop" for named tool calls. diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index b6fef7d2fafd..9b79e50c3208 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -778,11 +778,11 @@ class OpenAIServingResponses(OpenAIServing): logger.exception("Error in reasoning parser creation.") raise e - reasoning_content, content = reasoning_parser.extract_reasoning_content( + reasoning, content = reasoning_parser.extract_reasoning( final_output.text, request=request ) else: - reasoning_content = None + reasoning = None content = final_output.text # Log complete response if output logging is enabled @@ -790,8 +790,8 @@ class OpenAIServingResponses(OpenAIServing): output_text = "" if content: output_text = content - elif reasoning_content: - output_text = f"[reasoning: {reasoning_content}]" + elif reasoning: + output_text = f"[reasoning: {reasoning}]" if output_text: self.request_logger.log_outputs( @@ -805,15 +805,13 @@ class OpenAIServingResponses(OpenAIServing): reasoning_item = None message_item = None - if reasoning_content: + if reasoning: reasoning_item = ResponseReasoningItem( id=f"rs_{random_uuid()}", summary=[], type="reasoning", content=[ - ResponseReasoningTextContent( - text=reasoning_content, type="reasoning_text" - ) + ResponseReasoningTextContent(text=reasoning, type="reasoning_text") ], status=None, # NOTE: Only the last output item has status. ) @@ -1208,15 +1206,13 @@ class OpenAIServingResponses(OpenAIServing): if ctx.last_output.outputs: output = ctx.last_output.outputs[0] if reasoning_parser: - delta_message = ( - reasoning_parser.extract_reasoning_content_streaming( - previous_text=previous_text, - current_text=previous_text + output.text, - delta_text=output.text, - previous_token_ids=previous_token_ids, - current_token_ids=previous_token_ids + output.token_ids, - delta_token_ids=output.token_ids, - ) + delta_message = reasoning_parser.extract_reasoning_streaming( + previous_text=previous_text, + current_text=previous_text + output.text, + delta_text=output.text, + previous_token_ids=previous_token_ids, + current_token_ids=previous_token_ids + output.token_ids, + delta_token_ids=output.token_ids, ) else: delta_message = DeltaMessage( @@ -1228,7 +1224,7 @@ class OpenAIServingResponses(OpenAIServing): continue if not first_delta_sent: current_item_id = str(uuid.uuid4()) - if delta_message.reasoning_content: + if delta_message.reasoning: yield _increment_sequence_number_and_return( ResponseOutputItemAddedEvent( type="response.output_item.added", @@ -1280,15 +1276,15 @@ class OpenAIServingResponses(OpenAIServing): # same as content or reasoning content if ( previous_delta_messages - and previous_delta_messages[-1].reasoning_content is not None + and previous_delta_messages[-1].reasoning is not None and delta_message.content is not None ): # from reasoning to normal content, send done # event for reasoning reason_content = "".join( - pm.reasoning_content + pm.reasoning for pm in previous_delta_messages - if pm.reasoning_content is not None + if pm.reasoning is not None ) yield _increment_sequence_number_and_return( ResponseReasoningTextDoneEvent( @@ -1356,7 +1352,7 @@ class OpenAIServingResponses(OpenAIServing): # reset previous delta messages previous_delta_messages = [] - if delta_message.reasoning_content is not None: + if delta_message.reasoning is not None: yield _increment_sequence_number_and_return( ResponseReasoningTextDeltaEvent( type="response.reasoning_text.delta", @@ -1364,7 +1360,7 @@ class OpenAIServingResponses(OpenAIServing): content_index=current_content_index, output_index=current_output_index, item_id=current_item_id, - delta=delta_message.reasoning_content, + delta=delta_message.reasoning, ) ) elif delta_message.content is not None: @@ -1392,11 +1388,11 @@ class OpenAIServingResponses(OpenAIServing): previous_delta_messages.append(delta_message) if previous_delta_messages: - if previous_delta_messages[-1].reasoning_content is not None: + if previous_delta_messages[-1].reasoning is not None: reason_content = "".join( - pm.reasoning_content + pm.reasoning for pm in previous_delta_messages - if pm.reasoning_content is not None + if pm.reasoning is not None ) yield _increment_sequence_number_and_return( ResponseReasoningTextDoneEvent( diff --git a/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py index cf2fa30d0154..432c419db189 100644 --- a/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py @@ -279,7 +279,7 @@ class StreamingXMLToolCallParser: final_delta = DeltaMessage( role=None, content=None, - reasoning_content=None, + reasoning=None, tool_calls=[ DeltaToolCall( index=self.tool_call_index - 1, diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py index 63ff450053ea..d26e4ffc9c16 100644 --- a/vllm/reasoning/abs_reasoning_parsers.py +++ b/vllm/reasoning/abs_reasoning_parsers.py @@ -76,7 +76,7 @@ class ReasoningParser: """ @abstractmethod - def extract_reasoning_content( + def extract_reasoning( self, model_output: str, request: ChatCompletionRequest | ResponsesRequest, @@ -100,7 +100,7 @@ class ReasoningParser: """ @abstractmethod - def extract_reasoning_content_streaming( + def extract_reasoning_streaming( self, previous_text: str, current_text: str, diff --git a/vllm/reasoning/basic_parsers.py b/vllm/reasoning/basic_parsers.py index 5fb3c8d368a8..026894773272 100644 --- a/vllm/reasoning/basic_parsers.py +++ b/vllm/reasoning/basic_parsers.py @@ -76,7 +76,7 @@ class BaseThinkingReasoningParser(ReasoningParser): else: return input_ids[input_ids.index(self.end_token_id) + 1 :] - def extract_reasoning_content_streaming( + def extract_reasoning_streaming( self, previous_text: str, current_text: str, @@ -103,11 +103,10 @@ class BaseThinkingReasoningParser(ReasoningParser): # start token in previous, end token in delta, # extract reasoning content end_index = delta_text.find(self.end_token) - reasoning_content = delta_text[:end_index] + reasoning = delta_text[:end_index] content = delta_text[end_index + len(self.end_token) :] return DeltaMessage( - reasoning_content=reasoning_content, - content=content if content else None, + reasoning=reasoning, content=content if content else None ) elif self.end_token_id in previous_token_ids: # start token in previous, end token in previous, @@ -116,30 +115,27 @@ class BaseThinkingReasoningParser(ReasoningParser): else: # start token in previous, no end token in previous or delta, # reasoning content continues - return DeltaMessage(reasoning_content=delta_text) + return DeltaMessage(reasoning=delta_text) elif self.start_token_id in delta_token_ids: if self.end_token_id in delta_token_ids: # start token in delta, end token in delta, # extract reasoning content start_index = delta_text.find(self.start_token) end_index = delta_text.find(self.end_token) - reasoning_content = delta_text[ - start_index + len(self.start_token) : end_index - ] + reasoning = delta_text[start_index + len(self.start_token) : end_index] content = delta_text[end_index + len(self.end_token) :] return DeltaMessage( - reasoning_content=reasoning_content, - content=content if content else None, + reasoning=reasoning, content=content if content else None ) else: # start token in delta, no end token in delta, # reasoning content continues - return DeltaMessage(reasoning_content=delta_text) + return DeltaMessage(reasoning=delta_text) else: # not find thinking start token return DeltaMessage(content=delta_text) - def extract_reasoning_content( + def extract_reasoning( self, model_output: str, request: ChatCompletionRequest | ResponsesRequest ) -> tuple[str | None, str | None]: """ @@ -160,7 +156,7 @@ class BaseThinkingReasoningParser(ReasoningParser): if self.end_token not in model_output: return model_output, None else: - reasoning_content, _, content = model_output.partition(self.end_token) + reasoning, _, content = model_output.partition(self.end_token) # If generation stops right after end-of-think, return null content final_content = content or None - return reasoning_content, final_content + return reasoning, final_content diff --git a/vllm/reasoning/deepseek_r1_reasoning_parser.py b/vllm/reasoning/deepseek_r1_reasoning_parser.py index ad4e0fe6c9ce..a91c8ceeb625 100644 --- a/vllm/reasoning/deepseek_r1_reasoning_parser.py +++ b/vllm/reasoning/deepseek_r1_reasoning_parser.py @@ -25,7 +25,7 @@ class DeepSeekR1ReasoningParser(BaseThinkingReasoningParser): """The token that ends reasoning content.""" return "" - def extract_reasoning_content_streaming( + def extract_reasoning_streaming( self, previous_text: str, current_text: str, @@ -34,7 +34,7 @@ class DeepSeekR1ReasoningParser(BaseThinkingReasoningParser): current_token_ids: Sequence[int], delta_token_ids: Sequence[int], ) -> DeltaMessage | None: - ret = super().extract_reasoning_content_streaming( + ret = super().extract_reasoning_streaming( previous_text, current_text, delta_text, @@ -51,10 +51,10 @@ class DeepSeekR1ReasoningParser(BaseThinkingReasoningParser): # end token in delta with more tokens, # extract reasoning content and content end_index = delta_text.find(self.end_token) - reasoning_content = delta_text[:end_index] + reasoning = delta_text[:end_index] content = delta_text[end_index + len(self.end_token) :] return DeltaMessage( - reasoning_content=reasoning_content, + reasoning=reasoning, content=content if content else None, ) elif self.end_token_id in previous_token_ids: @@ -62,6 +62,6 @@ class DeepSeekR1ReasoningParser(BaseThinkingReasoningParser): return DeltaMessage(content=delta_text) else: # no end token in previous or delta, reasoning content continues - return DeltaMessage(reasoning_content=delta_text) + return DeltaMessage(reasoning=delta_text) return ret diff --git a/vllm/reasoning/deepseek_v3_reasoning_parser.py b/vllm/reasoning/deepseek_v3_reasoning_parser.py index 81f6e1f32eb3..afdf73262aca 100644 --- a/vllm/reasoning/deepseek_v3_reasoning_parser.py +++ b/vllm/reasoning/deepseek_v3_reasoning_parser.py @@ -38,12 +38,12 @@ class DeepSeekV3ReasoningParser(ReasoningParser): def extract_content_ids(self, input_ids: list[int]) -> list[int]: return self._parser.extract_content_ids(input_ids) - def extract_reasoning_content( + def extract_reasoning( self, model_output: str, request: ChatCompletionRequest ) -> tuple[str | None, str | None]: - return self._parser.extract_reasoning_content(model_output, request) + return self._parser.extract_reasoning(model_output, request) - def extract_reasoning_content_streaming( + def extract_reasoning_streaming( self, previous_text: str, current_text: str, @@ -52,7 +52,7 @@ class DeepSeekV3ReasoningParser(ReasoningParser): current_token_ids: Sequence[int], delta_token_ids: Sequence[int], ) -> DeltaMessage | None: - return self._parser.extract_reasoning_content_streaming( + return self._parser.extract_reasoning_streaming( previous_text, current_text, delta_text, diff --git a/vllm/reasoning/ernie45_reasoning_parser.py b/vllm/reasoning/ernie45_reasoning_parser.py index 11dc1d10f73e..3cdbf14858ec 100644 --- a/vllm/reasoning/ernie45_reasoning_parser.py +++ b/vllm/reasoning/ernie45_reasoning_parser.py @@ -57,7 +57,7 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser): "tokens in the tokenizer!" ) - def extract_reasoning_content_streaming( + def extract_reasoning_streaming( self, previous_text: str, current_text: str, @@ -73,7 +73,7 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser): The Ernie45 thinking model ouput format is abc\n\n\n\ndef\n\n or abc\n\ndef - - 'abc' goes to reasoning_content + - 'abc' goes to reasoning - 'def' goes to content """ # Skip single special tokens @@ -94,7 +94,7 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser): # in delta with more tokens, # extract reasoning content and content think_end_index = delta_text.find(self.end_token) - reasoning_content = delta_text[:think_end_index] + reasoning = delta_text[:think_end_index] content = delta_text[think_end_index + len(self.end_token) :] content = content.lstrip("\n") response_start_idx = content.find(self.response_start_token) @@ -104,7 +104,7 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser): if response_end_idx != -1: content = content[:response_end_idx] return DeltaMessage( - reasoning_content=reasoning_content, + reasoning=reasoning, content=content if content else None, ) elif self.end_token_id in previous_token_ids: @@ -138,9 +138,9 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser): return DeltaMessage(content=content if content else None) else: # no in previous or delta, reasoning content continues - return DeltaMessage(reasoning_content=delta_text) + return DeltaMessage(reasoning=delta_text) - def extract_reasoning_content( + def extract_reasoning( self, model_output: str, request: ChatCompletionRequest ) -> tuple[str | None, str | None]: """ @@ -148,14 +148,12 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser): The Ernie45 thinking model ouput format is abc\n\n\n\n\ndef\n\n or abc\n\ndef - - 'abc' goes to reasoning_content + - 'abc' goes to reasoning - 'def' goes to content Returns: tuple[Optional[str], Optional[str]]: reasoning content and content """ - reasoning_content, content = super().extract_reasoning_content( - model_output, request - ) + reasoning, content = super().extract_reasoning(model_output, request) if content: start_idx = content.find(self.response_start_token) end_idx = content.rfind(self.response_end_token) @@ -164,4 +162,4 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser): content = content[start_idx + len(self.response_start_token) : end_idx] final_content = content or None - return reasoning_content, final_content + return reasoning, final_content diff --git a/vllm/reasoning/glm4_moe_reasoning_parser.py b/vllm/reasoning/glm4_moe_reasoning_parser.py index d43fa7700799..1871adcd4321 100644 --- a/vllm/reasoning/glm4_moe_reasoning_parser.py +++ b/vllm/reasoning/glm4_moe_reasoning_parser.py @@ -70,7 +70,7 @@ class Glm4MoeModelReasoningParser(ReasoningParser): else: return input_ids[input_ids.index(self.think_end_token_id) + 1 :] - def extract_reasoning_content_streaming( + def extract_reasoning_streaming( self, previous_text: str, current_text: str, @@ -84,7 +84,7 @@ class Glm4MoeModelReasoningParser(ReasoningParser): Handles streaming output where previous + delta = current. Uses token IDs for faster processing. For text abcxyz: - - 'abc' goes to reasoning_content + - 'abc' goes to reasoning - 'xyz' goes to content """ # Skip single special tokens @@ -98,10 +98,10 @@ class Glm4MoeModelReasoningParser(ReasoningParser): # in previous, in delta, # extract reasoning content end_index = delta_text.find(self.think_end_token) - reasoning_content = delta_text[:end_index] + reasoning = delta_text[:end_index] content = delta_text[end_index + len(self.think_end_token) :] return DeltaMessage( - reasoning_content=reasoning_content, + reasoning=reasoning, content=content if content else None, ) elif self.think_end_token_id in previous_token_ids: @@ -111,36 +111,36 @@ class Glm4MoeModelReasoningParser(ReasoningParser): else: # in previous, no in previous or delta, # reasoning content continues - return DeltaMessage(reasoning_content=delta_text) + return DeltaMessage(reasoning=delta_text) elif self.think_start_token_id in delta_token_ids: if self.think_end_token_id in delta_token_ids: # in delta, in delta, extract reasoning content start_index = delta_text.find(self.think_start_token) end_index = delta_text.find(self.think_end_token) - reasoning_content = delta_text[ + reasoning = delta_text[ start_index + len(self.think_start_token) : end_index ] content = delta_text[end_index + len(self.think_end_token) :] return DeltaMessage( - reasoning_content=reasoning_content, + reasoning=reasoning, content=content if content else None, ) else: # in delta, no in delta, # reasoning content continues - return DeltaMessage(reasoning_content=delta_text) + return DeltaMessage(reasoning=delta_text) else: # thinking is disabled, just content return DeltaMessage(content=delta_text) - def extract_reasoning_content( + def extract_reasoning( self, model_output: str, request: ChatCompletionRequest ) -> tuple[str | None, str | None]: """ Extract reasoning content from the model output. For text abcxyz: - - 'abc' goes to reasoning_content + - 'abc' goes to reasoning - 'xyz' goes to content Returns: @@ -165,7 +165,7 @@ class Glm4MoeModelReasoningParser(ReasoningParser): return None, model_output # Extract reasoning content from the model output. - reasoning_content, _, content = model_output.partition(self.think_end_token) + reasoning, _, content = model_output.partition(self.think_end_token) final_content = content or None - return reasoning_content, final_content + return reasoning, final_content diff --git a/vllm/reasoning/gptoss_reasoning_parser.py b/vllm/reasoning/gptoss_reasoning_parser.py index e720f5228d0f..0c1b54d0bd35 100644 --- a/vllm/reasoning/gptoss_reasoning_parser.py +++ b/vllm/reasoning/gptoss_reasoning_parser.py @@ -104,7 +104,7 @@ class GptOssReasoningParser(ReasoningParser): return [] return self.model_tokenizer.encode(content) - def extract_reasoning_content_streaming( + def extract_reasoning_streaming( self, previous_text: str, current_text: str, @@ -131,9 +131,9 @@ class GptOssReasoningParser(ReasoningParser): content_delta = cur_content if reasoning_delta is None and content_delta is None: return None - return DeltaMessage(reasoning_content=reasoning_delta, content=content_delta) + return DeltaMessage(reasoning=reasoning_delta, content=content_delta) - def extract_reasoning_content( + def extract_reasoning( self, model_output: str, request: ChatCompletionRequest, diff --git a/vllm/reasoning/granite_reasoning_parser.py b/vllm/reasoning/granite_reasoning_parser.py index eae6c2f5c7b3..484045d66a3c 100644 --- a/vllm/reasoning/granite_reasoning_parser.py +++ b/vllm/reasoning/granite_reasoning_parser.py @@ -49,7 +49,7 @@ class GraniteReasoningParser(ReasoningParser): len(think_start) for think_start in self.valid_think_starts ) - def extract_reasoning_content( + def extract_reasoning( self, model_output: str, request: ChatCompletionRequest ) -> tuple[str | None, str | None]: """Extract the reasoning content & content sections, respectively. @@ -67,12 +67,12 @@ class GraniteReasoningParser(ReasoningParser): re_match = self.reasoning_regex.findall(model_output) if not re_match: return None, model_output - reasoning_content, response_content = re_match[0] + reasoning, response_content = re_match[0] if not response_content: - return reasoning_content, None - return reasoning_content, response_content + return reasoning, None + return reasoning, response_content - def extract_reasoning_content_streaming( + def extract_reasoning_streaming( self, previous_text: str, current_text: str, @@ -107,12 +107,10 @@ class GraniteReasoningParser(ReasoningParser): Union[DeltaMessage, None] DeltaMessage with either reasoning content or content, or None. """ - reasoning_content, resp_seq_len, content = self._get_content_sections( - current_text - ) + reasoning, resp_seq_len, content = self._get_content_sections(current_text) # Either we haven't finished the start of the reasoning sequence, # or the model is generating something unexpected. - if not reasoning_content: + if not reasoning: delta_message = self._get_delta_message_with_no_reasoning_bounds( current_text, delta_text ) @@ -120,16 +118,16 @@ class GraniteReasoningParser(ReasoningParser): # the start of response sequence. elif not content: delta_message = self._get_delta_message_with_no_response_bounds( - current_text, reasoning_content, delta_text + current_text, reasoning, delta_text ) # We've finished both the start of reasoning and start of response seq. else: # This should never happen since we matched on the response assert resp_seq_len is not None delta_message = self._get_delta_message_with_both_bounds( - delta_text, reasoning_content, content, current_text, resp_seq_len + delta_text, reasoning, content, current_text, resp_seq_len ) - if not delta_message.content and not delta_message.reasoning_content: + if not delta_message.content and not delta_message.reasoning: return None return delta_message @@ -185,20 +183,20 @@ class GraniteReasoningParser(ReasoningParser): # message and append everything to content in the future. if was_substr and not is_substr: return DeltaMessage( - reasoning_content=None, + reasoning=None, content=current_text, ) if is_substr: # Might still be in the special token sequence; return nothing - return DeltaMessage(reasoning_content=None, content=None) + return DeltaMessage(reasoning=None, content=None) # Otherwise the sequence has already been broken and we already # corrected; just return the delta text as normal content. - return DeltaMessage(reasoning_content=None, content=delta_text) + return DeltaMessage(reasoning=None, content=delta_text) def _get_delta_message_with_no_response_bounds( self, current_text: str, - reasoning_content: str, + reasoning: str, delta_text: str, ) -> DeltaMessage: """Parse the delta message when the current text has both reasoning @@ -208,7 +206,7 @@ class GraniteReasoningParser(ReasoningParser): Args: current_text (str): The full previous + delta text. - reasoning_content (str): reasoning content from current_text. + reasoning (str): reasoning content from current_text. delta_text (str): Text to consider and parse content from. Returns: @@ -222,12 +220,12 @@ class GraniteReasoningParser(ReasoningParser): current_text.endswith(response_start) for response_start in self.valid_response_starts ) - if reasoning_content is None or ends_with_start_response_seq: - return DeltaMessage(reasoning_content=None, content=None) + if reasoning is None or ends_with_start_response_seq: + return DeltaMessage(reasoning=None, content=None) # Consider previous / current text only within context of the reasoning - previous_text = reasoning_content[: -len(delta_text)] - current_text = reasoning_content + previous_text = reasoning[: -len(delta_text)] + current_text = reasoning # We need to be careful about adding unfinished response sequences; # Find the place at which we MIGHT be starting a response sequence @@ -253,32 +251,30 @@ class GraniteReasoningParser(ReasoningParser): # Delta only contains potential continued response sequence text. if delta_continues_substr: - return DeltaMessage(reasoning_content=None, content=None) + return DeltaMessage(reasoning=None, content=None) if not prev_was_substr: # Delta may be starting a new response seq but has other text too. if delta_new_substr: - return DeltaMessage( - reasoning_content=delta_text[:delta_idx], content=None - ) + return DeltaMessage(reasoning=delta_text[:delta_idx], content=None) # Normal case for most reasoning text (no potential special seqs). - return DeltaMessage(reasoning_content=delta_text, content=None) + return DeltaMessage(reasoning=delta_text, content=None) # The substring that previously seemed to be a potential response # seq wasn't one; we need to add the content to the delta message, # and also slice off the potential response sequence elif delta_new_substr: - reasoning_content = previous_text[prev_idx:] + delta_text[:delta_idx] - return DeltaMessage(reasoning_content=reasoning_content, content=None) + reasoning = previous_text[prev_idx:] + delta_text[:delta_idx] + return DeltaMessage(reasoning=reasoning, content=None) # No new substring yet, and we broke our old one; take the whole delta return DeltaMessage( - reasoning_content=previous_text[prev_idx:] + delta_text, + reasoning=previous_text[prev_idx:] + delta_text, content=None, ) def _get_delta_message_with_both_bounds( self, delta_text: str, - reasoning_content: str, + reasoning: str, response_content: str, current_text: str, response_seq_len: int, @@ -288,7 +284,7 @@ class GraniteReasoningParser(ReasoningParser): Args: delta_text: Text to consider and parse content from. - reasoning_content: reasoning content from current_text. + reasoning: reasoning content from current_text. response_content: response content from current_text. current_text: The full previous + delta text. response_seq_len: Len of the complete response sequence used. @@ -301,20 +297,20 @@ class GraniteReasoningParser(ReasoningParser): reasoning_end_idx = len(delta_text) - (len(response_content) + response_seq_len) if reasoning_end_idx < 0: - delta_reasoning_content = None + delta_reasoning = None else: # Get the starting offset - start_reasoning_content_idx = ( - len(reasoning_content) + response_seq_len + len(response_content) - 1 + start_reasoning_idx = ( + len(reasoning) + response_seq_len + len(response_content) - 1 ) delta_offset = len(current_text) - len(delta_text) - start_offset = start_reasoning_content_idx - delta_offset + start_offset = start_reasoning_idx - delta_offset if start_offset < 0: start_offset = 0 - delta_reasoning_content = delta_text[start_offset:reasoning_end_idx] + delta_reasoning = delta_text[start_offset:reasoning_end_idx] return DeltaMessage( - reasoning_content=delta_reasoning_content, + reasoning=delta_reasoning, content=delta_content, ) @@ -333,7 +329,7 @@ class GraniteReasoningParser(ReasoningParser): (if there is one) and the non-reasoning content. """ current_chunk_start = 0 - start_reasoning_content = None + start_reasoning = None parsed_content = False delimiter_idxs = [ idx @@ -344,10 +340,10 @@ class GraniteReasoningParser(ReasoningParser): for current_chunk_end in delimiter_idxs: current_chunk = current_text[current_chunk_start:current_chunk_end] # Check to see if the start of reasoning seq if complete - if start_reasoning_content is None: + if start_reasoning is None: for think_start in self.valid_think_starts: if current_chunk == think_start[:-1]: - start_reasoning_content = current_chunk_end + 1 + start_reasoning = current_chunk_end + 1 current_chunk_start = current_chunk_end + 1 break @@ -357,13 +353,11 @@ class GraniteReasoningParser(ReasoningParser): if current_chunk[-len(response_start) + 1 :] == response_start[:-1]: # Mark end of reasoning and start response content # after the start of response sequence. - end_reasoning_content = current_chunk_end - len(response_start) - reasoning_content = current_text[ - start_reasoning_content:end_reasoning_content - ] + end_reasoning = current_chunk_end - len(response_start) + reasoning = current_text[start_reasoning:end_reasoning] response_content = current_text[current_chunk_end + 1 :] - return reasoning_content, len(response_start), response_content + return reasoning, len(response_start), response_content - if start_reasoning_content and not parsed_content: - return current_text[start_reasoning_content:], None, None + if start_reasoning and not parsed_content: + return current_text[start_reasoning:], None, None return None, None, None diff --git a/vllm/reasoning/hunyuan_a13b_reasoning_parser.py b/vllm/reasoning/hunyuan_a13b_reasoning_parser.py index 1a82068c2694..f297454f57ec 100644 --- a/vllm/reasoning/hunyuan_a13b_reasoning_parser.py +++ b/vllm/reasoning/hunyuan_a13b_reasoning_parser.py @@ -86,7 +86,7 @@ class HunyuanA13BReasoningParser(ReasoningParser): # this id is not part of content, so just return [] here. return [] - def extract_reasoning_content( + def extract_reasoning( self, model_output: str, request: ChatCompletionRequest ) -> tuple[str | None, str | None]: """Extract the reasoning content & content sections, respectively. @@ -104,27 +104,27 @@ class HunyuanA13BReasoningParser(ReasoningParser): re_match = self.full_match_reasoning_regex.findall(model_output) if re_match: - reasoning_content, response_content = re_match[0] - if len(reasoning_content) == 0: - reasoning_content = None + reasoning, response_content = re_match[0] + if len(reasoning) == 0: + reasoning = None if len(response_content) == 0: response_content = None - return reasoning_content, response_content + return reasoning, response_content fallback_regex = self.half_match_reasoning_regex fallback_match = fallback_regex.findall(model_output) if fallback_match: - reasoning_content, response_content = fallback_match[0] + reasoning, response_content = fallback_match[0] if response_content.endswith(self.response_end_expr): response_content = response_content[: -len(self.response_end_expr)] - if len(reasoning_content) == 0: - reasoning_content = None + if len(reasoning) == 0: + reasoning = None if len(response_content) == 0: response_content = None - return reasoning_content, response_content + return reasoning, response_content return None, model_output @@ -140,7 +140,7 @@ class HunyuanA13BReasoningParser(ReasoningParser): sub_idx += 1 return sub_idx == len(subsequence) - def extract_reasoning_content_streaming( + def extract_reasoning_streaming( self, previous_text: str, current_text: str, @@ -223,19 +223,15 @@ class HunyuanA13BReasoningParser(ReasoningParser): # Return content based on current state if self.current_state == "think": - return DeltaMessage( - reasoning_content=buffered_content, content=None - ) + return DeltaMessage(reasoning=buffered_content, content=None) else: - return DeltaMessage( - reasoning_content=None, content=buffered_content - ) + return DeltaMessage(reasoning=None, content=buffered_content) else: # No buffered content, send normally if self.current_state == "think": - return DeltaMessage(reasoning_content=delta_text, content=None) + return DeltaMessage(reasoning=delta_text, content=None) else: - return DeltaMessage(reasoning_content=None, content=delta_text) + return DeltaMessage(reasoning=None, content=delta_text) # If no content to send in this delta return None diff --git a/vllm/reasoning/identity_reasoning_parser.py b/vllm/reasoning/identity_reasoning_parser.py index f1d17a71be33..e92f8add0391 100644 --- a/vllm/reasoning/identity_reasoning_parser.py +++ b/vllm/reasoning/identity_reasoning_parser.py @@ -36,7 +36,7 @@ class IdentityReasoningParser(ReasoningParser): # Identity: return all tokens as content return input_ids - def extract_reasoning_content_streaming( + def extract_reasoning_streaming( self, previous_text: str, current_text: str, @@ -50,9 +50,9 @@ class IdentityReasoningParser(ReasoningParser): return DeltaMessage(content=delta_text) return None - def extract_reasoning_content( + def extract_reasoning( self, model_output: str, request: ChatCompletionRequest ) -> tuple[str | None, str | None]: - # No reasoning separation: return None for reasoning_content, + # No reasoning separation: return None for reasoning, # and full model_output as content return None, model_output diff --git a/vllm/reasoning/minimax_m2_reasoning_parser.py b/vllm/reasoning/minimax_m2_reasoning_parser.py index 440b2b6e2fc2..30f5f2f88caf 100644 --- a/vllm/reasoning/minimax_m2_reasoning_parser.py +++ b/vllm/reasoning/minimax_m2_reasoning_parser.py @@ -48,7 +48,7 @@ class MiniMaxM2AppendThinkReasoningParser(ReasoningParser): def extract_content_ids(self, input_ids: list[int]) -> list[int]: return input_ids - def extract_reasoning_content_streaming( + def extract_reasoning_streaming( self, previous_text: str, current_text: str, @@ -61,7 +61,7 @@ class MiniMaxM2AppendThinkReasoningParser(ReasoningParser): delta_text = "" + delta_text return DeltaMessage(content=delta_text) - def extract_reasoning_content( + def extract_reasoning( self, model_output: str, request: ChatCompletionRequest | ResponsesRequest ) -> tuple[str | None, str | None]: return None, "" + model_output diff --git a/vllm/reasoning/olmo3_reasoning_parser.py b/vllm/reasoning/olmo3_reasoning_parser.py index 91512a2e34c7..7149f8c4123b 100644 --- a/vllm/reasoning/olmo3_reasoning_parser.py +++ b/vllm/reasoning/olmo3_reasoning_parser.py @@ -115,7 +115,7 @@ class Olmo3ReasoningBuffer: if end_think_idx > 0: # this covers the case there's content before # the end of the reasoning block - return DeltaMessage(reasoning_content=pretext) + return DeltaMessage(reasoning=pretext) if self.state == Olmo3ReasoningState.REASONING: # we are inside reasoning block, return and empty @@ -124,7 +124,7 @@ class Olmo3ReasoningBuffer: text_buffer, self.buffer, ) = self.buffer, "" - return DeltaMessage(reasoning_content=text_buffer) + return DeltaMessage(reasoning=text_buffer) if self.state == Olmo3ReasoningState.CONTENT: # we are outside reasoning block, return and empty @@ -250,7 +250,7 @@ class Olmo3ReasoningParser(ReasoningParser): # this id is not part of content, so just return [] here. return [] - def extract_reasoning_content( + def extract_reasoning( self, model_output: str, request: ChatCompletionRequest | ResponsesRequest, @@ -271,14 +271,14 @@ class Olmo3ReasoningParser(ReasoningParser): re_match = self.reasoning_regex.match(model_output) if re_match: - reasoning_content = re_match.group("reasoning") or None + reasoning = re_match.group("reasoning") or None content = re_match.group("content") or None - return reasoning_content, content + return reasoning, content # no reasoning content return None, model_output - def extract_reasoning_content_streaming( + def extract_reasoning_streaming( self, previous_text: str, current_text: str, diff --git a/vllm/reasoning/qwen3_reasoning_parser.py b/vllm/reasoning/qwen3_reasoning_parser.py index 254f0e2e066b..ef7762bf0af5 100644 --- a/vllm/reasoning/qwen3_reasoning_parser.py +++ b/vllm/reasoning/qwen3_reasoning_parser.py @@ -27,7 +27,7 @@ class Qwen3ReasoningParser(BaseThinkingReasoningParser): """The token that ends reasoning content.""" return "" - def extract_reasoning_content( + def extract_reasoning( self, model_output: str, request: ChatCompletionRequest | ResponsesRequest ) -> tuple[str | None, str | None]: """ @@ -37,7 +37,7 @@ class Qwen3ReasoningParser(BaseThinkingReasoningParser): to be present, unlike other models that work with just the end token. For text abcxyz: - - 'abc' goes to reasoning_content + - 'abc' goes to reasoning - 'xyz' goes to content Returns: @@ -61,7 +61,7 @@ class Qwen3ReasoningParser(BaseThinkingReasoningParser): return None, model_output # Extract reasoning content from the model output. - reasoning_content, _, content = model_output.partition(self.end_token) + reasoning, _, content = model_output.partition(self.end_token) final_content = content or None - return reasoning_content, final_content + return reasoning, final_content diff --git a/vllm/reasoning/step3_reasoning_parser.py b/vllm/reasoning/step3_reasoning_parser.py index 202da057b028..f635758a92c0 100644 --- a/vllm/reasoning/step3_reasoning_parser.py +++ b/vllm/reasoning/step3_reasoning_parser.py @@ -40,7 +40,7 @@ class Step3ReasoningParser(ReasoningParser): "token in the tokenizer!" ) - def extract_reasoning_content_streaming( + def extract_reasoning_streaming( self, previous_text: str, current_text: str, @@ -54,7 +54,7 @@ class Step3ReasoningParser(ReasoningParser): Handles streaming output where previous + delta = current. Uses token IDs for faster processing. For text "abcxyz": - - 'abc' goes to reasoning_content + - 'abc' goes to reasoning - 'xyz' goes to content """ # Skip single special token @@ -64,10 +64,10 @@ class Step3ReasoningParser(ReasoningParser): if self.think_end_token_id in delta_token_ids: # in delta, extract reasoning content and remaining content end_index = delta_text.find(self.think_end_token) - reasoning_content = delta_text[:end_index] + reasoning = delta_text[:end_index] content = delta_text[end_index + len(self.think_end_token) :] return DeltaMessage( - reasoning_content=reasoning_content, + reasoning=reasoning, content=content if content else None, ) elif self.think_end_token_id in previous_token_ids: @@ -75,9 +75,9 @@ class Step3ReasoningParser(ReasoningParser): return DeltaMessage(content=delta_text) else: # No seen yet, everything is reasoning - return DeltaMessage(reasoning_content=delta_text) + return DeltaMessage(reasoning=delta_text) - def extract_reasoning_content( + def extract_reasoning( self, model_output: str, request: ChatCompletionRequest ) -> tuple[str | None, str | None]: # Check if the model output contains the token @@ -87,7 +87,7 @@ class Step3ReasoningParser(ReasoningParser): else: # Find the first occurrence of end_index = model_output.find(self.think_end_token) - reasoning_content = model_output[:end_index] + reasoning = model_output[:end_index] # Content after token content = model_output[end_index + len(self.think_end_token) :] @@ -95,7 +95,7 @@ class Step3ReasoningParser(ReasoningParser): if len(content) == 0: content = None - return reasoning_content, content + return reasoning, content def is_reasoning_end(self, input_ids: list[int]) -> bool: return self.think_end_token_id in input_ids diff --git a/vllm/transformers_utils/chat_templates/template_minicpmv45.jinja b/vllm/transformers_utils/chat_templates/template_minicpmv45.jinja index 661ebd1cf5c1..c73ae96f0c1d 100644 --- a/vllm/transformers_utils/chat_templates/template_minicpmv45.jinja +++ b/vllm/transformers_utils/chat_templates/template_minicpmv45.jinja @@ -30,18 +30,18 @@ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} {%- elif message.role == "assistant" %} {%- set content = message.content %} - {%- set reasoning_content = '' %} - {%- if message.reasoning_content is defined and message.reasoning_content is not none %} - {%- set reasoning_content = message.reasoning_content %} + {%- set reasoning = '' %} + {%- if message.reasoning is defined and message.reasoning is not none %} + {%- set reasoning = message.reasoning %} {%- else %} {%- if '' in message.content %} {%- set content = message.content.split('')[-1].lstrip('\n') %} - {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- set reasoning = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} {%- endif %} {%- endif %} {%- if loop.index0 > ns.last_query_index %} - {%- if loop.last or (not loop.last and reasoning_content) %} - {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- if loop.last or (not loop.last and reasoning) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning.strip('\n') + '\n\n\n' + content.lstrip('\n') }} {%- else %} {{- '<|im_start|>' + message.role + '\n' + content }} {%- endif %} diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index 34433484fc14..39198a1f3d81 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -121,8 +121,8 @@ def _prepare_apply_chat_template_tools_and_messages( # # [1]: https://github.com/mistralai/mistral-common/blob/f4a06998b75ed78bbf5aaf569590b772ea26c9f6/src/mistral_common/protocol/instruct/messages.py#L80 for message in messages: - # Remove reasoning_content as unsupported by Mistral - _ = message.pop("reasoning_content", None) # type: ignore + # Remove reasoning as unsupported by Mistral + _ = message.pop("reasoning", None) # type: ignore # The Mistral client, in comparison to the OpenAI client, requires the # "parameters" dict and the "description" string to be present