From 8f8fda261a620234fdeea338f44093d5d8072879 Mon Sep 17 00:00:00 2001 From: Ben Browning Date: Thu, 11 Dec 2025 23:59:53 -0500 Subject: [PATCH] [Bugfix] Multiple fixes for gpt-oss Chat Completion prompting (#28729) Signed-off-by: Ben Browning Co-authored-by: Chauncey --- .../openai/parser/test_harmony_utils.py | 807 +++++++++++++++--- tests/entrypoints/openai/test_serving_chat.py | 646 +++++++++++++- tests/entrypoints/openai/utils.py | 190 +++++ .../openai/parser/harmony_utils.py | 225 ++++- vllm/entrypoints/openai/serving_chat.py | 13 +- .../openai/tool_parsers/openai_tool_parser.py | 7 +- 6 files changed, 1749 insertions(+), 139 deletions(-) create mode 100644 tests/entrypoints/openai/utils.py diff --git a/tests/entrypoints/openai/parser/test_harmony_utils.py b/tests/entrypoints/openai/parser/test_harmony_utils.py index a3fd80938de6a..1d34fc51ad563 100644 --- a/tests/entrypoints/openai/parser/test_harmony_utils.py +++ b/tests/entrypoints/openai/parser/test_harmony_utils.py @@ -1,21 +1,37 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest from openai.types.responses import ResponseFunctionToolCall, ResponseReasoningItem from openai.types.responses.response_output_item import McpCall from openai_harmony import Author, Message, Role, TextContent +from tests.entrypoints.openai.utils import verify_harmony_messages from vllm.entrypoints.openai.parser.harmony_utils import ( + auto_drop_analysis_messages, + get_encoding, has_custom_tools, + parse_chat_input_to_harmony_message, + parse_chat_output, parse_input_to_harmony_message, parse_output_message, ) -class TestParseInputToHarmonyMessage: - """Tests for parse_input_to_harmony_message function.""" +class TestCommonParseInputToHarmonyMessage: + """ + Tests for scenarios that are common to both Chat Completion + parse_chat_input_to_harmony_message and Responsees API + parse_input_to_harmony_message functions. + """ - def test_assistant_message_with_tool_calls(self): + @pytest.fixture( + params=[parse_chat_input_to_harmony_message, parse_input_to_harmony_message] + ) + def parse_function(self, request): + return request.param + + def test_assistant_message_with_tool_calls(self, parse_function): """Test parsing assistant message with tool calls.""" chat_msg = { "role": "assistant", @@ -35,7 +51,7 @@ class TestParseInputToHarmonyMessage: ], } - messages = parse_input_to_harmony_message(chat_msg) + messages = parse_function(chat_msg) assert len(messages) == 2 @@ -53,7 +69,7 @@ class TestParseInputToHarmonyMessage: assert messages[1].recipient == "functions.search_web" assert messages[1].content_type == "json" - def test_assistant_message_with_empty_tool_call_arguments(self): + def test_assistant_message_with_empty_tool_call_arguments(self, parse_function): """Test parsing assistant message with tool call having None arguments.""" chat_msg = { "role": "assistant", @@ -67,12 +83,152 @@ class TestParseInputToHarmonyMessage: ], } - messages = parse_input_to_harmony_message(chat_msg) + messages = parse_function(chat_msg) assert len(messages) == 1 assert messages[0].content[0].text == "" assert messages[0].recipient == "functions.get_current_time" + def test_system_message(self, parse_function): + """Test parsing system message.""" + chat_msg = { + "role": "system", + "content": "You are a helpful assistant", + } + + messages = parse_function(chat_msg) + + assert len(messages) == 1 + # System messages are converted using Message.from_dict + # which should preserve the role + assert messages[0].author.role == Role.SYSTEM + + def test_developer_message(self, parse_function): + """Test parsing developer message.""" + chat_msg = { + "role": "developer", + "content": "Use concise language", + } + + messages = parse_function(chat_msg) + + assert len(messages) == 1 + assert messages[0].author.role == Role.DEVELOPER + + def test_user_message_with_string_content(self, parse_function): + """Test parsing user message with string content.""" + chat_msg = { + "role": "user", + "content": "What's the weather in San Francisco?", + } + + messages = parse_function(chat_msg) + + assert len(messages) == 1 + assert messages[0].author.role == Role.USER + assert messages[0].content[0].text == "What's the weather in San Francisco?" + + def test_user_message_with_array_content(self, parse_function): + """Test parsing user message with array content.""" + chat_msg = { + "role": "user", + "content": [ + {"text": "What's in this image? "}, + {"text": "Please describe it."}, + ], + } + + messages = parse_function(chat_msg) + + assert len(messages) == 1 + assert messages[0].author.role == Role.USER + assert len(messages[0].content) == 2 + assert messages[0].content[0].text == "What's in this image? " + assert messages[0].content[1].text == "Please describe it." + + def test_assistant_message_with_string_content(self, parse_function): + """Test parsing assistant message with string content (no tool calls).""" + chat_msg = { + "role": "assistant", + "content": "Hello! How can I help you today?", + } + + messages = parse_function(chat_msg) + + assert len(messages) == 1 + assert messages[0].author.role == Role.ASSISTANT + assert messages[0].content[0].text == "Hello! How can I help you today?" + + def test_pydantic_model_input(self, parse_function): + """Test parsing Pydantic model input (has model_dump method).""" + + class MockPydanticModel: + def model_dump(self, exclude_none=True): + return { + "role": "user", + "content": "Test message", + } + + chat_msg = MockPydanticModel() + messages = parse_function(chat_msg) + + assert len(messages) == 1 + assert messages[0].author.role == Role.USER + assert messages[0].content[0].text == "Test message" + + def test_tool_call_with_missing_function_fields(self, parse_function): + """Test parsing tool call with missing name or arguments.""" + chat_msg = { + "role": "assistant", + "tool_calls": [ + { + "function": {} # Missing both name and arguments + } + ], + } + + messages = parse_function(chat_msg) + + assert len(messages) == 1 + assert messages[0].recipient == "functions." + assert messages[0].content[0].text == "" + + def test_array_content_with_missing_text(self, parse_function): + """Test parsing array content where text field is missing.""" + chat_msg = { + "role": "user", + "content": [ + {}, # Missing text field + {"text": "actual text"}, + ], + } + + messages = parse_function(chat_msg) + + assert len(messages) == 1 + assert len(messages[0].content) == 2 + assert messages[0].content[0].text == "" + assert messages[0].content[1].text == "actual text" + + +class TestParseInputToHarmonyMessage: + """ + Tests for scenarios that are specific to the Responses API + parse_input_to_harmony_message function. + """ + + def test_message_with_empty_content(self): + """Test parsing message with empty string content.""" + chat_msg = { + "role": "user", + "content": "", + } + + messages = parse_input_to_harmony_message(chat_msg) + + assert len(messages) == 1 + assert messages[0].content[0].text == "" + def test_tool_message_with_string_content(self): """Test parsing tool message with string content.""" chat_msg = { @@ -111,6 +267,7 @@ class TestParseInputToHarmonyMessage: assert len(messages) == 1 assert messages[0].author.role == Role.TOOL + assert messages[0].author.name == "functions.search_results" assert messages[0].content[0].text == "Result 1: Result 2: Result 3" def test_tool_message_with_empty_content(self): @@ -124,140 +281,564 @@ class TestParseInputToHarmonyMessage: messages = parse_input_to_harmony_message(chat_msg) assert len(messages) == 1 + assert messages[0].author.role == Role.TOOL + assert messages[0].author.name == "functions.empty_tool" assert messages[0].content[0].text == "" - def test_system_message(self): - """Test parsing system message.""" - chat_msg = { - "role": "system", - "content": "You are a helpful assistant", - } - messages = parse_input_to_harmony_message(chat_msg) +class TestParseChatInputToHarmonyMessage: + """ + Tests for scenarios that are specific to the Chat Completion API + parse_chat_input_to_harmony_message function. + """ - assert len(messages) == 1 - # System messages are converted using Message.from_dict - # which should preserve the role - assert messages[0].author.role == Role.SYSTEM - - def test_developer_message(self): - """Test parsing developer message.""" - chat_msg = { - "role": "developer", - "content": "Use concise language", - } - - messages = parse_input_to_harmony_message(chat_msg) - - assert len(messages) == 1 - assert messages[0].author.role == Role.DEVELOPER - - def test_user_message_with_string_content(self): - """Test parsing user message with string content.""" - chat_msg = { - "role": "user", - "content": "What's the weather in San Francisco?", - } - - messages = parse_input_to_harmony_message(chat_msg) - - assert len(messages) == 1 - assert messages[0].author.role == Role.USER - assert messages[0].content[0].text == "What's the weather in San Francisco?" - - def test_user_message_with_array_content(self): - """Test parsing user message with array content.""" - chat_msg = { - "role": "user", - "content": [ - {"text": "What's in this image? "}, - {"text": "Please describe it."}, - ], - } - - messages = parse_input_to_harmony_message(chat_msg) - - assert len(messages) == 1 - assert messages[0].author.role == Role.USER - assert len(messages[0].content) == 2 - assert messages[0].content[0].text == "What's in this image? " - assert messages[0].content[1].text == "Please describe it." - - def test_assistant_message_with_string_content(self): - """Test parsing assistant message with string content (no tool calls).""" - chat_msg = { - "role": "assistant", - "content": "Hello! How can I help you today?", - } - - messages = parse_input_to_harmony_message(chat_msg) - - assert len(messages) == 1 - assert messages[0].author.role == Role.ASSISTANT - assert messages[0].content[0].text == "Hello! How can I help you today?" - - def test_pydantic_model_input(self): - """Test parsing Pydantic model input (has model_dump method).""" - - class MockPydanticModel: - def model_dump(self, exclude_none=True): - return { - "role": "user", - "content": "Test message", - } - - chat_msg = MockPydanticModel() - messages = parse_input_to_harmony_message(chat_msg) - - assert len(messages) == 1 - assert messages[0].author.role == Role.USER - assert messages[0].content[0].text == "Test message" - - def test_message_with_empty_content(self): - """Test parsing message with empty string content.""" + def test_user_message_with_empty_content(self): chat_msg = { "role": "user", "content": "", } - messages = parse_input_to_harmony_message(chat_msg) + messages = parse_chat_input_to_harmony_message(chat_msg) - assert len(messages) == 1 - assert messages[0].content[0].text == "" + verify_harmony_messages( + messages, + [ + { + "role": "user", + "content": "", + }, + ], + ) - def test_tool_call_with_missing_function_fields(self): - """Test parsing tool call with missing name or arguments.""" + def test_user_message_with_none_content(self): + chat_msg = { + "role": "user", + "content": None, + } + + messages = parse_chat_input_to_harmony_message(chat_msg) + + verify_harmony_messages( + messages, + [ + { + "role": "user", + "content": "", + }, + ], + ) + + def test_assistant_message_with_empty_content(self): + chat_msg = { + "role": "assistant", + "content": "", + } + + messages = parse_chat_input_to_harmony_message(chat_msg) + + assert len(messages) == 0 + + def test_assistant_message_with_none_content(self): + chat_msg = { + "role": "assistant", + "content": None, + } + + messages = parse_chat_input_to_harmony_message(chat_msg) + + assert len(messages) == 0 + + def test_assistant_message_with_content_but_empty_reasoning(self): + chat_msg = { + "role": "assistant", + "content": "The answer is 4.", + "reasoning": "", + } + + messages = parse_chat_input_to_harmony_message(chat_msg) + + verify_harmony_messages( + messages, + [ + { + "role": "assistant", + "channel": "final", + "content": "The answer is 4.", + }, + ], + ) + + def test_assistant_message_with_reasoning_but_empty_content(self): + chat_msg = { + "role": "assistant", + "reasoning": "I'm thinking about the user's question.", + "content": "", + } + + messages = parse_chat_input_to_harmony_message(chat_msg) + + verify_harmony_messages( + messages, + [ + { + "role": "assistant", + "channel": "analysis", + "content": "I'm thinking about the user's question.", + }, + ], + ) + + def test_assistant_message_with_reasoning_but_none_content(self): + chat_msg = { + "role": "assistant", + "reasoning": "I'm thinking about the user's question.", + "content": None, + } + + messages = parse_chat_input_to_harmony_message(chat_msg) + + verify_harmony_messages( + messages, + [ + { + "role": "assistant", + "channel": "analysis", + "content": "I'm thinking about the user's question.", + }, + ], + ) + + def test_assistant_message_with_tool_calls_but_no_content(self): chat_msg = { "role": "assistant", "tool_calls": [ { - "function": {} # Missing both name and arguments + "function": { + "name": "get_weather", + "arguments": '{"location": "San Francisco"}', + } } ], } - messages = parse_input_to_harmony_message(chat_msg) + messages = parse_chat_input_to_harmony_message(chat_msg) - assert len(messages) == 1 - assert messages[0].recipient == "functions." - assert messages[0].content[0].text == "" + verify_harmony_messages( + messages, + [ + { + "role": "assistant", + "channel": "commentary", + "recipient": "functions.get_weather", + "content": '{"location": "San Francisco"}', + "content_type": "json", + }, + ], + ) - def test_array_content_with_missing_text(self): - """Test parsing array content where text field is missing.""" + def test_assistant_message_with_tool_calls_and_content(self): chat_msg = { - "role": "user", + "role": "assistant", + "tool_calls": [ + { + "function": { + "name": "get_weather", + "arguments": '{"location": "San Francisco"}', + } + } + ], + "content": "I'll call the tool.", + } + + messages = parse_chat_input_to_harmony_message(chat_msg) + + verify_harmony_messages( + messages, + [ + { + "role": "assistant", + "channel": "commentary", + "content": "I'll call the tool.", + }, + { + "role": "assistant", + "channel": "commentary", + "recipient": "functions.get_weather", + "content": '{"location": "San Francisco"}', + "content_type": "json", + }, + ], + ) + + def test_assistant_message_with_tool_calls_and_reasoning(self): + chat_msg = { + "role": "assistant", + "tool_calls": [ + { + "function": { + "name": "get_weather", + "arguments": '{"location": "San Francisco"}', + } + } + ], + "reasoning": "I should use the get_weather tool.", + } + + messages = parse_chat_input_to_harmony_message(chat_msg) + + verify_harmony_messages( + messages, + [ + { + "role": "assistant", + "channel": "analysis", + "content": "I should use the get_weather tool.", + }, + { + "role": "assistant", + "channel": "commentary", + "recipient": "functions.get_weather", + "content": '{"location": "San Francisco"}', + "content_type": "json", + }, + ], + ) + + def test_assistant_message_with_tool_calls_and_reasoning_and_content(self): + chat_msg = { + "role": "assistant", + "tool_calls": [ + { + "function": { + "name": "get_weather", + "arguments": '{"location": "San Francisco"}', + } + } + ], + "reasoning": "I should use the get_weather tool.", + "content": "I'll call the tool.", + } + + messages = parse_chat_input_to_harmony_message(chat_msg) + + verify_harmony_messages( + messages, + [ + { + "role": "assistant", + "channel": "commentary", + "content": "I'll call the tool.", + }, + { + "role": "assistant", + "channel": "analysis", + "content": "I should use the get_weather tool.", + }, + { + "role": "assistant", + "channel": "commentary", + "recipient": "functions.get_weather", + "content": '{"location": "San Francisco"}', + "content_type": "json", + }, + ], + ) + + def test_tool_message_with_string_content(self): + tool_id_names = { + "call_123": "get_weather", + } + chat_msg = { + "role": "tool", + "tool_call_id": "call_123", + "content": "The weather in San Francisco is sunny, 72°F", + } + + messages = parse_chat_input_to_harmony_message( + chat_msg, tool_id_names=tool_id_names + ) + + verify_harmony_messages( + messages, + [ + { + "role": "tool", + "name": "functions.get_weather", + "content": "The weather in San Francisco is sunny, 72°F", + "channel": "commentary", + }, + ], + ) + + def test_tool_message_with_array_content(self): + tool_id_names = { + "call_123": "search_results", + } + chat_msg = { + "role": "tool", + "tool_call_id": "call_123", "content": [ - {}, # Missing text field - {"text": "actual text"}, + {"type": "text", "text": "Result 1: "}, + {"type": "text", "text": "Result 2: "}, + { + "type": "image", + "url": "http://example.com/img.png", + }, # Should be ignored + {"type": "text", "text": "Result 3"}, ], } - messages = parse_input_to_harmony_message(chat_msg) + messages = parse_chat_input_to_harmony_message( + chat_msg, tool_id_names=tool_id_names + ) - assert len(messages) == 1 - assert len(messages[0].content) == 2 - assert messages[0].content[0].text == "" - assert messages[0].content[1].text == "actual text" + verify_harmony_messages( + messages, + [ + { + "role": "tool", + "name": "functions.search_results", + "content": "Result 1: Result 2: Result 3", + "channel": "commentary", + }, + ], + ) + + def test_tool_message_with_empty_content(self): + tool_id_names = { + "call_123": "empty_tool", + } + chat_msg = { + "role": "tool", + "tool_call_id": "call_123", + "content": "", + } + + messages = parse_chat_input_to_harmony_message( + chat_msg, tool_id_names=tool_id_names + ) + + verify_harmony_messages( + messages, + [ + { + "role": "tool", + "name": "functions.empty_tool", + "content": "", + "channel": "commentary", + }, + ], + ) + + def test_tool_message_with_none_content(self): + tool_id_names = { + "call_123": "empty_tool", + } + chat_msg = { + "role": "tool", + "tool_call_id": "call_123", + "content": None, + } + + messages = parse_chat_input_to_harmony_message( + chat_msg, tool_id_names=tool_id_names + ) + + verify_harmony_messages( + messages, + [ + { + "role": "tool", + "name": "functions.empty_tool", + "content": "", + "channel": "commentary", + }, + ], + ) + + +class TestAutoDropAnalysisMessages: + def test_no_analysis_messages(self) -> None: + messages = [ + Message.from_role_and_content( + Role.ASSISTANT, "The answer is 4." + ).with_channel("final"), + ] + cleaned_messages = auto_drop_analysis_messages(messages) + assert cleaned_messages == messages + + def test_only_analysis_message(self) -> None: + messages = [ + Message.from_role_and_content( + Role.ASSISTANT, "I'm thinking about the user's question." + ).with_channel("analysis"), + ] + cleaned_messages = auto_drop_analysis_messages(messages) + assert cleaned_messages == messages + + def test_multiple_analysis_messages_without_final_message(self) -> None: + messages = [ + Message.from_role_and_content( + Role.ASSISTANT, "I'm thinking about the user's question." + ).with_channel("analysis"), + Message.from_role_and_content( + Role.ASSISTANT, "I'm thinking more." + ).with_channel("analysis"), + Message.from_role_and_content( + Role.ASSISTANT, "I'm thinking even more." + ).with_channel("analysis"), + ] + cleaned_messages = auto_drop_analysis_messages(messages) + assert cleaned_messages == messages + + def test_only_final_message(self) -> None: + messages = [ + Message.from_role_and_content( + Role.ASSISTANT, "The answer is 4." + ).with_channel("final"), + ] + cleaned_messages = auto_drop_analysis_messages(messages) + assert cleaned_messages == messages + + def test_drops_one_analysis_messages_before_final_message(self) -> None: + messages = [ + Message.from_role_and_content( + Role.ASSISTANT, "I'm thinking about the user's question." + ).with_channel("analysis"), + Message.from_role_and_content( + Role.ASSISTANT, "The answer is 4." + ).with_channel("final"), + Message.from_role_and_content( + Role.ASSISTANT, "I should think harder." + ).with_channel("analysis"), + ] + cleaned_messages = auto_drop_analysis_messages(messages) + # Should have dropped the first analysis message + assert cleaned_messages == messages[1:] + + def test_drops_all_analysis_messages_before_final_message(self) -> None: + messages = [ + Message.from_role_and_content( + Role.ASSISTANT, "I'm thinking about the user's question." + ).with_channel("analysis"), + Message.from_role_and_content( + Role.ASSISTANT, "I'm thinking more." + ).with_channel("analysis"), + Message.from_role_and_content( + Role.ASSISTANT, "I'm thinking even more." + ).with_channel("analysis"), + Message.from_role_and_content( + Role.ASSISTANT, "The answer is 4." + ).with_channel("final"), + Message.from_role_and_content( + Role.ASSISTANT, "I should think harder." + ).with_channel("analysis"), + ] + cleaned_messages = auto_drop_analysis_messages(messages) + # Should have dropped the first 3 analysis messages + assert cleaned_messages == messages[3:] + + def test_multiple_analysis_messages_with_multiple_final_messages(self) -> None: + messages = [ + Message.from_role_and_content( + Role.ASSISTANT, "I'm thinking about the user's question." + ).with_channel("analysis"), + Message.from_role_and_content( + Role.ASSISTANT, "I'm thinking more." + ).with_channel("analysis"), + Message.from_role_and_content( + Role.ASSISTANT, "I'm thinking even more." + ).with_channel("analysis"), + Message.from_role_and_content( + Role.ASSISTANT, "The answer is 4." + ).with_channel("final"), + Message.from_role_and_content( + Role.ASSISTANT, "I should think harder." + ).with_channel("analysis"), + Message.from_role_and_content( + Role.ASSISTANT, "The answer is 5." + ).with_channel("final"), + ] + cleaned_messages = auto_drop_analysis_messages(messages) + # Should have dropped all those analysis messages + assert len(cleaned_messages) == 2 + assert cleaned_messages[0].content[0].text == "The answer is 4." + assert cleaned_messages[1].content[0].text == "The answer is 5." + + def test_drops_non_assistant_analysis_messages(self) -> None: + messages = [ + Message.from_role_and_content( + Role.TOOL, "The tool thinks we should think harder." + ).with_channel("analysis"), + Message.from_role_and_content( + Role.ASSISTANT, "The answer is 4." + ).with_channel("final"), + ] + cleaned_messages = auto_drop_analysis_messages(messages) + # Should have dropped the analysis message + assert cleaned_messages == messages[1:] + + +class TestParseChatOutput: + def test_parse_chat_output_interrupted_first_message(self) -> None: + harmony_str = "<|channel|>final<|message|>I'm in the middle of answering" + token_ids = get_encoding().encode(harmony_str, allowed_special="all") + reasoning, final_content, _ = parse_chat_output(token_ids) + assert reasoning is None + assert final_content == "I'm in the middle of answering" + + def test_parse_chat_output_interrupted_reasoning_first_message(self) -> None: + harmony_str = "<|channel|>analysis<|message|>I'm in the middle of thinking" + token_ids = get_encoding().encode(harmony_str, allowed_special="all") + reasoning, final_content, _ = parse_chat_output(token_ids) + assert reasoning == "I'm in the middle of thinking" + assert final_content is None + + def test_parse_chat_output_complete_reasoning_interrupted_content(self) -> None: + harmony_str = ( + "<|channel|>analysis<|message|>I'm thinking.<|end|>" + "<|start|>assistant<|channel|>final" + "<|message|>I'm in the middle of answering" + ) + token_ids = get_encoding().encode(harmony_str, allowed_special="all") + reasoning, final_content, _ = parse_chat_output(token_ids) + assert reasoning == "I'm thinking." + assert final_content == "I'm in the middle of answering" + + def test_parse_chat_output_complete_content(self) -> None: + harmony_str = "<|channel|>final<|message|>The answer is 4.<|end|>" + token_ids = get_encoding().encode(harmony_str, allowed_special="all") + reasoning, final_content, _ = parse_chat_output(token_ids) + assert reasoning is None + assert final_content == "The answer is 4." + + def test_parse_chat_output_complete_commentary(self) -> None: + harmony_str = ( + "<|channel|>commentary<|message|>I need to call some tools.<|end|>" + ) + token_ids = get_encoding().encode(harmony_str, allowed_special="all") + reasoning, final_content, _ = parse_chat_output(token_ids) + assert reasoning is None + assert final_content == "I need to call some tools." + + def test_parse_chat_output_complete_reasoning(self) -> None: + harmony_str = ( + "<|channel|>analysis<|message|>I've thought hard about this.<|end|>" + ) + token_ids = get_encoding().encode(harmony_str, allowed_special="all") + reasoning, final_content, _ = parse_chat_output(token_ids) + assert reasoning == "I've thought hard about this." + assert final_content is None + + def test_parse_chat_output_complete_reasoning_and_content(self) -> None: + harmony_str = ( + "<|channel|>analysis<|message|>I've thought hard about this.<|end|>" + "<|start|>assistant<|channel|>final<|message|>The answer is 4.<|end|>" + ) + token_ids = get_encoding().encode(harmony_str, allowed_special="all") + reasoning, final_content, _ = parse_chat_output(token_ids) + assert reasoning == "I've thought hard about this." + assert final_content == "The answer is 4." class TestParseOutputMessage: diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 9ea65f9fa6e7a..5a9293f1b9ae5 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -11,13 +11,25 @@ import pytest_asyncio from openai import OpenAI from vllm.config.multimodal import MultiModalConfig -from vllm.entrypoints.openai.protocol import ChatCompletionRequest +from vllm.entrypoints.openai.parser.harmony_utils import get_encoding +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + ChatCompletionResponse, + RequestResponseMetadata, +) from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels +from vllm.entrypoints.openai.tool_parsers import ToolParserManager +from vllm.outputs import CompletionOutput, RequestOutput from vllm.tokenizers import get_tokenizer from vllm.v1.engine.async_llm import AsyncLLM from ...utils import RemoteOpenAIServer +from .utils import ( + accumulate_streaming_response, + verify_chat_response, + verify_harmony_messages, +) GPT_OSS_MODEL_NAME = "openai/gpt-oss-20b" @@ -728,3 +740,635 @@ async def test_serving_chat_data_parallel_rank_extraction(): # Verify that data_parallel_rank defaults to None assert "data_parallel_rank" in mock_engine.generate.call_args.kwargs assert mock_engine.generate.call_args.kwargs["data_parallel_rank"] is None + + +class TestServingChatWithHarmony: + """ + These tests ensure Chat Completion requests are being properly converted into + Harmony messages and Harmony response messages back into Chat Completion responses. + These tests are not exhaustive, but each one was created to cover a specific case + that we got wrong but is now fixed. + + Any changes to the tests and their expectations may result in changes to the + accuracy of model prompting and responses generated. It is suggested to run + an evaluation or benchmarking suite (such as bfcl multi_turn) to understand + any impact of changes in how we prompt Harmony models. + """ + + @pytest.fixture(params=[False, True], ids=["non_streaming", "streaming"]) + def stream(self, request) -> bool: + """Parameterize tests to run in both non-streaming and streaming modes.""" + return request.param + + @pytest.fixture() + def mock_engine(self) -> AsyncLLM: + mock_engine = MagicMock(spec=AsyncLLM) + mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) + mock_engine.errored = False + mock_engine.model_config = MockModelConfig() + mock_engine.input_processor = MagicMock() + mock_engine.io_processor = MagicMock() + return mock_engine + + @pytest.fixture() + def serving_chat(self, mock_engine) -> OpenAIServingChat: + chat = _build_serving_chat(mock_engine) + chat.use_harmony = True + chat.tool_parser = ToolParserManager.get_tool_parser("openai") + return chat + + def mock_request_output_from_req_and_token_ids( + self, req: ChatCompletionRequest, token_ids: list[int], finished: bool = False + ) -> RequestOutput: + # Our tests don't use most fields, so just get the token ids correct + completion_output = CompletionOutput( + index=0, + text="", + token_ids=token_ids, + cumulative_logprob=0.0, + logprobs=None, + ) + return RequestOutput( + request_id=req.request_id, + prompt=[], + prompt_token_ids=[], + prompt_logprobs=None, + outputs=[completion_output], + finished=finished, + ) + + @pytest.fixture + def weather_tools(self) -> list[dict[str, Any]]: + return [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string"}, + }, + "required": ["location"], + }, + }, + }, + ] + + @pytest.fixture + def weather_messages_start(self) -> list[dict[str, Any]]: + return [ + { + "role": "user", + "content": "What's the weather like in Paris today?", + }, + ] + + async def generate_response_from_harmony_str( + self, + serving_chat: OpenAIServingChat, + req: ChatCompletionRequest, + harmony_str: str, + stream: bool = False, + ) -> ChatCompletionResponse: + harmony_token_ids = get_encoding().encode(harmony_str, allowed_special="all") + + async def result_generator(): + if stream: + for token_id in harmony_token_ids: + yield self.mock_request_output_from_req_and_token_ids( + req, [token_id] + ) + yield self.mock_request_output_from_req_and_token_ids( + req, [], finished=True + ) + else: + yield self.mock_request_output_from_req_and_token_ids( + req, harmony_token_ids, finished=True + ) + + generator_func = ( + serving_chat.chat_completion_stream_generator + if stream + else serving_chat.chat_completion_full_generator + ) + + result = generator_func( + request=req, + result_generator=result_generator(), + request_id=req.request_id, + model_name=req.model, + conversation=[], + tokenizer=get_tokenizer(req.model), + request_metadata=RequestResponseMetadata( + request_id=req.request_id, + model_name=req.model, + ), + ) + + if stream: + return await accumulate_streaming_response(result) + return await result + + @pytest.mark.asyncio + async def test_simple_chat(self, serving_chat, stream): + messages = [{"role": "user", "content": "what is 1+1?"}] + + # Test the Harmony messages for the first turn's input + req = ChatCompletionRequest(model=MODEL_NAME, messages=messages) + input_messages, _, _ = serving_chat._make_request_with_harmony(req) + verify_harmony_messages( + input_messages, + [ + {"role": "system"}, + {"role": "developer"}, + {"role": "user", "content": messages[0]["content"]}, + ], + ) + + # Test the Chat Completion response for the first turn's output + reasoning_str = "We need to think really hard about this." + final_str = "The answer is 2." + response_str = ( + f"<|channel|>analysis<|message|>{reasoning_str}<|end|>" + f"<|start|>assistant<|channel|>final<|message|>{final_str}<|end|>" + ) + response = await self.generate_response_from_harmony_str( + serving_chat, req, response_str, stream=stream + ) + verify_chat_response(response, content=final_str, reasoning=reasoning_str) + + # Add the output messages from the first turn as input to the second turn + for choice in response.choices: + messages.append(choice.message.model_dump(exclude_none=True)) + + # Test the Harmony messages for the second turn's input + req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages) + input_messages_2, _, _ = serving_chat._make_request_with_harmony(req_2) + verify_harmony_messages( + input_messages_2, + [ + {"role": "system"}, + {"role": "developer"}, + {"role": "user"}, + # The analysis message should be dropped on subsequent inputs because + # of the subsequent assistant message to the final channel. + {"role": "assistant", "channel": "final", "content": final_str}, + ], + ) + + @pytest.mark.asyncio + async def test_tool_call_response_with_content( + self, serving_chat, stream, weather_tools, weather_messages_start + ): + tools = weather_tools + messages = list(weather_messages_start) + + # Test the Harmony messages for the first turn's input + req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) + input_messages, _, _ = serving_chat._make_request_with_harmony(req) + verify_harmony_messages( + input_messages, + [ + {"role": "system"}, + {"role": "developer", "tool_definitions": ["get_weather"]}, + {"role": "user", "content": messages[0]["content"]}, + ], + ) + + # Test the Chat Completion response for the first turn's output + commentary_str = "We'll call get_weather." + tool_args_str = '{"location": "Paris"}' + response_str = ( + f"<|channel|>commentary<|message|>{commentary_str}<|end|>" + "<|start|>assistant to=functions.get_weather<|channel|>commentary" + f"<|constrain|>json<|message|>{tool_args_str}<|call|>" + ) + response = await self.generate_response_from_harmony_str( + serving_chat, req, response_str, stream=stream + ) + verify_chat_response( + response, + content=commentary_str, + tool_calls=[("get_weather", tool_args_str)], + ) + + tool_call = response.choices[0].message.tool_calls[0] + + # Add the output messages from the first turn as input to the second turn + for choice in response.choices: + messages.append(choice.message.model_dump(exclude_none=True)) + + # Add our tool output message + messages.append( + { + "role": "tool", + "tool_call_id": tool_call.id, + "content": "20 degrees Celsius", + }, + ) + + # Test the Harmony messages for the second turn's input + req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages) + input_messages_2, _, _ = serving_chat._make_request_with_harmony(req_2) + verify_harmony_messages( + input_messages_2, + [ + {"role": "system"}, + {"role": "developer"}, + {"role": "user"}, + { + "role": "assistant", + "channel": "commentary", + "content": commentary_str, + }, + { + "role": "assistant", + "channel": "commentary", + "recipient": "functions.get_weather", + "content": tool_args_str, + }, + { + "role": "tool", + "author_name": "functions.get_weather", + "channel": "commentary", + "recipient": "assistant", + "content": "20 degrees Celsius", + }, + ], + ) + + @pytest.mark.asyncio + async def test_tools_and_reasoning( + self, serving_chat, stream, weather_tools, weather_messages_start + ): + tools = weather_tools + messages = list(weather_messages_start) + + # Test the Harmony messages for the first turn's input + req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) + input_messages, _, _ = serving_chat._make_request_with_harmony(req) + verify_harmony_messages( + input_messages, + [ + {"role": "system"}, + {"role": "developer", "tool_definitions": ["get_weather"]}, + {"role": "user", "content": messages[0]["content"]}, + ], + ) + + # Test the Chat Completion response for the first turn's output + reasoning_str = "I'll call get_weather." + tool_args_str = '{"location": "Paris"}' + response_str = ( + f"<|channel|>analysis<|message|>{reasoning_str}<|end|>" + "<|start|>assistant to=functions.get_weather<|channel|>commentary" + f"<|constrain|>json<|message|>{tool_args_str}<|call|>" + ) + response = await self.generate_response_from_harmony_str( + serving_chat, req, response_str, stream=stream + ) + verify_chat_response( + response, + reasoning=reasoning_str, + tool_calls=[("get_weather", tool_args_str)], + ) + + tool_call = response.choices[0].message.tool_calls[0] + + # Add the output messages from the first turn as input to the second turn + for choice in response.choices: + messages.append(choice.message.model_dump(exclude_none=True)) + + # Add our tool output message + messages.append( + { + "role": "tool", + "tool_call_id": tool_call.id, + "content": "20 degrees Celsius", + }, + ) + + # Test the Harmony messages for the second turn's input + req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages) + input_messages_2, _, _ = serving_chat._make_request_with_harmony(req_2) + verify_harmony_messages( + input_messages_2, + [ + {"role": "system"}, + {"role": "developer"}, + {"role": "user"}, + { + "role": "assistant", + "channel": "analysis", + "content": reasoning_str, + }, + { + "role": "assistant", + "channel": "commentary", + "recipient": "functions.get_weather", + "content": tool_args_str, + }, + { + "role": "tool", + "author_name": "functions.get_weather", + "channel": "commentary", + "recipient": "assistant", + "content": "20 degrees Celsius", + }, + ], + ) + + @pytest.mark.asyncio + async def test_multi_turn_tools_and_reasoning( + self, serving_chat, stream, weather_tools, weather_messages_start + ): + tools = weather_tools + messages = list(weather_messages_start) + + # Test the Harmony messages for the first turn's input + req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) + input_messages, _, _ = serving_chat._make_request_with_harmony(req) + verify_harmony_messages( + input_messages, + [ + {"role": "system"}, + {"role": "developer", "tool_definitions": ["get_weather"]}, + {"role": "user", "content": messages[0]["content"]}, + ], + ) + + # Test the Chat Completion response for the first turn's output + reasoning_str = "I'll call get_weather." + paris_tool_args_str = '{"location": "Paris"}' + response_str = ( + f"<|channel|>analysis<|message|>{reasoning_str}<|end|>" + "<|start|>assistant to=functions.get_weather<|channel|>commentary" + f"<|constrain|>json<|message|>{paris_tool_args_str}<|call|>" + ) + response = await self.generate_response_from_harmony_str( + serving_chat, req, response_str, stream=stream + ) + verify_chat_response( + response, + reasoning=reasoning_str, + tool_calls=[("get_weather", paris_tool_args_str)], + ) + + tool_call = response.choices[0].message.tool_calls[0] + + # Add the output messages from the first turn as input to the second turn + for choice in response.choices: + messages.append(choice.message.model_dump(exclude_none=True)) + + # Add our tool output message + messages.append( + { + "role": "tool", + "tool_call_id": tool_call.id, + "content": "20 degrees Celsius", + }, + ) + + # Test the Harmony messages for the second turn's input + req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages) + input_messages_2, _, _ = serving_chat._make_request_with_harmony(req_2) + verify_harmony_messages( + input_messages_2, + [ + {"role": "system"}, + {"role": "developer"}, + {"role": "user"}, + { + "role": "assistant", + "channel": "analysis", + "content": reasoning_str, + }, + { + "role": "assistant", + "channel": "commentary", + "recipient": "functions.get_weather", + "content": paris_tool_args_str, + }, + { + "role": "tool", + "author_name": "functions.get_weather", + "channel": "commentary", + "recipient": "assistant", + "content": "20 degrees Celsius", + }, + ], + ) + + # Test the Chat Completion response for the second turn's output + paris_weather_str = "The weather in Paris today is 20 degrees Celsius." + response_str = f"<|channel|>final<|message|>{paris_weather_str}<|end|>" + response_2 = await self.generate_response_from_harmony_str( + serving_chat, req_2, response_str, stream=stream + ) + verify_chat_response(response_2, content=paris_weather_str) + + # Add the output messages from the second turn as input to the third turn + for choice in response_2.choices: + messages.append(choice.message.model_dump(exclude_none=True)) + + # Add a new user message for the third turn + messages.append( + { + "role": "user", + "content": "What's the weather like in Boston today?", + }, + ) + + # Test the Harmony messages for the third turn's input + req_3 = ChatCompletionRequest(model=MODEL_NAME, messages=messages) + input_messages_3, _, _ = serving_chat._make_request_with_harmony(req_3) + verify_harmony_messages( + input_messages_3, + [ + {"role": "system"}, + {"role": "developer"}, + {"role": "user"}, + { + "role": "assistant", + "channel": "commentary", + "recipient": "functions.get_weather", + "content": paris_tool_args_str, + }, + { + "role": "tool", + "author_name": "functions.get_weather", + "channel": "commentary", + "recipient": "assistant", + "content": "20 degrees Celsius", + }, + { + "role": "assistant", + "channel": "final", + "content": paris_weather_str, + }, + {"role": "user", "content": messages[-1]["content"]}, + ], + ) + + # Test the Chat Completion response for the third turn's output + reasoning_str = "I'll call get_weather." + boston_tool_args_str = '{"location": "Boston"}' + response_str = ( + f"<|channel|>analysis<|message|>{reasoning_str}<|end|>" + "<|start|>assistant to=functions.get_weather<|channel|>commentary" + f"<|constrain|>json<|message|>{boston_tool_args_str}<|call|>" + ) + response_3 = await self.generate_response_from_harmony_str( + serving_chat, req, response_str, stream=stream + ) + verify_chat_response( + response_3, + reasoning=reasoning_str, + tool_calls=[("get_weather", boston_tool_args_str)], + ) + + tool_call = response_3.choices[0].message.tool_calls[0] + + # Add the output messages from the third turn as input to the fourth turn + for choice in response_3.choices: + messages.append(choice.message.model_dump(exclude_none=True)) + + # Add our tool output message + messages.append( + { + "role": "tool", + "tool_call_id": tool_call.id, + "content": "10 degrees Celsius", + }, + ) + + # Test the Harmony messages for the fourth turn's input + req_4 = ChatCompletionRequest(model=MODEL_NAME, messages=messages) + input_messages_4, _, _ = serving_chat._make_request_with_harmony(req_4) + verify_harmony_messages( + input_messages_4, + [ + {"role": "system"}, + {"role": "developer"}, + {"role": "user"}, + {"role": "assistant"}, + {"role": "tool"}, + { + "role": "assistant", + "channel": "final", + }, + {"role": "user"}, + { + "role": "assistant", + "channel": "analysis", + "content": reasoning_str, + }, + { + "role": "assistant", + "channel": "commentary", + "recipient": "functions.get_weather", + "content": boston_tool_args_str, + }, + { + "role": "tool", + "author_name": "functions.get_weather", + "channel": "commentary", + "recipient": "assistant", + "content": "10 degrees Celsius", + }, + ], + ) + + @pytest.mark.asyncio + async def test_non_tool_reasoning(self, serving_chat): + messages: list[dict[str, Any]] = [ + { + "role": "user", + "content": "What's 2+2?", + }, + { + "role": "assistant", + "reasoning": "Adding 2 and 2 is easy. The result is 4.", + "content": "4", + }, + ] + req = ChatCompletionRequest(model=MODEL_NAME, messages=messages) + input_messages, _, _ = serving_chat._make_request_with_harmony(req) + + verify_harmony_messages( + input_messages, + [ + {"role": "system"}, + {"role": "developer"}, + {"role": "user", "content": messages[0]["content"]}, + # The reasoning that would have resulted in an analysis message is + # dropped because of a later assistant message to the final channel. + { + "role": "assistant", + "channel": "final", + "content": messages[1]["content"], + }, + ], + ) + + @pytest.mark.asyncio + async def test_non_tool_reasoning_empty_content(self, serving_chat): + messages: list[dict[str, Any]] = [ + { + "role": "user", + "content": "What's 2+2?", + }, + { + "role": "assistant", + "reasoning": "Adding 2 and 2 is easy. The result is 4.", + "content": "", + }, + ] + req = ChatCompletionRequest(model=MODEL_NAME, messages=messages) + input_messages, _, _ = serving_chat._make_request_with_harmony(req) + + verify_harmony_messages( + input_messages, + [ + {"role": "system"}, + {"role": "developer"}, + {"role": "user", "content": messages[0]["content"]}, + { + "role": "assistant", + "channel": "analysis", + "content": messages[1]["reasoning"], + }, + ], + ) + + @pytest.mark.asyncio + async def test_non_tool_reasoning_empty_content_list(self, serving_chat): + messages: list[dict[str, Any]] = [ + { + "role": "user", + "content": "What's 2+2?", + }, + { + "role": "assistant", + "reasoning": "Adding 2 and 2 is easy. The result is 4.", + "content": [], + }, + ] + req = ChatCompletionRequest(model=MODEL_NAME, messages=messages) + input_messages, _, _ = serving_chat._make_request_with_harmony(req) + + verify_harmony_messages( + input_messages, + [ + {"role": "system"}, + {"role": "developer"}, + {"role": "user", "content": messages[0]["content"]}, + { + "role": "assistant", + "channel": "analysis", + "content": messages[1]["reasoning"], + }, + ], + ) diff --git a/tests/entrypoints/openai/utils.py b/tests/entrypoints/openai/utils.py new file mode 100644 index 0000000000000..501f6dcc91543 --- /dev/null +++ b/tests/entrypoints/openai/utils.py @@ -0,0 +1,190 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import json +from collections.abc import AsyncGenerator +from typing import Any + +from vllm.entrypoints.openai.protocol import ( + ChatCompletionResponse, + ChatCompletionResponseChoice, + ChatCompletionStreamResponse, + ChatMessage, + UsageInfo, +) + + +async def accumulate_streaming_response( + stream_generator: AsyncGenerator[str, None], +) -> ChatCompletionResponse: + """ + Accumulate streaming SSE chunks into a complete ChatCompletionResponse. + + This helper parses the SSE format and builds up the complete response + by combining all the delta chunks. + """ + accumulated_content = "" + accumulated_reasoning = None + accumulated_tool_calls: list[dict[str, Any]] = [] + role = None + finish_reason = None + response_id = None + created = None + model = None + index = 0 + + async for chunk_str in stream_generator: + # Skip empty lines and [DONE] marker + if not chunk_str.strip() or chunk_str.strip() == "data: [DONE]": + continue + + # Parse SSE format: "data: {json}\n\n" + if chunk_str.startswith("data: "): + json_str = chunk_str[6:].strip() + try: + chunk_data = json.loads(json_str) + # print(f"DEBUG: Parsed chunk_data: {chunk_data}") + chunk = ChatCompletionStreamResponse(**chunk_data) + + # Store metadata from first chunk + if response_id is None: + response_id = chunk.id + created = chunk.created + model = chunk.model + + # Process each choice in the chunk + for choice in chunk.choices: + if choice.delta.role: + role = choice.delta.role + if choice.delta.content: + accumulated_content += choice.delta.content + if choice.delta.reasoning: + if accumulated_reasoning is None: + accumulated_reasoning = "" + accumulated_reasoning += choice.delta.reasoning + if choice.delta.tool_calls: + # Accumulate tool calls + for tool_call_delta in choice.delta.tool_calls: + # Find or create the tool call at this index + while len(accumulated_tool_calls) <= tool_call_delta.index: + accumulated_tool_calls.append( + { + "id": None, + "type": "function", + "function": {"name": "", "arguments": ""}, + } + ) + + if tool_call_delta.id: + accumulated_tool_calls[tool_call_delta.index]["id"] = ( + tool_call_delta.id + ) + if tool_call_delta.function: + if tool_call_delta.function.name: + accumulated_tool_calls[tool_call_delta.index][ + "function" + ]["name"] += tool_call_delta.function.name + if tool_call_delta.function.arguments: + accumulated_tool_calls[tool_call_delta.index][ + "function" + ]["arguments"] += tool_call_delta.function.arguments + + if choice.finish_reason: + finish_reason = choice.finish_reason + if choice.index is not None: + index = choice.index + + except json.JSONDecodeError: + continue + + # Build the final message + message_kwargs = { + "role": role or "assistant", + "content": accumulated_content if accumulated_content else None, + "reasoning": accumulated_reasoning, + } + + # Only include tool_calls if there are any + if accumulated_tool_calls: + message_kwargs["tool_calls"] = [ + {"id": tc["id"], "type": tc["type"], "function": tc["function"]} + for tc in accumulated_tool_calls + ] + + message = ChatMessage(**message_kwargs) + + # Build the final response + choice = ChatCompletionResponseChoice( + index=index, + message=message, + finish_reason=finish_reason or "stop", + ) + + # Create usage info (with dummy values for tests) + usage = UsageInfo( + prompt_tokens=0, + completion_tokens=0, + total_tokens=0, + ) + + response = ChatCompletionResponse( + id=response_id or "chatcmpl-test", + object="chat.completion", + created=created or 0, + model=model or "test-model", + choices=[choice], + usage=usage, + ) + + return response + + +def verify_harmony_messages( + messages: list[Any], expected_messages: list[dict[str, Any]] +): + assert len(messages) == len(expected_messages) + for msg, expected in zip(messages, expected_messages): + if "role" in expected: + assert msg.author.role == expected["role"] + if "author_name" in expected: + assert msg.author.name == expected["author_name"] + if "channel" in expected: + assert msg.channel == expected["channel"] + if "recipient" in expected: + assert msg.recipient == expected["recipient"] + if "content" in expected: + assert msg.content[0].text == expected["content"] + if "content_type" in expected: + assert msg.content_type == expected["content_type"] + if "tool_definitions" in expected: + # Check that the tool definitions match the expected list of tool names + actual_tools = [t.name for t in msg.content[0].tools["functions"].tools] + assert actual_tools == expected["tool_definitions"] + + +def verify_chat_response( + response: ChatCompletionResponse, + content: str | None = None, + reasoning: str | None = None, + tool_calls: list[tuple[str, str]] | None = None, +): + assert len(response.choices) == 1 + message = response.choices[0].message + + if content is not None: + assert message.content == content + else: + assert not message.content + + if reasoning is not None: + assert message.reasoning == reasoning + else: + assert not message.reasoning + + if tool_calls: + assert message.tool_calls is not None + assert len(message.tool_calls) == len(tool_calls) + for tc, (expected_name, expected_args) in zip(message.tool_calls, tool_calls): + assert tc.function.name == expected_name + assert tc.function.arguments == expected_args + else: + assert not message.tool_calls diff --git a/vllm/entrypoints/openai/parser/harmony_utils.py b/vllm/entrypoints/openai/parser/harmony_utils.py index 2260e9604c3ed..376d97a03964e 100644 --- a/vllm/entrypoints/openai/parser/harmony_utils.py +++ b/vllm/entrypoints/openai/parser/harmony_utils.py @@ -232,7 +232,177 @@ def parse_response_input( return msg +def parse_chat_inputs_to_harmony_messages(chat_msgs: list) -> list[Message]: + """ + Parse a list of messages from request.messages in the Chat Completion API to + Harmony messages. + """ + msgs: list[Message] = [] + tool_id_names: dict[str, str] = {} + + # Collect tool id to name mappings for tool response recipient values + for chat_msg in chat_msgs: + for tool_call in chat_msg.get("tool_calls", []): + tool_id_names[tool_call.get("id")] = tool_call.get("function", {}).get( + "name" + ) + + for chat_msg in chat_msgs: + msgs.extend(parse_chat_input_to_harmony_message(chat_msg, tool_id_names)) + + msgs = auto_drop_analysis_messages(msgs) + return msgs + + +def auto_drop_analysis_messages(msgs: list[Message]) -> list[Message]: + """ + Harmony models expect the analysis messages (representing raw chain of thought) to + be dropped after an assistant message to the final channel is produced from the + reasoning of those messages. + + The openai-harmony library does this if the very last assistant message is to the + final channel, but it does not handle the case where we're in longer multi-turn + conversations and the client gave us reasoning content from previous turns of + the conversation with multiple assistant messages to the final channel in the + conversation. + + So, we find the index of the last assistant message to the final channel and drop + all analysis messages that precede it, leaving only the analysis messages that + are relevant to the current part of the conversation. + """ + last_assistant_final_index = -1 + for i in range(len(msgs) - 1, -1, -1): + msg = msgs[i] + if msg.author.role == "assistant" and msg.channel == "final": + last_assistant_final_index = i + break + + cleaned_msgs: list[Message] = [] + for i, msg in enumerate(msgs): + if i < last_assistant_final_index and msg.channel == "analysis": + continue + cleaned_msgs.append(msg) + + return cleaned_msgs + + +def flatten_chat_text_content(content: str | list | None) -> str | None: + """ + Extract the text parts from a chat message content field and flatten them + into a single string. + """ + if isinstance(content, list): + return "".join( + item.get("text", "") + for item in content + if isinstance(item, dict) and item.get("type") == "text" + ) + return content + + +def parse_chat_input_to_harmony_message( + chat_msg, tool_id_names: dict[str, str] | None = None +) -> list[Message]: + """ + Parse a message from request.messages in the Chat Completion API to + Harmony messages. + """ + tool_id_names = tool_id_names or {} + + if not isinstance(chat_msg, dict): + # Handle Pydantic models + chat_msg = chat_msg.model_dump(exclude_none=True) + + role = chat_msg.get("role") + msgs: list[Message] = [] + + # Assistant message with tool calls + tool_calls = chat_msg.get("tool_calls", []) + + if role == "assistant" and tool_calls: + content = flatten_chat_text_content(chat_msg.get("content")) + if content: + commentary_msg = Message.from_role_and_content(Role.ASSISTANT, content) + commentary_msg = commentary_msg.with_channel("commentary") + msgs.append(commentary_msg) + + reasoning_content = chat_msg.get("reasoning") or chat_msg.get( + "reasoning_content" + ) + if reasoning_content: + analysis_msg = Message.from_role_and_content( + Role.ASSISTANT, reasoning_content + ) + analysis_msg = analysis_msg.with_channel("analysis") + msgs.append(analysis_msg) + + for call in tool_calls: + func = call.get("function", {}) + name = func.get("name", "") + arguments = func.get("arguments", "") or "" + msg = Message.from_role_and_content(Role.ASSISTANT, arguments) + msg = msg.with_channel("commentary") + msg = msg.with_recipient(f"functions.{name}") + # Officially, this should be `<|constrain|>json` but there is not clear + # evidence that improves accuracy over `json` and some anecdotes to the + # contrary. Further testing of the different content_types is needed. + msg = msg.with_content_type("json") + msgs.append(msg) + return msgs + + # Tool role message (tool output) + if role == "tool": + tool_call_id = chat_msg.get("tool_call_id", "") + name = tool_id_names.get(tool_call_id, "") + content = chat_msg.get("content", "") or "" + content = flatten_chat_text_content(content) + + msg = ( + Message.from_author_and_content( + Author.new(Role.TOOL, f"functions.{name}"), content + ) + .with_channel("commentary") + .with_recipient("assistant") + ) + return [msg] + + # Non-tool reasoning content + reasoning_content = chat_msg.get("reasoning") or chat_msg.get("reasoning_content") + if role == "assistant" and reasoning_content: + analysis_msg = Message.from_role_and_content(Role.ASSISTANT, reasoning_content) + analysis_msg = analysis_msg.with_channel("analysis") + msgs.append(analysis_msg) + + # Default: user/assistant/system messages with content + content = chat_msg.get("content") or "" + if content is None: + content = "" + if isinstance(content, str): + contents = [TextContent(text=content)] + else: + # TODO: Support refusal. + contents = [TextContent(text=c.get("text", "")) for c in content] + + # Only add assistant messages if they have content, as reasoning or tool calling + # assistant messages were already added above. + if role == "assistant" and contents and contents[0].text: + msg = Message.from_role_and_contents(role, contents) + # Send non-tool assistant messages to the final channel + msg = msg.with_channel("final") + msgs.append(msg) + # For user/system/developer messages, add them directly even if no content. + elif role != "assistant": + msg = Message.from_role_and_contents(role, contents) + msgs.append(msg) + + return msgs + + def parse_input_to_harmony_message(chat_msg) -> list[Message]: + """ + Parse a message from request.previous_input_messages in the Responsees API to + Harmony messages. + """ if not isinstance(chat_msg, dict): # Handle Pydantic models chat_msg = chat_msg.model_dump(exclude_none=True) @@ -258,14 +428,7 @@ def parse_input_to_harmony_message(chat_msg) -> list[Message]: if role == "tool": name = chat_msg.get("name", "") content = chat_msg.get("content", "") or "" - if isinstance(content, list): - # Handle array format for tool message content - # by concatenating all text parts. - content = "".join( - item.get("text", "") - for item in content - if isinstance(item, dict) and item.get("type") == "text" - ) + content = flatten_chat_text_content(content) msg = Message.from_author_and_content( Author.new(Role.TOOL, f"functions.{name}"), content @@ -623,20 +786,40 @@ def parse_output_into_messages(token_ids: Iterable[int]) -> StreamableParser: def parse_chat_output( token_ids: Sequence[int], ) -> tuple[str | None, str | None, bool]: + """ + Parse the output of a Harmony chat completion into reasoning and final content. + Note that when the `openai` tool parser is used, serving_chat only uses this + for the reasoning content and gets the final content from the tool call parser. + + When the `openai` tool parser is not enabled, or when `GptOssReasoningParser` is + in use,this needs to return the final content without any tool calls parsed. + + Empty reasoning or final content is returned as None instead of an empty string. + """ parser = parse_output_into_messages(token_ids) output_msgs = parser.messages is_tool_call = False # TODO: update this when tool call is supported - if len(output_msgs) == 0: - # The generation has stopped during reasoning. - reasoning = parser.current_content - final_content = None - elif len(output_msgs) == 1: - # The generation has stopped during final message. - reasoning = output_msgs[0].content[0].text - final_content = parser.current_content - else: - reasoning_msg = output_msgs[:-1] - final_msg = output_msgs[-1] - reasoning = "\n".join([msg.content[0].text for msg in reasoning_msg]) - final_content = final_msg.content[0].text + + # Get completed messages from the parser + reasoning_texts = [ + msg.content[0].text for msg in output_msgs if msg.channel == "analysis" + ] + final_texts = [ + msg.content[0].text for msg in output_msgs if msg.channel != "analysis" + ] + + # Extract partial messages from the parser + if parser.current_channel == "analysis" and parser.current_content: + reasoning_texts.append(parser.current_content) + elif parser.current_channel != "analysis" and parser.current_content: + final_texts.append(parser.current_content) + + # Flatten multiple messages into a single string + reasoning: str | None = "\n".join(reasoning_texts) + final_content: str | None = "\n".join(final_texts) + + # Return None instead of empty string since existing callers check for None + reasoning = reasoning or None + final_content = final_content or None + return reasoning, final_content, is_tool_call diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 2560a5b2cdf41..d94fa7dd91937 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -27,8 +27,8 @@ from vllm.entrypoints.openai.parser.harmony_utils import ( get_stop_tokens_for_assistant_actions, get_streamable_parser_for_assistant, get_system_message, + parse_chat_inputs_to_harmony_messages, parse_chat_output, - parse_input_to_harmony_message, render_for_completion, ) from vllm.entrypoints.openai.protocol import ( @@ -822,6 +822,9 @@ class OpenAIServingChat(OpenAIServing): if delta_message is not None: harmony_tools_streamed[i] = True + elif cur_channel == "commentary": + # Tool call preambles meant to be shown to the user + delta_message = DeltaMessage(content=delta_text) else: delta_message = None # handle streaming deltas for tools with named tool_choice @@ -1770,6 +1773,11 @@ class OpenAIServingChat(OpenAIServing): ): messages: list[OpenAIMessage] = [] + # because of issues with pydantic we need to potentially + # re-serialize the tool_calls field of the request + # for more info: see comment in `maybe_serialize_tool_calls` + maybe_serialize_tool_calls(request) + # Add system message. # NOTE: In Chat Completion API, browsing is enabled by default # if the model supports it. TODO: Support browsing. @@ -1788,8 +1796,7 @@ class OpenAIServingChat(OpenAIServing): messages.append(dev_msg) # Add user message. - for chat_msg in request.messages: - messages.extend(parse_input_to_harmony_message(chat_msg)) + messages.extend(parse_chat_inputs_to_harmony_messages(request.messages)) # Render prompt token ids. prompt_token_ids = render_for_completion(messages) diff --git a/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py index 387e87f208e66..a3cf793ed3a6d 100644 --- a/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py @@ -43,6 +43,7 @@ class OpenAIToolParser(ToolParser): parser = parse_output_into_messages(token_ids) tool_calls = [] final_content = None + commentary_content = None if len(parser.messages) > 0: for msg in parser.messages: @@ -75,11 +76,15 @@ class OpenAIToolParser(ToolParser): ) elif msg.channel == "final": final_content = msg_text + elif msg.channel == "commentary" and not msg.recipient: + commentary_content = msg_text return ExtractedToolCallInformation( tools_called=len(tool_calls) > 0, tool_calls=tool_calls, - content=final_content, + # prefer final content over commentary content if both are present + # commentary content is tool call preambles meant to be shown to the user + content=final_content or commentary_content, ) def extract_tool_calls_streaming(