From 444f0e3f339caba85f84c6628e1df50605b241a0 Mon Sep 17 00:00:00 2001 From: daniel-salib Date: Sun, 7 Dec 2025 18:02:52 -0800 Subject: [PATCH] [Frontend] Add MCP type support infrastructure to Responses API (#30054) Signed-off-by: Daniel Salib --- .../openai/parser/test_harmony_utils.py | 185 +++++++++++++++++- .../openai/parser/harmony_utils.py | 163 +++++++++++---- vllm/entrypoints/openai/protocol.py | 8 + 3 files changed, 309 insertions(+), 47 deletions(-) diff --git a/tests/entrypoints/openai/parser/test_harmony_utils.py b/tests/entrypoints/openai/parser/test_harmony_utils.py index ae6f558f22b0..a3fd80938de6 100644 --- a/tests/entrypoints/openai/parser/test_harmony_utils.py +++ b/tests/entrypoints/openai/parser/test_harmony_utils.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from openai.types.responses import ResponseFunctionToolCall, ResponseReasoningItem +from openai.types.responses.response_output_item import McpCall from openai_harmony import Author, Message, Role, TextContent from vllm.entrypoints.openai.parser.harmony_utils import ( @@ -400,17 +401,19 @@ class TestParseOutputMessage: assert output_items[0].arguments == '{"location": "San Francisco"}' assert output_items[1].arguments == '{"location": "New York"}' - def test_commentary_with_unknown_recipient_raises_error(self): - """Test that commentary with unknown recipient raises ValueError.""" - message = Message.from_role_and_content(Role.ASSISTANT, "some content") + def test_commentary_with_unknown_recipient_creates_mcp_call(self): + """Test that commentary with unknown recipient creates MCP call.""" + message = Message.from_role_and_content(Role.ASSISTANT, '{"arg": "value"}') message = message.with_channel("commentary") - message = message.with_recipient("unknown_recipient") + message = message.with_recipient("custom_tool") - try: - parse_output_message(message) - raise AssertionError("Expected ValueError to be raised") - except ValueError as e: - assert "Unknown recipient: unknown_recipient" in str(e) + output_items = parse_output_message(message) + + assert len(output_items) == 1 + assert isinstance(output_items[0], McpCall) + assert output_items[0].type == "mcp_call" + assert output_items[0].name == "custom_tool" + assert output_items[0].server_label == "custom_tool" def test_analysis_channel_creates_reasoning(self): """Test that analysis channel creates reasoning items.""" @@ -451,3 +454,167 @@ def test_has_custom_tools() -> None: assert has_custom_tools( {"web_search_preview", "code_interpreter", "container", "others"} ) + + +def test_parse_mcp_call_basic() -> None: + """Test that MCP calls are parsed with correct type and server_label.""" + message = Message.from_role_and_content(Role.ASSISTANT, '{"path": "/tmp"}') + message = message.with_recipient("filesystem") + message = message.with_channel("commentary") + + output_items = parse_output_message(message) + + assert len(output_items) == 1 + assert isinstance(output_items[0], McpCall) + assert output_items[0].type == "mcp_call" + assert output_items[0].name == "filesystem" + assert output_items[0].server_label == "filesystem" + assert output_items[0].arguments == '{"path": "/tmp"}' + assert output_items[0].status == "completed" + + +def test_parse_mcp_call_dotted_recipient() -> None: + """Test that dotted recipients extract the tool name correctly.""" + message = Message.from_role_and_content(Role.ASSISTANT, '{"cmd": "ls"}') + message = message.with_recipient("repo_browser.list") + message = message.with_channel("commentary") + + output_items = parse_output_message(message) + + assert len(output_items) == 1 + assert isinstance(output_items[0], McpCall) + assert output_items[0].name == "list" + assert output_items[0].server_label == "repo_browser" + + +def test_mcp_vs_function_call() -> None: + """Test that function calls are not parsed as MCP calls.""" + func_message = Message.from_role_and_content(Role.ASSISTANT, '{"arg": "value"}') + func_message = func_message.with_recipient("functions.my_tool") + func_message = func_message.with_channel("commentary") + + func_items = parse_output_message(func_message) + + assert len(func_items) == 1 + assert not isinstance(func_items[0], McpCall) + assert func_items[0].type == "function_call" + + +def test_mcp_vs_builtin_tools() -> None: + """Test that built-in tools (python, container) are not parsed as MCP calls.""" + # Test python (built-in tool) - should be reasoning, not MCP + python_message = Message.from_role_and_content(Role.ASSISTANT, "print('hello')") + python_message = python_message.with_recipient("python") + python_message = python_message.with_channel("commentary") + + python_items = parse_output_message(python_message) + + assert len(python_items) == 1 + assert not isinstance(python_items[0], McpCall) + assert python_items[0].type == "reasoning" + + +def test_parse_remaining_state_commentary_channel() -> None: + """Test parse_remaining_state with commentary channel and various recipients.""" + from unittest.mock import Mock + + from vllm.entrypoints.openai.parser.harmony_utils import parse_remaining_state + + # Test 1: functions.* recipient → should return function tool call + parser_func = Mock() + parser_func.current_content = '{"arg": "value"}' + parser_func.current_role = Role.ASSISTANT + parser_func.current_channel = "commentary" + parser_func.current_recipient = "functions.my_tool" + + func_items = parse_remaining_state(parser_func) + + assert len(func_items) == 1 + assert not isinstance(func_items[0], McpCall) + assert func_items[0].type == "function_call" + assert func_items[0].name == "my_tool" + assert func_items[0].status == "in_progress" + + # Test 2: MCP tool (not builtin) → should return MCP call + parser_mcp = Mock() + parser_mcp.current_content = '{"path": "/tmp"}' + parser_mcp.current_role = Role.ASSISTANT + parser_mcp.current_channel = "commentary" + parser_mcp.current_recipient = "filesystem" + + mcp_items = parse_remaining_state(parser_mcp) + + assert len(mcp_items) == 1 + assert isinstance(mcp_items[0], McpCall) + assert mcp_items[0].type == "mcp_call" + assert mcp_items[0].name == "filesystem" + assert mcp_items[0].server_label == "filesystem" + assert mcp_items[0].status == "in_progress" + + # Test 3: Built-in tool (python) + # should NOT return MCP call, falls through to reasoning + parser_builtin = Mock() + parser_builtin.current_content = "print('hello')" + parser_builtin.current_role = Role.ASSISTANT + parser_builtin.current_channel = "commentary" + parser_builtin.current_recipient = "python" + + builtin_items = parse_remaining_state(parser_builtin) + + # Should fall through to reasoning logic + assert len(builtin_items) == 1 + assert not isinstance(builtin_items[0], McpCall) + assert builtin_items[0].type == "reasoning" + + +def test_parse_remaining_state_analysis_channel() -> None: + """Test parse_remaining_state with analysis channel and various recipients.""" + from unittest.mock import Mock + + from vllm.entrypoints.openai.parser.harmony_utils import parse_remaining_state + + # Test 1: functions.* recipient → should return function tool call + parser_func = Mock() + parser_func.current_content = '{"arg": "value"}' + parser_func.current_role = Role.ASSISTANT + parser_func.current_channel = "analysis" + parser_func.current_recipient = "functions.my_tool" + + func_items = parse_remaining_state(parser_func) + + assert len(func_items) == 1 + assert not isinstance(func_items[0], McpCall) + assert func_items[0].type == "function_call" + assert func_items[0].name == "my_tool" + assert func_items[0].status == "in_progress" + + # Test 2: MCP tool (not builtin) → should return MCP call + parser_mcp = Mock() + parser_mcp.current_content = '{"query": "test"}' + parser_mcp.current_role = Role.ASSISTANT + parser_mcp.current_channel = "analysis" + parser_mcp.current_recipient = "database" + + mcp_items = parse_remaining_state(parser_mcp) + + assert len(mcp_items) == 1 + assert isinstance(mcp_items[0], McpCall) + assert mcp_items[0].type == "mcp_call" + assert mcp_items[0].name == "database" + assert mcp_items[0].server_label == "database" + assert mcp_items[0].status == "in_progress" + + # Test 3: Built-in tool (container) + # should NOT return MCP call, falls through to reasoning + parser_builtin = Mock() + parser_builtin.current_content = "docker run" + parser_builtin.current_role = Role.ASSISTANT + parser_builtin.current_channel = "analysis" + parser_builtin.current_recipient = "container" + + builtin_items = parse_remaining_state(parser_builtin) + + # Should fall through to reasoning logic + assert len(builtin_items) == 1 + assert not isinstance(builtin_items[0], McpCall) + assert builtin_items[0].type == "reasoning" diff --git a/vllm/entrypoints/openai/parser/harmony_utils.py b/vllm/entrypoints/openai/parser/harmony_utils.py index 7da0914ce3d3..2260e9604c3e 100644 --- a/vllm/entrypoints/openai/parser/harmony_utils.py +++ b/vllm/entrypoints/openai/parser/harmony_utils.py @@ -19,6 +19,7 @@ from openai.types.responses.response_function_web_search import ( ActionSearch, ResponseFunctionWebSearch, ) +from openai.types.responses.response_output_item import McpCall from openai.types.responses.response_reasoning_item import ( Content as ResponseReasoningTextContent, ) @@ -155,11 +156,7 @@ def get_developer_message( "web_search_preview", "code_interpreter", "container", - "mcp", ): - # These are built-in tools that are added to the system message. - # Adding in MCP for now until we support MCP tools executed - # server side pass elif tool.type == "function": @@ -427,6 +424,44 @@ def _parse_final_message(message: Message) -> ResponseOutputItem: ) +def _parse_mcp_recipient(recipient: str) -> tuple[str, str]: + """ + Parse MCP recipient into (server_label, tool_name). + + For dotted recipients like "repo_browser.list": + - server_label: "repo_browser" (namespace/server) + - tool_name: "list" (specific tool) + + For simple recipients like "filesystem": + - server_label: "filesystem" + - tool_name: "filesystem" + """ + if "." in recipient: + server_label = recipient.split(".")[0] + tool_name = recipient.split(".")[-1] + else: + server_label = recipient + tool_name = recipient + return server_label, tool_name + + +def _parse_mcp_call(message: Message, recipient: str) -> list[ResponseOutputItem]: + """Parse MCP calls into MCP call items.""" + server_label, tool_name = _parse_mcp_recipient(recipient) + output_items = [] + for content in message.content: + response_item = McpCall( + arguments=content.text, + type="mcp_call", + name=tool_name, + server_label=server_label, + id=f"mcp_{random_uuid()}", + status="completed", + ) + output_items.append(response_item) + return output_items + + def parse_output_message(message: Message) -> list[ResponseOutputItem]: """ Parse a Harmony message into a list of output response items. @@ -440,33 +475,34 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]: output_items: list[ResponseOutputItem] = [] recipient = message.recipient - # Browser tool calls - if recipient is not None and recipient.startswith("browser."): - output_items.append(_parse_browser_tool_call(message, recipient)) + if recipient is not None: + # Browser tool calls + if recipient.startswith("browser."): + output_items.append(_parse_browser_tool_call(message, recipient)) - # Analysis channel (reasoning/chain-of-thought) + # Function calls (should only happen on commentary channel) + elif message.channel == "commentary" and recipient.startswith("functions."): + output_items.extend(_parse_function_call(message, recipient)) + + # Built-in tools are treated as reasoning + elif recipient.startswith(("python", "browser", "container")): + # Built-in tool recipients (python/browser/container) + # generate reasoning output + output_items.extend(_parse_reasoning_content(message)) + + # All other recipients are MCP calls + else: + output_items.extend(_parse_mcp_call(message, recipient)) + + # No recipient - handle based on channel for non-tool messages elif message.channel == "analysis": output_items.extend(_parse_reasoning_content(message)) - # Commentary channel elif message.channel == "commentary": - # Function calls - if recipient is not None and recipient.startswith("functions."): - output_items.extend(_parse_function_call(message, recipient)) + # Per Harmony format, commentary channel can contain preambles to calling + # multiple functions - explanatory text with no recipient + output_items.extend(_parse_reasoning_content(message)) - # Built-in tools on commentary channel are treated as reasoning for now - elif ( - recipient is None # Preambles: explanatory text before tool calls - or recipient.startswith(("python", "browser", "container")) - ): - # Per Harmony format, commentary channel can contain preambles to calling - # multiple functions - explanatory text with no recipient. Built-in tool - # recipients (python/browser/container) also generate reasoning output. - output_items.extend(_parse_reasoning_content(message)) - else: - raise ValueError(f"Unknown recipient: {recipient}") - - # Final output message elif message.channel == "final": output_items.append(_parse_final_message(message)) @@ -485,20 +521,70 @@ def parse_remaining_state(parser: StreamableParser) -> list[ResponseOutputItem]: if current_recipient is not None and current_recipient.startswith("browser."): return [] - if parser.current_channel == "analysis": - reasoning_item = ResponseReasoningItem( - id=f"rs_{random_uuid()}", - summary=[], - type="reasoning", - content=[ - ResponseReasoningTextContent( - text=parser.current_content, type="reasoning_text" + if current_recipient and parser.current_channel in ("commentary", "analysis"): + if current_recipient.startswith("functions."): + rid = random_uuid() + return [ + ResponseFunctionToolCall( + arguments=parser.current_content, + call_id=f"call_{rid}", + type="function_call", + name=current_recipient.split(".")[-1], + id=f"fc_{rid}", + status="in_progress", ) - ], - status=None, - ) - return [reasoning_item] - elif parser.current_channel == "final": + ] + # Built-in tools (python, browser, container) should be treated as reasoning + elif not ( + current_recipient.startswith("python") + or current_recipient.startswith("browser") + or current_recipient.startswith("container") + ): + # All other recipients are MCP calls + rid = random_uuid() + server_label, tool_name = _parse_mcp_recipient(current_recipient) + return [ + McpCall( + arguments=parser.current_content, + type="mcp_call", + name=tool_name, + server_label=server_label, + id=f"mcp_{rid}", + status="in_progress", + ) + ] + + if parser.current_channel == "commentary": + return [ + ResponseReasoningItem( + id=f"rs_{random_uuid()}", + summary=[], + type="reasoning", + content=[ + ResponseReasoningTextContent( + text=parser.current_content, type="reasoning_text" + ) + ], + status=None, + ) + ] + + if parser.current_channel == "analysis": + return [ + ResponseReasoningItem( + id=f"rs_{random_uuid()}", + summary=[], + type="reasoning", + content=[ + ResponseReasoningTextContent( + text=parser.current_content, type="reasoning_text" + ) + ], + status=None, + ) + ] + + if parser.current_channel == "final": output_text = ResponseOutputText( text=parser.current_content, annotations=[], # TODO @@ -515,6 +601,7 @@ def parse_remaining_state(parser: StreamableParser) -> list[ResponseOutputItem]: type="message", ) return [text_item] + return [] diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 2d34a6a0cd5a..aeff6bded7f0 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -25,6 +25,10 @@ from openai.types.responses import ( ResponseContentPartDoneEvent, ResponseFunctionToolCall, ResponseInputItemParam, + ResponseMcpCallArgumentsDeltaEvent, + ResponseMcpCallArgumentsDoneEvent, + ResponseMcpCallCompletedEvent, + ResponseMcpCallInProgressEvent, ResponseOutputItem, ResponseOutputItemAddedEvent, ResponseOutputItemDoneEvent, @@ -1790,6 +1794,10 @@ StreamingResponsesResponse: TypeAlias = ( | ResponseCodeInterpreterCallCodeDoneEvent | ResponseCodeInterpreterCallInterpretingEvent | ResponseCodeInterpreterCallCompletedEvent + | ResponseMcpCallArgumentsDeltaEvent + | ResponseMcpCallArgumentsDoneEvent + | ResponseMcpCallInProgressEvent + | ResponseMcpCallCompletedEvent )