[responsesAPI][6] Fix multi turn MCP tokenization (#30230)

Signed-off-by: Andrew Xia <axia@fb.com> Co-authored-by: Andrew Xia <axia@fb.com>
2025-12-22 06:45:01 +08:00 · 2025-12-09 18:13:13 -08:00 · 2025-12-09 18:13:13 -08:00 · c3487aca34
commit c3487aca34
parent abe93bce59
5 changed files with 110 additions and 13 deletions
--- a/tests/entrypoints/test_responses_utils.py
+++ b/tests/entrypoints/test_responses_utils.py
@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 from openai.types.responses.response_function_tool_call import ResponseFunctionToolCall
 from openai.types.responses.response_function_tool_call_output_item import (
    ResponseFunctionToolCallOutputItem,
 )
@ -14,7 +15,8 @@ from openai.types.responses.response_reasoning_item import (
 )
 from vllm.entrypoints.responses_utils import (
-    construct_chat_message_with_tool_call,
+    _construct_single_message_from_response_item,
    construct_chat_messages_with_tool_call,
    convert_tool_responses_to_completions_format,
 )
@ -42,7 +44,43 @@ class TestResponsesUtils:
        assert result == {"type": "function", "function": input_tool}
-    def test_construct_chat_message_with_tool_call(self):
+    def test_construct_chat_messages_with_tool_call(self):
        """Test construction of chat messages with tool calls."""
        reasoning_item = ResponseReasoningItem(
            id="lol",
            summary=[],
            type="reasoning",
            content=[
                Content(
                    text="Leroy Jenkins",
                    type="reasoning_text",
                )
            ],
            encrypted_content=None,
            status=None,
        )
        mcp_tool_item = ResponseFunctionToolCall(
            id="mcp_123",
            call_id="call_123",
            type="function_call",
            status="completed",
            name="python",
            arguments='{"code": "123+456"}',
        )
        input_items = [reasoning_item, mcp_tool_item]
        messages = construct_chat_messages_with_tool_call(input_items)
        assert len(messages) == 1
        message = messages[0]
        assert message["role"] == "assistant"
        assert message["reasoning"] == "Leroy Jenkins"
        assert message["tool_calls"][0]["id"] == "call_123"
        assert message["tool_calls"][0]["function"]["name"] == "python"
        assert (
            message["tool_calls"][0]["function"]["arguments"] == '{"code": "123+456"}'
        )
    def test_construct_single_message_from_response_item(self):
        item = ResponseReasoningItem(
            id="lol",
            summary=[],
@ -56,7 +94,7 @@ class TestResponsesUtils:
            encrypted_content=None,
            status=None,
        )
-        formatted_item = construct_chat_message_with_tool_call(item)
+        formatted_item = _construct_single_message_from_response_item(item)
        assert formatted_item["role"] == "assistant"
        assert formatted_item["reasoning"] == "Leroy Jenkins"
@ -74,7 +112,7 @@ class TestResponsesUtils:
            status=None,
        )
-        formatted_item = construct_chat_message_with_tool_call(item)
+        formatted_item = _construct_single_message_from_response_item(item)
        assert formatted_item["role"] == "assistant"
        assert (
            formatted_item["reasoning"]
@ -88,7 +126,7 @@ class TestResponsesUtils:
            output="1234",
            status="completed",
        )
-        formatted_item = construct_chat_message_with_tool_call(tool_call_output)
+        formatted_item = _construct_single_message_from_response_item(tool_call_output)
        assert formatted_item["role"] == "tool"
        assert formatted_item["content"] == "1234"
        assert formatted_item["tool_call_id"] == "temp"
@ -102,7 +140,7 @@ class TestResponsesUtils:
            status=None,
        )
        with pytest.raises(ValueError):
-            construct_chat_message_with_tool_call(item)
+            _construct_single_message_from_response_item(item)
        output_item = ResponseOutputMessage(
            id="msg_bf585bbbe3d500e0",
@ -119,6 +157,6 @@ class TestResponsesUtils:
            type="message",
        )
-        formatted_item = construct_chat_message_with_tool_call(output_item)
+        formatted_item = _construct_single_message_from_response_item(output_item)
        assert formatted_item["role"] == "assistant"
        assert formatted_item["content"] == "dongyi"
--- a/vllm/entrypoints/constants.py
+++ b/vllm/entrypoints/constants.py
@ -8,3 +8,5 @@ Shared constants for vLLM entrypoints.
 # These constants help mitigate header abuse attacks
 H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT = 4194304  # 4 MB
 H11_MAX_HEADER_COUNT_DEFAULT = 256
 MCP_PREFIX = "mcp_"
--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@ -19,6 +19,7 @@ from vllm import envs
 from vllm.entrypoints.chat_utils import (
    ChatTemplateContentFormatOption,
 )
 from vllm.entrypoints.constants import MCP_PREFIX
 from vllm.entrypoints.openai.parser.harmony_utils import (
    get_encoding,
    get_streamable_parser_for_assistant,
@ -303,7 +304,7 @@ class ParsableContext(ConversationContext):
        result_str = result.content[0].text
        message = ResponseFunctionToolCallOutputItem(
-            id=f"fco_{random_uuid()}",
+            id=f"mcpo_{random_uuid()}",
            type="function_call_output",
            call_id=f"call_{random_uuid()}",
            output=result_str,
@ -385,6 +386,9 @@ class ParsableContext(ConversationContext):
        if not self.parser.response_messages:
            return []
        last_msg = self.parser.response_messages[-1]
        # change this to a mcp_ function call
        last_msg.id = f"{MCP_PREFIX}{random_uuid()}"
        self.parser.response_messages[-1] = last_msg
        if last_msg.name == "code_interpreter":
            return await self.call_python_tool(self._tool_sessions["python"], last_msg)
        elif last_msg.name == "web_search_preview":
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@ -1339,6 +1339,7 @@ class OpenAIServing:
                )
                engine_prompt = engine_prompts[0]
                request_prompt = request_prompts[0]
                prompt_text, _, _ = self._get_prompt_components(request_prompt)
            # Update the sampling params.
            sampling_params.max_tokens = self.max_model_len - len(
--- a/vllm/entrypoints/responses_utils.py
+++ b/vllm/entrypoints/responses_utils.py
@ -22,6 +22,7 @@ from openai.types.responses.response_reasoning_item import ResponseReasoningItem
 from openai.types.responses.tool import Tool
 from vllm import envs
 from vllm.entrypoints.constants import MCP_PREFIX
 from vllm.entrypoints.openai.protocol import (
    ChatCompletionMessageParam,
    ResponseInputOutputItem,
@ -44,13 +45,13 @@ def make_response_output_items_from_parsable_context(
                )
            if isinstance(output_messages[-1], ResponseFunctionToolCall):
                mcp_message = McpCall(
-                    id=f"mcp_{random_uuid()}",
+                    id=f"{MCP_PREFIX}{random_uuid()}",
                    arguments=output_messages[-1].arguments,
                    name=output_messages[-1].name,
                    server_label=output_messages[
                        -1
                    ].name,  # TODO: store the server label
-                    type="mcp_call",
+                    type=f"{MCP_PREFIX}call",
                    status="completed",
                    output=message.output,
                    # TODO: support error output
@ -98,12 +99,63 @@ def construct_input_messages(
    if isinstance(request_input, str):
        messages.append({"role": "user", "content": request_input})
    else:
-        for item in request_input:
+        input_messages = construct_chat_messages_with_tool_call(request_input)
-            messages.append(construct_chat_message_with_tool_call(item))
+        messages.extend(input_messages)
    return messages
-def construct_chat_message_with_tool_call(
+def _maybe_combine_reasoning_and_tool_call(
    item: ResponseInputOutputItem, messages: list[ChatCompletionMessageParam]
 ) -> ChatCompletionMessageParam | None:
    """Many models treat MCP calls and reasoning as a single message.
    This function checks if the last message is a reasoning message and
    the current message is a tool call"""
    if not (
        isinstance(item, ResponseFunctionToolCall) and item.id.startswith(MCP_PREFIX)
    ):
        return None
    if len(messages) == 0:
        return None
    last_message = messages[-1]
    if not (
        last_message.get("role") == "assistant"
        and last_message.get("reasoning") is not None
    ):
        return None
    last_message["tool_calls"] = [
        ChatCompletionMessageToolCallParam(
            id=item.call_id,
            function=FunctionCallTool(
                name=item.name,
                arguments=item.arguments,
            ),
            type="function",
        )
    ]
    return last_message
 def construct_chat_messages_with_tool_call(
    input_messages: list[ResponseInputOutputItem],
 ) -> list[ChatCompletionMessageParam]:
    """This function wraps _construct_single_message_from_response_item
    Because some chatMessages come from multiple response items
    for example a reasoning item and a MCP tool call are two response items
    but are one chat message
    """
    messages: list[ChatCompletionMessageParam] = []
    for item in input_messages:
        maybe_combined_message = _maybe_combine_reasoning_and_tool_call(item, messages)
        if maybe_combined_message is not None:
            messages[-1] = maybe_combined_message
        else:
            messages.append(_construct_single_message_from_response_item(item))
    return messages
 def _construct_single_message_from_response_item(
    item: ResponseInputOutputItem,
 ) -> ChatCompletionMessageParam:
    if isinstance(item, ResponseFunctionToolCall):