[responsesAPI][6] Fix multi turn MCP tokenization (#30230)

Signed-off-by: Andrew Xia <axia@fb.com> Co-authored-by: Andrew Xia <axia@fb.com>
2025-12-22 06:35:01 +08:00 · 2025-12-09 18:13:13 -08:00 · 2025-12-09 18:13:13 -08:00 · c3487aca34
commit c3487aca34
parent abe93bce59
5 changed files with 110 additions and 13 deletions
--- a/tests/entrypoints/test_responses_utils.py
+++ b/tests/entrypoints/test_responses_utils.py
@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import pytest
+from openai.types.responses.response_function_tool_call import ResponseFunctionToolCall
 from openai.types.responses.response_function_tool_call_output_item import (
    ResponseFunctionToolCallOutputItem,
 )
@ -14,7 +15,8 @@ from openai.types.responses.response_reasoning_item import (
 )

 from vllm.entrypoints.responses_utils import (
-    construct_chat_message_with_tool_call,
+    _construct_single_message_from_response_item,
+    construct_chat_messages_with_tool_call,
    convert_tool_responses_to_completions_format,
 )

@ -42,7 +44,43 @@ class TestResponsesUtils:

        assert result == {"type": "function", "function": input_tool}

-    def test_construct_chat_message_with_tool_call(self):
+    def test_construct_chat_messages_with_tool_call(self):
+        """Test construction of chat messages with tool calls."""
+        reasoning_item = ResponseReasoningItem(
+            id="lol",
+            summary=[],
+            type="reasoning",
+            content=[
+                Content(
+                    text="Leroy Jenkins",
+                    type="reasoning_text",
+                )
+            ],
+            encrypted_content=None,
+            status=None,
+        )
+        mcp_tool_item = ResponseFunctionToolCall(
+            id="mcp_123",
+            call_id="call_123",
+            type="function_call",
+            status="completed",
+            name="python",
+            arguments='{"code": "123+456"}',
+        )
+        input_items = [reasoning_item, mcp_tool_item]
+        messages = construct_chat_messages_with_tool_call(input_items)
+
+        assert len(messages) == 1
+        message = messages[0]
+        assert message["role"] == "assistant"
+        assert message["reasoning"] == "Leroy Jenkins"
+        assert message["tool_calls"][0]["id"] == "call_123"
+        assert message["tool_calls"][0]["function"]["name"] == "python"
+        assert (
+            message["tool_calls"][0]["function"]["arguments"] == '{"code": "123+456"}'
+        )
+
+    def test_construct_single_message_from_response_item(self):
        item = ResponseReasoningItem(
            id="lol",
            summary=[],
@ -56,7 +94,7 @@ class TestResponsesUtils:
            encrypted_content=None,
            status=None,
        )
-        formatted_item = construct_chat_message_with_tool_call(item)
+        formatted_item = _construct_single_message_from_response_item(item)
        assert formatted_item["role"] == "assistant"
        assert formatted_item["reasoning"] == "Leroy Jenkins"

@ -74,7 +112,7 @@ class TestResponsesUtils:
            status=None,
        )

-        formatted_item = construct_chat_message_with_tool_call(item)
+        formatted_item = _construct_single_message_from_response_item(item)
        assert formatted_item["role"] == "assistant"
        assert (
            formatted_item["reasoning"]
@ -88,7 +126,7 @@ class TestResponsesUtils:
            output="1234",
            status="completed",
        )
-        formatted_item = construct_chat_message_with_tool_call(tool_call_output)
+        formatted_item = _construct_single_message_from_response_item(tool_call_output)
        assert formatted_item["role"] == "tool"
        assert formatted_item["content"] == "1234"
        assert formatted_item["tool_call_id"] == "temp"
@ -102,7 +140,7 @@ class TestResponsesUtils:
            status=None,
        )
        with pytest.raises(ValueError):
-            construct_chat_message_with_tool_call(item)
+            _construct_single_message_from_response_item(item)

        output_item = ResponseOutputMessage(
            id="msg_bf585bbbe3d500e0",
@ -119,6 +157,6 @@ class TestResponsesUtils:
            type="message",
        )

-        formatted_item = construct_chat_message_with_tool_call(output_item)
+        formatted_item = _construct_single_message_from_response_item(output_item)
        assert formatted_item["role"] == "assistant"
        assert formatted_item["content"] == "dongyi"
--- a/vllm/entrypoints/constants.py
+++ b/vllm/entrypoints/constants.py
@ -8,3 +8,5 @@ Shared constants for vLLM entrypoints.
 # These constants help mitigate header abuse attacks
 H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT = 4194304  # 4 MB
 H11_MAX_HEADER_COUNT_DEFAULT = 256
+
+MCP_PREFIX = "mcp_"
--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@ -19,6 +19,7 @@ from vllm import envs
 from vllm.entrypoints.chat_utils import (
    ChatTemplateContentFormatOption,
 )
+from vllm.entrypoints.constants import MCP_PREFIX
 from vllm.entrypoints.openai.parser.harmony_utils import (
    get_encoding,
    get_streamable_parser_for_assistant,
@ -303,7 +304,7 @@ class ParsableContext(ConversationContext):
        result_str = result.content[0].text

        message = ResponseFunctionToolCallOutputItem(
-            id=f"fco_{random_uuid()}",
+            id=f"mcpo_{random_uuid()}",
            type="function_call_output",
            call_id=f"call_{random_uuid()}",
            output=result_str,
@ -385,6 +386,9 @@ class ParsableContext(ConversationContext):
        if not self.parser.response_messages:
            return []
        last_msg = self.parser.response_messages[-1]
+        # change this to a mcp_ function call
+        last_msg.id = f"{MCP_PREFIX}{random_uuid()}"
+        self.parser.response_messages[-1] = last_msg
        if last_msg.name == "code_interpreter":
            return await self.call_python_tool(self._tool_sessions["python"], last_msg)
        elif last_msg.name == "web_search_preview":
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@ -1339,6 +1339,7 @@ class OpenAIServing:
                )
                engine_prompt = engine_prompts[0]
                request_prompt = request_prompts[0]
+                prompt_text, _, _ = self._get_prompt_components(request_prompt)

            # Update the sampling params.
            sampling_params.max_tokens = self.max_model_len - len(
--- a/vllm/entrypoints/responses_utils.py
+++ b/vllm/entrypoints/responses_utils.py
@ -22,6 +22,7 @@ from openai.types.responses.response_reasoning_item import ResponseReasoningItem
 from openai.types.responses.tool import Tool

 from vllm import envs
+from vllm.entrypoints.constants import MCP_PREFIX
 from vllm.entrypoints.openai.protocol import (
    ChatCompletionMessageParam,
    ResponseInputOutputItem,
@ -44,13 +45,13 @@ def make_response_output_items_from_parsable_context(
                )
            if isinstance(output_messages[-1], ResponseFunctionToolCall):
                mcp_message = McpCall(
-                    id=f"mcp_{random_uuid()}",
+                    id=f"{MCP_PREFIX}{random_uuid()}",
                    arguments=output_messages[-1].arguments,
                    name=output_messages[-1].name,
                    server_label=output_messages[
                        -1
                    ].name,  # TODO: store the server label
-                    type="mcp_call",
+                    type=f"{MCP_PREFIX}call",
                    status="completed",
                    output=message.output,
                    # TODO: support error output
@ -98,12 +99,63 @@ def construct_input_messages(
    if isinstance(request_input, str):
        messages.append({"role": "user", "content": request_input})
    else:
-        for item in request_input:
-            messages.append(construct_chat_message_with_tool_call(item))
+        input_messages = construct_chat_messages_with_tool_call(request_input)
+        messages.extend(input_messages)
    return messages


-def construct_chat_message_with_tool_call(
+def _maybe_combine_reasoning_and_tool_call(
+    item: ResponseInputOutputItem, messages: list[ChatCompletionMessageParam]
+) -> ChatCompletionMessageParam | None:
+    """Many models treat MCP calls and reasoning as a single message.
+    This function checks if the last message is a reasoning message and
+    the current message is a tool call"""
+    if not (
+        isinstance(item, ResponseFunctionToolCall) and item.id.startswith(MCP_PREFIX)
+    ):
+        return None
+    if len(messages) == 0:
+        return None
+    last_message = messages[-1]
+    if not (
+        last_message.get("role") == "assistant"
+        and last_message.get("reasoning") is not None
+    ):
+        return None
+
+    last_message["tool_calls"] = [
+        ChatCompletionMessageToolCallParam(
+            id=item.call_id,
+            function=FunctionCallTool(
+                name=item.name,
+                arguments=item.arguments,
+            ),
+            type="function",
+        )
+    ]
+    return last_message
+
+
+def construct_chat_messages_with_tool_call(
+    input_messages: list[ResponseInputOutputItem],
+) -> list[ChatCompletionMessageParam]:
+    """This function wraps _construct_single_message_from_response_item
+    Because some chatMessages come from multiple response items
+    for example a reasoning item and a MCP tool call are two response items
+    but are one chat message
+    """
+    messages: list[ChatCompletionMessageParam] = []
+    for item in input_messages:
+        maybe_combined_message = _maybe_combine_reasoning_and_tool_call(item, messages)
+        if maybe_combined_message is not None:
+            messages[-1] = maybe_combined_message
+        else:
+            messages.append(_construct_single_message_from_response_item(item))
+
+    return messages
+
+
+def _construct_single_message_from_response_item(
    item: ResponseInputOutputItem,
 ) -> ChatCompletionMessageParam:
    if isinstance(item, ResponseFunctionToolCall):