[responsesAPI] parse reasoning item input (#28248)

Signed-off-by: Andrew Xia <axia@fb.com> Co-authored-by: Andrew Xia <axia@fb.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-12-11 00:25:49 +08:00 · 2025-11-21 23:42:11 -08:00 · 2025-11-21 23:42:11 -08:00 · 742e9ff6b3
commit 742e9ff6b3
parent e9056056fb
5 changed files with 212 additions and 1 deletions
--- a/examples/online_serving/openai_responses_client.py
+++ b/examples/online_serving/openai_responses_client.py
@ -0,0 +1,44 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Set up this example by starting a vLLM OpenAI-compatible server.
 Reasoning models can be used through the Responses API as seen here
 https://platform.openai.com/docs/api-reference/responses
 For example:
 vllm serve Qwen/Qwen3-8B --reasoning-parser qwen3
 """
 from openai import OpenAI
 input_messages = [{"role": "user", "content": "What model are you?"}]
 def main():
    base_url = "http://localhost:8000/v1"
    client = OpenAI(base_url=base_url, api_key="empty")
    model = "Qwen/Qwen3-8B"  # get_first_model(client)
    response = client.responses.create(
        model=model,
        input=input_messages,
    )
    for message in response.output:
        if message.type == "reasoning":
            # append reasoning message
            input_messages.append(message)
    response_2 = client.responses.create(
        model=model,
        input=input_messages,
    )
    print(response_2.output_text)
    # I am Qwen, a large language model developed by Alibaba Cloud.
    # I am designed to assist with a wide range of tasks, including
    # answering questions, creating content, coding, and engaging in
    # conversations. I can help with various topics and provide
    # information or support in multiple languages. How can I assist you today?
 if __name__ == "__main__":
    main()
--- a/tests/entrypoints/openai/test_response_api_simple.py
+++ b/tests/entrypoints/openai/test_response_api_simple.py
@ -0,0 +1,71 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import pytest_asyncio
 from openai import OpenAI
 from ...utils import RemoteOpenAIServer
 MODEL_NAME = "Qwen/Qwen3-8B"
@pytest.fixture(scope="module")
 def server():
    args = ["--reasoning-parser", "qwen3", "--max_model_len", "5000"]
    env_dict = dict(
        VLLM_ENABLE_RESPONSES_API_STORE="1",
        # uncomment for tool calling
        # PYTHON_EXECUTION_BACKEND="dangerously_use_uv",
    )
    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as remote_server:
        yield remote_server
@pytest_asyncio.fixture
 async def client(server):
    async with server.get_async_client() as async_client:
        yield async_client
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_basic(client: OpenAI, model_name: str):
    response = await client.responses.create(
        model=model_name,
        input="What is 13 * 24?",
    )
    assert response is not None
    print("response: ", response)
    assert response.status == "completed"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_reasoning_item(client: OpenAI, model_name: str):
    response = await client.responses.create(
        model=model_name,
        input=[
            {"type": "message", "content": "Hello.", "role": "user"},
            {
                "type": "reasoning",
                "id": "lol",
                "content": [
                    {
                        "type": "reasoning_text",
                        "text": "We need to respond: greeting.",
                    }
                ],
                "summary": [],
            },
        ],
        temperature=0.0,
    )
    assert response is not None
    assert response.status == "completed"
    # make sure we get a reasoning and text output
    assert response.output[0].type == "reasoning"
    assert response.output[1].type == "message"
    assert type(response.output[1].content[0].text) is str
--- a/tests/entrypoints/openai/test_response_api_with_harmony.py
+++ b/tests/entrypoints/openai/test_response_api_with_harmony.py
@ -35,7 +35,7 @@ GET_WEATHER_SCHEMA = {
@pytest.fixture(scope="module")
 def server():
-    args = ["--enforce-eager", "--tool-server", "demo"]
+    args = ["--enforce-eager", "--tool-server", "demo", "--max_model_len", "5000"]
    env_dict = dict(
        VLLM_ENABLE_RESPONSES_API_STORE="1",
        PYTHON_EXECUTION_BACKEND="dangerously_use_uv",
@ -550,6 +550,31 @@ def call_function(name, args):
        raise ValueError(f"Unknown function: {name}")
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_reasoning_item(client: OpenAI, model_name: str):
    response = await client.responses.create(
        model=model_name,
        input=[
            {"type": "message", "content": "Hello.", "role": "user"},
            {
                "type": "reasoning",
                "id": "lol",
                "content": [
                    {
                        "type": "reasoning_text",
                        "text": "We need to respond: greeting.",
                    }
                ],
                "summary": [],
            },
        ],
        temperature=0.0,
    )
    assert response is not None
    assert response.status == "completed"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_function_calling(client: OpenAI, model_name: str):
--- a/tests/entrypoints/test_responses_utils.py
+++ b/tests/entrypoints/test_responses_utils.py
@ -1,7 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 from openai.types.responses.response_reasoning_item import (
    Content,
    ResponseReasoningItem,
    Summary,
 )
 from vllm.entrypoints.responses_utils import (
    construct_chat_message_with_tool_call,
    convert_tool_responses_to_completions_format,
 )
@ -28,3 +36,53 @@ class TestResponsesUtils:
        result = convert_tool_responses_to_completions_format(input_tool)
        assert result == {"type": "function", "function": input_tool}
    def test_construct_chat_message_with_tool_call(self):
        item = ResponseReasoningItem(
            id="lol",
            summary=[],
            type="reasoning",
            content=[
                Content(
                    text="Leroy Jenkins",
                    type="reasoning_text",
                )
            ],
            encrypted_content=None,
            status=None,
        )
        formatted_item = construct_chat_message_with_tool_call(item)
        assert formatted_item["role"] == "assistant"
        assert formatted_item["reasoning"] == "Leroy Jenkins"
        item = ResponseReasoningItem(
            id="lol",
            summary=[
                Summary(
                    text='Hmm, the user has just started with a simple "Hello,"',
                    type="summary_text",
                )
            ],
            type="reasoning",
            content=None,
            encrypted_content=None,
            status=None,
        )
        formatted_item = construct_chat_message_with_tool_call(item)
        assert formatted_item["role"] == "assistant"
        assert (
            formatted_item["reasoning"]
            == 'Hmm, the user has just started with a simple "Hello,"'
        )
        item = ResponseReasoningItem(
            id="lol",
            summary=[],
            type="reasoning",
            content=None,
            encrypted_content="TOP_SECRET_MESSAGE",
            status=None,
        )
        with pytest.raises(ValueError):
            construct_chat_message_with_tool_call(item)
--- a/vllm/entrypoints/responses_utils.py
+++ b/vllm/entrypoints/responses_utils.py
@ -10,6 +10,7 @@ from openai.types.chat.chat_completion_message_tool_call_param import (
    Function as FunctionCallTool,
 )
 from openai.types.responses import ResponseFunctionToolCall
 from openai.types.responses.response_reasoning_item import ResponseReasoningItem
 from openai.types.responses.tool import Tool
 from vllm import envs
@ -37,6 +38,18 @@ def construct_chat_message_with_tool_call(
                )
            ],
        )
    elif isinstance(item, ResponseReasoningItem):
        reasoning_content = ""
        if item.encrypted_content:
            raise ValueError("Encrypted content is not supported.")
        if len(item.summary) == 1:
            reasoning_content = item.summary[0].text
        elif item.content and len(item.content) == 1:
            reasoning_content = item.content[0].text
        return {
            "role": "assistant",
            "reasoning": reasoning_content,
        }
    elif item.get("type") == "function_call_output":
        # Append the function call output as a tool message.
        return ChatCompletionToolMessageParam(