[Feature][Responses API] Stream Function Call - harmony (#24317)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2026-07-19 19:47:19 +08:00 · 2025-10-14 23:31:43 +08:00 · 2025-10-14 23:31:43 +08:00 · df850c4912
commit df850c4912
parent 720394de43
2 changed files with 213 additions and 70 deletions
--- a/tests/entrypoints/openai/test_response_api_with_harmony.py
+++ b/tests/entrypoints/openai/test_response_api_with_harmony.py
@ -16,6 +16,22 @@ from ...utils import RemoteOpenAIServer
 MODEL_NAME = "openai/gpt-oss-20b"
 GET_WEATHER_SCHEMA = {
    "type": "function",
    "name": "get_weather",
    "description": "Get current temperature for provided coordinates in celsius.",  # noqa
    "parameters": {
        "type": "object",
        "properties": {
            "latitude": {"type": "number"},
            "longitude": {"type": "number"},
        },
        "required": ["latitude", "longitude"],
        "additionalProperties": False,
    },
    "strict": True,
 }
@pytest.fixture(scope="module")
 def server():
@ -305,6 +321,54 @@ async def test_streaming_types(client: OpenAI, model_name: str):
        assert len(stack_of_event_types) == 0
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_function_calling_with_streaming_types(client: OpenAI, model_name: str):
    # this links the "done" type with the "start" type
    # so every "done" type should have a corresponding "start" type
    # and every open block should be closed by the end of the stream
    pairs_of_event_types = {
        "response.completed": "response.created",
        "response.output_item.done": "response.output_item.added",
        "response.output_text.done": "response.output_text.delta",
        "response.reasoning_text.done": "response.reasoning_text.delta",
        "response.reasoning_part.done": "response.reasoning_part.added",
        "response.function_call_arguments.done": "response.function_call_arguments.delta",  # noqa
    }
    tools = [GET_WEATHER_SCHEMA]
    input_list = [
        {
            "role": "user",
            "content": "What's the weather like in Paris today?",
        }
    ]
    stream_response = await client.responses.create(
        model=model_name,
        input=input_list,
        tools=tools,
        stream=True,
    )
    stack_of_event_types = []
    async for event in stream_response:
        if event.type == "response.created":
            stack_of_event_types.append(event.type)
        elif event.type == "response.completed":
            assert stack_of_event_types[-1] == pairs_of_event_types[event.type]
            stack_of_event_types.pop()
        if event.type.endswith("added"):
            stack_of_event_types.append(event.type)
        elif event.type.endswith("delta"):
            if stack_of_event_types[-1] == event.type:
                continue
            stack_of_event_types.append(event.type)
        elif event.type.endswith("done"):
            assert stack_of_event_types[-1] == pairs_of_event_types[event.type]
            stack_of_event_types.pop()
    assert len(stack_of_event_types) == 0
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("background", [True, False])
@ -483,23 +547,7 @@ def call_function(name, args):
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_function_calling(client: OpenAI, model_name: str):
-    tools = [
+    tools = [GET_WEATHER_SCHEMA]
        {
            "type": "function",
            "name": "get_weather",
            "description": "Get current temperature for provided coordinates in celsius.",  # noqa
            "parameters": {
                "type": "object",
                "properties": {
                    "latitude": {"type": "number"},
                    "longitude": {"type": "number"},
                },
                "required": ["latitude", "longitude"],
                "additionalProperties": False,
            },
            "strict": True,
        }
    ]
    response = await client.responses.create(
        model=model_name,
@ -565,21 +613,7 @@ async def test_function_calling_multi_turn(client: OpenAI, model_name: str):
            },
            "strict": True,
        },
-        {
+        GET_WEATHER_SCHEMA,
            "type": "function",
            "name": "get_weather",
            "description": "Get current temperature for provided coordinates in celsius.",  # noqa
            "parameters": {
                "type": "object",
                "properties": {
                    "latitude": {"type": "number"},
                    "longitude": {"type": "number"},
                },
                "required": ["latitude", "longitude"],
                "additionalProperties": False,
            },
            "strict": True,
        },
    ]
    response = await client.responses.create(
@ -643,23 +677,7 @@ async def test_function_calling_multi_turn(client: OpenAI, model_name: str):
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_function_calling_required(client: OpenAI, model_name: str):
-    tools = [
+    tools = [GET_WEATHER_SCHEMA]
        {
            "type": "function",
            "name": "get_weather",
            "description": "Get current temperature for provided coordinates in celsius.",  # noqa
            "parameters": {
                "type": "object",
                "properties": {
                    "latitude": {"type": "number"},
                    "longitude": {"type": "number"},
                },
                "required": ["latitude", "longitude"],
                "additionalProperties": False,
            },
            "strict": True,
        }
    ]
    with pytest.raises(BadRequestError):
        await client.responses.create(
@ -689,23 +707,7 @@ async def test_system_message_with_tools(client: OpenAI, model_name: str):
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_function_calling_full_history(client: OpenAI, model_name: str):
-    tools = [
+    tools = [GET_WEATHER_SCHEMA]
        {
            "type": "function",
            "name": "get_weather",
            "description": "Get current temperature for provided coordinates in celsius.",  # noqa
            "parameters": {
                "type": "object",
                "properties": {
                    "latitude": {"type": "number"},
                    "longitude": {"type": "number"},
                },
                "required": ["latitude", "longitude"],
                "additionalProperties": False,
            },
            "strict": True,
        }
    ]
    input_messages = [
        {"role": "user", "content": "What's the weather like in Paris today?"}
@ -745,6 +747,74 @@ async def test_function_calling_full_history(client: OpenAI, model_name: str):
    assert response_2.output_text is not None
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_function_calling_with_stream(client: OpenAI, model_name: str):
    tools = [GET_WEATHER_SCHEMA]
    input_list = [
        {
            "role": "user",
            "content": "What's the weather like in Paris today?",
        }
    ]
    stream_response = await client.responses.create(
        model=model_name,
        input=input_list,
        tools=tools,
        stream=True,
    )
    assert stream_response is not None
    final_tool_calls = {}
    final_tool_calls_named = {}
    async for event in stream_response:
        if event.type == "response.output_item.added":
            if event.item.type != "function_call":
                continue
            final_tool_calls[event.output_index] = event.item
            final_tool_calls_named[event.item.name] = event.item
        elif event.type == "response.function_call_arguments.delta":
            index = event.output_index
            tool_call = final_tool_calls[index]
            if tool_call:
                tool_call.arguments += event.delta
                final_tool_calls_named[tool_call.name] = tool_call
        elif event.type == "response.function_call_arguments.done":
            assert event.arguments == final_tool_calls_named[event.name].arguments
    for tool_call in final_tool_calls.values():
        if (
            tool_call
            and tool_call.type == "function_call"
            and tool_call.name == "get_weather"
        ):
            args = json.loads(tool_call.arguments)
            result = call_function(tool_call.name, args)
            input_list += [tool_call]
            break
    assert result is not None
    response = await client.responses.create(
        model=model_name,
        input=input_list
        + [
            {
                "type": "function_call_output",
                "call_id": tool_call.call_id,
                "output": str(result),
            }
        ],
        tools=tools,
        stream=True,
    )
    assert response is not None
    async for event in response:
        # check that no function call events in the stream
        assert event.type != "response.function_call_arguments.delta"
        assert event.type != "response.function_call_arguments.done"
        # check that the response contains output text
        if event.type == "response.completed":
            assert len(event.response.output) > 0
            assert event.response.output_text is not None
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_output_messages_enabled(client: OpenAI, model_name: str, server):
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@ -23,6 +23,8 @@ from openai.types.responses import (
    ResponseCodeInterpreterToolCallParam,
    ResponseContentPartAddedEvent,
    ResponseContentPartDoneEvent,
    ResponseFunctionCallArgumentsDeltaEvent,
    ResponseFunctionCallArgumentsDoneEvent,
    ResponseFunctionToolCall,
    ResponseFunctionWebSearch,
    ResponseOutputItem,
@ -927,6 +929,11 @@ class OpenAIServingResponses(OpenAIServing):
                # to add the tool call request to prev_outputs so that the
                # parse_response_input can find the tool call request when
                # parsing the tool call output.
                if (
                    isinstance(response_msg, dict)
                    and response_msg.get("type") == "function_call"
                ):
                    response_msg = ResponseFunctionToolCall.model_validate(response_msg)
                if isinstance(response_msg, ResponseFunctionToolCall):
                    prev_outputs.append(response_msg)
        return messages
@ -1398,19 +1405,48 @@ class OpenAIServingResponses(OpenAIServing):
        current_output_index = 0
        current_item_id: str = ""
        sent_output_item_added = False
-
+        is_first_function_call_delta = False
        async for ctx in result_generator:
            assert isinstance(ctx, StreamingHarmonyContext)
            if ctx.is_expecting_start():
                current_output_index += 1
                sent_output_item_added = False
-
+                is_first_function_call_delta = False
                if len(ctx.parser.messages) > 0:
                    previous_item = ctx.parser.messages[-1]
                    if previous_item.recipient is not None:
-                        # Deal with tool call here
+                        # Deal with tool call
-                        pass
+                        if previous_item.recipient.startswith("functions."):
                            function_name = previous_item.recipient[len("functions.") :]
                            yield _increment_sequence_number_and_return(
                                ResponseFunctionCallArgumentsDoneEvent(
                                    type="response.function_call_arguments.done",
                                    arguments=previous_item.content[0].text,
                                    name=function_name,
                                    item_id=current_item_id,
                                    output_index=current_output_index,
                                    sequence_number=-1,
                                )
                            )
                            function_call_item = ResponseFunctionToolCall(
                                type="function_call",
                                arguments=previous_item.content[0].text,
                                name=function_name,
                                item_id=current_item_id,
                                output_index=current_output_index,
                                sequence_number=-1,
                                call_id=f"fc_{random_uuid()}",
                                status="completed",
                            )
                            yield _increment_sequence_number_and_return(
                                ResponseOutputItemDoneEvent(
                                    type="response.output_item.done",
                                    sequence_number=-1,
                                    output_index=current_output_index,
                                    item=function_call_item,
                                )
                            )
                    elif previous_item.channel == "analysis":
                        content = ResponseReasoningTextContent(
                            text=previous_item.content[0].text,
@ -1766,6 +1802,43 @@ class OpenAIServingResponses(OpenAIServing):
                            ),
                        )
                    )
            # developer tools will be triggered on the commentary channel
            # and recipient starts with "functions.TOOL_NAME"
            if (
                ctx.parser.current_channel == "commentary"
                and ctx.parser.current_recipient
                and ctx.parser.current_recipient.startswith("functions.")
            ):
                if is_first_function_call_delta is False:
                    is_first_function_call_delta = True
                    fc_name = ctx.parser.current_recipient[len("functions.") :]
                    tool_call_item = ResponseFunctionToolCall(
                        name=fc_name,
                        type="function_call",
                        id=current_item_id,
                        call_id=f"call_{random_uuid()}",
                        arguments="",
                        status="in_progress",
                    )
                    current_item_id = f"fc_{random_uuid()}"
                    yield _increment_sequence_number_and_return(
                        ResponseOutputItemAddedEvent(
                            type="response.output_item.added",
                            sequence_number=-1,
                            output_index=current_output_index,
                            item=tool_call_item,
                        )
                    )
                else:
                    yield _increment_sequence_number_and_return(
                        ResponseFunctionCallArgumentsDeltaEvent(
                            item_id=current_item_id,
                            delta=ctx.parser.last_content_delta,
                            output_index=current_output_index,
                            sequence_number=-1,
                            type="response.function_call_arguments.delta",
                        )
                    )
    async def responses_stream_generator(
        self,