[Frontend] OpenAI Responses API supports Tool/Function calling - non-harmony (#26874)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-12-15 12:16:13 +08:00 · 2025-11-06 18:40:03 +08:00 · 2025-11-06 18:40:03 +08:00 · 59a50afa08
commit 59a50afa08
parent 981cadb35c
12 changed files with 404 additions and 30 deletions
--- a/examples/online_serving/openai_responses_client_with_tools.py
+++ b/examples/online_serving/openai_responses_client_with_tools.py
@ -0,0 +1,83 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Set up this example by starting a vLLM OpenAI-compatible server with tool call
 options enabled.
 Reasoning models can be used through the Responses API as seen here 
 https://platform.openai.com/docs/api-reference/responses
 For example:
 vllm serve Qwen/Qwen3-1.7B --reasoning-parser qwen3 \
      --structured-outputs-config.backend xgrammar \
      --enable-auto-tool-choice --tool-call-parser hermes
 """
 import json
 from openai import OpenAI
 from utils import get_first_model
 def get_weather(latitude: float, longitude: float) -> str:
    """
    Mock function to simulate getting weather data.
    In a real application, this would call an external weather API.
    """
    return f"Current temperature at ({latitude}, {longitude}) is 20°C."
 tools = [
    {
        "type": "function",
        "name": "get_weather",
        "description": "Get current temperature for provided coordinates in celsius.",
        "parameters": {
            "type": "object",
            "properties": {
                "latitude": {"type": "number"},
                "longitude": {"type": "number"},
            },
            "required": ["latitude", "longitude"],
            "additionalProperties": False,
        },
        "strict": True,
    }
 ]
 input_messages = [
    {"role": "user", "content": "What's the weather like in Paris today?"}
 ]
 def main():
    base_url = "http://0.0.0.0:8000/v1"
    client = OpenAI(base_url=base_url, api_key="empty")
    model = get_first_model(client)
    response = client.responses.create(
        model=model, input=input_messages, tools=tools, tool_choice="required"
    )
    for out in response.output:
        if out.type == "function_call":
            print("Function call:", out.name, out.arguments)
            tool_call = out
    args = json.loads(tool_call.arguments)
    result = get_weather(args["latitude"], args["longitude"])
    input_messages.append(tool_call)  # append model's function call message
    input_messages.append(
        {  # append result message
            "type": "function_call_output",
            "call_id": tool_call.call_id,
            "output": str(result),
        }
    )
    response_2 = client.responses.create(
        model=model,
        input=input_messages,
        tools=tools,
    )
    print(response_2.output_text)
 if __name__ == "__main__":
    main()
--- a/tests/v1/entrypoints/openai/serving_responses/init.py
+++ b/tests/v1/entrypoints/openai/serving_responses/init.py
--- a/tests/v1/entrypoints/openai/serving_responses/conftest.py
+++ b/tests/v1/entrypoints/openai/serving_responses/conftest.py
@ -15,8 +15,13 @@ def default_server_args():
        "--max-model-len",
        "8192",
        "--enforce-eager",  # For faster startup.
        "--enable-auto-tool-choice",
        "--structured-outputs-config.backend",
        "xgrammar",
        "--tool-call-parser",
        "hermes",
        "--reasoning-parser",
-        "deepseek_r1",
+        "qwen3",
    ]
--- a/tests/v1/entrypoints/openai/serving_responses/test_basic.py
+++ b/tests/v1/entrypoints/openai/serving_responses/test_basic.py
--- a/tests/v1/entrypoints/openai/serving_responses/test_function_call.py
+++ b/tests/v1/entrypoints/openai/serving_responses/test_function_call.py
@ -0,0 +1,198 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import json
 import openai  # use the official client for correctness check
 import pytest
 MODEL_NAME = "Qwen/Qwen3-1.7B"
 tools = [
    {
        "type": "function",
        "name": "get_current_weather",
        "description": "Get the current weather in a given location",
        "parameters": {
            "type": "object",
            "properties": {
                "city": {
                    "type": "string",
                    "description": "The city to find the weather for, e.g. 'Vienna'",
                    "default": "Vienna",
                },
                "country": {
                    "type": "string",
                    "description": "The country that the city is in, e.g. 'Austria'",
                },
                "unit": {
                    "type": "string",
                    "description": "The unit to fetch the temperature in",
                    "enum": ["celsius", "fahrenheit"],
                },
                "options": {
                    "$ref": "#/$defs/WeatherOptions",
                    "description": "Optional parameters for weather query",
                },
            },
            "required": ["country", "unit"],
            "$defs": {
                "WeatherOptions": {
                    "title": "WeatherOptions",
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "unit": {
                            "type": "string",
                            "enum": ["celsius", "fahrenheit"],
                            "default": "celsius",
                            "description": "Temperature unit",
                            "title": "Temperature Unit",
                        },
                        "include_forecast": {
                            "type": "boolean",
                            "default": False,
                            "description": "Whether to include a 24-hour forecast",
                            "title": "Include Forecast",
                        },
                        "language": {
                            "type": "string",
                            "default": "zh-CN",
                            "description": "Language of the response",
                            "title": "Language",
                            "enum": ["zh-CN", "en-US", "ja-JP"],
                        },
                    },
                },
            },
        },
    },
    {
        "type": "function",
        "name": "get_forecast",
        "description": "Get the weather forecast for a given location",
        "parameters": {
            "type": "object",
            "properties": {
                "city": {
                    "type": "string",
                    "description": "The city to get the forecast for, e.g. 'Vienna'",
                    "default": "Vienna",
                },
                "country": {
                    "type": "string",
                    "description": "The country that the city is in, e.g. 'Austria'",
                },
                "days": {
                    "type": "integer",
                    "description": "Number of days to get the forecast for (1-7)",
                },
                "unit": {
                    "type": "string",
                    "description": "The unit to fetch the temperature in",
                    "enum": ["celsius", "fahrenheit"],
                },
            },
            "required": ["country", "days", "unit"],
        },
    },
 ]
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("tool_choice", ["auto", "required"])
 async def test_function_tool_use(
    client: openai.AsyncOpenAI, model_name: str, tool_choice: str
 ):
    prompt = [
        {
            "role": "user",
            "content": "Can you tell me what the current weather is in Berlin and the "
            "forecast for the next 5 days, in fahrenheit?",
        },
    ]
    response = await client.responses.create(
        model=model_name,
        input=prompt,
        tools=tools,
        tool_choice=tool_choice,
    )
    assert len(response.output) >= 1
    tool_call = None
    reasoning = None
    for out in response.output:
        if out.type == "function_call":
            tool_call = out
        if out.type == "reasoning":
            reasoning = out
    assert tool_call is not None
    assert tool_call.type == "function_call"
    assert json.loads(tool_call.arguments) is not None
    assert reasoning is not None
    assert reasoning.type == "reasoning"
@pytest.mark.asyncio
 async def test_named_tool_use(client: openai.AsyncOpenAI):
    def get_weather(latitude: float, longitude: float) -> str:
        """
        Mock function to simulate getting weather data.
        In a real application, this would call an external weather API.
        """
        return f"Current temperature at ({latitude}, {longitude}) is 20°C."
    tools = [
        {
            "type": "function",
            "name": "get_weather",
            "description": (
                "Get current temperature for provided coordinates in celsius."
            ),
            "parameters": {
                "type": "object",
                "properties": {
                    "latitude": {"type": "number"},
                    "longitude": {"type": "number"},
                },
                "required": ["latitude", "longitude"],
                "additionalProperties": False,
            },
            "strict": True,
        }
    ]
    input_messages = [
        {"role": "user", "content": "What's the weather like in Paris today?"}
    ]
    response = await client.responses.create(
        model=MODEL_NAME,
        input=input_messages,
        tools=tools,
        tool_choice={"type": "function", "name": "get_weather"},
    )
    assert len(response.output) >= 1
    for out in response.output:
        if out.type == "function_call":
            tool_call = out
    assert tool_call is not None
    assert tool_call.type == "function_call"
    assert tool_call.name == "get_weather"
    args = json.loads(tool_call.arguments)
    assert args["latitude"] is not None
    assert args["longitude"] is not None
    # call the tool
    result = get_weather(args["latitude"], args["longitude"])
    input_messages.append(tool_call)  # append model's function call message
    input_messages.append(
        {  # append result message
            "type": "function_call_output",
            "call_id": tool_call.call_id,
            "output": str(result),
        }
    )
    # create a new response with the tool call result
    response_2 = await client.responses.create(model=MODEL_NAME, input=input_messages)
    # check the output
    assert len(response_2.output_text) > 0
--- a/tests/v1/entrypoints/openai/serving_responses/test_image.py
+++ b/tests/v1/entrypoints/openai/serving_responses/test_image.py
--- a/tests/v1/entrypoints/openai/serving_responses/test_stateful.py
+++ b/tests/v1/entrypoints/openai/serving_responses/test_stateful.py
--- a/tests/v1/entrypoints/openai/serving_responses/test_structured_output.py
+++ b/tests/v1/entrypoints/openai/serving_responses/test_structured_output.py
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@ -1098,13 +1098,13 @@ class OpenAIServing:
        )
        if should_parse_tools:
-            if not isinstance(request, ChatCompletionRequest):
+            if not isinstance(request, ChatCompletionRequest | ResponsesRequest):
-                msg = "Tool usage is only supported for Chat Completions API"
+                msg = (
-                raise NotImplementedError(msg)
+                    "Tool usage is only supported for Chat Completions API "
-
+                    "or Responses API requests."
            request = tool_parser(tokenizer).adjust_request(  # type: ignore
                request=request
                )
                raise NotImplementedError(msg)
            request = tool_parser(tokenizer).adjust_request(request=request)  # type: ignore
        if tokenizer is None:
            assert isinstance(request_prompt, str), (
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@ -94,6 +94,7 @@ from vllm.entrypoints.openai.protocol import (
 )
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.responses_utils import construct_chat_message_with_tool_call
 from vllm.entrypoints.tool_server import ToolServer
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
 from vllm.logger import init_logger
@ -196,16 +197,12 @@ class OpenAIServingResponses(OpenAIServing):
            self.default_sampling_params["stop_token_ids"].extend(
                get_stop_tokens_for_assistant_actions()
            )
-
+        self.enable_auto_tools = enable_auto_tools
        # set up tool use
-        self.enable_auto_tools: bool = enable_auto_tools
+        self.tool_parser = self._get_tool_parser(
-        if self.enable_auto_tools:
+            tool_parser_name=tool_parser, enable_auto_tools=enable_auto_tools
            logger.info(
                '"auto" tool choice has been enabled please note that while'
                " the parallel_tool_calls client option is preset for "
                "compatibility reasons, it will be ignored."
        )
-
+        self.exclude_tools_when_tool_choice_none = False
        # HACK(woosuk): This is a hack. We should use a better store.
        # FIXME: If enable_store=True, this may cause a memory leak since we
        # never remove responses from the store.
@ -511,16 +508,20 @@ class OpenAIServingResponses(OpenAIServing):
        prev_response: ResponsesResponse | None,
        tokenizer: AnyTokenizer,
    ):
-        if len(request.tools) > 0:
+        if request.tools is None or (
-            raise NotImplementedError(
+            request.tool_choice == "none" and self.exclude_tools_when_tool_choice_none
-                "Tool use is not supported in Responses API without Harmony"
+        ):
-            )
+            tool_dicts = None
        else:
            tool_dicts = [tool.model_dump() for tool in request.tools]
        # Construct the input messages.
        messages = self._construct_input_messages(request, prev_response)
        _, request_prompts, engine_prompts = await self._preprocess_chat(
            request,
            tokenizer,
            messages,
            tool_dicts=tool_dicts,
            tool_parser=self.tool_parser,
            chat_template=self.chat_template,
            chat_template_content_format=self.chat_template_content_format,
        )
@ -802,7 +803,8 @@ class OpenAIServingResponses(OpenAIServing):
                    delta=False,
                )
-        output = []
+        reasoning_item = None
        message_item = None
        if reasoning_content:
            reasoning_item = ResponseReasoningItem(
                id=f"rs_{random_uuid()}",
@ -815,7 +817,13 @@ class OpenAIServingResponses(OpenAIServing):
                ],
                status=None,  # NOTE: Only the last output item has status.
            )
-            output.append(reasoning_item)
+        tool_calls, content = self._parse_tool_calls_from_content(
            request=request,
            tokenizer=tokenizer,
            content=content,
            enable_auto_tools=self.enable_auto_tools,
            tool_parser_cls=self.tool_parser,
        )
        if content:
            output_text = ResponseOutputText(
                text=content,
@ -832,15 +840,33 @@ class OpenAIServingResponses(OpenAIServing):
                    else None
                ),
            )
-            message = ResponseOutputMessage(
+            message_item = ResponseOutputMessage(
                id=f"msg_{random_uuid()}",
                content=[output_text],
                role="assistant",
                status="completed",
                type="message",
            )
-            output.append(message)
+        outputs = []
-        return output
+
        if reasoning_item:
            outputs.append(reasoning_item)
        if message_item:
            outputs.append(message_item)
        if tool_calls:
            tool_call_items = [
                ResponseFunctionToolCall(
                    id=f"fc_{random_uuid()}",
                    call_id=f"call_{random_uuid()}",
                    type="function_call",
                    status="completed",
                    name=tool_call.name,
                    arguments=tool_call.arguments,
                )
                for tool_call in tool_calls
            ]
            outputs.extend(tool_call_items)
        return outputs
    def _make_response_output_items_with_harmony(
        self,
@ -893,7 +919,8 @@ class OpenAIServingResponses(OpenAIServing):
        if isinstance(request.input, str):
            messages.append({"role": "user", "content": request.input})
        else:
-            messages.extend(request.input)  # type: ignore
+            for item in request.input:
                messages.append(construct_chat_message_with_tool_call(item))
        return messages
    def _construct_harmony_system_input_message(
--- a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
@ -6,10 +6,16 @@ import os
 from collections.abc import Callable, Sequence
 from functools import cached_property
 from openai.types.responses.response_format_text_json_schema_config import (
    ResponseFormatTextJSONSchemaConfig,
 )
 from vllm.entrypoints.openai.protocol import (
    ChatCompletionRequest,
    DeltaMessage,
    ExtractedToolCallInformation,
    ResponsesRequest,
    ResponseTextConfig,
 )
 from vllm.entrypoints.openai.tool_parsers.utils import get_json_schema_from_tools
 from vllm.logger import init_logger
@ -56,11 +62,21 @@ class ToolParser:
        )
        # Set structured output params for tool calling
        if json_schema_from_tool is not None:
-            if request.structured_outputs is None:
+            if isinstance(request, ChatCompletionRequest):
                request.structured_outputs = StructuredOutputsParams()
                # tool_choice: "Forced Function" or "required" will override
                # structured output json settings to make tool calling work correctly
                request.structured_outputs.json = json_schema_from_tool
            if isinstance(request, ResponsesRequest):
                request.text = ResponseTextConfig()
                request.text.format = ResponseFormatTextJSONSchemaConfig(
                    name="tool_calling_response",
                    schema=json_schema_from_tool,
                    type="json_schema",
                    description="Response format for tool calling",
                    strict=True,
                )
        return request
    def extract_tool_calls(
--- a/vllm/entrypoints/responses_utils.py
+++ b/vllm/entrypoints/responses_utils.py
@ -0,0 +1,45 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from openai.types.chat import (
    ChatCompletionAssistantMessageParam,
    ChatCompletionMessageToolCallParam,
    ChatCompletionToolMessageParam,
 )
 from openai.types.chat.chat_completion_message_tool_call_param import (
    Function as FunctionCallTool,
 )
 from openai.types.responses import ResponseFunctionToolCall
 from vllm.entrypoints.openai.protocol import (
    ChatCompletionMessageParam,
    ResponseInputOutputItem,
 )
 def construct_chat_message_with_tool_call(
    item: ResponseInputOutputItem,
 ) -> ChatCompletionMessageParam:
    if isinstance(item, ResponseFunctionToolCall):
        # Append the function call as a tool call.
        return ChatCompletionAssistantMessageParam(
            role="assistant",
            tool_calls=[
                ChatCompletionMessageToolCallParam(
                    id=item.call_id,
                    function=FunctionCallTool(
                        name=item.name,
                        arguments=item.arguments,
                    ),
                    type="function",
                )
            ],
        )
    elif item.get("type") == "function_call_output":
        # Append the function call output as a tool message.
        return ChatCompletionToolMessageParam(
            role="tool",
            content=item.get("output"),
            tool_call_id=item.get("call_id"),
        )
    return item  # type: ignore