[Frontend] OpenAI Responses API supports Tool/Function calling - non-harmony (#26874)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-12-10 08:45:00 +08:00 · 2025-11-06 18:40:03 +08:00 · 2025-11-06 18:40:03 +08:00 · 59a50afa08
commit 59a50afa08
parent 981cadb35c
12 changed files with 404 additions and 30 deletions
--- a/examples/online_serving/openai_responses_client_with_tools.py
+++ b/examples/online_serving/openai_responses_client_with_tools.py
@ -0,0 +1,83 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Set up this example by starting a vLLM OpenAI-compatible server with tool call
+options enabled.
+Reasoning models can be used through the Responses API as seen here 
+https://platform.openai.com/docs/api-reference/responses
+For example:
+vllm serve Qwen/Qwen3-1.7B --reasoning-parser qwen3 \
+      --structured-outputs-config.backend xgrammar \
+      --enable-auto-tool-choice --tool-call-parser hermes
+"""
+
+import json
+
+from openai import OpenAI
+from utils import get_first_model
+
+
+def get_weather(latitude: float, longitude: float) -> str:
+    """
+    Mock function to simulate getting weather data.
+    In a real application, this would call an external weather API.
+    """
+    return f"Current temperature at ({latitude}, {longitude}) is 20°C."
+
+
+tools = [
+    {
+        "type": "function",
+        "name": "get_weather",
+        "description": "Get current temperature for provided coordinates in celsius.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "latitude": {"type": "number"},
+                "longitude": {"type": "number"},
+            },
+            "required": ["latitude", "longitude"],
+            "additionalProperties": False,
+        },
+        "strict": True,
+    }
+]
+
+input_messages = [
+    {"role": "user", "content": "What's the weather like in Paris today?"}
+]
+
+
+def main():
+    base_url = "http://0.0.0.0:8000/v1"
+    client = OpenAI(base_url=base_url, api_key="empty")
+    model = get_first_model(client)
+    response = client.responses.create(
+        model=model, input=input_messages, tools=tools, tool_choice="required"
+    )
+
+    for out in response.output:
+        if out.type == "function_call":
+            print("Function call:", out.name, out.arguments)
+            tool_call = out
+    args = json.loads(tool_call.arguments)
+    result = get_weather(args["latitude"], args["longitude"])
+
+    input_messages.append(tool_call)  # append model's function call message
+    input_messages.append(
+        {  # append result message
+            "type": "function_call_output",
+            "call_id": tool_call.call_id,
+            "output": str(result),
+        }
+    )
+    response_2 = client.responses.create(
+        model=model,
+        input=input_messages,
+        tools=tools,
+    )
+    print(response_2.output_text)
+
+
+if __name__ == "__main__":
+    main()
--- a/tests/v1/entrypoints/openai/serving_responses/init.py
+++ b/tests/v1/entrypoints/openai/serving_responses/init.py
--- a/tests/v1/entrypoints/openai/serving_responses/conftest.py
+++ b/tests/v1/entrypoints/openai/serving_responses/conftest.py
@ -15,8 +15,13 @@ def default_server_args():
        "--max-model-len",
        "8192",
        "--enforce-eager",  # For faster startup.
+        "--enable-auto-tool-choice",
+        "--structured-outputs-config.backend",
+        "xgrammar",
+        "--tool-call-parser",
+        "hermes",
        "--reasoning-parser",
-        "deepseek_r1",
+        "qwen3",
    ]


--- a/tests/v1/entrypoints/openai/serving_responses/test_basic.py
+++ b/tests/v1/entrypoints/openai/serving_responses/test_basic.py
--- a/tests/v1/entrypoints/openai/serving_responses/test_function_call.py
+++ b/tests/v1/entrypoints/openai/serving_responses/test_function_call.py
@ -0,0 +1,198 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import openai  # use the official client for correctness check
+import pytest
+
+MODEL_NAME = "Qwen/Qwen3-1.7B"
+tools = [
+    {
+        "type": "function",
+        "name": "get_current_weather",
+        "description": "Get the current weather in a given location",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "city": {
+                    "type": "string",
+                    "description": "The city to find the weather for, e.g. 'Vienna'",
+                    "default": "Vienna",
+                },
+                "country": {
+                    "type": "string",
+                    "description": "The country that the city is in, e.g. 'Austria'",
+                },
+                "unit": {
+                    "type": "string",
+                    "description": "The unit to fetch the temperature in",
+                    "enum": ["celsius", "fahrenheit"],
+                },
+                "options": {
+                    "$ref": "#/$defs/WeatherOptions",
+                    "description": "Optional parameters for weather query",
+                },
+            },
+            "required": ["country", "unit"],
+            "$defs": {
+                "WeatherOptions": {
+                    "title": "WeatherOptions",
+                    "type": "object",
+                    "additionalProperties": False,
+                    "properties": {
+                        "unit": {
+                            "type": "string",
+                            "enum": ["celsius", "fahrenheit"],
+                            "default": "celsius",
+                            "description": "Temperature unit",
+                            "title": "Temperature Unit",
+                        },
+                        "include_forecast": {
+                            "type": "boolean",
+                            "default": False,
+                            "description": "Whether to include a 24-hour forecast",
+                            "title": "Include Forecast",
+                        },
+                        "language": {
+                            "type": "string",
+                            "default": "zh-CN",
+                            "description": "Language of the response",
+                            "title": "Language",
+                            "enum": ["zh-CN", "en-US", "ja-JP"],
+                        },
+                    },
+                },
+            },
+        },
+    },
+    {
+        "type": "function",
+        "name": "get_forecast",
+        "description": "Get the weather forecast for a given location",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "city": {
+                    "type": "string",
+                    "description": "The city to get the forecast for, e.g. 'Vienna'",
+                    "default": "Vienna",
+                },
+                "country": {
+                    "type": "string",
+                    "description": "The country that the city is in, e.g. 'Austria'",
+                },
+                "days": {
+                    "type": "integer",
+                    "description": "Number of days to get the forecast for (1-7)",
+                },
+                "unit": {
+                    "type": "string",
+                    "description": "The unit to fetch the temperature in",
+                    "enum": ["celsius", "fahrenheit"],
+                },
+            },
+            "required": ["country", "days", "unit"],
+        },
+    },
+]
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("tool_choice", ["auto", "required"])
+async def test_function_tool_use(
+    client: openai.AsyncOpenAI, model_name: str, tool_choice: str
+):
+    prompt = [
+        {
+            "role": "user",
+            "content": "Can you tell me what the current weather is in Berlin and the "
+            "forecast for the next 5 days, in fahrenheit?",
+        },
+    ]
+    response = await client.responses.create(
+        model=model_name,
+        input=prompt,
+        tools=tools,
+        tool_choice=tool_choice,
+    )
+
+    assert len(response.output) >= 1
+    tool_call = None
+    reasoning = None
+    for out in response.output:
+        if out.type == "function_call":
+            tool_call = out
+        if out.type == "reasoning":
+            reasoning = out
+    assert tool_call is not None
+    assert tool_call.type == "function_call"
+    assert json.loads(tool_call.arguments) is not None
+    assert reasoning is not None
+    assert reasoning.type == "reasoning"
+
+
+@pytest.mark.asyncio
+async def test_named_tool_use(client: openai.AsyncOpenAI):
+    def get_weather(latitude: float, longitude: float) -> str:
+        """
+        Mock function to simulate getting weather data.
+        In a real application, this would call an external weather API.
+        """
+        return f"Current temperature at ({latitude}, {longitude}) is 20°C."
+
+    tools = [
+        {
+            "type": "function",
+            "name": "get_weather",
+            "description": (
+                "Get current temperature for provided coordinates in celsius."
+            ),
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "latitude": {"type": "number"},
+                    "longitude": {"type": "number"},
+                },
+                "required": ["latitude", "longitude"],
+                "additionalProperties": False,
+            },
+            "strict": True,
+        }
+    ]
+
+    input_messages = [
+        {"role": "user", "content": "What's the weather like in Paris today?"}
+    ]
+
+    response = await client.responses.create(
+        model=MODEL_NAME,
+        input=input_messages,
+        tools=tools,
+        tool_choice={"type": "function", "name": "get_weather"},
+    )
+    assert len(response.output) >= 1
+    for out in response.output:
+        if out.type == "function_call":
+            tool_call = out
+    assert tool_call is not None
+    assert tool_call.type == "function_call"
+    assert tool_call.name == "get_weather"
+    args = json.loads(tool_call.arguments)
+    assert args["latitude"] is not None
+    assert args["longitude"] is not None
+    # call the tool
+    result = get_weather(args["latitude"], args["longitude"])
+    input_messages.append(tool_call)  # append model's function call message
+    input_messages.append(
+        {  # append result message
+            "type": "function_call_output",
+            "call_id": tool_call.call_id,
+            "output": str(result),
+        }
+    )
+    # create a new response with the tool call result
+    response_2 = await client.responses.create(model=MODEL_NAME, input=input_messages)
+    # check the output
+    assert len(response_2.output_text) > 0
--- a/tests/v1/entrypoints/openai/serving_responses/test_image.py
+++ b/tests/v1/entrypoints/openai/serving_responses/test_image.py
--- a/tests/v1/entrypoints/openai/serving_responses/test_stateful.py
+++ b/tests/v1/entrypoints/openai/serving_responses/test_stateful.py
--- a/tests/v1/entrypoints/openai/serving_responses/test_structured_output.py
+++ b/tests/v1/entrypoints/openai/serving_responses/test_structured_output.py
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@ -1098,13 +1098,13 @@ class OpenAIServing:
        )

        if should_parse_tools:
-            if not isinstance(request, ChatCompletionRequest):
-                msg = "Tool usage is only supported for Chat Completions API"
+            if not isinstance(request, ChatCompletionRequest | ResponsesRequest):
+                msg = (
+                    "Tool usage is only supported for Chat Completions API "
+                    "or Responses API requests."
+                )
                raise NotImplementedError(msg)
-
-            request = tool_parser(tokenizer).adjust_request(  # type: ignore
-                request=request
-            )
+            request = tool_parser(tokenizer).adjust_request(request=request)  # type: ignore

        if tokenizer is None:
            assert isinstance(request_prompt, str), (
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@ -94,6 +94,7 @@ from vllm.entrypoints.openai.protocol import (
 )
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.entrypoints.responses_utils import construct_chat_message_with_tool_call
 from vllm.entrypoints.tool_server import ToolServer
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
 from vllm.logger import init_logger
@ -196,16 +197,12 @@ class OpenAIServingResponses(OpenAIServing):
            self.default_sampling_params["stop_token_ids"].extend(
                get_stop_tokens_for_assistant_actions()
            )
-
+        self.enable_auto_tools = enable_auto_tools
        # set up tool use
-        self.enable_auto_tools: bool = enable_auto_tools
-        if self.enable_auto_tools:
-            logger.info(
-                '"auto" tool choice has been enabled please note that while'
-                " the parallel_tool_calls client option is preset for "
-                "compatibility reasons, it will be ignored."
-            )
-
+        self.tool_parser = self._get_tool_parser(
+            tool_parser_name=tool_parser, enable_auto_tools=enable_auto_tools
+        )
+        self.exclude_tools_when_tool_choice_none = False
        # HACK(woosuk): This is a hack. We should use a better store.
        # FIXME: If enable_store=True, this may cause a memory leak since we
        # never remove responses from the store.
@ -511,16 +508,20 @@ class OpenAIServingResponses(OpenAIServing):
        prev_response: ResponsesResponse | None,
        tokenizer: AnyTokenizer,
    ):
-        if len(request.tools) > 0:
-            raise NotImplementedError(
-                "Tool use is not supported in Responses API without Harmony"
-            )
+        if request.tools is None or (
+            request.tool_choice == "none" and self.exclude_tools_when_tool_choice_none
+        ):
+            tool_dicts = None
+        else:
+            tool_dicts = [tool.model_dump() for tool in request.tools]
        # Construct the input messages.
        messages = self._construct_input_messages(request, prev_response)
        _, request_prompts, engine_prompts = await self._preprocess_chat(
            request,
            tokenizer,
            messages,
+            tool_dicts=tool_dicts,
+            tool_parser=self.tool_parser,
            chat_template=self.chat_template,
            chat_template_content_format=self.chat_template_content_format,
        )
@ -802,7 +803,8 @@ class OpenAIServingResponses(OpenAIServing):
                    delta=False,
                )

-        output = []
+        reasoning_item = None
+        message_item = None
        if reasoning_content:
            reasoning_item = ResponseReasoningItem(
                id=f"rs_{random_uuid()}",
@ -815,7 +817,13 @@ class OpenAIServingResponses(OpenAIServing):
                ],
                status=None,  # NOTE: Only the last output item has status.
            )
-            output.append(reasoning_item)
+        tool_calls, content = self._parse_tool_calls_from_content(
+            request=request,
+            tokenizer=tokenizer,
+            content=content,
+            enable_auto_tools=self.enable_auto_tools,
+            tool_parser_cls=self.tool_parser,
+        )
        if content:
            output_text = ResponseOutputText(
                text=content,
@ -832,15 +840,33 @@ class OpenAIServingResponses(OpenAIServing):
                    else None
                ),
            )
-            message = ResponseOutputMessage(
+            message_item = ResponseOutputMessage(
                id=f"msg_{random_uuid()}",
                content=[output_text],
                role="assistant",
                status="completed",
                type="message",
            )
-            output.append(message)
-        return output
+        outputs = []
+
+        if reasoning_item:
+            outputs.append(reasoning_item)
+        if message_item:
+            outputs.append(message_item)
+        if tool_calls:
+            tool_call_items = [
+                ResponseFunctionToolCall(
+                    id=f"fc_{random_uuid()}",
+                    call_id=f"call_{random_uuid()}",
+                    type="function_call",
+                    status="completed",
+                    name=tool_call.name,
+                    arguments=tool_call.arguments,
+                )
+                for tool_call in tool_calls
+            ]
+            outputs.extend(tool_call_items)
+        return outputs

    def _make_response_output_items_with_harmony(
        self,
@ -893,7 +919,8 @@ class OpenAIServingResponses(OpenAIServing):
        if isinstance(request.input, str):
            messages.append({"role": "user", "content": request.input})
        else:
-            messages.extend(request.input)  # type: ignore
+            for item in request.input:
+                messages.append(construct_chat_message_with_tool_call(item))
        return messages

    def _construct_harmony_system_input_message(
--- a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
@ -6,10 +6,16 @@ import os
 from collections.abc import Callable, Sequence
 from functools import cached_property

+from openai.types.responses.response_format_text_json_schema_config import (
+    ResponseFormatTextJSONSchemaConfig,
+)
+
 from vllm.entrypoints.openai.protocol import (
    ChatCompletionRequest,
    DeltaMessage,
    ExtractedToolCallInformation,
+    ResponsesRequest,
+    ResponseTextConfig,
 )
 from vllm.entrypoints.openai.tool_parsers.utils import get_json_schema_from_tools
 from vllm.logger import init_logger
@ -56,11 +62,21 @@ class ToolParser:
        )
        # Set structured output params for tool calling
        if json_schema_from_tool is not None:
-            if request.structured_outputs is None:
+            if isinstance(request, ChatCompletionRequest):
                request.structured_outputs = StructuredOutputsParams()
-            # tool_choice: "Forced Function" or "required" will override
-            # structured output json settings to make tool calling work correctly
-            request.structured_outputs.json = json_schema_from_tool
+                # tool_choice: "Forced Function" or "required" will override
+                # structured output json settings to make tool calling work correctly
+                request.structured_outputs.json = json_schema_from_tool
+            if isinstance(request, ResponsesRequest):
+                request.text = ResponseTextConfig()
+                request.text.format = ResponseFormatTextJSONSchemaConfig(
+                    name="tool_calling_response",
+                    schema=json_schema_from_tool,
+                    type="json_schema",
+                    description="Response format for tool calling",
+                    strict=True,
+                )
+
        return request

    def extract_tool_calls(
--- a/vllm/entrypoints/responses_utils.py
+++ b/vllm/entrypoints/responses_utils.py
@ -0,0 +1,45 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from openai.types.chat import (
+    ChatCompletionAssistantMessageParam,
+    ChatCompletionMessageToolCallParam,
+    ChatCompletionToolMessageParam,
+)
+from openai.types.chat.chat_completion_message_tool_call_param import (
+    Function as FunctionCallTool,
+)
+from openai.types.responses import ResponseFunctionToolCall
+
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionMessageParam,
+    ResponseInputOutputItem,
+)
+
+
+def construct_chat_message_with_tool_call(
+    item: ResponseInputOutputItem,
+) -> ChatCompletionMessageParam:
+    if isinstance(item, ResponseFunctionToolCall):
+        # Append the function call as a tool call.
+        return ChatCompletionAssistantMessageParam(
+            role="assistant",
+            tool_calls=[
+                ChatCompletionMessageToolCallParam(
+                    id=item.call_id,
+                    function=FunctionCallTool(
+                        name=item.name,
+                        arguments=item.arguments,
+                    ),
+                    type="function",
+                )
+            ],
+        )
+    elif item.get("type") == "function_call_output":
+        # Append the function call output as a tool message.
+        return ChatCompletionToolMessageParam(
+            role="tool",
+            content=item.get("output"),
+            tool_call_id=item.get("call_id"),
+        )
+    return item  # type: ignore