[Frontend] OpenAI Responses API supports Tool/Function calling - non-harmony (#26874)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
This commit is contained in:
Chauncey 2025-11-06 18:40:03 +08:00 committed by GitHub
parent 981cadb35c
commit 59a50afa08
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 404 additions and 30 deletions

View File

@ -0,0 +1,83 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Set up this example by starting a vLLM OpenAI-compatible server with tool call
options enabled.
Reasoning models can be used through the Responses API as seen here
https://platform.openai.com/docs/api-reference/responses
For example:
vllm serve Qwen/Qwen3-1.7B --reasoning-parser qwen3 \
--structured-outputs-config.backend xgrammar \
--enable-auto-tool-choice --tool-call-parser hermes
"""
import json
from openai import OpenAI
from utils import get_first_model
def get_weather(latitude: float, longitude: float) -> str:
"""
Mock function to simulate getting weather data.
In a real application, this would call an external weather API.
"""
return f"Current temperature at ({latitude}, {longitude}) is 20°C."
tools = [
{
"type": "function",
"name": "get_weather",
"description": "Get current temperature for provided coordinates in celsius.",
"parameters": {
"type": "object",
"properties": {
"latitude": {"type": "number"},
"longitude": {"type": "number"},
},
"required": ["latitude", "longitude"],
"additionalProperties": False,
},
"strict": True,
}
]
input_messages = [
{"role": "user", "content": "What's the weather like in Paris today?"}
]
def main():
base_url = "http://0.0.0.0:8000/v1"
client = OpenAI(base_url=base_url, api_key="empty")
model = get_first_model(client)
response = client.responses.create(
model=model, input=input_messages, tools=tools, tool_choice="required"
)
for out in response.output:
if out.type == "function_call":
print("Function call:", out.name, out.arguments)
tool_call = out
args = json.loads(tool_call.arguments)
result = get_weather(args["latitude"], args["longitude"])
input_messages.append(tool_call) # append model's function call message
input_messages.append(
{ # append result message
"type": "function_call_output",
"call_id": tool_call.call_id,
"output": str(result),
}
)
response_2 = client.responses.create(
model=model,
input=input_messages,
tools=tools,
)
print(response_2.output_text)
if __name__ == "__main__":
main()

View File

@ -15,8 +15,13 @@ def default_server_args():
"--max-model-len",
"8192",
"--enforce-eager", # For faster startup.
"--enable-auto-tool-choice",
"--structured-outputs-config.backend",
"xgrammar",
"--tool-call-parser",
"hermes",
"--reasoning-parser",
"deepseek_r1",
"qwen3",
]

View File

@ -0,0 +1,198 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
import openai # use the official client for correctness check
import pytest
MODEL_NAME = "Qwen/Qwen3-1.7B"
tools = [
{
"type": "function",
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"city": {
"type": "string",
"description": "The city to find the weather for, e.g. 'Vienna'",
"default": "Vienna",
},
"country": {
"type": "string",
"description": "The country that the city is in, e.g. 'Austria'",
},
"unit": {
"type": "string",
"description": "The unit to fetch the temperature in",
"enum": ["celsius", "fahrenheit"],
},
"options": {
"$ref": "#/$defs/WeatherOptions",
"description": "Optional parameters for weather query",
},
},
"required": ["country", "unit"],
"$defs": {
"WeatherOptions": {
"title": "WeatherOptions",
"type": "object",
"additionalProperties": False,
"properties": {
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
"default": "celsius",
"description": "Temperature unit",
"title": "Temperature Unit",
},
"include_forecast": {
"type": "boolean",
"default": False,
"description": "Whether to include a 24-hour forecast",
"title": "Include Forecast",
},
"language": {
"type": "string",
"default": "zh-CN",
"description": "Language of the response",
"title": "Language",
"enum": ["zh-CN", "en-US", "ja-JP"],
},
},
},
},
},
},
{
"type": "function",
"name": "get_forecast",
"description": "Get the weather forecast for a given location",
"parameters": {
"type": "object",
"properties": {
"city": {
"type": "string",
"description": "The city to get the forecast for, e.g. 'Vienna'",
"default": "Vienna",
},
"country": {
"type": "string",
"description": "The country that the city is in, e.g. 'Austria'",
},
"days": {
"type": "integer",
"description": "Number of days to get the forecast for (1-7)",
},
"unit": {
"type": "string",
"description": "The unit to fetch the temperature in",
"enum": ["celsius", "fahrenheit"],
},
},
"required": ["country", "days", "unit"],
},
},
]
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("tool_choice", ["auto", "required"])
async def test_function_tool_use(
client: openai.AsyncOpenAI, model_name: str, tool_choice: str
):
prompt = [
{
"role": "user",
"content": "Can you tell me what the current weather is in Berlin and the "
"forecast for the next 5 days, in fahrenheit?",
},
]
response = await client.responses.create(
model=model_name,
input=prompt,
tools=tools,
tool_choice=tool_choice,
)
assert len(response.output) >= 1
tool_call = None
reasoning = None
for out in response.output:
if out.type == "function_call":
tool_call = out
if out.type == "reasoning":
reasoning = out
assert tool_call is not None
assert tool_call.type == "function_call"
assert json.loads(tool_call.arguments) is not None
assert reasoning is not None
assert reasoning.type == "reasoning"
@pytest.mark.asyncio
async def test_named_tool_use(client: openai.AsyncOpenAI):
def get_weather(latitude: float, longitude: float) -> str:
"""
Mock function to simulate getting weather data.
In a real application, this would call an external weather API.
"""
return f"Current temperature at ({latitude}, {longitude}) is 20°C."
tools = [
{
"type": "function",
"name": "get_weather",
"description": (
"Get current temperature for provided coordinates in celsius."
),
"parameters": {
"type": "object",
"properties": {
"latitude": {"type": "number"},
"longitude": {"type": "number"},
},
"required": ["latitude", "longitude"],
"additionalProperties": False,
},
"strict": True,
}
]
input_messages = [
{"role": "user", "content": "What's the weather like in Paris today?"}
]
response = await client.responses.create(
model=MODEL_NAME,
input=input_messages,
tools=tools,
tool_choice={"type": "function", "name": "get_weather"},
)
assert len(response.output) >= 1
for out in response.output:
if out.type == "function_call":
tool_call = out
assert tool_call is not None
assert tool_call.type == "function_call"
assert tool_call.name == "get_weather"
args = json.loads(tool_call.arguments)
assert args["latitude"] is not None
assert args["longitude"] is not None
# call the tool
result = get_weather(args["latitude"], args["longitude"])
input_messages.append(tool_call) # append model's function call message
input_messages.append(
{ # append result message
"type": "function_call_output",
"call_id": tool_call.call_id,
"output": str(result),
}
)
# create a new response with the tool call result
response_2 = await client.responses.create(model=MODEL_NAME, input=input_messages)
# check the output
assert len(response_2.output_text) > 0

View File

@ -1098,13 +1098,13 @@ class OpenAIServing:
)
if should_parse_tools:
if not isinstance(request, ChatCompletionRequest):
msg = "Tool usage is only supported for Chat Completions API"
if not isinstance(request, ChatCompletionRequest | ResponsesRequest):
msg = (
"Tool usage is only supported for Chat Completions API "
"or Responses API requests."
)
raise NotImplementedError(msg)
request = tool_parser(tokenizer).adjust_request( # type: ignore
request=request
)
request = tool_parser(tokenizer).adjust_request(request=request) # type: ignore
if tokenizer is None:
assert isinstance(request_prompt, str), (

View File

@ -94,6 +94,7 @@ from vllm.entrypoints.openai.protocol import (
)
from vllm.entrypoints.openai.serving_engine import OpenAIServing
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.responses_utils import construct_chat_message_with_tool_call
from vllm.entrypoints.tool_server import ToolServer
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
from vllm.logger import init_logger
@ -196,16 +197,12 @@ class OpenAIServingResponses(OpenAIServing):
self.default_sampling_params["stop_token_ids"].extend(
get_stop_tokens_for_assistant_actions()
)
self.enable_auto_tools = enable_auto_tools
# set up tool use
self.enable_auto_tools: bool = enable_auto_tools
if self.enable_auto_tools:
logger.info(
'"auto" tool choice has been enabled please note that while'
" the parallel_tool_calls client option is preset for "
"compatibility reasons, it will be ignored."
)
self.tool_parser = self._get_tool_parser(
tool_parser_name=tool_parser, enable_auto_tools=enable_auto_tools
)
self.exclude_tools_when_tool_choice_none = False
# HACK(woosuk): This is a hack. We should use a better store.
# FIXME: If enable_store=True, this may cause a memory leak since we
# never remove responses from the store.
@ -511,16 +508,20 @@ class OpenAIServingResponses(OpenAIServing):
prev_response: ResponsesResponse | None,
tokenizer: AnyTokenizer,
):
if len(request.tools) > 0:
raise NotImplementedError(
"Tool use is not supported in Responses API without Harmony"
)
if request.tools is None or (
request.tool_choice == "none" and self.exclude_tools_when_tool_choice_none
):
tool_dicts = None
else:
tool_dicts = [tool.model_dump() for tool in request.tools]
# Construct the input messages.
messages = self._construct_input_messages(request, prev_response)
_, request_prompts, engine_prompts = await self._preprocess_chat(
request,
tokenizer,
messages,
tool_dicts=tool_dicts,
tool_parser=self.tool_parser,
chat_template=self.chat_template,
chat_template_content_format=self.chat_template_content_format,
)
@ -802,7 +803,8 @@ class OpenAIServingResponses(OpenAIServing):
delta=False,
)
output = []
reasoning_item = None
message_item = None
if reasoning_content:
reasoning_item = ResponseReasoningItem(
id=f"rs_{random_uuid()}",
@ -815,7 +817,13 @@ class OpenAIServingResponses(OpenAIServing):
],
status=None, # NOTE: Only the last output item has status.
)
output.append(reasoning_item)
tool_calls, content = self._parse_tool_calls_from_content(
request=request,
tokenizer=tokenizer,
content=content,
enable_auto_tools=self.enable_auto_tools,
tool_parser_cls=self.tool_parser,
)
if content:
output_text = ResponseOutputText(
text=content,
@ -832,15 +840,33 @@ class OpenAIServingResponses(OpenAIServing):
else None
),
)
message = ResponseOutputMessage(
message_item = ResponseOutputMessage(
id=f"msg_{random_uuid()}",
content=[output_text],
role="assistant",
status="completed",
type="message",
)
output.append(message)
return output
outputs = []
if reasoning_item:
outputs.append(reasoning_item)
if message_item:
outputs.append(message_item)
if tool_calls:
tool_call_items = [
ResponseFunctionToolCall(
id=f"fc_{random_uuid()}",
call_id=f"call_{random_uuid()}",
type="function_call",
status="completed",
name=tool_call.name,
arguments=tool_call.arguments,
)
for tool_call in tool_calls
]
outputs.extend(tool_call_items)
return outputs
def _make_response_output_items_with_harmony(
self,
@ -893,7 +919,8 @@ class OpenAIServingResponses(OpenAIServing):
if isinstance(request.input, str):
messages.append({"role": "user", "content": request.input})
else:
messages.extend(request.input) # type: ignore
for item in request.input:
messages.append(construct_chat_message_with_tool_call(item))
return messages
def _construct_harmony_system_input_message(

View File

@ -6,10 +6,16 @@ import os
from collections.abc import Callable, Sequence
from functools import cached_property
from openai.types.responses.response_format_text_json_schema_config import (
ResponseFormatTextJSONSchemaConfig,
)
from vllm.entrypoints.openai.protocol import (
ChatCompletionRequest,
DeltaMessage,
ExtractedToolCallInformation,
ResponsesRequest,
ResponseTextConfig,
)
from vllm.entrypoints.openai.tool_parsers.utils import get_json_schema_from_tools
from vllm.logger import init_logger
@ -56,11 +62,21 @@ class ToolParser:
)
# Set structured output params for tool calling
if json_schema_from_tool is not None:
if request.structured_outputs is None:
if isinstance(request, ChatCompletionRequest):
request.structured_outputs = StructuredOutputsParams()
# tool_choice: "Forced Function" or "required" will override
# structured output json settings to make tool calling work correctly
request.structured_outputs.json = json_schema_from_tool
# tool_choice: "Forced Function" or "required" will override
# structured output json settings to make tool calling work correctly
request.structured_outputs.json = json_schema_from_tool
if isinstance(request, ResponsesRequest):
request.text = ResponseTextConfig()
request.text.format = ResponseFormatTextJSONSchemaConfig(
name="tool_calling_response",
schema=json_schema_from_tool,
type="json_schema",
description="Response format for tool calling",
strict=True,
)
return request
def extract_tool_calls(

View File

@ -0,0 +1,45 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from openai.types.chat import (
ChatCompletionAssistantMessageParam,
ChatCompletionMessageToolCallParam,
ChatCompletionToolMessageParam,
)
from openai.types.chat.chat_completion_message_tool_call_param import (
Function as FunctionCallTool,
)
from openai.types.responses import ResponseFunctionToolCall
from vllm.entrypoints.openai.protocol import (
ChatCompletionMessageParam,
ResponseInputOutputItem,
)
def construct_chat_message_with_tool_call(
item: ResponseInputOutputItem,
) -> ChatCompletionMessageParam:
if isinstance(item, ResponseFunctionToolCall):
# Append the function call as a tool call.
return ChatCompletionAssistantMessageParam(
role="assistant",
tool_calls=[
ChatCompletionMessageToolCallParam(
id=item.call_id,
function=FunctionCallTool(
name=item.name,
arguments=item.arguments,
),
type="function",
)
],
)
elif item.get("type") == "function_call_output":
# Append the function call output as a tool message.
return ChatCompletionToolMessageParam(
role="tool",
content=item.get("output"),
tool_call_id=item.get("call_id"),
)
return item # type: ignore