diff --git a/examples/online_serving/openai_responses_client_with_tools.py b/examples/online_serving/openai_responses_client_with_tools.py new file mode 100644 index 0000000000000..276010197b5ab --- /dev/null +++ b/examples/online_serving/openai_responses_client_with_tools.py @@ -0,0 +1,83 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Set up this example by starting a vLLM OpenAI-compatible server with tool call +options enabled. +Reasoning models can be used through the Responses API as seen here +https://platform.openai.com/docs/api-reference/responses +For example: +vllm serve Qwen/Qwen3-1.7B --reasoning-parser qwen3 \ + --structured-outputs-config.backend xgrammar \ + --enable-auto-tool-choice --tool-call-parser hermes +""" + +import json + +from openai import OpenAI +from utils import get_first_model + + +def get_weather(latitude: float, longitude: float) -> str: + """ + Mock function to simulate getting weather data. + In a real application, this would call an external weather API. + """ + return f"Current temperature at ({latitude}, {longitude}) is 20°C." + + +tools = [ + { + "type": "function", + "name": "get_weather", + "description": "Get current temperature for provided coordinates in celsius.", + "parameters": { + "type": "object", + "properties": { + "latitude": {"type": "number"}, + "longitude": {"type": "number"}, + }, + "required": ["latitude", "longitude"], + "additionalProperties": False, + }, + "strict": True, + } +] + +input_messages = [ + {"role": "user", "content": "What's the weather like in Paris today?"} +] + + +def main(): + base_url = "http://0.0.0.0:8000/v1" + client = OpenAI(base_url=base_url, api_key="empty") + model = get_first_model(client) + response = client.responses.create( + model=model, input=input_messages, tools=tools, tool_choice="required" + ) + + for out in response.output: + if out.type == "function_call": + print("Function call:", out.name, out.arguments) + tool_call = out + args = json.loads(tool_call.arguments) + result = get_weather(args["latitude"], args["longitude"]) + + input_messages.append(tool_call) # append model's function call message + input_messages.append( + { # append result message + "type": "function_call_output", + "call_id": tool_call.call_id, + "output": str(result), + } + ) + response_2 = client.responses.create( + model=model, + input=input_messages, + tools=tools, + ) + print(response_2.output_text) + + +if __name__ == "__main__": + main() diff --git a/tests/v1/entrypoints/openai/responses/__init__.py b/tests/v1/entrypoints/openai/serving_responses/__init__.py similarity index 100% rename from tests/v1/entrypoints/openai/responses/__init__.py rename to tests/v1/entrypoints/openai/serving_responses/__init__.py diff --git a/tests/v1/entrypoints/openai/responses/conftest.py b/tests/v1/entrypoints/openai/serving_responses/conftest.py similarity index 84% rename from tests/v1/entrypoints/openai/responses/conftest.py rename to tests/v1/entrypoints/openai/serving_responses/conftest.py index 032ed42f43d1b..8081e5fa1d837 100644 --- a/tests/v1/entrypoints/openai/responses/conftest.py +++ b/tests/v1/entrypoints/openai/serving_responses/conftest.py @@ -15,8 +15,13 @@ def default_server_args(): "--max-model-len", "8192", "--enforce-eager", # For faster startup. + "--enable-auto-tool-choice", + "--structured-outputs-config.backend", + "xgrammar", + "--tool-call-parser", + "hermes", "--reasoning-parser", - "deepseek_r1", + "qwen3", ] diff --git a/tests/v1/entrypoints/openai/responses/test_basic.py b/tests/v1/entrypoints/openai/serving_responses/test_basic.py similarity index 100% rename from tests/v1/entrypoints/openai/responses/test_basic.py rename to tests/v1/entrypoints/openai/serving_responses/test_basic.py diff --git a/tests/v1/entrypoints/openai/serving_responses/test_function_call.py b/tests/v1/entrypoints/openai/serving_responses/test_function_call.py new file mode 100644 index 0000000000000..cf57956a9dea7 --- /dev/null +++ b/tests/v1/entrypoints/openai/serving_responses/test_function_call.py @@ -0,0 +1,198 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import json + +import openai # use the official client for correctness check +import pytest + +MODEL_NAME = "Qwen/Qwen3-1.7B" +tools = [ + { + "type": "function", + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": "string", + "description": "The city to find the weather for, e.g. 'Vienna'", + "default": "Vienna", + }, + "country": { + "type": "string", + "description": "The country that the city is in, e.g. 'Austria'", + }, + "unit": { + "type": "string", + "description": "The unit to fetch the temperature in", + "enum": ["celsius", "fahrenheit"], + }, + "options": { + "$ref": "#/$defs/WeatherOptions", + "description": "Optional parameters for weather query", + }, + }, + "required": ["country", "unit"], + "$defs": { + "WeatherOptions": { + "title": "WeatherOptions", + "type": "object", + "additionalProperties": False, + "properties": { + "unit": { + "type": "string", + "enum": ["celsius", "fahrenheit"], + "default": "celsius", + "description": "Temperature unit", + "title": "Temperature Unit", + }, + "include_forecast": { + "type": "boolean", + "default": False, + "description": "Whether to include a 24-hour forecast", + "title": "Include Forecast", + }, + "language": { + "type": "string", + "default": "zh-CN", + "description": "Language of the response", + "title": "Language", + "enum": ["zh-CN", "en-US", "ja-JP"], + }, + }, + }, + }, + }, + }, + { + "type": "function", + "name": "get_forecast", + "description": "Get the weather forecast for a given location", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": "string", + "description": "The city to get the forecast for, e.g. 'Vienna'", + "default": "Vienna", + }, + "country": { + "type": "string", + "description": "The country that the city is in, e.g. 'Austria'", + }, + "days": { + "type": "integer", + "description": "Number of days to get the forecast for (1-7)", + }, + "unit": { + "type": "string", + "description": "The unit to fetch the temperature in", + "enum": ["celsius", "fahrenheit"], + }, + }, + "required": ["country", "days", "unit"], + }, + }, +] + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("tool_choice", ["auto", "required"]) +async def test_function_tool_use( + client: openai.AsyncOpenAI, model_name: str, tool_choice: str +): + prompt = [ + { + "role": "user", + "content": "Can you tell me what the current weather is in Berlin and the " + "forecast for the next 5 days, in fahrenheit?", + }, + ] + response = await client.responses.create( + model=model_name, + input=prompt, + tools=tools, + tool_choice=tool_choice, + ) + + assert len(response.output) >= 1 + tool_call = None + reasoning = None + for out in response.output: + if out.type == "function_call": + tool_call = out + if out.type == "reasoning": + reasoning = out + assert tool_call is not None + assert tool_call.type == "function_call" + assert json.loads(tool_call.arguments) is not None + assert reasoning is not None + assert reasoning.type == "reasoning" + + +@pytest.mark.asyncio +async def test_named_tool_use(client: openai.AsyncOpenAI): + def get_weather(latitude: float, longitude: float) -> str: + """ + Mock function to simulate getting weather data. + In a real application, this would call an external weather API. + """ + return f"Current temperature at ({latitude}, {longitude}) is 20°C." + + tools = [ + { + "type": "function", + "name": "get_weather", + "description": ( + "Get current temperature for provided coordinates in celsius." + ), + "parameters": { + "type": "object", + "properties": { + "latitude": {"type": "number"}, + "longitude": {"type": "number"}, + }, + "required": ["latitude", "longitude"], + "additionalProperties": False, + }, + "strict": True, + } + ] + + input_messages = [ + {"role": "user", "content": "What's the weather like in Paris today?"} + ] + + response = await client.responses.create( + model=MODEL_NAME, + input=input_messages, + tools=tools, + tool_choice={"type": "function", "name": "get_weather"}, + ) + assert len(response.output) >= 1 + for out in response.output: + if out.type == "function_call": + tool_call = out + assert tool_call is not None + assert tool_call.type == "function_call" + assert tool_call.name == "get_weather" + args = json.loads(tool_call.arguments) + assert args["latitude"] is not None + assert args["longitude"] is not None + # call the tool + result = get_weather(args["latitude"], args["longitude"]) + input_messages.append(tool_call) # append model's function call message + input_messages.append( + { # append result message + "type": "function_call_output", + "call_id": tool_call.call_id, + "output": str(result), + } + ) + # create a new response with the tool call result + response_2 = await client.responses.create(model=MODEL_NAME, input=input_messages) + # check the output + assert len(response_2.output_text) > 0 diff --git a/tests/v1/entrypoints/openai/responses/test_image.py b/tests/v1/entrypoints/openai/serving_responses/test_image.py similarity index 100% rename from tests/v1/entrypoints/openai/responses/test_image.py rename to tests/v1/entrypoints/openai/serving_responses/test_image.py diff --git a/tests/v1/entrypoints/openai/responses/test_stateful.py b/tests/v1/entrypoints/openai/serving_responses/test_stateful.py similarity index 100% rename from tests/v1/entrypoints/openai/responses/test_stateful.py rename to tests/v1/entrypoints/openai/serving_responses/test_stateful.py diff --git a/tests/v1/entrypoints/openai/responses/test_structured_output.py b/tests/v1/entrypoints/openai/serving_responses/test_structured_output.py similarity index 100% rename from tests/v1/entrypoints/openai/responses/test_structured_output.py rename to tests/v1/entrypoints/openai/serving_responses/test_structured_output.py diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index bafc0e2c372f7..8ce4ff574699d 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -1098,13 +1098,13 @@ class OpenAIServing: ) if should_parse_tools: - if not isinstance(request, ChatCompletionRequest): - msg = "Tool usage is only supported for Chat Completions API" + if not isinstance(request, ChatCompletionRequest | ResponsesRequest): + msg = ( + "Tool usage is only supported for Chat Completions API " + "or Responses API requests." + ) raise NotImplementedError(msg) - - request = tool_parser(tokenizer).adjust_request( # type: ignore - request=request - ) + request = tool_parser(tokenizer).adjust_request(request=request) # type: ignore if tokenizer is None: assert isinstance(request_prompt, str), ( diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index dacf61dee111f..b6fef7d2fafd2 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -94,6 +94,7 @@ from vllm.entrypoints.openai.protocol import ( ) from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.serving_models import OpenAIServingModels +from vllm.entrypoints.responses_utils import construct_chat_message_with_tool_call from vllm.entrypoints.tool_server import ToolServer from vllm.inputs.data import TokensPrompt as EngineTokensPrompt from vllm.logger import init_logger @@ -196,16 +197,12 @@ class OpenAIServingResponses(OpenAIServing): self.default_sampling_params["stop_token_ids"].extend( get_stop_tokens_for_assistant_actions() ) - + self.enable_auto_tools = enable_auto_tools # set up tool use - self.enable_auto_tools: bool = enable_auto_tools - if self.enable_auto_tools: - logger.info( - '"auto" tool choice has been enabled please note that while' - " the parallel_tool_calls client option is preset for " - "compatibility reasons, it will be ignored." - ) - + self.tool_parser = self._get_tool_parser( + tool_parser_name=tool_parser, enable_auto_tools=enable_auto_tools + ) + self.exclude_tools_when_tool_choice_none = False # HACK(woosuk): This is a hack. We should use a better store. # FIXME: If enable_store=True, this may cause a memory leak since we # never remove responses from the store. @@ -511,16 +508,20 @@ class OpenAIServingResponses(OpenAIServing): prev_response: ResponsesResponse | None, tokenizer: AnyTokenizer, ): - if len(request.tools) > 0: - raise NotImplementedError( - "Tool use is not supported in Responses API without Harmony" - ) + if request.tools is None or ( + request.tool_choice == "none" and self.exclude_tools_when_tool_choice_none + ): + tool_dicts = None + else: + tool_dicts = [tool.model_dump() for tool in request.tools] # Construct the input messages. messages = self._construct_input_messages(request, prev_response) _, request_prompts, engine_prompts = await self._preprocess_chat( request, tokenizer, messages, + tool_dicts=tool_dicts, + tool_parser=self.tool_parser, chat_template=self.chat_template, chat_template_content_format=self.chat_template_content_format, ) @@ -802,7 +803,8 @@ class OpenAIServingResponses(OpenAIServing): delta=False, ) - output = [] + reasoning_item = None + message_item = None if reasoning_content: reasoning_item = ResponseReasoningItem( id=f"rs_{random_uuid()}", @@ -815,7 +817,13 @@ class OpenAIServingResponses(OpenAIServing): ], status=None, # NOTE: Only the last output item has status. ) - output.append(reasoning_item) + tool_calls, content = self._parse_tool_calls_from_content( + request=request, + tokenizer=tokenizer, + content=content, + enable_auto_tools=self.enable_auto_tools, + tool_parser_cls=self.tool_parser, + ) if content: output_text = ResponseOutputText( text=content, @@ -832,15 +840,33 @@ class OpenAIServingResponses(OpenAIServing): else None ), ) - message = ResponseOutputMessage( + message_item = ResponseOutputMessage( id=f"msg_{random_uuid()}", content=[output_text], role="assistant", status="completed", type="message", ) - output.append(message) - return output + outputs = [] + + if reasoning_item: + outputs.append(reasoning_item) + if message_item: + outputs.append(message_item) + if tool_calls: + tool_call_items = [ + ResponseFunctionToolCall( + id=f"fc_{random_uuid()}", + call_id=f"call_{random_uuid()}", + type="function_call", + status="completed", + name=tool_call.name, + arguments=tool_call.arguments, + ) + for tool_call in tool_calls + ] + outputs.extend(tool_call_items) + return outputs def _make_response_output_items_with_harmony( self, @@ -893,7 +919,8 @@ class OpenAIServingResponses(OpenAIServing): if isinstance(request.input, str): messages.append({"role": "user", "content": request.input}) else: - messages.extend(request.input) # type: ignore + for item in request.input: + messages.append(construct_chat_message_with_tool_call(item)) return messages def _construct_harmony_system_input_message( diff --git a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py index 950139c69c29a..e99e405f5de65 100644 --- a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py @@ -6,10 +6,16 @@ import os from collections.abc import Callable, Sequence from functools import cached_property +from openai.types.responses.response_format_text_json_schema_config import ( + ResponseFormatTextJSONSchemaConfig, +) + from vllm.entrypoints.openai.protocol import ( ChatCompletionRequest, DeltaMessage, ExtractedToolCallInformation, + ResponsesRequest, + ResponseTextConfig, ) from vllm.entrypoints.openai.tool_parsers.utils import get_json_schema_from_tools from vllm.logger import init_logger @@ -56,11 +62,21 @@ class ToolParser: ) # Set structured output params for tool calling if json_schema_from_tool is not None: - if request.structured_outputs is None: + if isinstance(request, ChatCompletionRequest): request.structured_outputs = StructuredOutputsParams() - # tool_choice: "Forced Function" or "required" will override - # structured output json settings to make tool calling work correctly - request.structured_outputs.json = json_schema_from_tool + # tool_choice: "Forced Function" or "required" will override + # structured output json settings to make tool calling work correctly + request.structured_outputs.json = json_schema_from_tool + if isinstance(request, ResponsesRequest): + request.text = ResponseTextConfig() + request.text.format = ResponseFormatTextJSONSchemaConfig( + name="tool_calling_response", + schema=json_schema_from_tool, + type="json_schema", + description="Response format for tool calling", + strict=True, + ) + return request def extract_tool_calls( diff --git a/vllm/entrypoints/responses_utils.py b/vllm/entrypoints/responses_utils.py new file mode 100644 index 0000000000000..6eb7c0b70a670 --- /dev/null +++ b/vllm/entrypoints/responses_utils.py @@ -0,0 +1,45 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from openai.types.chat import ( + ChatCompletionAssistantMessageParam, + ChatCompletionMessageToolCallParam, + ChatCompletionToolMessageParam, +) +from openai.types.chat.chat_completion_message_tool_call_param import ( + Function as FunctionCallTool, +) +from openai.types.responses import ResponseFunctionToolCall + +from vllm.entrypoints.openai.protocol import ( + ChatCompletionMessageParam, + ResponseInputOutputItem, +) + + +def construct_chat_message_with_tool_call( + item: ResponseInputOutputItem, +) -> ChatCompletionMessageParam: + if isinstance(item, ResponseFunctionToolCall): + # Append the function call as a tool call. + return ChatCompletionAssistantMessageParam( + role="assistant", + tool_calls=[ + ChatCompletionMessageToolCallParam( + id=item.call_id, + function=FunctionCallTool( + name=item.name, + arguments=item.arguments, + ), + type="function", + ) + ], + ) + elif item.get("type") == "function_call_output": + # Append the function call output as a tool message. + return ChatCompletionToolMessageParam( + role="tool", + content=item.get("output"), + tool_call_id=item.get("call_id"), + ) + return item # type: ignore