diff --git a/examples/online_serving/openai_responses_client.py b/examples/online_serving/openai_responses_client.py new file mode 100644 index 000000000000..b4eb24671507 --- /dev/null +++ b/examples/online_serving/openai_responses_client.py @@ -0,0 +1,44 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Set up this example by starting a vLLM OpenAI-compatible server. +Reasoning models can be used through the Responses API as seen here +https://platform.openai.com/docs/api-reference/responses +For example: +vllm serve Qwen/Qwen3-8B --reasoning-parser qwen3 + +""" + +from openai import OpenAI + +input_messages = [{"role": "user", "content": "What model are you?"}] + + +def main(): + base_url = "http://localhost:8000/v1" + client = OpenAI(base_url=base_url, api_key="empty") + model = "Qwen/Qwen3-8B" # get_first_model(client) + response = client.responses.create( + model=model, + input=input_messages, + ) + + for message in response.output: + if message.type == "reasoning": + # append reasoning message + input_messages.append(message) + + response_2 = client.responses.create( + model=model, + input=input_messages, + ) + print(response_2.output_text) + # I am Qwen, a large language model developed by Alibaba Cloud. + # I am designed to assist with a wide range of tasks, including + # answering questions, creating content, coding, and engaging in + # conversations. I can help with various topics and provide + # information or support in multiple languages. How can I assist you today? + + +if __name__ == "__main__": + main() diff --git a/tests/entrypoints/openai/test_response_api_simple.py b/tests/entrypoints/openai/test_response_api_simple.py new file mode 100644 index 000000000000..425b8199a0fd --- /dev/null +++ b/tests/entrypoints/openai/test_response_api_simple.py @@ -0,0 +1,71 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import pytest +import pytest_asyncio +from openai import OpenAI + +from ...utils import RemoteOpenAIServer + +MODEL_NAME = "Qwen/Qwen3-8B" + + +@pytest.fixture(scope="module") +def server(): + args = ["--reasoning-parser", "qwen3", "--max_model_len", "5000"] + env_dict = dict( + VLLM_ENABLE_RESPONSES_API_STORE="1", + # uncomment for tool calling + # PYTHON_EXECUTION_BACKEND="dangerously_use_uv", + ) + + with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def client(server): + async with server.get_async_client() as async_client: + yield async_client + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_basic(client: OpenAI, model_name: str): + response = await client.responses.create( + model=model_name, + input="What is 13 * 24?", + ) + assert response is not None + print("response: ", response) + assert response.status == "completed" + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_reasoning_item(client: OpenAI, model_name: str): + response = await client.responses.create( + model=model_name, + input=[ + {"type": "message", "content": "Hello.", "role": "user"}, + { + "type": "reasoning", + "id": "lol", + "content": [ + { + "type": "reasoning_text", + "text": "We need to respond: greeting.", + } + ], + "summary": [], + }, + ], + temperature=0.0, + ) + assert response is not None + assert response.status == "completed" + # make sure we get a reasoning and text output + assert response.output[0].type == "reasoning" + assert response.output[1].type == "message" + assert type(response.output[1].content[0].text) is str diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py index dea8d2d28f61..6251e1776c30 100644 --- a/tests/entrypoints/openai/test_response_api_with_harmony.py +++ b/tests/entrypoints/openai/test_response_api_with_harmony.py @@ -35,7 +35,7 @@ GET_WEATHER_SCHEMA = { @pytest.fixture(scope="module") def server(): - args = ["--enforce-eager", "--tool-server", "demo"] + args = ["--enforce-eager", "--tool-server", "demo", "--max_model_len", "5000"] env_dict = dict( VLLM_ENABLE_RESPONSES_API_STORE="1", PYTHON_EXECUTION_BACKEND="dangerously_use_uv", @@ -550,6 +550,31 @@ def call_function(name, args): raise ValueError(f"Unknown function: {name}") +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_reasoning_item(client: OpenAI, model_name: str): + response = await client.responses.create( + model=model_name, + input=[ + {"type": "message", "content": "Hello.", "role": "user"}, + { + "type": "reasoning", + "id": "lol", + "content": [ + { + "type": "reasoning_text", + "text": "We need to respond: greeting.", + } + ], + "summary": [], + }, + ], + temperature=0.0, + ) + assert response is not None + assert response.status == "completed" + + @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_function_calling(client: OpenAI, model_name: str): diff --git a/tests/entrypoints/test_responses_utils.py b/tests/entrypoints/test_responses_utils.py index 48bf06088bc0..91c818374e3f 100644 --- a/tests/entrypoints/test_responses_utils.py +++ b/tests/entrypoints/test_responses_utils.py @@ -1,7 +1,15 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest +from openai.types.responses.response_reasoning_item import ( + Content, + ResponseReasoningItem, + Summary, +) + from vllm.entrypoints.responses_utils import ( + construct_chat_message_with_tool_call, convert_tool_responses_to_completions_format, ) @@ -28,3 +36,53 @@ class TestResponsesUtils: result = convert_tool_responses_to_completions_format(input_tool) assert result == {"type": "function", "function": input_tool} + + def test_construct_chat_message_with_tool_call(self): + item = ResponseReasoningItem( + id="lol", + summary=[], + type="reasoning", + content=[ + Content( + text="Leroy Jenkins", + type="reasoning_text", + ) + ], + encrypted_content=None, + status=None, + ) + formatted_item = construct_chat_message_with_tool_call(item) + assert formatted_item["role"] == "assistant" + assert formatted_item["reasoning"] == "Leroy Jenkins" + + item = ResponseReasoningItem( + id="lol", + summary=[ + Summary( + text='Hmm, the user has just started with a simple "Hello,"', + type="summary_text", + ) + ], + type="reasoning", + content=None, + encrypted_content=None, + status=None, + ) + + formatted_item = construct_chat_message_with_tool_call(item) + assert formatted_item["role"] == "assistant" + assert ( + formatted_item["reasoning"] + == 'Hmm, the user has just started with a simple "Hello,"' + ) + + item = ResponseReasoningItem( + id="lol", + summary=[], + type="reasoning", + content=None, + encrypted_content="TOP_SECRET_MESSAGE", + status=None, + ) + with pytest.raises(ValueError): + construct_chat_message_with_tool_call(item) diff --git a/vllm/entrypoints/responses_utils.py b/vllm/entrypoints/responses_utils.py index d966f58804b6..912e8a690573 100644 --- a/vllm/entrypoints/responses_utils.py +++ b/vllm/entrypoints/responses_utils.py @@ -10,6 +10,7 @@ from openai.types.chat.chat_completion_message_tool_call_param import ( Function as FunctionCallTool, ) from openai.types.responses import ResponseFunctionToolCall +from openai.types.responses.response_reasoning_item import ResponseReasoningItem from openai.types.responses.tool import Tool from vllm import envs @@ -37,6 +38,18 @@ def construct_chat_message_with_tool_call( ) ], ) + elif isinstance(item, ResponseReasoningItem): + reasoning_content = "" + if item.encrypted_content: + raise ValueError("Encrypted content is not supported.") + if len(item.summary) == 1: + reasoning_content = item.summary[0].text + elif item.content and len(item.content) == 1: + reasoning_content = item.content[0].text + return { + "role": "assistant", + "reasoning": reasoning_content, + } elif item.get("type") == "function_call_output": # Append the function call output as a tool message. return ChatCompletionToolMessageParam(