diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py index 6292306e7cdb..e817f07ef594 100644 --- a/vllm/entrypoints/context.py +++ b/vllm/entrypoints/context.py @@ -1,15 +1,20 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import json import logging from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Union -from openai_harmony import Message, Role, StreamState +from openai_harmony import Author, Message, Role, StreamState, TextContent from vllm.entrypoints.harmony_utils import ( get_encoding, get_streamable_parser_for_assistant, render_for_completion) from vllm.entrypoints.tool import Tool from vllm.outputs import RequestOutput +if TYPE_CHECKING: + from mcp.client import ClientSession + logger = logging.getLogger(__name__) @@ -71,6 +76,7 @@ class HarmonyContext(ConversationContext): def append_output(self, output) -> None: if isinstance(output, RequestOutput): output_token_ids = output.outputs[0].token_ids + self.parser = get_streamable_parser_for_assistant() for token_id in output_token_ids: self.parser.process(token_id) output_msgs = self.parser.messages @@ -106,19 +112,41 @@ class HarmonyContext(ConversationContext): def render_for_completion(self) -> list[int]: return render_for_completion(self.messages) - async def call_search_tool( - self, - tool_session: Tool, - last_msg: Message, - ) -> list[Message]: - return await tool_session.get_result(self) + async def call_search_tool(self, tool_session: Union["ClientSession", + Tool], + last_msg: Message) -> list[Message]: + if isinstance(tool_session, Tool): + return await tool_session.get_result(self) + tool_name = last_msg.recipient.split(".")[1] + args = json.loads(last_msg.content[0].text) + result = await tool_session.call_tool(tool_name, args) + result_str = result.content[0].text + content = TextContent(text=result_str) + author = Author(role=Role.TOOL, name=last_msg.recipient) + return [ + Message(author=author, content=[content], recipient=Role.ASSISTANT) + ] - async def call_python_tool( - self, - tool_session: Tool, - last_msg: Message, - ) -> list[Message]: - return await tool_session.get_result(self) + async def call_python_tool(self, tool_session: Union["ClientSession", + Tool], + last_msg: Message) -> list[Message]: + if isinstance(tool_session, Tool): + return await tool_session.get_result(self) + param = { + "code": last_msg.content[0].text, + } + result = await tool_session.call_tool("python", param) + result_str = result.content[0].text + + content = TextContent(text=result_str) + author = Author(role=Role.TOOL, name="python") + + return [ + Message(author=author, + content=[content], + channel=last_msg.channel, + recipient=Role.ASSISTANT) + ] class StreamingHarmonyContext(HarmonyContext): diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 3b9f4b544e45..543701ed144e 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -19,8 +19,8 @@ from openai.types.chat.chat_completion_message import ( # yapf: enable from openai.types.responses import (ResponseFunctionToolCall, ResponseInputItemParam, ResponseOutputItem, - ResponsePrompt, ResponseStatus, - ResponseTextConfig) + ResponsePrompt, ResponseReasoningItem, + ResponseStatus, ResponseTextConfig) from openai.types.responses.response import ToolChoice from openai.types.responses.tool import Tool from openai.types.shared import Metadata, Reasoning @@ -239,6 +239,7 @@ def get_logits_processors(processors: Optional[LogitsProcessors], ResponseInputOutputItem: TypeAlias = Union[ResponseInputItemParam, + ResponseReasoningItem, ResponseFunctionToolCall] diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 089f50a1e6a3..86c16df40e69 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -16,8 +16,7 @@ from fastapi import Request from openai import BaseModel # yapf conflicts with isort for this block # yapf: disable -from openai.types.responses import (ResponseContentPartDoneEvent, - ResponseCreatedEvent, +from openai.types.responses import (ResponseCreatedEvent, ResponseFunctionToolCall, ResponseInProgressEvent, ResponseOutputItem, @@ -54,7 +53,7 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse, # yapf: enable from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.serving_models import OpenAIServingModels -from vllm.entrypoints.tool_server import ToolServer +from vllm.entrypoints.tool_server import MCPToolServer, ToolServer from vllm.inputs.data import TokensPrompt as EngineTokensPrompt from vllm.logger import init_logger from vllm.outputs import CompletionOutput @@ -238,6 +237,15 @@ class OpenAIServingResponses(OpenAIServing): if raw_request: raw_request.state.request_metadata = request_metadata + if self.tool_server is not None and isinstance( + self.tool_server, MCPToolServer + ) and (request.background or request.stream) and request.tools and any( + tool.type in ["web_search_preview", "code_interpreter"] + for tool in request.tools): + return self.create_error_response( + "MCP tool server is not supported in background mode and " + "streaming mode") + # Schedule the request and get the result generator. generators: list[AsyncGenerator[ConversationContext, None]] = [] @@ -844,9 +852,13 @@ class OpenAIServingResponses(OpenAIServing): type="reasoning", content=[ ResponseReasoningTextContent( - text=previous_item.content[0].text), + text=previous_item.content[0].text, + type="reasoning_text", + ), ], status="completed", + id=current_item_id, + summary=[], ) yield _send_event( ResponseReasoningTextDoneEvent( @@ -857,15 +869,6 @@ class OpenAIServingResponses(OpenAIServing): content_index=current_content_index, text=previous_item.content[0].text, )) - yield _send_event( - ResponseContentPartDoneEvent( - type="response.content_part.done", - item_id=current_item_id, - sequence_number=-1, - output_index=current_output_index, - content_index=current_content_index, - part=reasoning_item, - )) yield _send_event( ResponseOutputItemDoneEvent( type="response.output_item.done", diff --git a/vllm/entrypoints/tool.py b/vllm/entrypoints/tool.py index 01ee77414f13..723cff91d44c 100644 --- a/vllm/entrypoints/tool.py +++ b/vllm/entrypoints/tool.py @@ -2,7 +2,9 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Optional + +from openai_harmony import Message from vllm.logger import init_logger @@ -70,7 +72,16 @@ class HarmonyPythonTool(Tool): "gpt_oss is not installed, code interpreter is disabled") return - self.python_tool = PythonTool() + # NOTE (Chen): as of gpt-oss 0.0.2, there is a bug in _make_response + # and we do the following monkey patch to fix it. + class PatchedGptOssPythonTool(PythonTool): + + def _make_response(self, + output: str, + channel: Optional[str] = None) -> Message: + return super()._make_response(output) + + self.python_tool = PatchedGptOssPythonTool() logger.info_once("Code interpreter tool initialized") async def get_result(self, context: "ConversationContext") -> Any: diff --git a/vllm/entrypoints/tool_server.py b/vllm/entrypoints/tool_server.py index 352704b2b374..2f28595f27c6 100644 --- a/vllm/entrypoints/tool_server.py +++ b/vllm/entrypoints/tool_server.py @@ -4,7 +4,7 @@ from abc import ABC, abstractmethod from contextlib import AbstractAsyncContextManager, asynccontextmanager from typing import TYPE_CHECKING, Any, Optional -from openai_harmony import ToolNamespaceConfig +from openai_harmony import ToolDescription, ToolNamespaceConfig from vllm.entrypoints.tool import HarmonyBrowserTool, HarmonyPythonTool, Tool from vllm.logger import init_logger @@ -105,7 +105,6 @@ class MCPToolServer(ToolServer): self.harmony_tool_descriptions = {} async def add_tool_server(self, server_url: str): - from mcp.types import ToolDescription tool_urls = server_url.split(",") self.harmony_tool_descriptions = {} self.urls: dict[str, str] = {} @@ -133,6 +132,8 @@ class MCPToolServer(ToolServer): logger.warning( "Tool %s already exists. Ignoring duplicate tool server %s", tool_from_mcp.name, url) + logger.info("MCPToolServer initialized with tools: %s", + list(self.harmony_tool_descriptions.keys())) def has_tool(self, tool_name: str): return tool_name in self.harmony_tool_descriptions