diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index a658d97cc8c5..74c8093f4967 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -29,6 +29,7 @@ from openai.types.chat import (ChatCompletionMessageToolCallParam, from openai.types.chat.chat_completion_content_part_input_audio_param import ( InputAudio) from openai.types.responses import ResponseInputImageParam +from openai_harmony import Message as OpenAIHarmonyMessage from PIL import Image from pydantic import BaseModel, ConfigDict, TypeAdapter # yapf: enable @@ -207,7 +208,8 @@ class CustomChatCompletionMessageParam(TypedDict, total=False): ChatCompletionMessageParam = Union[OpenAIChatCompletionMessageParam, - CustomChatCompletionMessageParam] + CustomChatCompletionMessageParam, + OpenAIHarmonyMessage] # TODO: Make fields ReadOnly once mypy supports it diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py index ecda35c9807e..ee08d62b5747 100644 --- a/vllm/entrypoints/harmony_utils.py +++ b/vllm/entrypoints/harmony_utils.py @@ -2,14 +2,18 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import datetime from collections.abc import Iterable, Sequence -from typing import Literal, Optional +from typing import Literal, Optional, Union +from openai.types.responses import ResponseFunctionToolCall, ResponseOutputItem from openai.types.responses.tool import Tool -from openai_harmony import (Conversation, DeveloperContent, +from openai_harmony import (Author, Conversation, DeveloperContent, HarmonyEncodingName, Message, ReasoningEffort, Role, StreamableParser, SystemContent, TextContent, ToolDescription, load_harmony_encoding) +from vllm.entrypoints.openai.protocol import (ResponseInputOutputItem, + ResponseReasoningItem) + REASONING_EFFORT = { "high": ReasoningEffort.HIGH, "medium": ReasoningEffort.MEDIUM, @@ -85,6 +89,58 @@ def get_user_message(content: str) -> Message: return Message.from_role_and_content(Role.USER, content) +def parse_response_input( + response_msg: ResponseInputOutputItem, + prev_responses: list[Union[ResponseOutputItem, ResponseReasoningItem]] +) -> Message: + if not isinstance(response_msg, dict): + response_msg = response_msg.model_dump() + if "type" not in response_msg or response_msg["type"] == "message": + role = response_msg["role"] + content = response_msg["content"] + if role == "system": + # User is trying to set a system message. Change it to: + # <|start|>developer<|message|># Instructions + # {instructions}<|end|> + role = "developer" + text_prefix = "Instructions:\n" + else: + text_prefix = "" + if isinstance(content, str): + msg = Message.from_role_and_content(role, text_prefix + content) + else: + contents = [ + TextContent(text=text_prefix + c["text"]) for c in content + ] + msg = Message.from_role_and_contents(role, contents) + elif response_msg["type"] == "function_call_output": + call_id = response_msg["call_id"] + call_response: Optional[ResponseFunctionToolCall] = None + for prev_response in reversed(prev_responses): + if isinstance(prev_response, ResponseFunctionToolCall + ) and prev_response.call_id == call_id: + call_response = prev_response + break + if call_response is None: + raise ValueError(f"No call message found for {call_id}") + msg = Message.from_author_and_content( + Author.new(Role.TOOL, f"functions.{call_response.name}"), + response_msg["output"]) + elif response_msg["type"] == "reasoning": + content = response_msg["content"] + assert len(content) == 1 + msg = Message.from_role_and_content(Role.ASSISTANT, content[0]["text"]) + elif response_msg["type"] == "function_call": + msg = Message.from_role_and_content(Role.ASSISTANT, + response_msg["arguments"]) + msg = msg.with_channel("commentary") + msg = msg.with_recipient(f"functions.{response_msg['name']}") + msg = msg.with_content_type("json") + else: + raise ValueError(f"Unknown input type: {response_msg['type']}") + return msg + + def parse_chat_input(chat_msg) -> Message: role = chat_msg["role"] content = chat_msg["content"] diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 57aa42720756..421927d61bba 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -17,7 +17,8 @@ from openai.types.chat.chat_completion_audio import ( from openai.types.chat.chat_completion_message import ( Annotation as OpenAIAnnotation) # yapf: enable -from openai.types.responses import (ResponseInputParam, ResponseOutputItem, +from openai.types.responses import (ResponseFunctionToolCall, + ResponseInputItemParam, ResponseOutputItem, ResponseOutputMessage, ResponsePrompt, ResponseStatus, ResponseTextConfig) from openai.types.responses.response import ToolChoice @@ -234,6 +235,10 @@ def get_logits_processors(processors: Optional[LogitsProcessors], return None +ResponseInputOutputItem: TypeAlias = Union[ResponseInputItemParam, + ResponseFunctionToolCall] + + class ResponsesRequest(OpenAIBaseModel): # Ordered by official OpenAI API documentation # https://platform.openai.com/docs/api-reference/responses/create @@ -248,7 +253,7 @@ class ResponsesRequest(OpenAIBaseModel): "reasoning.encrypted_content", ], ]] = None - input: Union[str, ResponseInputParam] + input: Union[str, list[ResponseInputOutputItem]] instructions: Optional[str] = None max_output_tokens: Optional[int] = None max_tool_calls: Optional[int] = None diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 4ca863fd07db..3c0b590b0cd8 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -4,12 +4,15 @@ import asyncio import time from collections.abc import AsyncGenerator, AsyncIterator +from copy import copy from http import HTTPStatus from typing import Callable, Final, Optional, Union import jinja2 from fastapi import Request -from openai.types.responses import ResponseOutputMessage, ResponseOutputText +from openai.types.responses import (ResponseFunctionToolCall, + ResponseOutputMessage, ResponseOutputText) +from openai_harmony import Message as OpenAIHarmonyMessage from vllm import envs from vllm.config import ModelConfig @@ -17,6 +20,10 @@ from vllm.engine.protocol import EngineClient from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam, ChatTemplateContentFormatOption) from vllm.entrypoints.context import ConversationContext, SimpleContext +from vllm.entrypoints.harmony_utils import ( + get_developer_message, get_stop_tokens_for_assistant_actions, + get_system_message, get_user_message, parse_response_input, + render_for_completion) from vllm.entrypoints.logger import RequestLogger # yapf conflicts with isort for this block # yapf: disable @@ -30,6 +37,7 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse, from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.tool_server import ToolServer +from vllm.inputs.data import TokensPrompt as EngineTokensPrompt from vllm.logger import init_logger from vllm.reasoning import ReasoningParser, ReasoningParserManager from vllm.sampling_params import SamplingParams @@ -103,6 +111,29 @@ class OpenAIServingResponses(OpenAIServing): "`VLLM_ENABLE_RESPONSES_API_STORE` is enabled. This may " "cause a memory leak since we never remove responses from " "the store.") + + self.use_harmony = model_config.hf_config.model_type == "gpt_oss" + if self.use_harmony: + logger.warning("For gpt-oss, we ignore --enable-auto-tool-choice " + "and always enable tool use.") + # OpenAI models have two EOS-like tokens: <|return|> and <|call|>. + # We need to add them to the stop token ids. + if "stop_token_ids" not in self.default_sampling_params: + self.default_sampling_params["stop_token_ids"] = [] + self.default_sampling_params["stop_token_ids"].extend( + get_stop_tokens_for_assistant_actions()) + + # set up tool use + self.enable_auto_tools: bool = enable_auto_tools + if self.enable_auto_tools: + logger.info( + "\"auto\" tool choice has been enabled please note that while" + " the parallel_tool_calls client option is preset for " + "compatibility reasons, it will be ignored.") + if not self.use_harmony: + raise NotImplementedError("Auto tool choice is not supported " + "yet unless using Harmony") + # HACK(woosuk): This is a hack. We should use a better store. # FIXME: If enable_store=True, this may cause a memory leak since we # never remove responses from the store. @@ -165,21 +196,20 @@ class OpenAIServingResponses(OpenAIServing): return self._make_not_found_error(prev_response_id) else: prev_response = None - # Construct the input messages. - messages = self._construct_input_messages(request, prev_response) try: lora_request = self._maybe_get_adapters(request) model_name = self._get_model_name(request.model, lora_request) tokenizer = await self.engine_client.get_tokenizer(lora_request) - _, request_prompts, engine_prompts = await self._preprocess_chat( - request, - tokenizer, - messages, - chat_template=self.chat_template, - chat_template_content_format=self.chat_template_content_format, - ) + if self.use_harmony: + messages, request_prompts, engine_prompts = ( + self._make_request_with_harmony(request, prev_response)) + else: + messages, request_prompts, engine_prompts = ( + await self._make_request(request, prev_response, + tokenizer)) + except (ValueError, TypeError, RuntimeError, jinja2.TemplateError) as e: logger.exception("Error in preprocessing prompt inputs") @@ -275,6 +305,38 @@ class OpenAIServingResponses(OpenAIServing): except Exception as e: return self.create_error_response(str(e)) + async def _make_request( + self, + request: ResponsesRequest, + prev_response: Optional[ResponsesResponse], + tokenizer: AnyTokenizer, + ): + # Construct the input messages. + messages = self._construct_input_messages(request, prev_response) + _, request_prompts, engine_prompts = await self._preprocess_chat( + request, + tokenizer, + messages, + chat_template=self.chat_template, + chat_template_content_format=self.chat_template_content_format, + ) + return messages, request_prompts, engine_prompts + + def _make_request_with_harmony( + self, + request: ResponsesRequest, + prev_response: Optional[ResponsesResponse], + ): + if request.tool_choice != "auto": + raise NotImplementedError( + "Only 'auto' tool_choice is supported in " + "response API with Harmony") + messages = self._construct_input_messages_with_harmony( + request, prev_response) + prompt_token_ids = render_for_completion(messages) + engine_prompt = EngineTokensPrompt(prompt_token_ids=prompt_token_ids) + return messages, [prompt_token_ids], [engine_prompt] + async def responses_full_generator( self, request: ResponsesRequest, @@ -411,6 +473,82 @@ class OpenAIServingResponses(OpenAIServing): messages.extend(request.input) # type: ignore return messages + def _construct_input_messages_with_harmony( + self, + request: ResponsesRequest, + prev_response: Optional[ResponsesResponse], + ) -> list[OpenAIHarmonyMessage]: + messages: list[OpenAIHarmonyMessage] = [] + if prev_response is None: + # New conversation. + reasoning_effort = (request.reasoning.effort + if request.reasoning else None) + tool_types = [tool.type for tool in request.tools] + enable_browser = ("web_search_preview" in tool_types + and self.tool_server is not None + and self.tool_server.has_tool("browser")) + enable_code_interpreter = ("code_interpreter" in tool_types + and self.tool_server is not None + and self.tool_server.has_tool("python")) + sys_msg = get_system_message( + reasoning_effort=reasoning_effort, + browser_description=self.tool_server.get_tool_description( + "browser") + if enable_browser and self.tool_server is not None else None, + python_description=self.tool_server.get_tool_description( + "python") if enable_code_interpreter + and self.tool_server is not None else None, + ) + messages.append(sys_msg) + dev_msg = get_developer_message(request.instructions, + request.tools) + messages.append(dev_msg) + else: + # Continue the previous conversation. + # FIXME(woosuk): Currently, request params like reasoning and + # instructions are ignored. + prev_msgs = self.msg_store[prev_response.id] + # Remove the previous chain-of-thoughts if there is a new "final" + # message. Note that this also removes these messages from the + # msg_store. + if len(prev_msgs) > 0: + last_msg = prev_msgs[-1] + assert isinstance(last_msg, OpenAIHarmonyMessage) + if last_msg.channel == "final": + prev_final_msg_idx = -1 + for i in range(len(prev_msgs) - 2, -1, -1): + prev_msg_i = prev_msgs[i] + assert isinstance(prev_msg_i, OpenAIHarmonyMessage) + if prev_msg_i.channel == "final": + prev_final_msg_idx = i + break + recent_turn_msgs = prev_msgs[prev_final_msg_idx + 1:] + del prev_msgs[prev_final_msg_idx + 1:] + for msg in recent_turn_msgs: + assert isinstance(msg, OpenAIHarmonyMessage) + if msg.channel != "analysis": + prev_msgs.append(msg) + messages.extend(prev_msgs) + # Append the new input. + # Reponses API supports simple text inputs without chat format. + if isinstance(request.input, str): + messages.append(get_user_message(request.input)) + else: + if prev_response is not None: + prev_outputs = copy(prev_response.output) + else: + prev_outputs = [] + for response_msg in request.input: + messages.append( + parse_response_input(response_msg, prev_outputs)) + # User passes in a a tool call request and its output. We need + # to add the tool call request to prev_outputs so that the + # parse_response_input can find the tool call request when + # parsing the tool call output. + if isinstance(response_msg, ResponseFunctionToolCall): + prev_outputs.append(response_msg) + return messages + async def _run_background_request( self, request: ResponsesRequest,