[gpt-oss] Convert user input to harmony format (#22402)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
Chen Zhang 2025-08-06 20:56:02 -07:00 committed by GitHub
parent ad6c655dde
commit f6278b6243
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 216 additions and 15 deletions

View File

@ -29,6 +29,7 @@ from openai.types.chat import (ChatCompletionMessageToolCallParam,
from openai.types.chat.chat_completion_content_part_input_audio_param import ( from openai.types.chat.chat_completion_content_part_input_audio_param import (
InputAudio) InputAudio)
from openai.types.responses import ResponseInputImageParam from openai.types.responses import ResponseInputImageParam
from openai_harmony import Message as OpenAIHarmonyMessage
from PIL import Image from PIL import Image
from pydantic import BaseModel, ConfigDict, TypeAdapter from pydantic import BaseModel, ConfigDict, TypeAdapter
# yapf: enable # yapf: enable
@ -207,7 +208,8 @@ class CustomChatCompletionMessageParam(TypedDict, total=False):
ChatCompletionMessageParam = Union[OpenAIChatCompletionMessageParam, ChatCompletionMessageParam = Union[OpenAIChatCompletionMessageParam,
CustomChatCompletionMessageParam] CustomChatCompletionMessageParam,
OpenAIHarmonyMessage]
# TODO: Make fields ReadOnly once mypy supports it # TODO: Make fields ReadOnly once mypy supports it

View File

@ -2,14 +2,18 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import datetime import datetime
from collections.abc import Iterable, Sequence from collections.abc import Iterable, Sequence
from typing import Literal, Optional from typing import Literal, Optional, Union
from openai.types.responses import ResponseFunctionToolCall, ResponseOutputItem
from openai.types.responses.tool import Tool from openai.types.responses.tool import Tool
from openai_harmony import (Conversation, DeveloperContent, from openai_harmony import (Author, Conversation, DeveloperContent,
HarmonyEncodingName, Message, ReasoningEffort, HarmonyEncodingName, Message, ReasoningEffort,
Role, StreamableParser, SystemContent, TextContent, Role, StreamableParser, SystemContent, TextContent,
ToolDescription, load_harmony_encoding) ToolDescription, load_harmony_encoding)
from vllm.entrypoints.openai.protocol import (ResponseInputOutputItem,
ResponseReasoningItem)
REASONING_EFFORT = { REASONING_EFFORT = {
"high": ReasoningEffort.HIGH, "high": ReasoningEffort.HIGH,
"medium": ReasoningEffort.MEDIUM, "medium": ReasoningEffort.MEDIUM,
@ -85,6 +89,58 @@ def get_user_message(content: str) -> Message:
return Message.from_role_and_content(Role.USER, content) return Message.from_role_and_content(Role.USER, content)
def parse_response_input(
response_msg: ResponseInputOutputItem,
prev_responses: list[Union[ResponseOutputItem, ResponseReasoningItem]]
) -> Message:
if not isinstance(response_msg, dict):
response_msg = response_msg.model_dump()
if "type" not in response_msg or response_msg["type"] == "message":
role = response_msg["role"]
content = response_msg["content"]
if role == "system":
# User is trying to set a system message. Change it to:
# <|start|>developer<|message|># Instructions
# {instructions}<|end|>
role = "developer"
text_prefix = "Instructions:\n"
else:
text_prefix = ""
if isinstance(content, str):
msg = Message.from_role_and_content(role, text_prefix + content)
else:
contents = [
TextContent(text=text_prefix + c["text"]) for c in content
]
msg = Message.from_role_and_contents(role, contents)
elif response_msg["type"] == "function_call_output":
call_id = response_msg["call_id"]
call_response: Optional[ResponseFunctionToolCall] = None
for prev_response in reversed(prev_responses):
if isinstance(prev_response, ResponseFunctionToolCall
) and prev_response.call_id == call_id:
call_response = prev_response
break
if call_response is None:
raise ValueError(f"No call message found for {call_id}")
msg = Message.from_author_and_content(
Author.new(Role.TOOL, f"functions.{call_response.name}"),
response_msg["output"])
elif response_msg["type"] == "reasoning":
content = response_msg["content"]
assert len(content) == 1
msg = Message.from_role_and_content(Role.ASSISTANT, content[0]["text"])
elif response_msg["type"] == "function_call":
msg = Message.from_role_and_content(Role.ASSISTANT,
response_msg["arguments"])
msg = msg.with_channel("commentary")
msg = msg.with_recipient(f"functions.{response_msg['name']}")
msg = msg.with_content_type("json")
else:
raise ValueError(f"Unknown input type: {response_msg['type']}")
return msg
def parse_chat_input(chat_msg) -> Message: def parse_chat_input(chat_msg) -> Message:
role = chat_msg["role"] role = chat_msg["role"]
content = chat_msg["content"] content = chat_msg["content"]

View File

@ -17,7 +17,8 @@ from openai.types.chat.chat_completion_audio import (
from openai.types.chat.chat_completion_message import ( from openai.types.chat.chat_completion_message import (
Annotation as OpenAIAnnotation) Annotation as OpenAIAnnotation)
# yapf: enable # yapf: enable
from openai.types.responses import (ResponseInputParam, ResponseOutputItem, from openai.types.responses import (ResponseFunctionToolCall,
ResponseInputItemParam, ResponseOutputItem,
ResponseOutputMessage, ResponsePrompt, ResponseOutputMessage, ResponsePrompt,
ResponseStatus, ResponseTextConfig) ResponseStatus, ResponseTextConfig)
from openai.types.responses.response import ToolChoice from openai.types.responses.response import ToolChoice
@ -234,6 +235,10 @@ def get_logits_processors(processors: Optional[LogitsProcessors],
return None return None
ResponseInputOutputItem: TypeAlias = Union[ResponseInputItemParam,
ResponseFunctionToolCall]
class ResponsesRequest(OpenAIBaseModel): class ResponsesRequest(OpenAIBaseModel):
# Ordered by official OpenAI API documentation # Ordered by official OpenAI API documentation
# https://platform.openai.com/docs/api-reference/responses/create # https://platform.openai.com/docs/api-reference/responses/create
@ -248,7 +253,7 @@ class ResponsesRequest(OpenAIBaseModel):
"reasoning.encrypted_content", "reasoning.encrypted_content",
], ],
]] = None ]] = None
input: Union[str, ResponseInputParam] input: Union[str, list[ResponseInputOutputItem]]
instructions: Optional[str] = None instructions: Optional[str] = None
max_output_tokens: Optional[int] = None max_output_tokens: Optional[int] = None
max_tool_calls: Optional[int] = None max_tool_calls: Optional[int] = None

View File

@ -4,12 +4,15 @@
import asyncio import asyncio
import time import time
from collections.abc import AsyncGenerator, AsyncIterator from collections.abc import AsyncGenerator, AsyncIterator
from copy import copy
from http import HTTPStatus from http import HTTPStatus
from typing import Callable, Final, Optional, Union from typing import Callable, Final, Optional, Union
import jinja2 import jinja2
from fastapi import Request from fastapi import Request
from openai.types.responses import ResponseOutputMessage, ResponseOutputText from openai.types.responses import (ResponseFunctionToolCall,
ResponseOutputMessage, ResponseOutputText)
from openai_harmony import Message as OpenAIHarmonyMessage
from vllm import envs from vllm import envs
from vllm.config import ModelConfig from vllm.config import ModelConfig
@ -17,6 +20,10 @@ from vllm.engine.protocol import EngineClient
from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam, from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
ChatTemplateContentFormatOption) ChatTemplateContentFormatOption)
from vllm.entrypoints.context import ConversationContext, SimpleContext from vllm.entrypoints.context import ConversationContext, SimpleContext
from vllm.entrypoints.harmony_utils import (
get_developer_message, get_stop_tokens_for_assistant_actions,
get_system_message, get_user_message, parse_response_input,
render_for_completion)
from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.logger import RequestLogger
# yapf conflicts with isort for this block # yapf conflicts with isort for this block
# yapf: disable # yapf: disable
@ -30,6 +37,7 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse,
from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.serving_engine import OpenAIServing
from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.tool_server import ToolServer from vllm.entrypoints.tool_server import ToolServer
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.reasoning import ReasoningParser, ReasoningParserManager from vllm.reasoning import ReasoningParser, ReasoningParserManager
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
@ -103,6 +111,29 @@ class OpenAIServingResponses(OpenAIServing):
"`VLLM_ENABLE_RESPONSES_API_STORE` is enabled. This may " "`VLLM_ENABLE_RESPONSES_API_STORE` is enabled. This may "
"cause a memory leak since we never remove responses from " "cause a memory leak since we never remove responses from "
"the store.") "the store.")
self.use_harmony = model_config.hf_config.model_type == "gpt_oss"
if self.use_harmony:
logger.warning("For gpt-oss, we ignore --enable-auto-tool-choice "
"and always enable tool use.")
# OpenAI models have two EOS-like tokens: <|return|> and <|call|>.
# We need to add them to the stop token ids.
if "stop_token_ids" not in self.default_sampling_params:
self.default_sampling_params["stop_token_ids"] = []
self.default_sampling_params["stop_token_ids"].extend(
get_stop_tokens_for_assistant_actions())
# set up tool use
self.enable_auto_tools: bool = enable_auto_tools
if self.enable_auto_tools:
logger.info(
"\"auto\" tool choice has been enabled please note that while"
" the parallel_tool_calls client option is preset for "
"compatibility reasons, it will be ignored.")
if not self.use_harmony:
raise NotImplementedError("Auto tool choice is not supported "
"yet unless using Harmony")
# HACK(woosuk): This is a hack. We should use a better store. # HACK(woosuk): This is a hack. We should use a better store.
# FIXME: If enable_store=True, this may cause a memory leak since we # FIXME: If enable_store=True, this may cause a memory leak since we
# never remove responses from the store. # never remove responses from the store.
@ -165,21 +196,20 @@ class OpenAIServingResponses(OpenAIServing):
return self._make_not_found_error(prev_response_id) return self._make_not_found_error(prev_response_id)
else: else:
prev_response = None prev_response = None
# Construct the input messages.
messages = self._construct_input_messages(request, prev_response)
try: try:
lora_request = self._maybe_get_adapters(request) lora_request = self._maybe_get_adapters(request)
model_name = self._get_model_name(request.model, lora_request) model_name = self._get_model_name(request.model, lora_request)
tokenizer = await self.engine_client.get_tokenizer(lora_request) tokenizer = await self.engine_client.get_tokenizer(lora_request)
_, request_prompts, engine_prompts = await self._preprocess_chat( if self.use_harmony:
request, messages, request_prompts, engine_prompts = (
tokenizer, self._make_request_with_harmony(request, prev_response))
messages, else:
chat_template=self.chat_template, messages, request_prompts, engine_prompts = (
chat_template_content_format=self.chat_template_content_format, await self._make_request(request, prev_response,
) tokenizer))
except (ValueError, TypeError, RuntimeError, except (ValueError, TypeError, RuntimeError,
jinja2.TemplateError) as e: jinja2.TemplateError) as e:
logger.exception("Error in preprocessing prompt inputs") logger.exception("Error in preprocessing prompt inputs")
@ -275,6 +305,38 @@ class OpenAIServingResponses(OpenAIServing):
except Exception as e: except Exception as e:
return self.create_error_response(str(e)) return self.create_error_response(str(e))
async def _make_request(
self,
request: ResponsesRequest,
prev_response: Optional[ResponsesResponse],
tokenizer: AnyTokenizer,
):
# Construct the input messages.
messages = self._construct_input_messages(request, prev_response)
_, request_prompts, engine_prompts = await self._preprocess_chat(
request,
tokenizer,
messages,
chat_template=self.chat_template,
chat_template_content_format=self.chat_template_content_format,
)
return messages, request_prompts, engine_prompts
def _make_request_with_harmony(
self,
request: ResponsesRequest,
prev_response: Optional[ResponsesResponse],
):
if request.tool_choice != "auto":
raise NotImplementedError(
"Only 'auto' tool_choice is supported in "
"response API with Harmony")
messages = self._construct_input_messages_with_harmony(
request, prev_response)
prompt_token_ids = render_for_completion(messages)
engine_prompt = EngineTokensPrompt(prompt_token_ids=prompt_token_ids)
return messages, [prompt_token_ids], [engine_prompt]
async def responses_full_generator( async def responses_full_generator(
self, self,
request: ResponsesRequest, request: ResponsesRequest,
@ -411,6 +473,82 @@ class OpenAIServingResponses(OpenAIServing):
messages.extend(request.input) # type: ignore messages.extend(request.input) # type: ignore
return messages return messages
def _construct_input_messages_with_harmony(
self,
request: ResponsesRequest,
prev_response: Optional[ResponsesResponse],
) -> list[OpenAIHarmonyMessage]:
messages: list[OpenAIHarmonyMessage] = []
if prev_response is None:
# New conversation.
reasoning_effort = (request.reasoning.effort
if request.reasoning else None)
tool_types = [tool.type for tool in request.tools]
enable_browser = ("web_search_preview" in tool_types
and self.tool_server is not None
and self.tool_server.has_tool("browser"))
enable_code_interpreter = ("code_interpreter" in tool_types
and self.tool_server is not None
and self.tool_server.has_tool("python"))
sys_msg = get_system_message(
reasoning_effort=reasoning_effort,
browser_description=self.tool_server.get_tool_description(
"browser")
if enable_browser and self.tool_server is not None else None,
python_description=self.tool_server.get_tool_description(
"python") if enable_code_interpreter
and self.tool_server is not None else None,
)
messages.append(sys_msg)
dev_msg = get_developer_message(request.instructions,
request.tools)
messages.append(dev_msg)
else:
# Continue the previous conversation.
# FIXME(woosuk): Currently, request params like reasoning and
# instructions are ignored.
prev_msgs = self.msg_store[prev_response.id]
# Remove the previous chain-of-thoughts if there is a new "final"
# message. Note that this also removes these messages from the
# msg_store.
if len(prev_msgs) > 0:
last_msg = prev_msgs[-1]
assert isinstance(last_msg, OpenAIHarmonyMessage)
if last_msg.channel == "final":
prev_final_msg_idx = -1
for i in range(len(prev_msgs) - 2, -1, -1):
prev_msg_i = prev_msgs[i]
assert isinstance(prev_msg_i, OpenAIHarmonyMessage)
if prev_msg_i.channel == "final":
prev_final_msg_idx = i
break
recent_turn_msgs = prev_msgs[prev_final_msg_idx + 1:]
del prev_msgs[prev_final_msg_idx + 1:]
for msg in recent_turn_msgs:
assert isinstance(msg, OpenAIHarmonyMessage)
if msg.channel != "analysis":
prev_msgs.append(msg)
messages.extend(prev_msgs)
# Append the new input.
# Reponses API supports simple text inputs without chat format.
if isinstance(request.input, str):
messages.append(get_user_message(request.input))
else:
if prev_response is not None:
prev_outputs = copy(prev_response.output)
else:
prev_outputs = []
for response_msg in request.input:
messages.append(
parse_response_input(response_msg, prev_outputs))
# User passes in a a tool call request and its output. We need
# to add the tool call request to prev_outputs so that the
# parse_response_input can find the tool call request when
# parsing the tool call output.
if isinstance(response_msg, ResponseFunctionToolCall):
prev_outputs.append(response_msg)
return messages
async def _run_background_request( async def _run_background_request(
self, self,
request: ResponsesRequest, request: ResponsesRequest,