[gpt-oss] Generate ResponseOutputItem from Harmony Message (#22410)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
This commit is contained in:
Chen Zhang 2025-08-07 08:33:25 -07:00 committed by GitHub
parent 4da8bf20d0
commit 4815b00f54
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 290 additions and 80 deletions

View File

@ -17,7 +17,7 @@ async def test_simple_input(client: openai.AsyncOpenAI):
# Whether the output contains the reasoning.
assert outputs[0].type == "reasoning"
assert outputs[0].text != ""
assert outputs[0].content[0].text != ""
@pytest.mark.asyncio

View File

@ -1,18 +1,25 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import datetime
import json
from collections.abc import Iterable, Sequence
from typing import Literal, Optional, Union
from openai.types.responses import ResponseFunctionToolCall, ResponseOutputItem
from openai.types.responses import (ResponseFunctionToolCall,
ResponseOutputItem, ResponseOutputMessage,
ResponseOutputText, ResponseReasoningItem)
from openai.types.responses.response_function_web_search import (
ActionFind, ActionOpenPage, ActionSearch, ResponseFunctionWebSearch)
from openai.types.responses.response_reasoning_item import (
Content as ResponseReasoningTextContent)
from openai.types.responses.tool import Tool
from openai_harmony import (Author, Conversation, DeveloperContent,
HarmonyEncodingName, Message, ReasoningEffort,
Role, StreamableParser, SystemContent, TextContent,
ToolDescription, load_harmony_encoding)
from vllm.entrypoints.openai.protocol import (ResponseInputOutputItem,
ResponseReasoningItem)
from vllm.entrypoints.openai.protocol import ResponseInputOutputItem
from vllm.utils import random_uuid
REASONING_EFFORT = {
"high": ReasoningEffort.HIGH,
@ -160,6 +167,146 @@ def render_for_completion(messages: list[Message]) -> list[int]:
return token_ids
def parse_output_message(message: Message) -> list[ResponseOutputItem]:
"""
Parse a Harmony message into a list of output response items.
"""
if message.author.role != "assistant":
# This is a message from a tool to the assistant (e.g., search result).
# Don't include it in the final output for now. This aligns with
# OpenAI's behavior on models like o4-mini.
return []
output_items: list[ResponseOutputItem] = []
recipient = message.recipient
if recipient is not None and recipient.startswith("browser."):
if len(message.content) != 1:
raise ValueError("Invalid number of contents in browser message")
content = message.content[0]
browser_call = json.loads(content.text)
# TODO: translate to url properly!
if recipient == "browser.search":
action = ActionSearch(
query=f"cursor:{browser_call.get('query', '')}", type="search")
elif recipient == "browser.open":
action = ActionOpenPage(
url=f"cursor:{browser_call.get('url', '')}", type="open_page")
elif recipient == "browser.find":
action = ActionFind(pattern=browser_call["pattern"],
url=f"cursor:{browser_call.get('url', '')}",
type="find")
else:
raise ValueError(f"Unknown browser action: {recipient}")
web_search_item = ResponseFunctionWebSearch(
id=f"ws_{random_uuid()}",
action=action,
status="completed",
type="web_search_call",
)
output_items.append(web_search_item)
elif message.channel == "analysis":
for content in message.content:
reasoning_item = ResponseReasoningItem(
id=f"rs_{random_uuid()}",
summary=[],
type="reasoning",
content=[
ResponseReasoningTextContent(text=content.text,
type="reasoning_text")
],
status=None,
)
output_items.append(reasoning_item)
elif message.channel == "commentary":
if message.recipient.startswith("functions."):
function_name = message.recipient.split(".")[-1]
for content in message.content:
random_id = random_uuid()
response_item = ResponseFunctionToolCall(
arguments=content.text,
call_id=f"call_{random_id}",
type="function_call",
name=function_name,
id=f"ft_{random_id}",
)
output_items.append(response_item)
elif message.recipient.startswith(
"python") or message.recipient.startswith("browser"):
for content in message.content:
reasoning_item = ResponseReasoningItem(
id=f"rs_{random_uuid()}",
summary=[],
type="reasoning",
text=content.text,
status=None,
)
output_items.append(reasoning_item)
else:
raise ValueError(f"Unknown recipient: {message.recipient}")
elif message.channel == "final":
contents = []
for content in message.content:
output_text = ResponseOutputText(
text=content.text,
annotations=[], # TODO
type="output_text",
logprobs=None, # TODO
)
contents.append(output_text)
text_item = ResponseOutputMessage(
id=f"msg_{random_uuid()}",
content=contents,
role=message.author.role,
status="completed",
type="message",
)
output_items.append(text_item)
else:
raise ValueError(f"Unknown channel: {message.channel}")
return output_items
def parse_remaining_state(
parser: StreamableParser) -> list[ResponseOutputItem]:
if not parser.current_content:
return []
if parser.current_role != Role.ASSISTANT:
return []
current_recipient = parser.current_recipient
if (current_recipient is not None
and current_recipient.startswith("browser.")):
return []
if parser.current_channel == "analysis":
reasoning_item = ResponseReasoningItem(
id=f"rs_{random_uuid()}",
summary=[],
type="reasoning",
content=[
ResponseReasoningTextContent(text=parser.current_content,
type="reasoning_text")
],
status=None,
)
return [reasoning_item]
elif parser.current_channel == "final":
output_text = ResponseOutputText(
text=parser.current_content,
annotations=[], # TODO
type="output_text",
logprobs=None, # TODO
)
text_item = ResponseOutputMessage(
id=f"msg_{random_uuid()}",
content=[output_text],
role="assistant",
status="completed",
type="message",
)
return [text_item]
return []
def get_stop_tokens_for_assistant_actions() -> list[int]:
return get_encoding().stop_tokens_for_assistant_actions()

View File

@ -19,8 +19,8 @@ from openai.types.chat.chat_completion_message import (
# yapf: enable
from openai.types.responses import (ResponseFunctionToolCall,
ResponseInputItemParam, ResponseOutputItem,
ResponseOutputMessage, ResponsePrompt,
ResponseStatus, ResponseTextConfig)
ResponsePrompt, ResponseStatus,
ResponseTextConfig)
from openai.types.responses.response import ToolChoice
from openai.types.responses.tool import Tool
from openai.types.shared import Metadata, Reasoning
@ -1729,13 +1729,20 @@ class TranscriptionStreamResponse(OpenAIBaseModel):
usage: Optional[UsageInfo] = Field(default=None)
class ResponseReasoningItem(OpenAIBaseModel):
id: str = Field(default_factory=lambda: f"rs_{random_uuid()}")
text: str
summary: list = Field(default_factory=list)
type: Literal["reasoning"] = "reasoning"
encrypted_content: Optional[str] = None
status: Optional[Literal["in_progress", "completed", "incomplete"]]
class InputTokensDetails(OpenAIBaseModel):
cached_tokens: int
class OutputTokensDetails(OpenAIBaseModel):
reasoning_tokens: int
class ResponseUsage(OpenAIBaseModel):
input_tokens: int
input_tokens_details: InputTokensDetails
output_tokens: int
output_tokens_details: OutputTokensDetails
total_tokens: int
class ResponsesResponse(OpenAIBaseModel):
@ -1747,7 +1754,7 @@ class ResponsesResponse(OpenAIBaseModel):
metadata: Optional[Metadata] = None
model: str
object: Literal["response"] = "response"
output: list[Union[ResponseOutputMessage, ResponseReasoningItem]]
output: list[ResponseOutputItem]
parallel_tool_calls: bool
temperature: float
tool_choice: ToolChoice
@ -1764,7 +1771,7 @@ class ResponsesResponse(OpenAIBaseModel):
text: Optional[ResponseTextConfig] = None
top_logprobs: int
truncation: Literal["auto", "disabled"]
usage: Optional[UsageInfo] = None
usage: Optional[ResponseUsage] = None
user: Optional[str] = None
@classmethod
@ -1776,7 +1783,7 @@ class ResponsesResponse(OpenAIBaseModel):
created_time: int,
output: list[ResponseOutputItem],
status: ResponseStatus,
usage: Optional[UsageInfo] = None,
usage: Optional[ResponseUsage] = None,
) -> "ResponsesResponse":
return cls(
id=request.request_id,

View File

@ -6,12 +6,15 @@ import time
from collections.abc import AsyncGenerator, AsyncIterator
from copy import copy
from http import HTTPStatus
from typing import Callable, Final, Optional, Union
from typing import Any, Callable, Final, Optional, Union
import jinja2
from fastapi import Request
from openai.types.responses import (ResponseFunctionToolCall,
ResponseOutputMessage, ResponseOutputText)
ResponseOutputItem, ResponseOutputMessage,
ResponseOutputText, ResponseReasoningItem)
from openai.types.responses.response_reasoning_item import (
Content as ResponseReasoningTextContent)
from openai_harmony import Message as OpenAIHarmonyMessage
from vllm import envs
@ -19,26 +22,28 @@ from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
ChatTemplateContentFormatOption)
from vllm.entrypoints.context import ConversationContext, SimpleContext
from vllm.entrypoints.context import (ConversationContext, HarmonyContext,
SimpleContext, StreamingHarmonyContext)
from vllm.entrypoints.harmony_utils import (
get_developer_message, get_stop_tokens_for_assistant_actions,
get_system_message, get_user_message, parse_response_input,
render_for_completion)
get_system_message, get_user_message, parse_output_message,
parse_remaining_state, parse_response_input, render_for_completion)
from vllm.entrypoints.logger import RequestLogger
# yapf conflicts with isort for this block
# yapf: disable
from vllm.entrypoints.openai.protocol import (ErrorResponse,
PromptTokenUsageInfo,
InputTokensDetails,
OutputTokensDetails,
RequestResponseMetadata,
ResponseReasoningItem,
ResponsesRequest,
ResponsesResponse, UsageInfo)
ResponsesResponse, ResponseUsage)
# yapf: enable
from vllm.entrypoints.openai.serving_engine import OpenAIServing
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.tool_server import ToolServer
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
from vllm.logger import init_logger
from vllm.outputs import CompletionOutput
from vllm.reasoning import ReasoningParser, ReasoningParserManager
from vllm.sampling_params import SamplingParams
from vllm.transformers_utils.tokenizer import AnyTokenizer
@ -222,6 +227,7 @@ class OpenAIServingResponses(OpenAIServing):
# Schedule the request and get the result generator.
generators: list[AsyncGenerator[ConversationContext, None]] = []
try:
tool_sessions: dict[str, Any] = {}
for i, engine_prompt in enumerate(engine_prompts):
default_max_tokens = self.max_model_len - len(
engine_prompt["prompt_token_ids"])
@ -231,7 +237,15 @@ class OpenAIServingResponses(OpenAIServing):
trace_headers = (None if raw_request is None else await
self._get_trace_headers(raw_request.headers))
context = SimpleContext()
context: ConversationContext
if self.use_harmony:
if request.stream:
context = StreamingHarmonyContext(
messages, tool_sessions)
else:
context = HarmonyContext(messages, tool_sessions)
else:
context = SimpleContext()
generator = self._generate_with_builtin_tools(
request_id=request.request_id,
request_prompt=request_prompts[i],
@ -274,6 +288,7 @@ class OpenAIServingResponses(OpenAIServing):
request,
sampling_params,
result_generator,
context,
model_name,
tokenizer,
request_metadata,
@ -297,6 +312,7 @@ class OpenAIServingResponses(OpenAIServing):
request,
sampling_params,
result_generator,
context,
model_name,
tokenizer,
request_metadata,
@ -344,6 +360,7 @@ class OpenAIServingResponses(OpenAIServing):
request: ResponsesRequest,
sampling_params: SamplingParams,
result_generator: AsyncIterator[ConversationContext],
context: ConversationContext,
model_name: str,
tokenizer: AnyTokenizer,
request_metadata: RequestResponseMetadata,
@ -352,9 +369,8 @@ class OpenAIServingResponses(OpenAIServing):
if created_time is None:
created_time = int(time.time())
context: Optional[ConversationContext] = None
try:
async for context in result_generator:
async for _ in result_generator:
pass
except asyncio.CancelledError:
return self.create_error_response("Client disconnected")
@ -362,64 +378,40 @@ class OpenAIServingResponses(OpenAIServing):
# TODO: Use a vllm-specific Validation Error
return self.create_error_response(str(e))
assert context is not None
assert isinstance(context, SimpleContext)
final_res = context.last_output
assert final_res is not None
assert len(final_res.outputs) == 1
final_output = final_res.outputs[0]
if self.reasoning_parser:
try:
reasoning_parser = self.reasoning_parser(tokenizer)
except RuntimeError as e:
logger.exception("Error in reasoning parser creation.")
return self.create_error_response(str(e))
reasoning_content, content = (
reasoning_parser.extract_reasoning_content(final_output.text,
request=request))
if self.use_harmony:
assert isinstance(context, HarmonyContext)
output = self._make_response_output_items_with_harmony(context)
# TODO: these are all 0 for now!
num_prompt_tokens = context.num_prompt_tokens
num_generated_tokens = context.num_output_tokens
num_cached_tokens = context.num_cached_tokens
num_reasoning_tokens = context.num_reasoning_tokens
else:
reasoning_content = None
content = final_output.text
assert isinstance(context, SimpleContext)
final_res = context.last_output
assert final_res is not None
assert len(final_res.outputs) == 1
final_output = final_res.outputs[0]
output = []
if reasoning_content:
reasoning_item = ResponseReasoningItem(
text=reasoning_content,
status=None, # NOTE: Only the last output item has status.
)
output.append(reasoning_item)
if content:
output_text = ResponseOutputText(
text=content,
annotations=[], # TODO
type="output_text",
logprobs=None, # TODO
)
message = ResponseOutputMessage(
id=f"msg_{random_uuid()}",
content=[output_text],
role="assistant",
status="completed",
type="message",
)
output.append(message)
output = self._make_response_output_items(request, final_output,
tokenizer)
# Calculate usage.
assert final_res.prompt_token_ids is not None
num_prompt_tokens = len(final_res.prompt_token_ids)
num_generated_tokens = len(final_output.token_ids)
usage = UsageInfo(
prompt_tokens=num_prompt_tokens,
completion_tokens=num_generated_tokens,
# Calculate usage.
assert final_res.prompt_token_ids is not None
num_prompt_tokens = len(final_res.prompt_token_ids)
num_generated_tokens = len(final_output.token_ids)
num_cached_tokens = final_res.num_cached_tokens
num_reasoning_tokens = 0
usage = ResponseUsage(
input_tokens=num_prompt_tokens,
output_tokens=num_generated_tokens,
total_tokens=num_prompt_tokens + num_generated_tokens,
input_tokens_details=InputTokensDetails(
cached_tokens=num_cached_tokens),
output_tokens_details=OutputTokensDetails(
reasoning_tokens=num_reasoning_tokens),
)
if self.enable_prompt_tokens_details and final_res.num_cached_tokens:
usage.prompt_tokens_details = PromptTokenUsageInfo(
cached_tokens=final_res.num_cached_tokens)
request_metadata.final_usage_info = usage
response = ResponsesResponse.from_request(
request,
sampling_params,
@ -457,6 +449,70 @@ class OpenAIServingResponses(OpenAIServing):
self.response_store[response.id] = response
return response
def _make_response_output_items(
self,
request: ResponsesRequest,
final_output: CompletionOutput,
tokenizer: AnyTokenizer,
) -> list[ResponseOutputItem]:
if self.reasoning_parser:
try:
reasoning_parser = self.reasoning_parser(tokenizer)
except RuntimeError as e:
logger.exception("Error in reasoning parser creation.")
raise e
reasoning_content, content = (
reasoning_parser.extract_reasoning_content(final_output.text,
request=request))
else:
reasoning_content = None
content = final_output.text
output = []
if reasoning_content:
reasoning_item = ResponseReasoningItem(
id=f"rs_{random_uuid()}",
summary=[],
type="reasoning",
content=[
ResponseReasoningTextContent(text=reasoning_content,
type="reasoning_text")
],
status=None, # NOTE: Only the last output item has status.
)
output.append(reasoning_item)
if content:
output_text = ResponseOutputText(
text=content,
annotations=[], # TODO
type="output_text",
logprobs=None, # TODO
)
message = ResponseOutputMessage(
id=f"msg_{random_uuid()}",
content=[output_text],
role="assistant",
status="completed",
type="message",
)
output.append(message)
return output
def _make_response_output_items_with_harmony(
self,
context: HarmonyContext,
) -> list[ResponseOutputItem]:
output_items = []
num_init_messages = context.num_init_messages
for msg in context.messages[num_init_messages:]:
output_items.extend(parse_output_message(msg))
# Handle the generation stopped in the middle (if any).
last_items = parse_remaining_state(context.parser)
if last_items:
output_items.extend(last_items)
return output_items
def _construct_input_messages(
self,
request: ResponsesRequest,