[gpt-oss] Generate ResponseOutputItem from Harmony Message (#22410)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
This commit is contained in:
Chen Zhang 2025-08-07 08:33:25 -07:00 committed by GitHub
parent 4da8bf20d0
commit 4815b00f54
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 290 additions and 80 deletions

View File

@ -17,7 +17,7 @@ async def test_simple_input(client: openai.AsyncOpenAI):
# Whether the output contains the reasoning. # Whether the output contains the reasoning.
assert outputs[0].type == "reasoning" assert outputs[0].type == "reasoning"
assert outputs[0].text != "" assert outputs[0].content[0].text != ""
@pytest.mark.asyncio @pytest.mark.asyncio

View File

@ -1,18 +1,25 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import datetime import datetime
import json
from collections.abc import Iterable, Sequence from collections.abc import Iterable, Sequence
from typing import Literal, Optional, Union from typing import Literal, Optional, Union
from openai.types.responses import ResponseFunctionToolCall, ResponseOutputItem from openai.types.responses import (ResponseFunctionToolCall,
ResponseOutputItem, ResponseOutputMessage,
ResponseOutputText, ResponseReasoningItem)
from openai.types.responses.response_function_web_search import (
ActionFind, ActionOpenPage, ActionSearch, ResponseFunctionWebSearch)
from openai.types.responses.response_reasoning_item import (
Content as ResponseReasoningTextContent)
from openai.types.responses.tool import Tool from openai.types.responses.tool import Tool
from openai_harmony import (Author, Conversation, DeveloperContent, from openai_harmony import (Author, Conversation, DeveloperContent,
HarmonyEncodingName, Message, ReasoningEffort, HarmonyEncodingName, Message, ReasoningEffort,
Role, StreamableParser, SystemContent, TextContent, Role, StreamableParser, SystemContent, TextContent,
ToolDescription, load_harmony_encoding) ToolDescription, load_harmony_encoding)
from vllm.entrypoints.openai.protocol import (ResponseInputOutputItem, from vllm.entrypoints.openai.protocol import ResponseInputOutputItem
ResponseReasoningItem) from vllm.utils import random_uuid
REASONING_EFFORT = { REASONING_EFFORT = {
"high": ReasoningEffort.HIGH, "high": ReasoningEffort.HIGH,
@ -160,6 +167,146 @@ def render_for_completion(messages: list[Message]) -> list[int]:
return token_ids return token_ids
def parse_output_message(message: Message) -> list[ResponseOutputItem]:
"""
Parse a Harmony message into a list of output response items.
"""
if message.author.role != "assistant":
# This is a message from a tool to the assistant (e.g., search result).
# Don't include it in the final output for now. This aligns with
# OpenAI's behavior on models like o4-mini.
return []
output_items: list[ResponseOutputItem] = []
recipient = message.recipient
if recipient is not None and recipient.startswith("browser."):
if len(message.content) != 1:
raise ValueError("Invalid number of contents in browser message")
content = message.content[0]
browser_call = json.loads(content.text)
# TODO: translate to url properly!
if recipient == "browser.search":
action = ActionSearch(
query=f"cursor:{browser_call.get('query', '')}", type="search")
elif recipient == "browser.open":
action = ActionOpenPage(
url=f"cursor:{browser_call.get('url', '')}", type="open_page")
elif recipient == "browser.find":
action = ActionFind(pattern=browser_call["pattern"],
url=f"cursor:{browser_call.get('url', '')}",
type="find")
else:
raise ValueError(f"Unknown browser action: {recipient}")
web_search_item = ResponseFunctionWebSearch(
id=f"ws_{random_uuid()}",
action=action,
status="completed",
type="web_search_call",
)
output_items.append(web_search_item)
elif message.channel == "analysis":
for content in message.content:
reasoning_item = ResponseReasoningItem(
id=f"rs_{random_uuid()}",
summary=[],
type="reasoning",
content=[
ResponseReasoningTextContent(text=content.text,
type="reasoning_text")
],
status=None,
)
output_items.append(reasoning_item)
elif message.channel == "commentary":
if message.recipient.startswith("functions."):
function_name = message.recipient.split(".")[-1]
for content in message.content:
random_id = random_uuid()
response_item = ResponseFunctionToolCall(
arguments=content.text,
call_id=f"call_{random_id}",
type="function_call",
name=function_name,
id=f"ft_{random_id}",
)
output_items.append(response_item)
elif message.recipient.startswith(
"python") or message.recipient.startswith("browser"):
for content in message.content:
reasoning_item = ResponseReasoningItem(
id=f"rs_{random_uuid()}",
summary=[],
type="reasoning",
text=content.text,
status=None,
)
output_items.append(reasoning_item)
else:
raise ValueError(f"Unknown recipient: {message.recipient}")
elif message.channel == "final":
contents = []
for content in message.content:
output_text = ResponseOutputText(
text=content.text,
annotations=[], # TODO
type="output_text",
logprobs=None, # TODO
)
contents.append(output_text)
text_item = ResponseOutputMessage(
id=f"msg_{random_uuid()}",
content=contents,
role=message.author.role,
status="completed",
type="message",
)
output_items.append(text_item)
else:
raise ValueError(f"Unknown channel: {message.channel}")
return output_items
def parse_remaining_state(
parser: StreamableParser) -> list[ResponseOutputItem]:
if not parser.current_content:
return []
if parser.current_role != Role.ASSISTANT:
return []
current_recipient = parser.current_recipient
if (current_recipient is not None
and current_recipient.startswith("browser.")):
return []
if parser.current_channel == "analysis":
reasoning_item = ResponseReasoningItem(
id=f"rs_{random_uuid()}",
summary=[],
type="reasoning",
content=[
ResponseReasoningTextContent(text=parser.current_content,
type="reasoning_text")
],
status=None,
)
return [reasoning_item]
elif parser.current_channel == "final":
output_text = ResponseOutputText(
text=parser.current_content,
annotations=[], # TODO
type="output_text",
logprobs=None, # TODO
)
text_item = ResponseOutputMessage(
id=f"msg_{random_uuid()}",
content=[output_text],
role="assistant",
status="completed",
type="message",
)
return [text_item]
return []
def get_stop_tokens_for_assistant_actions() -> list[int]: def get_stop_tokens_for_assistant_actions() -> list[int]:
return get_encoding().stop_tokens_for_assistant_actions() return get_encoding().stop_tokens_for_assistant_actions()

View File

@ -19,8 +19,8 @@ from openai.types.chat.chat_completion_message import (
# yapf: enable # yapf: enable
from openai.types.responses import (ResponseFunctionToolCall, from openai.types.responses import (ResponseFunctionToolCall,
ResponseInputItemParam, ResponseOutputItem, ResponseInputItemParam, ResponseOutputItem,
ResponseOutputMessage, ResponsePrompt, ResponsePrompt, ResponseStatus,
ResponseStatus, ResponseTextConfig) ResponseTextConfig)
from openai.types.responses.response import ToolChoice from openai.types.responses.response import ToolChoice
from openai.types.responses.tool import Tool from openai.types.responses.tool import Tool
from openai.types.shared import Metadata, Reasoning from openai.types.shared import Metadata, Reasoning
@ -1729,13 +1729,20 @@ class TranscriptionStreamResponse(OpenAIBaseModel):
usage: Optional[UsageInfo] = Field(default=None) usage: Optional[UsageInfo] = Field(default=None)
class ResponseReasoningItem(OpenAIBaseModel): class InputTokensDetails(OpenAIBaseModel):
id: str = Field(default_factory=lambda: f"rs_{random_uuid()}") cached_tokens: int
text: str
summary: list = Field(default_factory=list)
type: Literal["reasoning"] = "reasoning" class OutputTokensDetails(OpenAIBaseModel):
encrypted_content: Optional[str] = None reasoning_tokens: int
status: Optional[Literal["in_progress", "completed", "incomplete"]]
class ResponseUsage(OpenAIBaseModel):
input_tokens: int
input_tokens_details: InputTokensDetails
output_tokens: int
output_tokens_details: OutputTokensDetails
total_tokens: int
class ResponsesResponse(OpenAIBaseModel): class ResponsesResponse(OpenAIBaseModel):
@ -1747,7 +1754,7 @@ class ResponsesResponse(OpenAIBaseModel):
metadata: Optional[Metadata] = None metadata: Optional[Metadata] = None
model: str model: str
object: Literal["response"] = "response" object: Literal["response"] = "response"
output: list[Union[ResponseOutputMessage, ResponseReasoningItem]] output: list[ResponseOutputItem]
parallel_tool_calls: bool parallel_tool_calls: bool
temperature: float temperature: float
tool_choice: ToolChoice tool_choice: ToolChoice
@ -1764,7 +1771,7 @@ class ResponsesResponse(OpenAIBaseModel):
text: Optional[ResponseTextConfig] = None text: Optional[ResponseTextConfig] = None
top_logprobs: int top_logprobs: int
truncation: Literal["auto", "disabled"] truncation: Literal["auto", "disabled"]
usage: Optional[UsageInfo] = None usage: Optional[ResponseUsage] = None
user: Optional[str] = None user: Optional[str] = None
@classmethod @classmethod
@ -1776,7 +1783,7 @@ class ResponsesResponse(OpenAIBaseModel):
created_time: int, created_time: int,
output: list[ResponseOutputItem], output: list[ResponseOutputItem],
status: ResponseStatus, status: ResponseStatus,
usage: Optional[UsageInfo] = None, usage: Optional[ResponseUsage] = None,
) -> "ResponsesResponse": ) -> "ResponsesResponse":
return cls( return cls(
id=request.request_id, id=request.request_id,

View File

@ -6,12 +6,15 @@ import time
from collections.abc import AsyncGenerator, AsyncIterator from collections.abc import AsyncGenerator, AsyncIterator
from copy import copy from copy import copy
from http import HTTPStatus from http import HTTPStatus
from typing import Callable, Final, Optional, Union from typing import Any, Callable, Final, Optional, Union
import jinja2 import jinja2
from fastapi import Request from fastapi import Request
from openai.types.responses import (ResponseFunctionToolCall, from openai.types.responses import (ResponseFunctionToolCall,
ResponseOutputMessage, ResponseOutputText) ResponseOutputItem, ResponseOutputMessage,
ResponseOutputText, ResponseReasoningItem)
from openai.types.responses.response_reasoning_item import (
Content as ResponseReasoningTextContent)
from openai_harmony import Message as OpenAIHarmonyMessage from openai_harmony import Message as OpenAIHarmonyMessage
from vllm import envs from vllm import envs
@ -19,26 +22,28 @@ from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient from vllm.engine.protocol import EngineClient
from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam, from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
ChatTemplateContentFormatOption) ChatTemplateContentFormatOption)
from vllm.entrypoints.context import ConversationContext, SimpleContext from vllm.entrypoints.context import (ConversationContext, HarmonyContext,
SimpleContext, StreamingHarmonyContext)
from vllm.entrypoints.harmony_utils import ( from vllm.entrypoints.harmony_utils import (
get_developer_message, get_stop_tokens_for_assistant_actions, get_developer_message, get_stop_tokens_for_assistant_actions,
get_system_message, get_user_message, parse_response_input, get_system_message, get_user_message, parse_output_message,
render_for_completion) parse_remaining_state, parse_response_input, render_for_completion)
from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.logger import RequestLogger
# yapf conflicts with isort for this block # yapf conflicts with isort for this block
# yapf: disable # yapf: disable
from vllm.entrypoints.openai.protocol import (ErrorResponse, from vllm.entrypoints.openai.protocol import (ErrorResponse,
PromptTokenUsageInfo, InputTokensDetails,
OutputTokensDetails,
RequestResponseMetadata, RequestResponseMetadata,
ResponseReasoningItem,
ResponsesRequest, ResponsesRequest,
ResponsesResponse, UsageInfo) ResponsesResponse, ResponseUsage)
# yapf: enable # yapf: enable
from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.serving_engine import OpenAIServing
from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.tool_server import ToolServer from vllm.entrypoints.tool_server import ToolServer
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.outputs import CompletionOutput
from vllm.reasoning import ReasoningParser, ReasoningParserManager from vllm.reasoning import ReasoningParser, ReasoningParserManager
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer import AnyTokenizer
@ -222,6 +227,7 @@ class OpenAIServingResponses(OpenAIServing):
# Schedule the request and get the result generator. # Schedule the request and get the result generator.
generators: list[AsyncGenerator[ConversationContext, None]] = [] generators: list[AsyncGenerator[ConversationContext, None]] = []
try: try:
tool_sessions: dict[str, Any] = {}
for i, engine_prompt in enumerate(engine_prompts): for i, engine_prompt in enumerate(engine_prompts):
default_max_tokens = self.max_model_len - len( default_max_tokens = self.max_model_len - len(
engine_prompt["prompt_token_ids"]) engine_prompt["prompt_token_ids"])
@ -231,7 +237,15 @@ class OpenAIServingResponses(OpenAIServing):
trace_headers = (None if raw_request is None else await trace_headers = (None if raw_request is None else await
self._get_trace_headers(raw_request.headers)) self._get_trace_headers(raw_request.headers))
context = SimpleContext() context: ConversationContext
if self.use_harmony:
if request.stream:
context = StreamingHarmonyContext(
messages, tool_sessions)
else:
context = HarmonyContext(messages, tool_sessions)
else:
context = SimpleContext()
generator = self._generate_with_builtin_tools( generator = self._generate_with_builtin_tools(
request_id=request.request_id, request_id=request.request_id,
request_prompt=request_prompts[i], request_prompt=request_prompts[i],
@ -274,6 +288,7 @@ class OpenAIServingResponses(OpenAIServing):
request, request,
sampling_params, sampling_params,
result_generator, result_generator,
context,
model_name, model_name,
tokenizer, tokenizer,
request_metadata, request_metadata,
@ -297,6 +312,7 @@ class OpenAIServingResponses(OpenAIServing):
request, request,
sampling_params, sampling_params,
result_generator, result_generator,
context,
model_name, model_name,
tokenizer, tokenizer,
request_metadata, request_metadata,
@ -344,6 +360,7 @@ class OpenAIServingResponses(OpenAIServing):
request: ResponsesRequest, request: ResponsesRequest,
sampling_params: SamplingParams, sampling_params: SamplingParams,
result_generator: AsyncIterator[ConversationContext], result_generator: AsyncIterator[ConversationContext],
context: ConversationContext,
model_name: str, model_name: str,
tokenizer: AnyTokenizer, tokenizer: AnyTokenizer,
request_metadata: RequestResponseMetadata, request_metadata: RequestResponseMetadata,
@ -352,9 +369,8 @@ class OpenAIServingResponses(OpenAIServing):
if created_time is None: if created_time is None:
created_time = int(time.time()) created_time = int(time.time())
context: Optional[ConversationContext] = None
try: try:
async for context in result_generator: async for _ in result_generator:
pass pass
except asyncio.CancelledError: except asyncio.CancelledError:
return self.create_error_response("Client disconnected") return self.create_error_response("Client disconnected")
@ -362,64 +378,40 @@ class OpenAIServingResponses(OpenAIServing):
# TODO: Use a vllm-specific Validation Error # TODO: Use a vllm-specific Validation Error
return self.create_error_response(str(e)) return self.create_error_response(str(e))
assert context is not None if self.use_harmony:
assert isinstance(context, SimpleContext) assert isinstance(context, HarmonyContext)
final_res = context.last_output output = self._make_response_output_items_with_harmony(context)
assert final_res is not None # TODO: these are all 0 for now!
assert len(final_res.outputs) == 1 num_prompt_tokens = context.num_prompt_tokens
final_output = final_res.outputs[0] num_generated_tokens = context.num_output_tokens
num_cached_tokens = context.num_cached_tokens
if self.reasoning_parser: num_reasoning_tokens = context.num_reasoning_tokens
try:
reasoning_parser = self.reasoning_parser(tokenizer)
except RuntimeError as e:
logger.exception("Error in reasoning parser creation.")
return self.create_error_response(str(e))
reasoning_content, content = (
reasoning_parser.extract_reasoning_content(final_output.text,
request=request))
else: else:
reasoning_content = None assert isinstance(context, SimpleContext)
content = final_output.text final_res = context.last_output
assert final_res is not None
assert len(final_res.outputs) == 1
final_output = final_res.outputs[0]
output = [] output = self._make_response_output_items(request, final_output,
if reasoning_content: tokenizer)
reasoning_item = ResponseReasoningItem(
text=reasoning_content,
status=None, # NOTE: Only the last output item has status.
)
output.append(reasoning_item)
if content:
output_text = ResponseOutputText(
text=content,
annotations=[], # TODO
type="output_text",
logprobs=None, # TODO
)
message = ResponseOutputMessage(
id=f"msg_{random_uuid()}",
content=[output_text],
role="assistant",
status="completed",
type="message",
)
output.append(message)
# Calculate usage. # Calculate usage.
assert final_res.prompt_token_ids is not None assert final_res.prompt_token_ids is not None
num_prompt_tokens = len(final_res.prompt_token_ids) num_prompt_tokens = len(final_res.prompt_token_ids)
num_generated_tokens = len(final_output.token_ids) num_generated_tokens = len(final_output.token_ids)
usage = UsageInfo( num_cached_tokens = final_res.num_cached_tokens
prompt_tokens=num_prompt_tokens, num_reasoning_tokens = 0
completion_tokens=num_generated_tokens,
usage = ResponseUsage(
input_tokens=num_prompt_tokens,
output_tokens=num_generated_tokens,
total_tokens=num_prompt_tokens + num_generated_tokens, total_tokens=num_prompt_tokens + num_generated_tokens,
input_tokens_details=InputTokensDetails(
cached_tokens=num_cached_tokens),
output_tokens_details=OutputTokensDetails(
reasoning_tokens=num_reasoning_tokens),
) )
if self.enable_prompt_tokens_details and final_res.num_cached_tokens:
usage.prompt_tokens_details = PromptTokenUsageInfo(
cached_tokens=final_res.num_cached_tokens)
request_metadata.final_usage_info = usage
response = ResponsesResponse.from_request( response = ResponsesResponse.from_request(
request, request,
sampling_params, sampling_params,
@ -457,6 +449,70 @@ class OpenAIServingResponses(OpenAIServing):
self.response_store[response.id] = response self.response_store[response.id] = response
return response return response
def _make_response_output_items(
self,
request: ResponsesRequest,
final_output: CompletionOutput,
tokenizer: AnyTokenizer,
) -> list[ResponseOutputItem]:
if self.reasoning_parser:
try:
reasoning_parser = self.reasoning_parser(tokenizer)
except RuntimeError as e:
logger.exception("Error in reasoning parser creation.")
raise e
reasoning_content, content = (
reasoning_parser.extract_reasoning_content(final_output.text,
request=request))
else:
reasoning_content = None
content = final_output.text
output = []
if reasoning_content:
reasoning_item = ResponseReasoningItem(
id=f"rs_{random_uuid()}",
summary=[],
type="reasoning",
content=[
ResponseReasoningTextContent(text=reasoning_content,
type="reasoning_text")
],
status=None, # NOTE: Only the last output item has status.
)
output.append(reasoning_item)
if content:
output_text = ResponseOutputText(
text=content,
annotations=[], # TODO
type="output_text",
logprobs=None, # TODO
)
message = ResponseOutputMessage(
id=f"msg_{random_uuid()}",
content=[output_text],
role="assistant",
status="completed",
type="message",
)
output.append(message)
return output
def _make_response_output_items_with_harmony(
self,
context: HarmonyContext,
) -> list[ResponseOutputItem]:
output_items = []
num_init_messages = context.num_init_messages
for msg in context.messages[num_init_messages:]:
output_items.extend(parse_output_message(msg))
# Handle the generation stopped in the middle (if any).
last_items = parse_remaining_state(context.parser)
if last_items:
output_items.extend(last_items)
return output_items
def _construct_input_messages( def _construct_input_messages(
self, self,
request: ResponsesRequest, request: ResponsesRequest,