mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-14 00:45:36 +08:00
[gpt-oss] Generate ResponseOutputItem from Harmony Message (#22410)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
This commit is contained in:
parent
4da8bf20d0
commit
4815b00f54
@ -17,7 +17,7 @@ async def test_simple_input(client: openai.AsyncOpenAI):
|
|||||||
|
|
||||||
# Whether the output contains the reasoning.
|
# Whether the output contains the reasoning.
|
||||||
assert outputs[0].type == "reasoning"
|
assert outputs[0].type == "reasoning"
|
||||||
assert outputs[0].text != ""
|
assert outputs[0].content[0].text != ""
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
|
|||||||
@ -1,18 +1,25 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import datetime
|
import datetime
|
||||||
|
import json
|
||||||
from collections.abc import Iterable, Sequence
|
from collections.abc import Iterable, Sequence
|
||||||
from typing import Literal, Optional, Union
|
from typing import Literal, Optional, Union
|
||||||
|
|
||||||
from openai.types.responses import ResponseFunctionToolCall, ResponseOutputItem
|
from openai.types.responses import (ResponseFunctionToolCall,
|
||||||
|
ResponseOutputItem, ResponseOutputMessage,
|
||||||
|
ResponseOutputText, ResponseReasoningItem)
|
||||||
|
from openai.types.responses.response_function_web_search import (
|
||||||
|
ActionFind, ActionOpenPage, ActionSearch, ResponseFunctionWebSearch)
|
||||||
|
from openai.types.responses.response_reasoning_item import (
|
||||||
|
Content as ResponseReasoningTextContent)
|
||||||
from openai.types.responses.tool import Tool
|
from openai.types.responses.tool import Tool
|
||||||
from openai_harmony import (Author, Conversation, DeveloperContent,
|
from openai_harmony import (Author, Conversation, DeveloperContent,
|
||||||
HarmonyEncodingName, Message, ReasoningEffort,
|
HarmonyEncodingName, Message, ReasoningEffort,
|
||||||
Role, StreamableParser, SystemContent, TextContent,
|
Role, StreamableParser, SystemContent, TextContent,
|
||||||
ToolDescription, load_harmony_encoding)
|
ToolDescription, load_harmony_encoding)
|
||||||
|
|
||||||
from vllm.entrypoints.openai.protocol import (ResponseInputOutputItem,
|
from vllm.entrypoints.openai.protocol import ResponseInputOutputItem
|
||||||
ResponseReasoningItem)
|
from vllm.utils import random_uuid
|
||||||
|
|
||||||
REASONING_EFFORT = {
|
REASONING_EFFORT = {
|
||||||
"high": ReasoningEffort.HIGH,
|
"high": ReasoningEffort.HIGH,
|
||||||
@ -160,6 +167,146 @@ def render_for_completion(messages: list[Message]) -> list[int]:
|
|||||||
return token_ids
|
return token_ids
|
||||||
|
|
||||||
|
|
||||||
|
def parse_output_message(message: Message) -> list[ResponseOutputItem]:
|
||||||
|
"""
|
||||||
|
Parse a Harmony message into a list of output response items.
|
||||||
|
"""
|
||||||
|
if message.author.role != "assistant":
|
||||||
|
# This is a message from a tool to the assistant (e.g., search result).
|
||||||
|
# Don't include it in the final output for now. This aligns with
|
||||||
|
# OpenAI's behavior on models like o4-mini.
|
||||||
|
return []
|
||||||
|
|
||||||
|
output_items: list[ResponseOutputItem] = []
|
||||||
|
recipient = message.recipient
|
||||||
|
if recipient is not None and recipient.startswith("browser."):
|
||||||
|
if len(message.content) != 1:
|
||||||
|
raise ValueError("Invalid number of contents in browser message")
|
||||||
|
content = message.content[0]
|
||||||
|
browser_call = json.loads(content.text)
|
||||||
|
# TODO: translate to url properly!
|
||||||
|
if recipient == "browser.search":
|
||||||
|
action = ActionSearch(
|
||||||
|
query=f"cursor:{browser_call.get('query', '')}", type="search")
|
||||||
|
elif recipient == "browser.open":
|
||||||
|
action = ActionOpenPage(
|
||||||
|
url=f"cursor:{browser_call.get('url', '')}", type="open_page")
|
||||||
|
elif recipient == "browser.find":
|
||||||
|
action = ActionFind(pattern=browser_call["pattern"],
|
||||||
|
url=f"cursor:{browser_call.get('url', '')}",
|
||||||
|
type="find")
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown browser action: {recipient}")
|
||||||
|
web_search_item = ResponseFunctionWebSearch(
|
||||||
|
id=f"ws_{random_uuid()}",
|
||||||
|
action=action,
|
||||||
|
status="completed",
|
||||||
|
type="web_search_call",
|
||||||
|
)
|
||||||
|
output_items.append(web_search_item)
|
||||||
|
elif message.channel == "analysis":
|
||||||
|
for content in message.content:
|
||||||
|
reasoning_item = ResponseReasoningItem(
|
||||||
|
id=f"rs_{random_uuid()}",
|
||||||
|
summary=[],
|
||||||
|
type="reasoning",
|
||||||
|
content=[
|
||||||
|
ResponseReasoningTextContent(text=content.text,
|
||||||
|
type="reasoning_text")
|
||||||
|
],
|
||||||
|
status=None,
|
||||||
|
)
|
||||||
|
output_items.append(reasoning_item)
|
||||||
|
elif message.channel == "commentary":
|
||||||
|
if message.recipient.startswith("functions."):
|
||||||
|
function_name = message.recipient.split(".")[-1]
|
||||||
|
for content in message.content:
|
||||||
|
random_id = random_uuid()
|
||||||
|
response_item = ResponseFunctionToolCall(
|
||||||
|
arguments=content.text,
|
||||||
|
call_id=f"call_{random_id}",
|
||||||
|
type="function_call",
|
||||||
|
name=function_name,
|
||||||
|
id=f"ft_{random_id}",
|
||||||
|
)
|
||||||
|
output_items.append(response_item)
|
||||||
|
elif message.recipient.startswith(
|
||||||
|
"python") or message.recipient.startswith("browser"):
|
||||||
|
for content in message.content:
|
||||||
|
reasoning_item = ResponseReasoningItem(
|
||||||
|
id=f"rs_{random_uuid()}",
|
||||||
|
summary=[],
|
||||||
|
type="reasoning",
|
||||||
|
text=content.text,
|
||||||
|
status=None,
|
||||||
|
)
|
||||||
|
output_items.append(reasoning_item)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown recipient: {message.recipient}")
|
||||||
|
elif message.channel == "final":
|
||||||
|
contents = []
|
||||||
|
for content in message.content:
|
||||||
|
output_text = ResponseOutputText(
|
||||||
|
text=content.text,
|
||||||
|
annotations=[], # TODO
|
||||||
|
type="output_text",
|
||||||
|
logprobs=None, # TODO
|
||||||
|
)
|
||||||
|
contents.append(output_text)
|
||||||
|
text_item = ResponseOutputMessage(
|
||||||
|
id=f"msg_{random_uuid()}",
|
||||||
|
content=contents,
|
||||||
|
role=message.author.role,
|
||||||
|
status="completed",
|
||||||
|
type="message",
|
||||||
|
)
|
||||||
|
output_items.append(text_item)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown channel: {message.channel}")
|
||||||
|
return output_items
|
||||||
|
|
||||||
|
|
||||||
|
def parse_remaining_state(
|
||||||
|
parser: StreamableParser) -> list[ResponseOutputItem]:
|
||||||
|
if not parser.current_content:
|
||||||
|
return []
|
||||||
|
if parser.current_role != Role.ASSISTANT:
|
||||||
|
return []
|
||||||
|
current_recipient = parser.current_recipient
|
||||||
|
if (current_recipient is not None
|
||||||
|
and current_recipient.startswith("browser.")):
|
||||||
|
return []
|
||||||
|
|
||||||
|
if parser.current_channel == "analysis":
|
||||||
|
reasoning_item = ResponseReasoningItem(
|
||||||
|
id=f"rs_{random_uuid()}",
|
||||||
|
summary=[],
|
||||||
|
type="reasoning",
|
||||||
|
content=[
|
||||||
|
ResponseReasoningTextContent(text=parser.current_content,
|
||||||
|
type="reasoning_text")
|
||||||
|
],
|
||||||
|
status=None,
|
||||||
|
)
|
||||||
|
return [reasoning_item]
|
||||||
|
elif parser.current_channel == "final":
|
||||||
|
output_text = ResponseOutputText(
|
||||||
|
text=parser.current_content,
|
||||||
|
annotations=[], # TODO
|
||||||
|
type="output_text",
|
||||||
|
logprobs=None, # TODO
|
||||||
|
)
|
||||||
|
text_item = ResponseOutputMessage(
|
||||||
|
id=f"msg_{random_uuid()}",
|
||||||
|
content=[output_text],
|
||||||
|
role="assistant",
|
||||||
|
status="completed",
|
||||||
|
type="message",
|
||||||
|
)
|
||||||
|
return [text_item]
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
def get_stop_tokens_for_assistant_actions() -> list[int]:
|
def get_stop_tokens_for_assistant_actions() -> list[int]:
|
||||||
return get_encoding().stop_tokens_for_assistant_actions()
|
return get_encoding().stop_tokens_for_assistant_actions()
|
||||||
|
|
||||||
|
|||||||
@ -19,8 +19,8 @@ from openai.types.chat.chat_completion_message import (
|
|||||||
# yapf: enable
|
# yapf: enable
|
||||||
from openai.types.responses import (ResponseFunctionToolCall,
|
from openai.types.responses import (ResponseFunctionToolCall,
|
||||||
ResponseInputItemParam, ResponseOutputItem,
|
ResponseInputItemParam, ResponseOutputItem,
|
||||||
ResponseOutputMessage, ResponsePrompt,
|
ResponsePrompt, ResponseStatus,
|
||||||
ResponseStatus, ResponseTextConfig)
|
ResponseTextConfig)
|
||||||
from openai.types.responses.response import ToolChoice
|
from openai.types.responses.response import ToolChoice
|
||||||
from openai.types.responses.tool import Tool
|
from openai.types.responses.tool import Tool
|
||||||
from openai.types.shared import Metadata, Reasoning
|
from openai.types.shared import Metadata, Reasoning
|
||||||
@ -1729,13 +1729,20 @@ class TranscriptionStreamResponse(OpenAIBaseModel):
|
|||||||
usage: Optional[UsageInfo] = Field(default=None)
|
usage: Optional[UsageInfo] = Field(default=None)
|
||||||
|
|
||||||
|
|
||||||
class ResponseReasoningItem(OpenAIBaseModel):
|
class InputTokensDetails(OpenAIBaseModel):
|
||||||
id: str = Field(default_factory=lambda: f"rs_{random_uuid()}")
|
cached_tokens: int
|
||||||
text: str
|
|
||||||
summary: list = Field(default_factory=list)
|
|
||||||
type: Literal["reasoning"] = "reasoning"
|
class OutputTokensDetails(OpenAIBaseModel):
|
||||||
encrypted_content: Optional[str] = None
|
reasoning_tokens: int
|
||||||
status: Optional[Literal["in_progress", "completed", "incomplete"]]
|
|
||||||
|
|
||||||
|
class ResponseUsage(OpenAIBaseModel):
|
||||||
|
input_tokens: int
|
||||||
|
input_tokens_details: InputTokensDetails
|
||||||
|
output_tokens: int
|
||||||
|
output_tokens_details: OutputTokensDetails
|
||||||
|
total_tokens: int
|
||||||
|
|
||||||
|
|
||||||
class ResponsesResponse(OpenAIBaseModel):
|
class ResponsesResponse(OpenAIBaseModel):
|
||||||
@ -1747,7 +1754,7 @@ class ResponsesResponse(OpenAIBaseModel):
|
|||||||
metadata: Optional[Metadata] = None
|
metadata: Optional[Metadata] = None
|
||||||
model: str
|
model: str
|
||||||
object: Literal["response"] = "response"
|
object: Literal["response"] = "response"
|
||||||
output: list[Union[ResponseOutputMessage, ResponseReasoningItem]]
|
output: list[ResponseOutputItem]
|
||||||
parallel_tool_calls: bool
|
parallel_tool_calls: bool
|
||||||
temperature: float
|
temperature: float
|
||||||
tool_choice: ToolChoice
|
tool_choice: ToolChoice
|
||||||
@ -1764,7 +1771,7 @@ class ResponsesResponse(OpenAIBaseModel):
|
|||||||
text: Optional[ResponseTextConfig] = None
|
text: Optional[ResponseTextConfig] = None
|
||||||
top_logprobs: int
|
top_logprobs: int
|
||||||
truncation: Literal["auto", "disabled"]
|
truncation: Literal["auto", "disabled"]
|
||||||
usage: Optional[UsageInfo] = None
|
usage: Optional[ResponseUsage] = None
|
||||||
user: Optional[str] = None
|
user: Optional[str] = None
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@ -1776,7 +1783,7 @@ class ResponsesResponse(OpenAIBaseModel):
|
|||||||
created_time: int,
|
created_time: int,
|
||||||
output: list[ResponseOutputItem],
|
output: list[ResponseOutputItem],
|
||||||
status: ResponseStatus,
|
status: ResponseStatus,
|
||||||
usage: Optional[UsageInfo] = None,
|
usage: Optional[ResponseUsage] = None,
|
||||||
) -> "ResponsesResponse":
|
) -> "ResponsesResponse":
|
||||||
return cls(
|
return cls(
|
||||||
id=request.request_id,
|
id=request.request_id,
|
||||||
|
|||||||
@ -6,12 +6,15 @@ import time
|
|||||||
from collections.abc import AsyncGenerator, AsyncIterator
|
from collections.abc import AsyncGenerator, AsyncIterator
|
||||||
from copy import copy
|
from copy import copy
|
||||||
from http import HTTPStatus
|
from http import HTTPStatus
|
||||||
from typing import Callable, Final, Optional, Union
|
from typing import Any, Callable, Final, Optional, Union
|
||||||
|
|
||||||
import jinja2
|
import jinja2
|
||||||
from fastapi import Request
|
from fastapi import Request
|
||||||
from openai.types.responses import (ResponseFunctionToolCall,
|
from openai.types.responses import (ResponseFunctionToolCall,
|
||||||
ResponseOutputMessage, ResponseOutputText)
|
ResponseOutputItem, ResponseOutputMessage,
|
||||||
|
ResponseOutputText, ResponseReasoningItem)
|
||||||
|
from openai.types.responses.response_reasoning_item import (
|
||||||
|
Content as ResponseReasoningTextContent)
|
||||||
from openai_harmony import Message as OpenAIHarmonyMessage
|
from openai_harmony import Message as OpenAIHarmonyMessage
|
||||||
|
|
||||||
from vllm import envs
|
from vllm import envs
|
||||||
@ -19,26 +22,28 @@ from vllm.config import ModelConfig
|
|||||||
from vllm.engine.protocol import EngineClient
|
from vllm.engine.protocol import EngineClient
|
||||||
from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
|
from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
|
||||||
ChatTemplateContentFormatOption)
|
ChatTemplateContentFormatOption)
|
||||||
from vllm.entrypoints.context import ConversationContext, SimpleContext
|
from vllm.entrypoints.context import (ConversationContext, HarmonyContext,
|
||||||
|
SimpleContext, StreamingHarmonyContext)
|
||||||
from vllm.entrypoints.harmony_utils import (
|
from vllm.entrypoints.harmony_utils import (
|
||||||
get_developer_message, get_stop_tokens_for_assistant_actions,
|
get_developer_message, get_stop_tokens_for_assistant_actions,
|
||||||
get_system_message, get_user_message, parse_response_input,
|
get_system_message, get_user_message, parse_output_message,
|
||||||
render_for_completion)
|
parse_remaining_state, parse_response_input, render_for_completion)
|
||||||
from vllm.entrypoints.logger import RequestLogger
|
from vllm.entrypoints.logger import RequestLogger
|
||||||
# yapf conflicts with isort for this block
|
# yapf conflicts with isort for this block
|
||||||
# yapf: disable
|
# yapf: disable
|
||||||
from vllm.entrypoints.openai.protocol import (ErrorResponse,
|
from vllm.entrypoints.openai.protocol import (ErrorResponse,
|
||||||
PromptTokenUsageInfo,
|
InputTokensDetails,
|
||||||
|
OutputTokensDetails,
|
||||||
RequestResponseMetadata,
|
RequestResponseMetadata,
|
||||||
ResponseReasoningItem,
|
|
||||||
ResponsesRequest,
|
ResponsesRequest,
|
||||||
ResponsesResponse, UsageInfo)
|
ResponsesResponse, ResponseUsage)
|
||||||
# yapf: enable
|
# yapf: enable
|
||||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
||||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||||
from vllm.entrypoints.tool_server import ToolServer
|
from vllm.entrypoints.tool_server import ToolServer
|
||||||
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
|
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
from vllm.outputs import CompletionOutput
|
||||||
from vllm.reasoning import ReasoningParser, ReasoningParserManager
|
from vllm.reasoning import ReasoningParser, ReasoningParserManager
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||||
@ -222,6 +227,7 @@ class OpenAIServingResponses(OpenAIServing):
|
|||||||
# Schedule the request and get the result generator.
|
# Schedule the request and get the result generator.
|
||||||
generators: list[AsyncGenerator[ConversationContext, None]] = []
|
generators: list[AsyncGenerator[ConversationContext, None]] = []
|
||||||
try:
|
try:
|
||||||
|
tool_sessions: dict[str, Any] = {}
|
||||||
for i, engine_prompt in enumerate(engine_prompts):
|
for i, engine_prompt in enumerate(engine_prompts):
|
||||||
default_max_tokens = self.max_model_len - len(
|
default_max_tokens = self.max_model_len - len(
|
||||||
engine_prompt["prompt_token_ids"])
|
engine_prompt["prompt_token_ids"])
|
||||||
@ -231,7 +237,15 @@ class OpenAIServingResponses(OpenAIServing):
|
|||||||
trace_headers = (None if raw_request is None else await
|
trace_headers = (None if raw_request is None else await
|
||||||
self._get_trace_headers(raw_request.headers))
|
self._get_trace_headers(raw_request.headers))
|
||||||
|
|
||||||
context = SimpleContext()
|
context: ConversationContext
|
||||||
|
if self.use_harmony:
|
||||||
|
if request.stream:
|
||||||
|
context = StreamingHarmonyContext(
|
||||||
|
messages, tool_sessions)
|
||||||
|
else:
|
||||||
|
context = HarmonyContext(messages, tool_sessions)
|
||||||
|
else:
|
||||||
|
context = SimpleContext()
|
||||||
generator = self._generate_with_builtin_tools(
|
generator = self._generate_with_builtin_tools(
|
||||||
request_id=request.request_id,
|
request_id=request.request_id,
|
||||||
request_prompt=request_prompts[i],
|
request_prompt=request_prompts[i],
|
||||||
@ -274,6 +288,7 @@ class OpenAIServingResponses(OpenAIServing):
|
|||||||
request,
|
request,
|
||||||
sampling_params,
|
sampling_params,
|
||||||
result_generator,
|
result_generator,
|
||||||
|
context,
|
||||||
model_name,
|
model_name,
|
||||||
tokenizer,
|
tokenizer,
|
||||||
request_metadata,
|
request_metadata,
|
||||||
@ -297,6 +312,7 @@ class OpenAIServingResponses(OpenAIServing):
|
|||||||
request,
|
request,
|
||||||
sampling_params,
|
sampling_params,
|
||||||
result_generator,
|
result_generator,
|
||||||
|
context,
|
||||||
model_name,
|
model_name,
|
||||||
tokenizer,
|
tokenizer,
|
||||||
request_metadata,
|
request_metadata,
|
||||||
@ -344,6 +360,7 @@ class OpenAIServingResponses(OpenAIServing):
|
|||||||
request: ResponsesRequest,
|
request: ResponsesRequest,
|
||||||
sampling_params: SamplingParams,
|
sampling_params: SamplingParams,
|
||||||
result_generator: AsyncIterator[ConversationContext],
|
result_generator: AsyncIterator[ConversationContext],
|
||||||
|
context: ConversationContext,
|
||||||
model_name: str,
|
model_name: str,
|
||||||
tokenizer: AnyTokenizer,
|
tokenizer: AnyTokenizer,
|
||||||
request_metadata: RequestResponseMetadata,
|
request_metadata: RequestResponseMetadata,
|
||||||
@ -352,9 +369,8 @@ class OpenAIServingResponses(OpenAIServing):
|
|||||||
if created_time is None:
|
if created_time is None:
|
||||||
created_time = int(time.time())
|
created_time = int(time.time())
|
||||||
|
|
||||||
context: Optional[ConversationContext] = None
|
|
||||||
try:
|
try:
|
||||||
async for context in result_generator:
|
async for _ in result_generator:
|
||||||
pass
|
pass
|
||||||
except asyncio.CancelledError:
|
except asyncio.CancelledError:
|
||||||
return self.create_error_response("Client disconnected")
|
return self.create_error_response("Client disconnected")
|
||||||
@ -362,64 +378,40 @@ class OpenAIServingResponses(OpenAIServing):
|
|||||||
# TODO: Use a vllm-specific Validation Error
|
# TODO: Use a vllm-specific Validation Error
|
||||||
return self.create_error_response(str(e))
|
return self.create_error_response(str(e))
|
||||||
|
|
||||||
assert context is not None
|
if self.use_harmony:
|
||||||
assert isinstance(context, SimpleContext)
|
assert isinstance(context, HarmonyContext)
|
||||||
final_res = context.last_output
|
output = self._make_response_output_items_with_harmony(context)
|
||||||
assert final_res is not None
|
# TODO: these are all 0 for now!
|
||||||
assert len(final_res.outputs) == 1
|
num_prompt_tokens = context.num_prompt_tokens
|
||||||
final_output = final_res.outputs[0]
|
num_generated_tokens = context.num_output_tokens
|
||||||
|
num_cached_tokens = context.num_cached_tokens
|
||||||
if self.reasoning_parser:
|
num_reasoning_tokens = context.num_reasoning_tokens
|
||||||
try:
|
|
||||||
reasoning_parser = self.reasoning_parser(tokenizer)
|
|
||||||
except RuntimeError as e:
|
|
||||||
logger.exception("Error in reasoning parser creation.")
|
|
||||||
return self.create_error_response(str(e))
|
|
||||||
|
|
||||||
reasoning_content, content = (
|
|
||||||
reasoning_parser.extract_reasoning_content(final_output.text,
|
|
||||||
request=request))
|
|
||||||
else:
|
else:
|
||||||
reasoning_content = None
|
assert isinstance(context, SimpleContext)
|
||||||
content = final_output.text
|
final_res = context.last_output
|
||||||
|
assert final_res is not None
|
||||||
|
assert len(final_res.outputs) == 1
|
||||||
|
final_output = final_res.outputs[0]
|
||||||
|
|
||||||
output = []
|
output = self._make_response_output_items(request, final_output,
|
||||||
if reasoning_content:
|
tokenizer)
|
||||||
reasoning_item = ResponseReasoningItem(
|
|
||||||
text=reasoning_content,
|
|
||||||
status=None, # NOTE: Only the last output item has status.
|
|
||||||
)
|
|
||||||
output.append(reasoning_item)
|
|
||||||
if content:
|
|
||||||
output_text = ResponseOutputText(
|
|
||||||
text=content,
|
|
||||||
annotations=[], # TODO
|
|
||||||
type="output_text",
|
|
||||||
logprobs=None, # TODO
|
|
||||||
)
|
|
||||||
message = ResponseOutputMessage(
|
|
||||||
id=f"msg_{random_uuid()}",
|
|
||||||
content=[output_text],
|
|
||||||
role="assistant",
|
|
||||||
status="completed",
|
|
||||||
type="message",
|
|
||||||
)
|
|
||||||
output.append(message)
|
|
||||||
|
|
||||||
# Calculate usage.
|
# Calculate usage.
|
||||||
assert final_res.prompt_token_ids is not None
|
assert final_res.prompt_token_ids is not None
|
||||||
num_prompt_tokens = len(final_res.prompt_token_ids)
|
num_prompt_tokens = len(final_res.prompt_token_ids)
|
||||||
num_generated_tokens = len(final_output.token_ids)
|
num_generated_tokens = len(final_output.token_ids)
|
||||||
usage = UsageInfo(
|
num_cached_tokens = final_res.num_cached_tokens
|
||||||
prompt_tokens=num_prompt_tokens,
|
num_reasoning_tokens = 0
|
||||||
completion_tokens=num_generated_tokens,
|
|
||||||
|
usage = ResponseUsage(
|
||||||
|
input_tokens=num_prompt_tokens,
|
||||||
|
output_tokens=num_generated_tokens,
|
||||||
total_tokens=num_prompt_tokens + num_generated_tokens,
|
total_tokens=num_prompt_tokens + num_generated_tokens,
|
||||||
|
input_tokens_details=InputTokensDetails(
|
||||||
|
cached_tokens=num_cached_tokens),
|
||||||
|
output_tokens_details=OutputTokensDetails(
|
||||||
|
reasoning_tokens=num_reasoning_tokens),
|
||||||
)
|
)
|
||||||
if self.enable_prompt_tokens_details and final_res.num_cached_tokens:
|
|
||||||
usage.prompt_tokens_details = PromptTokenUsageInfo(
|
|
||||||
cached_tokens=final_res.num_cached_tokens)
|
|
||||||
request_metadata.final_usage_info = usage
|
|
||||||
|
|
||||||
response = ResponsesResponse.from_request(
|
response = ResponsesResponse.from_request(
|
||||||
request,
|
request,
|
||||||
sampling_params,
|
sampling_params,
|
||||||
@ -457,6 +449,70 @@ class OpenAIServingResponses(OpenAIServing):
|
|||||||
self.response_store[response.id] = response
|
self.response_store[response.id] = response
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
def _make_response_output_items(
|
||||||
|
self,
|
||||||
|
request: ResponsesRequest,
|
||||||
|
final_output: CompletionOutput,
|
||||||
|
tokenizer: AnyTokenizer,
|
||||||
|
) -> list[ResponseOutputItem]:
|
||||||
|
if self.reasoning_parser:
|
||||||
|
try:
|
||||||
|
reasoning_parser = self.reasoning_parser(tokenizer)
|
||||||
|
except RuntimeError as e:
|
||||||
|
logger.exception("Error in reasoning parser creation.")
|
||||||
|
raise e
|
||||||
|
|
||||||
|
reasoning_content, content = (
|
||||||
|
reasoning_parser.extract_reasoning_content(final_output.text,
|
||||||
|
request=request))
|
||||||
|
else:
|
||||||
|
reasoning_content = None
|
||||||
|
content = final_output.text
|
||||||
|
|
||||||
|
output = []
|
||||||
|
if reasoning_content:
|
||||||
|
reasoning_item = ResponseReasoningItem(
|
||||||
|
id=f"rs_{random_uuid()}",
|
||||||
|
summary=[],
|
||||||
|
type="reasoning",
|
||||||
|
content=[
|
||||||
|
ResponseReasoningTextContent(text=reasoning_content,
|
||||||
|
type="reasoning_text")
|
||||||
|
],
|
||||||
|
status=None, # NOTE: Only the last output item has status.
|
||||||
|
)
|
||||||
|
output.append(reasoning_item)
|
||||||
|
if content:
|
||||||
|
output_text = ResponseOutputText(
|
||||||
|
text=content,
|
||||||
|
annotations=[], # TODO
|
||||||
|
type="output_text",
|
||||||
|
logprobs=None, # TODO
|
||||||
|
)
|
||||||
|
message = ResponseOutputMessage(
|
||||||
|
id=f"msg_{random_uuid()}",
|
||||||
|
content=[output_text],
|
||||||
|
role="assistant",
|
||||||
|
status="completed",
|
||||||
|
type="message",
|
||||||
|
)
|
||||||
|
output.append(message)
|
||||||
|
return output
|
||||||
|
|
||||||
|
def _make_response_output_items_with_harmony(
|
||||||
|
self,
|
||||||
|
context: HarmonyContext,
|
||||||
|
) -> list[ResponseOutputItem]:
|
||||||
|
output_items = []
|
||||||
|
num_init_messages = context.num_init_messages
|
||||||
|
for msg in context.messages[num_init_messages:]:
|
||||||
|
output_items.extend(parse_output_message(msg))
|
||||||
|
# Handle the generation stopped in the middle (if any).
|
||||||
|
last_items = parse_remaining_state(context.parser)
|
||||||
|
if last_items:
|
||||||
|
output_items.extend(last_items)
|
||||||
|
return output_items
|
||||||
|
|
||||||
def _construct_input_messages(
|
def _construct_input_messages(
|
||||||
self,
|
self,
|
||||||
request: ResponsesRequest,
|
request: ResponsesRequest,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user