mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-15 00:05:48 +08:00
[gpt-oss] Convert user input to harmony format (#22402)
Signed-off-by: Chen Zhang <zhangch99@outlook.com> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
parent
ad6c655dde
commit
f6278b6243
@ -29,6 +29,7 @@ from openai.types.chat import (ChatCompletionMessageToolCallParam,
|
|||||||
from openai.types.chat.chat_completion_content_part_input_audio_param import (
|
from openai.types.chat.chat_completion_content_part_input_audio_param import (
|
||||||
InputAudio)
|
InputAudio)
|
||||||
from openai.types.responses import ResponseInputImageParam
|
from openai.types.responses import ResponseInputImageParam
|
||||||
|
from openai_harmony import Message as OpenAIHarmonyMessage
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from pydantic import BaseModel, ConfigDict, TypeAdapter
|
from pydantic import BaseModel, ConfigDict, TypeAdapter
|
||||||
# yapf: enable
|
# yapf: enable
|
||||||
@ -207,7 +208,8 @@ class CustomChatCompletionMessageParam(TypedDict, total=False):
|
|||||||
|
|
||||||
|
|
||||||
ChatCompletionMessageParam = Union[OpenAIChatCompletionMessageParam,
|
ChatCompletionMessageParam = Union[OpenAIChatCompletionMessageParam,
|
||||||
CustomChatCompletionMessageParam]
|
CustomChatCompletionMessageParam,
|
||||||
|
OpenAIHarmonyMessage]
|
||||||
|
|
||||||
|
|
||||||
# TODO: Make fields ReadOnly once mypy supports it
|
# TODO: Make fields ReadOnly once mypy supports it
|
||||||
|
|||||||
@ -2,14 +2,18 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import datetime
|
import datetime
|
||||||
from collections.abc import Iterable, Sequence
|
from collections.abc import Iterable, Sequence
|
||||||
from typing import Literal, Optional
|
from typing import Literal, Optional, Union
|
||||||
|
|
||||||
|
from openai.types.responses import ResponseFunctionToolCall, ResponseOutputItem
|
||||||
from openai.types.responses.tool import Tool
|
from openai.types.responses.tool import Tool
|
||||||
from openai_harmony import (Conversation, DeveloperContent,
|
from openai_harmony import (Author, Conversation, DeveloperContent,
|
||||||
HarmonyEncodingName, Message, ReasoningEffort,
|
HarmonyEncodingName, Message, ReasoningEffort,
|
||||||
Role, StreamableParser, SystemContent, TextContent,
|
Role, StreamableParser, SystemContent, TextContent,
|
||||||
ToolDescription, load_harmony_encoding)
|
ToolDescription, load_harmony_encoding)
|
||||||
|
|
||||||
|
from vllm.entrypoints.openai.protocol import (ResponseInputOutputItem,
|
||||||
|
ResponseReasoningItem)
|
||||||
|
|
||||||
REASONING_EFFORT = {
|
REASONING_EFFORT = {
|
||||||
"high": ReasoningEffort.HIGH,
|
"high": ReasoningEffort.HIGH,
|
||||||
"medium": ReasoningEffort.MEDIUM,
|
"medium": ReasoningEffort.MEDIUM,
|
||||||
@ -85,6 +89,58 @@ def get_user_message(content: str) -> Message:
|
|||||||
return Message.from_role_and_content(Role.USER, content)
|
return Message.from_role_and_content(Role.USER, content)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_response_input(
|
||||||
|
response_msg: ResponseInputOutputItem,
|
||||||
|
prev_responses: list[Union[ResponseOutputItem, ResponseReasoningItem]]
|
||||||
|
) -> Message:
|
||||||
|
if not isinstance(response_msg, dict):
|
||||||
|
response_msg = response_msg.model_dump()
|
||||||
|
if "type" not in response_msg or response_msg["type"] == "message":
|
||||||
|
role = response_msg["role"]
|
||||||
|
content = response_msg["content"]
|
||||||
|
if role == "system":
|
||||||
|
# User is trying to set a system message. Change it to:
|
||||||
|
# <|start|>developer<|message|># Instructions
|
||||||
|
# {instructions}<|end|>
|
||||||
|
role = "developer"
|
||||||
|
text_prefix = "Instructions:\n"
|
||||||
|
else:
|
||||||
|
text_prefix = ""
|
||||||
|
if isinstance(content, str):
|
||||||
|
msg = Message.from_role_and_content(role, text_prefix + content)
|
||||||
|
else:
|
||||||
|
contents = [
|
||||||
|
TextContent(text=text_prefix + c["text"]) for c in content
|
||||||
|
]
|
||||||
|
msg = Message.from_role_and_contents(role, contents)
|
||||||
|
elif response_msg["type"] == "function_call_output":
|
||||||
|
call_id = response_msg["call_id"]
|
||||||
|
call_response: Optional[ResponseFunctionToolCall] = None
|
||||||
|
for prev_response in reversed(prev_responses):
|
||||||
|
if isinstance(prev_response, ResponseFunctionToolCall
|
||||||
|
) and prev_response.call_id == call_id:
|
||||||
|
call_response = prev_response
|
||||||
|
break
|
||||||
|
if call_response is None:
|
||||||
|
raise ValueError(f"No call message found for {call_id}")
|
||||||
|
msg = Message.from_author_and_content(
|
||||||
|
Author.new(Role.TOOL, f"functions.{call_response.name}"),
|
||||||
|
response_msg["output"])
|
||||||
|
elif response_msg["type"] == "reasoning":
|
||||||
|
content = response_msg["content"]
|
||||||
|
assert len(content) == 1
|
||||||
|
msg = Message.from_role_and_content(Role.ASSISTANT, content[0]["text"])
|
||||||
|
elif response_msg["type"] == "function_call":
|
||||||
|
msg = Message.from_role_and_content(Role.ASSISTANT,
|
||||||
|
response_msg["arguments"])
|
||||||
|
msg = msg.with_channel("commentary")
|
||||||
|
msg = msg.with_recipient(f"functions.{response_msg['name']}")
|
||||||
|
msg = msg.with_content_type("json")
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown input type: {response_msg['type']}")
|
||||||
|
return msg
|
||||||
|
|
||||||
|
|
||||||
def parse_chat_input(chat_msg) -> Message:
|
def parse_chat_input(chat_msg) -> Message:
|
||||||
role = chat_msg["role"]
|
role = chat_msg["role"]
|
||||||
content = chat_msg["content"]
|
content = chat_msg["content"]
|
||||||
|
|||||||
@ -17,7 +17,8 @@ from openai.types.chat.chat_completion_audio import (
|
|||||||
from openai.types.chat.chat_completion_message import (
|
from openai.types.chat.chat_completion_message import (
|
||||||
Annotation as OpenAIAnnotation)
|
Annotation as OpenAIAnnotation)
|
||||||
# yapf: enable
|
# yapf: enable
|
||||||
from openai.types.responses import (ResponseInputParam, ResponseOutputItem,
|
from openai.types.responses import (ResponseFunctionToolCall,
|
||||||
|
ResponseInputItemParam, ResponseOutputItem,
|
||||||
ResponseOutputMessage, ResponsePrompt,
|
ResponseOutputMessage, ResponsePrompt,
|
||||||
ResponseStatus, ResponseTextConfig)
|
ResponseStatus, ResponseTextConfig)
|
||||||
from openai.types.responses.response import ToolChoice
|
from openai.types.responses.response import ToolChoice
|
||||||
@ -234,6 +235,10 @@ def get_logits_processors(processors: Optional[LogitsProcessors],
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
ResponseInputOutputItem: TypeAlias = Union[ResponseInputItemParam,
|
||||||
|
ResponseFunctionToolCall]
|
||||||
|
|
||||||
|
|
||||||
class ResponsesRequest(OpenAIBaseModel):
|
class ResponsesRequest(OpenAIBaseModel):
|
||||||
# Ordered by official OpenAI API documentation
|
# Ordered by official OpenAI API documentation
|
||||||
# https://platform.openai.com/docs/api-reference/responses/create
|
# https://platform.openai.com/docs/api-reference/responses/create
|
||||||
@ -248,7 +253,7 @@ class ResponsesRequest(OpenAIBaseModel):
|
|||||||
"reasoning.encrypted_content",
|
"reasoning.encrypted_content",
|
||||||
],
|
],
|
||||||
]] = None
|
]] = None
|
||||||
input: Union[str, ResponseInputParam]
|
input: Union[str, list[ResponseInputOutputItem]]
|
||||||
instructions: Optional[str] = None
|
instructions: Optional[str] = None
|
||||||
max_output_tokens: Optional[int] = None
|
max_output_tokens: Optional[int] = None
|
||||||
max_tool_calls: Optional[int] = None
|
max_tool_calls: Optional[int] = None
|
||||||
|
|||||||
@ -4,12 +4,15 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import time
|
import time
|
||||||
from collections.abc import AsyncGenerator, AsyncIterator
|
from collections.abc import AsyncGenerator, AsyncIterator
|
||||||
|
from copy import copy
|
||||||
from http import HTTPStatus
|
from http import HTTPStatus
|
||||||
from typing import Callable, Final, Optional, Union
|
from typing import Callable, Final, Optional, Union
|
||||||
|
|
||||||
import jinja2
|
import jinja2
|
||||||
from fastapi import Request
|
from fastapi import Request
|
||||||
from openai.types.responses import ResponseOutputMessage, ResponseOutputText
|
from openai.types.responses import (ResponseFunctionToolCall,
|
||||||
|
ResponseOutputMessage, ResponseOutputText)
|
||||||
|
from openai_harmony import Message as OpenAIHarmonyMessage
|
||||||
|
|
||||||
from vllm import envs
|
from vllm import envs
|
||||||
from vllm.config import ModelConfig
|
from vllm.config import ModelConfig
|
||||||
@ -17,6 +20,10 @@ from vllm.engine.protocol import EngineClient
|
|||||||
from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
|
from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
|
||||||
ChatTemplateContentFormatOption)
|
ChatTemplateContentFormatOption)
|
||||||
from vllm.entrypoints.context import ConversationContext, SimpleContext
|
from vllm.entrypoints.context import ConversationContext, SimpleContext
|
||||||
|
from vllm.entrypoints.harmony_utils import (
|
||||||
|
get_developer_message, get_stop_tokens_for_assistant_actions,
|
||||||
|
get_system_message, get_user_message, parse_response_input,
|
||||||
|
render_for_completion)
|
||||||
from vllm.entrypoints.logger import RequestLogger
|
from vllm.entrypoints.logger import RequestLogger
|
||||||
# yapf conflicts with isort for this block
|
# yapf conflicts with isort for this block
|
||||||
# yapf: disable
|
# yapf: disable
|
||||||
@ -30,6 +37,7 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse,
|
|||||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
||||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||||
from vllm.entrypoints.tool_server import ToolServer
|
from vllm.entrypoints.tool_server import ToolServer
|
||||||
|
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.reasoning import ReasoningParser, ReasoningParserManager
|
from vllm.reasoning import ReasoningParser, ReasoningParserManager
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
@ -103,6 +111,29 @@ class OpenAIServingResponses(OpenAIServing):
|
|||||||
"`VLLM_ENABLE_RESPONSES_API_STORE` is enabled. This may "
|
"`VLLM_ENABLE_RESPONSES_API_STORE` is enabled. This may "
|
||||||
"cause a memory leak since we never remove responses from "
|
"cause a memory leak since we never remove responses from "
|
||||||
"the store.")
|
"the store.")
|
||||||
|
|
||||||
|
self.use_harmony = model_config.hf_config.model_type == "gpt_oss"
|
||||||
|
if self.use_harmony:
|
||||||
|
logger.warning("For gpt-oss, we ignore --enable-auto-tool-choice "
|
||||||
|
"and always enable tool use.")
|
||||||
|
# OpenAI models have two EOS-like tokens: <|return|> and <|call|>.
|
||||||
|
# We need to add them to the stop token ids.
|
||||||
|
if "stop_token_ids" not in self.default_sampling_params:
|
||||||
|
self.default_sampling_params["stop_token_ids"] = []
|
||||||
|
self.default_sampling_params["stop_token_ids"].extend(
|
||||||
|
get_stop_tokens_for_assistant_actions())
|
||||||
|
|
||||||
|
# set up tool use
|
||||||
|
self.enable_auto_tools: bool = enable_auto_tools
|
||||||
|
if self.enable_auto_tools:
|
||||||
|
logger.info(
|
||||||
|
"\"auto\" tool choice has been enabled please note that while"
|
||||||
|
" the parallel_tool_calls client option is preset for "
|
||||||
|
"compatibility reasons, it will be ignored.")
|
||||||
|
if not self.use_harmony:
|
||||||
|
raise NotImplementedError("Auto tool choice is not supported "
|
||||||
|
"yet unless using Harmony")
|
||||||
|
|
||||||
# HACK(woosuk): This is a hack. We should use a better store.
|
# HACK(woosuk): This is a hack. We should use a better store.
|
||||||
# FIXME: If enable_store=True, this may cause a memory leak since we
|
# FIXME: If enable_store=True, this may cause a memory leak since we
|
||||||
# never remove responses from the store.
|
# never remove responses from the store.
|
||||||
@ -165,21 +196,20 @@ class OpenAIServingResponses(OpenAIServing):
|
|||||||
return self._make_not_found_error(prev_response_id)
|
return self._make_not_found_error(prev_response_id)
|
||||||
else:
|
else:
|
||||||
prev_response = None
|
prev_response = None
|
||||||
# Construct the input messages.
|
|
||||||
messages = self._construct_input_messages(request, prev_response)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
lora_request = self._maybe_get_adapters(request)
|
lora_request = self._maybe_get_adapters(request)
|
||||||
model_name = self._get_model_name(request.model, lora_request)
|
model_name = self._get_model_name(request.model, lora_request)
|
||||||
tokenizer = await self.engine_client.get_tokenizer(lora_request)
|
tokenizer = await self.engine_client.get_tokenizer(lora_request)
|
||||||
|
|
||||||
_, request_prompts, engine_prompts = await self._preprocess_chat(
|
if self.use_harmony:
|
||||||
request,
|
messages, request_prompts, engine_prompts = (
|
||||||
tokenizer,
|
self._make_request_with_harmony(request, prev_response))
|
||||||
messages,
|
else:
|
||||||
chat_template=self.chat_template,
|
messages, request_prompts, engine_prompts = (
|
||||||
chat_template_content_format=self.chat_template_content_format,
|
await self._make_request(request, prev_response,
|
||||||
)
|
tokenizer))
|
||||||
|
|
||||||
except (ValueError, TypeError, RuntimeError,
|
except (ValueError, TypeError, RuntimeError,
|
||||||
jinja2.TemplateError) as e:
|
jinja2.TemplateError) as e:
|
||||||
logger.exception("Error in preprocessing prompt inputs")
|
logger.exception("Error in preprocessing prompt inputs")
|
||||||
@ -275,6 +305,38 @@ class OpenAIServingResponses(OpenAIServing):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
return self.create_error_response(str(e))
|
return self.create_error_response(str(e))
|
||||||
|
|
||||||
|
async def _make_request(
|
||||||
|
self,
|
||||||
|
request: ResponsesRequest,
|
||||||
|
prev_response: Optional[ResponsesResponse],
|
||||||
|
tokenizer: AnyTokenizer,
|
||||||
|
):
|
||||||
|
# Construct the input messages.
|
||||||
|
messages = self._construct_input_messages(request, prev_response)
|
||||||
|
_, request_prompts, engine_prompts = await self._preprocess_chat(
|
||||||
|
request,
|
||||||
|
tokenizer,
|
||||||
|
messages,
|
||||||
|
chat_template=self.chat_template,
|
||||||
|
chat_template_content_format=self.chat_template_content_format,
|
||||||
|
)
|
||||||
|
return messages, request_prompts, engine_prompts
|
||||||
|
|
||||||
|
def _make_request_with_harmony(
|
||||||
|
self,
|
||||||
|
request: ResponsesRequest,
|
||||||
|
prev_response: Optional[ResponsesResponse],
|
||||||
|
):
|
||||||
|
if request.tool_choice != "auto":
|
||||||
|
raise NotImplementedError(
|
||||||
|
"Only 'auto' tool_choice is supported in "
|
||||||
|
"response API with Harmony")
|
||||||
|
messages = self._construct_input_messages_with_harmony(
|
||||||
|
request, prev_response)
|
||||||
|
prompt_token_ids = render_for_completion(messages)
|
||||||
|
engine_prompt = EngineTokensPrompt(prompt_token_ids=prompt_token_ids)
|
||||||
|
return messages, [prompt_token_ids], [engine_prompt]
|
||||||
|
|
||||||
async def responses_full_generator(
|
async def responses_full_generator(
|
||||||
self,
|
self,
|
||||||
request: ResponsesRequest,
|
request: ResponsesRequest,
|
||||||
@ -411,6 +473,82 @@ class OpenAIServingResponses(OpenAIServing):
|
|||||||
messages.extend(request.input) # type: ignore
|
messages.extend(request.input) # type: ignore
|
||||||
return messages
|
return messages
|
||||||
|
|
||||||
|
def _construct_input_messages_with_harmony(
|
||||||
|
self,
|
||||||
|
request: ResponsesRequest,
|
||||||
|
prev_response: Optional[ResponsesResponse],
|
||||||
|
) -> list[OpenAIHarmonyMessage]:
|
||||||
|
messages: list[OpenAIHarmonyMessage] = []
|
||||||
|
if prev_response is None:
|
||||||
|
# New conversation.
|
||||||
|
reasoning_effort = (request.reasoning.effort
|
||||||
|
if request.reasoning else None)
|
||||||
|
tool_types = [tool.type for tool in request.tools]
|
||||||
|
enable_browser = ("web_search_preview" in tool_types
|
||||||
|
and self.tool_server is not None
|
||||||
|
and self.tool_server.has_tool("browser"))
|
||||||
|
enable_code_interpreter = ("code_interpreter" in tool_types
|
||||||
|
and self.tool_server is not None
|
||||||
|
and self.tool_server.has_tool("python"))
|
||||||
|
sys_msg = get_system_message(
|
||||||
|
reasoning_effort=reasoning_effort,
|
||||||
|
browser_description=self.tool_server.get_tool_description(
|
||||||
|
"browser")
|
||||||
|
if enable_browser and self.tool_server is not None else None,
|
||||||
|
python_description=self.tool_server.get_tool_description(
|
||||||
|
"python") if enable_code_interpreter
|
||||||
|
and self.tool_server is not None else None,
|
||||||
|
)
|
||||||
|
messages.append(sys_msg)
|
||||||
|
dev_msg = get_developer_message(request.instructions,
|
||||||
|
request.tools)
|
||||||
|
messages.append(dev_msg)
|
||||||
|
else:
|
||||||
|
# Continue the previous conversation.
|
||||||
|
# FIXME(woosuk): Currently, request params like reasoning and
|
||||||
|
# instructions are ignored.
|
||||||
|
prev_msgs = self.msg_store[prev_response.id]
|
||||||
|
# Remove the previous chain-of-thoughts if there is a new "final"
|
||||||
|
# message. Note that this also removes these messages from the
|
||||||
|
# msg_store.
|
||||||
|
if len(prev_msgs) > 0:
|
||||||
|
last_msg = prev_msgs[-1]
|
||||||
|
assert isinstance(last_msg, OpenAIHarmonyMessage)
|
||||||
|
if last_msg.channel == "final":
|
||||||
|
prev_final_msg_idx = -1
|
||||||
|
for i in range(len(prev_msgs) - 2, -1, -1):
|
||||||
|
prev_msg_i = prev_msgs[i]
|
||||||
|
assert isinstance(prev_msg_i, OpenAIHarmonyMessage)
|
||||||
|
if prev_msg_i.channel == "final":
|
||||||
|
prev_final_msg_idx = i
|
||||||
|
break
|
||||||
|
recent_turn_msgs = prev_msgs[prev_final_msg_idx + 1:]
|
||||||
|
del prev_msgs[prev_final_msg_idx + 1:]
|
||||||
|
for msg in recent_turn_msgs:
|
||||||
|
assert isinstance(msg, OpenAIHarmonyMessage)
|
||||||
|
if msg.channel != "analysis":
|
||||||
|
prev_msgs.append(msg)
|
||||||
|
messages.extend(prev_msgs)
|
||||||
|
# Append the new input.
|
||||||
|
# Reponses API supports simple text inputs without chat format.
|
||||||
|
if isinstance(request.input, str):
|
||||||
|
messages.append(get_user_message(request.input))
|
||||||
|
else:
|
||||||
|
if prev_response is not None:
|
||||||
|
prev_outputs = copy(prev_response.output)
|
||||||
|
else:
|
||||||
|
prev_outputs = []
|
||||||
|
for response_msg in request.input:
|
||||||
|
messages.append(
|
||||||
|
parse_response_input(response_msg, prev_outputs))
|
||||||
|
# User passes in a a tool call request and its output. We need
|
||||||
|
# to add the tool call request to prev_outputs so that the
|
||||||
|
# parse_response_input can find the tool call request when
|
||||||
|
# parsing the tool call output.
|
||||||
|
if isinstance(response_msg, ResponseFunctionToolCall):
|
||||||
|
prev_outputs.append(response_msg)
|
||||||
|
return messages
|
||||||
|
|
||||||
async def _run_background_request(
|
async def _run_background_request(
|
||||||
self,
|
self,
|
||||||
request: ResponsesRequest,
|
request: ResponsesRequest,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user