[gpt-oss][1/N] EZ: refactor serving_responses for modularity (#26948)

Signed-off-by: Andrew Xia <axia@meta.com>
This commit is contained in:
Andrew Xia 2025-10-16 11:44:06 -07:00 committed by GitHub
parent aa255ff55a
commit e6ba2000ae
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -227,6 +227,29 @@ class OpenAIServingResponses(OpenAIServing):
)
return None
def _validate_create_responses_input(
self, request: ResponsesRequest
) -> ErrorResponse | None:
if self.use_harmony and request.is_include_output_logprobs():
return self.create_error_response(
err_type="invalid_request_error",
message="logprobs are not supported with gpt-oss models",
status_code=HTTPStatus.BAD_REQUEST,
)
if request.store and not self.enable_store and request.background:
return self.create_error_response(
err_type="invalid_request_error",
message=(
"This vLLM engine does not support `store=True` and "
"therefore does not support the background mode. To "
"enable these features, set the environment variable "
"`VLLM_ENABLE_RESPONSES_API_STORE=1` when launching "
"the vLLM server."
),
status_code=HTTPStatus.BAD_REQUEST,
)
return None
async def create_responses(
self,
request: ResponsesRequest,
@ -240,6 +263,9 @@ class OpenAIServingResponses(OpenAIServing):
if error_check_ret is not None:
logger.error("Error with model %s", error_check_ret)
return error_check_ret
maybe_validation_error = self._validate_create_responses_input(request)
if maybe_validation_error is not None:
return maybe_validation_error
# If the engine is dead, raise the engine's DEAD_ERROR.
# This is required for the streaming case, where we return a
@ -248,18 +274,6 @@ class OpenAIServingResponses(OpenAIServing):
raise self.engine_client.dead_error
if request.store and not self.enable_store:
if request.background:
return self.create_error_response(
err_type="invalid_request_error",
message=(
"This vLLM engine does not support `store=True` and "
"therefore does not support the background mode. To "
"enable these features, set the environment variable "
"`VLLM_ENABLE_RESPONSES_API_STORE=1` when launching "
"the vLLM server."
),
status_code=HTTPStatus.BAD_REQUEST,
)
# Disable the store option.
# NOTE(woosuk): Although returning an error is possible, we opted
# to implicitly disable store and process the request anyway, as
@ -267,12 +281,6 @@ class OpenAIServingResponses(OpenAIServing):
# (i.e., their request's `store=True` just because it's the default
# value).
request.store = False
if self.use_harmony and request.is_include_output_logprobs():
return self.create_error_response(
err_type="invalid_request_error",
message="logprobs are not supported with gpt-oss models",
status_code=HTTPStatus.BAD_REQUEST,
)
# Handle the previous response ID.
prev_response_id = request.previous_response_id
@ -849,6 +857,47 @@ class OpenAIServingResponses(OpenAIServing):
messages.extend(request.input) # type: ignore
return messages
def _construct_harmony_system_input_message(
self, request: ResponsesRequest, with_custom_tools: bool, tool_types: list[str]
) -> OpenAIHarmonyMessage:
reasoning_effort = request.reasoning.effort if request.reasoning else None
enable_browser = (
"web_search_preview" in tool_types
and self.tool_server is not None
and self.tool_server.has_tool("browser")
)
enable_code_interpreter = (
"code_interpreter" in tool_types
and self.tool_server is not None
and self.tool_server.has_tool("python")
)
enable_container = (
"container" in tool_types
and self.tool_server is not None
and self.tool_server.has_tool("container")
)
sys_msg = get_system_message(
reasoning_effort=reasoning_effort,
browser_description=(
self.tool_server.get_tool_description("browser")
if enable_browser and self.tool_server is not None
else None
),
python_description=(
self.tool_server.get_tool_description("python")
if enable_code_interpreter and self.tool_server is not None
else None
),
container_description=(
self.tool_server.get_tool_description("container")
if enable_container and self.tool_server is not None
else None
),
instructions=request.instructions,
with_custom_tools=with_custom_tools,
)
return sys_msg
def _construct_input_messages_with_harmony(
self,
request: ResponsesRequest,
@ -857,9 +906,7 @@ class OpenAIServingResponses(OpenAIServing):
messages: list[OpenAIHarmonyMessage] = []
if prev_response is None:
# New conversation.
reasoning_effort = request.reasoning.effort if request.reasoning else None
tool_types = [tool.type for tool in request.tools]
# Allow the MCP Tool type to enable built in tools if the
# server_label is allowlisted in
# envs.GPT_OSS_SYSTEM_TOOL_MCP_LABELS
@ -870,41 +917,10 @@ class OpenAIServingResponses(OpenAIServing):
and tool.server_label in envs.GPT_OSS_SYSTEM_TOOL_MCP_LABELS
):
tool_types.append(tool.server_label)
enable_browser = (
"web_search_preview" in tool_types
and self.tool_server is not None
and self.tool_server.has_tool("browser")
)
enable_code_interpreter = (
"code_interpreter" in tool_types
and self.tool_server is not None
and self.tool_server.has_tool("python")
)
enable_container = (
"container" in tool_types
and self.tool_server is not None
and self.tool_server.has_tool("container")
)
with_custom_tools = has_custom_tools(tool_types)
sys_msg = get_system_message(
reasoning_effort=reasoning_effort,
browser_description=(
self.tool_server.get_tool_description("browser")
if enable_browser and self.tool_server is not None
else None
),
python_description=(
self.tool_server.get_tool_description("python")
if enable_code_interpreter and self.tool_server is not None
else None
),
container_description=(
self.tool_server.get_tool_description("container")
if enable_container and self.tool_server is not None
else None
),
instructions=request.instructions,
with_custom_tools=with_custom_tools,
sys_msg = self._construct_harmony_system_input_message(
request, with_custom_tools, tool_types
)
messages.append(sys_msg)
if with_custom_tools: