mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-24 06:25:01 +08:00
[responsesAPI][8] input/output messages for ResponsesParser (#30158)
Signed-off-by: Andrew Xia <axia@fb.com> Signed-off-by: Andrew Xia <axia@meta.com> Co-authored-by: Andrew Xia <axia@fb.com> Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
This commit is contained in:
parent
e94384bbad
commit
0d0c929f23
@ -165,6 +165,7 @@ async def test_mcp_tool_call(client: OpenAI, model_name: str):
|
||||
model=model_name,
|
||||
input="What is 13 * 24? Use python to calculate the result.",
|
||||
tools=[{"type": "code_interpreter", "container": {"type": "auto"}}],
|
||||
extra_body={"enable_response_messages": True},
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
@ -178,3 +179,8 @@ async def test_mcp_tool_call(client: OpenAI, model_name: str):
|
||||
# make sure the correct math is in the final output
|
||||
assert response.output[3].type == "message"
|
||||
assert "312" in response.output[3].content[0].text
|
||||
|
||||
# test raw input_messages / output_messages
|
||||
assert len(response.input_messages) == 1
|
||||
assert len(response.output_messages) == 3
|
||||
assert "312" in response.output_messages[2]["message"]
|
||||
|
||||
@ -297,12 +297,40 @@ class ParsableContext(ConversationContext):
|
||||
self.chat_template = chat_template
|
||||
self.chat_template_content_format = chat_template_content_format
|
||||
|
||||
self.input_messages: list[ResponseRawMessageAndToken] = []
|
||||
self.output_messages: list[ResponseRawMessageAndToken] = []
|
||||
|
||||
def append_output(self, output: RequestOutput) -> None:
|
||||
self.num_prompt_tokens = len(output.prompt_token_ids or [])
|
||||
self.num_cached_tokens = output.num_cached_tokens or 0
|
||||
self.num_output_tokens += len(output.outputs[0].token_ids or [])
|
||||
self.parser.process(output.outputs[0])
|
||||
|
||||
# only store if enable_response_messages is True, save memory
|
||||
if self.request.enable_response_messages:
|
||||
output_prompt = output.prompt or ""
|
||||
output_prompt_token_ids = output.prompt_token_ids or []
|
||||
if len(self.input_messages) == 0:
|
||||
self.input_messages.append(
|
||||
ResponseRawMessageAndToken(
|
||||
message=output_prompt,
|
||||
tokens=output_prompt_token_ids,
|
||||
)
|
||||
)
|
||||
else:
|
||||
self.output_messages.append(
|
||||
ResponseRawMessageAndToken(
|
||||
message=output_prompt,
|
||||
tokens=output_prompt_token_ids,
|
||||
)
|
||||
)
|
||||
self.output_messages.append(
|
||||
ResponseRawMessageAndToken(
|
||||
message=output.outputs[0].text,
|
||||
tokens=output.outputs[0].token_ids,
|
||||
)
|
||||
)
|
||||
|
||||
def append_tool_output(self, output: list[ResponseInputOutputItem]) -> None:
|
||||
self.parser.response_messages.extend(output)
|
||||
|
||||
|
||||
@ -3,7 +3,11 @@
|
||||
import logging
|
||||
from collections.abc import Callable
|
||||
|
||||
from openai.types.responses.response_function_tool_call import ResponseFunctionToolCall
|
||||
from openai.types.responses import ResponseFunctionToolCall, ResponseOutputItem
|
||||
from openai.types.responses.response_function_tool_call_output_item import (
|
||||
ResponseFunctionToolCallOutputItem,
|
||||
)
|
||||
from openai.types.responses.response_output_item import McpCall
|
||||
from openai.types.responses.response_output_message import ResponseOutputMessage
|
||||
from openai.types.responses.response_output_text import ResponseOutputText
|
||||
from openai.types.responses.response_reasoning_item import (
|
||||
@ -11,6 +15,7 @@ from openai.types.responses.response_reasoning_item import (
|
||||
ResponseReasoningItem,
|
||||
)
|
||||
|
||||
from vllm.entrypoints.constants import MCP_PREFIX
|
||||
from vllm.entrypoints.openai.protocol import ResponseInputOutputItem, ResponsesRequest
|
||||
from vllm.outputs import CompletionOutput
|
||||
from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
|
||||
@ -111,6 +116,37 @@ class ResponsesParser:
|
||||
|
||||
return self
|
||||
|
||||
def make_response_output_items_from_parsable_context(
|
||||
self,
|
||||
) -> list[ResponseOutputItem]:
|
||||
"""Given a list of sentences, construct ResponseOutput Items."""
|
||||
response_messages = self.response_messages[self.num_init_messages :]
|
||||
output_messages: list[ResponseOutputItem] = []
|
||||
for message in response_messages:
|
||||
if not isinstance(message, ResponseFunctionToolCallOutputItem):
|
||||
output_messages.append(message)
|
||||
else:
|
||||
if len(output_messages) == 0:
|
||||
raise ValueError(
|
||||
"Cannot have a FunctionToolCallOutput before FunctionToolCall."
|
||||
)
|
||||
if isinstance(output_messages[-1], ResponseFunctionToolCall):
|
||||
mcp_message = McpCall(
|
||||
id=f"{MCP_PREFIX}{random_uuid()}",
|
||||
arguments=output_messages[-1].arguments,
|
||||
name=output_messages[-1].name,
|
||||
server_label=output_messages[
|
||||
-1
|
||||
].name, # TODO: store the server label
|
||||
type="mcp_call",
|
||||
status="completed",
|
||||
output=message.output,
|
||||
# TODO: support error output
|
||||
)
|
||||
output_messages[-1] = mcp_message
|
||||
|
||||
return output_messages
|
||||
|
||||
|
||||
def get_responses_parser_for_simple_context(
|
||||
*,
|
||||
|
||||
@ -104,7 +104,6 @@ from vllm.entrypoints.responses_utils import (
|
||||
construct_input_messages,
|
||||
construct_tool_dicts,
|
||||
extract_tool_types,
|
||||
make_response_output_items_from_parsable_context,
|
||||
)
|
||||
from vllm.entrypoints.tool_server import ToolServer
|
||||
from vllm.inputs.data import TokensPrompt
|
||||
@ -658,17 +657,11 @@ class OpenAIServingResponses(OpenAIServing):
|
||||
else:
|
||||
status = "incomplete"
|
||||
elif isinstance(context, ParsableContext):
|
||||
response_messages = context.parser.response_messages[
|
||||
context.parser.num_init_messages :
|
||||
]
|
||||
output = make_response_output_items_from_parsable_context(response_messages)
|
||||
output = context.parser.make_response_output_items_from_parsable_context()
|
||||
|
||||
# TODO: context for non-gptoss models doesn't use messages
|
||||
# so we can't get them out yet
|
||||
if request.enable_response_messages:
|
||||
raise NotImplementedError(
|
||||
"enable_response_messages is currently only supported for gpt-oss"
|
||||
)
|
||||
input_messages = context.input_messages
|
||||
output_messages = context.output_messages
|
||||
|
||||
# TODO: Calculate usage.
|
||||
# assert final_res.prompt_token_ids is not None
|
||||
|
||||
@ -16,7 +16,6 @@ from openai.types.responses.response import ToolChoice
|
||||
from openai.types.responses.response_function_tool_call_output_item import (
|
||||
ResponseFunctionToolCallOutputItem,
|
||||
)
|
||||
from openai.types.responses.response_output_item import McpCall
|
||||
from openai.types.responses.response_output_message import ResponseOutputMessage
|
||||
from openai.types.responses.response_reasoning_item import ResponseReasoningItem
|
||||
from openai.types.responses.tool import Tool
|
||||
@ -27,38 +26,6 @@ from vllm.entrypoints.openai.protocol import (
|
||||
ChatCompletionMessageParam,
|
||||
ResponseInputOutputItem,
|
||||
)
|
||||
from vllm.utils import random_uuid
|
||||
|
||||
|
||||
def make_response_output_items_from_parsable_context(
|
||||
response_messages: list[ResponseInputOutputItem],
|
||||
) -> list[ResponseOutputItem]:
|
||||
"""Given a list of sentences, construct ResponseOutput Items."""
|
||||
output_messages: list[ResponseOutputItem] = []
|
||||
for message in response_messages:
|
||||
if not isinstance(message, ResponseFunctionToolCallOutputItem):
|
||||
output_messages.append(message)
|
||||
else:
|
||||
if len(output_messages) == 0:
|
||||
raise ValueError(
|
||||
"Cannot have a FunctionToolCallOutput before FunctionToolCall."
|
||||
)
|
||||
if isinstance(output_messages[-1], ResponseFunctionToolCall):
|
||||
mcp_message = McpCall(
|
||||
id=f"{MCP_PREFIX}{random_uuid()}",
|
||||
arguments=output_messages[-1].arguments,
|
||||
name=output_messages[-1].name,
|
||||
server_label=output_messages[
|
||||
-1
|
||||
].name, # TODO: store the server label
|
||||
type=f"{MCP_PREFIX}call",
|
||||
status="completed",
|
||||
output=message.output,
|
||||
# TODO: support error output
|
||||
)
|
||||
output_messages[-1] = mcp_message
|
||||
|
||||
return output_messages
|
||||
|
||||
|
||||
def construct_input_messages(
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user