mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-22 19:05:36 +08:00
[Frontend] Responses API messages out, just harmony for now (#24985)
Signed-off-by: Alec Solder <alecs@fb.com> Co-authored-by: Alec Solder <alecs@fb.com> Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com> Signed-off-by: yewentao256 <zhyanwentao@126.com>
This commit is contained in:
parent
9da51c77a9
commit
8da7b98366
@ -744,3 +744,18 @@ async def test_function_calling_full_history(client: OpenAI, model_name: str):
|
|||||||
assert response_2 is not None
|
assert response_2 is not None
|
||||||
assert response_2.status == "completed"
|
assert response_2.status == "completed"
|
||||||
assert response_2.output_text is not None
|
assert response_2.output_text is not None
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||||
|
async def test_output_messages_enabled(client: OpenAI, model_name: str,
|
||||||
|
server):
|
||||||
|
response = await client.responses.create(
|
||||||
|
model=model_name,
|
||||||
|
input="What is the capital of South Korea?",
|
||||||
|
extra_body={"enable_response_messages": True})
|
||||||
|
|
||||||
|
assert response is not None
|
||||||
|
assert response.status == "completed"
|
||||||
|
assert len(response.input_messages) > 0
|
||||||
|
assert len(response.output_messages) > 0
|
||||||
|
|||||||
@ -328,6 +328,13 @@ class ResponsesRequest(OpenAIBaseModel):
|
|||||||
"access by 3rd parties, and long enough to be "
|
"access by 3rd parties, and long enough to be "
|
||||||
"unpredictable (e.g., 43 characters base64-encoded, corresponding "
|
"unpredictable (e.g., 43 characters base64-encoded, corresponding "
|
||||||
"to 256 bit). Not supported by vLLM engine V0."))
|
"to 256 bit). Not supported by vLLM engine V0."))
|
||||||
|
|
||||||
|
enable_response_messages: bool = Field(
|
||||||
|
default=False,
|
||||||
|
description=(
|
||||||
|
"Dictates whether or not to return messages as part of the "
|
||||||
|
"response object. Currently only supported for non-streaming "
|
||||||
|
"non-background and gpt-oss only. "))
|
||||||
# --8<-- [end:responses-extra-params]
|
# --8<-- [end:responses-extra-params]
|
||||||
|
|
||||||
_DEFAULT_SAMPLING_PARAMS = {
|
_DEFAULT_SAMPLING_PARAMS = {
|
||||||
@ -1831,6 +1838,11 @@ class ResponsesResponse(OpenAIBaseModel):
|
|||||||
model: str
|
model: str
|
||||||
object: Literal["response"] = "response"
|
object: Literal["response"] = "response"
|
||||||
output: list[ResponseOutputItem]
|
output: list[ResponseOutputItem]
|
||||||
|
# These are populated when enable_response_messages is set to True
|
||||||
|
# TODO: Currently an issue where content of harmony messages
|
||||||
|
# is not available when these are serialized. Metadata is available
|
||||||
|
input_messages: Optional[list[ChatCompletionMessageParam]] = None
|
||||||
|
output_messages: Optional[list[ChatCompletionMessageParam]] = None
|
||||||
parallel_tool_calls: bool
|
parallel_tool_calls: bool
|
||||||
temperature: float
|
temperature: float
|
||||||
tool_choice: ToolChoice
|
tool_choice: ToolChoice
|
||||||
@ -1860,6 +1872,8 @@ class ResponsesResponse(OpenAIBaseModel):
|
|||||||
output: list[ResponseOutputItem],
|
output: list[ResponseOutputItem],
|
||||||
status: ResponseStatus,
|
status: ResponseStatus,
|
||||||
usage: Optional[ResponseUsage] = None,
|
usage: Optional[ResponseUsage] = None,
|
||||||
|
input_messages: Optional[list[ChatCompletionMessageParam]] = None,
|
||||||
|
output_messages: Optional[list[ChatCompletionMessageParam]] = None,
|
||||||
) -> "ResponsesResponse":
|
) -> "ResponsesResponse":
|
||||||
|
|
||||||
incomplete_details: Optional[IncompleteDetails] = None
|
incomplete_details: Optional[IncompleteDetails] = None
|
||||||
@ -1868,7 +1882,6 @@ class ResponsesResponse(OpenAIBaseModel):
|
|||||||
# TODO: implement the other reason for incomplete_details,
|
# TODO: implement the other reason for incomplete_details,
|
||||||
# which is content_filter
|
# which is content_filter
|
||||||
# incomplete_details = IncompleteDetails(reason='content_filter')
|
# incomplete_details = IncompleteDetails(reason='content_filter')
|
||||||
|
|
||||||
return cls(
|
return cls(
|
||||||
id=request.request_id,
|
id=request.request_id,
|
||||||
created_at=created_time,
|
created_at=created_time,
|
||||||
@ -1877,6 +1890,8 @@ class ResponsesResponse(OpenAIBaseModel):
|
|||||||
metadata=request.metadata,
|
metadata=request.metadata,
|
||||||
model=model_name,
|
model=model_name,
|
||||||
output=output,
|
output=output,
|
||||||
|
input_messages=input_messages,
|
||||||
|
output_messages=output_messages,
|
||||||
parallel_tool_calls=request.parallel_tool_calls,
|
parallel_tool_calls=request.parallel_tool_calls,
|
||||||
temperature=sampling_params.temperature,
|
temperature=sampling_params.temperature,
|
||||||
tool_choice=request.tool_choice,
|
tool_choice=request.tool_choice,
|
||||||
|
|||||||
@ -475,9 +475,14 @@ class OpenAIServingResponses(OpenAIServing):
|
|||||||
# "completed" is implemented as the "catch-all" for now.
|
# "completed" is implemented as the "catch-all" for now.
|
||||||
status: ResponseStatus = "completed"
|
status: ResponseStatus = "completed"
|
||||||
|
|
||||||
|
input_messages = None
|
||||||
|
output_messages = None
|
||||||
if self.use_harmony:
|
if self.use_harmony:
|
||||||
assert isinstance(context, HarmonyContext)
|
assert isinstance(context, HarmonyContext)
|
||||||
output = self._make_response_output_items_with_harmony(context)
|
output = self._make_response_output_items_with_harmony(context)
|
||||||
|
if request.enable_response_messages:
|
||||||
|
input_messages = context.messages[:context.num_init_messages]
|
||||||
|
output_messages = context.messages[context.num_init_messages:]
|
||||||
num_tool_output_tokens = context.num_tool_output_tokens
|
num_tool_output_tokens = context.num_tool_output_tokens
|
||||||
if len(output) > 0:
|
if len(output) > 0:
|
||||||
if context.finish_reason == "length":
|
if context.finish_reason == "length":
|
||||||
@ -496,6 +501,12 @@ class OpenAIServingResponses(OpenAIServing):
|
|||||||
output = self._make_response_output_items(request, final_output,
|
output = self._make_response_output_items(request, final_output,
|
||||||
tokenizer)
|
tokenizer)
|
||||||
|
|
||||||
|
# TODO: context for non-gptoss models doesn't use messages
|
||||||
|
# so we can't get them out yet
|
||||||
|
if request.enable_response_messages:
|
||||||
|
raise NotImplementedError(
|
||||||
|
"enable_response_messages is currently"
|
||||||
|
" only supported for gpt-oss")
|
||||||
# Calculate usage.
|
# Calculate usage.
|
||||||
assert final_res.prompt_token_ids is not None
|
assert final_res.prompt_token_ids is not None
|
||||||
num_tool_output_tokens = 0
|
num_tool_output_tokens = 0
|
||||||
@ -519,6 +530,8 @@ class OpenAIServingResponses(OpenAIServing):
|
|||||||
response = ResponsesResponse.from_request(
|
response = ResponsesResponse.from_request(
|
||||||
request,
|
request,
|
||||||
sampling_params,
|
sampling_params,
|
||||||
|
input_messages=input_messages,
|
||||||
|
output_messages=output_messages,
|
||||||
model_name=model_name,
|
model_name=model_name,
|
||||||
created_time=created_time,
|
created_time=created_time,
|
||||||
output=output,
|
output=output,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user