mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 08:04:58 +08:00
[gpt-oss] Add ResponseReasoningPartAddedEvent, ResponseReasoningPartDoneEvent for streaming (#24938)
Signed-off-by: Andrew Xia <axia@meta.com>
This commit is contained in:
parent
9d1c50a5ac
commit
6d8246aaff
@ -287,6 +287,57 @@ async def test_stateful_multi_turn(client: OpenAI, model_name: str):
|
||||
assert response3.status == "completed"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_streaming_types(client: OpenAI, model_name: str):
|
||||
prompts = [
|
||||
"tell me a story about a cat in 20 words",
|
||||
]
|
||||
|
||||
# this links the "done" type with the "start" type
|
||||
# so every "done" type should have a corresponding "start" type
|
||||
# and every open block should be closed by the end of the stream
|
||||
pairs_of_event_types = {
|
||||
"response.completed": "response.created",
|
||||
"response.output_item.done": "response.output_item.added",
|
||||
"response.content_part.done": "response.content_part.added",
|
||||
"response.output_text.done": "response.output_text.delta",
|
||||
"response.web_search_call.done": "response.web_search_call.added",
|
||||
"response.reasoning_text.done": "response.reasoning_text.delta",
|
||||
"response.reasoning_part.done": "response.reasoning_part.added",
|
||||
}
|
||||
|
||||
for prompt in prompts:
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input=prompt,
|
||||
reasoning={"effort": "low"},
|
||||
tools=[],
|
||||
stream=True,
|
||||
background=False,
|
||||
)
|
||||
|
||||
stack_of_event_types = []
|
||||
async for event in response:
|
||||
if event.type == 'response.created':
|
||||
stack_of_event_types.append(event.type)
|
||||
elif event.type == 'response.completed':
|
||||
assert stack_of_event_types[-1] == pairs_of_event_types[
|
||||
event.type]
|
||||
stack_of_event_types.pop()
|
||||
if event.type.endswith("added"):
|
||||
stack_of_event_types.append(event.type)
|
||||
elif event.type.endswith("delta"):
|
||||
if stack_of_event_types[-1] == event.type:
|
||||
continue
|
||||
stack_of_event_types.append(event.type)
|
||||
elif event.type.endswith("done"):
|
||||
assert stack_of_event_types[-1] == pairs_of_event_types[
|
||||
event.type]
|
||||
stack_of_event_types.pop()
|
||||
assert len(stack_of_event_types) == 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("background", [True, False])
|
||||
@ -343,7 +394,10 @@ async def test_streaming(client: OpenAI, model_name: str, background: bool):
|
||||
assert event.item_id == current_item_id
|
||||
|
||||
# verify content_index_id is correct
|
||||
if event.type == "response.content_part.added":
|
||||
if event.type in [
|
||||
"response.content_part.added",
|
||||
"response.reasoning_part.added"
|
||||
]:
|
||||
assert event.content_index != current_content_index
|
||||
current_content_index = event.content_index
|
||||
elif event.type in [
|
||||
|
||||
@ -31,6 +31,8 @@ from openai.types.responses import (
|
||||
ResponseReasoningTextDeltaEvent, ResponseReasoningTextDoneEvent,
|
||||
ResponseStatus, ResponseWebSearchCallCompletedEvent,
|
||||
ResponseWebSearchCallInProgressEvent, ResponseWebSearchCallSearchingEvent)
|
||||
from openai.types.responses.response_reasoning_item import (
|
||||
Content as ResponseReasoningTextContent)
|
||||
|
||||
# Backward compatibility for OpenAI client versions
|
||||
try: # For older openai versions (< 1.100.0)
|
||||
@ -260,26 +262,6 @@ ResponseInputOutputItem: TypeAlias = Union[ResponseInputItemParam,
|
||||
ResponseReasoningItem,
|
||||
ResponseFunctionToolCall]
|
||||
|
||||
StreamingResponsesResponse: TypeAlias = Union[
|
||||
ResponseCreatedEvent,
|
||||
ResponseInProgressEvent,
|
||||
ResponseCompletedEvent,
|
||||
ResponseOutputItemAddedEvent,
|
||||
ResponseOutputItemDoneEvent,
|
||||
ResponseContentPartAddedEvent,
|
||||
ResponseContentPartDoneEvent,
|
||||
ResponseReasoningTextDeltaEvent,
|
||||
ResponseReasoningTextDoneEvent,
|
||||
ResponseCodeInterpreterCallInProgressEvent,
|
||||
ResponseCodeInterpreterCallCodeDeltaEvent,
|
||||
ResponseWebSearchCallInProgressEvent,
|
||||
ResponseWebSearchCallSearchingEvent,
|
||||
ResponseWebSearchCallCompletedEvent,
|
||||
ResponseCodeInterpreterCallCodeDoneEvent,
|
||||
ResponseCodeInterpreterCallInterpretingEvent,
|
||||
ResponseCodeInterpreterCallCompletedEvent,
|
||||
]
|
||||
|
||||
|
||||
class ResponsesRequest(OpenAIBaseModel):
|
||||
# Ordered by official OpenAI API documentation
|
||||
@ -1916,6 +1898,72 @@ class ResponsesResponse(OpenAIBaseModel):
|
||||
)
|
||||
|
||||
|
||||
# TODO: this code can be removed once
|
||||
# https://github.com/openai/openai-python/issues/2634 has been resolved
|
||||
class ResponseReasoningPartDoneEvent(OpenAIBaseModel):
|
||||
content_index: int
|
||||
"""The index of the content part that is done."""
|
||||
|
||||
item_id: str
|
||||
"""The ID of the output item that the content part was added to."""
|
||||
|
||||
output_index: int
|
||||
"""The index of the output item that the content part was added to."""
|
||||
|
||||
part: ResponseReasoningTextContent
|
||||
"""The content part that is done."""
|
||||
|
||||
sequence_number: int
|
||||
"""The sequence number of this event."""
|
||||
|
||||
type: Literal["response.reasoning_part.done"]
|
||||
"""The type of the event. Always `response.reasoning_part.done`."""
|
||||
|
||||
|
||||
# TODO: this code can be removed once
|
||||
# https://github.com/openai/openai-python/issues/2634 has been resolved
|
||||
class ResponseReasoningPartAddedEvent(OpenAIBaseModel):
|
||||
content_index: int
|
||||
"""The index of the content part that is done."""
|
||||
|
||||
item_id: str
|
||||
"""The ID of the output item that the content part was added to."""
|
||||
|
||||
output_index: int
|
||||
"""The index of the output item that the content part was added to."""
|
||||
|
||||
part: ResponseReasoningTextContent
|
||||
"""The content part that is done."""
|
||||
|
||||
sequence_number: int
|
||||
"""The sequence number of this event."""
|
||||
|
||||
type: Literal["response.reasoning_part.added"]
|
||||
"""The type of the event. Always `response.reasoning_part.added`."""
|
||||
|
||||
|
||||
StreamingResponsesResponse: TypeAlias = Union[
|
||||
ResponseCreatedEvent,
|
||||
ResponseInProgressEvent,
|
||||
ResponseCompletedEvent,
|
||||
ResponseOutputItemAddedEvent,
|
||||
ResponseOutputItemDoneEvent,
|
||||
ResponseContentPartAddedEvent,
|
||||
ResponseContentPartDoneEvent,
|
||||
ResponseReasoningTextDeltaEvent,
|
||||
ResponseReasoningTextDoneEvent,
|
||||
ResponseReasoningPartAddedEvent,
|
||||
ResponseReasoningPartDoneEvent,
|
||||
ResponseCodeInterpreterCallInProgressEvent,
|
||||
ResponseCodeInterpreterCallCodeDeltaEvent,
|
||||
ResponseWebSearchCallInProgressEvent,
|
||||
ResponseWebSearchCallSearchingEvent,
|
||||
ResponseWebSearchCallCompletedEvent,
|
||||
ResponseCodeInterpreterCallCodeDoneEvent,
|
||||
ResponseCodeInterpreterCallInterpretingEvent,
|
||||
ResponseCodeInterpreterCallCompletedEvent,
|
||||
]
|
||||
|
||||
BatchRequestInputBody = Union[ChatCompletionRequest, EmbeddingRequest,
|
||||
ScoreRequest, RerankRequest]
|
||||
|
||||
|
||||
@ -58,6 +58,8 @@ from vllm.entrypoints.openai.protocol import (DeltaMessage, ErrorResponse,
|
||||
InputTokensDetails,
|
||||
OutputTokensDetails,
|
||||
RequestResponseMetadata,
|
||||
ResponseReasoningPartAddedEvent,
|
||||
ResponseReasoningPartDoneEvent,
|
||||
ResponsesRequest,
|
||||
ResponsesResponse, ResponseUsage,
|
||||
StreamingResponsesResponse)
|
||||
@ -1280,14 +1282,13 @@ class OpenAIServingResponses(OpenAIServing):
|
||||
# Deal with tool call here
|
||||
pass
|
||||
elif previous_item.channel == "analysis":
|
||||
content = ResponseReasoningTextContent(
|
||||
text=previous_item.content[0].text,
|
||||
type="reasoning_text",
|
||||
)
|
||||
reasoning_item = ResponseReasoningItem(
|
||||
type="reasoning",
|
||||
content=[
|
||||
ResponseReasoningTextContent(
|
||||
text=previous_item.content[0].text,
|
||||
type="reasoning_text",
|
||||
),
|
||||
],
|
||||
content=[content],
|
||||
status="completed",
|
||||
id=current_item_id,
|
||||
summary=[],
|
||||
@ -1301,6 +1302,15 @@ class OpenAIServingResponses(OpenAIServing):
|
||||
content_index=current_content_index,
|
||||
text=previous_item.content[0].text,
|
||||
))
|
||||
yield _increment_sequence_number_and_return(
|
||||
ResponseReasoningPartDoneEvent(
|
||||
type="response.reasoning_part.done",
|
||||
sequence_number=-1,
|
||||
item_id=current_item_id,
|
||||
output_index=current_output_index,
|
||||
content_index=current_content_index,
|
||||
part=content,
|
||||
))
|
||||
yield _increment_sequence_number_and_return(
|
||||
ResponseOutputItemDoneEvent(
|
||||
type="response.output_item.done",
|
||||
@ -1412,17 +1422,15 @@ class OpenAIServingResponses(OpenAIServing):
|
||||
))
|
||||
current_content_index += 1
|
||||
yield _increment_sequence_number_and_return(
|
||||
ResponseContentPartAddedEvent(
|
||||
type="response.content_part.added",
|
||||
ResponseReasoningPartAddedEvent(
|
||||
type="response.reasoning_part.added",
|
||||
sequence_number=-1,
|
||||
output_index=current_output_index,
|
||||
item_id=current_item_id,
|
||||
content_index=current_content_index,
|
||||
part=ResponseOutputText(
|
||||
type="output_text",
|
||||
part=ResponseReasoningTextContent(
|
||||
text="",
|
||||
annotations=[],
|
||||
logprobs=[],
|
||||
type="reasoning_text",
|
||||
),
|
||||
))
|
||||
yield _increment_sequence_number_and_return(
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user