[gpt-oss] Add ResponseReasoningPartAddedEvent, ResponseReasoningPartDoneEvent for streaming (#24938)

Signed-off-by: Andrew Xia <axia@meta.com>
2025-12-16 02:05:01 +08:00 · 2025-09-18 19:11:59 -07:00 · 2025-09-18 19:11:59 -07:00 · 6d8246aaff
commit 6d8246aaff
parent 9d1c50a5ac
3 changed files with 143 additions and 33 deletions
--- a/tests/entrypoints/openai/test_response_api_with_harmony.py
+++ b/tests/entrypoints/openai/test_response_api_with_harmony.py
@ -287,6 +287,57 @@ async def test_stateful_multi_turn(client: OpenAI, model_name: str):
    assert response3.status == "completed"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_streaming_types(client: OpenAI, model_name: str):
    prompts = [
        "tell me a story about a cat in 20 words",
    ]
    # this links the "done" type with the "start" type
    # so every "done" type should have a corresponding "start" type
    # and every open block should be closed by the end of the stream
    pairs_of_event_types = {
        "response.completed": "response.created",
        "response.output_item.done": "response.output_item.added",
        "response.content_part.done": "response.content_part.added",
        "response.output_text.done": "response.output_text.delta",
        "response.web_search_call.done": "response.web_search_call.added",
        "response.reasoning_text.done": "response.reasoning_text.delta",
        "response.reasoning_part.done": "response.reasoning_part.added",
    }
    for prompt in prompts:
        response = await client.responses.create(
            model=model_name,
            input=prompt,
            reasoning={"effort": "low"},
            tools=[],
            stream=True,
            background=False,
        )
        stack_of_event_types = []
        async for event in response:
            if event.type == 'response.created':
                stack_of_event_types.append(event.type)
            elif event.type == 'response.completed':
                assert stack_of_event_types[-1] == pairs_of_event_types[
                    event.type]
                stack_of_event_types.pop()
            if event.type.endswith("added"):
                stack_of_event_types.append(event.type)
            elif event.type.endswith("delta"):
                if stack_of_event_types[-1] == event.type:
                    continue
                stack_of_event_types.append(event.type)
            elif event.type.endswith("done"):
                assert stack_of_event_types[-1] == pairs_of_event_types[
                    event.type]
                stack_of_event_types.pop()
        assert len(stack_of_event_types) == 0
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("background", [True, False])
@ -343,7 +394,10 @@ async def test_streaming(client: OpenAI, model_name: str, background: bool):
                assert event.item_id == current_item_id
            # verify content_index_id is correct
-            if event.type == "response.content_part.added":
+            if event.type in [
                    "response.content_part.added",
                    "response.reasoning_part.added"
            ]:
                assert event.content_index != current_content_index
                current_content_index = event.content_index
            elif event.type in [
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@ -31,6 +31,8 @@ from openai.types.responses import (
    ResponseReasoningTextDeltaEvent, ResponseReasoningTextDoneEvent,
    ResponseStatus, ResponseWebSearchCallCompletedEvent,
    ResponseWebSearchCallInProgressEvent, ResponseWebSearchCallSearchingEvent)
 from openai.types.responses.response_reasoning_item import (
    Content as ResponseReasoningTextContent)
 # Backward compatibility for OpenAI client versions
 try:  # For older openai versions (< 1.100.0)
@ -260,26 +262,6 @@ ResponseInputOutputItem: TypeAlias = Union[ResponseInputItemParam,
                                           ResponseReasoningItem,
                                           ResponseFunctionToolCall]
 StreamingResponsesResponse: TypeAlias = Union[
    ResponseCreatedEvent,
    ResponseInProgressEvent,
    ResponseCompletedEvent,
    ResponseOutputItemAddedEvent,
    ResponseOutputItemDoneEvent,
    ResponseContentPartAddedEvent,
    ResponseContentPartDoneEvent,
    ResponseReasoningTextDeltaEvent,
    ResponseReasoningTextDoneEvent,
    ResponseCodeInterpreterCallInProgressEvent,
    ResponseCodeInterpreterCallCodeDeltaEvent,
    ResponseWebSearchCallInProgressEvent,
    ResponseWebSearchCallSearchingEvent,
    ResponseWebSearchCallCompletedEvent,
    ResponseCodeInterpreterCallCodeDoneEvent,
    ResponseCodeInterpreterCallInterpretingEvent,
    ResponseCodeInterpreterCallCompletedEvent,
 ]
 class ResponsesRequest(OpenAIBaseModel):
    # Ordered by official OpenAI API documentation
@ -1916,6 +1898,72 @@ class ResponsesResponse(OpenAIBaseModel):
        )
 # TODO: this code can be removed once
 # https://github.com/openai/openai-python/issues/2634 has been resolved
 class ResponseReasoningPartDoneEvent(OpenAIBaseModel):
    content_index: int
    """The index of the content part that is done."""
    item_id: str
    """The ID of the output item that the content part was added to."""
    output_index: int
    """The index of the output item that the content part was added to."""
    part: ResponseReasoningTextContent
    """The content part that is done."""
    sequence_number: int
    """The sequence number of this event."""
    type: Literal["response.reasoning_part.done"]
    """The type of the event. Always `response.reasoning_part.done`."""
 # TODO: this code can be removed once
 # https://github.com/openai/openai-python/issues/2634 has been resolved
 class ResponseReasoningPartAddedEvent(OpenAIBaseModel):
    content_index: int
    """The index of the content part that is done."""
    item_id: str
    """The ID of the output item that the content part was added to."""
    output_index: int
    """The index of the output item that the content part was added to."""
    part: ResponseReasoningTextContent
    """The content part that is done."""
    sequence_number: int
    """The sequence number of this event."""
    type: Literal["response.reasoning_part.added"]
    """The type of the event. Always `response.reasoning_part.added`."""
 StreamingResponsesResponse: TypeAlias = Union[
    ResponseCreatedEvent,
    ResponseInProgressEvent,
    ResponseCompletedEvent,
    ResponseOutputItemAddedEvent,
    ResponseOutputItemDoneEvent,
    ResponseContentPartAddedEvent,
    ResponseContentPartDoneEvent,
    ResponseReasoningTextDeltaEvent,
    ResponseReasoningTextDoneEvent,
    ResponseReasoningPartAddedEvent,
    ResponseReasoningPartDoneEvent,
    ResponseCodeInterpreterCallInProgressEvent,
    ResponseCodeInterpreterCallCodeDeltaEvent,
    ResponseWebSearchCallInProgressEvent,
    ResponseWebSearchCallSearchingEvent,
    ResponseWebSearchCallCompletedEvent,
    ResponseCodeInterpreterCallCodeDoneEvent,
    ResponseCodeInterpreterCallInterpretingEvent,
    ResponseCodeInterpreterCallCompletedEvent,
 ]
 BatchRequestInputBody = Union[ChatCompletionRequest, EmbeddingRequest,
                              ScoreRequest, RerankRequest]
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@ -58,6 +58,8 @@ from vllm.entrypoints.openai.protocol import (DeltaMessage, ErrorResponse,
                                              InputTokensDetails,
                                              OutputTokensDetails,
                                              RequestResponseMetadata,
                                              ResponseReasoningPartAddedEvent,
                                              ResponseReasoningPartDoneEvent,
                                              ResponsesRequest,
                                              ResponsesResponse, ResponseUsage,
                                              StreamingResponsesResponse)
@ -1280,14 +1282,13 @@ class OpenAIServingResponses(OpenAIServing):
                        # Deal with tool call here
                        pass
                    elif previous_item.channel == "analysis":
                        content = ResponseReasoningTextContent(
                            text=previous_item.content[0].text,
                            type="reasoning_text",
                        )
                        reasoning_item = ResponseReasoningItem(
                            type="reasoning",
-                            content=[
+                            content=[content],
                                ResponseReasoningTextContent(
                                    text=previous_item.content[0].text,
                                    type="reasoning_text",
                                ),
                            ],
                            status="completed",
                            id=current_item_id,
                            summary=[],
@ -1301,6 +1302,15 @@ class OpenAIServingResponses(OpenAIServing):
                                content_index=current_content_index,
                                text=previous_item.content[0].text,
                            ))
                        yield _increment_sequence_number_and_return(
                            ResponseReasoningPartDoneEvent(
                                type="response.reasoning_part.done",
                                sequence_number=-1,
                                item_id=current_item_id,
                                output_index=current_output_index,
                                content_index=current_content_index,
                                part=content,
                            ))
                        yield _increment_sequence_number_and_return(
                            ResponseOutputItemDoneEvent(
                                type="response.output_item.done",
@ -1412,17 +1422,15 @@ class OpenAIServingResponses(OpenAIServing):
                            ))
                        current_content_index += 1
                        yield _increment_sequence_number_and_return(
-                            ResponseContentPartAddedEvent(
+                            ResponseReasoningPartAddedEvent(
-                                type="response.content_part.added",
+                                type="response.reasoning_part.added",
                                sequence_number=-1,
                                output_index=current_output_index,
                                item_id=current_item_id,
                                content_index=current_content_index,
-                                part=ResponseOutputText(
+                                part=ResponseReasoningTextContent(
                                    type="output_text",
                                    text="",
-                                    annotations=[],
+                                    type="reasoning_text",
                                    logprobs=[],
                                ),
                            ))
                    yield _increment_sequence_number_and_return(