From 5db1870bb99f9788cc232a9473d42dbab081a16a Mon Sep 17 00:00:00 2001
From: Andrew Xia <axia@mit.edu>
Date: Tue, 30 Sep 2025 15:47:07 -0700
Subject: [PATCH] [gpt-oss] use vLLM instead of openai types for streaming
 (#25186)

Signed-off-by: Andrew Xia <axia@meta.com>
Signed-off-by: Andrew Xia <axia@fb.com>
Co-authored-by: Andrew Xia <axia@fb.com>
---
 .../openai/test_response_api_with_harmony.py  |  8 +++
 vllm/entrypoints/openai/protocol.py           | 50 ++++++++++++++-----
 vllm/entrypoints/openai/serving_responses.py  | 23 +++++----
 3 files changed, 59 insertions(+), 22 deletions(-)

diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py
index c28970afc731..b882a2f9326e 100644
--- a/tests/entrypoints/openai/test_response_api_with_harmony.py
+++ b/tests/entrypoints/openai/test_response_api_with_harmony.py
@@ -379,6 +379,14 @@ async def test_streaming(client: OpenAI, model_name: str, background: bool):
             if event.type == "response.created":
                 resp_id = event.response.id
 
+            # test vllm custom types are in the response
+            if event.type in [
+                    "response.completed", "response.in_progress",
+                    "response.created"
+            ]:
+                assert 'input_messages' in event.response.model_extra
+                assert 'output_messages' in event.response.model_extra
+
             if current_event_mode != event.type:
                 current_event_mode = event.type
                 print(f"\n[{event.type}] ", end="", flush=True)
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 8829fa4886f6..9d51372887c2 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -17,20 +17,32 @@ from openai.types.chat.chat_completion_audio import (
     ChatCompletionAudio as OpenAIChatCompletionAudio)
 from openai.types.chat.chat_completion_message import (
     Annotation as OpenAIAnnotation)
-# yapf: enable
 from openai.types.responses import (
     ResponseCodeInterpreterCallCodeDeltaEvent,
     ResponseCodeInterpreterCallCodeDoneEvent,
     ResponseCodeInterpreterCallCompletedEvent,
     ResponseCodeInterpreterCallInProgressEvent,
-    ResponseCodeInterpreterCallInterpretingEvent, ResponseCompletedEvent,
-    ResponseContentPartAddedEvent, ResponseContentPartDoneEvent,
-    ResponseCreatedEvent, ResponseFunctionToolCall, ResponseInProgressEvent,
-    ResponseInputItemParam, ResponseOutputItem, ResponseOutputItemAddedEvent,
-    ResponseOutputItemDoneEvent, ResponsePrompt, ResponseReasoningItem,
-    ResponseReasoningTextDeltaEvent, ResponseReasoningTextDoneEvent,
-    ResponseStatus, ResponseWebSearchCallCompletedEvent,
-    ResponseWebSearchCallInProgressEvent, ResponseWebSearchCallSearchingEvent)
+    ResponseCodeInterpreterCallInterpretingEvent)
+from openai.types.responses import (
+    ResponseCompletedEvent as OpenAIResponseCompletedEvent)
+from openai.types.responses import (ResponseContentPartAddedEvent,
+                                    ResponseContentPartDoneEvent)
+from openai.types.responses import (
+    ResponseCreatedEvent as OpenAIResponseCreatedEvent)
+from openai.types.responses import ResponseFunctionToolCall
+from openai.types.responses import (
+    ResponseInProgressEvent as OpenAIResponseInProgressEvent)
+from openai.types.responses import (ResponseInputItemParam, ResponseOutputItem,
+                                    ResponseOutputItemAddedEvent,
+                                    ResponseOutputItemDoneEvent,
+                                    ResponsePrompt, ResponseReasoningItem,
+                                    ResponseReasoningTextDeltaEvent,
+                                    ResponseReasoningTextDoneEvent,
+                                    ResponseStatus,
+                                    ResponseWebSearchCallCompletedEvent,
+                                    ResponseWebSearchCallInProgressEvent,
+                                    ResponseWebSearchCallSearchingEvent)
+# yapf: enable
 from openai.types.responses.response_reasoning_item import (
     Content as ResponseReasoningTextContent)
 
@@ -2077,10 +2089,24 @@ class ResponseReasoningPartAddedEvent(OpenAIBaseModel):
     """The type of the event. Always `response.reasoning_part.added`."""
 
 
+# vLLM Streaming Events
+# Note: we override the response type with the vLLM ResponsesResponse type
+class ResponseCompletedEvent(OpenAIResponseCompletedEvent):
+    response: ResponsesResponse  # type: ignore[override]
+
+
+class ResponseCreatedEvent(OpenAIResponseCreatedEvent):
+    response: ResponsesResponse  # type: ignore[override]
+
+
+class ResponseInProgressEvent(OpenAIResponseInProgressEvent):
+    response: ResponsesResponse  # type: ignore[override]
+
+
 StreamingResponsesResponse: TypeAlias = Union[
-    ResponseCreatedEvent,
-    ResponseInProgressEvent,
-    ResponseCompletedEvent,
+    "ResponseCreatedEvent",
+    "ResponseInProgressEvent",
+    "ResponseCompletedEvent",
     ResponseOutputItemAddedEvent,
     ResponseOutputItemDoneEvent,
     ResponseContentPartAddedEvent,
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index c70baba88d43..eaeab3360c60 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -22,16 +22,16 @@ from openai.types.responses import (
     ResponseCodeInterpreterCallCompletedEvent,
     ResponseCodeInterpreterCallInProgressEvent,
     ResponseCodeInterpreterCallInterpretingEvent,
-    ResponseCodeInterpreterToolCallParam, ResponseCompletedEvent,
-    ResponseContentPartAddedEvent, ResponseContentPartDoneEvent,
-    ResponseCreatedEvent, ResponseFunctionToolCall, ResponseFunctionWebSearch,
-    ResponseInProgressEvent, ResponseOutputItem, ResponseOutputItemAddedEvent,
-    ResponseOutputItemDoneEvent, ResponseOutputMessage, ResponseOutputText,
-    ResponseReasoningItem, ResponseReasoningTextDeltaEvent,
-    ResponseReasoningTextDoneEvent, ResponseStatus, ResponseTextDeltaEvent,
-    ResponseTextDoneEvent, ResponseWebSearchCallCompletedEvent,
-    ResponseWebSearchCallInProgressEvent, ResponseWebSearchCallSearchingEvent,
-    response_function_web_search, response_text_delta_event)
+    ResponseCodeInterpreterToolCallParam, ResponseContentPartAddedEvent,
+    ResponseContentPartDoneEvent, ResponseFunctionToolCall,
+    ResponseFunctionWebSearch, ResponseOutputItem,
+    ResponseOutputItemAddedEvent, ResponseOutputItemDoneEvent,
+    ResponseOutputMessage, ResponseOutputText, ResponseReasoningItem,
+    ResponseReasoningTextDeltaEvent, ResponseReasoningTextDoneEvent,
+    ResponseStatus, ResponseTextDeltaEvent, ResponseTextDoneEvent,
+    ResponseWebSearchCallCompletedEvent, ResponseWebSearchCallInProgressEvent,
+    ResponseWebSearchCallSearchingEvent, response_function_web_search,
+    response_text_delta_event)
 from openai.types.responses.response_output_text import (Logprob,
                                                          LogprobTopLogprob)
 # yapf: enable
@@ -58,6 +58,9 @@ from vllm.entrypoints.openai.protocol import (DeltaMessage, ErrorResponse,
                                               InputTokensDetails,
                                               OutputTokensDetails,
                                               RequestResponseMetadata,
+                                              ResponseCompletedEvent,
+                                              ResponseCreatedEvent,
+                                              ResponseInProgressEvent,
                                               ResponseReasoningPartAddedEvent,
                                               ResponseReasoningPartDoneEvent,
                                               ResponsesRequest,