[gpt-oss] use vLLM instead of openai types for streaming (#25186)

Signed-off-by: Andrew Xia <axia@meta.com> Signed-off-by: Andrew Xia <axia@fb.com> Co-authored-by: Andrew Xia <axia@fb.com>
2026-03-16 13:57:12 +08:00 · 2025-09-30 15:47:07 -07:00 · 2025-09-30 15:47:07 -07:00 · 5db1870bb9
commit 5db1870bb9
parent 2ce26b9b5d
3 changed files with 59 additions and 22 deletions
--- a/tests/entrypoints/openai/test_response_api_with_harmony.py
+++ b/tests/entrypoints/openai/test_response_api_with_harmony.py
@ -379,6 +379,14 @@ async def test_streaming(client: OpenAI, model_name: str, background: bool):
            if event.type == "response.created":
                resp_id = event.response.id

+            # test vllm custom types are in the response
+            if event.type in [
+                    "response.completed", "response.in_progress",
+                    "response.created"
+            ]:
+                assert 'input_messages' in event.response.model_extra
+                assert 'output_messages' in event.response.model_extra
+
            if current_event_mode != event.type:
                current_event_mode = event.type
                print(f"\n[{event.type}] ", end="", flush=True)
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@ -17,20 +17,32 @@ from openai.types.chat.chat_completion_audio import (
    ChatCompletionAudio as OpenAIChatCompletionAudio)
 from openai.types.chat.chat_completion_message import (
    Annotation as OpenAIAnnotation)
-# yapf: enable
 from openai.types.responses import (
    ResponseCodeInterpreterCallCodeDeltaEvent,
    ResponseCodeInterpreterCallCodeDoneEvent,
    ResponseCodeInterpreterCallCompletedEvent,
    ResponseCodeInterpreterCallInProgressEvent,
-    ResponseCodeInterpreterCallInterpretingEvent, ResponseCompletedEvent,
-    ResponseContentPartAddedEvent, ResponseContentPartDoneEvent,
-    ResponseCreatedEvent, ResponseFunctionToolCall, ResponseInProgressEvent,
-    ResponseInputItemParam, ResponseOutputItem, ResponseOutputItemAddedEvent,
-    ResponseOutputItemDoneEvent, ResponsePrompt, ResponseReasoningItem,
-    ResponseReasoningTextDeltaEvent, ResponseReasoningTextDoneEvent,
-    ResponseStatus, ResponseWebSearchCallCompletedEvent,
-    ResponseWebSearchCallInProgressEvent, ResponseWebSearchCallSearchingEvent)
+    ResponseCodeInterpreterCallInterpretingEvent)
+from openai.types.responses import (
+    ResponseCompletedEvent as OpenAIResponseCompletedEvent)
+from openai.types.responses import (ResponseContentPartAddedEvent,
+                                    ResponseContentPartDoneEvent)
+from openai.types.responses import (
+    ResponseCreatedEvent as OpenAIResponseCreatedEvent)
+from openai.types.responses import ResponseFunctionToolCall
+from openai.types.responses import (
+    ResponseInProgressEvent as OpenAIResponseInProgressEvent)
+from openai.types.responses import (ResponseInputItemParam, ResponseOutputItem,
+                                    ResponseOutputItemAddedEvent,
+                                    ResponseOutputItemDoneEvent,
+                                    ResponsePrompt, ResponseReasoningItem,
+                                    ResponseReasoningTextDeltaEvent,
+                                    ResponseReasoningTextDoneEvent,
+                                    ResponseStatus,
+                                    ResponseWebSearchCallCompletedEvent,
+                                    ResponseWebSearchCallInProgressEvent,
+                                    ResponseWebSearchCallSearchingEvent)
+# yapf: enable
 from openai.types.responses.response_reasoning_item import (
    Content as ResponseReasoningTextContent)

@ -2077,10 +2089,24 @@ class ResponseReasoningPartAddedEvent(OpenAIBaseModel):
    """The type of the event. Always `response.reasoning_part.added`."""


+# vLLM Streaming Events
+# Note: we override the response type with the vLLM ResponsesResponse type
+class ResponseCompletedEvent(OpenAIResponseCompletedEvent):
+    response: ResponsesResponse  # type: ignore[override]
+
+
+class ResponseCreatedEvent(OpenAIResponseCreatedEvent):
+    response: ResponsesResponse  # type: ignore[override]
+
+
+class ResponseInProgressEvent(OpenAIResponseInProgressEvent):
+    response: ResponsesResponse  # type: ignore[override]
+
+
 StreamingResponsesResponse: TypeAlias = Union[
-    ResponseCreatedEvent,
-    ResponseInProgressEvent,
-    ResponseCompletedEvent,
+    "ResponseCreatedEvent",
+    "ResponseInProgressEvent",
+    "ResponseCompletedEvent",
    ResponseOutputItemAddedEvent,
    ResponseOutputItemDoneEvent,
    ResponseContentPartAddedEvent,
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@ -22,16 +22,16 @@ from openai.types.responses import (
    ResponseCodeInterpreterCallCompletedEvent,
    ResponseCodeInterpreterCallInProgressEvent,
    ResponseCodeInterpreterCallInterpretingEvent,
-    ResponseCodeInterpreterToolCallParam, ResponseCompletedEvent,
-    ResponseContentPartAddedEvent, ResponseContentPartDoneEvent,
-    ResponseCreatedEvent, ResponseFunctionToolCall, ResponseFunctionWebSearch,
-    ResponseInProgressEvent, ResponseOutputItem, ResponseOutputItemAddedEvent,
-    ResponseOutputItemDoneEvent, ResponseOutputMessage, ResponseOutputText,
-    ResponseReasoningItem, ResponseReasoningTextDeltaEvent,
-    ResponseReasoningTextDoneEvent, ResponseStatus, ResponseTextDeltaEvent,
-    ResponseTextDoneEvent, ResponseWebSearchCallCompletedEvent,
-    ResponseWebSearchCallInProgressEvent, ResponseWebSearchCallSearchingEvent,
-    response_function_web_search, response_text_delta_event)
+    ResponseCodeInterpreterToolCallParam, ResponseContentPartAddedEvent,
+    ResponseContentPartDoneEvent, ResponseFunctionToolCall,
+    ResponseFunctionWebSearch, ResponseOutputItem,
+    ResponseOutputItemAddedEvent, ResponseOutputItemDoneEvent,
+    ResponseOutputMessage, ResponseOutputText, ResponseReasoningItem,
+    ResponseReasoningTextDeltaEvent, ResponseReasoningTextDoneEvent,
+    ResponseStatus, ResponseTextDeltaEvent, ResponseTextDoneEvent,
+    ResponseWebSearchCallCompletedEvent, ResponseWebSearchCallInProgressEvent,
+    ResponseWebSearchCallSearchingEvent, response_function_web_search,
+    response_text_delta_event)
 from openai.types.responses.response_output_text import (Logprob,
                                                         LogprobTopLogprob)
 # yapf: enable
@ -58,6 +58,9 @@ from vllm.entrypoints.openai.protocol import (DeltaMessage, ErrorResponse,
                                              InputTokensDetails,
                                              OutputTokensDetails,
                                              RequestResponseMetadata,
+                                              ResponseCompletedEvent,
+                                              ResponseCreatedEvent,
+                                              ResponseInProgressEvent,
                                              ResponseReasoningPartAddedEvent,
                                              ResponseReasoningPartDoneEvent,
                                              ResponsesRequest,