[gpt-oss] use vLLM instead of openai types for streaming (#25186)

Signed-off-by: Andrew Xia <axia@meta.com> Signed-off-by: Andrew Xia <axia@fb.com> Co-authored-by: Andrew Xia <axia@fb.com>
2025-12-14 00:25:23 +08:00 · 2025-09-30 15:47:07 -07:00 · 2025-09-30 15:47:07 -07:00 · 5db1870bb9
commit 5db1870bb9
parent 2ce26b9b5d
3 changed files with 59 additions and 22 deletions
--- a/tests/entrypoints/openai/test_response_api_with_harmony.py
+++ b/tests/entrypoints/openai/test_response_api_with_harmony.py
@ -379,6 +379,14 @@ async def test_streaming(client: OpenAI, model_name: str, background: bool):
            if event.type == "response.created":
                resp_id = event.response.id
            # test vllm custom types are in the response
            if event.type in [
                    "response.completed", "response.in_progress",
                    "response.created"
            ]:
                assert 'input_messages' in event.response.model_extra
                assert 'output_messages' in event.response.model_extra
            if current_event_mode != event.type:
                current_event_mode = event.type
                print(f"\n[{event.type}] ", end="", flush=True)
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@ -17,20 +17,32 @@ from openai.types.chat.chat_completion_audio import (
    ChatCompletionAudio as OpenAIChatCompletionAudio)
 from openai.types.chat.chat_completion_message import (
    Annotation as OpenAIAnnotation)
 # yapf: enable
 from openai.types.responses import (
    ResponseCodeInterpreterCallCodeDeltaEvent,
    ResponseCodeInterpreterCallCodeDoneEvent,
    ResponseCodeInterpreterCallCompletedEvent,
    ResponseCodeInterpreterCallInProgressEvent,
-    ResponseCodeInterpreterCallInterpretingEvent, ResponseCompletedEvent,
+    ResponseCodeInterpreterCallInterpretingEvent)
-    ResponseContentPartAddedEvent, ResponseContentPartDoneEvent,
+from openai.types.responses import (
-    ResponseCreatedEvent, ResponseFunctionToolCall, ResponseInProgressEvent,
+    ResponseCompletedEvent as OpenAIResponseCompletedEvent)
-    ResponseInputItemParam, ResponseOutputItem, ResponseOutputItemAddedEvent,
+from openai.types.responses import (ResponseContentPartAddedEvent,
-    ResponseOutputItemDoneEvent, ResponsePrompt, ResponseReasoningItem,
+                                    ResponseContentPartDoneEvent)
-    ResponseReasoningTextDeltaEvent, ResponseReasoningTextDoneEvent,
+from openai.types.responses import (
-    ResponseStatus, ResponseWebSearchCallCompletedEvent,
+    ResponseCreatedEvent as OpenAIResponseCreatedEvent)
-    ResponseWebSearchCallInProgressEvent, ResponseWebSearchCallSearchingEvent)
+from openai.types.responses import ResponseFunctionToolCall
 from openai.types.responses import (
    ResponseInProgressEvent as OpenAIResponseInProgressEvent)
 from openai.types.responses import (ResponseInputItemParam, ResponseOutputItem,
                                    ResponseOutputItemAddedEvent,
                                    ResponseOutputItemDoneEvent,
                                    ResponsePrompt, ResponseReasoningItem,
                                    ResponseReasoningTextDeltaEvent,
                                    ResponseReasoningTextDoneEvent,
                                    ResponseStatus,
                                    ResponseWebSearchCallCompletedEvent,
                                    ResponseWebSearchCallInProgressEvent,
                                    ResponseWebSearchCallSearchingEvent)
 # yapf: enable
 from openai.types.responses.response_reasoning_item import (
    Content as ResponseReasoningTextContent)
@ -2077,10 +2089,24 @@ class ResponseReasoningPartAddedEvent(OpenAIBaseModel):
    """The type of the event. Always `response.reasoning_part.added`."""
 # vLLM Streaming Events
 # Note: we override the response type with the vLLM ResponsesResponse type
 class ResponseCompletedEvent(OpenAIResponseCompletedEvent):
    response: ResponsesResponse  # type: ignore[override]
 class ResponseCreatedEvent(OpenAIResponseCreatedEvent):
    response: ResponsesResponse  # type: ignore[override]
 class ResponseInProgressEvent(OpenAIResponseInProgressEvent):
    response: ResponsesResponse  # type: ignore[override]
 StreamingResponsesResponse: TypeAlias = Union[
-    ResponseCreatedEvent,
+    "ResponseCreatedEvent",
-    ResponseInProgressEvent,
+    "ResponseInProgressEvent",
-    ResponseCompletedEvent,
+    "ResponseCompletedEvent",
    ResponseOutputItemAddedEvent,
    ResponseOutputItemDoneEvent,
    ResponseContentPartAddedEvent,
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@ -22,16 +22,16 @@ from openai.types.responses import (
    ResponseCodeInterpreterCallCompletedEvent,
    ResponseCodeInterpreterCallInProgressEvent,
    ResponseCodeInterpreterCallInterpretingEvent,
-    ResponseCodeInterpreterToolCallParam, ResponseCompletedEvent,
+    ResponseCodeInterpreterToolCallParam, ResponseContentPartAddedEvent,
-    ResponseContentPartAddedEvent, ResponseContentPartDoneEvent,
+    ResponseContentPartDoneEvent, ResponseFunctionToolCall,
-    ResponseCreatedEvent, ResponseFunctionToolCall, ResponseFunctionWebSearch,
+    ResponseFunctionWebSearch, ResponseOutputItem,
-    ResponseInProgressEvent, ResponseOutputItem, ResponseOutputItemAddedEvent,
+    ResponseOutputItemAddedEvent, ResponseOutputItemDoneEvent,
-    ResponseOutputItemDoneEvent, ResponseOutputMessage, ResponseOutputText,
+    ResponseOutputMessage, ResponseOutputText, ResponseReasoningItem,
-    ResponseReasoningItem, ResponseReasoningTextDeltaEvent,
+    ResponseReasoningTextDeltaEvent, ResponseReasoningTextDoneEvent,
-    ResponseReasoningTextDoneEvent, ResponseStatus, ResponseTextDeltaEvent,
+    ResponseStatus, ResponseTextDeltaEvent, ResponseTextDoneEvent,
-    ResponseTextDoneEvent, ResponseWebSearchCallCompletedEvent,
+    ResponseWebSearchCallCompletedEvent, ResponseWebSearchCallInProgressEvent,
-    ResponseWebSearchCallInProgressEvent, ResponseWebSearchCallSearchingEvent,
+    ResponseWebSearchCallSearchingEvent, response_function_web_search,
-    response_function_web_search, response_text_delta_event)
+    response_text_delta_event)
 from openai.types.responses.response_output_text import (Logprob,
                                                         LogprobTopLogprob)
 # yapf: enable
@ -58,6 +58,9 @@ from vllm.entrypoints.openai.protocol import (DeltaMessage, ErrorResponse,
                                              InputTokensDetails,
                                              OutputTokensDetails,
                                              RequestResponseMetadata,
                                              ResponseCompletedEvent,
                                              ResponseCreatedEvent,
                                              ResponseInProgressEvent,
                                              ResponseReasoningPartAddedEvent,
                                              ResponseReasoningPartDoneEvent,
                                              ResponsesRequest,