From 5db1870bb99f9788cc232a9473d42dbab081a16a Mon Sep 17 00:00:00 2001 From: Andrew Xia Date: Tue, 30 Sep 2025 15:47:07 -0700 Subject: [PATCH] [gpt-oss] use vLLM instead of openai types for streaming (#25186) Signed-off-by: Andrew Xia Signed-off-by: Andrew Xia Co-authored-by: Andrew Xia --- .../openai/test_response_api_with_harmony.py | 8 +++ vllm/entrypoints/openai/protocol.py | 50 ++++++++++++++----- vllm/entrypoints/openai/serving_responses.py | 23 +++++---- 3 files changed, 59 insertions(+), 22 deletions(-) diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py index c28970afc731..b882a2f9326e 100644 --- a/tests/entrypoints/openai/test_response_api_with_harmony.py +++ b/tests/entrypoints/openai/test_response_api_with_harmony.py @@ -379,6 +379,14 @@ async def test_streaming(client: OpenAI, model_name: str, background: bool): if event.type == "response.created": resp_id = event.response.id + # test vllm custom types are in the response + if event.type in [ + "response.completed", "response.in_progress", + "response.created" + ]: + assert 'input_messages' in event.response.model_extra + assert 'output_messages' in event.response.model_extra + if current_event_mode != event.type: current_event_mode = event.type print(f"\n[{event.type}] ", end="", flush=True) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 8829fa4886f6..9d51372887c2 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -17,20 +17,32 @@ from openai.types.chat.chat_completion_audio import ( ChatCompletionAudio as OpenAIChatCompletionAudio) from openai.types.chat.chat_completion_message import ( Annotation as OpenAIAnnotation) -# yapf: enable from openai.types.responses import ( ResponseCodeInterpreterCallCodeDeltaEvent, ResponseCodeInterpreterCallCodeDoneEvent, ResponseCodeInterpreterCallCompletedEvent, ResponseCodeInterpreterCallInProgressEvent, - ResponseCodeInterpreterCallInterpretingEvent, ResponseCompletedEvent, - ResponseContentPartAddedEvent, ResponseContentPartDoneEvent, - ResponseCreatedEvent, ResponseFunctionToolCall, ResponseInProgressEvent, - ResponseInputItemParam, ResponseOutputItem, ResponseOutputItemAddedEvent, - ResponseOutputItemDoneEvent, ResponsePrompt, ResponseReasoningItem, - ResponseReasoningTextDeltaEvent, ResponseReasoningTextDoneEvent, - ResponseStatus, ResponseWebSearchCallCompletedEvent, - ResponseWebSearchCallInProgressEvent, ResponseWebSearchCallSearchingEvent) + ResponseCodeInterpreterCallInterpretingEvent) +from openai.types.responses import ( + ResponseCompletedEvent as OpenAIResponseCompletedEvent) +from openai.types.responses import (ResponseContentPartAddedEvent, + ResponseContentPartDoneEvent) +from openai.types.responses import ( + ResponseCreatedEvent as OpenAIResponseCreatedEvent) +from openai.types.responses import ResponseFunctionToolCall +from openai.types.responses import ( + ResponseInProgressEvent as OpenAIResponseInProgressEvent) +from openai.types.responses import (ResponseInputItemParam, ResponseOutputItem, + ResponseOutputItemAddedEvent, + ResponseOutputItemDoneEvent, + ResponsePrompt, ResponseReasoningItem, + ResponseReasoningTextDeltaEvent, + ResponseReasoningTextDoneEvent, + ResponseStatus, + ResponseWebSearchCallCompletedEvent, + ResponseWebSearchCallInProgressEvent, + ResponseWebSearchCallSearchingEvent) +# yapf: enable from openai.types.responses.response_reasoning_item import ( Content as ResponseReasoningTextContent) @@ -2077,10 +2089,24 @@ class ResponseReasoningPartAddedEvent(OpenAIBaseModel): """The type of the event. Always `response.reasoning_part.added`.""" +# vLLM Streaming Events +# Note: we override the response type with the vLLM ResponsesResponse type +class ResponseCompletedEvent(OpenAIResponseCompletedEvent): + response: ResponsesResponse # type: ignore[override] + + +class ResponseCreatedEvent(OpenAIResponseCreatedEvent): + response: ResponsesResponse # type: ignore[override] + + +class ResponseInProgressEvent(OpenAIResponseInProgressEvent): + response: ResponsesResponse # type: ignore[override] + + StreamingResponsesResponse: TypeAlias = Union[ - ResponseCreatedEvent, - ResponseInProgressEvent, - ResponseCompletedEvent, + "ResponseCreatedEvent", + "ResponseInProgressEvent", + "ResponseCompletedEvent", ResponseOutputItemAddedEvent, ResponseOutputItemDoneEvent, ResponseContentPartAddedEvent, diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index c70baba88d43..eaeab3360c60 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -22,16 +22,16 @@ from openai.types.responses import ( ResponseCodeInterpreterCallCompletedEvent, ResponseCodeInterpreterCallInProgressEvent, ResponseCodeInterpreterCallInterpretingEvent, - ResponseCodeInterpreterToolCallParam, ResponseCompletedEvent, - ResponseContentPartAddedEvent, ResponseContentPartDoneEvent, - ResponseCreatedEvent, ResponseFunctionToolCall, ResponseFunctionWebSearch, - ResponseInProgressEvent, ResponseOutputItem, ResponseOutputItemAddedEvent, - ResponseOutputItemDoneEvent, ResponseOutputMessage, ResponseOutputText, - ResponseReasoningItem, ResponseReasoningTextDeltaEvent, - ResponseReasoningTextDoneEvent, ResponseStatus, ResponseTextDeltaEvent, - ResponseTextDoneEvent, ResponseWebSearchCallCompletedEvent, - ResponseWebSearchCallInProgressEvent, ResponseWebSearchCallSearchingEvent, - response_function_web_search, response_text_delta_event) + ResponseCodeInterpreterToolCallParam, ResponseContentPartAddedEvent, + ResponseContentPartDoneEvent, ResponseFunctionToolCall, + ResponseFunctionWebSearch, ResponseOutputItem, + ResponseOutputItemAddedEvent, ResponseOutputItemDoneEvent, + ResponseOutputMessage, ResponseOutputText, ResponseReasoningItem, + ResponseReasoningTextDeltaEvent, ResponseReasoningTextDoneEvent, + ResponseStatus, ResponseTextDeltaEvent, ResponseTextDoneEvent, + ResponseWebSearchCallCompletedEvent, ResponseWebSearchCallInProgressEvent, + ResponseWebSearchCallSearchingEvent, response_function_web_search, + response_text_delta_event) from openai.types.responses.response_output_text import (Logprob, LogprobTopLogprob) # yapf: enable @@ -58,6 +58,9 @@ from vllm.entrypoints.openai.protocol import (DeltaMessage, ErrorResponse, InputTokensDetails, OutputTokensDetails, RequestResponseMetadata, + ResponseCompletedEvent, + ResponseCreatedEvent, + ResponseInProgressEvent, ResponseReasoningPartAddedEvent, ResponseReasoningPartDoneEvent, ResponsesRequest,