[gpt-oss] use vLLM instead of openai types for streaming (#25186)

Signed-off-by: Andrew Xia <axia@meta.com>
Signed-off-by: Andrew Xia <axia@fb.com>
Co-authored-by: Andrew Xia <axia@fb.com>
This commit is contained in:
Andrew Xia 2025-09-30 15:47:07 -07:00 committed by GitHub
parent 2ce26b9b5d
commit 5db1870bb9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 59 additions and 22 deletions

View File

@ -379,6 +379,14 @@ async def test_streaming(client: OpenAI, model_name: str, background: bool):
if event.type == "response.created": if event.type == "response.created":
resp_id = event.response.id resp_id = event.response.id
# test vllm custom types are in the response
if event.type in [
"response.completed", "response.in_progress",
"response.created"
]:
assert 'input_messages' in event.response.model_extra
assert 'output_messages' in event.response.model_extra
if current_event_mode != event.type: if current_event_mode != event.type:
current_event_mode = event.type current_event_mode = event.type
print(f"\n[{event.type}] ", end="", flush=True) print(f"\n[{event.type}] ", end="", flush=True)

View File

@ -17,20 +17,32 @@ from openai.types.chat.chat_completion_audio import (
ChatCompletionAudio as OpenAIChatCompletionAudio) ChatCompletionAudio as OpenAIChatCompletionAudio)
from openai.types.chat.chat_completion_message import ( from openai.types.chat.chat_completion_message import (
Annotation as OpenAIAnnotation) Annotation as OpenAIAnnotation)
# yapf: enable
from openai.types.responses import ( from openai.types.responses import (
ResponseCodeInterpreterCallCodeDeltaEvent, ResponseCodeInterpreterCallCodeDeltaEvent,
ResponseCodeInterpreterCallCodeDoneEvent, ResponseCodeInterpreterCallCodeDoneEvent,
ResponseCodeInterpreterCallCompletedEvent, ResponseCodeInterpreterCallCompletedEvent,
ResponseCodeInterpreterCallInProgressEvent, ResponseCodeInterpreterCallInProgressEvent,
ResponseCodeInterpreterCallInterpretingEvent, ResponseCompletedEvent, ResponseCodeInterpreterCallInterpretingEvent)
ResponseContentPartAddedEvent, ResponseContentPartDoneEvent, from openai.types.responses import (
ResponseCreatedEvent, ResponseFunctionToolCall, ResponseInProgressEvent, ResponseCompletedEvent as OpenAIResponseCompletedEvent)
ResponseInputItemParam, ResponseOutputItem, ResponseOutputItemAddedEvent, from openai.types.responses import (ResponseContentPartAddedEvent,
ResponseOutputItemDoneEvent, ResponsePrompt, ResponseReasoningItem, ResponseContentPartDoneEvent)
ResponseReasoningTextDeltaEvent, ResponseReasoningTextDoneEvent, from openai.types.responses import (
ResponseStatus, ResponseWebSearchCallCompletedEvent, ResponseCreatedEvent as OpenAIResponseCreatedEvent)
ResponseWebSearchCallInProgressEvent, ResponseWebSearchCallSearchingEvent) from openai.types.responses import ResponseFunctionToolCall
from openai.types.responses import (
ResponseInProgressEvent as OpenAIResponseInProgressEvent)
from openai.types.responses import (ResponseInputItemParam, ResponseOutputItem,
ResponseOutputItemAddedEvent,
ResponseOutputItemDoneEvent,
ResponsePrompt, ResponseReasoningItem,
ResponseReasoningTextDeltaEvent,
ResponseReasoningTextDoneEvent,
ResponseStatus,
ResponseWebSearchCallCompletedEvent,
ResponseWebSearchCallInProgressEvent,
ResponseWebSearchCallSearchingEvent)
# yapf: enable
from openai.types.responses.response_reasoning_item import ( from openai.types.responses.response_reasoning_item import (
Content as ResponseReasoningTextContent) Content as ResponseReasoningTextContent)
@ -2077,10 +2089,24 @@ class ResponseReasoningPartAddedEvent(OpenAIBaseModel):
"""The type of the event. Always `response.reasoning_part.added`.""" """The type of the event. Always `response.reasoning_part.added`."""
# vLLM Streaming Events
# Note: we override the response type with the vLLM ResponsesResponse type
class ResponseCompletedEvent(OpenAIResponseCompletedEvent):
response: ResponsesResponse # type: ignore[override]
class ResponseCreatedEvent(OpenAIResponseCreatedEvent):
response: ResponsesResponse # type: ignore[override]
class ResponseInProgressEvent(OpenAIResponseInProgressEvent):
response: ResponsesResponse # type: ignore[override]
StreamingResponsesResponse: TypeAlias = Union[ StreamingResponsesResponse: TypeAlias = Union[
ResponseCreatedEvent, "ResponseCreatedEvent",
ResponseInProgressEvent, "ResponseInProgressEvent",
ResponseCompletedEvent, "ResponseCompletedEvent",
ResponseOutputItemAddedEvent, ResponseOutputItemAddedEvent,
ResponseOutputItemDoneEvent, ResponseOutputItemDoneEvent,
ResponseContentPartAddedEvent, ResponseContentPartAddedEvent,

View File

@ -22,16 +22,16 @@ from openai.types.responses import (
ResponseCodeInterpreterCallCompletedEvent, ResponseCodeInterpreterCallCompletedEvent,
ResponseCodeInterpreterCallInProgressEvent, ResponseCodeInterpreterCallInProgressEvent,
ResponseCodeInterpreterCallInterpretingEvent, ResponseCodeInterpreterCallInterpretingEvent,
ResponseCodeInterpreterToolCallParam, ResponseCompletedEvent, ResponseCodeInterpreterToolCallParam, ResponseContentPartAddedEvent,
ResponseContentPartAddedEvent, ResponseContentPartDoneEvent, ResponseContentPartDoneEvent, ResponseFunctionToolCall,
ResponseCreatedEvent, ResponseFunctionToolCall, ResponseFunctionWebSearch, ResponseFunctionWebSearch, ResponseOutputItem,
ResponseInProgressEvent, ResponseOutputItem, ResponseOutputItemAddedEvent, ResponseOutputItemAddedEvent, ResponseOutputItemDoneEvent,
ResponseOutputItemDoneEvent, ResponseOutputMessage, ResponseOutputText, ResponseOutputMessage, ResponseOutputText, ResponseReasoningItem,
ResponseReasoningItem, ResponseReasoningTextDeltaEvent, ResponseReasoningTextDeltaEvent, ResponseReasoningTextDoneEvent,
ResponseReasoningTextDoneEvent, ResponseStatus, ResponseTextDeltaEvent, ResponseStatus, ResponseTextDeltaEvent, ResponseTextDoneEvent,
ResponseTextDoneEvent, ResponseWebSearchCallCompletedEvent, ResponseWebSearchCallCompletedEvent, ResponseWebSearchCallInProgressEvent,
ResponseWebSearchCallInProgressEvent, ResponseWebSearchCallSearchingEvent, ResponseWebSearchCallSearchingEvent, response_function_web_search,
response_function_web_search, response_text_delta_event) response_text_delta_event)
from openai.types.responses.response_output_text import (Logprob, from openai.types.responses.response_output_text import (Logprob,
LogprobTopLogprob) LogprobTopLogprob)
# yapf: enable # yapf: enable
@ -58,6 +58,9 @@ from vllm.entrypoints.openai.protocol import (DeltaMessage, ErrorResponse,
InputTokensDetails, InputTokensDetails,
OutputTokensDetails, OutputTokensDetails,
RequestResponseMetadata, RequestResponseMetadata,
ResponseCompletedEvent,
ResponseCreatedEvent,
ResponseInProgressEvent,
ResponseReasoningPartAddedEvent, ResponseReasoningPartAddedEvent,
ResponseReasoningPartDoneEvent, ResponseReasoningPartDoneEvent,
ResponsesRequest, ResponsesRequest,