[gpt-oss] use vLLM instead of openai types for streaming (#25186)

Signed-off-by: Andrew Xia <axia@meta.com>
Signed-off-by: Andrew Xia <axia@fb.com>
Co-authored-by: Andrew Xia <axia@fb.com>
This commit is contained in:
Andrew Xia 2025-09-30 15:47:07 -07:00 committed by GitHub
parent 2ce26b9b5d
commit 5db1870bb9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 59 additions and 22 deletions

View File

@ -379,6 +379,14 @@ async def test_streaming(client: OpenAI, model_name: str, background: bool):
if event.type == "response.created":
resp_id = event.response.id
# test vllm custom types are in the response
if event.type in [
"response.completed", "response.in_progress",
"response.created"
]:
assert 'input_messages' in event.response.model_extra
assert 'output_messages' in event.response.model_extra
if current_event_mode != event.type:
current_event_mode = event.type
print(f"\n[{event.type}] ", end="", flush=True)

View File

@ -17,20 +17,32 @@ from openai.types.chat.chat_completion_audio import (
ChatCompletionAudio as OpenAIChatCompletionAudio)
from openai.types.chat.chat_completion_message import (
Annotation as OpenAIAnnotation)
# yapf: enable
from openai.types.responses import (
ResponseCodeInterpreterCallCodeDeltaEvent,
ResponseCodeInterpreterCallCodeDoneEvent,
ResponseCodeInterpreterCallCompletedEvent,
ResponseCodeInterpreterCallInProgressEvent,
ResponseCodeInterpreterCallInterpretingEvent, ResponseCompletedEvent,
ResponseContentPartAddedEvent, ResponseContentPartDoneEvent,
ResponseCreatedEvent, ResponseFunctionToolCall, ResponseInProgressEvent,
ResponseInputItemParam, ResponseOutputItem, ResponseOutputItemAddedEvent,
ResponseOutputItemDoneEvent, ResponsePrompt, ResponseReasoningItem,
ResponseReasoningTextDeltaEvent, ResponseReasoningTextDoneEvent,
ResponseStatus, ResponseWebSearchCallCompletedEvent,
ResponseWebSearchCallInProgressEvent, ResponseWebSearchCallSearchingEvent)
ResponseCodeInterpreterCallInterpretingEvent)
from openai.types.responses import (
ResponseCompletedEvent as OpenAIResponseCompletedEvent)
from openai.types.responses import (ResponseContentPartAddedEvent,
ResponseContentPartDoneEvent)
from openai.types.responses import (
ResponseCreatedEvent as OpenAIResponseCreatedEvent)
from openai.types.responses import ResponseFunctionToolCall
from openai.types.responses import (
ResponseInProgressEvent as OpenAIResponseInProgressEvent)
from openai.types.responses import (ResponseInputItemParam, ResponseOutputItem,
ResponseOutputItemAddedEvent,
ResponseOutputItemDoneEvent,
ResponsePrompt, ResponseReasoningItem,
ResponseReasoningTextDeltaEvent,
ResponseReasoningTextDoneEvent,
ResponseStatus,
ResponseWebSearchCallCompletedEvent,
ResponseWebSearchCallInProgressEvent,
ResponseWebSearchCallSearchingEvent)
# yapf: enable
from openai.types.responses.response_reasoning_item import (
Content as ResponseReasoningTextContent)
@ -2077,10 +2089,24 @@ class ResponseReasoningPartAddedEvent(OpenAIBaseModel):
"""The type of the event. Always `response.reasoning_part.added`."""
# vLLM Streaming Events
# Note: we override the response type with the vLLM ResponsesResponse type
class ResponseCompletedEvent(OpenAIResponseCompletedEvent):
response: ResponsesResponse # type: ignore[override]
class ResponseCreatedEvent(OpenAIResponseCreatedEvent):
response: ResponsesResponse # type: ignore[override]
class ResponseInProgressEvent(OpenAIResponseInProgressEvent):
response: ResponsesResponse # type: ignore[override]
StreamingResponsesResponse: TypeAlias = Union[
ResponseCreatedEvent,
ResponseInProgressEvent,
ResponseCompletedEvent,
"ResponseCreatedEvent",
"ResponseInProgressEvent",
"ResponseCompletedEvent",
ResponseOutputItemAddedEvent,
ResponseOutputItemDoneEvent,
ResponseContentPartAddedEvent,

View File

@ -22,16 +22,16 @@ from openai.types.responses import (
ResponseCodeInterpreterCallCompletedEvent,
ResponseCodeInterpreterCallInProgressEvent,
ResponseCodeInterpreterCallInterpretingEvent,
ResponseCodeInterpreterToolCallParam, ResponseCompletedEvent,
ResponseContentPartAddedEvent, ResponseContentPartDoneEvent,
ResponseCreatedEvent, ResponseFunctionToolCall, ResponseFunctionWebSearch,
ResponseInProgressEvent, ResponseOutputItem, ResponseOutputItemAddedEvent,
ResponseOutputItemDoneEvent, ResponseOutputMessage, ResponseOutputText,
ResponseReasoningItem, ResponseReasoningTextDeltaEvent,
ResponseReasoningTextDoneEvent, ResponseStatus, ResponseTextDeltaEvent,
ResponseTextDoneEvent, ResponseWebSearchCallCompletedEvent,
ResponseWebSearchCallInProgressEvent, ResponseWebSearchCallSearchingEvent,
response_function_web_search, response_text_delta_event)
ResponseCodeInterpreterToolCallParam, ResponseContentPartAddedEvent,
ResponseContentPartDoneEvent, ResponseFunctionToolCall,
ResponseFunctionWebSearch, ResponseOutputItem,
ResponseOutputItemAddedEvent, ResponseOutputItemDoneEvent,
ResponseOutputMessage, ResponseOutputText, ResponseReasoningItem,
ResponseReasoningTextDeltaEvent, ResponseReasoningTextDoneEvent,
ResponseStatus, ResponseTextDeltaEvent, ResponseTextDoneEvent,
ResponseWebSearchCallCompletedEvent, ResponseWebSearchCallInProgressEvent,
ResponseWebSearchCallSearchingEvent, response_function_web_search,
response_text_delta_event)
from openai.types.responses.response_output_text import (Logprob,
LogprobTopLogprob)
# yapf: enable
@ -58,6 +58,9 @@ from vllm.entrypoints.openai.protocol import (DeltaMessage, ErrorResponse,
InputTokensDetails,
OutputTokensDetails,
RequestResponseMetadata,
ResponseCompletedEvent,
ResponseCreatedEvent,
ResponseInProgressEvent,
ResponseReasoningPartAddedEvent,
ResponseReasoningPartDoneEvent,
ResponsesRequest,