mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-01-28 16:47:14 +08:00
[gpt-oss] Add IncompleteDetails to ResponsesRepsonse (#24561)
Signed-off-by: Andrew Xia <axia@meta.com>
This commit is contained in:
parent
94b03f88dd
commit
25aba2b6a3
@ -74,6 +74,20 @@ async def test_basic_with_reasoning_effort(client: OpenAI, model_name: str):
|
||||
assert response.status == "completed"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_max_tokens(client: OpenAI, model_name: str):
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input="What is the first paragraph of Moby Dick?",
|
||||
reasoning={"effort": "low"},
|
||||
max_output_tokens=30,
|
||||
)
|
||||
assert response is not None
|
||||
assert response.status == "incomplete"
|
||||
assert response.incomplete_details.reason == "max_output_tokens"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_chat(client: OpenAI, model_name: str):
|
||||
|
||||
@ -112,6 +112,7 @@ class HarmonyContext(ConversationContext):
|
||||
available_tools: list[str],
|
||||
):
|
||||
self._messages = messages
|
||||
self.finish_reason: Optional[str] = None
|
||||
self.available_tools = available_tools
|
||||
self._tool_sessions: dict[str, Union[ClientSession, Tool]] = {}
|
||||
self.called_tools: set[str] = set()
|
||||
@ -135,7 +136,8 @@ class HarmonyContext(ConversationContext):
|
||||
if self.parser.current_channel in {"analysis", "commentary"}:
|
||||
self.num_reasoning_tokens += 1
|
||||
|
||||
def append_output(self, output) -> None:
|
||||
def append_output(self, output: Union[RequestOutput,
|
||||
list[Message]]) -> None:
|
||||
if isinstance(output, RequestOutput):
|
||||
output_token_ids = output.outputs[0].token_ids
|
||||
self.parser = get_streamable_parser_for_assistant()
|
||||
@ -150,6 +152,8 @@ class HarmonyContext(ConversationContext):
|
||||
# Move current turn to previous turn for next turn's calculations
|
||||
self.previous_turn = self.current_turn.copy()
|
||||
output_msgs = self.parser.messages
|
||||
# The responses finish reason is set in the last message
|
||||
self.finish_reason = output.outputs[0].finish_reason
|
||||
else:
|
||||
# Tool output.
|
||||
output_msgs = output
|
||||
@ -157,18 +161,18 @@ class HarmonyContext(ConversationContext):
|
||||
|
||||
def _update_prefill_token_usage(self, output: RequestOutput) -> None:
|
||||
"""Update token usage statistics for the prefill phase of generation.
|
||||
|
||||
|
||||
The prefill phase processes the input prompt tokens. This method:
|
||||
1. Counts the prompt tokens for this turn
|
||||
2. Calculates tool output tokens for multi-turn conversations
|
||||
3. Updates cached token counts
|
||||
4. Tracks state for next turn calculations
|
||||
|
||||
|
||||
Tool output tokens are calculated as:
|
||||
current_prompt_tokens - last_turn_prompt_tokens -
|
||||
current_prompt_tokens - last_turn_prompt_tokens -
|
||||
last_turn_output_tokens
|
||||
This represents tokens added between turns (typically tool responses).
|
||||
|
||||
|
||||
Args:
|
||||
output: The RequestOutput containing prompt token information
|
||||
"""
|
||||
@ -214,18 +218,18 @@ class HarmonyContext(ConversationContext):
|
||||
|
||||
def _update_decode_token_usage(self, output: RequestOutput) -> int:
|
||||
"""Update token usage statistics for the decode phase of generation.
|
||||
|
||||
|
||||
The decode phase processes the generated output tokens. This method:
|
||||
1. Counts output tokens from all completion outputs
|
||||
2. Updates the total output token count
|
||||
3. Tracks tokens generated in the current turn
|
||||
|
||||
|
||||
In streaming mode, this is called for each token generated.
|
||||
In non-streaming mode, this is called once with all output tokens.
|
||||
|
||||
|
||||
Args:
|
||||
output: The RequestOutput containing generated token information
|
||||
|
||||
|
||||
Returns:
|
||||
int: Number of output tokens processed in this call
|
||||
"""
|
||||
@ -385,7 +389,8 @@ class StreamingHarmonyContext(HarmonyContext):
|
||||
def messages(self) -> list:
|
||||
return self.parser.messages
|
||||
|
||||
def append_output(self, output) -> None:
|
||||
def append_output(self, output: Union[RequestOutput,
|
||||
list[Message]]) -> None:
|
||||
if isinstance(output, RequestOutput):
|
||||
# append_output is called for each output token in streaming case,
|
||||
# so we only want to add the prompt tokens once for each message.
|
||||
|
||||
@ -387,7 +387,9 @@ def parse_remaining_state(
|
||||
id=f"msg_{random_uuid()}",
|
||||
content=[output_text],
|
||||
role="assistant",
|
||||
status="completed",
|
||||
# if the parser still has messages (ie if the generator got cut
|
||||
# abruptly), this should be incomplete
|
||||
status="incomplete",
|
||||
type="message",
|
||||
)
|
||||
return [text_item]
|
||||
|
||||
@ -30,7 +30,7 @@ except ImportError: # For newer openai versions (>= 1.100.0)
|
||||
from openai.types.responses import (ResponseFormatTextConfig as
|
||||
ResponseTextConfig)
|
||||
|
||||
from openai.types.responses.response import ToolChoice
|
||||
from openai.types.responses.response import IncompleteDetails, ToolChoice
|
||||
from openai.types.responses.tool import Tool
|
||||
from openai.types.shared import Metadata, Reasoning
|
||||
from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter,
|
||||
@ -1868,7 +1868,7 @@ class ResponsesResponse(OpenAIBaseModel):
|
||||
id: str = Field(default_factory=lambda: f"resp_{random_uuid()}")
|
||||
created_at: int = Field(default_factory=lambda: int(time.time()))
|
||||
# error: Optional[ResponseError] = None
|
||||
# incomplete_details: Optional[IncompleteDetails] = None
|
||||
incomplete_details: Optional[IncompleteDetails] = None
|
||||
instructions: Optional[str] = None
|
||||
metadata: Optional[Metadata] = None
|
||||
model: str
|
||||
@ -1904,9 +1904,18 @@ class ResponsesResponse(OpenAIBaseModel):
|
||||
status: ResponseStatus,
|
||||
usage: Optional[ResponseUsage] = None,
|
||||
) -> "ResponsesResponse":
|
||||
|
||||
incomplete_details: Optional[IncompleteDetails] = None
|
||||
if status == 'incomplete':
|
||||
incomplete_details = IncompleteDetails(reason='max_output_tokens')
|
||||
# TODO: implement the other reason for incomplete_details,
|
||||
# which is content_filter
|
||||
# incomplete_details = IncompleteDetails(reason='content_filter')
|
||||
|
||||
return cls(
|
||||
id=request.request_id,
|
||||
created_at=created_time,
|
||||
incomplete_details=incomplete_details,
|
||||
instructions=request.instructions,
|
||||
metadata=request.metadata,
|
||||
model=model_name,
|
||||
@ -2109,7 +2118,7 @@ class DetokenizeResponse(OpenAIBaseModel):
|
||||
|
||||
class TokenizerInfoResponse(OpenAIBaseModel):
|
||||
"""
|
||||
Response containing tokenizer configuration
|
||||
Response containing tokenizer configuration
|
||||
equivalent to tokenizer_config.json
|
||||
"""
|
||||
|
||||
@ -2199,7 +2208,7 @@ class TranscriptionRequest(OpenAIBaseModel):
|
||||
to_language: Optional[str] = None
|
||||
"""The language of the output audio we transcribe to.
|
||||
|
||||
Please note that this is not currently used by supported models at this
|
||||
Please note that this is not currently used by supported models at this
|
||||
time, but it is a placeholder for future use, matching translation api.
|
||||
"""
|
||||
|
||||
|
||||
@ -27,7 +27,7 @@ from openai.types.responses import (ResponseCreatedEvent,
|
||||
ResponseReasoningItem,
|
||||
ResponseReasoningTextDeltaEvent,
|
||||
ResponseReasoningTextDoneEvent,
|
||||
response_text_delta_event)
|
||||
ResponseStatus, response_text_delta_event)
|
||||
from openai.types.responses.response_output_text import (Logprob,
|
||||
LogprobTopLogprob)
|
||||
# yapf: enable
|
||||
@ -461,10 +461,22 @@ class OpenAIServingResponses(OpenAIServing):
|
||||
# TODO: Use a vllm-specific Validation Error
|
||||
return self.create_error_response(str(e))
|
||||
|
||||
# NOTE: Implementation of stauts is still WIP, but for now
|
||||
# we guarantee that if the status is not "completed", it is accurate.
|
||||
# "completed" is implemented as the "catch-all" for now.
|
||||
status: ResponseStatus = "completed"
|
||||
|
||||
if self.use_harmony:
|
||||
assert isinstance(context, HarmonyContext)
|
||||
output = self._make_response_output_items_with_harmony(context)
|
||||
num_tool_output_tokens = context.num_tool_output_tokens
|
||||
if len(output) > 0:
|
||||
if context.finish_reason == "length":
|
||||
status = "incomplete"
|
||||
elif context.finish_reason == "abort":
|
||||
status = "cancelled"
|
||||
else:
|
||||
status = "incomplete"
|
||||
else:
|
||||
assert isinstance(context, SimpleContext)
|
||||
final_res = context.last_output
|
||||
@ -501,7 +513,7 @@ class OpenAIServingResponses(OpenAIServing):
|
||||
model_name=model_name,
|
||||
created_time=created_time,
|
||||
output=output,
|
||||
status="completed",
|
||||
status=status,
|
||||
usage=usage,
|
||||
)
|
||||
|
||||
@ -658,7 +670,7 @@ class OpenAIServingResponses(OpenAIServing):
|
||||
self,
|
||||
context: HarmonyContext,
|
||||
) -> list[ResponseOutputItem]:
|
||||
output_items = []
|
||||
output_items: list[ResponseOutputItem] = []
|
||||
num_init_messages = context.num_init_messages
|
||||
for msg in context.messages[num_init_messages:]:
|
||||
output_items.extend(parse_output_message(msg))
|
||||
|
||||
@ -10,19 +10,19 @@ from vllm.v1.request import Request, RequestStatus
|
||||
|
||||
def remove_all(lst: list, items_to_remove: set) -> list:
|
||||
"""Remove all items from a list that are in the items_to_remove set.
|
||||
|
||||
|
||||
This method optimizes for the common case of removing a single item,
|
||||
falling back to list comprehension for multiple items.
|
||||
|
||||
|
||||
Args:
|
||||
lst: The list to remove items from
|
||||
items_to_remove: Set of items to remove
|
||||
|
||||
|
||||
Returns:
|
||||
Either the modified original list (for single item removal) or
|
||||
a new list (for multiple item removal). Callers should use the
|
||||
returned value.
|
||||
|
||||
|
||||
Note:
|
||||
For single item removal, this modifies the original list in-place
|
||||
and returns it. For multiple items, it creates and returns a new list.
|
||||
|
||||
@ -373,17 +373,17 @@ class OutputProcessor:
|
||||
1) Compute stats for logging
|
||||
2) Detokenize
|
||||
3) Create and handle RequestOutput objects:
|
||||
* If there is a queue (for usage with AsyncLLM),
|
||||
* If there is a queue (for usage with AsyncLLM),
|
||||
put the RequestOutput objects into the queue for
|
||||
handling by the per-request generate() tasks.
|
||||
|
||||
* If there is no queue (for usage with LLMEngine),
|
||||
* If there is no queue (for usage with LLMEngine),
|
||||
return a list of RequestOutput objects.
|
||||
|
||||
NOTE FOR DEVELOPERS
|
||||
|
||||
vLLM V1 minimizes the number of python loops over the full
|
||||
batch to ensure system overheads are minimized. This is the
|
||||
batch to ensure system overheads are minimized. This is the
|
||||
only function that should loop over EngineCoreOutputs.
|
||||
|
||||
If you need to touch every element of the batch, do it from
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user