mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-15 06:45:01 +08:00
[BugFix] Fix chat API continuous usage stats (#9357)
This commit is contained in:
parent
55e081fbad
commit
e9d517f276
@ -433,18 +433,28 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
|
|||||||
model=model_name,
|
model=model_name,
|
||||||
messages=messages,
|
messages=messages,
|
||||||
max_tokens=10,
|
max_tokens=10,
|
||||||
|
extra_body=dict(min_tokens=10),
|
||||||
temperature=0.0,
|
temperature=0.0,
|
||||||
stream=True,
|
stream=True,
|
||||||
stream_options={
|
stream_options={
|
||||||
"include_usage": True,
|
"include_usage": True,
|
||||||
"continuous_usage_stats": True
|
"continuous_usage_stats": True,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
last_completion_tokens = 0
|
||||||
async for chunk in stream:
|
async for chunk in stream:
|
||||||
assert chunk.usage.prompt_tokens >= 0
|
assert chunk.usage.prompt_tokens >= 0
|
||||||
assert chunk.usage.completion_tokens >= 0
|
assert last_completion_tokens == 0 or \
|
||||||
|
chunk.usage.completion_tokens > last_completion_tokens or \
|
||||||
|
(
|
||||||
|
not chunk.choices and
|
||||||
|
chunk.usage.completion_tokens == last_completion_tokens
|
||||||
|
)
|
||||||
assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
|
assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
|
||||||
chunk.usage.completion_tokens)
|
chunk.usage.completion_tokens)
|
||||||
|
last_completion_tokens = chunk.usage.completion_tokens
|
||||||
|
|
||||||
|
assert last_completion_tokens == 10
|
||||||
|
|
||||||
|
|
||||||
# NOTE: Not sure why, but when I place this after `test_guided_regex_chat`
|
# NOTE: Not sure why, but when I place this after `test_guided_regex_chat`
|
||||||
|
|||||||
@ -330,6 +330,14 @@ class OpenAIServingChat(OpenAIServing):
|
|||||||
yield "data: [DONE]\n\n"
|
yield "data: [DONE]\n\n"
|
||||||
return
|
return
|
||||||
|
|
||||||
|
stream_options = request.stream_options
|
||||||
|
if stream_options:
|
||||||
|
include_usage = stream_options.include_usage
|
||||||
|
include_continuous_usage = include_usage and \
|
||||||
|
stream_options.continuous_usage_stats
|
||||||
|
else:
|
||||||
|
include_usage, include_continuous_usage = False, False
|
||||||
|
|
||||||
try:
|
try:
|
||||||
async for res in result_generator:
|
async for res in result_generator:
|
||||||
if res.prompt_token_ids is not None:
|
if res.prompt_token_ids is not None:
|
||||||
@ -348,7 +356,6 @@ class OpenAIServingChat(OpenAIServing):
|
|||||||
# NOTE num_choices defaults to 1 so this usually executes
|
# NOTE num_choices defaults to 1 so this usually executes
|
||||||
# once per request
|
# once per request
|
||||||
for i in range(num_choices):
|
for i in range(num_choices):
|
||||||
tool_parser = tool_parsers[i]
|
|
||||||
choice_data = ChatCompletionResponseStreamChoice(
|
choice_data = ChatCompletionResponseStreamChoice(
|
||||||
index=i,
|
index=i,
|
||||||
delta=DeltaMessage(
|
delta=DeltaMessage(
|
||||||
@ -364,19 +371,12 @@ class OpenAIServingChat(OpenAIServing):
|
|||||||
choices=[choice_data],
|
choices=[choice_data],
|
||||||
model=model_name)
|
model=model_name)
|
||||||
|
|
||||||
# if usage should be included
|
|
||||||
if (request.stream_options
|
|
||||||
and request.stream_options.include_usage):
|
|
||||||
# if continuous usage stats are requested, add it
|
# if continuous usage stats are requested, add it
|
||||||
if request.stream_options.continuous_usage_stats:
|
if include_continuous_usage:
|
||||||
usage = UsageInfo(
|
chunk.usage = UsageInfo(
|
||||||
prompt_tokens=num_prompt_tokens,
|
prompt_tokens=num_prompt_tokens,
|
||||||
completion_tokens=0,
|
completion_tokens=0,
|
||||||
total_tokens=num_prompt_tokens)
|
total_tokens=num_prompt_tokens)
|
||||||
chunk.usage = usage
|
|
||||||
# otherwise don't
|
|
||||||
else:
|
|
||||||
chunk.usage = None
|
|
||||||
|
|
||||||
data = chunk.model_dump_json(exclude_unset=True)
|
data = chunk.model_dump_json(exclude_unset=True)
|
||||||
yield f"data: {data}\n\n"
|
yield f"data: {data}\n\n"
|
||||||
@ -404,17 +404,11 @@ class OpenAIServingChat(OpenAIServing):
|
|||||||
created=created_time,
|
created=created_time,
|
||||||
choices=[choice_data],
|
choices=[choice_data],
|
||||||
model=model_name)
|
model=model_name)
|
||||||
if (request.stream_options and
|
if include_continuous_usage:
|
||||||
request.stream_options.include_usage):
|
chunk.usage = UsageInfo(
|
||||||
if (request.stream_options.
|
|
||||||
continuous_usage_stats):
|
|
||||||
usage = UsageInfo(
|
|
||||||
prompt_tokens=num_prompt_tokens,
|
prompt_tokens=num_prompt_tokens,
|
||||||
completion_tokens=0,
|
completion_tokens=0,
|
||||||
total_tokens=num_prompt_tokens)
|
total_tokens=num_prompt_tokens)
|
||||||
chunk.usage = usage
|
|
||||||
else:
|
|
||||||
chunk.usage = None
|
|
||||||
|
|
||||||
data = chunk.model_dump_json(
|
data = chunk.model_dump_json(
|
||||||
exclude_unset=True)
|
exclude_unset=True)
|
||||||
@ -494,36 +488,11 @@ class OpenAIServingChat(OpenAIServing):
|
|||||||
|
|
||||||
if output.finish_reason is None:
|
if output.finish_reason is None:
|
||||||
# Send token-by-token response for each request.n
|
# Send token-by-token response for each request.n
|
||||||
|
|
||||||
choice_data = ChatCompletionResponseStreamChoice(
|
choice_data = ChatCompletionResponseStreamChoice(
|
||||||
index=i,
|
index=i,
|
||||||
delta=delta_message,
|
delta=delta_message,
|
||||||
logprobs=logprobs,
|
logprobs=logprobs,
|
||||||
finish_reason=None)
|
finish_reason=None)
|
||||||
chunk = ChatCompletionStreamResponse(
|
|
||||||
id=request_id,
|
|
||||||
object=chunk_object_type,
|
|
||||||
created=created_time,
|
|
||||||
choices=[choice_data],
|
|
||||||
model=model_name)
|
|
||||||
|
|
||||||
# handle usage stats if requested & if continuous
|
|
||||||
if (request.stream_options
|
|
||||||
and request.stream_options.include_usage):
|
|
||||||
if request.stream_options.continuous_usage_stats:
|
|
||||||
completion_tokens = len(output.token_ids)
|
|
||||||
usage = UsageInfo(
|
|
||||||
prompt_tokens=num_prompt_tokens,
|
|
||||||
completion_tokens=completion_tokens,
|
|
||||||
total_tokens=num_prompt_tokens +
|
|
||||||
completion_tokens,
|
|
||||||
)
|
|
||||||
chunk.usage = usage
|
|
||||||
else:
|
|
||||||
chunk.usage = None
|
|
||||||
|
|
||||||
data = chunk.model_dump_json(exclude_unset=True)
|
|
||||||
yield f"data: {data}\n\n"
|
|
||||||
|
|
||||||
# if the model is finished generating
|
# if the model is finished generating
|
||||||
else:
|
else:
|
||||||
@ -573,34 +542,32 @@ class OpenAIServingChat(OpenAIServing):
|
|||||||
finish_reason=output.finish_reason
|
finish_reason=output.finish_reason
|
||||||
if not auto_tools_called else "tool_calls",
|
if not auto_tools_called else "tool_calls",
|
||||||
stop_reason=output.stop_reason)
|
stop_reason=output.stop_reason)
|
||||||
|
|
||||||
|
finish_reason_sent[i] = True
|
||||||
|
|
||||||
chunk = ChatCompletionStreamResponse(
|
chunk = ChatCompletionStreamResponse(
|
||||||
id=request_id,
|
id=request_id,
|
||||||
object=chunk_object_type,
|
object=chunk_object_type,
|
||||||
created=created_time,
|
created=created_time,
|
||||||
choices=[choice_data],
|
choices=[choice_data],
|
||||||
model=model_name)
|
model=model_name)
|
||||||
if (request.stream_options
|
|
||||||
and request.stream_options.include_usage):
|
# handle usage stats if requested & if continuous
|
||||||
if request.stream_options.continuous_usage_stats:
|
if include_continuous_usage:
|
||||||
completion_tokens = len(output.token_ids)
|
completion_tokens = previous_num_tokens[i]
|
||||||
usage = UsageInfo(
|
chunk.usage = UsageInfo(
|
||||||
prompt_tokens=num_prompt_tokens,
|
prompt_tokens=num_prompt_tokens,
|
||||||
completion_tokens=completion_tokens,
|
completion_tokens=completion_tokens,
|
||||||
total_tokens=num_prompt_tokens +
|
total_tokens=num_prompt_tokens + completion_tokens,
|
||||||
completion_tokens,
|
|
||||||
)
|
)
|
||||||
chunk.usage = usage
|
|
||||||
else:
|
|
||||||
chunk.usage = None
|
|
||||||
data = chunk.model_dump_json(exclude_unset=True)
|
data = chunk.model_dump_json(exclude_unset=True)
|
||||||
yield f"data: {data}\n\n"
|
yield f"data: {data}\n\n"
|
||||||
finish_reason_sent[i] = True
|
|
||||||
|
|
||||||
# once the final token is handled, if stream_options.include_usage
|
# once the final token is handled, if stream_options.include_usage
|
||||||
# is sent, send the usage
|
# is sent, send the usage
|
||||||
if (request.stream_options
|
if include_usage:
|
||||||
and request.stream_options.include_usage):
|
completion_tokens = sum(previous_num_tokens)
|
||||||
completion_tokens = previous_num_tokens[i]
|
|
||||||
final_usage = UsageInfo(
|
final_usage = UsageInfo(
|
||||||
prompt_tokens=num_prompt_tokens,
|
prompt_tokens=num_prompt_tokens,
|
||||||
completion_tokens=completion_tokens,
|
completion_tokens=completion_tokens,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user