[BugFix] Fix chat API continuous usage stats (#9357)

2025-12-15 06:45:01 +08:00 · 2024-10-15 07:19:48 +01:00 · 2024-10-15 07:19:48 +01:00 · e9d517f276
commit e9d517f276
parent 55e081fbad
2 changed files with 53 additions and 76 deletions
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@ -433,18 +433,28 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
        model=model_name,
        messages=messages,
        max_tokens=10,
        extra_body=dict(min_tokens=10),
        temperature=0.0,
        stream=True,
        stream_options={
            "include_usage": True,
-            "continuous_usage_stats": True
+            "continuous_usage_stats": True,
        },
    )
    last_completion_tokens = 0
    async for chunk in stream:
        assert chunk.usage.prompt_tokens >= 0
-        assert chunk.usage.completion_tokens >= 0
+        assert last_completion_tokens == 0 or \
               chunk.usage.completion_tokens > last_completion_tokens or \
               (
                   not chunk.choices and
                   chunk.usage.completion_tokens == last_completion_tokens
               )
        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
                                            chunk.usage.completion_tokens)
        last_completion_tokens = chunk.usage.completion_tokens
    assert last_completion_tokens == 10
 # NOTE: Not sure why, but when I place this after `test_guided_regex_chat`
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@ -330,6 +330,14 @@ class OpenAIServingChat(OpenAIServing):
            yield "data: [DONE]\n\n"
            return
        stream_options = request.stream_options
        if stream_options:
            include_usage = stream_options.include_usage
            include_continuous_usage = include_usage and \
                                       stream_options.continuous_usage_stats
        else:
            include_usage, include_continuous_usage = False, False
        try:
            async for res in result_generator:
                if res.prompt_token_ids is not None:
@ -348,7 +356,6 @@ class OpenAIServingChat(OpenAIServing):
                    # NOTE num_choices defaults to 1 so this usually executes
                    # once per request
                    for i in range(num_choices):
                        tool_parser = tool_parsers[i]
                        choice_data = ChatCompletionResponseStreamChoice(
                            index=i,
                            delta=DeltaMessage(
@ -364,19 +371,12 @@ class OpenAIServingChat(OpenAIServing):
                            choices=[choice_data],
                            model=model_name)
                        # if usage should be included
                        if (request.stream_options
                                and request.stream_options.include_usage):
                        # if continuous usage stats are requested, add it
-                            if request.stream_options.continuous_usage_stats:
+                        if include_continuous_usage:
-                                usage = UsageInfo(
+                            chunk.usage = UsageInfo(
                                prompt_tokens=num_prompt_tokens,
                                completion_tokens=0,
                                total_tokens=num_prompt_tokens)
                                chunk.usage = usage
                            # otherwise don't
                            else:
                                chunk.usage = None
                        data = chunk.model_dump_json(exclude_unset=True)
                        yield f"data: {data}\n\n"
@ -404,17 +404,11 @@ class OpenAIServingChat(OpenAIServing):
                                    created=created_time,
                                    choices=[choice_data],
                                    model=model_name)
-                                if (request.stream_options and
+                                if include_continuous_usage:
-                                        request.stream_options.include_usage):
+                                    chunk.usage = UsageInfo(
                                    if (request.stream_options.
                                            continuous_usage_stats):
                                        usage = UsageInfo(
                                        prompt_tokens=num_prompt_tokens,
                                        completion_tokens=0,
                                        total_tokens=num_prompt_tokens)
                                        chunk.usage = usage
                                    else:
                                        chunk.usage = None
                                data = chunk.model_dump_json(
                                    exclude_unset=True)
@ -494,36 +488,11 @@ class OpenAIServingChat(OpenAIServing):
                    if output.finish_reason is None:
                        # Send token-by-token response for each request.n
                        choice_data = ChatCompletionResponseStreamChoice(
                            index=i,
                            delta=delta_message,
                            logprobs=logprobs,
                            finish_reason=None)
                        chunk = ChatCompletionStreamResponse(
                            id=request_id,
                            object=chunk_object_type,
                            created=created_time,
                            choices=[choice_data],
                            model=model_name)
                        # handle usage stats if requested & if continuous
                        if (request.stream_options
                                and request.stream_options.include_usage):
                            if request.stream_options.continuous_usage_stats:
                                completion_tokens = len(output.token_ids)
                                usage = UsageInfo(
                                    prompt_tokens=num_prompt_tokens,
                                    completion_tokens=completion_tokens,
                                    total_tokens=num_prompt_tokens +
                                    completion_tokens,
                                )
                                chunk.usage = usage
                            else:
                                chunk.usage = None
                        data = chunk.model_dump_json(exclude_unset=True)
                        yield f"data: {data}\n\n"
                    # if the model is finished generating
                    else:
@ -573,34 +542,32 @@ class OpenAIServingChat(OpenAIServing):
                            finish_reason=output.finish_reason
                            if not auto_tools_called else "tool_calls",
                            stop_reason=output.stop_reason)
                        finish_reason_sent[i] = True
                    chunk = ChatCompletionStreamResponse(
                        id=request_id,
                        object=chunk_object_type,
                        created=created_time,
                        choices=[choice_data],
                        model=model_name)
-                        if (request.stream_options
+
-                                and request.stream_options.include_usage):
+                    # handle usage stats if requested & if continuous
-                            if request.stream_options.continuous_usage_stats:
+                    if include_continuous_usage:
-                                completion_tokens = len(output.token_ids)
+                        completion_tokens = previous_num_tokens[i]
-                                usage = UsageInfo(
+                        chunk.usage = UsageInfo(
                            prompt_tokens=num_prompt_tokens,
                            completion_tokens=completion_tokens,
-                                    total_tokens=num_prompt_tokens +
+                            total_tokens=num_prompt_tokens + completion_tokens,
                                    completion_tokens,
                        )
-                                chunk.usage = usage
+
                            else:
                                chunk.usage = None
                    data = chunk.model_dump_json(exclude_unset=True)
                    yield f"data: {data}\n\n"
                        finish_reason_sent[i] = True
            # once the final token is handled, if stream_options.include_usage
            # is sent, send the usage
-            if (request.stream_options
+            if include_usage:
-                    and request.stream_options.include_usage):
+                completion_tokens = sum(previous_num_tokens)
                completion_tokens = previous_num_tokens[i]
                final_usage = UsageInfo(
                    prompt_tokens=num_prompt_tokens,
                    completion_tokens=completion_tokens,