[Frontend] Fixes anthropic /v1/messages streaming not containing input_tokens on first chunk (#29971)

Signed-off-by: bbartels <benjamin@bartels.dev>
2026-03-16 13:57:12 +08:00 · 2025-12-04 05:50:27 +00:00 · 2025-12-04 05:50:27 +00:00 · fca3f46658
commit fca3f46658
parent 28097d5638
2 changed files with 20 additions and 1 deletions
--- a/tests/entrypoints/openai/test_messages.py
+++ b/tests/entrypoints/openai/test_messages.py
@ -69,9 +69,20 @@ async def test_anthropic_streaming(client: anthropic.AsyncAnthropic):
        stream=True,
    )

+    first_chunk = None
+    chunk_count = 0
    async for chunk in resp:
+        chunk_count += 1
+        if first_chunk is None and chunk.type == "message_start":
+            first_chunk = chunk
        print(chunk.model_dump_json())

+    assert chunk_count > 0
+    assert first_chunk is not None, "message_start chunk was never observed"
+    assert first_chunk.usage is not None, "first chunk should include usage stats"
+    assert first_chunk.usage["output_tokens"] == 0
+    assert first_chunk.usage["input_tokens"] > 5
+

@pytest.mark.asyncio
 async def test_anthropic_tool_call(client: anthropic.AsyncAnthropic):
--- a/vllm/entrypoints/anthropic/serving_messages.py
+++ b/vllm/entrypoints/anthropic/serving_messages.py
@ -183,7 +183,9 @@ class AnthropicServingMessages(OpenAIServingChat):

        if anthropic_request.stream:
            req.stream = anthropic_request.stream
-            req.stream_options = StreamOptions.validate({"include_usage": True})
+            req.stream_options = StreamOptions.validate(
+                {"include_usage": True, "continuous_usage_stats": True}
+            )

        if anthropic_request.tool_choice is None:
            req.tool_choice = None
@ -323,6 +325,12 @@ class AnthropicServingMessages(OpenAIServingChat):
                                    content=[],
                                    model=origin_chunk.model,
                                ),
+                                usage=AnthropicUsage(
+                                    input_tokens=origin_chunk.usage.prompt_tokens
+                                    if origin_chunk.usage
+                                    else 0,
+                                    output_tokens=0,
+                                ),
                            )
                            first_item = False
                            data = chunk.model_dump_json(exclude_unset=True)