vllm/tests/entrypoints/openai/test_enable_force_include_usage.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import openai
import pytest
import pytest_asyncio

from ...utils import RemoteOpenAIServer


@pytest.fixture(scope="module")
def chat_server_with_force_include_usage(request):  # noqa: F811
    args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
        "bfloat16",
        "--max-model-len",
        "128",
        "--enforce-eager",
        "--max-num-seqs",
        "1",
        "--enable-force-include-usage",
        "--port",
        "55857",
        "--gpu-memory-utilization",
        "0.2",
    ]

    with RemoteOpenAIServer("Qwen/Qwen3-0.6B", args, auto_port=False) as remote_server:
        yield remote_server


@pytest_asyncio.fixture
async def chat_client_with_force_include_usage(chat_server_with_force_include_usage):
    async with chat_server_with_force_include_usage.get_async_client() as async_client:
        yield async_client


@pytest.mark.asyncio
async def test_chat_with_enable_force_include_usage(
    chat_client_with_force_include_usage: openai.AsyncOpenAI,
):
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "What is the capital of France?"},
    ]

    stream = await chat_client_with_force_include_usage.chat.completions.create(
        model="Qwen/Qwen3-0.6B",
        messages=messages,
        max_completion_tokens=10,
        extra_body=dict(min_tokens=10),
        temperature=0.0,
        stream=True,
    )
    last_completion_tokens = 0
    async for chunk in stream:
        if not len(chunk.choices):
            assert chunk.usage.prompt_tokens >= 0
            assert (
                last_completion_tokens == 0
                or chunk.usage.completion_tokens > last_completion_tokens
                or (
                    not chunk.choices
                    and chunk.usage.completion_tokens == last_completion_tokens
                )
            )
            assert chunk.usage.total_tokens == (
                chunk.usage.prompt_tokens + chunk.usage.completion_tokens
            )
        else:
            assert chunk.usage is None


@pytest.fixture(scope="module")
def transcription_server_with_force_include_usage():
    args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
        "bfloat16",
        "--max-num-seqs",
        "1",
        "--enforce-eager",
        "--enable-force-include-usage",
        "--gpu-memory-utilization",
        "0.2",
    ]

    with RemoteOpenAIServer("openai/whisper-large-v3-turbo", args) as remote_server:
        yield remote_server


@pytest_asyncio.fixture
async def transcription_client_with_force_include_usage(
    transcription_server_with_force_include_usage,
):
    async with (
        transcription_server_with_force_include_usage.get_async_client() as async_client
    ):
        yield async_client


@pytest.mark.asyncio
async def test_transcription_with_enable_force_include_usage(
    transcription_client_with_force_include_usage, winning_call
):
    res = (
        await transcription_client_with_force_include_usage.audio.transcriptions.create(
            model="openai/whisper-large-v3-turbo",
            file=winning_call,
            language="en",
            temperature=0.0,
            stream=True,
            timeout=30,
        )
    )

    async for chunk in res:
        if not len(chunk.choices):
            # final usage sent
            usage = chunk.usage
            assert isinstance(usage, dict)
            assert usage["prompt_tokens"] > 0
            assert usage["completion_tokens"] > 0
            assert usage["total_tokens"] > 0
        else:
            assert not hasattr(chunk, "usage")