mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 02:44:57 +08:00
Signed-off-by: Max Wittig <max.wittig@siemens.com> Signed-off-by: Antoine Auger <antoineauger@users.noreply.github.com> Co-authored-by: Antoine Auger <antoineauger@users.noreply.github.com>
127 lines
3.7 KiB
Python
127 lines
3.7 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
import openai
|
|
import pytest
|
|
import pytest_asyncio
|
|
|
|
from ...utils import RemoteOpenAIServer
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def chat_server_with_force_include_usage(request): # noqa: F811
|
|
args = [
|
|
# use half precision for speed and memory savings in CI environment
|
|
"--dtype",
|
|
"bfloat16",
|
|
"--max-model-len",
|
|
"128",
|
|
"--enforce-eager",
|
|
"--max-num-seqs",
|
|
"1",
|
|
"--enable-force-include-usage",
|
|
"--port",
|
|
"55857",
|
|
"--gpu-memory-utilization",
|
|
"0.2",
|
|
]
|
|
|
|
with RemoteOpenAIServer("Qwen/Qwen3-0.6B", args, auto_port=False) as remote_server:
|
|
yield remote_server
|
|
|
|
|
|
@pytest_asyncio.fixture
|
|
async def chat_client_with_force_include_usage(chat_server_with_force_include_usage):
|
|
async with chat_server_with_force_include_usage.get_async_client() as async_client:
|
|
yield async_client
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_chat_with_enable_force_include_usage(
|
|
chat_client_with_force_include_usage: openai.AsyncOpenAI,
|
|
):
|
|
messages = [
|
|
{"role": "system", "content": "You are a helpful assistant."},
|
|
{"role": "user", "content": "What is the capital of France?"},
|
|
]
|
|
|
|
stream = await chat_client_with_force_include_usage.chat.completions.create(
|
|
model="Qwen/Qwen3-0.6B",
|
|
messages=messages,
|
|
max_completion_tokens=10,
|
|
extra_body=dict(min_tokens=10),
|
|
temperature=0.0,
|
|
stream=True,
|
|
)
|
|
last_completion_tokens = 0
|
|
async for chunk in stream:
|
|
if not len(chunk.choices):
|
|
assert chunk.usage.prompt_tokens >= 0
|
|
assert (
|
|
last_completion_tokens == 0
|
|
or chunk.usage.completion_tokens > last_completion_tokens
|
|
or (
|
|
not chunk.choices
|
|
and chunk.usage.completion_tokens == last_completion_tokens
|
|
)
|
|
)
|
|
assert chunk.usage.total_tokens == (
|
|
chunk.usage.prompt_tokens + chunk.usage.completion_tokens
|
|
)
|
|
else:
|
|
assert chunk.usage is None
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def transcription_server_with_force_include_usage():
|
|
args = [
|
|
# use half precision for speed and memory savings in CI environment
|
|
"--dtype",
|
|
"bfloat16",
|
|
"--max-num-seqs",
|
|
"1",
|
|
"--enforce-eager",
|
|
"--enable-force-include-usage",
|
|
"--gpu-memory-utilization",
|
|
"0.2",
|
|
]
|
|
|
|
with RemoteOpenAIServer("openai/whisper-large-v3-turbo", args) as remote_server:
|
|
yield remote_server
|
|
|
|
|
|
@pytest_asyncio.fixture
|
|
async def transcription_client_with_force_include_usage(
|
|
transcription_server_with_force_include_usage,
|
|
):
|
|
async with (
|
|
transcription_server_with_force_include_usage.get_async_client() as async_client
|
|
):
|
|
yield async_client
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_transcription_with_enable_force_include_usage(
|
|
transcription_client_with_force_include_usage, winning_call
|
|
):
|
|
res = (
|
|
await transcription_client_with_force_include_usage.audio.transcriptions.create(
|
|
model="openai/whisper-large-v3-turbo",
|
|
file=winning_call,
|
|
language="en",
|
|
temperature=0.0,
|
|
stream=True,
|
|
timeout=30,
|
|
)
|
|
)
|
|
|
|
async for chunk in res:
|
|
if not len(chunk.choices):
|
|
# final usage sent
|
|
usage = chunk.usage
|
|
assert isinstance(usage, dict)
|
|
assert usage["prompt_tokens"] > 0
|
|
assert usage["completion_tokens"] > 0
|
|
assert usage["total_tokens"] > 0
|
|
else:
|
|
assert not hasattr(chunk, "usage")
|