mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 06:35:00 +08:00
Signed-off-by: NickLucche <nlucches@redhat.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
193 lines
6.6 KiB
Python
193 lines
6.6 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
|
|
import io
|
|
# imports for guided decoding tests
|
|
import json
|
|
|
|
import httpx
|
|
import librosa
|
|
import numpy as np
|
|
import pytest
|
|
import pytest_asyncio
|
|
import soundfile as sf
|
|
|
|
from ...utils import RemoteOpenAIServer
|
|
|
|
SERVER_ARGS = ["--enforce-eager"]
|
|
|
|
|
|
@pytest.fixture(scope="module",
|
|
params=["openai/whisper-small", "google/gemma-3n-E2B-it"])
|
|
def server(request):
|
|
# Parametrize over model name
|
|
with RemoteOpenAIServer(request.param, SERVER_ARGS) as remote_server:
|
|
yield remote_server, request.param
|
|
|
|
|
|
@pytest_asyncio.fixture
|
|
async def client_and_model(server):
|
|
server, model_name = server
|
|
async with server.get_async_client() as async_client:
|
|
yield async_client, model_name
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_non_asr_model(foscolo):
|
|
# text to text model
|
|
model_name = "JackFram/llama-68m"
|
|
with RemoteOpenAIServer(model_name, SERVER_ARGS) as remote_server:
|
|
client = remote_server.get_async_client()
|
|
res = await client.audio.translations.create(model=model_name,
|
|
file=foscolo,
|
|
temperature=0.0)
|
|
err = res.error
|
|
assert err["code"] == 400 and not res.text
|
|
assert err["message"] == "The model does not support Translations API"
|
|
|
|
|
|
# NOTE: (NickLucche) the large-v3-turbo model was not trained on translation!
|
|
@pytest.mark.asyncio
|
|
async def test_basic_audio(foscolo, client_and_model):
|
|
client, model_name = client_and_model
|
|
translation = await client.audio.translations.create(
|
|
model=model_name,
|
|
file=foscolo,
|
|
response_format="text",
|
|
# TODO remove `language="it"` once language detection is implemented
|
|
extra_body=dict(language="it", to_language="en"),
|
|
temperature=0.0)
|
|
out = json.loads(translation)['text'].strip().lower()
|
|
assert "greek sea" in out
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_audio_prompt(foscolo, client_and_model):
|
|
client, model_name = client_and_model
|
|
# Condition whisper on starting text
|
|
prompt = "Nor have I ever"
|
|
transcription = await client.audio.translations.create(
|
|
model=model_name,
|
|
file=foscolo,
|
|
prompt=prompt,
|
|
extra_body=dict(language="it", to_language="en"),
|
|
response_format="text",
|
|
temperature=0.0)
|
|
out = json.loads(transcription)['text']
|
|
assert "Nor will I ever touch the sacred" not in out
|
|
assert prompt not in out
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_streaming_response(foscolo, client_and_model, server):
|
|
client, model_name = client_and_model
|
|
translation = ""
|
|
res_no_stream = await client.audio.translations.create(
|
|
model=model_name,
|
|
file=foscolo,
|
|
response_format="json",
|
|
extra_body=dict(language="it", to_language="en", seed=42),
|
|
temperature=0.0)
|
|
|
|
# Stream via HTTPX since OpenAI translation client doesn't expose streaming
|
|
server, model_name = server
|
|
url = server.url_for("v1/audio/translations")
|
|
headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
|
|
data = {
|
|
"model": model_name,
|
|
"language": "it",
|
|
"to_language": "en",
|
|
"stream": True,
|
|
"temperature": 0.0,
|
|
"seed": 42,
|
|
}
|
|
foscolo.seek(0)
|
|
async with httpx.AsyncClient() as http_client:
|
|
files = {"file": foscolo}
|
|
async with http_client.stream("POST",
|
|
url,
|
|
headers=headers,
|
|
data=data,
|
|
files=files) as response:
|
|
async for line in response.aiter_lines():
|
|
if not line:
|
|
continue
|
|
if line.startswith("data: "):
|
|
line = line[len("data: "):]
|
|
if line.strip() == "[DONE]":
|
|
break
|
|
chunk = json.loads(line)
|
|
text = chunk["choices"][0].get("delta", {}).get("content")
|
|
translation += text or ""
|
|
|
|
res_stream = translation.split()
|
|
# NOTE There's a small non-deterministic issue here, likely in the attn
|
|
# computation, which will cause a few tokens to be different, while still
|
|
# being very close semantically.
|
|
assert sum([
|
|
x == y for x, y in zip(res_stream, res_no_stream.text.split())
|
|
]) >= len(res_stream) * 0.9
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_stream_options(foscolo, server):
|
|
server, model_name = server
|
|
url = server.url_for("v1/audio/translations")
|
|
headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
|
|
data = {
|
|
"model": model_name,
|
|
"language": "it",
|
|
"to_language": "en",
|
|
"stream": True,
|
|
"stream_include_usage": True,
|
|
"stream_continuous_usage_stats": True,
|
|
"temperature": 0.0,
|
|
}
|
|
foscolo.seek(0)
|
|
final = False
|
|
continuous = True
|
|
async with httpx.AsyncClient() as http_client:
|
|
files = {"file": foscolo}
|
|
async with http_client.stream("POST",
|
|
url,
|
|
headers=headers,
|
|
data=data,
|
|
files=files) as response:
|
|
async for line in response.aiter_lines():
|
|
if not line:
|
|
continue
|
|
if line.startswith("data: "):
|
|
line = line[len("data: "):]
|
|
if line.strip() == "[DONE]":
|
|
break
|
|
chunk = json.loads(line)
|
|
choices = chunk.get("choices", [])
|
|
if not choices:
|
|
# final usage sent
|
|
final = True
|
|
else:
|
|
continuous = continuous and ("usage" in chunk)
|
|
assert final and continuous
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_long_audio_request(foscolo, client_and_model):
|
|
client, model_name = client_and_model
|
|
if model_name == "google/gemma-3n-E2B-it":
|
|
pytest.skip("Gemma3n does not support long audio requests")
|
|
foscolo.seek(0)
|
|
audio, sr = librosa.load(foscolo)
|
|
repeated_audio = np.tile(audio, 2)
|
|
# Repeated audio to buffer
|
|
buffer = io.BytesIO()
|
|
sf.write(buffer, repeated_audio, sr, format='WAV')
|
|
buffer.seek(0)
|
|
translation = await client.audio.translations.create(
|
|
model=model_name,
|
|
file=buffer,
|
|
extra_body=dict(language="it", to_language="en"),
|
|
response_format="text",
|
|
temperature=0.0)
|
|
out = json.loads(translation)['text'].strip().lower()
|
|
assert out.count("greek sea") == 2
|