[CI] Speed up Whisper tests by reusing server (#22859)

Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
Michael Goin 2025-08-15 16:56:31 -04:00 committed by GitHub
parent a344a1a7da
commit 8a87cd27d9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 260 additions and 288 deletions

View File

@ -4,19 +4,20 @@
# imports for guided decoding tests # imports for guided decoding tests
import io import io
import json import json
from unittest.mock import patch
import librosa import librosa
import numpy as np import numpy as np
import openai import openai
import pytest import pytest
import pytest_asyncio
import soundfile as sf import soundfile as sf
from openai._base_client import AsyncAPIClient
from vllm.assets.audio import AudioAsset from vllm.assets.audio import AudioAsset
from ...utils import RemoteOpenAIServer from ...utils import RemoteOpenAIServer
MODEL_NAME = "openai/whisper-large-v3-turbo"
SERVER_ARGS = ["--enforce-eager"]
MISTRAL_FORMAT_ARGS = [ MISTRAL_FORMAT_ARGS = [
"--tokenizer_mode", "mistral", "--config_format", "mistral", "--tokenizer_mode", "mistral", "--config_format", "mistral",
"--load_format", "mistral" "--load_format", "mistral"
@ -37,6 +38,18 @@ def winning_call():
yield f yield f
@pytest.fixture(scope="module")
def server():
with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize( @pytest.mark.parametrize(
"model_name", "model_name",
@ -60,54 +73,11 @@ async def test_basic_audio(mary_had_lamb, model_name):
assert "Mary had a little lamb," in out assert "Mary had a little lamb," in out
@pytest.mark.asyncio
async def test_bad_requests(mary_had_lamb):
model_name = "openai/whisper-small"
server_args = ["--enforce-eager"]
with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client()
# invalid language
with pytest.raises(openai.BadRequestError):
await client.audio.transcriptions.create(model=model_name,
file=mary_had_lamb,
language="hh",
temperature=0.0)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", ["openai/whisper-large-v3-turbo"])
async def test_long_audio_request(mary_had_lamb, model_name):
server_args = ["--enforce-eager"]
mary_had_lamb.seek(0)
audio, sr = librosa.load(mary_had_lamb)
# Add small silence after each audio for repeatability in the split process
audio = np.pad(audio, (0, 1600))
repeated_audio = np.tile(audio, 10)
# Repeated audio to buffer
buffer = io.BytesIO()
sf.write(buffer, repeated_audio, sr, format='WAV')
buffer.seek(0)
with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client()
transcription = await client.audio.transcriptions.create(
model=model_name,
file=buffer,
language="en",
response_format="text",
temperature=0.0)
out = json.loads(transcription)['text']
counts = out.count("Mary had a little lamb")
assert counts == 10, counts
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_non_asr_model(winning_call): async def test_non_asr_model(winning_call):
# text to text model # text to text model
model_name = "JackFram/llama-68m" model_name = "JackFram/llama-68m"
server_args = ["--enforce-eager"] with RemoteOpenAIServer(model_name, SERVER_ARGS) as remote_server:
with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client() client = remote_server.get_async_client()
res = await client.audio.transcriptions.create(model=model_name, res = await client.audio.transcriptions.create(model=model_name,
file=winning_call, file=winning_call,
@ -120,62 +90,73 @@ async def test_non_asr_model(winning_call):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_completion_endpoints(): async def test_bad_requests(mary_had_lamb, client):
# invalid language
with pytest.raises(openai.BadRequestError):
await client.audio.transcriptions.create(model=MODEL_NAME,
file=mary_had_lamb,
language="hh",
temperature=0.0)
@pytest.mark.asyncio
async def test_long_audio_request(mary_had_lamb, client):
mary_had_lamb.seek(0)
audio, sr = librosa.load(mary_had_lamb)
# Add small silence after each audio for repeatability in the split process
audio = np.pad(audio, (0, 1600))
repeated_audio = np.tile(audio, 10)
# Repeated audio to buffer
buffer = io.BytesIO()
sf.write(buffer, repeated_audio, sr, format='WAV')
buffer.seek(0)
transcription = await client.audio.transcriptions.create(
model=MODEL_NAME,
file=buffer,
language="en",
response_format="text",
temperature=0.0)
out = json.loads(transcription)['text']
counts = out.count("Mary had a little lamb")
assert counts == 10, counts
@pytest.mark.asyncio
async def test_completion_endpoints(client):
# text to text model # text to text model
model_name = "openai/whisper-small"
server_args = ["--enforce-eager"]
with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client()
res = await client.chat.completions.create( res = await client.chat.completions.create(
model=model_name, model=MODEL_NAME,
messages=[{ messages=[{
"role": "system", "role": "system",
"content": "You are a helpful assistant." "content": "You are a helpful assistant."
}]) }])
err = res.error err = res.error
assert err["code"] == 400 assert err["code"] == 400
assert err[ assert err["message"] == "The model does not support Chat Completions API"
"message"] == "The model does not support Chat Completions API"
res = await client.completions.create(model=model_name, prompt="Hello") res = await client.completions.create(model=MODEL_NAME, prompt="Hello")
err = res.error err = res.error
assert err["code"] == 400 assert err["code"] == 400
assert err["message"] == "The model does not support Completions API" assert err["message"] == "The model does not support Completions API"
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_streaming_response(winning_call): async def test_streaming_response(winning_call, client):
model_name = "openai/whisper-small"
server_args = ["--enforce-eager"]
transcription = "" transcription = ""
with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client()
res_no_stream = await client.audio.transcriptions.create( res_no_stream = await client.audio.transcriptions.create(
model=model_name, model=MODEL_NAME,
file=winning_call, file=winning_call,
response_format="json", response_format="json",
language="en", language="en",
temperature=0.0) temperature=0.0)
# Unfortunately this only works when the openai client is patched res = await client.audio.transcriptions.create(model=MODEL_NAME,
# to use streaming mode, not exposed in the transcription api.
original_post = AsyncAPIClient.post
async def post_with_stream(*args, **kwargs):
kwargs['stream'] = True
return await original_post(*args, **kwargs)
with patch.object(AsyncAPIClient, "post", new=post_with_stream):
client = remote_server.get_async_client()
res = await client.audio.transcriptions.create(
model=model_name,
file=winning_call, file=winning_call,
language="en", language="en",
temperature=0.0, temperature=0.0,
extra_body=dict(stream=True), stream=True,
timeout=30) timeout=30)
# Reconstruct from chunks and validate # Reconstruct from chunks and validate
async for chunk in res: async for chunk in res:
# just a chunk
text = chunk.choices[0]['delta']['content'] text = chunk.choices[0]['delta']['content']
transcription += text transcription += text
@ -183,25 +164,14 @@ async def test_streaming_response(winning_call):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_stream_options(winning_call): async def test_stream_options(winning_call, client):
model_name = "openai/whisper-small"
server_args = ["--enforce-eager"]
with RemoteOpenAIServer(model_name, server_args) as remote_server:
original_post = AsyncAPIClient.post
async def post_with_stream(*args, **kwargs):
kwargs['stream'] = True
return await original_post(*args, **kwargs)
with patch.object(AsyncAPIClient, "post", new=post_with_stream):
client = remote_server.get_async_client()
res = await client.audio.transcriptions.create( res = await client.audio.transcriptions.create(
model=model_name, model=MODEL_NAME,
file=winning_call, file=winning_call,
language="en", language="en",
temperature=0.0, temperature=0.0,
extra_body=dict(stream=True, stream=True,
stream_include_usage=True, extra_body=dict(stream_include_usage=True,
stream_continuous_usage_stats=True), stream_continuous_usage_stats=True),
timeout=30) timeout=30)
final = False final = False
@ -216,17 +186,13 @@ async def test_stream_options(winning_call):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_sampling_params(mary_had_lamb): async def test_sampling_params(mary_had_lamb, client):
""" """
Compare sampling with params and greedy sampling to assert results Compare sampling with params and greedy sampling to assert results
are different when extreme sampling parameters values are picked. are different when extreme sampling parameters values are picked.
""" """
model_name = "openai/whisper-small"
server_args = ["--enforce-eager"]
with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client()
transcription = await client.audio.transcriptions.create( transcription = await client.audio.transcriptions.create(
model=model_name, model=MODEL_NAME,
file=mary_had_lamb, file=mary_had_lamb,
language="en", language="en",
temperature=0.8, temperature=0.8,
@ -239,7 +205,7 @@ async def test_sampling_params(mary_had_lamb):
presence_penalty=2.0)) presence_penalty=2.0))
greedy_transcription = await client.audio.transcriptions.create( greedy_transcription = await client.audio.transcriptions.create(
model=model_name, model=MODEL_NAME,
file=mary_had_lamb, file=mary_had_lamb,
language="en", language="en",
temperature=0.0, temperature=0.0,
@ -249,16 +215,12 @@ async def test_sampling_params(mary_had_lamb):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_audio_prompt(mary_had_lamb): async def test_audio_prompt(mary_had_lamb, client):
model_name = "openai/whisper-large-v3-turbo"
server_args = ["--enforce-eager"]
prompt = "This is a speech, recorded in a phonograph." prompt = "This is a speech, recorded in a phonograph."
with RemoteOpenAIServer(model_name, server_args) as remote_server:
#Prompts should not omit the part of original prompt while transcribing. #Prompts should not omit the part of original prompt while transcribing.
prefix = "The first words I spoke in the original phonograph" prefix = "The first words I spoke in the original phonograph"
client = remote_server.get_async_client()
transcription = await client.audio.transcriptions.create( transcription = await client.audio.transcriptions.create(
model=model_name, model=MODEL_NAME,
file=mary_had_lamb, file=mary_had_lamb,
language="en", language="en",
response_format="text", response_format="text",
@ -266,7 +228,7 @@ async def test_audio_prompt(mary_had_lamb):
out = json.loads(transcription)['text'] out = json.loads(transcription)['text']
assert prefix in out assert prefix in out
transcription_wprompt = await client.audio.transcriptions.create( transcription_wprompt = await client.audio.transcriptions.create(
model=model_name, model=MODEL_NAME,
file=mary_had_lamb, file=mary_had_lamb,
language="en", language="en",
response_format="text", response_format="text",

View File

@ -4,18 +4,21 @@
import io import io
# imports for guided decoding tests # imports for guided decoding tests
import json import json
from unittest.mock import patch
import httpx
import librosa import librosa
import numpy as np import numpy as np
import pytest import pytest
import pytest_asyncio
import soundfile as sf import soundfile as sf
from openai._base_client import AsyncAPIClient
from vllm.assets.audio import AudioAsset from vllm.assets.audio import AudioAsset
from ...utils import RemoteOpenAIServer from ...utils import RemoteOpenAIServer
MODEL_NAME = "openai/whisper-small"
SERVER_ARGS = ["--enforce-eager"]
@pytest.fixture @pytest.fixture
def foscolo(): def foscolo():
@ -25,15 +28,37 @@ def foscolo():
yield f yield f
@pytest.fixture(scope="module")
def server():
with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
async def test_non_asr_model(foscolo):
# text to text model
model_name = "JackFram/llama-68m"
with RemoteOpenAIServer(model_name, SERVER_ARGS) as remote_server:
client = remote_server.get_async_client()
res = await client.audio.translations.create(model=model_name,
file=foscolo,
temperature=0.0)
err = res.error
assert err["code"] == 400 and not res.text
assert err["message"] == "The model does not support Translations API"
# NOTE: (NickLucche) the large-v3-turbo model was not trained on translation! # NOTE: (NickLucche) the large-v3-turbo model was not trained on translation!
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_basic_audio(foscolo): async def test_basic_audio(foscolo, client):
model_name = "openai/whisper-small"
server_args = ["--enforce-eager"]
with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client()
translation = await client.audio.translations.create( translation = await client.audio.translations.create(
model=model_name, model=MODEL_NAME,
file=foscolo, file=foscolo,
response_format="text", response_format="text",
# TODO remove once language detection is implemented # TODO remove once language detection is implemented
@ -44,15 +69,11 @@ async def test_basic_audio(foscolo):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_audio_prompt(foscolo): async def test_audio_prompt(foscolo, client):
model_name = "openai/whisper-small"
server_args = ["--enforce-eager"]
# Condition whisper on starting text # Condition whisper on starting text
prompt = "Nor have I ever" prompt = "Nor have I ever"
with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client()
transcription = await client.audio.translations.create( transcription = await client.audio.translations.create(
model=model_name, model=MODEL_NAME,
file=foscolo, file=foscolo,
prompt=prompt, prompt=prompt,
extra_body=dict(language="it"), extra_body=dict(language="it"),
@ -64,95 +85,86 @@ async def test_audio_prompt(foscolo):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_non_asr_model(foscolo): async def test_streaming_response(foscolo, client, server):
# text to text model
model_name = "JackFram/llama-68m"
server_args = ["--enforce-eager"]
with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client()
res = await client.audio.translations.create(model=model_name,
file=foscolo,
temperature=0.0)
err = res.error
assert err["code"] == 400 and not res.text
assert err["message"] == "The model does not support Translations API"
@pytest.mark.asyncio
async def test_streaming_response(foscolo):
model_name = "openai/whisper-small"
server_args = ["--enforce-eager"]
translation = "" translation = ""
with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client()
res_no_stream = await client.audio.translations.create( res_no_stream = await client.audio.translations.create(
model=model_name, model=MODEL_NAME,
file=foscolo, file=foscolo,
response_format="json", response_format="json",
extra_body=dict(language="it"), extra_body=dict(language="it"),
temperature=0.0) temperature=0.0)
# Unfortunately this only works when the openai client is patched # Stream via HTTPX since OpenAI translation client doesn't expose streaming
# to use streaming mode, not exposed in the translation api. url = server.url_for("v1/audio/translations")
original_post = AsyncAPIClient.post headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
data = {
async def post_with_stream(*args, **kwargs): "model": MODEL_NAME,
kwargs['stream'] = True "language": "it",
return await original_post(*args, **kwargs) "stream": True,
"temperature": 0.0,
with patch.object(AsyncAPIClient, "post", new=post_with_stream): }
client = remote_server.get_async_client() foscolo.seek(0)
res = await client.audio.translations.create(model=model_name, async with httpx.AsyncClient() as http_client:
file=foscolo, files = {"file": foscolo}
temperature=0.0, async with http_client.stream("POST",
extra_body=dict( url,
stream=True, headers=headers,
language="it")) data=data,
# Reconstruct from chunks and validate files=files) as response:
async for chunk in res: async for line in response.aiter_lines():
# just a chunk if not line:
text = chunk.choices[0]['delta']['content'] continue
translation += text if line.startswith("data: "):
line = line[len("data: "):]
if line.strip() == "[DONE]":
break
chunk = json.loads(line)
text = chunk["choices"][0].get("delta", {}).get("content")
translation += text or ""
assert translation == res_no_stream.text assert translation == res_no_stream.text
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_stream_options(foscolo): async def test_stream_options(foscolo, client, server):
model_name = "openai/whisper-small" url = server.url_for("v1/audio/translations")
server_args = ["--enforce-eager"] headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
with RemoteOpenAIServer(model_name, server_args) as remote_server: data = {
original_post = AsyncAPIClient.post "model": MODEL_NAME,
"language": "it",
async def post_with_stream(*args, **kwargs): "stream": True,
kwargs['stream'] = True "stream_include_usage": True,
return await original_post(*args, **kwargs) "stream_continuous_usage_stats": True,
"temperature": 0.0,
with patch.object(AsyncAPIClient, "post", new=post_with_stream): }
client = remote_server.get_async_client() foscolo.seek(0)
res = await client.audio.translations.create(
model=model_name,
file=foscolo,
temperature=0.0,
extra_body=dict(language="it",
stream=True,
stream_include_usage=True,
stream_continuous_usage_stats=True))
final = False final = False
continuous = True continuous = True
async for chunk in res: async with httpx.AsyncClient() as http_client:
if not len(chunk.choices): files = {"file": foscolo}
async with http_client.stream("POST",
url,
headers=headers,
data=data,
files=files) as response:
async for line in response.aiter_lines():
if not line:
continue
if line.startswith("data: "):
line = line[len("data: "):]
if line.strip() == "[DONE]":
break
chunk = json.loads(line)
choices = chunk.get("choices", [])
if not choices:
# final usage sent # final usage sent
final = True final = True
else: else:
continuous = continuous and hasattr(chunk, 'usage') continuous = continuous and ("usage" in chunk)
assert final and continuous assert final and continuous
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_long_audio_request(foscolo): async def test_long_audio_request(foscolo, client):
model_name = "openai/whisper-small"
server_args = ["--enforce-eager"]
foscolo.seek(0) foscolo.seek(0)
audio, sr = librosa.load(foscolo) audio, sr = librosa.load(foscolo)
repeated_audio = np.tile(audio, 2) repeated_audio = np.tile(audio, 2)
@ -160,10 +172,8 @@ async def test_long_audio_request(foscolo):
buffer = io.BytesIO() buffer = io.BytesIO()
sf.write(buffer, repeated_audio, sr, format='WAV') sf.write(buffer, repeated_audio, sr, format='WAV')
buffer.seek(0) buffer.seek(0)
with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client()
translation = await client.audio.translations.create( translation = await client.audio.translations.create(
model=model_name, model=MODEL_NAME,
file=buffer, file=buffer,
extra_body=dict(language="it"), extra_body=dict(language="it"),
response_format="text", response_format="text",