[CI] Add batched audios Whisper test (#29308)

Signed-off-by: NickLucche <nlucches@redhat.com>
2026-05-07 18:20:07 +08:00 · 2025-11-27 20:31:52 +01:00 · 2025-11-27 20:31:52 +01:00 · e5a621b724
commit e5a621b724
parent 38658ec6f3
2 changed files with 238 additions and 196 deletions
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@ -2,20 +2,12 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # imports for structured outputs tests
 import io
 import json
 import librosa
 import numpy as np
 import openai
 import pytest
 import pytest_asyncio
 import soundfile as sf
 from ...utils import RemoteOpenAIServer
 MODEL_NAME = "openai/whisper-large-v3-turbo"
 SERVER_ARGS = ["--enforce-eager"]
 MISTRAL_FORMAT_ARGS = [
    "--tokenizer_mode",
    "mistral",
@ -26,22 +18,8 @@ MISTRAL_FORMAT_ARGS = [
 ]
@pytest.fixture(scope="module")
 def server():
    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
        yield remote_server
@pytest_asyncio.fixture
 async def client(server):
    async with server.get_async_client() as async_client:
        yield async_client
@pytest.mark.asyncio
-@pytest.mark.parametrize(
+@pytest.mark.parametrize("model_name", ["mistralai/Voxtral-Mini-3B-2507"])
    "model_name", ["openai/whisper-large-v3-turbo", "mistralai/Voxtral-Mini-3B-2507"]
 )
 async def test_basic_audio(mary_had_lamb, model_name):
    server_args = ["--enforce-eager"]
@ -120,176 +98,3 @@ async def test_basic_audio_gemma(foscolo):
        )
        out = json.loads(transcription)["text"]
        assert "da cui vergine nacque Venere" in out
@pytest.mark.asyncio
 async def test_non_asr_model(winning_call):
    # text to text model
    model_name = "JackFram/llama-68m"
    with RemoteOpenAIServer(model_name, SERVER_ARGS) as remote_server:
        client = remote_server.get_async_client()
        res = await client.audio.transcriptions.create(
            model=model_name, file=winning_call, language="en", temperature=0.0
        )
        err = res.error
        assert err["code"] == 400 and not res.text
        assert err["message"] == "The model does not support Transcriptions API"
@pytest.mark.asyncio
 async def test_bad_requests(mary_had_lamb, client):
    # invalid language
    with pytest.raises(openai.BadRequestError):
        await client.audio.transcriptions.create(
            model=MODEL_NAME, file=mary_had_lamb, language="hh", temperature=0.0
        )
@pytest.mark.asyncio
 async def test_long_audio_request(mary_had_lamb, client):
    mary_had_lamb.seek(0)
    audio, sr = librosa.load(mary_had_lamb)
    # Add small silence after each audio for repeatability in the split process
    audio = np.pad(audio, (0, 1600))
    repeated_audio = np.tile(audio, 10)
    # Repeated audio to buffer
    buffer = io.BytesIO()
    sf.write(buffer, repeated_audio, sr, format="WAV")
    buffer.seek(0)
    transcription = await client.audio.transcriptions.create(
        model=MODEL_NAME,
        file=buffer,
        language="en",
        response_format="text",
        temperature=0.0,
    )
    out = json.loads(transcription)
    out_text = out["text"]
    out_usage = out["usage"]
    counts = out_text.count("Mary had a little lamb")
    assert counts == 10, counts
    assert out_usage["seconds"] == 161, out_usage["seconds"]
@pytest.mark.asyncio
 async def test_completion_endpoints(client):
    # text to text model
    res = await client.chat.completions.create(
        model=MODEL_NAME,
        messages=[{"role": "system", "content": "You are a helpful assistant."}],
    )
    err = res.error
    assert err["code"] == 400
    assert err["message"] == "The model does not support Chat Completions API"
    res = await client.completions.create(model=MODEL_NAME, prompt="Hello")
    err = res.error
    assert err["code"] == 400
    assert err["message"] == "The model does not support Completions API"
@pytest.mark.asyncio
 async def test_streaming_response(winning_call, client):
    transcription = ""
    res_no_stream = await client.audio.transcriptions.create(
        model=MODEL_NAME,
        file=winning_call,
        response_format="json",
        language="en",
        temperature=0.0,
    )
    res = await client.audio.transcriptions.create(
        model=MODEL_NAME,
        file=winning_call,
        language="en",
        temperature=0.0,
        stream=True,
        timeout=30,
    )
    # Reconstruct from chunks and validate
    async for chunk in res:
        text = chunk.choices[0]["delta"]["content"]
        transcription += text
    assert transcription == res_no_stream.text
@pytest.mark.asyncio
 async def test_stream_options(winning_call, client):
    res = await client.audio.transcriptions.create(
        model=MODEL_NAME,
        file=winning_call,
        language="en",
        temperature=0.0,
        stream=True,
        extra_body=dict(stream_include_usage=True, stream_continuous_usage_stats=True),
        timeout=30,
    )
    final = False
    continuous = True
    async for chunk in res:
        if not len(chunk.choices):
            # final usage sent
            final = True
        else:
            continuous = continuous and hasattr(chunk, "usage")
    assert final and continuous
@pytest.mark.asyncio
 async def test_sampling_params(mary_had_lamb, client):
    """
    Compare sampling with params and greedy sampling to assert results
    are different when extreme sampling parameters values are picked.
    """
    transcription = await client.audio.transcriptions.create(
        model=MODEL_NAME,
        file=mary_had_lamb,
        language="en",
        temperature=0.8,
        extra_body=dict(
            seed=42,
            repetition_penalty=1.9,
            top_k=12,
            top_p=0.4,
            min_p=0.5,
            frequency_penalty=1.8,
            presence_penalty=2.0,
        ),
    )
    greedy_transcription = await client.audio.transcriptions.create(
        model=MODEL_NAME,
        file=mary_had_lamb,
        language="en",
        temperature=0.0,
        extra_body=dict(seed=42),
    )
    assert greedy_transcription.text != transcription.text
@pytest.mark.asyncio
 async def test_audio_prompt(mary_had_lamb, client):
    prompt = "This is a speech, recorded in a phonograph."
    # Prompts should not omit the part of original prompt while transcribing.
    prefix = "The first words I spoke in the original phonograph"
    transcription = await client.audio.transcriptions.create(
        model=MODEL_NAME,
        file=mary_had_lamb,
        language="en",
        response_format="text",
        temperature=0.0,
    )
    out = json.loads(transcription)["text"]
    assert prefix in out
    transcription_wprompt = await client.audio.transcriptions.create(
        model=MODEL_NAME,
        file=mary_had_lamb,
        language="en",
        response_format="text",
        prompt=prompt,
        temperature=0.0,
    )
    out_prompt = json.loads(transcription_wprompt)["text"]
    assert prefix in out_prompt
--- a/tests/entrypoints/openai/test_transcription_validation_whisper.py
+++ b/tests/entrypoints/openai/test_transcription_validation_whisper.py
@ -0,0 +1,237 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # imports for structured outputs tests
 import asyncio
 import io
 import json
 import librosa
 import numpy as np
 import openai
 import pytest
 import pytest_asyncio
 import soundfile as sf
 from ...utils import RemoteOpenAIServer
 MODEL_NAME = "openai/whisper-large-v3-turbo"
 SERVER_ARGS = ["--enforce-eager"]
@pytest.fixture(scope="module")
 def server():
    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
        yield remote_server
@pytest_asyncio.fixture
 async def whisper_client(server):
    async with server.get_async_client() as async_client:
        yield async_client
@pytest.mark.asyncio
 async def test_basic_audio(mary_had_lamb):
    server_args = ["--enforce-eager"]
    # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
    with RemoteOpenAIServer(MODEL_NAME, server_args) as remote_server:
        client = remote_server.get_async_client()
        transcription = await client.audio.transcriptions.create(
            model=MODEL_NAME,
            file=mary_had_lamb,
            language="en",
            response_format="text",
            temperature=0.0,
        )
        out = json.loads(transcription)
        out_text = out["text"]
        out_usage = out["usage"]
        assert "Mary had a little lamb," in out_text
        assert out_usage["seconds"] == 16, out_usage["seconds"]
@pytest.mark.asyncio
 async def test_basic_audio_batched(mary_had_lamb, winning_call, whisper_client):
    transcription = whisper_client.audio.transcriptions.create(
        model=MODEL_NAME,
        file=mary_had_lamb,
        language="en",
        response_format="text",
        temperature=0.0,
    )
    transcription2 = whisper_client.audio.transcriptions.create(
        model=MODEL_NAME,
        file=winning_call,
        language="en",
        response_format="text",
        temperature=0.0,
    )
    # Await both transcriptions by scheduling coroutines together
    transcription, transcription2 = await asyncio.gather(transcription, transcription2)
    out = json.loads(transcription)
    out_text = out["text"]
    assert "Mary had a little lamb," in out_text
    out2 = json.loads(transcription2)
    out_text2 = out2["text"]
    assert "Edgar Martinez" in out_text2
@pytest.mark.asyncio
 async def test_bad_requests(mary_had_lamb, whisper_client):
    # invalid language
    with pytest.raises(openai.BadRequestError):
        await whisper_client.audio.transcriptions.create(
            model=MODEL_NAME, file=mary_had_lamb, language="hh", temperature=0.0
        )
@pytest.mark.asyncio
 async def test_long_audio_request(mary_had_lamb, whisper_client):
    mary_had_lamb.seek(0)
    audio, sr = librosa.load(mary_had_lamb)
    # Add small silence after each audio for repeatability in the split process
    audio = np.pad(audio, (0, 1600))
    repeated_audio = np.tile(audio, 10)
    # Repeated audio to buffer
    buffer = io.BytesIO()
    sf.write(buffer, repeated_audio, sr, format="WAV")
    buffer.seek(0)
    transcription = await whisper_client.audio.transcriptions.create(
        model=MODEL_NAME,
        file=buffer,
        language="en",
        response_format="text",
        temperature=0.0,
    )
    out = json.loads(transcription)
    out_text = out["text"]
    out_usage = out["usage"]
    counts = out_text.count("Mary had a little lamb")
    assert counts == 10, counts
    assert out_usage["seconds"] == 161, out_usage["seconds"]
@pytest.mark.asyncio
 async def test_completion_endpoints(whisper_client):
    # text to text model
    res = await whisper_client.chat.completions.create(
        model=MODEL_NAME,
        messages=[{"role": "system", "content": "You are a helpful assistant."}],
    )
    err = res.error
    assert err["code"] == 400
    assert err["message"] == "The model does not support Chat Completions API"
    res = await whisper_client.completions.create(model=MODEL_NAME, prompt="Hello")
    err = res.error
    assert err["code"] == 400
    assert err["message"] == "The model does not support Completions API"
@pytest.mark.asyncio
 async def test_streaming_response(winning_call, whisper_client):
    transcription = ""
    res_no_stream = await whisper_client.audio.transcriptions.create(
        model=MODEL_NAME,
        file=winning_call,
        response_format="json",
        language="en",
        temperature=0.0,
    )
    res = await whisper_client.audio.transcriptions.create(
        model=MODEL_NAME,
        file=winning_call,
        language="en",
        temperature=0.0,
        stream=True,
        timeout=30,
    )
    # Reconstruct from chunks and validate
    async for chunk in res:
        text = chunk.choices[0]["delta"]["content"]
        transcription += text
    assert transcription == res_no_stream.text
@pytest.mark.asyncio
 async def test_stream_options(winning_call, whisper_client):
    res = await whisper_client.audio.transcriptions.create(
        model=MODEL_NAME,
        file=winning_call,
        language="en",
        temperature=0.0,
        stream=True,
        extra_body=dict(stream_include_usage=True, stream_continuous_usage_stats=True),
        timeout=30,
    )
    final = False
    continuous = True
    async for chunk in res:
        if not len(chunk.choices):
            # final usage sent
            final = True
        else:
            continuous = continuous and hasattr(chunk, "usage")
    assert final and continuous
@pytest.mark.asyncio
 async def test_sampling_params(mary_had_lamb, whisper_client):
    """
    Compare sampling with params and greedy sampling to assert results
    are different when extreme sampling parameters values are picked.
    """
    transcription = await whisper_client.audio.transcriptions.create(
        model=MODEL_NAME,
        file=mary_had_lamb,
        language="en",
        temperature=0.8,
        extra_body=dict(
            seed=42,
            repetition_penalty=1.9,
            top_k=12,
            top_p=0.4,
            min_p=0.5,
            frequency_penalty=1.8,
            presence_penalty=2.0,
        ),
    )
    greedy_transcription = await whisper_client.audio.transcriptions.create(
        model=MODEL_NAME,
        file=mary_had_lamb,
        language="en",
        temperature=0.0,
        extra_body=dict(seed=42),
    )
    assert greedy_transcription.text != transcription.text
@pytest.mark.asyncio
 async def test_audio_prompt(mary_had_lamb, whisper_client):
    prompt = "This is a speech, recorded in a phonograph."
    # Prompts should not omit the part of original prompt while transcribing.
    prefix = "The first words I spoke in the original phonograph"
    transcription = await whisper_client.audio.transcriptions.create(
        model=MODEL_NAME,
        file=mary_had_lamb,
        language="en",
        response_format="text",
        temperature=0.0,
    )
    out = json.loads(transcription)["text"]
    assert prefix in out
    transcription_wprompt = await whisper_client.audio.transcriptions.create(
        model=MODEL_NAME,
        file=mary_had_lamb,
        language="en",
        response_format="text",
        prompt=prompt,
        temperature=0.0,
    )
    out_prompt = json.loads(transcription_wprompt)["text"]
    assert prefix in out_prompt