[CI] Whisper logprobs tests (#30504)

Signed-off-by: NickLucche <nlucches@redhat.com>
2026-05-21 20:30:14 +08:00 · 2025-12-13 03:49:11 +01:00 · 2025-12-13 03:49:11 +01:00 · 57e9bf1864
commit 57e9bf1864
parent 2f32a68d75
3 changed files with 133 additions and 112 deletions
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -702,10 +702,16 @@ class HfRunner:
                **kwargs,
            )
            # Encoder-decoder models return decoder_hidden_states instead of
            # hidden_states
            hidden_states = (
                getattr(output, "hidden_states", None) or output.decoder_hidden_states
            )
            (
                seq_logprobs_lst,
                output_len,
-            ) = self._hidden_states_to_logprobs(output.hidden_states, num_logprobs)
+            ) = self._hidden_states_to_logprobs(hidden_states, num_logprobs)
            all_logprobs.append(seq_logprobs_lst)
            seq_ids = output.sequences[0]
--- a/tests/models/multimodal/generation/test_whisper.py
+++ b/tests/models/multimodal/generation/test_whisper.py
@ -1,150 +1,146 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Sequence
 from typing import Any
 import librosa
 import pytest
 from transformers import AutoModelForSpeechSeq2Seq
 from vllm import SamplingParams
 from vllm.assets.audio import AudioAsset
 from vllm.platforms import current_platform
-from ....conftest import VllmRunner
+from ....conftest import HfRunner, PromptAudioInput, VllmRunner
 from ....utils import create_new_process_for_each_test, multi_gpu_test
 from ...registry import HF_EXAMPLE_MODELS
 from ...utils import check_logprobs_close
-PROMPTS = [
+VLLM_PROMPT = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"
-    {
+HF_PROMPT = ""
-        "prompt": "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
+# Whisper expects 16kHz audio
-        "multi_modal_data": {
+WHISPER_SAMPLE_RATE = 16000
            "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
        },
    },
    {  # Test explicit encoder/decoder prompt
        "encoder_prompt": {
            "prompt": "",
            "multi_modal_data": {
                "audio": AudioAsset("winning_call").audio_and_sample_rate,
            },
        },
        "decoder_prompt": "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
    },
 ]
-EXPECTED = {
+
-    "openai/whisper-tiny": [
+@pytest.fixture(autouse=True)
-        " He has birth words I spoke in the original corner of that. And a"
+def use_spawn_for_whisper(monkeypatch):
-        " little piece of black coat poetry. Mary had a little sandwich,"
+    """Whisper has issues with forked workers, use spawn instead."""
-        " sweet, with white and snow. And everyone had it very went the last"
+    monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
        " would sure to go.",
        " >> And the old one, fit John the way to Edgar Martinez. >> One more"
        " to line down the field line for our base camp. Here comes joy. Here"
        " is June and the third base. They're going to wave him in. The throw"
        " to the plate will be late. The Mariners are going to play for the"
        " American League Championship. I don't believe it. It just continues"
        " by all five.",
    ],
    "openai/whisper-small": [
        " The first words I spoke in the original pornograph. A little piece"
        " of practical poetry. Mary had a little lamb, its fleece was quite a"
        " slow, and everywhere that Mary went the lamb was sure to go.",
        " And the old one pitch on the way to Edgar Martinez one month. Here"
        " comes joy. Here is Junior to third base. They're gonna wave him"
        " in. The throw to the plate will be late. The Mariners are going to"
        " play for the American League Championship. I don't believe it. It"
        " just continues. My, oh my.",
    ],
    "openai/whisper-medium": [
        " The first words I spoke in the original phonograph, a little piece"
        " of practical poetry. Mary had a little lamb, its fleece was quite as"
        " slow, and everywhere that Mary went the lamb was sure to go.",
        " And the 0-1 pitch on the way to Edgar Martinez swung on the line"
        " down the left field line for Obeyshev. Here comes Joy. Here is"
        " Jorgen at third base. They're going to wave him in. The throw to the"
        " plate will be late. The Mariners are going to play for the American"
        " League Championship. I don't believe it. It just continues. My, oh"
        " my.",
    ],
    "openai/whisper-large-v3": [
        " The first words I spoke in the original phonograph, a little piece"
        " of practical poetry. Mary had a little lamb, its feet were quite as"
        " slow, and everywhere that Mary went, the lamb was sure to go.",
        " And the 0-1 pitch on the way to Edgar Martinez. Swung on the line."
        " Now the left field line for a base hit. Here comes Joy. Here is"
        " Junior to third base. They're going to wave him in. The throw to the"
        " plate will be late. The Mariners are going to play for the American"
        " League Championship. I don't believe it. It just continues. My, oh,"
        " my.",
    ],
    "openai/whisper-large-v3-turbo": [
        " The first words I spoke in the original phonograph, a little piece"
        " of practical poetry. Mary had a little lamb, its streets were quite"
        " as slow, and everywhere that Mary went the lamb was sure to go.",
        " And the 0-1 pitch on the way to Edgar Martinez. Swung on the line"
        " down the left field line for a base hit. Here comes Joy. Here is"
        " Junior to third base. They're going to wave him in. The throw to the"
        " plate will be late. The Mariners are going to play for the American"
        " League Championship. I don't believe it. It just continues. My, oh,"
        " my.",
    ],
 }
 def run_test(
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
    inputs: Sequence[tuple[list[str], list[str], PromptAudioInput]],
    model: str,
    *,
    max_model_len: int,
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
    tensor_parallel_size: int,
    distributed_executor_backend: str | None = None,
-    dtype: str = "half",
+    enforce_eager: bool = True,
 ) -> None:
-    prompt_list = PROMPTS * 10
+    """Inference result should be the same between hf and vllm.
    expected_list = EXPECTED[model] * 10
    All the audio fixtures for the test are from AudioAsset.
    For huggingface runner, we provide the audio as input.
    For vllm runner, we provide MultiModalDataDict objects
    and corresponding MultiModalConfig as input.
    """
    with vllm_runner(
        model,
        dtype=dtype,
-        max_model_len=448,
+        max_model_len=max_model_len,
        tensor_parallel_size=tensor_parallel_size,
        distributed_executor_backend=distributed_executor_backend,
-        # TODO (NickLucche) figure out output differences with non-eager and re-enable
+        limit_mm_per_prompt={"audio": 2},
-        enforce_eager=True,
+        enforce_eager=enforce_eager,
        disable_custom_all_reduce=True,
    ) as vllm_model:
-        llm = vllm_model.llm
+        vllm_outputs_per_case = [
            vllm_model.generate_greedy_logprobs(
                vllm_prompts,
                max_tokens,
                num_logprobs=num_logprobs,
                audios=audios,
            )
            for vllm_prompts, _, audios in inputs
        ]
-        sampling_params = SamplingParams(
+    with hf_runner(model, dtype=dtype, auto_cls=AutoModelForSpeechSeq2Seq) as hf_model:
-            temperature=0,
+        hf_outputs_per_case = [
-            top_p=1.0,
+            hf_model.generate_greedy_logprobs_limit(
-            max_tokens=200,
+                hf_prompts,
                max_tokens,
                num_logprobs=num_logprobs,
                audios=audios,
            )
            for _, hf_prompts, audios in inputs
        ]
    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
        check_logprobs_close(
            outputs_0_lst=hf_outputs,
            outputs_1_lst=vllm_outputs,
            name_0="hf",
            name_1="vllm",
        )
        outputs = llm.generate(prompt_list, sampling_params)
-    for output, expected in zip(outputs, expected_list):
+@pytest.fixture
-        print(output.outputs[0].text)
+def input_audios() -> list[tuple[list[str], list[str], list[tuple[Any, int]]]]:
-        assert output.outputs[0].text == expected
+    audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
    inputs = []
    for asset in audio_assets:
        audio, orig_sr = asset.audio_and_sample_rate
        # Resample to Whisper's expected sample rate (16kHz)
        if orig_sr != WHISPER_SAMPLE_RATE:
            audio = librosa.resample(
                audio, orig_sr=orig_sr, target_sr=WHISPER_SAMPLE_RATE
            )
        # vLLM prompts, HF prompts, audio inputs
        inputs.append(([VLLM_PROMPT], [HF_PROMPT], [(audio, WHISPER_SAMPLE_RATE)]))
    return inputs
 def check_model_available(model: str) -> None:
    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
    model_info.check_available_online(on_fail="skip")
    model_info.check_transformers_version(on_fail="skip")
@pytest.mark.core_model
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
@pytest.mark.parametrize("dtype", ["half"])
@create_new_process_for_each_test()
 def test_models(vllm_runner, model, dtype) -> None:
    run_test(
        vllm_runner,
        model,
        tensor_parallel_size=1,
        dtype=dtype,
    )
@pytest.mark.cpu_model
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
@pytest.mark.parametrize("dtype", ["half"])
-def test_models_cpu(vllm_runner, model, dtype) -> None:
+@pytest.mark.parametrize("num_logprobs", [5])
-    # @create_new_process_for_each_test() does not work for some runners
+@pytest.mark.parametrize("enforce_eager", [True, False])
-    # TODO: to fix cpu privilege issues in run-cpu-test-arm.sh
+@create_new_process_for_each_test("spawn")
 def test_models(
    hf_runner,
    vllm_runner,
    model: str,
    dtype: str,
    num_logprobs: int,
    input_audios,
    enforce_eager: bool,
 ) -> None:
    check_model_available(model)
    if current_platform.is_cpu() and not enforce_eager:
        pytest.skip("Skipping test for CPU with non-eager mode")
    run_test(
        hf_runner,
        vllm_runner,
        input_audios,
        model,
        tensor_parallel_size=1,
        dtype=dtype,
        max_model_len=448,
        max_tokens=200,
        num_logprobs=num_logprobs,
        tensor_parallel_size=1,
        enforce_eager=enforce_eager,
    )
@ -152,15 +148,31 @@ def test_models_cpu(vllm_runner, model, dtype) -> None:
@pytest.mark.core_model
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
-@create_new_process_for_each_test()
+@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [200])
@pytest.mark.parametrize("num_logprobs", [5])
@create_new_process_for_each_test("spawn")
 def test_models_distributed(
    hf_runner,
    vllm_runner,
-    model,
+    model: str,
-    distributed_executor_backend,
+    distributed_executor_backend: str,
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
    input_audios,
 ) -> None:
    check_model_available(model)
    run_test(
        hf_runner,
        vllm_runner,
        input_audios,
        model,
        dtype=dtype,
        max_model_len=448,
        max_tokens=max_tokens,
        num_logprobs=num_logprobs,
        tensor_parallel_size=2,
        distributed_executor_backend=distributed_executor_backend,
        enforce_eager=False,
    )
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@ -840,7 +840,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
        is_available_online=False,
    ),
    # [Encoder-decoder]
-    "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"),
+    "WhisperForConditionalGeneration": _HfExamplesInfo(
        "openai/whisper-large-v3-turbo",
        extras={"v3": "openai/whisper-large-v3"},
    ),
    # [Cross-encoder]
    "JinaVLForRanking": _HfExamplesInfo("jinaai/jina-reranker-m0"),
 }