[CI] Whisper logprobs tests (#30504)

Signed-off-by: NickLucche <nlucches@redhat.com>
2026-06-22 05:17:13 +08:00 · 2025-12-13 03:49:11 +01:00 · 2025-12-13 03:49:11 +01:00 · 57e9bf1864
commit 57e9bf1864
parent 2f32a68d75
3 changed files with 133 additions and 112 deletions
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -702,10 +702,16 @@ class HfRunner:
                **kwargs,
            )

+            # Encoder-decoder models return decoder_hidden_states instead of
+            # hidden_states
+            hidden_states = (
+                getattr(output, "hidden_states", None) or output.decoder_hidden_states
+            )
+
            (
                seq_logprobs_lst,
                output_len,
-            ) = self._hidden_states_to_logprobs(output.hidden_states, num_logprobs)
+            ) = self._hidden_states_to_logprobs(hidden_states, num_logprobs)

            all_logprobs.append(seq_logprobs_lst)
            seq_ids = output.sequences[0]
--- a/tests/models/multimodal/generation/test_whisper.py
+++ b/tests/models/multimodal/generation/test_whisper.py
@ -1,150 +1,146 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

+from collections.abc import Sequence
+from typing import Any
+
+import librosa
 import pytest
+from transformers import AutoModelForSpeechSeq2Seq

-from vllm import SamplingParams
 from vllm.assets.audio import AudioAsset
+from vllm.platforms import current_platform

-from ....conftest import VllmRunner
+from ....conftest import HfRunner, PromptAudioInput, VllmRunner
 from ....utils import create_new_process_for_each_test, multi_gpu_test
+from ...registry import HF_EXAMPLE_MODELS
+from ...utils import check_logprobs_close

-PROMPTS = [
-    {
-        "prompt": "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
-        "multi_modal_data": {
-            "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
-        },
-    },
-    {  # Test explicit encoder/decoder prompt
-        "encoder_prompt": {
-            "prompt": "",
-            "multi_modal_data": {
-                "audio": AudioAsset("winning_call").audio_and_sample_rate,
-            },
-        },
-        "decoder_prompt": "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
-    },
-]
+VLLM_PROMPT = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"
+HF_PROMPT = ""
+# Whisper expects 16kHz audio
+WHISPER_SAMPLE_RATE = 16000

-EXPECTED = {
-    "openai/whisper-tiny": [
-        " He has birth words I spoke in the original corner of that. And a"
-        " little piece of black coat poetry. Mary had a little sandwich,"
-        " sweet, with white and snow. And everyone had it very went the last"
-        " would sure to go.",
-        " >> And the old one, fit John the way to Edgar Martinez. >> One more"
-        " to line down the field line for our base camp. Here comes joy. Here"
-        " is June and the third base. They're going to wave him in. The throw"
-        " to the plate will be late. The Mariners are going to play for the"
-        " American League Championship. I don't believe it. It just continues"
-        " by all five.",
-    ],
-    "openai/whisper-small": [
-        " The first words I spoke in the original pornograph. A little piece"
-        " of practical poetry. Mary had a little lamb, its fleece was quite a"
-        " slow, and everywhere that Mary went the lamb was sure to go.",
-        " And the old one pitch on the way to Edgar Martinez one month. Here"
-        " comes joy. Here is Junior to third base. They're gonna wave him"
-        " in. The throw to the plate will be late. The Mariners are going to"
-        " play for the American League Championship. I don't believe it. It"
-        " just continues. My, oh my.",
-    ],
-    "openai/whisper-medium": [
-        " The first words I spoke in the original phonograph, a little piece"
-        " of practical poetry. Mary had a little lamb, its fleece was quite as"
-        " slow, and everywhere that Mary went the lamb was sure to go.",
-        " And the 0-1 pitch on the way to Edgar Martinez swung on the line"
-        " down the left field line for Obeyshev. Here comes Joy. Here is"
-        " Jorgen at third base. They're going to wave him in. The throw to the"
-        " plate will be late. The Mariners are going to play for the American"
-        " League Championship. I don't believe it. It just continues. My, oh"
-        " my.",
-    ],
-    "openai/whisper-large-v3": [
-        " The first words I spoke in the original phonograph, a little piece"
-        " of practical poetry. Mary had a little lamb, its feet were quite as"
-        " slow, and everywhere that Mary went, the lamb was sure to go.",
-        " And the 0-1 pitch on the way to Edgar Martinez. Swung on the line."
-        " Now the left field line for a base hit. Here comes Joy. Here is"
-        " Junior to third base. They're going to wave him in. The throw to the"
-        " plate will be late. The Mariners are going to play for the American"
-        " League Championship. I don't believe it. It just continues. My, oh,"
-        " my.",
-    ],
-    "openai/whisper-large-v3-turbo": [
-        " The first words I spoke in the original phonograph, a little piece"
-        " of practical poetry. Mary had a little lamb, its streets were quite"
-        " as slow, and everywhere that Mary went the lamb was sure to go.",
-        " And the 0-1 pitch on the way to Edgar Martinez. Swung on the line"
-        " down the left field line for a base hit. Here comes Joy. Here is"
-        " Junior to third base. They're going to wave him in. The throw to the"
-        " plate will be late. The Mariners are going to play for the American"
-        " League Championship. I don't believe it. It just continues. My, oh,"
-        " my.",
-    ],
-}
+
+@pytest.fixture(autouse=True)
+def use_spawn_for_whisper(monkeypatch):
+    """Whisper has issues with forked workers, use spawn instead."""
+    monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")


 def run_test(
+    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
+    inputs: Sequence[tuple[list[str], list[str], PromptAudioInput]],
    model: str,
    *,
+    max_model_len: int,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
    tensor_parallel_size: int,
    distributed_executor_backend: str | None = None,
-    dtype: str = "half",
+    enforce_eager: bool = True,
 ) -> None:
-    prompt_list = PROMPTS * 10
-    expected_list = EXPECTED[model] * 10
+    """Inference result should be the same between hf and vllm.

+    All the audio fixtures for the test are from AudioAsset.
+    For huggingface runner, we provide the audio as input.
+    For vllm runner, we provide MultiModalDataDict objects
+    and corresponding MultiModalConfig as input.
+    """
    with vllm_runner(
        model,
        dtype=dtype,
-        max_model_len=448,
+        max_model_len=max_model_len,
        tensor_parallel_size=tensor_parallel_size,
        distributed_executor_backend=distributed_executor_backend,
-        # TODO (NickLucche) figure out output differences with non-eager and re-enable
-        enforce_eager=True,
+        limit_mm_per_prompt={"audio": 2},
+        enforce_eager=enforce_eager,
+        disable_custom_all_reduce=True,
    ) as vllm_model:
-        llm = vllm_model.llm
+        vllm_outputs_per_case = [
+            vllm_model.generate_greedy_logprobs(
+                vllm_prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                audios=audios,
+            )
+            for vllm_prompts, _, audios in inputs
+        ]

-        sampling_params = SamplingParams(
-            temperature=0,
-            top_p=1.0,
-            max_tokens=200,
+    with hf_runner(model, dtype=dtype, auto_cls=AutoModelForSpeechSeq2Seq) as hf_model:
+        hf_outputs_per_case = [
+            hf_model.generate_greedy_logprobs_limit(
+                hf_prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                audios=audios,
+            )
+            for _, hf_prompts, audios in inputs
+        ]
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
        )

-        outputs = llm.generate(prompt_list, sampling_params)

-    for output, expected in zip(outputs, expected_list):
-        print(output.outputs[0].text)
-        assert output.outputs[0].text == expected
+@pytest.fixture
+def input_audios() -> list[tuple[list[str], list[str], list[tuple[Any, int]]]]:
+    audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
+    inputs = []
+    for asset in audio_assets:
+        audio, orig_sr = asset.audio_and_sample_rate
+        # Resample to Whisper's expected sample rate (16kHz)
+        if orig_sr != WHISPER_SAMPLE_RATE:
+            audio = librosa.resample(
+                audio, orig_sr=orig_sr, target_sr=WHISPER_SAMPLE_RATE
+            )
+        # vLLM prompts, HF prompts, audio inputs
+        inputs.append(([VLLM_PROMPT], [HF_PROMPT], [(audio, WHISPER_SAMPLE_RATE)]))
+    return inputs
+
+
+def check_model_available(model: str) -> None:
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+    model_info.check_available_online(on_fail="skip")
+    model_info.check_transformers_version(on_fail="skip")


@pytest.mark.core_model
-@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
-@pytest.mark.parametrize("dtype", ["half"])
-@create_new_process_for_each_test()
-def test_models(vllm_runner, model, dtype) -> None:
-    run_test(
-        vllm_runner,
-        model,
-        tensor_parallel_size=1,
-        dtype=dtype,
-    )
-
-
@pytest.mark.cpu_model
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
@pytest.mark.parametrize("dtype", ["half"])
-def test_models_cpu(vllm_runner, model, dtype) -> None:
-    # @create_new_process_for_each_test() does not work for some runners
-    # TODO: to fix cpu privilege issues in run-cpu-test-arm.sh
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("enforce_eager", [True, False])
+@create_new_process_for_each_test("spawn")
+def test_models(
+    hf_runner,
+    vllm_runner,
+    model: str,
+    dtype: str,
+    num_logprobs: int,
+    input_audios,
+    enforce_eager: bool,
+) -> None:
+    check_model_available(model)
+    if current_platform.is_cpu() and not enforce_eager:
+        pytest.skip("Skipping test for CPU with non-eager mode")
    run_test(
+        hf_runner,
        vllm_runner,
+        input_audios,
        model,
-        tensor_parallel_size=1,
        dtype=dtype,
+        max_model_len=448,
+        max_tokens=200,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=1,
+        enforce_eager=enforce_eager,
    )


@ -152,15 +148,31 @@ def test_models_cpu(vllm_runner, model, dtype) -> None:
@pytest.mark.core_model
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
-@create_new_process_for_each_test()
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [200])
+@pytest.mark.parametrize("num_logprobs", [5])
+@create_new_process_for_each_test("spawn")
 def test_models_distributed(
+    hf_runner,
    vllm_runner,
-    model,
-    distributed_executor_backend,
+    model: str,
+    distributed_executor_backend: str,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    input_audios,
 ) -> None:
+    check_model_available(model)
    run_test(
+        hf_runner,
        vllm_runner,
+        input_audios,
        model,
+        dtype=dtype,
+        max_model_len=448,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
        tensor_parallel_size=2,
        distributed_executor_backend=distributed_executor_backend,
+        enforce_eager=False,
    )
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@ -840,7 +840,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
        is_available_online=False,
    ),
    # [Encoder-decoder]
-    "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"),
+    "WhisperForConditionalGeneration": _HfExamplesInfo(
+        "openai/whisper-large-v3-turbo",
+        extras={"v3": "openai/whisper-large-v3"},
+    ),
    # [Cross-encoder]
    "JinaVLForRanking": _HfExamplesInfo("jinaai/jina-reranker-m0"),
 }