Migrate whisper inputs to TensorSchema (#23505)

Signed-off-by: Benji Beck <benjibeck@meta.com>
2026-03-19 04:07:08 +08:00 · 2025-09-03 11:04:00 -07:00 · 2025-09-03 11:04:00 -07:00 · 731a6940e3
commit 731a6940e3
parent e9b92dcd89
1 changed files with 12 additions and 4 deletions
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@ -4,7 +4,7 @@
 import math
 from collections.abc import Iterable, Mapping, Sequence
 from contextlib import nullcontext
-from typing import Literal, Optional, TypedDict, Union, cast
+from typing import Annotated, Literal, Optional, Union, cast

 import numpy as np
 import torch
@ -40,6 +40,7 @@ from vllm.multimodal.processing import (BaseProcessingInfo,
                                        PromptReplacement, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.transformers_utils.processor import cached_get_processor
+from vllm.utils.tensor_schema import TensorSchema, TensorShape

 from .interfaces import (MultiModalEmbeddings, SupportsMultiModal,
                         SupportsTranscription, SupportsV0Only)
@ -111,9 +112,16 @@ ISO639_1_SUPPORTED_LANGS = {
 }


-class WhisperAudioInputs(TypedDict):
-    input_features: NestedTensors
-    """Shape: `(batch_size, 128, M)`"""
+class WhisperAudioInputs(TensorSchema):
+    """
+    Dimensions:
+        - b: Batch size
+        - nmb: Number of mel bins
+        - t: Time frames (M)
+    """
+
+    input_features: Annotated[Optional[NestedTensors],
+                              TensorShape("b", "nmb", "t")]


 class WhisperPositionalEmbedding(nn.Embedding):