From e17a4d3bf9cffe32ec308a5979790732818e4919 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 29 Jul 2025 02:19:21 +0800
Subject: [PATCH] [Bugfix] Fix granite speech shape validation (#21762)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/granite_speech.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py
index 5a3e715c3e748..c9e3b74e7c3c4 100644
--- a/vllm/model_executor/models/granite_speech.py
+++ b/vllm/model_executor/models/granite_speech.py
@@ -64,14 +64,15 @@ class GraniteSpeechAudioInputs(TensorSchema):
     
     Dimensions:
         - b: Batch size
-        - nf: Number of audio features (variable length)
+        - fi: Number of input features from the Mel spectrogram.
+        - fo: Number of output features, i.e. the embedding size.
         - 160: Fixed feature dimension for Mel spectrogram features
     """
 
-    input_features: Annotated[torch.Tensor, TensorShape("b", "nf", 160)]
+    input_features: Annotated[torch.Tensor, TensorShape("b", "fi", 160)]
     """Audio input features."""
 
-    input_features_mask: Annotated[torch.Tensor, TensorShape("b", "nf")]
+    input_features_mask: Annotated[torch.Tensor, TensorShape("b", "fo")]
     """Mask for variable length audio features."""
 
     audio_embed_sizes: Annotated[list[int], TensorShape("b")]