feat: add usage to TranscriptionResponse (text and json response_format) (#23576)

Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com>
2025-12-11 06:45:01 +08:00 · 2025-08-26 14:26:26 +02:00 · 2025-08-26 14:26:26 +02:00 · ebd5a77bb5
commit ebd5a77bb5
parent 384dd1b0a8
3 changed files with 32 additions and 5 deletions
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@ -69,8 +69,11 @@ async def test_basic_audio(mary_had_lamb, model_name):
            language="en",
            response_format="text",
            temperature=0.0)
-        out = json.loads(transcription)['text']
-        assert "Mary had a little lamb," in out
+        out = json.loads(transcription)
+        out_text = out['text']
+        out_usage = out['usage']
+        assert "Mary had a little lamb," in out_text
+        assert out_usage["seconds"] == 16, out_usage["seconds"]


@pytest.mark.asyncio
@ -116,9 +119,12 @@ async def test_long_audio_request(mary_had_lamb, client):
        language="en",
        response_format="text",
        temperature=0.0)
-    out = json.loads(transcription)['text']
-    counts = out.count("Mary had a little lamb")
+    out = json.loads(transcription)
+    out_text = out['text']
+    out_usage = out['usage']
+    counts = out_text.count("Mary had a little lamb")
    assert counts == 10, counts
+    assert out_usage["seconds"] == 161, out_usage["seconds"]


@pytest.mark.asyncio
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@ -2232,9 +2232,15 @@ class TranscriptionRequest(OpenAIBaseModel):


 # Transcription response objects
+class TranscriptionUsageAudio(OpenAIBaseModel):
+    type: Literal["duration"] = "duration"
+    seconds: int
+
+
 class TranscriptionResponse(OpenAIBaseModel):
    text: str
    """The transcribed text."""
+    usage: TranscriptionUsageAudio


 class TranscriptionWord(OpenAIBaseModel):
--- a/vllm/entrypoints/openai/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text.py
@ -200,7 +200,22 @@ class OpenAISpeechToText(OpenAIServing):
            for result_generator in list_result_generator:
                async for op in result_generator:
                    text += op.outputs[0].text
-            return cast(T, response_class(text=text))
+
+            if self.task_type == "transcribe":
+                # add usage in TranscriptionResponse.
+                usage = {
+                    "type": "duration",
+                    # rounded up as per openAI specs
+                    "seconds": int(math.ceil(duration_s)),
+                }
+                final_response = cast(T, response_class(text=text,
+                                                        usage=usage))
+            else:
+                # no usage in response for translation task
+                final_response = cast(
+                    T, response_class(text=text))  # type: ignore[call-arg]
+
+            return final_response
        except asyncio.CancelledError:
            return self.create_error_response("Client disconnected")
        except ValueError as e: