feat: add usage to TranscriptionResponse (text and json response_format) (#23576)

Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com>
2025-12-10 10:46:08 +08:00 · 2025-08-26 14:26:26 +02:00 · 2025-08-26 14:26:26 +02:00 · ebd5a77bb5
commit ebd5a77bb5
parent 384dd1b0a8
3 changed files with 32 additions and 5 deletions
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@ -69,8 +69,11 @@ async def test_basic_audio(mary_had_lamb, model_name):
            language="en",
            response_format="text",
            temperature=0.0)
-        out = json.loads(transcription)['text']
+        out = json.loads(transcription)
-        assert "Mary had a little lamb," in out
+        out_text = out['text']
        out_usage = out['usage']
        assert "Mary had a little lamb," in out_text
        assert out_usage["seconds"] == 16, out_usage["seconds"]
@pytest.mark.asyncio
@ -116,9 +119,12 @@ async def test_long_audio_request(mary_had_lamb, client):
        language="en",
        response_format="text",
        temperature=0.0)
-    out = json.loads(transcription)['text']
+    out = json.loads(transcription)
-    counts = out.count("Mary had a little lamb")
+    out_text = out['text']
    out_usage = out['usage']
    counts = out_text.count("Mary had a little lamb")
    assert counts == 10, counts
    assert out_usage["seconds"] == 161, out_usage["seconds"]
@pytest.mark.asyncio
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@ -2232,9 +2232,15 @@ class TranscriptionRequest(OpenAIBaseModel):
 # Transcription response objects
 class TranscriptionUsageAudio(OpenAIBaseModel):
    type: Literal["duration"] = "duration"
    seconds: int
 class TranscriptionResponse(OpenAIBaseModel):
    text: str
    """The transcribed text."""
    usage: TranscriptionUsageAudio
 class TranscriptionWord(OpenAIBaseModel):
--- a/vllm/entrypoints/openai/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text.py
@ -200,7 +200,22 @@ class OpenAISpeechToText(OpenAIServing):
            for result_generator in list_result_generator:
                async for op in result_generator:
                    text += op.outputs[0].text
-            return cast(T, response_class(text=text))
+
            if self.task_type == "transcribe":
                # add usage in TranscriptionResponse.
                usage = {
                    "type": "duration",
                    # rounded up as per openAI specs
                    "seconds": int(math.ceil(duration_s)),
                }
                final_response = cast(T, response_class(text=text,
                                                        usage=usage))
            else:
                # no usage in response for translation task
                final_response = cast(
                    T, response_class(text=text))  # type: ignore[call-arg]
            return final_response
        except asyncio.CancelledError:
            return self.create_error_response("Client disconnected")
        except ValueError as e: