From ebd5a77bb5a6b7643f047f61294da0ce92baf3f6 Mon Sep 17 00:00:00 2001 From: Guillaume Calmettes Date: Tue, 26 Aug 2025 14:26:26 +0200 Subject: [PATCH] feat: add usage to TranscriptionResponse (text and json response_format) (#23576) Signed-off-by: Guillaume Calmettes --- .../openai/test_transcription_validation.py | 14 ++++++++++---- vllm/entrypoints/openai/protocol.py | 6 ++++++ vllm/entrypoints/openai/speech_to_text.py | 17 ++++++++++++++++- 3 files changed, 32 insertions(+), 5 deletions(-) diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py index 93239f41a4ae..6009d9aeec93 100644 --- a/tests/entrypoints/openai/test_transcription_validation.py +++ b/tests/entrypoints/openai/test_transcription_validation.py @@ -69,8 +69,11 @@ async def test_basic_audio(mary_had_lamb, model_name): language="en", response_format="text", temperature=0.0) - out = json.loads(transcription)['text'] - assert "Mary had a little lamb," in out + out = json.loads(transcription) + out_text = out['text'] + out_usage = out['usage'] + assert "Mary had a little lamb," in out_text + assert out_usage["seconds"] == 16, out_usage["seconds"] @pytest.mark.asyncio @@ -116,9 +119,12 @@ async def test_long_audio_request(mary_had_lamb, client): language="en", response_format="text", temperature=0.0) - out = json.loads(transcription)['text'] - counts = out.count("Mary had a little lamb") + out = json.loads(transcription) + out_text = out['text'] + out_usage = out['usage'] + counts = out_text.count("Mary had a little lamb") assert counts == 10, counts + assert out_usage["seconds"] == 161, out_usage["seconds"] @pytest.mark.asyncio diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index a3d7b78cf455..5cb41bd93d4b 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -2232,9 +2232,15 @@ class TranscriptionRequest(OpenAIBaseModel): # Transcription response objects +class TranscriptionUsageAudio(OpenAIBaseModel): + type: Literal["duration"] = "duration" + seconds: int + + class TranscriptionResponse(OpenAIBaseModel): text: str """The transcribed text.""" + usage: TranscriptionUsageAudio class TranscriptionWord(OpenAIBaseModel): diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py index 01140a4bfea7..de2619a78f8e 100644 --- a/vllm/entrypoints/openai/speech_to_text.py +++ b/vllm/entrypoints/openai/speech_to_text.py @@ -200,7 +200,22 @@ class OpenAISpeechToText(OpenAIServing): for result_generator in list_result_generator: async for op in result_generator: text += op.outputs[0].text - return cast(T, response_class(text=text)) + + if self.task_type == "transcribe": + # add usage in TranscriptionResponse. + usage = { + "type": "duration", + # rounded up as per openAI specs + "seconds": int(math.ceil(duration_s)), + } + final_response = cast(T, response_class(text=text, + usage=usage)) + else: + # no usage in response for translation task + final_response = cast( + T, response_class(text=text)) # type: ignore[call-arg] + + return final_response except asyncio.CancelledError: return self.create_error_response("Client disconnected") except ValueError as e: