[Frontend] Set MAX_AUDIO_CLIP_FILESIZE_MB via env var instead of hardcoding (#21374)

Signed-off-by: Deven Labovitch <deven@videa.ai>
This commit is contained in:
deven-labovitch 2025-07-23 23:22:19 -04:00 committed by GitHub
parent 11599b0e1f
commit 63d92abb7c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 16 additions and 5 deletions

View File

@ -351,6 +351,11 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai
Code example: <gh-file:examples/online_serving/openai_transcription_client.py> Code example: <gh-file:examples/online_serving/openai_transcription_client.py>
<!-- TODO: api enforced limits + uploading audios --> <!-- TODO: api enforced limits + uploading audios -->
#### API Enforced Limits
Set the maximum audio file size (in MB) that VLLM will accept, via the
`VLLM_MAX_AUDIO_CLIP_FILESIZE_MB` environment variable. Default is 25 MB.
#### Extra Parameters #### Extra Parameters
The following [sampling parameters][sampling-params] are supported. The following [sampling parameters][sampling-params] are supported.

View File

@ -11,6 +11,7 @@ from typing import Callable, Literal, Optional, TypeVar, Union, cast
import numpy as np import numpy as np
from fastapi import Request from fastapi import Request
import vllm.envs as envs
from vllm.config import ModelConfig from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient from vllm.engine.protocol import EngineClient
from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.logger import RequestLogger
@ -38,10 +39,6 @@ T = TypeVar("T", bound=SpeechToTextResponse)
logger = init_logger(__name__) logger = init_logger(__name__)
# As per https://platform.openai.com/docs/guides/speech-to-text#overview.
# TODO configurable
MAX_AUDIO_CLIP_FILESIZE_MB = 25
class OpenAISpeechToText(OpenAIServing): class OpenAISpeechToText(OpenAIServing):
"""Base class for speech-to-text operations like transcription and """Base class for speech-to-text operations like transcription and
@ -70,6 +67,8 @@ class OpenAISpeechToText(OpenAIServing):
self.asr_config = self.model_cls.get_speech_to_text_config( self.asr_config = self.model_cls.get_speech_to_text_config(
model_config, task_type) model_config, task_type)
self.max_audio_filesize_mb = envs.VLLM_MAX_AUDIO_CLIP_FILESIZE_MB
if self.default_sampling_params: if self.default_sampling_params:
logger.info( logger.info(
"Overwriting default completion sampling param with: %s", "Overwriting default completion sampling param with: %s",
@ -93,7 +92,7 @@ class OpenAISpeechToText(OpenAIServing):
lang = request.language or "en" lang = request.language or "en"
self.model_cls.validate_language(lang) self.model_cls.validate_language(lang)
if len(audio_data) / 1024**2 > MAX_AUDIO_CLIP_FILESIZE_MB: if len(audio_data) / 1024**2 > self.max_audio_filesize_mb:
raise ValueError("Maximum file size exceeded.") raise ValueError("Maximum file size exceeded.")
with io.BytesIO(audio_data) as bytes_: with io.BytesIO(audio_data) as bytes_:

View File

@ -61,6 +61,7 @@ if TYPE_CHECKING:
VLLM_IMAGE_FETCH_TIMEOUT: int = 5 VLLM_IMAGE_FETCH_TIMEOUT: int = 5
VLLM_VIDEO_FETCH_TIMEOUT: int = 30 VLLM_VIDEO_FETCH_TIMEOUT: int = 30
VLLM_AUDIO_FETCH_TIMEOUT: int = 10 VLLM_AUDIO_FETCH_TIMEOUT: int = 10
VLLM_MAX_AUDIO_CLIP_FILESIZE_MB: int = 25
VLLM_VIDEO_LOADER_BACKEND: str = "opencv" VLLM_VIDEO_LOADER_BACKEND: str = "opencv"
VLLM_MM_INPUT_CACHE_GIB: int = 8 VLLM_MM_INPUT_CACHE_GIB: int = 8
VLLM_TARGET_DEVICE: str = "cuda" VLLM_TARGET_DEVICE: str = "cuda"
@ -519,6 +520,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_AUDIO_FETCH_TIMEOUT": "VLLM_AUDIO_FETCH_TIMEOUT":
lambda: int(os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")), lambda: int(os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")),
# Maximum filesize in MB for a single audio file when processing
# speech-to-text requests. Files larger than this will be rejected.
# Default is 25 MB
"VLLM_MAX_AUDIO_CLIP_FILESIZE_MB":
lambda: int(os.getenv("VLLM_MAX_AUDIO_CLIP_FILESIZE_MB", "25")),
# Backend for Video IO # Backend for Video IO
# - "opencv": Default backend that uses OpenCV stream buffered backend. # - "opencv": Default backend that uses OpenCV stream buffered backend.
# #