[Frontend] Set MAX_AUDIO_CLIP_FILESIZE_MB via env var instead of hardcoding (#21374)

Signed-off-by: Deven Labovitch <deven@videa.ai>
This commit is contained in:
deven-labovitch 2025-07-23 23:22:19 -04:00 committed by GitHub
parent 11599b0e1f
commit 63d92abb7c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 16 additions and 5 deletions

View File

@ -351,6 +351,11 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai
Code example: <gh-file:examples/online_serving/openai_transcription_client.py>
<!-- TODO: api enforced limits + uploading audios -->
#### API Enforced Limits
Set the maximum audio file size (in MB) that VLLM will accept, via the
`VLLM_MAX_AUDIO_CLIP_FILESIZE_MB` environment variable. Default is 25 MB.
#### Extra Parameters
The following [sampling parameters][sampling-params] are supported.

View File

@ -11,6 +11,7 @@ from typing import Callable, Literal, Optional, TypeVar, Union, cast
import numpy as np
from fastapi import Request
import vllm.envs as envs
from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.logger import RequestLogger
@ -38,10 +39,6 @@ T = TypeVar("T", bound=SpeechToTextResponse)
logger = init_logger(__name__)
# As per https://platform.openai.com/docs/guides/speech-to-text#overview.
# TODO configurable
MAX_AUDIO_CLIP_FILESIZE_MB = 25
class OpenAISpeechToText(OpenAIServing):
"""Base class for speech-to-text operations like transcription and
@ -70,6 +67,8 @@ class OpenAISpeechToText(OpenAIServing):
self.asr_config = self.model_cls.get_speech_to_text_config(
model_config, task_type)
self.max_audio_filesize_mb = envs.VLLM_MAX_AUDIO_CLIP_FILESIZE_MB
if self.default_sampling_params:
logger.info(
"Overwriting default completion sampling param with: %s",
@ -93,7 +92,7 @@ class OpenAISpeechToText(OpenAIServing):
lang = request.language or "en"
self.model_cls.validate_language(lang)
if len(audio_data) / 1024**2 > MAX_AUDIO_CLIP_FILESIZE_MB:
if len(audio_data) / 1024**2 > self.max_audio_filesize_mb:
raise ValueError("Maximum file size exceeded.")
with io.BytesIO(audio_data) as bytes_:

View File

@ -61,6 +61,7 @@ if TYPE_CHECKING:
VLLM_IMAGE_FETCH_TIMEOUT: int = 5
VLLM_VIDEO_FETCH_TIMEOUT: int = 30
VLLM_AUDIO_FETCH_TIMEOUT: int = 10
VLLM_MAX_AUDIO_CLIP_FILESIZE_MB: int = 25
VLLM_VIDEO_LOADER_BACKEND: str = "opencv"
VLLM_MM_INPUT_CACHE_GIB: int = 8
VLLM_TARGET_DEVICE: str = "cuda"
@ -519,6 +520,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_AUDIO_FETCH_TIMEOUT":
lambda: int(os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")),
# Maximum filesize in MB for a single audio file when processing
# speech-to-text requests. Files larger than this will be rejected.
# Default is 25 MB
"VLLM_MAX_AUDIO_CLIP_FILESIZE_MB":
lambda: int(os.getenv("VLLM_MAX_AUDIO_CLIP_FILESIZE_MB", "25")),
# Backend for Video IO
# - "opencv": Default backend that uses OpenCV stream buffered backend.
#