[Frontend] Set MAX_AUDIO_CLIP_FILESIZE_MB via env var instead of hardcoding (#21374)

Signed-off-by: Deven Labovitch <deven@videa.ai>
2025-12-10 09:06:03 +08:00 · 2025-07-23 23:22:19 -04:00 · 2025-07-23 23:22:19 -04:00 · 63d92abb7c
commit 63d92abb7c
parent 11599b0e1f
3 changed files with 16 additions and 5 deletions
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@ -351,6 +351,11 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai
 Code example: <gh-file:examples/online_serving/openai_transcription_client.py>
 <!-- TODO: api enforced limits + uploading audios -->
 #### API Enforced Limits
 Set the maximum audio file size (in MB) that VLLM will accept, via the
 `VLLM_MAX_AUDIO_CLIP_FILESIZE_MB` environment variable. Default is 25 MB.
 #### Extra Parameters
 The following [sampling parameters][sampling-params] are supported.
--- a/vllm/entrypoints/openai/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text.py
@ -11,6 +11,7 @@ from typing import Callable, Literal, Optional, TypeVar, Union, cast
 import numpy as np
 from fastapi import Request
 import vllm.envs as envs
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.logger import RequestLogger
@ -38,10 +39,6 @@ T = TypeVar("T", bound=SpeechToTextResponse)
 logger = init_logger(__name__)
 # As per https://platform.openai.com/docs/guides/speech-to-text#overview.
 # TODO configurable
 MAX_AUDIO_CLIP_FILESIZE_MB = 25
 class OpenAISpeechToText(OpenAIServing):
    """Base class for speech-to-text operations like transcription and 
@ -70,6 +67,8 @@ class OpenAISpeechToText(OpenAIServing):
        self.asr_config = self.model_cls.get_speech_to_text_config(
            model_config, task_type)
        self.max_audio_filesize_mb = envs.VLLM_MAX_AUDIO_CLIP_FILESIZE_MB
        if self.default_sampling_params:
            logger.info(
                "Overwriting default completion sampling param with: %s",
@ -93,7 +92,7 @@ class OpenAISpeechToText(OpenAIServing):
        lang = request.language or "en"
        self.model_cls.validate_language(lang)
-        if len(audio_data) / 1024**2 > MAX_AUDIO_CLIP_FILESIZE_MB:
+        if len(audio_data) / 1024**2 > self.max_audio_filesize_mb:
            raise ValueError("Maximum file size exceeded.")
        with io.BytesIO(audio_data) as bytes_:
--- a/vllm/envs.py
+++ b/vllm/envs.py
@ -61,6 +61,7 @@ if TYPE_CHECKING:
    VLLM_IMAGE_FETCH_TIMEOUT: int = 5
    VLLM_VIDEO_FETCH_TIMEOUT: int = 30
    VLLM_AUDIO_FETCH_TIMEOUT: int = 10
    VLLM_MAX_AUDIO_CLIP_FILESIZE_MB: int = 25
    VLLM_VIDEO_LOADER_BACKEND: str = "opencv"
    VLLM_MM_INPUT_CACHE_GIB: int = 8
    VLLM_TARGET_DEVICE: str = "cuda"
@ -519,6 +520,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_AUDIO_FETCH_TIMEOUT":
    lambda: int(os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")),
    # Maximum filesize in MB for a single audio file when processing
    # speech-to-text requests. Files larger than this will be rejected.
    # Default is 25 MB
    "VLLM_MAX_AUDIO_CLIP_FILESIZE_MB":
    lambda: int(os.getenv("VLLM_MAX_AUDIO_CLIP_FILESIZE_MB", "25")),
    # Backend for Video IO
    # - "opencv": Default backend that uses OpenCV stream buffered backend.
    #