diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index 2cf45eeaab4d..edec40f41760 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -351,6 +351,11 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai Code example: +#### API Enforced Limits + +Set the maximum audio file size (in MB) that VLLM will accept, via the +`VLLM_MAX_AUDIO_CLIP_FILESIZE_MB` environment variable. Default is 25 MB. + #### Extra Parameters The following [sampling parameters][sampling-params] are supported. diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py index e26e1b748b86..c2227a21a4b9 100644 --- a/vllm/entrypoints/openai/speech_to_text.py +++ b/vllm/entrypoints/openai/speech_to_text.py @@ -11,6 +11,7 @@ from typing import Callable, Literal, Optional, TypeVar, Union, cast import numpy as np from fastapi import Request +import vllm.envs as envs from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient from vllm.entrypoints.logger import RequestLogger @@ -38,10 +39,6 @@ T = TypeVar("T", bound=SpeechToTextResponse) logger = init_logger(__name__) -# As per https://platform.openai.com/docs/guides/speech-to-text#overview. -# TODO configurable -MAX_AUDIO_CLIP_FILESIZE_MB = 25 - class OpenAISpeechToText(OpenAIServing): """Base class for speech-to-text operations like transcription and @@ -70,6 +67,8 @@ class OpenAISpeechToText(OpenAIServing): self.asr_config = self.model_cls.get_speech_to_text_config( model_config, task_type) + self.max_audio_filesize_mb = envs.VLLM_MAX_AUDIO_CLIP_FILESIZE_MB + if self.default_sampling_params: logger.info( "Overwriting default completion sampling param with: %s", @@ -93,7 +92,7 @@ class OpenAISpeechToText(OpenAIServing): lang = request.language or "en" self.model_cls.validate_language(lang) - if len(audio_data) / 1024**2 > MAX_AUDIO_CLIP_FILESIZE_MB: + if len(audio_data) / 1024**2 > self.max_audio_filesize_mb: raise ValueError("Maximum file size exceeded.") with io.BytesIO(audio_data) as bytes_: diff --git a/vllm/envs.py b/vllm/envs.py index ca45d69eec1b..5c414e82d93b 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -61,6 +61,7 @@ if TYPE_CHECKING: VLLM_IMAGE_FETCH_TIMEOUT: int = 5 VLLM_VIDEO_FETCH_TIMEOUT: int = 30 VLLM_AUDIO_FETCH_TIMEOUT: int = 10 + VLLM_MAX_AUDIO_CLIP_FILESIZE_MB: int = 25 VLLM_VIDEO_LOADER_BACKEND: str = "opencv" VLLM_MM_INPUT_CACHE_GIB: int = 8 VLLM_TARGET_DEVICE: str = "cuda" @@ -519,6 +520,12 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_AUDIO_FETCH_TIMEOUT": lambda: int(os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")), + # Maximum filesize in MB for a single audio file when processing + # speech-to-text requests. Files larger than this will be rejected. + # Default is 25 MB + "VLLM_MAX_AUDIO_CLIP_FILESIZE_MB": + lambda: int(os.getenv("VLLM_MAX_AUDIO_CLIP_FILESIZE_MB", "25")), + # Backend for Video IO # - "opencv": Default backend that uses OpenCV stream buffered backend. #