mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 09:06:03 +08:00
[Frontend] Set MAX_AUDIO_CLIP_FILESIZE_MB via env var instead of hardcoding (#21374)
Signed-off-by: Deven Labovitch <deven@videa.ai>
This commit is contained in:
parent
11599b0e1f
commit
63d92abb7c
@ -351,6 +351,11 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai
|
|||||||
Code example: <gh-file:examples/online_serving/openai_transcription_client.py>
|
Code example: <gh-file:examples/online_serving/openai_transcription_client.py>
|
||||||
<!-- TODO: api enforced limits + uploading audios -->
|
<!-- TODO: api enforced limits + uploading audios -->
|
||||||
|
|
||||||
|
#### API Enforced Limits
|
||||||
|
|
||||||
|
Set the maximum audio file size (in MB) that VLLM will accept, via the
|
||||||
|
`VLLM_MAX_AUDIO_CLIP_FILESIZE_MB` environment variable. Default is 25 MB.
|
||||||
|
|
||||||
#### Extra Parameters
|
#### Extra Parameters
|
||||||
|
|
||||||
The following [sampling parameters][sampling-params] are supported.
|
The following [sampling parameters][sampling-params] are supported.
|
||||||
|
|||||||
@ -11,6 +11,7 @@ from typing import Callable, Literal, Optional, TypeVar, Union, cast
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from fastapi import Request
|
from fastapi import Request
|
||||||
|
|
||||||
|
import vllm.envs as envs
|
||||||
from vllm.config import ModelConfig
|
from vllm.config import ModelConfig
|
||||||
from vllm.engine.protocol import EngineClient
|
from vllm.engine.protocol import EngineClient
|
||||||
from vllm.entrypoints.logger import RequestLogger
|
from vllm.entrypoints.logger import RequestLogger
|
||||||
@ -38,10 +39,6 @@ T = TypeVar("T", bound=SpeechToTextResponse)
|
|||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
# As per https://platform.openai.com/docs/guides/speech-to-text#overview.
|
|
||||||
# TODO configurable
|
|
||||||
MAX_AUDIO_CLIP_FILESIZE_MB = 25
|
|
||||||
|
|
||||||
|
|
||||||
class OpenAISpeechToText(OpenAIServing):
|
class OpenAISpeechToText(OpenAIServing):
|
||||||
"""Base class for speech-to-text operations like transcription and
|
"""Base class for speech-to-text operations like transcription and
|
||||||
@ -70,6 +67,8 @@ class OpenAISpeechToText(OpenAIServing):
|
|||||||
self.asr_config = self.model_cls.get_speech_to_text_config(
|
self.asr_config = self.model_cls.get_speech_to_text_config(
|
||||||
model_config, task_type)
|
model_config, task_type)
|
||||||
|
|
||||||
|
self.max_audio_filesize_mb = envs.VLLM_MAX_AUDIO_CLIP_FILESIZE_MB
|
||||||
|
|
||||||
if self.default_sampling_params:
|
if self.default_sampling_params:
|
||||||
logger.info(
|
logger.info(
|
||||||
"Overwriting default completion sampling param with: %s",
|
"Overwriting default completion sampling param with: %s",
|
||||||
@ -93,7 +92,7 @@ class OpenAISpeechToText(OpenAIServing):
|
|||||||
lang = request.language or "en"
|
lang = request.language or "en"
|
||||||
self.model_cls.validate_language(lang)
|
self.model_cls.validate_language(lang)
|
||||||
|
|
||||||
if len(audio_data) / 1024**2 > MAX_AUDIO_CLIP_FILESIZE_MB:
|
if len(audio_data) / 1024**2 > self.max_audio_filesize_mb:
|
||||||
raise ValueError("Maximum file size exceeded.")
|
raise ValueError("Maximum file size exceeded.")
|
||||||
|
|
||||||
with io.BytesIO(audio_data) as bytes_:
|
with io.BytesIO(audio_data) as bytes_:
|
||||||
|
|||||||
@ -61,6 +61,7 @@ if TYPE_CHECKING:
|
|||||||
VLLM_IMAGE_FETCH_TIMEOUT: int = 5
|
VLLM_IMAGE_FETCH_TIMEOUT: int = 5
|
||||||
VLLM_VIDEO_FETCH_TIMEOUT: int = 30
|
VLLM_VIDEO_FETCH_TIMEOUT: int = 30
|
||||||
VLLM_AUDIO_FETCH_TIMEOUT: int = 10
|
VLLM_AUDIO_FETCH_TIMEOUT: int = 10
|
||||||
|
VLLM_MAX_AUDIO_CLIP_FILESIZE_MB: int = 25
|
||||||
VLLM_VIDEO_LOADER_BACKEND: str = "opencv"
|
VLLM_VIDEO_LOADER_BACKEND: str = "opencv"
|
||||||
VLLM_MM_INPUT_CACHE_GIB: int = 8
|
VLLM_MM_INPUT_CACHE_GIB: int = 8
|
||||||
VLLM_TARGET_DEVICE: str = "cuda"
|
VLLM_TARGET_DEVICE: str = "cuda"
|
||||||
@ -519,6 +520,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
"VLLM_AUDIO_FETCH_TIMEOUT":
|
"VLLM_AUDIO_FETCH_TIMEOUT":
|
||||||
lambda: int(os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")),
|
lambda: int(os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")),
|
||||||
|
|
||||||
|
# Maximum filesize in MB for a single audio file when processing
|
||||||
|
# speech-to-text requests. Files larger than this will be rejected.
|
||||||
|
# Default is 25 MB
|
||||||
|
"VLLM_MAX_AUDIO_CLIP_FILESIZE_MB":
|
||||||
|
lambda: int(os.getenv("VLLM_MAX_AUDIO_CLIP_FILESIZE_MB", "25")),
|
||||||
|
|
||||||
# Backend for Video IO
|
# Backend for Video IO
|
||||||
# - "opencv": Default backend that uses OpenCV stream buffered backend.
|
# - "opencv": Default backend that uses OpenCV stream buffered backend.
|
||||||
#
|
#
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user