mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-23 09:55:46 +08:00
39 lines
1.4 KiB
Python
39 lines
1.4 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
|
|
|
|
from pydantic.dataclasses import dataclass
|
|
|
|
from vllm.config.utils import config
|
|
|
|
|
|
@config
|
|
@dataclass
|
|
class SpeechToTextConfig:
|
|
"""Configuration for speech-to-text models."""
|
|
|
|
sample_rate: float = 16_000
|
|
"""Sample rate (Hz) to resample input audio to. Most speech models expect
|
|
16kHz audio input. The input audio will be automatically resampled to this
|
|
rate before processing."""
|
|
|
|
max_audio_clip_s: int = 30
|
|
"""Maximum duration in seconds for a single audio clip without chunking.
|
|
Audio longer than this will be split into smaller chunks if
|
|
`allow_audio_chunking` evaluates to True, otherwise it will be rejected."""
|
|
|
|
overlap_chunk_second: int = 1
|
|
"""Overlap duration in seconds between consecutive audio chunks when
|
|
splitting long audio. This helps maintain context across chunk boundaries
|
|
and improves transcription quality at split points."""
|
|
|
|
min_energy_split_window_size: int | None = 1600
|
|
"""Window size in samples for finding low-energy (quiet) regions to split
|
|
audio chunks. The algorithm looks for the quietest moment within this
|
|
window to minimize cutting through speech. Default 1600 samples ≈ 100ms
|
|
at 16kHz. If None, no chunking will be done."""
|
|
|
|
@property
|
|
def allow_audio_chunking(self) -> bool:
|
|
return self.min_energy_split_window_size is not None
|