vllm/vllm/multimodal/audio.py
Cyrus Leung 6142ef0ada
[VLM] Merged multimodal processor for Qwen2-Audio (#11303)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-12-19 06:14:17 +00:00

42 lines
1.1 KiB
Python

import numpy as np
import numpy.typing as npt
from vllm.inputs.registry import InputContext
from .base import MultiModalPlugin
from .inputs import AudioItem, MultiModalData, MultiModalKwargs
class AudioPlugin(MultiModalPlugin):
"""Plugin for audio data."""
def get_data_key(self) -> str:
return "audio"
def _default_input_mapper(
self,
ctx: InputContext,
data: MultiModalData[AudioItem],
**mm_processor_kwargs,
) -> MultiModalKwargs:
raise NotImplementedError("There is no default audio input mapper")
def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
raise NotImplementedError(
"There is no default maximum multimodal tokens")
def resample_audio(
audio: npt.NDArray[np.floating],
*,
orig_sr: float,
target_sr: float,
) -> npt.NDArray[np.floating]:
try:
import librosa
except ImportError as exc:
msg = "Please install vllm[audio] for audio support."
raise ImportError(msg) from exc
return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)