diff --git a/docs/contributing/model/transcription.md b/docs/contributing/model/transcription.md index a590ecd6a1a23..fca941acd5076 100644 --- a/docs/contributing/model/transcription.md +++ b/docs/contributing/model/transcription.md @@ -249,7 +249,7 @@ No extra registration is required beyond having your model class available via t ## Examples in-tree - Whisper encoder–decoder (audio-only): [vllm/model_executor/models/whisper.py](../../../vllm/model_executor/models/whisper.py) -- Voxtral decoder-only (audio embeddings + LLM): [vllm/model_executor/models/voxtral.py](../../../vllm/model_executor/models/voxtral.py) +- Voxtral decoder-only (audio embeddings + LLM): [vllm/model_executor/models/voxtral.py](../../../vllm/model_executor/models/voxtral.py). Make sure to have installed `mistral-common[audio]`. - Gemma3n decoder-only with fixed instruction prompt: [vllm/model_executor/models/gemma3n_mm.py](../../../vllm/model_executor/models/gemma3n_mm.py) ## Test with the API diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 0439e9cf23644..9cdf644c3cc52 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -785,6 +785,9 @@ Speech2Text models trained specifically for Automatic Speech Recognition. | `Gemma3nForConditionalGeneration` | Gemma3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | | `GraniteSpeechForConditionalGeneration` | Granite Speech | `ibm-granite/granite-speech-3.3-2b`, `ibm-granite/granite-speech-3.3-8b`, etc. | ✅︎ | ✅︎ | +!!! note + `VoxtralForConditionalGeneration` requires `mistral-common[audio]` to be installed. + ### Pooling Models See [this page](./pooling_models.md) for more information on how to use pooling models. diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py index 53d69bbdbdc7d..04e6f99f8957e 100644 --- a/examples/offline_inference/audio_language.py +++ b/examples/offline_inference/audio_language.py @@ -43,6 +43,7 @@ class ModelRequestData(NamedTuple): # Voxtral +# Make sure to install mistral-common[audio]. def run_voxtral(question: str, audio_count: int) -> ModelRequestData: from mistral_common.audio import Audio from mistral_common.protocol.instruct.chunk import ( diff --git a/requirements/common.txt b/requirements/common.txt index 90efb79a845d3..ad92ba3ad8278 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -31,7 +31,7 @@ partial-json-parser # used for parsing partial JSON outputs pyzmq >= 25.0.0 msgspec gguf >= 0.13.0 -mistral_common[image,audio] >= 1.8.5 +mistral_common[image] >= 1.8.5 opencv-python-headless >= 4.11.0 # required for video IO pyyaml six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12