diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index bf4e243203489..ec65bfca10096 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -765,7 +765,7 @@ Speech2Text models trained specifically for Automatic Speech Recognition. | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | |--------------|--------|-------------------|----------------------|---------------------------|---------------------| | `WhisperForConditionalGeneration` | Whisper | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc. | | | | -| `VoxtralForConditionalGeneration` | Voxtral (Mistral format) | `mistralai/Voxtral-Mini-3B-2507`, `mistralai/Voxtral-Small-24B-2507`, etc. | | ✅︎ | ✅︎ | +| `VoxtralForConditionalGeneration` | Voxtral (Mistral format) | `mistralai/Voxtral-Mini-3B-2507`, `mistralai/Voxtral-Small-24B-2507`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Gemma3nForConditionalGeneration` | Gemma3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ | ### Pooling Models diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index 2831c4df78ba7..9451670bc60c2 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -25,6 +25,7 @@ from vllm.inputs.data import PromptType from vllm.logger import init_logger from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models import SupportsPP +from vllm.model_executor.models.module_mapping import MultiModelKeys # yapf: disable from vllm.model_executor.models.whisper import WhisperEncoder # yapf: enable @@ -44,8 +45,8 @@ from vllm.sequence import IntermediateTensors from vllm.transformers_utils.tokenizer import (MistralTokenizer, cached_tokenizer_from_config) -from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, - SupportsTranscription) +from .interfaces import (MultiModalEmbeddings, SupportsLoRA, + SupportsMultiModal, SupportsTranscription) from .utils import (flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) @@ -313,9 +314,15 @@ class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo] info=VoxtralProcessingInfo, dummy_inputs=VoxtralDummyInputsBuilder) class VoxtralForConditionalGeneration(nn.Module, SupportsMultiModal, - SupportsPP, SupportsTranscription): + SupportsPP, SupportsLoRA, + SupportsTranscription): supported_languages = ISO639_1_SUPPORTED_LANGS + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"] + } + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() self.tokenizer = cached_tokenizer_from_config(vllm_config.model_config) @@ -341,6 +348,14 @@ class VoxtralForConditionalGeneration(nn.Module, SupportsMultiModal, def get_language_model(self) -> torch.nn.Module: return self.language_model + def get_mm_mapping(self) -> MultiModelKeys: + """Get module prefix for multimodal models to filter LoRA modules.""" + return MultiModelKeys.from_string_field( + language_model="language_model", + connector="audio_language_adapter", + tower_model=["whisper_encoder"], + ) + def forward( self, input_ids: torch.Tensor,