diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 8f071eac22019..1fc34f48401df 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -69,18 +69,16 @@ class UltravoxAudioFeatureInputs(TensorSchema): type: Literal["audio_features"] data: Annotated[ Union[torch.Tensor, list[torch.Tensor], list[list[torch.Tensor]]], - TensorShape("b", "n", "nmb", "t", dynamic_dims={"n"}), + TensorShape("bn", "nmb", "t"), ] - lens: Annotated[ - Union[torch.Tensor, list[torch.Tensor]], - TensorShape("b", "n", dynamic_dims={"n"}), - ] - """Length of the audio frames. Used for attention mask in WhisperEncoder.""" - token_len: Annotated[ - Union[torch.Tensor, list[torch.Tensor]], - TensorShape("b", "n", dynamic_dims={"n"}), - ] - """Length of the audio tokens. Used for flattening the audio features.""" + lens: Annotated[torch.Tensor, TensorShape("bn")] + """ + Length of the audio frames per chunk. Used for attention mask in WhisperEncoder. + """ + token_len: Annotated[torch.Tensor, TensorShape("bn")] + """Length of the audio tokens per chunk. Used for flattening the audio features.""" + num_chunks: Annotated[torch.Tensor, TensorShape("n")] + """Number of chunks per audio. Used for flattening the audio features.""" class UltravoxAudioEmbeddingInputs(TensorSchema): @@ -421,6 +419,8 @@ class ModifiedWhisperEncoder(WhisperEncoder): dummy_inputs=UltravoxDummyInputsBuilder, ) class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): + merge_by_field_config = True + packed_modules_mapping = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], "gate_up_proj": ["gate_proj", "up_proj"], @@ -519,6 +519,7 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): audio_embeds = kwargs.pop("audio_embeds", None) audio_lens = kwargs.pop("audio_lens", None) audio_token_len = kwargs.pop("audio_token_len", None) + audio_num_chunks = kwargs.pop("audio_num_chunks", None) if audio_features is None and audio_embeds is None: return None @@ -529,6 +530,7 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): data=audio_features, lens=audio_lens, token_len=audio_token_len, + num_chunks=audio_num_chunks, ) if audio_embeds is not None: @@ -547,9 +549,8 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): # [[B1, 80, M1], [B2, 80, M2]] -> [B1+B2, 80, max(M1, M2)] audio_features = pad_and_concat_to_dim3(audio_input["data"]) - # [B1, B2] -> [B1+B2] - audio_lens = flatten_bn(audio_input["lens"], concat=True) - audio_token_len = flatten_bn(audio_input["token_len"], concat=True) + audio_lens = audio_input["lens"] + audio_token_len = audio_input["token_len"] embeddings = self._audio_features_to_embeddings(audio_features, audio_lens) @@ -568,7 +569,8 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): # Return one tensor per input audio embed_lens = [ - token_len_item.sum().item() for token_len_item in audio_input["token_len"] + chunk_lens.sum().item() + for chunk_lens in audio_token_len.split(audio_input["num_chunks"].tolist()) ] return flattened_embeddings.split(embed_lens) @@ -663,6 +665,7 @@ def pad_and_concat_to_dim3( if features.ndim > 3: # Flatten [B, N, 80, M] -> [B * N, 80, M] features = flatten_bn(features) + return features features = [pad_and_concat_to_dim3(f) for f in features] diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index 0d77b72675e24..f929ba9913ecf 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -61,7 +61,7 @@ from vllm.transformers_utils.tokenizer import ( ) from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsTranscription -from .utils import flatten_bn, init_vllm_registered_model, maybe_prefix +from .utils import init_vllm_registered_model, maybe_prefix logger = init_logger(__name__) @@ -337,6 +337,8 @@ class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo]) class VoxtralForConditionalGeneration( nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, SupportsTranscription ): + merge_by_field_config = True + supported_languages = ISO639_1_SUPPORTED_LANGS packed_modules_mapping = { @@ -445,7 +447,6 @@ class VoxtralForConditionalGeneration( f"Incorrect type of audio_arrays. Got type: {type(audio_arrays)}" ) - audio_arrays = flatten_bn(audio_arrays) if isinstance(audio_arrays, torch.Tensor): audio_arrays = list(audio_arrays.unbind(0)) return audio_arrays diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index ce9634935d24c..397556cbbcc47 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -36,7 +36,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.utils import set_default_torch_dtype from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors +from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import ( MultiModalDataDict, MultiModalFieldConfig, @@ -51,6 +51,7 @@ from vllm.multimodal.processing import ( ) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.transformers_utils.processor import cached_get_processor +from vllm.utils.jsontree import json_map_leaves from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsTranscription @@ -135,7 +136,10 @@ class WhisperAudioInputs(TensorSchema): - t: Time frames (M) """ - input_features: Annotated[Optional[NestedTensors], TensorShape("b", "nmb", "t")] + input_features: Annotated[ + Optional[list[torch.Tensor]], + TensorShape("b", "nmb", "t"), + ] class WhisperEncoderAttention(MultiHeadAttention): @@ -781,6 +785,7 @@ class WhisperMultiModalProcessor(EncDecMultiModalProcessor[WhisperProcessingInfo class WhisperForConditionalGeneration( nn.Module, SupportsTranscription, SupportsMultiModal ): + merge_by_field_config = True packed_modules_mapping = { "self_attn.qkv_proj": [ "self_attn.q_proj", @@ -936,12 +941,7 @@ class WhisperForConditionalGeneration( input_features = kwargs.pop("input_features", None) if input_features is not None: - if not isinstance(input_features, (torch.Tensor, list)): - raise ValueError( - "Incorrect type of audio features. " - f"Got type: {type(input_features)}" - ) - input_features = torch.cat([feat.to(self.dtype) for feat in input_features]) + input_features = json_map_leaves(lambda x: x.to(self.dtype), input_features) return WhisperAudioInputs(input_features=input_features) diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 45e6ac2adacaf..bec3099a99bc5 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -677,6 +677,9 @@ class MultiModalFieldConfig: self.field = field self.modality = modality + def __repr__(self) -> str: + return f"MultiModalFieldConfig(field={self.field}, modality={self.modality})" + def build_elems( self, key: str,