diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index 677d34dea39b3..7e970ebbe2bbc 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -130,6 +130,8 @@ class Qwen2_5OmniAudioFeatureInputs(TensorSchema): TensorShape("nmb", "tsl", dynamic_dims={"tsl"}), ] + audio_feature_lengths: Annotated[torch.Tensor, TensorShape("na")] + feature_attention_mask: Annotated[ torch.Tensor | list[torch.Tensor], TensorShape("na", "msl", dynamic_dims={"msl"}), @@ -732,13 +734,6 @@ class Qwen2_5OmniConditionalGenerationMixin: input_features = audio_input["input_features"] audio_feature_lengths = audio_input["audio_feature_lengths"] - if audio_feature_lengths.shape[0] == 1: - audio_feature_lengths = audio_feature_lengths.squeeze(0) - elif audio_feature_lengths.shape[1] == 1: - audio_feature_lengths = audio_feature_lengths.squeeze(1) - else: - raise AssertionError(audio_feature_lengths.shape) - audio_feat_lengths, audio_output_lengths = ( self.audio_tower._get_feat_extract_output_lengths(audio_feature_lengths) ) diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index efcd003fbbda7..f20e679027214 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -99,7 +99,6 @@ from .utils import ( AutoWeightsLoader, WeightsMapper, _merge_multimodal_embeddings, - flatten_bn, maybe_prefix, ) from .vision import ( @@ -1065,8 +1064,6 @@ class Qwen3OmniMoeConditionalGenerationMixin(Qwen2_5OmniConditionalGenerationMix input_features = audio_input["input_features"] audio_feature_lengths = audio_input["audio_feature_lengths"] - audio_feature_lengths = flatten_bn(audio_feature_lengths, concat=True) - audio_feat_lengths, audio_output_lengths = _get_feat_extract_output_lengths( audio_feature_lengths )