From 9f1c6422549d37eee22bfa4dbadaaa91d95e98ba Mon Sep 17 00:00:00 2001 From: double7 <33449816+DoubleVII@users.noreply.github.com> Date: Mon, 18 Aug 2025 13:09:11 +0800 Subject: [PATCH] [Bugfix] fix Qwen2.5-Omni processor output mapping (#23058) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: double7 <33449816+DoubleVII@users.noreply.github.com> Co-authored-by: 杨森 Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm/model_executor/models/qwen2_5_omni_thinker.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index e95295c31885a..59411eb7503bf 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -88,6 +88,11 @@ def _qwen2_5_omni_thinker_field_config(hf_inputs: Mapping[str, torch.Tensor]): video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3))) video_grid_sizes = video_grid_thw.prod(-1) + # vllm use `second_per_grid_ts` to compute multimodal rotary embedding + video_second_per_grid = hf_inputs.get("video_second_per_grid", None) + if video_second_per_grid is not None: + hf_inputs["second_per_grid_ts"] = video_second_per_grid + return dict( input_audio_features=MultiModalFieldConfig.flat_from_sizes( "audio", audio_feature_lengths, dim=1),