From ad0012a0ac507973656cfcf5750d603af4a5fdcc Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 21 May 2025 13:39:22 +0800 Subject: [PATCH] Revert "[Bugfix] Fix MRoPE Errors in the Qwen-VL Model When Processing Pure Text (#18407)" (#18456) Signed-off-by: DarkLight1337 --- vllm/worker/model_runner.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 15f40bcef8969..12025617e5127 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -729,6 +729,8 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): mm_kwargs, placeholder_maps = MultiModalPlaceholderMap.from_seq_group( seq_group_metadata, range(positions[0], positions[0] + len(positions))) + if not mm_kwargs: + return inter_data.multi_modal_kwargs = mm_kwargs inter_data.multi_modal_placeholder_maps = placeholder_maps @@ -739,6 +741,12 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): video_grid_thw = mm_kwargs.get("video_grid_thw", None) audio_feature_lengths = mm_kwargs.get("audio_feature_lengths", None) + assert ( + image_grid_thw is not None or video_grid_thw is not None + or audio_feature_lengths is not None), ( + "mrope embedding type requires multi-modal input mapper " + "returns 'image_grid_thw' or 'video_grid_thw' or " + "'audio_feature_lengths'.") second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None) use_audio_in_video = mm_kwargs.get("use_audio_in_video", False)