From d84d8f4429a5246a9d9f179b47fac7e13801710d Mon Sep 17 00:00:00 2001 From: ZiTian Zhao Date: Sat, 22 Nov 2025 22:48:59 +0800 Subject: [PATCH] Fix EVS crash when using `video_embeds` inputs in Qwen2.5-VL (#29232) Signed-off-by: zitian.zhao Co-authored-by: Cyrus Leung --- vllm/model_executor/models/qwen2_5_vl.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 8e3c0e84dfe51..1500a437613cc 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -230,6 +230,9 @@ class Qwen2_5_VLVideoEmbeddingInputs(TensorSchema): - hidden_size must match the hidden size of language model backbone. - video_grid_thw shape: (num_videos, 3) in (grid_t, grid_h, grid_w) format + - second_per_grid_ts: The video time interval (in seconds) for each + grid along the temporal dimension in the 3D position IDs. Returned + when `videos` is not `None`. """ type: Literal["video_embeds"] @@ -244,6 +247,11 @@ class Qwen2_5_VLVideoEmbeddingInputs(TensorSchema): TensorShape("nv", 3), ] + second_per_grid_ts: Annotated[ + torch.Tensor | None, + TensorShape("nv"), + ] = None + Qwen2_5_VLVideoInputs: TypeAlias = ( Qwen2_5_VLVideoPixelInputs | Qwen2_5_VLVideoEmbeddingInputs @@ -1311,6 +1319,7 @@ class Qwen2_5_VLForConditionalGeneration( type="video_embeds", video_embeds=video_embeds, video_grid_thw=video_grid_thw, + second_per_grid_ts=second_per_grid_ts, ) def _process_image_input( @@ -1422,7 +1431,13 @@ class Qwen2_5_VLForConditionalGeneration( # Cast to long to match the original code # https://github.com/huggingface/transformers/blob/41980ce93e775f6c88500c51c8db7946fc6a2add/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py#L491 # noqa - second_per_grid_ts = video_input["second_per_grid_ts"].long() + second_per_grid_ts = video_input.get("second_per_grid_ts") + if second_per_grid_ts is None: + raise ValueError( + "second_per_grid_ts is required when video_pruning_rate > 0 " + "is enabled for video inputs, including the video_embeds path." + ) + second_per_grid_ts = second_per_grid_ts.long() tokens_per_second = self.config.vision_config.tokens_per_second video_embeds_out = []