From eddaafc1c77b0690194cbd1b73747d572793838c Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sat, 6 Sep 2025 02:33:19 -0700 Subject: [PATCH] [Multimodal] Improve max video embedding length estimation in V1 (#24312) Signed-off-by: Roger Wang Co-authored-by: Roger Wang --- vllm/model_executor/models/llava_onevision.py | 5 +---- vllm/model_executor/models/qwen2_vl.py | 5 +---- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index e4ac0cd919101..bc340a9e2d8f8 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -216,12 +216,9 @@ class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo): seq_len: int, mm_counts: Mapping[str, int], ) -> int: - max_images = mm_counts.get("image", 0) max_videos = mm_counts.get("video", 0) - max_image_tokens = self.get_max_image_tokens() * max_images - max_total_frames = self._get_max_video_frames(seq_len - - max_image_tokens) + max_total_frames = self._get_max_video_frames(seq_len) max_frames_per_video = min(max_total_frames // max(max_videos, 1), _MAX_FRAMES_PER_VIDEO) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index ae7a8d8d7a5b9..b708719e4f9b8 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -915,12 +915,9 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo): seq_len: int, mm_counts: Mapping[str, int], ) -> int: - max_images = mm_counts.get("image", 0) max_videos = mm_counts.get("video", 0) - max_image_tokens = self.get_max_image_tokens() * max_images - max_total_frames = self._get_max_video_frames(seq_len - - max_image_tokens) + max_total_frames = self._get_max_video_frames(seq_len) max_frames_per_video = min(max_total_frames // max(max_videos, 1), _MAX_FRAMES_PER_VIDEO)