From 60f0843ef8fb4b0c4e6788acc042873a0a2ea2a1 Mon Sep 17 00:00:00 2001 From: Chatcharin Sangbutsarakum <67754293+what-in-the-nim@users.noreply.github.com> Date: Mon, 8 Sep 2025 13:11:12 +0700 Subject: [PATCH] [Model] Remove unnecessary CUDA sync of Qwen2VL image and video preprocess (#24334) Signed-off-by: Win Co-authored-by: Roger Wang --- vllm/model_executor/models/qwen2_vl.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index f00b214b1ef18..90a1ad2a658ab 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1218,6 +1218,7 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, grid_thw = image_input["image_grid_thw"] assert grid_thw.ndim == 2 + grid_thw_list = grid_thw.tolist() if image_input["type"] == "image_embeds": image_embeds = image_input["image_embeds"] @@ -1227,15 +1228,17 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, # Split concatenated embeddings for each image item. merge_size = self.visual.spatial_merge_size - sizes = grid_thw.prod(-1) // merge_size // merge_size + sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) // + (merge_size * merge_size)).tolist() - return image_embeds.split(sizes.tolist()) + return image_embeds.split(sizes) def _process_video_input( self, video_input: Qwen2VLVideoInputs) -> tuple[torch.Tensor, ...]: grid_thw = video_input["video_grid_thw"] assert grid_thw.ndim == 2 + grid_thw_list = grid_thw.tolist() if video_input["type"] == "video_embeds": video_embeds = video_input["video_embeds"] @@ -1245,9 +1248,10 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, # Split concatenated embeddings for each video item. merge_size = self.visual.spatial_merge_size - sizes = grid_thw.prod(-1) // merge_size // merge_size + sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) // + (merge_size * merge_size)).tolist() - return video_embeds.split(sizes.tolist()) + return video_embeds.split(sizes) def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: modalities = {}