diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 42a87c4a796e9..8ae096536fdc5 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -974,7 +974,7 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal, grid_thw_list = grid_thw.tolist() if image_input["type"] == "image_embeds": - image_embeds = image_input["image_embeds"] + image_embeds = image_input["image_embeds"].type(self.visual.dtype) else: pixel_values = image_input["pixel_values"] image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list) @@ -994,7 +994,7 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal, grid_thw_list = grid_thw.tolist() if video_input["type"] == "video_embeds": - video_embeds = video_input["video_embeds"] + video_embeds = video_input["video_embeds"].type(self.visual.dtype) else: pixel_values_videos = video_input["pixel_values_videos"] video_embeds = self.visual(pixel_values_videos,