From 80679f108ffd94c165ea11adbc3afcc43f24a06e Mon Sep 17 00:00:00 2001 From: Lukas Geiger Date: Thu, 6 Nov 2025 04:05:12 +0000 Subject: [PATCH] [Core][MM] Use non-blocking CPU-GPU copy of multimodal data (#28141) Signed-off-by: Lukas Geiger --- vllm/multimodal/utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 3fad11a2cb4d..aa61bcc11f9f 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -444,7 +444,9 @@ def group_mm_kwargs_by_modality( if device is not None: mm_kwargs_group = json_map_leaves( - lambda x: x.to(device=device) if isinstance(x, torch.Tensor) else x, + lambda x: x.to(device=device, non_blocking=True) + if isinstance(x, torch.Tensor) + else x, mm_kwargs_group, ) else: