[Core][MM] Use non-blocking CPU-GPU copy of multimodal data (#28141)

Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2026-01-27 16:32:34 +08:00 · 2025-11-06 04:05:12 +00:00 · 2025-11-06 04:05:12 +00:00 · 80679f108f
commit 80679f108f
parent 43ecd0a900
1 changed files with 3 additions and 1 deletions
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@ -444,7 +444,9 @@ def group_mm_kwargs_by_modality(

            if device is not None:
                mm_kwargs_group = json_map_leaves(
-                    lambda x: x.to(device=device) if isinstance(x, torch.Tensor) else x,
+                    lambda x: x.to(device=device, non_blocking=True)
+                    if isinstance(x, torch.Tensor)
+                    else x,
                    mm_kwargs_group,
                )
        else: