From e84e0735c71e983440b2194e0ea7f9117694dc71 Mon Sep 17 00:00:00 2001
From: Andrew Sansom <andrew@protopia.ai>
Date: Fri, 26 Sep 2025 03:18:05 -0500
Subject: [PATCH] fix: revert cast to cpu in `MsgpackEncoder._encode_tensor` to
 avoid hidden performance regressions (#25738)

Signed-off-by: Andrew Sansom <andrew@protopia.ai>
---
 vllm/inputs/preprocess.py | 5 +++++
 vllm/v1/serial_utils.py   | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index a24307b79d95..7518cd8fc897 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -278,6 +278,11 @@ class InputPreprocessor:
             raise ValueError(
                 "prompt_embeds must be of shape (seq_len, hidden_size).")
 
+        # Tensors must be on CPU for serialization between processes
+        # in the MsgpackEncoder. Casting to CPU here ensures that there is no
+        # hidden device transfer in the critical path of generation.
+        prompt_embeds = prompt_embeds.cpu()
+
         return embeds_inputs(prompt_embeds=prompt_embeds,
                              cache_salt=parsed_content.get("cache_salt"))
 
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index c812a2ec6427..876838084b9a 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -208,7 +208,7 @@ class MsgpackEncoder:
     ) -> tuple[str, tuple[int, ...], Union[int, memoryview]]:
         assert self.aux_buffers is not None
         # view the tensor as a contiguous 1D array of bytes
-        arr = obj.flatten().contiguous().cpu().view(torch.uint8).numpy()
+        arr = obj.flatten().contiguous().view(torch.uint8).numpy()
         if obj.nbytes < self.size_threshold:
             # Smaller tensors are encoded inline, just like ndarrays.
             data = msgpack.Ext(CUSTOM_TYPE_RAW_VIEW, arr.data)