From e84e0735c71e983440b2194e0ea7f9117694dc71 Mon Sep 17 00:00:00 2001 From: Andrew Sansom Date: Fri, 26 Sep 2025 03:18:05 -0500 Subject: [PATCH] fix: revert cast to cpu in `MsgpackEncoder._encode_tensor` to avoid hidden performance regressions (#25738) Signed-off-by: Andrew Sansom --- vllm/inputs/preprocess.py | 5 +++++ vllm/v1/serial_utils.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index a24307b79d95..7518cd8fc897 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -278,6 +278,11 @@ class InputPreprocessor: raise ValueError( "prompt_embeds must be of shape (seq_len, hidden_size).") + # Tensors must be on CPU for serialization between processes + # in the MsgpackEncoder. Casting to CPU here ensures that there is no + # hidden device transfer in the critical path of generation. + prompt_embeds = prompt_embeds.cpu() + return embeds_inputs(prompt_embeds=prompt_embeds, cache_salt=parsed_content.get("cache_salt")) diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py index c812a2ec6427..876838084b9a 100644 --- a/vllm/v1/serial_utils.py +++ b/vllm/v1/serial_utils.py @@ -208,7 +208,7 @@ class MsgpackEncoder: ) -> tuple[str, tuple[int, ...], Union[int, memoryview]]: assert self.aux_buffers is not None # view the tensor as a contiguous 1D array of bytes - arr = obj.flatten().contiguous().cpu().view(torch.uint8).numpy() + arr = obj.flatten().contiguous().view(torch.uint8).numpy() if obj.nbytes < self.size_threshold: # Smaller tensors are encoded inline, just like ndarrays. data = msgpack.Ext(CUSTOM_TYPE_RAW_VIEW, arr.data)