fix: revert cast to cpu in MsgpackEncoder._encode_tensor to avoid hidden performance regressions (#25738)

Signed-off-by: Andrew Sansom <andrew@protopia.ai>
This commit is contained in:
Andrew Sansom 2025-09-26 03:18:05 -05:00 committed by GitHub
parent 3edf87d25f
commit e84e0735c7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 6 additions and 1 deletions

View File

@ -278,6 +278,11 @@ class InputPreprocessor:
raise ValueError(
"prompt_embeds must be of shape (seq_len, hidden_size).")
# Tensors must be on CPU for serialization between processes
# in the MsgpackEncoder. Casting to CPU here ensures that there is no
# hidden device transfer in the critical path of generation.
prompt_embeds = prompt_embeds.cpu()
return embeds_inputs(prompt_embeds=prompt_embeds,
cache_salt=parsed_content.get("cache_salt"))

View File

@ -208,7 +208,7 @@ class MsgpackEncoder:
) -> tuple[str, tuple[int, ...], Union[int, memoryview]]:
assert self.aux_buffers is not None
# view the tensor as a contiguous 1D array of bytes
arr = obj.flatten().contiguous().cpu().view(torch.uint8).numpy()
arr = obj.flatten().contiguous().view(torch.uint8).numpy()
if obj.nbytes < self.size_threshold:
# Smaller tensors are encoded inline, just like ndarrays.
data = msgpack.Ext(CUSTOM_TYPE_RAW_VIEW, arr.data)