[PERF] Use pybase64 to more quickly decode prompt embeddings (#22469)

Signed-off-by: Andrew Sansom <andrew@protopia.ai>
2025-12-17 04:05:01 +08:00 · 2025-08-07 21:15:32 -05:00 · 2025-08-07 21:15:32 -05:00 · e2c8f1edec
commit e2c8f1edec
parent 1ee5ead5f8
1 changed files with 3 additions and 2 deletions
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 import base64
 import io
 import json
 import sys
@ -12,6 +11,7 @@ from http import HTTPStatus
 from typing import (Annotated, Any, Callable, ClassVar, Generic, Optional,
                    TypeVar, Union, cast, overload)
 import pybase64
 import torch
 from fastapi import Request
 from pydantic import BaseModel, ConfigDict, Field
@ -1008,7 +1008,8 @@ class OpenAIServing:
    ) -> list[EmbedsPrompt]:
        def _load_and_validate_embed(embed: bytes) -> EmbedsPrompt:
-            tensor = torch.load(io.BytesIO(base64.b64decode(embed)),
+            tensor = torch.load(io.BytesIO(
                pybase64.b64decode(embed, validate=True)),
                                weights_only=True)
            assert isinstance(tensor, torch.Tensor) and tensor.dtype in (
                torch.float32,