From 2e660c24349944ac78abcf2c35d17e3caf6cdd08 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Mon, 8 Dec 2025 20:01:21 +0800 Subject: [PATCH] [Frontend] Binary embedding response does not return metadata by setting encoding_format to bytes_only. (#30249) Signed-off-by: wang.yuqi Signed-off-by: wang.yuqi Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- .../embed/embedding_requests_bytes_client.py | 37 +++++++++++- .../entrypoints/pooling/embed/test_online.py | 50 +++++++++++++++++ .../pooling/pooling/test_online.py | 56 +++++++++++++++++++ vllm/entrypoints/pooling/embed/api_router.py | 4 +- vllm/entrypoints/pooling/embed/protocol.py | 4 +- vllm/entrypoints/pooling/embed/serving.py | 34 ++++++----- .../entrypoints/pooling/pooling/api_router.py | 4 +- vllm/entrypoints/pooling/pooling/protocol.py | 4 +- vllm/entrypoints/pooling/pooling/serving.py | 35 +++++++----- vllm/utils/serial_utils.py | 43 ++++++++++++-- 10 files changed, 230 insertions(+), 41 deletions(-) diff --git a/examples/pooling/embed/embedding_requests_bytes_client.py b/examples/pooling/embed/embedding_requests_bytes_client.py index c2832f1b54ce..5ea452524149 100644 --- a/examples/pooling/embed/embedding_requests_bytes_client.py +++ b/examples/pooling/embed/embedding_requests_bytes_client.py @@ -16,6 +16,7 @@ from vllm.utils.serial_utils import ( EMBED_DTYPE_TO_TORCH_DTYPE, ENDIANNESS, MetadataItem, + build_metadata_items, decode_pooling_output, ) @@ -38,6 +39,11 @@ def parse_args(): def main(args): api_url = f"http://{args.host}:{args.port}/v1/embeddings" model_name = args.model + embedding_size = 0 + + input_texts = [ + "The best thing about vLLM is that it supports many different models", + ] * 2 # The OpenAI client does not support the bytes encoding_format. # The OpenAI client does not support the embed_dtype and endianness parameters. @@ -45,7 +51,7 @@ def main(args): for endianness in ENDIANNESS: prompt = { "model": model_name, - "input": "vLLM is great!", + "input": input_texts, "encoding_format": "bytes", "embed_dtype": embed_dtype, "endianness": endianness, @@ -57,7 +63,34 @@ def main(args): embedding = decode_pooling_output(items=items, body=body) embedding = [x.to(torch.float32) for x in embedding] - embedding = torch.cat(embedding) + embedding = torch.stack(embedding) + embedding_size = embedding.shape[-1] + print(embed_dtype, endianness, embedding.shape) + + # The vllm server always sorts the returned embeddings in the order of input. So + # returning metadata is not necessary. You can set encoding_format to bytes_only + # to let the server not return metadata. + for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE: + for endianness in ENDIANNESS: + prompt = { + "model": model_name, + "input": input_texts, + "encoding_format": "bytes_only", + "embed_dtype": embed_dtype, + "endianness": endianness, + } + response = post_http_request(prompt=prompt, api_url=api_url) + body = response.content + + items = build_metadata_items( + embed_dtype=embed_dtype, + endianness=endianness, + shape=(embedding_size,), + n_request=len(input_texts), + ) + embedding = decode_pooling_output(items=items, body=body) + embedding = [x.to(torch.float32) for x in embedding] + embedding = torch.stack(embedding) print(embed_dtype, endianness, embedding.shape) diff --git a/tests/entrypoints/pooling/embed/test_online.py b/tests/entrypoints/pooling/embed/test_online.py index ddba1c790ba8..f96338c47f0b 100644 --- a/tests/entrypoints/pooling/embed/test_online.py +++ b/tests/entrypoints/pooling/embed/test_online.py @@ -24,6 +24,7 @@ from vllm.utils.serial_utils import ( ENDIANNESS, MetadataItem, binary2tensor, + build_metadata_items, decode_pooling_output, ) @@ -344,6 +345,55 @@ async def test_bytes_embed_dtype_and_endianness( ) +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_bytes_only_embed_dtype_and_endianness( + server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str +): + input_texts = [ + "The best thing about vLLM is that it supports many different models", + ] * 2 + + responses_float = await client.embeddings.create( + input=input_texts, model=model_name, encoding_format="float" + ) + float_data = [d.embedding for d in responses_float.data] + embedding_size = len(float_data[0]) + + for embed_dtype in list(EMBED_DTYPE_TO_TORCH_DTYPE.keys()): + for endianness in ENDIANNESS: + responses_bytes = requests.post( + server.url_for("/v1/embeddings"), + json={ + "model": model_name, + "input": input_texts, + "encoding_format": "bytes_only", + "embed_dtype": embed_dtype, + "endianness": endianness, + }, + ) + + assert "metadata" not in responses_bytes.headers + body = responses_bytes.content + items = build_metadata_items( + embed_dtype=embed_dtype, + endianness=endianness, + shape=(embedding_size,), + n_request=len(input_texts), + ) + + bytes_data = decode_pooling_output(items=items, body=body) + bytes_data = [x.to(torch.float32).tolist() for x in bytes_data] + + check_embeddings_close( + embeddings_0_lst=float_data, + embeddings_1_lst=bytes_data, + name_0="float_data", + name_1="bytes_data", + tol=1e-2, + ) + + @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("param_name", ["encoding_format", "embed_dtype", "endianness"]) diff --git a/tests/entrypoints/pooling/pooling/test_online.py b/tests/entrypoints/pooling/pooling/test_online.py index cc5c2f26f80f..33add5bdaef4 100644 --- a/tests/entrypoints/pooling/pooling/test_online.py +++ b/tests/entrypoints/pooling/pooling/test_online.py @@ -18,6 +18,7 @@ from vllm.utils.serial_utils import ( ENDIANNESS, MetadataItem, binary2tensor, + build_metadata_items, decode_pooling_output, ) @@ -352,6 +353,61 @@ async def test_bytes_embed_dtype_and_endianness( ) +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_bytes_only_embed_dtype_and_endianness( + server: RemoteOpenAIServer, model_name: str +): + input_texts = [ + "The best thing about vLLM is that it supports many different models", + ] * 2 + + url = server.url_for("pooling") + float_response = requests.post( + url, + json={ + "model": model_name, + "input": input_texts, + "encoding_format": "float", + }, + ) + responses_float = PoolingResponse.model_validate(float_response.json()) + float_data = [np.array(d.data).squeeze(-1).tolist() for d in responses_float.data] + n_tokens = responses_float.usage.prompt_tokens // len(input_texts) + + for embed_dtype in list(EMBED_DTYPE_TO_TORCH_DTYPE.keys()): + for endianness in ENDIANNESS: + responses_bytes = requests.post( + url, + json={ + "model": model_name, + "input": input_texts, + "encoding_format": "bytes_only", + "embed_dtype": embed_dtype, + "endianness": endianness, + }, + ) + + assert "metadata" not in responses_bytes.headers + body = responses_bytes.content + items = build_metadata_items( + embed_dtype=embed_dtype, + endianness=endianness, + shape=(n_tokens, 1), + n_request=len(input_texts), + ) + bytes_data = decode_pooling_output(items=items, body=body) + bytes_data = [x.to(torch.float32).view(-1).tolist() for x in bytes_data] + + check_embeddings_close( + embeddings_0_lst=float_data, + embeddings_1_lst=bytes_data, + name_0="float_data", + name_1="bytes_data", + tol=1e-2, + ) + + @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("param_name", ["encoding_format", "embed_dtype", "endianness"]) diff --git a/vllm/entrypoints/pooling/embed/api_router.py b/vllm/entrypoints/pooling/embed/api_router.py index 5b10a32e79f8..24b0c8c2b3cf 100644 --- a/vllm/entrypoints/pooling/embed/api_router.py +++ b/vllm/entrypoints/pooling/embed/api_router.py @@ -59,8 +59,8 @@ async def create_embedding( return JSONResponse(content=generator.model_dump()) elif isinstance(generator, EmbeddingBytesResponse): return StreamingResponse( - content=generator.body, - headers={"metadata": generator.metadata}, + content=generator.content, + headers=generator.headers, media_type=generator.media_type, ) diff --git a/vllm/entrypoints/pooling/embed/protocol.py b/vllm/entrypoints/pooling/embed/protocol.py index 7eb53e14d5d8..6a8f8c4434e5 100644 --- a/vllm/entrypoints/pooling/embed/protocol.py +++ b/vllm/entrypoints/pooling/embed/protocol.py @@ -203,6 +203,6 @@ class EmbeddingResponse(OpenAIBaseModel): class EmbeddingBytesResponse(OpenAIBaseModel): - body: list[bytes] - metadata: str + content: list[bytes] + headers: dict[str, str] | None = None media_type: str = "application/octet-stream" diff --git a/vllm/entrypoints/pooling/embed/serving.py b/vllm/entrypoints/pooling/embed/serving.py index 868a3cb017a6..aafc35489710 100644 --- a/vllm/entrypoints/pooling/embed/serving.py +++ b/vllm/entrypoints/pooling/embed/serving.py @@ -163,29 +163,35 @@ class EmbeddingMixin(OpenAIServing): usage=usage, ) - def encode_bytes(): - body, items, usage = encode_pooling_bytes( + def encode_bytes(bytes_only: bool) -> EmbeddingBytesResponse: + content, items, usage = encode_pooling_bytes( pooling_outputs=final_res_batch_checked, embed_dtype=embed_dtype, endianness=endianness, ) - metadata = { - "id": ctx.request_id, - "created": ctx.created_time, - "model": ctx.model_name, - "data": items, - "usage": usage, - } - return EmbeddingBytesResponse( - body=body, - metadata=json.dumps(metadata), + headers = ( + None + if bytes_only + else { + "metadata": json.dumps( + { + "id": ctx.request_id, + "created": ctx.created_time, + "model": ctx.model_name, + "data": items, + "usage": usage, + } + ) + } ) + return EmbeddingBytesResponse(content=content, headers=headers) + if encoding_format == "float" or encoding_format == "base64": return encode_float_base64() - elif encoding_format == "bytes": - return encode_bytes() + elif encoding_format == "bytes" or encoding_format == "bytes_only": + return encode_bytes(bytes_only=encoding_format == "bytes_only") else: assert_never(encoding_format) diff --git a/vllm/entrypoints/pooling/pooling/api_router.py b/vllm/entrypoints/pooling/pooling/api_router.py index 674da94d126c..4baaf8f30f6b 100644 --- a/vllm/entrypoints/pooling/pooling/api_router.py +++ b/vllm/entrypoints/pooling/pooling/api_router.py @@ -55,8 +55,8 @@ async def create_pooling(request: PoolingRequest, raw_request: Request): return JSONResponse(content=generator.model_dump()) elif isinstance(generator, PoolingBytesResponse): return StreamingResponse( - content=generator.body, - headers={"metadata": generator.metadata}, + content=generator.content, + headers=generator.headers, media_type=generator.media_type, ) diff --git a/vllm/entrypoints/pooling/pooling/protocol.py b/vllm/entrypoints/pooling/pooling/protocol.py index 364cd93738b8..76b361b49b66 100644 --- a/vllm/entrypoints/pooling/pooling/protocol.py +++ b/vllm/entrypoints/pooling/pooling/protocol.py @@ -143,6 +143,6 @@ class PoolingResponse(OpenAIBaseModel): class PoolingBytesResponse(OpenAIBaseModel): - body: list[bytes] - metadata: str + content: list[bytes] + headers: dict[str, str] | None = None media_type: str = "application/octet-stream" diff --git a/vllm/entrypoints/pooling/pooling/serving.py b/vllm/entrypoints/pooling/pooling/serving.py index 7fb767e26d01..57f1a6440cf7 100644 --- a/vllm/entrypoints/pooling/pooling/serving.py +++ b/vllm/entrypoints/pooling/pooling/serving.py @@ -314,29 +314,38 @@ class OpenAIServingPooling(OpenAIServing): usage=usage, ) - def encode_bytes(): - body, items, usage = encode_pooling_bytes( + def encode_bytes(bytes_only: bool) -> PoolingBytesResponse: + content, items, usage = encode_pooling_bytes( pooling_outputs=final_res_batch, embed_dtype=embed_dtype, endianness=endianness, ) - metadata = { - "id": request_id, - "created": created_time, - "model": model_name, - "data": items, - "usage": usage, - } + headers = ( + None + if bytes_only + else { + "metadata": json.dumps( + { + "id": request_id, + "created": created_time, + "model": model_name, + "data": items, + "usage": usage, + } + ) + } + ) + return PoolingBytesResponse( - body=body, - metadata=json.dumps(metadata), + content=content, + headers=headers, ) if encoding_format == "float" or encoding_format == "base64": return encode_float_base64() - elif encoding_format == "bytes": - return encode_bytes() + elif encoding_format == "bytes" or encoding_format == "bytes_only": + return encode_bytes(bytes_only=encoding_format == "bytes_only") else: assert_never(encoding_format) diff --git a/vllm/utils/serial_utils.py b/vllm/utils/serial_utils.py index a6d717e03d37..07db5eaf74c8 100644 --- a/vllm/utils/serial_utils.py +++ b/vllm/utils/serial_utils.py @@ -2,15 +2,19 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import base64 import io +import math import sys from dataclasses import dataclass -from typing import Literal +from typing import TYPE_CHECKING, Any, Literal import numpy as np import torch from typing_extensions import assert_never -from vllm import PoolingRequestOutput +if TYPE_CHECKING: + from vllm import PoolingRequestOutput +else: + PoolingRequestOutput = Any sys_byteorder = sys.byteorder @@ -27,6 +31,14 @@ EMBED_DTYPE_TO_TORCH_DTYPE = { "fp8_e5m2": torch.float8_e5m2, } +EMBED_DTYPE_TO_N_BYTES = { + "float32": 4, + "float16": 2, + "bfloat16": 2, + "fp8_e4m3": 1, + "fp8_e5m2": 1, +} + EMBED_DTYPE_TO_TORCH_DTYPE_VIEW = { "float32": torch.float32, @@ -50,7 +62,7 @@ ENDIANNESS = ["native", "big", "little"] EmbedDType = Literal["float32", "float16", "bfloat16", "fp8_e4m3", "fp8_e5m2"] Endianness = Literal["native", "big", "little"] -EncodingFormat = Literal["float", "base64", "bytes"] +EncodingFormat = Literal["float", "base64", "bytes", "bytes_only"] def tensor2base64(x: torch.Tensor) -> str: @@ -114,7 +126,7 @@ def encode_pooling_output( elif encoding_format == "base64": embedding_bytes = tensor2binary(output.outputs.data, embed_dtype, endianness) return base64.b64encode(embedding_bytes).decode("utf-8") - elif encoding_format == "bytes": + elif encoding_format == "bytes" or encoding_format == "bytes_only": return tensor2binary(output.outputs.data, embed_dtype, endianness) assert_never(encoding_format) @@ -129,6 +141,29 @@ class MetadataItem: shape: tuple[int, ...] +def build_metadata_items( + embed_dtype: EmbedDType, + endianness: Endianness, + shape: tuple[int, ...], + n_request: int, +): + n_bytes = EMBED_DTYPE_TO_N_BYTES[embed_dtype] + size = math.prod(shape) + items = [ + MetadataItem( + index=i, + embed_dtype=embed_dtype, + endianness=endianness, + start=i * size * n_bytes, + end=(i + 1) * size * n_bytes, + shape=shape, + ) + for i in range(n_request) + ] + + return items + + def encode_pooling_bytes( pooling_outputs: list[PoolingRequestOutput], embed_dtype: EmbedDType,