[Frontend] Binary embedding response does not return metadata by setting encoding_format to bytes_only. (#30249)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
Signed-off-by: wang.yuqi <noooop@126.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
wang.yuqi 2025-12-08 20:01:21 +08:00 committed by GitHub
parent 408cf42f67
commit 2e660c2434
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 230 additions and 41 deletions

View File

@ -16,6 +16,7 @@ from vllm.utils.serial_utils import (
EMBED_DTYPE_TO_TORCH_DTYPE, EMBED_DTYPE_TO_TORCH_DTYPE,
ENDIANNESS, ENDIANNESS,
MetadataItem, MetadataItem,
build_metadata_items,
decode_pooling_output, decode_pooling_output,
) )
@ -38,6 +39,11 @@ def parse_args():
def main(args): def main(args):
api_url = f"http://{args.host}:{args.port}/v1/embeddings" api_url = f"http://{args.host}:{args.port}/v1/embeddings"
model_name = args.model model_name = args.model
embedding_size = 0
input_texts = [
"The best thing about vLLM is that it supports many different models",
] * 2
# The OpenAI client does not support the bytes encoding_format. # The OpenAI client does not support the bytes encoding_format.
# The OpenAI client does not support the embed_dtype and endianness parameters. # The OpenAI client does not support the embed_dtype and endianness parameters.
@ -45,7 +51,7 @@ def main(args):
for endianness in ENDIANNESS: for endianness in ENDIANNESS:
prompt = { prompt = {
"model": model_name, "model": model_name,
"input": "vLLM is great!", "input": input_texts,
"encoding_format": "bytes", "encoding_format": "bytes",
"embed_dtype": embed_dtype, "embed_dtype": embed_dtype,
"endianness": endianness, "endianness": endianness,
@ -57,7 +63,34 @@ def main(args):
embedding = decode_pooling_output(items=items, body=body) embedding = decode_pooling_output(items=items, body=body)
embedding = [x.to(torch.float32) for x in embedding] embedding = [x.to(torch.float32) for x in embedding]
embedding = torch.cat(embedding) embedding = torch.stack(embedding)
embedding_size = embedding.shape[-1]
print(embed_dtype, endianness, embedding.shape)
# The vllm server always sorts the returned embeddings in the order of input. So
# returning metadata is not necessary. You can set encoding_format to bytes_only
# to let the server not return metadata.
for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE:
for endianness in ENDIANNESS:
prompt = {
"model": model_name,
"input": input_texts,
"encoding_format": "bytes_only",
"embed_dtype": embed_dtype,
"endianness": endianness,
}
response = post_http_request(prompt=prompt, api_url=api_url)
body = response.content
items = build_metadata_items(
embed_dtype=embed_dtype,
endianness=endianness,
shape=(embedding_size,),
n_request=len(input_texts),
)
embedding = decode_pooling_output(items=items, body=body)
embedding = [x.to(torch.float32) for x in embedding]
embedding = torch.stack(embedding)
print(embed_dtype, endianness, embedding.shape) print(embed_dtype, endianness, embedding.shape)

View File

@ -24,6 +24,7 @@ from vllm.utils.serial_utils import (
ENDIANNESS, ENDIANNESS,
MetadataItem, MetadataItem,
binary2tensor, binary2tensor,
build_metadata_items,
decode_pooling_output, decode_pooling_output,
) )
@ -344,6 +345,55 @@ async def test_bytes_embed_dtype_and_endianness(
) )
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_bytes_only_embed_dtype_and_endianness(
server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
):
input_texts = [
"The best thing about vLLM is that it supports many different models",
] * 2
responses_float = await client.embeddings.create(
input=input_texts, model=model_name, encoding_format="float"
)
float_data = [d.embedding for d in responses_float.data]
embedding_size = len(float_data[0])
for embed_dtype in list(EMBED_DTYPE_TO_TORCH_DTYPE.keys()):
for endianness in ENDIANNESS:
responses_bytes = requests.post(
server.url_for("/v1/embeddings"),
json={
"model": model_name,
"input": input_texts,
"encoding_format": "bytes_only",
"embed_dtype": embed_dtype,
"endianness": endianness,
},
)
assert "metadata" not in responses_bytes.headers
body = responses_bytes.content
items = build_metadata_items(
embed_dtype=embed_dtype,
endianness=endianness,
shape=(embedding_size,),
n_request=len(input_texts),
)
bytes_data = decode_pooling_output(items=items, body=body)
bytes_data = [x.to(torch.float32).tolist() for x in bytes_data]
check_embeddings_close(
embeddings_0_lst=float_data,
embeddings_1_lst=bytes_data,
name_0="float_data",
name_1="bytes_data",
tol=1e-2,
)
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("param_name", ["encoding_format", "embed_dtype", "endianness"]) @pytest.mark.parametrize("param_name", ["encoding_format", "embed_dtype", "endianness"])

View File

@ -18,6 +18,7 @@ from vllm.utils.serial_utils import (
ENDIANNESS, ENDIANNESS,
MetadataItem, MetadataItem,
binary2tensor, binary2tensor,
build_metadata_items,
decode_pooling_output, decode_pooling_output,
) )
@ -352,6 +353,61 @@ async def test_bytes_embed_dtype_and_endianness(
) )
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_bytes_only_embed_dtype_and_endianness(
server: RemoteOpenAIServer, model_name: str
):
input_texts = [
"The best thing about vLLM is that it supports many different models",
] * 2
url = server.url_for("pooling")
float_response = requests.post(
url,
json={
"model": model_name,
"input": input_texts,
"encoding_format": "float",
},
)
responses_float = PoolingResponse.model_validate(float_response.json())
float_data = [np.array(d.data).squeeze(-1).tolist() for d in responses_float.data]
n_tokens = responses_float.usage.prompt_tokens // len(input_texts)
for embed_dtype in list(EMBED_DTYPE_TO_TORCH_DTYPE.keys()):
for endianness in ENDIANNESS:
responses_bytes = requests.post(
url,
json={
"model": model_name,
"input": input_texts,
"encoding_format": "bytes_only",
"embed_dtype": embed_dtype,
"endianness": endianness,
},
)
assert "metadata" not in responses_bytes.headers
body = responses_bytes.content
items = build_metadata_items(
embed_dtype=embed_dtype,
endianness=endianness,
shape=(n_tokens, 1),
n_request=len(input_texts),
)
bytes_data = decode_pooling_output(items=items, body=body)
bytes_data = [x.to(torch.float32).view(-1).tolist() for x in bytes_data]
check_embeddings_close(
embeddings_0_lst=float_data,
embeddings_1_lst=bytes_data,
name_0="float_data",
name_1="bytes_data",
tol=1e-2,
)
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("param_name", ["encoding_format", "embed_dtype", "endianness"]) @pytest.mark.parametrize("param_name", ["encoding_format", "embed_dtype", "endianness"])

View File

@ -59,8 +59,8 @@ async def create_embedding(
return JSONResponse(content=generator.model_dump()) return JSONResponse(content=generator.model_dump())
elif isinstance(generator, EmbeddingBytesResponse): elif isinstance(generator, EmbeddingBytesResponse):
return StreamingResponse( return StreamingResponse(
content=generator.body, content=generator.content,
headers={"metadata": generator.metadata}, headers=generator.headers,
media_type=generator.media_type, media_type=generator.media_type,
) )

View File

@ -203,6 +203,6 @@ class EmbeddingResponse(OpenAIBaseModel):
class EmbeddingBytesResponse(OpenAIBaseModel): class EmbeddingBytesResponse(OpenAIBaseModel):
body: list[bytes] content: list[bytes]
metadata: str headers: dict[str, str] | None = None
media_type: str = "application/octet-stream" media_type: str = "application/octet-stream"

View File

@ -163,29 +163,35 @@ class EmbeddingMixin(OpenAIServing):
usage=usage, usage=usage,
) )
def encode_bytes(): def encode_bytes(bytes_only: bool) -> EmbeddingBytesResponse:
body, items, usage = encode_pooling_bytes( content, items, usage = encode_pooling_bytes(
pooling_outputs=final_res_batch_checked, pooling_outputs=final_res_batch_checked,
embed_dtype=embed_dtype, embed_dtype=embed_dtype,
endianness=endianness, endianness=endianness,
) )
metadata = { headers = (
"id": ctx.request_id, None
"created": ctx.created_time, if bytes_only
"model": ctx.model_name, else {
"data": items, "metadata": json.dumps(
"usage": usage, {
} "id": ctx.request_id,
return EmbeddingBytesResponse( "created": ctx.created_time,
body=body, "model": ctx.model_name,
metadata=json.dumps(metadata), "data": items,
"usage": usage,
}
)
}
) )
return EmbeddingBytesResponse(content=content, headers=headers)
if encoding_format == "float" or encoding_format == "base64": if encoding_format == "float" or encoding_format == "base64":
return encode_float_base64() return encode_float_base64()
elif encoding_format == "bytes": elif encoding_format == "bytes" or encoding_format == "bytes_only":
return encode_bytes() return encode_bytes(bytes_only=encoding_format == "bytes_only")
else: else:
assert_never(encoding_format) assert_never(encoding_format)

View File

@ -55,8 +55,8 @@ async def create_pooling(request: PoolingRequest, raw_request: Request):
return JSONResponse(content=generator.model_dump()) return JSONResponse(content=generator.model_dump())
elif isinstance(generator, PoolingBytesResponse): elif isinstance(generator, PoolingBytesResponse):
return StreamingResponse( return StreamingResponse(
content=generator.body, content=generator.content,
headers={"metadata": generator.metadata}, headers=generator.headers,
media_type=generator.media_type, media_type=generator.media_type,
) )

View File

@ -143,6 +143,6 @@ class PoolingResponse(OpenAIBaseModel):
class PoolingBytesResponse(OpenAIBaseModel): class PoolingBytesResponse(OpenAIBaseModel):
body: list[bytes] content: list[bytes]
metadata: str headers: dict[str, str] | None = None
media_type: str = "application/octet-stream" media_type: str = "application/octet-stream"

View File

@ -314,29 +314,38 @@ class OpenAIServingPooling(OpenAIServing):
usage=usage, usage=usage,
) )
def encode_bytes(): def encode_bytes(bytes_only: bool) -> PoolingBytesResponse:
body, items, usage = encode_pooling_bytes( content, items, usage = encode_pooling_bytes(
pooling_outputs=final_res_batch, pooling_outputs=final_res_batch,
embed_dtype=embed_dtype, embed_dtype=embed_dtype,
endianness=endianness, endianness=endianness,
) )
metadata = { headers = (
"id": request_id, None
"created": created_time, if bytes_only
"model": model_name, else {
"data": items, "metadata": json.dumps(
"usage": usage, {
} "id": request_id,
"created": created_time,
"model": model_name,
"data": items,
"usage": usage,
}
)
}
)
return PoolingBytesResponse( return PoolingBytesResponse(
body=body, content=content,
metadata=json.dumps(metadata), headers=headers,
) )
if encoding_format == "float" or encoding_format == "base64": if encoding_format == "float" or encoding_format == "base64":
return encode_float_base64() return encode_float_base64()
elif encoding_format == "bytes": elif encoding_format == "bytes" or encoding_format == "bytes_only":
return encode_bytes() return encode_bytes(bytes_only=encoding_format == "bytes_only")
else: else:
assert_never(encoding_format) assert_never(encoding_format)

View File

@ -2,15 +2,19 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64 import base64
import io import io
import math
import sys import sys
from dataclasses import dataclass from dataclasses import dataclass
from typing import Literal from typing import TYPE_CHECKING, Any, Literal
import numpy as np import numpy as np
import torch import torch
from typing_extensions import assert_never from typing_extensions import assert_never
from vllm import PoolingRequestOutput if TYPE_CHECKING:
from vllm import PoolingRequestOutput
else:
PoolingRequestOutput = Any
sys_byteorder = sys.byteorder sys_byteorder = sys.byteorder
@ -27,6 +31,14 @@ EMBED_DTYPE_TO_TORCH_DTYPE = {
"fp8_e5m2": torch.float8_e5m2, "fp8_e5m2": torch.float8_e5m2,
} }
EMBED_DTYPE_TO_N_BYTES = {
"float32": 4,
"float16": 2,
"bfloat16": 2,
"fp8_e4m3": 1,
"fp8_e5m2": 1,
}
EMBED_DTYPE_TO_TORCH_DTYPE_VIEW = { EMBED_DTYPE_TO_TORCH_DTYPE_VIEW = {
"float32": torch.float32, "float32": torch.float32,
@ -50,7 +62,7 @@ ENDIANNESS = ["native", "big", "little"]
EmbedDType = Literal["float32", "float16", "bfloat16", "fp8_e4m3", "fp8_e5m2"] EmbedDType = Literal["float32", "float16", "bfloat16", "fp8_e4m3", "fp8_e5m2"]
Endianness = Literal["native", "big", "little"] Endianness = Literal["native", "big", "little"]
EncodingFormat = Literal["float", "base64", "bytes"] EncodingFormat = Literal["float", "base64", "bytes", "bytes_only"]
def tensor2base64(x: torch.Tensor) -> str: def tensor2base64(x: torch.Tensor) -> str:
@ -114,7 +126,7 @@ def encode_pooling_output(
elif encoding_format == "base64": elif encoding_format == "base64":
embedding_bytes = tensor2binary(output.outputs.data, embed_dtype, endianness) embedding_bytes = tensor2binary(output.outputs.data, embed_dtype, endianness)
return base64.b64encode(embedding_bytes).decode("utf-8") return base64.b64encode(embedding_bytes).decode("utf-8")
elif encoding_format == "bytes": elif encoding_format == "bytes" or encoding_format == "bytes_only":
return tensor2binary(output.outputs.data, embed_dtype, endianness) return tensor2binary(output.outputs.data, embed_dtype, endianness)
assert_never(encoding_format) assert_never(encoding_format)
@ -129,6 +141,29 @@ class MetadataItem:
shape: tuple[int, ...] shape: tuple[int, ...]
def build_metadata_items(
embed_dtype: EmbedDType,
endianness: Endianness,
shape: tuple[int, ...],
n_request: int,
):
n_bytes = EMBED_DTYPE_TO_N_BYTES[embed_dtype]
size = math.prod(shape)
items = [
MetadataItem(
index=i,
embed_dtype=embed_dtype,
endianness=endianness,
start=i * size * n_bytes,
end=(i + 1) * size * n_bytes,
shape=shape,
)
for i in range(n_request)
]
return items
def encode_pooling_bytes( def encode_pooling_bytes(
pooling_outputs: list[PoolingRequestOutput], pooling_outputs: list[PoolingRequestOutput],
embed_dtype: EmbedDType, embed_dtype: EmbedDType,