diff --git a/examples/online_serving/pooling/README.md b/examples/online_serving/pooling/README.md index 91345e0ae7785..3b6da20d5f0fe 100644 --- a/examples/online_serving/pooling/README.md +++ b/examples/online_serving/pooling/README.md @@ -6,10 +6,16 @@ python examples/online_serving/pooling/cohere_rerank_client.py ``` -## Embedding embed_dtype usage +## Embedding requests base64 encoding_format usage ```bash -python examples/online_serving/pooling/embedding_embed_dtype_client.py +python examples/online_serving/pooling/embedding_requests_base64_client.py +``` + +## Embedding requests bytes encoding_format usage + +```bash +python examples/online_serving/pooling/embedding_requests_bytes_client.py ``` ## Jinaai rerank usage diff --git a/examples/online_serving/pooling/embedding_embed_dtype_client.py b/examples/online_serving/pooling/embedding_requests_base64_client.py similarity index 50% rename from examples/online_serving/pooling/embedding_embed_dtype_client.py rename to examples/online_serving/pooling/embedding_requests_base64_client.py index c769fe613806e..4c2399b58c11f 100644 --- a/examples/online_serving/pooling/embedding_embed_dtype_client.py +++ b/examples/online_serving/pooling/embedding_requests_base64_client.py @@ -12,7 +12,11 @@ import base64 import requests import torch -from vllm.entrypoints.openai.protocol import EMBED_DTYPE_TO_TORCH_DTYPE +from vllm.utils.serial_utils import ( + EMBED_DTYPE_TO_TORCH_DTYPE, + ENDIANNESS, + binary2tensor, +) def post_http_request(prompt: dict, api_url: str) -> requests.Response: @@ -34,24 +38,25 @@ def main(args): api_url = f"http://{args.host}:{args.port}/v1/embeddings" model_name = args.model - for embed_dtype, torch_dtype in EMBED_DTYPE_TO_TORCH_DTYPE.items(): - prompt = { - "model": model_name, - "input": "vLLM is great!", - "encoding_format": "base64", - "embed_dtype": embed_dtype, - } - response = post_http_request(prompt=prompt, api_url=api_url) + # The OpenAI client does not support the embed_dtype and endianness parameters. + for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE: + for endianness in ENDIANNESS: + prompt = { + "model": model_name, + "input": "vLLM is great!", + "encoding_format": "base64", + "embed_dtype": embed_dtype, + "endianness": endianness, + } + response = post_http_request(prompt=prompt, api_url=api_url) - embedding = [] - for data in response.json()["data"]: - embedding.append( - torch.frombuffer( - base64.b64decode(data["embedding"]), dtype=torch_dtype - ).to(torch.float32) - ) - embedding = torch.cat(embedding) - print(embed_dtype, embedding.shape) + embedding = [] + for data in response.json()["data"]: + binary = base64.b64decode(data["embedding"]) + tensor = binary2tensor(binary, (-1,), embed_dtype, endianness) + embedding.append(tensor.to(torch.float32)) + embedding = torch.cat(embedding) + print(embed_dtype, endianness, embedding.shape) if __name__ == "__main__": diff --git a/examples/online_serving/pooling/embedding_requests_bytes_client.py b/examples/online_serving/pooling/embedding_requests_bytes_client.py new file mode 100644 index 0000000000000..c2832f1b54ce7 --- /dev/null +++ b/examples/online_serving/pooling/embedding_requests_bytes_client.py @@ -0,0 +1,66 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Example Python client for embedding API using vLLM API server +NOTE: + start a supported embeddings model server with `vllm serve`, e.g. + vllm serve intfloat/e5-small +""" + +import argparse +import json + +import requests +import torch + +from vllm.utils.serial_utils import ( + EMBED_DTYPE_TO_TORCH_DTYPE, + ENDIANNESS, + MetadataItem, + decode_pooling_output, +) + + +def post_http_request(prompt: dict, api_url: str) -> requests.Response: + headers = {"User-Agent": "Test Client"} + response = requests.post(api_url, headers=headers, json=prompt) + return response + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--host", type=str, default="localhost") + parser.add_argument("--port", type=int, default=8000) + parser.add_argument("--model", type=str, default="intfloat/e5-small") + + return parser.parse_args() + + +def main(args): + api_url = f"http://{args.host}:{args.port}/v1/embeddings" + model_name = args.model + + # The OpenAI client does not support the bytes encoding_format. + # The OpenAI client does not support the embed_dtype and endianness parameters. + for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE: + for endianness in ENDIANNESS: + prompt = { + "model": model_name, + "input": "vLLM is great!", + "encoding_format": "bytes", + "embed_dtype": embed_dtype, + "endianness": endianness, + } + response = post_http_request(prompt=prompt, api_url=api_url) + metadata = json.loads(response.headers["metadata"]) + body = response.content + items = [MetadataItem(**x) for x in metadata["data"]] + + embedding = decode_pooling_output(items=items, body=body) + embedding = [x.to(torch.float32) for x in embedding] + embedding = torch.cat(embedding) + print(embed_dtype, endianness, embedding.shape) + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/tests/entrypoints/pooling/openai/test_embedding.py b/tests/entrypoints/pooling/openai/test_embedding.py index ab8ca9d68e0e7..b3f12283fdbdf 100644 --- a/tests/entrypoints/pooling/openai/test_embedding.py +++ b/tests/entrypoints/pooling/openai/test_embedding.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import base64 +import json import numpy as np import openai @@ -15,11 +16,17 @@ from tests.models.language.pooling.embed_utils import run_embedding_correctness_ from tests.models.utils import check_embeddings_close from tests.utils import RemoteOpenAIServer from vllm.entrypoints.openai.protocol import ( - EMBED_DTYPE_TO_TORCH_DTYPE, EmbeddingResponse, PoolingResponse, ) from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.utils.serial_utils import ( + EMBED_DTYPE_TO_TORCH_DTYPE, + ENDIANNESS, + MetadataItem, + binary2tensor, + decode_pooling_output, +) MODEL_NAME = "intfloat/multilingual-e5-small" DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501 @@ -250,8 +257,8 @@ async def test_batch_base64_embedding( @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_base64_embed_dtype( - hf_model, server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str +async def test_base64_embed_dtype_and_endianness( + server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str ): input_texts = [ "The best thing about vLLM is that it supports many different models", @@ -262,44 +269,86 @@ async def test_base64_embed_dtype( ) float_data = [d.embedding for d in responses_float.data] - for embed_dtype, torch_dtype in EMBED_DTYPE_TO_TORCH_DTYPE.items(): - responses_base64 = requests.post( - server.url_for("/v1/embeddings"), - json={ - "model": model_name, - "input": input_texts, - "encoding_format": "base64", - "embed_dtype": embed_dtype, - }, - ) - - base64_data = [] - for data in responses_base64.json()["data"]: - base64_data.append( - torch.frombuffer(base64.b64decode(data["embedding"]), dtype=torch_dtype) - .to(torch.float32) - .tolist() + for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE: + for endianness in ENDIANNESS: + responses_base64 = requests.post( + server.url_for("/v1/embeddings"), + json={ + "model": model_name, + "input": input_texts, + "encoding_format": "base64", + "embed_dtype": embed_dtype, + "endianness": endianness, + }, ) - check_embeddings_close( - embeddings_0_lst=float_data, - embeddings_1_lst=base64_data, - name_0="float_data", - name_1="base64_data", - tol=1e-2, - ) + base64_data = [] + for data in responses_base64.json()["data"]: + binary = base64.b64decode(data["embedding"]) + tensor = binary2tensor(binary, (-1,), embed_dtype, endianness) + base64_data.append(tensor.to(torch.float32).tolist()) + + check_embeddings_close( + embeddings_0_lst=float_data, + embeddings_1_lst=base64_data, + name_0="float_data", + name_1="base64_data", + tol=1e-2, + ) @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_base64_embed_dtype_not_supported( - hf_model, server: RemoteOpenAIServer, model_name: str +async def test_bytes_embed_dtype_and_endianness( + server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str ): input_texts = [ "The best thing about vLLM is that it supports many different models", ] - bad_embed_dtype = "bad_embed_dtype" + responses_float = await client.embeddings.create( + input=input_texts, model=model_name, encoding_format="float" + ) + float_data = [d.embedding for d in responses_float.data] + + for embed_dtype in list(EMBED_DTYPE_TO_TORCH_DTYPE.keys()): + for endianness in ENDIANNESS: + responses_bytes = requests.post( + server.url_for("/v1/embeddings"), + json={ + "model": model_name, + "input": input_texts, + "encoding_format": "bytes", + "embed_dtype": embed_dtype, + "endianness": endianness, + }, + ) + + metadata = json.loads(responses_bytes.headers["metadata"]) + body = responses_bytes.content + items = [MetadataItem(**x) for x in metadata["data"]] + + bytes_data = decode_pooling_output(items=items, body=body) + bytes_data = [x.to(torch.float32).tolist() for x in bytes_data] + + check_embeddings_close( + embeddings_0_lst=float_data, + embeddings_1_lst=bytes_data, + name_0="float_data", + name_1="bytes_data", + tol=1e-2, + ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("param_name", ["encoding_format", "embed_dtype", "endianness"]) +async def test_params_not_supported( + server: RemoteOpenAIServer, model_name: str, param_name: str +): + input_texts = [ + "The best thing about vLLM is that it supports many different models", + ] responses_base64 = requests.post( server.url_for("/v1/embeddings"), @@ -307,14 +356,13 @@ async def test_base64_embed_dtype_not_supported( "model": model_name, "input": input_texts, "encoding_format": "base64", - "embed_dtype": bad_embed_dtype, + param_name: f"bad_{param_name}", }, ) assert responses_base64.status_code == 400 - assert responses_base64.json()["error"]["message"].startswith( - f"embed_dtype={bad_embed_dtype!r} is not supported." - ) + assert "literal_error" in responses_base64.json()["error"]["message"] + assert f"bad_{param_name}" in responses_base64.json()["error"]["message"] @pytest.mark.asyncio diff --git a/tests/entrypoints/pooling/openai/test_pooling.py b/tests/entrypoints/pooling/openai/test_pooling.py index e4e395f9eb6cf..4b20c5b0fa84d 100644 --- a/tests/entrypoints/pooling/openai/test_pooling.py +++ b/tests/entrypoints/pooling/openai/test_pooling.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import base64 +import json import numpy as np import pytest @@ -10,8 +11,15 @@ import torch from tests.models.utils import check_embeddings_close from tests.utils import RemoteOpenAIServer -from vllm.entrypoints.openai.protocol import EMBED_DTYPE_TO_TORCH_DTYPE, PoolingResponse +from vllm.entrypoints.openai.protocol import PoolingResponse from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.utils.serial_utils import ( + EMBED_DTYPE_TO_TORCH_DTYPE, + ENDIANNESS, + MetadataItem, + binary2tensor, + decode_pooling_output, +) MODEL_NAME = "internlm/internlm2-1_8b-reward" DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501 @@ -251,7 +259,9 @@ async def test_batch_base64_pooling(server: RemoteOpenAIServer, model_name: str) @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_base64_embed_dtype(server: RemoteOpenAIServer, model_name: str): +async def test_base64_embed_dtype_and_endianness( + server: RemoteOpenAIServer, model_name: str +): input_texts = [ "The best thing about vLLM is that it supports many different models", ] @@ -268,44 +278,93 @@ async def test_base64_embed_dtype(server: RemoteOpenAIServer, model_name: str): responses_float = PoolingResponse.model_validate(float_response.json()) float_data = [np.array(d.data).squeeze(-1).tolist() for d in responses_float.data] - for embed_dtype, torch_dtype in EMBED_DTYPE_TO_TORCH_DTYPE.items(): - responses_base64 = requests.post( - url, - json={ - "model": model_name, - "input": input_texts, - "encoding_format": "base64", - "embed_dtype": embed_dtype, - }, - ) - - base64_data = [] - for data in responses_base64.json()["data"]: - base64_data.append( - torch.frombuffer(base64.b64decode(data["data"]), dtype=torch_dtype) - .to(torch.float32) - .tolist() + for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE: + for endianness in ENDIANNESS: + responses_base64 = requests.post( + url, + json={ + "model": model_name, + "input": input_texts, + "encoding_format": "base64", + "embed_dtype": embed_dtype, + "endianness": endianness, + }, ) - check_embeddings_close( - embeddings_0_lst=float_data, - embeddings_1_lst=base64_data, - name_0="float_data", - name_1="base64_data", - tol=1e-2, - ) + base64_data = [] + for data in responses_base64.json()["data"]: + binary = base64.b64decode(data["data"]) + tensor = binary2tensor(binary, (-1,), embed_dtype, endianness) + base64_data.append(tensor.to(torch.float32).tolist()) + + check_embeddings_close( + embeddings_0_lst=float_data, + embeddings_1_lst=base64_data, + name_0="float_data", + name_1="base64_data", + tol=1e-2, + ) @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_base64_embed_dtype_not_supported( +async def test_bytes_embed_dtype_and_endianness( server: RemoteOpenAIServer, model_name: str ): input_texts = [ "The best thing about vLLM is that it supports many different models", ] - bad_embed_dtype = "bad_embed_dtype" + url = server.url_for("pooling") + float_response = requests.post( + url, + json={ + "model": model_name, + "input": input_texts, + "encoding_format": "float", + }, + ) + responses_float = PoolingResponse.model_validate(float_response.json()) + float_data = [np.array(d.data).squeeze(-1).tolist() for d in responses_float.data] + + for embed_dtype in list(EMBED_DTYPE_TO_TORCH_DTYPE.keys()): + for endianness in ENDIANNESS: + responses_bytes = requests.post( + url, + json={ + "model": model_name, + "input": input_texts, + "encoding_format": "bytes", + "embed_dtype": embed_dtype, + "endianness": endianness, + }, + ) + + metadata = json.loads(responses_bytes.headers["metadata"]) + body = responses_bytes.content + items = [MetadataItem(**x) for x in metadata["data"]] + + bytes_data = decode_pooling_output(items=items, body=body) + bytes_data = [x.to(torch.float32).view(-1).tolist() for x in bytes_data] + + check_embeddings_close( + embeddings_0_lst=float_data, + embeddings_1_lst=bytes_data, + name_0="float_data", + name_1="bytes_data", + tol=1e-2, + ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("param_name", ["encoding_format", "embed_dtype", "endianness"]) +async def test_params_not_supported( + server: RemoteOpenAIServer, model_name: str, param_name: str +): + input_texts = [ + "The best thing about vLLM is that it supports many different models", + ] responses_base64 = requests.post( server.url_for("pooling"), @@ -313,14 +372,13 @@ async def test_base64_embed_dtype_not_supported( "model": model_name, "input": input_texts, "encoding_format": "base64", - "embed_dtype": bad_embed_dtype, + param_name: f"bad_{param_name}", }, ) assert responses_base64.status_code == 400 - assert responses_base64.json()["error"]["message"].startswith( - f"embed_dtype={bad_embed_dtype!r} is not supported." - ) + assert "literal_error" in responses_base64.json()["error"]["message"] + assert f"bad_{param_name}" in responses_base64.json()["error"]["message"] @pytest.mark.asyncio diff --git a/tests/utils_/test_serial_utils.py b/tests/utils_/test_serial_utils.py new file mode 100644 index 0000000000000..7f2c1bdacf90e --- /dev/null +++ b/tests/utils_/test_serial_utils.py @@ -0,0 +1,40 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest +import torch + +from tests.models.utils import check_embeddings_close +from vllm.utils.serial_utils import ( + EMBED_DTYPE_TO_TORCH_DTYPE, + ENDIANNESS, + binary2tensor, + tensor2binary, +) + + +@pytest.mark.parametrize("endianness", ENDIANNESS) +@pytest.mark.parametrize("embed_dtype", EMBED_DTYPE_TO_TORCH_DTYPE.keys()) +@torch.inference_mode +def test_encode_and_decode(embed_dtype: str, endianness: str): + for i in range(10): + tensor = torch.rand(2, 3, 5, 7, 11, 13, device="cpu", dtype=torch.float32) + shape = tensor.shape + binary = tensor2binary(tensor, embed_dtype, endianness) + new_tensor = binary2tensor(binary, shape, embed_dtype, endianness).to( + torch.float32 + ) + + if embed_dtype in ["float32", "float16"]: + torch.testing.assert_close(tensor, new_tensor, atol=0.001, rtol=0.001) + elif embed_dtype == "bfloat16": + torch.testing.assert_close(tensor, new_tensor, atol=0.01, rtol=0.01) + else: # for fp8 + torch.testing.assert_close(tensor, new_tensor, atol=0.1, rtol=0.1) + + check_embeddings_close( + embeddings_0_lst=tensor.view(1, -1), + embeddings_1_lst=new_tensor.view(1, -1), + name_0="gt", + name_1="new", + tol=1e-2, + ) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 555c95effd1dd..f84a530fd004e 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -58,12 +58,14 @@ from vllm.entrypoints.openai.protocol import ( CompletionResponse, DetokenizeRequest, DetokenizeResponse, + EmbeddingBytesResponse, EmbeddingRequest, EmbeddingResponse, ErrorInfo, ErrorResponse, IOProcessorResponse, LoadLoRAAdapterRequest, + PoolingBytesResponse, PoolingRequest, PoolingResponse, RerankRequest, @@ -681,7 +683,10 @@ async def create_completion(request: CompletionRequest, raw_request: Request): ) @with_cancellation @load_aware_call -async def create_embedding(request: EmbeddingRequest, raw_request: Request): +async def create_embedding( + request: EmbeddingRequest, + raw_request: Request, +): handler = embedding(raw_request) if handler is None: return base(raw_request).create_error_response( @@ -701,6 +706,12 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request): ) elif isinstance(generator, EmbeddingResponse): return JSONResponse(content=generator.model_dump()) + elif isinstance(generator, EmbeddingBytesResponse): + return StreamingResponse( + content=generator.body, + headers={"metadata": generator.metadata}, + media_type=generator.media_type, + ) assert_never(generator) @@ -733,6 +744,12 @@ async def create_pooling(request: PoolingRequest, raw_request: Request): ) elif isinstance(generator, (PoolingResponse, IOProcessorResponse)): return JSONResponse(content=generator.model_dump()) + elif isinstance(generator, PoolingBytesResponse): + return StreamingResponse( + content=generator.body, + headers={"metadata": generator.metadata}, + media_type=generator.media_type, + ) assert_never(generator) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 0d27e6707c233..6d0149960f3cc 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -48,6 +48,12 @@ from openai.types.responses.response_reasoning_item import ( Content as ResponseReasoningTextContent, ) +from vllm.utils.serial_utils import ( + EmbedDType, + EncodingFormat, + Endianness, +) + # Backward compatibility for OpenAI client versions try: # For older openai versions (< 1.100.0) from openai.types.responses import ResponseTextConfig @@ -84,18 +90,6 @@ from vllm.sampling_params import ( from vllm.utils import random_uuid from vllm.utils.import_utils import resolve_obj_by_qualname -EMBED_DTYPE_TO_TORCH_DTYPE = { - "float32": torch.float32, - "float16": torch.float16, - "bfloat16": torch.bfloat16, - # I'm not sure if other platforms' CPUs support the fp8 data format. - # EMBED_DTYPE only uses the fp8 data representation, - # does not use fp8 computation, and only occurs on the CPU. - # Apologize for any possible break. - "fp8_e4m3": torch.float8_e4m3fn, - "fp8_e5m2": torch.float8_e5m2, -} - logger = init_logger(__name__) _LONG_INFO = torch.iinfo(torch.long) @@ -1531,7 +1525,7 @@ class EmbeddingCompletionRequest(OpenAIBaseModel): # https://platform.openai.com/docs/api-reference/embeddings model: str | None = None input: list[int] | list[list[int]] | str | list[str] - encoding_format: Literal["float", "base64"] = "float" + encoding_format: EncodingFormat = "float" dimensions: int | None = None user: str | None = None truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None @@ -1564,11 +1558,20 @@ class EmbeddingCompletionRequest(OpenAIBaseModel): default=None, description="Whether to normalize the embeddings outputs. Default is True.", ) - embed_dtype: str = Field( + embed_dtype: EmbedDType = Field( default="float32", description=( - "What dtype to use for base64 encoding. Default to using " - "float32 for base64 encoding to match the OpenAI python client behavior." + "What dtype to use for encoding. Default to using float32 for base64 " + "encoding to match the OpenAI python client behavior. " + "This parameter will affect base64 and binary_response." + ), + ) + endianness: Endianness = Field( + default="native", + description=( + "What endianness to use for encoding. Default to using native for " + "base64 encoding to match the OpenAI python client behavior." + "This parameter will affect base64 and binary_response." ), ) # --8<-- [end:embedding-extra-params] @@ -1585,7 +1588,7 @@ class EmbeddingChatRequest(OpenAIBaseModel): model: str | None = None messages: list[ChatCompletionMessageParam] - encoding_format: Literal["float", "base64"] = "float" + encoding_format: EncodingFormat = "float" dimensions: int | None = None user: str | None = None truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None @@ -1650,11 +1653,20 @@ class EmbeddingChatRequest(OpenAIBaseModel): default=None, description="Whether to normalize the embeddings outputs. Default is True.", ) - embed_dtype: str = Field( + embed_dtype: EmbedDType = Field( default="float32", description=( - "Which dtype to use for base64 encoding. Defaults to float32 " - "to match OpenAI API." + "What dtype to use for encoding. Default to using float32 for base64 " + "encoding to match the OpenAI python client behavior. " + "This parameter will affect base64 and binary_response." + ), + ) + endianness: Endianness = Field( + default="native", + description=( + "What endianness to use for encoding. Default to using native for " + "base64 encoding to match the OpenAI python client behavior." + "This parameter will affect base64 and binary_response." ), ) # --8<-- [end:chat-embedding-extra-params] @@ -1701,11 +1713,21 @@ class IOProcessorRequest(OpenAIBaseModel, Generic[T]): """ activation: bool = False - embed_dtype: str = Field( + encoding_format: EncodingFormat = "float" + embed_dtype: EmbedDType = Field( default="float32", description=( - "What dtype to use for base64 encoding. Default to using " - "float32 for base64 encoding to match the OpenAI python client behavior." + "What dtype to use for encoding. Default to using float32 for base64 " + "encoding to match the OpenAI python client behavior. " + "This parameter will affect base64 and binary_response." + ), + ) + endianness: Endianness = Field( + default="native", + description=( + "What endianness to use for encoding. Default to using native for " + "base64 encoding to match the OpenAI python client behavior." + "This parameter will affect base64 and binary_response." ), ) @@ -1905,6 +1927,12 @@ class EmbeddingResponse(OpenAIBaseModel): usage: UsageInfo +class EmbeddingBytesResponse(OpenAIBaseModel): + body: list[bytes] + metadata: str + media_type: str = "application/octet-stream" + + class PoolingResponseData(OpenAIBaseModel): index: int object: str = "pooling" @@ -1920,6 +1948,12 @@ class PoolingResponse(OpenAIBaseModel): usage: UsageInfo +class PoolingBytesResponse(OpenAIBaseModel): + body: list[bytes] + metadata: str + media_type: str = "application/octet-stream" + + class ScoreResponseData(OpenAIBaseModel): index: int object: str = "score" diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index 55f58e7757faf..3308198c8bd19 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -1,18 +1,19 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project - +import json from collections.abc import AsyncGenerator, Mapping from typing import Any, Final, cast import torch from fastapi import Request -from typing_extensions import override +from fastapi.responses import Response +from typing_extensions import assert_never, override from vllm.engine.protocol import EngineClient from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.protocol import ( - EMBED_DTYPE_TO_TORCH_DTYPE, + EmbeddingBytesResponse, EmbeddingChatRequest, EmbeddingCompletionRequest, EmbeddingRequest, @@ -28,7 +29,6 @@ from vllm.entrypoints.openai.serving_engine import ( TextTokensPrompt, ) from vllm.entrypoints.openai.serving_models import OpenAIServingModels -from vllm.entrypoints.openai.utils import encoding_pooling_output from vllm.entrypoints.renderer import RenderConfig from vllm.inputs.data import TokensPrompt as EngineTokensPrompt from vllm.logger import init_logger @@ -41,6 +41,13 @@ from vllm.outputs import ( from vllm.pooling_params import PoolingParams from vllm.utils.async_utils import merge_async_iterators from vllm.utils.collection_utils import chunk_list +from vllm.utils.serial_utils import ( + EmbedDType, + EncodingFormat, + Endianness, + encode_pooling_bytes, + encode_pooling_output, +) logger = init_logger(__name__) @@ -68,12 +75,6 @@ class EmbeddingMixin(OpenAIServing): ) -> ErrorResponse | None: ctx = cast(EmbeddingServeContext, ctx) try: - if ctx.request.embed_dtype not in EMBED_DTYPE_TO_TORCH_DTYPE: - return self.create_error_response( - f"embed_dtype={ctx.request.embed_dtype!r} is not supported. " - f"Supported types: {EMBED_DTYPE_TO_TORCH_DTYPE.keys()}" - ) - ctx.lora_request = self._maybe_get_adapters(ctx.request) tokenizer = await self.engine_client.get_tokenizer() @@ -121,36 +122,70 @@ class EmbeddingMixin(OpenAIServing): def _build_response( self, ctx: ServeContext, - ) -> EmbeddingResponse | ErrorResponse: - items: list[EmbeddingResponseData] = [] - num_prompt_tokens = 0 - + ) -> EmbeddingResponse | Response | ErrorResponse: final_res_batch_checked = cast(list[PoolingRequestOutput], ctx.final_res_batch) - for idx, final_res in enumerate(final_res_batch_checked): - item = EmbeddingResponseData( - index=idx, - embedding=encoding_pooling_output( - final_res, ctx.request.encoding_format, ctx.request.embed_dtype - ), + encoding_format: EncodingFormat = ctx.request.encoding_format + embed_dtype: EmbedDType = ctx.request.embed_dtype + endianness: Endianness = ctx.request.endianness + + def encode_float_base64(): + items: list[EmbeddingResponseData] = [] + num_prompt_tokens = 0 + + for idx, final_res in enumerate(final_res_batch_checked): + item = EmbeddingResponseData( + index=idx, + embedding=encode_pooling_output( + final_res, + encoding_format=encoding_format, + embed_dtype=embed_dtype, + endianness=endianness, + ), + ) + prompt_token_ids = final_res.prompt_token_ids + + items.append(item) + num_prompt_tokens += len(prompt_token_ids) + + usage = UsageInfo( + prompt_tokens=num_prompt_tokens, + total_tokens=num_prompt_tokens, ) - prompt_token_ids = final_res.prompt_token_ids - items.append(item) - num_prompt_tokens += len(prompt_token_ids) + return EmbeddingResponse( + id=ctx.request_id, + created=ctx.created_time, + model=ctx.model_name, + data=items, + usage=usage, + ) - usage = UsageInfo( - prompt_tokens=num_prompt_tokens, - total_tokens=num_prompt_tokens, - ) + def encode_bytes(): + body, items, usage = encode_pooling_bytes( + pooling_outputs=final_res_batch_checked, + embed_dtype=embed_dtype, + endianness=endianness, + ) - return EmbeddingResponse( - id=ctx.request_id, - created=ctx.created_time, - model=ctx.model_name, - data=items, - usage=usage, - ) + metadata = { + "id": ctx.request_id, + "created": ctx.created_time, + "model": ctx.model_name, + "data": items, + "usage": usage, + } + return EmbeddingBytesResponse( + body=body, + metadata=json.dumps(metadata), + ) + + if encoding_format == "float" or encoding_format == "base64": + return encode_float_base64() + elif encoding_format == "bytes": + return encode_bytes() + else: + assert_never(encoding_format) def _get_max_position_embeddings(self) -> int: """Get the model's effective maximum sequence length for chunking.""" diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py index 7a27348da35b8..5d4a638808b1e 100644 --- a/vllm/entrypoints/openai/serving_pooling.py +++ b/vllm/entrypoints/openai/serving_pooling.py @@ -2,14 +2,12 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio -import base64 +import json import time from collections.abc import AsyncGenerator -from typing import Final, Literal, cast +from typing import Final, cast import jinja2 -import numpy as np -import torch from fastapi import Request from typing_extensions import assert_never @@ -17,10 +15,10 @@ from vllm.engine.protocol import EngineClient from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.protocol import ( - EMBED_DTYPE_TO_TORCH_DTYPE, ErrorResponse, IOProcessorRequest, IOProcessorResponse, + PoolingBytesResponse, PoolingChatRequest, PoolingCompletionRequest, PoolingRequest, @@ -30,33 +28,23 @@ from vllm.entrypoints.openai.protocol import ( ) from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.serving_models import OpenAIServingModels -from vllm.entrypoints.openai.utils import encoding_pooling_output from vllm.entrypoints.renderer import RenderConfig from vllm.entrypoints.utils import _validate_truncation_size from vllm.logger import init_logger -from vllm.outputs import PoolingOutput, PoolingRequestOutput +from vllm.outputs import PoolingRequestOutput from vllm.tasks import SupportedTask from vllm.utils.async_utils import merge_async_iterators +from vllm.utils.serial_utils import ( + EmbedDType, + EncodingFormat, + Endianness, + encode_pooling_bytes, + encode_pooling_output, +) logger = init_logger(__name__) -def _get_data( - output: PoolingOutput, - encoding_format: Literal["float", "base64"], -) -> list[float] | str: - if encoding_format == "float": - return output.data.tolist() - elif encoding_format == "base64": - # Force to use float32 for base64 encoding - # to match the OpenAI python client behavior - pt_float32 = output.data.to(dtype=torch.float32) - pooling_bytes = np.array(pt_float32, dtype="float32").tobytes() - return base64.b64encode(pooling_bytes).decode("utf-8") - - assert_never(encoding_format) - - class OpenAIServingPooling(OpenAIServing): def __init__( self, @@ -86,7 +74,7 @@ class OpenAIServingPooling(OpenAIServing): self, request: PoolingRequest, raw_request: Request | None = None, - ) -> PoolingResponse | IOProcessorResponse | ErrorResponse: + ) -> PoolingResponse | IOProcessorResponse | PoolingBytesResponse | ErrorResponse: """ See https://platform.openai.com/docs/api-reference/embeddings/create for the API specification. This API mimics the OpenAI Embedding API. @@ -95,12 +83,6 @@ class OpenAIServingPooling(OpenAIServing): if error_check_ret is not None: return error_check_ret - if request.embed_dtype not in EMBED_DTYPE_TO_TORCH_DTYPE: - return self.create_error_response( - f"embed_dtype={request.embed_dtype!r} is not supported. " - f"Supported types: {EMBED_DTYPE_TO_TORCH_DTYPE.keys()}" - ) - model_name = self.models.model_name() request_id = f"pool-{self._base_request_id(raw_request)}" @@ -256,6 +238,7 @@ class OpenAIServingPooling(OpenAIServing): model_name, request.encoding_format, request.embed_dtype, + request.endianness, ) except asyncio.CancelledError: return self.create_error_response("Client disconnected") @@ -271,34 +254,67 @@ class OpenAIServingPooling(OpenAIServing): request_id: str, created_time: int, model_name: str, - encoding_format: Literal["float", "base64"], - embed_dtype: str, - ) -> PoolingResponse: - items: list[PoolingResponseData] = [] - num_prompt_tokens = 0 + encoding_format: EncodingFormat, + embed_dtype: EmbedDType, + endianness: Endianness, + ) -> PoolingResponse | PoolingBytesResponse: + def encode_float_base64(): + items: list[PoolingResponseData] = [] + num_prompt_tokens = 0 - for idx, final_res in enumerate(final_res_batch): - item = PoolingResponseData( - index=idx, - data=encoding_pooling_output(final_res, encoding_format, embed_dtype), + for idx, final_res in enumerate(final_res_batch): + item = PoolingResponseData( + index=idx, + data=encode_pooling_output( + final_res, + encoding_format=encoding_format, + embed_dtype=embed_dtype, + endianness=endianness, + ), + ) + prompt_token_ids = final_res.prompt_token_ids + + items.append(item) + num_prompt_tokens += len(prompt_token_ids) + + usage = UsageInfo( + prompt_tokens=num_prompt_tokens, + total_tokens=num_prompt_tokens, ) - prompt_token_ids = final_res.prompt_token_ids - items.append(item) - num_prompt_tokens += len(prompt_token_ids) + return PoolingResponse( + id=request_id, + created=created_time, + model=model_name, + data=items, + usage=usage, + ) - usage = UsageInfo( - prompt_tokens=num_prompt_tokens, - total_tokens=num_prompt_tokens, - ) + def encode_bytes(): + body, items, usage = encode_pooling_bytes( + pooling_outputs=final_res_batch, + embed_dtype=embed_dtype, + endianness=endianness, + ) - return PoolingResponse( - id=request_id, - created=created_time, - model=model_name, - data=items, - usage=usage, - ) + metadata = { + "id": request_id, + "created": created_time, + "model": model_name, + "data": items, + "usage": usage, + } + return PoolingBytesResponse( + body=body, + metadata=json.dumps(metadata), + ) + + if encoding_format == "float" or encoding_format == "base64": + return encode_float_base64() + elif encoding_format == "bytes": + return encode_bytes() + else: + assert_never(encoding_format) def _build_render_config(self, request: PoolingCompletionRequest) -> RenderConfig: return RenderConfig( diff --git a/vllm/entrypoints/openai/utils.py b/vllm/entrypoints/openai/utils.py deleted file mode 100644 index 1fff9b0b501ac..0000000000000 --- a/vllm/entrypoints/openai/utils.py +++ /dev/null @@ -1,33 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import base64 -from typing import Literal - -import torch -from typing_extensions import assert_never - -from vllm import PoolingRequestOutput -from vllm.entrypoints.openai.protocol import EMBED_DTYPE_TO_TORCH_DTYPE - - -def encoding_pooling_output( - output: PoolingRequestOutput, - encoding_format: Literal["float", "base64"], - embed_dtype: str, -) -> list[float] | str: - if encoding_format == "float": - return output.outputs.data.tolist() - elif encoding_format == "base64": - assert embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE - torch_dtype = EMBED_DTYPE_TO_TORCH_DTYPE[embed_dtype] - embedding_bytes = ( - output.outputs.data.to(torch_dtype) - .flatten() - .contiguous() - .view(torch.uint8) - .numpy() - .tobytes() - ) - return base64.b64encode(embedding_bytes).decode("utf-8") - - assert_never(encoding_format) diff --git a/vllm/utils/serial_utils.py b/vllm/utils/serial_utils.py new file mode 100644 index 0000000000000..b89fa6ce4db66 --- /dev/null +++ b/vllm/utils/serial_utils.py @@ -0,0 +1,169 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import base64 +import sys +from dataclasses import dataclass +from typing import Literal + +import numpy as np +import torch +from typing_extensions import assert_never + +from vllm import PoolingRequestOutput + +sys_byteorder = sys.byteorder + + +EMBED_DTYPE_TO_TORCH_DTYPE = { + "float32": torch.float32, + "float16": torch.float16, + "bfloat16": torch.bfloat16, + # I'm not sure if other platforms' CPUs support the fp8 data format. + # EMBED_DTYPE only uses the fp8 data representation, + # does not use fp8 computation, and only occurs on the CPU. + # Apologize for any possible break. + "fp8_e4m3": torch.float8_e4m3fn, + "fp8_e5m2": torch.float8_e5m2, +} + + +EMBED_DTYPE_TO_TORCH_DTYPE_VIEW = { + "float32": torch.float32, + "float16": torch.float16, + # numpy does not support bfloat16 and fp8 + "bfloat16": torch.float16, + "fp8_e4m3": torch.uint8, + "fp8_e5m2": torch.uint8, +} + +EMBED_DTYPE_TO_NUMPY_DTYPE_VIEW = { + "float32": np.float32, + "float16": np.float16, + # numpy does not support bfloat16 and fp8 + "bfloat16": np.float16, + "fp8_e4m3": np.uint8, + "fp8_e5m2": np.uint8, +} + +ENDIANNESS = ["native", "big", "little"] + +EmbedDType = Literal["float32", "float16", "bfloat16", "fp8_e4m3", "fp8_e5m2"] +Endianness = Literal["native", "big", "little"] +EncodingFormat = Literal["float", "base64", "bytes"] + + +def tensor2binary( + tensor: torch.Tensor, embed_dtype: EmbedDType, endianness: Endianness +) -> bytes: + assert isinstance(tensor, torch.Tensor) + assert embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE + assert endianness in ENDIANNESS + + torch_dtype = EMBED_DTYPE_TO_TORCH_DTYPE[embed_dtype] + torch_view_dtype = EMBED_DTYPE_TO_TORCH_DTYPE_VIEW[embed_dtype] + + np_array = ( + tensor.to(torch_dtype).flatten().contiguous().view(torch_view_dtype).numpy() + ) + + if endianness != "native" and endianness != sys_byteorder: + np_array = np_array.byteswap() + + return np_array.tobytes() + + +def binary2tensor( + binary: bytes, + shape: tuple[int, ...], + embed_dtype: EmbedDType, + endianness: Endianness, +) -> torch.Tensor: + assert embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE + assert embed_dtype in EMBED_DTYPE_TO_NUMPY_DTYPE_VIEW + assert endianness in ENDIANNESS + + torch_dtype = EMBED_DTYPE_TO_TORCH_DTYPE[embed_dtype] + np_dtype = EMBED_DTYPE_TO_NUMPY_DTYPE_VIEW[embed_dtype] + + np_array = np.frombuffer(binary, dtype=np_dtype).reshape(shape) + + if endianness != "native" and endianness != sys_byteorder: + np_array = np_array.byteswap() + + return torch.from_numpy(np_array).view(torch_dtype) + + +def encode_pooling_output( + output: PoolingRequestOutput, + encoding_format: EncodingFormat, + embed_dtype: EmbedDType, + endianness: Endianness, +) -> list[float] | str | bytes: + if encoding_format == "float": + return output.outputs.data.tolist() + elif encoding_format == "base64": + embedding_bytes = tensor2binary(output.outputs.data, embed_dtype, endianness) + return base64.b64encode(embedding_bytes).decode("utf-8") + elif encoding_format == "bytes": + return tensor2binary(output.outputs.data, embed_dtype, endianness) + assert_never(encoding_format) + + +@dataclass +class MetadataItem: + index: int + embed_dtype: EmbedDType + endianness: Endianness + start: int + end: int + shape: tuple[int, ...] + + +def encode_pooling_bytes( + pooling_outputs: list[PoolingRequestOutput], + embed_dtype: EmbedDType, + endianness: Endianness, +): + num_prompt_tokens = 0 + items: list[dict[str, MetadataItem]] = [] + body = [] + offset = 0 + for idx, output in enumerate(pooling_outputs): + binary = tensor2binary( + tensor=output.outputs.data, + embed_dtype=embed_dtype, + endianness=endianness, + ) + size = len(binary) + + item = { + "index": idx, + "embed_dtype": embed_dtype, + "endianness": endianness, + "start": offset, + "end": offset + size, + "shape": output.outputs.data.shape, + } + + body.append(binary) + items.append(item) + prompt_token_ids = output.prompt_token_ids + num_prompt_tokens += len(prompt_token_ids) + offset += size + + usage = { + "prompt_tokens": num_prompt_tokens, + "total_tokens": num_prompt_tokens, + } + return body, items, usage + + +def decode_pooling_output(items: list[MetadataItem], body: bytes) -> list[torch.Tensor]: + items.sort(key=lambda x: x.index) + + tensor_list: list[torch.Tensor] = [] + for item in items: + binary = body[item.start : item.end] + tensor = binary2tensor(binary, item.shape, item.embed_dtype, item.endianness) + tensor_list.append(tensor) + return tensor_list