From b7030d962b082077e11c863746708866c4c9d83f Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 24 Oct 2025 19:16:50 +0800 Subject: [PATCH] [Benchmark] Enable benchmark to run with `encoding_format="bytes"` (#27467) Signed-off-by: DarkLight1337 --- vllm/benchmarks/lib/endpoint_request_func.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/vllm/benchmarks/lib/endpoint_request_func.py b/vllm/benchmarks/lib/endpoint_request_func.py index 6e09c722bec71..ed0fdec251863 100644 --- a/vllm/benchmarks/lib/endpoint_request_func.py +++ b/vllm/benchmarks/lib/endpoint_request_func.py @@ -498,10 +498,17 @@ async def _run_pooling_request( async with session.post(url=api_url, headers=headers, json=payload) as response: if response.status == 200: output.ttft = output.latency = time.perf_counter() - st - data = await response.json() + + if payload.get("encoding_format", "float") == "bytes": + metadata = json.loads(response.headers["metadata"]) + usage = metadata.get("usage", {}) + else: + data = await response.json() + usage = data.get("usage", {}) + output.success = True output.generated_text = "" - output.prompt_len = data.get("usage", {}).get("prompt_tokens", 0) + output.prompt_len = usage.get("prompt_tokens", 0) else: output.success = False output.error = response.reason or ""