From b169d5f7b6ad65586c5352030bda154d589a4d89 Mon Sep 17 00:00:00 2001 From: Duyi-Wang Date: Thu, 29 May 2025 20:02:08 +0800 Subject: [PATCH] [Misc][Tools][Benchmark] Add benchmark_serving supports for llama.cpp. (#18692) Signed-off-by: Duyi-Wang --- benchmarks/backend_request_func.py | 3 ++- benchmarks/benchmark_serving.py | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 88616e1108c52..85e6eda7f36fd 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -324,7 +324,7 @@ async def async_request_openai_completions( most_recent_timestamp = timestamp generated_text += text or "" - elif usage := data.get("usage"): + if usage := data.get("usage"): output.output_tokens = usage.get("completion_tokens") if first_chunk_received: output.success = True @@ -611,6 +611,7 @@ ASYNC_REQUEST_FUNCS = { "tensorrt-llm": async_request_trt_llm, "scalellm": async_request_openai_completions, "sglang": async_request_openai_completions, + "llama.cpp": async_request_openai_completions, } OPENAI_COMPATIBLE_BACKENDS = [ diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index a887e7150dc78..79024a9d61c51 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -762,6 +762,10 @@ def main(args: argparse.Namespace): if "temperature" not in sampling_params: sampling_params["temperature"] = 0.0 # Default to greedy decoding. + if args.backend == "llama.cpp": + # Disable prompt caching in llama.cpp backend + sampling_params["cache_prompt"] = False + # Avoid GC processing "static" data - reduce pause times. gc.collect() gc.freeze()