From b169d5f7b6ad65586c5352030bda154d589a4d89 Mon Sep 17 00:00:00 2001
From: Duyi-Wang <duyi.wang@intel.com>
Date: Thu, 29 May 2025 20:02:08 +0800
Subject: [PATCH] [Misc][Tools][Benchmark] Add benchmark_serving supports for
 llama.cpp.  (#18692)

Signed-off-by: Duyi-Wang <duyi.wang@intel.com>
---
 benchmarks/backend_request_func.py | 3 ++-
 benchmarks/benchmark_serving.py    | 4 ++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 88616e1108c52..85e6eda7f36fd 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -324,7 +324,7 @@ async def async_request_openai_completions(
 
                                 most_recent_timestamp = timestamp
                                 generated_text += text or ""
-                            elif usage := data.get("usage"):
+                            if usage := data.get("usage"):
                                 output.output_tokens = usage.get("completion_tokens")
                     if first_chunk_received:
                         output.success = True
@@ -611,6 +611,7 @@ ASYNC_REQUEST_FUNCS = {
     "tensorrt-llm": async_request_trt_llm,
     "scalellm": async_request_openai_completions,
     "sglang": async_request_openai_completions,
+    "llama.cpp": async_request_openai_completions,
 }
 
 OPENAI_COMPATIBLE_BACKENDS = [
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index a887e7150dc78..79024a9d61c51 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -762,6 +762,10 @@ def main(args: argparse.Namespace):
     if "temperature" not in sampling_params:
         sampling_params["temperature"] = 0.0  # Default to greedy decoding.
 
+    if args.backend == "llama.cpp":
+        # Disable prompt caching in llama.cpp backend
+        sampling_params["cache_prompt"] = False
+
     # Avoid GC processing "static" data - reduce pause times.
     gc.collect()
     gc.freeze()