From a27b288e4a389e3ece5e178bc0219c6c0e1db7d1 Mon Sep 17 00:00:00 2001
From: "rongfu.leng" <rongfu.leng@daocloud.io>
Date: Wed, 15 Oct 2025 12:23:44 +0800
Subject: [PATCH] [Feature] default --extra-body param to disable thinking in
 vllm bench serve (#26784)

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
---
 vllm/benchmarks/serve.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index c52e384a40023..3c85a1e8fdd9e 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -1230,6 +1230,15 @@ def add_cli_args(parser: argparse.ArgumentParser):
         "the ready check will be skipped.",
     )
 
+    parser.add_argument(
+        "--extra-body",
+        help="A JSON string representing extra body parameters to include "
+        "in each request."
+        'Example: \'{"chat_template_kwargs":{"enable_thinking":false}}\'',
+        type=json.loads,
+        default=None,
+    )
+
 
 def main(args: argparse.Namespace) -> dict[str, Any]:
     return asyncio.run(main_async(args))
@@ -1330,6 +1339,9 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
     else:
         sampling_params = {}
 
+    extra_body = args.extra_body or {}
+    extra_body = {**sampling_params, **extra_body}
+
     # Avoid GC processing "static" data - reduce pause times.
     gc.collect()
     gc.freeze()
@@ -1355,7 +1367,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
         max_concurrency=args.max_concurrency,
         lora_modules=args.lora_modules,
         extra_headers=headers,
-        extra_body=sampling_params,
+        extra_body=extra_body,
         ramp_up_strategy=args.ramp_up_strategy,
         ramp_up_start_rps=args.ramp_up_start_rps,
         ramp_up_end_rps=args.ramp_up_end_rps,