[Feature] default --extra-body param to disable thinking in vllm bench serve (#26784)

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
This commit is contained in:
rongfu.leng 2025-10-15 12:23:44 +08:00 committed by GitHub
parent e471d7ca7e
commit a27b288e4a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1230,6 +1230,15 @@ def add_cli_args(parser: argparse.ArgumentParser):
"the ready check will be skipped.",
)
parser.add_argument(
"--extra-body",
help="A JSON string representing extra body parameters to include "
"in each request."
'Example: \'{"chat_template_kwargs":{"enable_thinking":false}}\'',
type=json.loads,
default=None,
)
def main(args: argparse.Namespace) -> dict[str, Any]:
return asyncio.run(main_async(args))
@ -1330,6 +1339,9 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
else:
sampling_params = {}
extra_body = args.extra_body or {}
extra_body = {**sampling_params, **extra_body}
# Avoid GC processing "static" data - reduce pause times.
gc.collect()
gc.freeze()
@ -1355,7 +1367,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
max_concurrency=args.max_concurrency,
lora_modules=args.lora_modules,
extra_headers=headers,
extra_body=sampling_params,
extra_body=extra_body,
ramp_up_strategy=args.ramp_up_strategy,
ramp_up_start_rps=args.ramp_up_start_rps,
ramp_up_end_rps=args.ramp_up_end_rps,