From aaa4ac1c95aaf70afab51582c56d80554a21bbd0 Mon Sep 17 00:00:00 2001 From: cascade Date: Tue, 27 May 2025 05:06:34 -0700 Subject: [PATCH] Disable prefix cache by default for benchmark (#18639) Signed-off-by: cascade812 --- benchmarks/benchmark_latency.py | 3 +++ vllm/benchmarks/latency.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index d5aaceeb8c9c..84759c5c354d 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -189,5 +189,8 @@ if __name__ == "__main__": ) parser = EngineArgs.add_cli_args(parser) + # V1 enables prefix caching by default which skews the latency + # numbers. We need to disable prefix caching by default. + parser.set_defaults(enable_prefix_caching=False) args = parser.parse_args() main(args) diff --git a/vllm/benchmarks/latency.py b/vllm/benchmarks/latency.py index 06f6848f50cb..2c992727b139 100644 --- a/vllm/benchmarks/latency.py +++ b/vllm/benchmarks/latency.py @@ -80,6 +80,9 @@ def add_cli_args(parser: argparse.ArgumentParser): ) parser = EngineArgs.add_cli_args(parser) + # V1 enables prefix caching by default which skews the latency + # numbers. We need to disable prefix caching by default. + parser.set_defaults(enable_prefix_caching=True) def main(args: argparse.Namespace):