diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index 93bc94123aaa9..be926764e4948 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -277,8 +277,9 @@ def test_prefix_cache_default(): parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) args = parser.parse_args([]) + # should be None by default (depends on model). engine_args = EngineArgs.from_cli_args(args=args) - assert engine_args.enable_prefix_caching, "prefix caching should default to on." + assert engine_args.enable_prefix_caching is None # with flag to turn it on. args = parser.parse_args(["--enable-prefix-caching"]) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 6b5c8ba87ecbf..177915715140f 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -880,7 +880,11 @@ class EngineArgs: "--num-gpu-blocks-override", **cache_kwargs["num_gpu_blocks_override"] ) cache_group.add_argument( - "--enable-prefix-caching", **cache_kwargs["enable_prefix_caching"] + "--enable-prefix-caching", + **{ + **cache_kwargs["enable_prefix_caching"], + "default": None, + }, ) cache_group.add_argument( "--prefix-caching-hash-algo", **cache_kwargs["prefix_caching_hash_algo"]