From 44bc46da6008c04d351d8fd0bf026bff8ab57dab Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 30 Jul 2025 11:36:04 +0800 Subject: [PATCH] [Bugfix] Actually disable processing cache when API server is scaled out (#21839) Signed-off-by: DarkLight1337 --- vllm/entrypoints/cli/serve.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index a69363e3d98f..7dcba2cccdb5 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -140,11 +140,16 @@ def run_multi_api_server(args: argparse.Namespace): num_api_servers = args.api_server_count assert num_api_servers > 0 + orig_disable_mm_preprocessor_cache = args.disable_mm_preprocessor_cache + # set_process_title("ProcManager") if num_api_servers > 1: setup_multiprocess_prometheus() + # Not compatible with API server scale-out + args.disable_mm_preprocessor_cache = True + listen_address, sock = setup_server(args) engine_args = vllm.AsyncEngineArgs.from_cli_args(args) @@ -161,11 +166,9 @@ def run_multi_api_server(args: argparse.Namespace): "with api_server_count > 1") if model_config.is_multimodal_model and not ( - model_config.disable_mm_preprocessor_cache): - logger.warning( - "Multi-model preprocessor cache will be disabled for" - " api_server_count > 1") - model_config.disable_mm_preprocessor_cache = True + orig_disable_mm_preprocessor_cache): + logger.warning("Multi-model preprocessor cache will be disabled " + "for api_server_count > 1") executor_class = Executor.get_class(vllm_config) log_stats = not engine_args.disable_log_stats