diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index a69363e3d98f..7dcba2cccdb5 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -140,11 +140,16 @@ def run_multi_api_server(args: argparse.Namespace): num_api_servers = args.api_server_count assert num_api_servers > 0 + orig_disable_mm_preprocessor_cache = args.disable_mm_preprocessor_cache + # set_process_title("ProcManager") if num_api_servers > 1: setup_multiprocess_prometheus() + # Not compatible with API server scale-out + args.disable_mm_preprocessor_cache = True + listen_address, sock = setup_server(args) engine_args = vllm.AsyncEngineArgs.from_cli_args(args) @@ -161,11 +166,9 @@ def run_multi_api_server(args: argparse.Namespace): "with api_server_count > 1") if model_config.is_multimodal_model and not ( - model_config.disable_mm_preprocessor_cache): - logger.warning( - "Multi-model preprocessor cache will be disabled for" - " api_server_count > 1") - model_config.disable_mm_preprocessor_cache = True + orig_disable_mm_preprocessor_cache): + logger.warning("Multi-model preprocessor cache will be disabled " + "for api_server_count > 1") executor_class = Executor.get_class(vllm_config) log_stats = not engine_args.disable_log_stats