[Bugfix] Actually disable processing cache when API server is scaled out (#21839)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung 2025-07-30 11:36:04 +08:00 committed by GitHub
parent b7b23da4d2
commit 44bc46da60
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -140,11 +140,16 @@ def run_multi_api_server(args: argparse.Namespace):
num_api_servers = args.api_server_count
assert num_api_servers > 0
orig_disable_mm_preprocessor_cache = args.disable_mm_preprocessor_cache
# set_process_title("ProcManager")
if num_api_servers > 1:
setup_multiprocess_prometheus()
# Not compatible with API server scale-out
args.disable_mm_preprocessor_cache = True
listen_address, sock = setup_server(args)
engine_args = vllm.AsyncEngineArgs.from_cli_args(args)
@ -161,11 +166,9 @@ def run_multi_api_server(args: argparse.Namespace):
"with api_server_count > 1")
if model_config.is_multimodal_model and not (
model_config.disable_mm_preprocessor_cache):
logger.warning(
"Multi-model preprocessor cache will be disabled for"
" api_server_count > 1")
model_config.disable_mm_preprocessor_cache = True
orig_disable_mm_preprocessor_cache):
logger.warning("Multi-model preprocessor cache will be disabled "
"for api_server_count > 1")
executor_class = Executor.get_class(vllm_config)
log_stats = not engine_args.disable_log_stats