[Bugfix] Actually disable processing cache when API server is scaled out (#21839)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-01-28 18:37:15 +08:00 · 2025-07-30 11:36:04 +08:00 · 2025-07-30 11:36:04 +08:00 · 44bc46da60
commit 44bc46da60
parent b7b23da4d2
1 changed files with 8 additions and 5 deletions
--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@ -140,11 +140,16 @@ def run_multi_api_server(args: argparse.Namespace):
    num_api_servers = args.api_server_count
    assert num_api_servers > 0

+    orig_disable_mm_preprocessor_cache = args.disable_mm_preprocessor_cache
+
    # set_process_title("ProcManager")

    if num_api_servers > 1:
        setup_multiprocess_prometheus()

+        # Not compatible with API server scale-out
+        args.disable_mm_preprocessor_cache = True
+
    listen_address, sock = setup_server(args)

    engine_args = vllm.AsyncEngineArgs.from_cli_args(args)
@ -161,11 +166,9 @@ def run_multi_api_server(args: argparse.Namespace):
                             "with api_server_count > 1")

        if model_config.is_multimodal_model and not (
-                model_config.disable_mm_preprocessor_cache):
-            logger.warning(
-                "Multi-model preprocessor cache will be disabled for"
-                " api_server_count > 1")
-            model_config.disable_mm_preprocessor_cache = True
+                orig_disable_mm_preprocessor_cache):
+            logger.warning("Multi-model preprocessor cache will be disabled "
+                           "for api_server_count > 1")

    executor_class = Executor.get_class(vllm_config)
    log_stats = not engine_args.disable_log_stats