[Perf] Move gc.freeze logic from EngineCoreProc to EngineCore for better coverage (#27896)

Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
2026-07-03 10:07:10 +08:00 · 2025-11-10 15:34:18 -08:00 · 2025-11-10 15:34:18 -08:00 · b30372cbd0
commit b30372cbd0
parent d17ecc6b19
5 changed files with 30 additions and 14 deletions
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@ -19,7 +19,6 @@ On the client side, run:
 import argparse
 import asyncio
 import contextlib
 import gc
 import importlib.util
 import json
 import os
@ -49,6 +48,7 @@ from vllm.benchmarks.lib.endpoint_request_func import (
 from vllm.benchmarks.lib.ready_checker import wait_for_endpoint
 from vllm.benchmarks.lib.utils import convert_to_pytorch_benchmark_format, write_to_json
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.utils.gc_utils import freeze_gc_heap
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@ -1414,8 +1414,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
    percentile_metrics: str = args.percentile_metrics or default_percentile_metrics
    # Avoid GC processing "static" data - reduce pause times.
-    gc.collect()
+    freeze_gc_heap()
    gc.freeze()
    benchmark_result = await benchmark(
        task_type=task_type,
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@ -1483,6 +1483,9 @@ def destroy_distributed_environment():
 def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
    # Ensure all objects are not freezed before cleanup
    gc.unfreeze()
    destroy_model_parallel()
    destroy_distributed_environment()
    if shutdown_ray:
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 import gc
 import hashlib
 import importlib
 import inspect
@ -118,6 +116,7 @@ from vllm.reasoning import ReasoningParserManager
 from vllm.tasks import POOLING_TASKS
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.gc_utils import freeze_gc_heap
 from vllm.utils.network_utils import is_valid_ipv6_address
 from vllm.utils.system_utils import decorate_logs, set_ulimit
 from vllm.v1.engine.exceptions import EngineDeadError
@ -153,8 +152,7 @@ async def lifespan(app: FastAPI):
        # Mark the startup heap as static so that it's ignored by GC.
        # Reduces pause times of oldest generation collections.
-        gc.collect()
+        freeze_gc_heap()
        gc.freeze()
        try:
            yield
        finally:
--- a/vllm/utils/gc_utils.py
+++ b/vllm/utils/gc_utils.py
@ -89,6 +89,21 @@ class GCDebugger:
            )
 def freeze_gc_heap() -> None:
    """
    Freeze all objects tracked by the garbage collector. It should be invoked
    after server init / warmup, to reduce GC overhead from static objects
    during serving time.
    """
    # Ensure all static objects are pushed down to the oldest generation for
    # freeze
    gc.collect(0)
    gc.collect(1)
    gc.collect(2)
    # Freeze all GC tracked objects
    gc.freeze()
 def maybe_attach_gc_debug_callback() -> None:
    """
    Attached a callback for GC debug when VLLM_GC_DEBUG is enabled.
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import gc
 import os
 import queue
 import signal
@ -27,7 +26,10 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.cache import engine_receiver_cache_from_config
 from vllm.tasks import POOLING_TASKS, SupportedTask
 from vllm.transformers_utils.config import maybe_register_config_serialize_by_value
-from vllm.utils.gc_utils import maybe_attach_gc_debug_callback
+from vllm.utils.gc_utils import (
    freeze_gc_heap,
    maybe_attach_gc_debug_callback,
 )
 from vllm.utils.hashing import get_hash_fn_by_name
 from vllm.utils.network_utils import make_zmq_socket
 from vllm.utils.system_utils import decorate_logs, set_process_title
@ -197,6 +199,10 @@ class EngineCore:
            self.step if self.batch_queue is None else self.step_with_batch_queue
        )
        # Mark the startup heap as static so that it's ignored by GC.
        # Reduces pause times of oldest generation collections.
        freeze_gc_heap()
    def _initialize_kv_caches(
        self, vllm_config: VllmConfig
    ) -> tuple[int, int, KVCacheConfig]:
@ -651,11 +657,6 @@ class EngineCoreProc(EngineCore):
                assert addresses.coordinator_input is not None
                logger.info("Waiting for READY message from DP Coordinator...")
        # Mark the startup heap as static so that it's ignored by GC.
        # Reduces pause times of oldest generation collections.
        gc.collect()
        gc.freeze()
        # If enable, attach GC debugger after static variable freeze.
        maybe_attach_gc_debug_callback()