vllm/vllm/utils/gc_utils.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import gc
import json
import time
from collections import Counter
from contextlib import suppress
from typing import Any

import vllm.envs as envs
from vllm.logger import init_logger

logger = init_logger(__name__)


class GCDebugConfig:
    """
    Config for GC Debugger.
    - 0: disable GC debugger
    - 1: enable GC debugger with gc.collect elpased times
    - '{"top_objects":5}': enable GC debugger with top 5 collected objects
    """

    def __init__(self, gc_debug_conf: str | None = None) -> None:
        self.enabled: bool = False
        self.top_objects: int = -1

        if not gc_debug_conf or gc_debug_conf == "0":
            pass
        elif gc_debug_conf == "1":
            self.enabled = True
        else:
            try:
                json_conf = json.loads(gc_debug_conf)
                self.enabled = True
                self.top_objects = json_conf.get("top_objects", -1)
            except Exception:
                self.enabled = False
                logger.error("Failed to parse VLLM_GC_DEBUG(%s)", envs.VLLM_GC_DEBUG)
        logger.debug("GC Debug Config. %s", str(self))

    def __repr__(self) -> str:
        return f"enabled:{self.enabled},top_objects:{self.top_objects}"


class GCDebugger:
    """
    Debugger for GC which logs helpful information for GC understanding.
    To enable, you should call maybe_attach_gc_debug_callback in the process.
    """

    def __init__(self, config: GCDebugConfig) -> None:
        self.config = config
        # Start time in micro second of this GC cycle
        self.start_time_ns: int = time.monotonic_ns()
        # If config.top_objects is positive,
        # compute top collected objects by object types
        self.gc_top_collected_objects: str = ""

    def handle(self, phase: str, info: dict[str, int]) -> None:
        """
        Handles a GC event (e.g. GC start or GC finish)
        """
        generation = info.get("generation")
        if generation is None:
            return
        if phase == "start":
            # Before GC started, record GC start time
            # and top collected objects
            self.start_time_ns = time.monotonic_ns()
            self.gc_top_collected_objects = _compute_top_gc_collected_objects(
                gc.get_objects(generation), self.config.top_objects
            )
        elif phase == "stop":
            # After GC finished, Record GC elapsed time and
            # optionally top collected objects
            elpased_ms = (time.monotonic_ns() - self.start_time_ns) / 1e6
            logger.info(
                "GC took %.3fms to complete. "
                "Collected %s objects in GC generation %d.%s",
                elpased_ms,
                str(info.get("collected", "?")),
                generation,
                (
                    f" Top collected objects: \n{self.gc_top_collected_objects}"
                    if self.gc_top_collected_objects
                    else ""
                ),
            )


def maybe_attach_gc_debug_callback() -> None:
    """
    Attached a callback for GC debug when VLLM_GC_DEBUG is enabled.
    """
    config = GCDebugConfig(envs.VLLM_GC_DEBUG)
    if config.enabled:
        debugger: GCDebugger = GCDebugger(config)

        def gc_callback(phase: str, info: dict[str, int]) -> None:
            debugger.handle(phase, info)

        gc.callbacks.append(gc_callback)


def _compute_detailed_type(o: Any) -> str:
    """
    Detailed object type.

    TODO(Jialin): Further enhance the detailed type with element types for
    easier debugging. We tried but occasionally it would run into signals
    which kills the engine.
    """
    size_str: str = ""
    # Object doesn't support len() - this can happen with type objects
    # or other objects that don't implement __len__ properly
    with suppress(Exception):
        size_str = f"(size:{len(o)})"
    return f"{str(type(o))}{size_str}"


def _compute_top_gc_collected_objects(objects: list[Any], top: int) -> str:
    """
    Group collected objects by types.
    """
    if top <= 0:
        return ""
    object_types = [_compute_detailed_type(o) for o in objects]
    return "\n".join(
        f"{count:>5}:{object_type}"
        for object_type, count in Counter(object_types).most_common(top)
    )