Add logging for cudagraph related info (#29825)

Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
2026-01-26 13:14:30 +08:00 · 2025-12-02 23:01:48 -10:00 · 2025-12-02 23:01:48 -10:00 · 69520bc695
commit 69520bc695
parent 3a7751485b
9 changed files with 161 additions and 6 deletions
--- a/vllm/compilation/cuda_graph.py
+++ b/vllm/compilation/cuda_graph.py
@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import dataclasses
+from collections import Counter
 from collections.abc import Callable
 from contextlib import ExitStack
 from typing import Any
@ -22,6 +23,99 @@ from vllm.utils.torch_utils import weak_ref_tensors
 logger = init_logger(__name__)


+@dataclasses.dataclass(frozen=True)
+class CUDAGraphStat:
+    num_unpadded_tokens: int
+    num_padded_tokens: int
+    num_paddings: int
+    runtime_mode: str
+
+
+class CUDAGraphLogging:
+    """Aggregate and log cudagraph metrics"""
+
+    COLUMN_HEADERS = [
+        "Unpadded Tokens",
+        "Padded Tokens",
+        "Num Paddings",
+        "Runtime Mode",
+        "Count",
+    ]
+
+    def __init__(self, cg_mode: CUDAGraphMode, cg_capture_sizes: list[int] | None):
+        self.reset()
+        self.cg_mode = str(cg_mode)
+        self.cg_capture_sizes = str(cg_capture_sizes or [])
+
+        self.settings_header = (
+            "**CUDAGraph Config Settings:**\n\n"
+            f"- Mode: {self.cg_mode}\n"
+            f"- Capture sizes: {self.cg_capture_sizes}\n\n"
+            "**CUDAGraph Stats:**\n\n"
+        )
+
+    def reset(self):
+        self.stats = []
+
+    def observe(self, cudagraph_stat: CUDAGraphStat):
+        self.stats.append(cudagraph_stat)
+
+    def generate_metric_table(self) -> str:
+        stats_counts = Counter(self.stats)
+
+        # Convert stats to rows of strings, in descending order of observed frequencies
+        rows = []
+        for stat, count in sorted(
+            stats_counts.items(), key=lambda item: item[1], reverse=True
+        ):
+            rows.append(
+                [
+                    str(stat.num_unpadded_tokens),
+                    str(stat.num_padded_tokens),
+                    str(stat.num_paddings),
+                    stat.runtime_mode,
+                    str(count),
+                ]
+            )
+
+        # Calculate column widths (max of header and data)
+        col_widths = []
+        for i, header_text in enumerate(self.COLUMN_HEADERS):
+            max_width = len(header_text)
+            for row in rows:
+                max_width = max(max_width, len(row[i]))
+            col_widths.append(max_width)
+
+        table_header_list = [
+            h.ljust(w) for h, w in zip(self.COLUMN_HEADERS, col_widths)
+        ]
+        table_header = "| " + " | ".join(table_header_list) + " |\n"
+
+        table_separator = "|" + "|".join("-" * (w + 2) for w in col_widths) + "|\n"
+
+        # Create data rows with proper alignment
+        data_rows = []
+        for row in rows:
+            formatted_row = [
+                str(val).ljust(width) for val, width in zip(row, col_widths)
+            ]
+            data_rows.append("| " + " | ".join(formatted_row) + " |")
+
+        return (
+            self.settings_header
+            + table_header
+            + table_separator
+            + "\n".join(data_rows)
+            + "\n"
+        )
+
+    def log(self, log_fn=logger.info):
+        if not self.stats:
+            return
+        log_fn(self.generate_metric_table())
+        self.reset()
+
+
@dataclasses.dataclass
 class CUDAGraphEntry:
    batch_descriptor: BatchDescriptor
--- a/vllm/config/observability.py
+++ b/vllm/config/observability.py
@ -55,6 +55,10 @@ class ObservabilityConfig:
    kv_cache_metrics_sample: float = Field(default=0.01, gt=0, le=1)
    """Sampling rate for KV cache metrics (0.0, 1.0]. Default 0.01 = 1% of blocks."""

+    cudagraph_metrics: bool = False
+    """Enable CUDA graph metrics (number of padded/unpadded tokens, runtime cudagraph
+    dispatch modes, and their observed frequencies at every logging interval)."""
+
    @cached_property
    def collect_model_forward_time(self) -> bool:
        """Whether to collect model forward time for the request."""
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@ -518,6 +518,7 @@ class EngineArgs:
    kv_cache_metrics_sample: float = get_field(
        ObservabilityConfig, "kv_cache_metrics_sample"
    )
+    cudagraph_metrics: bool = ObservabilityConfig.cudagraph_metrics
    scheduling_policy: SchedulerPolicy = SchedulerConfig.policy
    scheduler_cls: str | type[object] | None = SchedulerConfig.scheduler_cls

@ -1021,6 +1022,10 @@ class EngineArgs:
            "--kv-cache-metrics-sample",
            **observability_kwargs["kv_cache_metrics_sample"],
        )
+        observability_group.add_argument(
+            "--cudagraph-metrics",
+            **observability_kwargs["cudagraph_metrics"],
+        )

        # Scheduler arguments
        scheduler_kwargs = get_kwargs(SchedulerConfig)
@ -1698,6 +1703,7 @@ class EngineArgs:
            collect_detailed_traces=self.collect_detailed_traces,
            kv_cache_metrics=self.kv_cache_metrics,
            kv_cache_metrics_sample=self.kv_cache_metrics_sample,
+            cudagraph_metrics=self.cudagraph_metrics,
        )

        # Compilation config overrides
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@ -7,6 +7,7 @@ from collections.abc import Iterable
 from typing import Any

 from vllm import envs
+from vllm.compilation.cuda_graph import CUDAGraphStat
 from vllm.config import VllmConfig
 from vllm.distributed.ec_transfer.ec_connector.base import (
    ECConnectorMetadata,
@ -1037,6 +1038,7 @@ class Scheduler(SchedulerInterface):
        pooler_outputs = model_runner_output.pooler_output
        num_nans_in_logits = model_runner_output.num_nans_in_logits
        kv_connector_output = model_runner_output.kv_connector_output
+        cudagraph_stats = model_runner_output.cudagraph_stats

        outputs: dict[int, list[EngineCoreOutput]] = defaultdict(list)
        spec_decoding_stats: SpecDecodingStats | None = None
@ -1219,7 +1221,9 @@ class Scheduler(SchedulerInterface):
            finished_req_ids.clear()

        if (
-            stats := self.make_stats(spec_decoding_stats, kv_connector_stats)
+            stats := self.make_stats(
+                spec_decoding_stats, kv_connector_stats, cudagraph_stats
+            )
        ) is not None:
            # Return stats to only one of the front-ends.
            if (eco := next(iter(engine_core_outputs.values()), None)) is None:
@ -1420,6 +1424,7 @@ class Scheduler(SchedulerInterface):
        self,
        spec_decoding_stats: SpecDecodingStats | None = None,
        kv_connector_stats: KVConnectorStats | None = None,
+        cudagraph_stats: CUDAGraphStat | None = None,
    ) -> SchedulerStats | None:
        if not self.log_stats:
            return None
@ -1444,6 +1449,7 @@ class Scheduler(SchedulerInterface):
            kv_cache_eviction_events=eviction_events,
            spec_decoding_stats=spec_stats,
            kv_connector_stats=connector_stats_payload,
+            cudagraph_stats=cudagraph_stats,
        )

    def make_spec_decoding_stats(
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@ -10,6 +10,7 @@ from typing import TypeAlias
 from prometheus_client import Counter, Gauge, Histogram

 import vllm.envs as envs
+from vllm.compilation.cuda_graph import CUDAGraphLogging
 from vllm.config import SupportsMetricsInfo, VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
    KVConnectorLogging,
@ -106,6 +107,12 @@ class LoggingStatLogger(StatLoggerBase):
        self.spec_decoding_logging = SpecDecodingLogging()
        kv_transfer_config = self.vllm_config.kv_transfer_config
        self.kv_connector_logging = KVConnectorLogging(kv_transfer_config)
+        self.cudagraph_logging = None
+        if self.vllm_config.observability_config.cudagraph_metrics:
+            self.cudagraph_logging = CUDAGraphLogging(
+                self.vllm_config.compilation_config.cudagraph_mode,
+                self.vllm_config.compilation_config.cudagraph_capture_sizes,
+            )
        self.last_prompt_throughput: float = 0.0
        self.last_generation_throughput: float = 0.0
        self.engine_is_idle = False
@ -161,6 +168,11 @@ class LoggingStatLogger(StatLoggerBase):
                self.spec_decoding_logging.observe(scheduler_stats.spec_decoding_stats)
            if kv_connector_stats := scheduler_stats.kv_connector_stats:
                self.kv_connector_logging.observe(kv_connector_stats)
+            if (
+                self.cudagraph_logging is not None
+                and scheduler_stats.cudagraph_stats is not None
+            ):
+                self.cudagraph_logging.observe(scheduler_stats.cudagraph_stats)
            if not self.aggregated:
                self.last_scheduler_stats = scheduler_stats
        if mm_cache_stats:
@ -240,6 +252,8 @@ class LoggingStatLogger(StatLoggerBase):

        self.spec_decoding_logging.log(log_fn=log_fn)
        self.kv_connector_logging.log(log_fn=log_fn)
+        if self.cudagraph_logging is not None:
+            self.cudagraph_logging.log(log_fn=log_fn)

    def log_engine_initialized(self):
        if self.vllm_config.cache_config.num_gpu_blocks:
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@ -7,6 +7,7 @@ from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any

 import vllm.envs as envs
+from vllm.compilation.cuda_graph import CUDAGraphStat
 from vllm.v1.spec_decode.metrics import SpecDecodingStats

 if TYPE_CHECKING:
@ -183,6 +184,8 @@ class SchedulerStats:
    waiting_lora_adapters: dict[str, int] = field(default_factory=dict)
    running_lora_adapters: dict[str, int] = field(default_factory=dict)

+    cudagraph_stats: CUDAGraphStat | None = None
+

@dataclass
 class RequestStateStats:
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@ -8,6 +8,7 @@ from typing import TYPE_CHECKING, NamedTuple
 import numpy as np
 import torch

+from vllm.compilation.cuda_graph import CUDAGraphStat
 from vllm.v1.core.sched.output import SchedulerOutput

 if TYPE_CHECKING:
@ -169,6 +170,9 @@ class ModelRunnerOutput:
    # req_id -> num_nans_in_logits
    num_nans_in_logits: dict[str, int] | None = None

+    # information related to cudagraph execution
+    cudagraph_stats: CUDAGraphStat | None = None
+

 # ModelRunnerOutput wrapper for async scheduling.
 class AsyncModelRunnerOutput(ABC):
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@ -27,7 +27,7 @@ from vllm.attention.backends.abstract import (
 )
 from vllm.attention.layer import Attention, MLAAttention
 from vllm.compilation.counter import compilation_counter
-from vllm.compilation.cuda_graph import CUDAGraphWrapper
+from vllm.compilation.cuda_graph import CUDAGraphStat, CUDAGraphWrapper
 from vllm.compilation.monitor import set_cudagraph_capturing_enabled
 from vllm.config import (
    CompilationMode,
@ -257,6 +257,7 @@ class ExecuteModelState(NamedTuple):
    sample_hidden_states: torch.Tensor
    aux_hidden_states: list[torch.Tensor] | None
    ec_connector_output: ECConnectorOutput | None
+    cudagraph_stats: CUDAGraphStat | None


 class GPUModelRunner(
@ -2755,7 +2756,11 @@ class GPUModelRunner(
        force_uniform_decode: bool | None = None,
        force_has_lora: bool | None = None,
    ) -> tuple[
-        CUDAGraphMode, BatchDescriptor, UBatchSlices | None, torch.Tensor | None
+        CUDAGraphMode,
+        BatchDescriptor,
+        UBatchSlices | None,
+        torch.Tensor | None,
+        CUDAGraphStat | None,
    ]:
        num_tokens_padded = self._pad_for_sequence_parallelism(num_tokens)
        uniform_decode = (
@ -2820,7 +2825,22 @@ class GPUModelRunner(
                # num_tokens_across_dp will no-longer be valid
                assert batch_descriptor.num_tokens == num_tokens_padded

-        return cudagraph_mode, batch_descriptor, ubatch_slices, num_tokens_across_dp
+        cudagraph_stats = None
+        if self.vllm_config.observability_config.cudagraph_metrics:
+            cudagraph_stats = CUDAGraphStat(
+                num_unpadded_tokens=num_tokens,
+                num_padded_tokens=batch_descriptor.num_tokens,
+                num_paddings=batch_descriptor.num_tokens - num_tokens,
+                runtime_mode=str(cudagraph_mode),
+            )
+
+        return (
+            cudagraph_mode,
+            batch_descriptor,
+            ubatch_slices,
+            num_tokens_across_dp,
+            cudagraph_stats,
+        )

    @torch.inference_mode()
    def execute_model(
@ -2918,6 +2938,7 @@ class GPUModelRunner(
                    batch_desc,
                    ubatch_slices,
                    num_tokens_across_dp,
+                    cudagraph_stats,
                ) = self._determine_batch_execution_and_padding(
                    num_tokens=num_tokens_unpadded,
                    num_reqs=num_reqs,
@ -3067,6 +3088,7 @@ class GPUModelRunner(
            sample_hidden_states,
            aux_hidden_states,
            ec_connector_output,
+            cudagraph_stats,
        )
        self.kv_connector_output = kv_connector_output
        return None
@ -3102,6 +3124,7 @@ class GPUModelRunner(
            sample_hidden_states,
            aux_hidden_states,
            ec_connector_output,
+            cudagraph_stats,
        ) = self.execute_model_state
        # Clear ephemeral state.
        self.execute_model_state = None
@ -3217,6 +3240,7 @@ class GPUModelRunner(
                if self.supports_mm_inputs
                else None,
                num_nans_in_logits=num_nans_in_logits,
+                cudagraph_stats=cudagraph_stats,
            )

        if not self.use_async_scheduling:
@ -3937,7 +3961,7 @@ class GPUModelRunner(

        num_sampled_tokens = np.ones(num_reqs, dtype=np.int32)

-        _cudagraph_mode, batch_desc, ubatch_slices, num_tokens_across_dp = (
+        _cudagraph_mode, batch_desc, ubatch_slices, num_tokens_across_dp, _ = (
            self._determine_batch_execution_and_padding(
                num_tokens=num_tokens_unpadded,
                num_reqs=num_reqs,
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@ -564,7 +564,7 @@ class Worker(WorkerBase):
            # TODO(lucas): This is pretty gross; ideally we should only ever call
            # `_determine_batch_execution_and_padding` once (will get called again
            # in `execute_model`) but this requires a larger refactor of PP.
-            _, batch_desc, _, _ = (
+            _, batch_desc, _, _, _ = (
                self.model_runner._determine_batch_execution_and_padding(
                    num_tokens=num_scheduled_tokens,
                    num_reqs=len(num_scheduled_tokens_np),