[Misc] Change buckets of histogram_iteration_tokens to [1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8096] to represent number of tokens (#17033)

Signed-off-by: sfc-gh-zhwang <flex.wang@snowflake.com>
2026-01-18 19:04:31 +08:00 · 2025-04-27 05:30:53 -07:00 · 2025-04-27 05:30:53 -07:00 · 18445edd0f
commit 18445edd0f
parent 30215ca61f
2 changed files with 7 additions and 17 deletions
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@ -140,16 +140,13 @@ class Metrics:
            name="vllm:generation_tokens_total",
            documentation="Number of generation tokens processed.",
            labelnames=labelnames)
-        buckets = [1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8096]
-        if not vllm_config.model_config.enforce_eager:
-            buckets = vllm_config.compilation_config.\
-                cudagraph_capture_sizes.copy()
-            buckets.sort()
        self.histogram_iteration_tokens = self._histogram_cls(
            name="vllm:iteration_tokens_total",
            documentation="Histogram of number of tokens per engine_step.",
            labelnames=labelnames,
-            buckets=buckets)
+            buckets=[
+                1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8096, 16192
+            ])
        self.histogram_time_to_first_token = self._histogram_cls(
            name="vllm:time_to_first_token_seconds",
            documentation="Histogram of time to first token in seconds.",
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@ -232,7 +232,10 @@ class PrometheusStatLogger(StatLoggerBase):
            prometheus_client.Histogram(
                name="vllm:iteration_tokens_total",
                documentation="Histogram of number of tokens per engine_step.",
-                buckets=build_cudagraph_buckets(vllm_config),
+                buckets=[
+                    1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8096,
+                    16192
+                ],
                labelnames=labelnames).labels(*labelvalues)

        self.histogram_max_num_generation_tokens_request = \
@ -467,16 +470,6 @@ def build_1_2_5_buckets(max_value: int) -> list[int]:
    return build_buckets([1, 2, 5], max_value)


-def build_cudagraph_buckets(vllm_config: VllmConfig) -> list[int]:
-    if not vllm_config.model_config.enforce_eager:
-        buckets = vllm_config.compilation_config.\
-            cudagraph_capture_sizes.copy()
-        buckets.sort()
-        return buckets
-    else:
-        return [1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8096]
-
-
 def setup_default_loggers(
    vllm_config: VllmConfig,
    log_stats: bool,