From 18445edd0f19b3d734315f968ed9a554937aab20 Mon Sep 17 00:00:00 2001 From: Flex Wang Date: Sun, 27 Apr 2025 05:30:53 -0700 Subject: [PATCH] [Misc] Change buckets of histogram_iteration_tokens to [1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8096] to represent number of tokens (#17033) Signed-off-by: sfc-gh-zhwang --- vllm/engine/metrics.py | 9 +++------ vllm/v1/metrics/loggers.py | 15 ++++----------- 2 files changed, 7 insertions(+), 17 deletions(-) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 7c4265fac20b0..0f79b7e79d384 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -140,16 +140,13 @@ class Metrics: name="vllm:generation_tokens_total", documentation="Number of generation tokens processed.", labelnames=labelnames) - buckets = [1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8096] - if not vllm_config.model_config.enforce_eager: - buckets = vllm_config.compilation_config.\ - cudagraph_capture_sizes.copy() - buckets.sort() self.histogram_iteration_tokens = self._histogram_cls( name="vllm:iteration_tokens_total", documentation="Histogram of number of tokens per engine_step.", labelnames=labelnames, - buckets=buckets) + buckets=[ + 1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8096, 16192 + ]) self.histogram_time_to_first_token = self._histogram_cls( name="vllm:time_to_first_token_seconds", documentation="Histogram of time to first token in seconds.", diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 22d1d9724c8cd..e2e0b305e81fa 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -232,7 +232,10 @@ class PrometheusStatLogger(StatLoggerBase): prometheus_client.Histogram( name="vllm:iteration_tokens_total", documentation="Histogram of number of tokens per engine_step.", - buckets=build_cudagraph_buckets(vllm_config), + buckets=[ + 1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8096, + 16192 + ], labelnames=labelnames).labels(*labelvalues) self.histogram_max_num_generation_tokens_request = \ @@ -467,16 +470,6 @@ def build_1_2_5_buckets(max_value: int) -> list[int]: return build_buckets([1, 2, 5], max_value) -def build_cudagraph_buckets(vllm_config: VllmConfig) -> list[int]: - if not vllm_config.model_config.enforce_eager: - buckets = vllm_config.compilation_config.\ - cudagraph_capture_sizes.copy() - buckets.sort() - return buckets - else: - return [1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8096] - - def setup_default_loggers( vllm_config: VllmConfig, log_stats: bool,