From 9cf4edae6ef2c972429560ca8f72d40688d10495 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Tue, 25 Nov 2025 03:15:13 +0000 Subject: [PATCH] [Metrics] Scheduled removal of deprecated metrics (#29330) Signed-off-by: Mark McLoughlin --- tests/entrypoints/openai/test_metrics.py | 3 - vllm/v1/metrics/loggers.py | 134 +++++++---------------- 2 files changed, 37 insertions(+), 100 deletions(-) diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index 4e7b765d7713f..65a6fd20bd0d1 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -183,9 +183,6 @@ async def test_metrics_counts( EXPECTED_METRICS_V1 = [ "vllm:num_requests_running", "vllm:num_requests_waiting", - "vllm:gpu_cache_usage_perc", - "vllm:gpu_prefix_cache_queries", - "vllm:gpu_prefix_cache_hits", "vllm:kv_cache_usage_perc", "vllm:prefix_cache_queries", "vllm:prefix_cache_hits", diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index e2d82241ce210..bd18a152ffc08 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -440,57 +440,6 @@ class PrometheusStatLogger(AggregateStatLoggerBase): # Setting default values self.record_sleep_state() - # GPU cache - # - # Deprecated in 0.9.2 - Renamed as vllm:kv_cache_usage_perc - # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10 - # TODO: remove in 0.12.0 - if self.show_hidden_metrics: - gauge_gpu_cache_usage = self._gauge_cls( - name="vllm:gpu_cache_usage_perc", - documentation=( - "GPU KV-cache usage. 1 means 100 percent usage." - "DEPRECATED: Use vllm:kv_cache_usage_perc instead." - ), - multiprocess_mode="mostrecent", - labelnames=labelnames, - ) - self.gauge_gpu_cache_usage = make_per_engine( - gauge_gpu_cache_usage, engine_indexes, model_name - ) - - # Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_queries - # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10 - # TODO: remove in 0.12.0 - if self.show_hidden_metrics: - counter_gpu_prefix_cache_queries = self._counter_cls( - name="vllm:gpu_prefix_cache_queries", - documentation=( - "GPU prefix cache queries, in terms of number of queried" - "tokens. DEPRECATED: Use vllm:prefix_cache_queries instead." - ), - labelnames=labelnames, - ) - self.counter_gpu_prefix_cache_queries = make_per_engine( - counter_gpu_prefix_cache_queries, engine_indexes, model_name - ) - - # Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_hits - # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10 - # TODO: remove in 0.12.0 - if self.show_hidden_metrics: - counter_gpu_prefix_cache_hits = self._counter_cls( - name="vllm:gpu_prefix_cache_hits", - documentation=( - "GPU prefix cache hits, in terms of number of cached " - "tokens. DEPRECATED: Use vllm:prefix_cache_hits instead." - ), - labelnames=labelnames, - ) - self.counter_gpu_prefix_cache_hits = make_per_engine( - counter_gpu_prefix_cache_hits, engine_indexes, model_name - ) - gauge_kv_cache_usage = self._gauge_cls( name="vllm:kv_cache_usage_perc", documentation="KV-cache usage. 1 means 100 percent usage.", @@ -735,39 +684,41 @@ class PrometheusStatLogger(AggregateStatLoggerBase): ) # Deprecated in 0.11 - Renamed as vllm:inter_token_latency_seconds - # TODO: in 0.12, only enable if show_hidden_metrics=True - histogram_time_per_output_token = self._histogram_cls( - name="vllm:time_per_output_token_seconds", - documentation=( - "Histogram of time per output token in seconds." - "DEPRECATED: Use vllm:inter_token_latency_seconds instead." - ), - buckets=[ - 0.01, - 0.025, - 0.05, - 0.075, - 0.1, - 0.15, - 0.2, - 0.3, - 0.4, - 0.5, - 0.75, - 1.0, - 2.5, - 5.0, - 7.5, - 10.0, - 20.0, - 40.0, - 80.0, - ], - labelnames=labelnames, - ) - self.histogram_time_per_output_token = make_per_engine( - histogram_time_per_output_token, engine_indexes, model_name - ) + # With 0.12.x you can enable with --show-hidden-metrics-for-version=0.11 + # TODO: remove in 0.13.0 + if self.show_hidden_metrics: + histogram_time_per_output_token = self._histogram_cls( + name="vllm:time_per_output_token_seconds", + documentation=( + "Histogram of time per output token in seconds." + "DEPRECATED: Use vllm:inter_token_latency_seconds instead." + ), + buckets=[ + 0.01, + 0.025, + 0.05, + 0.075, + 0.1, + 0.15, + 0.2, + 0.3, + 0.4, + 0.5, + 0.75, + 1.0, + 2.5, + 5.0, + 7.5, + 10.0, + 20.0, + 40.0, + 80.0, + ], + labelnames=labelnames, + ) + self.histogram_time_per_output_token = make_per_engine( + histogram_time_per_output_token, engine_indexes, model_name + ) histogram_inter_token_latency = self._histogram_cls( name="vllm:inter_token_latency_seconds", @@ -966,20 +917,8 @@ class PrometheusStatLogger(AggregateStatLoggerBase): self.gauge_scheduler_waiting[engine_idx].set( scheduler_stats.num_waiting_reqs ) - if self.show_hidden_metrics: - self.gauge_gpu_cache_usage[engine_idx].set( - scheduler_stats.kv_cache_usage - ) self.gauge_kv_cache_usage[engine_idx].set(scheduler_stats.kv_cache_usage) - if self.show_hidden_metrics: - self.counter_gpu_prefix_cache_queries[engine_idx].inc( - scheduler_stats.prefix_cache_stats.queries - ) - self.counter_gpu_prefix_cache_hits[engine_idx].inc( - scheduler_stats.prefix_cache_stats.hits - ) - self.counter_prefix_cache_queries[engine_idx].inc( scheduler_stats.prefix_cache_stats.queries ) @@ -1050,7 +989,8 @@ class PrometheusStatLogger(AggregateStatLoggerBase): self.histogram_time_to_first_token[engine_idx].observe(ttft) for itl in iteration_stats.inter_token_latencies_iter: self.histogram_inter_token_latency[engine_idx].observe(itl) - self.histogram_time_per_output_token[engine_idx].observe(itl) + if self.show_hidden_metrics: + self.histogram_time_per_output_token[engine_idx].observe(itl) for finished_request in iteration_stats.finished_requests: self.counter_request_success[finished_request.finish_reason][