diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index a4e1aca8bcac..0c9e0f3a5142 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -232,6 +232,9 @@ EXPECTED_METRICS_V1 = [ "vllm:gpu_cache_usage_perc", "vllm:gpu_prefix_cache_queries", "vllm:gpu_prefix_cache_hits", + "vllm:kv_cache_usage_perc", + "vllm:prefix_cache_queries", + "vllm:prefix_cache_hits", "vllm:num_preemptions_total", "vllm:prompt_tokens_total", "vllm:generation_tokens_total", @@ -277,6 +280,9 @@ EXPECTED_METRICS_V1 = [ ] HIDDEN_DEPRECATED_METRICS: list[str] = [ + "vllm:gpu_cache_usage_perc", + "vllm:gpu_prefix_cache_queries", + "vllm:gpu_prefix_cache_hits", "vllm:time_per_output_token_seconds_sum", "vllm:time_per_output_token_seconds_bucket", "vllm:time_per_output_token_seconds_count", @@ -307,7 +313,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool): running_requests, waiting_requests, kv_cache_usage = ( - _get_running_metrics_from_api(server)) + _get_running_metrics_from_api(server, use_v1)) # Expect no running requests or kvcache usage assert running_requests == 0 @@ -330,7 +336,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer, # Check that we have running requests running_requests, waiting_requests, kv_cache_usage = ( - _get_running_metrics_from_api(server)) + _get_running_metrics_from_api(server, use_v1)) # Expect running requests and kvcache usage assert running_requests > 0 @@ -349,7 +355,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer, # Verify running and waiting requests counts and KV cache usage are zero running_requests_after, waiting_requests_after, kv_cache_usage_after = ( - _get_running_metrics_from_api(server)) + _get_running_metrics_from_api(server, use_v1)) assert running_requests_after == 0,\ (f"Expected 0 running requests after abort, got " @@ -362,7 +368,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer, f"{kv_cache_usage_after}") -def _get_running_metrics_from_api(server: RemoteOpenAIServer): +def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool): """Return (running_count, waiting_count, kv_cache_usage)""" response = requests.get(server.url_for("metrics")) @@ -371,6 +377,9 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer): # Verify running and waiting requests counts and KV cache usage are zero running_requests, waiting_requests, kv_cache_usage = None, None, None + kv_cache_usage_metric = ("vllm:kv_cache_usage_perc" + if use_v1 else "vllm:gpu_cache_usage_perc") + for family in text_string_to_metric_families(response.text): if family.name == "vllm:num_requests_running": for sample in family.samples: @@ -382,9 +391,9 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer): if sample.name == "vllm:num_requests_waiting": waiting_requests = sample.value break - elif family.name == "vllm:gpu_cache_usage_perc": + elif family.name == kv_cache_usage_metric: for sample in family.samples: - if sample.name == "vllm:gpu_cache_usage_perc": + if sample.name == kv_cache_usage_metric: kv_cache_usage = sample.value break diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 3f4699281cce..b30036a6f8e8 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -202,40 +202,46 @@ class PrometheusStatLogger(StatLoggerBase): # # GPU cache # - # Deprecated in 0.9 - Renamed as vllm:kv_cache_usage_perc - # TODO: in 0.10, only enable if show_hidden_metrics=True - gauge_gpu_cache_usage = self._gauge_cls( - name="vllm:gpu_cache_usage_perc", - documentation=( - "GPU KV-cache usage. 1 means 100 percent usage." - "DEPRECATED: Use vllm:kv_cache_usage_perc instead."), - multiprocess_mode="mostrecent", - labelnames=labelnames) - self.gauge_gpu_cache_usage = make_per_engine(gauge_gpu_cache_usage, - engine_indexes, - model_name) + # Deprecated in 0.9.2 - Renamed as vllm:kv_cache_usage_perc + # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10 + # TODO: remove in 0.12.0 + if self.show_hidden_metrics: + gauge_gpu_cache_usage = self._gauge_cls( + name="vllm:gpu_cache_usage_perc", + documentation=( + "GPU KV-cache usage. 1 means 100 percent usage." + "DEPRECATED: Use vllm:kv_cache_usage_perc instead."), + multiprocess_mode="mostrecent", + labelnames=labelnames) + self.gauge_gpu_cache_usage = make_per_engine( + gauge_gpu_cache_usage, engine_indexes, model_name) - # Deprecated in 0.9 - Renamed as vllm:prefix_cache_queries - # TODO: in 0.10, only enable if show_hidden_metrics=True - counter_gpu_prefix_cache_queries = self._counter_cls( - name="vllm:gpu_prefix_cache_queries", - documentation=( - "GPU prefix cache queries, in terms of number of queried" - "tokens. DEPRECATED: Use vllm:prefix_cache_queries instead."), - labelnames=labelnames) - self.counter_gpu_prefix_cache_queries = make_per_engine( - counter_gpu_prefix_cache_queries, engine_indexes, model_name) + # Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_queries + # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10 + # TODO: remove in 0.12.0 + if self.show_hidden_metrics: + counter_gpu_prefix_cache_queries = self._counter_cls( + name="vllm:gpu_prefix_cache_queries", + documentation=( + "GPU prefix cache queries, in terms of number of queried" + "tokens. DEPRECATED: Use vllm:prefix_cache_queries instead." + ), + labelnames=labelnames) + self.counter_gpu_prefix_cache_queries = make_per_engine( + counter_gpu_prefix_cache_queries, engine_indexes, model_name) - # Deprecated in 0.9 - Renamed as vllm:prefix_cache_hits - # TODO: in 0.10, only enable if show_hidden_metrics=True - counter_gpu_prefix_cache_hits = self._counter_cls( - name="vllm:gpu_prefix_cache_hits", - documentation=( - "GPU prefix cache hits, in terms of number of cached " - "tokens. DEPRECATED: Use vllm:prefix_cache_hits instead."), - labelnames=labelnames) - self.counter_gpu_prefix_cache_hits = make_per_engine( - counter_gpu_prefix_cache_hits, engine_indexes, model_name) + # Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_hits + # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10 + # TODO: remove in 0.12.0 + if self.show_hidden_metrics: + counter_gpu_prefix_cache_hits = self._counter_cls( + name="vllm:gpu_prefix_cache_hits", + documentation=( + "GPU prefix cache hits, in terms of number of cached " + "tokens. DEPRECATED: Use vllm:prefix_cache_hits instead."), + labelnames=labelnames) + self.counter_gpu_prefix_cache_hits = make_per_engine( + counter_gpu_prefix_cache_hits, engine_indexes, model_name) gauge_kv_cache_usage = self._gauge_cls( name="vllm:kv_cache_usage_perc", @@ -509,15 +515,17 @@ class PrometheusStatLogger(StatLoggerBase): self.gauge_scheduler_waiting[engine_idx].set( scheduler_stats.num_waiting_reqs) - self.gauge_gpu_cache_usage[engine_idx].set( - scheduler_stats.kv_cache_usage) + if self.show_hidden_metrics: + self.gauge_gpu_cache_usage[engine_idx].set( + scheduler_stats.kv_cache_usage) self.gauge_kv_cache_usage[engine_idx].set( scheduler_stats.kv_cache_usage) - self.counter_gpu_prefix_cache_queries[engine_idx].inc( - scheduler_stats.prefix_cache_stats.queries) - self.counter_gpu_prefix_cache_hits[engine_idx].inc( - scheduler_stats.prefix_cache_stats.hits) + if self.show_hidden_metrics: + self.counter_gpu_prefix_cache_queries[engine_idx].inc( + scheduler_stats.prefix_cache_stats.queries) + self.counter_gpu_prefix_cache_hits[engine_idx].inc( + scheduler_stats.prefix_cache_stats.hits) self.counter_prefix_cache_queries[engine_idx].inc( scheduler_stats.prefix_cache_stats.queries)