diff --git a/docs/usage/metrics.md b/docs/usage/metrics.md index 9ad7253184d9..6603aa83b4af 100644 --- a/docs/usage/metrics.md +++ b/docs/usage/metrics.md @@ -35,19 +35,6 @@ The following metrics are exposed: --8<-- "vllm/engine/metrics.py:metrics-definitions" ``` -The following metrics are deprecated and due to be removed in a future version: - -- `vllm:num_requests_swapped`, `vllm:cpu_cache_usage_perc`, and - `vllm:cpu_prefix_cache_hit_rate` because KV cache offloading is not - used in V1. -- `vllm:gpu_prefix_cache_hit_rate` is replaced by queries+hits - counters in V1. -- `vllm:time_in_queue_requests` because it duplicates - `vllm:request_queue_time_seconds`. -- `vllm:model_forward_time_milliseconds` and - `vllm:model_execute_time_milliseconds` because - prefill/decode/inference time metrics should be used instead. - Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1` but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch, and are then removed in version `X.Y+2`. diff --git a/examples/online_serving/prometheus_grafana/grafana.json b/examples/online_serving/prometheus_grafana/grafana.json index fbe96b48e799..3488956a5b24 100644 --- a/examples/online_serving/prometheus_grafana/grafana.json +++ b/examples/online_serving/prometheus_grafana/grafana.json @@ -577,23 +577,6 @@ "refId": "A", "useBackend": false }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "vllm:num_requests_swapped{model_name=\"$model_name\"}", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "Num Swapped", - "range": true, - "refId": "B", - "useBackend": false - }, { "datasource": { "type": "prometheus", @@ -874,19 +857,6 @@ "legendFormat": "GPU Cache Usage", "range": true, "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "code", - "expr": "vllm:cpu_cache_usage_perc{model_name=\"$model_name\"}", - "hide": false, - "instant": false, - "legendFormat": "CPU Cache Usage", - "range": true, - "refId": "B" } ], "title": "Cache Utilization", diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index 42f7b098f917..b21c0173c7b8 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -171,10 +171,8 @@ async def test_metrics_counts(server: RemoteOpenAIServer, EXPECTED_METRICS = [ "vllm:num_requests_running", - "vllm:num_requests_swapped", # deprecated "vllm:num_requests_waiting", "vllm:gpu_cache_usage_perc", - "vllm:cpu_cache_usage_perc", # deprecated "vllm:time_to_first_token_seconds_sum", "vllm:time_to_first_token_seconds_bucket", "vllm:time_to_first_token_seconds_count", @@ -274,10 +272,7 @@ EXPECTED_METRICS_V1 = [ "vllm:request_decode_time_seconds_count", ] -HIDDEN_DEPRECATED_METRICS = [ - "vllm:num_requests_swapped", - "vllm:cpu_cache_usage_perc", -] +HIDDEN_DEPRECATED_METRICS: list[str] = [] @pytest.mark.asyncio diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index ff33d566ab68..a9600a2c8aa3 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1680,9 +1680,6 @@ class LLMEngine: time_inference_requests: List[float] = [] time_prefill_requests: List[float] = [] time_decode_requests: List[float] = [] - time_in_queue_requests: List[float] = [] - model_forward_time_requests: List[float] = [] - model_execute_time_requests: List[float] = [] # Metadata num_prompt_tokens_requests: List[int] = [] num_generation_tokens_requests: List[int] = [] @@ -1790,15 +1787,6 @@ class LLMEngine: now - seq_group.metrics.first_token_time) time_inference_requests.append( now - seq_group.metrics.first_scheduled_time) - if seq_group.metrics.time_in_queue is not None: - time_in_queue_requests.append( - seq_group.metrics.time_in_queue) - if seq_group.metrics.model_forward_time is not None: - model_forward_time_requests.append( - seq_group.metrics.model_forward_time) - if seq_group.metrics.model_execute_time is not None: - model_execute_time_requests.append( - seq_group.metrics.model_execute_time * 1000) # Metadata num_prompt_tokens_requests.append( len(seq_group.prompt_token_ids)) @@ -1867,9 +1855,6 @@ class LLMEngine: time_inference_requests=time_inference_requests, time_prefill_requests=time_prefill_requests, time_decode_requests=time_decode_requests, - time_in_queue_requests=time_in_queue_requests, - model_forward_time_requests=model_forward_time_requests, - model_execute_time_requests=model_execute_time_requests, # Metadata num_prompt_tokens_requests=num_prompt_tokens_requests, num_generation_tokens_requests=num_generation_tokens_requests, diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 34b48f83b643..916afe0c8e5f 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -80,17 +80,6 @@ class Metrics: multiprocess_mode="livemostrecent", ) - # Deprecated in 0.8 - KV cache offloading is not used in V1 - # Hidden in 0.9, due to be removed in 0.10 - if self.show_hidden_metrics: - self.gauge_scheduler_swapped = self._gauge_cls( - name="vllm:num_requests_swapped", - documentation=( - "Number of requests swapped to CPU. " - "DEPRECATED: KV cache offloading is not used in V1"), - labelnames=labelnames, - multiprocess_mode="sum") - # KV Cache Usage in % self.gauge_gpu_cache_usage = self._gauge_cls( name="vllm:gpu_cache_usage_perc", @@ -98,35 +87,6 @@ class Metrics: labelnames=labelnames, multiprocess_mode="sum") - # Deprecated in 0.8 - KV cache offloading is not used in V1 - # Hidden in 0.9, due to be removed in 0.10 - if self.show_hidden_metrics: - self.gauge_cpu_cache_usage = self._gauge_cls( - name="vllm:cpu_cache_usage_perc", - documentation=( - "CPU KV-cache usage. 1 means 100 percent usage. " - "DEPRECATED: KV cache offloading is not used in V1"), - labelnames=labelnames, - multiprocess_mode="sum") - self.gauge_cpu_prefix_cache_hit_rate = self._gauge_cls( - name="vllm:cpu_prefix_cache_hit_rate", - documentation=( - "CPU prefix cache block hit rate. " - "DEPRECATED: KV cache offloading is not used in V1"), - labelnames=labelnames, - multiprocess_mode="sum") - - # Deprecated in 0.8 - replaced by queries+hits counters in V1 - # Hidden in 0.9, due to be removed in 0.10 - if self.show_hidden_metrics: - self.gauge_gpu_prefix_cache_hit_rate = self._gauge_cls( - name="vllm:gpu_prefix_cache_hit_rate", - documentation=("GPU prefix cache block hit rate. " - "DEPRECATED: use vllm:gpu_prefix_cache_queries " - "and vllm:gpu_prefix_cache_queries in V1"), - labelnames=labelnames, - multiprocess_mode="sum") - # Iteration stats self.counter_num_preemption = self._counter_cls( name="vllm:num_preemptions_total", @@ -200,36 +160,6 @@ class Metrics: "Histogram of time spent in DECODE phase for request.", labelnames=labelnames, buckets=request_latency_buckets) - # Deprecated in 0.8 - duplicates vllm:request_queue_time_seconds: - # Hidden in 0.9, due to be removed in 0.10 - if self.show_hidden_metrics: - self.histogram_time_in_queue_request = self._histogram_cls( - name="vllm:time_in_queue_requests", - documentation= - ("Histogram of time the request spent in the queue in seconds. " - "DEPRECATED: use vllm:request_queue_time_seconds instead."), - labelnames=labelnames, - buckets=request_latency_buckets) - - # Deprecated in 0.8 - use prefill/decode/inference time metrics - # Hidden in 0.9, due to be removed in 0.10 - if self.show_hidden_metrics: - self.histogram_model_forward_time_request = self._histogram_cls( - name="vllm:model_forward_time_milliseconds", - documentation= - ("Histogram of time spent in the model forward pass in ms. " - "DEPRECATED: use prefill/decode/inference time metrics instead" - ), - labelnames=labelnames, - buckets=build_1_2_3_5_8_buckets(3000)) - self.histogram_model_execute_time_request = self._histogram_cls( - name="vllm:model_execute_time_milliseconds", - documentation= - ("Histogram of time spent in the model execute function in ms." - "DEPRECATED: use prefill/decode/inference time metrics instead" - ), - labelnames=labelnames, - buckets=build_1_2_3_5_8_buckets(3000)) # Metadata self.histogram_num_prompt_tokens_request = self._histogram_cls( @@ -580,20 +510,10 @@ class PrometheusStatLogger(StatLoggerBase): # System state data self._log_gauge(self.metrics.gauge_scheduler_running, stats.num_running_sys) - if self.metrics.show_hidden_metrics: - self._log_gauge(self.metrics.gauge_scheduler_swapped, - stats.num_swapped_sys) self._log_gauge(self.metrics.gauge_scheduler_waiting, stats.num_waiting_sys) self._log_gauge(self.metrics.gauge_gpu_cache_usage, stats.gpu_cache_usage_sys) - if self.metrics.show_hidden_metrics: - self._log_gauge(self.metrics.gauge_cpu_cache_usage, - stats.cpu_cache_usage_sys) - self._log_gauge(self.metrics.gauge_cpu_prefix_cache_hit_rate, - stats.cpu_prefix_cache_hit_rate) - self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate, - stats.gpu_prefix_cache_hit_rate) # Including max-lora in metric, in future this property of lora # config maybe extended to be dynamic. lora_info = { @@ -631,15 +551,6 @@ class PrometheusStatLogger(StatLoggerBase): stats.time_prefill_requests) self._log_histogram(self.metrics.histogram_decode_time_request, stats.time_decode_requests) - if self.metrics.show_hidden_metrics: - self._log_histogram(self.metrics.histogram_time_in_queue_request, - stats.time_in_queue_requests) - self._log_histogram( - self.metrics.histogram_model_forward_time_request, - stats.model_forward_time_requests) - self._log_histogram( - self.metrics.histogram_model_execute_time_request, - stats.model_execute_time_requests) # Metadata finished_reason_counter = CollectionsCounter( stats.finished_reason_requests) diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py index 9e6d5ef29bed..acc83011d6c8 100644 --- a/vllm/engine/metrics_types.py +++ b/vllm/engine/metrics_types.py @@ -53,9 +53,6 @@ class Stats: time_inference_requests: List[float] time_prefill_requests: List[float] time_decode_requests: List[float] - time_in_queue_requests: List[float] - model_forward_time_requests: List[float] - model_execute_time_requests: List[float] # Metadata num_prompt_tokens_requests: List[int] num_generation_tokens_requests: List[int]