diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index b2a1a7d3f931d..0288fe2f2197b 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -4,7 +4,7 @@ import logging import time from abc import ABC, abstractmethod -from typing import Callable, Optional +from typing import Callable, Optional, Union import numpy as np import prometheus_client @@ -153,15 +153,15 @@ class PrometheusStatLogger(StatLoggerBase): # unregister_vllm_metrics() self.vllm_config = vllm_config - self.engine_indexes = range(engine_num) # Use this flag to hide metrics that were deprecated in # a previous release and which will be removed future self.show_hidden_metrics = \ vllm_config.observability_config.show_hidden_metrics labelnames = ["model_name", "engine"] - model_name = vllm_config.model_config.served_model_name, + model_name = vllm_config.model_config.served_model_name max_model_len = vllm_config.model_config.max_model_len + engine_indexes = list(range(engine_num)) # self.spec_decoding_prom = self._spec_decoding_cls( # vllm_config.speculative_config, labelnames, labelvalues) @@ -169,133 +169,112 @@ class PrometheusStatLogger(StatLoggerBase): # # Scheduler state # - self.gauge_scheduler_running = { - idx: - self._gauge_cls( - name="vllm:num_requests_running", - documentation="Number of requests in model execution batches.", - multiprocess_mode="mostrecent", - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } + gauge_scheduler_running = self._gauge_cls( + name="vllm:num_requests_running", + documentation="Number of requests in model execution batches.", + multiprocess_mode="mostrecent", + labelnames=labelnames) + self.gauge_scheduler_running = make_per_engine(gauge_scheduler_running, + engine_indexes, + model_name) - self.gauge_scheduler_waiting = { - idx: - self._gauge_cls( - name="vllm:num_requests_waiting", - documentation="Number of requests waiting to be processed.", - multiprocess_mode="mostrecent", - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } + gauge_scheduler_waiting = self._gauge_cls( + name="vllm:num_requests_waiting", + documentation="Number of requests waiting to be processed.", + multiprocess_mode="mostrecent", + labelnames=labelnames) + self.gauge_scheduler_waiting = make_per_engine(gauge_scheduler_waiting, + engine_indexes, + model_name) # # GPU cache # # Deprecated in 0.9 - Renamed as vllm:kv_cache_usage_perc # TODO: in 0.10, only enable if show_hidden_metrics=True - self.gauge_gpu_cache_usage = { - idx: - self._gauge_cls( - name="vllm:gpu_cache_usage_perc", - documentation=( - "GPU KV-cache usage. 1 means 100 percent usage." - "DEPRECATED: Use vllm:kv_cache_usage_perc instead."), - multiprocess_mode="mostrecent", - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } + gauge_gpu_cache_usage = self._gauge_cls( + name="vllm:gpu_cache_usage_perc", + documentation=( + "GPU KV-cache usage. 1 means 100 percent usage." + "DEPRECATED: Use vllm:kv_cache_usage_perc instead."), + multiprocess_mode="mostrecent", + labelnames=labelnames) + self.gauge_gpu_cache_usage = make_per_engine(gauge_gpu_cache_usage, + engine_indexes, + model_name) # Deprecated in 0.9 - Renamed as vllm:prefix_cache_queries # TODO: in 0.10, only enable if show_hidden_metrics=True - self.counter_gpu_prefix_cache_queries = { - idx: - self._counter_cls( - name="vllm:gpu_prefix_cache_queries", - documentation=( - "GPU prefix cache queries, in terms of number of queried" - "tokens. DEPRECATED: Use vllm:prefix_cache_queries instead." - ), - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } + counter_gpu_prefix_cache_queries = self._counter_cls( + name="vllm:gpu_prefix_cache_queries", + documentation=( + "GPU prefix cache queries, in terms of number of queried" + "tokens. DEPRECATED: Use vllm:prefix_cache_queries instead."), + labelnames=labelnames) + self.counter_gpu_prefix_cache_queries = make_per_engine( + counter_gpu_prefix_cache_queries, engine_indexes, model_name) # Deprecated in 0.9 - Renamed as vllm:prefix_cache_hits # TODO: in 0.10, only enable if show_hidden_metrics=True - self.counter_gpu_prefix_cache_hits = { - idx: - self._counter_cls( - name="vllm:gpu_prefix_cache_hits", - documentation=( - "GPU prefix cache hits, in terms of number of cached " - "tokens. DEPRECATED: Use vllm:prefix_cache_hits instead."), - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } + counter_gpu_prefix_cache_hits = self._counter_cls( + name="vllm:gpu_prefix_cache_hits", + documentation=( + "GPU prefix cache hits, in terms of number of cached " + "tokens. DEPRECATED: Use vllm:prefix_cache_hits instead."), + labelnames=labelnames) + self.counter_gpu_prefix_cache_hits = make_per_engine( + counter_gpu_prefix_cache_hits, engine_indexes, model_name) - self.gauge_kv_cache_usage = { - idx: - self._gauge_cls( - name="vllm:kv_cache_usage_perc", - documentation="KV-cache usage. 1 means 100 percent usage.", - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } + gauge_kv_cache_usage = self._gauge_cls( + name="vllm:kv_cache_usage_perc", + documentation="KV-cache usage. 1 means 100 percent usage.", + labelnames=labelnames) + self.gauge_kv_cache_usage = make_per_engine(gauge_kv_cache_usage, + engine_indexes, model_name) - self.counter_prefix_cache_queries = { - idx: - self._counter_cls( - name="vllm:prefix_cache_queries", - documentation= - ("Prefix cache queries, in terms of number of queried tokens." - ), - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } + counter_prefix_cache_queries = self._counter_cls( + name="vllm:prefix_cache_queries", + documentation=( + "Prefix cache queries, in terms of number of queried tokens."), + labelnames=labelnames) + self.counter_prefix_cache_queries = make_per_engine( + counter_prefix_cache_queries, engine_indexes, model_name) - self.counter_prefix_cache_hits = { - idx: - self._counter_cls( - name="vllm:prefix_cache_hits", - documentation=( - "Prefix cache hits, in terms of number of cached tokens."), - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } + counter_prefix_cache_hits = self._counter_cls( + name="vllm:prefix_cache_hits", + documentation=( + "Prefix cache hits, in terms of number of cached tokens."), + labelnames=labelnames) + self.counter_prefix_cache_hits = make_per_engine( + counter_prefix_cache_hits, engine_indexes, model_name) # # Counters # - self.counter_num_preempted_reqs = { - idx: - self._counter_cls( - name="vllm:num_preemptions", - documentation= - "Cumulative number of preemption from the engine.", - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } + counter_num_preempted_reqs = self._counter_cls( + name="vllm:num_preemptions", + documentation="Cumulative number of preemption from the engine.", + labelnames=labelnames) + self.counter_num_preempted_reqs = make_per_engine( + counter_num_preempted_reqs, engine_indexes, model_name) - self.counter_prompt_tokens = { - idx: - self._counter_cls( - name="vllm:prompt_tokens", - documentation="Number of prefill tokens processed.", - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } + counter_prompt_tokens = self._counter_cls( + name="vllm:prompt_tokens", + documentation="Number of prefill tokens processed.", + labelnames=labelnames) + self.counter_prompt_tokens = make_per_engine(counter_prompt_tokens, + engine_indexes, + model_name) - self.counter_generation_tokens = { - idx: - self._counter_cls( - name="vllm:generation_tokens", - documentation="Number of generation tokens processed.", - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } + counter_generation_tokens = self._counter_cls( + name="vllm:generation_tokens", + documentation="Number of generation tokens processed.", + labelnames=labelnames) + self.counter_generation_tokens = make_per_engine( + counter_generation_tokens, engine_indexes, model_name) - self.counter_request_success: dict[FinishReason, - prometheus_client.Counter] = {} + self.counter_request_success: dict[FinishReason, dict[ + int, prometheus_client.Counter]] = {} counter_request_success_base = self._counter_cls( name="vllm:request_success", documentation="Count of successfully processed requests.", @@ -305,166 +284,141 @@ class PrometheusStatLogger(StatLoggerBase): idx: counter_request_success_base.labels(model_name, str(idx), str(reason)) - for idx in self.engine_indexes + for idx in engine_indexes } # # Histograms of counts # - self.histogram_num_prompt_tokens_request = { - idx: - self._histogram_cls( - name="vllm:request_prompt_tokens", - documentation="Number of prefill tokens processed.", - buckets=build_1_2_5_buckets(max_model_len), - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } + histogram_num_prompt_tokens_request = self._histogram_cls( + name="vllm:request_prompt_tokens", + documentation="Number of prefill tokens processed.", + buckets=build_1_2_5_buckets(max_model_len), + labelnames=labelnames) + self.histogram_num_prompt_tokens_request = make_per_engine( + histogram_num_prompt_tokens_request, engine_indexes, model_name) - self.histogram_num_generation_tokens_request = { - idx: - self._histogram_cls( - name="vllm:request_generation_tokens", - documentation="Number of generation tokens processed.", - buckets=build_1_2_5_buckets(max_model_len), - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } + histogram_num_generation_tokens_request = self._histogram_cls( + name="vllm:request_generation_tokens", + documentation="Number of generation tokens processed.", + buckets=build_1_2_5_buckets(max_model_len), + labelnames=labelnames) + self.histogram_num_generation_tokens_request = make_per_engine( + histogram_num_generation_tokens_request, engine_indexes, + model_name) # TODO: This metric might be incorrect in case of using multiple # api_server counts which uses prometheus mp. # See: https://github.com/vllm-project/vllm/pull/18053 - self.histogram_iteration_tokens = { - idx: - self._histogram_cls( - name="vllm:iteration_tokens_total", - documentation="Histogram of number of tokens per engine_step.", - buckets=[ - 1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, - 16384 - ], - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } + histogram_iteration_tokens = self._histogram_cls( + name="vllm:iteration_tokens_total", + documentation="Histogram of number of tokens per engine_step.", + buckets=[ + 1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384 + ], + labelnames=labelnames) + self.histogram_iteration_tokens = make_per_engine( + histogram_iteration_tokens, engine_indexes, model_name) - self.histogram_max_num_generation_tokens_request = { - idx: - self._histogram_cls( - name="vllm:request_max_num_generation_tokens", - documentation= - "Histogram of maximum number of requested generation tokens.", - buckets=build_1_2_5_buckets(max_model_len), - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } + histogram_max_num_generation_tokens_request = self._histogram_cls( + name="vllm:request_max_num_generation_tokens", + documentation= + "Histogram of maximum number of requested generation tokens.", + buckets=build_1_2_5_buckets(max_model_len), + labelnames=labelnames) + self.histogram_max_num_generation_tokens_request = make_per_engine( + histogram_max_num_generation_tokens_request, engine_indexes, + model_name) - self.histogram_n_request = { - idx: - self._histogram_cls( - name="vllm:request_params_n", - documentation="Histogram of the n request parameter.", - buckets=[1, 2, 5, 10, 20], - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } + histogram_n_request = self._histogram_cls( + name="vllm:request_params_n", + documentation="Histogram of the n request parameter.", + buckets=[1, 2, 5, 10, 20], + labelnames=labelnames) + self.histogram_n_request = make_per_engine(histogram_n_request, + engine_indexes, model_name) - self.histogram_max_tokens_request = { - idx: - self._histogram_cls( - name="vllm:request_params_max_tokens", - documentation="Histogram of the max_tokens request parameter.", - buckets=build_1_2_5_buckets(max_model_len), - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } + histogram_max_tokens_request = self._histogram_cls( + name="vllm:request_params_max_tokens", + documentation="Histogram of the max_tokens request parameter.", + buckets=build_1_2_5_buckets(max_model_len), + labelnames=labelnames) + self.histogram_max_tokens_request = make_per_engine( + histogram_max_tokens_request, engine_indexes, model_name) # # Histogram of timing intervals # - self.histogram_time_to_first_token = { - idx: - self._histogram_cls( - name="vllm:time_to_first_token_seconds", - documentation="Histogram of time to first token in seconds.", - buckets=[ - 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, - 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, - 640.0, 2560.0 - ], - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } + histogram_time_to_first_token = self._histogram_cls( + name="vllm:time_to_first_token_seconds", + documentation="Histogram of time to first token in seconds.", + buckets=[ + 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, + 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0, + 2560.0 + ], + labelnames=labelnames) + self.histogram_time_to_first_token = make_per_engine( + histogram_time_to_first_token, engine_indexes, model_name) - self.histogram_time_per_output_token = { - idx: - self._histogram_cls( - name="vllm:time_per_output_token_seconds", - documentation="Histogram of time per output token in seconds.", - buckets=[ - 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, - 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0 - ], - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } + histogram_time_per_output_token = self._histogram_cls( + name="vllm:time_per_output_token_seconds", + documentation="Histogram of time per output token in seconds.", + buckets=[ + 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, + 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0 + ], + labelnames=labelnames) + self.histogram_time_per_output_token = make_per_engine( + histogram_time_per_output_token, engine_indexes, model_name) request_latency_buckets = [ 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0 ] - self.histogram_e2e_time_request = { - idx: - self._histogram_cls( - name="vllm:e2e_request_latency_seconds", - documentation="Histogram of e2e request latency in seconds.", - buckets=request_latency_buckets, - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } + histogram_e2e_time_request = self._histogram_cls( + name="vllm:e2e_request_latency_seconds", + documentation="Histogram of e2e request latency in seconds.", + buckets=request_latency_buckets, + labelnames=labelnames) + self.histogram_e2e_time_request = make_per_engine( + histogram_e2e_time_request, engine_indexes, model_name) - self.histogram_queue_time_request = { - idx: - self._histogram_cls( - name="vllm:request_queue_time_seconds", - documentation= - "Histogram of time spent in WAITING phase for request.", - buckets=request_latency_buckets, - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } + histogram_queue_time_request = self._histogram_cls( + name="vllm:request_queue_time_seconds", + documentation= + "Histogram of time spent in WAITING phase for request.", + buckets=request_latency_buckets, + labelnames=labelnames) + self.histogram_queue_time_request = make_per_engine( + histogram_queue_time_request, engine_indexes, model_name) - self.histogram_inference_time_request = { - idx: - self._histogram_cls( - name="vllm:request_inference_time_seconds", - documentation= - "Histogram of time spent in RUNNING phase for request.", - buckets=request_latency_buckets, - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } + histogram_inference_time_request = self._histogram_cls( + name="vllm:request_inference_time_seconds", + documentation= + "Histogram of time spent in RUNNING phase for request.", + buckets=request_latency_buckets, + labelnames=labelnames) + self.histogram_inference_time_request = make_per_engine( + histogram_inference_time_request, engine_indexes, model_name) - self.histogram_prefill_time_request = { - idx: - self._histogram_cls( - name="vllm:request_prefill_time_seconds", - documentation= - "Histogram of time spent in PREFILL phase for request.", - buckets=request_latency_buckets, - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } + histogram_prefill_time_request = self._histogram_cls( + name="vllm:request_prefill_time_seconds", + documentation= + "Histogram of time spent in PREFILL phase for request.", + buckets=request_latency_buckets, + labelnames=labelnames) + self.histogram_prefill_time_request = make_per_engine( + histogram_prefill_time_request, engine_indexes, model_name) - self.histogram_decode_time_request = { - idx: - self._histogram_cls( - name="vllm:request_decode_time_seconds", - documentation= - "Histogram of time spent in DECODE phase for request.", - buckets=request_latency_buckets, - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } + histogram_decode_time_request = self._histogram_cls( + name="vllm:request_decode_time_seconds", + documentation= + "Histogram of time spent in DECODE phase for request.", + buckets=request_latency_buckets, + labelnames=labelnames) + self.histogram_decode_time_request = make_per_engine( + histogram_decode_time_request, engine_indexes, model_name) # # # # LoRA metrics @@ -603,6 +557,18 @@ class PrometheusStatLogger(StatLoggerBase): self.log_metrics_info("cache_config", self.vllm_config.cache_config) +PromMetric = Union[ + prometheus_client.Gauge, + prometheus_client.Counter, + prometheus_client.Histogram, +] + + +def make_per_engine(metric: PromMetric, engine_idxs: list[int], + model_name: str) -> dict[int, PromMetric]: + return {idx: metric.labels(model_name, str(idx)) for idx in engine_idxs} + + def build_buckets(mantissa_lst: list[int], max_value: int) -> list[int]: """ Builds a list of buckets with increasing powers of 10 multiplied by