Signed-off-by: Robert Shaw <robshaw@redhat.com>
This commit is contained in:
Robert Shaw 2025-07-20 15:22:43 +00:00
parent 9a2e26d049
commit 3956d8ccad

View File

@ -4,7 +4,7 @@
import logging import logging
import time import time
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Callable, Optional from typing import Callable, Optional, Union
import numpy as np import numpy as np
import prometheus_client import prometheus_client
@ -153,15 +153,15 @@ class PrometheusStatLogger(StatLoggerBase):
# unregister_vllm_metrics() # unregister_vllm_metrics()
self.vllm_config = vllm_config self.vllm_config = vllm_config
self.engine_indexes = range(engine_num)
# Use this flag to hide metrics that were deprecated in # Use this flag to hide metrics that were deprecated in
# a previous release and which will be removed future # a previous release and which will be removed future
self.show_hidden_metrics = \ self.show_hidden_metrics = \
vllm_config.observability_config.show_hidden_metrics vllm_config.observability_config.show_hidden_metrics
labelnames = ["model_name", "engine"] labelnames = ["model_name", "engine"]
model_name = vllm_config.model_config.served_model_name, model_name = vllm_config.model_config.served_model_name
max_model_len = vllm_config.model_config.max_model_len max_model_len = vllm_config.model_config.max_model_len
engine_indexes = list(range(engine_num))
# self.spec_decoding_prom = self._spec_decoding_cls( # self.spec_decoding_prom = self._spec_decoding_cls(
# vllm_config.speculative_config, labelnames, labelvalues) # vllm_config.speculative_config, labelnames, labelvalues)
@ -169,133 +169,112 @@ class PrometheusStatLogger(StatLoggerBase):
# #
# Scheduler state # Scheduler state
# #
self.gauge_scheduler_running = { gauge_scheduler_running = self._gauge_cls(
idx: name="vllm:num_requests_running",
self._gauge_cls( documentation="Number of requests in model execution batches.",
name="vllm:num_requests_running", multiprocess_mode="mostrecent",
documentation="Number of requests in model execution batches.", labelnames=labelnames)
multiprocess_mode="mostrecent", self.gauge_scheduler_running = make_per_engine(gauge_scheduler_running,
labelnames=labelnames).labels(model_name, str(idx)) engine_indexes,
for idx in self.engine_indexes model_name)
}
self.gauge_scheduler_waiting = { gauge_scheduler_waiting = self._gauge_cls(
idx: name="vllm:num_requests_waiting",
self._gauge_cls( documentation="Number of requests waiting to be processed.",
name="vllm:num_requests_waiting", multiprocess_mode="mostrecent",
documentation="Number of requests waiting to be processed.", labelnames=labelnames)
multiprocess_mode="mostrecent", self.gauge_scheduler_waiting = make_per_engine(gauge_scheduler_waiting,
labelnames=labelnames).labels(model_name, str(idx)) engine_indexes,
for idx in self.engine_indexes model_name)
}
# #
# GPU cache # GPU cache
# #
# Deprecated in 0.9 - Renamed as vllm:kv_cache_usage_perc # Deprecated in 0.9 - Renamed as vllm:kv_cache_usage_perc
# TODO: in 0.10, only enable if show_hidden_metrics=True # TODO: in 0.10, only enable if show_hidden_metrics=True
self.gauge_gpu_cache_usage = { gauge_gpu_cache_usage = self._gauge_cls(
idx: name="vllm:gpu_cache_usage_perc",
self._gauge_cls( documentation=(
name="vllm:gpu_cache_usage_perc", "GPU KV-cache usage. 1 means 100 percent usage."
documentation=( "DEPRECATED: Use vllm:kv_cache_usage_perc instead."),
"GPU KV-cache usage. 1 means 100 percent usage." multiprocess_mode="mostrecent",
"DEPRECATED: Use vllm:kv_cache_usage_perc instead."), labelnames=labelnames)
multiprocess_mode="mostrecent", self.gauge_gpu_cache_usage = make_per_engine(gauge_gpu_cache_usage,
labelnames=labelnames).labels(model_name, str(idx)) engine_indexes,
for idx in self.engine_indexes model_name)
}
# Deprecated in 0.9 - Renamed as vllm:prefix_cache_queries # Deprecated in 0.9 - Renamed as vllm:prefix_cache_queries
# TODO: in 0.10, only enable if show_hidden_metrics=True # TODO: in 0.10, only enable if show_hidden_metrics=True
self.counter_gpu_prefix_cache_queries = { counter_gpu_prefix_cache_queries = self._counter_cls(
idx: name="vllm:gpu_prefix_cache_queries",
self._counter_cls( documentation=(
name="vllm:gpu_prefix_cache_queries", "GPU prefix cache queries, in terms of number of queried"
documentation=( "tokens. DEPRECATED: Use vllm:prefix_cache_queries instead."),
"GPU prefix cache queries, in terms of number of queried" labelnames=labelnames)
"tokens. DEPRECATED: Use vllm:prefix_cache_queries instead." self.counter_gpu_prefix_cache_queries = make_per_engine(
), counter_gpu_prefix_cache_queries, engine_indexes, model_name)
labelnames=labelnames).labels(model_name, str(idx))
for idx in self.engine_indexes
}
# Deprecated in 0.9 - Renamed as vllm:prefix_cache_hits # Deprecated in 0.9 - Renamed as vllm:prefix_cache_hits
# TODO: in 0.10, only enable if show_hidden_metrics=True # TODO: in 0.10, only enable if show_hidden_metrics=True
self.counter_gpu_prefix_cache_hits = { counter_gpu_prefix_cache_hits = self._counter_cls(
idx: name="vllm:gpu_prefix_cache_hits",
self._counter_cls( documentation=(
name="vllm:gpu_prefix_cache_hits", "GPU prefix cache hits, in terms of number of cached "
documentation=( "tokens. DEPRECATED: Use vllm:prefix_cache_hits instead."),
"GPU prefix cache hits, in terms of number of cached " labelnames=labelnames)
"tokens. DEPRECATED: Use vllm:prefix_cache_hits instead."), self.counter_gpu_prefix_cache_hits = make_per_engine(
labelnames=labelnames).labels(model_name, str(idx)) counter_gpu_prefix_cache_hits, engine_indexes, model_name)
for idx in self.engine_indexes
}
self.gauge_kv_cache_usage = { gauge_kv_cache_usage = self._gauge_cls(
idx: name="vllm:kv_cache_usage_perc",
self._gauge_cls( documentation="KV-cache usage. 1 means 100 percent usage.",
name="vllm:kv_cache_usage_perc", labelnames=labelnames)
documentation="KV-cache usage. 1 means 100 percent usage.", self.gauge_kv_cache_usage = make_per_engine(gauge_kv_cache_usage,
labelnames=labelnames).labels(model_name, str(idx)) engine_indexes, model_name)
for idx in self.engine_indexes
}
self.counter_prefix_cache_queries = { counter_prefix_cache_queries = self._counter_cls(
idx: name="vllm:prefix_cache_queries",
self._counter_cls( documentation=(
name="vllm:prefix_cache_queries", "Prefix cache queries, in terms of number of queried tokens."),
documentation= labelnames=labelnames)
("Prefix cache queries, in terms of number of queried tokens." self.counter_prefix_cache_queries = make_per_engine(
), counter_prefix_cache_queries, engine_indexes, model_name)
labelnames=labelnames).labels(model_name, str(idx))
for idx in self.engine_indexes
}
self.counter_prefix_cache_hits = { counter_prefix_cache_hits = self._counter_cls(
idx: name="vllm:prefix_cache_hits",
self._counter_cls( documentation=(
name="vllm:prefix_cache_hits", "Prefix cache hits, in terms of number of cached tokens."),
documentation=( labelnames=labelnames)
"Prefix cache hits, in terms of number of cached tokens."), self.counter_prefix_cache_hits = make_per_engine(
labelnames=labelnames).labels(model_name, str(idx)) counter_prefix_cache_hits, engine_indexes, model_name)
for idx in self.engine_indexes
}
# #
# Counters # Counters
# #
self.counter_num_preempted_reqs = { counter_num_preempted_reqs = self._counter_cls(
idx: name="vllm:num_preemptions",
self._counter_cls( documentation="Cumulative number of preemption from the engine.",
name="vllm:num_preemptions", labelnames=labelnames)
documentation= self.counter_num_preempted_reqs = make_per_engine(
"Cumulative number of preemption from the engine.", counter_num_preempted_reqs, engine_indexes, model_name)
labelnames=labelnames).labels(model_name, str(idx))
for idx in self.engine_indexes
}
self.counter_prompt_tokens = { counter_prompt_tokens = self._counter_cls(
idx: name="vllm:prompt_tokens",
self._counter_cls( documentation="Number of prefill tokens processed.",
name="vllm:prompt_tokens", labelnames=labelnames)
documentation="Number of prefill tokens processed.", self.counter_prompt_tokens = make_per_engine(counter_prompt_tokens,
labelnames=labelnames).labels(model_name, str(idx)) engine_indexes,
for idx in self.engine_indexes model_name)
}
self.counter_generation_tokens = { counter_generation_tokens = self._counter_cls(
idx: name="vllm:generation_tokens",
self._counter_cls( documentation="Number of generation tokens processed.",
name="vllm:generation_tokens", labelnames=labelnames)
documentation="Number of generation tokens processed.", self.counter_generation_tokens = make_per_engine(
labelnames=labelnames).labels(model_name, str(idx)) counter_generation_tokens, engine_indexes, model_name)
for idx in self.engine_indexes
}
self.counter_request_success: dict[FinishReason, self.counter_request_success: dict[FinishReason, dict[
prometheus_client.Counter] = {} int, prometheus_client.Counter]] = {}
counter_request_success_base = self._counter_cls( counter_request_success_base = self._counter_cls(
name="vllm:request_success", name="vllm:request_success",
documentation="Count of successfully processed requests.", documentation="Count of successfully processed requests.",
@ -305,166 +284,141 @@ class PrometheusStatLogger(StatLoggerBase):
idx: idx:
counter_request_success_base.labels(model_name, str(idx), counter_request_success_base.labels(model_name, str(idx),
str(reason)) str(reason))
for idx in self.engine_indexes for idx in engine_indexes
} }
# #
# Histograms of counts # Histograms of counts
# #
self.histogram_num_prompt_tokens_request = { histogram_num_prompt_tokens_request = self._histogram_cls(
idx: name="vllm:request_prompt_tokens",
self._histogram_cls( documentation="Number of prefill tokens processed.",
name="vllm:request_prompt_tokens", buckets=build_1_2_5_buckets(max_model_len),
documentation="Number of prefill tokens processed.", labelnames=labelnames)
buckets=build_1_2_5_buckets(max_model_len), self.histogram_num_prompt_tokens_request = make_per_engine(
labelnames=labelnames).labels(model_name, str(idx)) histogram_num_prompt_tokens_request, engine_indexes, model_name)
for idx in self.engine_indexes
}
self.histogram_num_generation_tokens_request = { histogram_num_generation_tokens_request = self._histogram_cls(
idx: name="vllm:request_generation_tokens",
self._histogram_cls( documentation="Number of generation tokens processed.",
name="vllm:request_generation_tokens", buckets=build_1_2_5_buckets(max_model_len),
documentation="Number of generation tokens processed.", labelnames=labelnames)
buckets=build_1_2_5_buckets(max_model_len), self.histogram_num_generation_tokens_request = make_per_engine(
labelnames=labelnames).labels(model_name, str(idx)) histogram_num_generation_tokens_request, engine_indexes,
for idx in self.engine_indexes model_name)
}
# TODO: This metric might be incorrect in case of using multiple # TODO: This metric might be incorrect in case of using multiple
# api_server counts which uses prometheus mp. # api_server counts which uses prometheus mp.
# See: https://github.com/vllm-project/vllm/pull/18053 # See: https://github.com/vllm-project/vllm/pull/18053
self.histogram_iteration_tokens = { histogram_iteration_tokens = self._histogram_cls(
idx: name="vllm:iteration_tokens_total",
self._histogram_cls( documentation="Histogram of number of tokens per engine_step.",
name="vllm:iteration_tokens_total", buckets=[
documentation="Histogram of number of tokens per engine_step.", 1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384
buckets=[ ],
1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, labelnames=labelnames)
16384 self.histogram_iteration_tokens = make_per_engine(
], histogram_iteration_tokens, engine_indexes, model_name)
labelnames=labelnames).labels(model_name, str(idx))
for idx in self.engine_indexes
}
self.histogram_max_num_generation_tokens_request = { histogram_max_num_generation_tokens_request = self._histogram_cls(
idx: name="vllm:request_max_num_generation_tokens",
self._histogram_cls( documentation=
name="vllm:request_max_num_generation_tokens", "Histogram of maximum number of requested generation tokens.",
documentation= buckets=build_1_2_5_buckets(max_model_len),
"Histogram of maximum number of requested generation tokens.", labelnames=labelnames)
buckets=build_1_2_5_buckets(max_model_len), self.histogram_max_num_generation_tokens_request = make_per_engine(
labelnames=labelnames).labels(model_name, str(idx)) histogram_max_num_generation_tokens_request, engine_indexes,
for idx in self.engine_indexes model_name)
}
self.histogram_n_request = { histogram_n_request = self._histogram_cls(
idx: name="vllm:request_params_n",
self._histogram_cls( documentation="Histogram of the n request parameter.",
name="vllm:request_params_n", buckets=[1, 2, 5, 10, 20],
documentation="Histogram of the n request parameter.", labelnames=labelnames)
buckets=[1, 2, 5, 10, 20], self.histogram_n_request = make_per_engine(histogram_n_request,
labelnames=labelnames).labels(model_name, str(idx)) engine_indexes, model_name)
for idx in self.engine_indexes
}
self.histogram_max_tokens_request = { histogram_max_tokens_request = self._histogram_cls(
idx: name="vllm:request_params_max_tokens",
self._histogram_cls( documentation="Histogram of the max_tokens request parameter.",
name="vllm:request_params_max_tokens", buckets=build_1_2_5_buckets(max_model_len),
documentation="Histogram of the max_tokens request parameter.", labelnames=labelnames)
buckets=build_1_2_5_buckets(max_model_len), self.histogram_max_tokens_request = make_per_engine(
labelnames=labelnames).labels(model_name, str(idx)) histogram_max_tokens_request, engine_indexes, model_name)
for idx in self.engine_indexes
}
# #
# Histogram of timing intervals # Histogram of timing intervals
# #
self.histogram_time_to_first_token = { histogram_time_to_first_token = self._histogram_cls(
idx: name="vllm:time_to_first_token_seconds",
self._histogram_cls( documentation="Histogram of time to first token in seconds.",
name="vllm:time_to_first_token_seconds", buckets=[
documentation="Histogram of time to first token in seconds.", 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
buckets=[ 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0,
0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, 2560.0
0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, ],
640.0, 2560.0 labelnames=labelnames)
], self.histogram_time_to_first_token = make_per_engine(
labelnames=labelnames).labels(model_name, str(idx)) histogram_time_to_first_token, engine_indexes, model_name)
for idx in self.engine_indexes
}
self.histogram_time_per_output_token = { histogram_time_per_output_token = self._histogram_cls(
idx: name="vllm:time_per_output_token_seconds",
self._histogram_cls( documentation="Histogram of time per output token in seconds.",
name="vllm:time_per_output_token_seconds", buckets=[
documentation="Histogram of time per output token in seconds.", 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
buckets=[ 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0
0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, ],
0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0 labelnames=labelnames)
], self.histogram_time_per_output_token = make_per_engine(
labelnames=labelnames).labels(model_name, str(idx)) histogram_time_per_output_token, engine_indexes, model_name)
for idx in self.engine_indexes
}
request_latency_buckets = [ request_latency_buckets = [
0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0
] ]
self.histogram_e2e_time_request = { histogram_e2e_time_request = self._histogram_cls(
idx: name="vllm:e2e_request_latency_seconds",
self._histogram_cls( documentation="Histogram of e2e request latency in seconds.",
name="vllm:e2e_request_latency_seconds", buckets=request_latency_buckets,
documentation="Histogram of e2e request latency in seconds.", labelnames=labelnames)
buckets=request_latency_buckets, self.histogram_e2e_time_request = make_per_engine(
labelnames=labelnames).labels(model_name, str(idx)) histogram_e2e_time_request, engine_indexes, model_name)
for idx in self.engine_indexes
}
self.histogram_queue_time_request = { histogram_queue_time_request = self._histogram_cls(
idx: name="vllm:request_queue_time_seconds",
self._histogram_cls( documentation=
name="vllm:request_queue_time_seconds", "Histogram of time spent in WAITING phase for request.",
documentation= buckets=request_latency_buckets,
"Histogram of time spent in WAITING phase for request.", labelnames=labelnames)
buckets=request_latency_buckets, self.histogram_queue_time_request = make_per_engine(
labelnames=labelnames).labels(model_name, str(idx)) histogram_queue_time_request, engine_indexes, model_name)
for idx in self.engine_indexes
}
self.histogram_inference_time_request = { histogram_inference_time_request = self._histogram_cls(
idx: name="vllm:request_inference_time_seconds",
self._histogram_cls( documentation=
name="vllm:request_inference_time_seconds", "Histogram of time spent in RUNNING phase for request.",
documentation= buckets=request_latency_buckets,
"Histogram of time spent in RUNNING phase for request.", labelnames=labelnames)
buckets=request_latency_buckets, self.histogram_inference_time_request = make_per_engine(
labelnames=labelnames).labels(model_name, str(idx)) histogram_inference_time_request, engine_indexes, model_name)
for idx in self.engine_indexes
}
self.histogram_prefill_time_request = { histogram_prefill_time_request = self._histogram_cls(
idx: name="vllm:request_prefill_time_seconds",
self._histogram_cls( documentation=
name="vllm:request_prefill_time_seconds", "Histogram of time spent in PREFILL phase for request.",
documentation= buckets=request_latency_buckets,
"Histogram of time spent in PREFILL phase for request.", labelnames=labelnames)
buckets=request_latency_buckets, self.histogram_prefill_time_request = make_per_engine(
labelnames=labelnames).labels(model_name, str(idx)) histogram_prefill_time_request, engine_indexes, model_name)
for idx in self.engine_indexes
}
self.histogram_decode_time_request = { histogram_decode_time_request = self._histogram_cls(
idx: name="vllm:request_decode_time_seconds",
self._histogram_cls( documentation=
name="vllm:request_decode_time_seconds", "Histogram of time spent in DECODE phase for request.",
documentation= buckets=request_latency_buckets,
"Histogram of time spent in DECODE phase for request.", labelnames=labelnames)
buckets=request_latency_buckets, self.histogram_decode_time_request = make_per_engine(
labelnames=labelnames).labels(model_name, str(idx)) histogram_decode_time_request, engine_indexes, model_name)
for idx in self.engine_indexes
}
# # # #
# # LoRA metrics # # LoRA metrics
@ -603,6 +557,18 @@ class PrometheusStatLogger(StatLoggerBase):
self.log_metrics_info("cache_config", self.vllm_config.cache_config) self.log_metrics_info("cache_config", self.vllm_config.cache_config)
PromMetric = Union[
prometheus_client.Gauge,
prometheus_client.Counter,
prometheus_client.Histogram,
]
def make_per_engine(metric: PromMetric, engine_idxs: list[int],
model_name: str) -> dict[int, PromMetric]:
return {idx: metric.labels(model_name, str(idx)) for idx in engine_idxs}
def build_buckets(mantissa_lst: list[int], max_value: int) -> list[int]: def build_buckets(mantissa_lst: list[int], max_value: int) -> list[int]:
""" """
Builds a list of buckets with increasing powers of 10 multiplied by Builds a list of buckets with increasing powers of 10 multiplied by