mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-29 23:37:12 +08:00
[Metrics] Scheduled removal of deprecated metrics (#29330)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
This commit is contained in:
parent
7012d8b45e
commit
9cf4edae6e
@ -183,9 +183,6 @@ async def test_metrics_counts(
|
|||||||
EXPECTED_METRICS_V1 = [
|
EXPECTED_METRICS_V1 = [
|
||||||
"vllm:num_requests_running",
|
"vllm:num_requests_running",
|
||||||
"vllm:num_requests_waiting",
|
"vllm:num_requests_waiting",
|
||||||
"vllm:gpu_cache_usage_perc",
|
|
||||||
"vllm:gpu_prefix_cache_queries",
|
|
||||||
"vllm:gpu_prefix_cache_hits",
|
|
||||||
"vllm:kv_cache_usage_perc",
|
"vllm:kv_cache_usage_perc",
|
||||||
"vllm:prefix_cache_queries",
|
"vllm:prefix_cache_queries",
|
||||||
"vllm:prefix_cache_hits",
|
"vllm:prefix_cache_hits",
|
||||||
|
|||||||
@ -440,57 +440,6 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
|
|||||||
# Setting default values
|
# Setting default values
|
||||||
self.record_sleep_state()
|
self.record_sleep_state()
|
||||||
|
|
||||||
# GPU cache
|
|
||||||
#
|
|
||||||
# Deprecated in 0.9.2 - Renamed as vllm:kv_cache_usage_perc
|
|
||||||
# With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
|
|
||||||
# TODO: remove in 0.12.0
|
|
||||||
if self.show_hidden_metrics:
|
|
||||||
gauge_gpu_cache_usage = self._gauge_cls(
|
|
||||||
name="vllm:gpu_cache_usage_perc",
|
|
||||||
documentation=(
|
|
||||||
"GPU KV-cache usage. 1 means 100 percent usage."
|
|
||||||
"DEPRECATED: Use vllm:kv_cache_usage_perc instead."
|
|
||||||
),
|
|
||||||
multiprocess_mode="mostrecent",
|
|
||||||
labelnames=labelnames,
|
|
||||||
)
|
|
||||||
self.gauge_gpu_cache_usage = make_per_engine(
|
|
||||||
gauge_gpu_cache_usage, engine_indexes, model_name
|
|
||||||
)
|
|
||||||
|
|
||||||
# Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_queries
|
|
||||||
# With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
|
|
||||||
# TODO: remove in 0.12.0
|
|
||||||
if self.show_hidden_metrics:
|
|
||||||
counter_gpu_prefix_cache_queries = self._counter_cls(
|
|
||||||
name="vllm:gpu_prefix_cache_queries",
|
|
||||||
documentation=(
|
|
||||||
"GPU prefix cache queries, in terms of number of queried"
|
|
||||||
"tokens. DEPRECATED: Use vllm:prefix_cache_queries instead."
|
|
||||||
),
|
|
||||||
labelnames=labelnames,
|
|
||||||
)
|
|
||||||
self.counter_gpu_prefix_cache_queries = make_per_engine(
|
|
||||||
counter_gpu_prefix_cache_queries, engine_indexes, model_name
|
|
||||||
)
|
|
||||||
|
|
||||||
# Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_hits
|
|
||||||
# With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
|
|
||||||
# TODO: remove in 0.12.0
|
|
||||||
if self.show_hidden_metrics:
|
|
||||||
counter_gpu_prefix_cache_hits = self._counter_cls(
|
|
||||||
name="vllm:gpu_prefix_cache_hits",
|
|
||||||
documentation=(
|
|
||||||
"GPU prefix cache hits, in terms of number of cached "
|
|
||||||
"tokens. DEPRECATED: Use vllm:prefix_cache_hits instead."
|
|
||||||
),
|
|
||||||
labelnames=labelnames,
|
|
||||||
)
|
|
||||||
self.counter_gpu_prefix_cache_hits = make_per_engine(
|
|
||||||
counter_gpu_prefix_cache_hits, engine_indexes, model_name
|
|
||||||
)
|
|
||||||
|
|
||||||
gauge_kv_cache_usage = self._gauge_cls(
|
gauge_kv_cache_usage = self._gauge_cls(
|
||||||
name="vllm:kv_cache_usage_perc",
|
name="vllm:kv_cache_usage_perc",
|
||||||
documentation="KV-cache usage. 1 means 100 percent usage.",
|
documentation="KV-cache usage. 1 means 100 percent usage.",
|
||||||
@ -735,39 +684,41 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Deprecated in 0.11 - Renamed as vllm:inter_token_latency_seconds
|
# Deprecated in 0.11 - Renamed as vllm:inter_token_latency_seconds
|
||||||
# TODO: in 0.12, only enable if show_hidden_metrics=True
|
# With 0.12.x you can enable with --show-hidden-metrics-for-version=0.11
|
||||||
histogram_time_per_output_token = self._histogram_cls(
|
# TODO: remove in 0.13.0
|
||||||
name="vllm:time_per_output_token_seconds",
|
if self.show_hidden_metrics:
|
||||||
documentation=(
|
histogram_time_per_output_token = self._histogram_cls(
|
||||||
"Histogram of time per output token in seconds."
|
name="vllm:time_per_output_token_seconds",
|
||||||
"DEPRECATED: Use vllm:inter_token_latency_seconds instead."
|
documentation=(
|
||||||
),
|
"Histogram of time per output token in seconds."
|
||||||
buckets=[
|
"DEPRECATED: Use vllm:inter_token_latency_seconds instead."
|
||||||
0.01,
|
),
|
||||||
0.025,
|
buckets=[
|
||||||
0.05,
|
0.01,
|
||||||
0.075,
|
0.025,
|
||||||
0.1,
|
0.05,
|
||||||
0.15,
|
0.075,
|
||||||
0.2,
|
0.1,
|
||||||
0.3,
|
0.15,
|
||||||
0.4,
|
0.2,
|
||||||
0.5,
|
0.3,
|
||||||
0.75,
|
0.4,
|
||||||
1.0,
|
0.5,
|
||||||
2.5,
|
0.75,
|
||||||
5.0,
|
1.0,
|
||||||
7.5,
|
2.5,
|
||||||
10.0,
|
5.0,
|
||||||
20.0,
|
7.5,
|
||||||
40.0,
|
10.0,
|
||||||
80.0,
|
20.0,
|
||||||
],
|
40.0,
|
||||||
labelnames=labelnames,
|
80.0,
|
||||||
)
|
],
|
||||||
self.histogram_time_per_output_token = make_per_engine(
|
labelnames=labelnames,
|
||||||
histogram_time_per_output_token, engine_indexes, model_name
|
)
|
||||||
)
|
self.histogram_time_per_output_token = make_per_engine(
|
||||||
|
histogram_time_per_output_token, engine_indexes, model_name
|
||||||
|
)
|
||||||
|
|
||||||
histogram_inter_token_latency = self._histogram_cls(
|
histogram_inter_token_latency = self._histogram_cls(
|
||||||
name="vllm:inter_token_latency_seconds",
|
name="vllm:inter_token_latency_seconds",
|
||||||
@ -966,20 +917,8 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
|
|||||||
self.gauge_scheduler_waiting[engine_idx].set(
|
self.gauge_scheduler_waiting[engine_idx].set(
|
||||||
scheduler_stats.num_waiting_reqs
|
scheduler_stats.num_waiting_reqs
|
||||||
)
|
)
|
||||||
if self.show_hidden_metrics:
|
|
||||||
self.gauge_gpu_cache_usage[engine_idx].set(
|
|
||||||
scheduler_stats.kv_cache_usage
|
|
||||||
)
|
|
||||||
self.gauge_kv_cache_usage[engine_idx].set(scheduler_stats.kv_cache_usage)
|
self.gauge_kv_cache_usage[engine_idx].set(scheduler_stats.kv_cache_usage)
|
||||||
|
|
||||||
if self.show_hidden_metrics:
|
|
||||||
self.counter_gpu_prefix_cache_queries[engine_idx].inc(
|
|
||||||
scheduler_stats.prefix_cache_stats.queries
|
|
||||||
)
|
|
||||||
self.counter_gpu_prefix_cache_hits[engine_idx].inc(
|
|
||||||
scheduler_stats.prefix_cache_stats.hits
|
|
||||||
)
|
|
||||||
|
|
||||||
self.counter_prefix_cache_queries[engine_idx].inc(
|
self.counter_prefix_cache_queries[engine_idx].inc(
|
||||||
scheduler_stats.prefix_cache_stats.queries
|
scheduler_stats.prefix_cache_stats.queries
|
||||||
)
|
)
|
||||||
@ -1050,7 +989,8 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
|
|||||||
self.histogram_time_to_first_token[engine_idx].observe(ttft)
|
self.histogram_time_to_first_token[engine_idx].observe(ttft)
|
||||||
for itl in iteration_stats.inter_token_latencies_iter:
|
for itl in iteration_stats.inter_token_latencies_iter:
|
||||||
self.histogram_inter_token_latency[engine_idx].observe(itl)
|
self.histogram_inter_token_latency[engine_idx].observe(itl)
|
||||||
self.histogram_time_per_output_token[engine_idx].observe(itl)
|
if self.show_hidden_metrics:
|
||||||
|
self.histogram_time_per_output_token[engine_idx].observe(itl)
|
||||||
|
|
||||||
for finished_request in iteration_stats.finished_requests:
|
for finished_request in iteration_stats.finished_requests:
|
||||||
self.counter_request_success[finished_request.finish_reason][
|
self.counter_request_success[finished_request.finish_reason][
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user