mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-21 16:27:57 +08:00
Introduces three new Prometheus histograms for fine-grained observability of KV cache residency behavior: vllm:kv_block_lifetime_seconds — total lifetime from allocation to free vllm:kv_block_idle_before_evict_seconds — idle duration before eviction vllm:kv_block_reuse_gap_seconds — time between consecutive reuses of the same block These metrics help operators analyze KV cache efficiency, reuse patterns, and eviction timing beyond simple utilization rates. Implementation uses monotonic timestamps for accuracy, 1% sampling for minimal overhead (~48 bytes/block), and is fully thread-safe with zero runtime cost when disabled. Two new runtime flags are introduced: --kv-cache-metrics – enable KV cache residency metrics --kv-cache-metrics-sample – control sampling ratio (default: 0.01) Signed-off-by: Shivam <shivamprasad91@gmail.com>
132 lines
5.2 KiB
Python
132 lines
5.2 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
|
|
from functools import cached_property
|
|
from typing import Any, Literal, cast
|
|
|
|
from packaging.version import parse
|
|
from pydantic import Field, field_validator, model_validator
|
|
from pydantic.dataclasses import dataclass
|
|
|
|
from vllm import version
|
|
from vllm.config.utils import config
|
|
from vllm.utils.hashing import safe_hash
|
|
|
|
DetailedTraceModules = Literal["model", "worker", "all"]
|
|
|
|
|
|
@config
|
|
@dataclass
|
|
class ObservabilityConfig:
|
|
"""Configuration for observability - metrics and tracing."""
|
|
|
|
show_hidden_metrics_for_version: str | None = None
|
|
"""Enable deprecated Prometheus metrics that have been hidden since the
|
|
specified version. For example, if a previously deprecated metric has been
|
|
hidden since the v0.7.0 release, you use
|
|
`--show-hidden-metrics-for-version=0.7` as a temporary escape hatch while
|
|
you migrate to new metrics. The metric is likely to be removed completely
|
|
in an upcoming release."""
|
|
|
|
@cached_property
|
|
def show_hidden_metrics(self) -> bool:
|
|
"""Check if the hidden metrics should be shown."""
|
|
if self.show_hidden_metrics_for_version is None:
|
|
return False
|
|
return version._prev_minor_version_was(self.show_hidden_metrics_for_version)
|
|
|
|
otlp_traces_endpoint: str | None = None
|
|
"""Target URL to which OpenTelemetry traces will be sent."""
|
|
|
|
collect_detailed_traces: list[DetailedTraceModules] | None = None
|
|
"""It makes sense to set this only if `--otlp-traces-endpoint` is set. If
|
|
set, it will collect detailed traces for the specified modules. This
|
|
involves use of possibly costly and or blocking operations and hence might
|
|
have a performance impact.
|
|
|
|
Note that collecting detailed timing information for each request can be
|
|
expensive."""
|
|
|
|
kv_cache_metrics: bool = False
|
|
"""Enable KV cache residency metrics (lifetime, idle time, reuse gaps).
|
|
Uses sampling to minimize overhead.
|
|
Requires log stats to be enabled (i.e., --disable-log-stats not set)."""
|
|
|
|
kv_cache_metrics_sample: float = Field(default=0.01, gt=0, le=1)
|
|
"""Sampling rate for KV cache metrics (0.0, 1.0]. Default 0.01 = 1% of blocks."""
|
|
|
|
@cached_property
|
|
def collect_model_forward_time(self) -> bool:
|
|
"""Whether to collect model forward time for the request."""
|
|
return self.collect_detailed_traces is not None and (
|
|
"model" in self.collect_detailed_traces
|
|
or "all" in self.collect_detailed_traces
|
|
)
|
|
|
|
@cached_property
|
|
def collect_model_execute_time(self) -> bool:
|
|
"""Whether to collect model execute time for the request."""
|
|
return self.collect_detailed_traces is not None and (
|
|
"worker" in self.collect_detailed_traces
|
|
or "all" in self.collect_detailed_traces
|
|
)
|
|
|
|
def compute_hash(self) -> str:
|
|
"""
|
|
WARNING: Whenever a new field is added to this config,
|
|
ensure that it is included in the factors list if
|
|
it affects the computation graph.
|
|
|
|
Provide a hash that uniquely identifies all the configs
|
|
that affect the structure of the computation
|
|
graph from input ids/embeddings to the final hidden states,
|
|
excluding anything before input ids/embeddings and after
|
|
the final hidden states.
|
|
"""
|
|
# no factors to consider.
|
|
# this config will not affect the computation graph.
|
|
factors: list[Any] = []
|
|
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
|
|
return hash_str
|
|
|
|
@field_validator("show_hidden_metrics_for_version")
|
|
@classmethod
|
|
def _validate_show_hidden_metrics_for_version(cls, value: str | None) -> str | None:
|
|
if value is not None:
|
|
# Raises an exception if the string is not a valid version.
|
|
parse(value)
|
|
return value
|
|
|
|
@field_validator("otlp_traces_endpoint")
|
|
@classmethod
|
|
def _validate_otlp_traces_endpoint(cls, value: str | None) -> str | None:
|
|
if value is not None:
|
|
from vllm.tracing import is_otel_available, otel_import_error_traceback
|
|
|
|
if not is_otel_available():
|
|
raise ValueError(
|
|
"OpenTelemetry is not available. Unable to configure "
|
|
"'otlp_traces_endpoint'. Ensure OpenTelemetry packages are "
|
|
f"installed. Original error:\n{otel_import_error_traceback}"
|
|
)
|
|
return value
|
|
|
|
@field_validator("collect_detailed_traces")
|
|
@classmethod
|
|
def _validate_collect_detailed_traces(
|
|
cls, value: list[DetailedTraceModules] | None
|
|
) -> list[DetailedTraceModules] | None:
|
|
"""Handle the legacy case where users might provide a comma-separated
|
|
string instead of a list of strings."""
|
|
if value is not None and len(value) == 1 and "," in value[0]:
|
|
value = cast(list[DetailedTraceModules], value[0].split(","))
|
|
return value
|
|
|
|
@model_validator(mode="after")
|
|
def _validate_tracing_config(self):
|
|
if self.collect_detailed_traces and not self.otlp_traces_endpoint:
|
|
raise ValueError(
|
|
"collect_detailed_traces requires `--otlp-traces-endpoint` to be set."
|
|
)
|
|
return self
|