add cache_config's info to prometheus metrics. (#3100)

This commit is contained in:
Allen.Dou 2024-02-29 14:15:18 +08:00 committed by GitHub
parent a6d471c759
commit 9289e577ec
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 14 additions and 1 deletions

View File

@ -308,6 +308,10 @@ class CacheConfig:
self.num_gpu_blocks = None self.num_gpu_blocks = None
self.num_cpu_blocks = None self.num_cpu_blocks = None
def metrics_info(self):
# convert cache_config to dict(key: str, value:str) for prometheus metrics info
return {key: str(value) for key, value in self.__dict__.items()}
def _verify_args(self) -> None: def _verify_args(self) -> None:
if self.gpu_memory_utilization > 1.0: if self.gpu_memory_utilization > 1.0:
raise ValueError( raise ValueError(

View File

@ -138,6 +138,7 @@ class LLMEngine:
self.stat_logger = StatLogger( self.stat_logger = StatLogger(
local_interval=_LOCAL_LOGGING_INTERVAL_SEC, local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
labels=dict(model_name=model_config.model)) labels=dict(model_name=model_config.model))
self.stat_logger.info("cache_config", self.cache_config)
self.forward_dag = None self.forward_dag = None
if USE_RAY_COMPILED_DAG: if USE_RAY_COMPILED_DAG:

View File

@ -1,5 +1,5 @@
from vllm.logger import init_logger from vllm.logger import init_logger
from prometheus_client import Counter, Gauge, Histogram, REGISTRY, disable_created_metrics from prometheus_client import Counter, Gauge, Histogram, Info, REGISTRY, disable_created_metrics
import time import time
import numpy as np import numpy as np
@ -23,6 +23,10 @@ class Metrics:
if hasattr(collector, "_name") and "vllm" in collector._name: if hasattr(collector, "_name") and "vllm" in collector._name:
REGISTRY.unregister(collector) REGISTRY.unregister(collector)
self.info_cache_config = Info(
name='vllm:cache_config',
documentation='information of cache_config')
# System stats # System stats
self.gauge_scheduler_running = Gauge( self.gauge_scheduler_running = Gauge(
name="vllm:num_requests_running", name="vllm:num_requests_running",
@ -128,6 +132,10 @@ class StatLogger:
self.labels = labels self.labels = labels
self.metrics = Metrics(labelnames=list(labels.keys())) self.metrics = Metrics(labelnames=list(labels.keys()))
def info(self, type: str, obj: object) -> None:
if type == "cache_config":
self.metrics.info_cache_config.info(obj.metrics_info())
def _get_throughput(self, tracked_stats: List[int], now: float) -> float: def _get_throughput(self, tracked_stats: List[int], now: float) -> float:
return float(np.sum(tracked_stats) / (now - self.last_local_log)) return float(np.sum(tracked_stats) / (now - self.last_local_log))