mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-01-06 14:35:17 +08:00
[V1][Metrics] Add GPU cache usage % gauge (#12561)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
This commit is contained in:
parent
1c1bb0bbf2
commit
f17f1d4608
@ -200,6 +200,7 @@ EXPECTED_METRICS = [
|
||||
EXPECTED_METRICS_V1 = [
|
||||
"vllm:num_requests_running",
|
||||
"vllm:num_requests_waiting",
|
||||
"vllm:gpu_cache_usage_perc",
|
||||
"vllm:prompt_tokens_total",
|
||||
"vllm:generation_tokens_total",
|
||||
"vllm:request_prompt_tokens_sum",
|
||||
|
||||
@ -69,6 +69,11 @@ class KVCacheManager:
|
||||
# is finished.
|
||||
self.req_to_blocks: Dict[str, List[KVCacheBlock]] = {}
|
||||
|
||||
@property
|
||||
def usage(self) -> float:
|
||||
return 1.0 - (self.free_block_queue.num_free_blocks /
|
||||
self.num_gpu_blocks)
|
||||
|
||||
def get_computed_blocks(
|
||||
self, request: Request) -> Tuple[List[KVCacheBlock], int]:
|
||||
"""Get the computed (cached) blocks for the request.
|
||||
|
||||
@ -544,6 +544,7 @@ class Scheduler:
|
||||
return SchedulerStats(
|
||||
num_running_reqs=len(self.running),
|
||||
num_waiting_reqs=len(self.waiting),
|
||||
gpu_cache_usage=self.kv_cache_manager.usage,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@ -69,11 +69,13 @@ class LoggingStatLogger(StatLoggerBase):
|
||||
logger.info(
|
||||
"Avg prompt throughput: %.1f tokens/s, "
|
||||
"Avg generation throughput: %.1f tokens/s, "
|
||||
"Running: %d reqs, Waiting: %d reqs ",
|
||||
"Running: %d reqs, Waiting: %d reqs "
|
||||
"GPU KV cache usage: %.1f%%.",
|
||||
prompt_throughput,
|
||||
generation_throughput,
|
||||
scheduler_stats.num_running_reqs,
|
||||
scheduler_stats.num_waiting_reqs,
|
||||
scheduler_stats.gpu_cache_usage * 100,
|
||||
)
|
||||
|
||||
|
||||
@ -97,6 +99,11 @@ class PrometheusStatLogger(StatLoggerBase):
|
||||
documentation="Number of requests waiting to be processed.",
|
||||
labelnames=labelnames).labels(*labelvalues)
|
||||
|
||||
self.gauge_gpu_cache_usage = prometheus_client.Gauge(
|
||||
name="vllm:gpu_cache_usage_perc",
|
||||
documentation="GPU KV-cache usage. 1 means 100 percent usage.",
|
||||
labelnames=labelnames).labels(*labelvalues)
|
||||
|
||||
self.counter_prompt_tokens = prometheus_client.Counter(
|
||||
name="vllm:prompt_tokens_total",
|
||||
documentation="Number of prefill tokens processed.",
|
||||
@ -147,6 +154,8 @@ class PrometheusStatLogger(StatLoggerBase):
|
||||
self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs)
|
||||
self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs)
|
||||
|
||||
self.gauge_gpu_cache_usage.set(scheduler_stats.gpu_cache_usage)
|
||||
|
||||
self.counter_prompt_tokens.inc(iteration_stats.num_prompt_tokens)
|
||||
self.counter_generation_tokens.inc(
|
||||
iteration_stats.num_generation_tokens)
|
||||
|
||||
@ -14,7 +14,7 @@ class SchedulerStats:
|
||||
num_running_reqs: int = 0
|
||||
num_waiting_reqs: int = 0
|
||||
|
||||
# gpu_cache_usage: float = 0.0
|
||||
gpu_cache_usage: float = 0.0
|
||||
# gpu_prefix_cache_hit_rate: float = 0.0
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user