diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index 901ba8e8e5ef3..941f465711ef1 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -200,6 +200,7 @@ EXPECTED_METRICS = [ EXPECTED_METRICS_V1 = [ "vllm:num_requests_running", "vllm:num_requests_waiting", + "vllm:gpu_cache_usage_perc", "vllm:prompt_tokens_total", "vllm:generation_tokens_total", "vllm:request_prompt_tokens_sum", diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 18fdfdfe4a010..d6c612f155f01 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -69,6 +69,11 @@ class KVCacheManager: # is finished. self.req_to_blocks: Dict[str, List[KVCacheBlock]] = {} + @property + def usage(self) -> float: + return 1.0 - (self.free_block_queue.num_free_blocks / + self.num_gpu_blocks) + def get_computed_blocks( self, request: Request) -> Tuple[List[KVCacheBlock], int]: """Get the computed (cached) blocks for the request. diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index da2e31b1fb75b..910fc4ff4d2b6 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -544,6 +544,7 @@ class Scheduler: return SchedulerStats( num_running_reqs=len(self.running), num_waiting_reqs=len(self.waiting), + gpu_cache_usage=self.kv_cache_manager.usage, ) diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 9bb24d1948651..f901822c7887c 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -69,11 +69,13 @@ class LoggingStatLogger(StatLoggerBase): logger.info( "Avg prompt throughput: %.1f tokens/s, " "Avg generation throughput: %.1f tokens/s, " - "Running: %d reqs, Waiting: %d reqs ", + "Running: %d reqs, Waiting: %d reqs " + "GPU KV cache usage: %.1f%%.", prompt_throughput, generation_throughput, scheduler_stats.num_running_reqs, scheduler_stats.num_waiting_reqs, + scheduler_stats.gpu_cache_usage * 100, ) @@ -97,6 +99,11 @@ class PrometheusStatLogger(StatLoggerBase): documentation="Number of requests waiting to be processed.", labelnames=labelnames).labels(*labelvalues) + self.gauge_gpu_cache_usage = prometheus_client.Gauge( + name="vllm:gpu_cache_usage_perc", + documentation="GPU KV-cache usage. 1 means 100 percent usage.", + labelnames=labelnames).labels(*labelvalues) + self.counter_prompt_tokens = prometheus_client.Counter( name="vllm:prompt_tokens_total", documentation="Number of prefill tokens processed.", @@ -147,6 +154,8 @@ class PrometheusStatLogger(StatLoggerBase): self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs) self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs) + self.gauge_gpu_cache_usage.set(scheduler_stats.gpu_cache_usage) + self.counter_prompt_tokens.inc(iteration_stats.num_prompt_tokens) self.counter_generation_tokens.inc( iteration_stats.num_generation_tokens) diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index f4c276f0b6902..5277505128a63 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -14,7 +14,7 @@ class SchedulerStats: num_running_reqs: int = 0 num_waiting_reqs: int = 0 - # gpu_cache_usage: float = 0.0 + gpu_cache_usage: float = 0.0 # gpu_prefix_cache_hit_rate: float = 0.0