diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 1a175e9e110b..21280b9c84cf 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -118,12 +118,14 @@ class LoggingStatLogger(StatLoggerBase): self.num_prompt_tokens: int = 0 self.num_generation_tokens: int = 0 self.num_corrupted_reqs: int = 0 + self.num_preemptions: int = 0 def _track_iteration_stats(self, iteration_stats: IterationStats): # Save tracked stats for token counters. self.num_prompt_tokens += iteration_stats.num_prompt_tokens self.num_generation_tokens += iteration_stats.num_generation_tokens self.num_corrupted_reqs += iteration_stats.num_corrupted_reqs + self.num_preemptions += iteration_stats.num_preempted_reqs def _get_throughput(self, tracked_stats: int, now: float) -> float: # Compute summary metrics for tracked stats @@ -196,18 +198,31 @@ class LoggingStatLogger(StatLoggerBase): "Avg generation throughput: %.1f tokens/s", "Running: %d reqs", "Waiting: %d reqs", - "GPU KV cache usage: %.1f%%", - "Prefix cache hit rate: %.1f%%", ] log_args = [ self.last_prompt_throughput, self.last_generation_throughput, self.last_scheduler_stats.num_running_reqs, self.last_scheduler_stats.num_waiting_reqs, - self.last_scheduler_stats.kv_cache_usage * 100, - self.prefix_caching_metrics.hit_rate * 100, ] + if self.num_preemptions > 0: + log_parts.append("Preemptions: %d") + log_args.append(self.num_preemptions) + + log_parts.extend( + [ + "GPU KV cache usage: %.1f%%", + "Prefix cache hit rate: %.1f%%", + ] + ) + log_args.extend( + [ + self.last_scheduler_stats.kv_cache_usage * 100, + self.prefix_caching_metrics.hit_rate * 100, + ] + ) + if envs.VLLM_COMPUTE_NANS_IN_LOGITS: log_parts.append("Corrupted: %d reqs") log_args.append(self.num_corrupted_reqs)