[Metrics] Log number of preempted requests (#28522)

Add tracking and periodic logging for the number of preempted requests in the metrics logger. This helps monitor system behavior under load. Signed-off-by: Yining Liu <610lyn@gmail.com>
2025-12-13 04:15:01 +08:00 · 2025-11-14 17:47:45 +08:00 · 2025-11-14 17:47:45 +08:00 · ecf8230d4d
commit ecf8230d4d
parent 8cfbe89b93
1 changed files with 19 additions and 4 deletions
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@ -118,12 +118,14 @@ class LoggingStatLogger(StatLoggerBase):
        self.num_prompt_tokens: int = 0
        self.num_generation_tokens: int = 0
        self.num_corrupted_reqs: int = 0
        self.num_preemptions: int = 0
    def _track_iteration_stats(self, iteration_stats: IterationStats):
        # Save tracked stats for token counters.
        self.num_prompt_tokens += iteration_stats.num_prompt_tokens
        self.num_generation_tokens += iteration_stats.num_generation_tokens
        self.num_corrupted_reqs += iteration_stats.num_corrupted_reqs
        self.num_preemptions += iteration_stats.num_preempted_reqs
    def _get_throughput(self, tracked_stats: int, now: float) -> float:
        # Compute summary metrics for tracked stats
@ -196,17 +198,30 @@ class LoggingStatLogger(StatLoggerBase):
            "Avg generation throughput: %.1f tokens/s",
            "Running: %d reqs",
            "Waiting: %d reqs",
            "GPU KV cache usage: %.1f%%",
            "Prefix cache hit rate: %.1f%%",
        ]
        log_args = [
            self.last_prompt_throughput,
            self.last_generation_throughput,
            self.last_scheduler_stats.num_running_reqs,
            self.last_scheduler_stats.num_waiting_reqs,
        ]
        if self.num_preemptions > 0:
            log_parts.append("Preemptions: %d")
            log_args.append(self.num_preemptions)
        log_parts.extend(
            [
                "GPU KV cache usage: %.1f%%",
                "Prefix cache hit rate: %.1f%%",
            ]
        )
        log_args.extend(
            [
                self.last_scheduler_stats.kv_cache_usage * 100,
                self.prefix_caching_metrics.hit_rate * 100,
            ]
        )
        if envs.VLLM_COMPUTE_NANS_IN_LOGITS:
            log_parts.append("Corrupted: %d reqs")