mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-13 04:15:01 +08:00
[Metrics] Log number of preempted requests (#28522)
Add tracking and periodic logging for the number of preempted requests in the metrics logger. This helps monitor system behavior under load. Signed-off-by: Yining Liu <610lyn@gmail.com>
This commit is contained in:
parent
8cfbe89b93
commit
ecf8230d4d
@ -118,12 +118,14 @@ class LoggingStatLogger(StatLoggerBase):
|
|||||||
self.num_prompt_tokens: int = 0
|
self.num_prompt_tokens: int = 0
|
||||||
self.num_generation_tokens: int = 0
|
self.num_generation_tokens: int = 0
|
||||||
self.num_corrupted_reqs: int = 0
|
self.num_corrupted_reqs: int = 0
|
||||||
|
self.num_preemptions: int = 0
|
||||||
|
|
||||||
def _track_iteration_stats(self, iteration_stats: IterationStats):
|
def _track_iteration_stats(self, iteration_stats: IterationStats):
|
||||||
# Save tracked stats for token counters.
|
# Save tracked stats for token counters.
|
||||||
self.num_prompt_tokens += iteration_stats.num_prompt_tokens
|
self.num_prompt_tokens += iteration_stats.num_prompt_tokens
|
||||||
self.num_generation_tokens += iteration_stats.num_generation_tokens
|
self.num_generation_tokens += iteration_stats.num_generation_tokens
|
||||||
self.num_corrupted_reqs += iteration_stats.num_corrupted_reqs
|
self.num_corrupted_reqs += iteration_stats.num_corrupted_reqs
|
||||||
|
self.num_preemptions += iteration_stats.num_preempted_reqs
|
||||||
|
|
||||||
def _get_throughput(self, tracked_stats: int, now: float) -> float:
|
def _get_throughput(self, tracked_stats: int, now: float) -> float:
|
||||||
# Compute summary metrics for tracked stats
|
# Compute summary metrics for tracked stats
|
||||||
@ -196,17 +198,30 @@ class LoggingStatLogger(StatLoggerBase):
|
|||||||
"Avg generation throughput: %.1f tokens/s",
|
"Avg generation throughput: %.1f tokens/s",
|
||||||
"Running: %d reqs",
|
"Running: %d reqs",
|
||||||
"Waiting: %d reqs",
|
"Waiting: %d reqs",
|
||||||
"GPU KV cache usage: %.1f%%",
|
|
||||||
"Prefix cache hit rate: %.1f%%",
|
|
||||||
]
|
]
|
||||||
log_args = [
|
log_args = [
|
||||||
self.last_prompt_throughput,
|
self.last_prompt_throughput,
|
||||||
self.last_generation_throughput,
|
self.last_generation_throughput,
|
||||||
self.last_scheduler_stats.num_running_reqs,
|
self.last_scheduler_stats.num_running_reqs,
|
||||||
self.last_scheduler_stats.num_waiting_reqs,
|
self.last_scheduler_stats.num_waiting_reqs,
|
||||||
|
]
|
||||||
|
|
||||||
|
if self.num_preemptions > 0:
|
||||||
|
log_parts.append("Preemptions: %d")
|
||||||
|
log_args.append(self.num_preemptions)
|
||||||
|
|
||||||
|
log_parts.extend(
|
||||||
|
[
|
||||||
|
"GPU KV cache usage: %.1f%%",
|
||||||
|
"Prefix cache hit rate: %.1f%%",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
log_args.extend(
|
||||||
|
[
|
||||||
self.last_scheduler_stats.kv_cache_usage * 100,
|
self.last_scheduler_stats.kv_cache_usage * 100,
|
||||||
self.prefix_caching_metrics.hit_rate * 100,
|
self.prefix_caching_metrics.hit_rate * 100,
|
||||||
]
|
]
|
||||||
|
)
|
||||||
|
|
||||||
if envs.VLLM_COMPUTE_NANS_IN_LOGITS:
|
if envs.VLLM_COMPUTE_NANS_IN_LOGITS:
|
||||||
log_parts.append("Corrupted: %d reqs")
|
log_parts.append("Corrupted: %d reqs")
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user