From a35a8a83920ed202a815ca2fead0969758229996 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Fri, 4 Apr 2025 16:52:41 +0100 Subject: [PATCH] [V1][Spec Decode] Avoid logging useless nan metrics (#16023) Signed-off-by: Mark McLoughlin --- tests/v1/core/test_scheduler.py | 17 +++++++++-------- vllm/v1/core/sched/scheduler.py | 25 +++++++++++++++++++------ 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index a087c41ab3a9f..21a1cbf540ae0 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -671,10 +671,7 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected): assert running_req.num_tokens_with_spec == 2 + len(spec_tokens[i]) # No draft or accepted tokens counted yet - assert engine_core_outputs.scheduler_stats.spec_decoding_stats is not None - stats = engine_core_outputs.scheduler_stats.spec_decoding_stats - assert stats.num_draft_tokens == 0 - assert stats.num_accepted_tokens == 0 + assert engine_core_outputs.scheduler_stats.spec_decoding_stats is None # Schedule the speculated tokens for validation output = scheduler.schedule() @@ -702,7 +699,11 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected): engine_core_outputs = scheduler.update_from_output(output, model_runner_output) - assert engine_core_outputs.scheduler_stats.spec_decoding_stats is not None - stats = engine_core_outputs.scheduler_stats.spec_decoding_stats - assert stats.num_draft_tokens == expected[0] - assert stats.num_accepted_tokens == expected[1] + scheduler_stats = engine_core_outputs.scheduler_stats + if expected[0] == 0: + assert scheduler_stats.spec_decoding_stats is None + else: + assert scheduler_stats.spec_decoding_stats is not None + stats = scheduler_stats.spec_decoding_stats + assert stats.num_draft_tokens == expected[0] + assert stats.num_accepted_tokens == expected[1] diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index a0865c8fd8457..81f8ad25051c3 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -553,11 +553,11 @@ class Scheduler(SchedulerInterface): spec_token_ids = model_runner_output.spec_token_ids logprobs = model_runner_output.logprobs prompt_logprobs_dict = model_runner_output.prompt_logprobs_dict - spec_decoding_stats = SpecDecodingStats() if self.log_stats else None num_scheduled_tokens = scheduler_output.num_scheduled_tokens new_running: list[Request] = [] outputs: list[EngineCoreOutput] = [] + spec_decoding_stats: Optional[SpecDecodingStats] = None # NOTE(woosuk): As len(self.running) can be up to 1K or more, the below # loop can be a performance bottleneck. We should do our best to avoid @@ -585,11 +585,10 @@ class Scheduler(SchedulerInterface): num_tokens_rejected = (len(scheduled_spec_token_ids) + 1 - len(generated_token_ids)) request.num_computed_tokens -= num_tokens_rejected - - if spec_decoding_stats is not None: - spec_decoding_stats.observe( - num_draft_tokens=len(scheduled_spec_token_ids), - num_accepted_tokens=len(generated_token_ids) - 1) + spec_decoding_stats = self.make_spec_decoding_stats( + spec_decoding_stats, + num_draft_tokens=len(scheduled_spec_token_ids), + num_accepted_tokens=len(generated_token_ids) - 1) cached_encoder_input_ids = ( self.encoder_cache_manager.get_cached_input_ids(request)) @@ -744,3 +743,17 @@ class Scheduler(SchedulerInterface): prefix_cache_stats=self.kv_cache_manager.make_prefix_cache_stats(), spec_decoding_stats=spec_decoding_stats, ) + + def make_spec_decoding_stats( + self, + spec_decoding_stats: Optional[SpecDecodingStats], + num_draft_tokens: int, + num_accepted_tokens: int, + ) -> Optional[SpecDecodingStats]: + if not self.log_stats: + return None + if spec_decoding_stats is None: + spec_decoding_stats = SpecDecodingStats() + spec_decoding_stats.observe(num_draft_tokens=num_draft_tokens, + num_accepted_tokens=num_accepted_tokens) + return spec_decoding_stats