diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 61640e856ac1c..4323141c435b7 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -472,9 +472,9 @@ class Scheduler(SchedulerInterface): num_computed_tokens = ( num_new_local_computed_tokens + num_external_computed_tokens ) - # KVTransfer: WAITING reqs have num_computed_tokens > 0 - # after async KV recvs are completed. else: + # KVTransfer: WAITING reqs have num_computed_tokens > 0 + # after async KV recvs are completed. new_computed_blocks = self.kv_cache_manager.empty_kv_cache_blocks num_new_local_computed_tokens = 0 num_computed_tokens = request.num_computed_tokens @@ -483,12 +483,12 @@ class Scheduler(SchedulerInterface): external_load_encoder_input = [] new_encoder_compute_budget = encoder_compute_budget - # KVTransfer: loading remote KV, do not allocate for new work. if load_kv_async: + # KVTransfer: loading remote KV, do not allocate for new work. assert num_external_computed_tokens > 0 num_new_tokens = 0 - # Number of tokens to be scheduled. else: + # Number of tokens to be scheduled. # We use `request.num_tokens` instead of # `request.num_prompt_tokens` to consider the resumed # requests, which have output tokens.