[Core] Simplify setting new_token_ids in CachedRequestData (#26388)

Signed-off-by: Nick Hill <nhill@redhat.com>
2025-12-19 06:45:01 +08:00 · 2025-10-07 20:32:37 -07:00 · 2025-10-07 20:32:37 -07:00 · 067da2d1df
commit 067da2d1df
parent 046118b938
1 changed files with 1 additions and 7 deletions
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@ -712,7 +712,6 @@ class Scheduler(SchedulerInterface):
        num_computed_tokens: list[int] = []
        num_output_tokens: list[int] = []
        use_connector = self.connector is not None
        for req in itertools.chain(running_reqs, resumed_reqs):
            req_id = req.request_id
            req_ids.append(req_id)
@ -729,16 +728,11 @@ class Scheduler(SchedulerInterface):
                    req.num_computed_tokens : req.num_computed_tokens + num_tokens
                ]
                new_token_ids.append(token_ids)
            elif use_connector:
                # When using a KVConnector, we add a placeholder to avoid index
                # out of bounds errors. TODO: Remove this once the KVConnector
                # is updated to handle token IDs properly.
                new_token_ids.append([])
            new_block_ids.append(
                req_to_new_blocks[req_id].get_block_ids(allow_none=True)
            )
            num_computed_tokens.append(req.num_computed_tokens)
-            num_output_tokens.append(len(req.output_token_ids))
+            num_output_tokens.append(req.num_output_tokens)
        # Because resumed_reqs is usually empty, it is more efficient to do
        # in-place appending so that we don't need to allocate a new list.
        resumed_from_preemption = [False] * len(running_reqs)