diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 729ce462cf186..04814b5991ebc 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2323,11 +2323,19 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): sampled_ids = [-1] if req_idx not in invalid_req_indices_set else None else: sampled_ids = valid_sampled_token_ids[req_idx] + + num_sampled_ids: int = len(sampled_ids) if sampled_ids else 0 + + if cu_num_accepted_tokens is not None: + cu_num_accepted_tokens.append( + cu_num_accepted_tokens[-1] + num_sampled_ids + ) + if not sampled_ids: continue start_idx = self.input_batch.num_tokens_no_spec[req_idx] - end_idx = start_idx + len(sampled_ids) + end_idx = start_idx + num_sampled_ids assert end_idx <= self.max_model_len, ( "Sampled token IDs exceed the max model length. " f"Total number of tokens: {end_idx} > max_model_len: " @@ -2343,11 +2351,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): req_state = self.requests[req_id] req_state.output_token_ids.extend(sampled_ids) - if cu_num_accepted_tokens is not None: - cu_num_accepted_tokens.append( - cu_num_accepted_tokens[-1] + len(sampled_ids) - ) - logprobs_lists = ( logprobs_tensors.tolists(cu_num_accepted_tokens) if not self.use_async_scheduling and logprobs_tensors is not None