mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-09 03:04:34 +08:00
[Core][Bookkeeping] Update cu_num_accepted_tokens for all req_index (#27629)
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
This commit is contained in:
parent
ab98f6556f
commit
4574d48bab
@ -2323,11 +2323,19 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
sampled_ids = [-1] if req_idx not in invalid_req_indices_set else None
|
sampled_ids = [-1] if req_idx not in invalid_req_indices_set else None
|
||||||
else:
|
else:
|
||||||
sampled_ids = valid_sampled_token_ids[req_idx]
|
sampled_ids = valid_sampled_token_ids[req_idx]
|
||||||
|
|
||||||
|
num_sampled_ids: int = len(sampled_ids) if sampled_ids else 0
|
||||||
|
|
||||||
|
if cu_num_accepted_tokens is not None:
|
||||||
|
cu_num_accepted_tokens.append(
|
||||||
|
cu_num_accepted_tokens[-1] + num_sampled_ids
|
||||||
|
)
|
||||||
|
|
||||||
if not sampled_ids:
|
if not sampled_ids:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
start_idx = self.input_batch.num_tokens_no_spec[req_idx]
|
start_idx = self.input_batch.num_tokens_no_spec[req_idx]
|
||||||
end_idx = start_idx + len(sampled_ids)
|
end_idx = start_idx + num_sampled_ids
|
||||||
assert end_idx <= self.max_model_len, (
|
assert end_idx <= self.max_model_len, (
|
||||||
"Sampled token IDs exceed the max model length. "
|
"Sampled token IDs exceed the max model length. "
|
||||||
f"Total number of tokens: {end_idx} > max_model_len: "
|
f"Total number of tokens: {end_idx} > max_model_len: "
|
||||||
@ -2343,11 +2351,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
req_state = self.requests[req_id]
|
req_state = self.requests[req_id]
|
||||||
req_state.output_token_ids.extend(sampled_ids)
|
req_state.output_token_ids.extend(sampled_ids)
|
||||||
|
|
||||||
if cu_num_accepted_tokens is not None:
|
|
||||||
cu_num_accepted_tokens.append(
|
|
||||||
cu_num_accepted_tokens[-1] + len(sampled_ids)
|
|
||||||
)
|
|
||||||
|
|
||||||
logprobs_lists = (
|
logprobs_lists = (
|
||||||
logprobs_tensors.tolists(cu_num_accepted_tokens)
|
logprobs_tensors.tolists(cu_num_accepted_tokens)
|
||||||
if not self.use_async_scheduling and logprobs_tensors is not None
|
if not self.use_async_scheduling and logprobs_tensors is not None
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user