mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-17 03:35:54 +08:00
[V1][Minor] Optimize token_ids_cpu copy (#11692)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
parent
2f385183f3
commit
b55ed6ef8a
@ -66,8 +66,9 @@ class InputBatch:
|
|||||||
pin_memory=False,
|
pin_memory=False,
|
||||||
)
|
)
|
||||||
self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
|
self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
|
||||||
self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32)
|
self.num_tokens = np.zeros(max_num_reqs, dtype=np.int32)
|
||||||
self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32)
|
self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32)
|
||||||
|
self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32)
|
||||||
|
|
||||||
# Attention-related.
|
# Attention-related.
|
||||||
self.block_table = torch.zeros(
|
self.block_table = torch.zeros(
|
||||||
@ -189,6 +190,7 @@ class InputBatch:
|
|||||||
end_idx = start_idx + len(request.output_token_ids)
|
end_idx = start_idx + len(request.output_token_ids)
|
||||||
self.token_ids_cpu[req_index,
|
self.token_ids_cpu[req_index,
|
||||||
start_idx:end_idx] = request.output_token_ids
|
start_idx:end_idx] = request.output_token_ids
|
||||||
|
self.num_tokens[req_index] = request.num_tokens
|
||||||
|
|
||||||
self.num_computed_tokens_cpu[req_index] = request.num_computed_tokens
|
self.num_computed_tokens_cpu[req_index] = request.num_computed_tokens
|
||||||
num_blocks = len(request.block_ids)
|
num_blocks = len(request.block_ids)
|
||||||
@ -290,14 +292,15 @@ class InputBatch:
|
|||||||
self.req_ids[last_req_index] = None
|
self.req_ids[last_req_index] = None
|
||||||
self.req_id_to_index[req_id] = empty_index
|
self.req_id_to_index[req_id] = empty_index
|
||||||
|
|
||||||
# TODO(woosuk): Optimize the copy of token_ids_cpu and
|
num_tokens = self.num_tokens[last_req_index]
|
||||||
# block_table_cpu.
|
self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[
|
||||||
self.token_ids_cpu[empty_index] = self.token_ids_cpu[
|
last_req_index, :num_tokens]
|
||||||
last_req_index]
|
self.num_tokens[empty_index] = num_tokens
|
||||||
self.num_prompt_tokens[empty_index] = \
|
self.num_prompt_tokens[empty_index] = \
|
||||||
self.num_prompt_tokens[last_req_index]
|
self.num_prompt_tokens[last_req_index]
|
||||||
self.num_computed_tokens_cpu[
|
self.num_computed_tokens_cpu[
|
||||||
empty_index] = self.num_computed_tokens_cpu[last_req_index]
|
empty_index] = self.num_computed_tokens_cpu[last_req_index]
|
||||||
|
# TODO(woosuk): Optimize the copy of block_table_cpu.
|
||||||
self.block_table_cpu[empty_index] = self.block_table_cpu[
|
self.block_table_cpu[empty_index] = self.block_table_cpu[
|
||||||
last_req_index]
|
last_req_index]
|
||||||
self.temperature_cpu[empty_index] = self.temperature_cpu[
|
self.temperature_cpu[empty_index] = self.temperature_cpu[
|
||||||
|
|||||||
@ -644,6 +644,7 @@ class GPUModelRunner:
|
|||||||
# Append the sampled token to the output token ids.
|
# Append the sampled token to the output token ids.
|
||||||
token_id = sampled_token_ids[i]
|
token_id = sampled_token_ids[i]
|
||||||
self.input_batch.token_ids_cpu[i, seq_len] = token_id
|
self.input_batch.token_ids_cpu[i, seq_len] = token_id
|
||||||
|
self.input_batch.num_tokens[i] += 1
|
||||||
req_state.output_token_ids.append(token_id)
|
req_state.output_token_ids.append(token_id)
|
||||||
else:
|
else:
|
||||||
# Ignore the sampled token from the partial request.
|
# Ignore the sampled token from the partial request.
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user