mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-16 10:17:13 +08:00
minimize fill_
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
This commit is contained in:
parent
34fb0cbbd0
commit
1244c25908
@ -457,15 +457,10 @@ class GPUModelRunner:
|
||||
self.slot_mapping_cpu[:total_num_scheduled_tokens],
|
||||
non_blocking=True)
|
||||
|
||||
# TODO: Which of these are needed?
|
||||
# Fill unused with -1. Needed for reshape_and_cache
|
||||
self.slot_mapping[total_num_scheduled_tokens:].fill_(-1)
|
||||
self.seq_lens[num_reqs:].fill_(0)
|
||||
self.query_start_loc[num_reqs + 1:].fill_(-1)
|
||||
self.positions[total_num_scheduled_tokens:].fill_(0)
|
||||
self.input_batch.block_table.get_device_tensor()[num_reqs:].fill_(-1)
|
||||
|
||||
# Fill with -1s -- needed for reshape_and_cache
|
||||
self.slot_mapping[total_num_scheduled_tokens:].fill_(
|
||||
-1) # Definitely needed
|
||||
|
||||
# Prepare for cascade attention if needed.
|
||||
common_prefix_len = (scheduler_output.num_common_prefix_blocks *
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user