mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-27 00:29:19 +08:00
[Model Runner V2] Don't use UVA buffer for prefill_len (#29713)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
parent
4b17ce6815
commit
4a80ad0a25
@ -410,6 +410,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
cu_num_new_blocks[i].append(x + len(block_ids))
|
||||
new_block_ids[i].extend(block_ids)
|
||||
overwrite.append(True)
|
||||
if scheduler_output.scheduled_new_reqs:
|
||||
self.req_states.prefill_len.copy_to_gpu()
|
||||
|
||||
# Add new blocks for the existing requests.
|
||||
cached_reqs = scheduler_output.scheduled_cached_reqs
|
||||
|
||||
@ -117,7 +117,10 @@ class RequestState:
|
||||
self.prefill_token_ids = UvaBuffer(
|
||||
self.max_num_reqs, self.max_model_len, dtype=torch.int32
|
||||
)
|
||||
self.prefill_len = UvaBuffer(self.max_num_reqs, dtype=torch.int32)
|
||||
# NOTE(woosuk): We don't use UVA for prefill_len because its GPU view
|
||||
# can be used outside of update_states and prepare_inputs.
|
||||
# Without async barrier, using UVA can cause race conditions.
|
||||
self.prefill_len = self._make_buffer(self.max_num_reqs, dtype=torch.int32)
|
||||
# Number of computed tokens.
|
||||
self.num_computed_prefill_tokens = np.zeros(self.max_num_reqs, dtype=np.int32)
|
||||
self.num_computed_tokens = torch.zeros(
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user