mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 04:15:01 +08:00
[Minor] Group async_scheduling related fields in model runner init (#26736)
Signed-off-by: Nick Hill <nhill@redhat.com>
This commit is contained in:
parent
9d6964926e
commit
ff4810ba73
@ -375,9 +375,15 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
)
|
)
|
||||||
|
|
||||||
self.use_async_scheduling = self.scheduler_config.async_scheduling
|
self.use_async_scheduling = self.scheduler_config.async_scheduling
|
||||||
self.async_output_copy_stream = (
|
# Separate cuda stream for overlapping transfer of sampled token ids from
|
||||||
torch.cuda.Stream() if self.use_async_scheduling else None
|
# GPU to CPU when async scheduling is enabled.
|
||||||
)
|
self.async_output_copy_stream: torch.cuda.Stream | None = None
|
||||||
|
# cuda event to synchronize use of reused CPU tensors between steps
|
||||||
|
# when async scheduling is enabled.
|
||||||
|
self.prepare_inputs_event: torch.cuda.Event | None = None
|
||||||
|
if self.use_async_scheduling:
|
||||||
|
self.async_output_copy_stream = torch.cuda.Stream()
|
||||||
|
self.prepare_inputs_event = torch.cuda.Event()
|
||||||
|
|
||||||
# TODO(woosuk): Provide an option to tune the max cudagraph batch size.
|
# TODO(woosuk): Provide an option to tune the max cudagraph batch size.
|
||||||
# The convention is different.
|
# The convention is different.
|
||||||
@ -444,14 +450,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
(3, self.max_num_tokens + 1), dtype=torch.int64
|
(3, self.max_num_tokens + 1), dtype=torch.int64
|
||||||
)
|
)
|
||||||
|
|
||||||
# CUDA event to synchronize use of reused CPU tensors between steps
|
|
||||||
# when async scheduling is enabled.
|
|
||||||
self.prepare_inputs_event: torch.cuda.Event | None = None
|
|
||||||
if self.use_async_scheduling:
|
|
||||||
self.prepare_inputs_event = torch.cuda.Event()
|
|
||||||
# Start in a completed state.
|
|
||||||
self.prepare_inputs_event.record(torch.cuda.default_stream())
|
|
||||||
|
|
||||||
# None in the first PP rank. The rest are set after load_model.
|
# None in the first PP rank. The rest are set after load_model.
|
||||||
self.intermediate_tensors: IntermediateTensors | None = None
|
self.intermediate_tensors: IntermediateTensors | None = None
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user