From ff4810ba73168237f090b0195fbe0ae4d64e0730 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Tue, 14 Oct 2025 14:46:37 -0700 Subject: [PATCH] [Minor] Group async_scheduling related fields in model runner init (#26736) Signed-off-by: Nick Hill --- vllm/v1/worker/gpu_model_runner.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 67a93fed5274..bbb63d28289c 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -375,9 +375,15 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ) self.use_async_scheduling = self.scheduler_config.async_scheduling - self.async_output_copy_stream = ( - torch.cuda.Stream() if self.use_async_scheduling else None - ) + # Separate cuda stream for overlapping transfer of sampled token ids from + # GPU to CPU when async scheduling is enabled. + self.async_output_copy_stream: torch.cuda.Stream | None = None + # cuda event to synchronize use of reused CPU tensors between steps + # when async scheduling is enabled. + self.prepare_inputs_event: torch.cuda.Event | None = None + if self.use_async_scheduling: + self.async_output_copy_stream = torch.cuda.Stream() + self.prepare_inputs_event = torch.cuda.Event() # TODO(woosuk): Provide an option to tune the max cudagraph batch size. # The convention is different. @@ -444,14 +450,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): (3, self.max_num_tokens + 1), dtype=torch.int64 ) - # CUDA event to synchronize use of reused CPU tensors between steps - # when async scheduling is enabled. - self.prepare_inputs_event: torch.cuda.Event | None = None - if self.use_async_scheduling: - self.prepare_inputs_event = torch.cuda.Event() - # Start in a completed state. - self.prepare_inputs_event.record(torch.cuda.default_stream()) - # None in the first PP rank. The rest are set after load_model. self.intermediate_tensors: IntermediateTensors | None = None