From ae0ce1be272105f02a3ac6a63e646690be2481fb Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Thu, 27 Nov 2025 12:38:53 -0800 Subject: [PATCH] [Model Runner V2][BugFix] Keep reference to GPU tensors in AsyncOutput (#29623) Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu/async_utils.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu/async_utils.py b/vllm/v1/worker/gpu/async_utils.py index 421fb29a7f87..f6bc607c1ae6 100644 --- a/vllm/v1/worker/gpu/async_utils.py +++ b/vllm/v1/worker/gpu/async_utils.py @@ -21,6 +21,9 @@ class AsyncOutput(AsyncModelRunnerOutput): copy_stream: torch.cuda.Stream, copy_event: torch.cuda.Event, ): + # NOTE(woosuk): We must retain references to the GPU tensors, + # as the copy operations are performed on a different CUDA stream than + # the one where the tensors were created. self.model_runner_output = model_runner_output self.sampler_output = sampler_output self.num_sampled_tokens = num_sampled_tokens @@ -51,7 +54,9 @@ class AsyncOutput(AsyncModelRunnerOutput): ) else: self.logprobs_tensors = None - self.num_sampled_tokens = num_sampled_tokens.to("cpu", non_blocking=True) + self.num_sampled_tokens_cpu = num_sampled_tokens.to( + "cpu", non_blocking=True + ) self.prompt_logprobs_dict: dict[str, LogprobsTensors | None] = {} if self.model_runner_output.prompt_logprobs_dict: for k, v in self.model_runner_output.prompt_logprobs_dict.items(): @@ -63,7 +68,7 @@ class AsyncOutput(AsyncModelRunnerOutput): def get_output(self) -> ModelRunnerOutput: self.copy_event.synchronize() - num_sampled_tokens_np = self.num_sampled_tokens.numpy() + num_sampled_tokens_np = self.num_sampled_tokens_cpu.numpy() # NOTE(woosuk): The following code is to ensure compatibility with # the existing model runner.