From 8a23e933026bdb66b0b141c69454457428aa056d Mon Sep 17 00:00:00 2001 From: WANGWEI Date: Fri, 13 Sep 2024 01:47:42 +0800 Subject: [PATCH] [BugFix] lazy init _copy_stream to avoid torch init wrong gpu instance (#8403) --- vllm/worker/multi_step_model_runner.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index 9a196c3dfcd1f..cd9b20083c1a6 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -230,12 +230,15 @@ class MultiStepModelRunner(GPUModelRunnerBase[StatefulModelInput]): self._base_model_runner: GPUModelRunnerBase = base_model_runner self.is_multi_step = self.scheduler_config.is_multi_step - # used to copy tensors from GPU to CPU asynchronously - self._copy_stream = torch.cuda.Stream() self.pinned_sampled_token_ids: Optional[torch.Tensor] = None self.pythonization_cache = PythonizationCache() + @functools.cached_property + def _copy_stream(self): + # used to copy tensors from GPU to CPU asynchronously + return torch.cuda.Stream() + def make_model_input_from_broadcasted_tensor_dict( self, tensor_dict: Dict[str, Any]) -> StatefulModelInput: model_input = (StatefulModelInput.from_broadcasted_tensor_dict(