[BugFix] lazy init _copy_stream to avoid torch init wrong gpu instance (#8403)

2025-12-30 12:51:49 +08:00 · 2024-09-13 01:47:42 +08:00 · 2024-09-13 01:47:42 +08:00 · 8a23e93302
commit 8a23e93302
parent c6202daeed
1 changed files with 5 additions and 2 deletions
--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@ -230,12 +230,15 @@ class MultiStepModelRunner(GPUModelRunnerBase[StatefulModelInput]):
        self._base_model_runner: GPUModelRunnerBase = base_model_runner

        self.is_multi_step = self.scheduler_config.is_multi_step
-        # used to copy tensors from GPU to CPU asynchronously
-        self._copy_stream = torch.cuda.Stream()
        self.pinned_sampled_token_ids: Optional[torch.Tensor] = None

        self.pythonization_cache = PythonizationCache()

+    @functools.cached_property
+    def _copy_stream(self):
+        # used to copy tensors from GPU to CPU asynchronously
+        return torch.cuda.Stream()
+
    def make_model_input_from_broadcasted_tensor_dict(
            self, tensor_dict: Dict[str, Any]) -> StatefulModelInput:
        model_input = (StatefulModelInput.from_broadcasted_tensor_dict(