diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 94a433c36b872..c43917baaa9eb 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1716,7 +1716,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): # num_tokens = ubatch_slices[1][1].stop print(f"RUNNING UBATCH {ubatch_slices} is_dummy_run: {is_dummy_run} num_tokens_across_dp{num_tokens_across_dp}") # assert not is_dummy_run - compute_stream = torch.cuda.Stream(device=self.device) + compute_stream = torch.cuda.current_stream() ubatch_metadata = _make_ubatch_metadata( ubatch_slices=ubatch_slices, attn_metadata=attn_metadata,