diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index f49f5bdd9703b..cb000d53a923d 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -400,8 +400,10 @@ class Worker(WorkerBase): self.profiler.start() else: self.profiler.stop() - print(self.profiler.key_averages().table( - sort_by="self_cuda_time_total")) + # only print profiler results on rank 0 + if self.local_rank == 0: + print(self.profiler.key_averages().table( + sort_by="self_cuda_time_total")) def execute_dummy_batch(self) -> None: self.model_runner._dummy_run(1) diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 2e20c89c632c5..2d2e51c329e74 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -128,8 +128,10 @@ class Worker(LocalOrDistributedWorkerBase): if self.profiler is None: raise RuntimeError("Profiler is not enabled.") self.profiler.stop() - print( - self.profiler.key_averages().table(sort_by="self_cuda_time_total")) + # only print profiler results on rank 0 + if self.local_rank == 0: + print(self.profiler.key_averages().table( + sort_by="self_cuda_time_total")) def sleep(self, level: int = 1) -> None: free_bytes_before_sleep = torch.cuda.mem_get_info()[0]