From 4157f563b4e82881309521c5aecabd4527db0a52 Mon Sep 17 00:00:00 2001
From: Siyuan Liu <lsiyuan@google.com>
Date: Tue, 25 Mar 2025 01:43:00 -0700
Subject: [PATCH] [Hardware][TPU][Bugfix] Fix v1 mp profiler (#15409)

Signed-off-by: Siyuan Liu <lsiyuan@google.com>
---
 vllm/v1/worker/tpu_worker.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index d56c25dd9da24..9a380373d4617 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -66,14 +66,18 @@ class TPUWorker:
             from vllm.utils import init_cached_hf_modules
             init_cached_hf_modules()
 
+        # Delay profiler initialization to the start of the profiling.
+        # This is because in vLLM V1, MP runtime is initialized before the
+        # TPU Worker is initialized. The profiler server needs to start after
+        # MP runtime is initialized.
         self.profiler = None
+        self.profile_dir = None
         if envs.VLLM_TORCH_PROFILER_DIR and self.rank < 1:
             # For TPU, we can only have 1 active profiler session for 1 profiler
             # server. So we only profile on rank0.
             self.profile_dir = envs.VLLM_TORCH_PROFILER_DIR
             logger.info("Profiling enabled. Traces will be saved to: %s",
                         self.profile_dir)
-            self.profiler = xp.start_server(9012)
 
         if self.model_config.seed is None:
             self.model_config.seed = 0
@@ -168,9 +172,11 @@ class TPUWorker:
 
     def profile(self, is_start: bool = True):
         if self.rank < 1:
-            if self.profiler is None:
+            if self.profile_dir is None:
                 raise RuntimeError("Profiler is not enabled.")
             if is_start:
+                if self.profiler is None:
+                    self.profiler = xp.start_server(9012)
                 xp.start_trace(self.profile_dir)
             else:
                 xp.stop_trace()