[Hardware][TPU][Bugfix] Fix v1 mp profiler (#15409)

Signed-off-by: Siyuan Liu <lsiyuan@google.com>
This commit is contained in:
Siyuan Liu 2025-03-25 01:43:00 -07:00 committed by GitHub
parent 051da7efe3
commit 4157f563b4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -66,14 +66,18 @@ class TPUWorker:
from vllm.utils import init_cached_hf_modules
init_cached_hf_modules()
# Delay profiler initialization to the start of the profiling.
# This is because in vLLM V1, MP runtime is initialized before the
# TPU Worker is initialized. The profiler server needs to start after
# MP runtime is initialized.
self.profiler = None
self.profile_dir = None
if envs.VLLM_TORCH_PROFILER_DIR and self.rank < 1:
# For TPU, we can only have 1 active profiler session for 1 profiler
# server. So we only profile on rank0.
self.profile_dir = envs.VLLM_TORCH_PROFILER_DIR
logger.info("Profiling enabled. Traces will be saved to: %s",
self.profile_dir)
self.profiler = xp.start_server(9012)
if self.model_config.seed is None:
self.model_config.seed = 0
@ -168,9 +172,11 @@ class TPUWorker:
def profile(self, is_start: bool = True):
if self.rank < 1:
if self.profiler is None:
if self.profile_dir is None:
raise RuntimeError("Profiler is not enabled.")
if is_start:
if self.profiler is None:
self.profiler = xp.start_server(9012)
xp.start_trace(self.profile_dir)
else:
xp.stop_trace()