[MISC] Rename the torch profiler filename as instance_id+rank_id for merging the Profiler results of each Rank (#25867)

Signed-off-by: wang.yuqi <noooop@126.com>
This commit is contained in:
wang.yuqi 2025-10-12 17:29:08 +08:00 committed by GitHub
parent 82e64c7a20
commit 76852017ea
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 8 additions and 2 deletions

View File

@ -5,6 +5,7 @@ import copy
import hashlib
import json
import os
import time
from contextlib import contextmanager
from dataclasses import field, replace
from functools import lru_cache
@ -270,6 +271,9 @@ class VllmConfig:
def __post_init__(self):
"""Verify configs are valid & consistent with each other."""
# To give each torch profile run a unique instance name.
self.instance_id = f"{time.time_ns()}"
self.try_verify_and_update_config()
if self.model_config is not None:

View File

@ -79,6 +79,7 @@ class Worker(WorkerBase):
# VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
if envs.VLLM_TORCH_PROFILER_DIR:
torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
worker_name = f"{vllm_config.instance_id}-rank-{self.rank}"
logger.info(
"Profiling enabled. Traces will be saved to: %s",
torch_profiler_trace_dir,
@ -101,7 +102,7 @@ class Worker(WorkerBase):
with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
on_trace_ready=torch.profiler.tensorboard_trace_handler(
torch_profiler_trace_dir, use_gzip=True
torch_profiler_trace_dir, worker_name=worker_name, use_gzip=True
),
)
else:

View File

@ -39,6 +39,7 @@ class XPUWorker(Worker):
# VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
if envs.VLLM_TORCH_PROFILER_DIR:
torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
worker_name = f"{vllm_config.instance_id}-rank-{self.rank}"
logger.info(
"Profiling enabled. Traces will be saved to: %s",
torch_profiler_trace_dir,
@ -61,7 +62,7 @@ class XPUWorker(Worker):
with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
on_trace_ready=torch.profiler.tensorboard_trace_handler(
torch_profiler_trace_dir, use_gzip=True
torch_profiler_trace_dir, worker_name=worker_name, use_gzip=True
),
)
else: