[MISC] Rename the torch profiler filename as instance_id+rank_id for merging the Profiler results of each Rank (#25867)

Signed-off-by: wang.yuqi <noooop@126.com>
This commit is contained in:
wang.yuqi 2025-10-12 17:29:08 +08:00 committed by GitHub
parent 82e64c7a20
commit 76852017ea
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 8 additions and 2 deletions

View File

@ -5,6 +5,7 @@ import copy
import hashlib import hashlib
import json import json
import os import os
import time
from contextlib import contextmanager from contextlib import contextmanager
from dataclasses import field, replace from dataclasses import field, replace
from functools import lru_cache from functools import lru_cache
@ -270,6 +271,9 @@ class VllmConfig:
def __post_init__(self): def __post_init__(self):
"""Verify configs are valid & consistent with each other.""" """Verify configs are valid & consistent with each other."""
# To give each torch profile run a unique instance name.
self.instance_id = f"{time.time_ns()}"
self.try_verify_and_update_config() self.try_verify_and_update_config()
if self.model_config is not None: if self.model_config is not None:

View File

@ -79,6 +79,7 @@ class Worker(WorkerBase):
# VLLM_TORCH_PROFILER_DIR=/path/to/save/trace # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
if envs.VLLM_TORCH_PROFILER_DIR: if envs.VLLM_TORCH_PROFILER_DIR:
torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
worker_name = f"{vllm_config.instance_id}-rank-{self.rank}"
logger.info( logger.info(
"Profiling enabled. Traces will be saved to: %s", "Profiling enabled. Traces will be saved to: %s",
torch_profiler_trace_dir, torch_profiler_trace_dir,
@ -101,7 +102,7 @@ class Worker(WorkerBase):
with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK, with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS, with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
on_trace_ready=torch.profiler.tensorboard_trace_handler( on_trace_ready=torch.profiler.tensorboard_trace_handler(
torch_profiler_trace_dir, use_gzip=True torch_profiler_trace_dir, worker_name=worker_name, use_gzip=True
), ),
) )
else: else:

View File

@ -39,6 +39,7 @@ class XPUWorker(Worker):
# VLLM_TORCH_PROFILER_DIR=/path/to/save/trace # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
if envs.VLLM_TORCH_PROFILER_DIR: if envs.VLLM_TORCH_PROFILER_DIR:
torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
worker_name = f"{vllm_config.instance_id}-rank-{self.rank}"
logger.info( logger.info(
"Profiling enabled. Traces will be saved to: %s", "Profiling enabled. Traces will be saved to: %s",
torch_profiler_trace_dir, torch_profiler_trace_dir,
@ -61,7 +62,7 @@ class XPUWorker(Worker):
with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK, with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS, with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
on_trace_ready=torch.profiler.tensorboard_trace_handler( on_trace_ready=torch.profiler.tensorboard_trace_handler(
torch_profiler_trace_dir, use_gzip=True torch_profiler_trace_dir, worker_name=worker_name, use_gzip=True
), ),
) )
else: else: