mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-11 04:54:59 +08:00
Make PyTorch profiler gzip and CUDA time dump configurable (#29568)
Signed-off-by: Yifei Zhang <yifei.zhang1992@outlook.com>
This commit is contained in:
parent
f72a817bdf
commit
1ab8fc8197
@ -11,6 +11,8 @@ We support tracing vLLM workers using the `torch.profiler` module. You can enabl
|
|||||||
- `VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY=1` to record memory, off by default
|
- `VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY=1` to record memory, off by default
|
||||||
- `VLLM_TORCH_PROFILER_WITH_STACK=1` to enable recording stack information, on by default
|
- `VLLM_TORCH_PROFILER_WITH_STACK=1` to enable recording stack information, on by default
|
||||||
- `VLLM_TORCH_PROFILER_WITH_FLOPS=1` to enable recording FLOPs, off by default
|
- `VLLM_TORCH_PROFILER_WITH_FLOPS=1` to enable recording FLOPs, off by default
|
||||||
|
- `VLLM_TORCH_PROFILER_USE_GZIP=0` to disable gzip-compressing profiling files, on by default
|
||||||
|
- `VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL=0` to disable dumping and printing the aggregated CUDA self time table, on by default
|
||||||
|
|
||||||
The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set.
|
The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set.
|
||||||
|
|
||||||
|
|||||||
13
vllm/envs.py
13
vllm/envs.py
@ -100,6 +100,8 @@ if TYPE_CHECKING:
|
|||||||
VLLM_TORCH_PROFILER_WITH_FLOPS: bool = False
|
VLLM_TORCH_PROFILER_WITH_FLOPS: bool = False
|
||||||
VLLM_PROFILER_DELAY_ITERS: int = 0
|
VLLM_PROFILER_DELAY_ITERS: int = 0
|
||||||
VLLM_PROFILER_MAX_ITERS: int = 0
|
VLLM_PROFILER_MAX_ITERS: int = 0
|
||||||
|
VLLM_TORCH_PROFILER_USE_GZIP: bool = True
|
||||||
|
VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL: bool = True
|
||||||
VLLM_USE_TRITON_AWQ: bool = False
|
VLLM_USE_TRITON_AWQ: bool = False
|
||||||
VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
|
VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
|
||||||
VLLM_SKIP_P2P_CHECK: bool = False
|
VLLM_SKIP_P2P_CHECK: bool = False
|
||||||
@ -890,6 +892,17 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
# Maximum number of iterations to profile when using the torch/torch CUDA profiler.
|
# Maximum number of iterations to profile when using the torch/torch CUDA profiler.
|
||||||
# If set to 0, will not limit the number of iterations.
|
# If set to 0, will not limit the number of iterations.
|
||||||
"VLLM_PROFILER_MAX_ITERS": lambda: int(os.getenv("VLLM_PROFILER_MAX_ITERS", "0")),
|
"VLLM_PROFILER_MAX_ITERS": lambda: int(os.getenv("VLLM_PROFILER_MAX_ITERS", "0")),
|
||||||
|
# Control whether torch profiler gzip-compresses profiling files.
|
||||||
|
# Set VLLM_TORCH_PROFILER_USE_GZIP=0 to disable gzip (enabled by default).
|
||||||
|
"VLLM_TORCH_PROFILER_USE_GZIP": lambda: bool(
|
||||||
|
os.getenv("VLLM_TORCH_PROFILER_USE_GZIP", "1") != "0"
|
||||||
|
),
|
||||||
|
# Control whether torch profiler dumps the self_cuda_time_total table.
|
||||||
|
# Set VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL=0 to disable dumping
|
||||||
|
# (enabled by default).
|
||||||
|
"VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL": lambda: bool(
|
||||||
|
os.getenv("VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL", "1") != "0"
|
||||||
|
),
|
||||||
# If set, vLLM will use Triton implementations of AWQ.
|
# If set, vLLM will use Triton implementations of AWQ.
|
||||||
"VLLM_USE_TRITON_AWQ": lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))),
|
"VLLM_USE_TRITON_AWQ": lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))),
|
||||||
# If set, allow loading or unloading lora adapters in runtime,
|
# If set, allow loading or unloading lora adapters in runtime,
|
||||||
|
|||||||
@ -162,7 +162,9 @@ class TorchProfilerWrapper(WorkerProfiler):
|
|||||||
with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
|
with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
|
||||||
with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
|
with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
|
||||||
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
||||||
torch_profiler_trace_dir, worker_name=worker_name, use_gzip=True
|
torch_profiler_trace_dir,
|
||||||
|
worker_name=worker_name,
|
||||||
|
use_gzip=envs.VLLM_TORCH_PROFILER_USE_GZIP,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -174,18 +176,19 @@ class TorchProfilerWrapper(WorkerProfiler):
|
|||||||
def _stop(self) -> None:
|
def _stop(self) -> None:
|
||||||
self.profiler.stop()
|
self.profiler.stop()
|
||||||
|
|
||||||
rank = self.local_rank
|
if envs.VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL:
|
||||||
profiler_dir = envs.VLLM_TORCH_PROFILER_DIR
|
rank = self.local_rank
|
||||||
profiler_out_file = f"{profiler_dir}/profiler_out_{rank}.txt"
|
profiler_dir = envs.VLLM_TORCH_PROFILER_DIR
|
||||||
sort_key = "self_cuda_time_total"
|
profiler_out_file = f"{profiler_dir}/profiler_out_{rank}.txt"
|
||||||
table = self.profiler.key_averages().table(sort_by=sort_key)
|
sort_key = "self_cuda_time_total"
|
||||||
|
table = self.profiler.key_averages().table(sort_by=sort_key)
|
||||||
|
|
||||||
with open(profiler_out_file, "w") as f:
|
with open(profiler_out_file, "w") as f:
|
||||||
print(table, file=f)
|
print(table, file=f)
|
||||||
|
|
||||||
# only print profiler results on rank 0
|
# only print profiler results on rank 0
|
||||||
if rank == 0:
|
if rank == 0:
|
||||||
print(table)
|
print(table)
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def annotate_context_manager(self, name: str):
|
def annotate_context_manager(self, name: str):
|
||||||
|
|||||||
@ -190,7 +190,9 @@ class AsyncLLM(EngineClient):
|
|||||||
],
|
],
|
||||||
with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
|
with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
|
||||||
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
||||||
envs.VLLM_TORCH_PROFILER_DIR, worker_name=worker_name, use_gzip=True
|
envs.VLLM_TORCH_PROFILER_DIR,
|
||||||
|
worker_name=worker_name,
|
||||||
|
use_gzip=envs.VLLM_TORCH_PROFILER_USE_GZIP,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
|
|||||||
@ -64,7 +64,9 @@ class XPUWorker(Worker):
|
|||||||
with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
|
with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
|
||||||
with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
|
with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
|
||||||
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
||||||
torch_profiler_trace_dir, worker_name=worker_name, use_gzip=True
|
torch_profiler_trace_dir,
|
||||||
|
worker_name=worker_name,
|
||||||
|
use_gzip=envs.VLLM_TORCH_PROFILER_USE_GZIP,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user