[V1] fix torch profiling for V1 offline scenarios (#18445)

Signed-off-by: Divakar Verma <divakar.verma@amd.com>
This commit is contained in:
Divakar Verma 2025-05-27 23:16:30 -05:00 committed by GitHub
parent 9a21e331ff
commit 774c5fde30
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 23 additions and 51 deletions

View File

@ -6,13 +6,12 @@ import dataclasses
import json import json
import os import os
import time import time
from pathlib import Path
from typing import Any, Optional from typing import Any, Optional
import numpy as np import numpy as np
import torch
from tqdm import tqdm from tqdm import tqdm
import vllm.envs as envs
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs
@ -80,17 +79,9 @@ def main(args: argparse.Namespace):
def run_to_completion(profile_dir: Optional[str] = None): def run_to_completion(profile_dir: Optional[str] = None):
if profile_dir: if profile_dir:
with torch.profiler.profile( llm.start_profile()
activities=[ llm_generate()
torch.profiler.ProfilerActivity.CPU, llm.stop_profile()
torch.profiler.ProfilerActivity.CUDA,
],
on_trace_ready=torch.profiler.tensorboard_trace_handler(
str(profile_dir)
),
) as p:
llm_generate()
print(p.key_averages().table(sort_by="self_cuda_time_total"))
else: else:
start_time = time.perf_counter() start_time = time.perf_counter()
llm_generate() llm_generate()
@ -103,11 +94,7 @@ def main(args: argparse.Namespace):
run_to_completion(profile_dir=None) run_to_completion(profile_dir=None)
if args.profile: if args.profile:
profile_dir = args.profile_result_dir profile_dir = envs.VLLM_TORCH_PROFILER_DIR
if not profile_dir:
profile_dir = (
Path(".") / "vllm_benchmark_result" / f"latency_result_{time.time()}"
)
print(f"Profiling (results will be saved to '{profile_dir}')...") print(f"Profiling (results will be saved to '{profile_dir}')...")
run_to_completion(profile_dir=profile_dir) run_to_completion(profile_dir=profile_dir)
return return
@ -164,15 +151,6 @@ if __name__ == "__main__":
action="store_true", action="store_true",
help="profile the generation process of a single batch", help="profile the generation process of a single batch",
) )
parser.add_argument(
"--profile-result-dir",
type=str,
default=None,
help=(
"path to save the pytorch profiler output. Can be visualized "
"with ui.perfetto.dev or Tensorboard."
),
)
parser.add_argument( parser.add_argument(
"--output-json", "--output-json",
type=str, type=str,
@ -193,4 +171,9 @@ if __name__ == "__main__":
# numbers. We need to disable prefix caching by default. # numbers. We need to disable prefix caching by default.
parser.set_defaults(enable_prefix_caching=False) parser.set_defaults(enable_prefix_caching=False)
args = parser.parse_args() args = parser.parse_args()
if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
raise OSError(
"The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
"Please set it to a valid path to use torch profiler."
)
main(args) main(args)

View File

@ -6,13 +6,12 @@ import dataclasses
import json import json
import os import os
import time import time
from pathlib import Path
from typing import Any, Optional from typing import Any, Optional
import numpy as np import numpy as np
import torch
from tqdm import tqdm from tqdm import tqdm
import vllm.envs as envs
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format, from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format,
write_to_json) write_to_json)
@ -59,13 +58,6 @@ def add_cli_args(parser: argparse.ArgumentParser):
action="store_true", action="store_true",
help="profile the generation process of a single batch", help="profile the generation process of a single batch",
) )
parser.add_argument(
"--profile-result-dir",
type=str,
default=None,
help=("path to save the pytorch profiler output. Can be visualized "
"with ui.perfetto.dev or Tensorboard."),
)
parser.add_argument( parser.add_argument(
"--output-json", "--output-json",
type=str, type=str,
@ -87,7 +79,10 @@ def add_cli_args(parser: argparse.ArgumentParser):
def main(args: argparse.Namespace): def main(args: argparse.Namespace):
print(args) print(args)
if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
raise OSError(
"The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
"Please set it to a valid path to use torch profiler.")
engine_args = EngineArgs.from_cli_args(args) engine_args = EngineArgs.from_cli_args(args)
# NOTE(woosuk): If the request cannot be processed in a single batch, # NOTE(woosuk): If the request cannot be processed in a single batch,
@ -131,16 +126,9 @@ def main(args: argparse.Namespace):
def run_to_completion(profile_dir: Optional[str] = None): def run_to_completion(profile_dir: Optional[str] = None):
if profile_dir: if profile_dir:
with torch.profiler.profile( llm.start_profile()
activities=[ llm_generate()
torch.profiler.ProfilerActivity.CPU, llm.stop_profile()
torch.profiler.ProfilerActivity.CUDA,
],
on_trace_ready=torch.profiler.tensorboard_trace_handler(
str(profile_dir)),
) as p:
llm_generate()
print(p.key_averages().table(sort_by="self_cuda_time_total"))
else: else:
start_time = time.perf_counter() start_time = time.perf_counter()
llm_generate() llm_generate()
@ -153,10 +141,7 @@ def main(args: argparse.Namespace):
run_to_completion(profile_dir=None) run_to_completion(profile_dir=None)
if args.profile: if args.profile:
profile_dir = args.profile_result_dir profile_dir = envs.VLLM_TORCH_PROFILER_DIR
if not profile_dir:
profile_dir = (Path(".") / "vllm_benchmark_result" /
f"latency_result_{time.time()}")
print(f"Profiling (results will be saved to '{profile_dir}')...") print(f"Profiling (results will be saved to '{profile_dir}')...")
run_to_completion(profile_dir=profile_dir) run_to_completion(profile_dir=profile_dir)
return return

View File

@ -292,6 +292,8 @@ class Worker(WorkerBase):
self.profiler.start() self.profiler.start()
else: else:
self.profiler.stop() self.profiler.stop()
print(self.profiler.key_averages().table(
sort_by="self_cuda_time_total"))
def execute_dummy_batch(self) -> None: def execute_dummy_batch(self) -> None:
self.model_runner._dummy_run(1) self.model_runner._dummy_run(1)

View File

@ -128,6 +128,8 @@ class Worker(LocalOrDistributedWorkerBase):
if self.profiler is None: if self.profiler is None:
raise RuntimeError("Profiler is not enabled.") raise RuntimeError("Profiler is not enabled.")
self.profiler.stop() self.profiler.stop()
print(
self.profiler.key_averages().table(sort_by="self_cuda_time_total"))
def sleep(self, level: int = 1) -> None: def sleep(self, level: int = 1) -> None:
free_bytes_before_sleep = torch.cuda.mem_get_info()[0] free_bytes_before_sleep = torch.cuda.mem_get_info()[0]