[Misc] support nsys profile for bench latency (#29776)

Signed-off-by: zhuhaoran <zhuhaoran.zhr@alibaba-inc.com>
This commit is contained in:
zhrrr 2025-12-18 22:52:20 +08:00 committed by GitHub
parent 100f93d2be
commit eee600c34f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -79,10 +79,6 @@ def add_cli_args(parser: argparse.ArgumentParser):
def main(args: argparse.Namespace): def main(args: argparse.Namespace):
engine_args = EngineArgs.from_cli_args(args) engine_args = EngineArgs.from_cli_args(args)
if args.profile and not engine_args.profiler_config.profiler == "torch":
raise ValueError(
"The torch profiler is not enabled. Please provide profiler_config."
)
# Lazy import to avoid importing LLM when the bench command is not selected. # Lazy import to avoid importing LLM when the bench command is not selected.
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
@ -125,8 +121,8 @@ def main(args: argparse.Namespace):
), ),
) )
def run_to_completion(profile_dir: str | None = None): def run_to_completion(do_profile: bool = False):
if profile_dir: if do_profile:
llm.start_profile() llm.start_profile()
llm_generate() llm_generate()
llm.stop_profile() llm.stop_profile()
@ -139,18 +135,24 @@ def main(args: argparse.Namespace):
print("Warming up...") print("Warming up...")
for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"): for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
run_to_completion(profile_dir=None) run_to_completion(do_profile=False)
if args.profile: if args.profile:
profile_dir = engine_args.profiler_config.torch_profiler_dir profiler_config = engine_args.profiler_config
print(f"Profiling (results will be saved to '{profile_dir}')...") if profiler_config.profiler == "torch":
run_to_completion(profile_dir=profile_dir) print(
"Profiling with torch profiler (results will be saved to"
f" {profiler_config.torch_profiler_dir})..."
)
elif profiler_config.profiler == "cuda":
print("Profiling with cuda profiler ...")
run_to_completion(do_profile=True)
return return
# Benchmark. # Benchmark.
latencies = [] latencies = []
for _ in tqdm(range(args.num_iters), desc="Profiling iterations"): for _ in tqdm(range(args.num_iters), desc="Bench iterations"):
latencies.append(run_to_completion(profile_dir=None)) latencies.append(run_to_completion(do_profile=False))
latencies = np.array(latencies) latencies = np.array(latencies)
percentages = [10, 25, 50, 75, 90, 99] percentages = [10, 25, 50, 75, 90, 99]
percentiles = np.percentile(latencies, percentages) percentiles = np.percentile(latencies, percentages)