mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-21 18:27:03 +08:00
[Misc] support nsys profile for bench latency (#29776)
Signed-off-by: zhuhaoran <zhuhaoran.zhr@alibaba-inc.com>
This commit is contained in:
parent
100f93d2be
commit
eee600c34f
@ -79,10 +79,6 @@ def add_cli_args(parser: argparse.ArgumentParser):
|
|||||||
|
|
||||||
def main(args: argparse.Namespace):
|
def main(args: argparse.Namespace):
|
||||||
engine_args = EngineArgs.from_cli_args(args)
|
engine_args = EngineArgs.from_cli_args(args)
|
||||||
if args.profile and not engine_args.profiler_config.profiler == "torch":
|
|
||||||
raise ValueError(
|
|
||||||
"The torch profiler is not enabled. Please provide profiler_config."
|
|
||||||
)
|
|
||||||
|
|
||||||
# Lazy import to avoid importing LLM when the bench command is not selected.
|
# Lazy import to avoid importing LLM when the bench command is not selected.
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
@ -125,8 +121,8 @@ def main(args: argparse.Namespace):
|
|||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
def run_to_completion(profile_dir: str | None = None):
|
def run_to_completion(do_profile: bool = False):
|
||||||
if profile_dir:
|
if do_profile:
|
||||||
llm.start_profile()
|
llm.start_profile()
|
||||||
llm_generate()
|
llm_generate()
|
||||||
llm.stop_profile()
|
llm.stop_profile()
|
||||||
@ -139,18 +135,24 @@ def main(args: argparse.Namespace):
|
|||||||
|
|
||||||
print("Warming up...")
|
print("Warming up...")
|
||||||
for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
|
for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
|
||||||
run_to_completion(profile_dir=None)
|
run_to_completion(do_profile=False)
|
||||||
|
|
||||||
if args.profile:
|
if args.profile:
|
||||||
profile_dir = engine_args.profiler_config.torch_profiler_dir
|
profiler_config = engine_args.profiler_config
|
||||||
print(f"Profiling (results will be saved to '{profile_dir}')...")
|
if profiler_config.profiler == "torch":
|
||||||
run_to_completion(profile_dir=profile_dir)
|
print(
|
||||||
|
"Profiling with torch profiler (results will be saved to"
|
||||||
|
f" {profiler_config.torch_profiler_dir})..."
|
||||||
|
)
|
||||||
|
elif profiler_config.profiler == "cuda":
|
||||||
|
print("Profiling with cuda profiler ...")
|
||||||
|
run_to_completion(do_profile=True)
|
||||||
return
|
return
|
||||||
|
|
||||||
# Benchmark.
|
# Benchmark.
|
||||||
latencies = []
|
latencies = []
|
||||||
for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
|
for _ in tqdm(range(args.num_iters), desc="Bench iterations"):
|
||||||
latencies.append(run_to_completion(profile_dir=None))
|
latencies.append(run_to_completion(do_profile=False))
|
||||||
latencies = np.array(latencies)
|
latencies = np.array(latencies)
|
||||||
percentages = [10, 25, 50, 75, 90, 99]
|
percentages = [10, 25, 50, 75, 90, 99]
|
||||||
percentiles = np.percentile(latencies, percentages)
|
percentiles = np.percentile(latencies, percentages)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user