# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This generates gpu kernel analysis output from nsys rep. Will call nsys stats -r cuda_gpu_kern_trace, get non-overlapped gpu cycles, then generate csv and html output for analysis """ import argparse import logging import os import regex as re logger = logging.getLogger(__name__) # helper data class for annotating kernels def load_engine_model(): """returns engine_model built from all json files in the current dir""" import glob import json engine_model = {} json_files = glob.glob(os.path.join(os.path.dirname(__file__) or ".", "*.json")) for fname in json_files: with open(fname, encoding="utf-8") as f: engine_model.update(json.load(f)) return engine_model class GPUTrace2Graph: """ Parses output of nsys report, generates csv and bar chart output """ def __init__(self): import pandas as pd # avoid importing till needed self.pd = pd self.pd.options.mode.copy_on_write = True # helper functions for generating trace->summary csvs def gen_nonoverlapped_sum_from_gputrace(self, in_file, out_file): logger.info("loading %s", in_file) df = self.pd.read_csv( in_file, usecols=["Start (ns)", "Duration (ns)", "Device", "Strm", "Name"] ) df["End (ns)"] = df["Start (ns)"] + df["Duration (ns)"] df = self.sum_non_overlapping_intervals(df) # get ready to print table with elapsed times per kernel df["Instances"] = 1 df_sum = df.groupby("Name", as_index=False).agg( {"Elapsed Time (ns)": "sum", "Duration (ns)": "sum", "Instances": "size"} ) # generate csv df_sum["Total Time (sec)"] = df_sum["Duration (ns)"] / 1e9 df_sum["Elapsed Time (sec)"] = df_sum["Elapsed Time (ns)"] / 1e9 df_sum = df_sum.sort_values(by="Elapsed Time (sec)", ascending=False) df_sum[["Elapsed Time (sec)", "Total Time (sec)", "Instances", "Name"]].to_csv( out_file, index=False ) def sum_non_overlapping_intervals(self, df): """ returns new sorted df with Elapsed Time (ns) column using vectorized operations """ logger.info("sorting %s trace records by start time", str(df.shape)) # Sort by start time and reset index df = df.sort_values(by="Start (ns)").reset_index(drop=True) # Initialize elapsed time as duration df["Elapsed Time (ns)"] = df["Duration (ns)"] # Get numpy arrays for faster operations starts = df["Start (ns)"].values ends = df["End (ns)"].values # Keep track of current interval end current_end = ends[0] display_units = int(len(df) / 100) # Update current_end for overlapping intervals for i in range(1, len(df)): if i % display_units == 0: print(f"processing trace: {int(i / len(df) * 100)} %", end="\r") if starts[i] <= current_end: if ends[i] > current_end: # Partial overlap df.iloc[i, df.columns.get_loc("Elapsed Time (ns)")] = ( ends[i] - current_end ) current_end = ends[i] else: # Complete overlap df.iloc[i, df.columns.get_loc("Elapsed Time (ns)")] = 0 else: # No overlap current_end = ends[i] return df # functions for generating html files def make_html(self, df, output_dir, title): """make html graph from df""" import plotly.express as px if df.empty: return output_name = output_dir + "/result" if not title: title = "Model_Engine" x = "Model_Engine" y = "Elapsed Time (sec)" color = "Category" """ generate kernel mapping table """ # Sort Model_Engine categories by last field after underscore df["Model_Engine"] = self.pd.Categorical( df["Model_Engine"], sorted(df["Model_Engine"].unique(), key=lambda x: x.split("_")[-1]), ) df[["Model_Engine", color, "Instances", "Name", y]].sort_values( by=color ).to_csv(f"{output_name}.csv", index=False) graph = px.histogram( df.round(2), x=x, y=y, title=(f"{y} for {title}"), color=color, text_auto=True, ) # wrap x axis labels graph.update_xaxes(automargin=True) graph.write_html(f"{output_name}.html") """ Generate data table with columns per Model_Engine into result.html """ pivot_df = df.pivot_table( values="Elapsed Time (sec)", index="Category", columns="Model_Engine", aggfunc="sum", observed=False, ).round(2) # Add sum row at bottom pivot_df.loc["total_elapsed_sec"] = pivot_df.sum() pivot_df.fillna("").to_html("temp.html") with ( open(f"{output_name}.html", "a", encoding="utf-8") as outfile, open("temp.html", encoding="utf-8") as infile, ): outfile.write(infile.read()) os.remove("temp.html") print( f"Finished generating: \n" f" {output_name}.html for stack bar chart \n" f" {output_name}.csv for Kernel-Category mapping" ) def anno_gpu_kernname(self, df, mapping): """add "Category" column""" def anno_gpu_kernname_helper(name): for kern_name, val in mapping.items(): if re.search(kern_name, name): return val df["Category"] = df["Name"].apply(anno_gpu_kernname_helper) def make_nongpu_row(self, df, nongpu_sec): """this will append non-gpu time entry at end of df""" nongpu_row = self.pd.DataFrame([df.iloc[-1]]) nongpu_row["Category"] = nongpu_row["Name"] = "CPU(non-GPU)" nongpu_row["Instances"] = 1 nongpu_row["Elapsed Time (sec)"] = nongpu_sec return nongpu_row def is_valid_file(self, base_file): """asserts if base_file is non-existent or is empty""" assert os.path.isfile(base_file) and os.path.getsize(base_file) > 0, ( f"{base_file} doesn't exist or is empty" ) def should_gen_file(self, new_file, base_file): """figure out if new file should be generated from base_file""" self.is_valid_file(base_file) if ( os.path.exists(new_file) and (os.path.getmtime(new_file) > os.path.getmtime(base_file)) and (os.path.getsize(base_file) > 0) ): logger.info("reusing %s", new_file) return False else: logger.info("generating %s", new_file) return True def gen_sum_file(self, file, nsys_cmd): """ generates sum file from nsys trace with times per kernel and returns the name of the sum file """ import subprocess file_dir = os.path.dirname(file) file_name = os.path.basename(file) if not file_dir: file_dir = "." # Walk through trace and get the total non-overlapped time nsys_stats_file = f"{file_dir}/{file_name}_cuda_gpu_trace.csv" sum_file = f"{file_dir}/{file_name}_cuda_gpu_kernel_tracesum.csv" if self.should_gen_file(nsys_stats_file, file): cmd = [ nsys_cmd, "stats", "-r", "cuda_gpu_trace", file, "-o", f"{file_dir}/{file_name}", ] cmd_str = " ".join(cmd) logger.info("+ %s", cmd_str) # estimate time based on calibrated 240M/min file_size_mb = os.path.getsize(file) / 1e6 logger.info( "nsys stats for %.2f MB file expected to take %.2f min", file_size_mb, file_size_mb / 240, ) try: subprocess.run(cmd, check=True) except Exception: logger.error("%s failed; Use --nsys_cmd to specify nsys path", cmd_str) exit(1) logger.info("generating non-overalapped sum %s", sum_file) self.gen_nonoverlapped_sum_from_gputrace(nsys_stats_file, sum_file) self.is_valid_file(sum_file) logger.info("Finished generating %s", sum_file) return sum_file def gen_graph(self, in_file, out_dir, title, nsys_cmd, engine_model): """generates graph and csv file from in_file into out_dir""" # Initialize an empty DataFrame to store combined data combined_df = self.pd.DataFrame() for idx, (file, engine, model, total_sec) in enumerate(in_file): file_dir = os.path.dirname(file) file_name = os.path.basename(file) if not file_dir: file_dir = "." sum_file = self.gen_sum_file(file, nsys_cmd) # read kernel summary file df = self.pd.read_csv(sum_file) # annotate kernel to their categories assert engine_model.get(engine), f"engine {engine} unknown" assert engine_model[engine].get(model), f"model {model} unknown" # remove nsys-rep from file_name for shorter x-label file_name = file_name.replace(".nsys-rep", "") df["Model_Engine"] = f"{model}_{engine}_{file_name}_{idx}" self.anno_gpu_kernname(df, engine_model[engine][model]) # patch in non-gpu time gpu_sec = round(df["Elapsed Time (sec)"].sum(), 1) total_sec = round(float(total_sec), 1) if total_sec < gpu_sec: logger.warning( "Elapsed sec %.2f < GPU sec %.2f resetting Elapsed sec ", total_sec, gpu_sec, ) total_sec = gpu_sec nongpu_row = self.make_nongpu_row(df, total_sec - gpu_sec) df = self.pd.concat([df, nongpu_row], ignore_index=True) combined_df = self.pd.concat([combined_df, df], ignore_index=True) if out_dir is None: out_dir = "." else: os.makedirs(out_dir, exist_ok=True) # generate html file self.make_html(combined_df, out_dir, title) def parse_tuple(s): return tuple(s.split(",")) def main(): logging.basicConfig( format=("%(asctime)s - %(levelname)s - %(message)s"), level=logging.INFO ) parser = argparse.ArgumentParser( description=( "Process nsys rep and generate kernel non-overlapped cycles. \n" "Example:\n" "gputrc2graph.py --in_file d1.nsys-rep,vllm,llama,100 \n" "d2.nsys-rep,vllm,gpt-oss,102 " '--out_dir results/ --title "Model=gpt-oss vLLM chart"' ), formatter_class=argparse.RawDescriptionHelpFormatter, ) # load supported engine_model engine_model_supported = load_engine_model() # Get a string representation of supported engine/model combinations engine_model_supported_str = ", ".join( f"{engine}:[{', '.join(models.keys())}]" for engine, models in engine_model_supported.items() ) parser.add_argument( "--in_file", type=parse_tuple, nargs="+", help=( "list of (nsys-rep, engine, model, elapsed_nonprofiled_sec) " "separated by space. Elapsed_nonprofiled_sec is runtime without " "profiling used to calculate non-gpu time. Specify 0 to use " "elapsed time from nsys-rep but that might inflate non-gpu time. " f"Available engine:[model] are: {engine_model_supported_str} " f"Example: --infile d1.nsys-rep,vllm,llama,100 " "d2.nsys-rep,vllm,gpt-oss,102" ), required=True, ) parser.add_argument("--out_dir", help=("output dir for result.csv/html")) parser.add_argument("--title", help=("title for html chart")) parser.add_argument( "--nsys_cmd", help=("nsys cmd, e.g. /usr/bin/nsys, Default: nsys"), default="nsys", ) args = parser.parse_args() gputrace = GPUTrace2Graph() gputrace.gen_graph( args.in_file, args.out_dir, args.title, args.nsys_cmd, engine_model_supported ) if __name__ == "__main__": main()