# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This generates gpu kernel analysis output from nsys rep. Will call nsys stats -r cuda_gpu_kern_trace, get non-overlapped gpu cycles, then generate csv and html output for analysis """ import argparse import logging import os import regex as re logger = logging.getLogger(__name__) # helper data class for annotating kernels class EngineModelData: # engine + model mappings engine_model = { 'vllm': { 'llama': { 'layer_anno': { 'Stage': { '.*': 'layer', }, 'Substage': { 'gemm': 'gemm', 'fused_moe_kernel|GroupProblemShape|group_gemm_starts': 'moe_gemm', #llama4 'moe|sigmoid': 'moe', #llama4 'CatArrayBatched|prepare_inputs': 'prepare_next', 'flash': 'attn', 'ncclDevKernel|cross_device_reduce': 'nccl_and_custom_ar', '_norm_': 'norm', 'act_and_mul_': 'silu', 'rotary_embedding_kernel': 'rope', 'SoftMax': 'softmax', 'elementwise': 'elementwise', 'fp8_quant': 'quantize', 'reduce_kernel': 'reduce', 'triton': 'triton_kernel', 'CUDA mem': 'non-gpu-H_D_memops', '.*': 'misc' } } }, 'ds': { 'layer_anno': { 'Stage': { '.*': 'layer', }, 'Substage': { 'block_fp8|gemm_fp8_blockwise': 'block_fp8_gemm', 'fused_moe_kernel|_group_gemm|GroupProblemShape|GemmUniversal': 'moe_gemm', 'gemm|matmul|nvjet': 'gemm', 'moe|sigmoid|expert': 'moe', '_fwd_|FlashAttn|_mla_|_attn_': 'attn', 'CatArrayBatched': 'prepare_next', 'ncclDevKernel|cross_device_reduce': 'nccl_and_custom_ar', 'Norm|_norm_': 'norm', 'sbtopk': 'topk', 'act_and_mul_': 'activation', 'compute_position_kernel': 'rope', 'elementwise': 'elementwise', 'fp8_quant|quant_fp8|cvt_fp16_to_fp4': 'quantize', 'reduce': 'reduce', 'SoftMax': 'softmax', 'triton': 'triton_kernel', 'CUDA mem': 'non-gpu-H_D_memops', '.*': 'misc' } } }, 'gpt-oss': { 'layer_anno': { 'Stage': { '.*': 'layer', }, 'Substage': { 'block_fp8|gemm_fp8_blockwise': 'block_fp8_gemm', 'fused_moe_kernel|_group_gemm|GroupProblemShape|GemmUniversal|bmm_' # this section is triton_moe_gemm '|matmul_ogs_|_topk_forward|_combined_routing' '|_sum_bitmatrix_rows|_compute_writeback_idx': 'moe_gemm', 'gemm|matmul|nvjet': 'gemm', 'moe|sigmoid|expert|splitKreduce': 'moe', '_fwd_|FlashAttn|_mla_|_attn_|_flash_|flash::prepare_varlen|fmha': 'attn', 'CatArrayBatched': 'prepare_next', 'ncclDevKernel|cross_device_reduce': 'nccl_and_custom_ar', 'Norm|_norm_': 'norm', 'sbtopk': 'topk', 'act_and_mul_': 'activation', 'compute_position_kernel': 'rope', 'elementwise': 'elementwise', 'fp8_quant|quant_fp8|cvt_fp16_to_fp4|quantize': 'quantize', 'reduce': 'reduce', 'SoftMax': 'softmax', 'triton': 'triton_kernel', 'CUDA mem': 'non-gpu-H_D_memops', '.*': 'misc' } } } }, } class GPUTrace2Graph: """ Parses output of nsys report, generates csv and bar chart output """ def __init__(self, nsys_cmd): self.nsys_cmd = nsys_cmd import pandas as pd # avoid importing till needed self.pd = pd self.pd.options.mode.copy_on_write = True # helper functions for generating trace->summary csvs def gen_nonoverlapped_sum_from_gputrace(self, in_file, out_file): logger.info('loading %s', in_file) df = self.pd.read_csv( in_file, usecols=['Start (ns)', 'Duration (ns)', 'Device', 'Strm', 'Name']) df['End (ns)'] = df['Start (ns)'] + df['Duration (ns)'] df = self.sum_non_overlapping_intervals(df) # get ready to print table with elapsed times per kernel df['Instances'] = 1 df_sum = df.groupby('Name', as_index=False).agg({ 'Elapsed Time (ns)': 'sum', 'Duration (ns)': 'sum', 'Instances': 'size' }) # generate csv df_sum['Total Time (sec)'] = df_sum['Duration (ns)'] / 1e9 df_sum['Elapsed Time (sec)'] = df_sum['Elapsed Time (ns)'] / 1e9 df_sum = df_sum.sort_values(by='Elapsed Time (sec)', ascending=False) df_sum[['Elapsed Time (sec)', 'Total Time (sec)', 'Instances', 'Name']].to_csv(out_file, index=False) def sum_non_overlapping_intervals(self, df): """ returns new sorted df with Elapsed Time (ns) column using vectorized operations """ logger.info("sorting %s trace records by start time", str(df.shape)) # Sort by start time and reset index df = df.sort_values(by='Start (ns)').reset_index(drop=True) # Initialize elapsed time as duration df['Elapsed Time (ns)'] = df['Duration (ns)'] # Get numpy arrays for faster operations starts = df['Start (ns)'].values ends = df['End (ns)'].values # Keep track of current interval end current_end = ends[0] display_units = int(len(df) / 100) # Update current_end for overlapping intervals for i in range(1, len(df)): if i % display_units == 0: print(f'processing trace: {int(i/len(df) * 100)} %', end="\r") if starts[i] <= current_end: if ends[i] > current_end: # Partial overlap df.iloc[i, df.columns.get_loc('Elapsed Time (ns)' )] = ends[i] - current_end current_end = ends[i] else: # Complete overlap df.iloc[i, df.columns.get_loc('Elapsed Time (ns)')] = 0 else: # No overlap current_end = ends[i] return df # functions for generating html files def make_html(self, df, output_dir, title): """ make html graph from df """ import plotly.express as px if df.empty: return output_name = output_dir + '/result' if not title: title = 'Model_Engine' x = 'Model_Engine' y = 'Elapsed Time (sec)' color = 'Substage' """ generate kernel mapping table """ # Sort Model_Engine categories by last field after underscore df['Model_Engine'] = self.pd.Categorical( df['Model_Engine'], sorted(df['Model_Engine'].unique(), key=lambda x: x.split('_')[-1])) df[['Model_Engine', color, 'Instances', 'Name', y]].sort_values(by=color).to_csv(f'{output_name}.csv', index=False) graph = px.histogram(df.round(2), x=x, y=y, title=(f'{y} for {title}'), color=color, text_auto=True) # wrap x axis labels graph.update_xaxes(automargin=True) graph.write_html(f'{output_name}.html') """ Generate data table with columns per Model_Engine into result.html """ pivot_df = df.pivot_table(values='Elapsed Time (sec)', index='Substage', columns='Model_Engine', aggfunc='sum', observed=False).round(2) # Add sum row at bottom pivot_df.loc['total_elapsed_sec'] = pivot_df.sum() pivot_df.fillna('').to_html('temp.html') print('got') with (open(f'{output_name}.html', 'a', encoding='utf-8') as outfile, open('temp.html', encoding='utf-8') as infile): outfile.write(infile.read()) os.remove('temp.html') print(f'Finished generating: \n' f' {output_name}.html for stack bar chart \n' f' {output_name}.csv for Kernel-Substage mapping') def anno_gpu_kernname(self, df, mapping): """ add "stage" and "substage" columns """ def anno_gpu_kernname_helper(name, stage): for kern_name, val in mapping['layer_anno'][stage].items(): if re.search(kern_name, name): return val for stage in ['Stage', 'Substage']: df[stage] = df['Name'].apply(anno_gpu_kernname_helper, stage=stage) def make_nongpu_row(self, df, nongpu_sec): """ this will append non-gpu time entry at end of df """ nongpu_row = self.pd.DataFrame([df.iloc[-1]]) nongpu_row['Substage'] = nongpu_row['Name'] = 'CPU(non-GPU)' nongpu_row['Instances'] = 1 nongpu_row['Elapsed Time (sec)'] = nongpu_sec return (nongpu_row) def is_valid_file(self, base_file): """ asserts if base_file is non-existent or is empty """ assert os.path.isfile(base_file) and os.path.getsize(base_file) > 0, \ f"{base_file} doesn't exist or is empty" def should_gen_file(self, new_file, base_file): """ figure out if new file should be generated from base_file """ self.is_valid_file(base_file) if (os.path.exists(new_file) and (os.path.getmtime(new_file) > os.path.getmtime(base_file)) and (os.path.getsize(base_file) > 0)): logger.info('reusing %s', new_file) return False else: logger.info('generating %s', new_file) return True def gen_sum_file(self, file): """ generates sum file from nsys trace with times per kernel and returns the name of the sum file """ import subprocess file_dir = os.path.dirname(file) file_name = os.path.basename(file) if not file_dir: file_dir = '.' # Walk through trace and get the total non-overlapped time nsys_stats_file = f'{file_dir}/{file_name}_cuda_gpu_trace.csv' sum_file = f'{file_dir}/{file_name}_cuda_gpu_kernel_tracesum.csv' if self.should_gen_file(nsys_stats_file, file): cmd = [ self.nsys_cmd, 'stats', '-r', 'cuda_gpu_trace', file, '-o', f'{file_dir}/{file_name}' ] cmd_str = ' '.join(cmd) logger.info('+ %s', cmd_str) try: subprocess.run(cmd) except Exception: logger.error( "%s failed, specify --nsys_cmd for correct nsys path", cmd_str) exit(1) logger.info('generating non-overalapped sum %s', sum_file) self.gen_nonoverlapped_sum_from_gputrace(nsys_stats_file, sum_file) self.is_valid_file(sum_file) logger.info('Finished generating %s', sum_file) return sum_file def gen_graph(self, in_file, out_dir, title): """ generates graph and csv file from in_file into out_dir """ # Initialize an empty DataFrame to store combined data combined_df = self.pd.DataFrame() for idx, (file, engine, model, total_sec) in enumerate(in_file): file_dir = os.path.dirname(file) file_name = os.path.basename(file) if not file_dir: file_dir = '.' sum_file = self.gen_sum_file(file) # read kernel summary file df = self.pd.read_csv(sum_file) # annotate kernel to their categories assert EngineModelData.engine_model.get(engine) assert EngineModelData.engine_model[engine].get(model) # remove nsys-rep from file_name for shorter x-label file_name = file_name.replace('.nsys-rep', '') df['Model_Engine'] = f'{model}_{engine}_{file_name}_{idx}' self.anno_gpu_kernname(df, EngineModelData.engine_model[engine][model]) # patch in non-gpu time gpu_sec = round(df['Elapsed Time (sec)'].sum(), 1) total_sec = round(float(total_sec), 1) if total_sec < gpu_sec: logger.warning( "Elapsed sec %.2f < GPU sec %.2f resetting Elapsed sec ", total_sec, gpu_sec, ) total_sec = gpu_sec nongpu_row = self.make_nongpu_row(df, total_sec - gpu_sec) df = self.pd.concat([df, nongpu_row], ignore_index=True) combined_df = self.pd.concat([combined_df, df], ignore_index=True) if out_dir is None: out_dir = '.' else: os.makedirs(out_dir, exist_ok=True) # generate html file self.make_html(combined_df, out_dir, title) def parse_tuple(s): return tuple(s.split(',')) def main(): logging.basicConfig(format=('%(asctime)s - %(levelname)s - %(message)s'), level=logging.INFO) parser = argparse.ArgumentParser( description=( 'Process nsys rep and generate kernel non-overlapped cycles. \n' 'Example:\n' "gputrc2graph.py --in_file d1.nsys-rep,vllm,llama,100 \n" "d2.nsys-rep,vllm,gpt-oss,102 " "--out_dir results/ --title \"Model=gpt-oss vLLM chart\""), formatter_class=argparse.RawDescriptionHelpFormatter) # Build help string showing available engine/model combinations engine_model_help = [] for engine, models in EngineModelData.engine_model.items(): model_list = list(models.keys()) engine_model_help.append(f"{engine}:[{','.join(model_list)}]") engine_model_str = ' '.join(engine_model_help) parser.add_argument( '--in_file', type=parse_tuple, nargs='+', help=( 'list of (nsys-rep, engine, model, elapsed_nonprofiled_sec) ' 'separated by space. Elapsed_nonprofiled_sec is runtime without ' 'profiling used to calculate non-gpu time. Specify 0 to use ' 'elapsed time from nsys-rep but that might inflate non-gpu time. ' f'Available engine:[model] are: {engine_model_str} ' f'Example: --infile d1.nsys-rep,vllm,llama,100 ' 'd2.nsys-rep,vllm,gpt-oss,102'), required=True) parser.add_argument('--out_dir', help=('output dir for result.csv/html')) parser.add_argument('--title', help=('title for html chart')) parser.add_argument('--nsys_cmd', help=('nsys cmd, e.g. /usr/bin/nsys, Default: nsys'), default="nsys") args = parser.parse_args() gputrace = GPUTrace2Graph(args.nsys_cmd) gputrace.gen_graph(args.in_file, args.out_dir, args.title) if __name__ == '__main__': main()