# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
    This generates gpu kernel analysis output from nsys rep. Will call nsys
    stats  -r cuda_gpu_kern_trace, get non-overlapped gpu cycles, then generate
    csv and html output for analysis
"""
import argparse
import logging
import os

import regex as re

logger = logging.getLogger(__name__)


# helper data class for annotating kernels
def load_engine_model():
    """ returns engine_model built from all json files in the current dir """
    import glob
    import json
    engine_model = {}

    json_files = glob.glob(
        os.path.join(os.path.dirname(__file__) or ".", "*.json"))
    for fname in json_files:
        with open(fname, encoding="utf-8") as f:
            engine_model.update(json.load(f))
    return engine_model


class GPUTrace2Graph:
    """ 
        Parses output of nsys report, generates csv and bar chart output
    """

    def __init__(self):
        import pandas as pd  # avoid importing till needed
        self.pd = pd
        self.pd.options.mode.copy_on_write = True

    # helper functions for generating trace->summary csvs
    def gen_nonoverlapped_sum_from_gputrace(self, in_file, out_file):
        logger.info('loading %s', in_file)
        df = self.pd.read_csv(
            in_file,
            usecols=['Start (ns)', 'Duration (ns)', 'Device', 'Strm', 'Name'])
        df['End (ns)'] = df['Start (ns)'] + df['Duration (ns)']
        df = self.sum_non_overlapping_intervals(df)
        # get ready to print table with elapsed times per kernel
        df['Instances'] = 1
        df_sum = df.groupby('Name', as_index=False).agg({
            'Elapsed Time (ns)': 'sum',
            'Duration (ns)': 'sum',
            'Instances': 'size'
        })

        # generate csv
        df_sum['Total Time (sec)'] = df_sum['Duration (ns)'] / 1e9
        df_sum['Elapsed Time (sec)'] = df_sum['Elapsed Time (ns)'] / 1e9
        df_sum = df_sum.sort_values(by='Elapsed Time (sec)', ascending=False)
        df_sum[['Elapsed Time (sec)', 'Total Time (sec)', 'Instances',
                'Name']].to_csv(out_file, index=False)

    def sum_non_overlapping_intervals(self, df):
        """ 
            returns new sorted df with Elapsed Time (ns) column using 
            vectorized operations 
        """
        logger.info("sorting %s trace records by start time", str(df.shape))

        # Sort by start time and reset index
        df = df.sort_values(by='Start (ns)').reset_index(drop=True)

        # Initialize elapsed time as duration
        df['Elapsed Time (ns)'] = df['Duration (ns)']

        # Get numpy arrays for faster operations
        starts = df['Start (ns)'].values
        ends = df['End (ns)'].values

        # Keep track of current interval end
        current_end = ends[0]
        display_units = int(len(df) / 100)
        # Update current_end for overlapping intervals
        for i in range(1, len(df)):
            if i % display_units == 0:
                print(f'processing trace: {int(i/len(df) * 100)} %', end="\r")
            if starts[i] <= current_end:
                if ends[i] > current_end:
                    # Partial overlap
                    df.iloc[i, df.columns.get_loc('Elapsed Time (ns)'
                                                  )] = ends[i] - current_end
                    current_end = ends[i]
                else:
                    # Complete overlap
                    df.iloc[i, df.columns.get_loc('Elapsed Time (ns)')] = 0
            else:
                # No overlap
                current_end = ends[i]

        return df

    # functions for generating html files
    def make_html(self, df, output_dir, title):
        """ make html graph from df """
        import plotly.express as px
        if df.empty:
            return
        output_name = output_dir + '/result'
        if not title:
            title = 'Model_Engine'
        x = 'Model_Engine'
        y = 'Elapsed Time (sec)'
        color = 'Category'
        """ generate kernel mapping table  """
        # Sort Model_Engine categories by last field after underscore
        df['Model_Engine'] = self.pd.Categorical(
            df['Model_Engine'],
            sorted(df['Model_Engine'].unique(),
                   key=lambda x: x.split('_')[-1]))
        df[['Model_Engine', color, 'Instances', 'Name',
            y]].sort_values(by=color).to_csv(f'{output_name}.csv', index=False)
        graph = px.histogram(df.round(2),
                             x=x,
                             y=y,
                             title=(f'{y} for {title}'),
                             color=color,
                             text_auto=True)
        # wrap x axis labels
        graph.update_xaxes(automargin=True)
        graph.write_html(f'{output_name}.html')
        """
            Generate data table with columns per Model_Engine into result.html
        """
        pivot_df = df.pivot_table(values='Elapsed Time (sec)',
                                  index='Category',
                                  columns='Model_Engine',
                                  aggfunc='sum',
                                  observed=False).round(2)
        # Add sum row at bottom
        pivot_df.loc['total_elapsed_sec'] = pivot_df.sum()
        pivot_df.fillna('').to_html('temp.html')
        with (open(f'{output_name}.html', 'a', encoding='utf-8') as
              outfile, open('temp.html', encoding='utf-8') as infile):
            outfile.write(infile.read())
        os.remove('temp.html')

        print(f'Finished generating: \n'
              f' {output_name}.html for stack bar chart \n'
              f' {output_name}.csv for Kernel-Category mapping')

    def anno_gpu_kernname(self, df, mapping):
        """ add "Category" column """

        def anno_gpu_kernname_helper(name):
            for kern_name, val in mapping.items():
                if re.search(kern_name, name):
                    return val

        df['Category'] = df['Name'].apply(anno_gpu_kernname_helper)

    def make_nongpu_row(self, df, nongpu_sec):
        """ this will append non-gpu time entry at end of df """
        nongpu_row = self.pd.DataFrame([df.iloc[-1]])
        nongpu_row['Category'] = nongpu_row['Name'] = 'CPU(non-GPU)'
        nongpu_row['Instances'] = 1
        nongpu_row['Elapsed Time (sec)'] = nongpu_sec
        return (nongpu_row)

    def is_valid_file(self, base_file):
        """ asserts if base_file is non-existent or is empty """
        assert os.path.isfile(base_file) and os.path.getsize(base_file) > 0, \
           f"{base_file} doesn't exist or is empty"

    def should_gen_file(self, new_file, base_file):
        """ figure out if new file should be generated from base_file """
        self.is_valid_file(base_file)
        if (os.path.exists(new_file)
                and (os.path.getmtime(new_file) > os.path.getmtime(base_file))
                and (os.path.getsize(base_file) > 0)):
            logger.info('reusing %s', new_file)
            return False
        else:
            logger.info('generating %s', new_file)
            return True

    def gen_sum_file(self, file, nsys_cmd):
        """ 
            generates sum file from nsys trace with times per kernel and
            returns the name of the sum file
        """
        import subprocess
        file_dir = os.path.dirname(file)
        file_name = os.path.basename(file)

        if not file_dir:
            file_dir = '.'
        # Walk through trace and get the total non-overlapped time
        nsys_stats_file = f'{file_dir}/{file_name}_cuda_gpu_trace.csv'
        sum_file = f'{file_dir}/{file_name}_cuda_gpu_kernel_tracesum.csv'
        if self.should_gen_file(nsys_stats_file, file):
            cmd = [
                nsys_cmd, 'stats', '-r', 'cuda_gpu_trace', file, '-o',
                f'{file_dir}/{file_name}'
            ]
            cmd_str = ' '.join(cmd)
            logger.info('+ %s', cmd_str)
            # estimate time based on calibrated 240M/min
            file_size_mb = os.path.getsize(file) / 1e6
            logger.info(
                'nsys stats for %.2f MB file expected to take %.2f min',
                file_size_mb, file_size_mb / 240)
            try:
                subprocess.run(cmd, check=True)
            except Exception:
                logger.error("%s failed; Use --nsys_cmd to specify nsys path",
                             cmd_str)
                exit(1)
            logger.info('generating non-overalapped sum %s', sum_file)
            self.gen_nonoverlapped_sum_from_gputrace(nsys_stats_file, sum_file)
        self.is_valid_file(sum_file)
        logger.info('Finished generating %s', sum_file)
        return sum_file

    def gen_graph(self, in_file, out_dir, title, nsys_cmd, engine_model):
        """ generates graph and csv file from in_file into out_dir """
        # Initialize an empty DataFrame to store combined data
        combined_df = self.pd.DataFrame()
        for idx, (file, engine, model, total_sec) in enumerate(in_file):
            file_dir = os.path.dirname(file)
            file_name = os.path.basename(file)
            if not file_dir:
                file_dir = '.'
            sum_file = self.gen_sum_file(file, nsys_cmd)
            # read kernel summary file
            df = self.pd.read_csv(sum_file)
            # annotate kernel to their categories
            assert engine_model.get(engine), f'engine {engine} unknown'
            assert engine_model[engine].get(model), f'model {model} unknown'
            # remove nsys-rep from file_name for shorter x-label
            file_name = file_name.replace('.nsys-rep', '')
            df['Model_Engine'] = f'{model}_{engine}_{file_name}_{idx}'
            self.anno_gpu_kernname(df, engine_model[engine][model])
            # patch in non-gpu time
            gpu_sec = round(df['Elapsed Time (sec)'].sum(), 1)
            total_sec = round(float(total_sec), 1)
            if total_sec < gpu_sec:
                logger.warning(
                    "Elapsed sec %.2f < GPU sec %.2f resetting Elapsed sec ",
                    total_sec,
                    gpu_sec,
                )
                total_sec = gpu_sec
            nongpu_row = self.make_nongpu_row(df, total_sec - gpu_sec)
            df = self.pd.concat([df, nongpu_row], ignore_index=True)
            combined_df = self.pd.concat([combined_df, df], ignore_index=True)
        if out_dir is None:
            out_dir = '.'
        else:
            os.makedirs(out_dir, exist_ok=True)
        # generate html file
        self.make_html(combined_df, out_dir, title)


def parse_tuple(s):
    return tuple(s.split(','))


def main():
    logging.basicConfig(format=('%(asctime)s - %(levelname)s - %(message)s'),
                        level=logging.INFO)
    parser = argparse.ArgumentParser(
        description=(
            'Process nsys rep and generate kernel non-overlapped cycles. \n'
            'Example:\n'
            "gputrc2graph.py --in_file d1.nsys-rep,vllm,llama,100 \n"
            "d2.nsys-rep,vllm,gpt-oss,102 "
            "--out_dir results/ --title \"Model=gpt-oss vLLM chart\""),
        formatter_class=argparse.RawDescriptionHelpFormatter)

    # load supported engine_model
    engine_model_supported = load_engine_model()
    # Get a string representation of supported engine/model combinations
    engine_model_supported_str = ', '.join(
        f"{engine}:[{', '.join(models.keys())}]"
        for engine, models in engine_model_supported.items())
    parser.add_argument(
        '--in_file',
        type=parse_tuple,
        nargs='+',
        help=(
            'list of (nsys-rep, engine, model, elapsed_nonprofiled_sec) '
            'separated by space. Elapsed_nonprofiled_sec is runtime without '
            'profiling used to calculate non-gpu time. Specify 0 to use '
            'elapsed time from nsys-rep but that might inflate non-gpu time. '
            f'Available engine:[model] are: {engine_model_supported_str} '
            f'Example: --infile d1.nsys-rep,vllm,llama,100 '
            'd2.nsys-rep,vllm,gpt-oss,102'),
        required=True)
    parser.add_argument('--out_dir', help=('output dir for result.csv/html'))
    parser.add_argument('--title', help=('title for html chart'))
    parser.add_argument('--nsys_cmd',
                        help=('nsys cmd, e.g. /usr/bin/nsys, Default: nsys'),
                        default="nsys")
    args = parser.parse_args()
    gputrace = GPUTrace2Graph()
    gputrace.gen_graph(args.in_file, args.out_dir, args.title, args.nsys_cmd,
                       engine_model_supported)


if __name__ == '__main__':
    main()