mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-13 17:20:23 +08:00
[misc] split engine_model into json file for nsys profile tool (#23117)
Signed-off-by: Grace Ho <grho@nvidia.com> Signed-off-by: Grace Ho <146482179+gracehonv@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
This commit is contained in:
parent
fda9537c5e
commit
01a08739e0
@ -36,8 +36,7 @@ profiling and analyzing nsys profile output.
|
|||||||
## Notes
|
## Notes
|
||||||
|
|
||||||
- Make sure you have pandas installed.
|
- Make sure you have pandas installed.
|
||||||
- Make sure nsys is installed, and specify the path to the `nsys` command with
|
- Make sure [nsys](https://developer.nvidia.com/nsight-systems/get-started) is installed, and specify the path to the `nsys` command with `--nsys_cmd` if it is not in your PATH.
|
||||||
`--nsys_cmd` if it is not in your PATH.
|
|
||||||
- For more details on available engines and models, see the help string in
|
- For more details on available engines and models, see the help string in
|
||||||
the script or run:
|
the script or run:
|
||||||
|
|
||||||
@ -135,34 +134,31 @@ time which would cause a difference for the overall category.
|
|||||||
|
|
||||||
## Example 3: add new classification for a new model
|
## Example 3: add new classification for a new model
|
||||||
|
|
||||||
Suppose there's a new model ABC that is available for engine DEF, and say there
|
To create a new engine DEF with model ABC, just add another json file in the same directory as
|
||||||
are 4 kernels to be classified into "gemm" and "attn", where the gemm kernels
|
gputrc2graph.py with the same format as the other json files. The script will automatically pick up all the json files in the same directory as engine/model specifications.
|
||||||
have names with "*H*" or "*I*" in them, and attn kernels have names with "*J*"
|
|
||||||
or "*K*" in them, add a new entry like so:
|
|
||||||
|
|
||||||
```python
|
Then, for this new model, suppose there are 4 kernels to be classified into "gemm" and "attn", where the gemm kernels
|
||||||
engine_model = {
|
have names with "*H*" or "*I*" in them, and attn kernels have names with "*J*"
|
||||||
'DEF': {
|
or "*K*" in them, just add another .json file in the same directory as
|
||||||
'ABC': {
|
gputrc2graph.py with the same format as the other json files, like the following:
|
||||||
'layer_anno': {
|
|
||||||
'Stage': {
|
```json
|
||||||
'.*': 'layer',
|
{
|
||||||
},
|
"DEF": {
|
||||||
'Substage': {
|
"ABC": {
|
||||||
'H|I': 'gemm',
|
"H|I": "gemm",
|
||||||
'J|K': 'attn',
|
"J|K": "attn",
|
||||||
'CUDA mem': 'non-gpu-H_D_memops',
|
"CUDA mem": "non-gpu-H_D_memops",
|
||||||
'.*': 'misc'
|
".*": "misc"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
}
|
||||||
}
|
|
||||||
'vllm': {...}
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Basically Substage is a dictionary with a list of key/value pairs, where the
|
Each entry in the dictionary consists of:
|
||||||
keys are regex's of the kernel names to be classified, and values are the
|
|
||||||
classification bins which one wishes to compare across engines/models.
|
- key: a regex used to classify the kernels
|
||||||
|
- value: the category to classify the kernels into.
|
||||||
|
|
||||||
The last 2 entries are common for all engine/models, consisting of CUDA memory
|
The last 2 entries are common for all engine/models, consisting of CUDA memory
|
||||||
operations and a 'misc' for anything that's leftover and can't be classified.
|
operations and a 'misc' for anything that's leftover and can't be classified.
|
||||||
@ -173,3 +169,6 @@ like the following:
|
|||||||
```bash
|
```bash
|
||||||
--infile new.nsys-rep,DEF,ABC,<runtime>
|
--infile new.nsys-rep,DEF,ABC,<runtime>
|
||||||
```
|
```
|
||||||
|
|
||||||
|
If the engine_DEF.json file already exists, just add the model as a new node in
|
||||||
|
the existing engine file, after the other models.
|
||||||
|
|||||||
@ -15,132 +15,18 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
# helper data class for annotating kernels
|
# helper data class for annotating kernels
|
||||||
class EngineModelData:
|
def load_engine_model():
|
||||||
# engine + model mappings
|
""" returns engine_model built from all json files in the current dir """
|
||||||
engine_model = {
|
import glob
|
||||||
'vllm': {
|
import json
|
||||||
'llama': {
|
engine_model = {}
|
||||||
'layer_anno': {
|
|
||||||
'Stage': {
|
json_files = glob.glob(
|
||||||
'.*': 'layer',
|
os.path.join(os.path.dirname(__file__) or ".", "*.json"))
|
||||||
},
|
for fname in json_files:
|
||||||
'Substage': {
|
with open(fname, encoding="utf-8") as f:
|
||||||
'gemm': 'gemm',
|
engine_model.update(json.load(f))
|
||||||
'fused_moe_kernel|GroupProblemShape|group_gemm_starts':
|
return engine_model
|
||||||
'moe_gemm', #llama4
|
|
||||||
'moe|sigmoid': 'moe', #llama4
|
|
||||||
'CatArrayBatched|prepare_inputs': 'prepare_next',
|
|
||||||
'flash': 'attn',
|
|
||||||
'ncclDevKernel|cross_device_reduce':
|
|
||||||
'nccl_and_custom_ar',
|
|
||||||
'_norm_': 'norm',
|
|
||||||
'act_and_mul_': 'silu',
|
|
||||||
'rotary_embedding_kernel': 'rope',
|
|
||||||
'SoftMax': 'softmax',
|
|
||||||
'elementwise': 'elementwise',
|
|
||||||
'fp8_quant': 'quantize',
|
|
||||||
'reduce_kernel': 'reduce',
|
|
||||||
'triton': 'triton_kernel',
|
|
||||||
'CUDA mem': 'non-gpu-H_D_memops',
|
|
||||||
'.*': 'misc'
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
'ds': {
|
|
||||||
'layer_anno': {
|
|
||||||
'Stage': {
|
|
||||||
'.*': 'layer',
|
|
||||||
},
|
|
||||||
'Substage': {
|
|
||||||
'block_fp8|gemm_fp8_blockwise':
|
|
||||||
'block_fp8_gemm',
|
|
||||||
'fused_moe_kernel|_group_gemm|GroupProblemShape|GemmUniversal':
|
|
||||||
'moe_gemm',
|
|
||||||
'gemm|matmul|nvjet':
|
|
||||||
'gemm',
|
|
||||||
'moe|sigmoid|expert':
|
|
||||||
'moe',
|
|
||||||
'_fwd_|FlashAttn|_mla_|_attn_':
|
|
||||||
'attn',
|
|
||||||
'CatArrayBatched':
|
|
||||||
'prepare_next',
|
|
||||||
'ncclDevKernel|cross_device_reduce':
|
|
||||||
'nccl_and_custom_ar',
|
|
||||||
'Norm|_norm_':
|
|
||||||
'norm',
|
|
||||||
'sbtopk':
|
|
||||||
'topk',
|
|
||||||
'act_and_mul_':
|
|
||||||
'activation',
|
|
||||||
'compute_position_kernel':
|
|
||||||
'rope',
|
|
||||||
'elementwise':
|
|
||||||
'elementwise',
|
|
||||||
'fp8_quant|quant_fp8|cvt_fp16_to_fp4':
|
|
||||||
'quantize',
|
|
||||||
'reduce':
|
|
||||||
'reduce',
|
|
||||||
'SoftMax':
|
|
||||||
'softmax',
|
|
||||||
'triton':
|
|
||||||
'triton_kernel',
|
|
||||||
'CUDA mem':
|
|
||||||
'non-gpu-H_D_memops',
|
|
||||||
'.*':
|
|
||||||
'misc'
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
'gpt-oss': {
|
|
||||||
'layer_anno': {
|
|
||||||
'Stage': {
|
|
||||||
'.*': 'layer',
|
|
||||||
},
|
|
||||||
'Substage': {
|
|
||||||
'block_fp8|gemm_fp8_blockwise':
|
|
||||||
'block_fp8_gemm',
|
|
||||||
'fused_moe_kernel|_group_gemm|GroupProblemShape|GemmUniversal|bmm_'
|
|
||||||
# this section is triton_moe_gemm
|
|
||||||
'|matmul_ogs_|_topk_forward|_combined_routing'
|
|
||||||
'|_sum_bitmatrix_rows|_compute_writeback_idx':
|
|
||||||
'moe_gemm',
|
|
||||||
'gemm|matmul|nvjet':
|
|
||||||
'gemm',
|
|
||||||
'moe|sigmoid|expert|splitKreduce':
|
|
||||||
'moe',
|
|
||||||
'_fwd_|FlashAttn|_mla_|_attn_|_flash_|flash::prepare_varlen|fmha':
|
|
||||||
'attn',
|
|
||||||
'CatArrayBatched':
|
|
||||||
'prepare_next',
|
|
||||||
'ncclDevKernel|cross_device_reduce':
|
|
||||||
'nccl_and_custom_ar',
|
|
||||||
'Norm|_norm_':
|
|
||||||
'norm',
|
|
||||||
'sbtopk':
|
|
||||||
'topk',
|
|
||||||
'act_and_mul_':
|
|
||||||
'activation',
|
|
||||||
'compute_position_kernel':
|
|
||||||
'rope',
|
|
||||||
'elementwise':
|
|
||||||
'elementwise',
|
|
||||||
'fp8_quant|quant_fp8|cvt_fp16_to_fp4|quantize':
|
|
||||||
'quantize',
|
|
||||||
'reduce':
|
|
||||||
'reduce',
|
|
||||||
'SoftMax':
|
|
||||||
'softmax',
|
|
||||||
'triton':
|
|
||||||
'triton_kernel',
|
|
||||||
'CUDA mem':
|
|
||||||
'non-gpu-H_D_memops',
|
|
||||||
'.*':
|
|
||||||
'misc'
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class GPUTrace2Graph:
|
class GPUTrace2Graph:
|
||||||
@ -148,8 +34,7 @@ class GPUTrace2Graph:
|
|||||||
Parses output of nsys report, generates csv and bar chart output
|
Parses output of nsys report, generates csv and bar chart output
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, nsys_cmd):
|
def __init__(self):
|
||||||
self.nsys_cmd = nsys_cmd
|
|
||||||
import pandas as pd # avoid importing till needed
|
import pandas as pd # avoid importing till needed
|
||||||
self.pd = pd
|
self.pd = pd
|
||||||
self.pd.options.mode.copy_on_write = True
|
self.pd.options.mode.copy_on_write = True
|
||||||
@ -227,7 +112,7 @@ class GPUTrace2Graph:
|
|||||||
title = 'Model_Engine'
|
title = 'Model_Engine'
|
||||||
x = 'Model_Engine'
|
x = 'Model_Engine'
|
||||||
y = 'Elapsed Time (sec)'
|
y = 'Elapsed Time (sec)'
|
||||||
color = 'Substage'
|
color = 'Category'
|
||||||
""" generate kernel mapping table """
|
""" generate kernel mapping table """
|
||||||
# Sort Model_Engine categories by last field after underscore
|
# Sort Model_Engine categories by last field after underscore
|
||||||
df['Model_Engine'] = self.pd.Categorical(
|
df['Model_Engine'] = self.pd.Categorical(
|
||||||
@ -249,14 +134,13 @@ class GPUTrace2Graph:
|
|||||||
Generate data table with columns per Model_Engine into result.html
|
Generate data table with columns per Model_Engine into result.html
|
||||||
"""
|
"""
|
||||||
pivot_df = df.pivot_table(values='Elapsed Time (sec)',
|
pivot_df = df.pivot_table(values='Elapsed Time (sec)',
|
||||||
index='Substage',
|
index='Category',
|
||||||
columns='Model_Engine',
|
columns='Model_Engine',
|
||||||
aggfunc='sum',
|
aggfunc='sum',
|
||||||
observed=False).round(2)
|
observed=False).round(2)
|
||||||
# Add sum row at bottom
|
# Add sum row at bottom
|
||||||
pivot_df.loc['total_elapsed_sec'] = pivot_df.sum()
|
pivot_df.loc['total_elapsed_sec'] = pivot_df.sum()
|
||||||
pivot_df.fillna('').to_html('temp.html')
|
pivot_df.fillna('').to_html('temp.html')
|
||||||
print('got')
|
|
||||||
with (open(f'{output_name}.html', 'a', encoding='utf-8') as
|
with (open(f'{output_name}.html', 'a', encoding='utf-8') as
|
||||||
outfile, open('temp.html', encoding='utf-8') as infile):
|
outfile, open('temp.html', encoding='utf-8') as infile):
|
||||||
outfile.write(infile.read())
|
outfile.write(infile.read())
|
||||||
@ -264,23 +148,22 @@ class GPUTrace2Graph:
|
|||||||
|
|
||||||
print(f'Finished generating: \n'
|
print(f'Finished generating: \n'
|
||||||
f' {output_name}.html for stack bar chart \n'
|
f' {output_name}.html for stack bar chart \n'
|
||||||
f' {output_name}.csv for Kernel-Substage mapping')
|
f' {output_name}.csv for Kernel-Category mapping')
|
||||||
|
|
||||||
def anno_gpu_kernname(self, df, mapping):
|
def anno_gpu_kernname(self, df, mapping):
|
||||||
""" add "stage" and "substage" columns """
|
""" add "Category" column """
|
||||||
|
|
||||||
def anno_gpu_kernname_helper(name, stage):
|
def anno_gpu_kernname_helper(name):
|
||||||
for kern_name, val in mapping['layer_anno'][stage].items():
|
for kern_name, val in mapping.items():
|
||||||
if re.search(kern_name, name):
|
if re.search(kern_name, name):
|
||||||
return val
|
return val
|
||||||
|
|
||||||
for stage in ['Stage', 'Substage']:
|
df['Category'] = df['Name'].apply(anno_gpu_kernname_helper)
|
||||||
df[stage] = df['Name'].apply(anno_gpu_kernname_helper, stage=stage)
|
|
||||||
|
|
||||||
def make_nongpu_row(self, df, nongpu_sec):
|
def make_nongpu_row(self, df, nongpu_sec):
|
||||||
""" this will append non-gpu time entry at end of df """
|
""" this will append non-gpu time entry at end of df """
|
||||||
nongpu_row = self.pd.DataFrame([df.iloc[-1]])
|
nongpu_row = self.pd.DataFrame([df.iloc[-1]])
|
||||||
nongpu_row['Substage'] = nongpu_row['Name'] = 'CPU(non-GPU)'
|
nongpu_row['Category'] = nongpu_row['Name'] = 'CPU(non-GPU)'
|
||||||
nongpu_row['Instances'] = 1
|
nongpu_row['Instances'] = 1
|
||||||
nongpu_row['Elapsed Time (sec)'] = nongpu_sec
|
nongpu_row['Elapsed Time (sec)'] = nongpu_sec
|
||||||
return (nongpu_row)
|
return (nongpu_row)
|
||||||
@ -302,7 +185,7 @@ class GPUTrace2Graph:
|
|||||||
logger.info('generating %s', new_file)
|
logger.info('generating %s', new_file)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def gen_sum_file(self, file):
|
def gen_sum_file(self, file, nsys_cmd):
|
||||||
"""
|
"""
|
||||||
generates sum file from nsys trace with times per kernel and
|
generates sum file from nsys trace with times per kernel and
|
||||||
returns the name of the sum file
|
returns the name of the sum file
|
||||||
@ -318,17 +201,21 @@ class GPUTrace2Graph:
|
|||||||
sum_file = f'{file_dir}/{file_name}_cuda_gpu_kernel_tracesum.csv'
|
sum_file = f'{file_dir}/{file_name}_cuda_gpu_kernel_tracesum.csv'
|
||||||
if self.should_gen_file(nsys_stats_file, file):
|
if self.should_gen_file(nsys_stats_file, file):
|
||||||
cmd = [
|
cmd = [
|
||||||
self.nsys_cmd, 'stats', '-r', 'cuda_gpu_trace', file, '-o',
|
nsys_cmd, 'stats', '-r', 'cuda_gpu_trace', file, '-o',
|
||||||
f'{file_dir}/{file_name}'
|
f'{file_dir}/{file_name}'
|
||||||
]
|
]
|
||||||
cmd_str = ' '.join(cmd)
|
cmd_str = ' '.join(cmd)
|
||||||
logger.info('+ %s', cmd_str)
|
logger.info('+ %s', cmd_str)
|
||||||
|
# estimate time based on calibrated 240M/min
|
||||||
|
file_size_mb = os.path.getsize(file) / 1e6
|
||||||
|
logger.info(
|
||||||
|
'nsys stats for %.2f MB file expected to take %.2f min',
|
||||||
|
file_size_mb, file_size_mb / 240)
|
||||||
try:
|
try:
|
||||||
subprocess.run(cmd)
|
subprocess.run(cmd, check=True)
|
||||||
except Exception:
|
except Exception:
|
||||||
logger.error(
|
logger.error("%s failed; Use --nsys_cmd to specify nsys path",
|
||||||
"%s failed, specify --nsys_cmd for correct nsys path",
|
cmd_str)
|
||||||
cmd_str)
|
|
||||||
exit(1)
|
exit(1)
|
||||||
logger.info('generating non-overalapped sum %s', sum_file)
|
logger.info('generating non-overalapped sum %s', sum_file)
|
||||||
self.gen_nonoverlapped_sum_from_gputrace(nsys_stats_file, sum_file)
|
self.gen_nonoverlapped_sum_from_gputrace(nsys_stats_file, sum_file)
|
||||||
@ -336,7 +223,7 @@ class GPUTrace2Graph:
|
|||||||
logger.info('Finished generating %s', sum_file)
|
logger.info('Finished generating %s', sum_file)
|
||||||
return sum_file
|
return sum_file
|
||||||
|
|
||||||
def gen_graph(self, in_file, out_dir, title):
|
def gen_graph(self, in_file, out_dir, title, nsys_cmd, engine_model):
|
||||||
""" generates graph and csv file from in_file into out_dir """
|
""" generates graph and csv file from in_file into out_dir """
|
||||||
# Initialize an empty DataFrame to store combined data
|
# Initialize an empty DataFrame to store combined data
|
||||||
combined_df = self.pd.DataFrame()
|
combined_df = self.pd.DataFrame()
|
||||||
@ -345,17 +232,16 @@ class GPUTrace2Graph:
|
|||||||
file_name = os.path.basename(file)
|
file_name = os.path.basename(file)
|
||||||
if not file_dir:
|
if not file_dir:
|
||||||
file_dir = '.'
|
file_dir = '.'
|
||||||
sum_file = self.gen_sum_file(file)
|
sum_file = self.gen_sum_file(file, nsys_cmd)
|
||||||
# read kernel summary file
|
# read kernel summary file
|
||||||
df = self.pd.read_csv(sum_file)
|
df = self.pd.read_csv(sum_file)
|
||||||
# annotate kernel to their categories
|
# annotate kernel to their categories
|
||||||
assert EngineModelData.engine_model.get(engine)
|
assert engine_model.get(engine), f'engine {engine} unknown'
|
||||||
assert EngineModelData.engine_model[engine].get(model)
|
assert engine_model[engine].get(model), f'model {model} unknown'
|
||||||
# remove nsys-rep from file_name for shorter x-label
|
# remove nsys-rep from file_name for shorter x-label
|
||||||
file_name = file_name.replace('.nsys-rep', '')
|
file_name = file_name.replace('.nsys-rep', '')
|
||||||
df['Model_Engine'] = f'{model}_{engine}_{file_name}_{idx}'
|
df['Model_Engine'] = f'{model}_{engine}_{file_name}_{idx}'
|
||||||
self.anno_gpu_kernname(df,
|
self.anno_gpu_kernname(df, engine_model[engine][model])
|
||||||
EngineModelData.engine_model[engine][model])
|
|
||||||
# patch in non-gpu time
|
# patch in non-gpu time
|
||||||
gpu_sec = round(df['Elapsed Time (sec)'].sum(), 1)
|
gpu_sec = round(df['Elapsed Time (sec)'].sum(), 1)
|
||||||
total_sec = round(float(total_sec), 1)
|
total_sec = round(float(total_sec), 1)
|
||||||
@ -393,12 +279,12 @@ def main():
|
|||||||
"--out_dir results/ --title \"Model=gpt-oss vLLM chart\""),
|
"--out_dir results/ --title \"Model=gpt-oss vLLM chart\""),
|
||||||
formatter_class=argparse.RawDescriptionHelpFormatter)
|
formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||||
|
|
||||||
# Build help string showing available engine/model combinations
|
# load supported engine_model
|
||||||
engine_model_help = []
|
engine_model_supported = load_engine_model()
|
||||||
for engine, models in EngineModelData.engine_model.items():
|
# Get a string representation of supported engine/model combinations
|
||||||
model_list = list(models.keys())
|
engine_model_supported_str = ', '.join(
|
||||||
engine_model_help.append(f"{engine}:[{','.join(model_list)}]")
|
f"{engine}:[{', '.join(models.keys())}]"
|
||||||
engine_model_str = ' '.join(engine_model_help)
|
for engine, models in engine_model_supported.items())
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--in_file',
|
'--in_file',
|
||||||
type=parse_tuple,
|
type=parse_tuple,
|
||||||
@ -408,7 +294,7 @@ def main():
|
|||||||
'separated by space. Elapsed_nonprofiled_sec is runtime without '
|
'separated by space. Elapsed_nonprofiled_sec is runtime without '
|
||||||
'profiling used to calculate non-gpu time. Specify 0 to use '
|
'profiling used to calculate non-gpu time. Specify 0 to use '
|
||||||
'elapsed time from nsys-rep but that might inflate non-gpu time. '
|
'elapsed time from nsys-rep but that might inflate non-gpu time. '
|
||||||
f'Available engine:[model] are: {engine_model_str} '
|
f'Available engine:[model] are: {engine_model_supported_str} '
|
||||||
f'Example: --infile d1.nsys-rep,vllm,llama,100 '
|
f'Example: --infile d1.nsys-rep,vllm,llama,100 '
|
||||||
'd2.nsys-rep,vllm,gpt-oss,102'),
|
'd2.nsys-rep,vllm,gpt-oss,102'),
|
||||||
required=True)
|
required=True)
|
||||||
@ -418,8 +304,9 @@ def main():
|
|||||||
help=('nsys cmd, e.g. /usr/bin/nsys, Default: nsys'),
|
help=('nsys cmd, e.g. /usr/bin/nsys, Default: nsys'),
|
||||||
default="nsys")
|
default="nsys")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
gputrace = GPUTrace2Graph(args.nsys_cmd)
|
gputrace = GPUTrace2Graph()
|
||||||
gputrace.gen_graph(args.in_file, args.out_dir, args.title)
|
gputrace.gen_graph(args.in_file, args.out_dir, args.title, args.nsys_cmd,
|
||||||
|
engine_model_supported)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|||||||
63
tools/profiler/nsys_profile_tools/vllm_engine_model.json
Normal file
63
tools/profiler/nsys_profile_tools/vllm_engine_model.json
Normal file
@ -0,0 +1,63 @@
|
|||||||
|
{
|
||||||
|
"vllm": {
|
||||||
|
"llama": {
|
||||||
|
"fused_moe_kernel|GroupProblemShape|group_gemm_starts|bmm_|GemmUniversal": "moe_gemm",
|
||||||
|
"gemm|nvjet": "gemm",
|
||||||
|
"moe|sigmoid": "moe",
|
||||||
|
"CatArrayBatched|prepare_inputs": "prepare_next",
|
||||||
|
"ncclDevKernel|cross_device_reduce": "nccl_and_custom_ar",
|
||||||
|
"_norm_|Norm": "norm",
|
||||||
|
"act_and_mul_": "activation",
|
||||||
|
"Rotary": "rope",
|
||||||
|
"SoftMax": "softmax",
|
||||||
|
"flash|fmha": "attn",
|
||||||
|
"elementwise": "elementwise",
|
||||||
|
"fp8_quant|cvt_": "quantize",
|
||||||
|
"reduce_kernel": "reduce",
|
||||||
|
"triton": "triton_kernel",
|
||||||
|
"CUDA mem": "non-gpu-H_D_memops",
|
||||||
|
".*": "misc"
|
||||||
|
},
|
||||||
|
"ds": {
|
||||||
|
"block_fp8|gemm_fp8_blockwise": "block_fp8_gemm",
|
||||||
|
"fused_moe_kernel|_group_gemm|GroupProblemShape|GemmUniversal|bmm_": "moe_gemm",
|
||||||
|
"gemm|matmul|nvjet": "gemm",
|
||||||
|
"moe|sigmoid|expert": "moe",
|
||||||
|
"CatArrayBatched": "prepare_next",
|
||||||
|
"ncclDevKernel|cross_device_reduce": "nccl_and_custom_ar",
|
||||||
|
"Norm|_norm_": "norm",
|
||||||
|
"sbtopk": "topk",
|
||||||
|
"act_and_mul_": "activation",
|
||||||
|
"compute_position_kernel": "rope",
|
||||||
|
"elementwise": "elementwise",
|
||||||
|
"fp8_quant|quant_fp8|cvt_": "quantize",
|
||||||
|
"reduce": "reduce",
|
||||||
|
"SoftMax": "softmax",
|
||||||
|
"_fwd_|FlashAttn|_mla_|_attn_|fmha": "attn",
|
||||||
|
"triton": "triton_kernel",
|
||||||
|
"topk": "topk",
|
||||||
|
"CUDA mem": "non-gpu-H_D_memops",
|
||||||
|
".*": "misc"
|
||||||
|
},
|
||||||
|
"gpt-oss": {
|
||||||
|
"block_fp8|gemm_fp8_blockwise": "block_fp8_gemm",
|
||||||
|
"fused_moe_kernel|_group_gemm|GroupProblemShape|GemmUniversal|bmm_|matmul_ogs_|_topk_forward|_combined_routing|_sum_bitmatrix_rows|_compute_writeback_idx": "moe_gemm",
|
||||||
|
"gemm|matmul|nvjet": "gemm",
|
||||||
|
"moe|sigmoid|expert|splitKreduce": "moe",
|
||||||
|
"CatArrayBatched": "prepare_next",
|
||||||
|
"ncclDevKernel|cross_device_reduce": "nccl_and_custom_ar",
|
||||||
|
"Norm|_norm_": "norm",
|
||||||
|
"topk": "topk",
|
||||||
|
"act_and_mul_": "activation",
|
||||||
|
"compute_position_kernel": "rope",
|
||||||
|
"elementwise": "elementwise",
|
||||||
|
"fp8_quant|quant_fp8|cvt_|quantize": "quantize",
|
||||||
|
"reduce": "reduce",
|
||||||
|
"SoftMax": "softmax",
|
||||||
|
"_fwd_|FlashAttn|_mla_|_attn_|_flash_|flash::prepare_varlen|fmha": "attn",
|
||||||
|
"triton": "triton_kernel",
|
||||||
|
"CUDA mem": "non-gpu-H_D_memops",
|
||||||
|
".*": "misc"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user