mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-07 23:27:06 +08:00
Signed-off-by: Nick Hill <nhill@redhat.com> Signed-off-by: Lucas Kabela <lucaskabela@meta.com> Signed-off-by: Max de Bayser <mbayser@br.ibm.com> Signed-off-by: Andrew Sansom <andrew@protopia.ai> Signed-off-by: Boyuan Feng <boyuan@meta.com> Signed-off-by: Boyuan Feng <fby.1994@gmail.com> Signed-off-by: boyuanfeng <boyuan@meta.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: JartX <sagformas@epdcenter.es> Signed-off-by: Chendi Xue <Chendi.Xue@intel.com> Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com> Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Chen Zhang <zhangch99@outlook.com> Signed-off-by: Roger Wang <hey@rogerw.io> Signed-off-by: mgoin <mgoin64@gmail.com> Signed-off-by: wwl2755 <wangwenlong2755@gmail.com> Signed-off-by: Manoel Marques <manoel.marques@ibm.com> Signed-off-by: Manoel Marques <manoelmrqs@gmail.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Signed-off-by: pengdrumli <pengdrumli@tencent.com> Signed-off-by: windsonsea <haifeng.yao@daocloud.io> Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai> Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Signed-off-by: Huamin Li <3ericli@gmail.com> Signed-off-by: simondanielsson <simon.danielsson99@hotmail.com> Signed-off-by: Rahul Tuli <rtuli@redhat.com> Signed-off-by: Yang <lymailforjob@gmail.com> Signed-off-by: Debolina Roy <debroy@redhat.com> Signed-off-by: David Chen <530634352@qq.com> Signed-off-by: wangzi <3220100013@zju.edu.cn> Signed-off-by: Eldar Kurtic <8884008+eldarkurtic@users.noreply.github.com> Signed-off-by: NickLucche <nlucches@redhat.com> Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com> Signed-off-by: Sara Kokkila Schumacher <saraks@ibm.com> Signed-off-by: Csrayz <jover@cmbchina.com> Signed-off-by: ivyilike <pww123@cmbchina.com> Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com> Signed-off-by: Bowen Wang <abmfy@icloud.com> Signed-off-by: qqma <qqma@amazon.com> Signed-off-by: ElizaWszola <ewszola@redhat.com> Signed-off-by: Lu Fang <fanglu@fb.com> Signed-off-by: Zhuohan Li <zhuohan123@gmail.com> Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: luka <lgovedic@redhat.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Signed-off-by: Or Ozeri <oro@il.ibm.com> Signed-off-by: Johnny Yang <johnnyyang@google.com> Signed-off-by: Alec Solder <alecs@fb.com> Signed-off-by: Alec S <10566873+alecsolder@users.noreply.github.com> Signed-off-by: Russell Bryant <rbryant@redhat.com> Signed-off-by: Matthew Bonanni <mbonanni@redhat.com> Signed-off-by: Alexander Matveev <amatveev@redhat.com> Signed-off-by: yewentao256 <zhyanwentao@126.com> Signed-off-by: liuye.hj <liuye.hj@alibaba-inc.com> Signed-off-by: Kunshang Ji <kunshang.ji@intel.com> Signed-off-by: Lucia Fang <116399278+luccafong@users.noreply.github.com> Signed-off-by: Michael Goin <mgoin64@gmail.com> Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com> Signed-off-by: Ming Yang <minos.future@gmail.com> Signed-off-by: Zhikaiiii <1658973216@qq.com> Signed-off-by: Andreas Hartel <andreas.hartel@aleph-alpha.com> Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com> Signed-off-by: wuxibin <wuxibin@bytedance.com> Signed-off-by: youkaichao <youkaichao@gmail.com> Signed-off-by: Peter Pan <Peter.Pan@daocloud.io> Signed-off-by: Peter Pan <peter.pan@daocloud.io> Signed-off-by: Nicolò Lucchesi<nicolo.lucchesi@gmail.com> Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com> Signed-off-by: Sage Moore <sage@neuralmagic.com> Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Signed-off-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com> Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com> Signed-off-by: Bill Nell <bnell@redhat.com> Signed-off-by: Shreeasish Kumar <shreeasish@rivosinc.com> Signed-off-by: Weida Hong <wdhongtw@google.com> Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Signed-off-by: Hashem Hashemi <hashem.hashemi@amd.com> Signed-off-by: Hashem Hashemi <159079214+amd-hhashemi@users.noreply.github.com> Signed-off-by: Amir Samani <asamani@nvidia.com> Signed-off-by: ElizaWszola <elizaw.9289@gmail.com> Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com> Signed-off-by: ilmarkov <markovilya197@gmail.com> Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com> Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com> Signed-off-by: rouchenzi <ruochenwen@gmail.com> Signed-off-by: rouchenzi <40842833+rouchenzi@users.noreply.github.com> Signed-off-by: Andrew Xia <axia@meta.com> Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com> Signed-off-by: Corey Lowman <clowman1993@gmail.com> Signed-off-by: jpvillam <jpvillam@amd.com> Signed-off-by: dougbtv <dosmith@redhat.com> Signed-off-by: Chenxi Yang <cxyang@fb.com> Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Signed-off-by: ahao-anyscale <ahao@anyscale.com> Signed-off-by: Yan Lu <luyan@nvidia.com> Signed-off-by: baxingpiaochong <771405853@qq.com> Signed-off-by: Kyle Sayers <kylesayrs@gmail.com> Signed-off-by: Nikhil Gupta <nikhil.gupta2@arm.com> Signed-off-by: Yong Hoon Shin <yhshin@meta.com> Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai> Signed-off-by: Benjamin Chislett <bchislett@nvidia.com> Signed-off-by: Ben Browning <bbrownin@redhat.com> Signed-off-by: Chengji Yao <chengjiyao@google.com> Signed-off-by: jiang1.li <jiang1.li@intel.com> Signed-off-by: Jackmin801 <ongjackm@gmail.com> Signed-off-by: Jonas M. Kübler <44084297+jmkuebler@users.noreply.github.com> Signed-off-by: taohui <taohui3@gmail.com> Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io> Signed-off-by: Shu Wang <shuw@nvidia.com> Signed-off-by: Shu Wang. <shuw@nvidia.com> Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com> Signed-off-by: Duncan Moss <djm.moss@gmail.com> Signed-off-by: Shiyan Deng <dsy842974287@meta.com> Signed-off-by: Wei Wei <wwei6@meta.com> Signed-off-by: Saman Keon <samanamp@outlook.com> Signed-off-by: yangxurui <yangxurui@meituan.com> Signed-off-by: nicole-lihui <nicole.li@daocloud.io> Signed-off-by: courage17340 <courage17340@163.com> Signed-off-by: Jacob Kahn <jacobkahn1@gmail.com> Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com> Signed-off-by: Agata Dobrzyniewicz <adobrzyniewicz@habana.ai> Signed-off-by: zxw <1020938856@qq.com> Signed-off-by: wang.yuqi <noooop@126.com> Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Signed-off-by: chenlang <chen.lang5@zte.com.cn> Signed-off-by: Jonas Kuebler <kuebj@amazon.com> Signed-off-by: AlonKejzman <alonkeizman@gmail.com> Signed-off-by: Tao Hui <taohui3@gmail.com> Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com> Signed-off-by: Tomer Asida <57313761+tomeras91@users.noreply.github.com> Signed-off-by: Aleksandr Malyshev <maleksan@amd.com> Signed-off-by: Eugene Khvedchenia <ekhvedchenia@nvidia.com> Signed-off-by: Eugene Khvedchenya <ekhvedchenya@gmail.com> Signed-off-by: yiting.jiang <yiting.jiang@daocloud.io> Signed-off-by: xaguilar <Xavier.AguilarFruto@amd.com> Signed-off-by: Iceber Gu <caiwei95@hotmail.com> Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com> Signed-off-by: Icey <1790571317@qq.com> Signed-off-by: 许文卿 <xwq391974@alibaba-inc.com> Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com> Co-authored-by: Nick Hill <nhill@redhat.com> Co-authored-by: Lucas Kabela <lucasakabela@gmail.com> Co-authored-by: Maximilien de Bayser <mbayser@br.ibm.com> Co-authored-by: Andrew Sansom <andrew@protopia.ai> Co-authored-by: Boyuan Feng <boyuan@meta.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: JartX <sagformas@epdcenter.es> Co-authored-by: Chendi.Xue <chendi.xue@intel.com> Co-authored-by: Chauncey <chaunceyjiang@gmail.com> Co-authored-by: xin.li <xin.li@daocloud.io> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk> Co-authored-by: Chen Zhang <zhangch99@outlook.com> Co-authored-by: Roger Wang <hey@rogerw.io> Co-authored-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Wenlong Wang <wangwenlong2755@gmail.com> Co-authored-by: Manoel Marques <manoelmrqs@gmail.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: lirong <56789630+lirong-lirong@users.noreply.github.com> Co-authored-by: Michael Yao <haifeng.yao@daocloud.io> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Co-authored-by: Huamin Li <3ericli@gmail.com> Co-authored-by: Lu Fang <30275821+houseroad@users.noreply.github.com> Co-authored-by: Simon Danielsson <70206058+simondanielsson@users.noreply.github.com> Co-authored-by: Rahul Tuli <rtuli@redhat.com> Co-authored-by: Claude <noreply@anthropic.com> Co-authored-by: Yang Liu <127183760+KKSK-DON@users.noreply.github.com> Co-authored-by: Deboleina <debroy@redhat.com> Co-authored-by: yinz-aizip <yinz@aizip.ai> Co-authored-by: WeiQing Chen <40507679+david6666666@users.noreply.github.com> Co-authored-by: wangzi <3220100013@zju.edu.cn> Co-authored-by: Eldar Kurtić <8884008+eldarkurtic@users.noreply.github.com> Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com> Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com> Co-authored-by: Yizhou <136800916+yiz-liu@users.noreply.github.com> Co-authored-by: Sara-KS <50249410+Sara-KS@users.noreply.github.com> Co-authored-by: Csrayz <jover@cmbchina.com> Co-authored-by: ivyilike <pww123@cmbchina.com> Co-authored-by: Burkhard Ringlein <ngl@zurich.ibm.com> Co-authored-by: Bowen Wang <abmfy@icloud.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: Daisy-Ma-coder <daisy.ma.0117@gmail.com> Co-authored-by: qqma <qqma@amazon.com> Co-authored-by: ElizaWszola <ewszola@redhat.com> Co-authored-by: Lucia Fang <116399278+luccafong@users.noreply.github.com> Co-authored-by: Zhuohan Li <zhuohan123@gmail.com> Co-authored-by: Simon Mo <simon.mo@hey.com> Co-authored-by: Or Ozeri <oro@il.ibm.com> Co-authored-by: Johnny Yang <24908445+jcyang43@users.noreply.github.com> Co-authored-by: Chengji Yao <chengjiyao@google.com> Co-authored-by: Alec S <10566873+alecsolder@users.noreply.github.com> Co-authored-by: Alec Solder <alecs@fb.com> Co-authored-by: Russell Bryant <rbryant@redhat.com> Co-authored-by: Matthew Bonanni <mbonanni@redhat.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Co-authored-by: Chris Bamford <chrisbam4d@gmail.com> Co-authored-by: Alexander Matveev <59768536+alexm-redhat@users.noreply.github.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: JJJYmmm <92386084+JJJYmmm@users.noreply.github.com> Co-authored-by: liuye.hj <liuye.hj@alibaba-inc.com> Co-authored-by: Kunshang Ji <kunshang.ji@intel.com> Co-authored-by: Lucia (Lu) Fang <fanglu@meta.com> Co-authored-by: Varun Sundar Rabindranath <varunsundar08@gmail.com> Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com> Co-authored-by: Ming Yang <yming@meta.com> Co-authored-by: Zhikaiiii <55917203+Zhikaiiii@users.noreply.github.com> Co-authored-by: Andreas Hartel <andreas@hartel.me> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com> Co-authored-by: vllmellm <vllm.ellm@embeddedllm.com> Co-authored-by: Joel <wuxibin89@163.com> Co-authored-by: youkaichao <youkaichao@gmail.com> Co-authored-by: Mark McLoughlin <markmc@redhat.com> Co-authored-by: Peter Pan <peter.pan@daocloud.io> Co-authored-by: Nicolò Lucchesi <nicolo.lucchesi@gmail.com> Co-authored-by: Fanli Lin <fanli.lin@intel.com> Co-authored-by: Thomas Parnell <tpa@zurich.ibm.com> Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com> Co-authored-by: Sage Moore <sage@neuralmagic.com> Co-authored-by: yewentao256 <zhyanwentao@126.com> Co-authored-by: bnellnm <49004751+bnellnm@users.noreply.github.com> Co-authored-by: rivos-shreeasish <shreeasish@rivosinc.com> Co-authored-by: Chih-Chieh Yang <chih.chieh.yang@ibm.com> Co-authored-by: Weida Hong <wdhongtw@gmail.com> Co-authored-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Co-authored-by: Hashem Hashemi <159079214+amd-hhashemi@users.noreply.github.com> Co-authored-by: Amir Samani <samani@ualberta.ca> Co-authored-by: Luka Govedič <lgovedic@redhat.com> Co-authored-by: jiahanc <173873397+jiahanc@users.noreply.github.com> Co-authored-by: Ilya Markov <markovilya197@gmail.com> Co-authored-by: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Co-authored-by: Jialin Ouyang <Jialin.Ouyang@gmail.com> Co-authored-by: rouchenzi <40842833+rouchenzi@users.noreply.github.com> Co-authored-by: Andrew Xia <axia@meta.com> Co-authored-by: kourosh hakhamaneshi <31483498+kouroshHakha@users.noreply.github.com> Co-authored-by: Corey Lowman <clowman1993@gmail.com> Co-authored-by: Juan Villamizar <100237675+jpvillam-amd@users.noreply.github.com> Co-authored-by: jpvillam <jpvillam@amd.com> Co-authored-by: Doug Smith <dosmith@redhat.com> Co-authored-by: Chenxi Yang <cxyang@cs.utexas.edu> Co-authored-by: Chenxi Yang <cxyang@fb.com> Co-authored-by: ahao-anyscale <ahao@anyscale.com> Co-authored-by: 0xNullPath <luyanfcp@foxmail.com> Co-authored-by: baxingpiaochong <771405853@qq.com> Co-authored-by: Benjamin Chislett <bchislett@nvidia.com> Co-authored-by: Kyle Sayers <kylesayrs@gmail.com> Co-authored-by: Nikhil Gupta <nikhil.gupta2@arm.com> Co-authored-by: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com> Co-authored-by: lhsjohn <huashuoli@tencent.com> Co-authored-by: Ben Browning <bbrownin@redhat.com> Co-authored-by: Li, Jiang <jiang1.li@intel.com> Co-authored-by: Jackmin801 <56836461+Jackmin801@users.noreply.github.com> Co-authored-by: Jonas M. Kübler <44084297+jmkuebler@users.noreply.github.com> Co-authored-by: Tao Hui <taohui3@gmail.com> Co-authored-by: rongfu.leng <rongfu.leng@daocloud.io> Co-authored-by: Shu Wang <shuw@nvidia.com> Co-authored-by: Tyler Michael Smith <tlrmchlsmth@gmail.com> Co-authored-by: Duncan Moss <djm.moss@gmail.com> Co-authored-by: Shiyan Deng <dsy842974287@meta.com> Co-authored-by: Wei Wei <wwei6@meta.com> Co-authored-by: Saman A. Pour <samanamp@outlook.com> Co-authored-by: XuruiYang <530534756@qq.com> Co-authored-by: yangxurui <yangxurui@meituan.com> Co-authored-by: Nicole LiHui 🥜 <nicolelihui@outlook.com> Co-authored-by: courage17340 <courage17340@users.noreply.github.com> Co-authored-by: Jacob Kahn <jacobkahn1@gmail.com> Co-authored-by: Nicole LiHui 🥜 <nicole.li@daocloud.io> Co-authored-by: Fadi Arafeh <115173828+fadara01@users.noreply.github.com> Co-authored-by: Agata Dobrzyniewicz <160237065+adobrzyn@users.noreply.github.com> Co-authored-by: yyzxw <34639446+yyzxw@users.noreply.github.com> Co-authored-by: wang.yuqi <noooop@126.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: chenlang <chen.lang5@zte.com.cn> Co-authored-by: chenlang <10346245@zte.com.cn> Co-authored-by: AlonKejzman <alonkeizman@gmail.com> Co-authored-by: tomeras91 <57313761+tomeras91@users.noreply.github.com> Co-authored-by: Aleksandr Malyshev <164964928+maleksan85@users.noreply.github.com> Co-authored-by: Aleksandr Malyshev <maleksan@amd.com> Co-authored-by: Doug Lehr <douglehr@amd.com> Co-authored-by: Eugene Khvedchenya <ekhvedchenya@gmail.com> Co-authored-by: yitingdc <59356937+yitingdc@users.noreply.github.com> Co-authored-by: xaguilar-amd <xavier.aguilarfruto@amd.com> Co-authored-by: Iceber Gu <caiwei95@hotmail.com> Co-authored-by: Tao He <linzhu.ht@alibaba-inc.com> Co-authored-by: Icey <1790571317@qq.com> Co-authored-by: Xu Wenqing <121550081+Xu-Wenqing@users.noreply.github.com> Co-authored-by: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com> Co-authored-by: RishiAstra <40644327+RishiAstra@users.noreply.github.com>
594 lines
22 KiB
Python
594 lines
22 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
|
|
import argparse
|
|
import copy
|
|
import json
|
|
import math
|
|
import os
|
|
from pathlib import Path
|
|
from typing import Any, Optional
|
|
|
|
import matplotlib.pyplot as plt
|
|
import pandas as pd
|
|
|
|
## JSON parsing utils ####
|
|
|
|
|
|
def largest_dist_from_leaf(node: dict, depth: int = 0):
|
|
if len(node["children"]) == 0:
|
|
return depth
|
|
return max([
|
|
largest_dist_from_leaf(child, depth=depth + 1)
|
|
for child in node["children"]
|
|
])
|
|
|
|
|
|
def get_entries_at_depth(depth: int,
|
|
entries_and_traces: list[tuple[Any, Any]],
|
|
node: dict,
|
|
curr_depth: int = 0,
|
|
trace=()):
|
|
# assert that the query is at kernel or module level
|
|
assert depth == -1 or depth == -2
|
|
|
|
if curr_depth == 0 and largest_dist_from_leaf(node) <= (abs(depth) - 1):
|
|
# The tree is not tall enough!
|
|
entries_and_traces.append((node["entry"], trace))
|
|
return
|
|
|
|
if largest_dist_from_leaf(node) == (abs(depth) - 1):
|
|
entries_and_traces.append((node["entry"], trace))
|
|
|
|
trace = (node["entry"]["name"], ) + trace
|
|
for child in node["children"]:
|
|
get_entries_at_depth(depth,
|
|
entries_and_traces,
|
|
child,
|
|
curr_depth=curr_depth + 1,
|
|
trace=trace)
|
|
|
|
|
|
def fold_nodes(root: dict, nodes_to_fold: list[str]):
|
|
|
|
stack: list[dict] = [root]
|
|
while len(stack) != 0:
|
|
node = stack.pop()
|
|
if node['entry']['name'] in nodes_to_fold:
|
|
node["children"] = []
|
|
continue
|
|
for child in node["children"]:
|
|
stack.append(child)
|
|
return root
|
|
|
|
|
|
## Operation name cleanup utils ####
|
|
|
|
|
|
def trim_string_back(string: str, width: int) -> str:
|
|
if len(string) > width:
|
|
offset = len(string) - width + 3
|
|
string = string[:-offset]
|
|
if len(string) > 3:
|
|
string = string + "..."
|
|
return string
|
|
|
|
|
|
def shorten_plot_legend_strings(legend, max_char_len: int):
|
|
for t in legend.get_texts():
|
|
t.set_text(
|
|
trim_string_back(abbreviate_known_names(t.get_text()),
|
|
max_char_len))
|
|
|
|
|
|
def abbreviate_known_names(name: str) -> str:
|
|
abbreviations = {
|
|
"MergedColumnParallelLinear": "MCPLinear",
|
|
"QKVParallelLinear": "QKVPLinear",
|
|
"RowParallelLinear": "RPLinear",
|
|
"weight=": "w=",
|
|
"bfloat16": "bf16",
|
|
"float16": "f16",
|
|
}
|
|
for key, value in abbreviations.items():
|
|
name = name.replace(key, value)
|
|
return name
|
|
|
|
|
|
def attempt_to_make_names_unique(entries_and_traces):
|
|
names, non_unique_names = (set(), set())
|
|
|
|
def all_the_same(items) -> bool:
|
|
return all(i == items[0] for i in items)
|
|
|
|
for entry, _ in entries_and_traces:
|
|
if entry["name"] in names:
|
|
non_unique_names.add(entry["name"])
|
|
else:
|
|
names.add(entry["name"])
|
|
|
|
for name in non_unique_names:
|
|
entries_and_traces_with_name = [(entry, trace)
|
|
for entry, trace in entries_and_traces
|
|
if entry["name"] == name]
|
|
|
|
zipped_traces = list(
|
|
zip(*[trace for _, trace in entries_and_traces_with_name]))
|
|
first_trace_difference = next(
|
|
(i for i, trace_eles in enumerate(zipped_traces)
|
|
if not all_the_same(trace_eles)), None)
|
|
|
|
if first_trace_difference is None:
|
|
# can't create a unique name, leave the names as they
|
|
# are they will get aggregated by the pivot_table call
|
|
continue
|
|
|
|
for entry, trace in entries_and_traces_with_name:
|
|
entry["name"] = " <- ".join((entry["name"], ) +
|
|
trace[:first_trace_difference + 1])
|
|
|
|
|
|
## Operation grouping utils ####
|
|
'''
|
|
Group operations in the given dataframe by some high-level ops like,
|
|
- gemms
|
|
- attention
|
|
- rms_norm
|
|
etc.
|
|
'''
|
|
|
|
|
|
def group_trace_by_operations(trace_df: pd.DataFrame) -> pd.DataFrame:
|
|
|
|
def is_rms_norm(op_name: str):
|
|
if "rms_norm_kernel" in op_name:
|
|
return True
|
|
|
|
def is_attention_block(op_name: str):
|
|
if "flash_fwd" in op_name or \
|
|
"reshape_and_cache_flash_kernel" in op_name:
|
|
return True
|
|
|
|
def is_quant(op_name: str):
|
|
if "scaled_fp8_quant" in op_name or \
|
|
"scaled_int8_quant" in op_name:
|
|
return True
|
|
|
|
# LoRA ops
|
|
def is_sgmv_shrink(op_name: str):
|
|
return "sgmv_shrink" in op_name
|
|
|
|
def is_sgmv_expand(op_name: str):
|
|
return "sgmv_expand" in op_name
|
|
|
|
def is_bgmv_shrink(op_name: str):
|
|
return "bgmv_shrink" in op_name
|
|
|
|
def is_bgmv_expand(op_name: str):
|
|
return "bgmv_expand" in op_name
|
|
|
|
def is_cutlass_gemm_op(op_name: str):
|
|
return "void cutlass::Kernel" in op_name or \
|
|
"void cutlass::device_kernel" in op_name
|
|
|
|
def is_gemm_op(op_name: str):
|
|
if is_quant(op_name):
|
|
return False
|
|
return is_cutlass_gemm_op(op_name) or \
|
|
"xmma_gemm" in op_name or \
|
|
"gemv2T_kernel" in op_name or \
|
|
"splitKreduce" in op_name or \
|
|
"s16816gemm" in op_name
|
|
|
|
def is_elementwise_op(op_name: str):
|
|
return "elementwise_kernel" in op_name
|
|
|
|
def is_mem_op(op_name: str):
|
|
return "memcpy" in op_name.lower() or \
|
|
"memset" in op_name.lower()
|
|
|
|
def is_vocab_embedding_op(op_name: str):
|
|
return "vocabparallelembed" in op_name.lower()
|
|
|
|
# nccl ops
|
|
def is_nccl_op(op_name: str):
|
|
return "nccl" in op_name.lower()
|
|
|
|
def is_nccl_all_reduce(op_name: str):
|
|
return is_nccl_op(op_name) and \
|
|
("all_reduce" in op_name.lower() or \
|
|
"allreduce" in op_name.lower())
|
|
|
|
def is_nccl_gather(op_name: str):
|
|
return is_nccl_op(op_name) and \
|
|
"gather" in op_name.lower()
|
|
|
|
def is_nccl_broadcast(op_name: str):
|
|
return is_nccl_op(op_name) and \
|
|
"broadcast" in op_name.lower()
|
|
|
|
# Reduce ops types
|
|
def is_cross_device_reduce_1stage(op_name: str):
|
|
return "cross_device_reduce_1stage" in op_name
|
|
|
|
def is_cross_device_reduce_2stage(op_name: str):
|
|
return "cross_device_reduce_2stage" in op_name
|
|
|
|
def is_custom_ar_all_reduce(op_name: str):
|
|
return "_C_custom_ar::all_reduce" in op_name
|
|
|
|
def is_reduce_kernel(op_name: str):
|
|
return "reduce_kernel" in op_name
|
|
|
|
headers = list(trace_df)
|
|
ops = copy.deepcopy(headers)
|
|
|
|
attention_ops = list(filter(lambda x: is_attention_block(x), ops))
|
|
ops = list(filter(lambda x: x not in attention_ops, ops))
|
|
|
|
quant_ops = list(filter(lambda x: is_quant(x), ops))
|
|
ops = list(filter(lambda x: x not in quant_ops, ops))
|
|
|
|
sgmv_shrink_ops = list(filter(lambda x: is_sgmv_shrink(x), ops))
|
|
ops = list(filter(lambda x: x not in sgmv_shrink_ops, ops))
|
|
sgmv_expand_ops = list(filter(lambda x: is_sgmv_expand(x), ops))
|
|
ops = list(filter(lambda x: x not in sgmv_expand_ops, ops))
|
|
bgmv_shrink_ops = list(filter(lambda x: is_bgmv_shrink(x), ops))
|
|
ops = list(filter(lambda x: x not in bgmv_shrink_ops, ops))
|
|
bgmv_expand_ops = list(filter(lambda x: is_bgmv_expand(x), ops))
|
|
ops = list(filter(lambda x: x not in bgmv_expand_ops, ops))
|
|
|
|
cutlass_gemm_ops = list(filter(lambda x: is_cutlass_gemm_op(x), ops))
|
|
ops = list(filter(lambda x: x not in cutlass_gemm_ops, ops))
|
|
|
|
gemm_ops = list(filter(lambda x: is_gemm_op(x), ops))
|
|
ops = list(filter(lambda x: x not in gemm_ops, ops))
|
|
|
|
rms_norm_ops = list(filter(lambda x: is_rms_norm(x), ops))
|
|
ops = list(filter(lambda x: x not in rms_norm_ops, ops))
|
|
|
|
vocab_embed_ops = list(filter(lambda x: is_vocab_embedding_op(x), ops))
|
|
ops = list(filter(lambda x: x not in vocab_embed_ops, ops))
|
|
|
|
mem_ops = list(filter(lambda x: is_mem_op(x), ops))
|
|
ops = list(filter(lambda x: x not in mem_ops, ops))
|
|
|
|
elementwise_ops = list(filter(lambda x: is_elementwise_op(x), ops))
|
|
ops = list(filter(lambda x: x not in elementwise_ops, ops))
|
|
|
|
nccl_all_reduce_ops = list(filter(lambda x: is_nccl_all_reduce(x), ops))
|
|
ops = list(filter(lambda x: x not in nccl_all_reduce_ops, ops))
|
|
|
|
nccl_gather_ops = list(filter(lambda x: is_nccl_gather(x), ops))
|
|
ops = list(filter(lambda x: x not in nccl_gather_ops, ops))
|
|
|
|
nccl_broadcast_ops = list(filter(lambda x: is_nccl_broadcast(x), ops))
|
|
ops = list(filter(lambda x: x not in nccl_broadcast_ops, ops))
|
|
|
|
nccl_other_ops = list(filter(lambda x: is_nccl_op(x), ops))
|
|
ops = list(filter(lambda x: x not in nccl_other_ops, ops))
|
|
|
|
cross_device_reduce_1stage_ops = list(
|
|
filter(lambda x: is_cross_device_reduce_1stage(x), ops))
|
|
ops = list(filter(lambda x: x not in cross_device_reduce_1stage_ops, ops))
|
|
|
|
cross_device_reduce_2stage_ops = list(
|
|
filter(lambda x: is_cross_device_reduce_2stage(x), ops))
|
|
ops = list(filter(lambda x: x not in cross_device_reduce_2stage_ops, ops))
|
|
|
|
custom_ar_all_reduce_ops = list(
|
|
filter(lambda x: is_custom_ar_all_reduce(x), ops))
|
|
ops = list(filter(lambda x: x not in custom_ar_all_reduce_ops, ops))
|
|
|
|
reduce_kernel_ops = list(filter(lambda x: is_reduce_kernel(x), ops))
|
|
ops = list(filter(lambda x: x not in reduce_kernel_ops, ops))
|
|
|
|
if len(attention_ops):
|
|
trace_df['attention'] = trace_df[attention_ops].agg("sum", axis=1)
|
|
if len(quant_ops):
|
|
trace_df['quant_ops'] = trace_df[quant_ops].agg("sum", axis=1)
|
|
|
|
if len(sgmv_shrink_ops):
|
|
trace_df['sgmv_shrink_ops'] = trace_df[sgmv_shrink_ops].agg("sum",
|
|
axis=1)
|
|
if len(sgmv_expand_ops):
|
|
trace_df['sgmv_expand_ops'] = trace_df[sgmv_expand_ops].agg("sum",
|
|
axis=1)
|
|
if len(bgmv_shrink_ops):
|
|
trace_df['bgmv_shrink_ops'] = trace_df[bgmv_shrink_ops].agg("sum",
|
|
axis=1)
|
|
if len(bgmv_expand_ops):
|
|
trace_df['bgmv_expand_ops'] = trace_df[bgmv_expand_ops].agg("sum",
|
|
axis=1)
|
|
|
|
if len(cutlass_gemm_ops):
|
|
trace_df['cutlass_gemm_ops'] = trace_df[cutlass_gemm_ops].agg("sum",
|
|
axis=1)
|
|
|
|
if len(gemm_ops):
|
|
trace_df['gemm_ops'] = trace_df[gemm_ops].agg("sum", axis=1)
|
|
if len(rms_norm_ops):
|
|
trace_df['rms_norm_ops'] = trace_df[rms_norm_ops].agg("sum", axis=1)
|
|
if len(vocab_embed_ops):
|
|
trace_df['vocab_embed_ops'] = trace_df[vocab_embed_ops].agg("sum",
|
|
axis=1)
|
|
if len(mem_ops):
|
|
trace_df['mem_ops'] = trace_df[mem_ops].agg("sum", axis=1)
|
|
if len(elementwise_ops):
|
|
trace_df['elementwise_ops'] = trace_df[elementwise_ops].agg("sum",
|
|
axis=1)
|
|
|
|
if len(nccl_all_reduce_ops):
|
|
trace_df['nccl_all_reduce_ops'] = trace_df[nccl_all_reduce_ops].agg(
|
|
"sum", axis=1)
|
|
if len(nccl_gather_ops):
|
|
trace_df['nccl_gather_ops'] = trace_df[nccl_gather_ops].agg("sum",
|
|
axis=1)
|
|
if len(nccl_broadcast_ops):
|
|
trace_df['nccl_broadcast_ops'] = trace_df[nccl_broadcast_ops].agg(
|
|
"sum", axis=1)
|
|
if len(nccl_other_ops):
|
|
trace_df['nccl_other_ops'] = trace_df[nccl_other_ops].agg("sum",
|
|
axis=1)
|
|
|
|
if len(cross_device_reduce_1stage_ops):
|
|
trace_df['cross_device_reduce_1stage_ops'] = trace_df[
|
|
cross_device_reduce_1stage_ops].agg("sum", axis=1)
|
|
if len(cross_device_reduce_2stage_ops):
|
|
trace_df['cross_device_reduce_2stage_ops'] = trace_df[
|
|
cross_device_reduce_2stage_ops].agg("sum", axis=1)
|
|
if len(custom_ar_all_reduce_ops):
|
|
trace_df['custom_ar_all_reduce_ops'] = trace_df[
|
|
custom_ar_all_reduce_ops].agg("sum", axis=1)
|
|
if len(reduce_kernel_ops):
|
|
trace_df['reduce_kernel_ops'] = trace_df[reduce_kernel_ops].agg("sum",
|
|
axis=1)
|
|
|
|
trace_df.drop(attention_ops + quant_ops + sgmv_shrink_ops +
|
|
sgmv_expand_ops + bgmv_shrink_ops + bgmv_expand_ops +
|
|
cutlass_gemm_ops + gemm_ops + rms_norm_ops +
|
|
vocab_embed_ops + mem_ops + elementwise_ops +
|
|
nccl_all_reduce_ops + nccl_gather_ops + nccl_broadcast_ops +
|
|
nccl_other_ops + cross_device_reduce_1stage_ops +
|
|
cross_device_reduce_2stage_ops + custom_ar_all_reduce_ops +
|
|
reduce_kernel_ops,
|
|
axis=1,
|
|
inplace=True)
|
|
return trace_df
|
|
|
|
|
|
## Data plotting utils ####
|
|
|
|
|
|
def plot_trace_df(traces_df: pd.DataFrame,
|
|
plot_metric: str,
|
|
plot_title: str,
|
|
output: Optional[Path] = None):
|
|
|
|
def get_phase_description(traces_df: pd.DataFrame, phase: str) -> str:
|
|
phase_df = traces_df.query(f'phase == "{phase}"')
|
|
descs = phase_df['phase_desc'].to_list()
|
|
assert all([desc == descs[0] for desc in descs])
|
|
return descs[0]
|
|
|
|
phases = traces_df['phase'].unique()
|
|
phase_descs = [get_phase_description(traces_df, p) for p in phases]
|
|
traces_df = traces_df.pivot_table(index="phase",
|
|
columns="name",
|
|
values=plot_metric,
|
|
aggfunc="sum")
|
|
|
|
traces_df = group_trace_by_operations(traces_df)
|
|
|
|
# Make the figure
|
|
fig_size_x = max(5, len(phases))
|
|
fig, ax = plt.subplots(1, figsize=(fig_size_x, 8), sharex=True)
|
|
|
|
# Draw the stacked bars
|
|
ops = list(traces_df)
|
|
bottom = [0] * len(phases)
|
|
for op in ops:
|
|
values = [traces_df[op][phase] for phase in phases]
|
|
values = list(map(lambda x: 0.0 if math.isnan(x) else x, values))
|
|
ax.bar(phase_descs, values, label=op, bottom=bottom)
|
|
bottom = [bottom[j] + values[j] for j in range(len(phases))]
|
|
|
|
# Write the values as text on the bars
|
|
for bar in ax.patches:
|
|
if bar.get_height() != 0:
|
|
ax.text(bar.get_x() + bar.get_width() / 2,
|
|
bar.get_height() / 2 + bar.get_y(),
|
|
f"{round(bar.get_height(), 2)}",
|
|
ha='center',
|
|
color='w',
|
|
weight='bold',
|
|
size=5)
|
|
|
|
# Setup legend
|
|
handles, labels = plt.gca().get_legend_handles_labels()
|
|
legend = fig.legend(handles,
|
|
labels,
|
|
loc='center left',
|
|
bbox_to_anchor=(1, 1))
|
|
shorten_plot_legend_strings(legend, 50)
|
|
|
|
# Setup labels and title
|
|
plt.setp(ax.get_xticklabels(), rotation=90)
|
|
ax.set_ylabel(plot_metric)
|
|
plt.suptitle(plot_title)
|
|
|
|
plt.savefig(output, bbox_inches='tight')
|
|
print("Created: ", output)
|
|
|
|
|
|
def main(
|
|
json_trace: Path,
|
|
output_directory: Path,
|
|
depth: int, # Fetch/Plot operations at this depth of the Json tree
|
|
plot_metric: str,
|
|
make_names_unique: bool,
|
|
top_k: int,
|
|
json_nodes_to_fold: list[str]):
|
|
|
|
def prepare_data(profile_json: dict, step_keys: list[str]) -> pd.DataFrame:
|
|
|
|
def get_entries_and_traces(key: str):
|
|
entries_and_traces: list[tuple[Any, Any]] = []
|
|
for root in profile_json[key]["summary_stats"]:
|
|
# Fold nodes in the traces as per user request. i.e. simply
|
|
# make the requested nodes leaf-nodes.
|
|
root = fold_nodes(root, json_nodes_to_fold)
|
|
get_entries_at_depth(depth, entries_and_traces, root)
|
|
return entries_and_traces
|
|
|
|
def keep_only_top_entries(df: pd.DataFrame,
|
|
metric: str,
|
|
top_k: int = 9) -> pd.DataFrame:
|
|
df.loc[df.nsmallest(len(df) - top_k + 1, metric).index,
|
|
["name"]] = "others"
|
|
return df
|
|
|
|
def get_phase_description(key: str) -> str:
|
|
num_running_seqs = profile_json[key]['metadata'][
|
|
'num_running_seqs']
|
|
if num_running_seqs is not None:
|
|
return f"{key}-seqs-{num_running_seqs}"
|
|
else:
|
|
return key
|
|
|
|
# Get data for each key
|
|
traces = list(map(lambda x: get_entries_and_traces(x), step_keys))
|
|
|
|
# Attempt some cleanup
|
|
if make_names_unique:
|
|
for trace in traces:
|
|
attempt_to_make_names_unique(trace)
|
|
|
|
# To pandas dataframe
|
|
trace_dfs = list(
|
|
map(lambda t: pd.DataFrame([entry for entry, _ in t]).fillna(0),
|
|
traces))
|
|
|
|
# Respect top_k
|
|
if top_k:
|
|
trace_dfs = list(
|
|
map(
|
|
lambda trace_df: keep_only_top_entries(
|
|
trace_df, "cuda_time_us", top_k), trace_dfs))
|
|
|
|
# Fill in information about the step-keys
|
|
for trace_df, step_key in zip(trace_dfs, step_keys):
|
|
trace_df['phase'] = step_key
|
|
trace_df['phase_desc'] = get_phase_description(step_key)
|
|
|
|
# Combine all data frames so they can be put in a single plot
|
|
traces_df = pd.concat(trace_dfs)
|
|
|
|
# Add a derived metric `cuda_time_ms`
|
|
traces_df["cuda_time_ms"] = traces_df["cuda_time_us"] / 1000
|
|
traces_df = traces_df.fillna(0)
|
|
|
|
return traces_df
|
|
|
|
def make_plot_title_suffix(profile_json: dict) -> str:
|
|
context = profile_json["context"]
|
|
sparsity = context.get('sparsity', None)
|
|
run_type = \
|
|
f'Run {context["num_steps"]} steps' if context['num_steps'] else \
|
|
(f'Complete {context["complete_num_requests_per_step"]} per '
|
|
f'step; Run till completion')
|
|
return (f"{context['engine_args']['model']}\n"
|
|
f"Batch={context['batch_size']}, "
|
|
f"PromptLen={context['prompt_len']}, "
|
|
f"NumGpus={context['engine_args']['tensor_parallel_size']}"
|
|
f"{', Sparsity ' + sparsity if sparsity else ''}\n"
|
|
f"Run Type: {run_type}")
|
|
|
|
profile_json = None
|
|
with open(json_trace) as f:
|
|
profile_json = json.load(f)
|
|
assert profile_json is not None
|
|
|
|
# Get all `llm.generate.step()` profile
|
|
step_traces = list(profile_json.keys())
|
|
assert (step_traces[0] == 'context')
|
|
step_traces = step_traces[1:] # have only prefill and decodes
|
|
prefills = list(filter(lambda x: "prefill" in x, step_traces))
|
|
all_decodes = list(filter(lambda x: "decode" in x, step_traces))
|
|
assert len(prefills) + len(all_decodes) == len(step_traces)
|
|
assert len(prefills) == 1
|
|
|
|
decodes = all_decodes[::args.step_plot_interval]
|
|
if decodes[-1] != all_decodes[-1]:
|
|
# Always have the last decode
|
|
decodes.append(all_decodes[-1])
|
|
|
|
prefill_traces = prepare_data(profile_json, prefills)
|
|
decode_traces = prepare_data(profile_json, decodes)
|
|
|
|
plot_title_suffix = make_plot_title_suffix(profile_json)
|
|
|
|
plot_trace_df(prefill_traces, plot_metric, "prefill " + plot_title_suffix,
|
|
output_directory / Path("prefill.png"))
|
|
plot_trace_df(decode_traces, plot_metric, "decodes " + plot_title_suffix,
|
|
output_directory / Path("decode_steps.png"))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser()
|
|
|
|
parser.add_argument("--json-trace",
|
|
type=str,
|
|
required=True,
|
|
help="json trace file output by \
|
|
examples/offline_inference/profiling.py")
|
|
parser.add_argument("--output-directory",
|
|
type=str,
|
|
required=False,
|
|
help="Directory to output plots")
|
|
parser.add_argument("--level",
|
|
type=str,
|
|
default="module",
|
|
choices=["module", "kernel"])
|
|
parser.add_argument("--top-k",
|
|
type=int,
|
|
default=12,
|
|
help="Only graph the top `top_k` entries by time.")
|
|
parser.add_argument("--fold-json-node",
|
|
nargs='+',
|
|
default=['Sampler', 'LogitsProcessor'],
|
|
help='Do not plot the children of these nodes. Let, \
|
|
the node represent the aggregate of all its \
|
|
children')
|
|
parser.add_argument("--plot-metric",
|
|
type=str,
|
|
default="cuda_time_ms",
|
|
help='Metric to plot. some options are cuda_time_ms, \
|
|
pct_cuda_time')
|
|
parser.add_argument(
|
|
"--step-plot-interval",
|
|
type=int,
|
|
default=4,
|
|
help="For every `step_plot_interval` steps, plot 1 step")
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Prepare/Extract relevant args
|
|
make_names_unique = False
|
|
if args.level == "module":
|
|
depth = -2
|
|
make_names_unique = True
|
|
elif args.level == "kernel":
|
|
depth = -1
|
|
else:
|
|
raise Exception(f"Unexpected level value ({args.level})")
|
|
|
|
output_directory = args.output_directory if args.output_directory else Path(
|
|
args.json_trace).parent
|
|
|
|
if not os.path.exists(output_directory):
|
|
os.makedirs(output_directory)
|
|
|
|
main(Path(args.json_trace), output_directory, depth, args.plot_metric,
|
|
make_names_unique, args.top_k, args.fold_json_node)
|