mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-07 11:07:03 +08:00
Signed-off-by: Nick Hill <nhill@redhat.com> Signed-off-by: Lucas Kabela <lucaskabela@meta.com> Signed-off-by: Max de Bayser <mbayser@br.ibm.com> Signed-off-by: Andrew Sansom <andrew@protopia.ai> Signed-off-by: Boyuan Feng <boyuan@meta.com> Signed-off-by: Boyuan Feng <fby.1994@gmail.com> Signed-off-by: boyuanfeng <boyuan@meta.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: JartX <sagformas@epdcenter.es> Signed-off-by: Chendi Xue <Chendi.Xue@intel.com> Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com> Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Chen Zhang <zhangch99@outlook.com> Signed-off-by: Roger Wang <hey@rogerw.io> Signed-off-by: mgoin <mgoin64@gmail.com> Signed-off-by: wwl2755 <wangwenlong2755@gmail.com> Signed-off-by: Manoel Marques <manoel.marques@ibm.com> Signed-off-by: Manoel Marques <manoelmrqs@gmail.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Signed-off-by: pengdrumli <pengdrumli@tencent.com> Signed-off-by: windsonsea <haifeng.yao@daocloud.io> Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai> Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Signed-off-by: Huamin Li <3ericli@gmail.com> Signed-off-by: simondanielsson <simon.danielsson99@hotmail.com> Signed-off-by: Rahul Tuli <rtuli@redhat.com> Signed-off-by: Yang <lymailforjob@gmail.com> Signed-off-by: Debolina Roy <debroy@redhat.com> Signed-off-by: David Chen <530634352@qq.com> Signed-off-by: wangzi <3220100013@zju.edu.cn> Signed-off-by: Eldar Kurtic <8884008+eldarkurtic@users.noreply.github.com> Signed-off-by: NickLucche <nlucches@redhat.com> Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com> Signed-off-by: Sara Kokkila Schumacher <saraks@ibm.com> Signed-off-by: Csrayz <jover@cmbchina.com> Signed-off-by: ivyilike <pww123@cmbchina.com> Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com> Signed-off-by: Bowen Wang <abmfy@icloud.com> Signed-off-by: qqma <qqma@amazon.com> Signed-off-by: ElizaWszola <ewszola@redhat.com> Signed-off-by: Lu Fang <fanglu@fb.com> Signed-off-by: Zhuohan Li <zhuohan123@gmail.com> Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: luka <lgovedic@redhat.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Signed-off-by: Or Ozeri <oro@il.ibm.com> Signed-off-by: Johnny Yang <johnnyyang@google.com> Signed-off-by: Alec Solder <alecs@fb.com> Signed-off-by: Alec S <10566873+alecsolder@users.noreply.github.com> Signed-off-by: Russell Bryant <rbryant@redhat.com> Signed-off-by: Matthew Bonanni <mbonanni@redhat.com> Signed-off-by: Alexander Matveev <amatveev@redhat.com> Signed-off-by: yewentao256 <zhyanwentao@126.com> Signed-off-by: liuye.hj <liuye.hj@alibaba-inc.com> Signed-off-by: Kunshang Ji <kunshang.ji@intel.com> Signed-off-by: Lucia Fang <116399278+luccafong@users.noreply.github.com> Signed-off-by: Michael Goin <mgoin64@gmail.com> Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com> Signed-off-by: Ming Yang <minos.future@gmail.com> Signed-off-by: Zhikaiiii <1658973216@qq.com> Signed-off-by: Andreas Hartel <andreas.hartel@aleph-alpha.com> Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com> Signed-off-by: wuxibin <wuxibin@bytedance.com> Signed-off-by: youkaichao <youkaichao@gmail.com> Signed-off-by: Peter Pan <Peter.Pan@daocloud.io> Signed-off-by: Peter Pan <peter.pan@daocloud.io> Signed-off-by: Nicolò Lucchesi<nicolo.lucchesi@gmail.com> Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com> Signed-off-by: Sage Moore <sage@neuralmagic.com> Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Signed-off-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com> Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com> Signed-off-by: Bill Nell <bnell@redhat.com> Signed-off-by: Shreeasish Kumar <shreeasish@rivosinc.com> Signed-off-by: Weida Hong <wdhongtw@google.com> Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Signed-off-by: Hashem Hashemi <hashem.hashemi@amd.com> Signed-off-by: Hashem Hashemi <159079214+amd-hhashemi@users.noreply.github.com> Signed-off-by: Amir Samani <asamani@nvidia.com> Signed-off-by: ElizaWszola <elizaw.9289@gmail.com> Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com> Signed-off-by: ilmarkov <markovilya197@gmail.com> Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com> Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com> Signed-off-by: rouchenzi <ruochenwen@gmail.com> Signed-off-by: rouchenzi <40842833+rouchenzi@users.noreply.github.com> Signed-off-by: Andrew Xia <axia@meta.com> Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com> Signed-off-by: Corey Lowman <clowman1993@gmail.com> Signed-off-by: jpvillam <jpvillam@amd.com> Signed-off-by: dougbtv <dosmith@redhat.com> Signed-off-by: Chenxi Yang <cxyang@fb.com> Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Signed-off-by: ahao-anyscale <ahao@anyscale.com> Signed-off-by: Yan Lu <luyan@nvidia.com> Signed-off-by: baxingpiaochong <771405853@qq.com> Signed-off-by: Kyle Sayers <kylesayrs@gmail.com> Signed-off-by: Nikhil Gupta <nikhil.gupta2@arm.com> Signed-off-by: Yong Hoon Shin <yhshin@meta.com> Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai> Signed-off-by: Benjamin Chislett <bchislett@nvidia.com> Signed-off-by: Ben Browning <bbrownin@redhat.com> Signed-off-by: Chengji Yao <chengjiyao@google.com> Signed-off-by: jiang1.li <jiang1.li@intel.com> Signed-off-by: Jackmin801 <ongjackm@gmail.com> Signed-off-by: Jonas M. Kübler <44084297+jmkuebler@users.noreply.github.com> Signed-off-by: taohui <taohui3@gmail.com> Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io> Signed-off-by: Shu Wang <shuw@nvidia.com> Signed-off-by: Shu Wang. <shuw@nvidia.com> Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com> Signed-off-by: Duncan Moss <djm.moss@gmail.com> Signed-off-by: Shiyan Deng <dsy842974287@meta.com> Signed-off-by: Wei Wei <wwei6@meta.com> Signed-off-by: Saman Keon <samanamp@outlook.com> Signed-off-by: yangxurui <yangxurui@meituan.com> Signed-off-by: nicole-lihui <nicole.li@daocloud.io> Signed-off-by: courage17340 <courage17340@163.com> Signed-off-by: Jacob Kahn <jacobkahn1@gmail.com> Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com> Signed-off-by: Agata Dobrzyniewicz <adobrzyniewicz@habana.ai> Signed-off-by: zxw <1020938856@qq.com> Signed-off-by: wang.yuqi <noooop@126.com> Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Signed-off-by: chenlang <chen.lang5@zte.com.cn> Signed-off-by: Jonas Kuebler <kuebj@amazon.com> Signed-off-by: AlonKejzman <alonkeizman@gmail.com> Signed-off-by: Tao Hui <taohui3@gmail.com> Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com> Signed-off-by: Tomer Asida <57313761+tomeras91@users.noreply.github.com> Signed-off-by: Aleksandr Malyshev <maleksan@amd.com> Signed-off-by: Eugene Khvedchenia <ekhvedchenia@nvidia.com> Signed-off-by: Eugene Khvedchenya <ekhvedchenya@gmail.com> Signed-off-by: yiting.jiang <yiting.jiang@daocloud.io> Signed-off-by: xaguilar <Xavier.AguilarFruto@amd.com> Signed-off-by: Iceber Gu <caiwei95@hotmail.com> Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com> Signed-off-by: Icey <1790571317@qq.com> Signed-off-by: 许文卿 <xwq391974@alibaba-inc.com> Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com> Co-authored-by: Nick Hill <nhill@redhat.com> Co-authored-by: Lucas Kabela <lucasakabela@gmail.com> Co-authored-by: Maximilien de Bayser <mbayser@br.ibm.com> Co-authored-by: Andrew Sansom <andrew@protopia.ai> Co-authored-by: Boyuan Feng <boyuan@meta.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: JartX <sagformas@epdcenter.es> Co-authored-by: Chendi.Xue <chendi.xue@intel.com> Co-authored-by: Chauncey <chaunceyjiang@gmail.com> Co-authored-by: xin.li <xin.li@daocloud.io> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk> Co-authored-by: Chen Zhang <zhangch99@outlook.com> Co-authored-by: Roger Wang <hey@rogerw.io> Co-authored-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Wenlong Wang <wangwenlong2755@gmail.com> Co-authored-by: Manoel Marques <manoelmrqs@gmail.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: lirong <56789630+lirong-lirong@users.noreply.github.com> Co-authored-by: Michael Yao <haifeng.yao@daocloud.io> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Co-authored-by: Huamin Li <3ericli@gmail.com> Co-authored-by: Lu Fang <30275821+houseroad@users.noreply.github.com> Co-authored-by: Simon Danielsson <70206058+simondanielsson@users.noreply.github.com> Co-authored-by: Rahul Tuli <rtuli@redhat.com> Co-authored-by: Claude <noreply@anthropic.com> Co-authored-by: Yang Liu <127183760+KKSK-DON@users.noreply.github.com> Co-authored-by: Deboleina <debroy@redhat.com> Co-authored-by: yinz-aizip <yinz@aizip.ai> Co-authored-by: WeiQing Chen <40507679+david6666666@users.noreply.github.com> Co-authored-by: wangzi <3220100013@zju.edu.cn> Co-authored-by: Eldar Kurtić <8884008+eldarkurtic@users.noreply.github.com> Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com> Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com> Co-authored-by: Yizhou <136800916+yiz-liu@users.noreply.github.com> Co-authored-by: Sara-KS <50249410+Sara-KS@users.noreply.github.com> Co-authored-by: Csrayz <jover@cmbchina.com> Co-authored-by: ivyilike <pww123@cmbchina.com> Co-authored-by: Burkhard Ringlein <ngl@zurich.ibm.com> Co-authored-by: Bowen Wang <abmfy@icloud.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: Daisy-Ma-coder <daisy.ma.0117@gmail.com> Co-authored-by: qqma <qqma@amazon.com> Co-authored-by: ElizaWszola <ewszola@redhat.com> Co-authored-by: Lucia Fang <116399278+luccafong@users.noreply.github.com> Co-authored-by: Zhuohan Li <zhuohan123@gmail.com> Co-authored-by: Simon Mo <simon.mo@hey.com> Co-authored-by: Or Ozeri <oro@il.ibm.com> Co-authored-by: Johnny Yang <24908445+jcyang43@users.noreply.github.com> Co-authored-by: Chengji Yao <chengjiyao@google.com> Co-authored-by: Alec S <10566873+alecsolder@users.noreply.github.com> Co-authored-by: Alec Solder <alecs@fb.com> Co-authored-by: Russell Bryant <rbryant@redhat.com> Co-authored-by: Matthew Bonanni <mbonanni@redhat.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Co-authored-by: Chris Bamford <chrisbam4d@gmail.com> Co-authored-by: Alexander Matveev <59768536+alexm-redhat@users.noreply.github.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: JJJYmmm <92386084+JJJYmmm@users.noreply.github.com> Co-authored-by: liuye.hj <liuye.hj@alibaba-inc.com> Co-authored-by: Kunshang Ji <kunshang.ji@intel.com> Co-authored-by: Lucia (Lu) Fang <fanglu@meta.com> Co-authored-by: Varun Sundar Rabindranath <varunsundar08@gmail.com> Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com> Co-authored-by: Ming Yang <yming@meta.com> Co-authored-by: Zhikaiiii <55917203+Zhikaiiii@users.noreply.github.com> Co-authored-by: Andreas Hartel <andreas@hartel.me> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com> Co-authored-by: vllmellm <vllm.ellm@embeddedllm.com> Co-authored-by: Joel <wuxibin89@163.com> Co-authored-by: youkaichao <youkaichao@gmail.com> Co-authored-by: Mark McLoughlin <markmc@redhat.com> Co-authored-by: Peter Pan <peter.pan@daocloud.io> Co-authored-by: Nicolò Lucchesi <nicolo.lucchesi@gmail.com> Co-authored-by: Fanli Lin <fanli.lin@intel.com> Co-authored-by: Thomas Parnell <tpa@zurich.ibm.com> Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com> Co-authored-by: Sage Moore <sage@neuralmagic.com> Co-authored-by: yewentao256 <zhyanwentao@126.com> Co-authored-by: bnellnm <49004751+bnellnm@users.noreply.github.com> Co-authored-by: rivos-shreeasish <shreeasish@rivosinc.com> Co-authored-by: Chih-Chieh Yang <chih.chieh.yang@ibm.com> Co-authored-by: Weida Hong <wdhongtw@gmail.com> Co-authored-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Co-authored-by: Hashem Hashemi <159079214+amd-hhashemi@users.noreply.github.com> Co-authored-by: Amir Samani <samani@ualberta.ca> Co-authored-by: Luka Govedič <lgovedic@redhat.com> Co-authored-by: jiahanc <173873397+jiahanc@users.noreply.github.com> Co-authored-by: Ilya Markov <markovilya197@gmail.com> Co-authored-by: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Co-authored-by: Jialin Ouyang <Jialin.Ouyang@gmail.com> Co-authored-by: rouchenzi <40842833+rouchenzi@users.noreply.github.com> Co-authored-by: Andrew Xia <axia@meta.com> Co-authored-by: kourosh hakhamaneshi <31483498+kouroshHakha@users.noreply.github.com> Co-authored-by: Corey Lowman <clowman1993@gmail.com> Co-authored-by: Juan Villamizar <100237675+jpvillam-amd@users.noreply.github.com> Co-authored-by: jpvillam <jpvillam@amd.com> Co-authored-by: Doug Smith <dosmith@redhat.com> Co-authored-by: Chenxi Yang <cxyang@cs.utexas.edu> Co-authored-by: Chenxi Yang <cxyang@fb.com> Co-authored-by: ahao-anyscale <ahao@anyscale.com> Co-authored-by: 0xNullPath <luyanfcp@foxmail.com> Co-authored-by: baxingpiaochong <771405853@qq.com> Co-authored-by: Benjamin Chislett <bchislett@nvidia.com> Co-authored-by: Kyle Sayers <kylesayrs@gmail.com> Co-authored-by: Nikhil Gupta <nikhil.gupta2@arm.com> Co-authored-by: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com> Co-authored-by: lhsjohn <huashuoli@tencent.com> Co-authored-by: Ben Browning <bbrownin@redhat.com> Co-authored-by: Li, Jiang <jiang1.li@intel.com> Co-authored-by: Jackmin801 <56836461+Jackmin801@users.noreply.github.com> Co-authored-by: Jonas M. Kübler <44084297+jmkuebler@users.noreply.github.com> Co-authored-by: Tao Hui <taohui3@gmail.com> Co-authored-by: rongfu.leng <rongfu.leng@daocloud.io> Co-authored-by: Shu Wang <shuw@nvidia.com> Co-authored-by: Tyler Michael Smith <tlrmchlsmth@gmail.com> Co-authored-by: Duncan Moss <djm.moss@gmail.com> Co-authored-by: Shiyan Deng <dsy842974287@meta.com> Co-authored-by: Wei Wei <wwei6@meta.com> Co-authored-by: Saman A. Pour <samanamp@outlook.com> Co-authored-by: XuruiYang <530534756@qq.com> Co-authored-by: yangxurui <yangxurui@meituan.com> Co-authored-by: Nicole LiHui 🥜 <nicolelihui@outlook.com> Co-authored-by: courage17340 <courage17340@users.noreply.github.com> Co-authored-by: Jacob Kahn <jacobkahn1@gmail.com> Co-authored-by: Nicole LiHui 🥜 <nicole.li@daocloud.io> Co-authored-by: Fadi Arafeh <115173828+fadara01@users.noreply.github.com> Co-authored-by: Agata Dobrzyniewicz <160237065+adobrzyn@users.noreply.github.com> Co-authored-by: yyzxw <34639446+yyzxw@users.noreply.github.com> Co-authored-by: wang.yuqi <noooop@126.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: chenlang <chen.lang5@zte.com.cn> Co-authored-by: chenlang <10346245@zte.com.cn> Co-authored-by: AlonKejzman <alonkeizman@gmail.com> Co-authored-by: tomeras91 <57313761+tomeras91@users.noreply.github.com> Co-authored-by: Aleksandr Malyshev <164964928+maleksan85@users.noreply.github.com> Co-authored-by: Aleksandr Malyshev <maleksan@amd.com> Co-authored-by: Doug Lehr <douglehr@amd.com> Co-authored-by: Eugene Khvedchenya <ekhvedchenya@gmail.com> Co-authored-by: yitingdc <59356937+yitingdc@users.noreply.github.com> Co-authored-by: xaguilar-amd <xavier.aguilarfruto@amd.com> Co-authored-by: Iceber Gu <caiwei95@hotmail.com> Co-authored-by: Tao He <linzhu.ht@alibaba-inc.com> Co-authored-by: Icey <1790571317@qq.com> Co-authored-by: Xu Wenqing <121550081+Xu-Wenqing@users.noreply.github.com> Co-authored-by: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com> Co-authored-by: RishiAstra <40644327+RishiAstra@users.noreply.github.com>
524 lines
19 KiB
C++
524 lines
19 KiB
C++
#include <list>
|
|
#include <optional>
|
|
|
|
#include "common/memory_desc.hpp"
|
|
#include "common/memory.hpp"
|
|
|
|
#include "dnnl_helper.h"
|
|
|
|
static dnnl::engine& default_engine() {
|
|
static dnnl::engine engine(dnnl::engine::kind::cpu, 0);
|
|
return engine;
|
|
}
|
|
|
|
static dnnl::stream& default_stream() {
|
|
static dnnl::stream stream(default_engine());
|
|
return stream;
|
|
}
|
|
|
|
void release_dnnl_matmul_handler(int64_t handler) {
|
|
DNNLMatMulPrimitiveHandler* ptr =
|
|
reinterpret_cast<DNNLMatMulPrimitiveHandler*>(handler);
|
|
delete ptr;
|
|
}
|
|
|
|
DNNLScratchPadManager::DNNLScratchPadManager() : size_(0), ptr_(nullptr) {
|
|
this->realloc(allocation_unit * 128);
|
|
}
|
|
|
|
void DNNLScratchPadManager::realloc(size_t new_size) {
|
|
new_size = round(new_size);
|
|
if (new_size > size_) {
|
|
ptr_ = std::aligned_alloc(64, new_size);
|
|
size_ = new_size;
|
|
}
|
|
}
|
|
|
|
DNNLScratchPadManager* DNNLScratchPadManager::get_dnnl_scratchpad_manager() {
|
|
static DNNLScratchPadManager manager;
|
|
return &manager;
|
|
}
|
|
|
|
template <typename KT, typename VT>
|
|
class DNNLPrimitiveCache {
|
|
public:
|
|
using cache_value_t = std::pair<KT, VT>;
|
|
using result_value_t = VT;
|
|
using container_t = std::list<cache_value_t>;
|
|
using value_iterator_t = typename container_t::iterator;
|
|
using map_t = std::unordered_map<KT, value_iterator_t>;
|
|
using creator_t = VT (*)();
|
|
|
|
public:
|
|
DNNLPrimitiveCache(size_t capacity)
|
|
: capacity_(capacity),
|
|
values_(),
|
|
key_to_value_(std::min(256lu, capacity)) {
|
|
assert(capacity > 0);
|
|
}
|
|
|
|
template <typename F>
|
|
result_value_t get_or_create(const KT& key, F&& creator) {
|
|
std::optional<value_iterator_t> value = get_value(key);
|
|
if (value.has_value()) {
|
|
return value.value()->second;
|
|
} else {
|
|
return add_value({key, creator()})->second;
|
|
}
|
|
}
|
|
|
|
size_t size() const { return values_.size(); }
|
|
|
|
private:
|
|
void dump_data() {
|
|
std::stringstream ss;
|
|
ss << "table_id: " << std::hex << reinterpret_cast<size_t>(this) << std::dec
|
|
<< "\n";
|
|
ss << "container: [";
|
|
for (auto&& iter : values_) {
|
|
ss << "(" << iter.first << ", " << std::hex
|
|
<< reinterpret_cast<size_t>(iter.second.get()) << "), " << std::dec;
|
|
}
|
|
ss << "]\n";
|
|
|
|
ss << "map: [";
|
|
for (auto&& iter : key_to_value_) {
|
|
ss << "(" << iter.first << ", " << iter.second->first << ", " << std::hex
|
|
<< reinterpret_cast<size_t>(iter.second->second.get()) << std::dec
|
|
<< "), ";
|
|
}
|
|
ss << "]\n";
|
|
std::printf("%s\n", ss.str().c_str());
|
|
}
|
|
|
|
value_iterator_t add_value(cache_value_t&& new_value) {
|
|
if (size() == capacity_) {
|
|
cache_value_t& last_item = values_.back();
|
|
key_to_value_.erase(last_item.first);
|
|
values_.pop_back();
|
|
}
|
|
|
|
auto& added_value_ = values_.emplace_front(std::move(new_value));
|
|
key_to_value_.emplace(added_value_.first, values_.begin());
|
|
return values_.begin();
|
|
}
|
|
|
|
std::optional<value_iterator_t> get_value(const KT& key) {
|
|
if (key_to_value_.size() > 0 && key == values_.begin()->first) {
|
|
return values_.begin();
|
|
}
|
|
|
|
auto value_map_iterator = key_to_value_.find(key);
|
|
if (value_map_iterator != key_to_value_.end()) {
|
|
values_.splice(values_.begin(), values_, value_map_iterator->second);
|
|
return value_map_iterator->second;
|
|
} else {
|
|
return {};
|
|
}
|
|
}
|
|
|
|
private:
|
|
const size_t capacity_;
|
|
container_t values_;
|
|
map_t key_to_value_;
|
|
};
|
|
|
|
DNNLMatMulPrimitiveHandler::DNNLMatMulPrimitiveHandler(
|
|
const Args& args, dnnl::memory::data_type b_type)
|
|
: b_n_size_(args.b_n_size),
|
|
b_n_stride_(args.b_n_stride),
|
|
b_k_size_(args.b_k_size),
|
|
b_k_stride_(args.b_k_stride),
|
|
b_type_(b_type),
|
|
c_type_(args.c_type),
|
|
runtime_memory_ptrs_(8),
|
|
primitive_cache_size_(args.primitive_cache_size) {
|
|
assert(primitive_cache_size_ > 0);
|
|
}
|
|
|
|
void DNNLMatMulPrimitiveHandler::prepack_weight(
|
|
void* original_b_ptr, dnnl::memory::desc b_target_mem_desc) {
|
|
dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
|
|
{b_k_stride_, b_n_stride_});
|
|
dnnl::memory original_weight(original_b_md, default_engine(), original_b_ptr);
|
|
dnnl::memory packed_weight(b_target_mem_desc, default_engine());
|
|
{
|
|
dnnl::reorder(original_weight, packed_weight)
|
|
.execute(default_stream(), original_weight, packed_weight);
|
|
default_stream().wait();
|
|
}
|
|
memory_cache_[DNNL_ARG_WEIGHTS] = packed_weight;
|
|
b_target_mem_desc_ = b_target_mem_desc;
|
|
}
|
|
|
|
void DNNLMatMulPrimitiveHandler::set_runtime_memory_ptr(
|
|
size_t index, dnnl_memory* memory_ptr) {
|
|
dnnl::impl::memory_storage_t* mem_storage_ptr = memory_ptr->memory_storage();
|
|
dnnl_memory_desc* mem_desc = const_cast<dnnl_memory_desc*>(memory_ptr->md());
|
|
runtime_memory_ptrs_[index] = {mem_storage_ptr, mem_desc};
|
|
}
|
|
|
|
std::pair<dnnl::impl::memory_storage_t*, dnnl_memory_desc*>
|
|
DNNLMatMulPrimitiveHandler::get_runtime_memory_ptr(size_t index) {
|
|
return runtime_memory_ptrs_[index];
|
|
}
|
|
|
|
namespace std {
|
|
template <>
|
|
struct hash<W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey> {
|
|
size_t operator()(
|
|
const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& val) const {
|
|
return hash<dnnl_dim_t>()(val.b_n_size) ^ hash<dnnl_dim_t>()(val.b_k_size) ^
|
|
hash<int>()(static_cast<int>(val.a_qs)) ^
|
|
hash<int>()(static_cast<int>(val.b_qs)) ^ hash<bool>()(val.use_azp) ^
|
|
hash<int>()(static_cast<int>(val.c_type));
|
|
}
|
|
};
|
|
|
|
template <>
|
|
struct hash<W8A8MatMulPrimitiveHandler::MSizeCacheKey> {
|
|
size_t operator()(
|
|
const W8A8MatMulPrimitiveHandler::MSizeCacheKey& val) const {
|
|
return hash<dnnl_dim_t>()(val.a_m_size) ^ hash<bool>()(val.use_bias) ^
|
|
hash<int>()(static_cast<int>(val.bias_type));
|
|
}
|
|
};
|
|
|
|
template <>
|
|
struct hash<MatMulPrimitiveHandler::ClassMatmulCacheKey> {
|
|
size_t operator()(
|
|
const MatMulPrimitiveHandler::ClassMatmulCacheKey& val) const {
|
|
return hash<dnnl_dim_t>()(val.b_n_size) ^ hash<dnnl_dim_t>()(val.b_k_size);
|
|
}
|
|
};
|
|
|
|
template <>
|
|
struct hash<MatMulPrimitiveHandler::MSizeCacheKey> {
|
|
size_t operator()(const MatMulPrimitiveHandler::MSizeCacheKey& val) const {
|
|
return hash<dnnl_dim_t>()(val.a_m_size) ^
|
|
hash<dnnl_dim_t>()(val.a_m_stride) ^ hash<bool>()(val.use_bias) ^
|
|
hash<int>()(static_cast<int>(val.bias_type));
|
|
}
|
|
};
|
|
} // namespace std
|
|
|
|
bool operator==(const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& l,
|
|
const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& r) {
|
|
return l.b_n_size == r.b_n_size && l.b_k_size == r.b_k_size &&
|
|
l.a_qs == r.a_qs && l.b_qs == r.b_qs && l.use_azp == r.use_azp &&
|
|
l.c_type == r.c_type;
|
|
}
|
|
|
|
bool operator==(const W8A8MatMulPrimitiveHandler::MSizeCacheKey& l,
|
|
const W8A8MatMulPrimitiveHandler::MSizeCacheKey& r) {
|
|
return l.use_bias == r.use_bias && l.a_m_size == r.a_m_size &&
|
|
l.bias_type == r.bias_type;
|
|
}
|
|
|
|
bool operator==(const MatMulPrimitiveHandler::ClassMatmulCacheKey& l,
|
|
const MatMulPrimitiveHandler::ClassMatmulCacheKey& r) {
|
|
return l.b_n_size == r.b_n_size && l.b_k_size == r.b_k_size;
|
|
}
|
|
|
|
bool operator==(const MatMulPrimitiveHandler::MSizeCacheKey& l,
|
|
const MatMulPrimitiveHandler::MSizeCacheKey& r) {
|
|
return l.a_m_size == r.a_m_size && l.a_m_stride == r.a_m_stride &&
|
|
l.use_bias == r.use_bias && l.bias_type == r.bias_type;
|
|
}
|
|
|
|
static std::shared_ptr<W8A8MatMulPrimitiveHandler::MSizeCache>
|
|
get_w8a8_class_primitive_cache(
|
|
const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& key,
|
|
int64_t cache_size) {
|
|
static W8A8MatMulPrimitiveHandler::ClassMatmulCache cache(128);
|
|
assert(cache_size > 0);
|
|
return cache.get_or_create(key, [&]() {
|
|
return std::make_shared<W8A8MatMulPrimitiveHandler::MSizeCache>(cache_size);
|
|
});
|
|
}
|
|
|
|
W8A8MatMulPrimitiveHandler::W8A8MatMulPrimitiveHandler(const Args& args)
|
|
: DNNLMatMulPrimitiveHandler(
|
|
static_cast<const DNNLMatMulPrimitiveHandler::Args&>(args),
|
|
dnnl::memory::data_type::s8),
|
|
use_azp_(args.use_a_zero_point),
|
|
a_qs_(args.a_quantization_strategy),
|
|
b_qs_(args.b_quantization_strategy),
|
|
m_size_cache_(nullptr) {
|
|
assert(a_qs_ != QuantizationStrategy::PER_OUTPUT_CHANNEL);
|
|
assert(b_qs_ != QuantizationStrategy::PER_TOKEN);
|
|
if (a_qs_ == QuantizationStrategy::PER_TOKEN) {
|
|
assert(!use_azp_);
|
|
};
|
|
prepack_weight(args.b_ptr,
|
|
create_primitive_desc(
|
|
MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL,
|
|
.use_bias = false,
|
|
.bias_type = dnnl::memory::data_type::undef},
|
|
true)
|
|
.weights_desc());
|
|
init_runtime_memory_cache(args);
|
|
}
|
|
|
|
void W8A8MatMulPrimitiveHandler::execute(ExecArgs& args) {
|
|
auto&& [a_storage, a_mem_desc] = get_runtime_memory_ptr(0);
|
|
auto&& [c_storage, c_mem_desc] = get_runtime_memory_ptr(1);
|
|
a_storage->set_data_handle((void*)args.a_ptr);
|
|
a_mem_desc->dims[0] = args.a_m_size;
|
|
c_storage->set_data_handle((void*)args.c_ptr);
|
|
c_mem_desc->dims[0] = args.a_m_size;
|
|
|
|
if (a_qs_ == QuantizationStrategy::PER_TENSOR) {
|
|
auto&& [a_scale_storage, a_scale_mem_desc] = get_runtime_memory_ptr(2);
|
|
a_scale_storage->set_data_handle((void*)args.a_scales_ptr);
|
|
}
|
|
if (use_azp_) {
|
|
auto&& [a_zero_point_storage, a_zero_point_mem_desc] =
|
|
get_runtime_memory_ptr(3);
|
|
a_zero_point_storage->set_data_handle((void*)args.a_zero_points_ptr);
|
|
}
|
|
|
|
if (args.use_bias) {
|
|
auto&& [bias_storage, bias_mem_desc] = get_runtime_memory_ptr(4);
|
|
bias_storage->set_data_handle((void*)args.bias_ptr);
|
|
}
|
|
|
|
dnnl::matmul matmul = get_matmul_cache(args);
|
|
|
|
auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(5);
|
|
scratchpad_storage->set_data_handle(
|
|
DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data<void>());
|
|
|
|
matmul.execute(default_stream(), memory_cache_);
|
|
default_stream().wait();
|
|
}
|
|
|
|
dnnl::matmul W8A8MatMulPrimitiveHandler::get_matmul_cache(
|
|
const MSizeCacheKey& key) {
|
|
if (m_size_cache_.get() == nullptr) {
|
|
ClassMatmulCacheKey key = {.b_n_size = b_n_size_,
|
|
.b_k_size = b_k_size_,
|
|
.a_qs = a_qs_,
|
|
.b_qs = b_qs_,
|
|
.use_azp = use_azp_,
|
|
.c_type = c_type_};
|
|
m_size_cache_ = get_w8a8_class_primitive_cache(key, primitive_cache_size_);
|
|
}
|
|
|
|
return m_size_cache_->get_or_create(key, [&]() {
|
|
dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false);
|
|
auto manager = DNNLScratchPadManager::get_dnnl_scratchpad_manager();
|
|
manager->realloc(desc.scratchpad_desc().get_size());
|
|
return dnnl::matmul(desc);
|
|
});
|
|
}
|
|
|
|
void W8A8MatMulPrimitiveHandler::init_runtime_memory_cache(const Args& args) {
|
|
memory_cache_[DNNL_ARG_SRC] = dnnl::memory({{1, b_k_size_},
|
|
dnnl::memory::data_type::s8,
|
|
dnnl::memory::format_tag::ab},
|
|
default_engine(), nullptr);
|
|
set_runtime_memory_ptr(0, memory_cache_[DNNL_ARG_SRC].get());
|
|
memory_cache_[DNNL_ARG_DST] =
|
|
dnnl::memory({{1, b_n_size_}, c_type_, dnnl::memory::format_tag::ab},
|
|
default_engine(), nullptr);
|
|
set_runtime_memory_ptr(1, memory_cache_[DNNL_ARG_DST].get());
|
|
|
|
// For PER_TOKEN, scales will be applied in outside epilogue
|
|
if (a_qs_ == QuantizationStrategy::PER_TENSOR) {
|
|
memory_cache_[DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC] = dnnl::memory(
|
|
{{1}, dnnl::memory::data_type::f32, {1}}, default_engine(), nullptr);
|
|
set_runtime_memory_ptr(
|
|
2, memory_cache_[DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC].get());
|
|
if (use_azp_) {
|
|
memory_cache_[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC] = dnnl::memory(
|
|
{{1}, dnnl::memory::data_type::s32, {1}}, default_engine(), nullptr);
|
|
set_runtime_memory_ptr(
|
|
3, memory_cache_[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC].get());
|
|
}
|
|
}
|
|
|
|
if (b_qs_ == QuantizationStrategy::PER_TENSOR) {
|
|
memory_cache_[DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS] =
|
|
dnnl::memory({{1}, dnnl::memory::data_type::f32, {1}}, default_engine(),
|
|
(void*)args.b_scales_ptr);
|
|
} else if (b_qs_ == QuantizationStrategy::PER_OUTPUT_CHANNEL) {
|
|
memory_cache_[DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS] =
|
|
dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
|
|
default_engine(), (void*)args.b_scales_ptr);
|
|
}
|
|
|
|
memory_cache_[DNNL_ARG_BIAS] =
|
|
dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
|
|
default_engine(), nullptr);
|
|
set_runtime_memory_ptr(4, memory_cache_[DNNL_ARG_BIAS].get());
|
|
|
|
memory_cache_[DNNL_ARG_SCRATCHPAD] =
|
|
dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
|
|
default_engine(), nullptr);
|
|
set_runtime_memory_ptr(5, memory_cache_[DNNL_ARG_SCRATCHPAD].get());
|
|
}
|
|
|
|
dnnl::matmul::primitive_desc W8A8MatMulPrimitiveHandler::create_primitive_desc(
|
|
const MSizeCacheKey& key, bool first_time) {
|
|
dnnl::memory::desc a_md({key.a_m_size, b_k_size_},
|
|
dnnl::memory::data_type::s8,
|
|
dnnl::memory::format_tag::ab);
|
|
dnnl::memory::desc b_md;
|
|
if (first_time) {
|
|
b_md =
|
|
dnnl::memory::desc({b_k_size_, b_n_size_}, dnnl::memory::data_type::s8,
|
|
dnnl::memory::format_tag::any);
|
|
} else {
|
|
b_md = b_target_mem_desc_;
|
|
}
|
|
dnnl::memory::desc c_md({key.a_m_size, b_n_size_}, c_type_,
|
|
dnnl::memory::format_tag::ab);
|
|
|
|
dnnl::primitive_attr attr;
|
|
|
|
attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
|
|
|
|
// For PER_TOKEN, scales will be applied in outside epilogue
|
|
if (a_qs_ == QuantizationStrategy::PER_TENSOR) {
|
|
attr.set_scales_mask(DNNL_ARG_SRC, 0);
|
|
if (use_azp_) {
|
|
attr.set_zero_points_mask(DNNL_ARG_SRC, 0);
|
|
}
|
|
}
|
|
|
|
if (b_qs_ == QuantizationStrategy::PER_TENSOR) {
|
|
attr.set_scales_mask(DNNL_ARG_WEIGHTS, 0);
|
|
} else if (b_qs_ == QuantizationStrategy::PER_OUTPUT_CHANNEL) {
|
|
attr.set_scales_mask(DNNL_ARG_WEIGHTS, 2);
|
|
}
|
|
|
|
if (key.use_bias) {
|
|
// For PER_TOKEN, bias will be applied in epilogue
|
|
assert(a_qs_ == QuantizationStrategy::PER_TENSOR);
|
|
dnnl::memory::desc bias_md({1, b_n_size_}, key.bias_type, {b_n_size_, 1});
|
|
return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, bias_md,
|
|
c_md, attr);
|
|
} else {
|
|
return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md,
|
|
attr);
|
|
}
|
|
}
|
|
|
|
MatMulPrimitiveHandler::MatMulPrimitiveHandler(const Args& args)
|
|
: DNNLMatMulPrimitiveHandler(
|
|
static_cast<DNNLMatMulPrimitiveHandler::Args>(args), args.ab_type),
|
|
m_size_cache_(nullptr) {
|
|
assert(ab_type_ == dnnl::memory::data_type::f32 ||
|
|
ab_type_ == dnnl::memory::data_type::bf16 ||
|
|
ab_type_ == dnnl::memory::data_type::f16);
|
|
prepack_weight(args.b_ptr,
|
|
create_primitive_desc(
|
|
MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL,
|
|
.a_m_stride = DNNL_RUNTIME_DIM_VAL,
|
|
.use_bias = false,
|
|
.bias_type = dnnl::memory::data_type::undef},
|
|
true)
|
|
.weights_desc());
|
|
init_runtime_memory_cache(args);
|
|
}
|
|
|
|
static std::shared_ptr<MatMulPrimitiveHandler::MSizeCache>
|
|
get_matul_class_primitive_cache(
|
|
const MatMulPrimitiveHandler::ClassMatmulCacheKey& key,
|
|
int64_t cache_size) {
|
|
static MatMulPrimitiveHandler::ClassMatmulCache cache(128);
|
|
assert(cache_size > 0);
|
|
return cache.get_or_create(key, [&]() {
|
|
return std::make_shared<MatMulPrimitiveHandler::MSizeCache>(cache_size);
|
|
});
|
|
}
|
|
|
|
void MatMulPrimitiveHandler::execute(ExecArgs& args) {
|
|
auto&& [a_storage, a_mem_desc] = get_runtime_memory_ptr(0);
|
|
auto&& [c_storage, c_mem_desc] = get_runtime_memory_ptr(1);
|
|
a_storage->set_data_handle((void*)args.a_ptr);
|
|
a_mem_desc->dims[0] = args.a_m_size;
|
|
a_mem_desc->format_desc.blocking.strides[0] = args.a_m_stride;
|
|
c_storage->set_data_handle((void*)args.c_ptr);
|
|
c_mem_desc->dims[0] = args.a_m_size;
|
|
|
|
if (args.use_bias) {
|
|
auto&& [bias_storage, bias_mem_desc] = get_runtime_memory_ptr(2);
|
|
bias_storage->set_data_handle((void*)args.bias_ptr);
|
|
}
|
|
|
|
dnnl::matmul matmul = get_matmul_cache(args);
|
|
|
|
auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(3);
|
|
scratchpad_storage->set_data_handle(
|
|
DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data<void>());
|
|
|
|
matmul.execute(default_stream(), memory_cache_);
|
|
default_stream().wait();
|
|
}
|
|
|
|
dnnl::matmul MatMulPrimitiveHandler::get_matmul_cache(
|
|
const MSizeCacheKey& key) {
|
|
if (m_size_cache_.get() == nullptr) {
|
|
ClassMatmulCacheKey key = {.b_n_size = b_n_size_, .b_k_size = b_k_size_};
|
|
m_size_cache_ = get_matul_class_primitive_cache(key, primitive_cache_size_);
|
|
}
|
|
return m_size_cache_->get_or_create(key, [&]() {
|
|
dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false);
|
|
auto manager = DNNLScratchPadManager::get_dnnl_scratchpad_manager();
|
|
manager->realloc(desc.scratchpad_desc().get_size());
|
|
return dnnl::matmul(desc);
|
|
});
|
|
}
|
|
|
|
dnnl::matmul::primitive_desc MatMulPrimitiveHandler::create_primitive_desc(
|
|
const MSizeCacheKey& key, bool first_time) {
|
|
dnnl::memory::desc a_md;
|
|
dnnl::memory::desc b_md;
|
|
if (first_time) {
|
|
a_md = dnnl::memory::desc({key.a_m_size, b_k_size_}, b_type_,
|
|
dnnl::memory::format_tag::ab);
|
|
b_md = dnnl::memory::desc({b_k_size_, b_n_size_}, b_type_,
|
|
dnnl::memory::format_tag::any);
|
|
} else {
|
|
a_md = dnnl::memory::desc({key.a_m_size, b_k_size_}, b_type_,
|
|
{key.a_m_stride, 1});
|
|
b_md = b_target_mem_desc_;
|
|
}
|
|
dnnl::memory::desc c_md({key.a_m_size, b_n_size_}, c_type_,
|
|
dnnl::memory::format_tag::ab);
|
|
|
|
dnnl::primitive_attr attr;
|
|
attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
|
|
|
|
if (key.use_bias) {
|
|
dnnl::memory::desc bias_md({1, b_n_size_}, key.bias_type, {b_n_size_, 1});
|
|
return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, bias_md,
|
|
c_md, attr);
|
|
} else {
|
|
return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md,
|
|
attr);
|
|
}
|
|
}
|
|
|
|
void MatMulPrimitiveHandler::init_runtime_memory_cache(const Args& args) {
|
|
memory_cache_[DNNL_ARG_SRC] = dnnl::memory(
|
|
{{1, b_k_size_}, b_type_, {b_k_size_, 1}}, default_engine(), nullptr);
|
|
set_runtime_memory_ptr(0, memory_cache_[DNNL_ARG_SRC].get());
|
|
memory_cache_[DNNL_ARG_DST] =
|
|
dnnl::memory({{1, b_n_size_}, c_type_, dnnl::memory::format_tag::ab},
|
|
default_engine(), nullptr);
|
|
set_runtime_memory_ptr(1, memory_cache_[DNNL_ARG_DST].get());
|
|
|
|
memory_cache_[DNNL_ARG_BIAS] =
|
|
dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
|
|
default_engine(), nullptr);
|
|
set_runtime_memory_ptr(2, memory_cache_[DNNL_ARG_BIAS].get());
|
|
|
|
memory_cache_[DNNL_ARG_SCRATCHPAD] =
|
|
dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
|
|
default_engine(), nullptr);
|
|
set_runtime_memory_ptr(3, memory_cache_[DNNL_ARG_SCRATCHPAD].get());
|
|
}
|