mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-09 17:14:58 +08:00
Signed-off-by: nicole-lihui <nicole.li@daocloud.io> Signed-off-by: yewentao256 <zhyanwentao@126.com> Signed-off-by: courage17340 <courage17340@163.com> Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Jacob Kahn <jacobkahn1@gmail.com> Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com> Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com> Signed-off-by: Roger Wang <hey@rogerw.io> Signed-off-by: Agata Dobrzyniewicz <adobrzyniewicz@habana.ai> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Signed-off-by: zxw <1020938856@qq.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: wang.yuqi <noooop@126.com> Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Signed-off-by: Kunshang Ji <kunshang.ji@intel.com> Signed-off-by: chenlang <chen.lang5@zte.com.cn> Signed-off-by: youkaichao <youkaichao@gmail.com> Signed-off-by: Jonas Kuebler <kuebj@amazon.com> Signed-off-by: jiang1.li <jiang1.li@intel.com> Signed-off-by: Russell Bryant <rbryant@redhat.com> Signed-off-by: NickLucche <nlucches@redhat.com> Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com> Signed-off-by: AlonKejzman <alonkeizman@gmail.com> Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Signed-off-by: taohui <taohui3@gmail.com> Signed-off-by: Tao Hui <taohui3@gmail.com> Signed-off-by: Matthew Bonanni <mbonanni@redhat.com> Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com> Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Signed-off-by: Zhuohan Li <zhuohan123@gmail.com> Signed-off-by: Tomer Asida <57313761+tomeras91@users.noreply.github.com> Signed-off-by: Shu Wang. <shuw@nvidia.com> Signed-off-by: Nick Hill <nhill@redhat.com> Signed-off-by: Aleksandr Malyshev <maleksan@amd.com> Signed-off-by: Eugene Khvedchenia <ekhvedchenia@nvidia.com> Signed-off-by: Eugene Khvedchenya <ekhvedchenya@gmail.com> Signed-off-by: yiting.jiang <yiting.jiang@daocloud.io> Signed-off-by: Andrew Sansom <andrew@protopia.ai> Signed-off-by: xaguilar <Xavier.AguilarFruto@amd.com> Signed-off-by: Iceber Gu <caiwei95@hotmail.com> Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com> Signed-off-by: Icey <1790571317@qq.com> Signed-off-by: Sage Moore <sage@neuralmagic.com> Signed-off-by: 许文卿 <xwq391974@alibaba-inc.com> Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com> Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com> Signed-off-by: Seiji Eicher <seiji@anyscale.com> Signed-off-by: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com> Signed-off-by: zjy0516 <riverclouds.zhu@qq.com> Signed-off-by: Kosseila (CloudThrill) <klouddude@gmail.com> Signed-off-by: frankwang28 <frank.wbb@hotmail.com> Signed-off-by: Frank Wang <41319051+frankwang28@users.noreply.github.com> Signed-off-by: mgoin <mgoin64@gmail.com> Signed-off-by: fhl2000 <63384265+fhl2000@users.noreply.github.com> Signed-off-by: zixi-qi <qizixi@meta.com> Signed-off-by: Bram Wasti <bwasti@meta.com> Signed-off-by: Naman Lalit <nl2688@nyu.edu> Signed-off-by: Chenheli Hua <huachenheli@outlook.com> Signed-off-by: Junhong <liujunhong11@huawei.com> Signed-off-by: Junhong Liu <98734602+LJH-LBJ@users.noreply.github.com> Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com> Signed-off-by: rentianyue-jk <rentianyue-jk@360shuke.com> Signed-off-by: Peter Pan <Peter.Pan@daocloud.io> Signed-off-by: Patrick Toulme <ptoulme@meta.com> Signed-off-by: Patrick Toulme <pctoulme+1@gmail.com> Signed-off-by: Jiangyun Zhu <riverclouds.zhu@qq.com> Signed-off-by: Clayton Coleman <smarterclayton@gmail.com> Signed-off-by: Jialin Ouyang <jialino@meta.com> Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com> Signed-off-by: Weiliang Liu <weiliangl@nvidia.com> Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com> Signed-off-by: liuye.hj <liuye.hj@alibaba-inc.com> Signed-off-by: Juechen Liu <jueliu@meta.com> Signed-off-by: simon-mo <simon.mo@hey.com> Signed-off-by: Robert Shaw <robshaw@redhat.com> Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com> Signed-off-by: isotr0py <2037008807@qq.com> Signed-off-by: yingjun-mou <renzomou@gmail.com> Signed-off-by: zhoukz <me@zhoukz.com> Signed-off-by: Chenxi Yang <cxyang@fb.com> Signed-off-by: Rahul Tuli <rtuli@redhat.com> Signed-off-by: Lee Nau <lnau@nvidia.com> Signed-off-by: adabeyta <aabeyta@redhat.com> Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com> Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Signed-off-by: simondanielsson <simon.danielsson99@hotmail.com> Signed-off-by: Chen Zhang <zhangch99@outlook.com> Signed-off-by: Yongye Zhu <zyy1102000@gmail.com> Signed-off-by: Barry Kang <43644113+Barry-Delaney@users.noreply.github.com> Signed-off-by: Lucia Fang <fanglu@meta.com> Signed-off-by: a120092009 <zhaoty0121@gmail.com> Signed-off-by: sergiopaniego <sergiopaniegoblanco@gmail.com> Signed-off-by: Sergio Paniego Blanco <sergiopaniegoblanco@gmail.com> Signed-off-by: wangyafeng <wangyafeng@baidu.com> Signed-off-by: Lehua Ding <lehuading@tencent.com> Signed-off-by: lyd1992 <liuyudong@iscas.ac.cn> Signed-off-by: ihb2032 <1355790728@qq.com> Signed-off-by: asafg <39553475+Josephasafg@users.noreply.github.com> Signed-off-by: anion <1005128408@qq.com> Signed-off-by: Anion <123177548+Anionex@users.noreply.github.com> Signed-off-by: Pavani Majety <pmajety@nvidia.com> Signed-off-by: Bill Nell <bnell@redhat.com> Signed-off-by: bnellnm <49004751+bnellnm@users.noreply.github.com> Signed-off-by: Or Ozeri <oro@il.ibm.com> Signed-off-by: cjackal <44624812+cjackal@users.noreply.github.com> Signed-off-by: David Ben-David <davidb@pliops.com> Signed-off-by: Andrew Xia <axia@meta.com> Signed-off-by: Andrew Xia <axia@fb.com> Signed-off-by: Lu Fang <fanglu@fb.com> Signed-off-by: Salvatore Cena <cena@cenas.it> Signed-off-by: padg9912 <phone.and.desktop@gmail.com> Signed-off-by: nadathurv <work.vnadathur@gmail.com> Signed-off-by: WorldExplored <srreyansh.sethi@gmail.com> Signed-off-by: wwl2755 <wangwenlong2755@gmail.com> Signed-off-by: billishyahao <bill.he@amd.com> Signed-off-by: Nathan Scott <nathans@redhat.com> Signed-off-by: Kenichi Maehashi <maehashi@preferred.jp> Signed-off-by: Johnny <johnnynuca14@gmail.com> Signed-off-by: johnnynunez <johnnynuca14@gmail.com> Signed-off-by: Johnny <johnnync13@gmail.com> Signed-off-by: Huamin Li <3ericli@gmail.com> Signed-off-by: Hosang Yoon <hosang.yoon@amd.com> Signed-off-by: Jerry Zhang <jerryzh168@gmail.com> Signed-off-by: Peter Schuurman <psch@google.com> Signed-off-by: Huy Do <huydhn@gmail.com> Signed-off-by: leo-pony <nengjunma@outlook.com> Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com> Signed-off-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com> Signed-off-by: ElizaWszola <ewszola@redhat.com> Signed-off-by: ElizaWszola <elizaw.9289@gmail.com> Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Signed-off-by: Michael Goin <mgoin64@gmail.com> Signed-off-by: Benjamin Chislett <bchislett@nvidia.com> Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com> Signed-off-by: zhewenli <zhewenli@meta.com> Signed-off-by: ahao-anyscale <ahao@anyscale.com> Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com> Signed-off-by: huijjj <huijong.jeong@squeezebits.com> Signed-off-by: Yannick Schnider <yannick.schnider1@ibm.com> Signed-off-by: kyt <eluban4532@gmail.com> Signed-off-by: Egor <e.a.krivov@gmail.com> Signed-off-by: Yang <lymailforjob@gmail.com> Signed-off-by: Paul Pak <paulpak58@gmail.com> Signed-off-by: whx-sjtu <2952154980@qq.com> Signed-off-by: Xiang Si <sixiang@google.com> Signed-off-by: Aleksandr Samarin <astrlrd@nebius.com> Signed-off-by: Jun Jiang <jasl9187@hotmail.com> Signed-off-by: Chendi Xue <Chendi.Xue@intel.com> Signed-off-by: Chendi.Xue <chendi.xue@intel.com> Signed-off-by: Nikhil Ghosh <nikhil@anyscale.com> Co-authored-by: Nicole LiHui 🥜 <nicolelihui@outlook.com> Co-authored-by: courage17340 <courage17340@users.noreply.github.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk> Co-authored-by: Jacob Kahn <jacobkahn1@gmail.com> Co-authored-by: Roger Wang <hey@rogerw.io> Co-authored-by: Nicole LiHui 🥜 <nicole.li@daocloud.io> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: Fadi Arafeh <115173828+fadara01@users.noreply.github.com> Co-authored-by: Agata Dobrzyniewicz <160237065+adobrzyn@users.noreply.github.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: yyzxw <34639446+yyzxw@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: wang.yuqi <noooop@126.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Kunshang Ji <kunshang.ji@intel.com> Co-authored-by: chenlang <chen.lang5@zte.com.cn> Co-authored-by: chenlang <10346245@zte.com.cn> Co-authored-by: youkaichao <youkaichao@gmail.com> Co-authored-by: Jonas M. Kübler <44084297+jmkuebler@users.noreply.github.com> Co-authored-by: Li, Jiang <jiang1.li@intel.com> Co-authored-by: Russell Bryant <rbryant@redhat.com> Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com> Co-authored-by: AlonKejzman <alonkeizman@gmail.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com> Co-authored-by: Tao Hui <taohui3@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Matthew Bonanni <mbonanni@redhat.com> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com> Co-authored-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Co-authored-by: Nick Hill <nhill@redhat.com> Co-authored-by: Zhuohan Li <zhuohan123@gmail.com> Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com> Co-authored-by: tomeras91 <57313761+tomeras91@users.noreply.github.com> Co-authored-by: Shu Wang <shuw@nvidia.com> Co-authored-by: Aleksandr Malyshev <164964928+maleksan85@users.noreply.github.com> Co-authored-by: Aleksandr Malyshev <maleksan@amd.com> Co-authored-by: Doug Lehr <douglehr@amd.com> Co-authored-by: Eugene Khvedchenya <ekhvedchenya@gmail.com> Co-authored-by: yitingdc <59356937+yitingdc@users.noreply.github.com> Co-authored-by: Andrew Sansom <andrew@protopia.ai> Co-authored-by: xaguilar-amd <xavier.aguilarfruto@amd.com> Co-authored-by: Iceber Gu <caiwei95@hotmail.com> Co-authored-by: Tao He <linzhu.ht@alibaba-inc.com> Co-authored-by: Icey <1790571317@qq.com> Co-authored-by: Sage Moore <sage@neuralmagic.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Co-authored-by: Xu Wenqing <121550081+Xu-Wenqing@users.noreply.github.com> Co-authored-by: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com> Co-authored-by: RishiAstra <40644327+RishiAstra@users.noreply.github.com> Co-authored-by: Chauncey <chaunceyjiang@gmail.com> Co-authored-by: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com> Co-authored-by: Rui Qiao <161574667+ruisearch42@users.noreply.github.com> Co-authored-by: Jiangyun Zhu <riverclouds.zhu@qq.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Co-authored-by: 阿丹(adan) <47373076+LDLINGLINGLING@users.noreply.github.com> Co-authored-by: liudan <adan@minicpm.com> Co-authored-by: liudan <liudan@qq.com> Co-authored-by: Lucia Fang <116399278+luccafong@users.noreply.github.com> Co-authored-by: Clouddude <kouss.hd@gmail.com> Co-authored-by: Frank Wang <41319051+frankwang28@users.noreply.github.com> Co-authored-by: fhl2000 <63384265+fhl2000@users.noreply.github.com> Co-authored-by: qizixi <22851944+zixi-qi@users.noreply.github.com> Co-authored-by: Bram Wasti <bwasti@fb.com> Co-authored-by: Naman Lalit <nl2688@nyu.edu> Co-authored-by: Chenheli Hua <huachenheli@outlook.com> Co-authored-by: WeiQing Chen <40507679+david6666666@users.noreply.github.com> Co-authored-by: Junhong <liujunhong11@huawei.com> Co-authored-by: LJH-LBJ <98734602+LJH-LBJ@users.noreply.github.com> Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com> Co-authored-by: Xiaohan Zou <renovamenzxh@gmail.com> Co-authored-by: rentianyue-jk <rentianyue-jk@360shuke.com> Co-authored-by: Tyler Michael Smith <tlrmchlsmth@gmail.com> Co-authored-by: Peter Pan <peter.pan@daocloud.io> Co-authored-by: Patrick C. Toulme <135739773+patrick-toulme@users.noreply.github.com> Co-authored-by: Clayton Coleman <smarterclayton@gmail.com> Co-authored-by: Jialin Ouyang <Jialin.Ouyang@gmail.com> Co-authored-by: Jialin Ouyang <jialino@meta.com> Co-authored-by: weiliang <weiliangl@nvidia.com> Co-authored-by: Yuxuan Zhang <2448370773@qq.com> Co-authored-by: JJJYmmm <92386084+JJJYmmm@users.noreply.github.com> Co-authored-by: liuye.hj <liuye.hj@alibaba-inc.com> Co-authored-by: Juechen Liu <grinchcoder@gmail.com> Co-authored-by: Robert Shaw <robshaw@redhat.com> Co-authored-by: Thomas Parnell <tpa@zurich.ibm.com> Co-authored-by: Yingjun Mou <renzomou@gmail.com> Co-authored-by: Zhou Jiahao <me@zhoukz.com> Co-authored-by: Chenxi Yang <cxyang@cs.utexas.edu> Co-authored-by: Chenxi Yang <cxyang@fb.com> Co-authored-by: Rahul Tuli <rtuli@redhat.com> Co-authored-by: Lee Nau <lee.nau@gmail.com> Co-authored-by: Adrian Abeyta <aabeyta@redhat.com> Co-authored-by: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Co-authored-by: Aaron Pham <contact@aarnphm.xyz> Co-authored-by: acisseJZhong <40467976+acisseJZhong@users.noreply.github.com> Co-authored-by: Simon Danielsson <70206058+simondanielsson@users.noreply.github.com> Co-authored-by: Yongye Zhu <zyy1102000@gmail.com> Co-authored-by: Chen Zhang <zhangch99@outlook.com> Co-authored-by: Lucas Wilkinson <lwilkins@redhat.com> Co-authored-by: Lucia Fang <fanglu@meta.com> Co-authored-by: Siyuan Fu <siyuanf@nvidia.com> Co-authored-by: Xiaozhu Meng <mxz297@gmail.com> Co-authored-by: Barry Kang <43644113+Barry-Delaney@users.noreply.github.com> Co-authored-by: a120092009 <33205509+a120092009@users.noreply.github.com> Co-authored-by: Sergio Paniego Blanco <sergiopaniegoblanco@gmail.com> Co-authored-by: CSWYF3634076 <wangyafeng@baidu.com> Co-authored-by: Lehua Ding <lehuading@tencent.com> Co-authored-by: Reza Barazesh <3146276+rzabarazesh@users.noreply.github.com> Co-authored-by: ihb2032 <40718643+ihb2032@users.noreply.github.com> Co-authored-by: Asaf Joseph Gardin <39553475+Josephasafg@users.noreply.github.com> Co-authored-by: Anion <123177548+Anionex@users.noreply.github.com> Co-authored-by: Pavani Majety <pmajety@nvidia.com> Co-authored-by: bnellnm <49004751+bnellnm@users.noreply.github.com> Co-authored-by: Or Ozeri <oro@il.ibm.com> Co-authored-by: cjackal <44624812+cjackal@users.noreply.github.com> Co-authored-by: David Ben-David <sdavidbd@gmail.com> Co-authored-by: David Ben-David <davidb@pliops.com> Co-authored-by: Andrew Xia <axia@mit.edu> Co-authored-by: Andrew Xia <axia@fb.com> Co-authored-by: Salvatore Cena <cena@cenas.it> Co-authored-by: Param <psch@cs.unc.edu> Co-authored-by: Zhewen Li <zhewenli@meta.com> Co-authored-by: nadathurv <work.vnadathur@gmail.com> Co-authored-by: Srreyansh Sethi <107075589+WorldExplored@users.noreply.github.com> Co-authored-by: Wenlong Wang <wangwenlong2755@gmail.com> Co-authored-by: billishyahao <bill.he@amd.com> Co-authored-by: Nathan Scott <natoscott@users.noreply.github.com> Co-authored-by: Kenichi Maehashi <939877+kmaehashi@users.noreply.github.com> Co-authored-by: Johnny <johnnync13@gmail.com> Co-authored-by: Aidyn-A <31858918+Aidyn-A@users.noreply.github.com> Co-authored-by: Huamin Li <3ericli@gmail.com> Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com> Co-authored-by: Hosang <156028780+hyoon1@users.noreply.github.com> Co-authored-by: Jerry Zhang <jerryzh168@gmail.com> Co-authored-by: pwschuurman <psch@google.com> Co-authored-by: Huy Do <huydhn@gmail.com> Co-authored-by: leo-pony <nengjunma@outlook.com> Co-authored-by: vllmellm <vllm.ellm@embeddedllm.com> Co-authored-by: ElizaWszola <ewszola@redhat.com> Co-authored-by: Luka Govedič <lgovedic@redhat.com> Co-authored-by: Benjamin Chislett <bchislett@nvidia.com> Co-authored-by: Andrew Xia <axia@meta.com> Co-authored-by: Simon Mo <simon.mo@hey.com> Co-authored-by: TJian <tunjian.tan@embeddedllm.com> Co-authored-by: ahao-anyscale <ahao@anyscale.com> Co-authored-by: Varun Sundar Rabindranath <varunsundar08@gmail.com> Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com> Co-authored-by: Liu-congo <1502632128@qq.com> Co-authored-by: HUIJONG JEONG <64083281+huijjj@users.noreply.github.com> Co-authored-by: Yannick Schnider <Yannick.Schnider1@ibm.com> Co-authored-by: kyt <eluban4532@gmail.com> Co-authored-by: Egor <e.a.krivov@gmail.com> Co-authored-by: Yang Liu <127183760+KKSK-DON@users.noreply.github.com> Co-authored-by: Paul Pak <52512091+paulpak58@users.noreply.github.com> Co-authored-by: whx <56632993+whx-sjtu@users.noreply.github.com> Co-authored-by: Xiang Si <sixiang@google.com> Co-authored-by: Aleksandr Samarin <samarin_ad@mail.ru> Co-authored-by: Jun Jiang <jasl9187@hotmail.com> Co-authored-by: Chendi.Xue <chendi.xue@intel.com> Co-authored-by: Nikhil G <nrghosh@users.noreply.github.com>
1014 lines
40 KiB
CMake
1014 lines
40 KiB
CMake
cmake_minimum_required(VERSION 3.26)
|
|
|
|
# When building directly using CMake, make sure you run the install step
|
|
# (it places the .so files in the correct location).
|
|
#
|
|
# Example:
|
|
# mkdir build && cd build
|
|
# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_INSTALL_PREFIX=.. ..
|
|
# cmake --build . --target install
|
|
#
|
|
# If you want to only build one target, make sure to install it manually:
|
|
# cmake --build . --target _C
|
|
# cmake --install . --component _C
|
|
project(vllm_extensions LANGUAGES CXX)
|
|
|
|
set(CMAKE_CXX_STANDARD 17)
|
|
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
|
|
|
|
|
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
|
|
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
|
|
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
|
|
message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
|
|
|
|
include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
|
|
|
|
# Suppress potential warnings about unused manually-specified variables
|
|
set(ignoreMe "${VLLM_PYTHON_PATH}")
|
|
|
|
# Prevent installation of dependencies (cutlass) by default.
|
|
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
|
|
|
|
#
|
|
# Supported python versions. These versions will be searched in order, the
|
|
# first match will be selected. These should be kept in sync with setup.py.
|
|
#
|
|
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" "3.13")
|
|
|
|
# Supported AMD GPU architectures.
|
|
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
|
|
|
|
#
|
|
# Supported/expected torch versions for CUDA/ROCm.
|
|
#
|
|
# Currently, having an incorrect pytorch version results in a warning
|
|
# rather than an error.
|
|
#
|
|
# Note: the CUDA torch version is derived from pyproject.toml and various
|
|
# requirements.txt files and should be kept consistent. The ROCm torch
|
|
# versions are derived from docker/Dockerfile.rocm
|
|
#
|
|
set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0")
|
|
set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0")
|
|
|
|
#
|
|
# Try to find python package with an executable that exactly matches
|
|
# `VLLM_PYTHON_EXECUTABLE` and is one of the supported versions.
|
|
#
|
|
if (VLLM_PYTHON_EXECUTABLE)
|
|
find_python_from_executable(${VLLM_PYTHON_EXECUTABLE} "${PYTHON_SUPPORTED_VERSIONS}")
|
|
else()
|
|
message(FATAL_ERROR
|
|
"Please set VLLM_PYTHON_EXECUTABLE to the path of the desired python version"
|
|
" before running cmake configure.")
|
|
endif()
|
|
|
|
#
|
|
# Update cmake's `CMAKE_PREFIX_PATH` with torch location.
|
|
#
|
|
append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
|
|
|
|
# Ensure the 'nvcc' command is in the PATH
|
|
find_program(NVCC_EXECUTABLE nvcc)
|
|
if (CUDA_FOUND AND NOT NVCC_EXECUTABLE)
|
|
message(FATAL_ERROR "nvcc not found")
|
|
endif()
|
|
|
|
#
|
|
# Import torch cmake configuration.
|
|
# Torch also imports CUDA (and partially HIP) languages with some customizations,
|
|
# so there is no need to do this explicitly with check_language/enable_language,
|
|
# etc.
|
|
#
|
|
find_package(Torch REQUIRED)
|
|
|
|
# Supported NVIDIA architectures.
|
|
# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
|
|
if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
|
|
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
|
|
set(CUDA_SUPPORTED_ARCHS "7.5;8.0;8.6;8.7;8.9;9.0;10.0;11.0;12.0")
|
|
elseif(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
|
|
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
|
|
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
|
|
else()
|
|
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
|
|
endif()
|
|
|
|
#
|
|
# Forward the non-CUDA device extensions to external CMake scripts.
|
|
#
|
|
if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" AND
|
|
NOT VLLM_TARGET_DEVICE STREQUAL "rocm")
|
|
if (VLLM_TARGET_DEVICE STREQUAL "cpu")
|
|
include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)
|
|
else()
|
|
return()
|
|
endif()
|
|
return()
|
|
endif()
|
|
|
|
#
|
|
# Set up GPU language and check the torch version and warn if it isn't
|
|
# what is expected.
|
|
#
|
|
if (NOT HIP_FOUND AND CUDA_FOUND)
|
|
set(VLLM_GPU_LANG "CUDA")
|
|
|
|
if (NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_CUDA})
|
|
message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_CUDA} "
|
|
"expected for CUDA build, saw ${Torch_VERSION} instead.")
|
|
endif()
|
|
elseif(HIP_FOUND)
|
|
set(VLLM_GPU_LANG "HIP")
|
|
|
|
# Importing torch recognizes and sets up some HIP/ROCm configuration but does
|
|
# not let cmake recognize .hip files. In order to get cmake to understand the
|
|
# .hip extension automatically, HIP must be enabled explicitly.
|
|
enable_language(HIP)
|
|
|
|
# ROCm 5.X and 6.X
|
|
if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
|
|
NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM})
|
|
message(WARNING "Pytorch version >= ${TORCH_SUPPORTED_VERSION_ROCM} "
|
|
"expected for ROCm build, saw ${Torch_VERSION} instead.")
|
|
endif()
|
|
else()
|
|
message(FATAL_ERROR "Can't find CUDA or HIP installation.")
|
|
endif()
|
|
|
|
|
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|
#
|
|
# For cuda we want to be able to control which architectures we compile for on
|
|
# a per-file basis in order to cut down on compile time. So here we extract
|
|
# the set of architectures we want to compile for and remove the from the
|
|
# CMAKE_CUDA_FLAGS so that they are not applied globally.
|
|
#
|
|
clear_cuda_arches(CUDA_ARCH_FLAGS)
|
|
extract_unique_cuda_archs_ascending(CUDA_ARCHS "${CUDA_ARCH_FLAGS}")
|
|
message(STATUS "CUDA target architectures: ${CUDA_ARCHS}")
|
|
# Filter the target architectures by the supported supported archs
|
|
# since for some files we will build for all CUDA_ARCHS.
|
|
cuda_archs_loose_intersection(CUDA_ARCHS
|
|
"${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}")
|
|
message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}")
|
|
else()
|
|
#
|
|
# For other GPU targets override the GPU architectures detected by cmake/torch
|
|
# and filter them by the supported versions for the current language.
|
|
# The final set of arches is stored in `VLLM_GPU_ARCHES`.
|
|
#
|
|
override_gpu_arches(VLLM_GPU_ARCHES
|
|
${VLLM_GPU_LANG}
|
|
"${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}")
|
|
endif()
|
|
|
|
#
|
|
# Query torch for additional GPU compilation flags for the given
|
|
# `VLLM_GPU_LANG`.
|
|
# The final set of arches is stored in `VLLM_GPU_FLAGS`.
|
|
#
|
|
get_torch_gpu_compiler_flags(VLLM_GPU_FLAGS ${VLLM_GPU_LANG})
|
|
|
|
#
|
|
# Set nvcc parallelism.
|
|
#
|
|
if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
|
|
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
|
|
endif()
|
|
|
|
#
|
|
# Set compression mode for CUDA >=13.x.
|
|
#
|
|
if(VLLM_GPU_LANG STREQUAL "CUDA" AND
|
|
DEFINED CMAKE_CUDA_COMPILER_VERSION AND
|
|
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
|
|
list(APPEND VLLM_GPU_FLAGS "--compress-mode=size")
|
|
endif()
|
|
|
|
#
|
|
# Set CUDA include flags for CXX compiler.
|
|
#
|
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${CUDA_TOOLKIT_ROOT_DIR}/include")
|
|
if(CUDA_VERSION VERSION_GREATER_EQUAL 13.0)
|
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${CUDA_TOOLKIT_ROOT_DIR}/include/cccl")
|
|
endif()
|
|
endif()
|
|
|
|
#
|
|
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
|
|
# setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache.
|
|
# Each dependency that produces build artifacts should override its BINARY_DIR to avoid
|
|
# conflicts between build types. It should instead be set to ${CMAKE_BINARY_DIR}/<dependency>.
|
|
#
|
|
include(FetchContent)
|
|
file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
|
|
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
|
|
|
|
if(VLLM_GPU_LANG STREQUAL "HIP")
|
|
#
|
|
# Overriding the default -O set up by cmake, adding ggdb3 for the most verbose devug info
|
|
#
|
|
set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3")
|
|
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3")
|
|
|
|
#
|
|
# Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates
|
|
# a lot of warnings that always mask real issues. Suppressing until this is properly addressed.
|
|
#
|
|
set(CMAKE_${VLLM_GPU_LANG}_FLAGS "${CMAKE_${VLLM_GPU_LANG}_FLAGS} -Wno-unused-result")
|
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-result")
|
|
endif()
|
|
|
|
#
|
|
# Define other extension targets
|
|
#
|
|
|
|
#
|
|
# cumem_allocator extension
|
|
#
|
|
|
|
set(VLLM_CUMEM_EXT_SRC
|
|
"csrc/cumem_allocator.cpp")
|
|
|
|
set_gencode_flags_for_srcs(
|
|
SRCS "${VLLM_CUMEM_EXT_SRC}"
|
|
CUDA_ARCHS "${CUDA_ARCHS}")
|
|
|
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|
message(STATUS "Enabling cumem allocator extension.")
|
|
# link against cuda driver library
|
|
list(APPEND CUMEM_LIBS CUDA::cuda_driver)
|
|
define_gpu_extension_target(
|
|
cumem_allocator
|
|
DESTINATION vllm
|
|
LANGUAGE CXX
|
|
SOURCES ${VLLM_CUMEM_EXT_SRC}
|
|
LIBRARIES ${CUMEM_LIBS}
|
|
USE_SABI 3.8
|
|
WITH_SOABI)
|
|
endif()
|
|
|
|
#
|
|
# _C extension
|
|
#
|
|
|
|
set(VLLM_EXT_SRC
|
|
"csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
|
|
"csrc/cache_kernels.cu"
|
|
"csrc/attention/paged_attention_v1.cu"
|
|
"csrc/attention/paged_attention_v2.cu"
|
|
"csrc/attention/merge_attn_states.cu"
|
|
"csrc/attention/vertical_slash_index.cu"
|
|
"csrc/pos_encoding_kernels.cu"
|
|
"csrc/activation_kernels.cu"
|
|
"csrc/layernorm_kernels.cu"
|
|
"csrc/layernorm_quant_kernels.cu"
|
|
"csrc/sampler.cu"
|
|
"csrc/cuda_view.cu"
|
|
"csrc/quantization/gptq/q_gemm.cu"
|
|
"csrc/quantization/w8a8/int8/scaled_quant.cu"
|
|
"csrc/quantization/w8a8/fp8/common.cu"
|
|
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
|
|
"csrc/quantization/gguf/gguf_kernel.cu"
|
|
"csrc/quantization/activation_kernels.cu"
|
|
"csrc/cuda_utils_kernels.cu"
|
|
"csrc/custom_all_reduce.cu"
|
|
"csrc/torch_bindings.cpp")
|
|
|
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
|
|
|
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
|
|
set(CUTLASS_REVISION "v4.2.1" CACHE STRING "CUTLASS revision to use")
|
|
|
|
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
|
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
|
set(VLLM_CUTLASS_SRC_DIR $ENV{VLLM_CUTLASS_SRC_DIR})
|
|
endif()
|
|
|
|
if(VLLM_CUTLASS_SRC_DIR)
|
|
if(NOT IS_ABSOLUTE VLLM_CUTLASS_SRC_DIR)
|
|
get_filename_component(VLLM_CUTLASS_SRC_DIR "${VLLM_CUTLASS_SRC_DIR}" ABSOLUTE)
|
|
endif()
|
|
message(STATUS "The VLLM_CUTLASS_SRC_DIR is set, using ${VLLM_CUTLASS_SRC_DIR} for compilation")
|
|
FetchContent_Declare(cutlass SOURCE_DIR ${VLLM_CUTLASS_SRC_DIR})
|
|
else()
|
|
FetchContent_Declare(
|
|
cutlass
|
|
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
|
|
# Please keep this in sync with CUTLASS_REVISION line above.
|
|
GIT_TAG ${CUTLASS_REVISION}
|
|
GIT_PROGRESS TRUE
|
|
|
|
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
|
|
# Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
|
|
# So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
|
|
GIT_SHALLOW TRUE
|
|
)
|
|
endif()
|
|
FetchContent_MakeAvailable(cutlass)
|
|
|
|
list(APPEND VLLM_EXT_SRC
|
|
"csrc/quantization/awq/gemm_kernels.cu"
|
|
"csrc/permute_cols.cu"
|
|
"csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu"
|
|
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
|
|
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
|
|
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
|
"csrc/cutlass_extensions/common.cpp"
|
|
"csrc/quantization/w8a8/fp8/per_token_group_quant.cu"
|
|
"csrc/quantization/w8a8/int8/per_token_group_quant.cu")
|
|
|
|
set_gencode_flags_for_srcs(
|
|
SRCS "${VLLM_EXT_SRC}"
|
|
CUDA_ARCHS "${CUDA_ARCHS}")
|
|
|
|
# Only build Marlin kernels if we are building for at least some compatible archs.
|
|
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
|
|
# are not supported by Machete yet.
|
|
# 9.0 for latest bf16 atomicAdd PTX
|
|
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
|
|
if (MARLIN_ARCHS)
|
|
|
|
#
|
|
# For the Marlin kernels we automatically generate sources for various
|
|
# preselected input type pairs and schedules.
|
|
# Generate sources:
|
|
set(MARLIN_GEN_SCRIPT
|
|
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
|
|
file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
|
|
|
|
message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH}")
|
|
message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH}")
|
|
|
|
if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH}
|
|
OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH})
|
|
execute_process(
|
|
COMMAND ${CMAKE_COMMAND} -E env
|
|
PYTHONPATH=$PYTHONPATH
|
|
${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT}
|
|
RESULT_VARIABLE marlin_generation_result
|
|
OUTPUT_VARIABLE marlin_generation_result
|
|
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
|
|
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
|
|
)
|
|
|
|
if (NOT marlin_generation_result EQUAL 0)
|
|
message(FATAL_ERROR "Marlin generation failed."
|
|
" Result: \"${marlin_generation_result}\""
|
|
"\nCheck the log for details: "
|
|
"${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log")
|
|
else()
|
|
set(MARLIN_GEN_SCRIPT_HASH ${MARLIN_GEN_SCRIPT_HASH}
|
|
CACHE STRING "Last run Marlin generate script hash" FORCE)
|
|
message(STATUS "Marlin generation completed successfully.")
|
|
endif()
|
|
else()
|
|
message(STATUS "Marlin generation script has not changed, skipping generation.")
|
|
endif()
|
|
|
|
file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/kernel_*.cu")
|
|
set_gencode_flags_for_srcs(
|
|
SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
|
|
CUDA_ARCHS "${MARLIN_ARCHS}")
|
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
|
set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
|
|
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
|
endif()
|
|
|
|
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
|
|
|
|
set(MARLIN_SRCS
|
|
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
|
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
|
|
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
|
|
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
|
|
set_gencode_flags_for_srcs(
|
|
SRCS "${MARLIN_SRCS}"
|
|
CUDA_ARCHS "${MARLIN_ARCHS}")
|
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
|
set_source_files_properties("csrc/quantization/gptq_marlin/gptq_marlin.cu"
|
|
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
|
endif()
|
|
list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}")
|
|
|
|
message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
|
|
else()
|
|
message(STATUS "Not building Marlin kernels as no compatible archs found"
|
|
" in CUDA target architectures")
|
|
endif()
|
|
|
|
# Only build AllSpark kernels if we are building for at least some compatible archs.
|
|
cuda_archs_loose_intersection(ALLSPARK_ARCHS "8.0;8.6;8.7;8.9" "${CUDA_ARCHS}")
|
|
if (ALLSPARK_ARCHS)
|
|
set(ALLSPARK_SRCS
|
|
"csrc/quantization/gptq_allspark/allspark_repack.cu"
|
|
"csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu")
|
|
set_gencode_flags_for_srcs(
|
|
SRCS "${ALLSPARK_SRCS}"
|
|
CUDA_ARCHS "${ALLSPARK_ARCHS}")
|
|
list(APPEND VLLM_EXT_SRC "${ALLSPARK_SRCS}")
|
|
message(STATUS "Building AllSpark kernels for archs: ${ALLSPARK_ARCHS}")
|
|
else()
|
|
message(STATUS "Not building AllSpark kernels as no compatible archs found"
|
|
" in CUDA target architectures")
|
|
endif()
|
|
|
|
|
|
set(SCALED_MM_3X_ARCHS)
|
|
# The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
|
|
# CUDA 12.0 or later
|
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
|
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
|
|
set(SRCS
|
|
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm90.cu"
|
|
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8.cu"
|
|
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_int8.cu"
|
|
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_azp_sm90_int8.cu"
|
|
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8.cu")
|
|
set_gencode_flags_for_srcs(
|
|
SRCS "${SRCS}"
|
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM90=1")
|
|
# Let scaled_mm_c2x know it doesn't need to build these arches
|
|
list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
|
|
message(STATUS "Building scaled_mm_c3x_sm90 for archs: ${SCALED_MM_ARCHS}")
|
|
else()
|
|
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
|
|
message(STATUS "Not building scaled_mm_c3x_sm90 as CUDA Compiler version is "
|
|
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
|
|
"later if you intend on running FP8 quantized models on "
|
|
"Hopper.")
|
|
else()
|
|
message(STATUS "Not building scaled_mm_c3x_sm90 as no compatible archs found "
|
|
"in CUDA target architectures")
|
|
endif()
|
|
endif()
|
|
|
|
|
|
# The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
|
|
# CUDA 12.8 or later
|
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0f" "${CUDA_ARCHS}")
|
|
else()
|
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0a" "${CUDA_ARCHS}")
|
|
endif()
|
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
|
set(SRCS
|
|
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm120.cu"
|
|
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu"
|
|
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8.cu"
|
|
)
|
|
set_gencode_flags_for_srcs(
|
|
SRCS "${SRCS}"
|
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM120=1")
|
|
# Let scaled_mm_c2x know it doesn't need to build these arches
|
|
list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
|
|
message(STATUS "Building scaled_mm_c3x_sm120 for archs: ${SCALED_MM_ARCHS}")
|
|
else()
|
|
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
|
message(STATUS "Not building scaled_mm_c3x_sm120 as CUDA Compiler version is "
|
|
"not >= 12.8, we recommend upgrading to CUDA 12.8 or "
|
|
"later if you intend on running FP8 quantized models on "
|
|
"Blackwell.")
|
|
else()
|
|
message(STATUS "Not building scaled_mm_c3x_120 as no compatible archs found "
|
|
"in CUDA target architectures")
|
|
endif()
|
|
endif()
|
|
|
|
|
|
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
|
|
# require CUDA 12.8 or later
|
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
|
else()
|
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
|
endif()
|
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
|
set(SRCS
|
|
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm100.cu"
|
|
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu"
|
|
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8.cu"
|
|
)
|
|
set_gencode_flags_for_srcs(
|
|
SRCS "${SRCS}"
|
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM100=1")
|
|
# Let scaled_mm_c2x know it doesn't need to build these arches
|
|
list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
|
|
message(STATUS "Building scaled_mm_c3x_sm100 for archs: ${SCALED_MM_ARCHS}")
|
|
else()
|
|
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
|
message(STATUS "Not building scaled_mm_c3x_sm100 as CUDA Compiler version is "
|
|
"not >= 12.8, we recommend upgrading to CUDA 12.8 or "
|
|
"later if you intend on running FP8 quantized models on "
|
|
"Blackwell.")
|
|
else()
|
|
message(STATUS "Not building scaled_mm_c3x_100 as no compatible archs found "
|
|
"in CUDA target architectures")
|
|
endif()
|
|
endif()
|
|
|
|
#
|
|
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
|
|
# kernels for the remaining archs that are not already built for 3x.
|
|
# (Build 8.9 for FP8)
|
|
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
|
|
"7.5;8.0;8.7;8.9+PTX" "${CUDA_ARCHS}")
|
|
# subtract out the archs that are already built for 3x
|
|
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
|
|
if (SCALED_MM_2X_ARCHS)
|
|
set(SRCS "csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cu")
|
|
set_gencode_flags_for_srcs(
|
|
SRCS "${SRCS}"
|
|
CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
|
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C2X=1")
|
|
message(STATUS "Building scaled_mm_c2x for archs: ${SCALED_MM_2X_ARCHS}")
|
|
else()
|
|
if (SCALED_MM_3X_ARCHS)
|
|
message(STATUS "Not building scaled_mm_c2x as all archs are already built"
|
|
" for and covered by scaled_mm_c3x")
|
|
else()
|
|
message(STATUS "Not building scaled_mm_c2x as no compatible archs found "
|
|
"in CUDA target architectures")
|
|
endif()
|
|
endif()
|
|
|
|
#
|
|
# 2:4 Sparse Kernels
|
|
|
|
# The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
|
|
# require CUDA 12.2 or later (and only work on Hopper).
|
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
|
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.2 AND SCALED_MM_ARCHS)
|
|
set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
|
|
set_gencode_flags_for_srcs(
|
|
SRCS "${SRCS}"
|
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1")
|
|
message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_ARCHS}")
|
|
else()
|
|
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.2 AND SCALED_MM_ARCHS)
|
|
message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
|
|
"not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
|
|
"if you intend on running FP8 sparse quantized models on Hopper.")
|
|
else()
|
|
message(STATUS "Not building sparse_scaled_mm_c3x as no compatible archs found "
|
|
"in CUDA target architectures")
|
|
endif()
|
|
endif()
|
|
|
|
# The nvfp4_scaled_mm_sm120 kernels for Geforce Blackwell SM120 require
|
|
# CUDA 12.8 or later
|
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
|
cuda_archs_loose_intersection(FP4_ARCHS "12.0f" "${CUDA_ARCHS}")
|
|
else()
|
|
cuda_archs_loose_intersection(FP4_ARCHS "12.0a" "${CUDA_ARCHS}")
|
|
endif()
|
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
|
|
set(SRCS
|
|
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
|
"csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
|
|
"csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu")
|
|
set_gencode_flags_for_srcs(
|
|
SRCS "${SRCS}"
|
|
CUDA_ARCHS "${FP4_ARCHS}")
|
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM120=1")
|
|
message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
|
|
else()
|
|
message(STATUS "Not building NVFP4 as no compatible archs were found.")
|
|
# clear FP4_ARCHS
|
|
set(FP4_ARCHS)
|
|
endif()
|
|
|
|
# FP4 Archs and flags
|
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
|
cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
|
else()
|
|
cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;12.0a;12.1a" "${CUDA_ARCHS}")
|
|
endif()
|
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
|
|
set(SRCS
|
|
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
|
"csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
|
|
"csrc/quantization/fp4/nvfp4_experts_quant.cu"
|
|
"csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
|
|
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
|
|
set_gencode_flags_for_srcs(
|
|
SRCS "${SRCS}"
|
|
CUDA_ARCHS "${FP4_ARCHS}")
|
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM100=1")
|
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
|
|
message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
|
|
else()
|
|
message(STATUS "Not building NVFP4 as no compatible archs were found.")
|
|
# clear FP4_ARCHS
|
|
set(FP4_ARCHS)
|
|
endif()
|
|
|
|
# CUTLASS MLA Archs and flags
|
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
|
cuda_archs_loose_intersection(MLA_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
|
else()
|
|
cuda_archs_loose_intersection(MLA_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
|
endif()
|
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
|
|
set(SRCS
|
|
"csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
|
|
set_gencode_flags_for_srcs(
|
|
SRCS "${SRCS}"
|
|
CUDA_ARCHS "${MLA_ARCHS}")
|
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MLA=1")
|
|
# Add MLA-specific include directories only to MLA source files
|
|
set_source_files_properties(${SRCS}
|
|
PROPERTIES INCLUDE_DIRECTORIES "${CUTLASS_DIR}/examples/77_blackwell_fmha;${CUTLASS_DIR}/examples/common")
|
|
message(STATUS "Building CUTLASS MLA for archs: ${MLA_ARCHS}")
|
|
else()
|
|
message(STATUS "Not building CUTLASS MLA as no compatible archs were found.")
|
|
# clear MLA_ARCHS
|
|
set(MLA_ARCHS)
|
|
endif()
|
|
|
|
# CUTLASS MoE kernels
|
|
|
|
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and ONLY works
|
|
# on Hopper). get_cutlass_(pplx_)moe_mm_data should only be compiled
|
|
# if it's possible to compile MoE kernels that use its output.
|
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
|
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
|
set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm90.cu")
|
|
set_gencode_flags_for_srcs(
|
|
SRCS "${SRCS}"
|
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM90=1")
|
|
message(STATUS "Building grouped_mm_c3x for archs: ${SCALED_MM_ARCHS}")
|
|
else()
|
|
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
|
message(STATUS "Not building grouped_mm_c3x kernels as CUDA Compiler version is "
|
|
"not >= 12.3, we recommend upgrading to CUDA 12.3 or later "
|
|
"if you intend on running FP8 quantized MoE models on Hopper.")
|
|
else()
|
|
message(STATUS "Not building grouped_mm_c3x as no compatible archs found "
|
|
"in CUDA target architectures.")
|
|
endif()
|
|
endif()
|
|
|
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
|
|
else()
|
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
|
endif()
|
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
|
set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu")
|
|
set_gencode_flags_for_srcs(
|
|
SRCS "${SRCS}"
|
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
|
|
message(STATUS "Building grouped_mm_c3x for archs: ${SCALED_MM_ARCHS}")
|
|
else()
|
|
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
|
message(STATUS "Not building grouped_mm_c3x kernels as CUDA Compiler version is "
|
|
"not >= 12.8, we recommend upgrading to CUDA 12.8 or later "
|
|
"if you intend on running FP8 quantized MoE models on Blackwell.")
|
|
else()
|
|
message(STATUS "Not building grouped_mm_c3x as no compatible archs found "
|
|
"in CUDA target architectures.")
|
|
endif()
|
|
endif()
|
|
|
|
# moe_data.cu is used by all CUTLASS MoE kernels.
|
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
|
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
|
else()
|
|
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
|
endif()
|
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
|
|
set(SRCS "csrc/quantization/w8a8/cutlass/moe/moe_data.cu")
|
|
set_gencode_flags_for_srcs(
|
|
SRCS "${SRCS}"
|
|
CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
|
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
|
message(STATUS "Building moe_data for archs: ${CUTLASS_MOE_DATA_ARCHS}")
|
|
else()
|
|
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
|
|
message(STATUS "Not building moe_data as CUDA Compiler version is "
|
|
"not >= 12.3, we recommend upgrading to CUDA 12.3 or later "
|
|
"if you intend on running FP8 quantized MoE models on Hopper or Blackwell.")
|
|
else()
|
|
message(STATUS "Not building moe_data as no compatible archs found "
|
|
"in CUDA target architectures.")
|
|
endif()
|
|
endif()
|
|
|
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
|
else()
|
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
|
endif()
|
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
|
set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
|
|
set_gencode_flags_for_srcs(
|
|
SRCS "${SRCS}"
|
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
|
|
message(STATUS "Building blockwise_scaled_group_mm_sm100 for archs: ${SCALED_MM_ARCHS}")
|
|
else()
|
|
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
|
message(STATUS "Not building blockwise_scaled_group_mm_sm100 kernels as CUDA Compiler version is "
|
|
"not >= 12.8, we recommend upgrading to CUDA 12.8 or later "
|
|
"if you intend on running FP8 quantized MoE models on Blackwell.")
|
|
else()
|
|
message(STATUS "Not building blockwise_scaled_group_mm_sm100 as no compatible archs found "
|
|
"in CUDA target architectures")
|
|
endif()
|
|
endif()
|
|
|
|
#
|
|
# Machete kernels
|
|
|
|
# The machete kernels only work on hopper and require CUDA 12.0 or later.
|
|
# Only build Machete kernels if we are building for something compatible with sm90a
|
|
cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}")
|
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND MACHETE_ARCHS)
|
|
#
|
|
# For the Machete kernels we automatically generate sources for various
|
|
# preselected input type pairs and schedules.
|
|
# Generate sources:
|
|
set(MACHETE_GEN_SCRIPT
|
|
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py)
|
|
file(MD5 ${MACHETE_GEN_SCRIPT} MACHETE_GEN_SCRIPT_HASH)
|
|
|
|
message(STATUS "Machete generation script hash: ${MACHETE_GEN_SCRIPT_HASH}")
|
|
message(STATUS "Last run machete generate script hash: $CACHE{MACHETE_GEN_SCRIPT_HASH}")
|
|
|
|
if (NOT DEFINED CACHE{MACHETE_GEN_SCRIPT_HASH}
|
|
OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
|
|
execute_process(
|
|
COMMAND ${CMAKE_COMMAND} -E env
|
|
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
|
|
${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
|
|
RESULT_VARIABLE machete_generation_result
|
|
OUTPUT_VARIABLE machete_generation_output
|
|
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
|
|
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
|
|
)
|
|
|
|
if (NOT machete_generation_result EQUAL 0)
|
|
message(FATAL_ERROR "Machete generation failed."
|
|
" Result: \"${machete_generation_result}\""
|
|
"\nCheck the log for details: "
|
|
"${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log")
|
|
else()
|
|
set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH}
|
|
CACHE STRING "Last run machete generate script hash" FORCE)
|
|
message(STATUS "Machete generation completed successfully.")
|
|
endif()
|
|
else()
|
|
message(STATUS "Machete generation script has not changed, skipping generation.")
|
|
endif()
|
|
|
|
# Add machete generated sources
|
|
file(GLOB MACHETE_GEN_SOURCES "csrc/quantization/machete/generated/*.cu")
|
|
list(APPEND VLLM_EXT_SRC ${MACHETE_GEN_SOURCES})
|
|
|
|
# forward compatible
|
|
set_gencode_flags_for_srcs(
|
|
SRCS "${MACHETE_GEN_SOURCES}"
|
|
CUDA_ARCHS "${MACHETE_ARCHS}")
|
|
|
|
list(APPEND VLLM_EXT_SRC
|
|
csrc/quantization/machete/machete_pytorch.cu)
|
|
|
|
message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}")
|
|
else()
|
|
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0
|
|
AND MACHETE_ARCHS)
|
|
message(STATUS "Not building Machete kernels as CUDA Compiler version is "
|
|
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
|
|
"later if you intend on running w4a16 quantized models on "
|
|
"Hopper.")
|
|
else()
|
|
message(STATUS "Not building Machete kernels as no compatible archs "
|
|
"found in CUDA target architectures")
|
|
endif()
|
|
endif()
|
|
|
|
# Only build W4A8 kernels if we are building for something compatible with sm90a
|
|
cuda_archs_loose_intersection(W4A8_ARCHS "9.0a" "${CUDA_ARCHS}")
|
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND W4A8_ARCHS)
|
|
set(SRCS
|
|
"csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu")
|
|
|
|
set_gencode_flags_for_srcs(
|
|
SRCS "${SRCS}"
|
|
CUDA_ARCHS "${W4A8_ARCHS}")
|
|
|
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
|
|
|
message(STATUS "Building W4A8 kernels for archs: ${W4A8_ARCHS}")
|
|
else()
|
|
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0
|
|
AND W4A8_ARCHS)
|
|
message(STATUS "Not building W4A8 kernels as CUDA Compiler version is "
|
|
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
|
|
"later if you intend on running w4a16 quantized models on "
|
|
"Hopper.")
|
|
else()
|
|
message(STATUS "Not building W4A8 kernels as no compatible archs "
|
|
"found in CUDA target architectures")
|
|
endif()
|
|
endif()
|
|
|
|
# Hadacore kernels
|
|
cuda_archs_loose_intersection(HADACORE_ARCHS "8.0;8.9;9.0" "${CUDA_ARCHS}")
|
|
if(HADACORE_ARCHS)
|
|
set(SRCS "csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu")
|
|
set_gencode_flags_for_srcs(
|
|
SRCS "${SRCS}"
|
|
CUDA_ARCHS "${HADACORE_ARCHS}")
|
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
|
message(STATUS "Building hadacore")
|
|
endif()
|
|
|
|
# if CUDA endif
|
|
endif()
|
|
|
|
if (VLLM_GPU_LANG STREQUAL "HIP")
|
|
# Add QuickReduce kernels
|
|
list(APPEND VLLM_EXT_SRC
|
|
"csrc/custom_quickreduce.cu"
|
|
)
|
|
# if ROCM endif
|
|
endif()
|
|
|
|
message(STATUS "Enabling C extension.")
|
|
define_gpu_extension_target(
|
|
_C
|
|
DESTINATION vllm
|
|
LANGUAGE ${VLLM_GPU_LANG}
|
|
SOURCES ${VLLM_EXT_SRC}
|
|
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
|
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
|
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
|
|
INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
|
|
USE_SABI 3
|
|
WITH_SOABI)
|
|
|
|
# If CUTLASS is compiled on NVCC >= 12.5, it by default uses
|
|
# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the
|
|
# driver API. This causes problems when linking with earlier versions of CUDA.
|
|
# Setting this variable sidesteps the issue by calling the driver directly.
|
|
target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
|
|
|
|
#
|
|
# _moe_C extension
|
|
#
|
|
|
|
set(VLLM_MOE_EXT_SRC
|
|
"csrc/moe/torch_bindings.cpp"
|
|
"csrc/moe/moe_align_sum_kernels.cu"
|
|
"csrc/moe/topk_softmax_kernels.cu")
|
|
|
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|
list(APPEND VLLM_MOE_EXT_SRC
|
|
"csrc/moe/moe_wna16.cu"
|
|
"csrc/moe/grouped_topk_kernels.cu")
|
|
endif()
|
|
|
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|
set(MOE_PERMUTE_SRC
|
|
"csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu"
|
|
"csrc/moe/moe_permute_unpermute_op.cu")
|
|
|
|
list(APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC}")
|
|
endif()
|
|
|
|
set_gencode_flags_for_srcs(
|
|
SRCS "${VLLM_MOE_EXT_SRC}"
|
|
CUDA_ARCHS "${CUDA_ARCHS}")
|
|
|
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|
set(VLLM_MOE_WNA16_SRC
|
|
"csrc/moe/moe_wna16.cu")
|
|
|
|
set_gencode_flags_for_srcs(
|
|
SRCS "${VLLM_MOE_WNA16_SRC}"
|
|
CUDA_ARCHS "${CUDA_ARCHS}")
|
|
|
|
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
|
|
# 9.0 for latest bf16 atomicAdd PTX
|
|
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
|
|
if (MARLIN_MOE_ARCHS)
|
|
|
|
#
|
|
# For the Marlin MOE kernels we automatically generate sources for various
|
|
# preselected input type pairs and schedules.
|
|
# Generate sources:
|
|
set(MOE_MARLIN_GEN_SCRIPT
|
|
${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
|
|
file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
|
|
|
|
message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}")
|
|
message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}")
|
|
|
|
if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}
|
|
OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
|
|
execute_process(
|
|
COMMAND ${CMAKE_COMMAND} -E env
|
|
PYTHONPATH=$PYTHONPATH
|
|
${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
|
|
RESULT_VARIABLE moe_marlin_generation_result
|
|
OUTPUT_VARIABLE moe_marlin_generation_output
|
|
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
|
|
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
|
|
)
|
|
|
|
if (NOT moe_marlin_generation_result EQUAL 0)
|
|
message(FATAL_ERROR "Marlin MOE generation failed."
|
|
" Result: \"${moe_marlin_generation_result}\""
|
|
"\nCheck the log for details: "
|
|
"${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
|
|
else()
|
|
set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH}
|
|
CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
|
|
message(STATUS "Marlin MOE generation completed successfully.")
|
|
endif()
|
|
else()
|
|
message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
|
|
endif()
|
|
|
|
file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu")
|
|
set_gencode_flags_for_srcs(
|
|
SRCS "${MOE_WNAA16_MARLIN_SRC}"
|
|
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
|
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
|
set_source_files_properties(${MOE_WNAA16_MARLIN_SRC}
|
|
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
|
endif()
|
|
|
|
list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
|
|
|
|
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
|
|
else()
|
|
message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
|
|
" in CUDA target architectures")
|
|
endif()
|
|
endif()
|
|
|
|
message(STATUS "Enabling moe extension.")
|
|
define_gpu_extension_target(
|
|
_moe_C
|
|
DESTINATION vllm
|
|
LANGUAGE ${VLLM_GPU_LANG}
|
|
SOURCES ${VLLM_MOE_EXT_SRC}
|
|
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
|
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
|
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
|
|
INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
|
|
USE_SABI 3
|
|
WITH_SOABI)
|
|
|
|
if(VLLM_GPU_LANG STREQUAL "HIP")
|
|
#
|
|
# _rocm_C extension
|
|
#
|
|
set(VLLM_ROCM_EXT_SRC
|
|
"csrc/rocm/torch_bindings.cpp"
|
|
"csrc/rocm/skinny_gemms.cu"
|
|
"csrc/rocm/attention.cu")
|
|
|
|
define_gpu_extension_target(
|
|
_rocm_C
|
|
DESTINATION vllm
|
|
LANGUAGE ${VLLM_GPU_LANG}
|
|
SOURCES ${VLLM_ROCM_EXT_SRC}
|
|
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
|
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
|
USE_SABI 3
|
|
WITH_SOABI)
|
|
endif()
|
|
|
|
# For CUDA we also build and ship some external projects.
|
|
if (VLLM_GPU_LANG STREQUAL "CUDA")
|
|
include(cmake/external_projects/flashmla.cmake)
|
|
|
|
# vllm-flash-attn should be last as it overwrites some CMake functions
|
|
include(cmake/external_projects/vllm_flash_attn.cmake)
|
|
endif ()
|