From 241b4cfe6609adce0e72b9299d4f2e31150a6d86 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Wed, 8 Oct 2025 10:20:48 -0400
Subject: [PATCH] [Refactor] Refactor FP8 & INT8 Quant Folder inside `w8a8`
 (#25293)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: nicole-lihui <nicole.li@daocloud.io>
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: courage17340 <courage17340@163.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Jacob Kahn <jacobkahn1@gmail.com>
Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: Agata Dobrzyniewicz <adobrzyniewicz@habana.ai>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: zxw <1020938856@qq.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: wang.yuqi <noooop@126.com>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
Signed-off-by: chenlang <chen.lang5@zte.com.cn>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Jonas Kuebler <kuebj@amazon.com>
Signed-off-by: jiang1.li <jiang1.li@intel.com>
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Signed-off-by: AlonKejzman <alonkeizman@gmail.com>
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: taohui <taohui3@gmail.com>
Signed-off-by: Tao Hui <taohui3@gmail.com>
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
Signed-off-by: Zhuohan Li <zhuohan123@gmail.com>
Signed-off-by: Tomer Asida <57313761+tomeras91@users.noreply.github.com>
Signed-off-by: Shu Wang. <shuw@nvidia.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Aleksandr Malyshev <maleksan@amd.com>
Signed-off-by: Eugene Khvedchenia <ekhvedchenia@nvidia.com>
Signed-off-by: Eugene Khvedchenya <ekhvedchenya@gmail.com>
Signed-off-by: yiting.jiang <yiting.jiang@daocloud.io>
Signed-off-by: Andrew Sansom <andrew@protopia.ai>
Signed-off-by: xaguilar <Xavier.AguilarFruto@amd.com>
Signed-off-by: Iceber Gu <caiwei95@hotmail.com>
Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com>
Signed-off-by: Icey <1790571317@qq.com>
Signed-off-by: Sage Moore <sage@neuralmagic.com>
Signed-off-by: 许文卿 <xwq391974@alibaba-inc.com>
Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com>
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Signed-off-by: Seiji Eicher <seiji@anyscale.com>
Signed-off-by: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com>
Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
Signed-off-by: Kosseila (CloudThrill) <klouddude@gmail.com>
Signed-off-by: frankwang28 <frank.wbb@hotmail.com>
Signed-off-by: Frank Wang <41319051+frankwang28@users.noreply.github.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: fhl2000 <63384265+fhl2000@users.noreply.github.com>
Signed-off-by: zixi-qi <qizixi@meta.com>
Signed-off-by: Bram Wasti <bwasti@meta.com>
Signed-off-by: Naman Lalit <nl2688@nyu.edu>
Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
Signed-off-by: Junhong <liujunhong11@huawei.com>
Signed-off-by: Junhong Liu <98734602+LJH-LBJ@users.noreply.github.com>
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
Signed-off-by: rentianyue-jk <rentianyue-jk@360shuke.com>
Signed-off-by: Peter Pan <Peter.Pan@daocloud.io>
Signed-off-by: Patrick Toulme <ptoulme@meta.com>
Signed-off-by: Patrick Toulme <pctoulme+1@gmail.com>
Signed-off-by: Jiangyun Zhu <riverclouds.zhu@qq.com>
Signed-off-by: Clayton Coleman <smarterclayton@gmail.com>
Signed-off-by: Jialin Ouyang <jialino@meta.com>
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
Signed-off-by: Weiliang Liu <weiliangl@nvidia.com>
Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
Signed-off-by: liuye.hj <liuye.hj@alibaba-inc.com>
Signed-off-by: Juechen Liu <jueliu@meta.com>
Signed-off-by: simon-mo <simon.mo@hey.com>
Signed-off-by: Robert Shaw <robshaw@redhat.com>
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Signed-off-by: isotr0py <2037008807@qq.com>
Signed-off-by: yingjun-mou <renzomou@gmail.com>
Signed-off-by: zhoukz <me@zhoukz.com>
Signed-off-by: Chenxi Yang <cxyang@fb.com>
Signed-off-by: Rahul Tuli <rtuli@redhat.com>
Signed-off-by: Lee Nau <lnau@nvidia.com>
Signed-off-by: adabeyta <aabeyta@redhat.com>
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Signed-off-by: simondanielsson <simon.danielsson99@hotmail.com>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
Signed-off-by: Barry Kang <43644113+Barry-Delaney@users.noreply.github.com>
Signed-off-by: Lucia Fang <fanglu@meta.com>
Signed-off-by: a120092009 <zhaoty0121@gmail.com>
Signed-off-by: sergiopaniego <sergiopaniegoblanco@gmail.com>
Signed-off-by: Sergio Paniego Blanco <sergiopaniegoblanco@gmail.com>
Signed-off-by: wangyafeng <wangyafeng@baidu.com>
Signed-off-by: Lehua Ding <lehuading@tencent.com>
Signed-off-by: lyd1992 <liuyudong@iscas.ac.cn>
Signed-off-by: ihb2032 <1355790728@qq.com>
Signed-off-by: asafg <39553475+Josephasafg@users.noreply.github.com>
Signed-off-by: anion <1005128408@qq.com>
Signed-off-by: Anion <123177548+Anionex@users.noreply.github.com>
Signed-off-by: Pavani Majety <pmajety@nvidia.com>
Signed-off-by: Bill Nell <bnell@redhat.com>
Signed-off-by: bnellnm <49004751+bnellnm@users.noreply.github.com>
Signed-off-by: Or Ozeri <oro@il.ibm.com>
Signed-off-by: cjackal <44624812+cjackal@users.noreply.github.com>
Signed-off-by: David Ben-David <davidb@pliops.com>
Signed-off-by: Andrew Xia <axia@meta.com>
Signed-off-by: Andrew Xia <axia@fb.com>
Signed-off-by: Lu Fang <fanglu@fb.com>
Signed-off-by: Salvatore Cena <cena@cenas.it>
Signed-off-by: padg9912 <phone.and.desktop@gmail.com>
Signed-off-by: nadathurv <work.vnadathur@gmail.com>
Signed-off-by: WorldExplored <srreyansh.sethi@gmail.com>
Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
Signed-off-by: billishyahao <bill.he@amd.com>
Signed-off-by: Nathan Scott <nathans@redhat.com>
Signed-off-by: Kenichi Maehashi <maehashi@preferred.jp>
Signed-off-by: Johnny <johnnynuca14@gmail.com>
Signed-off-by: johnnynunez <johnnynuca14@gmail.com>
Signed-off-by: Johnny <johnnync13@gmail.com>
Signed-off-by: Huamin Li <3ericli@gmail.com>
Signed-off-by: Hosang Yoon <hosang.yoon@amd.com>
Signed-off-by: Jerry Zhang <jerryzh168@gmail.com>
Signed-off-by: Peter Schuurman <psch@google.com>
Signed-off-by: Huy Do <huydhn@gmail.com>
Signed-off-by: leo-pony <nengjunma@outlook.com>
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Signed-off-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Signed-off-by: ElizaWszola <ewszola@redhat.com>
Signed-off-by: ElizaWszola <elizaw.9289@gmail.com>
Signed-off-by: Luka Govedič <lgovedic@redhat.com>
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Signed-off-by: zhewenli <zhewenli@meta.com>
Signed-off-by: ahao-anyscale <ahao@anyscale.com>
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Signed-off-by: huijjj <huijong.jeong@squeezebits.com>
Signed-off-by: Yannick Schnider <yannick.schnider1@ibm.com>
Signed-off-by: kyt <eluban4532@gmail.com>
Signed-off-by: Egor <e.a.krivov@gmail.com>
Signed-off-by: Yang <lymailforjob@gmail.com>
Signed-off-by: Paul Pak <paulpak58@gmail.com>
Signed-off-by: whx-sjtu <2952154980@qq.com>
Signed-off-by: Xiang Si <sixiang@google.com>
Signed-off-by: Aleksandr Samarin <astrlrd@nebius.com>
Signed-off-by: Jun Jiang <jasl9187@hotmail.com>
Signed-off-by: Chendi Xue <Chendi.Xue@intel.com>
Signed-off-by: Chendi.Xue <chendi.xue@intel.com>
Signed-off-by: Nikhil Ghosh <nikhil@anyscale.com>
Co-authored-by: Nicole LiHui 🥜 <nicolelihui@outlook.com>
Co-authored-by: courage17340 <courage17340@users.noreply.github.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Jacob Kahn <jacobkahn1@gmail.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Nicole LiHui 🥜 <nicole.li@daocloud.io>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Fadi Arafeh <115173828+fadara01@users.noreply.github.com>
Co-authored-by: Agata Dobrzyniewicz <160237065+adobrzyn@users.noreply.github.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: yyzxw <34639446+yyzxw@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: wang.yuqi <noooop@126.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Kunshang Ji <kunshang.ji@intel.com>
Co-authored-by: chenlang <chen.lang5@zte.com.cn>
Co-authored-by: chenlang <10346245@zte.com.cn>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Jonas M. Kübler <44084297+jmkuebler@users.noreply.github.com>
Co-authored-by: Li, Jiang <jiang1.li@intel.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: AlonKejzman <alonkeizman@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Co-authored-by: Tao Hui <taohui3@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Matthew Bonanni <mbonanni@redhat.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com>
Co-authored-by: tomeras91 <57313761+tomeras91@users.noreply.github.com>
Co-authored-by: Shu Wang <shuw@nvidia.com>
Co-authored-by: Aleksandr Malyshev <164964928+maleksan85@users.noreply.github.com>
Co-authored-by: Aleksandr Malyshev <maleksan@amd.com>
Co-authored-by: Doug Lehr <douglehr@amd.com>
Co-authored-by: Eugene Khvedchenya <ekhvedchenya@gmail.com>
Co-authored-by: yitingdc <59356937+yitingdc@users.noreply.github.com>
Co-authored-by: Andrew Sansom <andrew@protopia.ai>
Co-authored-by: xaguilar-amd <xavier.aguilarfruto@amd.com>
Co-authored-by: Iceber Gu <caiwei95@hotmail.com>
Co-authored-by: Tao He <linzhu.ht@alibaba-inc.com>
Co-authored-by: Icey <1790571317@qq.com>
Co-authored-by: Sage Moore <sage@neuralmagic.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Co-authored-by: Xu Wenqing <121550081+Xu-Wenqing@users.noreply.github.com>
Co-authored-by: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com>
Co-authored-by: RishiAstra <40644327+RishiAstra@users.noreply.github.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
Co-authored-by: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com>
Co-authored-by: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Co-authored-by: Jiangyun Zhu <riverclouds.zhu@qq.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: 阿丹(adan) <47373076+LDLINGLINGLING@users.noreply.github.com>
Co-authored-by: liudan <adan@minicpm.com>
Co-authored-by: liudan <liudan@qq.com>
Co-authored-by: Lucia Fang <116399278+luccafong@users.noreply.github.com>
Co-authored-by: Clouddude <kouss.hd@gmail.com>
Co-authored-by: Frank Wang <41319051+frankwang28@users.noreply.github.com>
Co-authored-by: fhl2000 <63384265+fhl2000@users.noreply.github.com>
Co-authored-by: qizixi <22851944+zixi-qi@users.noreply.github.com>
Co-authored-by: Bram Wasti <bwasti@fb.com>
Co-authored-by: Naman Lalit <nl2688@nyu.edu>
Co-authored-by: Chenheli Hua <huachenheli@outlook.com>
Co-authored-by: WeiQing Chen <40507679+david6666666@users.noreply.github.com>
Co-authored-by: Junhong <liujunhong11@huawei.com>
Co-authored-by: LJH-LBJ <98734602+LJH-LBJ@users.noreply.github.com>
Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com>
Co-authored-by: Xiaohan Zou <renovamenzxh@gmail.com>
Co-authored-by: rentianyue-jk <rentianyue-jk@360shuke.com>
Co-authored-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
Co-authored-by: Peter Pan <peter.pan@daocloud.io>
Co-authored-by: Patrick C. Toulme <135739773+patrick-toulme@users.noreply.github.com>
Co-authored-by: Clayton Coleman <smarterclayton@gmail.com>
Co-authored-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
Co-authored-by: Jialin Ouyang <jialino@meta.com>
Co-authored-by: weiliang <weiliangl@nvidia.com>
Co-authored-by: Yuxuan Zhang <2448370773@qq.com>
Co-authored-by: JJJYmmm <92386084+JJJYmmm@users.noreply.github.com>
Co-authored-by: liuye.hj <liuye.hj@alibaba-inc.com>
Co-authored-by: Juechen Liu <grinchcoder@gmail.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Yingjun Mou <renzomou@gmail.com>
Co-authored-by: Zhou Jiahao <me@zhoukz.com>
Co-authored-by: Chenxi Yang <cxyang@cs.utexas.edu>
Co-authored-by: Chenxi Yang <cxyang@fb.com>
Co-authored-by: Rahul Tuli <rtuli@redhat.com>
Co-authored-by: Lee Nau <lee.nau@gmail.com>
Co-authored-by: Adrian Abeyta <aabeyta@redhat.com>
Co-authored-by: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Co-authored-by: Aaron Pham <contact@aarnphm.xyz>
Co-authored-by: acisseJZhong <40467976+acisseJZhong@users.noreply.github.com>
Co-authored-by: Simon Danielsson <70206058+simondanielsson@users.noreply.github.com>
Co-authored-by: Yongye Zhu <zyy1102000@gmail.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Lucas Wilkinson <lwilkins@redhat.com>
Co-authored-by: Lucia Fang <fanglu@meta.com>
Co-authored-by: Siyuan Fu <siyuanf@nvidia.com>
Co-authored-by: Xiaozhu Meng <mxz297@gmail.com>
Co-authored-by: Barry Kang <43644113+Barry-Delaney@users.noreply.github.com>
Co-authored-by: a120092009 <33205509+a120092009@users.noreply.github.com>
Co-authored-by: Sergio Paniego Blanco <sergiopaniegoblanco@gmail.com>
Co-authored-by: CSWYF3634076 <wangyafeng@baidu.com>
Co-authored-by: Lehua Ding <lehuading@tencent.com>
Co-authored-by: Reza Barazesh <3146276+rzabarazesh@users.noreply.github.com>
Co-authored-by: ihb2032 <40718643+ihb2032@users.noreply.github.com>
Co-authored-by: Asaf Joseph Gardin <39553475+Josephasafg@users.noreply.github.com>
Co-authored-by: Anion <123177548+Anionex@users.noreply.github.com>
Co-authored-by: Pavani Majety <pmajety@nvidia.com>
Co-authored-by: bnellnm <49004751+bnellnm@users.noreply.github.com>
Co-authored-by: Or Ozeri <oro@il.ibm.com>
Co-authored-by: cjackal <44624812+cjackal@users.noreply.github.com>
Co-authored-by: David Ben-David <sdavidbd@gmail.com>
Co-authored-by: David Ben-David <davidb@pliops.com>
Co-authored-by: Andrew Xia <axia@mit.edu>
Co-authored-by: Andrew Xia <axia@fb.com>
Co-authored-by: Salvatore Cena <cena@cenas.it>
Co-authored-by: Param <psch@cs.unc.edu>
Co-authored-by: Zhewen Li <zhewenli@meta.com>
Co-authored-by: nadathurv <work.vnadathur@gmail.com>
Co-authored-by: Srreyansh Sethi <107075589+WorldExplored@users.noreply.github.com>
Co-authored-by: Wenlong Wang <wangwenlong2755@gmail.com>
Co-authored-by: billishyahao <bill.he@amd.com>
Co-authored-by: Nathan Scott <natoscott@users.noreply.github.com>
Co-authored-by: Kenichi Maehashi <939877+kmaehashi@users.noreply.github.com>
Co-authored-by: Johnny <johnnync13@gmail.com>
Co-authored-by: Aidyn-A <31858918+Aidyn-A@users.noreply.github.com>
Co-authored-by: Huamin Li <3ericli@gmail.com>
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Hosang <156028780+hyoon1@users.noreply.github.com>
Co-authored-by: Jerry Zhang <jerryzh168@gmail.com>
Co-authored-by: pwschuurman <psch@google.com>
Co-authored-by: Huy Do <huydhn@gmail.com>
Co-authored-by: leo-pony <nengjunma@outlook.com>
Co-authored-by: vllmellm <vllm.ellm@embeddedllm.com>
Co-authored-by: ElizaWszola <ewszola@redhat.com>
Co-authored-by: Luka Govedič <lgovedic@redhat.com>
Co-authored-by: Benjamin Chislett <bchislett@nvidia.com>
Co-authored-by: Andrew Xia <axia@meta.com>
Co-authored-by: Simon Mo <simon.mo@hey.com>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
Co-authored-by: ahao-anyscale <ahao@anyscale.com>
Co-authored-by: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Liu-congo <1502632128@qq.com>
Co-authored-by: HUIJONG JEONG <64083281+huijjj@users.noreply.github.com>
Co-authored-by: Yannick Schnider <Yannick.Schnider1@ibm.com>
Co-authored-by: kyt <eluban4532@gmail.com>
Co-authored-by: Egor <e.a.krivov@gmail.com>
Co-authored-by: Yang Liu <127183760+KKSK-DON@users.noreply.github.com>
Co-authored-by: Paul Pak <52512091+paulpak58@users.noreply.github.com>
Co-authored-by: whx <56632993+whx-sjtu@users.noreply.github.com>
Co-authored-by: Xiang Si <sixiang@google.com>
Co-authored-by: Aleksandr Samarin <samarin_ad@mail.ru>
Co-authored-by: Jun Jiang <jasl9187@hotmail.com>
Co-authored-by: Chendi.Xue <chendi.xue@intel.com>
Co-authored-by: Nikhil G <nrghosh@users.noreply.github.com>
---
 CMakeLists.txt                                | 41 ++++++++++---------
 csrc/attention/attention_kernels.cuh          |  4 +-
 csrc/cache_kernels.cu                         |  4 +-
 csrc/cub_helpers.h                            |  5 ++-
 csrc/layernorm_quant_kernels.cu               |  2 +-
 csrc/quantization/activation_kernels.cu       |  2 +-
 .../fused_kernels/quant_conversions.cuh       |  2 +-
 .../cutlass}/Epilogues.md                     |  0
 .../cutlass}/c3x/cutlass_gemm_caller.cuh      |  0
 .../cutlass}/c3x/scaled_mm.cuh                |  0
 .../cutlass}/c3x/scaled_mm_azp_sm90_int8.cu   |  0
 .../c3x/scaled_mm_blockwise_sm100_fp8.cu      |  0
 ...scaled_mm_blockwise_sm100_fp8_dispatch.cuh |  0
 .../c3x/scaled_mm_blockwise_sm120_fp8.cu      |  0
 ...scaled_mm_blockwise_sm120_fp8_dispatch.cuh |  0
 .../c3x/scaled_mm_blockwise_sm90_fp8.cu       |  0
 .../scaled_mm_blockwise_sm90_fp8_dispatch.cuh |  0
 .../cutlass}/c3x/scaled_mm_helper.hpp         |  0
 .../cutlass}/c3x/scaled_mm_kernels.hpp        |  0
 .../cutlass}/c3x/scaled_mm_sm100_fp8.cu       |  0
 .../c3x/scaled_mm_sm100_fp8_dispatch.cuh      |  0
 .../cutlass}/c3x/scaled_mm_sm120_fp8.cu       |  0
 .../c3x/scaled_mm_sm120_fp8_dispatch.cuh      |  0
 .../cutlass}/c3x/scaled_mm_sm90_fp8.cu        |  0
 .../c3x/scaled_mm_sm90_fp8_dispatch.cuh       |  0
 .../cutlass}/c3x/scaled_mm_sm90_int8.cu       |  0
 .../c3x/scaled_mm_sm90_int8_dispatch.cuh      |  0
 .../moe/blockwise_scaled_group_mm_sm100.cu    |  0
 .../cutlass}/moe/get_group_starts.cuh         |  0
 .../cutlass}/moe/grouped_mm_c3x.cuh           |  0
 .../cutlass}/moe/grouped_mm_c3x_sm100.cu      |  0
 .../cutlass}/moe/grouped_mm_c3x_sm90.cu       |  0
 .../cutlass}/moe/moe_data.cu                  |  0
 .../cutlass}/scaled_mm_c2x.cu                 |  0
 .../cutlass}/scaled_mm_c2x.cuh                |  0
 .../cutlass}/scaled_mm_c2x_sm75_dispatch.cuh  |  0
 .../cutlass}/scaled_mm_c2x_sm80_dispatch.cuh  |  0
 .../scaled_mm_c2x_sm89_fp8_dispatch.cuh       |  0
 .../scaled_mm_c2x_sm89_int8_dispatch.cuh      |  0
 .../cutlass}/scaled_mm_c3x_sm100.cu           |  0
 .../cutlass}/scaled_mm_c3x_sm120.cu           |  0
 .../cutlass}/scaled_mm_c3x_sm90.cu            |  0
 .../cutlass}/scaled_mm_entry.cu               |  0
 .../{ => w8a8}/fp8/amd/quant_utils.cuh        |  2 +-
 csrc/quantization/{ => w8a8}/fp8/common.cu    |  4 +-
 csrc/quantization/{ => w8a8}/fp8/common.cuh   |  0
 .../{ => w8a8}/fp8/nvidia/quant_utils.cuh     |  2 +-
 .../{ => w8a8}/fp8/per_token_group_quant.cu   | 10 ++---
 .../w8a8/int8/per_token_group_quant.cu        | 12 ++++++
 .../int8/scaled_quant.cu}                     | 28 ++-----------
 .../{ => w8a8}/per_token_group_quant_8bit.h   |  1 -
 csrc/rocm/attention.cu                        |  2 +-
 csrc/rocm/skinny_gemms.cu                     |  2 +-
 .../quantization/kernels/scaled_mm/cutlass.py |  4 +-
 54 files changed, 60 insertions(+), 67 deletions(-)
 rename csrc/quantization/{cutlass_w8a8 => w8a8/cutlass}/Epilogues.md (100%)
 rename csrc/quantization/{cutlass_w8a8 => w8a8/cutlass}/c3x/cutlass_gemm_caller.cuh (100%)
 rename csrc/quantization/{cutlass_w8a8 => w8a8/cutlass}/c3x/scaled_mm.cuh (100%)
 rename csrc/quantization/{cutlass_w8a8 => w8a8/cutlass}/c3x/scaled_mm_azp_sm90_int8.cu (100%)
 rename csrc/quantization/{cutlass_w8a8 => w8a8/cutlass}/c3x/scaled_mm_blockwise_sm100_fp8.cu (100%)
 rename csrc/quantization/{cutlass_w8a8 => w8a8/cutlass}/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh (100%)
 rename csrc/quantization/{cutlass_w8a8 => w8a8/cutlass}/c3x/scaled_mm_blockwise_sm120_fp8.cu (100%)
 rename csrc/quantization/{cutlass_w8a8 => w8a8/cutlass}/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh (100%)
 rename csrc/quantization/{cutlass_w8a8 => w8a8/cutlass}/c3x/scaled_mm_blockwise_sm90_fp8.cu (100%)
 rename csrc/quantization/{cutlass_w8a8 => w8a8/cutlass}/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh (100%)
 rename csrc/quantization/{cutlass_w8a8 => w8a8/cutlass}/c3x/scaled_mm_helper.hpp (100%)
 rename csrc/quantization/{cutlass_w8a8 => w8a8/cutlass}/c3x/scaled_mm_kernels.hpp (100%)
 rename csrc/quantization/{cutlass_w8a8 => w8a8/cutlass}/c3x/scaled_mm_sm100_fp8.cu (100%)
 rename csrc/quantization/{cutlass_w8a8 => w8a8/cutlass}/c3x/scaled_mm_sm100_fp8_dispatch.cuh (100%)
 rename csrc/quantization/{cutlass_w8a8 => w8a8/cutlass}/c3x/scaled_mm_sm120_fp8.cu (100%)
 rename csrc/quantization/{cutlass_w8a8 => w8a8/cutlass}/c3x/scaled_mm_sm120_fp8_dispatch.cuh (100%)
 rename csrc/quantization/{cutlass_w8a8 => w8a8/cutlass}/c3x/scaled_mm_sm90_fp8.cu (100%)
 rename csrc/quantization/{cutlass_w8a8 => w8a8/cutlass}/c3x/scaled_mm_sm90_fp8_dispatch.cuh (100%)
 rename csrc/quantization/{cutlass_w8a8 => w8a8/cutlass}/c3x/scaled_mm_sm90_int8.cu (100%)
 rename csrc/quantization/{cutlass_w8a8 => w8a8/cutlass}/c3x/scaled_mm_sm90_int8_dispatch.cuh (100%)
 rename csrc/quantization/{cutlass_w8a8 => w8a8/cutlass}/moe/blockwise_scaled_group_mm_sm100.cu (100%)
 rename csrc/quantization/{cutlass_w8a8 => w8a8/cutlass}/moe/get_group_starts.cuh (100%)
 rename csrc/quantization/{cutlass_w8a8 => w8a8/cutlass}/moe/grouped_mm_c3x.cuh (100%)
 rename csrc/quantization/{cutlass_w8a8 => w8a8/cutlass}/moe/grouped_mm_c3x_sm100.cu (100%)
 rename csrc/quantization/{cutlass_w8a8 => w8a8/cutlass}/moe/grouped_mm_c3x_sm90.cu (100%)
 rename csrc/quantization/{cutlass_w8a8 => w8a8/cutlass}/moe/moe_data.cu (100%)
 rename csrc/quantization/{cutlass_w8a8 => w8a8/cutlass}/scaled_mm_c2x.cu (100%)
 rename csrc/quantization/{cutlass_w8a8 => w8a8/cutlass}/scaled_mm_c2x.cuh (100%)
 rename csrc/quantization/{cutlass_w8a8 => w8a8/cutlass}/scaled_mm_c2x_sm75_dispatch.cuh (100%)
 rename csrc/quantization/{cutlass_w8a8 => w8a8/cutlass}/scaled_mm_c2x_sm80_dispatch.cuh (100%)
 rename csrc/quantization/{cutlass_w8a8 => w8a8/cutlass}/scaled_mm_c2x_sm89_fp8_dispatch.cuh (100%)
 rename csrc/quantization/{cutlass_w8a8 => w8a8/cutlass}/scaled_mm_c2x_sm89_int8_dispatch.cuh (100%)
 rename csrc/quantization/{cutlass_w8a8 => w8a8/cutlass}/scaled_mm_c3x_sm100.cu (100%)
 rename csrc/quantization/{cutlass_w8a8 => w8a8/cutlass}/scaled_mm_c3x_sm120.cu (100%)
 rename csrc/quantization/{cutlass_w8a8 => w8a8/cutlass}/scaled_mm_c3x_sm90.cu (100%)
 rename csrc/quantization/{cutlass_w8a8 => w8a8/cutlass}/scaled_mm_entry.cu (100%)
 rename csrc/quantization/{ => w8a8}/fp8/amd/quant_utils.cuh (99%)
 rename csrc/quantization/{ => w8a8}/fp8/common.cu (99%)
 rename csrc/quantization/{ => w8a8}/fp8/common.cuh (100%)
 rename csrc/quantization/{ => w8a8}/fp8/nvidia/quant_utils.cuh (99%)
 rename csrc/quantization/{ => w8a8}/fp8/per_token_group_quant.cu (98%)
 create mode 100644 csrc/quantization/w8a8/int8/per_token_group_quant.cu
 rename csrc/quantization/{compressed_tensors/int8_quant_kernels.cu => w8a8/int8/scaled_quant.cu} (94%)
 rename csrc/quantization/{ => w8a8}/per_token_group_quant_8bit.h (84%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5ebea1c42e9a..66967b655a1a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -269,8 +269,8 @@ set(VLLM_EXT_SRC
   "csrc/sampler.cu"
   "csrc/cuda_view.cu"
   "csrc/quantization/gptq/q_gemm.cu"
-  "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
-  "csrc/quantization/fp8/common.cu"
+  "csrc/quantization/w8a8/int8/scaled_quant.cu"
+  "csrc/quantization/w8a8/fp8/common.cu"
   "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
   "csrc/quantization/gguf/gguf_kernel.cu"
   "csrc/quantization/activation_kernels.cu"
@@ -314,12 +314,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   list(APPEND VLLM_EXT_SRC
     "csrc/quantization/awq/gemm_kernels.cu"
     "csrc/permute_cols.cu"
-    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
+    "csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu"
     "csrc/quantization/fp4/nvfp4_quant_entry.cu"
     "csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
     "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
     "csrc/cutlass_extensions/common.cpp"
-    "csrc/quantization/fp8/per_token_group_quant.cu")
+    "csrc/quantization/w8a8/fp8/per_token_group_quant.cu"
+    "csrc/quantization/w8a8/int8/per_token_group_quant.cu")
 
   set_gencode_flags_for_srcs(
     SRCS "${VLLM_EXT_SRC}"
@@ -423,11 +424,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
     set(SRCS
-       "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
-       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
-       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
-       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
-       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
+       "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm90.cu"
+       "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8.cu"
+       "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_int8.cu"
+       "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_azp_sm90_int8.cu"
+       "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
       CUDA_ARCHS "${SCALED_MM_ARCHS}")
@@ -458,9 +459,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   endif()
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
     set(SRCS
-      "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
-      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu"
-      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu"
+      "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm120.cu"
+      "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu"
+      "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8.cu"
     )
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
@@ -492,9 +493,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   endif()
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
     set(SRCS
-      "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
-      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
-      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
+      "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm100.cu"
+      "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu"
+      "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8.cu"
     )
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
@@ -525,7 +526,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # subtract out the archs that are already built for 3x
   list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
   if (SCALED_MM_2X_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu")
+    set(SRCS "csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
       CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
@@ -648,7 +649,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # if it's possible to compile MoE kernels that use its output.
   cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu")
+    set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm90.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
       CUDA_ARCHS "${SCALED_MM_ARCHS}")
@@ -672,7 +673,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
   endif()
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu")
+    set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
       CUDA_ARCHS "${SCALED_MM_ARCHS}")
@@ -697,7 +698,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
   endif()
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
+    set(SRCS "csrc/quantization/w8a8/cutlass/moe/moe_data.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
       CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
@@ -720,7 +721,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
   endif()
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
+    set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
       CUDA_ARCHS "${SCALED_MM_ARCHS}")
diff --git a/csrc/attention/attention_kernels.cuh b/csrc/attention/attention_kernels.cuh
index 57382c1ddc65..052ff168cec4 100644
--- a/csrc/attention/attention_kernels.cuh
+++ b/csrc/attention/attention_kernels.cuh
@@ -28,10 +28,10 @@
 
 #ifdef USE_ROCM
   #include <hip/hip_bf16.h>
-  #include "../quantization/fp8/amd/quant_utils.cuh"
+  #include "../quantization/w8a8/fp8/amd/quant_utils.cuh"
 typedef __hip_bfloat16 __nv_bfloat16;
 #else
-  #include "../quantization/fp8/nvidia/quant_utils.cuh"
+  #include "../quantization/w8a8/fp8/nvidia/quant_utils.cuh"
 #endif
 
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index f4b116c94f19..0aa0dc14c748 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -9,9 +9,9 @@
 #include "quantization/vectorization_utils.cuh"
 
 #ifdef USE_ROCM
-  #include "quantization/fp8/amd/quant_utils.cuh"
+  #include "quantization/w8a8/fp8/amd/quant_utils.cuh"
 #else
-  #include "quantization/fp8/nvidia/quant_utils.cuh"
+  #include "quantization/w8a8/fp8/nvidia/quant_utils.cuh"
 #endif
 
 #include <algorithm>
diff --git a/csrc/cub_helpers.h b/csrc/cub_helpers.h
index 470a63a22cab..18e4e343ad8b 100644
--- a/csrc/cub_helpers.h
+++ b/csrc/cub_helpers.h
@@ -12,6 +12,7 @@ using CubMaxOp = cub::Max;
   #endif  // CUB_VERSION
 #else
   #include <hipcub/hipcub.hpp>
-using CubAddOp = cub::Sum;
-using CubMaxOp = cub::Max;
+namespace cub = hipcub;
+using CubAddOp = hipcub::Sum;
+using CubMaxOp = hipcub::Max;
 #endif  // USE_ROCM
diff --git a/csrc/layernorm_quant_kernels.cu b/csrc/layernorm_quant_kernels.cu
index 58c3d9c0981a..0fc462194fcd 100644
--- a/csrc/layernorm_quant_kernels.cu
+++ b/csrc/layernorm_quant_kernels.cu
@@ -6,7 +6,7 @@
  */
 
 #include "type_convert.cuh"
-#include "quantization/fp8/common.cuh"
+#include "quantization/w8a8/fp8/common.cuh"
 #include "dispatch_utils.h"
 #include "cub_helpers.h"
 #include "core/batch_invariant.hpp"
diff --git a/csrc/quantization/activation_kernels.cu b/csrc/quantization/activation_kernels.cu
index b94cc9ce5086..55da79a12d89 100644
--- a/csrc/quantization/activation_kernels.cu
+++ b/csrc/quantization/activation_kernels.cu
@@ -7,7 +7,7 @@
 #include "../cuda_compat.h"
 #include "dispatch_utils.h"
 
-#include "quantization/fp8/common.cuh"
+#include "quantization/w8a8/fp8/common.cuh"
 
 #include <c10/util/Float8_e4m3fn.h>
 
diff --git a/csrc/quantization/fused_kernels/quant_conversions.cuh b/csrc/quantization/fused_kernels/quant_conversions.cuh
index 4e6118e52e8d..2b1eb1d568e4 100644
--- a/csrc/quantization/fused_kernels/quant_conversions.cuh
+++ b/csrc/quantization/fused_kernels/quant_conversions.cuh
@@ -6,7 +6,7 @@
 
 #include "quantization/vectorization.cuh"
 // TODO(luka/varun):refactor common.cuh to use this file instead
-#include "quantization/fp8/common.cuh"
+#include "quantization/w8a8/fp8/common.cuh"
 
 namespace vllm {
 
diff --git a/csrc/quantization/cutlass_w8a8/Epilogues.md b/csrc/quantization/w8a8/cutlass/Epilogues.md
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/Epilogues.md
rename to csrc/quantization/w8a8/cutlass/Epilogues.md
diff --git a/csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh b/csrc/quantization/w8a8/cutlass/c3x/cutlass_gemm_caller.cuh
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh
rename to csrc/quantization/w8a8/cutlass/c3x/cutlass_gemm_caller.cuh
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm.cuh
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh
rename to csrc/quantization/w8a8/cutlass/c3x/scaled_mm.cuh
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_azp_sm90_int8.cu
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu
rename to csrc/quantization/w8a8/cutlass/c3x/scaled_mm_azp_sm90_int8.cu
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8.cu
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu
rename to csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8.cu
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
rename to csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8.cu
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu
rename to csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8.cu
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh
rename to csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8.cu
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu
rename to csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8.cu
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
rename to csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_helper.hpp
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp
rename to csrc/quantization/w8a8/cutlass/c3x/scaled_mm_helper.hpp
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_kernels.hpp
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp
rename to csrc/quantization/w8a8/cutlass/c3x/scaled_mm_kernels.hpp
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu
rename to csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8_dispatch.cuh
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh
rename to csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8_dispatch.cuh
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu
rename to csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8_dispatch.cuh b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8_dispatch.cuh
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8_dispatch.cuh
rename to csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8_dispatch.cuh
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8.cu
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu
rename to csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8.cu
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8_dispatch.cuh b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8_dispatch.cuh
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8_dispatch.cuh
rename to csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8_dispatch.cuh
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_int8.cu
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu
rename to csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_int8.cu
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8_dispatch.cuh b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_int8_dispatch.cuh
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8_dispatch.cuh
rename to csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_int8_dispatch.cuh
diff --git a/csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu b/csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu
rename to csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu
diff --git a/csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh b/csrc/quantization/w8a8/cutlass/moe/get_group_starts.cuh
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh
rename to csrc/quantization/w8a8/cutlass/moe/get_group_starts.cuh
diff --git a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh b/csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x.cuh
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh
rename to csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x.cuh
diff --git a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu b/csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu
rename to csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu
diff --git a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu b/csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm90.cu
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu
rename to csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm90.cu
diff --git a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu b/csrc/quantization/w8a8/cutlass/moe/moe_data.cu
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/moe/moe_data.cu
rename to csrc/quantization/w8a8/cutlass/moe/moe_data.cu
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu b/csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cu
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
rename to csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cu
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh b/csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cuh
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
rename to csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cuh
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm75_dispatch.cuh b/csrc/quantization/w8a8/cutlass/scaled_mm_c2x_sm75_dispatch.cuh
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm75_dispatch.cuh
rename to csrc/quantization/w8a8/cutlass/scaled_mm_c2x_sm75_dispatch.cuh
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm80_dispatch.cuh b/csrc/quantization/w8a8/cutlass/scaled_mm_c2x_sm80_dispatch.cuh
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm80_dispatch.cuh
rename to csrc/quantization/w8a8/cutlass/scaled_mm_c2x_sm80_dispatch.cuh
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_fp8_dispatch.cuh b/csrc/quantization/w8a8/cutlass/scaled_mm_c2x_sm89_fp8_dispatch.cuh
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_fp8_dispatch.cuh
rename to csrc/quantization/w8a8/cutlass/scaled_mm_c2x_sm89_fp8_dispatch.cuh
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_int8_dispatch.cuh b/csrc/quantization/w8a8/cutlass/scaled_mm_c2x_sm89_int8_dispatch.cuh
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_int8_dispatch.cuh
rename to csrc/quantization/w8a8/cutlass/scaled_mm_c2x_sm89_int8_dispatch.cuh
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu b/csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm100.cu
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu
rename to csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm100.cu
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu b/csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm120.cu
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu
rename to csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm120.cu
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu b/csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm90.cu
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu
rename to csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm90.cu
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
similarity index 100%
rename from csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
rename to csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
diff --git a/csrc/quantization/fp8/amd/quant_utils.cuh b/csrc/quantization/w8a8/fp8/amd/quant_utils.cuh
similarity index 99%
rename from csrc/quantization/fp8/amd/quant_utils.cuh
rename to csrc/quantization/w8a8/fp8/amd/quant_utils.cuh
index e51a4e14e518..81f5cb83f3e1 100644
--- a/csrc/quantization/fp8/amd/quant_utils.cuh
+++ b/csrc/quantization/w8a8/fp8/amd/quant_utils.cuh
@@ -5,7 +5,7 @@
 #include <hip/hip_bf16.h>
 #include <hip/hip_bfloat16.h>
 
-#include "../../../attention/attention_dtypes.h"
+#include "../../../../attention/attention_dtypes.h"
 
 namespace vllm {
 #ifdef USE_ROCM
diff --git a/csrc/quantization/fp8/common.cu b/csrc/quantization/w8a8/fp8/common.cu
similarity index 99%
rename from csrc/quantization/fp8/common.cu
rename to csrc/quantization/w8a8/fp8/common.cu
index 45d6d5082ce4..7a822fb8fb8a 100644
--- a/csrc/quantization/fp8/common.cu
+++ b/csrc/quantization/w8a8/fp8/common.cu
@@ -1,7 +1,7 @@
 #include "common.cuh"
 #include "dispatch_utils.h"
-#include "../../cub_helpers.h"
-#include "../vectorization_utils.cuh"
+#include "cub_helpers.h"
+#include "quantization/vectorization_utils.cuh"
 #include <c10/cuda/CUDAGuard.h>
 #include <ATen/cuda/Exceptions.h>
 
diff --git a/csrc/quantization/fp8/common.cuh b/csrc/quantization/w8a8/fp8/common.cuh
similarity index 100%
rename from csrc/quantization/fp8/common.cuh
rename to csrc/quantization/w8a8/fp8/common.cuh
diff --git a/csrc/quantization/fp8/nvidia/quant_utils.cuh b/csrc/quantization/w8a8/fp8/nvidia/quant_utils.cuh
similarity index 99%
rename from csrc/quantization/fp8/nvidia/quant_utils.cuh
rename to csrc/quantization/w8a8/fp8/nvidia/quant_utils.cuh
index 5361a8b1a598..421e8092474b 100644
--- a/csrc/quantization/fp8/nvidia/quant_utils.cuh
+++ b/csrc/quantization/w8a8/fp8/nvidia/quant_utils.cuh
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "../../../attention/attention_dtypes.h"
+#include "../../../../attention/attention_dtypes.h"
 #include <assert.h>
 #include <float.h>
 #include <stdint.h>
diff --git a/csrc/quantization/fp8/per_token_group_quant.cu b/csrc/quantization/w8a8/fp8/per_token_group_quant.cu
similarity index 98%
rename from csrc/quantization/fp8/per_token_group_quant.cu
rename to csrc/quantization/w8a8/fp8/per_token_group_quant.cu
index 91d489fdef86..e3ab0676b254 100644
--- a/csrc/quantization/fp8/per_token_group_quant.cu
+++ b/csrc/quantization/w8a8/fp8/per_token_group_quant.cu
@@ -1,6 +1,6 @@
 #include <ATen/cuda/CUDAContext.h>
 
-#include "../per_token_group_quant_8bit.h"
+#include "quantization/w8a8/per_token_group_quant_8bit.h"
 
 #include <cmath>
 
@@ -8,9 +8,9 @@
 
 #include <torch/all.h>
 
-#include "../vectorization.cuh"
-#include "../vectorization_utils.cuh"
-#include "../../dispatch_utils.h"
+#include "quantization/vectorization.cuh"
+#include "quantization/vectorization_utils.cuh"
+#include "dispatch_utils.h"
 
 __device__ __forceinline__ float GroupReduceMax(float val) {
   unsigned mask = threadIdx.x % 32 >= 16 ? 0xffff0000 : 0x0000ffff;
@@ -212,4 +212,4 @@ void per_token_group_quant_fp8(const torch::Tensor& input,
                                double fp8_max, bool scale_ue8m0) {
   per_token_group_quant_8bit(input, output_q, output_s, group_size, eps,
                              fp8_min, fp8_max, scale_ue8m0);
-}
+}
\ No newline at end of file
diff --git a/csrc/quantization/w8a8/int8/per_token_group_quant.cu b/csrc/quantization/w8a8/int8/per_token_group_quant.cu
new file mode 100644
index 000000000000..9d808a176f53
--- /dev/null
+++ b/csrc/quantization/w8a8/int8/per_token_group_quant.cu
@@ -0,0 +1,12 @@
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/all.h>
+
+#include "quantization/w8a8/per_token_group_quant_8bit.h"
+
+void per_token_group_quant_int8(const torch::Tensor& input,
+                                torch::Tensor& output_q,
+                                torch::Tensor& output_s, int64_t group_size,
+                                double eps, double int8_min, double int8_max) {
+  per_token_group_quant_8bit(input, output_q, output_s, group_size, eps,
+                             int8_min, int8_max);
+}
\ No newline at end of file
diff --git a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu b/csrc/quantization/w8a8/int8/scaled_quant.cu
similarity index 94%
rename from csrc/quantization/compressed_tensors/int8_quant_kernels.cu
rename to csrc/quantization/w8a8/int8/scaled_quant.cu
index bcfde9fbcbbe..7fe9e96bfb01 100644
--- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
+++ b/csrc/quantization/w8a8/int8/scaled_quant.cu
@@ -1,15 +1,11 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <torch/all.h>
 
-#ifndef USE_ROCM
-  #include "../per_token_group_quant_8bit.h"
-#endif
-
 #include <cmath>
 
-#include "../../cub_helpers.h"
-#include "../../dispatch_utils.h"
-#include "../vectorization_utils.cuh"
+#include "dispatch_utils.h"
+#include "quantization/vectorization_utils.cuh"
+#include "cub_helpers.h"
 
 static inline __device__ int8_t float_to_int8_rn(float x) {
 #ifdef USE_ROCM
@@ -25,7 +21,6 @@ static inline __device__ int8_t float_to_int8_rn(float x) {
   float dst = std::nearbyint(x);
 
   // saturate
-
   // See https://github.com/pytorch/pytorch/issues/127666
   // See https://github.com/llvm/llvm-project/issues/95183
   // hip-clang std::clamp __glibcxx_assert_fail host function when building on
@@ -84,7 +79,6 @@ static inline __device__ int8_t int32_to_int8(int32_t x) {
       static_cast<int32_t>(std::numeric_limits<int8_t>::max());
 
   // saturate
-
   // See https://github.com/pytorch/pytorch/issues/127666
   // See https://github.com/llvm/llvm-project/issues/95183
   // hip-clang std::clamp __glibcxx_assert_fail host function when building on
@@ -176,7 +170,6 @@ __global__ void dynamic_scaled_int8_quant_kernel(
 
   float inv_s = (absmax == 0.f) ? 0.f : 127.f / absmax;
 
-  // 2. quantize
   vectorize_with_alignment<16>(
       row_in, row_out, hidden_size, tid, stride,
       [=] __device__(int8_t& dst, const scalar_t& src) {
@@ -194,7 +187,6 @@ struct MinMax {
 
   __host__ __device__ explicit MinMax(float v) : min(v), max(v) {}
 
-  // add a value to the MinMax
   __host__ __device__ MinMax& operator+=(float v) {
     min = fminf(min, v);
     max = fmaxf(max, v);
@@ -228,7 +220,6 @@ __global__ void dynamic_scaled_int8_azp_quant_kernel(
   const scalar_t* row_in = input + token_idx * hidden_size;
   int8_t* row_out = output + token_idx * hidden_size;
 
-  // 1. calculate min & max
   MinMax thread_mm;
   vectorize_read_with_alignment<16>(row_in, hidden_size, tid, stride,
                                     [&] __device__(const scalar_t& src) {
@@ -261,7 +252,6 @@ __global__ void dynamic_scaled_int8_azp_quant_kernel(
   const float inv_s = 1.f / scale_sh;
   const azp_t azp = azp_sh;
 
-  // 2. quantize
   vectorize_with_alignment<16>(
       row_in, row_out, hidden_size, tid, stride,
       [=] __device__(int8_t& dst, const scalar_t& src) {
@@ -332,14 +322,4 @@ void dynamic_scaled_int8_quant(
                   hidden_size);
         }
       });
-}
-
-#ifndef USE_ROCM
-void per_token_group_quant_int8(const torch::Tensor& input,
-                                torch::Tensor& output_q,
-                                torch::Tensor& output_s, int64_t group_size,
-                                double eps, double int8_min, double int8_max) {
-  per_token_group_quant_8bit(input, output_q, output_s, group_size, eps,
-                             int8_min, int8_max);
-}
-#endif
+}
\ No newline at end of file
diff --git a/csrc/quantization/per_token_group_quant_8bit.h b/csrc/quantization/w8a8/per_token_group_quant_8bit.h
similarity index 84%
rename from csrc/quantization/per_token_group_quant_8bit.h
rename to csrc/quantization/w8a8/per_token_group_quant_8bit.h
index 537b61bc4303..25d4ecd1131a 100644
--- a/csrc/quantization/per_token_group_quant_8bit.h
+++ b/csrc/quantization/w8a8/per_token_group_quant_8bit.h
@@ -1,7 +1,6 @@
 #pragma once
 #include <torch/all.h>
 
-// TODO(wentao): refactor the folder to 8bit, then includes fp8 and int8 folders
 // 8-bit per-token-group quantization helper used by both FP8 and INT8
 void per_token_group_quant_8bit(const torch::Tensor& input,
                                 torch::Tensor& output_q,
diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu
index df3208a120f1..a339c5641bb4 100644
--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@@ -23,7 +23,7 @@
 
 #include <algorithm>
 #include "../attention/dtype_fp8.cuh"
-#include "../quantization/fp8/amd/quant_utils.cuh"
+#include "../quantization/w8a8/fp8/amd/quant_utils.cuh"
 
 // ROCm 6.2 compatibility: map OCP fp8 types to FNUZ variants if OCP is absent
 #if !defined(HIP_FP8_TYPE_OCP)
diff --git a/csrc/rocm/skinny_gemms.cu b/csrc/rocm/skinny_gemms.cu
index e4600350d3ea..2ef579a1b753 100644
--- a/csrc/rocm/skinny_gemms.cu
+++ b/csrc/rocm/skinny_gemms.cu
@@ -11,7 +11,7 @@
 
 #include "../cuda_compat.h"
 #include "dispatch_utils.h"
-#include "quantization/fp8/common.cuh"
+#include "quantization/w8a8/fp8/common.cuh"
 
 #if defined(__HIPCC__) && \
     (defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__))
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
index f1dafdf14c7a..13dbd55c32df 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
@@ -89,8 +89,8 @@ class CutlassScaledMMLinearKernel(ScaledMMLinearKernel):
         # azp_adj is the AZP adjustment term, used to account for weights.
         # It does not depend on scales or azp, so it is the same for
         # static and dynamic quantization.
-        # For more details, see csrc/quantization/cutlass_w8a8/Epilogues.md
-        # https://github.com/vllm-project/vllm/blob/8d59dbb00044a588cab96bcdc028006ed922eb06/csrc/quantization/cutlass_w8a8/Epilogues.md
+        # For more details, see csrc/quantization/w8a8/cutlass/Epilogues.md
+        # https://github.com/vllm-project/vllm/blob/main/csrc/quantization/w8a8/cutlass/Epilogues.md
         if not self.config.input_symmetric:
             weight = getattr(layer, self.w_q_name)
             azp_adj = weight.sum(dim=0, keepdim=True, dtype=torch.int32)