mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-08 04:47:03 +08:00
Signed-off-by: Nick Hill <nhill@redhat.com> Signed-off-by: Lucas Kabela <lucaskabela@meta.com> Signed-off-by: Max de Bayser <mbayser@br.ibm.com> Signed-off-by: Andrew Sansom <andrew@protopia.ai> Signed-off-by: Boyuan Feng <boyuan@meta.com> Signed-off-by: Boyuan Feng <fby.1994@gmail.com> Signed-off-by: boyuanfeng <boyuan@meta.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: JartX <sagformas@epdcenter.es> Signed-off-by: Chendi Xue <Chendi.Xue@intel.com> Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com> Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Chen Zhang <zhangch99@outlook.com> Signed-off-by: Roger Wang <hey@rogerw.io> Signed-off-by: mgoin <mgoin64@gmail.com> Signed-off-by: wwl2755 <wangwenlong2755@gmail.com> Signed-off-by: Manoel Marques <manoel.marques@ibm.com> Signed-off-by: Manoel Marques <manoelmrqs@gmail.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Signed-off-by: pengdrumli <pengdrumli@tencent.com> Signed-off-by: windsonsea <haifeng.yao@daocloud.io> Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai> Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Signed-off-by: Huamin Li <3ericli@gmail.com> Signed-off-by: simondanielsson <simon.danielsson99@hotmail.com> Signed-off-by: Rahul Tuli <rtuli@redhat.com> Signed-off-by: Yang <lymailforjob@gmail.com> Signed-off-by: Debolina Roy <debroy@redhat.com> Signed-off-by: David Chen <530634352@qq.com> Signed-off-by: wangzi <3220100013@zju.edu.cn> Signed-off-by: Eldar Kurtic <8884008+eldarkurtic@users.noreply.github.com> Signed-off-by: NickLucche <nlucches@redhat.com> Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com> Signed-off-by: Sara Kokkila Schumacher <saraks@ibm.com> Signed-off-by: Csrayz <jover@cmbchina.com> Signed-off-by: ivyilike <pww123@cmbchina.com> Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com> Signed-off-by: Bowen Wang <abmfy@icloud.com> Signed-off-by: qqma <qqma@amazon.com> Signed-off-by: ElizaWszola <ewszola@redhat.com> Signed-off-by: Lu Fang <fanglu@fb.com> Signed-off-by: Zhuohan Li <zhuohan123@gmail.com> Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: luka <lgovedic@redhat.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Signed-off-by: Or Ozeri <oro@il.ibm.com> Signed-off-by: Johnny Yang <johnnyyang@google.com> Signed-off-by: Alec Solder <alecs@fb.com> Signed-off-by: Alec S <10566873+alecsolder@users.noreply.github.com> Signed-off-by: Russell Bryant <rbryant@redhat.com> Signed-off-by: Matthew Bonanni <mbonanni@redhat.com> Signed-off-by: Alexander Matveev <amatveev@redhat.com> Signed-off-by: yewentao256 <zhyanwentao@126.com> Signed-off-by: liuye.hj <liuye.hj@alibaba-inc.com> Signed-off-by: Kunshang Ji <kunshang.ji@intel.com> Signed-off-by: Lucia Fang <116399278+luccafong@users.noreply.github.com> Signed-off-by: Michael Goin <mgoin64@gmail.com> Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com> Signed-off-by: Ming Yang <minos.future@gmail.com> Signed-off-by: Zhikaiiii <1658973216@qq.com> Signed-off-by: Andreas Hartel <andreas.hartel@aleph-alpha.com> Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com> Signed-off-by: wuxibin <wuxibin@bytedance.com> Signed-off-by: youkaichao <youkaichao@gmail.com> Signed-off-by: Peter Pan <Peter.Pan@daocloud.io> Signed-off-by: Peter Pan <peter.pan@daocloud.io> Signed-off-by: Nicolò Lucchesi<nicolo.lucchesi@gmail.com> Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com> Signed-off-by: Sage Moore <sage@neuralmagic.com> Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Signed-off-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com> Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com> Signed-off-by: Bill Nell <bnell@redhat.com> Signed-off-by: Shreeasish Kumar <shreeasish@rivosinc.com> Signed-off-by: Weida Hong <wdhongtw@google.com> Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Signed-off-by: Hashem Hashemi <hashem.hashemi@amd.com> Signed-off-by: Hashem Hashemi <159079214+amd-hhashemi@users.noreply.github.com> Signed-off-by: Amir Samani <asamani@nvidia.com> Signed-off-by: ElizaWszola <elizaw.9289@gmail.com> Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com> Signed-off-by: ilmarkov <markovilya197@gmail.com> Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com> Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com> Signed-off-by: rouchenzi <ruochenwen@gmail.com> Signed-off-by: rouchenzi <40842833+rouchenzi@users.noreply.github.com> Signed-off-by: Andrew Xia <axia@meta.com> Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com> Signed-off-by: Corey Lowman <clowman1993@gmail.com> Signed-off-by: jpvillam <jpvillam@amd.com> Signed-off-by: dougbtv <dosmith@redhat.com> Signed-off-by: Chenxi Yang <cxyang@fb.com> Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Signed-off-by: ahao-anyscale <ahao@anyscale.com> Signed-off-by: Yan Lu <luyan@nvidia.com> Signed-off-by: baxingpiaochong <771405853@qq.com> Signed-off-by: Kyle Sayers <kylesayrs@gmail.com> Signed-off-by: Nikhil Gupta <nikhil.gupta2@arm.com> Signed-off-by: Yong Hoon Shin <yhshin@meta.com> Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai> Signed-off-by: Benjamin Chislett <bchislett@nvidia.com> Signed-off-by: Ben Browning <bbrownin@redhat.com> Signed-off-by: Chengji Yao <chengjiyao@google.com> Signed-off-by: jiang1.li <jiang1.li@intel.com> Signed-off-by: Jackmin801 <ongjackm@gmail.com> Signed-off-by: Jonas M. Kübler <44084297+jmkuebler@users.noreply.github.com> Signed-off-by: taohui <taohui3@gmail.com> Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io> Signed-off-by: Shu Wang <shuw@nvidia.com> Signed-off-by: Shu Wang. <shuw@nvidia.com> Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com> Signed-off-by: Duncan Moss <djm.moss@gmail.com> Signed-off-by: Shiyan Deng <dsy842974287@meta.com> Signed-off-by: Wei Wei <wwei6@meta.com> Signed-off-by: Saman Keon <samanamp@outlook.com> Signed-off-by: yangxurui <yangxurui@meituan.com> Signed-off-by: nicole-lihui <nicole.li@daocloud.io> Signed-off-by: courage17340 <courage17340@163.com> Signed-off-by: Jacob Kahn <jacobkahn1@gmail.com> Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com> Signed-off-by: Agata Dobrzyniewicz <adobrzyniewicz@habana.ai> Signed-off-by: zxw <1020938856@qq.com> Signed-off-by: wang.yuqi <noooop@126.com> Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Signed-off-by: chenlang <chen.lang5@zte.com.cn> Signed-off-by: Jonas Kuebler <kuebj@amazon.com> Signed-off-by: AlonKejzman <alonkeizman@gmail.com> Signed-off-by: Tao Hui <taohui3@gmail.com> Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com> Signed-off-by: Tomer Asida <57313761+tomeras91@users.noreply.github.com> Signed-off-by: Aleksandr Malyshev <maleksan@amd.com> Signed-off-by: Eugene Khvedchenia <ekhvedchenia@nvidia.com> Signed-off-by: Eugene Khvedchenya <ekhvedchenya@gmail.com> Signed-off-by: yiting.jiang <yiting.jiang@daocloud.io> Signed-off-by: xaguilar <Xavier.AguilarFruto@amd.com> Signed-off-by: Iceber Gu <caiwei95@hotmail.com> Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com> Signed-off-by: Icey <1790571317@qq.com> Signed-off-by: 许文卿 <xwq391974@alibaba-inc.com> Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com> Co-authored-by: Nick Hill <nhill@redhat.com> Co-authored-by: Lucas Kabela <lucasakabela@gmail.com> Co-authored-by: Maximilien de Bayser <mbayser@br.ibm.com> Co-authored-by: Andrew Sansom <andrew@protopia.ai> Co-authored-by: Boyuan Feng <boyuan@meta.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: JartX <sagformas@epdcenter.es> Co-authored-by: Chendi.Xue <chendi.xue@intel.com> Co-authored-by: Chauncey <chaunceyjiang@gmail.com> Co-authored-by: xin.li <xin.li@daocloud.io> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk> Co-authored-by: Chen Zhang <zhangch99@outlook.com> Co-authored-by: Roger Wang <hey@rogerw.io> Co-authored-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Wenlong Wang <wangwenlong2755@gmail.com> Co-authored-by: Manoel Marques <manoelmrqs@gmail.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: lirong <56789630+lirong-lirong@users.noreply.github.com> Co-authored-by: Michael Yao <haifeng.yao@daocloud.io> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Co-authored-by: Huamin Li <3ericli@gmail.com> Co-authored-by: Lu Fang <30275821+houseroad@users.noreply.github.com> Co-authored-by: Simon Danielsson <70206058+simondanielsson@users.noreply.github.com> Co-authored-by: Rahul Tuli <rtuli@redhat.com> Co-authored-by: Claude <noreply@anthropic.com> Co-authored-by: Yang Liu <127183760+KKSK-DON@users.noreply.github.com> Co-authored-by: Deboleina <debroy@redhat.com> Co-authored-by: yinz-aizip <yinz@aizip.ai> Co-authored-by: WeiQing Chen <40507679+david6666666@users.noreply.github.com> Co-authored-by: wangzi <3220100013@zju.edu.cn> Co-authored-by: Eldar Kurtić <8884008+eldarkurtic@users.noreply.github.com> Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com> Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com> Co-authored-by: Yizhou <136800916+yiz-liu@users.noreply.github.com> Co-authored-by: Sara-KS <50249410+Sara-KS@users.noreply.github.com> Co-authored-by: Csrayz <jover@cmbchina.com> Co-authored-by: ivyilike <pww123@cmbchina.com> Co-authored-by: Burkhard Ringlein <ngl@zurich.ibm.com> Co-authored-by: Bowen Wang <abmfy@icloud.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: Daisy-Ma-coder <daisy.ma.0117@gmail.com> Co-authored-by: qqma <qqma@amazon.com> Co-authored-by: ElizaWszola <ewszola@redhat.com> Co-authored-by: Lucia Fang <116399278+luccafong@users.noreply.github.com> Co-authored-by: Zhuohan Li <zhuohan123@gmail.com> Co-authored-by: Simon Mo <simon.mo@hey.com> Co-authored-by: Or Ozeri <oro@il.ibm.com> Co-authored-by: Johnny Yang <24908445+jcyang43@users.noreply.github.com> Co-authored-by: Chengji Yao <chengjiyao@google.com> Co-authored-by: Alec S <10566873+alecsolder@users.noreply.github.com> Co-authored-by: Alec Solder <alecs@fb.com> Co-authored-by: Russell Bryant <rbryant@redhat.com> Co-authored-by: Matthew Bonanni <mbonanni@redhat.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Co-authored-by: Chris Bamford <chrisbam4d@gmail.com> Co-authored-by: Alexander Matveev <59768536+alexm-redhat@users.noreply.github.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: JJJYmmm <92386084+JJJYmmm@users.noreply.github.com> Co-authored-by: liuye.hj <liuye.hj@alibaba-inc.com> Co-authored-by: Kunshang Ji <kunshang.ji@intel.com> Co-authored-by: Lucia (Lu) Fang <fanglu@meta.com> Co-authored-by: Varun Sundar Rabindranath <varunsundar08@gmail.com> Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com> Co-authored-by: Ming Yang <yming@meta.com> Co-authored-by: Zhikaiiii <55917203+Zhikaiiii@users.noreply.github.com> Co-authored-by: Andreas Hartel <andreas@hartel.me> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com> Co-authored-by: vllmellm <vllm.ellm@embeddedllm.com> Co-authored-by: Joel <wuxibin89@163.com> Co-authored-by: youkaichao <youkaichao@gmail.com> Co-authored-by: Mark McLoughlin <markmc@redhat.com> Co-authored-by: Peter Pan <peter.pan@daocloud.io> Co-authored-by: Nicolò Lucchesi <nicolo.lucchesi@gmail.com> Co-authored-by: Fanli Lin <fanli.lin@intel.com> Co-authored-by: Thomas Parnell <tpa@zurich.ibm.com> Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com> Co-authored-by: Sage Moore <sage@neuralmagic.com> Co-authored-by: yewentao256 <zhyanwentao@126.com> Co-authored-by: bnellnm <49004751+bnellnm@users.noreply.github.com> Co-authored-by: rivos-shreeasish <shreeasish@rivosinc.com> Co-authored-by: Chih-Chieh Yang <chih.chieh.yang@ibm.com> Co-authored-by: Weida Hong <wdhongtw@gmail.com> Co-authored-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Co-authored-by: Hashem Hashemi <159079214+amd-hhashemi@users.noreply.github.com> Co-authored-by: Amir Samani <samani@ualberta.ca> Co-authored-by: Luka Govedič <lgovedic@redhat.com> Co-authored-by: jiahanc <173873397+jiahanc@users.noreply.github.com> Co-authored-by: Ilya Markov <markovilya197@gmail.com> Co-authored-by: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Co-authored-by: Jialin Ouyang <Jialin.Ouyang@gmail.com> Co-authored-by: rouchenzi <40842833+rouchenzi@users.noreply.github.com> Co-authored-by: Andrew Xia <axia@meta.com> Co-authored-by: kourosh hakhamaneshi <31483498+kouroshHakha@users.noreply.github.com> Co-authored-by: Corey Lowman <clowman1993@gmail.com> Co-authored-by: Juan Villamizar <100237675+jpvillam-amd@users.noreply.github.com> Co-authored-by: jpvillam <jpvillam@amd.com> Co-authored-by: Doug Smith <dosmith@redhat.com> Co-authored-by: Chenxi Yang <cxyang@cs.utexas.edu> Co-authored-by: Chenxi Yang <cxyang@fb.com> Co-authored-by: ahao-anyscale <ahao@anyscale.com> Co-authored-by: 0xNullPath <luyanfcp@foxmail.com> Co-authored-by: baxingpiaochong <771405853@qq.com> Co-authored-by: Benjamin Chislett <bchislett@nvidia.com> Co-authored-by: Kyle Sayers <kylesayrs@gmail.com> Co-authored-by: Nikhil Gupta <nikhil.gupta2@arm.com> Co-authored-by: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com> Co-authored-by: lhsjohn <huashuoli@tencent.com> Co-authored-by: Ben Browning <bbrownin@redhat.com> Co-authored-by: Li, Jiang <jiang1.li@intel.com> Co-authored-by: Jackmin801 <56836461+Jackmin801@users.noreply.github.com> Co-authored-by: Jonas M. Kübler <44084297+jmkuebler@users.noreply.github.com> Co-authored-by: Tao Hui <taohui3@gmail.com> Co-authored-by: rongfu.leng <rongfu.leng@daocloud.io> Co-authored-by: Shu Wang <shuw@nvidia.com> Co-authored-by: Tyler Michael Smith <tlrmchlsmth@gmail.com> Co-authored-by: Duncan Moss <djm.moss@gmail.com> Co-authored-by: Shiyan Deng <dsy842974287@meta.com> Co-authored-by: Wei Wei <wwei6@meta.com> Co-authored-by: Saman A. Pour <samanamp@outlook.com> Co-authored-by: XuruiYang <530534756@qq.com> Co-authored-by: yangxurui <yangxurui@meituan.com> Co-authored-by: Nicole LiHui 🥜 <nicolelihui@outlook.com> Co-authored-by: courage17340 <courage17340@users.noreply.github.com> Co-authored-by: Jacob Kahn <jacobkahn1@gmail.com> Co-authored-by: Nicole LiHui 🥜 <nicole.li@daocloud.io> Co-authored-by: Fadi Arafeh <115173828+fadara01@users.noreply.github.com> Co-authored-by: Agata Dobrzyniewicz <160237065+adobrzyn@users.noreply.github.com> Co-authored-by: yyzxw <34639446+yyzxw@users.noreply.github.com> Co-authored-by: wang.yuqi <noooop@126.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: chenlang <chen.lang5@zte.com.cn> Co-authored-by: chenlang <10346245@zte.com.cn> Co-authored-by: AlonKejzman <alonkeizman@gmail.com> Co-authored-by: tomeras91 <57313761+tomeras91@users.noreply.github.com> Co-authored-by: Aleksandr Malyshev <164964928+maleksan85@users.noreply.github.com> Co-authored-by: Aleksandr Malyshev <maleksan@amd.com> Co-authored-by: Doug Lehr <douglehr@amd.com> Co-authored-by: Eugene Khvedchenya <ekhvedchenya@gmail.com> Co-authored-by: yitingdc <59356937+yitingdc@users.noreply.github.com> Co-authored-by: xaguilar-amd <xavier.aguilarfruto@amd.com> Co-authored-by: Iceber Gu <caiwei95@hotmail.com> Co-authored-by: Tao He <linzhu.ht@alibaba-inc.com> Co-authored-by: Icey <1790571317@qq.com> Co-authored-by: Xu Wenqing <121550081+Xu-Wenqing@users.noreply.github.com> Co-authored-by: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com> Co-authored-by: RishiAstra <40644327+RishiAstra@users.noreply.github.com>
642 lines
22 KiB
Python
642 lines
22 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
from dataclasses import dataclass
|
|
from typing import Any, Optional, Union
|
|
|
|
import torch
|
|
|
|
import vllm._custom_ops as ops
|
|
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
|
from tests.kernels.moe.utils import make_test_weights, per_token_cast_to_fp8
|
|
from tests.kernels.quantization.nvfp4_utils import (FLOAT4_E2M1_MAX,
|
|
FLOAT8_E4M3_MAX,
|
|
dequantize_nvfp4_to_dtype)
|
|
from tests.kernels.utils import torch_experts
|
|
from vllm.config import VllmConfig
|
|
from vllm.distributed import get_dp_group, get_tensor_model_parallel_world_size
|
|
from vllm.forward_context import set_forward_context
|
|
from vllm.model_executor.layers.fused_moe.config import (
|
|
FusedMoEConfig, FusedMoEParallelConfig, FusedMoEQuantConfig)
|
|
from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
|
|
from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx
|
|
|
|
from .mk_objects import (TestMoEQuantConfig, expert_info, make_fused_experts,
|
|
make_prepare_finalize, prepare_finalize_info)
|
|
from .parallel_utils import ProcessGroupInfo
|
|
|
|
|
|
def _describe_tensor(t: Optional[torch.Tensor], name: str) -> str:
|
|
if t is None:
|
|
return f"{name} : None"
|
|
else:
|
|
return f"{name} : {t.shape} {t.dtype} {t.device}"
|
|
|
|
|
|
@dataclass
|
|
class Config:
|
|
Ms: Union[list[int], int]
|
|
K: int
|
|
N: int
|
|
E: int
|
|
topks: Union[list[int], int]
|
|
dtype: torch.dtype
|
|
quant_config: Optional[TestMoEQuantConfig]
|
|
|
|
prepare_finalize_type: mk.FusedMoEPrepareAndFinalize
|
|
fused_experts_type: mk.FusedMoEPermuteExpertsUnpermute
|
|
|
|
fused_moe_chunk_size: Optional[int]
|
|
world_size: int
|
|
|
|
torch_trace_dir_path: Optional[str] = None
|
|
|
|
def __post_init__(self):
|
|
if self.quant_config is None:
|
|
self.quant_config = TestMoEQuantConfig(None, False, False, None)
|
|
|
|
def describe(self) -> str:
|
|
s = ""
|
|
s += "== Config:\n"
|
|
s += f" world_size={self.world_size}\n"
|
|
s += f" PF={self.prepare_finalize_type.__name__}\n"
|
|
s += f" FE={self.fused_experts_type.__name__}\n"
|
|
s += f" E={self.E}\n"
|
|
s += f" Ms={self.Ms}\n"
|
|
s += f" N={self.N}\n"
|
|
s += f" K={self.K}\n"
|
|
s += f" topk={self.topks}\n"
|
|
s += f" dtype={self.dtype}\n"
|
|
s += f" fused_moe_chunk_size={self.fused_moe_chunk_size}\n"
|
|
s += " Quant:\n"
|
|
if self.quant_config is not None:
|
|
s += f" q_dtype={self.quant_dtype}\n"
|
|
s += f" q_block_shape={self.quant_block_shape}\n"
|
|
s += f" q_per_out_ch_quant={self.is_per_out_ch_quant}\n"
|
|
s += f" q_per_act_token={self.is_per_act_token_quant}\n"
|
|
else:
|
|
s += " quant=None\n"
|
|
return s
|
|
|
|
@property
|
|
def M(self) -> int:
|
|
assert isinstance(self.Ms, int)
|
|
return self.Ms
|
|
|
|
@property
|
|
def quant_dtype(self) -> Union[torch.dtype, str, None]:
|
|
assert self.quant_config is not None
|
|
return self.quant_config.quant_dtype
|
|
|
|
@property
|
|
def is_per_act_token_quant(self) -> bool:
|
|
assert self.quant_config is not None
|
|
return self.quant_config.per_act_token_quant
|
|
|
|
@property
|
|
def is_per_tensor_act_quant(self) -> bool:
|
|
return (not self.is_per_act_token_quant
|
|
and self.quant_block_shape is None)
|
|
|
|
@property
|
|
def is_per_out_ch_quant(self) -> bool:
|
|
assert self.quant_config is not None
|
|
return self.quant_config.per_out_ch_quant
|
|
|
|
@property
|
|
def quant_block_shape(self) -> Optional[list[int]]:
|
|
assert self.quant_config is not None
|
|
return self.quant_config.block_shape
|
|
|
|
@property
|
|
def topk(self) -> int:
|
|
assert isinstance(self.topks, int)
|
|
return self.topks
|
|
|
|
@property
|
|
def num_local_experts(self) -> int:
|
|
return self.E // self.world_size
|
|
|
|
def make_env_data(self) -> tuple[VllmConfig, dict[Any, Any]]:
|
|
"""
|
|
make env data for vllm launch.
|
|
"""
|
|
vllm_config = VllmConfig()
|
|
vllm_config.parallel_config.data_parallel_size = self.world_size
|
|
vllm_config.parallel_config.enable_expert_parallel = True
|
|
|
|
env_dict = {
|
|
"VLLM_USE_DEEP_GEMM": str(int(self.needs_deep_gemm())),
|
|
}
|
|
|
|
backend = self.all2all_backend()
|
|
if backend is not None:
|
|
env_dict.update({"VLLM_ALL2ALL_BACKEND": backend})
|
|
|
|
if self.fused_moe_chunk_size is not None:
|
|
env_dict.update(
|
|
{"VLLM_FUSED_MOE_CHUNK_SIZE": str(self.fused_moe_chunk_size)})
|
|
|
|
return vllm_config, env_dict
|
|
|
|
def is_fp8_block_quantized(self):
|
|
return (self.quant_dtype == torch.float8_e4m3fn
|
|
and self.quant_block_shape is not None)
|
|
|
|
def is_batched_prepare_finalize(self):
|
|
info = prepare_finalize_info(self.prepare_finalize_type)
|
|
return (mk.FusedMoEActivationFormat.BatchedExperts ==
|
|
info.activation_format)
|
|
|
|
def is_batched_fused_experts(self):
|
|
info = expert_info(self.fused_experts_type)
|
|
return (mk.FusedMoEActivationFormat.BatchedExperts ==
|
|
info.activation_format)
|
|
|
|
def is_standard_fused_experts(self):
|
|
info = expert_info(self.fused_experts_type)
|
|
return mk.FusedMoEActivationFormat.Standard == info.activation_format
|
|
|
|
def fe_supported_types(self):
|
|
info = expert_info(self.fused_experts_type)
|
|
return info.supported_dtypes
|
|
|
|
def pf_supported_types(self):
|
|
info = prepare_finalize_info(self.prepare_finalize_type)
|
|
return info.supported_dtypes
|
|
|
|
def is_block_quant_supported(self):
|
|
info = expert_info(self.fused_experts_type)
|
|
return info.blocked_quantization_support
|
|
|
|
def is_fe_supports_chunking(self):
|
|
info = expert_info(self.fused_experts_type)
|
|
return info.supports_chunking
|
|
|
|
def supports_expert_map(self):
|
|
info = expert_info(self.fused_experts_type)
|
|
return info.supports_expert_map
|
|
|
|
def supports_apply_weight_on_input(self):
|
|
info = prepare_finalize_info(self.prepare_finalize_type)
|
|
return info.supports_apply_weight_on_input
|
|
|
|
def needs_deep_gemm(self):
|
|
info = expert_info(self.fused_experts_type)
|
|
return info.needs_deep_gemm
|
|
|
|
def needs_pplx(self):
|
|
info = prepare_finalize_info(self.prepare_finalize_type)
|
|
return info.backend == "pplx"
|
|
|
|
def needs_deep_ep(self):
|
|
info = prepare_finalize_info(self.prepare_finalize_type)
|
|
return (info.backend == "deepep_high_throughput"
|
|
or info.backend == "deepep_low_latency")
|
|
|
|
def all2all_backend(self):
|
|
info = prepare_finalize_info(self.prepare_finalize_type)
|
|
return info.backend
|
|
|
|
def is_valid(self):
|
|
# Check prepare-finalize and fused-experts compatibility
|
|
if self.is_batched_prepare_finalize():
|
|
if not self.is_batched_fused_experts():
|
|
return False
|
|
else:
|
|
if not self.is_standard_fused_experts():
|
|
return False
|
|
|
|
use_chunking = self.fused_moe_chunk_size is not None
|
|
if use_chunking and not self.is_fe_supports_chunking():
|
|
return False
|
|
|
|
# Check quantization sanity
|
|
if (int(self.is_per_act_token_quant) +
|
|
int(self.is_per_tensor_act_quant) +
|
|
int(self.quant_block_shape is not None)) > 1:
|
|
# invalid quant config
|
|
return False
|
|
|
|
# check type support
|
|
if self.quant_dtype is None:
|
|
if (self.dtype not in self.pf_supported_types()
|
|
or self.dtype not in self.fe_supported_types()):
|
|
return False
|
|
else:
|
|
if (self.quant_dtype not in self.pf_supported_types()
|
|
or self.quant_dtype not in self.fe_supported_types()):
|
|
return False
|
|
|
|
# Check block quanization support
|
|
is_block_quatized = self.quant_block_shape is not None
|
|
if is_block_quatized and self.quant_dtype is None:
|
|
return False
|
|
if is_block_quatized and not self.is_block_quant_supported():
|
|
return False
|
|
|
|
# deep_gemm only works with block-quantized
|
|
if self.needs_deep_gemm() and not is_block_quatized:
|
|
return False
|
|
|
|
# Check dependencies (turn into asserts?)
|
|
if self.needs_deep_ep() and not has_deep_ep():
|
|
return False
|
|
if self.needs_deep_gemm() and not has_deep_gemm():
|
|
return False
|
|
if self.needs_pplx() and not has_pplx(): # noqa: SIM103
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
@dataclass
|
|
class WeightTensors:
|
|
w1: torch.Tensor
|
|
w2: torch.Tensor
|
|
w1_scale: Optional[torch.Tensor]
|
|
w2_scale: Optional[torch.Tensor]
|
|
w1_gs: Optional[torch.Tensor] = None
|
|
w2_gs: Optional[torch.Tensor] = None
|
|
|
|
def describe(self):
|
|
s = ""
|
|
s += "== Weight Tensors: \n"
|
|
s += f' - {_describe_tensor(self.w1, "w1")} \n'
|
|
s += f' - {_describe_tensor(self.w2, "w2")} \n'
|
|
s += f' - {_describe_tensor(self.w1_scale, "w1_scale")} \n'
|
|
s += f' - {_describe_tensor(self.w2_scale, "w2_scale")} \n'
|
|
s += f' - {_describe_tensor(self.w1_gs, "w1_gs")} \n'
|
|
s += f' - {_describe_tensor(self.w2_gs, "w2_gs")} \n'
|
|
return s
|
|
|
|
def is_quantized(self) -> bool:
|
|
# or w1_scale is not None?
|
|
return (self.w1.dtype == torch.float8_e4m3fn
|
|
or self.w1.dtype == torch.uint8 or self.w1.dtype == torch.int8)
|
|
|
|
def to_current_device(self):
|
|
device = torch.cuda.current_device()
|
|
self.w1 = self.w1.to(device=device)
|
|
self.w2 = self.w2.to(device=device)
|
|
|
|
if self.w1_scale is not None:
|
|
self.w1_scale = self.w1_scale.to(device=device)
|
|
if self.w2_scale is not None:
|
|
self.w2_scale = self.w2_scale.to(device=device)
|
|
|
|
if self.w1_gs is not None:
|
|
self.w1_gs = self.w1_gs.to(device=device)
|
|
if self.w2_gs is not None:
|
|
self.w2_gs = self.w2_gs.to(device=device)
|
|
|
|
def slice_weights(self, rank: int,
|
|
num_local_experts: int) -> "WeightTensors":
|
|
s = rank * num_local_experts
|
|
e = s + num_local_experts
|
|
w1 = self.w1[s:e, :, :]
|
|
w2 = self.w2[s:e, :, :]
|
|
w1_scale = self.w1_scale[
|
|
s:e, :, :] if self.w1_scale is not None else None
|
|
w2_scale = self.w2_scale[
|
|
s:e, :, :] if self.w2_scale is not None else None
|
|
w1_gs = self.w1_gs[s:e] if self.w1_gs is not None else None
|
|
w2_gs = self.w2_gs[s:e] if self.w2_gs is not None else None
|
|
|
|
return WeightTensors(w1, w2, w1_scale, w2_scale, w1_gs, w2_gs)
|
|
|
|
@staticmethod
|
|
def make(config: Config) -> "WeightTensors":
|
|
(_, w1, w1_scale, w1_gs), (_, w2, w2_scale, w2_gs) = make_test_weights(
|
|
e=config.E,
|
|
n=config.N,
|
|
k=config.K,
|
|
in_dtype=config.dtype,
|
|
quant_dtype=config.quant_dtype,
|
|
block_shape=config.quant_block_shape,
|
|
per_out_ch_quant=config.
|
|
is_per_act_token_quant, # or config.is_per_out_ch_quant
|
|
)
|
|
return WeightTensors(w1=w1,
|
|
w2=w2,
|
|
w1_scale=w1_scale,
|
|
w2_scale=w2_scale,
|
|
w1_gs=w1_gs,
|
|
w2_gs=w2_gs)
|
|
|
|
|
|
@dataclass
|
|
class RankTensors:
|
|
hidden_states: torch.Tensor
|
|
hidden_states_scale: Optional[torch.Tensor]
|
|
|
|
topk_weights: torch.Tensor
|
|
topk_ids: torch.Tensor
|
|
expert_map: Optional[torch.Tensor]
|
|
|
|
def describe(self):
|
|
s = ""
|
|
s += "== Rank Tensors: \n"
|
|
s += f' - {_describe_tensor(self.hidden_states, "HS")} \n'
|
|
s += f' - {_describe_tensor(self.hidden_states_scale, "HS_scale")} \n'
|
|
s += f' - {_describe_tensor(self.topk_weights, "topk_weights")} \n'
|
|
s += f' - {_describe_tensor(self.topk_ids, "topk_ids")} \n'
|
|
s += f' - {_describe_tensor(self.expert_map, "expert_map")} \n'
|
|
return s
|
|
|
|
@staticmethod
|
|
def make_hidden_states(
|
|
config: Config) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
|
|
"""
|
|
Return hidden_states
|
|
"""
|
|
m, k, dtype = (config.M, config.K, config.dtype)
|
|
a = (torch.randn(
|
|
(m, k), device=torch.cuda.current_device(), dtype=dtype) / 15.0)
|
|
|
|
if config.quant_dtype is None:
|
|
return a, None
|
|
|
|
# We dequant and use that as hidden_states so the tests are stable.
|
|
# quantizing and dequantizing yield slightly different results
|
|
# depending on the hardware. Here we, quantize and dequantize
|
|
# first - so further quantize and dequantize will yield the same
|
|
# values.
|
|
if config.is_per_tensor_act_quant:
|
|
a_q, a_scales = ops.scaled_fp8_quant(
|
|
a, use_per_token_if_dynamic=False)
|
|
return a_q.float().mul(a_scales).to(dtype), a_scales
|
|
|
|
if config.is_per_act_token_quant:
|
|
a_q, a_scales = ops.scaled_fp8_quant(a,
|
|
use_per_token_if_dynamic=True)
|
|
return a_q.float().mul(a_scales).to(dtype), None
|
|
|
|
assert config.quant_block_shape is not None
|
|
block_k = config.quant_block_shape[1]
|
|
a_q, a_scales = per_token_cast_to_fp8(a, block_size=block_k)
|
|
return a_q.float().view(
|
|
(-1, block_k)).mul(a_scales.view(-1, 1)).view(m, k).to(dtype), None
|
|
|
|
@staticmethod
|
|
def make(config: Config, pgi: ProcessGroupInfo):
|
|
|
|
dtype = config.dtype
|
|
topk, m, _ = (config.topk, config.M, config.K)
|
|
hidden_states, hidden_states_scale = RankTensors.make_hidden_states(
|
|
config)
|
|
|
|
num_local_experts, global_num_experts = (config.num_local_experts,
|
|
config.E)
|
|
score = torch.randn((m, global_num_experts),
|
|
device="cuda",
|
|
dtype=dtype)
|
|
topk_weights, topk_ids, _ = fused_topk(hidden_states, score, topk,
|
|
False)
|
|
|
|
# distribute topk_ids evenly
|
|
for mi in range(m):
|
|
topk_ids[mi] = torch.randperm(config.E)[:topk]
|
|
topk_ids = topk_ids.to(device=torch.cuda.current_device())
|
|
|
|
expert_map = None
|
|
if config.world_size > 1 and config.supports_expert_map():
|
|
expert_map = torch.full((global_num_experts, ),
|
|
fill_value=-1,
|
|
dtype=torch.int32)
|
|
s = pgi.rank * num_local_experts
|
|
e = s + num_local_experts
|
|
expert_map[s:e] = torch.tensor(list(range(num_local_experts)))
|
|
expert_map = expert_map.to(device=torch.cuda.current_device(),
|
|
dtype=torch.int32)
|
|
|
|
return RankTensors(
|
|
hidden_states=hidden_states,
|
|
hidden_states_scale=hidden_states_scale,
|
|
topk_weights=topk_weights,
|
|
topk_ids=topk_ids,
|
|
expert_map=expert_map,
|
|
)
|
|
|
|
|
|
def reference_moe_impl(config: Config, weights: WeightTensors,
|
|
rank_tensors: RankTensors) -> torch.Tensor:
|
|
|
|
if config.quant_dtype == "nvfp4":
|
|
quant_blocksize = 16
|
|
dtype = config.dtype
|
|
|
|
w1_q = weights.w1
|
|
w1_blockscale = weights.w1_scale
|
|
w1_gs = weights.w1_gs
|
|
|
|
w2_q = weights.w2
|
|
w2_blockscale = weights.w2_scale
|
|
w2_gs = weights.w2_gs
|
|
|
|
a_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) / torch.amax(
|
|
rank_tensors.hidden_states.flatten(), dim=-1)).to(torch.float32)
|
|
|
|
assert w1_gs is not None
|
|
assert w2_gs is not None
|
|
assert w1_blockscale is not None
|
|
assert w2_blockscale is not None
|
|
|
|
assert w1_blockscale.shape[1] % 128 == 0
|
|
assert w1_blockscale.shape[2] % 4 == 0
|
|
assert w2_blockscale.shape[1] % 128 == 0
|
|
assert w2_blockscale.shape[2] % 4 == 0
|
|
|
|
a_fp4, a_scale_interleaved = ops.scaled_fp4_quant(
|
|
rank_tensors.hidden_states, a_global_scale)
|
|
|
|
a = dequantize_nvfp4_to_dtype(a_fp4,
|
|
a_scale_interleaved,
|
|
a_global_scale,
|
|
dtype=dtype,
|
|
device=a_fp4.device,
|
|
block_size=quant_blocksize)
|
|
|
|
e = w1_q.shape[0]
|
|
n = w1_q.shape[1] // 2
|
|
k = w2_q.shape[1]
|
|
|
|
w1 = torch.zeros((e, 2 * n, k), device="cuda", dtype=dtype)
|
|
w2 = torch.zeros((e, k, n), device="cuda", dtype=dtype)
|
|
|
|
for idx in range(0, e):
|
|
w1[idx] = dequantize_nvfp4_to_dtype(w1_q[idx],
|
|
w1_blockscale[idx],
|
|
w1_gs[idx],
|
|
dtype=dtype,
|
|
device=w1_q.device,
|
|
block_size=quant_blocksize)
|
|
w2[idx] = dequantize_nvfp4_to_dtype(w2_q[idx],
|
|
w2_blockscale[idx],
|
|
w2_gs[idx],
|
|
dtype=dtype,
|
|
device=w2_q.device,
|
|
block_size=quant_blocksize)
|
|
a_scale = None
|
|
w1_scale = None
|
|
w2_scale = None
|
|
quant_dtype = None
|
|
per_act_token_quant = False
|
|
block_shape = None
|
|
else:
|
|
a = rank_tensors.hidden_states
|
|
a_scale = rank_tensors.hidden_states_scale
|
|
w1 = weights.w1
|
|
w1_scale = weights.w1_scale
|
|
w2 = weights.w2
|
|
w2_scale = weights.w2_scale
|
|
quant_dtype = config.quant_dtype
|
|
per_act_token_quant = config.is_per_act_token_quant
|
|
block_shape = config.quant_block_shape
|
|
|
|
return torch_experts(a=a,
|
|
w1=w1,
|
|
w2=w2,
|
|
topk_weight=rank_tensors.topk_weights,
|
|
topk_ids=rank_tensors.topk_ids,
|
|
global_num_experts=config.E,
|
|
expert_map=None,
|
|
w1_scale=w1_scale,
|
|
w2_scale=w2_scale,
|
|
a1_scale=a_scale,
|
|
quant_dtype=quant_dtype,
|
|
per_act_token_quant=per_act_token_quant,
|
|
block_shape=block_shape,
|
|
apply_router_weights_on_input=config.topk == 1
|
|
and config.supports_apply_weight_on_input())
|
|
|
|
|
|
def _make_gscale(num_experts: int) -> torch.Tensor:
|
|
return torch.ones((num_experts, ),
|
|
device=torch.cuda.current_device(),
|
|
dtype=torch.float32)
|
|
|
|
|
|
def make_modular_kernel(
|
|
config: Config,
|
|
vllm_config: VllmConfig,
|
|
quant_config: FusedMoEQuantConfig,
|
|
) -> mk.FusedMoEModularKernel:
|
|
|
|
def next_power_of_2(x):
|
|
import math
|
|
if x == 0:
|
|
return 1
|
|
return 2**math.ceil(math.log2(x))
|
|
|
|
# make moe config
|
|
moe_parallel_config: FusedMoEParallelConfig = FusedMoEParallelConfig.make(
|
|
tp_size_=get_tensor_model_parallel_world_size(),
|
|
dp_size_=get_dp_group().world_size,
|
|
vllm_parallel_config=vllm_config.parallel_config,
|
|
)
|
|
|
|
moe = FusedMoEConfig(
|
|
num_experts=config.E,
|
|
experts_per_token=config.topk,
|
|
hidden_dim=config.K,
|
|
num_local_experts=config.num_local_experts,
|
|
moe_parallel_config=moe_parallel_config,
|
|
in_dtype=config.dtype,
|
|
max_num_tokens=next_power_of_2(config.M),
|
|
)
|
|
|
|
# make modular kernel
|
|
prepare_finalize = make_prepare_finalize(config.prepare_finalize_type,
|
|
config.all2all_backend(), moe,
|
|
quant_config)
|
|
|
|
fused_experts = make_fused_experts(
|
|
config.fused_experts_type,
|
|
moe,
|
|
quant_config,
|
|
prepare_finalize.num_dispatchers(),
|
|
config.N,
|
|
)
|
|
|
|
modular_kernel = mk.FusedMoEModularKernel(
|
|
prepare_finalize=prepare_finalize, fused_experts=fused_experts)
|
|
|
|
return modular_kernel
|
|
|
|
|
|
def run_modular_kernel(
|
|
pgi: ProcessGroupInfo,
|
|
vllm_config: VllmConfig,
|
|
config: Config,
|
|
weights: WeightTensors,
|
|
rank_tensors: RankTensors,
|
|
) -> torch.Tensor:
|
|
assert isinstance(config.Ms, int)
|
|
assert isinstance(config.topks, int)
|
|
|
|
# weights for rank
|
|
rank_weights = weights.slice_weights(pgi.rank, config.num_local_experts)
|
|
|
|
if config.quant_dtype == "nvfp4":
|
|
gscale = _make_gscale(config.num_local_experts)
|
|
else:
|
|
gscale = None
|
|
|
|
quant_config = FusedMoEQuantConfig.make(
|
|
config.quant_dtype,
|
|
w1_scale=rank_weights.w1_scale,
|
|
w2_scale=rank_weights.w2_scale,
|
|
a1_scale=rank_tensors.hidden_states_scale,
|
|
g1_alphas=(1 / rank_weights.w1_gs)
|
|
if rank_weights.w1_gs is not None else None,
|
|
g2_alphas=(1 / rank_weights.w2_gs)
|
|
if rank_weights.w2_gs is not None else None,
|
|
a1_gscale=gscale,
|
|
a2_gscale=gscale,
|
|
block_shape=config.quant_block_shape,
|
|
per_act_token_quant=config.is_per_act_token_quant,
|
|
per_out_ch_quant=config.is_per_out_ch_quant,
|
|
)
|
|
|
|
mk = make_modular_kernel(config, vllm_config, quant_config)
|
|
|
|
# impls might update the tensor in place
|
|
hidden_states = rank_tensors.hidden_states.clone()
|
|
|
|
topk_ids = rank_tensors.topk_ids.to(
|
|
mk.prepare_finalize.topk_indices_dtype())
|
|
|
|
mk_kwargs = {
|
|
"hidden_states":
|
|
hidden_states,
|
|
"w1":
|
|
rank_weights.w1,
|
|
"w2":
|
|
rank_weights.w2,
|
|
"topk_weights":
|
|
rank_tensors.topk_weights,
|
|
"topk_ids":
|
|
topk_ids,
|
|
"expert_map":
|
|
rank_tensors.expert_map,
|
|
"global_num_experts":
|
|
config.E,
|
|
"apply_router_weight_on_input":
|
|
config.topk == 1 and config.supports_apply_weight_on_input(),
|
|
}
|
|
|
|
num_tokens = rank_tensors.hidden_states.shape[0]
|
|
num_tokens_across_dp = torch.tensor([num_tokens] * config.world_size,
|
|
device="cuda",
|
|
dtype=torch.int)
|
|
|
|
with set_forward_context(
|
|
None,
|
|
vllm_config,
|
|
num_tokens=num_tokens,
|
|
num_tokens_across_dp=num_tokens_across_dp,
|
|
):
|
|
out = mk.forward(**mk_kwargs)
|
|
|
|
return out
|