mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-14 01:21:22 +08:00
Signed-off-by: Nick Hill <nhill@redhat.com> Signed-off-by: Lucas Kabela <lucaskabela@meta.com> Signed-off-by: Max de Bayser <mbayser@br.ibm.com> Signed-off-by: Andrew Sansom <andrew@protopia.ai> Signed-off-by: Boyuan Feng <boyuan@meta.com> Signed-off-by: Boyuan Feng <fby.1994@gmail.com> Signed-off-by: boyuanfeng <boyuan@meta.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: JartX <sagformas@epdcenter.es> Signed-off-by: Chendi Xue <Chendi.Xue@intel.com> Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com> Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Chen Zhang <zhangch99@outlook.com> Signed-off-by: Roger Wang <hey@rogerw.io> Signed-off-by: mgoin <mgoin64@gmail.com> Signed-off-by: wwl2755 <wangwenlong2755@gmail.com> Signed-off-by: Manoel Marques <manoel.marques@ibm.com> Signed-off-by: Manoel Marques <manoelmrqs@gmail.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Signed-off-by: pengdrumli <pengdrumli@tencent.com> Signed-off-by: windsonsea <haifeng.yao@daocloud.io> Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai> Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Signed-off-by: Huamin Li <3ericli@gmail.com> Signed-off-by: simondanielsson <simon.danielsson99@hotmail.com> Signed-off-by: Rahul Tuli <rtuli@redhat.com> Signed-off-by: Yang <lymailforjob@gmail.com> Signed-off-by: Debolina Roy <debroy@redhat.com> Signed-off-by: David Chen <530634352@qq.com> Signed-off-by: wangzi <3220100013@zju.edu.cn> Signed-off-by: Eldar Kurtic <8884008+eldarkurtic@users.noreply.github.com> Signed-off-by: NickLucche <nlucches@redhat.com> Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com> Signed-off-by: Sara Kokkila Schumacher <saraks@ibm.com> Signed-off-by: Csrayz <jover@cmbchina.com> Signed-off-by: ivyilike <pww123@cmbchina.com> Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com> Signed-off-by: Bowen Wang <abmfy@icloud.com> Signed-off-by: qqma <qqma@amazon.com> Signed-off-by: ElizaWszola <ewszola@redhat.com> Signed-off-by: Lu Fang <fanglu@fb.com> Signed-off-by: Zhuohan Li <zhuohan123@gmail.com> Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: luka <lgovedic@redhat.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Signed-off-by: Or Ozeri <oro@il.ibm.com> Signed-off-by: Johnny Yang <johnnyyang@google.com> Signed-off-by: Alec Solder <alecs@fb.com> Signed-off-by: Alec S <10566873+alecsolder@users.noreply.github.com> Signed-off-by: Russell Bryant <rbryant@redhat.com> Signed-off-by: Matthew Bonanni <mbonanni@redhat.com> Signed-off-by: Alexander Matveev <amatveev@redhat.com> Signed-off-by: yewentao256 <zhyanwentao@126.com> Signed-off-by: liuye.hj <liuye.hj@alibaba-inc.com> Signed-off-by: Kunshang Ji <kunshang.ji@intel.com> Signed-off-by: Lucia Fang <116399278+luccafong@users.noreply.github.com> Signed-off-by: Michael Goin <mgoin64@gmail.com> Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com> Signed-off-by: Ming Yang <minos.future@gmail.com> Signed-off-by: Zhikaiiii <1658973216@qq.com> Signed-off-by: Andreas Hartel <andreas.hartel@aleph-alpha.com> Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com> Signed-off-by: wuxibin <wuxibin@bytedance.com> Signed-off-by: youkaichao <youkaichao@gmail.com> Signed-off-by: Peter Pan <Peter.Pan@daocloud.io> Signed-off-by: Peter Pan <peter.pan@daocloud.io> Signed-off-by: Nicolò Lucchesi<nicolo.lucchesi@gmail.com> Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com> Signed-off-by: Sage Moore <sage@neuralmagic.com> Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Signed-off-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com> Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com> Signed-off-by: Bill Nell <bnell@redhat.com> Signed-off-by: Shreeasish Kumar <shreeasish@rivosinc.com> Signed-off-by: Weida Hong <wdhongtw@google.com> Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Signed-off-by: Hashem Hashemi <hashem.hashemi@amd.com> Signed-off-by: Hashem Hashemi <159079214+amd-hhashemi@users.noreply.github.com> Signed-off-by: Amir Samani <asamani@nvidia.com> Signed-off-by: ElizaWszola <elizaw.9289@gmail.com> Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com> Signed-off-by: ilmarkov <markovilya197@gmail.com> Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com> Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com> Signed-off-by: rouchenzi <ruochenwen@gmail.com> Signed-off-by: rouchenzi <40842833+rouchenzi@users.noreply.github.com> Signed-off-by: Andrew Xia <axia@meta.com> Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com> Signed-off-by: Corey Lowman <clowman1993@gmail.com> Signed-off-by: jpvillam <jpvillam@amd.com> Signed-off-by: dougbtv <dosmith@redhat.com> Signed-off-by: Chenxi Yang <cxyang@fb.com> Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Signed-off-by: ahao-anyscale <ahao@anyscale.com> Signed-off-by: Yan Lu <luyan@nvidia.com> Signed-off-by: baxingpiaochong <771405853@qq.com> Signed-off-by: Kyle Sayers <kylesayrs@gmail.com> Signed-off-by: Nikhil Gupta <nikhil.gupta2@arm.com> Signed-off-by: Yong Hoon Shin <yhshin@meta.com> Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai> Signed-off-by: Benjamin Chislett <bchislett@nvidia.com> Signed-off-by: Ben Browning <bbrownin@redhat.com> Signed-off-by: Chengji Yao <chengjiyao@google.com> Signed-off-by: jiang1.li <jiang1.li@intel.com> Signed-off-by: Jackmin801 <ongjackm@gmail.com> Signed-off-by: Jonas M. Kübler <44084297+jmkuebler@users.noreply.github.com> Signed-off-by: taohui <taohui3@gmail.com> Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io> Signed-off-by: Shu Wang <shuw@nvidia.com> Signed-off-by: Shu Wang. <shuw@nvidia.com> Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com> Signed-off-by: Duncan Moss <djm.moss@gmail.com> Signed-off-by: Shiyan Deng <dsy842974287@meta.com> Signed-off-by: Wei Wei <wwei6@meta.com> Signed-off-by: Saman Keon <samanamp@outlook.com> Signed-off-by: yangxurui <yangxurui@meituan.com> Signed-off-by: nicole-lihui <nicole.li@daocloud.io> Signed-off-by: courage17340 <courage17340@163.com> Signed-off-by: Jacob Kahn <jacobkahn1@gmail.com> Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com> Signed-off-by: Agata Dobrzyniewicz <adobrzyniewicz@habana.ai> Signed-off-by: zxw <1020938856@qq.com> Signed-off-by: wang.yuqi <noooop@126.com> Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Signed-off-by: chenlang <chen.lang5@zte.com.cn> Signed-off-by: Jonas Kuebler <kuebj@amazon.com> Signed-off-by: AlonKejzman <alonkeizman@gmail.com> Signed-off-by: Tao Hui <taohui3@gmail.com> Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com> Signed-off-by: Tomer Asida <57313761+tomeras91@users.noreply.github.com> Signed-off-by: Aleksandr Malyshev <maleksan@amd.com> Signed-off-by: Eugene Khvedchenia <ekhvedchenia@nvidia.com> Signed-off-by: Eugene Khvedchenya <ekhvedchenya@gmail.com> Signed-off-by: yiting.jiang <yiting.jiang@daocloud.io> Signed-off-by: xaguilar <Xavier.AguilarFruto@amd.com> Signed-off-by: Iceber Gu <caiwei95@hotmail.com> Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com> Signed-off-by: Icey <1790571317@qq.com> Signed-off-by: 许文卿 <xwq391974@alibaba-inc.com> Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com> Co-authored-by: Nick Hill <nhill@redhat.com> Co-authored-by: Lucas Kabela <lucasakabela@gmail.com> Co-authored-by: Maximilien de Bayser <mbayser@br.ibm.com> Co-authored-by: Andrew Sansom <andrew@protopia.ai> Co-authored-by: Boyuan Feng <boyuan@meta.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: JartX <sagformas@epdcenter.es> Co-authored-by: Chendi.Xue <chendi.xue@intel.com> Co-authored-by: Chauncey <chaunceyjiang@gmail.com> Co-authored-by: xin.li <xin.li@daocloud.io> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk> Co-authored-by: Chen Zhang <zhangch99@outlook.com> Co-authored-by: Roger Wang <hey@rogerw.io> Co-authored-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Wenlong Wang <wangwenlong2755@gmail.com> Co-authored-by: Manoel Marques <manoelmrqs@gmail.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: lirong <56789630+lirong-lirong@users.noreply.github.com> Co-authored-by: Michael Yao <haifeng.yao@daocloud.io> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Co-authored-by: Huamin Li <3ericli@gmail.com> Co-authored-by: Lu Fang <30275821+houseroad@users.noreply.github.com> Co-authored-by: Simon Danielsson <70206058+simondanielsson@users.noreply.github.com> Co-authored-by: Rahul Tuli <rtuli@redhat.com> Co-authored-by: Claude <noreply@anthropic.com> Co-authored-by: Yang Liu <127183760+KKSK-DON@users.noreply.github.com> Co-authored-by: Deboleina <debroy@redhat.com> Co-authored-by: yinz-aizip <yinz@aizip.ai> Co-authored-by: WeiQing Chen <40507679+david6666666@users.noreply.github.com> Co-authored-by: wangzi <3220100013@zju.edu.cn> Co-authored-by: Eldar Kurtić <8884008+eldarkurtic@users.noreply.github.com> Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com> Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com> Co-authored-by: Yizhou <136800916+yiz-liu@users.noreply.github.com> Co-authored-by: Sara-KS <50249410+Sara-KS@users.noreply.github.com> Co-authored-by: Csrayz <jover@cmbchina.com> Co-authored-by: ivyilike <pww123@cmbchina.com> Co-authored-by: Burkhard Ringlein <ngl@zurich.ibm.com> Co-authored-by: Bowen Wang <abmfy@icloud.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: Daisy-Ma-coder <daisy.ma.0117@gmail.com> Co-authored-by: qqma <qqma@amazon.com> Co-authored-by: ElizaWszola <ewszola@redhat.com> Co-authored-by: Lucia Fang <116399278+luccafong@users.noreply.github.com> Co-authored-by: Zhuohan Li <zhuohan123@gmail.com> Co-authored-by: Simon Mo <simon.mo@hey.com> Co-authored-by: Or Ozeri <oro@il.ibm.com> Co-authored-by: Johnny Yang <24908445+jcyang43@users.noreply.github.com> Co-authored-by: Chengji Yao <chengjiyao@google.com> Co-authored-by: Alec S <10566873+alecsolder@users.noreply.github.com> Co-authored-by: Alec Solder <alecs@fb.com> Co-authored-by: Russell Bryant <rbryant@redhat.com> Co-authored-by: Matthew Bonanni <mbonanni@redhat.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Co-authored-by: Chris Bamford <chrisbam4d@gmail.com> Co-authored-by: Alexander Matveev <59768536+alexm-redhat@users.noreply.github.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: JJJYmmm <92386084+JJJYmmm@users.noreply.github.com> Co-authored-by: liuye.hj <liuye.hj@alibaba-inc.com> Co-authored-by: Kunshang Ji <kunshang.ji@intel.com> Co-authored-by: Lucia (Lu) Fang <fanglu@meta.com> Co-authored-by: Varun Sundar Rabindranath <varunsundar08@gmail.com> Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com> Co-authored-by: Ming Yang <yming@meta.com> Co-authored-by: Zhikaiiii <55917203+Zhikaiiii@users.noreply.github.com> Co-authored-by: Andreas Hartel <andreas@hartel.me> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com> Co-authored-by: vllmellm <vllm.ellm@embeddedllm.com> Co-authored-by: Joel <wuxibin89@163.com> Co-authored-by: youkaichao <youkaichao@gmail.com> Co-authored-by: Mark McLoughlin <markmc@redhat.com> Co-authored-by: Peter Pan <peter.pan@daocloud.io> Co-authored-by: Nicolò Lucchesi <nicolo.lucchesi@gmail.com> Co-authored-by: Fanli Lin <fanli.lin@intel.com> Co-authored-by: Thomas Parnell <tpa@zurich.ibm.com> Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com> Co-authored-by: Sage Moore <sage@neuralmagic.com> Co-authored-by: yewentao256 <zhyanwentao@126.com> Co-authored-by: bnellnm <49004751+bnellnm@users.noreply.github.com> Co-authored-by: rivos-shreeasish <shreeasish@rivosinc.com> Co-authored-by: Chih-Chieh Yang <chih.chieh.yang@ibm.com> Co-authored-by: Weida Hong <wdhongtw@gmail.com> Co-authored-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Co-authored-by: Hashem Hashemi <159079214+amd-hhashemi@users.noreply.github.com> Co-authored-by: Amir Samani <samani@ualberta.ca> Co-authored-by: Luka Govedič <lgovedic@redhat.com> Co-authored-by: jiahanc <173873397+jiahanc@users.noreply.github.com> Co-authored-by: Ilya Markov <markovilya197@gmail.com> Co-authored-by: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Co-authored-by: Jialin Ouyang <Jialin.Ouyang@gmail.com> Co-authored-by: rouchenzi <40842833+rouchenzi@users.noreply.github.com> Co-authored-by: Andrew Xia <axia@meta.com> Co-authored-by: kourosh hakhamaneshi <31483498+kouroshHakha@users.noreply.github.com> Co-authored-by: Corey Lowman <clowman1993@gmail.com> Co-authored-by: Juan Villamizar <100237675+jpvillam-amd@users.noreply.github.com> Co-authored-by: jpvillam <jpvillam@amd.com> Co-authored-by: Doug Smith <dosmith@redhat.com> Co-authored-by: Chenxi Yang <cxyang@cs.utexas.edu> Co-authored-by: Chenxi Yang <cxyang@fb.com> Co-authored-by: ahao-anyscale <ahao@anyscale.com> Co-authored-by: 0xNullPath <luyanfcp@foxmail.com> Co-authored-by: baxingpiaochong <771405853@qq.com> Co-authored-by: Benjamin Chislett <bchislett@nvidia.com> Co-authored-by: Kyle Sayers <kylesayrs@gmail.com> Co-authored-by: Nikhil Gupta <nikhil.gupta2@arm.com> Co-authored-by: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com> Co-authored-by: lhsjohn <huashuoli@tencent.com> Co-authored-by: Ben Browning <bbrownin@redhat.com> Co-authored-by: Li, Jiang <jiang1.li@intel.com> Co-authored-by: Jackmin801 <56836461+Jackmin801@users.noreply.github.com> Co-authored-by: Jonas M. Kübler <44084297+jmkuebler@users.noreply.github.com> Co-authored-by: Tao Hui <taohui3@gmail.com> Co-authored-by: rongfu.leng <rongfu.leng@daocloud.io> Co-authored-by: Shu Wang <shuw@nvidia.com> Co-authored-by: Tyler Michael Smith <tlrmchlsmth@gmail.com> Co-authored-by: Duncan Moss <djm.moss@gmail.com> Co-authored-by: Shiyan Deng <dsy842974287@meta.com> Co-authored-by: Wei Wei <wwei6@meta.com> Co-authored-by: Saman A. Pour <samanamp@outlook.com> Co-authored-by: XuruiYang <530534756@qq.com> Co-authored-by: yangxurui <yangxurui@meituan.com> Co-authored-by: Nicole LiHui 🥜 <nicolelihui@outlook.com> Co-authored-by: courage17340 <courage17340@users.noreply.github.com> Co-authored-by: Jacob Kahn <jacobkahn1@gmail.com> Co-authored-by: Nicole LiHui 🥜 <nicole.li@daocloud.io> Co-authored-by: Fadi Arafeh <115173828+fadara01@users.noreply.github.com> Co-authored-by: Agata Dobrzyniewicz <160237065+adobrzyn@users.noreply.github.com> Co-authored-by: yyzxw <34639446+yyzxw@users.noreply.github.com> Co-authored-by: wang.yuqi <noooop@126.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: chenlang <chen.lang5@zte.com.cn> Co-authored-by: chenlang <10346245@zte.com.cn> Co-authored-by: AlonKejzman <alonkeizman@gmail.com> Co-authored-by: tomeras91 <57313761+tomeras91@users.noreply.github.com> Co-authored-by: Aleksandr Malyshev <164964928+maleksan85@users.noreply.github.com> Co-authored-by: Aleksandr Malyshev <maleksan@amd.com> Co-authored-by: Doug Lehr <douglehr@amd.com> Co-authored-by: Eugene Khvedchenya <ekhvedchenya@gmail.com> Co-authored-by: yitingdc <59356937+yitingdc@users.noreply.github.com> Co-authored-by: xaguilar-amd <xavier.aguilarfruto@amd.com> Co-authored-by: Iceber Gu <caiwei95@hotmail.com> Co-authored-by: Tao He <linzhu.ht@alibaba-inc.com> Co-authored-by: Icey <1790571317@qq.com> Co-authored-by: Xu Wenqing <121550081+Xu-Wenqing@users.noreply.github.com> Co-authored-by: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com> Co-authored-by: RishiAstra <40644327+RishiAstra@users.noreply.github.com>
569 lines
21 KiB
Python
569 lines
21 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
"""Tests for v1 attention backends without GPUModelRunner dependency."""
|
|
from functools import partial
|
|
from typing import Optional, Union
|
|
|
|
import pytest
|
|
import torch
|
|
from torch.nn.attention.flex_attention import create_block_mask, flex_attention
|
|
|
|
from tests.v1.attention.utils import (BatchSpec, _Backend,
|
|
create_common_attn_metadata,
|
|
create_standard_kv_cache_spec,
|
|
create_vllm_config,
|
|
get_attention_backend)
|
|
from vllm.config import ModelConfig
|
|
from vllm.platforms import current_platform
|
|
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv, is_torch_equal_or_newer
|
|
from vllm.v1.attention.backends.utils import (CommonAttentionMetadata,
|
|
set_kv_cache_layout)
|
|
from vllm.v1.kv_cache_interface import FullAttentionSpec
|
|
|
|
BACKENDS_TO_TEST = [
|
|
_Backend.FLASH_ATTN, _Backend.FLASHINFER, _Backend.FLEX_ATTENTION,
|
|
_Backend.TRITON_ATTN, _Backend.TREE_ATTN, "FLEX_ATTENTION_SLOW"
|
|
]
|
|
|
|
# Remove flashinfer from the list if it's not available
|
|
try:
|
|
import flashinfer # noqa: F401
|
|
except ImportError:
|
|
BACKENDS_TO_TEST.remove(_Backend.FLASHINFER)
|
|
|
|
|
|
def _convert_dtype_to_torch(dtype):
|
|
"""Convert ModelDType to torch.dtype."""
|
|
if isinstance(dtype, str):
|
|
if dtype == "auto":
|
|
return torch.float16 # Default dtype for testing
|
|
elif dtype in STR_DTYPE_TO_TORCH_DTYPE:
|
|
return STR_DTYPE_TO_TORCH_DTYPE[dtype]
|
|
else:
|
|
raise ValueError(f"Unknown dtype: {dtype}")
|
|
elif isinstance(dtype, torch.dtype):
|
|
return dtype
|
|
else:
|
|
raise ValueError(f"Unknown dtype: {dtype}")
|
|
|
|
|
|
# Define common batch configurations
|
|
BATCH_SPECS = {
|
|
"small_decode":
|
|
BatchSpec(seq_lens=[32, 40], query_lens=[1, 1]),
|
|
"small_prefill":
|
|
BatchSpec(seq_lens=[32, 40], query_lens=[8, 8]),
|
|
"mixed_small":
|
|
BatchSpec(seq_lens=[32, 40, 48, 56], query_lens=[1, 1, 5, 5]),
|
|
"medium_decode":
|
|
BatchSpec(seq_lens=[128, 256, 512, 1024, 128, 256, 512, 1024],
|
|
query_lens=[1, 1, 1, 1, 1, 1, 1, 1]),
|
|
"medium_prefill":
|
|
BatchSpec(seq_lens=[256, 512, 1024, 2048], query_lens=[16, 16, 16, 16]),
|
|
"mixed_medium":
|
|
BatchSpec(seq_lens=[512, 1024, 2048, 512, 1024, 2048],
|
|
query_lens=[1, 1, 1, 7, 7, 7]),
|
|
"large_decode":
|
|
BatchSpec(seq_lens=[2048] * 32, query_lens=[1] * 32),
|
|
"large_prefill":
|
|
BatchSpec(seq_lens=[4096] * 8, query_lens=[32] * 8),
|
|
"single_decode":
|
|
BatchSpec(seq_lens=[1024], query_lens=[1]),
|
|
"single_prefill":
|
|
BatchSpec(seq_lens=[1024], query_lens=[64]),
|
|
}
|
|
|
|
|
|
def create_and_prepopulate_kv_cache(
|
|
k_contexts: list[torch.Tensor],
|
|
v_contexts: list[torch.Tensor],
|
|
block_size: int,
|
|
num_kv_heads: int,
|
|
head_size: int,
|
|
dtype: torch.dtype,
|
|
device: torch.device,
|
|
num_blocks: int,
|
|
common_attn_metadata: CommonAttentionMetadata,
|
|
randomize_blocks: bool = True) -> torch.Tensor:
|
|
"""Create and prepopulate a KV cache with context data.
|
|
|
|
Args:
|
|
k_contexts: List of key context tensors for each sequence
|
|
v_contexts: List of value context tensors for each sequence
|
|
seq_lens: List of sequence lengths
|
|
block_size: Size of each block
|
|
num_kv_heads: Number of KV heads
|
|
head_size: Size of each head
|
|
dtype: Data type for the cache
|
|
device: Device to create the cache on
|
|
num_blocks: Total number of blocks in the cache
|
|
block_table: Block table tensor to populate
|
|
randomize_blocks: Whether to randomly permute blocks
|
|
or use sequential order
|
|
|
|
Returns:
|
|
Tuple of (kv_cache, updated_block_table)
|
|
"""
|
|
batch_size = len(k_contexts)
|
|
seq_lens = common_attn_metadata.seq_lens_cpu
|
|
query_lens = common_attn_metadata.query_start_loc_cpu[
|
|
1:] - common_attn_metadata.query_start_loc_cpu[:-1]
|
|
context_lens = common_attn_metadata.num_computed_tokens_cpu
|
|
block_table = common_attn_metadata.block_table_tensor
|
|
slot_mapping = common_attn_metadata.slot_mapping
|
|
|
|
# Create KV cache
|
|
kv_cache = torch.empty(2,
|
|
num_blocks,
|
|
block_size,
|
|
num_kv_heads,
|
|
head_size,
|
|
dtype=dtype,
|
|
device=device)
|
|
kv_cache_flat = kv_cache.view(2, -1, num_kv_heads, head_size)
|
|
|
|
# Populate the cache with the context tokens
|
|
# Start from block_id=1 since block_id=0 is considered the null block
|
|
start_block_idx = 1
|
|
for i in range(batch_size):
|
|
k_context, v_context = k_contexts[i], v_contexts[i]
|
|
start = start_block_idx * block_size
|
|
end = start + k_context.shape[0]
|
|
kv_cache_flat[0, start:end, ...] = k_context
|
|
kv_cache_flat[1, start:end, ...] = v_context
|
|
|
|
# Stay block aligned and allocate enough blocks for the new tokens
|
|
start_block_idx += cdiv(int(seq_lens[i]), block_size)
|
|
|
|
blocks_end = start_block_idx
|
|
|
|
# Permute the context blocks (excluding block 0 which is null)
|
|
if randomize_blocks:
|
|
# Random permutation starting from block 1
|
|
perm = torch.randperm(blocks_end - 1) + 1
|
|
else:
|
|
# Sequential order starting from block 1
|
|
perm = torch.arange(1, blocks_end)
|
|
|
|
inv_perm = torch.zeros(blocks_end, dtype=torch.long, device=device)
|
|
# Add 1 to account for starting from block 1
|
|
inv_perm[1:] = torch.argsort(perm) + 1
|
|
kv_cache[:, 1:blocks_end, ...] = kv_cache[:, perm, ...]
|
|
|
|
# Construct the right block table
|
|
# Start from block_id=1 since block_id=0 is considered the null block
|
|
start_block_idx = 1
|
|
for i in range(batch_size):
|
|
num_blocks_for_seq = cdiv(int(seq_lens[i]), block_size)
|
|
start = start_block_idx
|
|
end = start + num_blocks_for_seq
|
|
block_table[i, :num_blocks_for_seq] = inv_perm[start:end]
|
|
start_block_idx += num_blocks_for_seq
|
|
|
|
# Create a realistic slot mapping that corresponds to the block table
|
|
for i in range(batch_size):
|
|
token_offsets = torch.arange(int(query_lens[i])) + int(context_lens[i])
|
|
block_indices = token_offsets // block_size
|
|
token_inter_block_offsets = token_offsets % block_size
|
|
start = common_attn_metadata.query_start_loc_cpu[i]
|
|
end = common_attn_metadata.query_start_loc_cpu[i + 1]
|
|
slot_mapping[start:end] = block_table[
|
|
i,
|
|
block_indices] * block_size + token_inter_block_offsets.to(device)
|
|
|
|
return kv_cache
|
|
|
|
|
|
class MockAttentionLayer:
|
|
"""A mock attention layer for testing."""
|
|
|
|
def __init__(self, device: torch.device):
|
|
self._q_scale = torch.tensor(1.0, device=device)
|
|
self._k_scale = torch.tensor(1.0, device=device)
|
|
self._v_scale = torch.tensor(1.0, device=device)
|
|
# Add float versions for flashinfer
|
|
self._q_scale_float = 1.0
|
|
self._k_scale_float = 1.0
|
|
self._v_scale_float = 1.0
|
|
|
|
|
|
def run_attention_backend(
|
|
backend: _Backend,
|
|
kv_cache_spec: FullAttentionSpec,
|
|
layer_names: list[str],
|
|
vllm_config,
|
|
device: torch.device,
|
|
common_attn_metadata: CommonAttentionMetadata,
|
|
query: torch.Tensor,
|
|
key: torch.Tensor,
|
|
value: torch.Tensor,
|
|
kv_cache: torch.Tensor,
|
|
sliding_window: Optional[int] = None,
|
|
) -> torch.Tensor:
|
|
"""Run attention computation using the specified backend's AttentionImpl."""
|
|
|
|
# Handle special case for FLEX_ATTENTION_SLOW
|
|
actual_backend = backend
|
|
|
|
use_direct_block_mask = is_torch_equal_or_newer("2.9.0.dev0")
|
|
if backend == "FLEX_ATTENTION_SLOW":
|
|
actual_backend = _Backend.FLEX_ATTENTION
|
|
use_direct_block_mask = False
|
|
|
|
builder_cls, impl_cls = get_attention_backend(actual_backend)
|
|
|
|
# Mock flashinfer's get_per_layer_parameters if needed
|
|
if actual_backend == _Backend.FLASHINFER:
|
|
import unittest.mock
|
|
|
|
from vllm.v1.attention.backends.utils import PerLayerParameters
|
|
|
|
def mock_get_per_layer_parameters(vllm_config, layer_names, impl_cls):
|
|
# Return mock parameters for a single layer
|
|
head_size = vllm_config.model_config.get_head_size()
|
|
return {
|
|
layer_name:
|
|
PerLayerParameters(
|
|
window_left=-1, # No sliding window
|
|
logits_soft_cap=0.0, # No soft cap
|
|
sm_scale=1.0 / (head_size**0.5) # Standard scale
|
|
)
|
|
for layer_name in layer_names
|
|
}
|
|
|
|
with unittest.mock.patch(
|
|
'vllm.v1.attention.backends.flashinfer.get_per_layer_parameters',
|
|
mock_get_per_layer_parameters):
|
|
builder = builder_cls(kv_cache_spec, layer_names, vllm_config,
|
|
device)
|
|
attn_metadata = builder.build(
|
|
common_prefix_len=0,
|
|
common_attn_metadata=common_attn_metadata,
|
|
)
|
|
else:
|
|
# Build metadata
|
|
builder = builder_cls(kv_cache_spec, layer_names, vllm_config, device)
|
|
if actual_backend == _Backend.FLEX_ATTENTION:
|
|
builder.direct_build = use_direct_block_mask
|
|
attn_metadata = builder.build(
|
|
common_prefix_len=0,
|
|
common_attn_metadata=common_attn_metadata,
|
|
)
|
|
|
|
# Instantiate implementation
|
|
num_heads = vllm_config.model_config.get_num_attention_heads(
|
|
vllm_config.parallel_config)
|
|
num_kv_heads = vllm_config.model_config.get_num_kv_heads(
|
|
vllm_config.parallel_config)
|
|
head_size = vllm_config.model_config.get_head_size()
|
|
scale = 1.0 / (head_size**0.5)
|
|
impl = impl_cls(
|
|
num_heads=num_heads,
|
|
head_size=head_size,
|
|
scale=scale,
|
|
num_kv_heads=num_kv_heads,
|
|
alibi_slopes=None,
|
|
sliding_window=sliding_window,
|
|
kv_cache_dtype="auto",
|
|
)
|
|
|
|
# Create mock layer and output buffer
|
|
mock_layer = MockAttentionLayer(device)
|
|
output = torch.empty_like(query)
|
|
|
|
# Run forward pass
|
|
# NOTE: The query, key, and value are already shaped correctly
|
|
# in the calling test function.
|
|
output = impl.forward(mock_layer,
|
|
query,
|
|
key,
|
|
value,
|
|
kv_cache,
|
|
attn_metadata,
|
|
output=output)
|
|
|
|
return output
|
|
|
|
|
|
def _test_backend_correctness(
|
|
batch_spec: BatchSpec,
|
|
model: str,
|
|
backend_to_test: list[Union[_Backend, str]],
|
|
mask_mod,
|
|
*,
|
|
block_size: int = 16,
|
|
atol: float = 1e-2,
|
|
rtol: float = 1e-2,
|
|
):
|
|
"""
|
|
Test that all backends produce similar outputs to a reference implementation
|
|
using torch.nn.functional.scaled_dot_product_attention.
|
|
|
|
This test works by:
|
|
1. Generating a batch of sequences with specified context and query lengths.
|
|
2. Computing a ground-truth attention output using torch.sdpa on
|
|
contiguous Q, K, and V tensors.
|
|
3. Simulating vLLM's paged KV cache: It takes the context portion of the
|
|
K/V tensors and manually places them into a paged buffer according to
|
|
the test's (randomly generated) block table.
|
|
4. Running each vLLM attention backend with the new queries and the
|
|
simulated paged KV cache.
|
|
5. Comparing the vLLM backend's output to the ground-truth SDPA output.
|
|
"""
|
|
current_platform.seed_everything(42)
|
|
vllm_config = create_vllm_config(model_name=model,
|
|
max_model_len=max(batch_spec.seq_lens),
|
|
block_size=block_size,
|
|
num_gpu_blocks=8192)
|
|
device = torch.device("cuda:0")
|
|
|
|
kv_cache_spec = create_standard_kv_cache_spec(vllm_config)
|
|
|
|
# 1. Setup
|
|
batch_size = batch_spec.batch_size
|
|
seq_lens = batch_spec.seq_lens
|
|
query_lens = batch_spec.query_lens
|
|
num_q_heads = vllm_config.model_config.get_num_attention_heads(
|
|
vllm_config.parallel_config)
|
|
num_kv_heads = vllm_config.model_config.get_num_kv_heads(
|
|
vllm_config.parallel_config)
|
|
head_size = vllm_config.model_config.get_head_size()
|
|
sliding_window = vllm_config.model_config.get_sliding_window()
|
|
dtype = _convert_dtype_to_torch(vllm_config.model_config.dtype)
|
|
block_size = vllm_config.cache_config.block_size
|
|
scale = 1.0 / (head_size**0.5)
|
|
|
|
# 2. Generate data and compute SDPA reference output
|
|
all_q_vllm, all_k_vllm, all_v_vllm = [], [], []
|
|
all_sdpa_outputs = []
|
|
k_contexts, v_contexts = [], []
|
|
|
|
for i in range(batch_size):
|
|
s_len = seq_lens[i]
|
|
q_len = query_lens[i]
|
|
context_len = s_len - q_len
|
|
|
|
# Generate Q, K, V for the whole sequence to be used in SDPA
|
|
q = torch.randn(q_len,
|
|
num_q_heads,
|
|
head_size,
|
|
dtype=dtype,
|
|
device=device)
|
|
k_full = torch.randn(s_len,
|
|
num_kv_heads,
|
|
head_size,
|
|
dtype=dtype,
|
|
device=device)
|
|
v_full = torch.randn(s_len,
|
|
num_kv_heads,
|
|
head_size,
|
|
dtype=dtype,
|
|
device=device)
|
|
|
|
# SDPA expects (N, H, L, D), so unsqueeze batch and permute
|
|
q_sdpa_in = q.unsqueeze(0).transpose(1, 2)
|
|
k_sdpa_in = k_full.unsqueeze(0).transpose(1, 2)
|
|
v_sdpa_in = v_full.unsqueeze(0).transpose(1, 2)
|
|
|
|
if num_q_heads != num_kv_heads:
|
|
assert num_q_heads % num_kv_heads == 0, (
|
|
f"num_q_heads ({num_q_heads}) must be divisible by "
|
|
f"num_kv_heads ({num_kv_heads})")
|
|
repeats = num_q_heads // num_kv_heads
|
|
k_sdpa_in = k_sdpa_in.repeat_interleave(repeats, dim=1)
|
|
v_sdpa_in = v_sdpa_in.repeat_interleave(repeats, dim=1)
|
|
|
|
# Create causal mask: query token i attends to positions 0 to
|
|
# (context_len + i)
|
|
kv_len = s_len
|
|
|
|
final_mask_mod = partial(mask_mod, context_len=context_len)
|
|
block_mask = create_block_mask(final_mask_mod,
|
|
B=None,
|
|
H=None,
|
|
Q_LEN=q_len,
|
|
KV_LEN=kv_len,
|
|
device=device)
|
|
sdpa_out_i = flex_attention(q_sdpa_in,
|
|
k_sdpa_in,
|
|
v_sdpa_in,
|
|
block_mask=block_mask,
|
|
scale=scale,
|
|
enable_gqa=True)
|
|
|
|
all_sdpa_outputs.append(sdpa_out_i.transpose(1, 2).squeeze(0))
|
|
|
|
# Inputs for vLLM backends are just the new tokens
|
|
all_q_vllm.append(q)
|
|
all_k_vllm.append(k_full[context_len:])
|
|
all_v_vllm.append(v_full[context_len:])
|
|
|
|
# Contextual K/V data used to populate the paged cache
|
|
k_contexts.append(k_full[:context_len])
|
|
v_contexts.append(v_full[:context_len])
|
|
|
|
query_vllm = torch.cat(all_q_vllm, dim=0)
|
|
key_vllm = torch.cat(all_k_vllm, dim=0)
|
|
value_vllm = torch.cat(all_v_vllm, dim=0)
|
|
sdpa_output = torch.cat(all_sdpa_outputs, dim=0)
|
|
|
|
common_attn_metadata = create_common_attn_metadata(
|
|
batch_spec, vllm_config.cache_config.block_size, device)
|
|
|
|
# 3. Simulate Paged KV Cache and a realistic slot_mapping
|
|
kv_cache = create_and_prepopulate_kv_cache(
|
|
k_contexts=k_contexts,
|
|
v_contexts=v_contexts,
|
|
block_size=block_size,
|
|
num_kv_heads=num_kv_heads,
|
|
head_size=head_size,
|
|
dtype=dtype,
|
|
device=device,
|
|
num_blocks=vllm_config.cache_config.num_gpu_blocks or 1000,
|
|
common_attn_metadata=common_attn_metadata,
|
|
randomize_blocks=True)
|
|
|
|
# 4. Run vLLM backends and compare
|
|
# Note: flex_attention has known Triton kernel compatibility issues
|
|
# with test infrastructures
|
|
for backend_name in backend_to_test:
|
|
# FlashAttentionm + FlexAttention:
|
|
# [2, num_blocks, block_size, num_kv_heads, head_size]
|
|
# FlashInfer:
|
|
# [num_blocks, 2, block_size, num_kv_heads, head_size]
|
|
# Select the appropriate KV cache format for each backend
|
|
kv_cache_for_backend = kv_cache
|
|
if backend_name == _Backend.FLASHINFER:
|
|
kv_cache_for_backend = kv_cache.transpose(0, 1)
|
|
|
|
# For FlashInfer default to HND layout and
|
|
kv_cache_for_backend = kv_cache_for_backend.transpose(
|
|
2, 3).contiguous().transpose(2, 3)
|
|
set_kv_cache_layout("HND")
|
|
|
|
backend_output = run_attention_backend(
|
|
backend_name,
|
|
kv_cache_spec,
|
|
["placeholder"],
|
|
vllm_config,
|
|
device,
|
|
common_attn_metadata,
|
|
query_vllm,
|
|
key_vllm,
|
|
value_vllm,
|
|
kv_cache_for_backend,
|
|
sliding_window=sliding_window,
|
|
)
|
|
|
|
# Check shape and dtype consistency
|
|
assert backend_output.shape == sdpa_output.shape, (
|
|
f"[{backend_name}] shape {backend_output.shape} != "
|
|
f"SDPA shape {sdpa_output.shape}")
|
|
assert backend_output.dtype == sdpa_output.dtype, (
|
|
f"[{backend_name}] dtype {backend_output.dtype} != "
|
|
f"SDPA dtype {sdpa_output.dtype}")
|
|
|
|
assert torch.isfinite(backend_output).all(), (
|
|
f"[{backend_name}] produced non-finite values")
|
|
|
|
# Check numerical similarity
|
|
def error_msg(msg: str, backend_name: str):
|
|
return (f"[{backend_name}] output differs from SDPA baseline. "
|
|
f"{msg}")
|
|
|
|
torch.testing.assert_close(backend_output,
|
|
sdpa_output,
|
|
rtol=rtol,
|
|
atol=atol,
|
|
msg=partial(error_msg,
|
|
backend_name=backend_name))
|
|
|
|
|
|
@pytest.mark.parametrize("batch_spec_name", [
|
|
"small_decode", "small_prefill", "mixed_small", "medium_decode",
|
|
"medium_prefill", "mixed_medium", "large_decode", "large_prefill",
|
|
"single_decode", "single_prefill"
|
|
])
|
|
@pytest.mark.parametrize("model", ["meta-llama/Meta-Llama-3-8B"])
|
|
def test_causal_backend_correctness(batch_spec_name: str, model: str):
|
|
"""Test backend's correctness with causal attention."""
|
|
|
|
def causal_mask_mod(
|
|
b: torch.Tensor,
|
|
h: torch.Tensor,
|
|
q_idx: torch.Tensor,
|
|
kv_idx: torch.Tensor,
|
|
*,
|
|
context_len: int,
|
|
):
|
|
return (q_idx + context_len) >= kv_idx
|
|
|
|
batch_spec = BATCH_SPECS[batch_spec_name]
|
|
LARGE_BLOCK_BACKENDS = ([_Backend.FLEX_ATTENTION]
|
|
if is_torch_equal_or_newer("2.9.0.dev0") else [])
|
|
SMALL_BLOCK_BACKENDS = [
|
|
x for x in BACKENDS_TO_TEST if x not in LARGE_BLOCK_BACKENDS
|
|
]
|
|
_test_backend_correctness(batch_spec, model, SMALL_BLOCK_BACKENDS,
|
|
causal_mask_mod)
|
|
|
|
# Fast FlexAttention needs to run with block_size=128
|
|
if LARGE_BLOCK_BACKENDS:
|
|
_test_backend_correctness(batch_spec,
|
|
model,
|
|
LARGE_BLOCK_BACKENDS,
|
|
causal_mask_mod,
|
|
block_size=128)
|
|
|
|
|
|
SLIDING_WINDOW_BACKENDS_TO_TEST = [
|
|
_Backend.FLASH_ATTN, _Backend.FLEX_ATTENTION, _Backend.TRITON_ATTN,
|
|
"FLEX_ATTENTION_SLOW"
|
|
]
|
|
|
|
|
|
@pytest.mark.parametrize("batch_spec_name", [
|
|
"small_decode", "small_prefill", "mixed_medium", "large_decode",
|
|
"large_prefill"
|
|
])
|
|
@pytest.mark.parametrize("model", ["microsoft/Phi-tiny-MoE-instruct"])
|
|
def test_sliding_window_backend_correctness(batch_spec_name: str, model: str):
|
|
"""Test backend's correctness with sliding window attention."""
|
|
|
|
def sliding_window_mask_mod(
|
|
b: torch.Tensor,
|
|
h: torch.Tensor,
|
|
q_idx: torch.Tensor,
|
|
kv_idx: torch.Tensor,
|
|
*,
|
|
context_len: int,
|
|
sliding_window: int,
|
|
):
|
|
causal_mask = q_idx + context_len >= kv_idx
|
|
window_mask = q_idx + context_len - kv_idx < sliding_window
|
|
return causal_mask & window_mask
|
|
|
|
batch_spec = BATCH_SPECS[batch_spec_name]
|
|
model_config = ModelConfig(model=model,
|
|
max_model_len=max(batch_spec.seq_lens))
|
|
sliding_window = model_config.get_sliding_window()
|
|
sliding_window_mask_mod_fn = partial(sliding_window_mask_mod,
|
|
sliding_window=sliding_window)
|
|
|
|
LARGE_BLOCK_BACKENDS = ([_Backend.FLEX_ATTENTION]
|
|
if is_torch_equal_or_newer("2.9.0.dev0") else [])
|
|
SMALL_BLOCK_BACKENDS = [
|
|
x for x in SLIDING_WINDOW_BACKENDS_TO_TEST
|
|
if x not in LARGE_BLOCK_BACKENDS
|
|
]
|
|
_test_backend_correctness(batch_spec, model, SMALL_BLOCK_BACKENDS,
|
|
sliding_window_mask_mod_fn)
|
|
|
|
# Fast FlexAttention needs to run with block_size=128
|
|
if LARGE_BLOCK_BACKENDS:
|
|
_test_backend_correctness(batch_spec,
|
|
model,
|
|
LARGE_BLOCK_BACKENDS,
|
|
sliding_window_mask_mod_fn,
|
|
block_size=128)
|