vllm/tests/v1/spec_decode/test_eagle.py
HAIAI aee76334d9
[amd_dev] branch rebase (#25753)
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Lucas Kabela <lucaskabela@meta.com>
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Signed-off-by: Andrew Sansom <andrew@protopia.ai>
Signed-off-by: Boyuan Feng <boyuan@meta.com>
Signed-off-by: Boyuan Feng <fby.1994@gmail.com>
Signed-off-by: boyuanfeng <boyuan@meta.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: JartX <sagformas@epdcenter.es>
Signed-off-by: Chendi Xue <Chendi.Xue@intel.com>
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
Signed-off-by: Manoel Marques <manoel.marques@ibm.com>
Signed-off-by: Manoel Marques <manoelmrqs@gmail.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: pengdrumli <pengdrumli@tencent.com>
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Huamin Li <3ericli@gmail.com>
Signed-off-by: simondanielsson <simon.danielsson99@hotmail.com>
Signed-off-by: Rahul Tuli <rtuli@redhat.com>
Signed-off-by: Yang <lymailforjob@gmail.com>
Signed-off-by: Debolina Roy <debroy@redhat.com>
Signed-off-by: David Chen <530634352@qq.com>
Signed-off-by: wangzi <3220100013@zju.edu.cn>
Signed-off-by: Eldar Kurtic <8884008+eldarkurtic@users.noreply.github.com>
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
Signed-off-by: Sara Kokkila Schumacher <saraks@ibm.com>
Signed-off-by: Csrayz <jover@cmbchina.com>
Signed-off-by: ivyilike <pww123@cmbchina.com>
Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
Signed-off-by: Bowen Wang <abmfy@icloud.com>
Signed-off-by: qqma <qqma@amazon.com>
Signed-off-by: ElizaWszola <ewszola@redhat.com>
Signed-off-by: Lu Fang <fanglu@fb.com>
Signed-off-by: Zhuohan Li <zhuohan123@gmail.com>
Signed-off-by: Luka Govedič <lgovedic@redhat.com>
Signed-off-by: luka <lgovedic@redhat.com>
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Signed-off-by: Or Ozeri <oro@il.ibm.com>
Signed-off-by: Johnny Yang <johnnyyang@google.com>
Signed-off-by: Alec Solder <alecs@fb.com>
Signed-off-by: Alec S <10566873+alecsolder@users.noreply.github.com>
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
Signed-off-by: Alexander Matveev <amatveev@redhat.com>
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: liuye.hj <liuye.hj@alibaba-inc.com>
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
Signed-off-by: Lucia Fang <116399278+luccafong@users.noreply.github.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Signed-off-by: Ming Yang <minos.future@gmail.com>
Signed-off-by: Zhikaiiii <1658973216@qq.com>
Signed-off-by: Andreas Hartel <andreas.hartel@aleph-alpha.com>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Signed-off-by: wuxibin <wuxibin@bytedance.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Peter Pan <Peter.Pan@daocloud.io>
Signed-off-by: Peter Pan <peter.pan@daocloud.io>
Signed-off-by: Nicolò Lucchesi<nicolo.lucchesi@gmail.com>
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Signed-off-by: Sage Moore <sage@neuralmagic.com>
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Signed-off-by: Bill Nell <bnell@redhat.com>
Signed-off-by: Shreeasish Kumar <shreeasish@rivosinc.com>
Signed-off-by: Weida Hong <wdhongtw@google.com>
Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
Signed-off-by: Hashem Hashemi <hashem.hashemi@amd.com>
Signed-off-by: Hashem Hashemi <159079214+amd-hhashemi@users.noreply.github.com>
Signed-off-by: Amir Samani <asamani@nvidia.com>
Signed-off-by: ElizaWszola <elizaw.9289@gmail.com>
Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com>
Signed-off-by: ilmarkov <markovilya197@gmail.com>
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
Signed-off-by: rouchenzi <ruochenwen@gmail.com>
Signed-off-by: rouchenzi <40842833+rouchenzi@users.noreply.github.com>
Signed-off-by: Andrew Xia <axia@meta.com>
Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Signed-off-by: Corey Lowman <clowman1993@gmail.com>
Signed-off-by: jpvillam <jpvillam@amd.com>
Signed-off-by: dougbtv <dosmith@redhat.com>
Signed-off-by: Chenxi Yang <cxyang@fb.com>
Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Signed-off-by: ahao-anyscale <ahao@anyscale.com>
Signed-off-by: Yan Lu <luyan@nvidia.com>
Signed-off-by: baxingpiaochong <771405853@qq.com>
Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
Signed-off-by: Nikhil Gupta <nikhil.gupta2@arm.com>
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai>
Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
Signed-off-by: Ben Browning <bbrownin@redhat.com>
Signed-off-by: Chengji Yao <chengjiyao@google.com>
Signed-off-by: jiang1.li <jiang1.li@intel.com>
Signed-off-by: Jackmin801 <ongjackm@gmail.com>
Signed-off-by: Jonas M. Kübler <44084297+jmkuebler@users.noreply.github.com>
Signed-off-by: taohui <taohui3@gmail.com>
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
Signed-off-by: Shu Wang <shuw@nvidia.com>
Signed-off-by: Shu Wang. <shuw@nvidia.com>
Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
Signed-off-by: Duncan Moss <djm.moss@gmail.com>
Signed-off-by: Shiyan Deng <dsy842974287@meta.com>
Signed-off-by: Wei Wei <wwei6@meta.com>
Signed-off-by: Saman Keon <samanamp@outlook.com>
Signed-off-by: yangxurui <yangxurui@meituan.com>
Signed-off-by: nicole-lihui <nicole.li@daocloud.io>
Signed-off-by: courage17340 <courage17340@163.com>
Signed-off-by: Jacob Kahn <jacobkahn1@gmail.com>
Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
Signed-off-by: Agata Dobrzyniewicz <adobrzyniewicz@habana.ai>
Signed-off-by: zxw <1020938856@qq.com>
Signed-off-by: wang.yuqi <noooop@126.com>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Signed-off-by: chenlang <chen.lang5@zte.com.cn>
Signed-off-by: Jonas Kuebler <kuebj@amazon.com>
Signed-off-by: AlonKejzman <alonkeizman@gmail.com>
Signed-off-by: Tao Hui <taohui3@gmail.com>
Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com>
Signed-off-by: Tomer Asida <57313761+tomeras91@users.noreply.github.com>
Signed-off-by: Aleksandr Malyshev <maleksan@amd.com>
Signed-off-by: Eugene Khvedchenia <ekhvedchenia@nvidia.com>
Signed-off-by: Eugene Khvedchenya <ekhvedchenya@gmail.com>
Signed-off-by: yiting.jiang <yiting.jiang@daocloud.io>
Signed-off-by: xaguilar <Xavier.AguilarFruto@amd.com>
Signed-off-by: Iceber Gu <caiwei95@hotmail.com>
Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com>
Signed-off-by: Icey <1790571317@qq.com>
Signed-off-by: 许文卿 <xwq391974@alibaba-inc.com>
Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Lucas Kabela <lucasakabela@gmail.com>
Co-authored-by: Maximilien de Bayser <mbayser@br.ibm.com>
Co-authored-by: Andrew Sansom <andrew@protopia.ai>
Co-authored-by: Boyuan Feng <boyuan@meta.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: JartX <sagformas@epdcenter.es>
Co-authored-by: Chendi.Xue <chendi.xue@intel.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
Co-authored-by: xin.li <xin.li@daocloud.io>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Wenlong Wang <wangwenlong2755@gmail.com>
Co-authored-by: Manoel Marques <manoelmrqs@gmail.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: lirong <56789630+lirong-lirong@users.noreply.github.com>
Co-authored-by: Michael Yao <haifeng.yao@daocloud.io>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Huamin Li <3ericli@gmail.com>
Co-authored-by: Lu Fang <30275821+houseroad@users.noreply.github.com>
Co-authored-by: Simon Danielsson <70206058+simondanielsson@users.noreply.github.com>
Co-authored-by: Rahul Tuli <rtuli@redhat.com>
Co-authored-by: Claude <noreply@anthropic.com>
Co-authored-by: Yang Liu <127183760+KKSK-DON@users.noreply.github.com>
Co-authored-by: Deboleina <debroy@redhat.com>
Co-authored-by: yinz-aizip <yinz@aizip.ai>
Co-authored-by: WeiQing Chen <40507679+david6666666@users.noreply.github.com>
Co-authored-by: wangzi <3220100013@zju.edu.cn>
Co-authored-by: Eldar Kurtić <8884008+eldarkurtic@users.noreply.github.com>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com>
Co-authored-by: Yizhou <136800916+yiz-liu@users.noreply.github.com>
Co-authored-by: Sara-KS <50249410+Sara-KS@users.noreply.github.com>
Co-authored-by: Csrayz <jover@cmbchina.com>
Co-authored-by: ivyilike <pww123@cmbchina.com>
Co-authored-by: Burkhard Ringlein <ngl@zurich.ibm.com>
Co-authored-by: Bowen Wang <abmfy@icloud.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Daisy-Ma-coder <daisy.ma.0117@gmail.com>
Co-authored-by: qqma <qqma@amazon.com>
Co-authored-by: ElizaWszola <ewszola@redhat.com>
Co-authored-by: Lucia Fang <116399278+luccafong@users.noreply.github.com>
Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
Co-authored-by: Simon Mo <simon.mo@hey.com>
Co-authored-by: Or Ozeri <oro@il.ibm.com>
Co-authored-by: Johnny Yang <24908445+jcyang43@users.noreply.github.com>
Co-authored-by: Chengji Yao <chengjiyao@google.com>
Co-authored-by: Alec S <10566873+alecsolder@users.noreply.github.com>
Co-authored-by: Alec Solder <alecs@fb.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Matthew Bonanni <mbonanni@redhat.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Co-authored-by: Chris Bamford <chrisbam4d@gmail.com>
Co-authored-by: Alexander Matveev <59768536+alexm-redhat@users.noreply.github.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: JJJYmmm <92386084+JJJYmmm@users.noreply.github.com>
Co-authored-by: liuye.hj <liuye.hj@alibaba-inc.com>
Co-authored-by: Kunshang Ji <kunshang.ji@intel.com>
Co-authored-by: Lucia (Lu) Fang <fanglu@meta.com>
Co-authored-by: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Ming Yang <yming@meta.com>
Co-authored-by: Zhikaiiii <55917203+Zhikaiiii@users.noreply.github.com>
Co-authored-by: Andreas Hartel <andreas@hartel.me>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: vllmellm <vllm.ellm@embeddedllm.com>
Co-authored-by: Joel <wuxibin89@163.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Mark McLoughlin <markmc@redhat.com>
Co-authored-by: Peter Pan <peter.pan@daocloud.io>
Co-authored-by: Nicolò Lucchesi <nicolo.lucchesi@gmail.com>
Co-authored-by: Fanli Lin <fanli.lin@intel.com>
Co-authored-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Co-authored-by: Sage Moore <sage@neuralmagic.com>
Co-authored-by: yewentao256 <zhyanwentao@126.com>
Co-authored-by: bnellnm <49004751+bnellnm@users.noreply.github.com>
Co-authored-by: rivos-shreeasish <shreeasish@rivosinc.com>
Co-authored-by: Chih-Chieh Yang <chih.chieh.yang@ibm.com>
Co-authored-by: Weida Hong <wdhongtw@gmail.com>
Co-authored-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
Co-authored-by: Hashem Hashemi <159079214+amd-hhashemi@users.noreply.github.com>
Co-authored-by: Amir Samani <samani@ualberta.ca>
Co-authored-by: Luka Govedič <lgovedic@redhat.com>
Co-authored-by: jiahanc <173873397+jiahanc@users.noreply.github.com>
Co-authored-by: Ilya Markov <markovilya197@gmail.com>
Co-authored-by: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Co-authored-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
Co-authored-by: rouchenzi <40842833+rouchenzi@users.noreply.github.com>
Co-authored-by: Andrew Xia <axia@meta.com>
Co-authored-by: kourosh hakhamaneshi <31483498+kouroshHakha@users.noreply.github.com>
Co-authored-by: Corey Lowman <clowman1993@gmail.com>
Co-authored-by: Juan Villamizar <100237675+jpvillam-amd@users.noreply.github.com>
Co-authored-by: jpvillam <jpvillam@amd.com>
Co-authored-by: Doug Smith <dosmith@redhat.com>
Co-authored-by: Chenxi Yang <cxyang@cs.utexas.edu>
Co-authored-by: Chenxi Yang <cxyang@fb.com>
Co-authored-by: ahao-anyscale <ahao@anyscale.com>
Co-authored-by: 0xNullPath <luyanfcp@foxmail.com>
Co-authored-by: baxingpiaochong <771405853@qq.com>
Co-authored-by: Benjamin Chislett <bchislett@nvidia.com>
Co-authored-by: Kyle Sayers <kylesayrs@gmail.com>
Co-authored-by: Nikhil Gupta <nikhil.gupta2@arm.com>
Co-authored-by: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com>
Co-authored-by: lhsjohn <huashuoli@tencent.com>
Co-authored-by: Ben Browning <bbrownin@redhat.com>
Co-authored-by: Li, Jiang <jiang1.li@intel.com>
Co-authored-by: Jackmin801 <56836461+Jackmin801@users.noreply.github.com>
Co-authored-by: Jonas M. Kübler <44084297+jmkuebler@users.noreply.github.com>
Co-authored-by: Tao Hui <taohui3@gmail.com>
Co-authored-by: rongfu.leng <rongfu.leng@daocloud.io>
Co-authored-by: Shu Wang <shuw@nvidia.com>
Co-authored-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
Co-authored-by: Duncan Moss <djm.moss@gmail.com>
Co-authored-by: Shiyan Deng <dsy842974287@meta.com>
Co-authored-by: Wei Wei <wwei6@meta.com>
Co-authored-by: Saman A. Pour <samanamp@outlook.com>
Co-authored-by: XuruiYang <530534756@qq.com>
Co-authored-by: yangxurui <yangxurui@meituan.com>
Co-authored-by: Nicole LiHui 🥜 <nicolelihui@outlook.com>
Co-authored-by: courage17340 <courage17340@users.noreply.github.com>
Co-authored-by: Jacob Kahn <jacobkahn1@gmail.com>
Co-authored-by: Nicole LiHui 🥜 <nicole.li@daocloud.io>
Co-authored-by: Fadi Arafeh <115173828+fadara01@users.noreply.github.com>
Co-authored-by: Agata Dobrzyniewicz <160237065+adobrzyn@users.noreply.github.com>
Co-authored-by: yyzxw <34639446+yyzxw@users.noreply.github.com>
Co-authored-by: wang.yuqi <noooop@126.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: chenlang <chen.lang5@zte.com.cn>
Co-authored-by: chenlang <10346245@zte.com.cn>
Co-authored-by: AlonKejzman <alonkeizman@gmail.com>
Co-authored-by: tomeras91 <57313761+tomeras91@users.noreply.github.com>
Co-authored-by: Aleksandr Malyshev <164964928+maleksan85@users.noreply.github.com>
Co-authored-by: Aleksandr Malyshev <maleksan@amd.com>
Co-authored-by: Doug Lehr <douglehr@amd.com>
Co-authored-by: Eugene Khvedchenya <ekhvedchenya@gmail.com>
Co-authored-by: yitingdc <59356937+yitingdc@users.noreply.github.com>
Co-authored-by: xaguilar-amd <xavier.aguilarfruto@amd.com>
Co-authored-by: Iceber Gu <caiwei95@hotmail.com>
Co-authored-by: Tao He <linzhu.ht@alibaba-inc.com>
Co-authored-by: Icey <1790571317@qq.com>
Co-authored-by: Xu Wenqing <121550081+Xu-Wenqing@users.noreply.github.com>
Co-authored-by: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com>
Co-authored-by: RishiAstra <40644327+RishiAstra@users.noreply.github.com>
2025-09-26 17:14:31 +01:00

709 lines
27 KiB
Python

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Optional
from unittest import mock
import pytest
import torch
from tests.utils import get_attn_backend_list_based_on_platform
from tests.v1.attention.utils import (BatchSpec, _Backend,
create_common_attn_metadata,
create_standard_kv_cache_spec,
get_attention_backend)
from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
ParallelConfig, SchedulerConfig, SpeculativeConfig,
VllmConfig)
from vllm.config.load import LoadConfig
from vllm.model_executor.models.llama import LlamaForCausalLM
from vllm.platforms import current_platform
from vllm.v1.spec_decode.eagle import EagleProposer
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
model_dir = "meta-llama/Llama-3.1-8B-Instruct"
eagle_dir = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
eagle3_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
def _create_proposer(
method: str,
num_speculative_tokens: int,
speculative_token_tree: Optional[list[tuple[int, ...]]] = None,
) -> EagleProposer:
model_config = ModelConfig(model=model_dir,
runner="generate",
max_model_len=100)
# Choose model directory based on method
draft_model_dir = eagle_dir if method == "eagle" else eagle3_dir
spec_token_tree_str = None
if speculative_token_tree is not None:
assert num_speculative_tokens == len(speculative_token_tree)
spec_token_tree_str = str(speculative_token_tree)
speculative_config = SpeculativeConfig(
target_model_config=model_config,
target_parallel_config=ParallelConfig(),
model=draft_model_dir,
method=method,
num_speculative_tokens=num_speculative_tokens,
speculative_token_tree=spec_token_tree_str,
)
vllm_config = VllmConfig(
model_config=model_config,
cache_config=CacheConfig(),
speculative_config=speculative_config,
device_config=DeviceConfig(device=current_platform.device_type),
parallel_config=ParallelConfig(),
load_config=LoadConfig(),
scheduler_config=SchedulerConfig())
return EagleProposer(vllm_config=vllm_config,
device=current_platform.device_type)
def test_prepare_next_token_ids():
"""
Test for prepare_next_token_ids_cpu and prepare_next_token_ids_padded.
Each will produce a device tensor of next_token_ids, taking as input
either the GPU tensor of sampled_token_ids with -1 for rejected tokens,
or the CPU python list[list[int]] with the rejected tokens removed.
"""
device = torch.device(current_platform.device_type)
num_requests = 4
num_speculative_tokens = 4
batch_spec = BatchSpec(
seq_lens=[num_speculative_tokens + 1] * num_requests,
query_lens=[num_speculative_tokens + 1] * num_requests,
)
req_ids = [f"req_{i+1}" for i in range(num_requests)]
mock_input_batch = mock.MagicMock(spec=InputBatch)
mock_input_batch.req_ids = req_ids
mock_input_batch.num_reqs = num_requests
mock_input_batch.vocab_size = 100
mock_num_scheduled_tokens = {req_id: 0 for req_id in req_ids}
mock_requests = {}
for req_id in req_ids:
mock_request = mock.MagicMock(spec=CachedRequestState)
# Each request will have a backup next token id of 10, 20, 30, 40
mock_request.get_token_id.return_value = int(req_id.split("_")[1]) * 10
mock_request.num_computed_tokens = 0
mock_requests[req_id] = mock_request
sampled_token_ids = [
[0, 1, -1, -1, -1], # 1 accepted, 3 rejected, "1" sampled
[0, 1, 2, 3, 4], # all accepted, "4" sampled
[-1, -1, -1, -1, -1], # sampling skipped, use backup token "30"
[-1, -1, -1, -1, -1] # this request will be discarded
]
sampled_token_ids_tensor = torch.tensor(sampled_token_ids,
dtype=torch.int32,
device=device)
sampled_token_ids_cpu = [[i for i in seq if i != -1]
for seq in sampled_token_ids]
expected_next_token_ids_cpu = [1, 4, 30, 40]
expected_next_token_ids_tensor = torch.tensor(expected_next_token_ids_cpu,
dtype=torch.int32,
device=device)
proposer = _create_proposer("eagle", num_speculative_tokens)
next_token_ids_from_cpu = proposer.prepare_next_token_ids_cpu(
sampled_token_ids_cpu, mock_requests, mock_input_batch,
mock_num_scheduled_tokens)
assert torch.equal(next_token_ids_from_cpu, expected_next_token_ids_tensor)
common_attn_metadata = create_common_attn_metadata(
batch_spec,
block_size=16,
device=device,
)
discarded_req_indices = torch.tensor([3], dtype=torch.int64, device=device)
num_discarded_reqs = 1
expected_valid_sampled_tokens_count = torch.tensor([2, 5, 0, 0],
dtype=torch.int32,
device=device)
next_token_ids_from_padded, valid_sampled_tokens_count = \
proposer.prepare_next_token_ids_padded(
common_attn_metadata, sampled_token_ids_tensor, mock_requests,
mock_input_batch, discarded_req_indices, num_discarded_reqs)
assert torch.equal(next_token_ids_from_padded,
expected_next_token_ids_tensor)
assert torch.equal(valid_sampled_tokens_count,
expected_valid_sampled_tokens_count)
def test_prepare_inputs():
"""
cu_target_query_lens: [0, a, a + b, a + b + c]
num_rejected_tokens: [n1, n2, n3]
num_tokens_per_req: [a - n1, b - n2, c - n3]
cu_num_tokens: [0, a - n1, a + b - n1 - n2, a + b + c - n1 - n2 - n3]
token_indices: [0, 1, ..., a - n1 - 1,
a, a + 1, ..., a + b - n2 - 1,
a + b, a + b + 1, ..., a + b + c - n3 - 1]
"""
device = torch.device(current_platform.device_type)
# q1 = 4, q2 = 7, q3 = 5
# n1 = 1, n2 = 3, n3 = 2
batch_spec = BatchSpec(
seq_lens=[4, 7, 5],
query_lens=[4, 7, 5],
)
common_attn_metadata = create_common_attn_metadata(
batch_spec,
block_size=16,
device=device,
)
# If there are `k` sampled tokens, then `k-1` tokens are draft tokens
# from the previous iteration, and the last token is the bonus token sampled
# from the base model.
num_draft_tokens = [3, 6, 4] # one less than query_lens
# num rejected tokens is [1, 3, 2]
ACCEPT_TOKEN = 0
BONUS_TOKEN = 1
REJECT_TOKEN = -1
sampled_token_ids = [
[ACCEPT_TOKEN, ACCEPT_TOKEN, REJECT_TOKEN, BONUS_TOKEN],
[
ACCEPT_TOKEN, ACCEPT_TOKEN, ACCEPT_TOKEN, REJECT_TOKEN,
REJECT_TOKEN, REJECT_TOKEN, BONUS_TOKEN
],
[ACCEPT_TOKEN, ACCEPT_TOKEN, REJECT_TOKEN, REJECT_TOKEN, BONUS_TOKEN]
]
sampled_token_ids = [[i for i in seq if i != REJECT_TOKEN]
for seq in sampled_token_ids]
# Expected calculations:
# query_len_per_req = [4, 7, 5]
# num_tokens_per_req = [3, 4, 3] (after subtracting rejected tokens)
# Expected cumulative counts: [0, 3, 7, 10]
expected_cu_num_tokens = torch.tensor([0, 3, 7, 10],
dtype=torch.int32,
device=device)
# Expected token indices (mapped from original positions):
# First request: indices 0, 1, 2 (keeping first 3 from positions 0-3)
# Second request: indices 4, 5, 6, 7 (keeping first 4 from positions 4-10)
# Third request: indices 11, 12, 13 (keeping first 3 from positions 11-15)
expected_token_indices = torch.tensor(
[
0,
1,
2, # First request: 3 tokens (4-1)
4,
5,
6,
7, # Second request: 4 tokens (7-3)
11,
12,
13 # Third request: 3 tokens (5-2)
],
dtype=torch.int32,
device=device)
proposer = _create_proposer("eagle", 1)
updated_metadata, token_indices = proposer.prepare_inputs(
common_attn_metadata, sampled_token_ids, num_draft_tokens)
assert torch.equal(updated_metadata.query_start_loc,
expected_cu_num_tokens)
assert token_indices.shape[0] == expected_cu_num_tokens[-1].item()
assert torch.equal(token_indices, expected_token_indices)
def test_prepare_inputs_padded():
"""
Input scenario is 3 requests with num_speculative_tokens == 2 and:
- Request 1: query_len = 3, rejected = 1
- Request 2: query_len = 3, rejected = 0
- Request 3: query_len = 3, rejected = 2
Expected outputs:
token_indices: [0, 1, 2,
3, 4, 5,
6, 7, 8]
Reason: Deferred computation should not disturb the original indices.
token_indices_to_sample: [1, 5, 6]
Reason: After accounting for rejections, these are the valid token positions
from the original indices to sample from.
"""
device = torch.device(current_platform.device_type)
expected_token_indices = torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 8],
dtype=torch.int32,
device=device)
expected_token_indices_to_sample = torch.tensor([1, 5, 6],
dtype=torch.int32,
device=device)
num_speculative_tokens = 2
batch_spec = BatchSpec(
seq_lens=[3, 3, 3],
query_lens=[3, 3, 3],
)
common_attn_metadata = create_common_attn_metadata(
batch_spec,
block_size=16,
device=device,
)
# Needed for cu_num_draft_tokens, which is expected to be [3, 6, 9]
expected_query_start_loc = torch.tensor([0, 3, 6, 9],
dtype=torch.int32,
device=device)
spec_decode_metadata = SpecDecodeMetadata.make_dummy(
draft_token_ids=[[0] * num_speculative_tokens] * 3,
device=device,
)
# num_rejected_tokens = [1, 0, 2]
# num_draft_tokens = [2, 2, 2]
# valid_sampled_tokens_count = num_draft_tokens + 1 - num_rejected_tokens
valid_sampled_tokens_count = torch.tensor([2, 3, 1],
dtype=torch.int32,
device=device)
proposer = _create_proposer("eagle", num_speculative_tokens)
output_metadata, token_indices, token_indices_to_sample = \
proposer.prepare_inputs_padded(
common_attn_metadata,
spec_decode_metadata,
valid_sampled_tokens_count)
assert output_metadata.max_query_len == 3
assert torch.equal(output_metadata.query_start_loc,
expected_query_start_loc)
assert torch.equal(token_indices, expected_token_indices)
assert torch.equal(token_indices_to_sample,
expected_token_indices_to_sample)
@pytest.mark.parametrize("method", ["eagle", "eagle3"])
@pytest.mark.parametrize("attn_backend",
get_attn_backend_list_based_on_platform())
@pytest.mark.parametrize("pp_size", [1, 2])
@pytest.mark.parametrize("use_distinct_embed_tokens", [True, False])
@mock.patch('vllm.v1.spec_decode.eagle.get_pp_group')
@mock.patch('vllm.v1.spec_decode.eagle.get_layers_from_vllm_config')
@mock.patch('vllm.v1.spec_decode.eagle.get_model')
def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method,
attn_backend, pp_size, use_distinct_embed_tokens,
monkeypatch):
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
if (attn_backend == "TRITON_ATTN" and not current_platform.is_rocm()):
pytest.skip("TRITON_ATTN does not support "
"multi-token eagle spec decode on current platform")
if attn_backend == "FLASH_ATTN" and current_platform.is_rocm():
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
# Setup draft model mock
mock_model = mock.MagicMock()
if use_distinct_embed_tokens:
# Some models can have a different hidden size than the target model,
# so we test that their embed_tokens doesn't get overwritten
mock_model.model.embed_tokens.weight.shape = (131072, 2048)
else:
mock_model.model.embed_tokens.weight.shape = (131072, 4096)
mock_get_model.return_value = mock_model
# Setup mocks for attention layers
target_attn_layers = {
"target_attn_1": mock.MagicMock(),
"target_attn_2": mock.MagicMock()
}
# Draft model has one extra attention layer compared to target model
all_attn_layers = {
**target_attn_layers, "draft_extra_attn": mock.MagicMock()
}
# Make mock_get_layers return different values for each call
mock_get_layers.side_effect = [target_attn_layers, all_attn_layers]
# Setup mock for pp group to return the appropriate value for world size
mock_pp_group = mock.MagicMock()
mock_pp_group.world_size = pp_size
mock_get_pp_group.return_value = mock_pp_group
# Set up the target model mock with a custom class so that
# isinstance() checks match the expected type.
class _TargetModelStub(LlamaForCausalLM):
model: mock.MagicMock
lm_head: mock.MagicMock
target_model = mock.create_autospec(_TargetModelStub, instance=True)
target_model.model = mock.MagicMock()
target_model.model.embed_tokens.weight.shape = (131072, 4096)
from vllm.model_executor.models import SupportsMultiModal
assert not isinstance(target_model, SupportsMultiModal)
if method == "eagle":
target_model.lm_head = mock.MagicMock()
# Create proposer using the helper function
proposer = _create_proposer(method, num_speculative_tokens=8)
# Call the method under test
proposer.load_model(target_model)
# Verify common interactions
mock_get_model.assert_called_once()
# Verify that EAGLE models gain the lm head from the target model
if method == "eagle":
assert proposer.model.lm_head == target_model.lm_head
# Verify that the embed tokens are set correctly
# If pp_size is > 1, the embed tokens should be distinct
if pp_size > 1 or use_distinct_embed_tokens:
assert proposer.model.model.embed_tokens != \
target_model.model.embed_tokens
else:
# When pp_size is 1 and the draft and target models have
# embed_tokens of the same shape, they should be shared.
assert proposer.model.model.embed_tokens == \
target_model.model.embed_tokens
@pytest.mark.parametrize("method", ["eagle", "eagle3"])
@pytest.mark.parametrize("attn_backend",
get_attn_backend_list_based_on_platform())
@pytest.mark.parametrize("num_speculative_tokens", [1, 3, 8])
def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch):
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
if (attn_backend == "TRITON_ATTN" and not current_platform.is_rocm()):
pytest.skip("TRITON_ATTN does not support "
"multi-token eagle spec decode on current platform")
if (attn_backend == "TREE_ATTN"):
pytest.skip("TREE_ATTN is tested separately in test_propose_tree"
"because it requires special input mocking.")
if attn_backend == "FLASH_ATTN" and current_platform.is_rocm():
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
# Use GPU device
device = torch.device(current_platform.device_type)
# Setup test parameters
batch_size = 2
seq_len_1 = 5
seq_len_2 = 3
total_tokens = seq_len_1 + seq_len_2
vocab_size = 100
seq_lens = [seq_len_1, seq_len_2]
# Create proposer first so we can use its actual hidden_size
proposer = _create_proposer("eagle", num_speculative_tokens)
# Get the hidden_size from the proposer to ensure consistency
hidden_size = proposer.hidden_size
# Helper to create deterministic logits that will produce specific tokens
def create_deterministic_logits(token_ids):
logits = torch.full((batch_size, vocab_size), -100.0, device=device)
for i, token_id in enumerate(token_ids):
logits[i, token_id] = 100.0
return logits
# We mock a model that returns deterministic logits
# Sequence 1: 42, 43, 44, ...
# Sequence 2: 60, 61, 62, ...
base_token_ids = [42, 60]
# Skip loading the model and replace it with a mock directly
# Create the mock model with deterministic outputs
model_mock = mock.MagicMock()
# Setup for model forward calls
forward_returns = []
for i in range(num_speculative_tokens):
if i == 0:
# First call uses all tokens
h_logits = torch.zeros(total_tokens, hidden_size, device=device)
h_states = torch.zeros(total_tokens, hidden_size, device=device)
else:
# Subsequent calls use batch_size tokens
h_logits = torch.zeros(batch_size, hidden_size, device=device)
h_states = torch.zeros(batch_size, hidden_size, device=device)
forward_returns.append((h_logits, h_states))
# For single token case, we only need the first item;
# for multi-token, we need the sequence
if num_speculative_tokens == 1:
model_mock.return_value = forward_returns[0]
else:
model_mock.side_effect = forward_returns
# Setup for compute_logits calls
logits_returns = []
for i in range(num_speculative_tokens):
# For each call, increment the base token IDs
current_tokens = [base_id + i for base_id in base_token_ids]
logits_returns.append(create_deterministic_logits(current_tokens))
if num_speculative_tokens == 1:
model_mock.compute_logits.return_value = logits_returns[0]
else:
model_mock.compute_logits.side_effect = logits_returns
# Assign the mock to the proposer
proposer.model = model_mock
# Assign draft attn_layer_names since load_model is not invoked
proposer.attn_layer_names = ["layer.0"]
# Create input tensors
batch_spec = BatchSpec(
seq_lens=seq_lens,
query_lens=seq_lens,
)
common_attn_metadata = create_common_attn_metadata(
batch_spec,
block_size=16,
device=device,
)
target_token_ids = torch.randint(0,
vocab_size, (total_tokens, ),
device=device)
target_positions = torch.cat([
torch.arange(seq_len_1, device=device),
torch.arange(seq_len_2, device=device)
])
target_hidden_states = torch.randn(total_tokens,
hidden_size,
device=device)
next_token_ids = torch.randint(0,
vocab_size, (batch_size, ),
dtype=torch.int32,
device=device)
sampling_metadata = mock.MagicMock()
if attn_backend == "FLASH_ATTN":
attn_metadata_builder_cls, _ = get_attention_backend(
_Backend.FLASH_ATTN)
elif attn_backend == "TRITON_ATTN":
attn_metadata_builder_cls, _ = get_attention_backend(
_Backend.TRITON_ATTN)
elif attn_backend == "TREE_ATTN":
attn_metadata_builder_cls, _ = get_attention_backend(
_Backend.TREE_ATTN)
else:
raise ValueError(f"Unsupported attention backend: {attn_backend}")
attn_metadata_builder = attn_metadata_builder_cls(
kv_cache_spec=create_standard_kv_cache_spec(proposer.vllm_config),
layer_names=proposer.attn_layer_names,
vllm_config=proposer.vllm_config,
device=device,
)
# Mock runner for attention metadata building
proposer.runner = mock.MagicMock()
proposer.runner.attn_groups.append([mock.MagicMock()])
proposer.runner.attn_groups[0][0].get_metadata_builder.return_value = \
attn_metadata_builder
proposer._get_attention_metadata_builder = mock.MagicMock(
return_value=attn_metadata_builder)
result = proposer.propose(target_token_ids=target_token_ids,
target_positions=target_positions,
target_hidden_states=target_hidden_states,
next_token_ids=next_token_ids,
last_token_indices=None,
common_attn_metadata=common_attn_metadata,
sampling_metadata=sampling_metadata)
assert result.shape == (batch_size, num_speculative_tokens)
# Create expected tokens based on our token pattern
if num_speculative_tokens == 1:
# Example for num_speculative_tokens=1:
# [[42], [60]]
expected_tokens = torch.tensor(
[[base_token_ids[0]], [base_token_ids[1]]], device=device)
else:
# Example for num_speculative_tokens=3:
# [[42, 43, 44], [60, 61, 62]]
expected_tokens = torch.zeros((batch_size, num_speculative_tokens),
dtype=torch.int64,
device=device)
for i in range(batch_size):
for j in range(num_speculative_tokens):
expected_tokens[i, j] = base_token_ids[i] + j
# Verify all tokens match our expectations
assert torch.equal(result, expected_tokens)
@pytest.mark.parametrize(
"spec_token_tree",
[
[(0, )], # A single token
[(0, ), (0, 0), (0, 0, 0)], # Chain
[(0, ), (1, ), (2, )], # Parallel
[(0, ), (1, ), (2, ), (0, 0), (0, 1), (1, 0), (1, 1), (2, 0),
(2, 1)], # Tree
])
def test_propose_tree(spec_token_tree):
# Get GPU device.
device = torch.device(current_platform.device_type)
# Setup test parameters.
batch_size = 2
seq_len_1 = 5
seq_len_2 = 3
total_tokens = seq_len_1 + seq_len_2
vocab_size = 100
seq_lens = [seq_len_1, seq_len_2]
num_speculative_tokens = len(spec_token_tree)
# Create proposer first so we can use its actual hidden_size.
proposer = _create_proposer("eagle",
num_speculative_tokens,
speculative_token_tree=spec_token_tree)
# Get the hidden_size from the proposer to ensure consistency.
hidden_size = proposer.hidden_size
# Helper to create deterministic logits that will produce specific tokens
def create_deterministic_logits(token_ids, k: int):
logits = torch.full((batch_size, vocab_size), -100.0, device=device)
for i, token_id in enumerate(token_ids):
# Assign decreasing values to the k, consecutive, tokens.
for j in range(k):
logits[i, token_id + j] = 100.0 - j
return logits
# Mock a model that returns deterministic logits.
base_token_ids = torch.tensor([42, 60], dtype=torch.int64, device=device)
# Skip loading the model and replace it with a mock that returns
# deterministic outputs.
model_mock = mock.MagicMock()
# Mock the model forward calls.
forward_returns = [(torch.zeros(total_tokens, hidden_size, device=device),
torch.zeros(total_tokens, hidden_size, device=device))]
for cu_num_drafts in proposer.cu_drafts_per_level:
h_logits = torch.zeros(batch_size * cu_num_drafts,
hidden_size,
device=device)
h_states = torch.zeros(batch_size * cu_num_drafts,
hidden_size,
device=device)
forward_returns.append((h_logits, h_states))
model_mock.side_effect = forward_returns
# Mock the compute_logits calls.
cu_num_drafts_tensor = torch.tensor([0] + proposer.cu_drafts_per_level,
dtype=torch.int32,
device=device)
logits_returns = []
for level, num_children in enumerate(proposer.child_drafts_per_level):
token_ids = base_token_ids + cu_num_drafts_tensor[level]
level_num_drafts = cu_num_drafts_tensor[
level + 1] - cu_num_drafts_tensor[level]
level_logits = []
for i in range(level_num_drafts // num_children):
level_logits.append(
create_deterministic_logits(token_ids + i * num_children,
num_children))
logits_returns.append(torch.stack(level_logits, dim=1))
model_mock.compute_logits.side_effect = logits_returns
# Assign the mock to the proposer
proposer.model = model_mock
# Assign draft attn_layer_names since load_model is not invoked
proposer.attn_layer_names = ["layer.0"]
# Get the tree attention metadata builder.
attn_metadata_builder_cls, _ = get_attention_backend(_Backend.TREE_ATTN)
attn_metadata_builder = attn_metadata_builder_cls(
kv_cache_spec=create_standard_kv_cache_spec(proposer.vllm_config),
layer_names=proposer.attn_layer_names,
vllm_config=proposer.vllm_config,
device=device,
)
# Mock runner for attention metadata building.
proposer.runner = mock.MagicMock()
proposer.runner.attn_groups.append([mock.MagicMock()])
proposer.runner.attn_groups[0][0].get_metadata_builder.return_value = \
attn_metadata_builder
proposer._get_attention_metadata_builder = mock.MagicMock(
return_value=attn_metadata_builder)
# Setup inputs for the proposer.
target_token_ids = torch.randint(0,
vocab_size, (total_tokens, ),
device=device)
target_positions = torch.cat([
torch.arange(seq_len_1, device=device),
torch.arange(seq_len_2, device=device)
])
target_hidden_states = torch.randn(total_tokens,
hidden_size,
device=device)
next_token_ids = torch.randint(0,
vocab_size, (batch_size, ),
dtype=torch.int32,
device=device)
batch_spec = BatchSpec(
seq_lens=seq_lens,
query_lens=seq_lens,
)
common_attn_metadata = create_common_attn_metadata(
batch_spec,
block_size=16,
device=device,
)
sampling_metadata = mock.MagicMock()
# Propose draft tokens.
result = proposer.propose(target_token_ids=target_token_ids,
target_positions=target_positions,
target_hidden_states=target_hidden_states,
next_token_ids=next_token_ids,
last_token_indices=None,
common_attn_metadata=common_attn_metadata,
sampling_metadata=sampling_metadata)
assert result.shape == (batch_size, num_speculative_tokens)
# The tokens are expected to be consecutive integers starting
# from the base token IDs.
expected_tokens = base_token_ids[:, None] + torch.arange(
num_speculative_tokens, dtype=torch.int64, device=device)
# Verify that the draft tokens match our expectations.
assert torch.equal(result, expected_tokens)