mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-07 19:57:08 +08:00
Signed-off-by: Nick Hill <nhill@redhat.com> Signed-off-by: Lucas Kabela <lucaskabela@meta.com> Signed-off-by: Max de Bayser <mbayser@br.ibm.com> Signed-off-by: Andrew Sansom <andrew@protopia.ai> Signed-off-by: Boyuan Feng <boyuan@meta.com> Signed-off-by: Boyuan Feng <fby.1994@gmail.com> Signed-off-by: boyuanfeng <boyuan@meta.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: JartX <sagformas@epdcenter.es> Signed-off-by: Chendi Xue <Chendi.Xue@intel.com> Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com> Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Chen Zhang <zhangch99@outlook.com> Signed-off-by: Roger Wang <hey@rogerw.io> Signed-off-by: mgoin <mgoin64@gmail.com> Signed-off-by: wwl2755 <wangwenlong2755@gmail.com> Signed-off-by: Manoel Marques <manoel.marques@ibm.com> Signed-off-by: Manoel Marques <manoelmrqs@gmail.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Signed-off-by: pengdrumli <pengdrumli@tencent.com> Signed-off-by: windsonsea <haifeng.yao@daocloud.io> Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai> Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Signed-off-by: Huamin Li <3ericli@gmail.com> Signed-off-by: simondanielsson <simon.danielsson99@hotmail.com> Signed-off-by: Rahul Tuli <rtuli@redhat.com> Signed-off-by: Yang <lymailforjob@gmail.com> Signed-off-by: Debolina Roy <debroy@redhat.com> Signed-off-by: David Chen <530634352@qq.com> Signed-off-by: wangzi <3220100013@zju.edu.cn> Signed-off-by: Eldar Kurtic <8884008+eldarkurtic@users.noreply.github.com> Signed-off-by: NickLucche <nlucches@redhat.com> Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com> Signed-off-by: Sara Kokkila Schumacher <saraks@ibm.com> Signed-off-by: Csrayz <jover@cmbchina.com> Signed-off-by: ivyilike <pww123@cmbchina.com> Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com> Signed-off-by: Bowen Wang <abmfy@icloud.com> Signed-off-by: qqma <qqma@amazon.com> Signed-off-by: ElizaWszola <ewszola@redhat.com> Signed-off-by: Lu Fang <fanglu@fb.com> Signed-off-by: Zhuohan Li <zhuohan123@gmail.com> Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: luka <lgovedic@redhat.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Signed-off-by: Or Ozeri <oro@il.ibm.com> Signed-off-by: Johnny Yang <johnnyyang@google.com> Signed-off-by: Alec Solder <alecs@fb.com> Signed-off-by: Alec S <10566873+alecsolder@users.noreply.github.com> Signed-off-by: Russell Bryant <rbryant@redhat.com> Signed-off-by: Matthew Bonanni <mbonanni@redhat.com> Signed-off-by: Alexander Matveev <amatveev@redhat.com> Signed-off-by: yewentao256 <zhyanwentao@126.com> Signed-off-by: liuye.hj <liuye.hj@alibaba-inc.com> Signed-off-by: Kunshang Ji <kunshang.ji@intel.com> Signed-off-by: Lucia Fang <116399278+luccafong@users.noreply.github.com> Signed-off-by: Michael Goin <mgoin64@gmail.com> Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com> Signed-off-by: Ming Yang <minos.future@gmail.com> Signed-off-by: Zhikaiiii <1658973216@qq.com> Signed-off-by: Andreas Hartel <andreas.hartel@aleph-alpha.com> Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com> Signed-off-by: wuxibin <wuxibin@bytedance.com> Signed-off-by: youkaichao <youkaichao@gmail.com> Signed-off-by: Peter Pan <Peter.Pan@daocloud.io> Signed-off-by: Peter Pan <peter.pan@daocloud.io> Signed-off-by: Nicolò Lucchesi<nicolo.lucchesi@gmail.com> Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com> Signed-off-by: Sage Moore <sage@neuralmagic.com> Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Signed-off-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com> Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com> Signed-off-by: Bill Nell <bnell@redhat.com> Signed-off-by: Shreeasish Kumar <shreeasish@rivosinc.com> Signed-off-by: Weida Hong <wdhongtw@google.com> Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Signed-off-by: Hashem Hashemi <hashem.hashemi@amd.com> Signed-off-by: Hashem Hashemi <159079214+amd-hhashemi@users.noreply.github.com> Signed-off-by: Amir Samani <asamani@nvidia.com> Signed-off-by: ElizaWszola <elizaw.9289@gmail.com> Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com> Signed-off-by: ilmarkov <markovilya197@gmail.com> Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com> Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com> Signed-off-by: rouchenzi <ruochenwen@gmail.com> Signed-off-by: rouchenzi <40842833+rouchenzi@users.noreply.github.com> Signed-off-by: Andrew Xia <axia@meta.com> Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com> Signed-off-by: Corey Lowman <clowman1993@gmail.com> Signed-off-by: jpvillam <jpvillam@amd.com> Signed-off-by: dougbtv <dosmith@redhat.com> Signed-off-by: Chenxi Yang <cxyang@fb.com> Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Signed-off-by: ahao-anyscale <ahao@anyscale.com> Signed-off-by: Yan Lu <luyan@nvidia.com> Signed-off-by: baxingpiaochong <771405853@qq.com> Signed-off-by: Kyle Sayers <kylesayrs@gmail.com> Signed-off-by: Nikhil Gupta <nikhil.gupta2@arm.com> Signed-off-by: Yong Hoon Shin <yhshin@meta.com> Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai> Signed-off-by: Benjamin Chislett <bchislett@nvidia.com> Signed-off-by: Ben Browning <bbrownin@redhat.com> Signed-off-by: Chengji Yao <chengjiyao@google.com> Signed-off-by: jiang1.li <jiang1.li@intel.com> Signed-off-by: Jackmin801 <ongjackm@gmail.com> Signed-off-by: Jonas M. Kübler <44084297+jmkuebler@users.noreply.github.com> Signed-off-by: taohui <taohui3@gmail.com> Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io> Signed-off-by: Shu Wang <shuw@nvidia.com> Signed-off-by: Shu Wang. <shuw@nvidia.com> Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com> Signed-off-by: Duncan Moss <djm.moss@gmail.com> Signed-off-by: Shiyan Deng <dsy842974287@meta.com> Signed-off-by: Wei Wei <wwei6@meta.com> Signed-off-by: Saman Keon <samanamp@outlook.com> Signed-off-by: yangxurui <yangxurui@meituan.com> Signed-off-by: nicole-lihui <nicole.li@daocloud.io> Signed-off-by: courage17340 <courage17340@163.com> Signed-off-by: Jacob Kahn <jacobkahn1@gmail.com> Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com> Signed-off-by: Agata Dobrzyniewicz <adobrzyniewicz@habana.ai> Signed-off-by: zxw <1020938856@qq.com> Signed-off-by: wang.yuqi <noooop@126.com> Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Signed-off-by: chenlang <chen.lang5@zte.com.cn> Signed-off-by: Jonas Kuebler <kuebj@amazon.com> Signed-off-by: AlonKejzman <alonkeizman@gmail.com> Signed-off-by: Tao Hui <taohui3@gmail.com> Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com> Signed-off-by: Tomer Asida <57313761+tomeras91@users.noreply.github.com> Signed-off-by: Aleksandr Malyshev <maleksan@amd.com> Signed-off-by: Eugene Khvedchenia <ekhvedchenia@nvidia.com> Signed-off-by: Eugene Khvedchenya <ekhvedchenya@gmail.com> Signed-off-by: yiting.jiang <yiting.jiang@daocloud.io> Signed-off-by: xaguilar <Xavier.AguilarFruto@amd.com> Signed-off-by: Iceber Gu <caiwei95@hotmail.com> Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com> Signed-off-by: Icey <1790571317@qq.com> Signed-off-by: 许文卿 <xwq391974@alibaba-inc.com> Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com> Co-authored-by: Nick Hill <nhill@redhat.com> Co-authored-by: Lucas Kabela <lucasakabela@gmail.com> Co-authored-by: Maximilien de Bayser <mbayser@br.ibm.com> Co-authored-by: Andrew Sansom <andrew@protopia.ai> Co-authored-by: Boyuan Feng <boyuan@meta.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: JartX <sagformas@epdcenter.es> Co-authored-by: Chendi.Xue <chendi.xue@intel.com> Co-authored-by: Chauncey <chaunceyjiang@gmail.com> Co-authored-by: xin.li <xin.li@daocloud.io> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk> Co-authored-by: Chen Zhang <zhangch99@outlook.com> Co-authored-by: Roger Wang <hey@rogerw.io> Co-authored-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Wenlong Wang <wangwenlong2755@gmail.com> Co-authored-by: Manoel Marques <manoelmrqs@gmail.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: lirong <56789630+lirong-lirong@users.noreply.github.com> Co-authored-by: Michael Yao <haifeng.yao@daocloud.io> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Co-authored-by: Huamin Li <3ericli@gmail.com> Co-authored-by: Lu Fang <30275821+houseroad@users.noreply.github.com> Co-authored-by: Simon Danielsson <70206058+simondanielsson@users.noreply.github.com> Co-authored-by: Rahul Tuli <rtuli@redhat.com> Co-authored-by: Claude <noreply@anthropic.com> Co-authored-by: Yang Liu <127183760+KKSK-DON@users.noreply.github.com> Co-authored-by: Deboleina <debroy@redhat.com> Co-authored-by: yinz-aizip <yinz@aizip.ai> Co-authored-by: WeiQing Chen <40507679+david6666666@users.noreply.github.com> Co-authored-by: wangzi <3220100013@zju.edu.cn> Co-authored-by: Eldar Kurtić <8884008+eldarkurtic@users.noreply.github.com> Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com> Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com> Co-authored-by: Yizhou <136800916+yiz-liu@users.noreply.github.com> Co-authored-by: Sara-KS <50249410+Sara-KS@users.noreply.github.com> Co-authored-by: Csrayz <jover@cmbchina.com> Co-authored-by: ivyilike <pww123@cmbchina.com> Co-authored-by: Burkhard Ringlein <ngl@zurich.ibm.com> Co-authored-by: Bowen Wang <abmfy@icloud.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: Daisy-Ma-coder <daisy.ma.0117@gmail.com> Co-authored-by: qqma <qqma@amazon.com> Co-authored-by: ElizaWszola <ewszola@redhat.com> Co-authored-by: Lucia Fang <116399278+luccafong@users.noreply.github.com> Co-authored-by: Zhuohan Li <zhuohan123@gmail.com> Co-authored-by: Simon Mo <simon.mo@hey.com> Co-authored-by: Or Ozeri <oro@il.ibm.com> Co-authored-by: Johnny Yang <24908445+jcyang43@users.noreply.github.com> Co-authored-by: Chengji Yao <chengjiyao@google.com> Co-authored-by: Alec S <10566873+alecsolder@users.noreply.github.com> Co-authored-by: Alec Solder <alecs@fb.com> Co-authored-by: Russell Bryant <rbryant@redhat.com> Co-authored-by: Matthew Bonanni <mbonanni@redhat.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Co-authored-by: Chris Bamford <chrisbam4d@gmail.com> Co-authored-by: Alexander Matveev <59768536+alexm-redhat@users.noreply.github.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: JJJYmmm <92386084+JJJYmmm@users.noreply.github.com> Co-authored-by: liuye.hj <liuye.hj@alibaba-inc.com> Co-authored-by: Kunshang Ji <kunshang.ji@intel.com> Co-authored-by: Lucia (Lu) Fang <fanglu@meta.com> Co-authored-by: Varun Sundar Rabindranath <varunsundar08@gmail.com> Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com> Co-authored-by: Ming Yang <yming@meta.com> Co-authored-by: Zhikaiiii <55917203+Zhikaiiii@users.noreply.github.com> Co-authored-by: Andreas Hartel <andreas@hartel.me> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com> Co-authored-by: vllmellm <vllm.ellm@embeddedllm.com> Co-authored-by: Joel <wuxibin89@163.com> Co-authored-by: youkaichao <youkaichao@gmail.com> Co-authored-by: Mark McLoughlin <markmc@redhat.com> Co-authored-by: Peter Pan <peter.pan@daocloud.io> Co-authored-by: Nicolò Lucchesi <nicolo.lucchesi@gmail.com> Co-authored-by: Fanli Lin <fanli.lin@intel.com> Co-authored-by: Thomas Parnell <tpa@zurich.ibm.com> Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com> Co-authored-by: Sage Moore <sage@neuralmagic.com> Co-authored-by: yewentao256 <zhyanwentao@126.com> Co-authored-by: bnellnm <49004751+bnellnm@users.noreply.github.com> Co-authored-by: rivos-shreeasish <shreeasish@rivosinc.com> Co-authored-by: Chih-Chieh Yang <chih.chieh.yang@ibm.com> Co-authored-by: Weida Hong <wdhongtw@gmail.com> Co-authored-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Co-authored-by: Hashem Hashemi <159079214+amd-hhashemi@users.noreply.github.com> Co-authored-by: Amir Samani <samani@ualberta.ca> Co-authored-by: Luka Govedič <lgovedic@redhat.com> Co-authored-by: jiahanc <173873397+jiahanc@users.noreply.github.com> Co-authored-by: Ilya Markov <markovilya197@gmail.com> Co-authored-by: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Co-authored-by: Jialin Ouyang <Jialin.Ouyang@gmail.com> Co-authored-by: rouchenzi <40842833+rouchenzi@users.noreply.github.com> Co-authored-by: Andrew Xia <axia@meta.com> Co-authored-by: kourosh hakhamaneshi <31483498+kouroshHakha@users.noreply.github.com> Co-authored-by: Corey Lowman <clowman1993@gmail.com> Co-authored-by: Juan Villamizar <100237675+jpvillam-amd@users.noreply.github.com> Co-authored-by: jpvillam <jpvillam@amd.com> Co-authored-by: Doug Smith <dosmith@redhat.com> Co-authored-by: Chenxi Yang <cxyang@cs.utexas.edu> Co-authored-by: Chenxi Yang <cxyang@fb.com> Co-authored-by: ahao-anyscale <ahao@anyscale.com> Co-authored-by: 0xNullPath <luyanfcp@foxmail.com> Co-authored-by: baxingpiaochong <771405853@qq.com> Co-authored-by: Benjamin Chislett <bchislett@nvidia.com> Co-authored-by: Kyle Sayers <kylesayrs@gmail.com> Co-authored-by: Nikhil Gupta <nikhil.gupta2@arm.com> Co-authored-by: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com> Co-authored-by: lhsjohn <huashuoli@tencent.com> Co-authored-by: Ben Browning <bbrownin@redhat.com> Co-authored-by: Li, Jiang <jiang1.li@intel.com> Co-authored-by: Jackmin801 <56836461+Jackmin801@users.noreply.github.com> Co-authored-by: Jonas M. Kübler <44084297+jmkuebler@users.noreply.github.com> Co-authored-by: Tao Hui <taohui3@gmail.com> Co-authored-by: rongfu.leng <rongfu.leng@daocloud.io> Co-authored-by: Shu Wang <shuw@nvidia.com> Co-authored-by: Tyler Michael Smith <tlrmchlsmth@gmail.com> Co-authored-by: Duncan Moss <djm.moss@gmail.com> Co-authored-by: Shiyan Deng <dsy842974287@meta.com> Co-authored-by: Wei Wei <wwei6@meta.com> Co-authored-by: Saman A. Pour <samanamp@outlook.com> Co-authored-by: XuruiYang <530534756@qq.com> Co-authored-by: yangxurui <yangxurui@meituan.com> Co-authored-by: Nicole LiHui 🥜 <nicolelihui@outlook.com> Co-authored-by: courage17340 <courage17340@users.noreply.github.com> Co-authored-by: Jacob Kahn <jacobkahn1@gmail.com> Co-authored-by: Nicole LiHui 🥜 <nicole.li@daocloud.io> Co-authored-by: Fadi Arafeh <115173828+fadara01@users.noreply.github.com> Co-authored-by: Agata Dobrzyniewicz <160237065+adobrzyn@users.noreply.github.com> Co-authored-by: yyzxw <34639446+yyzxw@users.noreply.github.com> Co-authored-by: wang.yuqi <noooop@126.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: chenlang <chen.lang5@zte.com.cn> Co-authored-by: chenlang <10346245@zte.com.cn> Co-authored-by: AlonKejzman <alonkeizman@gmail.com> Co-authored-by: tomeras91 <57313761+tomeras91@users.noreply.github.com> Co-authored-by: Aleksandr Malyshev <164964928+maleksan85@users.noreply.github.com> Co-authored-by: Aleksandr Malyshev <maleksan@amd.com> Co-authored-by: Doug Lehr <douglehr@amd.com> Co-authored-by: Eugene Khvedchenya <ekhvedchenya@gmail.com> Co-authored-by: yitingdc <59356937+yitingdc@users.noreply.github.com> Co-authored-by: xaguilar-amd <xavier.aguilarfruto@amd.com> Co-authored-by: Iceber Gu <caiwei95@hotmail.com> Co-authored-by: Tao He <linzhu.ht@alibaba-inc.com> Co-authored-by: Icey <1790571317@qq.com> Co-authored-by: Xu Wenqing <121550081+Xu-Wenqing@users.noreply.github.com> Co-authored-by: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com> Co-authored-by: RishiAstra <40644327+RishiAstra@users.noreply.github.com>
1384 lines
42 KiB
Python
1384 lines
42 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||
"""
|
||
This example shows how to use vLLM for running offline inference with
|
||
multi-image input on vision language models for text generation,
|
||
using the chat template defined by the model.
|
||
"""
|
||
|
||
import os
|
||
from argparse import Namespace
|
||
from dataclasses import asdict
|
||
from typing import NamedTuple, Optional
|
||
|
||
from huggingface_hub import snapshot_download
|
||
from PIL.Image import Image
|
||
from transformers import AutoProcessor, AutoTokenizer
|
||
|
||
from vllm import LLM, EngineArgs, SamplingParams
|
||
from vllm.lora.request import LoRARequest
|
||
from vllm.multimodal.utils import fetch_image
|
||
from vllm.utils import FlexibleArgumentParser
|
||
|
||
QUESTION = "What is the content of each image?"
|
||
IMAGE_URLS = [
|
||
"https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg",
|
||
"https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg",
|
||
"https://upload.wikimedia.org/wikipedia/commons/2/26/Ultramarine_Flycatcher_%28Ficedula_superciliaris%29_Naggar%2C_Himachal_Pradesh%2C_2013_%28cropped%29.JPG",
|
||
"https://upload.wikimedia.org/wikipedia/commons/thumb/e/e5/Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg/2560px-Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg",
|
||
"https://upload.wikimedia.org/wikipedia/commons/d/d4/Starfish%2C_Caswell_Bay_-_geograph.org.uk_-_409413.jpg",
|
||
"https://upload.wikimedia.org/wikipedia/commons/6/69/Grapevinesnail_01.jpg",
|
||
"https://upload.wikimedia.org/wikipedia/commons/thumb/0/0b/Texas_invasive_Musk_Thistle_1.jpg/1920px-Texas_invasive_Musk_Thistle_1.jpg",
|
||
"https://upload.wikimedia.org/wikipedia/commons/thumb/7/7a/Huskiesatrest.jpg/2880px-Huskiesatrest.jpg",
|
||
"https://upload.wikimedia.org/wikipedia/commons/thumb/6/68/Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg/1920px-Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg",
|
||
"https://upload.wikimedia.org/wikipedia/commons/3/30/George_the_amazing_guinea_pig.jpg",
|
||
"https://upload.wikimedia.org/wikipedia/commons/thumb/1/1f/Oryctolagus_cuniculus_Rcdo.jpg/1920px-Oryctolagus_cuniculus_Rcdo.jpg",
|
||
"https://upload.wikimedia.org/wikipedia/commons/9/98/Horse-and-pony.jpg",
|
||
]
|
||
|
||
|
||
class ModelRequestData(NamedTuple):
|
||
engine_args: EngineArgs
|
||
prompt: str
|
||
image_data: list[Image]
|
||
stop_token_ids: Optional[list[int]] = None
|
||
chat_template: Optional[str] = None
|
||
lora_requests: Optional[list[LoRARequest]] = None
|
||
|
||
|
||
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
|
||
# lower-end GPUs.
|
||
# Unless specified, these settings have been tested to work on a single L4.
|
||
|
||
|
||
def load_aria(question: str, image_urls: list[str]) -> ModelRequestData:
|
||
model_name = "rhymes-ai/Aria"
|
||
engine_args = EngineArgs(
|
||
model=model_name,
|
||
tokenizer_mode="slow",
|
||
trust_remote_code=True,
|
||
dtype="bfloat16",
|
||
limit_mm_per_prompt={"image": len(image_urls)},
|
||
)
|
||
placeholders = "<fim_prefix><|img|><fim_suffix>\n" * len(image_urls)
|
||
prompt = (
|
||
f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n<|im_start|>assistant\n"
|
||
)
|
||
stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
|
||
|
||
return ModelRequestData(
|
||
engine_args=engine_args,
|
||
prompt=prompt,
|
||
stop_token_ids=stop_token_ids,
|
||
image_data=[fetch_image(url) for url in image_urls],
|
||
)
|
||
|
||
|
||
def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData:
|
||
model_name = "CohereForAI/aya-vision-8b"
|
||
|
||
engine_args = EngineArgs(
|
||
model=model_name,
|
||
max_num_seqs=2,
|
||
limit_mm_per_prompt={"image": len(image_urls)},
|
||
)
|
||
|
||
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||
messages = [
|
||
{
|
||
"role": "user",
|
||
"content": [
|
||
*placeholders,
|
||
{"type": "text", "text": question},
|
||
],
|
||
}
|
||
]
|
||
|
||
processor = AutoProcessor.from_pretrained(model_name)
|
||
|
||
prompt = processor.apply_chat_template(
|
||
messages, tokenize=False, add_generation_prompt=True
|
||
)
|
||
|
||
return ModelRequestData(
|
||
engine_args=engine_args,
|
||
prompt=prompt,
|
||
image_data=[fetch_image(url) for url in image_urls],
|
||
)
|
||
|
||
|
||
def load_command_a_vision(question: str, image_urls: list[str]) -> ModelRequestData:
|
||
model_name = "CohereLabs/command-a-vision-07-2025"
|
||
|
||
# NOTE: This model is 122B parameters and requires tensor parallelism
|
||
# Recommended to use tp=4 on H100 GPUs
|
||
engine_args = EngineArgs(
|
||
model=model_name,
|
||
max_model_len=32768,
|
||
tensor_parallel_size=4,
|
||
limit_mm_per_prompt={"image": len(image_urls)},
|
||
)
|
||
|
||
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||
messages = [
|
||
{
|
||
"role": "user",
|
||
"content": [
|
||
*placeholders,
|
||
{"type": "text", "text": question},
|
||
],
|
||
}
|
||
]
|
||
|
||
processor = AutoProcessor.from_pretrained(model_name)
|
||
|
||
prompt = processor.apply_chat_template(
|
||
messages, tokenize=False, add_generation_prompt=True
|
||
)
|
||
|
||
return ModelRequestData(
|
||
engine_args=engine_args,
|
||
prompt=prompt,
|
||
image_data=[fetch_image(url) for url in image_urls],
|
||
)
|
||
|
||
|
||
def load_deepseek_vl2(question: str, image_urls: list[str]) -> ModelRequestData:
|
||
model_name = "deepseek-ai/deepseek-vl2-tiny"
|
||
|
||
engine_args = EngineArgs(
|
||
model=model_name,
|
||
max_model_len=4096,
|
||
max_num_seqs=2,
|
||
hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
|
||
limit_mm_per_prompt={"image": len(image_urls)},
|
||
)
|
||
|
||
placeholder = "".join(
|
||
f"image_{i}:<image>\n" for i, _ in enumerate(image_urls, start=1)
|
||
)
|
||
prompt = f"<|User|>: {placeholder}{question}\n\n<|Assistant|>:"
|
||
|
||
return ModelRequestData(
|
||
engine_args=engine_args,
|
||
prompt=prompt,
|
||
image_data=[fetch_image(url) for url in image_urls],
|
||
)
|
||
|
||
|
||
def load_gemma3(question: str, image_urls: list[str]) -> ModelRequestData:
|
||
model_name = "google/gemma-3-4b-it"
|
||
|
||
engine_args = EngineArgs(
|
||
model=model_name,
|
||
max_model_len=8192,
|
||
max_num_seqs=2,
|
||
limit_mm_per_prompt={"image": len(image_urls)},
|
||
)
|
||
|
||
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||
messages = [
|
||
{
|
||
"role": "user",
|
||
"content": [
|
||
*placeholders,
|
||
{"type": "text", "text": question},
|
||
],
|
||
}
|
||
]
|
||
|
||
processor = AutoProcessor.from_pretrained(model_name)
|
||
|
||
prompt = processor.apply_chat_template(
|
||
messages, tokenize=False, add_generation_prompt=True
|
||
)
|
||
|
||
return ModelRequestData(
|
||
engine_args=engine_args,
|
||
prompt=prompt,
|
||
image_data=[fetch_image(url) for url in image_urls],
|
||
)
|
||
|
||
|
||
def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||
model_name = "h2oai/h2ovl-mississippi-800m"
|
||
|
||
engine_args = EngineArgs(
|
||
model=model_name,
|
||
trust_remote_code=True,
|
||
max_model_len=8192,
|
||
limit_mm_per_prompt={"image": len(image_urls)},
|
||
mm_processor_kwargs={"max_dynamic_patch": 4},
|
||
)
|
||
|
||
placeholders = "\n".join(
|
||
f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
|
||
)
|
||
messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
|
||
|
||
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
||
prompt = tokenizer.apply_chat_template(
|
||
messages, tokenize=False, add_generation_prompt=True
|
||
)
|
||
|
||
# Stop tokens for H2OVL-Mississippi
|
||
# https://huggingface.co/h2oai/h2ovl-mississippi-800m
|
||
stop_token_ids = [tokenizer.eos_token_id]
|
||
|
||
return ModelRequestData(
|
||
engine_args=engine_args,
|
||
prompt=prompt,
|
||
stop_token_ids=stop_token_ids,
|
||
image_data=[fetch_image(url) for url in image_urls],
|
||
)
|
||
|
||
|
||
def load_hyperclovax_seed_vision(
|
||
question: str, image_urls: list[str]
|
||
) -> ModelRequestData:
|
||
model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
|
||
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
||
|
||
engine_args = EngineArgs(
|
||
model=model_name,
|
||
trust_remote_code=True,
|
||
max_model_len=16384,
|
||
limit_mm_per_prompt={"image": len(image_urls)},
|
||
)
|
||
|
||
message = {"role": "user", "content": list()}
|
||
for _image_url in image_urls:
|
||
message["content"].append(
|
||
{
|
||
"type": "image",
|
||
"image": _image_url,
|
||
"ocr": "",
|
||
"lens_keywords": "",
|
||
"lens_local_keywords": "",
|
||
}
|
||
)
|
||
message["content"].append(
|
||
{
|
||
"type": "text",
|
||
"text": question,
|
||
}
|
||
)
|
||
|
||
prompt = tokenizer.apply_chat_template(
|
||
[
|
||
message,
|
||
],
|
||
tokenize=False,
|
||
add_generation_prompt=True,
|
||
)
|
||
|
||
return ModelRequestData(
|
||
engine_args=engine_args,
|
||
prompt=prompt,
|
||
stop_token_ids=None,
|
||
image_data=[fetch_image(url) for url in image_urls],
|
||
)
|
||
|
||
|
||
def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
|
||
model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
|
||
|
||
# The configuration below has been confirmed to launch on a single L40 GPU.
|
||
engine_args = EngineArgs(
|
||
model=model_name,
|
||
max_model_len=8192,
|
||
max_num_seqs=16,
|
||
enforce_eager=True,
|
||
limit_mm_per_prompt={"image": len(image_urls)},
|
||
# if you are running out of memory, you can reduce the "longest_edge".
|
||
# see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
|
||
mm_processor_kwargs={
|
||
"size": {"longest_edge": 2 * 364},
|
||
},
|
||
)
|
||
|
||
placeholders = "\n".join(
|
||
f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
|
||
)
|
||
prompt = f"<|begin_of_text|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:" # noqa: E501
|
||
return ModelRequestData(
|
||
engine_args=engine_args,
|
||
prompt=prompt,
|
||
image_data=[fetch_image(url) for url in image_urls],
|
||
)
|
||
|
||
|
||
def load_interns1(question: str, image_urls: list[str]) -> ModelRequestData:
|
||
model_name = "internlm/Intern-S1"
|
||
|
||
engine_args = EngineArgs(
|
||
model=model_name,
|
||
trust_remote_code=True,
|
||
max_model_len=4096,
|
||
limit_mm_per_prompt={"image": len(image_urls)},
|
||
)
|
||
|
||
placeholders = "\n".join(
|
||
f"Image-{i}: <IMG_CONTEXT>\n" for i, _ in enumerate(image_urls, start=1)
|
||
)
|
||
messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
|
||
|
||
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
||
prompt = tokenizer.apply_chat_template(
|
||
messages, tokenize=False, add_generation_prompt=True
|
||
)
|
||
|
||
return ModelRequestData(
|
||
engine_args=engine_args,
|
||
prompt=prompt,
|
||
image_data=[fetch_image(url) for url in image_urls],
|
||
)
|
||
|
||
|
||
def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||
model_name = "OpenGVLab/InternVL2-2B"
|
||
|
||
engine_args = EngineArgs(
|
||
model=model_name,
|
||
trust_remote_code=True,
|
||
max_model_len=4096,
|
||
limit_mm_per_prompt={"image": len(image_urls)},
|
||
mm_processor_kwargs={"max_dynamic_patch": 4},
|
||
)
|
||
|
||
placeholders = "\n".join(
|
||
f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
|
||
)
|
||
messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
|
||
|
||
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
||
prompt = tokenizer.apply_chat_template(
|
||
messages, tokenize=False, add_generation_prompt=True
|
||
)
|
||
|
||
# Stop tokens for InternVL
|
||
# models variants may have different stop tokens
|
||
# please refer to the model card for the correct "stop words":
|
||
# https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
|
||
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
|
||
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
|
||
|
||
return ModelRequestData(
|
||
engine_args=engine_args,
|
||
prompt=prompt,
|
||
stop_token_ids=stop_token_ids,
|
||
image_data=[fetch_image(url) for url in image_urls],
|
||
)
|
||
|
||
|
||
def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
|
||
model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
|
||
|
||
engine_args = EngineArgs(
|
||
model=model_name,
|
||
max_model_len=131072,
|
||
tensor_parallel_size=8,
|
||
limit_mm_per_prompt={"image": len(image_urls)},
|
||
)
|
||
|
||
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||
messages = [
|
||
{
|
||
"role": "user",
|
||
"content": [
|
||
*placeholders,
|
||
{"type": "text", "text": question},
|
||
],
|
||
}
|
||
]
|
||
|
||
processor = AutoProcessor.from_pretrained(model_name)
|
||
|
||
prompt = processor.apply_chat_template(
|
||
messages, tokenize=False, add_generation_prompt=True
|
||
)
|
||
|
||
return ModelRequestData(
|
||
engine_args=engine_args,
|
||
prompt=prompt,
|
||
image_data=[fetch_image(url) for url in image_urls],
|
||
)
|
||
|
||
|
||
def load_llava(question: str, image_urls: list[str]) -> ModelRequestData:
|
||
# NOTE: CAUTION! Original Llava models wasn't really trained on multi-image inputs,
|
||
# it will generate poor response for multi-image inputs!
|
||
model_name = "llava-hf/llava-1.5-7b-hf"
|
||
engine_args = EngineArgs(
|
||
model=model_name,
|
||
max_num_seqs=16,
|
||
limit_mm_per_prompt={"image": len(image_urls)},
|
||
)
|
||
|
||
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||
messages = [
|
||
{
|
||
"role": "user",
|
||
"content": [
|
||
*placeholders,
|
||
{"type": "text", "text": question},
|
||
],
|
||
}
|
||
]
|
||
|
||
processor = AutoProcessor.from_pretrained(model_name)
|
||
|
||
prompt = processor.apply_chat_template(
|
||
messages, tokenize=False, add_generation_prompt=True
|
||
)
|
||
|
||
return ModelRequestData(
|
||
engine_args=engine_args,
|
||
prompt=prompt,
|
||
image_data=[fetch_image(url) for url in image_urls],
|
||
)
|
||
|
||
|
||
def load_llava_next(question: str, image_urls: list[str]) -> ModelRequestData:
|
||
model_name = "llava-hf/llava-v1.6-mistral-7b-hf"
|
||
engine_args = EngineArgs(
|
||
model=model_name,
|
||
max_model_len=8192,
|
||
max_num_seqs=16,
|
||
limit_mm_per_prompt={"image": len(image_urls)},
|
||
)
|
||
|
||
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||
messages = [
|
||
{
|
||
"role": "user",
|
||
"content": [
|
||
*placeholders,
|
||
{"type": "text", "text": question},
|
||
],
|
||
}
|
||
]
|
||
|
||
processor = AutoProcessor.from_pretrained(model_name)
|
||
|
||
prompt = processor.apply_chat_template(
|
||
messages, tokenize=False, add_generation_prompt=True
|
||
)
|
||
|
||
return ModelRequestData(
|
||
engine_args=engine_args,
|
||
prompt=prompt,
|
||
image_data=[fetch_image(url) for url in image_urls],
|
||
)
|
||
|
||
|
||
def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestData:
|
||
model_name = "llava-hf/llava-onevision-qwen2-7b-ov-hf"
|
||
engine_args = EngineArgs(
|
||
model=model_name,
|
||
max_model_len=16384,
|
||
max_num_seqs=16,
|
||
limit_mm_per_prompt={"image": len(image_urls)},
|
||
)
|
||
|
||
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||
messages = [
|
||
{
|
||
"role": "user",
|
||
"content": [
|
||
*placeholders,
|
||
{"type": "text", "text": question},
|
||
],
|
||
}
|
||
]
|
||
|
||
processor = AutoProcessor.from_pretrained(model_name)
|
||
|
||
prompt = processor.apply_chat_template(
|
||
messages, tokenize=False, add_generation_prompt=True
|
||
)
|
||
|
||
return ModelRequestData(
|
||
engine_args=engine_args,
|
||
prompt=prompt,
|
||
image_data=[fetch_image(url) for url in image_urls],
|
||
)
|
||
|
||
|
||
def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||
model_name = "Kwai-Keye/Keye-VL-8B-Preview"
|
||
|
||
engine_args = EngineArgs(
|
||
model=model_name,
|
||
trust_remote_code=True,
|
||
max_model_len=8192,
|
||
max_num_seqs=5,
|
||
limit_mm_per_prompt={"image": len(image_urls)},
|
||
)
|
||
|
||
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||
messages = [
|
||
{
|
||
"role": "user",
|
||
"content": [
|
||
*placeholders,
|
||
{"type": "text", "text": question},
|
||
],
|
||
},
|
||
]
|
||
|
||
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
|
||
|
||
prompt = processor.apply_chat_template(
|
||
messages, tokenize=False, add_generation_prompt=True
|
||
)
|
||
|
||
image_data = [fetch_image(url) for url in image_urls]
|
||
|
||
return ModelRequestData(
|
||
engine_args=engine_args,
|
||
prompt=prompt,
|
||
image_data=image_data,
|
||
)
|
||
|
||
|
||
def load_keye_vl1_5(question: str, image_urls: list[str]) -> ModelRequestData:
|
||
model_name = "Kwai-Keye/Keye-VL-1_5-8B"
|
||
|
||
engine_args = EngineArgs(
|
||
model=model_name,
|
||
trust_remote_code=True,
|
||
max_model_len=8192,
|
||
max_num_seqs=5,
|
||
limit_mm_per_prompt={"image": len(image_urls)},
|
||
)
|
||
|
||
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||
messages = [
|
||
{
|
||
"role": "user",
|
||
"content": [
|
||
*placeholders,
|
||
{"type": "text", "text": question},
|
||
],
|
||
},
|
||
]
|
||
|
||
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
|
||
|
||
prompt = processor.apply_chat_template(
|
||
messages, tokenize=False, add_generation_prompt=True
|
||
)
|
||
|
||
image_data = [fetch_image(url) for url in image_urls]
|
||
|
||
return ModelRequestData(
|
||
engine_args=engine_args,
|
||
prompt=prompt,
|
||
image_data=image_data,
|
||
)
|
||
|
||
|
||
def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||
model_name = "moonshotai/Kimi-VL-A3B-Instruct"
|
||
|
||
engine_args = EngineArgs(
|
||
model=model_name,
|
||
trust_remote_code=True,
|
||
max_model_len=4096,
|
||
max_num_seqs=4,
|
||
limit_mm_per_prompt={"image": len(image_urls)},
|
||
)
|
||
|
||
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||
messages = [
|
||
{
|
||
"role": "user",
|
||
"content": [
|
||
*placeholders,
|
||
{"type": "text", "text": question},
|
||
],
|
||
}
|
||
]
|
||
|
||
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
|
||
|
||
prompt = processor.apply_chat_template(
|
||
messages, tokenize=False, add_generation_prompt=True
|
||
)
|
||
|
||
return ModelRequestData(
|
||
engine_args=engine_args,
|
||
prompt=prompt,
|
||
image_data=[fetch_image(url) for url in image_urls],
|
||
)
|
||
|
||
|
||
def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData:
|
||
model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
|
||
|
||
# Adjust this as necessary to fit in GPU
|
||
engine_args = EngineArgs(
|
||
model=model_name,
|
||
max_model_len=8192,
|
||
max_num_seqs=2,
|
||
tensor_parallel_size=2,
|
||
limit_mm_per_prompt={"image": len(image_urls)},
|
||
ignore_patterns=["consolidated.safetensors"],
|
||
)
|
||
|
||
placeholders = "[IMG]" * len(image_urls)
|
||
prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"
|
||
|
||
return ModelRequestData(
|
||
engine_args=engine_args,
|
||
prompt=prompt,
|
||
image_data=[fetch_image(url) for url in image_urls],
|
||
)
|
||
|
||
|
||
def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData:
|
||
model_name = "nvidia/NVLM-D-72B"
|
||
|
||
# Adjust this as necessary to fit in GPU
|
||
engine_args = EngineArgs(
|
||
model=model_name,
|
||
trust_remote_code=True,
|
||
max_model_len=8192,
|
||
tensor_parallel_size=4,
|
||
limit_mm_per_prompt={"image": len(image_urls)},
|
||
mm_processor_kwargs={"max_dynamic_patch": 4},
|
||
)
|
||
|
||
placeholders = "\n".join(
|
||
f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
|
||
)
|
||
messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
|
||
|
||
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
||
prompt = tokenizer.apply_chat_template(
|
||
messages, tokenize=False, add_generation_prompt=True
|
||
)
|
||
|
||
return ModelRequestData(
|
||
engine_args=engine_args,
|
||
prompt=prompt,
|
||
image_data=[fetch_image(url) for url in image_urls],
|
||
)
|
||
|
||
|
||
# Ovis
|
||
def load_ovis(question: str, image_urls: list[str]) -> ModelRequestData:
|
||
model_name = "AIDC-AI/Ovis2-1B"
|
||
|
||
engine_args = EngineArgs(
|
||
model=model_name,
|
||
max_model_len=8192,
|
||
max_num_seqs=2,
|
||
trust_remote_code=True,
|
||
dtype="half",
|
||
limit_mm_per_prompt={"image": len(image_urls)},
|
||
)
|
||
|
||
placeholders = "\n".join(
|
||
f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
|
||
)
|
||
messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
|
||
|
||
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
||
prompt = tokenizer.apply_chat_template(
|
||
messages, tokenize=False, add_generation_prompt=True
|
||
)
|
||
|
||
return ModelRequestData(
|
||
engine_args=engine_args,
|
||
prompt=prompt,
|
||
image_data=[fetch_image(url) for url in image_urls],
|
||
)
|
||
|
||
|
||
# ovis2_5
|
||
def load_ovis2_5(question: str, image_urls: list[str]) -> ModelRequestData:
|
||
model_name = "AIDC-AI/Ovis2.5-2B"
|
||
|
||
engine_args = EngineArgs(
|
||
model=model_name,
|
||
max_model_len=8192,
|
||
max_num_seqs=2,
|
||
trust_remote_code=True,
|
||
dtype="half",
|
||
limit_mm_per_prompt={"image": len(image_urls)},
|
||
)
|
||
|
||
placeholders = "\n".join(
|
||
f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
|
||
)
|
||
messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
|
||
|
||
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
||
prompt = tokenizer.apply_chat_template(
|
||
messages, tokenize=False, add_generation_prompt=True
|
||
)
|
||
|
||
return ModelRequestData(
|
||
engine_args=engine_args,
|
||
prompt=prompt,
|
||
image_data=[fetch_image(url) for url in image_urls],
|
||
)
|
||
|
||
|
||
def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
|
||
model_name = "mistral-community/pixtral-12b"
|
||
|
||
# Adjust this as necessary to fit in GPU
|
||
engine_args = EngineArgs(
|
||
model=model_name,
|
||
max_model_len=8192,
|
||
max_num_seqs=2,
|
||
tensor_parallel_size=2,
|
||
limit_mm_per_prompt={"image": len(image_urls)},
|
||
)
|
||
|
||
placeholders = "[IMG]" * len(image_urls)
|
||
prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"
|
||
|
||
return ModelRequestData(
|
||
engine_args=engine_args,
|
||
prompt=prompt,
|
||
image_data=[fetch_image(url) for url in image_urls],
|
||
)
|
||
|
||
|
||
def load_phi3v(question: str, image_urls: list[str]) -> ModelRequestData:
|
||
# num_crops is an override kwarg to the multimodal image processor;
|
||
# For some models, e.g., Phi-3.5-vision-instruct, it is recommended
|
||
# to use 16 for single frame scenarios, and 4 for multi-frame.
|
||
#
|
||
# Generally speaking, a larger value for num_crops results in more
|
||
# tokens per image instance, because it may scale the image more in
|
||
# the image preprocessing. Some references in the model docs and the
|
||
# formula for image tokens after the preprocessing
|
||
# transform can be found below.
|
||
#
|
||
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
|
||
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
|
||
engine_args = EngineArgs(
|
||
model="microsoft/Phi-3.5-vision-instruct",
|
||
trust_remote_code=True,
|
||
max_model_len=4096,
|
||
max_num_seqs=2,
|
||
limit_mm_per_prompt={"image": len(image_urls)},
|
||
mm_processor_kwargs={"num_crops": 4},
|
||
)
|
||
placeholders = "\n".join(
|
||
f"<|image_{i}|>" for i, _ in enumerate(image_urls, start=1)
|
||
)
|
||
prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
|
||
|
||
return ModelRequestData(
|
||
engine_args=engine_args,
|
||
prompt=prompt,
|
||
image_data=[fetch_image(url) for url in image_urls],
|
||
)
|
||
|
||
|
||
def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
|
||
"""
|
||
Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
|
||
show how to process multi images inputs.
|
||
"""
|
||
|
||
model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
|
||
# Since the vision-lora and speech-lora co-exist with the base model,
|
||
# we have to manually specify the path of the lora weights.
|
||
vision_lora_path = os.path.join(model_path, "vision-lora")
|
||
engine_args = EngineArgs(
|
||
model=model_path,
|
||
trust_remote_code=True,
|
||
max_model_len=4096,
|
||
max_num_seqs=2,
|
||
limit_mm_per_prompt={"image": len(image_urls)},
|
||
enable_lora=True,
|
||
max_lora_rank=320,
|
||
# Note - mm_processor_kwargs can also be passed to generate/chat calls
|
||
mm_processor_kwargs={"dynamic_hd": 4},
|
||
)
|
||
|
||
placeholders = "".join(f"<|image_{i}|>" for i, _ in enumerate(image_urls, start=1))
|
||
prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
|
||
|
||
return ModelRequestData(
|
||
engine_args=engine_args,
|
||
prompt=prompt,
|
||
image_data=[fetch_image(url) for url in image_urls],
|
||
lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
|
||
)
|
||
|
||
|
||
def load_phi4_multimodal(question: str, image_urls: list[str]) -> ModelRequestData:
|
||
"""
|
||
Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
|
||
show how to process multi images inputs.
|
||
"""
|
||
|
||
model_path = snapshot_download(
|
||
"microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
|
||
)
|
||
# Since the vision-lora and speech-lora co-exist with the base model,
|
||
# we have to manually specify the path of the lora weights.
|
||
vision_lora_path = os.path.join(model_path, "vision-lora")
|
||
engine_args = EngineArgs(
|
||
model=model_path,
|
||
max_model_len=4096,
|
||
max_num_seqs=2,
|
||
limit_mm_per_prompt={"image": len(image_urls)},
|
||
enable_lora=True,
|
||
max_lora_rank=320,
|
||
# Note - mm_processor_kwargs can also be passed to generate/chat calls
|
||
mm_processor_kwargs={"dynamic_hd": 4},
|
||
)
|
||
|
||
placeholders = "<|image|>" * len(image_urls)
|
||
prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
|
||
|
||
return ModelRequestData(
|
||
engine_args=engine_args,
|
||
prompt=prompt,
|
||
image_data=[fetch_image(url) for url in image_urls],
|
||
lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
|
||
)
|
||
|
||
|
||
def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData:
|
||
model_name = "Qwen/Qwen-VL-Chat"
|
||
engine_args = EngineArgs(
|
||
model=model_name,
|
||
trust_remote_code=True,
|
||
max_model_len=1024,
|
||
max_num_seqs=2,
|
||
hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
|
||
limit_mm_per_prompt={"image": len(image_urls)},
|
||
)
|
||
placeholders = "".join(
|
||
f"Picture {i}: <img></img>\n" for i, _ in enumerate(image_urls, start=1)
|
||
)
|
||
|
||
# This model does not have a chat_template attribute on its tokenizer,
|
||
# so we need to explicitly pass it. We use ChatML since it's used in the
|
||
# generation utils of the model:
|
||
# https://huggingface.co/Qwen/Qwen-VL-Chat/blob/main/qwen_generation_utils.py#L265
|
||
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
||
|
||
# Copied from: https://huggingface.co/docs/transformers/main/en/chat_templating
|
||
chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" # noqa: E501
|
||
|
||
messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
|
||
prompt = tokenizer.apply_chat_template(
|
||
messages,
|
||
tokenize=False,
|
||
add_generation_prompt=True,
|
||
chat_template=chat_template,
|
||
)
|
||
|
||
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"]
|
||
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
|
||
|
||
return ModelRequestData(
|
||
engine_args=engine_args,
|
||
prompt=prompt,
|
||
stop_token_ids=stop_token_ids,
|
||
image_data=[fetch_image(url) for url in image_urls],
|
||
chat_template=chat_template,
|
||
)
|
||
|
||
|
||
def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||
try:
|
||
from qwen_vl_utils import smart_resize
|
||
except ModuleNotFoundError:
|
||
print(
|
||
"WARNING: `qwen-vl-utils` not installed, input images will not "
|
||
"be automatically resized. You can enable this functionality by "
|
||
"`pip install qwen-vl-utils`."
|
||
)
|
||
smart_resize = None
|
||
|
||
model_name = "Qwen/Qwen2-VL-7B-Instruct"
|
||
|
||
# Tested on L40
|
||
engine_args = EngineArgs(
|
||
model=model_name,
|
||
max_model_len=32768 if smart_resize is None else 4096,
|
||
max_num_seqs=5,
|
||
limit_mm_per_prompt={"image": len(image_urls)},
|
||
)
|
||
|
||
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||
messages = [
|
||
{"role": "system", "content": "You are a helpful assistant."},
|
||
{
|
||
"role": "user",
|
||
"content": [
|
||
*placeholders,
|
||
{"type": "text", "text": question},
|
||
],
|
||
},
|
||
]
|
||
|
||
processor = AutoProcessor.from_pretrained(model_name)
|
||
|
||
prompt = processor.apply_chat_template(
|
||
messages, tokenize=False, add_generation_prompt=True
|
||
)
|
||
|
||
if smart_resize is None:
|
||
image_data = [fetch_image(url) for url in image_urls]
|
||
else:
|
||
|
||
def post_process_image(image: Image) -> Image:
|
||
width, height = image.size
|
||
resized_height, resized_width = smart_resize(
|
||
height, width, max_pixels=1024 * 28 * 28
|
||
)
|
||
return image.resize((resized_width, resized_height))
|
||
|
||
image_data = [post_process_image(fetch_image(url)) for url in image_urls]
|
||
|
||
return ModelRequestData(
|
||
engine_args=engine_args,
|
||
prompt=prompt,
|
||
image_data=image_data,
|
||
)
|
||
|
||
|
||
def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||
try:
|
||
from qwen_vl_utils import smart_resize
|
||
except ModuleNotFoundError:
|
||
print(
|
||
"WARNING: `qwen-vl-utils` not installed, input images will not "
|
||
"be automatically resized. You can enable this functionality by "
|
||
"`pip install qwen-vl-utils`."
|
||
)
|
||
smart_resize = None
|
||
|
||
model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
|
||
|
||
engine_args = EngineArgs(
|
||
model=model_name,
|
||
max_model_len=32768 if smart_resize is None else 4096,
|
||
max_num_seqs=5,
|
||
limit_mm_per_prompt={"image": len(image_urls)},
|
||
)
|
||
|
||
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||
messages = [
|
||
{"role": "system", "content": "You are a helpful assistant."},
|
||
{
|
||
"role": "user",
|
||
"content": [
|
||
*placeholders,
|
||
{"type": "text", "text": question},
|
||
],
|
||
},
|
||
]
|
||
|
||
processor = AutoProcessor.from_pretrained(model_name)
|
||
|
||
prompt = processor.apply_chat_template(
|
||
messages, tokenize=False, add_generation_prompt=True
|
||
)
|
||
|
||
if smart_resize is None:
|
||
image_data = [fetch_image(url) for url in image_urls]
|
||
else:
|
||
|
||
def post_process_image(image: Image) -> Image:
|
||
width, height = image.size
|
||
resized_height, resized_width = smart_resize(
|
||
height, width, max_pixels=1024 * 28 * 28
|
||
)
|
||
return image.resize((resized_width, resized_height))
|
||
|
||
image_data = [post_process_image(fetch_image(url)) for url in image_urls]
|
||
|
||
return ModelRequestData(
|
||
engine_args=engine_args,
|
||
prompt=prompt,
|
||
image_data=image_data,
|
||
)
|
||
|
||
|
||
def load_r_vl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||
model_name = "YannQi/R-4B"
|
||
engine_args = EngineArgs(
|
||
model=model_name,
|
||
max_model_len=16384,
|
||
max_num_seqs=16,
|
||
limit_mm_per_prompt={"image": len(image_urls)},
|
||
)
|
||
|
||
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||
messages = [
|
||
{
|
||
"role": "user",
|
||
"content": [
|
||
*placeholders,
|
||
{"type": "text", "text": question},
|
||
],
|
||
}
|
||
]
|
||
|
||
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
|
||
|
||
prompt = processor.apply_chat_template(
|
||
messages, tokenize=False, add_generation_prompt=True
|
||
)
|
||
|
||
return ModelRequestData(
|
||
engine_args=engine_args,
|
||
prompt=prompt,
|
||
image_data=[fetch_image(url) for url in image_urls],
|
||
)
|
||
|
||
|
||
def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData:
|
||
model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
|
||
|
||
# The configuration below has been confirmed to launch on a single L40 GPU.
|
||
engine_args = EngineArgs(
|
||
model=model_name,
|
||
max_model_len=8192,
|
||
max_num_seqs=16,
|
||
enforce_eager=True,
|
||
limit_mm_per_prompt={"image": len(image_urls)},
|
||
mm_processor_kwargs={
|
||
"max_image_size": {"longest_edge": 384},
|
||
},
|
||
)
|
||
|
||
placeholders = "\n".join(
|
||
f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
|
||
)
|
||
prompt = (
|
||
f"<|im_start|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:" # noqa: E501
|
||
)
|
||
return ModelRequestData(
|
||
engine_args=engine_args,
|
||
prompt=prompt,
|
||
image_data=[fetch_image(url) for url in image_urls],
|
||
)
|
||
|
||
|
||
def load_step3(question: str, image_urls: list[str]) -> ModelRequestData:
|
||
model_name = "stepfun-ai/step3-fp8"
|
||
|
||
# NOTE: Below are verified configurations for step3-fp8
|
||
# on 8xH100 GPUs.
|
||
engine_args = EngineArgs(
|
||
model=model_name,
|
||
max_num_batched_tokens=4096,
|
||
gpu_memory_utilization=0.85,
|
||
tensor_parallel_size=8,
|
||
limit_mm_per_prompt={"image": len(image_urls)},
|
||
reasoning_parser="step3",
|
||
)
|
||
|
||
prompt = (
|
||
"<|begin▁of▁sentence|> You are a helpful assistant. <|BOT|>user\n "
|
||
f"{'<im_patch>' * len(image_urls)}{question} <|EOT|><|BOT|"
|
||
">assistant\n<think>\n"
|
||
)
|
||
image_data = [fetch_image(url) for url in image_urls]
|
||
|
||
return ModelRequestData(
|
||
engine_args=engine_args,
|
||
prompt=prompt,
|
||
image_data=image_data,
|
||
)
|
||
|
||
|
||
def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
|
||
model_name = "omni-research/Tarsier-7b"
|
||
|
||
engine_args = EngineArgs(
|
||
model=model_name,
|
||
trust_remote_code=True,
|
||
max_model_len=4096,
|
||
limit_mm_per_prompt={"image": len(image_urls)},
|
||
)
|
||
|
||
prompt = f"USER: {'<image>' * len(image_urls)}\n{question}\n ASSISTANT:"
|
||
image_data = [fetch_image(url) for url in image_urls]
|
||
|
||
return ModelRequestData(
|
||
engine_args=engine_args,
|
||
prompt=prompt,
|
||
image_data=image_data,
|
||
)
|
||
|
||
|
||
def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
|
||
model_name = "omni-research/Tarsier2-Recap-7b"
|
||
|
||
engine_args = EngineArgs(
|
||
model=model_name,
|
||
trust_remote_code=True,
|
||
max_model_len=32768,
|
||
limit_mm_per_prompt={"image": len(image_urls)},
|
||
hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
|
||
)
|
||
|
||
prompt = (
|
||
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
|
||
f"<|im_start|>user\n<|vision_start|>{'<|image_pad|>' * len(image_urls)}"
|
||
f"<|vision_end|>{question}<|im_end|>\n"
|
||
"<|im_start|>assistant\n"
|
||
)
|
||
image_data = [fetch_image(url) for url in image_urls]
|
||
|
||
return ModelRequestData(
|
||
engine_args=engine_args,
|
||
prompt=prompt,
|
||
image_data=image_data,
|
||
)
|
||
|
||
|
||
# GLM-4.5V
|
||
def load_glm4_5v(question: str, image_urls: list[str]) -> ModelRequestData:
|
||
model_name = "zai-org/GLM-4.5V"
|
||
|
||
engine_args = EngineArgs(
|
||
model=model_name,
|
||
max_model_len=32768,
|
||
max_num_seqs=2,
|
||
limit_mm_per_prompt={"image": len(image_urls)},
|
||
enforce_eager=True,
|
||
tensor_parallel_size=4,
|
||
)
|
||
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||
messages = [
|
||
{
|
||
"role": "user",
|
||
"content": [
|
||
*placeholders,
|
||
{"type": "text", "text": question},
|
||
],
|
||
}
|
||
]
|
||
processor = AutoProcessor.from_pretrained(model_name)
|
||
prompt = processor.apply_chat_template(
|
||
messages, tokenize=False, add_generation_prompt=True
|
||
)
|
||
image_data = [fetch_image(url) for url in image_urls]
|
||
|
||
return ModelRequestData(
|
||
engine_args=engine_args,
|
||
prompt=prompt,
|
||
image_data=image_data,
|
||
)
|
||
|
||
|
||
# GLM-4.5V-FP8
|
||
def load_glm4_5v_fp8(question: str, image_urls: list[str]) -> ModelRequestData:
|
||
model_name = "zai-org/GLM-4.5V-FP8"
|
||
|
||
engine_args = EngineArgs(
|
||
model=model_name,
|
||
max_model_len=32768,
|
||
max_num_seqs=2,
|
||
limit_mm_per_prompt={"image": len(image_urls)},
|
||
enforce_eager=True,
|
||
tensor_parallel_size=4,
|
||
)
|
||
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||
messages = [
|
||
{
|
||
"role": "user",
|
||
"content": [
|
||
*placeholders,
|
||
{"type": "text", "text": question},
|
||
],
|
||
}
|
||
]
|
||
processor = AutoProcessor.from_pretrained(model_name)
|
||
prompt = processor.apply_chat_template(
|
||
messages, tokenize=False, add_generation_prompt=True
|
||
)
|
||
image_data = [fetch_image(url) for url in image_urls]
|
||
|
||
return ModelRequestData(
|
||
engine_args=engine_args,
|
||
prompt=prompt,
|
||
image_data=image_data,
|
||
)
|
||
|
||
|
||
model_example_map = {
|
||
"aria": load_aria,
|
||
"aya_vision": load_aya_vision,
|
||
"command_a_vision": load_command_a_vision,
|
||
"deepseek_vl_v2": load_deepseek_vl2,
|
||
"gemma3": load_gemma3,
|
||
"h2ovl_chat": load_h2ovl,
|
||
"hyperclovax_seed_vision": load_hyperclovax_seed_vision,
|
||
"idefics3": load_idefics3,
|
||
"interns1": load_interns1,
|
||
"internvl_chat": load_internvl,
|
||
"keye_vl": load_keye_vl,
|
||
"keye_vl1_5": load_keye_vl1_5,
|
||
"kimi_vl": load_kimi_vl,
|
||
"llama4": load_llama4,
|
||
"llava": load_llava,
|
||
"llava-next": load_llava_next,
|
||
"llava-onevision": load_llava_onevision,
|
||
"mistral3": load_mistral3,
|
||
"NVLM_D": load_nvlm_d,
|
||
"ovis": load_ovis,
|
||
"ovis2_5": load_ovis2_5,
|
||
"phi3_v": load_phi3v,
|
||
"phi4_mm": load_phi4mm,
|
||
"phi4_multimodal": load_phi4_multimodal,
|
||
"pixtral_hf": load_pixtral_hf,
|
||
"qwen_vl_chat": load_qwen_vl_chat,
|
||
"qwen2_vl": load_qwen2_vl,
|
||
"qwen2_5_vl": load_qwen2_5_vl,
|
||
"rvl": load_r_vl,
|
||
"smolvlm": load_smolvlm,
|
||
"step3": load_step3,
|
||
"tarsier": load_tarsier,
|
||
"tarsier2": load_tarsier2,
|
||
"glm4_5v": load_glm4_5v,
|
||
"glm4_5v_fp8": load_glm4_5v_fp8,
|
||
}
|
||
|
||
|
||
def run_generate(model, question: str, image_urls: list[str], seed: Optional[int]):
|
||
req_data = model_example_map[model](question, image_urls)
|
||
|
||
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
|
||
llm = LLM(**engine_args)
|
||
|
||
sampling_params = SamplingParams(
|
||
temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids
|
||
)
|
||
|
||
outputs = llm.generate(
|
||
{
|
||
"prompt": req_data.prompt,
|
||
"multi_modal_data": {"image": req_data.image_data},
|
||
},
|
||
sampling_params=sampling_params,
|
||
lora_request=req_data.lora_requests,
|
||
)
|
||
|
||
print("-" * 50)
|
||
for o in outputs:
|
||
generated_text = o.outputs[0].text
|
||
print(generated_text)
|
||
print("-" * 50)
|
||
|
||
|
||
def run_chat(model: str, question: str, image_urls: list[str], seed: Optional[int]):
|
||
req_data = model_example_map[model](question, image_urls)
|
||
|
||
# Disable other modalities to save memory
|
||
default_limits = {"image": 0, "video": 0, "audio": 0}
|
||
req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
|
||
req_data.engine_args.limit_mm_per_prompt or {}
|
||
)
|
||
|
||
engine_args = asdict(req_data.engine_args) | {"seed": seed}
|
||
llm = LLM(**engine_args)
|
||
|
||
sampling_params = SamplingParams(
|
||
temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids
|
||
)
|
||
outputs = llm.chat(
|
||
[
|
||
{
|
||
"role": "user",
|
||
"content": [
|
||
{
|
||
"type": "text",
|
||
"text": question,
|
||
},
|
||
*(
|
||
{
|
||
"type": "image_url",
|
||
"image_url": {"url": image_url},
|
||
}
|
||
for image_url in image_urls
|
||
),
|
||
],
|
||
}
|
||
],
|
||
sampling_params=sampling_params,
|
||
chat_template=req_data.chat_template,
|
||
lora_request=req_data.lora_requests,
|
||
)
|
||
|
||
print("-" * 50)
|
||
for o in outputs:
|
||
generated_text = o.outputs[0].text
|
||
print(generated_text)
|
||
print("-" * 50)
|
||
|
||
|
||
def parse_args():
|
||
parser = FlexibleArgumentParser(
|
||
description="Demo on using vLLM for offline inference with "
|
||
"vision language models that support multi-image input for text "
|
||
"generation"
|
||
)
|
||
parser.add_argument(
|
||
"--model-type",
|
||
"-m",
|
||
type=str,
|
||
default="phi3_v",
|
||
choices=model_example_map.keys(),
|
||
help='Huggingface "model_type".',
|
||
)
|
||
parser.add_argument(
|
||
"--method",
|
||
type=str,
|
||
default="generate",
|
||
choices=["generate", "chat"],
|
||
help="The method to run in `vllm.LLM`.",
|
||
)
|
||
parser.add_argument(
|
||
"--seed",
|
||
type=int,
|
||
default=None,
|
||
help="Set the seed when initializing `vllm.LLM`.",
|
||
)
|
||
parser.add_argument(
|
||
"--num-images",
|
||
"-n",
|
||
type=int,
|
||
choices=list(range(1, len(IMAGE_URLS) + 1)), # the max number of images
|
||
default=2,
|
||
help="Number of images to use for the demo.",
|
||
)
|
||
return parser.parse_args()
|
||
|
||
|
||
def main(args: Namespace):
|
||
model = args.model_type
|
||
method = args.method
|
||
seed = args.seed
|
||
|
||
image_urls = IMAGE_URLS[: args.num_images]
|
||
|
||
if method == "generate":
|
||
run_generate(model, QUESTION, image_urls, seed)
|
||
elif method == "chat":
|
||
run_chat(model, QUESTION, image_urls, seed)
|
||
else:
|
||
raise ValueError(f"Invalid method: {method}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
args = parse_args()
|
||
main(args)
|