mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-08 01:47:05 +08:00
Signed-off-by: Nick Hill <nhill@redhat.com> Signed-off-by: Lucas Kabela <lucaskabela@meta.com> Signed-off-by: Max de Bayser <mbayser@br.ibm.com> Signed-off-by: Andrew Sansom <andrew@protopia.ai> Signed-off-by: Boyuan Feng <boyuan@meta.com> Signed-off-by: Boyuan Feng <fby.1994@gmail.com> Signed-off-by: boyuanfeng <boyuan@meta.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: JartX <sagformas@epdcenter.es> Signed-off-by: Chendi Xue <Chendi.Xue@intel.com> Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com> Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Chen Zhang <zhangch99@outlook.com> Signed-off-by: Roger Wang <hey@rogerw.io> Signed-off-by: mgoin <mgoin64@gmail.com> Signed-off-by: wwl2755 <wangwenlong2755@gmail.com> Signed-off-by: Manoel Marques <manoel.marques@ibm.com> Signed-off-by: Manoel Marques <manoelmrqs@gmail.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Signed-off-by: pengdrumli <pengdrumli@tencent.com> Signed-off-by: windsonsea <haifeng.yao@daocloud.io> Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai> Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Signed-off-by: Huamin Li <3ericli@gmail.com> Signed-off-by: simondanielsson <simon.danielsson99@hotmail.com> Signed-off-by: Rahul Tuli <rtuli@redhat.com> Signed-off-by: Yang <lymailforjob@gmail.com> Signed-off-by: Debolina Roy <debroy@redhat.com> Signed-off-by: David Chen <530634352@qq.com> Signed-off-by: wangzi <3220100013@zju.edu.cn> Signed-off-by: Eldar Kurtic <8884008+eldarkurtic@users.noreply.github.com> Signed-off-by: NickLucche <nlucches@redhat.com> Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com> Signed-off-by: Sara Kokkila Schumacher <saraks@ibm.com> Signed-off-by: Csrayz <jover@cmbchina.com> Signed-off-by: ivyilike <pww123@cmbchina.com> Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com> Signed-off-by: Bowen Wang <abmfy@icloud.com> Signed-off-by: qqma <qqma@amazon.com> Signed-off-by: ElizaWszola <ewszola@redhat.com> Signed-off-by: Lu Fang <fanglu@fb.com> Signed-off-by: Zhuohan Li <zhuohan123@gmail.com> Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: luka <lgovedic@redhat.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Signed-off-by: Or Ozeri <oro@il.ibm.com> Signed-off-by: Johnny Yang <johnnyyang@google.com> Signed-off-by: Alec Solder <alecs@fb.com> Signed-off-by: Alec S <10566873+alecsolder@users.noreply.github.com> Signed-off-by: Russell Bryant <rbryant@redhat.com> Signed-off-by: Matthew Bonanni <mbonanni@redhat.com> Signed-off-by: Alexander Matveev <amatveev@redhat.com> Signed-off-by: yewentao256 <zhyanwentao@126.com> Signed-off-by: liuye.hj <liuye.hj@alibaba-inc.com> Signed-off-by: Kunshang Ji <kunshang.ji@intel.com> Signed-off-by: Lucia Fang <116399278+luccafong@users.noreply.github.com> Signed-off-by: Michael Goin <mgoin64@gmail.com> Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com> Signed-off-by: Ming Yang <minos.future@gmail.com> Signed-off-by: Zhikaiiii <1658973216@qq.com> Signed-off-by: Andreas Hartel <andreas.hartel@aleph-alpha.com> Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com> Signed-off-by: wuxibin <wuxibin@bytedance.com> Signed-off-by: youkaichao <youkaichao@gmail.com> Signed-off-by: Peter Pan <Peter.Pan@daocloud.io> Signed-off-by: Peter Pan <peter.pan@daocloud.io> Signed-off-by: Nicolò Lucchesi<nicolo.lucchesi@gmail.com> Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com> Signed-off-by: Sage Moore <sage@neuralmagic.com> Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Signed-off-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com> Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com> Signed-off-by: Bill Nell <bnell@redhat.com> Signed-off-by: Shreeasish Kumar <shreeasish@rivosinc.com> Signed-off-by: Weida Hong <wdhongtw@google.com> Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Signed-off-by: Hashem Hashemi <hashem.hashemi@amd.com> Signed-off-by: Hashem Hashemi <159079214+amd-hhashemi@users.noreply.github.com> Signed-off-by: Amir Samani <asamani@nvidia.com> Signed-off-by: ElizaWszola <elizaw.9289@gmail.com> Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com> Signed-off-by: ilmarkov <markovilya197@gmail.com> Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com> Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com> Signed-off-by: rouchenzi <ruochenwen@gmail.com> Signed-off-by: rouchenzi <40842833+rouchenzi@users.noreply.github.com> Signed-off-by: Andrew Xia <axia@meta.com> Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com> Signed-off-by: Corey Lowman <clowman1993@gmail.com> Signed-off-by: jpvillam <jpvillam@amd.com> Signed-off-by: dougbtv <dosmith@redhat.com> Signed-off-by: Chenxi Yang <cxyang@fb.com> Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Signed-off-by: ahao-anyscale <ahao@anyscale.com> Signed-off-by: Yan Lu <luyan@nvidia.com> Signed-off-by: baxingpiaochong <771405853@qq.com> Signed-off-by: Kyle Sayers <kylesayrs@gmail.com> Signed-off-by: Nikhil Gupta <nikhil.gupta2@arm.com> Signed-off-by: Yong Hoon Shin <yhshin@meta.com> Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai> Signed-off-by: Benjamin Chislett <bchislett@nvidia.com> Signed-off-by: Ben Browning <bbrownin@redhat.com> Signed-off-by: Chengji Yao <chengjiyao@google.com> Signed-off-by: jiang1.li <jiang1.li@intel.com> Signed-off-by: Jackmin801 <ongjackm@gmail.com> Signed-off-by: Jonas M. Kübler <44084297+jmkuebler@users.noreply.github.com> Signed-off-by: taohui <taohui3@gmail.com> Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io> Signed-off-by: Shu Wang <shuw@nvidia.com> Signed-off-by: Shu Wang. <shuw@nvidia.com> Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com> Signed-off-by: Duncan Moss <djm.moss@gmail.com> Signed-off-by: Shiyan Deng <dsy842974287@meta.com> Signed-off-by: Wei Wei <wwei6@meta.com> Signed-off-by: Saman Keon <samanamp@outlook.com> Signed-off-by: yangxurui <yangxurui@meituan.com> Signed-off-by: nicole-lihui <nicole.li@daocloud.io> Signed-off-by: courage17340 <courage17340@163.com> Signed-off-by: Jacob Kahn <jacobkahn1@gmail.com> Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com> Signed-off-by: Agata Dobrzyniewicz <adobrzyniewicz@habana.ai> Signed-off-by: zxw <1020938856@qq.com> Signed-off-by: wang.yuqi <noooop@126.com> Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Signed-off-by: chenlang <chen.lang5@zte.com.cn> Signed-off-by: Jonas Kuebler <kuebj@amazon.com> Signed-off-by: AlonKejzman <alonkeizman@gmail.com> Signed-off-by: Tao Hui <taohui3@gmail.com> Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com> Signed-off-by: Tomer Asida <57313761+tomeras91@users.noreply.github.com> Signed-off-by: Aleksandr Malyshev <maleksan@amd.com> Signed-off-by: Eugene Khvedchenia <ekhvedchenia@nvidia.com> Signed-off-by: Eugene Khvedchenya <ekhvedchenya@gmail.com> Signed-off-by: yiting.jiang <yiting.jiang@daocloud.io> Signed-off-by: xaguilar <Xavier.AguilarFruto@amd.com> Signed-off-by: Iceber Gu <caiwei95@hotmail.com> Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com> Signed-off-by: Icey <1790571317@qq.com> Signed-off-by: 许文卿 <xwq391974@alibaba-inc.com> Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com> Co-authored-by: Nick Hill <nhill@redhat.com> Co-authored-by: Lucas Kabela <lucasakabela@gmail.com> Co-authored-by: Maximilien de Bayser <mbayser@br.ibm.com> Co-authored-by: Andrew Sansom <andrew@protopia.ai> Co-authored-by: Boyuan Feng <boyuan@meta.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: JartX <sagformas@epdcenter.es> Co-authored-by: Chendi.Xue <chendi.xue@intel.com> Co-authored-by: Chauncey <chaunceyjiang@gmail.com> Co-authored-by: xin.li <xin.li@daocloud.io> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk> Co-authored-by: Chen Zhang <zhangch99@outlook.com> Co-authored-by: Roger Wang <hey@rogerw.io> Co-authored-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Wenlong Wang <wangwenlong2755@gmail.com> Co-authored-by: Manoel Marques <manoelmrqs@gmail.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: lirong <56789630+lirong-lirong@users.noreply.github.com> Co-authored-by: Michael Yao <haifeng.yao@daocloud.io> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Co-authored-by: Huamin Li <3ericli@gmail.com> Co-authored-by: Lu Fang <30275821+houseroad@users.noreply.github.com> Co-authored-by: Simon Danielsson <70206058+simondanielsson@users.noreply.github.com> Co-authored-by: Rahul Tuli <rtuli@redhat.com> Co-authored-by: Claude <noreply@anthropic.com> Co-authored-by: Yang Liu <127183760+KKSK-DON@users.noreply.github.com> Co-authored-by: Deboleina <debroy@redhat.com> Co-authored-by: yinz-aizip <yinz@aizip.ai> Co-authored-by: WeiQing Chen <40507679+david6666666@users.noreply.github.com> Co-authored-by: wangzi <3220100013@zju.edu.cn> Co-authored-by: Eldar Kurtić <8884008+eldarkurtic@users.noreply.github.com> Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com> Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com> Co-authored-by: Yizhou <136800916+yiz-liu@users.noreply.github.com> Co-authored-by: Sara-KS <50249410+Sara-KS@users.noreply.github.com> Co-authored-by: Csrayz <jover@cmbchina.com> Co-authored-by: ivyilike <pww123@cmbchina.com> Co-authored-by: Burkhard Ringlein <ngl@zurich.ibm.com> Co-authored-by: Bowen Wang <abmfy@icloud.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: Daisy-Ma-coder <daisy.ma.0117@gmail.com> Co-authored-by: qqma <qqma@amazon.com> Co-authored-by: ElizaWszola <ewszola@redhat.com> Co-authored-by: Lucia Fang <116399278+luccafong@users.noreply.github.com> Co-authored-by: Zhuohan Li <zhuohan123@gmail.com> Co-authored-by: Simon Mo <simon.mo@hey.com> Co-authored-by: Or Ozeri <oro@il.ibm.com> Co-authored-by: Johnny Yang <24908445+jcyang43@users.noreply.github.com> Co-authored-by: Chengji Yao <chengjiyao@google.com> Co-authored-by: Alec S <10566873+alecsolder@users.noreply.github.com> Co-authored-by: Alec Solder <alecs@fb.com> Co-authored-by: Russell Bryant <rbryant@redhat.com> Co-authored-by: Matthew Bonanni <mbonanni@redhat.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Co-authored-by: Chris Bamford <chrisbam4d@gmail.com> Co-authored-by: Alexander Matveev <59768536+alexm-redhat@users.noreply.github.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: JJJYmmm <92386084+JJJYmmm@users.noreply.github.com> Co-authored-by: liuye.hj <liuye.hj@alibaba-inc.com> Co-authored-by: Kunshang Ji <kunshang.ji@intel.com> Co-authored-by: Lucia (Lu) Fang <fanglu@meta.com> Co-authored-by: Varun Sundar Rabindranath <varunsundar08@gmail.com> Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com> Co-authored-by: Ming Yang <yming@meta.com> Co-authored-by: Zhikaiiii <55917203+Zhikaiiii@users.noreply.github.com> Co-authored-by: Andreas Hartel <andreas@hartel.me> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com> Co-authored-by: vllmellm <vllm.ellm@embeddedllm.com> Co-authored-by: Joel <wuxibin89@163.com> Co-authored-by: youkaichao <youkaichao@gmail.com> Co-authored-by: Mark McLoughlin <markmc@redhat.com> Co-authored-by: Peter Pan <peter.pan@daocloud.io> Co-authored-by: Nicolò Lucchesi <nicolo.lucchesi@gmail.com> Co-authored-by: Fanli Lin <fanli.lin@intel.com> Co-authored-by: Thomas Parnell <tpa@zurich.ibm.com> Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com> Co-authored-by: Sage Moore <sage@neuralmagic.com> Co-authored-by: yewentao256 <zhyanwentao@126.com> Co-authored-by: bnellnm <49004751+bnellnm@users.noreply.github.com> Co-authored-by: rivos-shreeasish <shreeasish@rivosinc.com> Co-authored-by: Chih-Chieh Yang <chih.chieh.yang@ibm.com> Co-authored-by: Weida Hong <wdhongtw@gmail.com> Co-authored-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Co-authored-by: Hashem Hashemi <159079214+amd-hhashemi@users.noreply.github.com> Co-authored-by: Amir Samani <samani@ualberta.ca> Co-authored-by: Luka Govedič <lgovedic@redhat.com> Co-authored-by: jiahanc <173873397+jiahanc@users.noreply.github.com> Co-authored-by: Ilya Markov <markovilya197@gmail.com> Co-authored-by: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Co-authored-by: Jialin Ouyang <Jialin.Ouyang@gmail.com> Co-authored-by: rouchenzi <40842833+rouchenzi@users.noreply.github.com> Co-authored-by: Andrew Xia <axia@meta.com> Co-authored-by: kourosh hakhamaneshi <31483498+kouroshHakha@users.noreply.github.com> Co-authored-by: Corey Lowman <clowman1993@gmail.com> Co-authored-by: Juan Villamizar <100237675+jpvillam-amd@users.noreply.github.com> Co-authored-by: jpvillam <jpvillam@amd.com> Co-authored-by: Doug Smith <dosmith@redhat.com> Co-authored-by: Chenxi Yang <cxyang@cs.utexas.edu> Co-authored-by: Chenxi Yang <cxyang@fb.com> Co-authored-by: ahao-anyscale <ahao@anyscale.com> Co-authored-by: 0xNullPath <luyanfcp@foxmail.com> Co-authored-by: baxingpiaochong <771405853@qq.com> Co-authored-by: Benjamin Chislett <bchislett@nvidia.com> Co-authored-by: Kyle Sayers <kylesayrs@gmail.com> Co-authored-by: Nikhil Gupta <nikhil.gupta2@arm.com> Co-authored-by: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com> Co-authored-by: lhsjohn <huashuoli@tencent.com> Co-authored-by: Ben Browning <bbrownin@redhat.com> Co-authored-by: Li, Jiang <jiang1.li@intel.com> Co-authored-by: Jackmin801 <56836461+Jackmin801@users.noreply.github.com> Co-authored-by: Jonas M. Kübler <44084297+jmkuebler@users.noreply.github.com> Co-authored-by: Tao Hui <taohui3@gmail.com> Co-authored-by: rongfu.leng <rongfu.leng@daocloud.io> Co-authored-by: Shu Wang <shuw@nvidia.com> Co-authored-by: Tyler Michael Smith <tlrmchlsmth@gmail.com> Co-authored-by: Duncan Moss <djm.moss@gmail.com> Co-authored-by: Shiyan Deng <dsy842974287@meta.com> Co-authored-by: Wei Wei <wwei6@meta.com> Co-authored-by: Saman A. Pour <samanamp@outlook.com> Co-authored-by: XuruiYang <530534756@qq.com> Co-authored-by: yangxurui <yangxurui@meituan.com> Co-authored-by: Nicole LiHui 🥜 <nicolelihui@outlook.com> Co-authored-by: courage17340 <courage17340@users.noreply.github.com> Co-authored-by: Jacob Kahn <jacobkahn1@gmail.com> Co-authored-by: Nicole LiHui 🥜 <nicole.li@daocloud.io> Co-authored-by: Fadi Arafeh <115173828+fadara01@users.noreply.github.com> Co-authored-by: Agata Dobrzyniewicz <160237065+adobrzyn@users.noreply.github.com> Co-authored-by: yyzxw <34639446+yyzxw@users.noreply.github.com> Co-authored-by: wang.yuqi <noooop@126.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: chenlang <chen.lang5@zte.com.cn> Co-authored-by: chenlang <10346245@zte.com.cn> Co-authored-by: AlonKejzman <alonkeizman@gmail.com> Co-authored-by: tomeras91 <57313761+tomeras91@users.noreply.github.com> Co-authored-by: Aleksandr Malyshev <164964928+maleksan85@users.noreply.github.com> Co-authored-by: Aleksandr Malyshev <maleksan@amd.com> Co-authored-by: Doug Lehr <douglehr@amd.com> Co-authored-by: Eugene Khvedchenya <ekhvedchenya@gmail.com> Co-authored-by: yitingdc <59356937+yitingdc@users.noreply.github.com> Co-authored-by: xaguilar-amd <xavier.aguilarfruto@amd.com> Co-authored-by: Iceber Gu <caiwei95@hotmail.com> Co-authored-by: Tao He <linzhu.ht@alibaba-inc.com> Co-authored-by: Icey <1790571317@qq.com> Co-authored-by: Xu Wenqing <121550081+Xu-Wenqing@users.noreply.github.com> Co-authored-by: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com> Co-authored-by: RishiAstra <40644327+RishiAstra@users.noreply.github.com>
820 lines
26 KiB
Markdown
820 lines
26 KiB
Markdown
# Multimodal Inputs
|
||
|
||
This page teaches you how to pass multi-modal inputs to [multi-modal models][supported-mm-models] in vLLM.
|
||
|
||
!!! note
|
||
We are actively iterating on multi-modal support. See [this RFC](gh-issue:4194) for upcoming changes,
|
||
and [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) if you have any feedback or feature requests.
|
||
|
||
## Offline Inference
|
||
|
||
To input multi-modal data, follow this schema in [vllm.inputs.PromptType][]:
|
||
|
||
- `prompt`: The prompt should follow the format that is documented on HuggingFace.
|
||
- `multi_modal_data`: This is a dictionary that follows the schema defined in [vllm.multimodal.inputs.MultiModalDataDict][].
|
||
|
||
### Stable UUIDs for Caching (multi_modal_uuids)
|
||
|
||
When using multi-modal inputs, vLLM normally hashes each media item by content to enable caching across requests. You can optionally pass `multi_modal_uuids` to provide your own stable IDs for each item so caching can reuse work across requests without rehashing the raw content.
|
||
|
||
??? code
|
||
|
||
```python
|
||
from vllm import LLM
|
||
from PIL import Image
|
||
|
||
# Qwen2.5-VL example with two images
|
||
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct")
|
||
|
||
prompt = "USER: <image><image>\nDescribe the differences.\nASSISTANT:"
|
||
img_a = Image.open("/path/to/a.jpg")
|
||
img_b = Image.open("/path/to/b.jpg")
|
||
|
||
outputs = llm.generate({
|
||
"prompt": prompt,
|
||
"multi_modal_data": {"image": [img_a, img_b]},
|
||
# Provide stable IDs for caching.
|
||
# Requirements (matched by this example):
|
||
# - Include every modality present in multi_modal_data.
|
||
# - For lists, provide the same number of entries.
|
||
# - Use None to fall back to content hashing for that item.
|
||
"multi_modal_uuids": {"image": ["sku-1234-a", None]},
|
||
})
|
||
|
||
for o in outputs:
|
||
print(o.outputs[0].text)
|
||
```
|
||
|
||
Using UUIDs, you can also skip sending media data entirely if you expect cache hits for respective items. Note that the request will fail if the skipped media doesn't have a corresponding UUID, or if the UUID fails to hit the cache.
|
||
|
||
??? code
|
||
|
||
```python
|
||
from vllm import LLM
|
||
from PIL import Image
|
||
|
||
# Qwen2.5-VL example with two images
|
||
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct")
|
||
|
||
prompt = "USER: <image><image>\nDescribe the differences.\nASSISTANT:"
|
||
img_b = Image.open("/path/to/b.jpg")
|
||
|
||
outputs = llm.generate({
|
||
"prompt": prompt,
|
||
"multi_modal_data": {"image": [None, img_b]},
|
||
# Since img_a is expected to be cached, we can skip sending the actual
|
||
# image entirely.
|
||
"multi_modal_uuids": {"image": ["sku-1234-a", None]},
|
||
})
|
||
|
||
for o in outputs:
|
||
print(o.outputs[0].text)
|
||
```
|
||
|
||
!!! warning
|
||
If both multimodal processor caching and prefix caching are disabled, user-provided `multi_modal_uuids` are ignored.
|
||
|
||
### Image Inputs
|
||
|
||
You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples:
|
||
|
||
??? code
|
||
|
||
```python
|
||
from vllm import LLM
|
||
|
||
llm = LLM(model="llava-hf/llava-1.5-7b-hf")
|
||
|
||
# Refer to the HuggingFace repo for the correct format to use
|
||
prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
|
||
|
||
# Load the image using PIL.Image
|
||
image = PIL.Image.open(...)
|
||
|
||
# Single prompt inference
|
||
outputs = llm.generate({
|
||
"prompt": prompt,
|
||
"multi_modal_data": {"image": image},
|
||
})
|
||
|
||
for o in outputs:
|
||
generated_text = o.outputs[0].text
|
||
print(generated_text)
|
||
|
||
# Batch inference
|
||
image_1 = PIL.Image.open(...)
|
||
image_2 = PIL.Image.open(...)
|
||
outputs = llm.generate(
|
||
[
|
||
{
|
||
"prompt": "USER: <image>\nWhat is the content of this image?\nASSISTANT:",
|
||
"multi_modal_data": {"image": image_1},
|
||
},
|
||
{
|
||
"prompt": "USER: <image>\nWhat's the color of this image?\nASSISTANT:",
|
||
"multi_modal_data": {"image": image_2},
|
||
}
|
||
]
|
||
)
|
||
|
||
for o in outputs:
|
||
generated_text = o.outputs[0].text
|
||
print(generated_text)
|
||
```
|
||
|
||
Full example: <gh-file:examples/offline_inference/vision_language.py>
|
||
|
||
To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
|
||
|
||
??? code
|
||
|
||
```python
|
||
from vllm import LLM
|
||
|
||
llm = LLM(
|
||
model="microsoft/Phi-3.5-vision-instruct",
|
||
trust_remote_code=True, # Required to load Phi-3.5-vision
|
||
max_model_len=4096, # Otherwise, it may not fit in smaller GPUs
|
||
limit_mm_per_prompt={"image": 2}, # The maximum number to accept
|
||
)
|
||
|
||
# Refer to the HuggingFace repo for the correct format to use
|
||
prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n"
|
||
|
||
# Load the images using PIL.Image
|
||
image1 = PIL.Image.open(...)
|
||
image2 = PIL.Image.open(...)
|
||
|
||
outputs = llm.generate({
|
||
"prompt": prompt,
|
||
"multi_modal_data": {
|
||
"image": [image1, image2]
|
||
},
|
||
})
|
||
|
||
for o in outputs:
|
||
generated_text = o.outputs[0].text
|
||
print(generated_text)
|
||
```
|
||
|
||
Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py>
|
||
|
||
If using the [LLM.chat](../models/generative_models.md#llmchat) method, you can pass images directly in the message content using various formats: image URLs, PIL Image objects, or pre-computed embeddings:
|
||
|
||
```python
|
||
from vllm import LLM
|
||
from vllm.assets.image import ImageAsset
|
||
|
||
llm = LLM(model="llava-hf/llava-1.5-7b-hf")
|
||
image_url = "https://picsum.photos/id/32/512/512"
|
||
image_pil = ImageAsset('cherry_blossom').pil_image
|
||
image_embeds = torch.load(...)
|
||
|
||
conversation = [
|
||
{"role": "system", "content": "You are a helpful assistant"},
|
||
{"role": "user", "content": "Hello"},
|
||
{"role": "assistant", "content": "Hello! How can I assist you today?"},
|
||
{
|
||
"role": "user",
|
||
"content": [{
|
||
"type": "image_url",
|
||
"image_url": {
|
||
"url": image_url
|
||
}
|
||
},{
|
||
"type": "image_pil",
|
||
"image_pil": image_pil
|
||
}, {
|
||
"type": "image_embeds",
|
||
"image_embeds": image_embeds
|
||
}, {
|
||
"type": "text",
|
||
"text": "What's in these images?"
|
||
}],
|
||
},
|
||
]
|
||
|
||
# Perform inference and log output.
|
||
outputs = llm.chat(conversation)
|
||
|
||
for o in outputs:
|
||
generated_text = o.outputs[0].text
|
||
print(generated_text)
|
||
```
|
||
|
||
Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
|
||
|
||
??? code
|
||
|
||
```python
|
||
from vllm import LLM
|
||
|
||
# Specify the maximum number of frames per video to be 4. This can be changed.
|
||
llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
|
||
|
||
# Create the request payload.
|
||
video_frames = ... # load your video making sure it only has the number of frames specified earlier.
|
||
message = {
|
||
"role": "user",
|
||
"content": [
|
||
{"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."},
|
||
],
|
||
}
|
||
for i in range(len(video_frames)):
|
||
base64_image = encode_image(video_frames[i]) # base64 encoding.
|
||
new_image = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
|
||
message["content"].append(new_image)
|
||
|
||
# Perform inference and log output.
|
||
outputs = llm.chat([message])
|
||
|
||
for o in outputs:
|
||
generated_text = o.outputs[0].text
|
||
print(generated_text)
|
||
```
|
||
|
||
#### Custom RGBA Background Color
|
||
|
||
When loading RGBA images (images with transparency), vLLM converts them to RGB format. By default, transparent pixels are replaced with white background. You can customize this background color using the `rgba_background_color` parameter in `media_io_kwargs`.
|
||
|
||
??? code
|
||
|
||
```python
|
||
from vllm import LLM
|
||
|
||
# Default white background (no configuration needed)
|
||
llm = LLM(model="llava-hf/llava-1.5-7b-hf")
|
||
|
||
# Custom black background for dark theme
|
||
llm = LLM(
|
||
model="llava-hf/llava-1.5-7b-hf",
|
||
media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}}
|
||
)
|
||
|
||
# Custom brand color background (e.g., blue)
|
||
llm = LLM(
|
||
model="llava-hf/llava-1.5-7b-hf",
|
||
media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}}
|
||
)
|
||
```
|
||
|
||
!!! note
|
||
- The `rgba_background_color` accepts RGB values as a list `[R, G, B]` or tuple `(R, G, B)` where each value is 0-255
|
||
- This setting only affects RGBA images with transparency; RGB images are unchanged
|
||
- If not specified, the default white background `(255, 255, 255)` is used for backward compatibility
|
||
|
||
### Video Inputs
|
||
|
||
You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary
|
||
instead of using multi-image input.
|
||
|
||
Instead of NumPy arrays, you can also pass `'torch.Tensor'` instances, as shown in this example using Qwen2.5-VL:
|
||
|
||
??? code
|
||
|
||
```python
|
||
from transformers import AutoProcessor
|
||
from vllm import LLM, SamplingParams
|
||
from qwen_vl_utils import process_vision_info
|
||
|
||
model_path = "Qwen/Qwen2.5-VL-3B-Instruct"
|
||
video_path = "https://content.pexels.com/videos/free-videos.mp4"
|
||
|
||
llm = LLM(
|
||
model=model_path,
|
||
gpu_memory_utilization=0.8,
|
||
enforce_eager=True,
|
||
limit_mm_per_prompt={"video": 1},
|
||
)
|
||
|
||
sampling_params = SamplingParams(
|
||
max_tokens=1024,
|
||
)
|
||
|
||
video_messages = [
|
||
{"role": "system", "content": "You are a helpful assistant."},
|
||
{"role": "user", "content": [
|
||
{"type": "text", "text": "describe this video."},
|
||
{
|
||
"type": "video",
|
||
"video": video_path,
|
||
"total_pixels": 20480 * 28 * 28,
|
||
"min_pixels": 16 * 28 * 28
|
||
}
|
||
]
|
||
},
|
||
]
|
||
|
||
messages = video_messages
|
||
processor = AutoProcessor.from_pretrained(model_path)
|
||
prompt = processor.apply_chat_template(
|
||
messages,
|
||
tokenize=False,
|
||
add_generation_prompt=True,
|
||
)
|
||
|
||
image_inputs, video_inputs = process_vision_info(messages)
|
||
mm_data = {}
|
||
if video_inputs is not None:
|
||
mm_data["video"] = video_inputs
|
||
|
||
llm_inputs = {
|
||
"prompt": prompt,
|
||
"multi_modal_data": mm_data,
|
||
}
|
||
|
||
outputs = llm.generate([llm_inputs], sampling_params=sampling_params)
|
||
for o in outputs:
|
||
generated_text = o.outputs[0].text
|
||
print(generated_text)
|
||
```
|
||
|
||
!!! note
|
||
'process_vision_info' is only applicable to Qwen2.5-VL and similar models.
|
||
|
||
Full example: <gh-file:examples/offline_inference/vision_language.py>
|
||
|
||
### Audio Inputs
|
||
|
||
You can pass a tuple `(array, sampling_rate)` to the `'audio'` field of the multi-modal dictionary.
|
||
|
||
Full example: <gh-file:examples/offline_inference/audio_language.py>
|
||
|
||
### Embedding Inputs
|
||
|
||
To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
|
||
pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
|
||
|
||
??? code
|
||
|
||
```python
|
||
from vllm import LLM
|
||
|
||
# Inference with image embeddings as input
|
||
llm = LLM(model="llava-hf/llava-1.5-7b-hf")
|
||
|
||
# Refer to the HuggingFace repo for the correct format to use
|
||
prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
|
||
|
||
# Embeddings for single image
|
||
# torch.Tensor of shape (1, image_feature_size, hidden_size of LM)
|
||
image_embeds = torch.load(...)
|
||
|
||
outputs = llm.generate({
|
||
"prompt": prompt,
|
||
"multi_modal_data": {"image": image_embeds},
|
||
})
|
||
|
||
for o in outputs:
|
||
generated_text = o.outputs[0].text
|
||
print(generated_text)
|
||
```
|
||
|
||
For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embeddings:
|
||
|
||
??? code
|
||
|
||
```python
|
||
# Construct the prompt based on your model
|
||
prompt = ...
|
||
|
||
# Embeddings for multiple images
|
||
# torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM)
|
||
image_embeds = torch.load(...)
|
||
|
||
# Qwen2-VL
|
||
llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
|
||
mm_data = {
|
||
"image": {
|
||
"image_embeds": image_embeds,
|
||
# image_grid_thw is needed to calculate positional encoding.
|
||
"image_grid_thw": torch.load(...), # torch.Tensor of shape (1, 3),
|
||
}
|
||
}
|
||
|
||
# MiniCPM-V
|
||
llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4})
|
||
mm_data = {
|
||
"image": {
|
||
"image_embeds": image_embeds,
|
||
# image_sizes is needed to calculate details of the sliced image.
|
||
"image_sizes": [image.size for image in images], # list of image sizes
|
||
}
|
||
}
|
||
|
||
outputs = llm.generate({
|
||
"prompt": prompt,
|
||
"multi_modal_data": mm_data,
|
||
})
|
||
|
||
for o in outputs:
|
||
generated_text = o.outputs[0].text
|
||
print(generated_text)
|
||
```
|
||
|
||
## Online Serving
|
||
|
||
Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat). Media inputs also support optional UUIDs users can provide to uniquely identify each media, which is used to cache the media results across requests.
|
||
|
||
!!! important
|
||
A chat template is **required** to use Chat Completions API.
|
||
For HF format models, the default chat template is defined inside `chat_template.json` or `tokenizer_config.json`.
|
||
|
||
If no default chat template is available, we will first look for a built-in fallback in <gh-file:vllm/transformers_utils/chat_templates/registry.py>.
|
||
If no fallback is available, an error is raised and you have to provide the chat template manually via the `--chat-template` argument.
|
||
|
||
For certain models, we provide alternative chat templates inside <gh-dir:examples>.
|
||
For example, VLM2Vec uses <gh-file:examples/template_vlm2vec.jinja> which is different from the default one for Phi-3-Vision.
|
||
|
||
### Image Inputs
|
||
|
||
Image input is supported according to [OpenAI Vision API](https://platform.openai.com/docs/guides/vision).
|
||
Here is a simple example using Phi-3.5-Vision.
|
||
|
||
First, launch the OpenAI-compatible server:
|
||
|
||
```bash
|
||
vllm serve microsoft/Phi-3.5-vision-instruct --runner generate \
|
||
--trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}'
|
||
```
|
||
|
||
Then, you can use the OpenAI client as follows:
|
||
|
||
??? code
|
||
|
||
```python
|
||
from openai import OpenAI
|
||
|
||
openai_api_key = "EMPTY"
|
||
openai_api_base = "http://localhost:8000/v1"
|
||
|
||
client = OpenAI(
|
||
api_key=openai_api_key,
|
||
base_url=openai_api_base,
|
||
)
|
||
|
||
# Single-image input inference
|
||
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
||
|
||
chat_response = client.chat.completions.create(
|
||
model="microsoft/Phi-3.5-vision-instruct",
|
||
messages=[{
|
||
"role": "user",
|
||
"content": [
|
||
# NOTE: The prompt formatting with the image token `<image>` is not needed
|
||
# since the prompt will be processed automatically by the API server.
|
||
{"type": "text", "text": "What’s in this image?"},
|
||
{
|
||
"type": "image_url",
|
||
"image_url": {
|
||
url": image_url
|
||
},
|
||
"uuid": image_url # Optional
|
||
},
|
||
],
|
||
}],
|
||
)
|
||
print("Chat completion output:", chat_response.choices[0].message.content)
|
||
|
||
# Multi-image input inference
|
||
image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
|
||
image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
|
||
|
||
chat_response = client.chat.completions.create(
|
||
model="microsoft/Phi-3.5-vision-instruct",
|
||
messages=[{
|
||
"role": "user",
|
||
"content": [
|
||
{"type": "text", "text": "What are the animals in these images?"},
|
||
{
|
||
"type": "image_url",
|
||
"image_url": {
|
||
"url": image_url_duck
|
||
},
|
||
"uuid": image_url_duck # Optional
|
||
},
|
||
{
|
||
"type": "image_url",
|
||
"image_url": {
|
||
"url": image_url_lion
|
||
},
|
||
"uuid": image_url_lion # Optional
|
||
},
|
||
],
|
||
}],
|
||
)
|
||
print("Chat completion output:", chat_response.choices[0].message.content)
|
||
```
|
||
|
||
Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
|
||
|
||
!!! tip
|
||
Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via `--allowed-local-media-path` when launching the API server/engine,
|
||
and pass the file path as `url` in the API request.
|
||
|
||
!!! tip
|
||
There is no need to place image placeholders in the text content of the API request - they are already represented by the image content.
|
||
In fact, you can place image placeholders in the middle of the text by interleaving text and image content.
|
||
|
||
!!! note
|
||
By default, the timeout for fetching images through HTTP URL is `5` seconds.
|
||
You can override this by setting the environment variable:
|
||
|
||
```bash
|
||
export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
|
||
```
|
||
|
||
### Video Inputs
|
||
|
||
Instead of `image_url`, you can pass a video file via `video_url`. Here is a simple example using [LLaVA-OneVision](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf).
|
||
|
||
First, launch the OpenAI-compatible server:
|
||
|
||
```bash
|
||
vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --runner generate --max-model-len 8192
|
||
```
|
||
|
||
Then, you can use the OpenAI client as follows:
|
||
|
||
??? code
|
||
|
||
```python
|
||
from openai import OpenAI
|
||
|
||
openai_api_key = "EMPTY"
|
||
openai_api_base = "http://localhost:8000/v1"
|
||
|
||
client = OpenAI(
|
||
api_key=openai_api_key,
|
||
base_url=openai_api_base,
|
||
)
|
||
|
||
video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
|
||
|
||
## Use video url in the payload
|
||
chat_completion_from_url = client.chat.completions.create(
|
||
messages=[{
|
||
"role":
|
||
"user",
|
||
"content": [
|
||
{
|
||
"type": "text",
|
||
"text": "What's in this video?"
|
||
},
|
||
{
|
||
"type": "video_url",
|
||
"video_url": {
|
||
"url": video_url
|
||
},
|
||
"uuid": video_url # Optional
|
||
},
|
||
],
|
||
}],
|
||
model=model,
|
||
max_completion_tokens=64,
|
||
)
|
||
|
||
result = chat_completion_from_url.choices[0].message.content
|
||
print("Chat completion output from image url:", result)
|
||
```
|
||
|
||
Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
|
||
|
||
!!! note
|
||
By default, the timeout for fetching videos through HTTP URL is `30` seconds.
|
||
You can override this by setting the environment variable:
|
||
|
||
```bash
|
||
export VLLM_VIDEO_FETCH_TIMEOUT=<timeout>
|
||
```
|
||
|
||
#### Custom RGBA Background Color
|
||
|
||
To use a custom background color for RGBA images, pass the `rgba_background_color` parameter via `--media-io-kwargs`:
|
||
|
||
```bash
|
||
# Example: Black background for dark theme
|
||
vllm serve llava-hf/llava-1.5-7b-hf \
|
||
--media-io-kwargs '{"image": {"rgba_background_color": [0, 0, 0]}}'
|
||
|
||
# Example: Custom gray background
|
||
vllm serve llava-hf/llava-1.5-7b-hf \
|
||
--media-io-kwargs '{"image": {"rgba_background_color": [128, 128, 128]}}'
|
||
```
|
||
|
||
### Audio Inputs
|
||
|
||
Audio input is supported according to [OpenAI Audio API](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in).
|
||
Here is a simple example using Ultravox-v0.5-1B.
|
||
|
||
First, launch the OpenAI-compatible server:
|
||
|
||
```bash
|
||
vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b
|
||
```
|
||
|
||
Then, you can use the OpenAI client as follows:
|
||
|
||
??? code
|
||
|
||
```python
|
||
import base64
|
||
import requests
|
||
from openai import OpenAI
|
||
from vllm.assets.audio import AudioAsset
|
||
|
||
def encode_base64_content_from_url(content_url: str) -> str:
|
||
"""Encode a content retrieved from a remote url to base64 format."""
|
||
|
||
with requests.get(content_url) as response:
|
||
response.raise_for_status()
|
||
result = base64.b64encode(response.content).decode('utf-8')
|
||
|
||
return result
|
||
|
||
openai_api_key = "EMPTY"
|
||
openai_api_base = "http://localhost:8000/v1"
|
||
|
||
client = OpenAI(
|
||
api_key=openai_api_key,
|
||
base_url=openai_api_base,
|
||
)
|
||
|
||
# Any format supported by librosa is supported
|
||
audio_url = AudioAsset("winning_call").url
|
||
audio_base64 = encode_base64_content_from_url(audio_url)
|
||
|
||
chat_completion_from_base64 = client.chat.completions.create(
|
||
messages=[{
|
||
"role": "user",
|
||
"content": [
|
||
{
|
||
"type": "text",
|
||
"text": "What's in this audio?"
|
||
},
|
||
{
|
||
"type": "input_audio",
|
||
"input_audio": {
|
||
"data": audio_base64,
|
||
"format": "wav"
|
||
},
|
||
"uuid": audio_url # Optional
|
||
},
|
||
],
|
||
}],
|
||
model=model,
|
||
max_completion_tokens=64,
|
||
)
|
||
|
||
result = chat_completion_from_base64.choices[0].message.content
|
||
print("Chat completion output from input audio:", result)
|
||
```
|
||
|
||
Alternatively, you can pass `audio_url`, which is the audio counterpart of `image_url` for image input:
|
||
|
||
??? code
|
||
|
||
```python
|
||
chat_completion_from_url = client.chat.completions.create(
|
||
messages=[{
|
||
"role": "user",
|
||
"content": [
|
||
{
|
||
"type": "text",
|
||
"text": "What's in this audio?"
|
||
},
|
||
{
|
||
"type": "audio_url",
|
||
"audio_url": {
|
||
"url": audio_url
|
||
},
|
||
"uuid": audio_url # Optional
|
||
},
|
||
],
|
||
}],
|
||
model=model,
|
||
max_completion_tokens=64,
|
||
)
|
||
|
||
result = chat_completion_from_url.choices[0].message.content
|
||
print("Chat completion output from audio url:", result)
|
||
```
|
||
|
||
Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
|
||
|
||
!!! note
|
||
By default, the timeout for fetching audios through HTTP URL is `10` seconds.
|
||
You can override this by setting the environment variable:
|
||
|
||
```bash
|
||
export VLLM_AUDIO_FETCH_TIMEOUT=<timeout>
|
||
```
|
||
|
||
### Embedding Inputs
|
||
|
||
To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
|
||
pass a tensor of shape to the corresponding field of the multi-modal dictionary.
|
||
|
||
#### Image Embedding Inputs
|
||
|
||
For image embeddings, you can pass the base64-encoded tensor to the `image_embeds` field.
|
||
The following example demonstrates how to pass image embeddings to the OpenAI server:
|
||
|
||
??? code
|
||
|
||
```python
|
||
image_embedding = torch.load(...)
|
||
grid_thw = torch.load(...) # Required by Qwen/Qwen2-VL-2B-Instruct
|
||
|
||
buffer = io.BytesIO()
|
||
torch.save(image_embedding, buffer)
|
||
buffer.seek(0)
|
||
binary_data = buffer.read()
|
||
base64_image_embedding = base64.b64encode(binary_data).decode('utf-8')
|
||
|
||
client = OpenAI(
|
||
# defaults to os.environ.get("OPENAI_API_KEY")
|
||
api_key=openai_api_key,
|
||
base_url=openai_api_base,
|
||
)
|
||
|
||
# Basic usage - this is equivalent to the LLaVA example for offline inference
|
||
model = "llava-hf/llava-1.5-7b-hf"
|
||
embeds = {
|
||
"type": "image_embeds",
|
||
"image_embeds": f"{base64_image_embedding}",
|
||
"uuid": image_url # Optional
|
||
}
|
||
|
||
# Pass additional parameters (available to Qwen2-VL and MiniCPM-V)
|
||
model = "Qwen/Qwen2-VL-2B-Instruct"
|
||
embeds = {
|
||
"type": "image_embeds",
|
||
"image_embeds": {
|
||
"image_embeds": f"{base64_image_embedding}" , # Required
|
||
"image_grid_thw": f"{base64_image_grid_thw}" # Required by Qwen/Qwen2-VL-2B-Instruct
|
||
},
|
||
"uuid": image_url # Optional
|
||
}
|
||
model = "openbmb/MiniCPM-V-2_6"
|
||
embeds = {
|
||
"type": "image_embeds",
|
||
"image_embeds": {
|
||
"image_embeds": f"{base64_image_embedding}" , # Required
|
||
"image_sizes": f"{base64_image_sizes}" # Required by openbmb/MiniCPM-V-2_6
|
||
},
|
||
"uuid": image_url # Optional
|
||
}
|
||
chat_completion = client.chat.completions.create(
|
||
messages=[
|
||
{"role": "system", "content": "You are a helpful assistant."},
|
||
{"role": "user", "content": [
|
||
{
|
||
"type": "text",
|
||
"text": "What's in this image?",
|
||
},
|
||
embeds,
|
||
],
|
||
},
|
||
],
|
||
model=model,
|
||
)
|
||
```
|
||
|
||
For Online Serving, you can also skip sending media if you expect cache hits with provided UUIDs. You can do so by sending media like this:
|
||
|
||
```python
|
||
# Image/video/audio URL:
|
||
{
|
||
"type": "image_url",
|
||
"image_url": None,
|
||
"uuid": image_uuid,
|
||
},
|
||
|
||
# image_embeds
|
||
{
|
||
"type": "image_embeds",
|
||
"image_embeds": None,
|
||
"uuid": image_uuid
|
||
},
|
||
|
||
# input_audio:
|
||
{
|
||
"type": "input_audio",
|
||
"input_audio": None,
|
||
"uuid": audio_uuid
|
||
},
|
||
|
||
# PIL Image:
|
||
{
|
||
"type": "image_pil",
|
||
"image_pil": None
|
||
"uuid": image_uuid
|
||
}
|
||
|
||
```
|
||
|
||
!!! note
|
||
Only one message can contain `{"type": "image_embeds"}`.
|
||
If used with a model that requires additional parameters, you must also provide a tensor for each of them, e.g. `image_grid_thw`, `image_sizes`, etc.
|