mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-16 11:56:00 +08:00
[Doc] Add video example to openai client for multimodal (#11521)
Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
This commit is contained in:
parent
eec906d811
commit
b85a977822
@ -294,12 +294,58 @@ $ export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
|
||||
|
||||
### Video
|
||||
|
||||
Instead of {code}`image_url`, you can pass a video file via {code}`video_url`.
|
||||
Instead of {code}`image_url`, you can pass a video file via {code}`video_url`. Here is a simple example using [LLaVA-OneVision](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf).
|
||||
|
||||
You can use [these tests](gh-file:entrypoints/openai/test_video.py) as reference.
|
||||
First, launch the OpenAI-compatible server:
|
||||
|
||||
```bash
|
||||
vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --task generate --max-model-len 8192
|
||||
```
|
||||
|
||||
Then, you can use the OpenAI client as follows:
|
||||
```python
|
||||
from openai import OpenAI
|
||||
|
||||
openai_api_key = "EMPTY"
|
||||
openai_api_base = "http://localhost:8000/v1"
|
||||
|
||||
client = OpenAI(
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
|
||||
|
||||
## Use video url in the payload
|
||||
chat_completion_from_url = client.chat.completions.create(
|
||||
messages=[{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in this video?"
|
||||
},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {
|
||||
"url": video_url
|
||||
},
|
||||
},
|
||||
],
|
||||
}],
|
||||
model=model,
|
||||
max_completion_tokens=64,
|
||||
)
|
||||
|
||||
result = chat_completion_from_url.choices[0].message.content
|
||||
print("Chat completion output from image url:", result)
|
||||
```
|
||||
|
||||
Full example: <gh-file:examples/openai_chat_completion_client_for_multimodal.py>
|
||||
|
||||
````{note}
|
||||
By default, the timeout for fetching videos through HTTP URL url is `30` seconds.
|
||||
By default, the timeout for fetching videos through HTTP URL is `30` seconds.
|
||||
You can override this by setting the environment variable:
|
||||
|
||||
```console
|
||||
|
||||
@ -18,7 +18,6 @@ import base64
|
||||
import requests
|
||||
from openai import OpenAI
|
||||
|
||||
from vllm.assets.audio import AudioAsset
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
# Modify OpenAI's API key and API base to use vLLM's API server.
|
||||
@ -151,8 +150,66 @@ def run_multi_image() -> None:
|
||||
print("Chat completion output:", result)
|
||||
|
||||
|
||||
# Video input inference
|
||||
def run_video() -> None:
|
||||
video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
|
||||
video_base64 = encode_base64_content_from_url(video_url)
|
||||
|
||||
## Use video url in the payload
|
||||
chat_completion_from_url = client.chat.completions.create(
|
||||
messages=[{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in this video?"
|
||||
},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {
|
||||
"url": video_url
|
||||
},
|
||||
},
|
||||
],
|
||||
}],
|
||||
model=model,
|
||||
max_completion_tokens=64,
|
||||
)
|
||||
|
||||
result = chat_completion_from_url.choices[0].message.content
|
||||
print("Chat completion output from image url:", result)
|
||||
|
||||
## Use base64 encoded video in the payload
|
||||
chat_completion_from_base64 = client.chat.completions.create(
|
||||
messages=[{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in this video?"
|
||||
},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {
|
||||
"url": f"data:video/mp4;base64,{video_base64}"
|
||||
},
|
||||
},
|
||||
],
|
||||
}],
|
||||
model=model,
|
||||
max_completion_tokens=64,
|
||||
)
|
||||
|
||||
result = chat_completion_from_base64.choices[0].message.content
|
||||
print("Chat completion output from base64 encoded image:", result)
|
||||
|
||||
|
||||
# Audio input inference
|
||||
def run_audio() -> None:
|
||||
from vllm.assets.audio import AudioAsset
|
||||
|
||||
audio_url = AudioAsset("winning_call").url
|
||||
audio_base64 = encode_base64_content_from_url(audio_url)
|
||||
|
||||
@ -240,6 +297,7 @@ example_function_map = {
|
||||
"text-only": run_text_only,
|
||||
"single-image": run_single_image,
|
||||
"multi-image": run_multi_image,
|
||||
"video": run_video,
|
||||
"audio": run_audio,
|
||||
}
|
||||
|
||||
@ -253,12 +311,11 @@ if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser(
|
||||
description='Demo on using OpenAI client for online inference with '
|
||||
'multimodal language models served with vLLM.')
|
||||
parser.add_argument(
|
||||
'--chat-type',
|
||||
'-c',
|
||||
type=str,
|
||||
default="single-image",
|
||||
choices=["text-only", "single-image", "multi-image", "audio"],
|
||||
help='Conversation type with multimodal data.')
|
||||
parser.add_argument('--chat-type',
|
||||
'-c',
|
||||
type=str,
|
||||
default="single-image",
|
||||
choices=list(example_function_map.keys()),
|
||||
help='Conversation type with multimodal data.')
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user