[Doc] Add video example to openai client for multimodal (#11521)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
This commit is contained in:
Isotr0py 2024-12-27 01:31:29 +08:00 committed by GitHub
parent eec906d811
commit b85a977822
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 114 additions and 11 deletions

View File

@ -294,12 +294,58 @@ $ export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
### Video
Instead of {code}`image_url`, you can pass a video file via {code}`video_url`.
Instead of {code}`image_url`, you can pass a video file via {code}`video_url`. Here is a simple example using [LLaVA-OneVision](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf).
You can use [these tests](gh-file:entrypoints/openai/test_video.py) as reference.
First, launch the OpenAI-compatible server:
```bash
vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --task generate --max-model-len 8192
```
Then, you can use the OpenAI client as follows:
```python
from openai import OpenAI
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
## Use video url in the payload
chat_completion_from_url = client.chat.completions.create(
messages=[{
"role":
"user",
"content": [
{
"type": "text",
"text": "What's in this video?"
},
{
"type": "video_url",
"video_url": {
"url": video_url
},
},
],
}],
model=model,
max_completion_tokens=64,
)
result = chat_completion_from_url.choices[0].message.content
print("Chat completion output from image url:", result)
```
Full example: <gh-file:examples/openai_chat_completion_client_for_multimodal.py>
````{note}
By default, the timeout for fetching videos through HTTP URL url is `30` seconds.
By default, the timeout for fetching videos through HTTP URL is `30` seconds.
You can override this by setting the environment variable:
```console

View File

@ -18,7 +18,6 @@ import base64
import requests
from openai import OpenAI
from vllm.assets.audio import AudioAsset
from vllm.utils import FlexibleArgumentParser
# Modify OpenAI's API key and API base to use vLLM's API server.
@ -151,8 +150,66 @@ def run_multi_image() -> None:
print("Chat completion output:", result)
# Video input inference
def run_video() -> None:
video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
video_base64 = encode_base64_content_from_url(video_url)
## Use video url in the payload
chat_completion_from_url = client.chat.completions.create(
messages=[{
"role":
"user",
"content": [
{
"type": "text",
"text": "What's in this video?"
},
{
"type": "video_url",
"video_url": {
"url": video_url
},
},
],
}],
model=model,
max_completion_tokens=64,
)
result = chat_completion_from_url.choices[0].message.content
print("Chat completion output from image url:", result)
## Use base64 encoded video in the payload
chat_completion_from_base64 = client.chat.completions.create(
messages=[{
"role":
"user",
"content": [
{
"type": "text",
"text": "What's in this video?"
},
{
"type": "video_url",
"video_url": {
"url": f"data:video/mp4;base64,{video_base64}"
},
},
],
}],
model=model,
max_completion_tokens=64,
)
result = chat_completion_from_base64.choices[0].message.content
print("Chat completion output from base64 encoded image:", result)
# Audio input inference
def run_audio() -> None:
from vllm.assets.audio import AudioAsset
audio_url = AudioAsset("winning_call").url
audio_base64 = encode_base64_content_from_url(audio_url)
@ -240,6 +297,7 @@ example_function_map = {
"text-only": run_text_only,
"single-image": run_single_image,
"multi-image": run_multi_image,
"video": run_video,
"audio": run_audio,
}
@ -253,12 +311,11 @@ if __name__ == "__main__":
parser = FlexibleArgumentParser(
description='Demo on using OpenAI client for online inference with '
'multimodal language models served with vLLM.')
parser.add_argument(
'--chat-type',
'-c',
type=str,
default="single-image",
choices=["text-only", "single-image", "multi-image", "audio"],
help='Conversation type with multimodal data.')
parser.add_argument('--chat-type',
'-c',
type=str,
default="single-image",
choices=list(example_function_map.keys()),
help='Conversation type with multimodal data.')
args = parser.parse_args()
main(args)