mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-19 06:45:01 +08:00
[Doc] Add video example to openai client for multimodal (#11521)
Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
This commit is contained in:
parent
eec906d811
commit
b85a977822
@ -294,12 +294,58 @@ $ export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
|
|||||||
|
|
||||||
### Video
|
### Video
|
||||||
|
|
||||||
Instead of {code}`image_url`, you can pass a video file via {code}`video_url`.
|
Instead of {code}`image_url`, you can pass a video file via {code}`video_url`. Here is a simple example using [LLaVA-OneVision](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf).
|
||||||
|
|
||||||
You can use [these tests](gh-file:entrypoints/openai/test_video.py) as reference.
|
First, launch the OpenAI-compatible server:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --task generate --max-model-len 8192
|
||||||
|
```
|
||||||
|
|
||||||
|
Then, you can use the OpenAI client as follows:
|
||||||
|
```python
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
openai_api_key = "EMPTY"
|
||||||
|
openai_api_base = "http://localhost:8000/v1"
|
||||||
|
|
||||||
|
client = OpenAI(
|
||||||
|
api_key=openai_api_key,
|
||||||
|
base_url=openai_api_base,
|
||||||
|
)
|
||||||
|
|
||||||
|
video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
|
||||||
|
|
||||||
|
## Use video url in the payload
|
||||||
|
chat_completion_from_url = client.chat.completions.create(
|
||||||
|
messages=[{
|
||||||
|
"role":
|
||||||
|
"user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "What's in this video?"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "video_url",
|
||||||
|
"video_url": {
|
||||||
|
"url": video_url
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}],
|
||||||
|
model=model,
|
||||||
|
max_completion_tokens=64,
|
||||||
|
)
|
||||||
|
|
||||||
|
result = chat_completion_from_url.choices[0].message.content
|
||||||
|
print("Chat completion output from image url:", result)
|
||||||
|
```
|
||||||
|
|
||||||
|
Full example: <gh-file:examples/openai_chat_completion_client_for_multimodal.py>
|
||||||
|
|
||||||
````{note}
|
````{note}
|
||||||
By default, the timeout for fetching videos through HTTP URL url is `30` seconds.
|
By default, the timeout for fetching videos through HTTP URL is `30` seconds.
|
||||||
You can override this by setting the environment variable:
|
You can override this by setting the environment variable:
|
||||||
|
|
||||||
```console
|
```console
|
||||||
|
|||||||
@ -18,7 +18,6 @@ import base64
|
|||||||
import requests
|
import requests
|
||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
|
|
||||||
from vllm.assets.audio import AudioAsset
|
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
# Modify OpenAI's API key and API base to use vLLM's API server.
|
# Modify OpenAI's API key and API base to use vLLM's API server.
|
||||||
@ -151,8 +150,66 @@ def run_multi_image() -> None:
|
|||||||
print("Chat completion output:", result)
|
print("Chat completion output:", result)
|
||||||
|
|
||||||
|
|
||||||
|
# Video input inference
|
||||||
|
def run_video() -> None:
|
||||||
|
video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
|
||||||
|
video_base64 = encode_base64_content_from_url(video_url)
|
||||||
|
|
||||||
|
## Use video url in the payload
|
||||||
|
chat_completion_from_url = client.chat.completions.create(
|
||||||
|
messages=[{
|
||||||
|
"role":
|
||||||
|
"user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "What's in this video?"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "video_url",
|
||||||
|
"video_url": {
|
||||||
|
"url": video_url
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}],
|
||||||
|
model=model,
|
||||||
|
max_completion_tokens=64,
|
||||||
|
)
|
||||||
|
|
||||||
|
result = chat_completion_from_url.choices[0].message.content
|
||||||
|
print("Chat completion output from image url:", result)
|
||||||
|
|
||||||
|
## Use base64 encoded video in the payload
|
||||||
|
chat_completion_from_base64 = client.chat.completions.create(
|
||||||
|
messages=[{
|
||||||
|
"role":
|
||||||
|
"user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "What's in this video?"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "video_url",
|
||||||
|
"video_url": {
|
||||||
|
"url": f"data:video/mp4;base64,{video_base64}"
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}],
|
||||||
|
model=model,
|
||||||
|
max_completion_tokens=64,
|
||||||
|
)
|
||||||
|
|
||||||
|
result = chat_completion_from_base64.choices[0].message.content
|
||||||
|
print("Chat completion output from base64 encoded image:", result)
|
||||||
|
|
||||||
|
|
||||||
# Audio input inference
|
# Audio input inference
|
||||||
def run_audio() -> None:
|
def run_audio() -> None:
|
||||||
|
from vllm.assets.audio import AudioAsset
|
||||||
|
|
||||||
audio_url = AudioAsset("winning_call").url
|
audio_url = AudioAsset("winning_call").url
|
||||||
audio_base64 = encode_base64_content_from_url(audio_url)
|
audio_base64 = encode_base64_content_from_url(audio_url)
|
||||||
|
|
||||||
@ -240,6 +297,7 @@ example_function_map = {
|
|||||||
"text-only": run_text_only,
|
"text-only": run_text_only,
|
||||||
"single-image": run_single_image,
|
"single-image": run_single_image,
|
||||||
"multi-image": run_multi_image,
|
"multi-image": run_multi_image,
|
||||||
|
"video": run_video,
|
||||||
"audio": run_audio,
|
"audio": run_audio,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -253,12 +311,11 @@ if __name__ == "__main__":
|
|||||||
parser = FlexibleArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description='Demo on using OpenAI client for online inference with '
|
description='Demo on using OpenAI client for online inference with '
|
||||||
'multimodal language models served with vLLM.')
|
'multimodal language models served with vLLM.')
|
||||||
parser.add_argument(
|
parser.add_argument('--chat-type',
|
||||||
'--chat-type',
|
'-c',
|
||||||
'-c',
|
type=str,
|
||||||
type=str,
|
default="single-image",
|
||||||
default="single-image",
|
choices=list(example_function_map.keys()),
|
||||||
choices=["text-only", "single-image", "multi-image", "audio"],
|
help='Conversation type with multimodal data.')
|
||||||
help='Conversation type with multimodal data.')
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user