[Doc] Add video example to openai client for multimodal (#11521)

Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-12-19 06:45:01 +08:00 · 2024-12-27 01:31:29 +08:00 · 2024-12-27 01:31:29 +08:00 · b85a977822
commit b85a977822
parent eec906d811
2 changed files with 114 additions and 11 deletions
--- a/docs/source/usage/multimodal_inputs.md
+++ b/docs/source/usage/multimodal_inputs.md
@ -294,12 +294,58 @@ $ export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
 ### Video
-Instead of {code}`image_url`, you can pass a video file via {code}`video_url`.
+Instead of {code}`image_url`, you can pass a video file via {code}`video_url`. Here is a simple example using [LLaVA-OneVision](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf).
-You can use [these tests](gh-file:entrypoints/openai/test_video.py) as reference.
+First, launch the OpenAI-compatible server:
 ```bash
 vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --task generate --max-model-len 8192
 ```
 Then, you can use the OpenAI client as follows:
 ```python
 from openai import OpenAI
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
 client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
 )
 video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
 ## Use video url in the payload
 chat_completion_from_url = client.chat.completions.create(
    messages=[{
        "role":
        "user",
        "content": [
            {
                "type": "text",
                "text": "What's in this video?"
            },
            {
                "type": "video_url",
                "video_url": {
                    "url": video_url
                },
            },
        ],
    }],
    model=model,
    max_completion_tokens=64,
 )
 result = chat_completion_from_url.choices[0].message.content
 print("Chat completion output from image url:", result)
 ```
 Full example: <gh-file:examples/openai_chat_completion_client_for_multimodal.py>
 ````{note}
-By default, the timeout for fetching videos through HTTP URL url is `30` seconds.
+By default, the timeout for fetching videos through HTTP URL is `30` seconds.
 You can override this by setting the environment variable:
 ```console
--- a/examples/openai_chat_completion_client_for_multimodal.py
+++ b/examples/openai_chat_completion_client_for_multimodal.py
@ -18,7 +18,6 @@ import base64
 import requests
 from openai import OpenAI
 from vllm.assets.audio import AudioAsset
 from vllm.utils import FlexibleArgumentParser
 # Modify OpenAI's API key and API base to use vLLM's API server.
@ -151,8 +150,66 @@ def run_multi_image() -> None:
    print("Chat completion output:", result)
 # Video input inference
 def run_video() -> None:
    video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
    video_base64 = encode_base64_content_from_url(video_url)
    ## Use video url in the payload
    chat_completion_from_url = client.chat.completions.create(
        messages=[{
            "role":
            "user",
            "content": [
                {
                    "type": "text",
                    "text": "What's in this video?"
                },
                {
                    "type": "video_url",
                    "video_url": {
                        "url": video_url
                    },
                },
            ],
        }],
        model=model,
        max_completion_tokens=64,
    )
    result = chat_completion_from_url.choices[0].message.content
    print("Chat completion output from image url:", result)
    ## Use base64 encoded video in the payload
    chat_completion_from_base64 = client.chat.completions.create(
        messages=[{
            "role":
            "user",
            "content": [
                {
                    "type": "text",
                    "text": "What's in this video?"
                },
                {
                    "type": "video_url",
                    "video_url": {
                        "url": f"data:video/mp4;base64,{video_base64}"
                    },
                },
            ],
        }],
        model=model,
        max_completion_tokens=64,
    )
    result = chat_completion_from_base64.choices[0].message.content
    print("Chat completion output from base64 encoded image:", result)
 # Audio input inference
 def run_audio() -> None:
    from vllm.assets.audio import AudioAsset
    audio_url = AudioAsset("winning_call").url
    audio_base64 = encode_base64_content_from_url(audio_url)
@ -240,6 +297,7 @@ example_function_map = {
    "text-only": run_text_only,
    "single-image": run_single_image,
    "multi-image": run_multi_image,
    "video": run_video,
    "audio": run_audio,
 }
@ -253,12 +311,11 @@ if __name__ == "__main__":
    parser = FlexibleArgumentParser(
        description='Demo on using OpenAI client for online inference with '
        'multimodal language models served with vLLM.')
-    parser.add_argument(
+    parser.add_argument('--chat-type',
-        '--chat-type',
+                        '-c',
-        '-c',
+                        type=str,
-        type=str,
+                        default="single-image",
-        default="single-image",
+                        choices=list(example_function_map.keys()),
-        choices=["text-only", "single-image", "multi-image", "audio"],
+                        help='Conversation type with multimodal data.')
        help='Conversation type with multimodal data.')
    args = parser.parse_args()
    main(args)