diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py index 37216a5cfe574..5d515fbfb6716 100644 --- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py +++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py @@ -38,11 +38,13 @@ client = OpenAI( base_url=openai_api_base, ) +headers = {"User-Agent": "vLLM Example Client"} + def encode_base64_content_from_url(content_url: str) -> str: """Encode a content retrieved from a remote url to base64 format.""" - with requests.get(content_url) as response: + with requests.get(content_url, headers=headers) as response: response.raise_for_status() result = base64.b64encode(response.content).decode("utf-8") @@ -50,19 +52,19 @@ def encode_base64_content_from_url(content_url: str) -> str: # Text-only inference -def run_text_only(model: str) -> None: +def run_text_only(model: str, max_completion_tokens: int) -> None: chat_completion = client.chat.completions.create( messages=[{"role": "user", "content": "What's the capital of France?"}], model=model, - max_completion_tokens=64, + max_completion_tokens=max_completion_tokens, ) result = chat_completion.choices[0].message.content - print("Chat completion output:", result) + print("Chat completion output:\n", result) # Single-image input inference -def run_single_image(model: str) -> None: +def run_single_image(model: str, max_completion_tokens: int) -> None: ## Use image url in the payload image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" chat_completion_from_url = client.chat.completions.create( @@ -79,11 +81,11 @@ def run_single_image(model: str) -> None: } ], model=model, - max_completion_tokens=64, + max_completion_tokens=max_completion_tokens, ) result = chat_completion_from_url.choices[0].message.content - print("Chat completion output from image url:", result) + print("Chat completion output from image url:\n", result) ## Use base64 encoded image in the payload image_base64 = encode_base64_content_from_url(image_url) @@ -101,7 +103,7 @@ def run_single_image(model: str) -> None: } ], model=model, - max_completion_tokens=64, + max_completion_tokens=max_completion_tokens, ) result = chat_completion_from_base64.choices[0].message.content @@ -109,7 +111,7 @@ def run_single_image(model: str) -> None: # Multi-image input inference -def run_multi_image(model: str) -> None: +def run_multi_image(model: str, max_completion_tokens: int) -> None: image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg" image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg" chat_completion_from_url = client.chat.completions.create( @@ -130,15 +132,15 @@ def run_multi_image(model: str) -> None: } ], model=model, - max_completion_tokens=64, + max_completion_tokens=max_completion_tokens, ) result = chat_completion_from_url.choices[0].message.content - print("Chat completion output:", result) + print("Chat completion output:\n", result) # Video input inference -def run_video(model: str) -> None: +def run_video(model: str, max_completion_tokens: int) -> None: video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4" video_base64 = encode_base64_content_from_url(video_url) @@ -157,11 +159,11 @@ def run_video(model: str) -> None: } ], model=model, - max_completion_tokens=64, + max_completion_tokens=max_completion_tokens, ) result = chat_completion_from_url.choices[0].message.content - print("Chat completion output from image url:", result) + print("Chat completion output from video url:\n", result) ## Use base64 encoded video in the payload chat_completion_from_base64 = client.chat.completions.create( @@ -178,15 +180,15 @@ def run_video(model: str) -> None: } ], model=model, - max_completion_tokens=64, + max_completion_tokens=max_completion_tokens, ) result = chat_completion_from_base64.choices[0].message.content - print("Chat completion output from base64 encoded image:", result) + print("Chat completion output from base64 encoded video:\n", result) # Audio input inference -def run_audio(model: str) -> None: +def run_audio(model: str, max_completion_tokens: int) -> None: from vllm.assets.audio import AudioAsset audio_url = AudioAsset("winning_call").url @@ -211,11 +213,11 @@ def run_audio(model: str) -> None: } ], model=model, - max_completion_tokens=64, + max_completion_tokens=max_completion_tokens, ) result = chat_completion_from_base64.choices[0].message.content - print("Chat completion output from input audio:", result) + print("Chat completion output from input audio:\n", result) # HTTP URL chat_completion_from_url = client.chat.completions.create( @@ -235,11 +237,11 @@ def run_audio(model: str) -> None: } ], model=model, - max_completion_tokens=64, + max_completion_tokens=max_completion_tokens, ) result = chat_completion_from_url.choices[0].message.content - print("Chat completion output from audio url:", result) + print("Chat completion output from audio url:\n", result) # base64 URL chat_completion_from_base64 = client.chat.completions.create( @@ -259,14 +261,14 @@ def run_audio(model: str) -> None: } ], model=model, - max_completion_tokens=64, + max_completion_tokens=max_completion_tokens, ) result = chat_completion_from_base64.choices[0].message.content - print("Chat completion output from base64 encoded audio:", result) + print("Chat completion output from base64 encoded audio:\n", result) -def run_multi_audio(model: str) -> None: +def run_multi_audio(model: str, max_completion_tokens: int) -> None: from vllm.assets.audio import AudioAsset # Two different audios to showcase batched inference. @@ -300,11 +302,11 @@ def run_multi_audio(model: str) -> None: } ], model=model, - max_completion_tokens=64, + max_completion_tokens=max_completion_tokens, ) result = chat_completion_from_base64.choices[0].message.content - print("Chat completion output from input audio:", result) + print("Chat completion output from input audio:\n", result) example_function_map = { @@ -330,13 +332,20 @@ def parse_args(): choices=list(example_function_map.keys()), help="Conversation type with multimodal data.", ) + parser.add_argument( + "--max-completion-tokens", + "-n", + type=int, + default=128, + help="Maximum number of tokens to generate for each completion.", + ) return parser.parse_args() def main(args) -> None: chat_type = args.chat_type model = get_first_model(client) - example_function_map[chat_type](model) + example_function_map[chat_type](model, args.max_completion_tokens) if __name__ == "__main__":