diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py index cffd093c983ac..2707d46f46e2a 100644 --- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py +++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py @@ -1,5 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -"""An example showing how to use vLLM to serve multimodal models +"""An example showing how to use vLLM to serve multimodal models and run online serving with OpenAI client. Launch the vLLM server with the following command: @@ -12,12 +12,18 @@ vllm serve microsoft/Phi-3.5-vision-instruct --task generate \ --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}' (audio inference with Ultravox) -vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b --max-model-len 4096 +vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b \ + --max-model-len 4096 --trust-remote-code + +run the script with +python openai_chat_completion_client_for_multimodal.py --chat-type audio """ + import base64 import requests from openai import OpenAI +from utils import get_first_model from vllm.utils import FlexibleArgumentParser @@ -31,9 +37,6 @@ client = OpenAI( base_url=openai_api_base, ) -models = client.models.list() -model = models.data[0].id - def encode_base64_content_from_url(content_url: str) -> str: """Encode a content retrieved from a remote url to base64 format.""" @@ -46,7 +49,7 @@ def encode_base64_content_from_url(content_url: str) -> str: # Text-only inference -def run_text_only() -> None: +def run_text_only(model: str) -> None: chat_completion = client.chat.completions.create( messages=[{ "role": "user", @@ -61,7 +64,7 @@ def run_text_only() -> None: # Single-image input inference -def run_single_image() -> None: +def run_single_image(model: str) -> None: ## Use image url in the payload image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" @@ -117,7 +120,7 @@ def run_single_image() -> None: # Multi-image input inference -def run_multi_image() -> None: +def run_multi_image(model: str) -> None: image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg" image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg" chat_completion_from_url = client.chat.completions.create( @@ -152,7 +155,7 @@ def run_multi_image() -> None: # Video input inference -def run_video() -> None: +def run_video(model: str) -> None: video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4" video_base64 = encode_base64_content_from_url(video_url) @@ -208,7 +211,7 @@ def run_video() -> None: # Audio input inference -def run_audio() -> None: +def run_audio(model: str) -> None: from vllm.assets.audio import AudioAsset audio_url = AudioAsset("winning_call").url @@ -318,7 +321,8 @@ def parse_args(): def main(args) -> None: chat_type = args.chat_type - example_function_map[chat_type]() + model = get_first_model(client) + example_function_map[chat_type](model) if __name__ == "__main__": diff --git a/examples/online_serving/utils.py b/examples/online_serving/utils.py new file mode 100644 index 0000000000000..4826e8e205282 --- /dev/null +++ b/examples/online_serving/utils.py @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: Apache-2.0 +from openai import APIConnectionError, OpenAI +from openai.pagination import SyncPage +from openai.types.model import Model + + +def get_first_model(client: OpenAI) -> str: + """ + Get the first model from the vLLM server. + """ + try: + models: SyncPage[Model] = client.models.list() + except APIConnectionError as e: + raise RuntimeError( + "Failed to get the list of models from the vLLM server at " + f"{client.base_url} with API key {client.api_key}. Check\n" + "1. the server is running\n" + "2. the server URL is correct\n" + "3. the API key is correct") from e + + if len(models.data) == 0: + raise RuntimeError( + f"No models found on the vLLM server at {client.base_url}") + + return models.data[0].id