From a21256c46327ec366b7804d22ba66ed04c2ae18b Mon Sep 17 00:00:00 2001 From: Fanli Lin Date: Tue, 25 Nov 2025 14:03:20 +0800 Subject: [PATCH] Add TP CLI argument to multimodal inference examples (#29301) Signed-off-by: Lin, Fanli --- examples/offline_inference/audio_language.py | 15 +++++++ examples/offline_inference/vision_language.py | 15 +++++++ .../vision_language_multi_image.py | 40 ++++++++++++++++--- 3 files changed, 65 insertions(+), 5 deletions(-) mode change 100644 => 100755 examples/offline_inference/audio_language.py mode change 100644 => 100755 examples/offline_inference/vision_language.py mode change 100644 => 100755 examples/offline_inference/vision_language_multi_image.py diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py old mode 100644 new mode 100755 index 04e6f99f8957e..df6e96ca375fc --- a/examples/offline_inference/audio_language.py +++ b/examples/offline_inference/audio_language.py @@ -425,6 +425,13 @@ def parse_args(): default=None, help="Set the seed when initializing `vllm.LLM`.", ) + parser.add_argument( + "--tensor-parallel-size", + "-tp", + type=int, + default=None, + help="Tensor parallel size to override the model's default setting. ", + ) return parser.parse_args() @@ -434,6 +441,12 @@ def main(args): if model not in model_example_map: raise ValueError(f"Model type {model} is not supported.") + if args.tensor_parallel_size is not None and args.tensor_parallel_size < 1: + raise ValueError( + f"tensor_parallel_size must be a positive integer, " + f"got {args.tensor_parallel_size}" + ) + audio_count = args.num_audios req_data = model_example_map[model]( question_per_audio_count[audio_count], audio_count @@ -446,6 +459,8 @@ def main(args): ) engine_args = asdict(req_data.engine_args) | {"seed": args.seed} + if args.tensor_parallel_size is not None: + engine_args["tensor_parallel_size"] = args.tensor_parallel_size llm = LLM(**engine_args) # We set temperature to 0.2 so that outputs can be different diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py old mode 100644 new mode 100755 index 65ea4df4a3099..8f72bf6f0b0d1 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -2064,6 +2064,13 @@ def parse_args(): help="If True, will send all requests in a second batch with empty mm " "data to verify cache hits with UUIDs.", ) + parser.add_argument( + "--tensor-parallel-size", + "-tp", + type=int, + default=None, + help="Tensor parallel size to override the model's default setting. ", + ) return parser.parse_args() @@ -2072,6 +2079,12 @@ def main(args): if model not in model_example_map: raise ValueError(f"Model type {model} is not supported.") + if args.tensor_parallel_size is not None and args.tensor_parallel_size < 1: + raise ValueError( + f"tensor_parallel_size must be a positive integer, " + f"got {args.tensor_parallel_size}" + ) + modality = args.modality mm_input = get_multi_modal_input(args) data = mm_input["data"] @@ -2089,6 +2102,8 @@ def main(args): "seed": args.seed, "mm_processor_cache_gb": 0 if args.disable_mm_processor_cache else 4, } + if args.tensor_parallel_size is not None: + engine_args["tensor_parallel_size"] = args.tensor_parallel_size llm = LLM(**engine_args) # Don't want to check the flag multiple times, so just hijack `prompts`. diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py old mode 100644 new mode 100755 index 301265d4e17f7..7ba4e64b567de --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -1352,10 +1352,18 @@ model_example_map = { } -def run_generate(model, question: str, image_urls: list[str], seed: int | None): +def run_generate( + model, + question: str, + image_urls: list[str], + seed: int | None, + tensor_parallel_size: int | None, +): req_data = model_example_map[model](question, image_urls) - engine_args = asdict(req_data.engine_args) | {"seed": args.seed} + engine_args = asdict(req_data.engine_args) | {"seed": seed} + if tensor_parallel_size is not None: + engine_args["tensor_parallel_size"] = tensor_parallel_size llm = LLM(**engine_args) sampling_params = SamplingParams( @@ -1378,7 +1386,13 @@ def run_generate(model, question: str, image_urls: list[str], seed: int | None): print("-" * 50) -def run_chat(model: str, question: str, image_urls: list[str], seed: int | None): +def run_chat( + model: str, + question: str, + image_urls: list[str], + seed: int | None, + tensor_parallel_size: int | None, +): req_data = model_example_map[model](question, image_urls) # Disable other modalities to save memory @@ -1388,6 +1402,8 @@ def run_chat(model: str, question: str, image_urls: list[str], seed: int | None) ) engine_args = asdict(req_data.engine_args) | {"seed": seed} + if tensor_parallel_size is not None: + engine_args["tensor_parallel_size"] = tensor_parallel_size llm = LLM(**engine_args) sampling_params = ( @@ -1463,6 +1479,13 @@ def parse_args(): default=2, help="Number of images to use for the demo.", ) + parser.add_argument( + "--tensor-parallel-size", + "-tp", + type=int, + default=None, + help="Tensor parallel size to override the model's default setting. ", + ) return parser.parse_args() @@ -1470,13 +1493,20 @@ def main(args: Namespace): model = args.model_type method = args.method seed = args.seed + tensor_parallel_size = args.tensor_parallel_size + + if tensor_parallel_size is not None and tensor_parallel_size < 1: + raise ValueError( + f"tensor_parallel_size must be a positive integer, " + f"got {tensor_parallel_size}" + ) image_urls = IMAGE_URLS[: args.num_images] if method == "generate": - run_generate(model, QUESTION, image_urls, seed) + run_generate(model, QUESTION, image_urls, seed, tensor_parallel_size) elif method == "chat": - run_chat(model, QUESTION, image_urls, seed) + run_chat(model, QUESTION, image_urls, seed, tensor_parallel_size) else: raise ValueError(f"Invalid method: {method}")