diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 6753800f19902..186c4354fbc1c 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -198,6 +198,7 @@ steps: commands: # split the test to avoid interference - pytest -v -s v1/core + - pytest -v -s v1/entrypoints - pytest -v -s v1/engine - pytest -v -s v1/sample - pytest -v -s v1/worker @@ -225,10 +226,13 @@ steps: - python3 offline_inference/basic/chat.py - python3 offline_inference/prefix_caching.py - python3 offline_inference/llm_engine_example.py - - python3 offline_inference/vision_language.py - - python3 offline_inference/vision_language_multi_image.py + - python3 offline_inference/audio_language.py --seed 0 + - python3 offline_inference/vision_language.py --seed 0 + - python3 offline_inference/vision_language_embedding.py --seed 0 + - python3 offline_inference/vision_language_multi_image.py --seed 0 - VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 offline_inference/encoder_decoder.py + - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 - python3 offline_inference/basic/classify.py - python3 offline_inference/basic/embed.py - python3 offline_inference/basic/score.py @@ -530,7 +534,7 @@ steps: # TODO: investigate and fix # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py + - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py - label: Plugin Tests (2 GPUs) # 40min working_dir: "/vllm-workspace/tests" diff --git a/README.md b/README.md index bfab7faf598b6..f61b4218e1824 100644 --- a/README.md +++ b/README.md @@ -13,18 +13,9 @@ Easy, fast, and cheap LLM serving for everyone | Documentation | Blog | Paper | Twitter/X | Developer Slack |

---- - -Weโ€™re excited to invite you to the first **vLLM China Meetup** on **March 16** in **Beijing**! - -Join us to connect with the **vLLM team** and explore how vLLM is leveraged in **post-training, fine-tuning, and deployment**, including [verl](https://github.com/volcengine/verl), [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory), and [vllm-ascend](https://github.com/vllm-project/vllm-ascend). - -๐Ÿ‘‰ **[Register Now](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)** to be part of the discussion! - ---- - *Latest News* ๐Ÿ”ฅ +- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit#slide=id.g33fb1ff286e_0_29). - [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0). - [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted. - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html). diff --git a/benchmarks/README.md b/benchmarks/README.md index c64c24fd3ad05..3225a4b0db3a0 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -82,10 +82,10 @@ Then run the benchmarking script # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B" NUM_PROMPTS=10 -BACKEND="openai-chat" +BACKEND="vllm" DATASET_NAME="sharegpt" DATASET_PATH="/ShareGPT_V3_unfiltered_cleaned_split.json" -python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/chat/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS} +python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS} ``` If successful, you will see the following output diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 6a7db920b5b63..09c8e23ebb1c3 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -14,7 +14,8 @@ from tqdm.asyncio import tqdm from transformers import (AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast) -from vllm.model_executor.model_loader.weight_utils import get_lock +# NOTE(simon): do not import vLLM here so the benchmark script +# can run without vLLM installed. AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) @@ -427,6 +428,8 @@ def get_model(pretrained_model_name_or_path: str) -> str: if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true': from modelscope import snapshot_download + from vllm.model_executor.model_loader.weight_utils import get_lock + # Use file lock to prevent multiple processes from # downloading the same model weights at the same time. with get_lock(pretrained_model_name_or_path): diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 1dd01ca968678..47627126b6688 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -684,6 +684,15 @@ def main(args: argparse.Namespace): "Invalid metadata format. Please use KEY=VALUE format." ) + if not args.save_detailed: + # Remove fields with too many data points + for field in [ + "input_lens", "output_lens", "ttfts", "itls", + "generated_texts", "errors" + ]: + if field in result_json: + del result_json[field] + # Traffic result_json["request_rate"] = (args.request_rate if args.request_rate < float("inf") else "inf") @@ -828,6 +837,12 @@ if __name__ == "__main__": action="store_true", help="Specify to save benchmark results to a json file", ) + parser.add_argument( + "--save-detailed", + action="store_true", + help="When saving the results, whether to include per request " + "information such as response, error, ttfs, tpots, etc.", + ) parser.add_argument( "--metadata", metavar="KEY=VALUE", diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu index 90f0b54d2f006..c500d00ea528e 100644 --- a/csrc/rocm/attention.cu +++ b/csrc/rocm/attention.cu @@ -127,7 +127,7 @@ __device__ __forceinline__ T from_float(const float& inp) { template __device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) { - union tmpcvt { + [[maybe_unused]] union tmpcvt { uint16_t u; _Float16 f; __hip_bfloat16 b; @@ -160,7 +160,7 @@ __device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) { template __device__ __forceinline__ _B16x4 addx4(const _B16x4& inp1, const _B16x4& inp2) { - union tmpcvt { + [[maybe_unused]] union tmpcvt { uint16_t u; _Float16 f; __hip_bfloat16 b; @@ -1273,9 +1273,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel( const int seq_idx = blockIdx.y; const int context_len = context_lens[seq_idx]; const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE); - constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; + [[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; const int warpid = threadIdx.x / WARP_SIZE; - const int laneid = threadIdx.x % WARP_SIZE; + [[maybe_unused]] const int laneid = threadIdx.x % WARP_SIZE; __shared__ float shared_global_exp_sum; // max num partitions supported is warp_size * NPAR_LOOPS diff --git a/docs/source/getting_started/installation/gpu/cuda.inc.md b/docs/source/getting_started/installation/gpu/cuda.inc.md index 7e3b884c2ab1e..d3e375aec10cb 100644 --- a/docs/source/getting_started/installation/gpu/cuda.inc.md +++ b/docs/source/getting_started/installation/gpu/cuda.inc.md @@ -131,6 +131,8 @@ Building from source requires a lot of compilation. If you are building from sou For example, you can install [ccache](https://github.com/ccache/ccache) using `conda install ccache` or `apt install ccache` . As long as `which ccache` command can find the `ccache` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster. +When using `ccache` with `pip install -e .`, you should run `CCACHE_NOHASHDIR="true" pip install --no-build-isolation -e .`. This is because `pip` creates a new folder with a random name for each build, preventing `ccache` from recognizing that the same files are being built. + [sccache](https://github.com/mozilla/sccache) works similarly to `ccache`, but has the capability to utilize caching in remote storage environments. The following environment variables can be set to configure the vLLM `sccache` remote: `SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1`. We also recommend setting `SCCACHE_IDLE_TIMEOUT=0`. ::: diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py index 293b9fddac89e..02dbdcb64232f 100644 --- a/examples/offline_inference/audio_language.py +++ b/examples/offline_inference/audio_language.py @@ -7,11 +7,13 @@ For most models, the prompt format should follow corresponding examples on HuggingFace model repository. """ import os +from dataclasses import asdict +from typing import NamedTuple, Optional from huggingface_hub import snapshot_download from transformers import AutoTokenizer -from vllm import LLM, SamplingParams +from vllm import LLM, EngineArgs, SamplingParams from vllm.assets.audio import AudioAsset from vllm.lora.request import LoRARequest from vllm.utils import FlexibleArgumentParser @@ -23,21 +25,31 @@ question_per_audio_count = { 2: "What sport and what nursery rhyme are referenced?" } + +class ModelRequestData(NamedTuple): + engine_args: EngineArgs + prompt: str + stop_token_ids: Optional[list[int]] = None + lora_requests: Optional[list[LoRARequest]] = None + + # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on # lower-end GPUs. # Unless specified, these settings have been tested to work on a single L4. # MiniCPM-O -def run_minicpmo(question: str, audio_count: int): +def run_minicpmo(question: str, audio_count: int) -> ModelRequestData: model_name = "openbmb/MiniCPM-o-2_6" tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) - llm = LLM(model=model_name, - trust_remote_code=True, - max_model_len=4096, - max_num_seqs=5, - limit_mm_per_prompt={"audio": audio_count}) + engine_args = EngineArgs( + model=model_name, + trust_remote_code=True, + max_model_len=4096, + max_num_seqs=5, + limit_mm_per_prompt={"audio": audio_count}, + ) stop_tokens = ['<|im_end|>', '<|endoftext|>'] stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] @@ -52,11 +64,16 @@ def run_minicpmo(question: str, audio_count: int): tokenize=False, add_generation_prompt=True, chat_template=audio_chat_template) - return llm, prompt, stop_token_ids + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + stop_token_ids=stop_token_ids, + ) # Phi-4-multimodal-instruct -def run_phi4mm(questions: str, audio_count: int): +def run_phi4mm(question: str, audio_count: int) -> ModelRequestData: """ Phi-4-multimodal-instruct supports both image and audio inputs. Here, we show how to process audio inputs. @@ -67,9 +84,9 @@ def run_phi4mm(questions: str, audio_count: int): speech_lora_path = os.path.join(model_path, "speech-lora") placeholders = "".join([f"<|audio_{i+1}|>" for i in range(audio_count)]) - prompts = f"<|user|>{placeholders}{questions}<|end|><|assistant|>" + prompts = f"<|user|>{placeholders}{question}<|end|><|assistant|>" - llm = LLM( + engine_args = EngineArgs( model=model_path, trust_remote_code=True, max_model_len=4096, @@ -79,24 +96,24 @@ def run_phi4mm(questions: str, audio_count: int): lora_extra_vocab_size=0, limit_mm_per_prompt={"audio": audio_count}, ) - lora_request = LoRARequest("speech", 1, speech_lora_path) - # To maintain code compatibility in this script, we add LoRA here. - llm.llm_engine.add_lora(lora_request=lora_request) - # You can also add LoRA using: - # llm.generate(prompts, lora_request=lora_request,...) - stop_token_ids = None - return llm, prompts, stop_token_ids + return ModelRequestData( + engine_args=engine_args, + prompt=prompts, + lora_requests=[LoRARequest("speech", 1, speech_lora_path)], + ) # Qwen2-Audio -def run_qwen2_audio(question: str, audio_count: int): +def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData: model_name = "Qwen/Qwen2-Audio-7B-Instruct" - llm = LLM(model=model_name, - max_model_len=4096, - max_num_seqs=5, - limit_mm_per_prompt={"audio": audio_count}) + engine_args = EngineArgs( + model=model_name, + max_model_len=4096, + max_num_seqs=5, + limit_mm_per_prompt={"audio": audio_count}, + ) audio_in_prompt = "".join([ f"Audio {idx+1}: " @@ -107,12 +124,15 @@ def run_qwen2_audio(question: str, audio_count: int): "<|im_start|>user\n" f"{audio_in_prompt}{question}<|im_end|>\n" "<|im_start|>assistant\n") - stop_token_ids = None - return llm, prompt, stop_token_ids + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + ) # Ultravox 0.5-1B -def run_ultravox(question: str, audio_count: int): +def run_ultravox(question: str, audio_count: int) -> ModelRequestData: model_name = "fixie-ai/ultravox-v0_5-llama-3_2-1b" tokenizer = AutoTokenizer.from_pretrained(model_name) @@ -124,29 +144,39 @@ def run_ultravox(question: str, audio_count: int): tokenize=False, add_generation_prompt=True) - llm = LLM(model=model_name, - max_model_len=4096, - max_num_seqs=5, - trust_remote_code=True, - limit_mm_per_prompt={"audio": audio_count}) - stop_token_ids = None - return llm, prompt, stop_token_ids + engine_args = EngineArgs( + model=model_name, + max_model_len=4096, + max_num_seqs=5, + trust_remote_code=True, + limit_mm_per_prompt={"audio": audio_count}, + ) + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + ) # Whisper -def run_whisper(question: str, audio_count: int): +def run_whisper(question: str, audio_count: int) -> ModelRequestData: assert audio_count == 1, ( "Whisper only support single audio input per prompt") model_name = "openai/whisper-large-v3-turbo" prompt = "<|startoftranscript|>" - llm = LLM(model=model_name, - max_model_len=448, - max_num_seqs=5, - limit_mm_per_prompt={"audio": audio_count}) - stop_token_ids = None - return llm, prompt, stop_token_ids + engine_args = EngineArgs( + model=model_name, + max_model_len=448, + max_num_seqs=5, + limit_mm_per_prompt={"audio": audio_count}, + ) + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + ) model_example_map = { @@ -164,14 +194,24 @@ def main(args): raise ValueError(f"Model type {model} is not supported.") audio_count = args.num_audios - llm, prompt, stop_token_ids = model_example_map[model]( - question_per_audio_count[audio_count], audio_count) + req_data = model_example_map[model](question_per_audio_count[audio_count], + audio_count) + + engine_args = asdict(req_data.engine_args) | {"seed": args.seed} + llm = LLM(**engine_args) + + # To maintain code compatibility in this script, we add LoRA here. + # You can also add LoRA using: + # llm.generate(prompts, lora_request=lora_request,...) + if req_data.lora_requests: + for lora_request in req_data.lora_requests: + llm.llm_engine.add_lora(lora_request=lora_request) # We set temperature to 0.2 so that outputs can be different # even when all prompts are identical when running batch inference. sampling_params = SamplingParams(temperature=0.2, max_tokens=64, - stop_token_ids=stop_token_ids) + stop_token_ids=req_data.stop_token_ids) mm_data = {} if audio_count > 0: @@ -183,7 +223,7 @@ def main(args): } assert args.num_prompts > 0 - inputs = {"prompt": prompt, "multi_modal_data": mm_data} + inputs = {"prompt": req_data.prompt, "multi_modal_data": mm_data} if args.num_prompts > 1: # Batch inference inputs = [inputs] * args.num_prompts @@ -214,6 +254,10 @@ if __name__ == "__main__": default=1, choices=[0, 1, 2], help="Number of audio items per prompt.") + parser.add_argument("--seed", + type=int, + default=None, + help="Set the seed when initializing `vllm.LLM`.") args = parser.parse_args() main(args) diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py index b00519314d8bd..b73770ce382cf 100644 --- a/examples/offline_inference/data_parallel.py +++ b/examples/offline_inference/data_parallel.py @@ -76,5 +76,10 @@ if __name__ == "__main__": GPUs_per_dp_rank)) proc.start() procs.append(proc) + exit_code = 0 for proc in procs: proc.join() + if proc.exitcode: + exit_code = proc.exitcode + + exit(exit_code) diff --git a/examples/offline_inference/encoder_decoder_multimodal.py b/examples/offline_inference/encoder_decoder_multimodal.py index f44bc423658ec..6d0c3ac1ee09a 100644 --- a/examples/offline_inference/encoder_decoder_multimodal.py +++ b/examples/offline_inference/encoder_decoder_multimodal.py @@ -4,16 +4,23 @@ This example shows how to use vLLM for running offline inference with the explicit/implicit prompt format on enc-dec LMMs for text generation. """ import time +from collections.abc import Sequence +from dataclasses import asdict +from typing import NamedTuple -from vllm import LLM, SamplingParams +from vllm import LLM, EngineArgs, PromptType, SamplingParams from vllm.assets.audio import AudioAsset from vllm.assets.image import ImageAsset from vllm.utils import FlexibleArgumentParser +class ModelRequestData(NamedTuple): + engine_args: EngineArgs + prompts: Sequence[PromptType] + + def run_florence2(): - # Create a Florence-2 encoder/decoder model instance - llm = LLM( + engine_args = EngineArgs( model="microsoft/Florence-2-large", tokenizer="facebook/bart-large", max_num_seqs=8, @@ -39,12 +46,15 @@ def run_florence2(): "decoder_prompt": "", }, ] - return llm, prompts + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) def run_mllama(): - # Create a Mllama encoder/decoder model instance - llm = LLM( + engine_args = EngineArgs( model="meta-llama/Llama-3.2-11B-Vision-Instruct", max_model_len=4096, max_num_seqs=2, @@ -69,12 +79,15 @@ def run_mllama(): "decoder_prompt": "<|image|><|begin_of_text|>Please describe the image.", # noqa: E501 }, ] - return llm, prompts + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) def run_whisper(): - # Create a Whisper encoder/decoder model instance - llm = LLM( + engine_args = EngineArgs( model="openai/whisper-large-v3-turbo", max_model_len=448, max_num_seqs=16, @@ -99,7 +112,11 @@ def run_whisper(): "decoder_prompt": "<|startoftranscript|>", } ] - return llm, prompts + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) model_example_map = { @@ -114,7 +131,12 @@ def main(args): if model not in model_example_map: raise ValueError(f"Model type {model} is not supported.") - llm, prompts = model_example_map[model]() + req_data = model_example_map[model]() + + engine_args = asdict(req_data.engine_args) | {"seed": args.seed} + llm = LLM(**engine_args) + + prompts = req_data.prompts # Create a sampling params object. sampling_params = SamplingParams( @@ -153,6 +175,10 @@ if __name__ == "__main__": default="mllama", choices=model_example_map.keys(), help='Huggingface "model_type".') + parser.add_argument("--seed", + type=int, + default=None, + help="Set the seed when initializing `vllm.LLM`.") args = parser.parse_args() main(args) diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 432cda5e24396..58fd5e53bf8dc 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -8,122 +8,164 @@ on HuggingFace model repository. """ import os import random +from dataclasses import asdict +from typing import NamedTuple, Optional from huggingface_hub import snapshot_download from transformers import AutoTokenizer -from vllm import LLM, SamplingParams +from vllm import LLM, EngineArgs, SamplingParams from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset from vllm.lora.request import LoRARequest from vllm.utils import FlexibleArgumentParser + +class ModelRequestData(NamedTuple): + engine_args: EngineArgs + prompts: list[str] + stop_token_ids: Optional[list[int]] = None + lora_requests: Optional[list[LoRARequest]] = None + + # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on # lower-end GPUs. # Unless specified, these settings have been tested to work on a single L4. # Aria -def run_aria(questions: list[str], modality: str): +def run_aria(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "rhymes-ai/Aria" # NOTE: Need L40 (or equivalent) to avoid OOM - llm = LLM(model=model_name, - max_model_len=4096, - max_num_seqs=2, - dtype="bfloat16", - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) + engine_args = EngineArgs( + model=model_name, + max_model_len=4096, + max_num_seqs=2, + dtype="bfloat16", + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + ) prompts = [(f"<|im_start|>user\n<|img|>{question}" "<|im_end|>\n<|im_start|>assistant\n") for question in questions] stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519] - return llm, prompts, stop_token_ids + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + stop_token_ids=stop_token_ids, + ) # BLIP-2 -def run_blip2(questions: list[str], modality: str): +def run_blip2(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" # BLIP-2 prompt format is inaccurate on HuggingFace model repository. # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa prompts = [f"Question: {question} Answer:" for question in questions] - llm = LLM(model="Salesforce/blip2-opt-2.7b", - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) - stop_token_ids = None - return llm, prompts, stop_token_ids + engine_args = EngineArgs( + model="Salesforce/blip2-opt-2.7b", + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + ) + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) # Chameleon -def run_chameleon(questions: list[str], modality: str): +def run_chameleon(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" prompts = [f"{question}" for question in questions] - llm = LLM(model="facebook/chameleon-7b", - max_model_len=4096, - max_num_seqs=2, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) - stop_token_ids = None - return llm, prompts, stop_token_ids + engine_args = EngineArgs( + model="facebook/chameleon-7b", + max_model_len=4096, + max_num_seqs=2, + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + ) + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) # Deepseek-VL2 -def run_deepseek_vl2(questions: list[str], modality: str): +def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "deepseek-ai/deepseek-vl2-tiny" - llm = LLM(model=model_name, - max_model_len=4096, - max_num_seqs=2, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, - hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}) + engine_args = EngineArgs( + model=model_name, + max_model_len=4096, + max_num_seqs=2, + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}, + ) prompts = [ f"<|User|>: \n{question}\n\n<|Assistant|>:" for question in questions ] - stop_token_ids = None - return llm, prompts, stop_token_ids + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) # Florence2 -def run_florence2(question: str, modality: str): +def run_florence2(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" - llm = LLM(model="microsoft/Florence-2-large", - tokenizer="facebook/bart-large", - max_num_seqs=8, - trust_remote_code=True, - dtype="bfloat16", - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) + engine_args = EngineArgs( + model="microsoft/Florence-2-large", + tokenizer="facebook/bart-large", + max_num_seqs=8, + trust_remote_code=True, + dtype="bfloat16", + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + ) - prompt = "" - stop_token_ids = None - return llm, prompt, stop_token_ids + prompts = ["" for _ in questions] + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) # Fuyu -def run_fuyu(questions: list[str], modality: str): +def run_fuyu(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" prompts = [f"{question}\n" for question in questions] - llm = LLM(model="adept/fuyu-8b", - max_model_len=2048, - max_num_seqs=2, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) - stop_token_ids = None - return llm, prompts, stop_token_ids + engine_args = EngineArgs( + model="adept/fuyu-8b", + max_model_len=2048, + max_num_seqs=2, + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + ) + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) # Gemma 3 -def run_gemma3(questions: list[str], modality: str): +def run_gemma3(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "google/gemma-3-4b-it" - llm = LLM( + engine_args = EngineArgs( model=model_name, max_model_len=2048, max_num_seqs=2, @@ -135,22 +177,27 @@ def run_gemma3(questions: list[str], modality: str): prompts = [("user\n" f"{question}\n" "model\n") for question in questions] - stop_token_ids = None - return llm, prompts, stop_token_ids + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) # GLM-4v -def run_glm4v(questions: list[str], modality: str): +def run_glm4v(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "THUDM/glm-4v-9b" - llm = LLM(model=model_name, - max_model_len=2048, - max_num_seqs=2, - trust_remote_code=True, - enforce_eager=True, - hf_overrides={"architectures": ["GLM4VForCausalLM"]}, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) + engine_args = EngineArgs( + model=model_name, + max_model_len=2048, + max_num_seqs=2, + trust_remote_code=True, + enforce_eager=True, + hf_overrides={"architectures": ["GLM4VForCausalLM"]}, + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + ) prompts = [ f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\ @@ -158,16 +205,21 @@ def run_glm4v(questions: list[str], modality: str): ] stop_token_ids = [151329, 151336, 151338] - return llm, prompts, stop_token_ids + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + stop_token_ids=stop_token_ids, + ) # H2OVL-Mississippi -def run_h2ovl(questions: list[str], modality: str): +def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "h2oai/h2ovl-mississippi-800m" - llm = LLM( + engine_args = EngineArgs( model=model_name, trust_remote_code=True, max_model_len=8192, @@ -187,15 +239,20 @@ def run_h2ovl(questions: list[str], modality: str): # Stop tokens for H2OVL-Mississippi # https://huggingface.co/h2oai/h2ovl-mississippi-800m stop_token_ids = [tokenizer.eos_token_id] - return llm, prompts, stop_token_ids + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + stop_token_ids=stop_token_ids, + ) # Idefics3-8B-Llama3 -def run_idefics3(questions: list[str], modality: str): +def run_idefics3(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "HuggingFaceM4/Idefics3-8B-Llama3" - llm = LLM( + engine_args = EngineArgs( model=model_name, max_model_len=8192, max_num_seqs=2, @@ -212,17 +269,20 @@ def run_idefics3(questions: list[str], modality: str): prompts = [( f"<|begin_of_text|>User:{question}\nAssistant:" ) for question in questions] - stop_token_ids = None - return llm, prompts, stop_token_ids + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) # InternVL -def run_internvl(questions: list[str], modality: str): +def run_internvl(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "OpenGVLab/InternVL2-2B" - llm = LLM( + engine_args = EngineArgs( model=model_name, trust_remote_code=True, max_model_len=4096, @@ -245,53 +305,75 @@ def run_internvl(questions: list[str], modality: str): # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"] stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] - return llm, prompts, stop_token_ids + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + stop_token_ids=stop_token_ids, + ) # LLaVA-1.5 -def run_llava(questions: list[str], modality: str): +def run_llava(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" prompts = [ f"USER: \n{question}\nASSISTANT:" for question in questions ] - llm = LLM(model="llava-hf/llava-1.5-7b-hf", - max_model_len=4096, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) - stop_token_ids = None - return llm, prompts, stop_token_ids + engine_args = EngineArgs( + model="llava-hf/llava-1.5-7b-hf", + max_model_len=4096, + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + ) + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) # LLaVA-1.6/LLaVA-NeXT -def run_llava_next(questions: list[str], modality: str): +def run_llava_next(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" prompts = [f"[INST] \n{question} [/INST]" for question in questions] - llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", - max_model_len=8192, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) - stop_token_ids = None - return llm, prompts, stop_token_ids + engine_args = EngineArgs( + model="llava-hf/llava-v1.6-mistral-7b-hf", + max_model_len=8192, + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + ) + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) # LlaVA-NeXT-Video # Currently only support for video input -def run_llava_next_video(questions: list[str], modality: str): +def run_llava_next_video(questions: list[str], + modality: str) -> ModelRequestData: assert modality == "video" prompts = [ f"USER: