diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 6753800f19902..186c4354fbc1c 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -198,6 +198,7 @@ steps:
commands:
# split the test to avoid interference
- pytest -v -s v1/core
+ - pytest -v -s v1/entrypoints
- pytest -v -s v1/engine
- pytest -v -s v1/sample
- pytest -v -s v1/worker
@@ -225,10 +226,13 @@ steps:
- python3 offline_inference/basic/chat.py
- python3 offline_inference/prefix_caching.py
- python3 offline_inference/llm_engine_example.py
- - python3 offline_inference/vision_language.py
- - python3 offline_inference/vision_language_multi_image.py
+ - python3 offline_inference/audio_language.py --seed 0
+ - python3 offline_inference/vision_language.py --seed 0
+ - python3 offline_inference/vision_language_embedding.py --seed 0
+ - python3 offline_inference/vision_language_multi_image.py --seed 0
- VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/encoder_decoder.py
+ - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
- python3 offline_inference/basic/classify.py
- python3 offline_inference/basic/embed.py
- python3 offline_inference/basic/score.py
@@ -530,7 +534,7 @@ steps:
# TODO: investigate and fix
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
+ - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
- label: Plugin Tests (2 GPUs) # 40min
working_dir: "/vllm-workspace/tests"
diff --git a/README.md b/README.md
index bfab7faf598b6..f61b4218e1824 100644
--- a/README.md
+++ b/README.md
@@ -13,18 +13,9 @@ Easy, fast, and cheap LLM serving for everyone
| Documentation | Blog | Paper | Twitter/X | Developer Slack |
----
-
-Weโre excited to invite you to the first **vLLM China Meetup** on **March 16** in **Beijing**!
-
-Join us to connect with the **vLLM team** and explore how vLLM is leveraged in **post-training, fine-tuning, and deployment**, including [verl](https://github.com/volcengine/verl), [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory), and [vllm-ascend](https://github.com/vllm-project/vllm-ascend).
-
-๐ **[Register Now](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)** to be part of the discussion!
-
----
-
*Latest News* ๐ฅ
+- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit#slide=id.g33fb1ff286e_0_29).
- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
diff --git a/benchmarks/README.md b/benchmarks/README.md
index c64c24fd3ad05..3225a4b0db3a0 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -82,10 +82,10 @@ Then run the benchmarking script
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
NUM_PROMPTS=10
-BACKEND="openai-chat"
+BACKEND="vllm"
DATASET_NAME="sharegpt"
DATASET_PATH="/ShareGPT_V3_unfiltered_cleaned_split.json"
-python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/chat/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
+python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
```
If successful, you will see the following output
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 6a7db920b5b63..09c8e23ebb1c3 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -14,7 +14,8 @@ from tqdm.asyncio import tqdm
from transformers import (AutoTokenizer, PreTrainedTokenizer,
PreTrainedTokenizerFast)
-from vllm.model_executor.model_loader.weight_utils import get_lock
+# NOTE(simon): do not import vLLM here so the benchmark script
+# can run without vLLM installed.
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
@@ -427,6 +428,8 @@ def get_model(pretrained_model_name_or_path: str) -> str:
if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
from modelscope import snapshot_download
+ from vllm.model_executor.model_loader.weight_utils import get_lock
+
# Use file lock to prevent multiple processes from
# downloading the same model weights at the same time.
with get_lock(pretrained_model_name_or_path):
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 1dd01ca968678..47627126b6688 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -684,6 +684,15 @@ def main(args: argparse.Namespace):
"Invalid metadata format. Please use KEY=VALUE format."
)
+ if not args.save_detailed:
+ # Remove fields with too many data points
+ for field in [
+ "input_lens", "output_lens", "ttfts", "itls",
+ "generated_texts", "errors"
+ ]:
+ if field in result_json:
+ del result_json[field]
+
# Traffic
result_json["request_rate"] = (args.request_rate if args.request_rate
< float("inf") else "inf")
@@ -828,6 +837,12 @@ if __name__ == "__main__":
action="store_true",
help="Specify to save benchmark results to a json file",
)
+ parser.add_argument(
+ "--save-detailed",
+ action="store_true",
+ help="When saving the results, whether to include per request "
+ "information such as response, error, ttfs, tpots, etc.",
+ )
parser.add_argument(
"--metadata",
metavar="KEY=VALUE",
diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu
index 90f0b54d2f006..c500d00ea528e 100644
--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@@ -127,7 +127,7 @@ __device__ __forceinline__ T from_float(const float& inp) {
template
__device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) {
- union tmpcvt {
+ [[maybe_unused]] union tmpcvt {
uint16_t u;
_Float16 f;
__hip_bfloat16 b;
@@ -160,7 +160,7 @@ __device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) {
template
__device__ __forceinline__ _B16x4 addx4(const _B16x4& inp1,
const _B16x4& inp2) {
- union tmpcvt {
+ [[maybe_unused]] union tmpcvt {
uint16_t u;
_Float16 f;
__hip_bfloat16 b;
@@ -1273,9 +1273,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
const int seq_idx = blockIdx.y;
const int context_len = context_lens[seq_idx];
const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
- constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
+ [[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
const int warpid = threadIdx.x / WARP_SIZE;
- const int laneid = threadIdx.x % WARP_SIZE;
+ [[maybe_unused]] const int laneid = threadIdx.x % WARP_SIZE;
__shared__ float shared_global_exp_sum;
// max num partitions supported is warp_size * NPAR_LOOPS
diff --git a/docs/source/getting_started/installation/gpu/cuda.inc.md b/docs/source/getting_started/installation/gpu/cuda.inc.md
index 7e3b884c2ab1e..d3e375aec10cb 100644
--- a/docs/source/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/source/getting_started/installation/gpu/cuda.inc.md
@@ -131,6 +131,8 @@ Building from source requires a lot of compilation. If you are building from sou
For example, you can install [ccache](https://github.com/ccache/ccache) using `conda install ccache` or `apt install ccache` .
As long as `which ccache` command can find the `ccache` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster.
+When using `ccache` with `pip install -e .`, you should run `CCACHE_NOHASHDIR="true" pip install --no-build-isolation -e .`. This is because `pip` creates a new folder with a random name for each build, preventing `ccache` from recognizing that the same files are being built.
+
[sccache](https://github.com/mozilla/sccache) works similarly to `ccache`, but has the capability to utilize caching in remote storage environments.
The following environment variables can be set to configure the vLLM `sccache` remote: `SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1`. We also recommend setting `SCCACHE_IDLE_TIMEOUT=0`.
:::
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index 293b9fddac89e..02dbdcb64232f 100644
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -7,11 +7,13 @@ For most models, the prompt format should follow corresponding examples
on HuggingFace model repository.
"""
import os
+from dataclasses import asdict
+from typing import NamedTuple, Optional
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer
-from vllm import LLM, SamplingParams
+from vllm import LLM, EngineArgs, SamplingParams
from vllm.assets.audio import AudioAsset
from vllm.lora.request import LoRARequest
from vllm.utils import FlexibleArgumentParser
@@ -23,21 +25,31 @@ question_per_audio_count = {
2: "What sport and what nursery rhyme are referenced?"
}
+
+class ModelRequestData(NamedTuple):
+ engine_args: EngineArgs
+ prompt: str
+ stop_token_ids: Optional[list[int]] = None
+ lora_requests: Optional[list[LoRARequest]] = None
+
+
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.
# MiniCPM-O
-def run_minicpmo(question: str, audio_count: int):
+def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:
model_name = "openbmb/MiniCPM-o-2_6"
tokenizer = AutoTokenizer.from_pretrained(model_name,
trust_remote_code=True)
- llm = LLM(model=model_name,
- trust_remote_code=True,
- max_model_len=4096,
- max_num_seqs=5,
- limit_mm_per_prompt={"audio": audio_count})
+ engine_args = EngineArgs(
+ model=model_name,
+ trust_remote_code=True,
+ max_model_len=4096,
+ max_num_seqs=5,
+ limit_mm_per_prompt={"audio": audio_count},
+ )
stop_tokens = ['<|im_end|>', '<|endoftext|>']
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
@@ -52,11 +64,16 @@ def run_minicpmo(question: str, audio_count: int):
tokenize=False,
add_generation_prompt=True,
chat_template=audio_chat_template)
- return llm, prompt, stop_token_ids
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompt=prompt,
+ stop_token_ids=stop_token_ids,
+ )
# Phi-4-multimodal-instruct
-def run_phi4mm(questions: str, audio_count: int):
+def run_phi4mm(question: str, audio_count: int) -> ModelRequestData:
"""
Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
show how to process audio inputs.
@@ -67,9 +84,9 @@ def run_phi4mm(questions: str, audio_count: int):
speech_lora_path = os.path.join(model_path, "speech-lora")
placeholders = "".join([f"<|audio_{i+1}|>" for i in range(audio_count)])
- prompts = f"<|user|>{placeholders}{questions}<|end|><|assistant|>"
+ prompts = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
- llm = LLM(
+ engine_args = EngineArgs(
model=model_path,
trust_remote_code=True,
max_model_len=4096,
@@ -79,24 +96,24 @@ def run_phi4mm(questions: str, audio_count: int):
lora_extra_vocab_size=0,
limit_mm_per_prompt={"audio": audio_count},
)
- lora_request = LoRARequest("speech", 1, speech_lora_path)
- # To maintain code compatibility in this script, we add LoRA here.
- llm.llm_engine.add_lora(lora_request=lora_request)
- # You can also add LoRA using:
- # llm.generate(prompts, lora_request=lora_request,...)
- stop_token_ids = None
- return llm, prompts, stop_token_ids
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompt=prompts,
+ lora_requests=[LoRARequest("speech", 1, speech_lora_path)],
+ )
# Qwen2-Audio
-def run_qwen2_audio(question: str, audio_count: int):
+def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData:
model_name = "Qwen/Qwen2-Audio-7B-Instruct"
- llm = LLM(model=model_name,
- max_model_len=4096,
- max_num_seqs=5,
- limit_mm_per_prompt={"audio": audio_count})
+ engine_args = EngineArgs(
+ model=model_name,
+ max_model_len=4096,
+ max_num_seqs=5,
+ limit_mm_per_prompt={"audio": audio_count},
+ )
audio_in_prompt = "".join([
f"Audio {idx+1}: "
@@ -107,12 +124,15 @@ def run_qwen2_audio(question: str, audio_count: int):
"<|im_start|>user\n"
f"{audio_in_prompt}{question}<|im_end|>\n"
"<|im_start|>assistant\n")
- stop_token_ids = None
- return llm, prompt, stop_token_ids
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompt=prompt,
+ )
# Ultravox 0.5-1B
-def run_ultravox(question: str, audio_count: int):
+def run_ultravox(question: str, audio_count: int) -> ModelRequestData:
model_name = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -124,29 +144,39 @@ def run_ultravox(question: str, audio_count: int):
tokenize=False,
add_generation_prompt=True)
- llm = LLM(model=model_name,
- max_model_len=4096,
- max_num_seqs=5,
- trust_remote_code=True,
- limit_mm_per_prompt={"audio": audio_count})
- stop_token_ids = None
- return llm, prompt, stop_token_ids
+ engine_args = EngineArgs(
+ model=model_name,
+ max_model_len=4096,
+ max_num_seqs=5,
+ trust_remote_code=True,
+ limit_mm_per_prompt={"audio": audio_count},
+ )
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompt=prompt,
+ )
# Whisper
-def run_whisper(question: str, audio_count: int):
+def run_whisper(question: str, audio_count: int) -> ModelRequestData:
assert audio_count == 1, (
"Whisper only support single audio input per prompt")
model_name = "openai/whisper-large-v3-turbo"
prompt = "<|startoftranscript|>"
- llm = LLM(model=model_name,
- max_model_len=448,
- max_num_seqs=5,
- limit_mm_per_prompt={"audio": audio_count})
- stop_token_ids = None
- return llm, prompt, stop_token_ids
+ engine_args = EngineArgs(
+ model=model_name,
+ max_model_len=448,
+ max_num_seqs=5,
+ limit_mm_per_prompt={"audio": audio_count},
+ )
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompt=prompt,
+ )
model_example_map = {
@@ -164,14 +194,24 @@ def main(args):
raise ValueError(f"Model type {model} is not supported.")
audio_count = args.num_audios
- llm, prompt, stop_token_ids = model_example_map[model](
- question_per_audio_count[audio_count], audio_count)
+ req_data = model_example_map[model](question_per_audio_count[audio_count],
+ audio_count)
+
+ engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
+ llm = LLM(**engine_args)
+
+ # To maintain code compatibility in this script, we add LoRA here.
+ # You can also add LoRA using:
+ # llm.generate(prompts, lora_request=lora_request,...)
+ if req_data.lora_requests:
+ for lora_request in req_data.lora_requests:
+ llm.llm_engine.add_lora(lora_request=lora_request)
# We set temperature to 0.2 so that outputs can be different
# even when all prompts are identical when running batch inference.
sampling_params = SamplingParams(temperature=0.2,
max_tokens=64,
- stop_token_ids=stop_token_ids)
+ stop_token_ids=req_data.stop_token_ids)
mm_data = {}
if audio_count > 0:
@@ -183,7 +223,7 @@ def main(args):
}
assert args.num_prompts > 0
- inputs = {"prompt": prompt, "multi_modal_data": mm_data}
+ inputs = {"prompt": req_data.prompt, "multi_modal_data": mm_data}
if args.num_prompts > 1:
# Batch inference
inputs = [inputs] * args.num_prompts
@@ -214,6 +254,10 @@ if __name__ == "__main__":
default=1,
choices=[0, 1, 2],
help="Number of audio items per prompt.")
+ parser.add_argument("--seed",
+ type=int,
+ default=None,
+ help="Set the seed when initializing `vllm.LLM`.")
args = parser.parse_args()
main(args)
diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py
index b00519314d8bd..b73770ce382cf 100644
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@@ -76,5 +76,10 @@ if __name__ == "__main__":
GPUs_per_dp_rank))
proc.start()
procs.append(proc)
+ exit_code = 0
for proc in procs:
proc.join()
+ if proc.exitcode:
+ exit_code = proc.exitcode
+
+ exit(exit_code)
diff --git a/examples/offline_inference/encoder_decoder_multimodal.py b/examples/offline_inference/encoder_decoder_multimodal.py
index f44bc423658ec..6d0c3ac1ee09a 100644
--- a/examples/offline_inference/encoder_decoder_multimodal.py
+++ b/examples/offline_inference/encoder_decoder_multimodal.py
@@ -4,16 +4,23 @@ This example shows how to use vLLM for running offline inference with
the explicit/implicit prompt format on enc-dec LMMs for text generation.
"""
import time
+from collections.abc import Sequence
+from dataclasses import asdict
+from typing import NamedTuple
-from vllm import LLM, SamplingParams
+from vllm import LLM, EngineArgs, PromptType, SamplingParams
from vllm.assets.audio import AudioAsset
from vllm.assets.image import ImageAsset
from vllm.utils import FlexibleArgumentParser
+class ModelRequestData(NamedTuple):
+ engine_args: EngineArgs
+ prompts: Sequence[PromptType]
+
+
def run_florence2():
- # Create a Florence-2 encoder/decoder model instance
- llm = LLM(
+ engine_args = EngineArgs(
model="microsoft/Florence-2-large",
tokenizer="facebook/bart-large",
max_num_seqs=8,
@@ -39,12 +46,15 @@ def run_florence2():
"decoder_prompt": "",
},
]
- return llm, prompts
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ )
def run_mllama():
- # Create a Mllama encoder/decoder model instance
- llm = LLM(
+ engine_args = EngineArgs(
model="meta-llama/Llama-3.2-11B-Vision-Instruct",
max_model_len=4096,
max_num_seqs=2,
@@ -69,12 +79,15 @@ def run_mllama():
"decoder_prompt": "<|image|><|begin_of_text|>Please describe the image.", # noqa: E501
},
]
- return llm, prompts
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ )
def run_whisper():
- # Create a Whisper encoder/decoder model instance
- llm = LLM(
+ engine_args = EngineArgs(
model="openai/whisper-large-v3-turbo",
max_model_len=448,
max_num_seqs=16,
@@ -99,7 +112,11 @@ def run_whisper():
"decoder_prompt": "<|startoftranscript|>",
}
]
- return llm, prompts
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ )
model_example_map = {
@@ -114,7 +131,12 @@ def main(args):
if model not in model_example_map:
raise ValueError(f"Model type {model} is not supported.")
- llm, prompts = model_example_map[model]()
+ req_data = model_example_map[model]()
+
+ engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
+ llm = LLM(**engine_args)
+
+ prompts = req_data.prompts
# Create a sampling params object.
sampling_params = SamplingParams(
@@ -153,6 +175,10 @@ if __name__ == "__main__":
default="mllama",
choices=model_example_map.keys(),
help='Huggingface "model_type".')
+ parser.add_argument("--seed",
+ type=int,
+ default=None,
+ help="Set the seed when initializing `vllm.LLM`.")
args = parser.parse_args()
main(args)
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 432cda5e24396..58fd5e53bf8dc 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -8,122 +8,164 @@ on HuggingFace model repository.
"""
import os
import random
+from dataclasses import asdict
+from typing import NamedTuple, Optional
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer
-from vllm import LLM, SamplingParams
+from vllm import LLM, EngineArgs, SamplingParams
from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset
from vllm.lora.request import LoRARequest
from vllm.utils import FlexibleArgumentParser
+
+class ModelRequestData(NamedTuple):
+ engine_args: EngineArgs
+ prompts: list[str]
+ stop_token_ids: Optional[list[int]] = None
+ lora_requests: Optional[list[LoRARequest]] = None
+
+
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.
# Aria
-def run_aria(questions: list[str], modality: str):
+def run_aria(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "rhymes-ai/Aria"
# NOTE: Need L40 (or equivalent) to avoid OOM
- llm = LLM(model=model_name,
- max_model_len=4096,
- max_num_seqs=2,
- dtype="bfloat16",
- disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+ engine_args = EngineArgs(
+ model=model_name,
+ max_model_len=4096,
+ max_num_seqs=2,
+ dtype="bfloat16",
+ disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+ )
prompts = [(f"<|im_start|>user\n<|img|>{question}"
"<|im_end|>\n<|im_start|>assistant\n")
for question in questions]
stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
- return llm, prompts, stop_token_ids
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ stop_token_ids=stop_token_ids,
+ )
# BLIP-2
-def run_blip2(questions: list[str], modality: str):
+def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
# BLIP-2 prompt format is inaccurate on HuggingFace model repository.
# See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
prompts = [f"Question: {question} Answer:" for question in questions]
- llm = LLM(model="Salesforce/blip2-opt-2.7b",
- disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
- stop_token_ids = None
- return llm, prompts, stop_token_ids
+ engine_args = EngineArgs(
+ model="Salesforce/blip2-opt-2.7b",
+ disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+ )
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ )
# Chameleon
-def run_chameleon(questions: list[str], modality: str):
+def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
prompts = [f"{question}" for question in questions]
- llm = LLM(model="facebook/chameleon-7b",
- max_model_len=4096,
- max_num_seqs=2,
- disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
- stop_token_ids = None
- return llm, prompts, stop_token_ids
+ engine_args = EngineArgs(
+ model="facebook/chameleon-7b",
+ max_model_len=4096,
+ max_num_seqs=2,
+ disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+ )
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ )
# Deepseek-VL2
-def run_deepseek_vl2(questions: list[str], modality: str):
+def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "deepseek-ai/deepseek-vl2-tiny"
- llm = LLM(model=model_name,
- max_model_len=4096,
- max_num_seqs=2,
- disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
- hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]})
+ engine_args = EngineArgs(
+ model=model_name,
+ max_model_len=4096,
+ max_num_seqs=2,
+ disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+ hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
+ )
prompts = [
f"<|User|>: \n{question}\n\n<|Assistant|>:"
for question in questions
]
- stop_token_ids = None
- return llm, prompts, stop_token_ids
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ )
# Florence2
-def run_florence2(question: str, modality: str):
+def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
- llm = LLM(model="microsoft/Florence-2-large",
- tokenizer="facebook/bart-large",
- max_num_seqs=8,
- trust_remote_code=True,
- dtype="bfloat16",
- disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+ engine_args = EngineArgs(
+ model="microsoft/Florence-2-large",
+ tokenizer="facebook/bart-large",
+ max_num_seqs=8,
+ trust_remote_code=True,
+ dtype="bfloat16",
+ disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+ )
- prompt = ""
- stop_token_ids = None
- return llm, prompt, stop_token_ids
+ prompts = ["" for _ in questions]
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ )
# Fuyu
-def run_fuyu(questions: list[str], modality: str):
+def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
prompts = [f"{question}\n" for question in questions]
- llm = LLM(model="adept/fuyu-8b",
- max_model_len=2048,
- max_num_seqs=2,
- disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
- stop_token_ids = None
- return llm, prompts, stop_token_ids
+ engine_args = EngineArgs(
+ model="adept/fuyu-8b",
+ max_model_len=2048,
+ max_num_seqs=2,
+ disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+ )
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ )
# Gemma 3
-def run_gemma3(questions: list[str], modality: str):
+def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "google/gemma-3-4b-it"
- llm = LLM(
+ engine_args = EngineArgs(
model=model_name,
max_model_len=2048,
max_num_seqs=2,
@@ -135,22 +177,27 @@ def run_gemma3(questions: list[str], modality: str):
prompts = [("user\n"
f"{question}\n"
"model\n") for question in questions]
- stop_token_ids = None
- return llm, prompts, stop_token_ids
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ )
# GLM-4v
-def run_glm4v(questions: list[str], modality: str):
+def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "THUDM/glm-4v-9b"
- llm = LLM(model=model_name,
- max_model_len=2048,
- max_num_seqs=2,
- trust_remote_code=True,
- enforce_eager=True,
- hf_overrides={"architectures": ["GLM4VForCausalLM"]},
- disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+ engine_args = EngineArgs(
+ model=model_name,
+ max_model_len=2048,
+ max_num_seqs=2,
+ trust_remote_code=True,
+ enforce_eager=True,
+ hf_overrides={"architectures": ["GLM4VForCausalLM"]},
+ disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+ )
prompts = [
f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
@@ -158,16 +205,21 @@ def run_glm4v(questions: list[str], modality: str):
]
stop_token_ids = [151329, 151336, 151338]
- return llm, prompts, stop_token_ids
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ stop_token_ids=stop_token_ids,
+ )
# H2OVL-Mississippi
-def run_h2ovl(questions: list[str], modality: str):
+def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "h2oai/h2ovl-mississippi-800m"
- llm = LLM(
+ engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=8192,
@@ -187,15 +239,20 @@ def run_h2ovl(questions: list[str], modality: str):
# Stop tokens for H2OVL-Mississippi
# https://huggingface.co/h2oai/h2ovl-mississippi-800m
stop_token_ids = [tokenizer.eos_token_id]
- return llm, prompts, stop_token_ids
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ stop_token_ids=stop_token_ids,
+ )
# Idefics3-8B-Llama3
-def run_idefics3(questions: list[str], modality: str):
+def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
- llm = LLM(
+ engine_args = EngineArgs(
model=model_name,
max_model_len=8192,
max_num_seqs=2,
@@ -212,17 +269,20 @@ def run_idefics3(questions: list[str], modality: str):
prompts = [(
f"<|begin_of_text|>User:{question}\nAssistant:"
) for question in questions]
- stop_token_ids = None
- return llm, prompts, stop_token_ids
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ )
# InternVL
-def run_internvl(questions: list[str], modality: str):
+def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "OpenGVLab/InternVL2-2B"
- llm = LLM(
+ engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=4096,
@@ -245,53 +305,75 @@ def run_internvl(questions: list[str], modality: str):
# https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
- return llm, prompts, stop_token_ids
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ stop_token_ids=stop_token_ids,
+ )
# LLaVA-1.5
-def run_llava(questions: list[str], modality: str):
+def run_llava(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
prompts = [
f"USER: \n{question}\nASSISTANT:" for question in questions
]
- llm = LLM(model="llava-hf/llava-1.5-7b-hf",
- max_model_len=4096,
- disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
- stop_token_ids = None
- return llm, prompts, stop_token_ids
+ engine_args = EngineArgs(
+ model="llava-hf/llava-1.5-7b-hf",
+ max_model_len=4096,
+ disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+ )
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ )
# LLaVA-1.6/LLaVA-NeXT
-def run_llava_next(questions: list[str], modality: str):
+def run_llava_next(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
prompts = [f"[INST] \n{question} [/INST]" for question in questions]
- llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf",
- max_model_len=8192,
- disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
- stop_token_ids = None
- return llm, prompts, stop_token_ids
+ engine_args = EngineArgs(
+ model="llava-hf/llava-v1.6-mistral-7b-hf",
+ max_model_len=8192,
+ disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+ )
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ )
# LlaVA-NeXT-Video
# Currently only support for video input
-def run_llava_next_video(questions: list[str], modality: str):
+def run_llava_next_video(questions: list[str],
+ modality: str) -> ModelRequestData:
assert modality == "video"
prompts = [
f"USER: