Remove all references to yapf as it's no longer used (#26251)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor 2025-10-05 17:18:11 +01:00 committed by GitHub
parent d6953beb91
commit 4e256cadc2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
78 changed files with 1992 additions and 1717 deletions

View File

@ -12,9 +12,6 @@ from functools import reduce
from typing import Optional, Union
import jinja2
# yapf conflicts with isort for this block
# yapf: disable
from vllm_cutlass_library_extension import (
DataType,
EpilogueScheduleTag,
@ -31,8 +28,6 @@ from vllm_cutlass_library_extension import (
VLLMKernelScheduleTag,
)
# yapf: enable
#
# Generator templating
#

View File

@ -21,8 +21,6 @@ from vllm.utils import FlexibleArgumentParser
logger = logging.getLogger()
# yapf conflicts with isort for this docstring
# yapf: disable
"""
tensorize_vllm_model.py is a script that can be used to serialize and
deserialize vLLM models. These models can be loaded using tensorizer
@ -132,7 +130,8 @@ def get_parser():
"can be loaded using tensorizer directly to the GPU "
"extremely quickly. Tensor encryption and decryption is "
"also supported, although libsodium must be installed to "
"use it.")
"use it."
)
parser = EngineArgs.add_cli_args(parser)
parser.add_argument(
@ -144,13 +143,14 @@ def get_parser():
"along with the model by instantiating a TensorizerConfig object, "
"creating a dict from it with TensorizerConfig.to_serializable(), "
"and passing it to LoRARequest's initializer with the kwarg "
"tensorizer_config_dict."
"tensorizer_config_dict.",
)
subparsers = parser.add_subparsers(dest='command', required=True)
subparsers = parser.add_subparsers(dest="command", required=True)
serialize_parser = subparsers.add_parser(
'serialize', help="Serialize a model to `--serialized-directory`")
"serialize", help="Serialize a model to `--serialized-directory`"
)
serialize_parser.add_argument(
"--suffix",
@ -163,7 +163,9 @@ def get_parser():
"`--suffix` is `v1`, the serialized model tensors will be "
"saved to "
"`s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors`. "
"If none is provided, a random UUID will be used."))
"If none is provided, a random UUID will be used."
),
)
serialize_parser.add_argument(
"--serialized-directory",
type=str,
@ -175,108 +177,127 @@ def get_parser():
"and the model HuggingFace ID is `EleutherAI/gpt-j-6B`, tensors will "
"be saved to `dir/vllm/EleutherAI/gpt-j-6B/suffix/model.tensors`, "
"where `suffix` is given by `--suffix` or a random UUID if not "
"provided.")
"provided.",
)
serialize_parser.add_argument(
"--serialization-kwargs",
type=tensorizer_kwargs_arg,
required=False,
help=("A JSON string containing additional keyword arguments to "
"pass to Tensorizer's TensorSerializer during "
"serialization."))
help=(
"A JSON string containing additional keyword arguments to "
"pass to Tensorizer's TensorSerializer during "
"serialization."
),
)
serialize_parser.add_argument(
"--keyfile",
type=str,
required=False,
help=("Encrypt the model weights with a randomly-generated binary key,"
" and save the key at this path"))
help=(
"Encrypt the model weights with a randomly-generated binary key,"
" and save the key at this path"
),
)
deserialize_parser = subparsers.add_parser(
'deserialize',
help=("Deserialize a model from `--path-to-tensors`"
" to verify it can be loaded and used."))
"deserialize",
help=(
"Deserialize a model from `--path-to-tensors`"
" to verify it can be loaded and used."
),
)
deserialize_parser.add_argument(
"--path-to-tensors",
type=str,
required=False,
help="The local path or S3 URI to the model tensors to deserialize. ")
help="The local path or S3 URI to the model tensors to deserialize. ",
)
deserialize_parser.add_argument(
"--serialized-directory",
type=str,
required=False,
help="Directory with model artifacts for loading. Assumes a "
"model.tensors file exists therein. Can supersede "
"--path-to-tensors.")
"model.tensors file exists therein. Can supersede "
"--path-to-tensors.",
)
deserialize_parser.add_argument(
"--keyfile",
type=str,
required=False,
help=("Path to a binary key to use to decrypt the model weights,"
" if the model was serialized with encryption"))
help=(
"Path to a binary key to use to decrypt the model weights,"
" if the model was serialized with encryption"
),
)
deserialize_parser.add_argument(
"--deserialization-kwargs",
type=tensorizer_kwargs_arg,
required=False,
help=("A JSON string containing additional keyword arguments to "
"pass to Tensorizer's `TensorDeserializer` during "
"deserialization."))
help=(
"A JSON string containing additional keyword arguments to "
"pass to Tensorizer's `TensorDeserializer` during "
"deserialization."
),
)
TensorizerArgs.add_cli_args(deserialize_parser)
return parser
def merge_extra_config_with_tensorizer_config(extra_cfg: dict,
cfg: TensorizerConfig):
def merge_extra_config_with_tensorizer_config(extra_cfg: dict, cfg: TensorizerConfig):
for k, v in extra_cfg.items():
if hasattr(cfg, k):
setattr(cfg, k, v)
logger.info(
"Updating TensorizerConfig with %s from "
"--model-loader-extra-config provided", k
"--model-loader-extra-config provided",
k,
)
def deserialize(args, tensorizer_config):
if args.lora_path:
tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
llm = LLM(model=args.model,
load_format="tensorizer",
tensor_parallel_size=args.tensor_parallel_size,
model_loader_extra_config=tensorizer_config,
enable_lora=True,
llm = LLM(
model=args.model,
load_format="tensorizer",
tensor_parallel_size=args.tensor_parallel_size,
model_loader_extra_config=tensorizer_config,
enable_lora=True,
)
sampling_params = SamplingParams(
temperature=0,
max_tokens=256,
stop=["[/assistant]"]
temperature=0, max_tokens=256, stop=["[/assistant]"]
)
# Truncating this as the extra text isn't necessary
prompts = [
"[user] Write a SQL query to answer the question based on ..."
]
prompts = ["[user] Write a SQL query to answer the question based on ..."]
# Test LoRA load
print(
llm.generate(
prompts,
sampling_params,
lora_request=LoRARequest("sql-lora",
1,
args.lora_path,
tensorizer_config_dict = tensorizer_config
.to_serializable())
prompts,
sampling_params,
lora_request=LoRARequest(
"sql-lora",
1,
args.lora_path,
tensorizer_config_dict=tensorizer_config.to_serializable(),
),
)
)
else:
llm = LLM(model=args.model,
load_format="tensorizer",
tensor_parallel_size=args.tensor_parallel_size,
model_loader_extra_config=tensorizer_config
llm = LLM(
model=args.model,
load_format="tensorizer",
tensor_parallel_size=args.tensor_parallel_size,
model_loader_extra_config=tensorizer_config,
)
return llm
@ -285,17 +306,20 @@ def main():
parser = get_parser()
args = parser.parse_args()
s3_access_key_id = (getattr(args, 's3_access_key_id', None)
or os.environ.get("S3_ACCESS_KEY_ID", None))
s3_secret_access_key = (getattr(args, 's3_secret_access_key', None)
or os.environ.get("S3_SECRET_ACCESS_KEY", None))
s3_endpoint = (getattr(args, 's3_endpoint', None)
or os.environ.get("S3_ENDPOINT_URL", None))
s3_access_key_id = getattr(args, "s3_access_key_id", None) or os.environ.get(
"S3_ACCESS_KEY_ID", None
)
s3_secret_access_key = getattr(
args, "s3_secret_access_key", None
) or os.environ.get("S3_SECRET_ACCESS_KEY", None)
s3_endpoint = getattr(args, "s3_endpoint", None) or os.environ.get(
"S3_ENDPOINT_URL", None
)
credentials = {
"s3_access_key_id": s3_access_key_id,
"s3_secret_access_key": s3_secret_access_key,
"s3_endpoint": s3_endpoint
"s3_endpoint": s3_endpoint,
}
model_ref = args.model
@ -309,25 +333,25 @@ def main():
if args.model_loader_extra_config:
extra_config = json.loads(args.model_loader_extra_config)
tensorizer_dir = (args.serialized_directory or
extra_config.get("tensorizer_dir"))
tensorizer_uri = (getattr(args, "path_to_tensors", None)
or extra_config.get("tensorizer_uri"))
tensorizer_dir = args.serialized_directory or extra_config.get("tensorizer_dir")
tensorizer_uri = getattr(args, "path_to_tensors", None) or extra_config.get(
"tensorizer_uri"
)
if tensorizer_dir and tensorizer_uri:
parser.error("--serialized-directory and --path-to-tensors "
"cannot both be provided")
parser.error(
"--serialized-directory and --path-to-tensors cannot both be provided"
)
if not tensorizer_dir and not tensorizer_uri:
parser.error("Either --serialized-directory or --path-to-tensors "
"must be provided")
parser.error(
"Either --serialized-directory or --path-to-tensors must be provided"
)
if args.command == "serialize":
engine_args = EngineArgs.from_cli_args(args)
input_dir = tensorizer_dir.rstrip('/')
input_dir = tensorizer_dir.rstrip("/")
suffix = args.suffix if args.suffix else uuid.uuid4().hex
base_path = f"{input_dir}/vllm/{model_ref}/{suffix}"
if engine_args.tensor_parallel_size > 1:
@ -339,15 +363,14 @@ def main():
tensorizer_uri=model_path,
encryption_keyfile=keyfile,
serialization_kwargs=args.serialization_kwargs or {},
**credentials
**credentials,
)
if args.lora_path:
tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
tensorize_lora_adapter(args.lora_path, tensorizer_config)
merge_extra_config_with_tensorizer_config(extra_config,
tensorizer_config)
merge_extra_config_with_tensorizer_config(extra_config, tensorizer_config)
tensorize_vllm_model(engine_args, tensorizer_config)
elif args.command == "deserialize":
@ -356,11 +379,10 @@ def main():
tensorizer_dir=args.serialized_directory,
encryption_keyfile=keyfile,
deserialization_kwargs=args.deserialization_kwargs or {},
**credentials
**credentials,
)
merge_extra_config_with_tensorizer_config(extra_config,
tensorizer_config)
merge_extra_config_with_tensorizer_config(extra_config, tensorizer_config)
deserialize(args, tensorizer_config)
else:
raise ValueError("Either serialize or deserialize must be specified.")

View File

@ -8,16 +8,11 @@ import torch
import vllm.envs as envs
from tests.kernels.quantization.nvfp4_utils import quant_nvfp4_tensor
from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
# yapf conflicts with isort for this block
# yapf: disable
from vllm.compilation.activation_quant_fusion import (
FUSED_OPS,
SILU_MUL_OP,
ActivationQuantFusionPass,
)
# yapf: enable
from vllm.compilation.fusion import QUANT_OPS
from vllm.compilation.noop_elimination import NoOpEliminationPass
from vllm.compilation.post_cleanup import PostCleanupPass

View File

@ -107,10 +107,8 @@ class EPTestSettings:
# NOTE: You can adjust tp_base locally to fit the model in GPU
# The values displayed here are only a rough indicator of the size of the model
# yapf: disable
TEST_MODELS = {
"deepseek-ai/DeepSeek-V2-Lite-Chat": EPTestSettings.fast(
trust_remote_code=True),
"deepseek-ai/DeepSeek-V2-Lite-Chat": EPTestSettings.fast(trust_remote_code=True),
"mistralai/Mixtral-8x7B-Instruct-v0.1": EPTestSettings.fast(tp_base=4),
}
@ -192,22 +190,24 @@ def _compare_tp(
]
try:
compare_two_settings(model_name,
ep_args,
tp_args,
ep_env,
tp_env,
method=method,
max_wait_seconds=360)
compare_two_settings(
model_name,
ep_args,
tp_args,
ep_env,
tp_env,
method=method,
max_wait_seconds=360,
)
except Exception:
raise
@pytest.mark.parametrize(
("model_name", "parallel_setup", "distributed_backend", "runner",
"test_options"),
("model_name", "parallel_setup", "distributed_backend", "runner", "test_options"),
[
params for model_name, settings in TEST_MODELS.items()
params
for model_name, settings in TEST_MODELS.items()
for params in settings.iter_params(model_name)
],
)
@ -220,10 +220,12 @@ def test_ep(
test_options: EPTestOptions,
num_gpus_available,
):
_compare_tp(model_name,
parallel_setup,
distributed_backend,
runner,
test_options,
num_gpus_available,
method="generate")
_compare_tp(
model_name,
parallel_setup,
distributed_backend,
runner,
test_options,
num_gpus_available,
method="generate",
)

View File

@ -100,7 +100,6 @@ class PPTestSettings:
# NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
# The values displayed here are only a rough indicator of the size of the model
# yapf: disable
TEXT_GENERATION_MODELS = {
# [Decoder-only]
# Uses Llama
@ -150,7 +149,9 @@ TEXT_GENERATION_MODELS = {
"adept/persimmon-8b-chat": PPTestSettings.fast(),
"microsoft/phi-2": PPTestSettings.fast(),
"microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(),
"microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(multi_node_only=True, load_format="dummy"), # noqa: E501
"microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(
multi_node_only=True, load_format="dummy"
), # noqa: E501
"Qwen/Qwen-7B-Chat": PPTestSettings.fast(),
"Qwen/Qwen2.5-0.5B-Instruct": PPTestSettings.fast(),
"Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
@ -196,7 +197,6 @@ MULTIMODAL_MODELS = {
"Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
"fixie-ai/ultravox-v0_5-llama-3_2-1b": PPTestSettings.fast(),
}
# yapf: enable
# NOTE: You can update this on your local machine to run specific tests
TEST_MODELS = [

View File

@ -287,29 +287,15 @@ def test_prefix_cache_default():
assert not engine_args.enable_prefix_caching
# yapf: disable
@pytest.mark.parametrize(("arg", "expected", "option"), [
(None, None, "mm-processor-kwargs"),
("{}", {}, "mm-processor-kwargs"),
(
'{"num_crops": 4}',
{
"num_crops": 4
},
"mm-processor-kwargs"
),
(
'{"foo": {"bar": "baz"}}',
{
"foo":
{
"bar": "baz"
}
},
"mm-processor-kwargs"
),
])
# yapf: enable
@pytest.mark.parametrize(
("arg", "expected", "option"),
[
(None, None, "mm-processor-kwargs"),
("{}", {}, "mm-processor-kwargs"),
('{"num_crops": 4}', {"num_crops": 4}, "mm-processor-kwargs"),
('{"foo": {"bar": "baz"}}', {"foo": {"bar": "baz"}}, "mm-processor-kwargs"),
],
)
def test_composite_arg_parser(arg, expected, option):
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
if arg is None:
@ -321,8 +307,7 @@ def test_composite_arg_parser(arg, expected, option):
def test_human_readable_model_len():
# `exit_on_error` disabled to test invalid values below
parser = EngineArgs.add_cli_args(
FlexibleArgumentParser(exit_on_error=False))
parser = EngineArgs.add_cli_args(FlexibleArgumentParser(exit_on_error=False))
args = parser.parse_args([])
assert args.max_model_len is None

View File

@ -15,6 +15,7 @@ from vllm.assets.video import VideoAsset
from vllm.config import ModelConfig
from vllm.entrypoints.chat_utils import (
_try_extract_ast,
apply_mistral_chat_template,
load_chat_template,
parse_chat_messages,
parse_chat_messages_futures,
@ -1855,17 +1856,17 @@ def test_resolve_hf_chat_template_kwargs(sample_json_schema, model, expected_kwa
# NOTE: Qwen2-Audio default chat template is specially defined inside
# processor class instead of using `tokenizer_config.json`
# yapf: disable
@pytest.mark.parametrize(
("model", "expected_format"),
[(PHI3V_MODEL_ID, "string"),
(QWEN2VL_MODEL_ID, "openai"),
(QWEN25VL_MODEL_ID, "openai"),
(ULTRAVOX_MODEL_ID, "string"),
(QWEN2AUDIO_MODEL_ID, "openai"),
(LLAMA_GUARD_MODEL_ID, "openai")],
[
(PHI3V_MODEL_ID, "string"),
(QWEN2VL_MODEL_ID, "openai"),
(QWEN25VL_MODEL_ID, "openai"),
(ULTRAVOX_MODEL_ID, "string"),
(QWEN2AUDIO_MODEL_ID, "openai"),
(LLAMA_GUARD_MODEL_ID, "openai"),
],
)
# yapf: enable
def test_resolve_content_format_hf_defined(model, expected_format):
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
@ -1879,7 +1880,8 @@ def test_resolve_content_format_hf_defined(model, expected_format):
hf_overrides=model_info.hf_overrides,
skip_tokenizer_init=model_info.skip_tokenizer_init,
enforce_eager=model_info.enforce_eager,
dtype=model_info.dtype)
dtype=model_info.dtype,
)
tokenizer = get_tokenizer(
model,
@ -1911,18 +1913,18 @@ def test_resolve_content_format_hf_defined(model, expected_format):
assert resolved_format == expected_format
# yapf: disable
@pytest.mark.parametrize(
("model", "expected_format"),
[("Salesforce/blip2-opt-2.7b", "string"),
("facebook/chameleon-7b", "string"),
("deepseek-ai/deepseek-vl2-tiny", "string"),
("adept/fuyu-8b", "string"),
("google/paligemma-3b-mix-224", "string"),
("Qwen/Qwen-VL", "string"),
("Qwen/Qwen-VL-Chat", "string")],
[
("Salesforce/blip2-opt-2.7b", "string"),
("facebook/chameleon-7b", "string"),
("deepseek-ai/deepseek-vl2-tiny", "string"),
("adept/fuyu-8b", "string"),
("google/paligemma-3b-mix-224", "string"),
("Qwen/Qwen-VL", "string"),
("Qwen/Qwen-VL-Chat", "string"),
],
)
# yapf: enable
def test_resolve_content_format_fallbacks(model, expected_format):
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
@ -1936,7 +1938,8 @@ def test_resolve_content_format_fallbacks(model, expected_format):
hf_overrides=model_info.hf_overrides,
skip_tokenizer_init=model_info.skip_tokenizer_init,
enforce_eager=model_info.enforce_eager,
dtype=model_info.dtype)
dtype=model_info.dtype,
)
tokenizer = get_tokenizer(
model_config.tokenizer,
@ -1968,30 +1971,30 @@ def test_resolve_content_format_fallbacks(model, expected_format):
assert resolved_format == expected_format
# yapf: disable
@pytest.mark.parametrize(
("template_path", "expected_format"),
[("template_alpaca.jinja", "string"),
("template_baichuan.jinja", "string"),
("template_chatglm.jinja", "string"),
("template_chatglm2.jinja", "string"),
("template_chatml.jinja", "string"),
("template_dse_qwen2_vl.jinja", "openai"),
("template_falcon_180b.jinja", "string"),
("template_falcon.jinja", "string"),
("template_inkbot.jinja", "string"),
("template_teleflm.jinja", "string"),
("template_vlm2vec_phi3v.jinja", "openai"),
("template_vlm2vec_qwen2vl.jinja", "openai"),
("tool_chat_template_granite_20b_fc.jinja", "string"),
("tool_chat_template_hermes.jinja", "string"),
("tool_chat_template_internlm2_tool.jinja", "string"),
("tool_chat_template_llama3.1_json.jinja", "openai"),
("tool_chat_template_llama3.2_json.jinja", "openai"),
("tool_chat_template_mistral_parallel.jinja", "string"),
("tool_chat_template_mistral.jinja", "string")],
[
("template_alpaca.jinja", "string"),
("template_baichuan.jinja", "string"),
("template_chatglm.jinja", "string"),
("template_chatglm2.jinja", "string"),
("template_chatml.jinja", "string"),
("template_dse_qwen2_vl.jinja", "openai"),
("template_falcon_180b.jinja", "string"),
("template_falcon.jinja", "string"),
("template_inkbot.jinja", "string"),
("template_teleflm.jinja", "string"),
("template_vlm2vec_phi3v.jinja", "openai"),
("template_vlm2vec_qwen2vl.jinja", "openai"),
("tool_chat_template_granite_20b_fc.jinja", "string"),
("tool_chat_template_hermes.jinja", "string"),
("tool_chat_template_internlm2_tool.jinja", "string"),
("tool_chat_template_llama3.1_json.jinja", "openai"),
("tool_chat_template_llama3.2_json.jinja", "openai"),
("tool_chat_template_mistral_parallel.jinja", "string"),
("tool_chat_template_mistral.jinja", "string"),
],
)
# yapf: enable
def test_resolve_content_format_examples(template_path, expected_format):
model_config = ModelConfig(
PHI3V_MODEL_ID, # Dummy
@ -2024,40 +2027,34 @@ def test_resolve_content_format_examples(template_path, expected_format):
assert resolved_format == expected_format
def test_parse_chat_messages_include_thinking_chunk(mistral_model_config,
mistral_tokenizer):
messages = [{
"role":
"system",
"content": [{
"type": "text",
"text": "You are a helpful assistant."
}, {
"type":
"thinking",
"closed":
True,
"thinking":
"Only return the answer when you are confident."
}]
}, {
"role": "user",
"content": "What is 2+2?"
}, {
"role":
"assistant",
"content": [{
"type": "text",
"text": "Let me think about it."
}, {
"type": "thinking",
"closed": True,
"thinking": "2+2 = 4"
}, {
"type": "text",
"text": "The answer is 4.",
}],
}]
def test_parse_chat_messages_include_thinking_chunk(
mistral_model_config, mistral_tokenizer
):
messages = [
{
"role": "system",
"content": [
{"type": "text", "text": "You are a helpful assistant."},
{
"type": "thinking",
"closed": True,
"thinking": "Only return the answer when you are confident.",
},
],
},
{"role": "user", "content": "What is 2+2?"},
{
"role": "assistant",
"content": [
{"type": "text", "text": "Let me think about it."},
{"type": "thinking", "closed": True, "thinking": "2+2 = 4"},
{
"type": "text",
"text": "The answer is 4.",
},
],
},
]
conversation_with_thinking, _, _ = parse_chat_messages(
messages,
@ -2066,122 +2063,105 @@ def test_parse_chat_messages_include_thinking_chunk(mistral_model_config,
content_format="openai",
)
expected_conversation = [{
"role":
"system",
"content": [{
"type": "text",
"text": "You are a helpful assistant."
}, {
"type": "text",
"text": "Only return the answer when you are confident."
}],
}, {
"role":
"user",
"content": [{
"type": "text",
"text": "What is 2+2?"
}],
}, {
"role":
"assistant",
"content": [
{
"type": "text",
"text": "Let me think about it."
},
{
"type": "text",
"text": "2+2 = 4"
},
{
"type": "text",
"text": "The answer is 4."
},
]
}]
expected_conversation = [
{
"role": "system",
"content": [
{"type": "text", "text": "You are a helpful assistant."},
{
"type": "text",
"text": "Only return the answer when you are confident.",
},
],
},
{
"role": "user",
"content": [{"type": "text", "text": "What is 2+2?"}],
},
{
"role": "assistant",
"content": [
{"type": "text", "text": "Let me think about it."},
{"type": "text", "text": "2+2 = 4"},
{"type": "text", "text": "The answer is 4."},
],
},
]
assert conversation_with_thinking == expected_conversation
def test_apply_mistral_chat_template_thinking_chunk():
# Moved import here to avoid yapf and isort conflicts
from vllm.entrypoints.chat_utils import apply_mistral_chat_template
messages = [{
"role":
"system",
"content": [{
"type": "text",
"text": "You are a helpful assistant."
}, {
"type":
"thinking",
"closed":
True,
"thinking":
"Only return the answer when you are confident."
}]
}, {
"role": "user",
"content": "What is 2+2?"
}, {
"role":
"assistant",
"content": [{
"type": "text",
"text": "Let me think about it."
}, {
"type": "thinking",
"closed": True,
"thinking": "2+2 = 4"
}, {
"type": "text",
"text": "The answer is 4.",
}],
}, {
"role": "user",
"content": "Thanks, what is 3+3?"
}]
messages = [
{
"role": "system",
"content": [
{"type": "text", "text": "You are a helpful assistant."},
{
"type": "thinking",
"closed": True,
"thinking": "Only return the answer when you are confident.",
},
],
},
{"role": "user", "content": "What is 2+2?"},
{
"role": "assistant",
"content": [
{"type": "text", "text": "Let me think about it."},
{"type": "thinking", "closed": True, "thinking": "2+2 = 4"},
{
"type": "text",
"text": "The answer is 4.",
},
],
},
{"role": "user", "content": "Thanks, what is 3+3?"},
]
# TODO(Julien): upon model release change to a tokenizer already configured.
# =================================================================
mistral_tokenizer = MistralTokenizer.from_pretrained(
"mistralai/Devstral-Small-2507")
"mistralai/Devstral-Small-2507"
)
assert isinstance(mistral_tokenizer.tokenizer, Tekkenizer)
# Add think special tokens to the tokenizer
mistral_tokenizer.tokenizer._all_special_tokens[35] = SpecialTokenInfo(
rank=35, is_control=True, token_str=SpecialTokens.begin_think.value)
rank=35, is_control=True, token_str=SpecialTokens.begin_think.value
)
mistral_tokenizer.tokenizer._all_special_tokens[36] = SpecialTokenInfo(
rank=36, is_control=True, token_str=SpecialTokens.end_think.value)
rank=36, is_control=True, token_str=SpecialTokens.end_think.value
)
mistral_tokenizer.tokenizer._special_tokens_reverse_vocab = {
k: v
for k, v in
mistral_tokenizer.tokenizer._special_tokens_reverse_vocab.items()
for k, v in mistral_tokenizer.tokenizer._special_tokens_reverse_vocab.items()
if v not in {35, 36}
}
mistral_tokenizer.tokenizer._special_tokens_reverse_vocab[
SpecialTokens.begin_think.value] = 35
SpecialTokens.begin_think.value
] = 35
mistral_tokenizer.tokenizer._special_tokens_reverse_vocab[
SpecialTokens.end_think.value] = 36
SpecialTokens.end_think.value
] = 36
mistral_tokenizer.instruct.BEGIN_THINK = 35
mistral_tokenizer.instruct.END_THINK = 36
# =================================================================
tokens_ids = apply_mistral_chat_template(mistral_tokenizer,
messages,
chat_template=None,
tools=None)
tokens_ids = apply_mistral_chat_template(
mistral_tokenizer, messages, chat_template=None, tools=None
)
string_tokens = mistral_tokenizer.mistral.decode(
tokens_ids, special_token_policy=SpecialTokenPolicy.KEEP)
tokens_ids, special_token_policy=SpecialTokenPolicy.KEEP
)
expected_tokens = (
r"<s>[SYSTEM_PROMPT]You are a helpful assistant.[THINK]Only return the"
r" answer when you are confident.[/THINK][/SYSTEM_PROMPT]"
r"[INST]What is 2+2?[/INST]"
r"Let me think about it.[THINK]2+2 = 4[/THINK]The answer is 4.</s>"
r"[INST]Thanks, what is 3+3?[/INST]")
r"[INST]Thanks, what is 3+3?[/INST]"
)
assert string_tokens == expected_tokens
@ -2192,37 +2172,32 @@ def test_parse_chat_messages_single_empty_audio_with_uuid(
):
audio_uuid = "abcd"
conversation, mm_data, mm_uuids = parse_chat_messages(
[{
"role":
"user",
"content": [
{
"type": "input_audio",
"input_audio": {},
"uuid": audio_uuid,
},
{
"type": "text",
"text": "What does the audio say?"
},
],
}],
[
{
"role": "user",
"content": [
{
"type": "input_audio",
"input_audio": {},
"uuid": audio_uuid,
},
{"type": "text", "text": "What does the audio say?"},
],
}
],
qwen2_audio_model_config,
qwen2_audio_tokenizer,
content_format="string",
)
assert conversation == [{
"role":
"user",
"content":
"Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?"
}]
assert conversation == [
{
"role": "user",
"content": "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?",
}
]
_assert_mm_data_inputs(mm_data, {"audio": 1})
_assert_mm_uuids(mm_uuids,
1,
modality="audio",
expected_uuids=[audio_uuid])
_assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[audio_uuid])
@pytest.mark.asyncio
@ -2232,34 +2207,29 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
):
audio_uuid = "abcd"
conversation, mm_future, mm_uuids = parse_chat_messages_futures(
[{
"role":
"user",
"content": [
{
"type": "input_audio",
"input_audio": {},
"uuid": audio_uuid,
},
{
"type": "text",
"text": "What does the audio say?"
},
],
}],
[
{
"role": "user",
"content": [
{
"type": "input_audio",
"input_audio": {},
"uuid": audio_uuid,
},
{"type": "text", "text": "What does the audio say?"},
],
}
],
qwen2_audio_model_config,
qwen2_audio_tokenizer,
content_format="string",
)
assert conversation == [{
"role":
"user",
"content":
"Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?"
}]
assert conversation == [
{
"role": "user",
"content": "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?",
}
]
_assert_mm_data_inputs(await mm_future, {"audio": 1})
_assert_mm_uuids(mm_uuids,
1,
modality="audio",
expected_uuids=[audio_uuid])
_assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[audio_uuid])

View File

@ -12,9 +12,6 @@ import torch
import torch.nn.functional as F
from vllm.config.lora import LoRAConfig
# yapf conflicts with isort for this block
# yapf: disable
from vllm.lora.layers import (
BaseLayerWithLoRA,
ColumnParallelLinearWithLoRA,
@ -32,8 +29,6 @@ from vllm.lora.layers import (
RowParallelLinearWithShardedLoRA,
VocabParallelEmbeddingWithLoRA,
)
# yapf: enable
from vllm.lora.models import LoRALayerWeights, PackedLoRALayerWeights
from vllm.lora.punica_wrapper import get_punica_wrapper
from vllm.model_executor.layers.linear import (

View File

@ -17,8 +17,6 @@ import vllm.model_executor.model_loader.tensorizer
from tests.utils import VLLM_PATH, RemoteOpenAIServer
from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs
# yapf: disable
from vllm.model_executor.model_loader.tensorizer import (
TensorizerConfig,
TensorSerializer,
@ -29,8 +27,6 @@ from vllm.model_executor.model_loader.tensorizer import (
from vllm.model_executor.model_loader.tensorizer_loader import (
BLACKLISTED_TENSORIZER_ARGS,
)
# yapf: enable
from vllm.utils import PlaceholderModule
from .conftest import DummyExecutor, assert_from_collective_rpc

View File

@ -45,18 +45,17 @@ from .vlm_utils.types import (
if current_platform.is_rocm():
os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
# yapf: disable
COMMON_BROADCAST_SETTINGS = {
"test_type": VLMTestType.IMAGE,
"dtype": "half",
"max_tokens": 5,
"tensor_parallel_size": 2,
"hf_model_kwargs": {"device_map": "auto"},
"image_size_factors": [(.25, 0.5, 1.0)],
"image_size_factors": [(0.25, 0.5, 1.0)],
"distributed_executor_backend": (
"ray",
"mp",
)
),
}
### Test configuration for specific models
@ -96,22 +95,20 @@ VLM_TEST_SETTINGS = {
#### Core tests to always run in the CI
"llava": VLMTestInfo(
models=["llava-hf/llava-1.5-7b-hf"],
test_type=(
VLMTestType.EMBEDDING,
VLMTestType.IMAGE,
VLMTestType.CUSTOM_INPUTS
),
test_type=(VLMTestType.EMBEDDING, VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
convert_assets_to_embeddings=model_utils.get_llava_embeddings,
max_model_len=4096,
auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
custom_test_opts=[CustomTestOptions(
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:"
),
limit_mm_per_prompt={"image": 4},
)],
custom_test_opts=[
CustomTestOptions(
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:"
),
limit_mm_per_prompt={"image": 4},
)
],
# TODO: Revert to "auto" when CPU backend can use torch > 2.6
dtype="bfloat16" if current_platform.is_cpu() else "auto",
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
@ -120,27 +117,27 @@ VLM_TEST_SETTINGS = {
models=["google/paligemma-3b-mix-224"],
test_type=VLMTestType.IMAGE,
prompt_formatter=identity,
img_idx_to_prompt = lambda idx: "",
img_idx_to_prompt=lambda idx: "",
# Paligemma uses its own sample prompts because the default one fails
single_image_prompts=IMAGE_ASSETS.prompts({
"stop_sign": "caption es",
"cherry_blossom": "What is in the picture?",
}),
single_image_prompts=IMAGE_ASSETS.prompts(
{
"stop_sign": "caption es",
"cherry_blossom": "What is in the picture?",
}
),
auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
dtype="bfloat16",
marks=[pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")], # noqa: E501
marks=[
pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")
], # noqa: E501
),
"qwen2_5_vl": VLMTestInfo(
models=["Qwen/Qwen2.5-VL-3B-Instruct"],
test_type=(
VLMTestType.IMAGE,
VLMTestType.MULTI_IMAGE,
VLMTestType.VIDEO
),
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
@ -150,17 +147,13 @@ VLM_TEST_SETTINGS = {
),
"qwen2_5_omni": VLMTestInfo(
models=["Qwen/Qwen2.5-Omni-3B"],
test_type=(
VLMTestType.IMAGE,
VLMTestType.MULTI_IMAGE,
VLMTestType.VIDEO
),
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_bos|><|IMAGE|><|vision_eos|>", # noqa: E501
video_idx_to_prompt=lambda idx: "<|vision_bos|><|VIDEO|><|vision_eos|>", # noqa: E501
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_bos|><|IMAGE|><|vision_eos|>", # noqa: E501
video_idx_to_prompt=lambda idx: "<|vision_bos|><|VIDEO|><|vision_eos|>", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
num_logprobs= 6 if current_platform.is_cpu() else 5,
num_logprobs=6 if current_platform.is_cpu() else 5,
auto_cls=AutoModelForTextToWaveform,
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
patch_hf_runner=model_utils.qwen2_5_omni_patch_hf_runner,
@ -168,9 +161,9 @@ VLM_TEST_SETTINGS = {
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
),
"ultravox": VLMTestInfo(
models = ["fixie-ai/ultravox-v0_5-llama-3_2-1b"],
models=["fixie-ai/ultravox-v0_5-llama-3_2-1b"],
test_type=VLMTestType.AUDIO,
prompt_formatter=lambda audio_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{audio_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
prompt_formatter=lambda audio_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{audio_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
audio_idx_to_prompt=lambda idx: "<|audio|>",
max_model_len=4096,
max_num_seqs=2,
@ -184,9 +177,11 @@ VLM_TEST_SETTINGS = {
"llava-onevision-transformers": VLMTestInfo(
models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
test_type=VLMTestType.IMAGE,
prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
max_model_len=16384,
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"), # noqa: E501
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs(
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
), # noqa: E501
auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
image_size_factors=[(0.25, 0.5, 1.0)],
@ -201,7 +196,7 @@ VLM_TEST_SETTINGS = {
"idefics3-transformers": VLMTestInfo(
models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501
prompt_formatter=lambda img_prompt: f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501
img_idx_to_prompt=lambda idx: "<image>",
max_model_len=8192,
max_num_seqs=2,
@ -217,8 +212,8 @@ VLM_TEST_SETTINGS = {
"qwen2_5_vl-transformers": VLMTestInfo(
models=["Qwen/Qwen2.5-VL-3B-Instruct"],
test_type=VLMTestType.IMAGE,
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
@ -228,23 +223,24 @@ VLM_TEST_SETTINGS = {
"model_impl": "transformers",
},
# FIXME: Investigate mrope issue
marks=[large_gpu_mark(min_gb=32),
pytest.mark.skip(reason="Mrope issue")],
marks=[large_gpu_mark(min_gb=32), pytest.mark.skip(reason="Mrope issue")],
),
#### Extended model tests
"aria": VLMTestInfo(
models=["rhymes-ai/Aria"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
max_model_len=4096,
max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
single_image_prompts=IMAGE_ASSETS.prompts({
"stop_sign": "<vlm_image>Please describe the image shortly.",
"cherry_blossom": "<vlm_image>Please infer the season with reason.", # noqa: E501
}),
multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts(
{
"stop_sign": "<vlm_image>Please describe the image shortly.",
"cherry_blossom": "<vlm_image>Please infer the season with reason.", # noqa: E501
}
),
multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.", # noqa: E501
stop_str=["<|im_end|>"],
image_size_factors=[(0.10, 0.15)],
max_tokens=64,
@ -253,11 +249,13 @@ VLM_TEST_SETTINGS = {
"aya_vision": VLMTestInfo(
models=["CohereForAI/aya-vision-8b"],
test_type=(VLMTestType.IMAGE),
prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts({
"stop_sign": "<image>What's the content in the center of the image?", # noqa: E501
"cherry_blossom": "<image>What is the season?", # noqa: E501
}),
prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts(
{
"stop_sign": "<image>What's the content in the center of the image?", # noqa: E501
"cherry_blossom": "<image>What is the season?", # noqa: E501
}
),
multi_image_prompt="<image><image>Describe the two images in detail.", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
@ -267,11 +265,13 @@ VLM_TEST_SETTINGS = {
"aya_vision-multi_image": VLMTestInfo(
models=["CohereForAI/aya-vision-8b"],
test_type=(VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts({
"stop_sign": "<image>What's the content in the center of the image?", # noqa: E501
"cherry_blossom": "<image>What is the season?", # noqa: E501
}),
prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts(
{
"stop_sign": "<image>What's the content in the center of the image?", # noqa: E501
"cherry_blossom": "<image>What is the season?", # noqa: E501
}
),
multi_image_prompt="<image><image>Describe the two images in detail.", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
@ -297,27 +297,29 @@ VLM_TEST_SETTINGS = {
max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
# For chameleon, we only compare the sequences
vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
hf_output_post_proc = lambda hf_output, model: hf_output[:2],
vllm_output_post_proc=lambda vllm_output, model: vllm_output[:2],
hf_output_post_proc=lambda hf_output, model: hf_output[:2],
comparator=check_outputs_equal,
max_tokens=8,
dtype="bfloat16",
),
"deepseek_vl_v2": VLMTestInfo(
models=["Isotr0py/deepseek-vl2-tiny"], # model repo using dynamic module
models=["Isotr0py/deepseek-vl2-tiny"], # model repo using dynamic module
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ", # noqa: E501
prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
single_image_prompts=IMAGE_ASSETS.prompts({
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
"cherry_blossom": "<image>\nPlease infer the season with reason in details.", # noqa: E501
}),
multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts(
{
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
"cherry_blossom": "<image>\nPlease infer the season with reason in details.", # noqa: E501
}
),
multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?", # noqa: E501
patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
stop_str=["<end▁of▁sentence>", "<begin▁of▁sentence>"], # noqa: E501
image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
image_size_factors=[(), (1.0,), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
),
"fuyu": VLMTestInfo(
models=["adept/fuyu-8b"],
@ -336,11 +338,13 @@ VLM_TEST_SETTINGS = {
"gemma3": VLMTestInfo(
models=["google/gemma-3-4b-it"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts({
"stop_sign": "<start_of_image>What's the content in the center of the image?", # noqa: E501
"cherry_blossom": "<start_of_image>What is the season?", # noqa: E501
}),
prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts(
{
"stop_sign": "<start_of_image>What's the content in the center of the image?", # noqa: E501
"cherry_blossom": "<start_of_image>What is the season?", # noqa: E501
}
),
multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
@ -353,10 +357,12 @@ VLM_TEST_SETTINGS = {
models=["zai-org/glm-4v-9b"],
test_type=VLMTestType.IMAGE,
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts({
"stop_sign": "<|begin_of_image|><|endoftext|><|end_of_image|>What's the content in the center of the image?", # noqa: E501
"cherry_blossom": "<|begin_of_image|><|endoftext|><|end_of_image|>What is the season?", # noqa: E501
}),
single_image_prompts=IMAGE_ASSETS.prompts(
{
"stop_sign": "<|begin_of_image|><|endoftext|><|end_of_image|>What's the content in the center of the image?", # noqa: E501
"cherry_blossom": "<|begin_of_image|><|endoftext|><|end_of_image|>What is the season?", # noqa: E501
}
),
max_model_len=2048,
max_num_seqs=2,
get_stop_token_ids=lambda tok: [151329, 151336, 151338],
@ -372,8 +378,8 @@ VLM_TEST_SETTINGS = {
models=["zai-org/GLM-4.1V-9B-Thinking"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>", # noqa: E501
img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>", # noqa: E501
video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>", # noqa: E501
img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>", # noqa: E501
video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>", # noqa: E501
max_model_len=2048,
max_num_seqs=2,
get_stop_token_ids=lambda tok: [151329, 151336, 151338],
@ -390,23 +396,27 @@ VLM_TEST_SETTINGS = {
max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
patch_hf_runner=model_utils.glm4_1v_patch_hf_runner,
custom_test_opts=[CustomTestOptions(
inputs=custom_inputs.video_with_metadata_glm4_1v(),
limit_mm_per_prompt={"video": 1},
)],
custom_test_opts=[
CustomTestOptions(
inputs=custom_inputs.video_with_metadata_glm4_1v(),
limit_mm_per_prompt={"video": 1},
)
],
marks=[large_gpu_mark(min_gb=32)],
),
"h2ovl": VLMTestInfo(
models = [
models=[
"h2oai/h2ovl-mississippi-800m",
"h2oai/h2ovl-mississippi-2b",
],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts({
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
"cherry_blossom": "<image>\nWhat is the season?",
}),
prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts(
{
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
"cherry_blossom": "<image>\nWhat is the season?",
}
),
multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.", # noqa: E501
max_model_len=8192,
use_tokenizer_eos=True,
@ -416,7 +426,7 @@ VLM_TEST_SETTINGS = {
"idefics3": VLMTestInfo(
models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501
prompt_formatter=lambda img_prompt: f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501
img_idx_to_prompt=lambda idx: "<image>",
max_model_len=8192,
max_num_seqs=2,
@ -431,11 +441,13 @@ VLM_TEST_SETTINGS = {
# "OpenGVLab/Mono-InternVL-2B",
],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts({
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
"cherry_blossom": "<image>\nWhat is the season?",
}),
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts(
{
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
"cherry_blossom": "<image>\nWhat is the season?",
}
),
multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.", # noqa: E501
max_model_len=4096,
use_tokenizer_eos=True,
@ -446,7 +458,7 @@ VLM_TEST_SETTINGS = {
"OpenGVLab/InternVL3-1B",
],
test_type=VLMTestType.VIDEO,
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
video_idx_to_prompt=lambda idx: "<video>",
max_model_len=8192,
use_tokenizer_eos=True,
@ -459,7 +471,7 @@ VLM_TEST_SETTINGS = {
VLMTestType.MULTI_IMAGE,
VLMTestType.VIDEO,
),
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<IMG_CONTEXT>",
video_idx_to_prompt=lambda idx: "<video>",
max_model_len=8192,
@ -469,7 +481,7 @@ VLM_TEST_SETTINGS = {
"kimi_vl": VLMTestInfo(
models=["moonshotai/Kimi-VL-A3B-Instruct"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|im_user|>user<|im_middle|>{img_prompt}<|im_end|><|im_assistant|>assistant<|im_middle|>", # noqa: E501
prompt_formatter=lambda img_prompt: f"<|im_user|>user<|im_middle|>{img_prompt}<|im_end|><|im_assistant|>assistant<|im_middle|>", # noqa: E501
img_idx_to_prompt=lambda _: "<|media_start|>image<|media_content|><|media_pad|><|media_end|>", # noqa: E501
max_model_len=8192,
max_num_seqs=2,
@ -480,11 +492,11 @@ VLM_TEST_SETTINGS = {
),
"llama4": VLMTestInfo(
models=["meta-llama/Llama-4-Scout-17B-16E-Instruct"],
prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|header_start|>user<|header_end|>\n\n{img_prompt}<|eot|><|header_start|>assistant<|header_end|>\n\n", # noqa: E501
prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|header_start|>user<|header_end|>\n\n{img_prompt}<|eot|><|header_start|>assistant<|header_end|>\n\n", # noqa: E501
img_idx_to_prompt=lambda _: "<|image|>",
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
distributed_executor_backend="mp",
image_size_factors=[(.25, 0.5, 1.0)],
image_size_factors=[(0.25, 0.5, 1.0)],
hf_model_kwargs={"device_map": "auto"},
max_model_len=8192,
max_num_seqs=4,
@ -500,28 +512,34 @@ VLM_TEST_SETTINGS = {
max_model_len=10240,
auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
custom_test_opts=[CustomTestOptions(
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]"
),
limit_mm_per_prompt={"image": 4},
)],
custom_test_opts=[
CustomTestOptions(
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]"
),
limit_mm_per_prompt={"image": 4},
)
],
),
"llava_onevision": VLMTestInfo(
models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
test_type=VLMTestType.CUSTOM_INPUTS,
prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
num_video_frames=16,
max_model_len=16384,
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"), # noqa: E501
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs(
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
), # noqa: E501
auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
custom_test_opts=[CustomTestOptions(
inputs=custom_inputs.multi_video_multi_aspect_ratio_inputs(
formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
),
limit_mm_per_prompt={"video": 4},
)],
custom_test_opts=[
CustomTestOptions(
inputs=custom_inputs.multi_video_multi_aspect_ratio_inputs(
formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
),
limit_mm_per_prompt={"video": 4},
)
],
),
"llava_next_video": VLMTestInfo(
models=["llava-hf/LLaVA-NeXT-Video-7B-hf"],
@ -563,7 +581,9 @@ VLM_TEST_SETTINGS = {
img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
max_model_len=4096,
max_num_seqs=2,
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(
["<|im_end|>", "<|endoftext|>"]
), # noqa: E501
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
# FIXME: https://huggingface.co/openbmb/MiniCPM-o-2_6/discussions/49
@ -576,13 +596,15 @@ VLM_TEST_SETTINGS = {
img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
max_model_len=4096,
max_num_seqs=2,
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(
["<|im_end|>", "<|endoftext|>"]
), # noqa: E501
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
),
"minimax_vl_01": VLMTestInfo(
models=["MiniMaxAI/MiniMax-VL-01"],
prompt_formatter=lambda img_prompt: f"<beginning_of_sentence>user: {img_prompt} assistant:<end_of_sentence>", # noqa: E501
prompt_formatter=lambda img_prompt: f"<beginning_of_sentence>user: {img_prompt} assistant:<end_of_sentence>", # noqa: E501
img_idx_to_prompt=lambda _: "<image>",
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
max_model_len=8192,
@ -604,8 +626,8 @@ VLM_TEST_SETTINGS = {
"ovis1_6-gemma2": VLMTestInfo(
models=["AIDC-AI/Ovis1.6-Gemma2-9B"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
dtype="half",
@ -617,8 +639,8 @@ VLM_TEST_SETTINGS = {
"ovis2": VLMTestInfo(
models=["AIDC-AI/Ovis2-1B"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
dtype="half",
@ -628,13 +650,9 @@ VLM_TEST_SETTINGS = {
),
"ovis2_5": VLMTestInfo(
models=["AIDC-AI/Ovis2.5-2B"],
test_type=(
VLMTestType.IMAGE,
VLMTestType.MULTI_IMAGE,
VLMTestType.VIDEO
),
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
video_idx_to_prompt=lambda idx: "<video>\n",
max_model_len=4096,
max_num_seqs=2,
@ -646,7 +664,7 @@ VLM_TEST_SETTINGS = {
"phi3v": VLMTestInfo(
models=["microsoft/Phi-3.5-vision-instruct"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|end|>\n<|assistant|>\n", # noqa: E501
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|end|>\n<|assistant|>\n", # noqa: E501
img_idx_to_prompt=lambda idx: f"<|image_{idx}|>\n",
max_model_len=4096,
max_num_seqs=2,
@ -681,15 +699,11 @@ VLM_TEST_SETTINGS = {
),
"qwen2_vl": VLMTestInfo(
models=["Qwen/Qwen2-VL-2B-Instruct"],
test_type=(
VLMTestType.IMAGE,
VLMTestType.MULTI_IMAGE,
VLMTestType.VIDEO
),
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
multi_image_prompt="Picture 1: <vlm_image>\nPicture 2: <vlm_image>\nDescribe these two images with one paragraph respectively.", # noqa: E501
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
multi_image_prompt="Picture 1: <vlm_image>\nPicture 2: <vlm_image>\nDescribe these two images with one paragraph respectively.", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
@ -700,11 +714,13 @@ VLM_TEST_SETTINGS = {
"skywork_r1v": VLMTestInfo(
models=["Skywork/Skywork-R1V-38B"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<begin▁of▁sentence><User>\n{img_prompt}<Assistant><think>\n", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts({
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
"cherry_blossom": "<image>\nWhat is the season?",
}),
prompt_formatter=lambda img_prompt: f"<begin▁of▁sentence><User>\n{img_prompt}<Assistant><think>\n", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts(
{
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
"cherry_blossom": "<image>\nWhat is the season?",
}
),
multi_image_prompt="<image>\n<image>\nDescribe the two images in short.", # noqa: E501
max_model_len=4096,
use_tokenizer_eos=True,
@ -737,9 +753,9 @@ VLM_TEST_SETTINGS = {
VLMTestType.MULTI_IMAGE,
VLMTestType.VIDEO,
),
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
@ -752,11 +768,11 @@ VLM_TEST_SETTINGS = {
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
max_model_len=4096,
auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
hf_output_post_proc = lambda hf_output, model: hf_output[:2],
vllm_output_post_proc=lambda vllm_output, model: vllm_output[:2],
hf_output_post_proc=lambda hf_output, model: hf_output[:2],
comparator=check_outputs_equal,
marks=multi_gpu_marks(num_gpus=2),
**COMMON_BROADCAST_SETTINGS # type: ignore
**COMMON_BROADCAST_SETTINGS, # type: ignore
),
"llava-broadcast": VLMTestInfo(
models=["llava-hf/llava-1.5-7b-hf"],
@ -765,7 +781,7 @@ VLM_TEST_SETTINGS = {
auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
marks=multi_gpu_marks(num_gpus=2),
**COMMON_BROADCAST_SETTINGS # type: ignore
**COMMON_BROADCAST_SETTINGS, # type: ignore
),
"llava_next-broadcast": VLMTestInfo(
models=["llava-hf/llava-v1.6-mistral-7b-hf"],
@ -774,12 +790,12 @@ VLM_TEST_SETTINGS = {
auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
marks=multi_gpu_marks(num_gpus=2),
**COMMON_BROADCAST_SETTINGS # type: ignore
**COMMON_BROADCAST_SETTINGS, # type: ignore
),
### Custom input edge-cases for specific models
"intern_vl-diff-patches": VLMTestInfo(
models=["OpenGVLab/InternVL2-2B"],
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
test_type=VLMTestType.CUSTOM_INPUTS,
max_model_len=4096,
use_tokenizer_eos=True,
@ -788,7 +804,8 @@ VLM_TEST_SETTINGS = {
CustomTestOptions(
inputs=inp,
limit_mm_per_prompt={"image": 2},
) for inp in custom_inputs.different_patch_input_cases_internvl()
)
for inp in custom_inputs.different_patch_input_cases_internvl()
],
),
"llava_onevision-multiple-images": VLMTestInfo(
@ -797,14 +814,18 @@ VLM_TEST_SETTINGS = {
max_model_len=16384,
max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"), # noqa: E501
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs(
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
), # noqa: E501
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
custom_test_opts=[CustomTestOptions(
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
),
limit_mm_per_prompt={"image": 4},
)],
custom_test_opts=[
CustomTestOptions(
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
),
limit_mm_per_prompt={"image": 4},
)
],
),
# regression test for https://github.com/vllm-project/vllm/issues/15122
"qwen2_5_vl-windows-attention": VLMTestInfo(
@ -814,13 +835,14 @@ VLM_TEST_SETTINGS = {
max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
custom_test_opts=[CustomTestOptions(
inputs=custom_inputs.windows_attention_image_qwen2_5_vl(),
limit_mm_per_prompt={"image": 1},
)],
custom_test_opts=[
CustomTestOptions(
inputs=custom_inputs.windows_attention_image_qwen2_5_vl(),
limit_mm_per_prompt={"image": 1},
)
],
),
}
# yapf: enable
def _mark_splits(

View File

@ -114,7 +114,6 @@ def get_parametrized_options(
raise ValueError("Test has type CUSTOM_INPUTS, but none given")
iter_kwargs["custom_test_opts"] = test_info.custom_test_opts
# yapf: disable
# Wrap all model cases in a pytest parameter & pass marks through
return [
pytest.param(
@ -122,10 +121,10 @@ def get_parametrized_options(
ExpandableVLMTestArgs(
**{k: v for k, v in zip(iter_kwargs.keys(), case)}
),
marks=test_info.marks if test_info.marks is not None else []
) for case in list(itertools.product(*iter_kwargs.values()))
marks=test_info.marks if test_info.marks is not None else [],
)
for case in list(itertools.product(*iter_kwargs.values()))
]
# yapf: enable
# Get a list per model type, where each entry contains a tuple of all of
# that model type's cases, then flatten them into the top level so that

View File

@ -418,7 +418,6 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
self.image_size = self.vision_config.image_size
def __call__(self, text: str, images: Union[Image, list[Image]], **kwargs):
# yapf: disable
from vllm.model_executor.models.h2ovl import (
IMG_CONTEXT,
IMG_END,
@ -426,7 +425,6 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
image_to_pixel_values_h2ovl,
)
# yapf: enable
images = [images] if isinstance(images, Image) else images
pixel_values = [
image_to_pixel_values_h2ovl(

View File

@ -33,24 +33,26 @@ TEST_IMG_PLACEHOLDER = "<vlm_image>"
TEST_VIDEO_PLACEHOLDER = "<vlm_video>"
TEST_AUDIO_PLACEHOLDER = "<lmm_audio>"
# yapf: disable
SINGLE_IMAGE_BASE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign": f"{TEST_IMG_PLACEHOLDER}What's the content of the image?",
"cherry_blossom": f"{TEST_IMG_PLACEHOLDER}What is the season?",
})
SINGLE_AUDIO_BASE_PROMPT = AUDIO_ASSETS.prompts({
"mary_had_lamb": f"{TEST_AUDIO_PLACEHOLDER}Transcribe this audio into English.", # noqa: E501
"winning_call": f"{TEST_AUDIO_PLACEHOLDER}What is happening in this audio clip?", # noqa: E501
})
SINGLE_IMAGE_BASE_PROMPTS = IMAGE_ASSETS.prompts(
{
"stop_sign": f"{TEST_IMG_PLACEHOLDER}What's the content of the image?",
"cherry_blossom": f"{TEST_IMG_PLACEHOLDER}What is the season?",
}
)
SINGLE_AUDIO_BASE_PROMPT = AUDIO_ASSETS.prompts(
{
"mary_had_lamb": f"{TEST_AUDIO_PLACEHOLDER}Transcribe this audio into English.", # noqa: E501
"winning_call": f"{TEST_AUDIO_PLACEHOLDER}What is happening in this audio clip?", # noqa: E501
}
)
MULTI_IMAGE_BASE_PROMPT = f"Image-1: {TEST_IMG_PLACEHOLDER}Image-2: {TEST_IMG_PLACEHOLDER}Describe the two images in detail.\n" # noqa: E501
VIDEO_BASE_PROMPT = f"{TEST_VIDEO_PLACEHOLDER}Why is this video funny?"
IMAGE_SIZE_FACTORS = [(), (1.0, ), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)]
EMBEDDING_SIZE_FACTORS = [(), (1.0, ), (1.0, 1.0, 1.0)]
IMAGE_SIZE_FACTORS = [(), (1.0,), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)]
EMBEDDING_SIZE_FACTORS = [(), (1.0,), (1.0, 1.0, 1.0)]
RunnerOutput = tuple[list[int], str, Optional[SampleLogprobs]]
# yapf: enable
class PromptWithMultiModalInput(NamedTuple):

View File

@ -322,80 +322,81 @@ def _test_processing_correctness_one(
)
# yapf: disable
@pytest.mark.parametrize("model_id", [
"rhymes-ai/Aria",
"CohereForAI/aya-vision-8b",
"Salesforce/blip2-opt-2.7b",
"facebook/chameleon-7b",
"CohereLabs/command-a-vision-07-2025",
"deepseek-ai/deepseek-vl2-tiny",
"baidu/ERNIE-4.5-VL-28B-A3B-PT",
"adept/fuyu-8b",
"google/gemma-3-4b-it",
"google/gemma-3n-E2B-it",
"zai-org/glm-4v-9b",
"zai-org/GLM-4.1V-9B-Thinking",
"zai-org/GLM-4.5V",
"ibm-granite/granite-speech-3.3-2b",
"h2oai/h2ovl-mississippi-800m",
"naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
"HuggingFaceM4/Idefics3-8B-Llama3",
"internlm/Intern-S1",
"OpenGVLab/InternVL2-1B",
"OpenGVLab/InternVL3-1B",
"OpenGVLab/InternVL3_5-1B",
"OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview",
"OpenGVLab/InternVL3_5-30B-A3B",
"Kwai-Keye/Keye-VL-8B-Preview",
"Kwai-Keye/Keye-VL-1_5-8B",
"moonshotai/Kimi-VL-A3B-Instruct",
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
"llava-hf/llava-1.5-7b-hf",
"llava-hf/llava-v1.6-mistral-7b-hf",
"llava-hf/LLaVA-NeXT-Video-7B-hf",
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
"TIGER-Lab/Mantis-8B-siglip-llama3",
"mispeech/midashenglm-7b",
"openbmb/MiniCPM-Llama3-V-2_5",
"openbmb/MiniCPM-o-2_6",
"openbmb/MiniCPM-V-2_6",
"MiniMaxAI/MiniMax-VL-01",
"allenai/Molmo-7B-D-0924",
"allenai/Molmo-7B-O-0924",
"nvidia/NVLM-D-72B",
"nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1",
"AIDC-AI/Ovis1.6-Gemma2-9B",
"AIDC-AI/Ovis1.6-Llama3.2-3B",
"AIDC-AI/Ovis2-1B",
"AIDC-AI/Ovis2.5-2B",
"google/paligemma-3b-mix-224",
"google/paligemma2-3b-ft-docci-448",
"microsoft/Phi-3.5-vision-instruct",
"microsoft/Phi-4-multimodal-instruct",
"mistralai/Pixtral-12B-2409",
"mistral-community/pixtral-12b",
"Qwen/Qwen-VL-Chat",
"Qwen/Qwen2-VL-2B-Instruct",
"Qwen/Qwen2.5-VL-3B-Instruct",
"Qwen/Qwen2-Audio-7B-Instruct",
"Qwen/Qwen2.5-Omni-3B",
"Qwen/Qwen3-VL-4B-Instruct",
"Qwen/Qwen3-VL-30B-A3B-Instruct",
"YannQi/R-4B",
"Skywork/Skywork-R1V-38B",
"HuggingFaceTB/SmolVLM2-2.2B-Instruct",
"stepfun-ai/step3",
"fixie-ai/ultravox-v0_5-llama-3_2-1b",
"openai/whisper-large-v3",
"omni-research/Tarsier-7b",
"omni-research/Tarsier2-Recap-7b",
"mistralai/Voxtral-Mini-3B-2507",
])
@pytest.mark.parametrize(
"model_id",
[
"rhymes-ai/Aria",
"CohereForAI/aya-vision-8b",
"Salesforce/blip2-opt-2.7b",
"facebook/chameleon-7b",
"CohereLabs/command-a-vision-07-2025",
"deepseek-ai/deepseek-vl2-tiny",
"baidu/ERNIE-4.5-VL-28B-A3B-PT",
"adept/fuyu-8b",
"google/gemma-3-4b-it",
"google/gemma-3n-E2B-it",
"zai-org/glm-4v-9b",
"zai-org/GLM-4.1V-9B-Thinking",
"zai-org/GLM-4.5V",
"ibm-granite/granite-speech-3.3-2b",
"h2oai/h2ovl-mississippi-800m",
"naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
"HuggingFaceM4/Idefics3-8B-Llama3",
"internlm/Intern-S1",
"OpenGVLab/InternVL2-1B",
"OpenGVLab/InternVL3-1B",
"OpenGVLab/InternVL3_5-1B",
"OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview",
"OpenGVLab/InternVL3_5-30B-A3B",
"Kwai-Keye/Keye-VL-8B-Preview",
"Kwai-Keye/Keye-VL-1_5-8B",
"moonshotai/Kimi-VL-A3B-Instruct",
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
"llava-hf/llava-1.5-7b-hf",
"llava-hf/llava-v1.6-mistral-7b-hf",
"llava-hf/LLaVA-NeXT-Video-7B-hf",
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
"TIGER-Lab/Mantis-8B-siglip-llama3",
"mispeech/midashenglm-7b",
"openbmb/MiniCPM-Llama3-V-2_5",
"openbmb/MiniCPM-o-2_6",
"openbmb/MiniCPM-V-2_6",
"MiniMaxAI/MiniMax-VL-01",
"allenai/Molmo-7B-D-0924",
"allenai/Molmo-7B-O-0924",
"nvidia/NVLM-D-72B",
"nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1",
"AIDC-AI/Ovis1.6-Gemma2-9B",
"AIDC-AI/Ovis1.6-Llama3.2-3B",
"AIDC-AI/Ovis2-1B",
"AIDC-AI/Ovis2.5-2B",
"google/paligemma-3b-mix-224",
"google/paligemma2-3b-ft-docci-448",
"microsoft/Phi-3.5-vision-instruct",
"microsoft/Phi-4-multimodal-instruct",
"mistralai/Pixtral-12B-2409",
"mistral-community/pixtral-12b",
"Qwen/Qwen-VL-Chat",
"Qwen/Qwen2-VL-2B-Instruct",
"Qwen/Qwen2.5-VL-3B-Instruct",
"Qwen/Qwen2-Audio-7B-Instruct",
"Qwen/Qwen2.5-Omni-3B",
"Qwen/Qwen3-VL-4B-Instruct",
"Qwen/Qwen3-VL-30B-A3B-Instruct",
"YannQi/R-4B",
"Skywork/Skywork-R1V-38B",
"HuggingFaceTB/SmolVLM2-2.2B-Instruct",
"stepfun-ai/step3",
"fixie-ai/ultravox-v0_5-llama-3_2-1b",
"openai/whisper-large-v3",
"omni-research/Tarsier-7b",
"omni-research/Tarsier2-Recap-7b",
"mistralai/Voxtral-Mini-3B-2507",
],
)
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
@pytest.mark.parametrize("num_batches", [32])
@pytest.mark.parametrize("simplify_rate", [1.0])
# yapf: enable
def test_processing_correctness(
model_id: str,
hit_rate: float,

View File

@ -12,7 +12,6 @@ from ...utils import build_model_context
@pytest.mark.parametrize("model_id", ["HuggingFaceM4/Idefics3-8B-Llama3"])
# yapf: disable
@pytest.mark.parametrize(
("mm_processor_kwargs", "expected_toks_per_img"),
[
@ -20,7 +19,6 @@ from ...utils import build_model_context
({"size": {"longest_edge": 728}}, 169 * (2**2 + 1)),
],
)
# yapf: enable
@pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override(

View File

@ -11,7 +11,6 @@ from ...utils import build_model_context
@pytest.mark.parametrize("model_id", ["microsoft/Phi-3.5-vision-instruct"])
# yapf: disable
@pytest.mark.parametrize(
("mm_processor_kwargs", "expected_toks_per_img"),
[
@ -21,7 +20,6 @@ from ...utils import build_model_context
({}, 757),
],
)
# yapf: enable
@pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override(

View File

@ -11,7 +11,6 @@ from ...utils import build_model_context
@pytest.mark.parametrize("model_id", ["microsoft/Phi-4-multimodal-instruct"])
# yapf: disable
@pytest.mark.parametrize(
("mm_processor_kwargs", "expected_toks_per_img"),
[
@ -21,7 +20,6 @@ from ...utils import build_model_context
({}, 9585),
],
)
# yapf: enable
@pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override(

View File

@ -10,7 +10,6 @@ from ...utils import build_model_context
@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])
# yapf: disable
@pytest.mark.parametrize(
("mm_processor_kwargs", "expected_toks_per_img", "expected_pixels_shape"),
[
@ -18,7 +17,6 @@ from ...utils import build_model_context
({"min_pixels": 64**2, "max_pixels": 512**2}, 330, (1320, 1176)),
],
)
# yapf: enable
@pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override(

View File

@ -12,7 +12,6 @@ from ...utils import build_model_context
@pytest.mark.parametrize("model_id", ["HuggingFaceTB/SmolVLM2-2.2B-Instruct"])
# yapf: disable
@pytest.mark.parametrize(
("mm_processor_kwargs", "expected_toks_per_img"),
[
@ -20,7 +19,6 @@ from ...utils import build_model_context
({"max_image_size": {"longest_edge": 768}}, 405),
],
)
# yapf: enable
@pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override(

View File

@ -7,9 +7,7 @@ from vllm.config import ModelConfig
from vllm.multimodal import MULTIMODAL_REGISTRY
# yapf: disable
@pytest.mark.parametrize("model_id",
["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
def test_multimodal_processor(model_id):
model_config = ModelConfig(
model=model_id,
@ -18,9 +16,9 @@ def test_multimodal_processor(model_id):
mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config)
image_pil = ImageAsset('cherry_blossom').pil_image
image_pil = ImageAsset("cherry_blossom").pil_image
mm_data = {"image": image_pil}
str_prompt = "<|im_start|>user <image>\nWhat is the content of this image?<|im_end|><|im_start|>assistant\n" # noqa: E501
str_prompt = "<|im_start|>user <image>\nWhat is the content of this image?<|im_end|><|im_start|>assistant\n" # noqa: E501
str_processed_inputs = mm_processor.apply(
prompt=str_prompt,
mm_data=mm_data,
@ -28,8 +26,23 @@ def test_multimodal_processor(model_id):
)
ids_prompt = [
151644, 872, 220, 151646, 198, 3838, 374, 279, 2213, 315, 419, 2168,
30, 151645, 151644, 77091, 198
151644,
872,
220,
151646,
198,
3838,
374,
279,
2213,
315,
419,
2168,
30,
151645,
151644,
77091,
198,
]
ids_processed_inputs = mm_processor.apply(
prompt=ids_prompt,
@ -37,5 +50,7 @@ def test_multimodal_processor(model_id):
hf_processor_mm_kwargs={},
)
assert (str_processed_inputs["prompt_token_ids"]
== ids_processed_inputs["prompt_token_ids"])
assert (
str_processed_inputs["prompt_token_ids"]
== ids_processed_inputs["prompt_token_ids"]
)

File diff suppressed because it is too large Load Diff

View File

@ -71,25 +71,27 @@ def _dummy_items(
)
# yapf: disable
@pytest.mark.parametrize(
("item", "expected_size"),
[
(_dummy_item("a", {"a1": 100}), 100),
(_dummy_item("a", {"a1": 100, "a2": 110}), 210),
(_dummy_items({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460), # noqa: E501
(_dummy_items({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}).get_data(), 460), # noqa: E501
(
_dummy_items(
{"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}
).get_data(),
460,
), # noqa: E501
],
)
# yapf: enable
def test_cache_item_size(item, expected_size):
cache = MultiModalCache.get_lru_cache(2048, type(item))
cache[""] = item
assert cache.currsize == expected_size
prompt_update = PromptInsertion("dummy", "target", "insertion") \
.resolve(0)
prompt_update = PromptInsertion("dummy", "target", "insertion").resolve(0)
cache[""] = MultiModalProcessorCacheItem(item, [prompt_update])
assert cache.currsize == expected_size
@ -106,9 +108,9 @@ def _create_vllm_config(
return VllmConfig(
model_config=ModelConfig(
model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
mm_processor_cache_gb=mm_processor_cache_gb),
parallel_config=ParallelConfig(
data_parallel_size=1 if enable_ipc else 2),
mm_processor_cache_gb=mm_processor_cache_gb,
),
parallel_config=ParallelConfig(data_parallel_size=1 if enable_ipc else 2),
)
@ -124,11 +126,9 @@ def _compare_caches(
seed: int = 0,
):
cache_0_p0 = processor_cache_from_config(config_0, MULTIMODAL_REGISTRY)
cache_0_p1 = engine_receiver_cache_from_config(config_0,
MULTIMODAL_REGISTRY)
cache_0_p1 = engine_receiver_cache_from_config(config_0, MULTIMODAL_REGISTRY)
cache_1_p0 = processor_cache_from_config(config_1, MULTIMODAL_REGISTRY)
cache_1_p1 = engine_receiver_cache_from_config(config_1,
MULTIMODAL_REGISTRY)
cache_1_p1 = engine_receiver_cache_from_config(config_1, MULTIMODAL_REGISTRY)
cache_size_gb = max(
config_0.model_config.multimodal_config.mm_processor_cache_gb,
@ -142,8 +142,7 @@ def _compare_caches(
for _ in range(int(item_capacity / hit_rate))
]
all_hashes = [
MultiModalHasher.hash_kwargs(item=item.get_data())
for item in all_items
MultiModalHasher.hash_kwargs(item=item.get_data()) for item in all_items
]
# Should not be used since there is nothing to convert to text
@ -162,7 +161,8 @@ def _compare_caches(
for _ in range(is_cached_calls_per_iter):
cache_0_p0.is_cached(selected_hashes)
cache_0_p0_out = [
item for item, _ in cache_0_p0.get_and_update(
item
for item, _ in cache_0_p0.get_and_update(
[(item, prompt_update.content) for item in selected_items],
selected_hashes,
)
@ -174,7 +174,8 @@ def _compare_caches(
for _ in range(is_cached_calls_per_iter):
cache_1_p0.is_cached(selected_hashes)
cache_1_p0_out = [
item for item, _ in cache_1_p0.get_and_update(
item
for item, _ in cache_1_p0.get_and_update(
[(item, prompt_update.content) for item in selected_items],
selected_hashes,
)
@ -183,14 +184,12 @@ def _compare_caches(
if cache_0_p1 is None:
cache_0_p1_out = cache_0_p0_out
else:
cache_0_p1_out = cache_0_p1.get_and_update(cache_0_p0_out,
selected_hashes)
cache_0_p1_out = cache_0_p1.get_and_update(cache_0_p0_out, selected_hashes)
if cache_1_p1 is None:
cache_1_p1_out = cache_1_p0_out
else:
cache_1_p1_out = cache_1_p1.get_and_update(cache_1_p0_out,
selected_hashes)
cache_1_p1_out = cache_1_p1.get_and_update(cache_1_p0_out, selected_hashes)
assert cache_0_p1_out == cache_1_p1_out, f"Failed at {it=}"

View File

@ -9,9 +9,6 @@ import pytest
from vllm.config import ModelConfig
from vllm.multimodal import MULTIMODAL_REGISTRY
# yapf conflicts with isort for this block
# yapf: disable
from vllm.multimodal.processing import (
InputProcessingContext,
PlaceholderFeaturesInfo,
@ -24,8 +21,6 @@ from vllm.multimodal.processing import (
iter_token_matches,
replace_token_matches,
)
# yapf: enable
from vllm.multimodal.profiling import MultiModalProfiler
from vllm.transformers_utils.tokenizer import AnyTokenizer
@ -34,7 +29,6 @@ from .utils import random_image
pytestmark = pytest.mark.cpu_test
# yapf: disable
@pytest.mark.parametrize(
("token_ids", "match_ids", "expected"),
[
@ -44,34 +38,34 @@ pytestmark = pytest.mark.cpu_test
[32000, 32000, 32000],
[32000],
[
{ "start_idx": 0, "end_idx": 1 },
{ "start_idx": 1, "end_idx": 2 },
{ "start_idx": 2, "end_idx": 3 },
{"start_idx": 0, "end_idx": 1},
{"start_idx": 1, "end_idx": 2},
{"start_idx": 2, "end_idx": 3},
],
),
(
[32000, 32000, 32000],
[32000, 32000],
[{ "start_idx": 0, "end_idx": 2 }],
[{"start_idx": 0, "end_idx": 2}],
),
(
[32000, 32000, 32000],
[32000, 32000, 32000],
[{ "start_idx": 0, "end_idx": 3 }],
[{"start_idx": 0, "end_idx": 3}],
),
(
[9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
[28747, 32000],
[
{ "start_idx": 1, "end_idx": 3 },
{ "start_idx": 6, "end_idx": 8 },
{"start_idx": 1, "end_idx": 3},
{"start_idx": 6, "end_idx": 8},
],
),
(
[9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
[28747, 32000, 32000, 32000],
[
{ "start_idx": 1, "end_idx": 5 },
{"start_idx": 1, "end_idx": 5},
],
),
(
@ -82,14 +76,13 @@ pytestmark = pytest.mark.cpu_test
],
)
@pytest.mark.parametrize("start_idx", [0, 4, 8])
# yapf: enable
def test_iter_token_matches(token_ids, match_ids, expected, start_idx):
result = list(iter_token_matches(token_ids, match_ids,
start_idx=start_idx))
result = list(iter_token_matches(token_ids, match_ids, start_idx=start_idx))
# Manually constructed results
assert [item._asdict() for item in result
] == [item for item in expected if item["start_idx"] >= start_idx]
assert [item._asdict() for item in result] == [
item for item in expected if item["start_idx"] >= start_idx
]
# Invariants
match_lens = [end - start for start, end in result]
@ -97,7 +90,6 @@ def test_iter_token_matches(token_ids, match_ids, expected, start_idx):
assert all(match_len == len(match_ids) for match_len in match_lens)
# yapf: disable
@pytest.mark.parametrize(
("token_ids", "match_ids", "new_ids", "expected"),
[
@ -141,7 +133,6 @@ def test_iter_token_matches(token_ids, match_ids, expected, start_idx):
),
],
)
# yapf: enable
def test_replace_token_matches(token_ids, match_ids, new_ids, expected):
result = replace_token_matches(token_ids, match_ids, new_ids)
@ -149,7 +140,6 @@ def test_replace_token_matches(token_ids, match_ids, new_ids, expected):
assert result == expected
# yapf: disable
@pytest.mark.parametrize(
("prompt", "target_by_key", "expected_by_key"),
[
@ -166,11 +156,11 @@ def test_replace_token_matches(token_ids, match_ids, new_ids, expected):
"pattern_1": [],
"pattern_2": [],
"pattern_3": [
{ "start_idx": 0, "end_idx": 0 },
{"start_idx": 0, "end_idx": 0},
],
"pattern_4": [],
"pattern_5": [
{ "start_idx": 0, "end_idx": 0 },
{"start_idx": 0, "end_idx": 0},
],
},
),
@ -186,26 +176,26 @@ def test_replace_token_matches(token_ids, match_ids, new_ids, expected):
},
{
"pattern_1": [
{ "start_idx": 0, "end_idx": 1 },
{ "start_idx": 1, "end_idx": 2 },
{ "start_idx": 2, "end_idx": 3 },
{ "start_idx": 3, "end_idx": 4 },
{"start_idx": 0, "end_idx": 1},
{"start_idx": 1, "end_idx": 2},
{"start_idx": 2, "end_idx": 3},
{"start_idx": 3, "end_idx": 4},
],
"pattern_2": [
{ "start_idx": 0, "end_idx": 2 },
{ "start_idx": 2, "end_idx": 4 },
{"start_idx": 0, "end_idx": 2},
{"start_idx": 2, "end_idx": 4},
],
"pattern_3": [
{ "start_idx": 0, "end_idx": 3 },
{"start_idx": 0, "end_idx": 3},
],
"pattern_4": [
{ "start_idx": 0, "end_idx": 0 },
{"start_idx": 0, "end_idx": 0},
],
"pattern_5": [
{ "start_idx": 1, "end_idx": 1 },
{"start_idx": 1, "end_idx": 1},
],
"pattern_6": [
{ "start_idx": 4, "end_idx": 4 },
{"start_idx": 4, "end_idx": 4},
],
},
),
@ -221,26 +211,25 @@ def test_replace_token_matches(token_ids, match_ids, new_ids, expected):
},
{
"pattern_1": [
{ "start_idx": 1, "end_idx": 3 },
{ "start_idx": 6, "end_idx": 8 },
{"start_idx": 1, "end_idx": 3},
{"start_idx": 6, "end_idx": 8},
],
"pattern_2": [
{ "start_idx": 1, "end_idx": 5 },
{"start_idx": 1, "end_idx": 5},
],
"pattern_3": [],
"pattern_4": [
{ "start_idx": 0, "end_idx": 0 },
{"start_idx": 0, "end_idx": 0},
],
"pattern_5": [],
"pattern_6": [
{ "start_idx": 10, "end_idx": 10 },
{"start_idx": 10, "end_idx": 10},
],
},
),
],
)
@pytest.mark.parametrize("update_type", [PromptInsertion, PromptReplacement])
# yapf: enable
def test_find_token_matches(
prompt,
target_by_key,
@ -272,7 +261,6 @@ def test_find_token_matches(
} == expected_by_key
# yapf: disable
@pytest.mark.parametrize(
("prompt", "target_by_key", "expected_by_key"),
[
@ -288,16 +276,16 @@ def test_find_token_matches(
"pattern_5": PromptIndexTargets.end(),
},
{
"pattern_1": [{ "start_idx": 0, "end_idx": 0 }],
"pattern_1": [{"start_idx": 0, "end_idx": 0}],
"pattern_2": [],
"pattern_3": [
{ "start_idx": 0, "end_idx": 0 },
{"start_idx": 0, "end_idx": 0},
],
"pattern_4": [],
"pattern_5": [
{ "start_idx": 0, "end_idx": 0 },
{"start_idx": 0, "end_idx": 0},
],
}
},
),
(
"<image><image><image><image>",
@ -311,26 +299,26 @@ def test_find_token_matches(
},
{
"pattern_1": [
{ "start_idx": 0, "end_idx": 7 },
{ "start_idx": 7, "end_idx": 14 },
{ "start_idx": 14, "end_idx": 21 },
{ "start_idx": 21, "end_idx": 28 },
{"start_idx": 0, "end_idx": 7},
{"start_idx": 7, "end_idx": 14},
{"start_idx": 14, "end_idx": 21},
{"start_idx": 21, "end_idx": 28},
],
"pattern_2": [
{ "start_idx": 0, "end_idx": 14 },
{ "start_idx": 14, "end_idx": 28 },
{"start_idx": 0, "end_idx": 14},
{"start_idx": 14, "end_idx": 28},
],
"pattern_3": [
{ "start_idx": 0, "end_idx": 21 },
{"start_idx": 0, "end_idx": 21},
],
"pattern_4": [
{ "start_idx": 0, "end_idx": 0 },
{"start_idx": 0, "end_idx": 0},
],
"pattern_5": [
{ "start_idx": 7, "end_idx": 7 },
{"start_idx": 7, "end_idx": 7},
],
"pattern_6": [
{ "start_idx": 28, "end_idx": 28 },
{"start_idx": 28, "end_idx": 28},
],
},
),
@ -346,21 +334,21 @@ def test_find_token_matches(
},
{
"pattern_1": [
{ "start_idx": 0, "end_idx": 13 },
{ "start_idx": 27, "end_idx": 40 },
{"start_idx": 0, "end_idx": 13},
{"start_idx": 27, "end_idx": 40},
],
"pattern_2": [
{ "start_idx": 0, "end_idx": 27 },
{"start_idx": 0, "end_idx": 27},
],
"pattern_3": [],
"pattern_4": [
{ "start_idx": 0, "end_idx": 0 },
{"start_idx": 0, "end_idx": 0},
],
"pattern_5": [
{ "start_idx": 13, "end_idx": 13 },
{"start_idx": 13, "end_idx": 13},
],
"pattern_6": [
{ "start_idx": 48, "end_idx": 48 },
{"start_idx": 48, "end_idx": 48},
],
},
),
@ -374,22 +362,21 @@ def test_find_token_matches(
},
{
"pattern_1": [
{ "start_idx": 0, "end_idx": 9 },
{ "start_idx": 16, "end_idx": 25 },
{"start_idx": 0, "end_idx": 9},
{"start_idx": 16, "end_idx": 25},
],
"pattern_2": [
{ "start_idx": 0, "end_idx": 16 },
{ "start_idx": 16, "end_idx": 32 },
{"start_idx": 0, "end_idx": 16},
{"start_idx": 16, "end_idx": 32},
],
"pattern_3": [
{ "start_idx": 0, "end_idx": 25 },
{"start_idx": 0, "end_idx": 25},
],
},
),
],
)
@pytest.mark.parametrize("update_type", [PromptInsertion, PromptReplacement])
# yapf: enable
def test_find_text_matches(
prompt,
target_by_key,
@ -421,7 +408,6 @@ def test_find_text_matches(
} == expected_by_key
# yapf: disable
@pytest.mark.parametrize(
("prompt", "target_by_key", "repl_by_key", "expected_by_update_type_mm_count"), # noqa: E501
[
@ -549,9 +535,8 @@ def test_find_text_matches(
},
},
),
]
],
)
# yapf: enable
def test_find_update_text(
prompt,
target_by_key,
@ -562,13 +547,15 @@ def test_find_update_text(
mock_tokenizer = cast(AnyTokenizer, object())
for (
update_type,
expected_by_mm_count,
update_type,
expected_by_mm_count,
) in expected_by_update_type_mm_count.items():
for mm_count, expected in expected_by_mm_count.items():
mm_prompt_updates = {
key: [[update_type(key, target, repl_by_key[key]).resolve(i)]
for i in range(mm_count)]
key: [
[update_type(key, target, repl_by_key[key]).resolve(i)]
for i in range(mm_count)
]
for key, target in target_by_key.items()
}
@ -589,7 +576,6 @@ def test_find_update_text(
assert new_prompt == expected
# yapf: disable
@pytest.mark.parametrize(
("prompt", "target_by_key", "repl_by_key", "expected_by_update_type_mm_count"), # noqa: E501
[
@ -615,8 +601,43 @@ def test_find_update_text(
{
PromptInsertion: {
0: [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918],
1: [1, 9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918, 1550, 918, 1550], # noqa: E501
2: [1, 9833, 28747, 32000, 32000, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918, 1550, 918, 1550, 1550, 918, 1550], # noqa: E501
1: [
1,
9833,
28747,
32000,
32000,
32000,
9833,
28747,
32000,
32000,
918,
1550,
918,
1550,
], # noqa: E501
2: [
1,
9833,
28747,
32000,
32000,
32000,
32000,
32000,
9833,
28747,
32000,
32000,
918,
1550,
918,
1550,
1550,
918,
1550,
], # noqa: E501
},
PromptReplacement: {
0: [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918],
@ -719,9 +740,8 @@ def test_find_update_text(
},
},
),
]
],
)
# yapf: enable
def test_find_update_tokens(
prompt,
target_by_key,
@ -732,13 +752,15 @@ def test_find_update_tokens(
mock_tokenizer = cast(AnyTokenizer, object())
for (
update_type,
expected_by_mm_count,
update_type,
expected_by_mm_count,
) in expected_by_update_type_mm_count.items():
for mm_count, expected in expected_by_mm_count.items():
mm_prompt_updates = {
key: [[update_type(key, target, repl_by_key[key]).resolve(i)]
for i in range(mm_count)]
key: [
[update_type(key, target, repl_by_key[key]).resolve(i)]
for i in range(mm_count)
]
for key, target in target_by_key.items()
}
@ -759,7 +781,6 @@ def test_find_update_tokens(
assert new_prompt == expected
# yapf: disable
@pytest.mark.parametrize(
"repl_by_key",
[
@ -796,8 +817,7 @@ def test_find_update_tokens(
is_embed=None,
),
],
}
},
),
(
[1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550],
@ -828,7 +848,7 @@ def test_find_update_tokens(
),
],
# No match for pattern_4 as it has lower priority than pattern_1
}
},
),
(
[1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550],
@ -867,12 +887,11 @@ def test_find_update_tokens(
is_embed=None,
),
],
}
},
),
]
],
)
@pytest.mark.parametrize("update_type", [PromptInsertion, PromptReplacement])
# yapf: enable
def test_find_mm_placeholders(
repl_by_key,
prompt,
@ -899,8 +918,15 @@ def test_find_mm_placeholders(
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
@pytest.mark.parametrize(
("limit", "num_supported", "is_valid"),
[(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),
(2, 1, False), (2, 2, True)],
[
(0, 0, True),
(0, 1, True),
(1, 0, False),
(1, 1, True),
(1, 2, True),
(2, 1, False),
(2, 2, True),
],
)
def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
limit_mm_per_prompt = {"image": limit}
@ -930,8 +956,15 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
@pytest.mark.parametrize(
("num_images", "limit", "is_valid"),
[(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),
(2, 1, False), (2, 2, True)],
[
(0, 0, True),
(0, 1, True),
(1, 0, False),
(1, 1, True),
(1, 2, True),
(2, 1, False),
(2, 2, True),
],
)
def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
limit_mm_per_prompt = {"image": limit}
@ -966,7 +999,6 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
class DummyProcessor:
def __init__(self, a: int = 0, b: int = 0) -> None:
super().__init__()
@ -982,7 +1014,6 @@ class DummyProcessor:
return dict(a=a, c=c)
# yapf: disable
@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) # Dummy
@pytest.mark.parametrize(
("config_kwargs", "inference_kwargs", "expected_kwargs"),
@ -996,7 +1027,6 @@ class DummyProcessor:
({"b": 1, "c": 1}, {}, {"a": 0, "b": 1}),
],
)
# yapf: enable
def test_hf_processor_init_kwargs(
model_id,
config_kwargs,
@ -1020,7 +1050,6 @@ def test_hf_processor_init_kwargs(
assert getattr(processor, k) == v
# yapf: disable
@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) # Dummy
@pytest.mark.parametrize(
("config_kwargs", "inference_kwargs", "expected_kwargs"),
@ -1034,7 +1063,6 @@ def test_hf_processor_init_kwargs(
({"b": 1, "c": 1}, {}, {"a": 0, "c": 1}),
],
)
# yapf: enable
def test_hf_processor_call_kwargs(
model_id,
config_kwargs,

View File

@ -233,7 +233,6 @@ async def test_fetch_video_http_with_dynamic_loader(
assert metadata_sync["video_backend"] == "opencv_dynamic"
# yapf: disable
@pytest.mark.parametrize(
"case",
[
@ -264,7 +263,6 @@ async def test_fetch_video_http_with_dynamic_loader(
("image", 0),
],
),
# Two modalities
## Internally sorted
dict(
@ -276,7 +274,7 @@ async def test_fetch_video_http_with_dynamic_loader(
"audio": [
PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=2, length=3),
]
],
},
expected_modality_idxs=[
("audio", 0),
@ -295,7 +293,7 @@ async def test_fetch_video_http_with_dynamic_loader(
"audio": [
PlaceholderRange(offset=5, length=2),
PlaceholderRange(offset=11, length=4),
]
],
},
expected_modality_idxs=[
("image", 0),
@ -314,7 +312,7 @@ async def test_fetch_video_http_with_dynamic_loader(
"audio": [
PlaceholderRange(offset=11, length=4),
PlaceholderRange(offset=5, length=2),
]
],
},
expected_modality_idxs=[
("image", 1),
@ -323,7 +321,6 @@ async def test_fetch_video_http_with_dynamic_loader(
("audio", 0),
],
),
# Three modalities
## Internally sorted
dict(
@ -339,7 +336,7 @@ async def test_fetch_video_http_with_dynamic_loader(
PlaceholderRange(offset=3, length=4),
PlaceholderRange(offset=7, length=5),
PlaceholderRange(offset=12, length=6),
]
],
},
expected_modality_idxs=[
("audio", 0),
@ -363,7 +360,7 @@ async def test_fetch_video_http_with_dynamic_loader(
],
"video": [
PlaceholderRange(offset=8, length=5),
]
],
},
expected_modality_idxs=[
("image", 0),
@ -386,7 +383,7 @@ async def test_fetch_video_http_with_dynamic_loader(
],
"video": [
PlaceholderRange(offset=8, length=5),
]
],
},
expected_modality_idxs=[
("image", 0),
@ -398,7 +395,6 @@ async def test_fetch_video_http_with_dynamic_loader(
),
],
)
# yapf: enable
def test_argsort_mm_positions(case):
mm_positions = case["mm_positions"]
expected_modality_idxs = case["expected_modality_idxs"]
@ -413,13 +409,16 @@ def test_argsort_mm_positions(case):
@pytest.mark.parametrize("num_frames", [-1, 32, 1800])
async def test_allowed_media_domains(video_url: str, num_frames: int):
connector = MediaConnector(
media_io_kwargs={"video": {
"num_frames": num_frames,
}},
media_io_kwargs={
"video": {
"num_frames": num_frames,
}
},
allowed_media_domains=[
"www.bogotobogo.com",
"github.com",
])
],
)
video_sync, metadata_sync = connector.fetch_video(video_url)
video_async, metadata_async = await connector.fetch_video_async(video_url)

View File

@ -59,48 +59,52 @@ def test_parse_raw_single_batch_string_slice(inputs_slice: slice):
)
# yapf: disable
@pytest.mark.parametrize('mm_processor_kwargs,expected_mm_kwargs', [
(None, [{}, {}]),
({}, [{}, {}]),
({"foo": 100}, [{"foo": 100}, {"foo": 100}]),
([{"foo": 100}, {"bar": 200}], [{"foo": 100}, {"bar": 200}]),
])
# yapf: enable
@pytest.mark.parametrize(
"mm_processor_kwargs,expected_mm_kwargs",
[
(None, [{}, {}]),
({}, [{}, {}]),
({"foo": 100}, [{"foo": 100}, {"foo": 100}]),
([{"foo": 100}, {"bar": 200}], [{"foo": 100}, {"bar": 200}]),
],
)
def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
"""Test mm_processor_kwargs init for zipping enc/dec prompts."""
encoder_prompts = ['An encoder prompt', 'Another encoder prompt']
decoder_prompts = ['A decoder prompt', 'Another decoder prompt']
zipped_prompts = zip_enc_dec_prompts(encoder_prompts, decoder_prompts,
mm_processor_kwargs)
encoder_prompts = ["An encoder prompt", "Another encoder prompt"]
decoder_prompts = ["A decoder prompt", "Another decoder prompt"]
zipped_prompts = zip_enc_dec_prompts(
encoder_prompts, decoder_prompts, mm_processor_kwargs
)
assert len(zipped_prompts) == len(encoder_prompts) == len(decoder_prompts)
for enc, dec, exp_kwargs, zipped in zip(encoder_prompts, decoder_prompts,
expected_mm_kwargs,
zipped_prompts):
for enc, dec, exp_kwargs, zipped in zip(
encoder_prompts, decoder_prompts, expected_mm_kwargs, zipped_prompts
):
assert isinstance(zipped, dict)
assert len(zipped.keys()) == 3
assert zipped['encoder_prompt'] == enc
assert zipped['decoder_prompt'] == dec
assert zipped['mm_processor_kwargs'] == exp_kwargs
assert zipped["encoder_prompt"] == enc
assert zipped["decoder_prompt"] == dec
assert zipped["mm_processor_kwargs"] == exp_kwargs
@pytest.mark.parametrize("model_id", [
"facebook/opt-125m",
])
@pytest.mark.parametrize("prompt", [
{
"prompt": "",
"multi_modal_data": {
"dummy": []
@pytest.mark.parametrize(
"model_id",
[
"facebook/opt-125m",
],
)
@pytest.mark.parametrize(
"prompt",
[
{
"prompt": "",
"multi_modal_data": {"dummy": []},
},
},
{
"prompt_token_ids": [],
"multi_modal_data": {
"dummy": []
{
"prompt_token_ids": [],
"multi_modal_data": {"dummy": []},
},
},
])
],
)
def test_preprocessor_text_no_mm_inputs(model_id, prompt):
model_config = ModelConfig(model=model_id)
tokenizer = init_tokenizer_from_configs(model_config)
@ -110,15 +114,19 @@ def test_preprocessor_text_no_mm_inputs(model_id, prompt):
input_preprocessor.preprocess(prompt)
@pytest.mark.parametrize("model_id", [
"facebook/chameleon-7b",
])
@pytest.mark.parametrize("prompt", [
"",
{
"prompt_token_ids": []
},
])
@pytest.mark.parametrize(
"model_id",
[
"facebook/chameleon-7b",
],
)
@pytest.mark.parametrize(
"prompt",
[
"",
{"prompt_token_ids": []},
],
)
def test_preprocessor_always_mm_code_path(model_id, prompt):
model_config = ModelConfig(model=model_id)
tokenizer = init_tokenizer_from_configs(model_config)

View File

@ -9,14 +9,10 @@ import pytest
import torch
import torch_xla
# yapf conflicts with isort for this block
# yapf: disable
from vllm.model_executor.layers.fused_moe.moe_pallas import fused_moe as pallas_moe
from vllm.model_executor.layers.fused_moe.moe_torch_iterative import (
fused_moe as torch_moe,
)
# yapf: enable
from vllm.platforms import current_platform
if not current_platform.is_tpu():

View File

@ -388,7 +388,6 @@ def test_duplicate_dict_args(caplog_vllm, parser):
assert "-O.level" in caplog_vllm.text
# yapf: enable
@pytest.mark.parametrize(
"callable,kw_name,requires_kw_only,allow_var_kwargs,is_supported",
[
@ -408,7 +407,6 @@ def test_duplicate_dict_args(caplog_vllm, parser):
(lambda foo, **kwargs: None, "foo", True, True, False),
],
)
# yapf: disable
def test_supports_kw(
callable, kw_name, requires_kw_only, allow_var_kwargs, is_supported
):
@ -681,7 +679,6 @@ def test_lru_cache():
assert 6 in cache
# yapf: disable
@pytest.mark.parametrize(
("src_dtype", "tgt_dtype", "expected_result"),
[
@ -715,12 +712,10 @@ def test_lru_cache():
(torch.complex64, torch.complex32, False),
],
)
# yapf: enable
def test_is_lossless_cast(src_dtype, tgt_dtype, expected_result):
assert is_lossless_cast(src_dtype, tgt_dtype) == expected_result
# yapf: disable
@pytest.mark.parametrize(
("dtypes", "expected_result"),
[
@ -730,7 +725,6 @@ def test_is_lossless_cast(src_dtype, tgt_dtype, expected_result):
([torch.bool, torch.int8, torch.float16, torch.complex32], torch.complex32), # noqa: E501
],
)
# yapf: enable
def test_common_broadcastable_dtype(dtypes, expected_result):
assert common_broadcastable_dtype(dtypes) == expected_result
@ -775,7 +769,6 @@ def test_placeholder_module_error_handling():
_ = placeholder_attr.module
# yapf: disable
@pytest.mark.parametrize(
"obj,key1,key2",
[
@ -785,8 +778,8 @@ def test_placeholder_module_error_handling():
({1: "a", 2: "b"}, 1, 3),
# Tests for both keys do not exist
({1: "a", 2: "b"}, 3, 4),
])
# yapf: enable
],
)
def test_swap_dict_values(obj, key1, key2):
original_obj = obj.copy()
swap_dict_values(obj, key1, key2)
@ -800,26 +793,30 @@ def test_swap_dict_values(obj, key1, key2):
assert key1 not in obj
def test_model_specification(parser_with_config, cli_config_file,
cli_config_file_with_model):
def test_model_specification(
parser_with_config, cli_config_file, cli_config_file_with_model
):
# Test model in CLI takes precedence over config
args = parser_with_config.parse_args(
['serve', 'cli-model', '--config', cli_config_file_with_model])
assert args.model_tag == 'cli-model'
assert args.served_model_name == 'mymodel'
["serve", "cli-model", "--config", cli_config_file_with_model]
)
assert args.model_tag == "cli-model"
assert args.served_model_name == "mymodel"
# Test model from config file works
args = parser_with_config.parse_args([
'serve',
'--config',
cli_config_file_with_model,
])
assert args.model == 'config-model'
assert args.served_model_name == 'mymodel'
args = parser_with_config.parse_args(
[
"serve",
"--config",
cli_config_file_with_model,
]
)
assert args.model == "config-model"
assert args.served_model_name == "mymodel"
# Test no model specified anywhere raises error
with pytest.raises(ValueError, match="No model specified!"):
parser_with_config.parse_args(['serve', '--config', cli_config_file])
parser_with_config.parse_args(["serve", "--config", cli_config_file])
# Test using --model option raises error
# with pytest.raises(
@ -833,47 +830,52 @@ def test_model_specification(parser_with_config, cli_config_file,
# Test using --model option back-compatibility
# (when back-compatibility ends, the above test should be uncommented
# and the below test should be removed)
args = parser_with_config.parse_args([
'serve',
'--tensor-parallel-size',
'2',
'--model',
'my-model',
'--trust-remote-code',
'--port',
'8001',
])
args = parser_with_config.parse_args(
[
"serve",
"--tensor-parallel-size",
"2",
"--model",
"my-model",
"--trust-remote-code",
"--port",
"8001",
]
)
assert args.model is None
assert args.tensor_parallel_size == 2
assert args.trust_remote_code is True
assert args.port == 8001
args = parser_with_config.parse_args([
'serve',
'--tensor-parallel-size=2',
'--model=my-model',
'--trust-remote-code',
'--port=8001',
])
args = parser_with_config.parse_args(
[
"serve",
"--tensor-parallel-size=2",
"--model=my-model",
"--trust-remote-code",
"--port=8001",
]
)
assert args.model is None
assert args.tensor_parallel_size == 2
assert args.trust_remote_code is True
assert args.port == 8001
# Test other config values are preserved
args = parser_with_config.parse_args([
'serve',
'cli-model',
'--config',
cli_config_file_with_model,
])
args = parser_with_config.parse_args(
[
"serve",
"cli-model",
"--config",
cli_config_file_with_model,
]
)
assert args.tensor_parallel_size == 2
assert args.trust_remote_code is True
assert args.port == 12312
@pytest.mark.parametrize("input", [(), ("abc", ), (None, ),
(None, bool, [1, 2, 3])])
@pytest.mark.parametrize("input", [(), ("abc",), (None,), (None, bool, [1, 2, 3])])
def test_sha256(input: tuple):
digest = sha256(input)
assert digest is not None
@ -887,7 +889,7 @@ def test_sha256(input: tuple):
assert digest == sha256(input)
# hashing different input, returns different value
assert digest != sha256(input + (1, ))
assert digest != sha256(input + (1,))
@pytest.mark.parametrize(
@ -897,7 +899,8 @@ def test_sha256(input: tuple):
("tcp://127.0.0.1:5555", ("tcp", "127.0.0.1", "5555")),
("tcp://[::1]:5555", ("tcp", "::1", "5555")), # IPv6 address
("inproc://some_identifier", ("inproc", "some_identifier", "")),
])
],
)
def test_split_zmq_path(path, expected):
assert split_zmq_path(path) == expected
@ -909,7 +912,8 @@ def test_split_zmq_path(path, expected):
"tcp://127.0.0.1", # Missing port
"tcp://[::1]", # Missing port for IPv6
"tcp://:5555", # Missing host
])
],
)
def test_split_zmq_path_invalid(invalid_path):
with pytest.raises(ValueError):
split_zmq_path(invalid_path)
@ -931,8 +935,9 @@ def test_make_zmq_socket_ipv6():
zsock: zmq.Socket = make_zmq_socket(ctx, ipv6_path, socket_type)
# Verify that the IPV6 option is set
assert zsock.getsockopt(
zmq.IPV6) == 1, "IPV6 option should be enabled for IPv6 addresses"
assert zsock.getsockopt(zmq.IPV6) == 1, (
"IPV6 option should be enabled for IPv6 addresses"
)
# Clean up
zsock.close()
@ -1019,15 +1024,14 @@ def test_convert_ids_list_to_tokens():
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
token_ids = tokenizer.encode("Hello, world!")
# token_ids = [9707, 11, 1879, 0]
assert tokenizer.convert_ids_to_tokens(token_ids) == [
'Hello', ',', 'Ġworld', '!'
]
assert tokenizer.convert_ids_to_tokens(token_ids) == ["Hello", ",", "Ġworld", "!"]
tokens = convert_ids_list_to_tokens(tokenizer, token_ids)
assert tokens == ['Hello', ',', ' world', '!']
assert tokens == ["Hello", ",", " world", "!"]
def test_current_stream_multithread():
import threading
if not torch.cuda.is_available():
pytest.skip("CUDA not available")
@ -1046,13 +1050,18 @@ def test_current_stream_multithread():
child_thread.start()
try:
assert thread_stream_ready.wait(
timeout=5), "Child thread failed to enter stream context in time"
assert thread_stream_ready.wait(timeout=5), (
"Child thread failed to enter stream context in time"
)
main_current_stream = current_stream()
assert main_current_stream != child_stream, "Main thread's current_stream was contaminated by child thread"
assert main_current_stream == main_default_stream, "Main thread's current_stream is not the default stream"
assert main_current_stream != child_stream, (
"Main thread's current_stream was contaminated by child thread"
)
assert main_current_stream == main_default_stream, (
"Main thread's current_stream is not the default stream"
)
# Notify child thread it can exit
thread_can_exit.set()
@ -1070,7 +1079,7 @@ def test_load_config_file(tmp_path):
"enable-logging": True,
"list-arg": ["item1", "item2"],
"port": 12323,
"tensor-parallel-size": 4
"tensor-parallel-size": 4,
}
# Write the configuration data to a temporary YAML file

View File

@ -16,9 +16,6 @@ from vllm.multimodal.inputs import (
from vllm.sampling_params import SamplingParams
from vllm.utils import GiB_bytes, sha256, sha256_cbor
from vllm.v1.core.kv_cache_manager import KVCacheManager
# disable yapf here as it formats differently than isort such that both fail
# yapf: disable
from vllm.v1.core.kv_cache_utils import (
BlockHash,
FreeKVCacheBlockQueue,
@ -48,8 +45,6 @@ from vllm.v1.kv_cache_interface import (
from vllm.v1.metrics.stats import PrefixCacheStats
from vllm.v1.request import Request
# yapf: enable
pytestmark = pytest.mark.cpu_test

View File

@ -22,8 +22,6 @@ from vllm.config import VllmConfig
from vllm.platforms import current_platform
from vllm.sampling_params import SamplingParams
from vllm.utils import is_pin_memory_available
# yapf: disable
from vllm.v1.sample.logits_processor import (
BatchUpdate,
BatchUpdateBuilder,
@ -34,8 +32,6 @@ from vllm.v1.sample.logits_processor import (
MoveDirectionality,
build_logitsprocs,
)
# yapf: enable
from vllm.v1.sample.metadata import SamplingMetadata
PIN_MEMORY_AVAILABLE = is_pin_memory_available()

View File

@ -7,8 +7,6 @@ from typing import Union
import pytest
from tests.utils import create_new_process_for_each_test
# yapf: disable
from tests.v1.logits_processors.utils import (
DUMMY_LOGITPROC_ARG,
DUMMY_LOGITPROC_FQCN,
@ -24,8 +22,6 @@ from tests.v1.logits_processors.utils import (
prompts,
)
from tests.v1.logits_processors.utils import entry_points as fake_entry_points
# yapf: enable
from vllm import LLM, SamplingParams
from vllm.v1.sample.logits_processor import (
STR_POOLING_REJECTS_LOGITSPROCS,

View File

@ -11,8 +11,6 @@ import pytest
import pytest_asyncio
from tests.utils import RemoteOpenAIServerCustom, create_new_process_for_each_test
# yapf: disable
from tests.v1.logits_processors.utils import (
DUMMY_LOGITPROC_ARG,
DUMMY_LOGITPROC_FQCN,
@ -25,8 +23,6 @@ from tests.v1.logits_processors.utils import (
)
from tests.v1.logits_processors.utils import entry_points as fake_entry_points
# yapf: enable
def _server_with_logitproc_entrypoint(
env_dict: Optional[dict[str, str]],

View File

@ -4,7 +4,6 @@
import importlib
from typing import TYPE_CHECKING, Callable
# yapf: disable
import vllm.envs as envs
from vllm.distributed.kv_transfer.kv_connector.base import (
KVConnectorBase,
@ -13,8 +12,6 @@ from vllm.distributed.kv_transfer.kv_connector.base import (
from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorRole
from vllm.logger import init_logger
# yapf: enable
if TYPE_CHECKING:
from vllm.config import VllmConfig
from vllm.config.kv_transfer import KVTransferConfig

View File

@ -1,7 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# yapf: disable
import argparse
import copy
import dataclasses
@ -88,8 +87,6 @@ from vllm.transformers_utils.utils import check_gguf_file
from vllm.utils import FlexibleArgumentParser, GiB_bytes, get_ip, is_in_ray_actor
from vllm.v1.sample.logits_processor import LogitsProcessor
# yapf: enable
if TYPE_CHECKING:
from vllm.executor.executor_base import ExecutorBase
from vllm.model_executor.layers.quantization import QuantizationMethods

View File

@ -17,9 +17,6 @@ import jinja2.nodes
import jinja2.parser
import jinja2.sandbox
import transformers.utils.chat_template_utils as hf_chat_utils
# yapf conflicts with isort for this block
# yapf: disable
from openai.types.chat import (
ChatCompletionAssistantMessageParam,
ChatCompletionContentPartImageParam,
@ -40,8 +37,6 @@ from openai.types.responses import ResponseInputImageParam
from openai_harmony import Message as OpenAIHarmonyMessage
from PIL import Image
from pydantic import BaseModel, ConfigDict, TypeAdapter
# yapf: enable
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast, ProcessorMixin
# pydantic needs the TypedDict from typing_extensions
@ -52,11 +47,7 @@ from vllm.logger import init_logger
from vllm.model_executor.models import SupportsMultiModal
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict, MultiModalUUIDDict
from vllm.multimodal.utils import MediaConnector
# yapf: disable
from vllm.transformers_utils.chat_templates import get_chat_template_fallback_path
# yapf: enable
from vllm.transformers_utils.processor import cached_get_processor
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
from vllm.utils import random_uuid, supports_kw
@ -317,11 +308,7 @@ def _is_var_or_elems_access(
):
return _is_var_or_elems_access(node.node, varname, key)
# yapf: disable
return (
_is_attr_access(node, varname, key) if key
else _is_var_access(node, varname)
) # yapf: enable
return _is_attr_access(node, varname, key) if key else _is_var_access(node, varname)
def _iter_nodes_assign_var_or_elems(root: jinja2.nodes.Node, varname: str):

View File

@ -39,9 +39,6 @@ from vllm.entrypoints.chat_utils import (
parse_chat_messages,
resolve_chat_template_content_format,
)
# yapf conflicts with isort for this block
# yapf: disable
from vllm.entrypoints.score_utils import (
ScoreContentPartParam,
ScoreMultiModalParam,
@ -50,8 +47,6 @@ from vllm.entrypoints.score_utils import (
compress_token_type_ids,
get_score_prompt,
)
# yapf: enable
from vllm.entrypoints.utils import _validate_truncation_size, log_non_default_args
from vllm.inputs import (
DataPrompt,

View File

@ -49,9 +49,6 @@ from vllm.entrypoints.chat_utils import (
from vllm.entrypoints.launcher import serve_http
from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args
# yapf conflicts with isort for this block
# yapf: disable
from vllm.entrypoints.openai.protocol import (
ChatCompletionRequest,
ChatCompletionResponse,
@ -84,8 +81,6 @@ from vllm.entrypoints.openai.protocol import (
TranslationResponse,
UnloadLoRAAdapterRequest,
)
# yapf: enable
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_classification import ServingClassification
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion

View File

@ -11,8 +11,6 @@ from typing import Annotated, Any, ClassVar, Generic, Literal, Optional, TypeVar
import regex as re
import torch
from fastapi import HTTPException, UploadFile
# yapf: disable
from openai.types.chat.chat_completion_audio import (
ChatCompletionAudio as OpenAIChatCompletionAudio,
)
@ -46,8 +44,6 @@ from openai.types.responses import ResponseCreatedEvent as OpenAIResponseCreated
from openai.types.responses import (
ResponseInProgressEvent as OpenAIResponseInProgressEvent,
)
# yapf: enable
from openai.types.responses.response_reasoning_item import (
Content as ResponseReasoningTextContent,
)

View File

@ -18,8 +18,6 @@ from vllm.config import VllmConfig
from vllm.engine.arg_utils import AsyncEngineArgs, optional_type
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.logger import RequestLogger
# yapf: disable
from vllm.entrypoints.openai.protocol import (
BatchRequestInput,
BatchRequestOutput,
@ -30,8 +28,6 @@ from vllm.entrypoints.openai.protocol import (
RerankResponse,
ScoreResponse,
)
# yapf: enable
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels

View File

@ -1733,13 +1733,15 @@ class OpenAIServingChat(OpenAIServing):
is a tool call with arguments.
"""
# yapf: disable
return bool(
# if there is a delta message that includes tool calls which
# include a function that has arguments
output.finish_reason is not None
and self.enable_auto_tools and self.tool_parser and delta_message
and delta_message.tool_calls and delta_message.tool_calls[0]
and self.enable_auto_tools
and self.tool_parser
and delta_message
and delta_message.tool_calls
and delta_message.tool_calls[0]
and delta_message.tool_calls[0].function
and delta_message.tool_calls[0].function.arguments is not None
)

View File

@ -18,8 +18,6 @@ from vllm.entrypoints.openai.protocol import (
ErrorResponse,
UsageInfo,
)
# yapf: enable
from vllm.entrypoints.openai.serving_engine import (
ClassificationServeContext,
OpenAIServing,

View File

@ -13,9 +13,6 @@ from fastapi import Request
from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.logger import RequestLogger
# yapf conflicts with isort for this block
# yapf: disable
from vllm.entrypoints.openai.protocol import (
CompletionLogProbs,
CompletionRequest,
@ -29,8 +26,6 @@ from vllm.entrypoints.openai.protocol import (
UsageInfo,
)
from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs
# yapf: enable
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.renderer import RenderConfig
from vllm.entrypoints.utils import get_max_tokens

View File

@ -14,9 +14,6 @@ from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
from vllm.entrypoints.logger import RequestLogger
# yapf conflicts with isort for this docstring
# yapf: disable
from vllm.entrypoints.openai.protocol import (
EmbeddingChatRequest,
EmbeddingCompletionRequest,
@ -32,8 +29,6 @@ from vllm.entrypoints.openai.serving_engine import (
ServeContext,
TextTokensPrompt,
)
# yapf: enable
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.renderer import RenderConfig
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt

View File

@ -28,9 +28,6 @@ else:
import vllm.envs as envs
from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient
# yapf conflicts with isort for this block
# yapf: disable
from vllm.entrypoints.chat_utils import (
ChatCompletionMessageParam,
ChatTemplateContentFormatOption,
@ -72,8 +69,6 @@ from vllm.entrypoints.openai.protocol import (
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.openai.tool_parsers import ToolParser
from vllm.entrypoints.renderer import BaseRenderer, CompletionRenderer, RenderConfig
# yapf: enable
from vllm.inputs.data import PromptType
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
from vllm.inputs.parse import PromptComponents, get_prompt_components

View File

@ -17,8 +17,6 @@ from vllm.config import VllmConfig
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
from vllm.entrypoints.logger import RequestLogger
# yapf: disable
from vllm.entrypoints.openai.protocol import (
ErrorResponse,
IOProcessorRequest,
@ -30,8 +28,6 @@ from vllm.entrypoints.openai.protocol import (
PoolingResponseData,
UsageInfo,
)
# yapf: enable
from vllm.entrypoints.openai.serving_engine import OpenAIServing
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.renderer import RenderConfig

View File

@ -14,9 +14,6 @@ from typing import Callable, Final, Optional, Union
import jinja2
from fastapi import Request
# yapf conflicts with isort for this block
# yapf: disable
from openai.types.responses import (
ResponseCodeInterpreterCallCodeDeltaEvent,
ResponseCodeInterpreterCallCodeDoneEvent,
@ -46,8 +43,6 @@ from openai.types.responses import (
response_text_delta_event,
)
from openai.types.responses.response_output_text import Logprob, LogprobTopLogprob
# yapf: enable
from openai.types.responses.response_reasoning_item import (
Content as ResponseReasoningTextContent,
)
@ -78,9 +73,6 @@ from vllm.entrypoints.harmony_utils import (
render_for_completion,
)
from vllm.entrypoints.logger import RequestLogger
# yapf conflicts with isort for this block
# yapf: disable
from vllm.entrypoints.openai.protocol import (
DeltaMessage,
ErrorResponse,
@ -97,8 +89,6 @@ from vllm.entrypoints.openai.protocol import (
ResponseUsage,
StreamingResponsesResponse,
)
# yapf: enable
from vllm.entrypoints.openai.serving_engine import OpenAIServing
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.tool_server import ToolServer

View File

@ -24,9 +24,6 @@ from vllm.entrypoints.openai.protocol import (
)
from vllm.entrypoints.openai.serving_engine import OpenAIServing
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
# yapf conflicts with isort for this block
# yapf: disable
from vllm.entrypoints.score_utils import (
ScoreContentPartParam,
ScoreMultiModalParam,
@ -35,8 +32,6 @@ from vllm.entrypoints.score_utils import (
compress_token_type_ids,
get_score_prompt,
)
# yapf: enable
from vllm.entrypoints.utils import _validate_truncation_size
from vllm.inputs.data import TokensPrompt
from vllm.logger import init_logger

View File

@ -10,9 +10,6 @@ from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
from vllm.entrypoints.logger import RequestLogger
# yapf conflicts with isort for this block
# yapf: disable
from vllm.entrypoints.openai.protocol import (
DetokenizeRequest,
DetokenizeResponse,
@ -22,8 +19,6 @@ from vllm.entrypoints.openai.protocol import (
TokenizeResponse,
TokenizerInfoResponse,
)
# yapf: enable
from vllm.entrypoints.openai.serving_engine import OpenAIServing
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.renderer import RenderConfig

View File

@ -11,7 +11,7 @@ import cloudpickle
import msgspec
import vllm.envs as envs
from vllm.executor.executor_base import DistributedExecutorBase # yapf: disable
from vllm.executor.executor_base import DistributedExecutorBase
from vllm.executor.msgspec_utils import encode_hook
from vllm.executor.ray_utils import RayWorkerWrapper, initialize_ray_cluster, ray
from vllm.logger import init_logger

View File

@ -8,8 +8,6 @@ from transformers import PretrainedConfig
from vllm.config.lora import LoRAConfig
from vllm.distributed.utils import divide
# yapf: disable
from vllm.model_executor.layers.linear import (
ColumnParallelLinear,
LinearBase,
@ -23,7 +21,6 @@ from .utils import _get_lora_device
class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
def __init__(self, base_layer: LinearBase):
super().__init__()
self.base_layer = base_layer
@ -50,16 +47,20 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
lora_b_out_size = self.output_size
elif isinstance(self.base_layer, ColumnParallelLinear):
lora_a_out_size = (lora_config.max_lora_rank if
not lora_config.fully_sharded_loras else divide(
lora_config.max_lora_rank, self.tp_size))
lora_a_out_size = (
lora_config.max_lora_rank
if not lora_config.fully_sharded_loras
else divide(lora_config.max_lora_rank, self.tp_size)
)
lora_b_out_size = self.output_size
elif isinstance(self.base_layer, RowParallelLinear):
lora_a_out_size = lora_config.max_lora_rank
lora_b_out_size = (self.output_size if
not lora_config.fully_sharded_loras else divide(
self.output_size, self.tp_size))
lora_b_out_size = (
self.output_size
if not lora_config.fully_sharded_loras
else divide(self.output_size, self.tp_size)
)
else:
raise NotImplementedError
@ -71,7 +72,9 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
self.input_size,
dtype=lora_config.lora_dtype,
device=self.device,
) for _ in range(self.n_slices))
)
for _ in range(self.n_slices)
)
self.lora_b_stacked = tuple(
torch.zeros(
max_loras,
@ -80,7 +83,9 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
lora_config.max_lora_rank,
dtype=lora_config.lora_dtype,
device=self.device,
) for _ in range(self.n_slices))
)
for _ in range(self.n_slices)
)
if lora_config.bias_enabled:
lora_bias_out_size = lora_b_out_size
self.lora_bias_stacked = tuple(
@ -90,8 +95,10 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
lora_bias_out_size,
dtype=lora_config.lora_dtype,
device=self.device,
) for _ in range(self.n_slices))
self.output_slices = (self.lora_b_stacked[0].shape[2], )
)
for _ in range(self.n_slices)
)
self.output_slices = (self.lora_b_stacked[0].shape[2],)
def reset_lora(self, index: int):
for s_index in range(self.n_slices):
@ -99,8 +106,9 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
self.lora_b_stacked[s_index][index] = 0
if self.lora_config.bias_enabled:
# Make mypy happy
self.lora_bias_stacked = cast(tuple[torch.Tensor, ...],
self.lora_bias_stacked)
self.lora_bias_stacked = cast(
tuple[torch.Tensor, ...], self.lora_bias_stacked
)
self.lora_bias_stacked[s_index][index] = 0
def set_lora(
@ -115,8 +123,9 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
# MergedColumnParallelLinearWithLoRA, all other linear LoRA layers
# store weights in a tuple of size 1. These two layers will
# override this function.
assert (len(self.lora_a_stacked) == len(self.lora_b_stacked) ==
self.n_slices == 1)
assert (
len(self.lora_a_stacked) == len(self.lora_b_stacked) == self.n_slices == 1
)
self.reset_lora(index)
if self.tp_size > 1:
@ -125,23 +134,24 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
if lora_bias is not None:
lora_bias = self.slice_bias(lora_bias)
self.lora_a_stacked[0][index,
0, :lora_a.shape[0], :lora_a.shape[1]].copy_(
lora_a, non_blocking=True)
self.lora_b_stacked[0][index,
0, :lora_b.shape[0], :lora_b.shape[1]].copy_(
lora_b, non_blocking=True)
self.lora_a_stacked[0][index, 0, : lora_a.shape[0], : lora_a.shape[1]].copy_(
lora_a, non_blocking=True
)
self.lora_b_stacked[0][index, 0, : lora_b.shape[0], : lora_b.shape[1]].copy_(
lora_b, non_blocking=True
)
if lora_bias is not None:
self.lora_bias_stacked = cast(tuple[torch.Tensor, ...],
self.lora_bias_stacked)
self.lora_bias_stacked = cast(
tuple[torch.Tensor, ...], self.lora_bias_stacked
)
assert len(self.lora_bias_stacked)
self.lora_bias_stacked[0][index, 0, :lora_bias.shape[0]].copy_(
lora_bias, non_blocking=True)
self.lora_bias_stacked[0][index, 0, : lora_bias.shape[0]].copy_(
lora_bias, non_blocking=True
)
def apply(self,
x: torch.Tensor,
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
def apply(
self, x: torch.Tensor, bias: Optional[torch.Tensor] = None
) -> torch.Tensor:
output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
# In transformers backend, x and output have extra batch dimension like
@ -151,10 +161,15 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
output = output.flatten(0, 1)
x = x.flatten(0, 1)
lora_output: Optional[
torch.Tensor] = self.punica_wrapper.add_lora_linear(
output, x, self.lora_a_stacked, self.lora_b_stacked,
self.lora_bias_stacked, 1.0, self.output_slices)
lora_output: Optional[torch.Tensor] = self.punica_wrapper.add_lora_linear(
output,
x,
self.lora_a_stacked,
self.lora_b_stacked,
self.lora_bias_stacked,
1.0,
self.output_slices,
)
if not current_platform.can_update_inplace():
output = lora_output
@ -162,7 +177,6 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
@property
def weight(self) -> torch.Tensor:
# unquantizedLinear
if hasattr(self.base_layer, "weight"):
return self.base_layer.weight

View File

@ -12,8 +12,6 @@ from vllm.distributed import (
split_tensor_along_last_dim,
tensor_model_parallel_all_reduce,
)
# yapf: disable
from vllm.model_executor.layers.linear import RowParallelLinear
from vllm.platforms import current_platform
@ -22,7 +20,6 @@ from .utils import _fully_sharded_can_replace, _not_fully_sharded_can_replace
class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
def __init__(self, base_layer: RowParallelLinear) -> None:
super().__init__(base_layer)
@ -33,11 +30,10 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
self.n_slices = 1
def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
shard_size = self.input_size
start_idx = self.tp_rank * shard_size
end_idx = (self.tp_rank + 1) * shard_size
lora_a = lora_a[:,start_idx:end_idx]
lora_a = lora_a[:, start_idx:end_idx]
return lora_a
def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
@ -66,7 +62,8 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
else:
# TODO: simplify code below
splitted_input = split_tensor_along_last_dim(
input_, num_partitions=self.tp_size)
input_, num_partitions=self.tp_size
)
input_parallel = splitted_input[self.tp_rank].contiguous()
# Matrix multiply.
@ -77,8 +74,11 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
output_ = output_parallel
if not self.base_layer.skip_bias_add:
output = (output_ + self.base_layer.bias
if self.base_layer.bias is not None else output_)
output = (
output_ + self.base_layer.bias
if self.base_layer.bias is not None
else output_
)
output_bias = None
else:
output = output_
@ -101,11 +101,11 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
return type(source_layer) is RowParallelLinear
# The following layer is based on the tensor parallelism strategy given in
# Y. Sheng et al., S-LoRA: Serving Thousands of Concurrent LoRA Adapters. 2023,
# https://arxiv.org/abs/2311.03285.
class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA):
"""
Differs from RowParallelLinearWithLoRA by slicing the
@ -120,28 +120,26 @@ class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA):
shard_size = self.lora_b_stacked[0].shape[2]
start_idx = self.tp_rank * shard_size
end_idx = (self.tp_rank + 1) * shard_size
lora_b = lora_b[ start_idx:end_idx,:]
lora_b = lora_b[start_idx:end_idx, :]
return lora_b
def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
if bias is None:
return bias
self.lora_bias_stacked = cast(tuple[torch.Tensor, ...],
self.lora_bias_stacked)
self.lora_bias_stacked = cast(tuple[torch.Tensor, ...], self.lora_bias_stacked)
shard_size = self.lora_bias_stacked[0].shape[2]
start_idx = self.tp_rank * shard_size
end_idx = (self.tp_rank + 1) * shard_size
bias = bias[start_idx:end_idx]
return bias
def apply(self,
x: torch.Tensor,
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
def apply(
self, x: torch.Tensor, bias: Optional[torch.Tensor] = None
) -> torch.Tensor:
output = self.base_layer.quant_method.apply(self.base_layer, x)
x = x.view(-1, x.shape[-1])
output, out_orig_shape = output.view(-1,
output.shape[-1]), output.shape
output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape
buffer = torch.zeros(
(self.n_slices, x.shape[0], self.lora_a_stacked[0].shape[2]),
dtype=torch.float32,
@ -149,10 +147,11 @@ class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA):
)
shrunk_buffer: Optional[torch.Tensor] = self.punica_wrapper.add_shrink(
buffer, x, self.lora_a_stacked, 1.0)
buffer, x, self.lora_a_stacked, 1.0
)
if not current_platform.can_update_inplace():
buffer = shrunk_buffer
if self.tp_size>1:
if self.tp_size > 1:
buffer = tensor_model_parallel_all_reduce(buffer)
# following S-LoRA, allows the fusing of all_gather and all_reduce

View File

@ -19,8 +19,6 @@ from vllm.config.lora import LoRAConfig
from vllm.logger import init_logger
# being imported for _all_lora_classes below
# yapf conflicts with isort for this block
# yapf: disable
from vllm.lora.layers import (
BaseLayerWithLoRA,
ColumnParallelLinearWithLoRA,
@ -39,8 +37,6 @@ from vllm.lora.layers import (
)
from vllm.model_executor.layers.linear import LinearBase
# yapf: enable
if TYPE_CHECKING:
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead

View File

@ -14,8 +14,6 @@ import vllm.envs as envs
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from vllm import _custom_ops as ops
from vllm.logger import init_logger
# yapf: disable
from vllm.model_executor.layers.fused_moe.config import (
FUSED_MOE_UNQUANTIZED_CONFIG,
FusedMoEQuantConfig,
@ -25,8 +23,6 @@ from vllm.model_executor.layers.fused_moe.cutlass_moe import (
_valid_cutlass_block_scaled_grouped_gemm,
run_cutlass_block_scaled_fused_experts,
)
# yapf: enable
from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
_valid_deep_gemm,
deep_gemm_moe_fp8,

View File

@ -24,8 +24,6 @@ from vllm.distributed.eplb.eplb_state import EplbState
from vllm.forward_context import ForwardContext, get_forward_context
from vllm.logger import init_logger
from vllm.model_executor.custom_op import CustomOp
# yapf: disable
from vllm.model_executor.layers.fused_moe.config import (
FUSED_MOE_UNQUANTIZED_CONFIG,
FusedMoEConfig,
@ -34,8 +32,6 @@ from vllm.model_executor.layers.fused_moe.config import (
biased_moe_quant_config,
)
from vllm.model_executor.layers.fused_moe.fused_moe import zero_experts_compute_triton
# yapf: enable
from vllm.model_executor.layers.fused_moe.modular_kernel import (
FusedMoEActivationFormat,
FusedMoEModularKernel,

View File

@ -10,7 +10,7 @@ import torch
import vllm.envs as envs
from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
from vllm.model_executor.layers.fused_moe.utils import ( # yapf: disable
from vllm.model_executor.layers.fused_moe.utils import (
_resize_cache,
count_expert_num_tokens,
)

View File

@ -24,8 +24,6 @@ from vllm.model_executor.layers.quantization.base_config import (
QuantizeMethodBase,
)
from vllm.model_executor.layers.utils import dispatch_unquantized_gemm
# yapf: disable
from vllm.model_executor.parameter import (
BasevLLMParameter,
BlockQuantScaleParameter,
@ -35,8 +33,6 @@ from vllm.model_executor.parameter import (
PerTensorScaleParameter,
RowvLLMParameter,
)
# yapf: enable
from vllm.model_executor.utils import set_weight_attrs
from vllm.platforms import current_platform
from vllm.utils import GiB_bytes

View File

@ -17,17 +17,12 @@ from vllm.model_executor.layers.quantization.kernels.mixed_precision import (
from vllm.model_executor.layers.quantization.utils.marlin_utils import (
marlin_repeat_scales_on_all_ranks,
)
# yapf conflicts with isort for this block
# yapf: disable
from vllm.model_executor.parameter import (
BasevLLMParameter,
ChannelQuantScaleParameter,
GroupQuantScaleParameter,
PackedvLLMParameter,
)
# yapf: enable
from vllm.scalar_type import scalar_types
logger = init_logger(__name__)

View File

@ -17,9 +17,6 @@ from vllm.model_executor.layers.quantization.kernels.mixed_precision import (
from vllm.model_executor.layers.quantization.utils.marlin_utils import (
marlin_repeat_scales_on_all_ranks,
)
# yapf conflicts with isort for this block
# yapf: disable
from vllm.model_executor.parameter import (
BasevLLMParameter,
ChannelQuantScaleParameter,
@ -28,8 +25,6 @@ from vllm.model_executor.parameter import (
PackedvLLMParameter,
RowvLLMParameter,
)
# yapf: enable
from vllm.scalar_type import scalar_types
logger = init_logger(__name__)

View File

@ -22,8 +22,6 @@ from vllm.distributed import (
get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size,
)
# yapf: enable
from vllm.logger import init_logger
from vllm.model_executor.layers.fused_moe import FusedMoE
from vllm.model_executor.layers.linear import (
@ -51,8 +49,6 @@ from vllm.model_executor.utils import (
)
from vllm.platforms import current_platform
# yapf conflicts with isort for this block
logger = init_logger(__name__)

View File

@ -39,13 +39,10 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors
from vllm.utils.tensor_schema import TensorSchema, TensorShape
# yapf: disable
from .idefics2_vision_model import Idefics2VisionConfig
from .idefics2_vision_model import (
Idefics2VisionTransformer as Idefics3VisionTransformer,
)
# yapf: enable
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsQuant
from .llama import LlamaDecoderLayer, LlamaMLP, LlamaModel
from .utils import (

View File

@ -22,8 +22,6 @@ from vllm.multimodal.inputs import (
MultiModalKwargsItems,
)
from vllm.multimodal.parse import ImageProcessorItems, ImageSize, MultiModalDataItems
# yapf: disable
from vllm.multimodal.processing import (
BaseMultiModalProcessor,
BaseProcessingInfo,
@ -35,8 +33,6 @@ from vllm.multimodal.processing import (
PromptUpdateDetails,
replace_token_matches,
)
# yapf: enable
from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors
from vllm.utils.tensor_schema import TensorSchema, TensorShape

View File

@ -6,14 +6,16 @@ from typing import Annotated, Any, Literal, Optional, Union, cast
import numpy as np
import torch
# yapf: disable
from torch import nn
from transformers import AutoModel, BatchFeature
from transformers.models.gemma3n import (Gemma3nAudioConfig,
Gemma3nAudioFeatureExtractor,
Gemma3nConfig, Gemma3nProcessor,
Gemma3nTextConfig,
Gemma3nVisionConfig)
from transformers.models.gemma3n import (
Gemma3nAudioConfig,
Gemma3nAudioFeatureExtractor,
Gemma3nConfig,
Gemma3nProcessor,
Gemma3nTextConfig,
Gemma3nVisionConfig,
)
from transformers.models.siglip import SiglipImageProcessorFast
from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig
@ -22,25 +24,32 @@ from vllm.inputs.data import PromptType
from vllm.logger import init_logger
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.linear import RowParallelLinear
from vllm.model_executor.layers.vocab_parallel_embedding import (
VocabParallelEmbedding)
from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
from vllm.model_executor.models.gemma3n import Gemma3nForCausalLM
from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.models.whisper import ISO639_1_SUPPORTED_LANGS
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargsItems)
from vllm.multimodal.parse import (ImageProcessorItems, MultiModalDataItems,
MultiModalDataParser)
from vllm.multimodal.processing import (BaseMultiModalProcessor,
BaseProcessingInfo,
MultiModalPromptUpdates,
MultiModalPromptUpdatesApplyResult,
PlaceholderFeaturesInfo,
PromptReplacement, PromptUpdate,
PromptUpdateDetails,
replace_token_matches)
# yapf: enable
from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig,
MultiModalKwargsItems,
)
from vllm.multimodal.parse import (
ImageProcessorItems,
MultiModalDataItems,
MultiModalDataParser,
)
from vllm.multimodal.processing import (
BaseMultiModalProcessor,
BaseProcessingInfo,
MultiModalPromptUpdates,
MultiModalPromptUpdatesApplyResult,
PlaceholderFeaturesInfo,
PromptReplacement,
PromptUpdate,
PromptUpdateDetails,
replace_token_matches,
)
from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors
from vllm.utils.tensor_schema import TensorSchema, TensorShape

View File

@ -43,9 +43,6 @@ from vllm.multimodal.inputs import (
MultiModalKwargsItems,
)
from vllm.multimodal.parse import ImageProcessorItems, ImageSize
# yapf conflicts with isort for this block
# yapf: disable
from vllm.multimodal.processing import (
BaseMultiModalProcessor,
BaseProcessingInfo,
@ -54,18 +51,13 @@ from vllm.multimodal.processing import (
PromptUpdate,
PromptUpdateDetails,
)
# yapf: enable
from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors
from vllm.utils.tensor_schema import TensorSchema, TensorShape
# yapf: disable
from .idefics2_vision_model import (
Idefics2VisionTransformer as Idefics3VisionTransformer,
)
# yapf: enable
from .interfaces import MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal
from .llama import LlamaModel
from .utils import AutoWeightsLoader, maybe_prefix

View File

@ -45,9 +45,6 @@ from vllm.multimodal.parse import (
ImageSize,
MultiModalDataItems,
)
# yapf conflicts with isort for this block
# yapf: disable
from vllm.multimodal.processing import (
BaseMultiModalProcessor,
BaseProcessingInfo,
@ -57,8 +54,6 @@ from vllm.multimodal.processing import (
PromptUpdate,
ResolvedPromptUpdate,
)
# yapf: enable
from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors
from vllm.utils import is_list_of

View File

@ -52,16 +52,12 @@ from vllm.distributed import utils as dist_utils
from vllm.logger import init_logger
from vllm.model_executor.layers.activation import get_act_and_mul_fn
from vllm.model_executor.layers.layernorm import RMSNorm
# yapf: disable
from vllm.model_executor.layers.linear import (
ColumnParallelLinear,
MergedColumnParallelLinear,
QKVParallelLinear,
RowParallelLinear,
)
# yapf: enable
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models.module_mapping import MultiModelKeys

View File

@ -37,12 +37,7 @@ from vllm.model_executor.layers.fla.ops import (
fused_recurrent_gated_delta_rule,
)
from vllm.model_executor.layers.fused_moe import FusedMoE
# yapf conflicts with isort for this block
# yapf: disable
from vllm.model_executor.layers.layernorm import GemmaRMSNorm as Qwen3NextRMSNorm
# yapf: enable
from vllm.model_executor.layers.linear import (
ColumnParallelLinear,
QKVParallelLinear,

View File

@ -54,7 +54,6 @@ from .interfaces_base import (
logger = init_logger(__name__)
# yapf: disable
_TEXT_GENERATION_MODELS = {
# [Decoder-only]
"ApertusForCausalLM": ("apertus", "ApertusForCausalLM"),
@ -106,8 +105,8 @@ _TEXT_GENERATION_MODELS = {
"GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"),
"GraniteForCausalLM": ("granite", "GraniteForCausalLM"),
"GraniteMoeForCausalLM": ("granitemoe", "GraniteMoeForCausalLM"),
"GraniteMoeHybridForCausalLM": ("granitemoehybrid", "GraniteMoeHybridForCausalLM"), # noqa: E501
"GraniteMoeSharedForCausalLM": ("granitemoeshared", "GraniteMoeSharedForCausalLM"), # noqa: E501
"GraniteMoeHybridForCausalLM": ("granitemoehybrid", "GraniteMoeHybridForCausalLM"), # noqa: E501
"GraniteMoeSharedForCausalLM": ("granitemoeshared", "GraniteMoeSharedForCausalLM"), # noqa: E501
"GritLM": ("gritlm", "GritLM"),
"Grok1ModelForCausalLM": ("grok1", "Grok1ForCausalLM"),
"HunYuanMoEV1ForCausalLM": ("hunyuan_v1", "HunYuanMoEV1ForCausalLM"),
@ -127,7 +126,7 @@ _TEXT_GENERATION_MODELS = {
"LongcatFlashForCausalLM": ("longcat_flash", "LongcatFlashForCausalLM"),
"MambaForCausalLM": ("mamba", "MambaForCausalLM"),
"FalconMambaForCausalLM": ("mamba", "MambaForCausalLM"),
"FalconH1ForCausalLM":("falcon_h1", "FalconH1ForCausalLM"),
"FalconH1ForCausalLM": ("falcon_h1", "FalconH1ForCausalLM"),
"Mamba2ForCausalLM": ("mamba2", "Mamba2ForCausalLM"),
"MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"),
"MiniCPM3ForCausalLM": ("minicpm3", "MiniCPM3ForCausalLM"),
@ -184,7 +183,8 @@ _EMBEDDING_MODELS = {
"LlamaModel": ("llama", "LlamaForCausalLM"),
**{
# Multiple models share the same architecture, so we include them all
k: (mod, arch) for k, (mod, arch) in _TEXT_GENERATION_MODELS.items()
k: (mod, arch)
for k, (mod, arch) in _TEXT_GENERATION_MODELS.items()
if arch == "LlamaForCausalLM"
},
"MistralModel": ("llama", "LlamaForCausalLM"),
@ -201,7 +201,10 @@ _EMBEDDING_MODELS = {
"XLMRobertaModel": ("roberta", "RobertaEmbeddingModel"),
# [Multimodal]
"CLIPModel": ("clip", "CLIPEmbeddingModel"),
"LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501
"LlavaNextForConditionalGeneration": (
"llava_next",
"LlavaNextForConditionalGeneration",
), # noqa: E501
"Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
"Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), # noqa: E501
# Technically Terratorch models work on images, both in
@ -214,79 +217,150 @@ _EMBEDDING_MODELS = {
_CROSS_ENCODER_MODELS = {
"BertForSequenceClassification": ("bert", "BertForSequenceClassification"),
"BertForTokenClassification": ("bert", "BertForTokenClassification"),
"GteNewForSequenceClassification": ("bert_with_rope",
"GteNewForSequenceClassification"),
"ModernBertForSequenceClassification": ("modernbert",
"ModernBertForSequenceClassification"),
"RobertaForSequenceClassification": ("roberta",
"RobertaForSequenceClassification"),
"XLMRobertaForSequenceClassification": ("roberta",
"RobertaForSequenceClassification"),
"GteNewForSequenceClassification": (
"bert_with_rope",
"GteNewForSequenceClassification",
),
"ModernBertForSequenceClassification": (
"modernbert",
"ModernBertForSequenceClassification",
),
"RobertaForSequenceClassification": ("roberta", "RobertaForSequenceClassification"),
"XLMRobertaForSequenceClassification": (
"roberta",
"RobertaForSequenceClassification",
),
# [Auto-converted (see adapters.py)]
"JinaVLForRanking": ("jina_vl", "JinaVLForSequenceClassification"), # noqa: E501,
"JinaVLForRanking": ("jina_vl", "JinaVLForSequenceClassification"), # noqa: E501,
}
_MULTIMODAL_MODELS = {
# [Decoder-only]
"AriaForConditionalGeneration": ("aria", "AriaForConditionalGeneration"),
"AyaVisionForConditionalGeneration": ("aya_vision", "AyaVisionForConditionalGeneration"), # noqa: E501
"AyaVisionForConditionalGeneration": (
"aya_vision",
"AyaVisionForConditionalGeneration",
), # noqa: E501
"Blip2ForConditionalGeneration": ("blip2", "Blip2ForConditionalGeneration"),
"ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"), # noqa: E501
"Cohere2VisionForConditionalGeneration": ("cohere2_vision", "Cohere2VisionForConditionalGeneration"), # noqa: E501
"ChameleonForConditionalGeneration": (
"chameleon",
"ChameleonForConditionalGeneration",
), # noqa: E501
"Cohere2VisionForConditionalGeneration": (
"cohere2_vision",
"Cohere2VisionForConditionalGeneration",
), # noqa: E501
"DeepseekVLV2ForCausalLM": ("deepseek_vl2", "DeepseekVLV2ForCausalLM"),
"DotsOCRForCausalLM": ("dots_ocr", "DotsOCRForCausalLM"),
"Ernie4_5_VLMoeForConditionalGeneration": ("ernie45_vl", "Ernie4_5_VLMoeForConditionalGeneration"), # noqa: E501
"Ernie4_5_VLMoeForConditionalGeneration": (
"ernie45_vl",
"Ernie4_5_VLMoeForConditionalGeneration",
), # noqa: E501
"FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
"Gemma3ForConditionalGeneration": ("gemma3_mm", "Gemma3ForConditionalGeneration"), # noqa: E501
"Gemma3nForConditionalGeneration": ("gemma3n_mm", "Gemma3nForConditionalGeneration"), # noqa: E501
"Gemma3nForConditionalGeneration": (
"gemma3n_mm",
"Gemma3nForConditionalGeneration",
), # noqa: E501
"GLM4VForCausalLM": ("glm4v", "GLM4VForCausalLM"),
"Glm4vForConditionalGeneration": ("glm4_1v", "Glm4vForConditionalGeneration"), # noqa: E501
"Glm4vMoeForConditionalGeneration": ("glm4_1v", "Glm4vMoeForConditionalGeneration"), # noqa: E501
"GraniteSpeechForConditionalGeneration": ("granite_speech", "GraniteSpeechForConditionalGeneration"), # noqa: E501
"GraniteSpeechForConditionalGeneration": (
"granite_speech",
"GraniteSpeechForConditionalGeneration",
), # noqa: E501
"H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
"InternVLChatModel": ("internvl", "InternVLChatModel"),
"NemotronH_Nano_VL_V2": ("nano_nemotron_vl", "NemotronH_Nano_VL_V2"),
"InternS1ForConditionalGeneration": ("interns1", "InternS1ForConditionalGeneration"), # noqa: E501
"InternVLForConditionalGeneration": ("interns1", "InternS1ForConditionalGeneration"), # noqa: E501
"Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"),
"SmolVLMForConditionalGeneration": ("smolvlm","SmolVLMForConditionalGeneration"), # noqa: E501
"InternS1ForConditionalGeneration": (
"interns1",
"InternS1ForConditionalGeneration",
), # noqa: E501
"InternVLForConditionalGeneration": (
"interns1",
"InternS1ForConditionalGeneration",
), # noqa: E501
"Idefics3ForConditionalGeneration": (
"idefics3",
"Idefics3ForConditionalGeneration",
),
"SmolVLMForConditionalGeneration": ("smolvlm", "SmolVLMForConditionalGeneration"), # noqa: E501
"KeyeForConditionalGeneration": ("keye", "KeyeForConditionalGeneration"),
"KeyeVL1_5ForConditionalGeneration": ("keye_vl1_5", "KeyeVL1_5ForConditionalGeneration"), # noqa: E501
"KeyeVL1_5ForConditionalGeneration": (
"keye_vl1_5",
"KeyeVL1_5ForConditionalGeneration",
), # noqa: E501
"RForConditionalGeneration": ("rvl", "RForConditionalGeneration"),
"KimiVLForConditionalGeneration": ("kimi_vl", "KimiVLForConditionalGeneration"), # noqa: E501
"Llama_Nemotron_Nano_VL": ("nemotron_vl", "LlamaNemotronVLChatModel"),
"Llama4ForConditionalGeneration": ("mllama4", "Llama4ForConditionalGeneration"), # noqa: E501
"LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"),
"LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501
"LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"), # noqa: E501
"LlavaOnevisionForConditionalGeneration": ("llava_onevision", "LlavaOnevisionForConditionalGeneration"), # noqa: E501
"LlavaNextForConditionalGeneration": (
"llava_next",
"LlavaNextForConditionalGeneration",
), # noqa: E501
"LlavaNextVideoForConditionalGeneration": (
"llava_next_video",
"LlavaNextVideoForConditionalGeneration",
), # noqa: E501
"LlavaOnevisionForConditionalGeneration": (
"llava_onevision",
"LlavaOnevisionForConditionalGeneration",
), # noqa: E501
"MantisForConditionalGeneration": ("llava", "MantisForConditionalGeneration"), # noqa: E501
"MiDashengLMModel": ("midashenglm", "MiDashengLMModel"),
"MiniMaxVL01ForConditionalGeneration": ("minimax_vl_01", "MiniMaxVL01ForConditionalGeneration"), # noqa: E501
"MiniMaxVL01ForConditionalGeneration": (
"minimax_vl_01",
"MiniMaxVL01ForConditionalGeneration",
), # noqa: E501
"MiniCPMO": ("minicpmo", "MiniCPMO"),
"MiniCPMV": ("minicpmv", "MiniCPMV"),
"Mistral3ForConditionalGeneration": ("mistral3", "Mistral3ForConditionalGeneration"), # noqa: E501
"Mistral3ForConditionalGeneration": (
"mistral3",
"Mistral3ForConditionalGeneration",
), # noqa: E501
"MolmoForCausalLM": ("molmo", "MolmoForCausalLM"),
"NVLM_D": ("nvlm_d", "NVLM_D_Model"),
"Ovis": ("ovis", "Ovis"),
"Ovis2_5": ("ovis2_5", "Ovis2_5"),
"PaliGemmaForConditionalGeneration": ("paligemma", "PaliGemmaForConditionalGeneration"), # noqa: E501
"PaliGemmaForConditionalGeneration": (
"paligemma",
"PaliGemmaForConditionalGeneration",
), # noqa: E501
"Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
"Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"),
"Phi4MultimodalForCausalLM": ("phi4_multimodal", "Phi4MultimodalForCausalLM"), # noqa: E501
"PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"), # noqa: E501
"QwenVLForConditionalGeneration": ("qwen_vl", "QwenVLForConditionalGeneration"), # noqa: E501
"Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), # noqa: E501
"Qwen2_5_VLForConditionalGeneration": ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"), # noqa: E501
"Qwen2AudioForConditionalGeneration": ("qwen2_audio", "Qwen2AudioForConditionalGeneration"), # noqa: E501
"Qwen2_5OmniModel": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"), # noqa: E501
"Qwen2_5OmniForConditionalGeneration": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"), # noqa: E501
"Qwen2_5_VLForConditionalGeneration": (
"qwen2_5_vl",
"Qwen2_5_VLForConditionalGeneration",
), # noqa: E501
"Qwen2AudioForConditionalGeneration": (
"qwen2_audio",
"Qwen2AudioForConditionalGeneration",
), # noqa: E501
"Qwen2_5OmniModel": (
"qwen2_5_omni_thinker",
"Qwen2_5OmniThinkerForConditionalGeneration",
), # noqa: E501
"Qwen2_5OmniForConditionalGeneration": (
"qwen2_5_omni_thinker",
"Qwen2_5OmniThinkerForConditionalGeneration",
), # noqa: E501
"Qwen3VLForConditionalGeneration": ("qwen3_vl", "Qwen3VLForConditionalGeneration"), # noqa: E501
"Qwen3VLMoeForConditionalGeneration": ("qwen3_vl_moe", "Qwen3VLMoeForConditionalGeneration"), # noqa: E501
"Qwen3VLMoeForConditionalGeneration": (
"qwen3_vl_moe",
"Qwen3VLMoeForConditionalGeneration",
), # noqa: E501
"SkyworkR1VChatModel": ("skyworkr1v", "SkyworkR1VChatModel"),
"Step3VLForConditionalGeneration": ("step3_vl", "Step3VLForConditionalGeneration"), # noqa: E501
"TarsierForConditionalGeneration": ("tarsier", "TarsierForConditionalGeneration"), # noqa: E501
"Tarsier2ForConditionalGeneration": ("qwen2_vl", "Tarsier2ForConditionalGeneration"), # noqa: E501
"Tarsier2ForConditionalGeneration": (
"qwen2_vl",
"Tarsier2ForConditionalGeneration",
), # noqa: E501
"UltravoxModel": ("ultravox", "UltravoxModel"),
"VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"), # noqa: E501
# [Encoder-decoder]
@ -324,13 +398,27 @@ _TRANSFORMERS_BACKEND_MODELS = {
"TransformersForCausalLM": ("transformers", "TransformersForCausalLM"),
"TransformersForMultimodalLM": ("transformers", "TransformersForMultimodalLM"), # noqa: E501
"TransformersMoEForCausalLM": ("transformers_moe", "TransformersMoEForCausalLM"), # noqa: E501
"TransformersMoEForMultimodalLM": ("transformers_moe", "TransformersMoEForMultimodalLM"), # noqa: E501
"TransformersEmbeddingModel": ("transformers_pooling", "TransformersEmbeddingModel"), # noqa: E501
"TransformersForSequenceClassification": ("transformers_pooling", "TransformersForSequenceClassification"), # noqa: E501
"TransformersMoEForSequenceClassification": ("transformers_pooling", "TransformersMoEForSequenceClassification"), # noqa: E501
"TransformersMoEEmbeddingModel": ("transformers_pooling", "TransformersMoEEmbeddingModel"), # noqa: E501
"TransformersMoEForMultimodalLM": (
"transformers_moe",
"TransformersMoEForMultimodalLM",
), # noqa: E501
"TransformersEmbeddingModel": (
"transformers_pooling",
"TransformersEmbeddingModel",
), # noqa: E501
"TransformersForSequenceClassification": (
"transformers_pooling",
"TransformersForSequenceClassification",
), # noqa: E501
"TransformersMoEForSequenceClassification": (
"transformers_pooling",
"TransformersMoEForSequenceClassification",
), # noqa: E501
"TransformersMoEEmbeddingModel": (
"transformers_pooling",
"TransformersMoEEmbeddingModel",
), # noqa: E501
}
# yapf: enable
_VLLM_MODELS = {
**_TEXT_GENERATION_MODELS,

View File

@ -8,13 +8,10 @@ from transformers import SmolVLMProcessor
from vllm.config import VllmConfig
from vllm.multimodal import MULTIMODAL_REGISTRY
# yapf: disable
from .idefics3 import Idefics3DummyInputsBuilder as SmolVLMDummyInputsBuilder
from .idefics3 import Idefics3ForConditionalGeneration, Idefics3ProcessingInfo
from .idefics3 import Idefics3MultiModalProcessor as SmolVLMMultiModalProcessor
# yapf: enable
class SmolVLMProcessingInfo(Idefics3ProcessingInfo):
def get_hf_processor(self, **kwargs: object) -> SmolVLMProcessor:

View File

@ -32,11 +32,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models import SupportsPP
from vllm.model_executor.models.module_mapping import MultiModelKeys
# yapf: disable
from vllm.model_executor.models.whisper import WhisperEncoder
# yapf: enable
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (
MultiModalDataDict,

View File

@ -28,7 +28,6 @@ def _get_minicpmv_chat_template_fallback(tokenizer_name_or_path: str) -> Optiona
return CHAT_TEMPLATES_DIR / "template_chatml.jinja"
# yapf: disable
_MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = {
"blip-2": CHAT_TEMPLATES_DIR / "template_blip2.jinja",
"clip": CHAT_TEMPLATES_DIR / "template_basic.jinja",
@ -39,7 +38,6 @@ _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = {
"paligemma": CHAT_TEMPLATES_DIR / "template_basic.jinja",
"qwen": _get_qwen_chat_template_fallback,
}
# yapf: enable
def register_chat_template_fallback_path(

View File

@ -1,12 +1,11 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# yapf: disable
# ruff: noqa: E501
# coding=utf-8
# Copied from
# https://huggingface.co/Snowflake/snowflake-arctic-instruct/blob/main/configuration_arctic.py
""" Arctic model configuration"""
"""Arctic model configuration"""
from dataclasses import asdict, dataclass
from typing import Any

View File

@ -1,7 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# yapf: disable
# ruff: noqa: E501
# Adapted from
# https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1/blob/main/configuration.py
@ -16,7 +15,7 @@ from transformers.dynamic_module_utils import get_class_from_dynamic_module
class Nemotron_Nano_VL_Config(PretrainedConfig):
model_type = 'Llama_Nemotron_Nano_VL'
model_type = "Llama_Nemotron_Nano_VL"
is_composition = True
def __init__(
@ -26,17 +25,22 @@ class Nemotron_Nano_VL_Config(PretrainedConfig):
force_image_size=None,
downsample_ratio=0.5,
template=None,
ps_version='v1',
ps_version="v1",
image_tag_type="internvl",
projector_hidden_size=4096,
vit_hidden_size=1280,
**kwargs
**kwargs,
):
super().__init__(**kwargs)
if vision_config is not None:
assert "auto_map" in vision_config and "AutoConfig" in vision_config["auto_map"]
vision_auto_config = get_class_from_dynamic_module(*vision_config["auto_map"]["AutoConfig"].split("--")[::-1])
assert (
"auto_map" in vision_config
and "AutoConfig" in vision_config["auto_map"]
)
vision_auto_config = get_class_from_dynamic_module(
*vision_config["auto_map"]["AutoConfig"].split("--")[::-1]
)
self.vision_config = vision_auto_config(**vision_config)
else:
self.vision_config = PretrainedConfig()
@ -51,6 +55,6 @@ class Nemotron_Nano_VL_Config(PretrainedConfig):
self.downsample_ratio = downsample_ratio
self.template = template # TODO move out of here and into the tokenizer
self.ps_version = ps_version # Pixel shuffle version
self.image_tag_type = image_tag_type # TODO: into the tokenizer too?
self.image_tag_type = image_tag_type # TODO: into the tokenizer too?
self.projector_hidden_size = projector_hidden_size
self.vit_hidden_size = vit_hidden_size

View File

@ -1,7 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# yapf: disable
# ruff: noqa: E501
# adapted from https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_aimv2.py
# and https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_ovis.py
@ -70,34 +69,37 @@ class AIMv2Config(PretrainedConfig):
# Visual Tokenizer Configuration
# ----------------------------------------------------------------------
class BaseVisualTokenizerConfig(PretrainedConfig):
def __init__(self,
vocab_size=16384,
tokenize_function="softmax",
tau=1.0,
depths=None,
drop_cls_token=False,
backbone_config: Optional[Union[PretrainedConfig,
dict]] = None,
hidden_stride: int = 1,
**kwargs):
def __init__(
self,
vocab_size=16384,
tokenize_function="softmax",
tau=1.0,
depths=None,
drop_cls_token=False,
backbone_config: Optional[Union[PretrainedConfig, dict]] = None,
hidden_stride: int = 1,
**kwargs,
):
super().__init__(**kwargs)
self.vocab_size = vocab_size
self.tokenize_function = tokenize_function
self.tau = tau
if isinstance(depths, str):
depths = [int(x) for x in depths.split('|')]
depths = [int(x) for x in depths.split("|")]
self.depths = depths
self.backbone_kwargs = dict[str, Any]()
self.drop_cls_token = drop_cls_token
if backbone_config is not None:
assert isinstance(backbone_config, (PretrainedConfig, dict)), \
assert isinstance(backbone_config, (PretrainedConfig, dict)), (
f"expect `backbone_config` to be instance of PretrainedConfig or dict, but got {type(backbone_config)} type"
)
if not isinstance(backbone_config, PretrainedConfig):
model_type = backbone_config['model_type']
model_type = backbone_config["model_type"]
if model_type != "aimv2":
backbone_config.pop('model_type')
backbone_config = AutoConfig.for_model(model_type, **backbone_config)
backbone_config.pop("model_type")
backbone_config = AutoConfig.for_model(
model_type, **backbone_config
)
else:
backbone_config = AIMv2Config(**backbone_config)
self.backbone_config = backbone_config
@ -113,7 +115,7 @@ class Aimv2VisualTokenizerConfig(BaseVisualTokenizerConfig):
self.drop_cls_token = False
if self.depths:
assert len(self.depths) == 1
self.backbone_kwargs['num_hidden_layers'] = self.depths[0]
self.backbone_kwargs["num_hidden_layers"] = self.depths[0]
class SiglipVisualTokenizerConfig(BaseVisualTokenizerConfig):
@ -125,7 +127,7 @@ class SiglipVisualTokenizerConfig(BaseVisualTokenizerConfig):
self.drop_cls_token = False
if self.depths:
assert len(self.depths) == 1
self.backbone_kwargs['num_hidden_layers'] = self.depths[0]
self.backbone_kwargs["num_hidden_layers"] = self.depths[0]
AutoConfig.register("siglip_visual_tokenizer", SiglipVisualTokenizerConfig)
@ -138,35 +140,39 @@ AutoConfig.register("aimv2_visual_tokenizer", Aimv2VisualTokenizerConfig)
class OvisConfig(PretrainedConfig):
model_type = "ovis"
def __init__(self,
llm_config: Optional[Union[PretrainedConfig, dict]] = None,
visual_tokenizer_config: Optional[Union[PretrainedConfig,
dict]] = None,
multimodal_max_length=8192,
hidden_size=None,
conversation_formatter_class=None,
llm_attn_implementation=None,
disable_tie_weight=False,
**kwargs):
def __init__(
self,
llm_config: Optional[Union[PretrainedConfig, dict]] = None,
visual_tokenizer_config: Optional[Union[PretrainedConfig, dict]] = None,
multimodal_max_length=8192,
hidden_size=None,
conversation_formatter_class=None,
llm_attn_implementation=None,
disable_tie_weight=False,
**kwargs,
):
super().__init__(**kwargs)
if llm_config is not None:
assert isinstance(llm_config, (PretrainedConfig, dict)), \
assert isinstance(llm_config, (PretrainedConfig, dict)), (
f"expect `llm_config` to be instance of PretrainedConfig or dict, but got {type(llm_config)} type"
)
if not isinstance(llm_config, PretrainedConfig):
model_type = llm_config['model_type']
llm_config.pop('model_type')
model_type = llm_config["model_type"]
llm_config.pop("model_type")
llm_config = AutoConfig.for_model(model_type, **llm_config)
# map llm_config to text_config
self.text_config = llm_config
if visual_tokenizer_config is not None:
assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), \
assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), (
f"expect `visual_tokenizer_config` to be instance of PretrainedConfig or dict, but got {type(visual_tokenizer_config)} type"
)
if not isinstance(visual_tokenizer_config, PretrainedConfig):
model_type = visual_tokenizer_config['model_type']
visual_tokenizer_config.pop('model_type')
model_type = visual_tokenizer_config["model_type"]
visual_tokenizer_config.pop("model_type")
visual_tokenizer_config = AutoConfig.for_model(
model_type, **visual_tokenizer_config)
model_type, **visual_tokenizer_config
)
self.visual_tokenizer_config = visual_tokenizer_config
self.multimodal_max_length = multimodal_max_length

View File

@ -1,7 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# yapf: disable
# ruff: noqa: E501
# coding=utf-8
# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/ff23960c5cf9e6874b44be38af930cfb0ccbb620/deepseek_vl2/models/processing_deepseek_vl_v2.py
@ -35,11 +34,12 @@ from transformers.processing_utils import ProcessorMixin
class ImageTransform:
def __init__(self,
mean: tuple[float, float, float] = (0.5, 0.5, 0.5),
std: tuple[float, float, float] = (0.5, 0.5, 0.5),
normalize: bool = True):
def __init__(
self,
mean: tuple[float, float, float] = (0.5, 0.5, 0.5),
std: tuple[float, float, float] = (0.5, 0.5, 0.5),
normalize: bool = True,
):
self.mean = mean
self.std = std
self.normalize = normalize
@ -77,7 +77,6 @@ class DeepseekVLV2Processor(ProcessorMixin):
ignore_id: int = -100,
**kwargs,
):
self.candidate_resolutions = candidate_resolutions
self.image_size = candidate_resolutions[0][0]
self.patch_size = patch_size
@ -86,13 +85,15 @@ class DeepseekVLV2Processor(ProcessorMixin):
self.normalize = normalize
self.downsample_ratio = downsample_ratio
self.image_transform = ImageTransform(mean=image_mean, std=image_std, normalize=normalize)
self.image_transform = ImageTransform(
mean=image_mean, std=image_std, normalize=normalize
)
self.tokenizer = tokenizer
self.tokenizer.padding_side = 'left' # must set thispadding side with make a difference in batch inference
self.tokenizer.padding_side = "left" # must set thispadding side with make a difference in batch inference
# add the pad_token as special token to use 'tokenizer.pad_token' and 'tokenizer.pad_token_id'
if tokenizer.pad_token is None:
self.tokenizer.add_special_tokens({'pad_token': pad_token})
self.tokenizer.add_special_tokens({"pad_token": pad_token})
# add image token
image_token_id = self.tokenizer.vocab.get(image_token)
@ -104,7 +105,7 @@ class DeepseekVLV2Processor(ProcessorMixin):
# add five special tokens for grounding-related tasks
# <|ref|>, <|/ref|>, <|det|>, <|/det|>, <|grounding|>
special_tokens = ['<|ref|>', '<|/ref|>', '<|det|>', '<|/det|>', '<|grounding|>']
special_tokens = ["<|ref|>", "<|/ref|>", "<|det|>", "<|/det|>", "<|grounding|>"]
special_tokens_dict = {"additional_special_tokens": special_tokens}
self.tokenizer.add_special_tokens(special_tokens_dict)
@ -134,15 +135,19 @@ class DeepseekVLV2Processor(ProcessorMixin):
for width, height in self.candidate_resolutions:
scale = min(width / original_width, height / original_height)
downscaled_width, downscaled_height = int(
original_width * scale), int(original_height * scale)
effective_resolution = min(downscaled_width * downscaled_height,
original_width * original_height)
downscaled_width, downscaled_height = (
int(original_width * scale),
int(original_height * scale),
)
effective_resolution = min(
downscaled_width * downscaled_height, original_width * original_height
)
wasted_resolution = (width * height) - effective_resolution
if effective_resolution > max_effective_resolution or (
effective_resolution == max_effective_resolution
and wasted_resolution < min_wasted_resolution):
effective_resolution == max_effective_resolution
and wasted_resolution < min_wasted_resolution
):
max_effective_resolution = effective_resolution
min_wasted_resolution = wasted_resolution
best_fit = (width, height)
@ -198,12 +203,20 @@ class DeepseekVLV2Processor(ProcessorMixin):
- num_image_tokens (list[int]): the number of image tokens
"""
assert (prompt is not None and images is not None
), "prompt and images must be used at the same time."
assert prompt is not None and images is not None, (
"prompt and images must be used at the same time."
)
sft_format = prompt
tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens = self.tokenize_with_images(
sft_format, images, bos=True, eos=True, cropping=len(images) <= 2)
(
tokenized_str,
images_list,
images_seq_mask,
images_spatial_crop,
num_image_tokens,
) = self.tokenize_with_images(
sft_format, images, bos=True, eos=True, cropping=len(images) <= 2
)
masked_tokenized_str = []
for token_index in tokenized_str:
if token_index != self.image_token_id:
@ -211,17 +224,21 @@ class DeepseekVLV2Processor(ProcessorMixin):
else:
masked_tokenized_str.append(self.ignore_id)
assert len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str), \
(f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, "
f"imags_seq_mask's length {len(images_seq_mask)}, are not equal")
assert (
len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str)
), (
f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, "
f"imags_seq_mask's length {len(images_seq_mask)}, are not equal"
)
input_ids = torch.LongTensor(tokenized_str)
target_ids = torch.LongTensor(masked_tokenized_str)
images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool)
# set input_ids < 0 | input_ids == self.image_token_id as ignore_id
target_ids[(input_ids < 0) |
(input_ids == self.image_token_id)] = self.ignore_id
target_ids[(input_ids < 0) | (input_ids == self.image_token_id)] = (
self.ignore_id
)
input_ids[input_ids < 0] = self.pad_id
if inference_mode:
@ -311,30 +328,50 @@ class DeepseekVLV2Processor(ProcessorMixin):
best_width, best_height = self.image_size, self.image_size
"""process the global view"""
global_view = ImageOps.pad(image, (self.image_size, self.image_size),
color=tuple(int(x * 255) for x in self.image_transform.mean))
global_view = ImageOps.pad(
image,
(self.image_size, self.image_size),
color=tuple(int(x * 255) for x in self.image_transform.mean),
)
images_list.append(self.image_transform(global_view))
"""process the local views"""
local_view = ImageOps.pad(image, (best_width, best_height),
color=tuple(int(x * 255) for x in self.image_transform.mean))
local_view = ImageOps.pad(
image,
(best_width, best_height),
color=tuple(int(x * 255) for x in self.image_transform.mean),
)
for i in range(0, best_height, self.image_size):
for j in range(0, best_width, self.image_size):
images_list.append(
self.image_transform(local_view.crop((j, i, j + self.image_size, i + self.image_size))))
self.image_transform(
local_view.crop(
(j, i, j + self.image_size, i + self.image_size)
)
)
)
"""record height / width crop num"""
num_width_tiles, num_height_tiles = best_width // self.image_size, best_height // self.image_size
num_width_tiles, num_height_tiles = (
best_width // self.image_size,
best_height // self.image_size,
)
images_spatial_crop.append([num_width_tiles, num_height_tiles])
"""add image tokens"""
h = w = math.ceil((self.image_size // self.patch_size) / self.downsample_ratio)
h = w = math.ceil(
(self.image_size // self.patch_size) / self.downsample_ratio
)
# global views tokens h * (w + 1), 1 is for line separator
tokenized_image = [self.image_token_id] * h * (w + 1)
# add a separator between global and local views
tokenized_image += [self.image_token_id]
# local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1)
tokenized_image += [self.image_token_id] * (num_height_tiles * h) * (num_width_tiles * w + 1)
tokenized_image += (
[self.image_token_id]
* (num_height_tiles * h)
* (num_width_tiles * w + 1)
)
tokenized_str += tokenized_image
images_seq_mask += [True] * len(tokenized_image)
@ -353,10 +390,17 @@ class DeepseekVLV2Processor(ProcessorMixin):
tokenized_str = tokenized_str + [self.eos_id]
images_seq_mask = images_seq_mask + [False]
assert len(tokenized_str) == len(
images_seq_mask), f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
assert len(tokenized_str) == len(images_seq_mask), (
f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
)
return tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens
return (
tokenized_str,
images_list,
images_seq_mask,
images_spatial_crop,
num_image_tokens,
)
AutoProcessor.register("DeepseekVLV2Processor", DeepseekVLV2Processor)

View File

@ -1,7 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# yapf: disable
# ruff: noqa: E501
# coding=utf-8
# adapted from https://github.com/AIDC-AI/Ovis/blob/35ab51a1a1e3542fa6db260a1084cefbc8f164bb/ovis/vllm/processing_ovis.py
@ -35,23 +34,24 @@ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
from vllm.multimodal.image import convert_image_mode
__all__ = ['OvisProcessor']
__all__ = ["OvisProcessor"]
IGNORE_ID = -100
class OvisProcessorKwargs(ProcessingKwargs, total=False): # type: ignore[call-arg]
class OvisProcessorKwargs(ProcessingKwargs, total=False): # type: ignore[call-arg]
_defaults = {
"text_kwargs": {
"padding": False,
},
"images_kwargs": {
'max_partition':9,
'covering_threshold':0.9,
'convert_to_rgb':True,
'return_tensors':'pt'},
"max_partition": 9,
"covering_threshold": 0.9,
"convert_to_rgb": True,
"return_tensors": "pt",
},
}
class OvisProcessor(ProcessorMixin):
r"""
Constructs an Ovis processor which wraps an Ovis image processor and a Qwen2 tokenizer into a single processor.
@ -97,14 +97,16 @@ class OvisProcessor(ProcessorMixin):
"image_col_sep": -303,
"image_row_sep": -304,
"image_end": -305,
'image_pad': image_pad_token_id,
"image_pad": image_pad_token_id,
}
return extra_special_tokens
def __call__(
self,
images: ImageInput = None,
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
text: Union[
TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]
] = None,
**kwargs: Unpack[OvisProcessorKwargs],
) -> BatchFeature:
"""
@ -169,7 +171,6 @@ class OvisProcessor(ProcessorMixin):
# Process text input
if text is not None:
if not isinstance(text, list):
text = [text]
@ -178,7 +179,10 @@ class OvisProcessor(ProcessorMixin):
replaced_ids_list = []
idx = 0
for ids_tensor in tokenized_batched_text:
if image_token_id in ids_tensor and "image_placeholders" in image_features:
if (
image_token_id in ids_tensor
and "image_placeholders" in image_features
):
if idx < len(image_features["image_placeholders"]):
# Converts in list for ease of use
ids_list = ids_tensor.tolist()
@ -188,7 +192,9 @@ class OvisProcessor(ProcessorMixin):
# replace placeholders
for i, token_id in enumerate(ids_list):
if token_id == image_token_id:
placeholder_ids = image_features["image_placeholders"][idx]
placeholder_ids = image_features["image_placeholders"][
idx
]
new_ids.extend(placeholder_ids)
idx += 1
else:
@ -198,7 +204,8 @@ class OvisProcessor(ProcessorMixin):
ids_tensor = torch.tensor(new_ids, dtype=torch.long)
else:
raise RuntimeError(
'Mismatch between the images you provided and the number of placeholder present in the text')
"Mismatch between the images you provided and the number of placeholder present in the text"
)
replaced_ids_list.append(ids_tensor)
@ -217,7 +224,7 @@ class OvisProcessor(ProcessorMixin):
# Add image features if present
if image_features:
output["pixel_values"] = processed_images
output['grids'] = grids
output["grids"] = grids
return output
@ -227,8 +234,10 @@ class OvisProcessor(ProcessorMixin):
def _tokenize_with_image_symbol(self, text_list: list[str]) -> torch.LongTensor:
batch_token_ids = []
for text in text_list:
text_chunks = [self.tokenizer(chunk, add_special_tokens=False).input_ids for chunk in
text.split(self.image_token)]
text_chunks = [
self.tokenizer(chunk, add_special_tokens=False).input_ids
for chunk in text.split(self.image_token)
]
token_ids = []
num_chuck = len(text_chunks)
for i, chunk in enumerate(text_chunks):
@ -240,50 +249,60 @@ class OvisProcessor(ProcessorMixin):
def get_image_size(self):
size = self.image_processor.size
if 'shortest_edge' in size:
width = height = size['shortest_edge']
if "shortest_edge" in size:
width = height = size["shortest_edge"]
elif "height" in size and "width" in size:
width = size['width']
height = size['height']
width = size["width"]
height = size["height"]
else:
raise ValueError( "Can't parse image size from image_processor config.")
raise ValueError("Can't parse image size from image_processor config.")
return height, width
def get_token_value(self, tok):
return self.extra_special_tokens[tok]
def construct_image_indicators(self, grid):
image_placeholders = [self.get_token_value('image_start'),
self.get_token_value('image_atom'),
self.get_token_value('image_prefix')]
image_placeholders = [
self.get_token_value("image_start"),
self.get_token_value("image_atom"),
self.get_token_value("image_prefix"),
]
if grid[0] * grid[1] > 1:
for r in range(grid[0]):
for c in range(grid[1]):
image_placeholders.append(self.get_token_value('image_atom') )
image_placeholders.append(self.get_token_value("image_atom"))
if c < grid[1] - 1:
image_placeholders.append(self.get_token_value('image_col_sep'))
image_placeholders.append(self.get_token_value("image_col_sep"))
if r < grid[0] - 1:
image_placeholders.append(self.get_token_value('image_row_sep'))
image_placeholders.append(self.get_token_value('image_end'))
image_placeholders.append(self.get_token_value("image_row_sep"))
image_placeholders.append(self.get_token_value("image_end"))
return image_placeholders
def construct_image_placeholders(self, grid):
image_placeholders = self.construct_image_indicators(grid)
image_atom_token_id = self.get_token_value('image_atom')
image_atom_token_id = self.get_token_value("image_atom")
# Extract the padding token ID from tokenizer
image_padding_token_id = self.get_token_value('image_pad')
image_padding_token_id = self.get_token_value("image_pad")
# Create a new list with padding tokens inserted
padded_placeholder_tokens = []
for token in image_placeholders:
padded_placeholder_tokens.append(image_padding_token_id)
if token == image_atom_token_id:
padded_placeholder_tokens.extend([image_padding_token_id] * self.image_segment_len)
padded_placeholder_tokens.extend(
[image_padding_token_id] * self.image_segment_len
)
return padded_placeholder_tokens
def preprocess_image(self, image: PIL.Image.Image, max_partition, covering_threshold, convert_to_rgb, return_tensors):
def preprocess_image(
self,
image: PIL.Image.Image,
max_partition,
covering_threshold,
convert_to_rgb,
return_tensors,
):
def _preprocess(img: PIL.Image.Image, side):
# first resize and preprocess
w, h = img.size
@ -296,19 +315,27 @@ class OvisProcessor(ProcessorMixin):
new_height = side
new_width = int(w / h * new_height)
new_size = dict(height=new_height, width=new_width)
pixel_values = self.image_processor.preprocess(img, size=new_size, return_tensors=return_tensors)['pixel_values']
pixel_values = self.image_processor.preprocess(
img, size=new_size, return_tensors=return_tensors
)["pixel_values"]
# then pad to square
square_values = torch.zeros([1, 3, side, side], dtype=pixel_values.dtype, device=pixel_values.device)
square_values = torch.zeros(
[1, 3, side, side], dtype=pixel_values.dtype, device=pixel_values.device
)
new_height, new_width = pixel_values.shape[2:]
if new_height == new_width:
square_values[:, :, :, :] = pixel_values
elif new_height > new_width:
from_index = (side - new_width) // 2
square_values[:, :, :, from_index:from_index + new_width] = pixel_values
square_values[:, :, :, from_index : from_index + new_width] = (
pixel_values
)
else:
from_index = (side - new_height) // 2
square_values[:, :, from_index:from_index + new_height, :] = pixel_values
square_values[:, :, from_index : from_index + new_height, :] = (
pixel_values
)
return square_values
@ -350,7 +377,9 @@ class OvisProcessor(ProcessorMixin):
good_grids = []
for grid in candidate_grids:
partition = _partition(img, grid)
covering_ratio = sum([_covering_area(*p, side) for p in partition]) / img_area
covering_ratio = (
sum([_covering_area(*p, side) for p in partition]) / img_area
)
assert covering_ratio <= 1.0
all_grids.append((grid, covering_ratio))
if covering_ratio > covering_threshold:
@ -358,18 +387,19 @@ class OvisProcessor(ProcessorMixin):
if len(good_grids) > 0:
# pick the good partition with minimum #sub_images and break the tie using covering_ratio
return sorted(good_grids, key=lambda x: (x[0][0] * x[0][1], -x[1]))[0][0]
return sorted(good_grids, key=lambda x: (x[0][0] * x[0][1], -x[1]))[0][
0
]
else:
# pick the partition with maximum covering_ratio and break the tie using #sub_images
return sorted(all_grids, key=lambda x: (-x[1], x[0][0] * x[0][1]))[0][0]
if convert_to_rgb:
image = convert_image_mode(image, 'RGB')
image = convert_image_mode(image, "RGB")
sides = self.get_image_size()
if sides[0] != sides[1]:
raise ValueError('get_image_size() returns non-square size')
raise ValueError("get_image_size() returns non-square size")
side = sides[0]
grid = _get_best_grid(image, side)
partition = _partition(image, grid)
@ -405,14 +435,18 @@ class OvisProcessor(ProcessorMixin):
`list[str]`: The decoded text.
"""
return self.tokenizer.batch_decode(
generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False
generated_outputs,
skip_special_tokens=True,
clean_up_tokenization_spaces=False,
)
@property
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
image_processor_input_names = self.image_processor.model_input_names
names_from_processor = list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
names_from_processor = list(
dict.fromkeys(tokenizer_input_names + image_processor_input_names)
)
return names_from_processor + ["second_per_grid_ts"]

View File

@ -40,9 +40,6 @@ from vllm.utils.flashinfer import (
supports_trtllm_attention,
use_trtllm_attention,
)
# yapf conflicts with isort for this block
# yapf: disable
from vllm.v1.attention.backends.utils import (
AttentionCGSupport,
AttentionMetadataBuilder,
@ -52,8 +49,6 @@ from vllm.v1.attention.backends.utils import (
infer_global_hyperparameters,
split_decodes_and_prefills,
)
# yapf: enable
from vllm.v1.kv_cache_interface import AttentionSpec
FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024

View File

@ -11,9 +11,6 @@ from vllm.attention.backends.abstract import AttentionLayer
from vllm.attention.ops.rocm_aiter_mla import aiter_mla_decode_fwd
from vllm.config import VllmConfig
from vllm.utils import cdiv
# yapf conflicts with isort for this docstring
# yapf: disable
from vllm.v1.attention.backends.mla.common import (
MLACommonBackend,
MLACommonDecodeMetadata,
@ -24,8 +21,6 @@ from vllm.v1.attention.backends.mla.common import (
from vllm.v1.attention.backends.utils import AttentionCGSupport
from vllm.v1.kv_cache_interface import AttentionSpec
# yapf: enable
def is_aiter_mla_enabled() -> bool:
return envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MLA

View File

@ -18,8 +18,6 @@ from msgspec import msgpack
from vllm import envs
from vllm.logger import init_logger
# yapf: disable
from vllm.multimodal.inputs import (
BaseMultiModalField,
MultiModalBatchedField,
@ -32,8 +30,6 @@ from vllm.multimodal.inputs import (
MultiModalSharedField,
NestedTensors,
)
# yapf: enable
from vllm.v1.engine import UtilityResult
logger = init_logger(__name__)

View File

@ -48,9 +48,6 @@ from vllm.model_executor.layers.mamba.abstract import MambaBase
from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader
from vllm.model_executor.models.deepseek_v2 import DeepseekV32IndexerCache
# yapf conflicts with isort for this block
# yapf: disable
from vllm.model_executor.models.interfaces import (
SupportsMultiModal,
is_mixture_of_experts,
@ -59,8 +56,6 @@ from vllm.model_executor.models.interfaces import (
supports_multimodal_pruning,
supports_transcription,
)
# yapf: enable
from vllm.model_executor.models.interfaces_base import (
VllmModelForPooling,
is_pooling_model,
@ -101,9 +96,6 @@ from vllm.v1.attention.backends.utils import (
split_attn_metadata,
)
from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
# yapf conflicts with isort for this block
# yapf: disable
from vllm.v1.kv_cache_interface import (
AttentionSpec,
ChunkedLocalAttentionSpec,
@ -118,8 +110,6 @@ from vllm.v1.kv_cache_interface import (
SlidingWindowSpec,
UniformTypeKVCacheSpecs,
)
# yapf: enable
from vllm.v1.outputs import (
EMPTY_MODEL_RUNNER_OUTPUT,
AsyncModelRunnerOutput,