Remove all references to yapf as it's no longer used (#26251)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor 2025-10-05 17:18:11 +01:00 committed by GitHub
parent d6953beb91
commit 4e256cadc2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
78 changed files with 1992 additions and 1717 deletions

View File

@ -12,9 +12,6 @@ from functools import reduce
from typing import Optional, Union from typing import Optional, Union
import jinja2 import jinja2
# yapf conflicts with isort for this block
# yapf: disable
from vllm_cutlass_library_extension import ( from vllm_cutlass_library_extension import (
DataType, DataType,
EpilogueScheduleTag, EpilogueScheduleTag,
@ -31,8 +28,6 @@ from vllm_cutlass_library_extension import (
VLLMKernelScheduleTag, VLLMKernelScheduleTag,
) )
# yapf: enable
# #
# Generator templating # Generator templating
# #

View File

@ -21,8 +21,6 @@ from vllm.utils import FlexibleArgumentParser
logger = logging.getLogger() logger = logging.getLogger()
# yapf conflicts with isort for this docstring
# yapf: disable
""" """
tensorize_vllm_model.py is a script that can be used to serialize and tensorize_vllm_model.py is a script that can be used to serialize and
deserialize vLLM models. These models can be loaded using tensorizer deserialize vLLM models. These models can be loaded using tensorizer
@ -132,7 +130,8 @@ def get_parser():
"can be loaded using tensorizer directly to the GPU " "can be loaded using tensorizer directly to the GPU "
"extremely quickly. Tensor encryption and decryption is " "extremely quickly. Tensor encryption and decryption is "
"also supported, although libsodium must be installed to " "also supported, although libsodium must be installed to "
"use it.") "use it."
)
parser = EngineArgs.add_cli_args(parser) parser = EngineArgs.add_cli_args(parser)
parser.add_argument( parser.add_argument(
@ -144,13 +143,14 @@ def get_parser():
"along with the model by instantiating a TensorizerConfig object, " "along with the model by instantiating a TensorizerConfig object, "
"creating a dict from it with TensorizerConfig.to_serializable(), " "creating a dict from it with TensorizerConfig.to_serializable(), "
"and passing it to LoRARequest's initializer with the kwarg " "and passing it to LoRARequest's initializer with the kwarg "
"tensorizer_config_dict." "tensorizer_config_dict.",
) )
subparsers = parser.add_subparsers(dest='command', required=True) subparsers = parser.add_subparsers(dest="command", required=True)
serialize_parser = subparsers.add_parser( serialize_parser = subparsers.add_parser(
'serialize', help="Serialize a model to `--serialized-directory`") "serialize", help="Serialize a model to `--serialized-directory`"
)
serialize_parser.add_argument( serialize_parser.add_argument(
"--suffix", "--suffix",
@ -163,7 +163,9 @@ def get_parser():
"`--suffix` is `v1`, the serialized model tensors will be " "`--suffix` is `v1`, the serialized model tensors will be "
"saved to " "saved to "
"`s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors`. " "`s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors`. "
"If none is provided, a random UUID will be used.")) "If none is provided, a random UUID will be used."
),
)
serialize_parser.add_argument( serialize_parser.add_argument(
"--serialized-directory", "--serialized-directory",
type=str, type=str,
@ -175,108 +177,127 @@ def get_parser():
"and the model HuggingFace ID is `EleutherAI/gpt-j-6B`, tensors will " "and the model HuggingFace ID is `EleutherAI/gpt-j-6B`, tensors will "
"be saved to `dir/vllm/EleutherAI/gpt-j-6B/suffix/model.tensors`, " "be saved to `dir/vllm/EleutherAI/gpt-j-6B/suffix/model.tensors`, "
"where `suffix` is given by `--suffix` or a random UUID if not " "where `suffix` is given by `--suffix` or a random UUID if not "
"provided.") "provided.",
)
serialize_parser.add_argument( serialize_parser.add_argument(
"--serialization-kwargs", "--serialization-kwargs",
type=tensorizer_kwargs_arg, type=tensorizer_kwargs_arg,
required=False, required=False,
help=("A JSON string containing additional keyword arguments to " help=(
"pass to Tensorizer's TensorSerializer during " "A JSON string containing additional keyword arguments to "
"serialization.")) "pass to Tensorizer's TensorSerializer during "
"serialization."
),
)
serialize_parser.add_argument( serialize_parser.add_argument(
"--keyfile", "--keyfile",
type=str, type=str,
required=False, required=False,
help=("Encrypt the model weights with a randomly-generated binary key," help=(
" and save the key at this path")) "Encrypt the model weights with a randomly-generated binary key,"
" and save the key at this path"
),
)
deserialize_parser = subparsers.add_parser( deserialize_parser = subparsers.add_parser(
'deserialize', "deserialize",
help=("Deserialize a model from `--path-to-tensors`" help=(
" to verify it can be loaded and used.")) "Deserialize a model from `--path-to-tensors`"
" to verify it can be loaded and used."
),
)
deserialize_parser.add_argument( deserialize_parser.add_argument(
"--path-to-tensors", "--path-to-tensors",
type=str, type=str,
required=False, required=False,
help="The local path or S3 URI to the model tensors to deserialize. ") help="The local path or S3 URI to the model tensors to deserialize. ",
)
deserialize_parser.add_argument( deserialize_parser.add_argument(
"--serialized-directory", "--serialized-directory",
type=str, type=str,
required=False, required=False,
help="Directory with model artifacts for loading. Assumes a " help="Directory with model artifacts for loading. Assumes a "
"model.tensors file exists therein. Can supersede " "model.tensors file exists therein. Can supersede "
"--path-to-tensors.") "--path-to-tensors.",
)
deserialize_parser.add_argument( deserialize_parser.add_argument(
"--keyfile", "--keyfile",
type=str, type=str,
required=False, required=False,
help=("Path to a binary key to use to decrypt the model weights," help=(
" if the model was serialized with encryption")) "Path to a binary key to use to decrypt the model weights,"
" if the model was serialized with encryption"
),
)
deserialize_parser.add_argument( deserialize_parser.add_argument(
"--deserialization-kwargs", "--deserialization-kwargs",
type=tensorizer_kwargs_arg, type=tensorizer_kwargs_arg,
required=False, required=False,
help=("A JSON string containing additional keyword arguments to " help=(
"pass to Tensorizer's `TensorDeserializer` during " "A JSON string containing additional keyword arguments to "
"deserialization.")) "pass to Tensorizer's `TensorDeserializer` during "
"deserialization."
),
)
TensorizerArgs.add_cli_args(deserialize_parser) TensorizerArgs.add_cli_args(deserialize_parser)
return parser return parser
def merge_extra_config_with_tensorizer_config(extra_cfg: dict,
cfg: TensorizerConfig): def merge_extra_config_with_tensorizer_config(extra_cfg: dict, cfg: TensorizerConfig):
for k, v in extra_cfg.items(): for k, v in extra_cfg.items():
if hasattr(cfg, k): if hasattr(cfg, k):
setattr(cfg, k, v) setattr(cfg, k, v)
logger.info( logger.info(
"Updating TensorizerConfig with %s from " "Updating TensorizerConfig with %s from "
"--model-loader-extra-config provided", k "--model-loader-extra-config provided",
k,
) )
def deserialize(args, tensorizer_config): def deserialize(args, tensorizer_config):
if args.lora_path: if args.lora_path:
tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
llm = LLM(model=args.model, llm = LLM(
load_format="tensorizer", model=args.model,
tensor_parallel_size=args.tensor_parallel_size, load_format="tensorizer",
model_loader_extra_config=tensorizer_config, tensor_parallel_size=args.tensor_parallel_size,
enable_lora=True, model_loader_extra_config=tensorizer_config,
enable_lora=True,
) )
sampling_params = SamplingParams( sampling_params = SamplingParams(
temperature=0, temperature=0, max_tokens=256, stop=["[/assistant]"]
max_tokens=256,
stop=["[/assistant]"]
) )
# Truncating this as the extra text isn't necessary # Truncating this as the extra text isn't necessary
prompts = [ prompts = ["[user] Write a SQL query to answer the question based on ..."]
"[user] Write a SQL query to answer the question based on ..."
]
# Test LoRA load # Test LoRA load
print( print(
llm.generate( llm.generate(
prompts, prompts,
sampling_params, sampling_params,
lora_request=LoRARequest("sql-lora", lora_request=LoRARequest(
1, "sql-lora",
args.lora_path, 1,
tensorizer_config_dict = tensorizer_config args.lora_path,
.to_serializable()) tensorizer_config_dict=tensorizer_config.to_serializable(),
),
) )
) )
else: else:
llm = LLM(model=args.model, llm = LLM(
load_format="tensorizer", model=args.model,
tensor_parallel_size=args.tensor_parallel_size, load_format="tensorizer",
model_loader_extra_config=tensorizer_config tensor_parallel_size=args.tensor_parallel_size,
model_loader_extra_config=tensorizer_config,
) )
return llm return llm
@ -285,17 +306,20 @@ def main():
parser = get_parser() parser = get_parser()
args = parser.parse_args() args = parser.parse_args()
s3_access_key_id = (getattr(args, 's3_access_key_id', None) s3_access_key_id = getattr(args, "s3_access_key_id", None) or os.environ.get(
or os.environ.get("S3_ACCESS_KEY_ID", None)) "S3_ACCESS_KEY_ID", None
s3_secret_access_key = (getattr(args, 's3_secret_access_key', None) )
or os.environ.get("S3_SECRET_ACCESS_KEY", None)) s3_secret_access_key = getattr(
s3_endpoint = (getattr(args, 's3_endpoint', None) args, "s3_secret_access_key", None
or os.environ.get("S3_ENDPOINT_URL", None)) ) or os.environ.get("S3_SECRET_ACCESS_KEY", None)
s3_endpoint = getattr(args, "s3_endpoint", None) or os.environ.get(
"S3_ENDPOINT_URL", None
)
credentials = { credentials = {
"s3_access_key_id": s3_access_key_id, "s3_access_key_id": s3_access_key_id,
"s3_secret_access_key": s3_secret_access_key, "s3_secret_access_key": s3_secret_access_key,
"s3_endpoint": s3_endpoint "s3_endpoint": s3_endpoint,
} }
model_ref = args.model model_ref = args.model
@ -309,25 +333,25 @@ def main():
if args.model_loader_extra_config: if args.model_loader_extra_config:
extra_config = json.loads(args.model_loader_extra_config) extra_config = json.loads(args.model_loader_extra_config)
tensorizer_dir = args.serialized_directory or extra_config.get("tensorizer_dir")
tensorizer_dir = (args.serialized_directory or tensorizer_uri = getattr(args, "path_to_tensors", None) or extra_config.get(
extra_config.get("tensorizer_dir")) "tensorizer_uri"
tensorizer_uri = (getattr(args, "path_to_tensors", None) )
or extra_config.get("tensorizer_uri"))
if tensorizer_dir and tensorizer_uri: if tensorizer_dir and tensorizer_uri:
parser.error("--serialized-directory and --path-to-tensors " parser.error(
"cannot both be provided") "--serialized-directory and --path-to-tensors cannot both be provided"
)
if not tensorizer_dir and not tensorizer_uri: if not tensorizer_dir and not tensorizer_uri:
parser.error("Either --serialized-directory or --path-to-tensors " parser.error(
"must be provided") "Either --serialized-directory or --path-to-tensors must be provided"
)
if args.command == "serialize": if args.command == "serialize":
engine_args = EngineArgs.from_cli_args(args) engine_args = EngineArgs.from_cli_args(args)
input_dir = tensorizer_dir.rstrip('/') input_dir = tensorizer_dir.rstrip("/")
suffix = args.suffix if args.suffix else uuid.uuid4().hex suffix = args.suffix if args.suffix else uuid.uuid4().hex
base_path = f"{input_dir}/vllm/{model_ref}/{suffix}" base_path = f"{input_dir}/vllm/{model_ref}/{suffix}"
if engine_args.tensor_parallel_size > 1: if engine_args.tensor_parallel_size > 1:
@ -339,15 +363,14 @@ def main():
tensorizer_uri=model_path, tensorizer_uri=model_path,
encryption_keyfile=keyfile, encryption_keyfile=keyfile,
serialization_kwargs=args.serialization_kwargs or {}, serialization_kwargs=args.serialization_kwargs or {},
**credentials **credentials,
) )
if args.lora_path: if args.lora_path:
tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
tensorize_lora_adapter(args.lora_path, tensorizer_config) tensorize_lora_adapter(args.lora_path, tensorizer_config)
merge_extra_config_with_tensorizer_config(extra_config, merge_extra_config_with_tensorizer_config(extra_config, tensorizer_config)
tensorizer_config)
tensorize_vllm_model(engine_args, tensorizer_config) tensorize_vllm_model(engine_args, tensorizer_config)
elif args.command == "deserialize": elif args.command == "deserialize":
@ -356,11 +379,10 @@ def main():
tensorizer_dir=args.serialized_directory, tensorizer_dir=args.serialized_directory,
encryption_keyfile=keyfile, encryption_keyfile=keyfile,
deserialization_kwargs=args.deserialization_kwargs or {}, deserialization_kwargs=args.deserialization_kwargs or {},
**credentials **credentials,
) )
merge_extra_config_with_tensorizer_config(extra_config, merge_extra_config_with_tensorizer_config(extra_config, tensorizer_config)
tensorizer_config)
deserialize(args, tensorizer_config) deserialize(args, tensorizer_config)
else: else:
raise ValueError("Either serialize or deserialize must be specified.") raise ValueError("Either serialize or deserialize must be specified.")

View File

@ -8,16 +8,11 @@ import torch
import vllm.envs as envs import vllm.envs as envs
from tests.kernels.quantization.nvfp4_utils import quant_nvfp4_tensor from tests.kernels.quantization.nvfp4_utils import quant_nvfp4_tensor
from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
# yapf conflicts with isort for this block
# yapf: disable
from vllm.compilation.activation_quant_fusion import ( from vllm.compilation.activation_quant_fusion import (
FUSED_OPS, FUSED_OPS,
SILU_MUL_OP, SILU_MUL_OP,
ActivationQuantFusionPass, ActivationQuantFusionPass,
) )
# yapf: enable
from vllm.compilation.fusion import QUANT_OPS from vllm.compilation.fusion import QUANT_OPS
from vllm.compilation.noop_elimination import NoOpEliminationPass from vllm.compilation.noop_elimination import NoOpEliminationPass
from vllm.compilation.post_cleanup import PostCleanupPass from vllm.compilation.post_cleanup import PostCleanupPass

View File

@ -107,10 +107,8 @@ class EPTestSettings:
# NOTE: You can adjust tp_base locally to fit the model in GPU # NOTE: You can adjust tp_base locally to fit the model in GPU
# The values displayed here are only a rough indicator of the size of the model # The values displayed here are only a rough indicator of the size of the model
# yapf: disable
TEST_MODELS = { TEST_MODELS = {
"deepseek-ai/DeepSeek-V2-Lite-Chat": EPTestSettings.fast( "deepseek-ai/DeepSeek-V2-Lite-Chat": EPTestSettings.fast(trust_remote_code=True),
trust_remote_code=True),
"mistralai/Mixtral-8x7B-Instruct-v0.1": EPTestSettings.fast(tp_base=4), "mistralai/Mixtral-8x7B-Instruct-v0.1": EPTestSettings.fast(tp_base=4),
} }
@ -192,22 +190,24 @@ def _compare_tp(
] ]
try: try:
compare_two_settings(model_name, compare_two_settings(
ep_args, model_name,
tp_args, ep_args,
ep_env, tp_args,
tp_env, ep_env,
method=method, tp_env,
max_wait_seconds=360) method=method,
max_wait_seconds=360,
)
except Exception: except Exception:
raise raise
@pytest.mark.parametrize( @pytest.mark.parametrize(
("model_name", "parallel_setup", "distributed_backend", "runner", ("model_name", "parallel_setup", "distributed_backend", "runner", "test_options"),
"test_options"),
[ [
params for model_name, settings in TEST_MODELS.items() params
for model_name, settings in TEST_MODELS.items()
for params in settings.iter_params(model_name) for params in settings.iter_params(model_name)
], ],
) )
@ -220,10 +220,12 @@ def test_ep(
test_options: EPTestOptions, test_options: EPTestOptions,
num_gpus_available, num_gpus_available,
): ):
_compare_tp(model_name, _compare_tp(
parallel_setup, model_name,
distributed_backend, parallel_setup,
runner, distributed_backend,
test_options, runner,
num_gpus_available, test_options,
method="generate") num_gpus_available,
method="generate",
)

View File

@ -100,7 +100,6 @@ class PPTestSettings:
# NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU # NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
# The values displayed here are only a rough indicator of the size of the model # The values displayed here are only a rough indicator of the size of the model
# yapf: disable
TEXT_GENERATION_MODELS = { TEXT_GENERATION_MODELS = {
# [Decoder-only] # [Decoder-only]
# Uses Llama # Uses Llama
@ -150,7 +149,9 @@ TEXT_GENERATION_MODELS = {
"adept/persimmon-8b-chat": PPTestSettings.fast(), "adept/persimmon-8b-chat": PPTestSettings.fast(),
"microsoft/phi-2": PPTestSettings.fast(), "microsoft/phi-2": PPTestSettings.fast(),
"microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(), "microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(),
"microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(multi_node_only=True, load_format="dummy"), # noqa: E501 "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(
multi_node_only=True, load_format="dummy"
), # noqa: E501
"Qwen/Qwen-7B-Chat": PPTestSettings.fast(), "Qwen/Qwen-7B-Chat": PPTestSettings.fast(),
"Qwen/Qwen2.5-0.5B-Instruct": PPTestSettings.fast(), "Qwen/Qwen2.5-0.5B-Instruct": PPTestSettings.fast(),
"Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(), "Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
@ -196,7 +197,6 @@ MULTIMODAL_MODELS = {
"Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(), "Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
"fixie-ai/ultravox-v0_5-llama-3_2-1b": PPTestSettings.fast(), "fixie-ai/ultravox-v0_5-llama-3_2-1b": PPTestSettings.fast(),
} }
# yapf: enable
# NOTE: You can update this on your local machine to run specific tests # NOTE: You can update this on your local machine to run specific tests
TEST_MODELS = [ TEST_MODELS = [

View File

@ -287,29 +287,15 @@ def test_prefix_cache_default():
assert not engine_args.enable_prefix_caching assert not engine_args.enable_prefix_caching
# yapf: disable @pytest.mark.parametrize(
@pytest.mark.parametrize(("arg", "expected", "option"), [ ("arg", "expected", "option"),
(None, None, "mm-processor-kwargs"), [
("{}", {}, "mm-processor-kwargs"), (None, None, "mm-processor-kwargs"),
( ("{}", {}, "mm-processor-kwargs"),
'{"num_crops": 4}', ('{"num_crops": 4}', {"num_crops": 4}, "mm-processor-kwargs"),
{ ('{"foo": {"bar": "baz"}}', {"foo": {"bar": "baz"}}, "mm-processor-kwargs"),
"num_crops": 4 ],
}, )
"mm-processor-kwargs"
),
(
'{"foo": {"bar": "baz"}}',
{
"foo":
{
"bar": "baz"
}
},
"mm-processor-kwargs"
),
])
# yapf: enable
def test_composite_arg_parser(arg, expected, option): def test_composite_arg_parser(arg, expected, option):
parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
if arg is None: if arg is None:
@ -321,8 +307,7 @@ def test_composite_arg_parser(arg, expected, option):
def test_human_readable_model_len(): def test_human_readable_model_len():
# `exit_on_error` disabled to test invalid values below # `exit_on_error` disabled to test invalid values below
parser = EngineArgs.add_cli_args( parser = EngineArgs.add_cli_args(FlexibleArgumentParser(exit_on_error=False))
FlexibleArgumentParser(exit_on_error=False))
args = parser.parse_args([]) args = parser.parse_args([])
assert args.max_model_len is None assert args.max_model_len is None

View File

@ -15,6 +15,7 @@ from vllm.assets.video import VideoAsset
from vllm.config import ModelConfig from vllm.config import ModelConfig
from vllm.entrypoints.chat_utils import ( from vllm.entrypoints.chat_utils import (
_try_extract_ast, _try_extract_ast,
apply_mistral_chat_template,
load_chat_template, load_chat_template,
parse_chat_messages, parse_chat_messages,
parse_chat_messages_futures, parse_chat_messages_futures,
@ -1855,17 +1856,17 @@ def test_resolve_hf_chat_template_kwargs(sample_json_schema, model, expected_kwa
# NOTE: Qwen2-Audio default chat template is specially defined inside # NOTE: Qwen2-Audio default chat template is specially defined inside
# processor class instead of using `tokenizer_config.json` # processor class instead of using `tokenizer_config.json`
# yapf: disable
@pytest.mark.parametrize( @pytest.mark.parametrize(
("model", "expected_format"), ("model", "expected_format"),
[(PHI3V_MODEL_ID, "string"), [
(QWEN2VL_MODEL_ID, "openai"), (PHI3V_MODEL_ID, "string"),
(QWEN25VL_MODEL_ID, "openai"), (QWEN2VL_MODEL_ID, "openai"),
(ULTRAVOX_MODEL_ID, "string"), (QWEN25VL_MODEL_ID, "openai"),
(QWEN2AUDIO_MODEL_ID, "openai"), (ULTRAVOX_MODEL_ID, "string"),
(LLAMA_GUARD_MODEL_ID, "openai")], (QWEN2AUDIO_MODEL_ID, "openai"),
(LLAMA_GUARD_MODEL_ID, "openai"),
],
) )
# yapf: enable
def test_resolve_content_format_hf_defined(model, expected_format): def test_resolve_content_format_hf_defined(model, expected_format):
model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip") model_info.check_available_online(on_fail="skip")
@ -1879,7 +1880,8 @@ def test_resolve_content_format_hf_defined(model, expected_format):
hf_overrides=model_info.hf_overrides, hf_overrides=model_info.hf_overrides,
skip_tokenizer_init=model_info.skip_tokenizer_init, skip_tokenizer_init=model_info.skip_tokenizer_init,
enforce_eager=model_info.enforce_eager, enforce_eager=model_info.enforce_eager,
dtype=model_info.dtype) dtype=model_info.dtype,
)
tokenizer = get_tokenizer( tokenizer = get_tokenizer(
model, model,
@ -1911,18 +1913,18 @@ def test_resolve_content_format_hf_defined(model, expected_format):
assert resolved_format == expected_format assert resolved_format == expected_format
# yapf: disable
@pytest.mark.parametrize( @pytest.mark.parametrize(
("model", "expected_format"), ("model", "expected_format"),
[("Salesforce/blip2-opt-2.7b", "string"), [
("facebook/chameleon-7b", "string"), ("Salesforce/blip2-opt-2.7b", "string"),
("deepseek-ai/deepseek-vl2-tiny", "string"), ("facebook/chameleon-7b", "string"),
("adept/fuyu-8b", "string"), ("deepseek-ai/deepseek-vl2-tiny", "string"),
("google/paligemma-3b-mix-224", "string"), ("adept/fuyu-8b", "string"),
("Qwen/Qwen-VL", "string"), ("google/paligemma-3b-mix-224", "string"),
("Qwen/Qwen-VL-Chat", "string")], ("Qwen/Qwen-VL", "string"),
("Qwen/Qwen-VL-Chat", "string"),
],
) )
# yapf: enable
def test_resolve_content_format_fallbacks(model, expected_format): def test_resolve_content_format_fallbacks(model, expected_format):
model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip") model_info.check_available_online(on_fail="skip")
@ -1936,7 +1938,8 @@ def test_resolve_content_format_fallbacks(model, expected_format):
hf_overrides=model_info.hf_overrides, hf_overrides=model_info.hf_overrides,
skip_tokenizer_init=model_info.skip_tokenizer_init, skip_tokenizer_init=model_info.skip_tokenizer_init,
enforce_eager=model_info.enforce_eager, enforce_eager=model_info.enforce_eager,
dtype=model_info.dtype) dtype=model_info.dtype,
)
tokenizer = get_tokenizer( tokenizer = get_tokenizer(
model_config.tokenizer, model_config.tokenizer,
@ -1968,30 +1971,30 @@ def test_resolve_content_format_fallbacks(model, expected_format):
assert resolved_format == expected_format assert resolved_format == expected_format
# yapf: disable
@pytest.mark.parametrize( @pytest.mark.parametrize(
("template_path", "expected_format"), ("template_path", "expected_format"),
[("template_alpaca.jinja", "string"), [
("template_baichuan.jinja", "string"), ("template_alpaca.jinja", "string"),
("template_chatglm.jinja", "string"), ("template_baichuan.jinja", "string"),
("template_chatglm2.jinja", "string"), ("template_chatglm.jinja", "string"),
("template_chatml.jinja", "string"), ("template_chatglm2.jinja", "string"),
("template_dse_qwen2_vl.jinja", "openai"), ("template_chatml.jinja", "string"),
("template_falcon_180b.jinja", "string"), ("template_dse_qwen2_vl.jinja", "openai"),
("template_falcon.jinja", "string"), ("template_falcon_180b.jinja", "string"),
("template_inkbot.jinja", "string"), ("template_falcon.jinja", "string"),
("template_teleflm.jinja", "string"), ("template_inkbot.jinja", "string"),
("template_vlm2vec_phi3v.jinja", "openai"), ("template_teleflm.jinja", "string"),
("template_vlm2vec_qwen2vl.jinja", "openai"), ("template_vlm2vec_phi3v.jinja", "openai"),
("tool_chat_template_granite_20b_fc.jinja", "string"), ("template_vlm2vec_qwen2vl.jinja", "openai"),
("tool_chat_template_hermes.jinja", "string"), ("tool_chat_template_granite_20b_fc.jinja", "string"),
("tool_chat_template_internlm2_tool.jinja", "string"), ("tool_chat_template_hermes.jinja", "string"),
("tool_chat_template_llama3.1_json.jinja", "openai"), ("tool_chat_template_internlm2_tool.jinja", "string"),
("tool_chat_template_llama3.2_json.jinja", "openai"), ("tool_chat_template_llama3.1_json.jinja", "openai"),
("tool_chat_template_mistral_parallel.jinja", "string"), ("tool_chat_template_llama3.2_json.jinja", "openai"),
("tool_chat_template_mistral.jinja", "string")], ("tool_chat_template_mistral_parallel.jinja", "string"),
("tool_chat_template_mistral.jinja", "string"),
],
) )
# yapf: enable
def test_resolve_content_format_examples(template_path, expected_format): def test_resolve_content_format_examples(template_path, expected_format):
model_config = ModelConfig( model_config = ModelConfig(
PHI3V_MODEL_ID, # Dummy PHI3V_MODEL_ID, # Dummy
@ -2024,40 +2027,34 @@ def test_resolve_content_format_examples(template_path, expected_format):
assert resolved_format == expected_format assert resolved_format == expected_format
def test_parse_chat_messages_include_thinking_chunk(mistral_model_config, def test_parse_chat_messages_include_thinking_chunk(
mistral_tokenizer): mistral_model_config, mistral_tokenizer
messages = [{ ):
"role": messages = [
"system", {
"content": [{ "role": "system",
"type": "text", "content": [
"text": "You are a helpful assistant." {"type": "text", "text": "You are a helpful assistant."},
}, { {
"type": "type": "thinking",
"thinking", "closed": True,
"closed": "thinking": "Only return the answer when you are confident.",
True, },
"thinking": ],
"Only return the answer when you are confident." },
}] {"role": "user", "content": "What is 2+2?"},
}, { {
"role": "user", "role": "assistant",
"content": "What is 2+2?" "content": [
}, { {"type": "text", "text": "Let me think about it."},
"role": {"type": "thinking", "closed": True, "thinking": "2+2 = 4"},
"assistant", {
"content": [{ "type": "text",
"type": "text", "text": "The answer is 4.",
"text": "Let me think about it." },
}, { ],
"type": "thinking", },
"closed": True, ]
"thinking": "2+2 = 4"
}, {
"type": "text",
"text": "The answer is 4.",
}],
}]
conversation_with_thinking, _, _ = parse_chat_messages( conversation_with_thinking, _, _ = parse_chat_messages(
messages, messages,
@ -2066,122 +2063,105 @@ def test_parse_chat_messages_include_thinking_chunk(mistral_model_config,
content_format="openai", content_format="openai",
) )
expected_conversation = [{ expected_conversation = [
"role": {
"system", "role": "system",
"content": [{ "content": [
"type": "text", {"type": "text", "text": "You are a helpful assistant."},
"text": "You are a helpful assistant." {
}, { "type": "text",
"type": "text", "text": "Only return the answer when you are confident.",
"text": "Only return the answer when you are confident." },
}], ],
}, { },
"role": {
"user", "role": "user",
"content": [{ "content": [{"type": "text", "text": "What is 2+2?"}],
"type": "text", },
"text": "What is 2+2?" {
}], "role": "assistant",
}, { "content": [
"role": {"type": "text", "text": "Let me think about it."},
"assistant", {"type": "text", "text": "2+2 = 4"},
"content": [ {"type": "text", "text": "The answer is 4."},
{ ],
"type": "text", },
"text": "Let me think about it." ]
},
{
"type": "text",
"text": "2+2 = 4"
},
{
"type": "text",
"text": "The answer is 4."
},
]
}]
assert conversation_with_thinking == expected_conversation assert conversation_with_thinking == expected_conversation
def test_apply_mistral_chat_template_thinking_chunk(): def test_apply_mistral_chat_template_thinking_chunk():
# Moved import here to avoid yapf and isort conflicts messages = [
from vllm.entrypoints.chat_utils import apply_mistral_chat_template {
messages = [{ "role": "system",
"role": "content": [
"system", {"type": "text", "text": "You are a helpful assistant."},
"content": [{ {
"type": "text", "type": "thinking",
"text": "You are a helpful assistant." "closed": True,
}, { "thinking": "Only return the answer when you are confident.",
"type": },
"thinking", ],
"closed": },
True, {"role": "user", "content": "What is 2+2?"},
"thinking": {
"Only return the answer when you are confident." "role": "assistant",
}] "content": [
}, { {"type": "text", "text": "Let me think about it."},
"role": "user", {"type": "thinking", "closed": True, "thinking": "2+2 = 4"},
"content": "What is 2+2?" {
}, { "type": "text",
"role": "text": "The answer is 4.",
"assistant", },
"content": [{ ],
"type": "text", },
"text": "Let me think about it." {"role": "user", "content": "Thanks, what is 3+3?"},
}, { ]
"type": "thinking",
"closed": True,
"thinking": "2+2 = 4"
}, {
"type": "text",
"text": "The answer is 4.",
}],
}, {
"role": "user",
"content": "Thanks, what is 3+3?"
}]
# TODO(Julien): upon model release change to a tokenizer already configured. # TODO(Julien): upon model release change to a tokenizer already configured.
# ================================================================= # =================================================================
mistral_tokenizer = MistralTokenizer.from_pretrained( mistral_tokenizer = MistralTokenizer.from_pretrained(
"mistralai/Devstral-Small-2507") "mistralai/Devstral-Small-2507"
)
assert isinstance(mistral_tokenizer.tokenizer, Tekkenizer) assert isinstance(mistral_tokenizer.tokenizer, Tekkenizer)
# Add think special tokens to the tokenizer # Add think special tokens to the tokenizer
mistral_tokenizer.tokenizer._all_special_tokens[35] = SpecialTokenInfo( mistral_tokenizer.tokenizer._all_special_tokens[35] = SpecialTokenInfo(
rank=35, is_control=True, token_str=SpecialTokens.begin_think.value) rank=35, is_control=True, token_str=SpecialTokens.begin_think.value
)
mistral_tokenizer.tokenizer._all_special_tokens[36] = SpecialTokenInfo( mistral_tokenizer.tokenizer._all_special_tokens[36] = SpecialTokenInfo(
rank=36, is_control=True, token_str=SpecialTokens.end_think.value) rank=36, is_control=True, token_str=SpecialTokens.end_think.value
)
mistral_tokenizer.tokenizer._special_tokens_reverse_vocab = { mistral_tokenizer.tokenizer._special_tokens_reverse_vocab = {
k: v k: v
for k, v in for k, v in mistral_tokenizer.tokenizer._special_tokens_reverse_vocab.items()
mistral_tokenizer.tokenizer._special_tokens_reverse_vocab.items()
if v not in {35, 36} if v not in {35, 36}
} }
mistral_tokenizer.tokenizer._special_tokens_reverse_vocab[ mistral_tokenizer.tokenizer._special_tokens_reverse_vocab[
SpecialTokens.begin_think.value] = 35 SpecialTokens.begin_think.value
] = 35
mistral_tokenizer.tokenizer._special_tokens_reverse_vocab[ mistral_tokenizer.tokenizer._special_tokens_reverse_vocab[
SpecialTokens.end_think.value] = 36 SpecialTokens.end_think.value
] = 36
mistral_tokenizer.instruct.BEGIN_THINK = 35 mistral_tokenizer.instruct.BEGIN_THINK = 35
mistral_tokenizer.instruct.END_THINK = 36 mistral_tokenizer.instruct.END_THINK = 36
# ================================================================= # =================================================================
tokens_ids = apply_mistral_chat_template(mistral_tokenizer, tokens_ids = apply_mistral_chat_template(
messages, mistral_tokenizer, messages, chat_template=None, tools=None
chat_template=None, )
tools=None)
string_tokens = mistral_tokenizer.mistral.decode( string_tokens = mistral_tokenizer.mistral.decode(
tokens_ids, special_token_policy=SpecialTokenPolicy.KEEP) tokens_ids, special_token_policy=SpecialTokenPolicy.KEEP
)
expected_tokens = ( expected_tokens = (
r"<s>[SYSTEM_PROMPT]You are a helpful assistant.[THINK]Only return the" r"<s>[SYSTEM_PROMPT]You are a helpful assistant.[THINK]Only return the"
r" answer when you are confident.[/THINK][/SYSTEM_PROMPT]" r" answer when you are confident.[/THINK][/SYSTEM_PROMPT]"
r"[INST]What is 2+2?[/INST]" r"[INST]What is 2+2?[/INST]"
r"Let me think about it.[THINK]2+2 = 4[/THINK]The answer is 4.</s>" r"Let me think about it.[THINK]2+2 = 4[/THINK]The answer is 4.</s>"
r"[INST]Thanks, what is 3+3?[/INST]") r"[INST]Thanks, what is 3+3?[/INST]"
)
assert string_tokens == expected_tokens assert string_tokens == expected_tokens
@ -2192,37 +2172,32 @@ def test_parse_chat_messages_single_empty_audio_with_uuid(
): ):
audio_uuid = "abcd" audio_uuid = "abcd"
conversation, mm_data, mm_uuids = parse_chat_messages( conversation, mm_data, mm_uuids = parse_chat_messages(
[{ [
"role": {
"user", "role": "user",
"content": [ "content": [
{ {
"type": "input_audio", "type": "input_audio",
"input_audio": {}, "input_audio": {},
"uuid": audio_uuid, "uuid": audio_uuid,
}, },
{ {"type": "text", "text": "What does the audio say?"},
"type": "text", ],
"text": "What does the audio say?" }
}, ],
],
}],
qwen2_audio_model_config, qwen2_audio_model_config,
qwen2_audio_tokenizer, qwen2_audio_tokenizer,
content_format="string", content_format="string",
) )
assert conversation == [{ assert conversation == [
"role": {
"user", "role": "user",
"content": "content": "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?",
"Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?" }
}] ]
_assert_mm_data_inputs(mm_data, {"audio": 1}) _assert_mm_data_inputs(mm_data, {"audio": 1})
_assert_mm_uuids(mm_uuids, _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[audio_uuid])
1,
modality="audio",
expected_uuids=[audio_uuid])
@pytest.mark.asyncio @pytest.mark.asyncio
@ -2232,34 +2207,29 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
): ):
audio_uuid = "abcd" audio_uuid = "abcd"
conversation, mm_future, mm_uuids = parse_chat_messages_futures( conversation, mm_future, mm_uuids = parse_chat_messages_futures(
[{ [
"role": {
"user", "role": "user",
"content": [ "content": [
{ {
"type": "input_audio", "type": "input_audio",
"input_audio": {}, "input_audio": {},
"uuid": audio_uuid, "uuid": audio_uuid,
}, },
{ {"type": "text", "text": "What does the audio say?"},
"type": "text", ],
"text": "What does the audio say?" }
}, ],
],
}],
qwen2_audio_model_config, qwen2_audio_model_config,
qwen2_audio_tokenizer, qwen2_audio_tokenizer,
content_format="string", content_format="string",
) )
assert conversation == [{ assert conversation == [
"role": {
"user", "role": "user",
"content": "content": "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?",
"Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?" }
}] ]
_assert_mm_data_inputs(await mm_future, {"audio": 1}) _assert_mm_data_inputs(await mm_future, {"audio": 1})
_assert_mm_uuids(mm_uuids, _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[audio_uuid])
1,
modality="audio",
expected_uuids=[audio_uuid])

View File

@ -12,9 +12,6 @@ import torch
import torch.nn.functional as F import torch.nn.functional as F
from vllm.config.lora import LoRAConfig from vllm.config.lora import LoRAConfig
# yapf conflicts with isort for this block
# yapf: disable
from vllm.lora.layers import ( from vllm.lora.layers import (
BaseLayerWithLoRA, BaseLayerWithLoRA,
ColumnParallelLinearWithLoRA, ColumnParallelLinearWithLoRA,
@ -32,8 +29,6 @@ from vllm.lora.layers import (
RowParallelLinearWithShardedLoRA, RowParallelLinearWithShardedLoRA,
VocabParallelEmbeddingWithLoRA, VocabParallelEmbeddingWithLoRA,
) )
# yapf: enable
from vllm.lora.models import LoRALayerWeights, PackedLoRALayerWeights from vllm.lora.models import LoRALayerWeights, PackedLoRALayerWeights
from vllm.lora.punica_wrapper import get_punica_wrapper from vllm.lora.punica_wrapper import get_punica_wrapper
from vllm.model_executor.layers.linear import ( from vllm.model_executor.layers.linear import (

View File

@ -17,8 +17,6 @@ import vllm.model_executor.model_loader.tensorizer
from tests.utils import VLLM_PATH, RemoteOpenAIServer from tests.utils import VLLM_PATH, RemoteOpenAIServer
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs
# yapf: disable
from vllm.model_executor.model_loader.tensorizer import ( from vllm.model_executor.model_loader.tensorizer import (
TensorizerConfig, TensorizerConfig,
TensorSerializer, TensorSerializer,
@ -29,8 +27,6 @@ from vllm.model_executor.model_loader.tensorizer import (
from vllm.model_executor.model_loader.tensorizer_loader import ( from vllm.model_executor.model_loader.tensorizer_loader import (
BLACKLISTED_TENSORIZER_ARGS, BLACKLISTED_TENSORIZER_ARGS,
) )
# yapf: enable
from vllm.utils import PlaceholderModule from vllm.utils import PlaceholderModule
from .conftest import DummyExecutor, assert_from_collective_rpc from .conftest import DummyExecutor, assert_from_collective_rpc

View File

@ -45,18 +45,17 @@ from .vlm_utils.types import (
if current_platform.is_rocm(): if current_platform.is_rocm():
os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0" os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
# yapf: disable
COMMON_BROADCAST_SETTINGS = { COMMON_BROADCAST_SETTINGS = {
"test_type": VLMTestType.IMAGE, "test_type": VLMTestType.IMAGE,
"dtype": "half", "dtype": "half",
"max_tokens": 5, "max_tokens": 5,
"tensor_parallel_size": 2, "tensor_parallel_size": 2,
"hf_model_kwargs": {"device_map": "auto"}, "hf_model_kwargs": {"device_map": "auto"},
"image_size_factors": [(.25, 0.5, 1.0)], "image_size_factors": [(0.25, 0.5, 1.0)],
"distributed_executor_backend": ( "distributed_executor_backend": (
"ray", "ray",
"mp", "mp",
) ),
} }
### Test configuration for specific models ### Test configuration for specific models
@ -96,22 +95,20 @@ VLM_TEST_SETTINGS = {
#### Core tests to always run in the CI #### Core tests to always run in the CI
"llava": VLMTestInfo( "llava": VLMTestInfo(
models=["llava-hf/llava-1.5-7b-hf"], models=["llava-hf/llava-1.5-7b-hf"],
test_type=( test_type=(VLMTestType.EMBEDDING, VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
VLMTestType.EMBEDDING,
VLMTestType.IMAGE,
VLMTestType.CUSTOM_INPUTS
),
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
convert_assets_to_embeddings=model_utils.get_llava_embeddings, convert_assets_to_embeddings=model_utils.get_llava_embeddings,
max_model_len=4096, max_model_len=4096,
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output, vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
custom_test_opts=[CustomTestOptions( custom_test_opts=[
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs( CustomTestOptions(
formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:" inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
), formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:"
limit_mm_per_prompt={"image": 4}, ),
)], limit_mm_per_prompt={"image": 4},
)
],
# TODO: Revert to "auto" when CPU backend can use torch > 2.6 # TODO: Revert to "auto" when CPU backend can use torch > 2.6
dtype="bfloat16" if current_platform.is_cpu() else "auto", dtype="bfloat16" if current_platform.is_cpu() else "auto",
marks=[pytest.mark.core_model, pytest.mark.cpu_model], marks=[pytest.mark.core_model, pytest.mark.cpu_model],
@ -120,27 +117,27 @@ VLM_TEST_SETTINGS = {
models=["google/paligemma-3b-mix-224"], models=["google/paligemma-3b-mix-224"],
test_type=VLMTestType.IMAGE, test_type=VLMTestType.IMAGE,
prompt_formatter=identity, prompt_formatter=identity,
img_idx_to_prompt = lambda idx: "", img_idx_to_prompt=lambda idx: "",
# Paligemma uses its own sample prompts because the default one fails # Paligemma uses its own sample prompts because the default one fails
single_image_prompts=IMAGE_ASSETS.prompts({ single_image_prompts=IMAGE_ASSETS.prompts(
"stop_sign": "caption es", {
"cherry_blossom": "What is in the picture?", "stop_sign": "caption es",
}), "cherry_blossom": "What is in the picture?",
}
),
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output, vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
dtype="bfloat16", dtype="bfloat16",
marks=[pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")], # noqa: E501 marks=[
pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")
], # noqa: E501
), ),
"qwen2_5_vl": VLMTestInfo( "qwen2_5_vl": VLMTestInfo(
models=["Qwen/Qwen2.5-VL-3B-Instruct"], models=["Qwen/Qwen2.5-VL-3B-Instruct"],
test_type=( test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
VLMTestType.IMAGE, prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
VLMTestType.MULTI_IMAGE, img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
VLMTestType.VIDEO video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
),
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
@ -150,17 +147,13 @@ VLM_TEST_SETTINGS = {
), ),
"qwen2_5_omni": VLMTestInfo( "qwen2_5_omni": VLMTestInfo(
models=["Qwen/Qwen2.5-Omni-3B"], models=["Qwen/Qwen2.5-Omni-3B"],
test_type=( test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
VLMTestType.IMAGE, prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
VLMTestType.MULTI_IMAGE, img_idx_to_prompt=lambda idx: "<|vision_bos|><|IMAGE|><|vision_eos|>", # noqa: E501
VLMTestType.VIDEO video_idx_to_prompt=lambda idx: "<|vision_bos|><|VIDEO|><|vision_eos|>", # noqa: E501
),
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_bos|><|IMAGE|><|vision_eos|>", # noqa: E501
video_idx_to_prompt=lambda idx: "<|vision_bos|><|VIDEO|><|vision_eos|>", # noqa: E501
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
num_logprobs= 6 if current_platform.is_cpu() else 5, num_logprobs=6 if current_platform.is_cpu() else 5,
auto_cls=AutoModelForTextToWaveform, auto_cls=AutoModelForTextToWaveform,
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
patch_hf_runner=model_utils.qwen2_5_omni_patch_hf_runner, patch_hf_runner=model_utils.qwen2_5_omni_patch_hf_runner,
@ -168,9 +161,9 @@ VLM_TEST_SETTINGS = {
marks=[pytest.mark.core_model, pytest.mark.cpu_model], marks=[pytest.mark.core_model, pytest.mark.cpu_model],
), ),
"ultravox": VLMTestInfo( "ultravox": VLMTestInfo(
models = ["fixie-ai/ultravox-v0_5-llama-3_2-1b"], models=["fixie-ai/ultravox-v0_5-llama-3_2-1b"],
test_type=VLMTestType.AUDIO, test_type=VLMTestType.AUDIO,
prompt_formatter=lambda audio_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{audio_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501 prompt_formatter=lambda audio_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{audio_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
audio_idx_to_prompt=lambda idx: "<|audio|>", audio_idx_to_prompt=lambda idx: "<|audio|>",
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
@ -184,9 +177,11 @@ VLM_TEST_SETTINGS = {
"llava-onevision-transformers": VLMTestInfo( "llava-onevision-transformers": VLMTestInfo(
models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"], models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
test_type=VLMTestType.IMAGE, test_type=VLMTestType.IMAGE,
prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
max_model_len=16384, max_model_len=16384,
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"), # noqa: E501 hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs(
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
), # noqa: E501
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output, vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
image_size_factors=[(0.25, 0.5, 1.0)], image_size_factors=[(0.25, 0.5, 1.0)],
@ -201,7 +196,7 @@ VLM_TEST_SETTINGS = {
"idefics3-transformers": VLMTestInfo( "idefics3-transformers": VLMTestInfo(
models=["HuggingFaceTB/SmolVLM-256M-Instruct"], models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501
img_idx_to_prompt=lambda idx: "<image>", img_idx_to_prompt=lambda idx: "<image>",
max_model_len=8192, max_model_len=8192,
max_num_seqs=2, max_num_seqs=2,
@ -217,8 +212,8 @@ VLM_TEST_SETTINGS = {
"qwen2_5_vl-transformers": VLMTestInfo( "qwen2_5_vl-transformers": VLMTestInfo(
models=["Qwen/Qwen2.5-VL-3B-Instruct"], models=["Qwen/Qwen2.5-VL-3B-Instruct"],
test_type=VLMTestType.IMAGE, test_type=VLMTestType.IMAGE,
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501 img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
@ -228,23 +223,24 @@ VLM_TEST_SETTINGS = {
"model_impl": "transformers", "model_impl": "transformers",
}, },
# FIXME: Investigate mrope issue # FIXME: Investigate mrope issue
marks=[large_gpu_mark(min_gb=32), marks=[large_gpu_mark(min_gb=32), pytest.mark.skip(reason="Mrope issue")],
pytest.mark.skip(reason="Mrope issue")],
), ),
#### Extended model tests #### Extended model tests
"aria": VLMTestInfo( "aria": VLMTestInfo(
models=["rhymes-ai/Aria"], models=["rhymes-ai/Aria"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n", img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
single_image_prompts=IMAGE_ASSETS.prompts({ single_image_prompts=IMAGE_ASSETS.prompts(
"stop_sign": "<vlm_image>Please describe the image shortly.", {
"cherry_blossom": "<vlm_image>Please infer the season with reason.", # noqa: E501 "stop_sign": "<vlm_image>Please describe the image shortly.",
}), "cherry_blossom": "<vlm_image>Please infer the season with reason.", # noqa: E501
multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.", # noqa: E501 }
),
multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.", # noqa: E501
stop_str=["<|im_end|>"], stop_str=["<|im_end|>"],
image_size_factors=[(0.10, 0.15)], image_size_factors=[(0.10, 0.15)],
max_tokens=64, max_tokens=64,
@ -253,11 +249,13 @@ VLM_TEST_SETTINGS = {
"aya_vision": VLMTestInfo( "aya_vision": VLMTestInfo(
models=["CohereForAI/aya-vision-8b"], models=["CohereForAI/aya-vision-8b"],
test_type=(VLMTestType.IMAGE), test_type=(VLMTestType.IMAGE),
prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts({ single_image_prompts=IMAGE_ASSETS.prompts(
"stop_sign": "<image>What's the content in the center of the image?", # noqa: E501 {
"cherry_blossom": "<image>What is the season?", # noqa: E501 "stop_sign": "<image>What's the content in the center of the image?", # noqa: E501
}), "cherry_blossom": "<image>What is the season?", # noqa: E501
}
),
multi_image_prompt="<image><image>Describe the two images in detail.", # noqa: E501 multi_image_prompt="<image><image>Describe the two images in detail.", # noqa: E501
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
@ -267,11 +265,13 @@ VLM_TEST_SETTINGS = {
"aya_vision-multi_image": VLMTestInfo( "aya_vision-multi_image": VLMTestInfo(
models=["CohereForAI/aya-vision-8b"], models=["CohereForAI/aya-vision-8b"],
test_type=(VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts({ single_image_prompts=IMAGE_ASSETS.prompts(
"stop_sign": "<image>What's the content in the center of the image?", # noqa: E501 {
"cherry_blossom": "<image>What is the season?", # noqa: E501 "stop_sign": "<image>What's the content in the center of the image?", # noqa: E501
}), "cherry_blossom": "<image>What is the season?", # noqa: E501
}
),
multi_image_prompt="<image><image>Describe the two images in detail.", # noqa: E501 multi_image_prompt="<image><image>Describe the two images in detail.", # noqa: E501
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
@ -297,27 +297,29 @@ VLM_TEST_SETTINGS = {
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
# For chameleon, we only compare the sequences # For chameleon, we only compare the sequences
vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2], vllm_output_post_proc=lambda vllm_output, model: vllm_output[:2],
hf_output_post_proc = lambda hf_output, model: hf_output[:2], hf_output_post_proc=lambda hf_output, model: hf_output[:2],
comparator=check_outputs_equal, comparator=check_outputs_equal,
max_tokens=8, max_tokens=8,
dtype="bfloat16", dtype="bfloat16",
), ),
"deepseek_vl_v2": VLMTestInfo( "deepseek_vl_v2": VLMTestInfo(
models=["Isotr0py/deepseek-vl2-tiny"], # model repo using dynamic module models=["Isotr0py/deepseek-vl2-tiny"], # model repo using dynamic module
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ", # noqa: E501
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
single_image_prompts=IMAGE_ASSETS.prompts({ single_image_prompts=IMAGE_ASSETS.prompts(
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501 {
"cherry_blossom": "<image>\nPlease infer the season with reason in details.", # noqa: E501 "stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
}), "cherry_blossom": "<image>\nPlease infer the season with reason in details.", # noqa: E501
multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?", # noqa: E501 }
),
multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?", # noqa: E501
patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner, patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output, hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
stop_str=["<end▁of▁sentence>", "<begin▁of▁sentence>"], # noqa: E501 stop_str=["<end▁of▁sentence>", "<begin▁of▁sentence>"], # noqa: E501
image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)], image_size_factors=[(), (1.0,), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
), ),
"fuyu": VLMTestInfo( "fuyu": VLMTestInfo(
models=["adept/fuyu-8b"], models=["adept/fuyu-8b"],
@ -336,11 +338,13 @@ VLM_TEST_SETTINGS = {
"gemma3": VLMTestInfo( "gemma3": VLMTestInfo(
models=["google/gemma-3-4b-it"], models=["google/gemma-3-4b-it"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts({ single_image_prompts=IMAGE_ASSETS.prompts(
"stop_sign": "<start_of_image>What's the content in the center of the image?", # noqa: E501 {
"cherry_blossom": "<start_of_image>What is the season?", # noqa: E501 "stop_sign": "<start_of_image>What's the content in the center of the image?", # noqa: E501
}), "cherry_blossom": "<start_of_image>What is the season?", # noqa: E501
}
),
multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501 multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
@ -353,10 +357,12 @@ VLM_TEST_SETTINGS = {
models=["zai-org/glm-4v-9b"], models=["zai-org/glm-4v-9b"],
test_type=VLMTestType.IMAGE, test_type=VLMTestType.IMAGE,
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts({ single_image_prompts=IMAGE_ASSETS.prompts(
"stop_sign": "<|begin_of_image|><|endoftext|><|end_of_image|>What's the content in the center of the image?", # noqa: E501 {
"cherry_blossom": "<|begin_of_image|><|endoftext|><|end_of_image|>What is the season?", # noqa: E501 "stop_sign": "<|begin_of_image|><|endoftext|><|end_of_image|>What's the content in the center of the image?", # noqa: E501
}), "cherry_blossom": "<|begin_of_image|><|endoftext|><|end_of_image|>What is the season?", # noqa: E501
}
),
max_model_len=2048, max_model_len=2048,
max_num_seqs=2, max_num_seqs=2,
get_stop_token_ids=lambda tok: [151329, 151336, 151338], get_stop_token_ids=lambda tok: [151329, 151336, 151338],
@ -372,8 +378,8 @@ VLM_TEST_SETTINGS = {
models=["zai-org/GLM-4.1V-9B-Thinking"], models=["zai-org/GLM-4.1V-9B-Thinking"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>", # noqa: E501
img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>", # noqa: E501 img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>", # noqa: E501
video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>", # noqa: E501 video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>", # noqa: E501
max_model_len=2048, max_model_len=2048,
max_num_seqs=2, max_num_seqs=2,
get_stop_token_ids=lambda tok: [151329, 151336, 151338], get_stop_token_ids=lambda tok: [151329, 151336, 151338],
@ -390,23 +396,27 @@ VLM_TEST_SETTINGS = {
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
patch_hf_runner=model_utils.glm4_1v_patch_hf_runner, patch_hf_runner=model_utils.glm4_1v_patch_hf_runner,
custom_test_opts=[CustomTestOptions( custom_test_opts=[
inputs=custom_inputs.video_with_metadata_glm4_1v(), CustomTestOptions(
limit_mm_per_prompt={"video": 1}, inputs=custom_inputs.video_with_metadata_glm4_1v(),
)], limit_mm_per_prompt={"video": 1},
)
],
marks=[large_gpu_mark(min_gb=32)], marks=[large_gpu_mark(min_gb=32)],
), ),
"h2ovl": VLMTestInfo( "h2ovl": VLMTestInfo(
models = [ models=[
"h2oai/h2ovl-mississippi-800m", "h2oai/h2ovl-mississippi-800m",
"h2oai/h2ovl-mississippi-2b", "h2oai/h2ovl-mississippi-2b",
], ],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts({ single_image_prompts=IMAGE_ASSETS.prompts(
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501 {
"cherry_blossom": "<image>\nWhat is the season?", "stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
}), "cherry_blossom": "<image>\nWhat is the season?",
}
),
multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.", # noqa: E501 multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.", # noqa: E501
max_model_len=8192, max_model_len=8192,
use_tokenizer_eos=True, use_tokenizer_eos=True,
@ -416,7 +426,7 @@ VLM_TEST_SETTINGS = {
"idefics3": VLMTestInfo( "idefics3": VLMTestInfo(
models=["HuggingFaceTB/SmolVLM-256M-Instruct"], models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501
img_idx_to_prompt=lambda idx: "<image>", img_idx_to_prompt=lambda idx: "<image>",
max_model_len=8192, max_model_len=8192,
max_num_seqs=2, max_num_seqs=2,
@ -431,11 +441,13 @@ VLM_TEST_SETTINGS = {
# "OpenGVLab/Mono-InternVL-2B", # "OpenGVLab/Mono-InternVL-2B",
], ],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts({ single_image_prompts=IMAGE_ASSETS.prompts(
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501 {
"cherry_blossom": "<image>\nWhat is the season?", "stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
}), "cherry_blossom": "<image>\nWhat is the season?",
}
),
multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.", # noqa: E501 multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.", # noqa: E501
max_model_len=4096, max_model_len=4096,
use_tokenizer_eos=True, use_tokenizer_eos=True,
@ -446,7 +458,7 @@ VLM_TEST_SETTINGS = {
"OpenGVLab/InternVL3-1B", "OpenGVLab/InternVL3-1B",
], ],
test_type=VLMTestType.VIDEO, test_type=VLMTestType.VIDEO,
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
video_idx_to_prompt=lambda idx: "<video>", video_idx_to_prompt=lambda idx: "<video>",
max_model_len=8192, max_model_len=8192,
use_tokenizer_eos=True, use_tokenizer_eos=True,
@ -459,7 +471,7 @@ VLM_TEST_SETTINGS = {
VLMTestType.MULTI_IMAGE, VLMTestType.MULTI_IMAGE,
VLMTestType.VIDEO, VLMTestType.VIDEO,
), ),
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<IMG_CONTEXT>", img_idx_to_prompt=lambda idx: "<IMG_CONTEXT>",
video_idx_to_prompt=lambda idx: "<video>", video_idx_to_prompt=lambda idx: "<video>",
max_model_len=8192, max_model_len=8192,
@ -469,7 +481,7 @@ VLM_TEST_SETTINGS = {
"kimi_vl": VLMTestInfo( "kimi_vl": VLMTestInfo(
models=["moonshotai/Kimi-VL-A3B-Instruct"], models=["moonshotai/Kimi-VL-A3B-Instruct"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|im_user|>user<|im_middle|>{img_prompt}<|im_end|><|im_assistant|>assistant<|im_middle|>", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|im_user|>user<|im_middle|>{img_prompt}<|im_end|><|im_assistant|>assistant<|im_middle|>", # noqa: E501
img_idx_to_prompt=lambda _: "<|media_start|>image<|media_content|><|media_pad|><|media_end|>", # noqa: E501 img_idx_to_prompt=lambda _: "<|media_start|>image<|media_content|><|media_pad|><|media_end|>", # noqa: E501
max_model_len=8192, max_model_len=8192,
max_num_seqs=2, max_num_seqs=2,
@ -480,11 +492,11 @@ VLM_TEST_SETTINGS = {
), ),
"llama4": VLMTestInfo( "llama4": VLMTestInfo(
models=["meta-llama/Llama-4-Scout-17B-16E-Instruct"], models=["meta-llama/Llama-4-Scout-17B-16E-Instruct"],
prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|header_start|>user<|header_end|>\n\n{img_prompt}<|eot|><|header_start|>assistant<|header_end|>\n\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|header_start|>user<|header_end|>\n\n{img_prompt}<|eot|><|header_start|>assistant<|header_end|>\n\n", # noqa: E501
img_idx_to_prompt=lambda _: "<|image|>", img_idx_to_prompt=lambda _: "<|image|>",
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
distributed_executor_backend="mp", distributed_executor_backend="mp",
image_size_factors=[(.25, 0.5, 1.0)], image_size_factors=[(0.25, 0.5, 1.0)],
hf_model_kwargs={"device_map": "auto"}, hf_model_kwargs={"device_map": "auto"},
max_model_len=8192, max_model_len=8192,
max_num_seqs=4, max_num_seqs=4,
@ -500,28 +512,34 @@ VLM_TEST_SETTINGS = {
max_model_len=10240, max_model_len=10240,
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output, vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
custom_test_opts=[CustomTestOptions( custom_test_opts=[
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs( CustomTestOptions(
formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]" inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
), formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]"
limit_mm_per_prompt={"image": 4}, ),
)], limit_mm_per_prompt={"image": 4},
)
],
), ),
"llava_onevision": VLMTestInfo( "llava_onevision": VLMTestInfo(
models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"], models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
test_type=VLMTestType.CUSTOM_INPUTS, test_type=VLMTestType.CUSTOM_INPUTS,
prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
num_video_frames=16, num_video_frames=16,
max_model_len=16384, max_model_len=16384,
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"), # noqa: E501 hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs(
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
), # noqa: E501
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output, vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
custom_test_opts=[CustomTestOptions( custom_test_opts=[
inputs=custom_inputs.multi_video_multi_aspect_ratio_inputs( CustomTestOptions(
formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 inputs=custom_inputs.multi_video_multi_aspect_ratio_inputs(
), formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
limit_mm_per_prompt={"video": 4}, ),
)], limit_mm_per_prompt={"video": 4},
)
],
), ),
"llava_next_video": VLMTestInfo( "llava_next_video": VLMTestInfo(
models=["llava-hf/LLaVA-NeXT-Video-7B-hf"], models=["llava-hf/LLaVA-NeXT-Video-7B-hf"],
@ -563,7 +581,9 @@ VLM_TEST_SETTINGS = {
img_idx_to_prompt=lambda idx: "(<image>./</image>)\n", img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501 get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(
["<|im_end|>", "<|endoftext|>"]
), # noqa: E501
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner, patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
# FIXME: https://huggingface.co/openbmb/MiniCPM-o-2_6/discussions/49 # FIXME: https://huggingface.co/openbmb/MiniCPM-o-2_6/discussions/49
@ -576,13 +596,15 @@ VLM_TEST_SETTINGS = {
img_idx_to_prompt=lambda idx: "(<image>./</image>)\n", img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501 get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(
["<|im_end|>", "<|endoftext|>"]
), # noqa: E501
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner, patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
), ),
"minimax_vl_01": VLMTestInfo( "minimax_vl_01": VLMTestInfo(
models=["MiniMaxAI/MiniMax-VL-01"], models=["MiniMaxAI/MiniMax-VL-01"],
prompt_formatter=lambda img_prompt: f"<beginning_of_sentence>user: {img_prompt} assistant:<end_of_sentence>", # noqa: E501 prompt_formatter=lambda img_prompt: f"<beginning_of_sentence>user: {img_prompt} assistant:<end_of_sentence>", # noqa: E501
img_idx_to_prompt=lambda _: "<image>", img_idx_to_prompt=lambda _: "<image>",
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
max_model_len=8192, max_model_len=8192,
@ -604,8 +626,8 @@ VLM_TEST_SETTINGS = {
"ovis1_6-gemma2": VLMTestInfo( "ovis1_6-gemma2": VLMTestInfo(
models=["AIDC-AI/Ovis1.6-Gemma2-9B"], models=["AIDC-AI/Ovis1.6-Gemma2-9B"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501 img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
dtype="half", dtype="half",
@ -617,8 +639,8 @@ VLM_TEST_SETTINGS = {
"ovis2": VLMTestInfo( "ovis2": VLMTestInfo(
models=["AIDC-AI/Ovis2-1B"], models=["AIDC-AI/Ovis2-1B"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501 img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
dtype="half", dtype="half",
@ -628,13 +650,9 @@ VLM_TEST_SETTINGS = {
), ),
"ovis2_5": VLMTestInfo( "ovis2_5": VLMTestInfo(
models=["AIDC-AI/Ovis2.5-2B"], models=["AIDC-AI/Ovis2.5-2B"],
test_type=( test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
VLMTestType.IMAGE, prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
VLMTestType.MULTI_IMAGE, img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
VLMTestType.VIDEO
),
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
video_idx_to_prompt=lambda idx: "<video>\n", video_idx_to_prompt=lambda idx: "<video>\n",
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
@ -646,7 +664,7 @@ VLM_TEST_SETTINGS = {
"phi3v": VLMTestInfo( "phi3v": VLMTestInfo(
models=["microsoft/Phi-3.5-vision-instruct"], models=["microsoft/Phi-3.5-vision-instruct"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|end|>\n<|assistant|>\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|end|>\n<|assistant|>\n", # noqa: E501
img_idx_to_prompt=lambda idx: f"<|image_{idx}|>\n", img_idx_to_prompt=lambda idx: f"<|image_{idx}|>\n",
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
@ -681,15 +699,11 @@ VLM_TEST_SETTINGS = {
), ),
"qwen2_vl": VLMTestInfo( "qwen2_vl": VLMTestInfo(
models=["Qwen/Qwen2-VL-2B-Instruct"], models=["Qwen/Qwen2-VL-2B-Instruct"],
test_type=( test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
VLMTestType.IMAGE, prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
VLMTestType.MULTI_IMAGE, img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
VLMTestType.VIDEO video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
), multi_image_prompt="Picture 1: <vlm_image>\nPicture 2: <vlm_image>\nDescribe these two images with one paragraph respectively.", # noqa: E501
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
multi_image_prompt="Picture 1: <vlm_image>\nPicture 2: <vlm_image>\nDescribe these two images with one paragraph respectively.", # noqa: E501
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
@ -700,11 +714,13 @@ VLM_TEST_SETTINGS = {
"skywork_r1v": VLMTestInfo( "skywork_r1v": VLMTestInfo(
models=["Skywork/Skywork-R1V-38B"], models=["Skywork/Skywork-R1V-38B"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<begin▁of▁sentence><User>\n{img_prompt}<Assistant><think>\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<begin▁of▁sentence><User>\n{img_prompt}<Assistant><think>\n", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts({ single_image_prompts=IMAGE_ASSETS.prompts(
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501 {
"cherry_blossom": "<image>\nWhat is the season?", "stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
}), "cherry_blossom": "<image>\nWhat is the season?",
}
),
multi_image_prompt="<image>\n<image>\nDescribe the two images in short.", # noqa: E501 multi_image_prompt="<image>\n<image>\nDescribe the two images in short.", # noqa: E501
max_model_len=4096, max_model_len=4096,
use_tokenizer_eos=True, use_tokenizer_eos=True,
@ -737,9 +753,9 @@ VLM_TEST_SETTINGS = {
VLMTestType.MULTI_IMAGE, VLMTestType.MULTI_IMAGE,
VLMTestType.VIDEO, VLMTestType.VIDEO,
), ),
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501 img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501 video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
@ -752,11 +768,11 @@ VLM_TEST_SETTINGS = {
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
max_model_len=4096, max_model_len=4096,
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2], vllm_output_post_proc=lambda vllm_output, model: vllm_output[:2],
hf_output_post_proc = lambda hf_output, model: hf_output[:2], hf_output_post_proc=lambda hf_output, model: hf_output[:2],
comparator=check_outputs_equal, comparator=check_outputs_equal,
marks=multi_gpu_marks(num_gpus=2), marks=multi_gpu_marks(num_gpus=2),
**COMMON_BROADCAST_SETTINGS # type: ignore **COMMON_BROADCAST_SETTINGS, # type: ignore
), ),
"llava-broadcast": VLMTestInfo( "llava-broadcast": VLMTestInfo(
models=["llava-hf/llava-1.5-7b-hf"], models=["llava-hf/llava-1.5-7b-hf"],
@ -765,7 +781,7 @@ VLM_TEST_SETTINGS = {
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output, vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
marks=multi_gpu_marks(num_gpus=2), marks=multi_gpu_marks(num_gpus=2),
**COMMON_BROADCAST_SETTINGS # type: ignore **COMMON_BROADCAST_SETTINGS, # type: ignore
), ),
"llava_next-broadcast": VLMTestInfo( "llava_next-broadcast": VLMTestInfo(
models=["llava-hf/llava-v1.6-mistral-7b-hf"], models=["llava-hf/llava-v1.6-mistral-7b-hf"],
@ -774,12 +790,12 @@ VLM_TEST_SETTINGS = {
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output, vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
marks=multi_gpu_marks(num_gpus=2), marks=multi_gpu_marks(num_gpus=2),
**COMMON_BROADCAST_SETTINGS # type: ignore **COMMON_BROADCAST_SETTINGS, # type: ignore
), ),
### Custom input edge-cases for specific models ### Custom input edge-cases for specific models
"intern_vl-diff-patches": VLMTestInfo( "intern_vl-diff-patches": VLMTestInfo(
models=["OpenGVLab/InternVL2-2B"], models=["OpenGVLab/InternVL2-2B"],
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
test_type=VLMTestType.CUSTOM_INPUTS, test_type=VLMTestType.CUSTOM_INPUTS,
max_model_len=4096, max_model_len=4096,
use_tokenizer_eos=True, use_tokenizer_eos=True,
@ -788,7 +804,8 @@ VLM_TEST_SETTINGS = {
CustomTestOptions( CustomTestOptions(
inputs=inp, inputs=inp,
limit_mm_per_prompt={"image": 2}, limit_mm_per_prompt={"image": 2},
) for inp in custom_inputs.different_patch_input_cases_internvl() )
for inp in custom_inputs.different_patch_input_cases_internvl()
], ],
), ),
"llava_onevision-multiple-images": VLMTestInfo( "llava_onevision-multiple-images": VLMTestInfo(
@ -797,14 +814,18 @@ VLM_TEST_SETTINGS = {
max_model_len=16384, max_model_len=16384,
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"), # noqa: E501 hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs(
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
), # noqa: E501
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output, vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
custom_test_opts=[CustomTestOptions( custom_test_opts=[
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs( CustomTestOptions(
formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
), formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
limit_mm_per_prompt={"image": 4}, ),
)], limit_mm_per_prompt={"image": 4},
)
],
), ),
# regression test for https://github.com/vllm-project/vllm/issues/15122 # regression test for https://github.com/vllm-project/vllm/issues/15122
"qwen2_5_vl-windows-attention": VLMTestInfo( "qwen2_5_vl-windows-attention": VLMTestInfo(
@ -814,13 +835,14 @@ VLM_TEST_SETTINGS = {
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
custom_test_opts=[CustomTestOptions( custom_test_opts=[
inputs=custom_inputs.windows_attention_image_qwen2_5_vl(), CustomTestOptions(
limit_mm_per_prompt={"image": 1}, inputs=custom_inputs.windows_attention_image_qwen2_5_vl(),
)], limit_mm_per_prompt={"image": 1},
)
],
), ),
} }
# yapf: enable
def _mark_splits( def _mark_splits(

View File

@ -114,7 +114,6 @@ def get_parametrized_options(
raise ValueError("Test has type CUSTOM_INPUTS, but none given") raise ValueError("Test has type CUSTOM_INPUTS, but none given")
iter_kwargs["custom_test_opts"] = test_info.custom_test_opts iter_kwargs["custom_test_opts"] = test_info.custom_test_opts
# yapf: disable
# Wrap all model cases in a pytest parameter & pass marks through # Wrap all model cases in a pytest parameter & pass marks through
return [ return [
pytest.param( pytest.param(
@ -122,10 +121,10 @@ def get_parametrized_options(
ExpandableVLMTestArgs( ExpandableVLMTestArgs(
**{k: v for k, v in zip(iter_kwargs.keys(), case)} **{k: v for k, v in zip(iter_kwargs.keys(), case)}
), ),
marks=test_info.marks if test_info.marks is not None else [] marks=test_info.marks if test_info.marks is not None else [],
) for case in list(itertools.product(*iter_kwargs.values())) )
for case in list(itertools.product(*iter_kwargs.values()))
] ]
# yapf: enable
# Get a list per model type, where each entry contains a tuple of all of # Get a list per model type, where each entry contains a tuple of all of
# that model type's cases, then flatten them into the top level so that # that model type's cases, then flatten them into the top level so that

View File

@ -418,7 +418,6 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
self.image_size = self.vision_config.image_size self.image_size = self.vision_config.image_size
def __call__(self, text: str, images: Union[Image, list[Image]], **kwargs): def __call__(self, text: str, images: Union[Image, list[Image]], **kwargs):
# yapf: disable
from vllm.model_executor.models.h2ovl import ( from vllm.model_executor.models.h2ovl import (
IMG_CONTEXT, IMG_CONTEXT,
IMG_END, IMG_END,
@ -426,7 +425,6 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
image_to_pixel_values_h2ovl, image_to_pixel_values_h2ovl,
) )
# yapf: enable
images = [images] if isinstance(images, Image) else images images = [images] if isinstance(images, Image) else images
pixel_values = [ pixel_values = [
image_to_pixel_values_h2ovl( image_to_pixel_values_h2ovl(

View File

@ -33,24 +33,26 @@ TEST_IMG_PLACEHOLDER = "<vlm_image>"
TEST_VIDEO_PLACEHOLDER = "<vlm_video>" TEST_VIDEO_PLACEHOLDER = "<vlm_video>"
TEST_AUDIO_PLACEHOLDER = "<lmm_audio>" TEST_AUDIO_PLACEHOLDER = "<lmm_audio>"
# yapf: disable SINGLE_IMAGE_BASE_PROMPTS = IMAGE_ASSETS.prompts(
SINGLE_IMAGE_BASE_PROMPTS = IMAGE_ASSETS.prompts({ {
"stop_sign": f"{TEST_IMG_PLACEHOLDER}What's the content of the image?", "stop_sign": f"{TEST_IMG_PLACEHOLDER}What's the content of the image?",
"cherry_blossom": f"{TEST_IMG_PLACEHOLDER}What is the season?", "cherry_blossom": f"{TEST_IMG_PLACEHOLDER}What is the season?",
}) }
SINGLE_AUDIO_BASE_PROMPT = AUDIO_ASSETS.prompts({ )
"mary_had_lamb": f"{TEST_AUDIO_PLACEHOLDER}Transcribe this audio into English.", # noqa: E501 SINGLE_AUDIO_BASE_PROMPT = AUDIO_ASSETS.prompts(
"winning_call": f"{TEST_AUDIO_PLACEHOLDER}What is happening in this audio clip?", # noqa: E501 {
}) "mary_had_lamb": f"{TEST_AUDIO_PLACEHOLDER}Transcribe this audio into English.", # noqa: E501
"winning_call": f"{TEST_AUDIO_PLACEHOLDER}What is happening in this audio clip?", # noqa: E501
}
)
MULTI_IMAGE_BASE_PROMPT = f"Image-1: {TEST_IMG_PLACEHOLDER}Image-2: {TEST_IMG_PLACEHOLDER}Describe the two images in detail.\n" # noqa: E501 MULTI_IMAGE_BASE_PROMPT = f"Image-1: {TEST_IMG_PLACEHOLDER}Image-2: {TEST_IMG_PLACEHOLDER}Describe the two images in detail.\n" # noqa: E501
VIDEO_BASE_PROMPT = f"{TEST_VIDEO_PLACEHOLDER}Why is this video funny?" VIDEO_BASE_PROMPT = f"{TEST_VIDEO_PLACEHOLDER}Why is this video funny?"
IMAGE_SIZE_FACTORS = [(), (1.0, ), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)] IMAGE_SIZE_FACTORS = [(), (1.0,), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)]
EMBEDDING_SIZE_FACTORS = [(), (1.0, ), (1.0, 1.0, 1.0)] EMBEDDING_SIZE_FACTORS = [(), (1.0,), (1.0, 1.0, 1.0)]
RunnerOutput = tuple[list[int], str, Optional[SampleLogprobs]] RunnerOutput = tuple[list[int], str, Optional[SampleLogprobs]]
# yapf: enable
class PromptWithMultiModalInput(NamedTuple): class PromptWithMultiModalInput(NamedTuple):

View File

@ -322,80 +322,81 @@ def _test_processing_correctness_one(
) )
# yapf: disable @pytest.mark.parametrize(
@pytest.mark.parametrize("model_id", [ "model_id",
"rhymes-ai/Aria", [
"CohereForAI/aya-vision-8b", "rhymes-ai/Aria",
"Salesforce/blip2-opt-2.7b", "CohereForAI/aya-vision-8b",
"facebook/chameleon-7b", "Salesforce/blip2-opt-2.7b",
"CohereLabs/command-a-vision-07-2025", "facebook/chameleon-7b",
"deepseek-ai/deepseek-vl2-tiny", "CohereLabs/command-a-vision-07-2025",
"baidu/ERNIE-4.5-VL-28B-A3B-PT", "deepseek-ai/deepseek-vl2-tiny",
"adept/fuyu-8b", "baidu/ERNIE-4.5-VL-28B-A3B-PT",
"google/gemma-3-4b-it", "adept/fuyu-8b",
"google/gemma-3n-E2B-it", "google/gemma-3-4b-it",
"zai-org/glm-4v-9b", "google/gemma-3n-E2B-it",
"zai-org/GLM-4.1V-9B-Thinking", "zai-org/glm-4v-9b",
"zai-org/GLM-4.5V", "zai-org/GLM-4.1V-9B-Thinking",
"ibm-granite/granite-speech-3.3-2b", "zai-org/GLM-4.5V",
"h2oai/h2ovl-mississippi-800m", "ibm-granite/granite-speech-3.3-2b",
"naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B", "h2oai/h2ovl-mississippi-800m",
"HuggingFaceM4/Idefics3-8B-Llama3", "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
"internlm/Intern-S1", "HuggingFaceM4/Idefics3-8B-Llama3",
"OpenGVLab/InternVL2-1B", "internlm/Intern-S1",
"OpenGVLab/InternVL3-1B", "OpenGVLab/InternVL2-1B",
"OpenGVLab/InternVL3_5-1B", "OpenGVLab/InternVL3-1B",
"OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview", "OpenGVLab/InternVL3_5-1B",
"OpenGVLab/InternVL3_5-30B-A3B", "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview",
"Kwai-Keye/Keye-VL-8B-Preview", "OpenGVLab/InternVL3_5-30B-A3B",
"Kwai-Keye/Keye-VL-1_5-8B", "Kwai-Keye/Keye-VL-8B-Preview",
"moonshotai/Kimi-VL-A3B-Instruct", "Kwai-Keye/Keye-VL-1_5-8B",
"meta-llama/Llama-4-Scout-17B-16E-Instruct", "moonshotai/Kimi-VL-A3B-Instruct",
"llava-hf/llava-1.5-7b-hf", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
"llava-hf/llava-v1.6-mistral-7b-hf", "llava-hf/llava-1.5-7b-hf",
"llava-hf/LLaVA-NeXT-Video-7B-hf", "llava-hf/llava-v1.6-mistral-7b-hf",
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf", "llava-hf/LLaVA-NeXT-Video-7B-hf",
"TIGER-Lab/Mantis-8B-siglip-llama3", "llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
"mispeech/midashenglm-7b", "TIGER-Lab/Mantis-8B-siglip-llama3",
"openbmb/MiniCPM-Llama3-V-2_5", "mispeech/midashenglm-7b",
"openbmb/MiniCPM-o-2_6", "openbmb/MiniCPM-Llama3-V-2_5",
"openbmb/MiniCPM-V-2_6", "openbmb/MiniCPM-o-2_6",
"MiniMaxAI/MiniMax-VL-01", "openbmb/MiniCPM-V-2_6",
"allenai/Molmo-7B-D-0924", "MiniMaxAI/MiniMax-VL-01",
"allenai/Molmo-7B-O-0924", "allenai/Molmo-7B-D-0924",
"nvidia/NVLM-D-72B", "allenai/Molmo-7B-O-0924",
"nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1", "nvidia/NVLM-D-72B",
"AIDC-AI/Ovis1.6-Gemma2-9B", "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1",
"AIDC-AI/Ovis1.6-Llama3.2-3B", "AIDC-AI/Ovis1.6-Gemma2-9B",
"AIDC-AI/Ovis2-1B", "AIDC-AI/Ovis1.6-Llama3.2-3B",
"AIDC-AI/Ovis2.5-2B", "AIDC-AI/Ovis2-1B",
"google/paligemma-3b-mix-224", "AIDC-AI/Ovis2.5-2B",
"google/paligemma2-3b-ft-docci-448", "google/paligemma-3b-mix-224",
"microsoft/Phi-3.5-vision-instruct", "google/paligemma2-3b-ft-docci-448",
"microsoft/Phi-4-multimodal-instruct", "microsoft/Phi-3.5-vision-instruct",
"mistralai/Pixtral-12B-2409", "microsoft/Phi-4-multimodal-instruct",
"mistral-community/pixtral-12b", "mistralai/Pixtral-12B-2409",
"Qwen/Qwen-VL-Chat", "mistral-community/pixtral-12b",
"Qwen/Qwen2-VL-2B-Instruct", "Qwen/Qwen-VL-Chat",
"Qwen/Qwen2.5-VL-3B-Instruct", "Qwen/Qwen2-VL-2B-Instruct",
"Qwen/Qwen2-Audio-7B-Instruct", "Qwen/Qwen2.5-VL-3B-Instruct",
"Qwen/Qwen2.5-Omni-3B", "Qwen/Qwen2-Audio-7B-Instruct",
"Qwen/Qwen3-VL-4B-Instruct", "Qwen/Qwen2.5-Omni-3B",
"Qwen/Qwen3-VL-30B-A3B-Instruct", "Qwen/Qwen3-VL-4B-Instruct",
"YannQi/R-4B", "Qwen/Qwen3-VL-30B-A3B-Instruct",
"Skywork/Skywork-R1V-38B", "YannQi/R-4B",
"HuggingFaceTB/SmolVLM2-2.2B-Instruct", "Skywork/Skywork-R1V-38B",
"stepfun-ai/step3", "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
"fixie-ai/ultravox-v0_5-llama-3_2-1b", "stepfun-ai/step3",
"openai/whisper-large-v3", "fixie-ai/ultravox-v0_5-llama-3_2-1b",
"omni-research/Tarsier-7b", "openai/whisper-large-v3",
"omni-research/Tarsier2-Recap-7b", "omni-research/Tarsier-7b",
"mistralai/Voxtral-Mini-3B-2507", "omni-research/Tarsier2-Recap-7b",
]) "mistralai/Voxtral-Mini-3B-2507",
],
)
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
@pytest.mark.parametrize("num_batches", [32]) @pytest.mark.parametrize("num_batches", [32])
@pytest.mark.parametrize("simplify_rate", [1.0]) @pytest.mark.parametrize("simplify_rate", [1.0])
# yapf: enable
def test_processing_correctness( def test_processing_correctness(
model_id: str, model_id: str,
hit_rate: float, hit_rate: float,

View File

@ -12,7 +12,6 @@ from ...utils import build_model_context
@pytest.mark.parametrize("model_id", ["HuggingFaceM4/Idefics3-8B-Llama3"]) @pytest.mark.parametrize("model_id", ["HuggingFaceM4/Idefics3-8B-Llama3"])
# yapf: disable
@pytest.mark.parametrize( @pytest.mark.parametrize(
("mm_processor_kwargs", "expected_toks_per_img"), ("mm_processor_kwargs", "expected_toks_per_img"),
[ [
@ -20,7 +19,6 @@ from ...utils import build_model_context
({"size": {"longest_edge": 728}}, 169 * (2**2 + 1)), ({"size": {"longest_edge": 728}}, 169 * (2**2 + 1)),
], ],
) )
# yapf: enable
@pytest.mark.parametrize("num_imgs", [1, 2]) @pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False]) @pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override( def test_processor_override(

View File

@ -11,7 +11,6 @@ from ...utils import build_model_context
@pytest.mark.parametrize("model_id", ["microsoft/Phi-3.5-vision-instruct"]) @pytest.mark.parametrize("model_id", ["microsoft/Phi-3.5-vision-instruct"])
# yapf: disable
@pytest.mark.parametrize( @pytest.mark.parametrize(
("mm_processor_kwargs", "expected_toks_per_img"), ("mm_processor_kwargs", "expected_toks_per_img"),
[ [
@ -21,7 +20,6 @@ from ...utils import build_model_context
({}, 757), ({}, 757),
], ],
) )
# yapf: enable
@pytest.mark.parametrize("num_imgs", [1, 2]) @pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False]) @pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override( def test_processor_override(

View File

@ -11,7 +11,6 @@ from ...utils import build_model_context
@pytest.mark.parametrize("model_id", ["microsoft/Phi-4-multimodal-instruct"]) @pytest.mark.parametrize("model_id", ["microsoft/Phi-4-multimodal-instruct"])
# yapf: disable
@pytest.mark.parametrize( @pytest.mark.parametrize(
("mm_processor_kwargs", "expected_toks_per_img"), ("mm_processor_kwargs", "expected_toks_per_img"),
[ [
@ -21,7 +20,6 @@ from ...utils import build_model_context
({}, 9585), ({}, 9585),
], ],
) )
# yapf: enable
@pytest.mark.parametrize("num_imgs", [1, 2]) @pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False]) @pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override( def test_processor_override(

View File

@ -10,7 +10,6 @@ from ...utils import build_model_context
@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) @pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])
# yapf: disable
@pytest.mark.parametrize( @pytest.mark.parametrize(
("mm_processor_kwargs", "expected_toks_per_img", "expected_pixels_shape"), ("mm_processor_kwargs", "expected_toks_per_img", "expected_pixels_shape"),
[ [
@ -18,7 +17,6 @@ from ...utils import build_model_context
({"min_pixels": 64**2, "max_pixels": 512**2}, 330, (1320, 1176)), ({"min_pixels": 64**2, "max_pixels": 512**2}, 330, (1320, 1176)),
], ],
) )
# yapf: enable
@pytest.mark.parametrize("num_imgs", [1, 2]) @pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False]) @pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override( def test_processor_override(

View File

@ -12,7 +12,6 @@ from ...utils import build_model_context
@pytest.mark.parametrize("model_id", ["HuggingFaceTB/SmolVLM2-2.2B-Instruct"]) @pytest.mark.parametrize("model_id", ["HuggingFaceTB/SmolVLM2-2.2B-Instruct"])
# yapf: disable
@pytest.mark.parametrize( @pytest.mark.parametrize(
("mm_processor_kwargs", "expected_toks_per_img"), ("mm_processor_kwargs", "expected_toks_per_img"),
[ [
@ -20,7 +19,6 @@ from ...utils import build_model_context
({"max_image_size": {"longest_edge": 768}}, 405), ({"max_image_size": {"longest_edge": 768}}, 405),
], ],
) )
# yapf: enable
@pytest.mark.parametrize("num_imgs", [1, 2]) @pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False]) @pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override( def test_processor_override(

View File

@ -7,9 +7,7 @@ from vllm.config import ModelConfig
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
# yapf: disable @pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
@pytest.mark.parametrize("model_id",
["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
def test_multimodal_processor(model_id): def test_multimodal_processor(model_id):
model_config = ModelConfig( model_config = ModelConfig(
model=model_id, model=model_id,
@ -18,9 +16,9 @@ def test_multimodal_processor(model_id):
mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config) mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config)
image_pil = ImageAsset('cherry_blossom').pil_image image_pil = ImageAsset("cherry_blossom").pil_image
mm_data = {"image": image_pil} mm_data = {"image": image_pil}
str_prompt = "<|im_start|>user <image>\nWhat is the content of this image?<|im_end|><|im_start|>assistant\n" # noqa: E501 str_prompt = "<|im_start|>user <image>\nWhat is the content of this image?<|im_end|><|im_start|>assistant\n" # noqa: E501
str_processed_inputs = mm_processor.apply( str_processed_inputs = mm_processor.apply(
prompt=str_prompt, prompt=str_prompt,
mm_data=mm_data, mm_data=mm_data,
@ -28,8 +26,23 @@ def test_multimodal_processor(model_id):
) )
ids_prompt = [ ids_prompt = [
151644, 872, 220, 151646, 198, 3838, 374, 279, 2213, 315, 419, 2168, 151644,
30, 151645, 151644, 77091, 198 872,
220,
151646,
198,
3838,
374,
279,
2213,
315,
419,
2168,
30,
151645,
151644,
77091,
198,
] ]
ids_processed_inputs = mm_processor.apply( ids_processed_inputs = mm_processor.apply(
prompt=ids_prompt, prompt=ids_prompt,
@ -37,5 +50,7 @@ def test_multimodal_processor(model_id):
hf_processor_mm_kwargs={}, hf_processor_mm_kwargs={},
) )
assert (str_processed_inputs["prompt_token_ids"] assert (
== ids_processed_inputs["prompt_token_ids"]) str_processed_inputs["prompt_token_ids"]
== ids_processed_inputs["prompt_token_ids"]
)

File diff suppressed because it is too large Load Diff

View File

@ -71,25 +71,27 @@ def _dummy_items(
) )
# yapf: disable
@pytest.mark.parametrize( @pytest.mark.parametrize(
("item", "expected_size"), ("item", "expected_size"),
[ [
(_dummy_item("a", {"a1": 100}), 100), (_dummy_item("a", {"a1": 100}), 100),
(_dummy_item("a", {"a1": 100, "a2": 110}), 210), (_dummy_item("a", {"a1": 100, "a2": 110}), 210),
(_dummy_items({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460), # noqa: E501 (_dummy_items({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460), # noqa: E501
(_dummy_items({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}).get_data(), 460), # noqa: E501 (
_dummy_items(
{"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}
).get_data(),
460,
), # noqa: E501
], ],
) )
# yapf: enable
def test_cache_item_size(item, expected_size): def test_cache_item_size(item, expected_size):
cache = MultiModalCache.get_lru_cache(2048, type(item)) cache = MultiModalCache.get_lru_cache(2048, type(item))
cache[""] = item cache[""] = item
assert cache.currsize == expected_size assert cache.currsize == expected_size
prompt_update = PromptInsertion("dummy", "target", "insertion") \ prompt_update = PromptInsertion("dummy", "target", "insertion").resolve(0)
.resolve(0)
cache[""] = MultiModalProcessorCacheItem(item, [prompt_update]) cache[""] = MultiModalProcessorCacheItem(item, [prompt_update])
assert cache.currsize == expected_size assert cache.currsize == expected_size
@ -106,9 +108,9 @@ def _create_vllm_config(
return VllmConfig( return VllmConfig(
model_config=ModelConfig( model_config=ModelConfig(
model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
mm_processor_cache_gb=mm_processor_cache_gb), mm_processor_cache_gb=mm_processor_cache_gb,
parallel_config=ParallelConfig( ),
data_parallel_size=1 if enable_ipc else 2), parallel_config=ParallelConfig(data_parallel_size=1 if enable_ipc else 2),
) )
@ -124,11 +126,9 @@ def _compare_caches(
seed: int = 0, seed: int = 0,
): ):
cache_0_p0 = processor_cache_from_config(config_0, MULTIMODAL_REGISTRY) cache_0_p0 = processor_cache_from_config(config_0, MULTIMODAL_REGISTRY)
cache_0_p1 = engine_receiver_cache_from_config(config_0, cache_0_p1 = engine_receiver_cache_from_config(config_0, MULTIMODAL_REGISTRY)
MULTIMODAL_REGISTRY)
cache_1_p0 = processor_cache_from_config(config_1, MULTIMODAL_REGISTRY) cache_1_p0 = processor_cache_from_config(config_1, MULTIMODAL_REGISTRY)
cache_1_p1 = engine_receiver_cache_from_config(config_1, cache_1_p1 = engine_receiver_cache_from_config(config_1, MULTIMODAL_REGISTRY)
MULTIMODAL_REGISTRY)
cache_size_gb = max( cache_size_gb = max(
config_0.model_config.multimodal_config.mm_processor_cache_gb, config_0.model_config.multimodal_config.mm_processor_cache_gb,
@ -142,8 +142,7 @@ def _compare_caches(
for _ in range(int(item_capacity / hit_rate)) for _ in range(int(item_capacity / hit_rate))
] ]
all_hashes = [ all_hashes = [
MultiModalHasher.hash_kwargs(item=item.get_data()) MultiModalHasher.hash_kwargs(item=item.get_data()) for item in all_items
for item in all_items
] ]
# Should not be used since there is nothing to convert to text # Should not be used since there is nothing to convert to text
@ -162,7 +161,8 @@ def _compare_caches(
for _ in range(is_cached_calls_per_iter): for _ in range(is_cached_calls_per_iter):
cache_0_p0.is_cached(selected_hashes) cache_0_p0.is_cached(selected_hashes)
cache_0_p0_out = [ cache_0_p0_out = [
item for item, _ in cache_0_p0.get_and_update( item
for item, _ in cache_0_p0.get_and_update(
[(item, prompt_update.content) for item in selected_items], [(item, prompt_update.content) for item in selected_items],
selected_hashes, selected_hashes,
) )
@ -174,7 +174,8 @@ def _compare_caches(
for _ in range(is_cached_calls_per_iter): for _ in range(is_cached_calls_per_iter):
cache_1_p0.is_cached(selected_hashes) cache_1_p0.is_cached(selected_hashes)
cache_1_p0_out = [ cache_1_p0_out = [
item for item, _ in cache_1_p0.get_and_update( item
for item, _ in cache_1_p0.get_and_update(
[(item, prompt_update.content) for item in selected_items], [(item, prompt_update.content) for item in selected_items],
selected_hashes, selected_hashes,
) )
@ -183,14 +184,12 @@ def _compare_caches(
if cache_0_p1 is None: if cache_0_p1 is None:
cache_0_p1_out = cache_0_p0_out cache_0_p1_out = cache_0_p0_out
else: else:
cache_0_p1_out = cache_0_p1.get_and_update(cache_0_p0_out, cache_0_p1_out = cache_0_p1.get_and_update(cache_0_p0_out, selected_hashes)
selected_hashes)
if cache_1_p1 is None: if cache_1_p1 is None:
cache_1_p1_out = cache_1_p0_out cache_1_p1_out = cache_1_p0_out
else: else:
cache_1_p1_out = cache_1_p1.get_and_update(cache_1_p0_out, cache_1_p1_out = cache_1_p1.get_and_update(cache_1_p0_out, selected_hashes)
selected_hashes)
assert cache_0_p1_out == cache_1_p1_out, f"Failed at {it=}" assert cache_0_p1_out == cache_1_p1_out, f"Failed at {it=}"

View File

@ -9,9 +9,6 @@ import pytest
from vllm.config import ModelConfig from vllm.config import ModelConfig
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
# yapf conflicts with isort for this block
# yapf: disable
from vllm.multimodal.processing import ( from vllm.multimodal.processing import (
InputProcessingContext, InputProcessingContext,
PlaceholderFeaturesInfo, PlaceholderFeaturesInfo,
@ -24,8 +21,6 @@ from vllm.multimodal.processing import (
iter_token_matches, iter_token_matches,
replace_token_matches, replace_token_matches,
) )
# yapf: enable
from vllm.multimodal.profiling import MultiModalProfiler from vllm.multimodal.profiling import MultiModalProfiler
from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer import AnyTokenizer
@ -34,7 +29,6 @@ from .utils import random_image
pytestmark = pytest.mark.cpu_test pytestmark = pytest.mark.cpu_test
# yapf: disable
@pytest.mark.parametrize( @pytest.mark.parametrize(
("token_ids", "match_ids", "expected"), ("token_ids", "match_ids", "expected"),
[ [
@ -44,34 +38,34 @@ pytestmark = pytest.mark.cpu_test
[32000, 32000, 32000], [32000, 32000, 32000],
[32000], [32000],
[ [
{ "start_idx": 0, "end_idx": 1 }, {"start_idx": 0, "end_idx": 1},
{ "start_idx": 1, "end_idx": 2 }, {"start_idx": 1, "end_idx": 2},
{ "start_idx": 2, "end_idx": 3 }, {"start_idx": 2, "end_idx": 3},
], ],
), ),
( (
[32000, 32000, 32000], [32000, 32000, 32000],
[32000, 32000], [32000, 32000],
[{ "start_idx": 0, "end_idx": 2 }], [{"start_idx": 0, "end_idx": 2}],
), ),
( (
[32000, 32000, 32000], [32000, 32000, 32000],
[32000, 32000, 32000], [32000, 32000, 32000],
[{ "start_idx": 0, "end_idx": 3 }], [{"start_idx": 0, "end_idx": 3}],
), ),
( (
[9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918], [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
[28747, 32000], [28747, 32000],
[ [
{ "start_idx": 1, "end_idx": 3 }, {"start_idx": 1, "end_idx": 3},
{ "start_idx": 6, "end_idx": 8 }, {"start_idx": 6, "end_idx": 8},
], ],
), ),
( (
[9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918], [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
[28747, 32000, 32000, 32000], [28747, 32000, 32000, 32000],
[ [
{ "start_idx": 1, "end_idx": 5 }, {"start_idx": 1, "end_idx": 5},
], ],
), ),
( (
@ -82,14 +76,13 @@ pytestmark = pytest.mark.cpu_test
], ],
) )
@pytest.mark.parametrize("start_idx", [0, 4, 8]) @pytest.mark.parametrize("start_idx", [0, 4, 8])
# yapf: enable
def test_iter_token_matches(token_ids, match_ids, expected, start_idx): def test_iter_token_matches(token_ids, match_ids, expected, start_idx):
result = list(iter_token_matches(token_ids, match_ids, result = list(iter_token_matches(token_ids, match_ids, start_idx=start_idx))
start_idx=start_idx))
# Manually constructed results # Manually constructed results
assert [item._asdict() for item in result assert [item._asdict() for item in result] == [
] == [item for item in expected if item["start_idx"] >= start_idx] item for item in expected if item["start_idx"] >= start_idx
]
# Invariants # Invariants
match_lens = [end - start for start, end in result] match_lens = [end - start for start, end in result]
@ -97,7 +90,6 @@ def test_iter_token_matches(token_ids, match_ids, expected, start_idx):
assert all(match_len == len(match_ids) for match_len in match_lens) assert all(match_len == len(match_ids) for match_len in match_lens)
# yapf: disable
@pytest.mark.parametrize( @pytest.mark.parametrize(
("token_ids", "match_ids", "new_ids", "expected"), ("token_ids", "match_ids", "new_ids", "expected"),
[ [
@ -141,7 +133,6 @@ def test_iter_token_matches(token_ids, match_ids, expected, start_idx):
), ),
], ],
) )
# yapf: enable
def test_replace_token_matches(token_ids, match_ids, new_ids, expected): def test_replace_token_matches(token_ids, match_ids, new_ids, expected):
result = replace_token_matches(token_ids, match_ids, new_ids) result = replace_token_matches(token_ids, match_ids, new_ids)
@ -149,7 +140,6 @@ def test_replace_token_matches(token_ids, match_ids, new_ids, expected):
assert result == expected assert result == expected
# yapf: disable
@pytest.mark.parametrize( @pytest.mark.parametrize(
("prompt", "target_by_key", "expected_by_key"), ("prompt", "target_by_key", "expected_by_key"),
[ [
@ -166,11 +156,11 @@ def test_replace_token_matches(token_ids, match_ids, new_ids, expected):
"pattern_1": [], "pattern_1": [],
"pattern_2": [], "pattern_2": [],
"pattern_3": [ "pattern_3": [
{ "start_idx": 0, "end_idx": 0 }, {"start_idx": 0, "end_idx": 0},
], ],
"pattern_4": [], "pattern_4": [],
"pattern_5": [ "pattern_5": [
{ "start_idx": 0, "end_idx": 0 }, {"start_idx": 0, "end_idx": 0},
], ],
}, },
), ),
@ -186,26 +176,26 @@ def test_replace_token_matches(token_ids, match_ids, new_ids, expected):
}, },
{ {
"pattern_1": [ "pattern_1": [
{ "start_idx": 0, "end_idx": 1 }, {"start_idx": 0, "end_idx": 1},
{ "start_idx": 1, "end_idx": 2 }, {"start_idx": 1, "end_idx": 2},
{ "start_idx": 2, "end_idx": 3 }, {"start_idx": 2, "end_idx": 3},
{ "start_idx": 3, "end_idx": 4 }, {"start_idx": 3, "end_idx": 4},
], ],
"pattern_2": [ "pattern_2": [
{ "start_idx": 0, "end_idx": 2 }, {"start_idx": 0, "end_idx": 2},
{ "start_idx": 2, "end_idx": 4 }, {"start_idx": 2, "end_idx": 4},
], ],
"pattern_3": [ "pattern_3": [
{ "start_idx": 0, "end_idx": 3 }, {"start_idx": 0, "end_idx": 3},
], ],
"pattern_4": [ "pattern_4": [
{ "start_idx": 0, "end_idx": 0 }, {"start_idx": 0, "end_idx": 0},
], ],
"pattern_5": [ "pattern_5": [
{ "start_idx": 1, "end_idx": 1 }, {"start_idx": 1, "end_idx": 1},
], ],
"pattern_6": [ "pattern_6": [
{ "start_idx": 4, "end_idx": 4 }, {"start_idx": 4, "end_idx": 4},
], ],
}, },
), ),
@ -221,26 +211,25 @@ def test_replace_token_matches(token_ids, match_ids, new_ids, expected):
}, },
{ {
"pattern_1": [ "pattern_1": [
{ "start_idx": 1, "end_idx": 3 }, {"start_idx": 1, "end_idx": 3},
{ "start_idx": 6, "end_idx": 8 }, {"start_idx": 6, "end_idx": 8},
], ],
"pattern_2": [ "pattern_2": [
{ "start_idx": 1, "end_idx": 5 }, {"start_idx": 1, "end_idx": 5},
], ],
"pattern_3": [], "pattern_3": [],
"pattern_4": [ "pattern_4": [
{ "start_idx": 0, "end_idx": 0 }, {"start_idx": 0, "end_idx": 0},
], ],
"pattern_5": [], "pattern_5": [],
"pattern_6": [ "pattern_6": [
{ "start_idx": 10, "end_idx": 10 }, {"start_idx": 10, "end_idx": 10},
], ],
}, },
), ),
], ],
) )
@pytest.mark.parametrize("update_type", [PromptInsertion, PromptReplacement]) @pytest.mark.parametrize("update_type", [PromptInsertion, PromptReplacement])
# yapf: enable
def test_find_token_matches( def test_find_token_matches(
prompt, prompt,
target_by_key, target_by_key,
@ -272,7 +261,6 @@ def test_find_token_matches(
} == expected_by_key } == expected_by_key
# yapf: disable
@pytest.mark.parametrize( @pytest.mark.parametrize(
("prompt", "target_by_key", "expected_by_key"), ("prompt", "target_by_key", "expected_by_key"),
[ [
@ -288,16 +276,16 @@ def test_find_token_matches(
"pattern_5": PromptIndexTargets.end(), "pattern_5": PromptIndexTargets.end(),
}, },
{ {
"pattern_1": [{ "start_idx": 0, "end_idx": 0 }], "pattern_1": [{"start_idx": 0, "end_idx": 0}],
"pattern_2": [], "pattern_2": [],
"pattern_3": [ "pattern_3": [
{ "start_idx": 0, "end_idx": 0 }, {"start_idx": 0, "end_idx": 0},
], ],
"pattern_4": [], "pattern_4": [],
"pattern_5": [ "pattern_5": [
{ "start_idx": 0, "end_idx": 0 }, {"start_idx": 0, "end_idx": 0},
], ],
} },
), ),
( (
"<image><image><image><image>", "<image><image><image><image>",
@ -311,26 +299,26 @@ def test_find_token_matches(
}, },
{ {
"pattern_1": [ "pattern_1": [
{ "start_idx": 0, "end_idx": 7 }, {"start_idx": 0, "end_idx": 7},
{ "start_idx": 7, "end_idx": 14 }, {"start_idx": 7, "end_idx": 14},
{ "start_idx": 14, "end_idx": 21 }, {"start_idx": 14, "end_idx": 21},
{ "start_idx": 21, "end_idx": 28 }, {"start_idx": 21, "end_idx": 28},
], ],
"pattern_2": [ "pattern_2": [
{ "start_idx": 0, "end_idx": 14 }, {"start_idx": 0, "end_idx": 14},
{ "start_idx": 14, "end_idx": 28 }, {"start_idx": 14, "end_idx": 28},
], ],
"pattern_3": [ "pattern_3": [
{ "start_idx": 0, "end_idx": 21 }, {"start_idx": 0, "end_idx": 21},
], ],
"pattern_4": [ "pattern_4": [
{ "start_idx": 0, "end_idx": 0 }, {"start_idx": 0, "end_idx": 0},
], ],
"pattern_5": [ "pattern_5": [
{ "start_idx": 7, "end_idx": 7 }, {"start_idx": 7, "end_idx": 7},
], ],
"pattern_6": [ "pattern_6": [
{ "start_idx": 28, "end_idx": 28 }, {"start_idx": 28, "end_idx": 28},
], ],
}, },
), ),
@ -346,21 +334,21 @@ def test_find_token_matches(
}, },
{ {
"pattern_1": [ "pattern_1": [
{ "start_idx": 0, "end_idx": 13 }, {"start_idx": 0, "end_idx": 13},
{ "start_idx": 27, "end_idx": 40 }, {"start_idx": 27, "end_idx": 40},
], ],
"pattern_2": [ "pattern_2": [
{ "start_idx": 0, "end_idx": 27 }, {"start_idx": 0, "end_idx": 27},
], ],
"pattern_3": [], "pattern_3": [],
"pattern_4": [ "pattern_4": [
{ "start_idx": 0, "end_idx": 0 }, {"start_idx": 0, "end_idx": 0},
], ],
"pattern_5": [ "pattern_5": [
{ "start_idx": 13, "end_idx": 13 }, {"start_idx": 13, "end_idx": 13},
], ],
"pattern_6": [ "pattern_6": [
{ "start_idx": 48, "end_idx": 48 }, {"start_idx": 48, "end_idx": 48},
], ],
}, },
), ),
@ -374,22 +362,21 @@ def test_find_token_matches(
}, },
{ {
"pattern_1": [ "pattern_1": [
{ "start_idx": 0, "end_idx": 9 }, {"start_idx": 0, "end_idx": 9},
{ "start_idx": 16, "end_idx": 25 }, {"start_idx": 16, "end_idx": 25},
], ],
"pattern_2": [ "pattern_2": [
{ "start_idx": 0, "end_idx": 16 }, {"start_idx": 0, "end_idx": 16},
{ "start_idx": 16, "end_idx": 32 }, {"start_idx": 16, "end_idx": 32},
], ],
"pattern_3": [ "pattern_3": [
{ "start_idx": 0, "end_idx": 25 }, {"start_idx": 0, "end_idx": 25},
], ],
}, },
), ),
], ],
) )
@pytest.mark.parametrize("update_type", [PromptInsertion, PromptReplacement]) @pytest.mark.parametrize("update_type", [PromptInsertion, PromptReplacement])
# yapf: enable
def test_find_text_matches( def test_find_text_matches(
prompt, prompt,
target_by_key, target_by_key,
@ -421,7 +408,6 @@ def test_find_text_matches(
} == expected_by_key } == expected_by_key
# yapf: disable
@pytest.mark.parametrize( @pytest.mark.parametrize(
("prompt", "target_by_key", "repl_by_key", "expected_by_update_type_mm_count"), # noqa: E501 ("prompt", "target_by_key", "repl_by_key", "expected_by_update_type_mm_count"), # noqa: E501
[ [
@ -549,9 +535,8 @@ def test_find_text_matches(
}, },
}, },
), ),
] ],
) )
# yapf: enable
def test_find_update_text( def test_find_update_text(
prompt, prompt,
target_by_key, target_by_key,
@ -562,13 +547,15 @@ def test_find_update_text(
mock_tokenizer = cast(AnyTokenizer, object()) mock_tokenizer = cast(AnyTokenizer, object())
for ( for (
update_type, update_type,
expected_by_mm_count, expected_by_mm_count,
) in expected_by_update_type_mm_count.items(): ) in expected_by_update_type_mm_count.items():
for mm_count, expected in expected_by_mm_count.items(): for mm_count, expected in expected_by_mm_count.items():
mm_prompt_updates = { mm_prompt_updates = {
key: [[update_type(key, target, repl_by_key[key]).resolve(i)] key: [
for i in range(mm_count)] [update_type(key, target, repl_by_key[key]).resolve(i)]
for i in range(mm_count)
]
for key, target in target_by_key.items() for key, target in target_by_key.items()
} }
@ -589,7 +576,6 @@ def test_find_update_text(
assert new_prompt == expected assert new_prompt == expected
# yapf: disable
@pytest.mark.parametrize( @pytest.mark.parametrize(
("prompt", "target_by_key", "repl_by_key", "expected_by_update_type_mm_count"), # noqa: E501 ("prompt", "target_by_key", "repl_by_key", "expected_by_update_type_mm_count"), # noqa: E501
[ [
@ -615,8 +601,43 @@ def test_find_update_text(
{ {
PromptInsertion: { PromptInsertion: {
0: [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918], 0: [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918],
1: [1, 9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918, 1550, 918, 1550], # noqa: E501 1: [
2: [1, 9833, 28747, 32000, 32000, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918, 1550, 918, 1550, 1550, 918, 1550], # noqa: E501 1,
9833,
28747,
32000,
32000,
32000,
9833,
28747,
32000,
32000,
918,
1550,
918,
1550,
], # noqa: E501
2: [
1,
9833,
28747,
32000,
32000,
32000,
32000,
32000,
9833,
28747,
32000,
32000,
918,
1550,
918,
1550,
1550,
918,
1550,
], # noqa: E501
}, },
PromptReplacement: { PromptReplacement: {
0: [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918], 0: [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918],
@ -719,9 +740,8 @@ def test_find_update_text(
}, },
}, },
), ),
] ],
) )
# yapf: enable
def test_find_update_tokens( def test_find_update_tokens(
prompt, prompt,
target_by_key, target_by_key,
@ -732,13 +752,15 @@ def test_find_update_tokens(
mock_tokenizer = cast(AnyTokenizer, object()) mock_tokenizer = cast(AnyTokenizer, object())
for ( for (
update_type, update_type,
expected_by_mm_count, expected_by_mm_count,
) in expected_by_update_type_mm_count.items(): ) in expected_by_update_type_mm_count.items():
for mm_count, expected in expected_by_mm_count.items(): for mm_count, expected in expected_by_mm_count.items():
mm_prompt_updates = { mm_prompt_updates = {
key: [[update_type(key, target, repl_by_key[key]).resolve(i)] key: [
for i in range(mm_count)] [update_type(key, target, repl_by_key[key]).resolve(i)]
for i in range(mm_count)
]
for key, target in target_by_key.items() for key, target in target_by_key.items()
} }
@ -759,7 +781,6 @@ def test_find_update_tokens(
assert new_prompt == expected assert new_prompt == expected
# yapf: disable
@pytest.mark.parametrize( @pytest.mark.parametrize(
"repl_by_key", "repl_by_key",
[ [
@ -796,8 +817,7 @@ def test_find_update_tokens(
is_embed=None, is_embed=None,
), ),
], ],
} },
), ),
( (
[1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550], [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550],
@ -828,7 +848,7 @@ def test_find_update_tokens(
), ),
], ],
# No match for pattern_4 as it has lower priority than pattern_1 # No match for pattern_4 as it has lower priority than pattern_1
} },
), ),
( (
[1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550], [1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550],
@ -867,12 +887,11 @@ def test_find_update_tokens(
is_embed=None, is_embed=None,
), ),
], ],
} },
), ),
] ],
) )
@pytest.mark.parametrize("update_type", [PromptInsertion, PromptReplacement]) @pytest.mark.parametrize("update_type", [PromptInsertion, PromptReplacement])
# yapf: enable
def test_find_mm_placeholders( def test_find_mm_placeholders(
repl_by_key, repl_by_key,
prompt, prompt,
@ -899,8 +918,15 @@ def test_find_mm_placeholders(
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
@pytest.mark.parametrize( @pytest.mark.parametrize(
("limit", "num_supported", "is_valid"), ("limit", "num_supported", "is_valid"),
[(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True), [
(2, 1, False), (2, 2, True)], (0, 0, True),
(0, 1, True),
(1, 0, False),
(1, 1, True),
(1, 2, True),
(2, 1, False),
(2, 2, True),
],
) )
def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid): def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
limit_mm_per_prompt = {"image": limit} limit_mm_per_prompt = {"image": limit}
@ -930,8 +956,15 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
@pytest.mark.parametrize( @pytest.mark.parametrize(
("num_images", "limit", "is_valid"), ("num_images", "limit", "is_valid"),
[(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True), [
(2, 1, False), (2, 2, True)], (0, 0, True),
(0, 1, True),
(1, 0, False),
(1, 1, True),
(1, 2, True),
(2, 1, False),
(2, 2, True),
],
) )
def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid): def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
limit_mm_per_prompt = {"image": limit} limit_mm_per_prompt = {"image": limit}
@ -966,7 +999,6 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
class DummyProcessor: class DummyProcessor:
def __init__(self, a: int = 0, b: int = 0) -> None: def __init__(self, a: int = 0, b: int = 0) -> None:
super().__init__() super().__init__()
@ -982,7 +1014,6 @@ class DummyProcessor:
return dict(a=a, c=c) return dict(a=a, c=c)
# yapf: disable
@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) # Dummy @pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) # Dummy
@pytest.mark.parametrize( @pytest.mark.parametrize(
("config_kwargs", "inference_kwargs", "expected_kwargs"), ("config_kwargs", "inference_kwargs", "expected_kwargs"),
@ -996,7 +1027,6 @@ class DummyProcessor:
({"b": 1, "c": 1}, {}, {"a": 0, "b": 1}), ({"b": 1, "c": 1}, {}, {"a": 0, "b": 1}),
], ],
) )
# yapf: enable
def test_hf_processor_init_kwargs( def test_hf_processor_init_kwargs(
model_id, model_id,
config_kwargs, config_kwargs,
@ -1020,7 +1050,6 @@ def test_hf_processor_init_kwargs(
assert getattr(processor, k) == v assert getattr(processor, k) == v
# yapf: disable
@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) # Dummy @pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) # Dummy
@pytest.mark.parametrize( @pytest.mark.parametrize(
("config_kwargs", "inference_kwargs", "expected_kwargs"), ("config_kwargs", "inference_kwargs", "expected_kwargs"),
@ -1034,7 +1063,6 @@ def test_hf_processor_init_kwargs(
({"b": 1, "c": 1}, {}, {"a": 0, "c": 1}), ({"b": 1, "c": 1}, {}, {"a": 0, "c": 1}),
], ],
) )
# yapf: enable
def test_hf_processor_call_kwargs( def test_hf_processor_call_kwargs(
model_id, model_id,
config_kwargs, config_kwargs,

View File

@ -233,7 +233,6 @@ async def test_fetch_video_http_with_dynamic_loader(
assert metadata_sync["video_backend"] == "opencv_dynamic" assert metadata_sync["video_backend"] == "opencv_dynamic"
# yapf: disable
@pytest.mark.parametrize( @pytest.mark.parametrize(
"case", "case",
[ [
@ -264,7 +263,6 @@ async def test_fetch_video_http_with_dynamic_loader(
("image", 0), ("image", 0),
], ],
), ),
# Two modalities # Two modalities
## Internally sorted ## Internally sorted
dict( dict(
@ -276,7 +274,7 @@ async def test_fetch_video_http_with_dynamic_loader(
"audio": [ "audio": [
PlaceholderRange(offset=0, length=2), PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=2, length=3), PlaceholderRange(offset=2, length=3),
] ],
}, },
expected_modality_idxs=[ expected_modality_idxs=[
("audio", 0), ("audio", 0),
@ -295,7 +293,7 @@ async def test_fetch_video_http_with_dynamic_loader(
"audio": [ "audio": [
PlaceholderRange(offset=5, length=2), PlaceholderRange(offset=5, length=2),
PlaceholderRange(offset=11, length=4), PlaceholderRange(offset=11, length=4),
] ],
}, },
expected_modality_idxs=[ expected_modality_idxs=[
("image", 0), ("image", 0),
@ -314,7 +312,7 @@ async def test_fetch_video_http_with_dynamic_loader(
"audio": [ "audio": [
PlaceholderRange(offset=11, length=4), PlaceholderRange(offset=11, length=4),
PlaceholderRange(offset=5, length=2), PlaceholderRange(offset=5, length=2),
] ],
}, },
expected_modality_idxs=[ expected_modality_idxs=[
("image", 1), ("image", 1),
@ -323,7 +321,6 @@ async def test_fetch_video_http_with_dynamic_loader(
("audio", 0), ("audio", 0),
], ],
), ),
# Three modalities # Three modalities
## Internally sorted ## Internally sorted
dict( dict(
@ -339,7 +336,7 @@ async def test_fetch_video_http_with_dynamic_loader(
PlaceholderRange(offset=3, length=4), PlaceholderRange(offset=3, length=4),
PlaceholderRange(offset=7, length=5), PlaceholderRange(offset=7, length=5),
PlaceholderRange(offset=12, length=6), PlaceholderRange(offset=12, length=6),
] ],
}, },
expected_modality_idxs=[ expected_modality_idxs=[
("audio", 0), ("audio", 0),
@ -363,7 +360,7 @@ async def test_fetch_video_http_with_dynamic_loader(
], ],
"video": [ "video": [
PlaceholderRange(offset=8, length=5), PlaceholderRange(offset=8, length=5),
] ],
}, },
expected_modality_idxs=[ expected_modality_idxs=[
("image", 0), ("image", 0),
@ -386,7 +383,7 @@ async def test_fetch_video_http_with_dynamic_loader(
], ],
"video": [ "video": [
PlaceholderRange(offset=8, length=5), PlaceholderRange(offset=8, length=5),
] ],
}, },
expected_modality_idxs=[ expected_modality_idxs=[
("image", 0), ("image", 0),
@ -398,7 +395,6 @@ async def test_fetch_video_http_with_dynamic_loader(
), ),
], ],
) )
# yapf: enable
def test_argsort_mm_positions(case): def test_argsort_mm_positions(case):
mm_positions = case["mm_positions"] mm_positions = case["mm_positions"]
expected_modality_idxs = case["expected_modality_idxs"] expected_modality_idxs = case["expected_modality_idxs"]
@ -413,13 +409,16 @@ def test_argsort_mm_positions(case):
@pytest.mark.parametrize("num_frames", [-1, 32, 1800]) @pytest.mark.parametrize("num_frames", [-1, 32, 1800])
async def test_allowed_media_domains(video_url: str, num_frames: int): async def test_allowed_media_domains(video_url: str, num_frames: int):
connector = MediaConnector( connector = MediaConnector(
media_io_kwargs={"video": { media_io_kwargs={
"num_frames": num_frames, "video": {
}}, "num_frames": num_frames,
}
},
allowed_media_domains=[ allowed_media_domains=[
"www.bogotobogo.com", "www.bogotobogo.com",
"github.com", "github.com",
]) ],
)
video_sync, metadata_sync = connector.fetch_video(video_url) video_sync, metadata_sync = connector.fetch_video(video_url)
video_async, metadata_async = await connector.fetch_video_async(video_url) video_async, metadata_async = await connector.fetch_video_async(video_url)

View File

@ -59,48 +59,52 @@ def test_parse_raw_single_batch_string_slice(inputs_slice: slice):
) )
# yapf: disable @pytest.mark.parametrize(
@pytest.mark.parametrize('mm_processor_kwargs,expected_mm_kwargs', [ "mm_processor_kwargs,expected_mm_kwargs",
(None, [{}, {}]), [
({}, [{}, {}]), (None, [{}, {}]),
({"foo": 100}, [{"foo": 100}, {"foo": 100}]), ({}, [{}, {}]),
([{"foo": 100}, {"bar": 200}], [{"foo": 100}, {"bar": 200}]), ({"foo": 100}, [{"foo": 100}, {"foo": 100}]),
]) ([{"foo": 100}, {"bar": 200}], [{"foo": 100}, {"bar": 200}]),
# yapf: enable ],
)
def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs): def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
"""Test mm_processor_kwargs init for zipping enc/dec prompts.""" """Test mm_processor_kwargs init for zipping enc/dec prompts."""
encoder_prompts = ['An encoder prompt', 'Another encoder prompt'] encoder_prompts = ["An encoder prompt", "Another encoder prompt"]
decoder_prompts = ['A decoder prompt', 'Another decoder prompt'] decoder_prompts = ["A decoder prompt", "Another decoder prompt"]
zipped_prompts = zip_enc_dec_prompts(encoder_prompts, decoder_prompts, zipped_prompts = zip_enc_dec_prompts(
mm_processor_kwargs) encoder_prompts, decoder_prompts, mm_processor_kwargs
)
assert len(zipped_prompts) == len(encoder_prompts) == len(decoder_prompts) assert len(zipped_prompts) == len(encoder_prompts) == len(decoder_prompts)
for enc, dec, exp_kwargs, zipped in zip(encoder_prompts, decoder_prompts, for enc, dec, exp_kwargs, zipped in zip(
expected_mm_kwargs, encoder_prompts, decoder_prompts, expected_mm_kwargs, zipped_prompts
zipped_prompts): ):
assert isinstance(zipped, dict) assert isinstance(zipped, dict)
assert len(zipped.keys()) == 3 assert len(zipped.keys()) == 3
assert zipped['encoder_prompt'] == enc assert zipped["encoder_prompt"] == enc
assert zipped['decoder_prompt'] == dec assert zipped["decoder_prompt"] == dec
assert zipped['mm_processor_kwargs'] == exp_kwargs assert zipped["mm_processor_kwargs"] == exp_kwargs
@pytest.mark.parametrize("model_id", [ @pytest.mark.parametrize(
"facebook/opt-125m", "model_id",
]) [
@pytest.mark.parametrize("prompt", [ "facebook/opt-125m",
{ ],
"prompt": "", )
"multi_modal_data": { @pytest.mark.parametrize(
"dummy": [] "prompt",
[
{
"prompt": "",
"multi_modal_data": {"dummy": []},
}, },
}, {
{ "prompt_token_ids": [],
"prompt_token_ids": [], "multi_modal_data": {"dummy": []},
"multi_modal_data": {
"dummy": []
}, },
}, ],
]) )
def test_preprocessor_text_no_mm_inputs(model_id, prompt): def test_preprocessor_text_no_mm_inputs(model_id, prompt):
model_config = ModelConfig(model=model_id) model_config = ModelConfig(model=model_id)
tokenizer = init_tokenizer_from_configs(model_config) tokenizer = init_tokenizer_from_configs(model_config)
@ -110,15 +114,19 @@ def test_preprocessor_text_no_mm_inputs(model_id, prompt):
input_preprocessor.preprocess(prompt) input_preprocessor.preprocess(prompt)
@pytest.mark.parametrize("model_id", [ @pytest.mark.parametrize(
"facebook/chameleon-7b", "model_id",
]) [
@pytest.mark.parametrize("prompt", [ "facebook/chameleon-7b",
"", ],
{ )
"prompt_token_ids": [] @pytest.mark.parametrize(
}, "prompt",
]) [
"",
{"prompt_token_ids": []},
],
)
def test_preprocessor_always_mm_code_path(model_id, prompt): def test_preprocessor_always_mm_code_path(model_id, prompt):
model_config = ModelConfig(model=model_id) model_config = ModelConfig(model=model_id)
tokenizer = init_tokenizer_from_configs(model_config) tokenizer = init_tokenizer_from_configs(model_config)

View File

@ -9,14 +9,10 @@ import pytest
import torch import torch
import torch_xla import torch_xla
# yapf conflicts with isort for this block
# yapf: disable
from vllm.model_executor.layers.fused_moe.moe_pallas import fused_moe as pallas_moe from vllm.model_executor.layers.fused_moe.moe_pallas import fused_moe as pallas_moe
from vllm.model_executor.layers.fused_moe.moe_torch_iterative import ( from vllm.model_executor.layers.fused_moe.moe_torch_iterative import (
fused_moe as torch_moe, fused_moe as torch_moe,
) )
# yapf: enable
from vllm.platforms import current_platform from vllm.platforms import current_platform
if not current_platform.is_tpu(): if not current_platform.is_tpu():

View File

@ -388,7 +388,6 @@ def test_duplicate_dict_args(caplog_vllm, parser):
assert "-O.level" in caplog_vllm.text assert "-O.level" in caplog_vllm.text
# yapf: enable
@pytest.mark.parametrize( @pytest.mark.parametrize(
"callable,kw_name,requires_kw_only,allow_var_kwargs,is_supported", "callable,kw_name,requires_kw_only,allow_var_kwargs,is_supported",
[ [
@ -408,7 +407,6 @@ def test_duplicate_dict_args(caplog_vllm, parser):
(lambda foo, **kwargs: None, "foo", True, True, False), (lambda foo, **kwargs: None, "foo", True, True, False),
], ],
) )
# yapf: disable
def test_supports_kw( def test_supports_kw(
callable, kw_name, requires_kw_only, allow_var_kwargs, is_supported callable, kw_name, requires_kw_only, allow_var_kwargs, is_supported
): ):
@ -681,7 +679,6 @@ def test_lru_cache():
assert 6 in cache assert 6 in cache
# yapf: disable
@pytest.mark.parametrize( @pytest.mark.parametrize(
("src_dtype", "tgt_dtype", "expected_result"), ("src_dtype", "tgt_dtype", "expected_result"),
[ [
@ -715,12 +712,10 @@ def test_lru_cache():
(torch.complex64, torch.complex32, False), (torch.complex64, torch.complex32, False),
], ],
) )
# yapf: enable
def test_is_lossless_cast(src_dtype, tgt_dtype, expected_result): def test_is_lossless_cast(src_dtype, tgt_dtype, expected_result):
assert is_lossless_cast(src_dtype, tgt_dtype) == expected_result assert is_lossless_cast(src_dtype, tgt_dtype) == expected_result
# yapf: disable
@pytest.mark.parametrize( @pytest.mark.parametrize(
("dtypes", "expected_result"), ("dtypes", "expected_result"),
[ [
@ -730,7 +725,6 @@ def test_is_lossless_cast(src_dtype, tgt_dtype, expected_result):
([torch.bool, torch.int8, torch.float16, torch.complex32], torch.complex32), # noqa: E501 ([torch.bool, torch.int8, torch.float16, torch.complex32], torch.complex32), # noqa: E501
], ],
) )
# yapf: enable
def test_common_broadcastable_dtype(dtypes, expected_result): def test_common_broadcastable_dtype(dtypes, expected_result):
assert common_broadcastable_dtype(dtypes) == expected_result assert common_broadcastable_dtype(dtypes) == expected_result
@ -775,7 +769,6 @@ def test_placeholder_module_error_handling():
_ = placeholder_attr.module _ = placeholder_attr.module
# yapf: disable
@pytest.mark.parametrize( @pytest.mark.parametrize(
"obj,key1,key2", "obj,key1,key2",
[ [
@ -785,8 +778,8 @@ def test_placeholder_module_error_handling():
({1: "a", 2: "b"}, 1, 3), ({1: "a", 2: "b"}, 1, 3),
# Tests for both keys do not exist # Tests for both keys do not exist
({1: "a", 2: "b"}, 3, 4), ({1: "a", 2: "b"}, 3, 4),
]) ],
# yapf: enable )
def test_swap_dict_values(obj, key1, key2): def test_swap_dict_values(obj, key1, key2):
original_obj = obj.copy() original_obj = obj.copy()
swap_dict_values(obj, key1, key2) swap_dict_values(obj, key1, key2)
@ -800,26 +793,30 @@ def test_swap_dict_values(obj, key1, key2):
assert key1 not in obj assert key1 not in obj
def test_model_specification(parser_with_config, cli_config_file, def test_model_specification(
cli_config_file_with_model): parser_with_config, cli_config_file, cli_config_file_with_model
):
# Test model in CLI takes precedence over config # Test model in CLI takes precedence over config
args = parser_with_config.parse_args( args = parser_with_config.parse_args(
['serve', 'cli-model', '--config', cli_config_file_with_model]) ["serve", "cli-model", "--config", cli_config_file_with_model]
assert args.model_tag == 'cli-model' )
assert args.served_model_name == 'mymodel' assert args.model_tag == "cli-model"
assert args.served_model_name == "mymodel"
# Test model from config file works # Test model from config file works
args = parser_with_config.parse_args([ args = parser_with_config.parse_args(
'serve', [
'--config', "serve",
cli_config_file_with_model, "--config",
]) cli_config_file_with_model,
assert args.model == 'config-model' ]
assert args.served_model_name == 'mymodel' )
assert args.model == "config-model"
assert args.served_model_name == "mymodel"
# Test no model specified anywhere raises error # Test no model specified anywhere raises error
with pytest.raises(ValueError, match="No model specified!"): with pytest.raises(ValueError, match="No model specified!"):
parser_with_config.parse_args(['serve', '--config', cli_config_file]) parser_with_config.parse_args(["serve", "--config", cli_config_file])
# Test using --model option raises error # Test using --model option raises error
# with pytest.raises( # with pytest.raises(
@ -833,47 +830,52 @@ def test_model_specification(parser_with_config, cli_config_file,
# Test using --model option back-compatibility # Test using --model option back-compatibility
# (when back-compatibility ends, the above test should be uncommented # (when back-compatibility ends, the above test should be uncommented
# and the below test should be removed) # and the below test should be removed)
args = parser_with_config.parse_args([ args = parser_with_config.parse_args(
'serve', [
'--tensor-parallel-size', "serve",
'2', "--tensor-parallel-size",
'--model', "2",
'my-model', "--model",
'--trust-remote-code', "my-model",
'--port', "--trust-remote-code",
'8001', "--port",
]) "8001",
]
)
assert args.model is None assert args.model is None
assert args.tensor_parallel_size == 2 assert args.tensor_parallel_size == 2
assert args.trust_remote_code is True assert args.trust_remote_code is True
assert args.port == 8001 assert args.port == 8001
args = parser_with_config.parse_args([ args = parser_with_config.parse_args(
'serve', [
'--tensor-parallel-size=2', "serve",
'--model=my-model', "--tensor-parallel-size=2",
'--trust-remote-code', "--model=my-model",
'--port=8001', "--trust-remote-code",
]) "--port=8001",
]
)
assert args.model is None assert args.model is None
assert args.tensor_parallel_size == 2 assert args.tensor_parallel_size == 2
assert args.trust_remote_code is True assert args.trust_remote_code is True
assert args.port == 8001 assert args.port == 8001
# Test other config values are preserved # Test other config values are preserved
args = parser_with_config.parse_args([ args = parser_with_config.parse_args(
'serve', [
'cli-model', "serve",
'--config', "cli-model",
cli_config_file_with_model, "--config",
]) cli_config_file_with_model,
]
)
assert args.tensor_parallel_size == 2 assert args.tensor_parallel_size == 2
assert args.trust_remote_code is True assert args.trust_remote_code is True
assert args.port == 12312 assert args.port == 12312
@pytest.mark.parametrize("input", [(), ("abc", ), (None, ), @pytest.mark.parametrize("input", [(), ("abc",), (None,), (None, bool, [1, 2, 3])])
(None, bool, [1, 2, 3])])
def test_sha256(input: tuple): def test_sha256(input: tuple):
digest = sha256(input) digest = sha256(input)
assert digest is not None assert digest is not None
@ -887,7 +889,7 @@ def test_sha256(input: tuple):
assert digest == sha256(input) assert digest == sha256(input)
# hashing different input, returns different value # hashing different input, returns different value
assert digest != sha256(input + (1, )) assert digest != sha256(input + (1,))
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -897,7 +899,8 @@ def test_sha256(input: tuple):
("tcp://127.0.0.1:5555", ("tcp", "127.0.0.1", "5555")), ("tcp://127.0.0.1:5555", ("tcp", "127.0.0.1", "5555")),
("tcp://[::1]:5555", ("tcp", "::1", "5555")), # IPv6 address ("tcp://[::1]:5555", ("tcp", "::1", "5555")), # IPv6 address
("inproc://some_identifier", ("inproc", "some_identifier", "")), ("inproc://some_identifier", ("inproc", "some_identifier", "")),
]) ],
)
def test_split_zmq_path(path, expected): def test_split_zmq_path(path, expected):
assert split_zmq_path(path) == expected assert split_zmq_path(path) == expected
@ -909,7 +912,8 @@ def test_split_zmq_path(path, expected):
"tcp://127.0.0.1", # Missing port "tcp://127.0.0.1", # Missing port
"tcp://[::1]", # Missing port for IPv6 "tcp://[::1]", # Missing port for IPv6
"tcp://:5555", # Missing host "tcp://:5555", # Missing host
]) ],
)
def test_split_zmq_path_invalid(invalid_path): def test_split_zmq_path_invalid(invalid_path):
with pytest.raises(ValueError): with pytest.raises(ValueError):
split_zmq_path(invalid_path) split_zmq_path(invalid_path)
@ -931,8 +935,9 @@ def test_make_zmq_socket_ipv6():
zsock: zmq.Socket = make_zmq_socket(ctx, ipv6_path, socket_type) zsock: zmq.Socket = make_zmq_socket(ctx, ipv6_path, socket_type)
# Verify that the IPV6 option is set # Verify that the IPV6 option is set
assert zsock.getsockopt( assert zsock.getsockopt(zmq.IPV6) == 1, (
zmq.IPV6) == 1, "IPV6 option should be enabled for IPv6 addresses" "IPV6 option should be enabled for IPv6 addresses"
)
# Clean up # Clean up
zsock.close() zsock.close()
@ -1019,15 +1024,14 @@ def test_convert_ids_list_to_tokens():
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct") tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
token_ids = tokenizer.encode("Hello, world!") token_ids = tokenizer.encode("Hello, world!")
# token_ids = [9707, 11, 1879, 0] # token_ids = [9707, 11, 1879, 0]
assert tokenizer.convert_ids_to_tokens(token_ids) == [ assert tokenizer.convert_ids_to_tokens(token_ids) == ["Hello", ",", "Ġworld", "!"]
'Hello', ',', 'Ġworld', '!'
]
tokens = convert_ids_list_to_tokens(tokenizer, token_ids) tokens = convert_ids_list_to_tokens(tokenizer, token_ids)
assert tokens == ['Hello', ',', ' world', '!'] assert tokens == ["Hello", ",", " world", "!"]
def test_current_stream_multithread(): def test_current_stream_multithread():
import threading import threading
if not torch.cuda.is_available(): if not torch.cuda.is_available():
pytest.skip("CUDA not available") pytest.skip("CUDA not available")
@ -1046,13 +1050,18 @@ def test_current_stream_multithread():
child_thread.start() child_thread.start()
try: try:
assert thread_stream_ready.wait( assert thread_stream_ready.wait(timeout=5), (
timeout=5), "Child thread failed to enter stream context in time" "Child thread failed to enter stream context in time"
)
main_current_stream = current_stream() main_current_stream = current_stream()
assert main_current_stream != child_stream, "Main thread's current_stream was contaminated by child thread" assert main_current_stream != child_stream, (
assert main_current_stream == main_default_stream, "Main thread's current_stream is not the default stream" "Main thread's current_stream was contaminated by child thread"
)
assert main_current_stream == main_default_stream, (
"Main thread's current_stream is not the default stream"
)
# Notify child thread it can exit # Notify child thread it can exit
thread_can_exit.set() thread_can_exit.set()
@ -1070,7 +1079,7 @@ def test_load_config_file(tmp_path):
"enable-logging": True, "enable-logging": True,
"list-arg": ["item1", "item2"], "list-arg": ["item1", "item2"],
"port": 12323, "port": 12323,
"tensor-parallel-size": 4 "tensor-parallel-size": 4,
} }
# Write the configuration data to a temporary YAML file # Write the configuration data to a temporary YAML file

View File

@ -16,9 +16,6 @@ from vllm.multimodal.inputs import (
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.utils import GiB_bytes, sha256, sha256_cbor from vllm.utils import GiB_bytes, sha256, sha256_cbor
from vllm.v1.core.kv_cache_manager import KVCacheManager from vllm.v1.core.kv_cache_manager import KVCacheManager
# disable yapf here as it formats differently than isort such that both fail
# yapf: disable
from vllm.v1.core.kv_cache_utils import ( from vllm.v1.core.kv_cache_utils import (
BlockHash, BlockHash,
FreeKVCacheBlockQueue, FreeKVCacheBlockQueue,
@ -48,8 +45,6 @@ from vllm.v1.kv_cache_interface import (
from vllm.v1.metrics.stats import PrefixCacheStats from vllm.v1.metrics.stats import PrefixCacheStats
from vllm.v1.request import Request from vllm.v1.request import Request
# yapf: enable
pytestmark = pytest.mark.cpu_test pytestmark = pytest.mark.cpu_test

View File

@ -22,8 +22,6 @@ from vllm.config import VllmConfig
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.utils import is_pin_memory_available from vllm.utils import is_pin_memory_available
# yapf: disable
from vllm.v1.sample.logits_processor import ( from vllm.v1.sample.logits_processor import (
BatchUpdate, BatchUpdate,
BatchUpdateBuilder, BatchUpdateBuilder,
@ -34,8 +32,6 @@ from vllm.v1.sample.logits_processor import (
MoveDirectionality, MoveDirectionality,
build_logitsprocs, build_logitsprocs,
) )
# yapf: enable
from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.metadata import SamplingMetadata
PIN_MEMORY_AVAILABLE = is_pin_memory_available() PIN_MEMORY_AVAILABLE = is_pin_memory_available()

View File

@ -7,8 +7,6 @@ from typing import Union
import pytest import pytest
from tests.utils import create_new_process_for_each_test from tests.utils import create_new_process_for_each_test
# yapf: disable
from tests.v1.logits_processors.utils import ( from tests.v1.logits_processors.utils import (
DUMMY_LOGITPROC_ARG, DUMMY_LOGITPROC_ARG,
DUMMY_LOGITPROC_FQCN, DUMMY_LOGITPROC_FQCN,
@ -24,8 +22,6 @@ from tests.v1.logits_processors.utils import (
prompts, prompts,
) )
from tests.v1.logits_processors.utils import entry_points as fake_entry_points from tests.v1.logits_processors.utils import entry_points as fake_entry_points
# yapf: enable
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.v1.sample.logits_processor import ( from vllm.v1.sample.logits_processor import (
STR_POOLING_REJECTS_LOGITSPROCS, STR_POOLING_REJECTS_LOGITSPROCS,

View File

@ -11,8 +11,6 @@ import pytest
import pytest_asyncio import pytest_asyncio
from tests.utils import RemoteOpenAIServerCustom, create_new_process_for_each_test from tests.utils import RemoteOpenAIServerCustom, create_new_process_for_each_test
# yapf: disable
from tests.v1.logits_processors.utils import ( from tests.v1.logits_processors.utils import (
DUMMY_LOGITPROC_ARG, DUMMY_LOGITPROC_ARG,
DUMMY_LOGITPROC_FQCN, DUMMY_LOGITPROC_FQCN,
@ -25,8 +23,6 @@ from tests.v1.logits_processors.utils import (
) )
from tests.v1.logits_processors.utils import entry_points as fake_entry_points from tests.v1.logits_processors.utils import entry_points as fake_entry_points
# yapf: enable
def _server_with_logitproc_entrypoint( def _server_with_logitproc_entrypoint(
env_dict: Optional[dict[str, str]], env_dict: Optional[dict[str, str]],

View File

@ -4,7 +4,6 @@
import importlib import importlib
from typing import TYPE_CHECKING, Callable from typing import TYPE_CHECKING, Callable
# yapf: disable
import vllm.envs as envs import vllm.envs as envs
from vllm.distributed.kv_transfer.kv_connector.base import ( from vllm.distributed.kv_transfer.kv_connector.base import (
KVConnectorBase, KVConnectorBase,
@ -13,8 +12,6 @@ from vllm.distributed.kv_transfer.kv_connector.base import (
from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorRole from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorRole
from vllm.logger import init_logger from vllm.logger import init_logger
# yapf: enable
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.kv_transfer import KVTransferConfig from vllm.config.kv_transfer import KVTransferConfig

View File

@ -1,7 +1,6 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# yapf: disable
import argparse import argparse
import copy import copy
import dataclasses import dataclasses
@ -88,8 +87,6 @@ from vllm.transformers_utils.utils import check_gguf_file
from vllm.utils import FlexibleArgumentParser, GiB_bytes, get_ip, is_in_ray_actor from vllm.utils import FlexibleArgumentParser, GiB_bytes, get_ip, is_in_ray_actor
from vllm.v1.sample.logits_processor import LogitsProcessor from vllm.v1.sample.logits_processor import LogitsProcessor
# yapf: enable
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.executor.executor_base import ExecutorBase from vllm.executor.executor_base import ExecutorBase
from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization import QuantizationMethods

View File

@ -17,9 +17,6 @@ import jinja2.nodes
import jinja2.parser import jinja2.parser
import jinja2.sandbox import jinja2.sandbox
import transformers.utils.chat_template_utils as hf_chat_utils import transformers.utils.chat_template_utils as hf_chat_utils
# yapf conflicts with isort for this block
# yapf: disable
from openai.types.chat import ( from openai.types.chat import (
ChatCompletionAssistantMessageParam, ChatCompletionAssistantMessageParam,
ChatCompletionContentPartImageParam, ChatCompletionContentPartImageParam,
@ -40,8 +37,6 @@ from openai.types.responses import ResponseInputImageParam
from openai_harmony import Message as OpenAIHarmonyMessage from openai_harmony import Message as OpenAIHarmonyMessage
from PIL import Image from PIL import Image
from pydantic import BaseModel, ConfigDict, TypeAdapter from pydantic import BaseModel, ConfigDict, TypeAdapter
# yapf: enable
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast, ProcessorMixin from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast, ProcessorMixin
# pydantic needs the TypedDict from typing_extensions # pydantic needs the TypedDict from typing_extensions
@ -52,11 +47,7 @@ from vllm.logger import init_logger
from vllm.model_executor.models import SupportsMultiModal from vllm.model_executor.models import SupportsMultiModal
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict, MultiModalUUIDDict from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict, MultiModalUUIDDict
from vllm.multimodal.utils import MediaConnector from vllm.multimodal.utils import MediaConnector
# yapf: disable
from vllm.transformers_utils.chat_templates import get_chat_template_fallback_path from vllm.transformers_utils.chat_templates import get_chat_template_fallback_path
# yapf: enable
from vllm.transformers_utils.processor import cached_get_processor from vllm.transformers_utils.processor import cached_get_processor
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
from vllm.utils import random_uuid, supports_kw from vllm.utils import random_uuid, supports_kw
@ -317,11 +308,7 @@ def _is_var_or_elems_access(
): ):
return _is_var_or_elems_access(node.node, varname, key) return _is_var_or_elems_access(node.node, varname, key)
# yapf: disable return _is_attr_access(node, varname, key) if key else _is_var_access(node, varname)
return (
_is_attr_access(node, varname, key) if key
else _is_var_access(node, varname)
) # yapf: enable
def _iter_nodes_assign_var_or_elems(root: jinja2.nodes.Node, varname: str): def _iter_nodes_assign_var_or_elems(root: jinja2.nodes.Node, varname: str):

View File

@ -39,9 +39,6 @@ from vllm.entrypoints.chat_utils import (
parse_chat_messages, parse_chat_messages,
resolve_chat_template_content_format, resolve_chat_template_content_format,
) )
# yapf conflicts with isort for this block
# yapf: disable
from vllm.entrypoints.score_utils import ( from vllm.entrypoints.score_utils import (
ScoreContentPartParam, ScoreContentPartParam,
ScoreMultiModalParam, ScoreMultiModalParam,
@ -50,8 +47,6 @@ from vllm.entrypoints.score_utils import (
compress_token_type_ids, compress_token_type_ids,
get_score_prompt, get_score_prompt,
) )
# yapf: enable
from vllm.entrypoints.utils import _validate_truncation_size, log_non_default_args from vllm.entrypoints.utils import _validate_truncation_size, log_non_default_args
from vllm.inputs import ( from vllm.inputs import (
DataPrompt, DataPrompt,

View File

@ -49,9 +49,6 @@ from vllm.entrypoints.chat_utils import (
from vllm.entrypoints.launcher import serve_http from vllm.entrypoints.launcher import serve_http
from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args
# yapf conflicts with isort for this block
# yapf: disable
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.protocol import (
ChatCompletionRequest, ChatCompletionRequest,
ChatCompletionResponse, ChatCompletionResponse,
@ -84,8 +81,6 @@ from vllm.entrypoints.openai.protocol import (
TranslationResponse, TranslationResponse,
UnloadLoRAAdapterRequest, UnloadLoRAAdapterRequest,
) )
# yapf: enable
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_classification import ServingClassification from vllm.entrypoints.openai.serving_classification import ServingClassification
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion

View File

@ -11,8 +11,6 @@ from typing import Annotated, Any, ClassVar, Generic, Literal, Optional, TypeVar
import regex as re import regex as re
import torch import torch
from fastapi import HTTPException, UploadFile from fastapi import HTTPException, UploadFile
# yapf: disable
from openai.types.chat.chat_completion_audio import ( from openai.types.chat.chat_completion_audio import (
ChatCompletionAudio as OpenAIChatCompletionAudio, ChatCompletionAudio as OpenAIChatCompletionAudio,
) )
@ -46,8 +44,6 @@ from openai.types.responses import ResponseCreatedEvent as OpenAIResponseCreated
from openai.types.responses import ( from openai.types.responses import (
ResponseInProgressEvent as OpenAIResponseInProgressEvent, ResponseInProgressEvent as OpenAIResponseInProgressEvent,
) )
# yapf: enable
from openai.types.responses.response_reasoning_item import ( from openai.types.responses.response_reasoning_item import (
Content as ResponseReasoningTextContent, Content as ResponseReasoningTextContent,
) )

View File

@ -18,8 +18,6 @@ from vllm.config import VllmConfig
from vllm.engine.arg_utils import AsyncEngineArgs, optional_type from vllm.engine.arg_utils import AsyncEngineArgs, optional_type
from vllm.engine.protocol import EngineClient from vllm.engine.protocol import EngineClient
from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.logger import RequestLogger
# yapf: disable
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.protocol import (
BatchRequestInput, BatchRequestInput,
BatchRequestOutput, BatchRequestOutput,
@ -30,8 +28,6 @@ from vllm.entrypoints.openai.protocol import (
RerankResponse, RerankResponse,
ScoreResponse, ScoreResponse,
) )
# yapf: enable
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels

View File

@ -1733,13 +1733,15 @@ class OpenAIServingChat(OpenAIServing):
is a tool call with arguments. is a tool call with arguments.
""" """
# yapf: disable
return bool( return bool(
# if there is a delta message that includes tool calls which # if there is a delta message that includes tool calls which
# include a function that has arguments # include a function that has arguments
output.finish_reason is not None output.finish_reason is not None
and self.enable_auto_tools and self.tool_parser and delta_message and self.enable_auto_tools
and delta_message.tool_calls and delta_message.tool_calls[0] and self.tool_parser
and delta_message
and delta_message.tool_calls
and delta_message.tool_calls[0]
and delta_message.tool_calls[0].function and delta_message.tool_calls[0].function
and delta_message.tool_calls[0].function.arguments is not None and delta_message.tool_calls[0].function.arguments is not None
) )

View File

@ -18,8 +18,6 @@ from vllm.entrypoints.openai.protocol import (
ErrorResponse, ErrorResponse,
UsageInfo, UsageInfo,
) )
# yapf: enable
from vllm.entrypoints.openai.serving_engine import ( from vllm.entrypoints.openai.serving_engine import (
ClassificationServeContext, ClassificationServeContext,
OpenAIServing, OpenAIServing,

View File

@ -13,9 +13,6 @@ from fastapi import Request
from vllm.config import ModelConfig from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient from vllm.engine.protocol import EngineClient
from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.logger import RequestLogger
# yapf conflicts with isort for this block
# yapf: disable
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.protocol import (
CompletionLogProbs, CompletionLogProbs,
CompletionRequest, CompletionRequest,
@ -29,8 +26,6 @@ from vllm.entrypoints.openai.protocol import (
UsageInfo, UsageInfo,
) )
from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs
# yapf: enable
from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.renderer import RenderConfig from vllm.entrypoints.renderer import RenderConfig
from vllm.entrypoints.utils import get_max_tokens from vllm.entrypoints.utils import get_max_tokens

View File

@ -14,9 +14,6 @@ from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient from vllm.engine.protocol import EngineClient
from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.logger import RequestLogger
# yapf conflicts with isort for this docstring
# yapf: disable
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.protocol import (
EmbeddingChatRequest, EmbeddingChatRequest,
EmbeddingCompletionRequest, EmbeddingCompletionRequest,
@ -32,8 +29,6 @@ from vllm.entrypoints.openai.serving_engine import (
ServeContext, ServeContext,
TextTokensPrompt, TextTokensPrompt,
) )
# yapf: enable
from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.renderer import RenderConfig from vllm.entrypoints.renderer import RenderConfig
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt from vllm.inputs.data import TokensPrompt as EngineTokensPrompt

View File

@ -28,9 +28,6 @@ else:
import vllm.envs as envs import vllm.envs as envs
from vllm.config import ModelConfig from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient from vllm.engine.protocol import EngineClient
# yapf conflicts with isort for this block
# yapf: disable
from vllm.entrypoints.chat_utils import ( from vllm.entrypoints.chat_utils import (
ChatCompletionMessageParam, ChatCompletionMessageParam,
ChatTemplateContentFormatOption, ChatTemplateContentFormatOption,
@ -72,8 +69,6 @@ from vllm.entrypoints.openai.protocol import (
from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.openai.tool_parsers import ToolParser from vllm.entrypoints.openai.tool_parsers import ToolParser
from vllm.entrypoints.renderer import BaseRenderer, CompletionRenderer, RenderConfig from vllm.entrypoints.renderer import BaseRenderer, CompletionRenderer, RenderConfig
# yapf: enable
from vllm.inputs.data import PromptType from vllm.inputs.data import PromptType
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
from vllm.inputs.parse import PromptComponents, get_prompt_components from vllm.inputs.parse import PromptComponents, get_prompt_components

View File

@ -17,8 +17,6 @@ from vllm.config import VllmConfig
from vllm.engine.protocol import EngineClient from vllm.engine.protocol import EngineClient
from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.logger import RequestLogger
# yapf: disable
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.protocol import (
ErrorResponse, ErrorResponse,
IOProcessorRequest, IOProcessorRequest,
@ -30,8 +28,6 @@ from vllm.entrypoints.openai.protocol import (
PoolingResponseData, PoolingResponseData,
UsageInfo, UsageInfo,
) )
# yapf: enable
from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.serving_engine import OpenAIServing
from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.renderer import RenderConfig from vllm.entrypoints.renderer import RenderConfig

View File

@ -14,9 +14,6 @@ from typing import Callable, Final, Optional, Union
import jinja2 import jinja2
from fastapi import Request from fastapi import Request
# yapf conflicts with isort for this block
# yapf: disable
from openai.types.responses import ( from openai.types.responses import (
ResponseCodeInterpreterCallCodeDeltaEvent, ResponseCodeInterpreterCallCodeDeltaEvent,
ResponseCodeInterpreterCallCodeDoneEvent, ResponseCodeInterpreterCallCodeDoneEvent,
@ -46,8 +43,6 @@ from openai.types.responses import (
response_text_delta_event, response_text_delta_event,
) )
from openai.types.responses.response_output_text import Logprob, LogprobTopLogprob from openai.types.responses.response_output_text import Logprob, LogprobTopLogprob
# yapf: enable
from openai.types.responses.response_reasoning_item import ( from openai.types.responses.response_reasoning_item import (
Content as ResponseReasoningTextContent, Content as ResponseReasoningTextContent,
) )
@ -78,9 +73,6 @@ from vllm.entrypoints.harmony_utils import (
render_for_completion, render_for_completion,
) )
from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.logger import RequestLogger
# yapf conflicts with isort for this block
# yapf: disable
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.protocol import (
DeltaMessage, DeltaMessage,
ErrorResponse, ErrorResponse,
@ -97,8 +89,6 @@ from vllm.entrypoints.openai.protocol import (
ResponseUsage, ResponseUsage,
StreamingResponsesResponse, StreamingResponsesResponse,
) )
# yapf: enable
from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.serving_engine import OpenAIServing
from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.tool_server import ToolServer from vllm.entrypoints.tool_server import ToolServer

View File

@ -24,9 +24,6 @@ from vllm.entrypoints.openai.protocol import (
) )
from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.serving_engine import OpenAIServing
from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.serving_models import OpenAIServingModels
# yapf conflicts with isort for this block
# yapf: disable
from vllm.entrypoints.score_utils import ( from vllm.entrypoints.score_utils import (
ScoreContentPartParam, ScoreContentPartParam,
ScoreMultiModalParam, ScoreMultiModalParam,
@ -35,8 +32,6 @@ from vllm.entrypoints.score_utils import (
compress_token_type_ids, compress_token_type_ids,
get_score_prompt, get_score_prompt,
) )
# yapf: enable
from vllm.entrypoints.utils import _validate_truncation_size from vllm.entrypoints.utils import _validate_truncation_size
from vllm.inputs.data import TokensPrompt from vllm.inputs.data import TokensPrompt
from vllm.logger import init_logger from vllm.logger import init_logger

View File

@ -10,9 +10,6 @@ from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient from vllm.engine.protocol import EngineClient
from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.logger import RequestLogger
# yapf conflicts with isort for this block
# yapf: disable
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.protocol import (
DetokenizeRequest, DetokenizeRequest,
DetokenizeResponse, DetokenizeResponse,
@ -22,8 +19,6 @@ from vllm.entrypoints.openai.protocol import (
TokenizeResponse, TokenizeResponse,
TokenizerInfoResponse, TokenizerInfoResponse,
) )
# yapf: enable
from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.serving_engine import OpenAIServing
from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.renderer import RenderConfig from vllm.entrypoints.renderer import RenderConfig

View File

@ -11,7 +11,7 @@ import cloudpickle
import msgspec import msgspec
import vllm.envs as envs import vllm.envs as envs
from vllm.executor.executor_base import DistributedExecutorBase # yapf: disable from vllm.executor.executor_base import DistributedExecutorBase
from vllm.executor.msgspec_utils import encode_hook from vllm.executor.msgspec_utils import encode_hook
from vllm.executor.ray_utils import RayWorkerWrapper, initialize_ray_cluster, ray from vllm.executor.ray_utils import RayWorkerWrapper, initialize_ray_cluster, ray
from vllm.logger import init_logger from vllm.logger import init_logger

View File

@ -8,8 +8,6 @@ from transformers import PretrainedConfig
from vllm.config.lora import LoRAConfig from vllm.config.lora import LoRAConfig
from vllm.distributed.utils import divide from vllm.distributed.utils import divide
# yapf: disable
from vllm.model_executor.layers.linear import ( from vllm.model_executor.layers.linear import (
ColumnParallelLinear, ColumnParallelLinear,
LinearBase, LinearBase,
@ -23,7 +21,6 @@ from .utils import _get_lora_device
class BaseLinearLayerWithLoRA(BaseLayerWithLoRA): class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
def __init__(self, base_layer: LinearBase): def __init__(self, base_layer: LinearBase):
super().__init__() super().__init__()
self.base_layer = base_layer self.base_layer = base_layer
@ -50,16 +47,20 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
lora_b_out_size = self.output_size lora_b_out_size = self.output_size
elif isinstance(self.base_layer, ColumnParallelLinear): elif isinstance(self.base_layer, ColumnParallelLinear):
lora_a_out_size = (lora_config.max_lora_rank if lora_a_out_size = (
not lora_config.fully_sharded_loras else divide( lora_config.max_lora_rank
lora_config.max_lora_rank, self.tp_size)) if not lora_config.fully_sharded_loras
else divide(lora_config.max_lora_rank, self.tp_size)
)
lora_b_out_size = self.output_size lora_b_out_size = self.output_size
elif isinstance(self.base_layer, RowParallelLinear): elif isinstance(self.base_layer, RowParallelLinear):
lora_a_out_size = lora_config.max_lora_rank lora_a_out_size = lora_config.max_lora_rank
lora_b_out_size = (self.output_size if lora_b_out_size = (
not lora_config.fully_sharded_loras else divide( self.output_size
self.output_size, self.tp_size)) if not lora_config.fully_sharded_loras
else divide(self.output_size, self.tp_size)
)
else: else:
raise NotImplementedError raise NotImplementedError
@ -71,7 +72,9 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
self.input_size, self.input_size,
dtype=lora_config.lora_dtype, dtype=lora_config.lora_dtype,
device=self.device, device=self.device,
) for _ in range(self.n_slices)) )
for _ in range(self.n_slices)
)
self.lora_b_stacked = tuple( self.lora_b_stacked = tuple(
torch.zeros( torch.zeros(
max_loras, max_loras,
@ -80,7 +83,9 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
lora_config.max_lora_rank, lora_config.max_lora_rank,
dtype=lora_config.lora_dtype, dtype=lora_config.lora_dtype,
device=self.device, device=self.device,
) for _ in range(self.n_slices)) )
for _ in range(self.n_slices)
)
if lora_config.bias_enabled: if lora_config.bias_enabled:
lora_bias_out_size = lora_b_out_size lora_bias_out_size = lora_b_out_size
self.lora_bias_stacked = tuple( self.lora_bias_stacked = tuple(
@ -90,8 +95,10 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
lora_bias_out_size, lora_bias_out_size,
dtype=lora_config.lora_dtype, dtype=lora_config.lora_dtype,
device=self.device, device=self.device,
) for _ in range(self.n_slices)) )
self.output_slices = (self.lora_b_stacked[0].shape[2], ) for _ in range(self.n_slices)
)
self.output_slices = (self.lora_b_stacked[0].shape[2],)
def reset_lora(self, index: int): def reset_lora(self, index: int):
for s_index in range(self.n_slices): for s_index in range(self.n_slices):
@ -99,8 +106,9 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
self.lora_b_stacked[s_index][index] = 0 self.lora_b_stacked[s_index][index] = 0
if self.lora_config.bias_enabled: if self.lora_config.bias_enabled:
# Make mypy happy # Make mypy happy
self.lora_bias_stacked = cast(tuple[torch.Tensor, ...], self.lora_bias_stacked = cast(
self.lora_bias_stacked) tuple[torch.Tensor, ...], self.lora_bias_stacked
)
self.lora_bias_stacked[s_index][index] = 0 self.lora_bias_stacked[s_index][index] = 0
def set_lora( def set_lora(
@ -115,8 +123,9 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
# MergedColumnParallelLinearWithLoRA, all other linear LoRA layers # MergedColumnParallelLinearWithLoRA, all other linear LoRA layers
# store weights in a tuple of size 1. These two layers will # store weights in a tuple of size 1. These two layers will
# override this function. # override this function.
assert (len(self.lora_a_stacked) == len(self.lora_b_stacked) == assert (
self.n_slices == 1) len(self.lora_a_stacked) == len(self.lora_b_stacked) == self.n_slices == 1
)
self.reset_lora(index) self.reset_lora(index)
if self.tp_size > 1: if self.tp_size > 1:
@ -125,23 +134,24 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
if lora_bias is not None: if lora_bias is not None:
lora_bias = self.slice_bias(lora_bias) lora_bias = self.slice_bias(lora_bias)
self.lora_a_stacked[0][index, self.lora_a_stacked[0][index, 0, : lora_a.shape[0], : lora_a.shape[1]].copy_(
0, :lora_a.shape[0], :lora_a.shape[1]].copy_( lora_a, non_blocking=True
lora_a, non_blocking=True) )
self.lora_b_stacked[0][index, self.lora_b_stacked[0][index, 0, : lora_b.shape[0], : lora_b.shape[1]].copy_(
0, :lora_b.shape[0], :lora_b.shape[1]].copy_( lora_b, non_blocking=True
lora_b, non_blocking=True) )
if lora_bias is not None: if lora_bias is not None:
self.lora_bias_stacked = cast(
self.lora_bias_stacked = cast(tuple[torch.Tensor, ...], tuple[torch.Tensor, ...], self.lora_bias_stacked
self.lora_bias_stacked) )
assert len(self.lora_bias_stacked) assert len(self.lora_bias_stacked)
self.lora_bias_stacked[0][index, 0, :lora_bias.shape[0]].copy_( self.lora_bias_stacked[0][index, 0, : lora_bias.shape[0]].copy_(
lora_bias, non_blocking=True) lora_bias, non_blocking=True
)
def apply(self, def apply(
x: torch.Tensor, self, x: torch.Tensor, bias: Optional[torch.Tensor] = None
bias: Optional[torch.Tensor] = None) -> torch.Tensor: ) -> torch.Tensor:
output = self.base_layer.quant_method.apply(self.base_layer, x, bias) output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
# In transformers backend, x and output have extra batch dimension like # In transformers backend, x and output have extra batch dimension like
@ -151,10 +161,15 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
output = output.flatten(0, 1) output = output.flatten(0, 1)
x = x.flatten(0, 1) x = x.flatten(0, 1)
lora_output: Optional[ lora_output: Optional[torch.Tensor] = self.punica_wrapper.add_lora_linear(
torch.Tensor] = self.punica_wrapper.add_lora_linear( output,
output, x, self.lora_a_stacked, self.lora_b_stacked, x,
self.lora_bias_stacked, 1.0, self.output_slices) self.lora_a_stacked,
self.lora_b_stacked,
self.lora_bias_stacked,
1.0,
self.output_slices,
)
if not current_platform.can_update_inplace(): if not current_platform.can_update_inplace():
output = lora_output output = lora_output
@ -162,7 +177,6 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
@property @property
def weight(self) -> torch.Tensor: def weight(self) -> torch.Tensor:
# unquantizedLinear # unquantizedLinear
if hasattr(self.base_layer, "weight"): if hasattr(self.base_layer, "weight"):
return self.base_layer.weight return self.base_layer.weight

View File

@ -12,8 +12,6 @@ from vllm.distributed import (
split_tensor_along_last_dim, split_tensor_along_last_dim,
tensor_model_parallel_all_reduce, tensor_model_parallel_all_reduce,
) )
# yapf: disable
from vllm.model_executor.layers.linear import RowParallelLinear from vllm.model_executor.layers.linear import RowParallelLinear
from vllm.platforms import current_platform from vllm.platforms import current_platform
@ -22,7 +20,6 @@ from .utils import _fully_sharded_can_replace, _not_fully_sharded_can_replace
class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA): class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
def __init__(self, base_layer: RowParallelLinear) -> None: def __init__(self, base_layer: RowParallelLinear) -> None:
super().__init__(base_layer) super().__init__(base_layer)
@ -33,11 +30,10 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
self.n_slices = 1 self.n_slices = 1
def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
shard_size = self.input_size shard_size = self.input_size
start_idx = self.tp_rank * shard_size start_idx = self.tp_rank * shard_size
end_idx = (self.tp_rank + 1) * shard_size end_idx = (self.tp_rank + 1) * shard_size
lora_a = lora_a[:,start_idx:end_idx] lora_a = lora_a[:, start_idx:end_idx]
return lora_a return lora_a
def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
@ -66,7 +62,8 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
else: else:
# TODO: simplify code below # TODO: simplify code below
splitted_input = split_tensor_along_last_dim( splitted_input = split_tensor_along_last_dim(
input_, num_partitions=self.tp_size) input_, num_partitions=self.tp_size
)
input_parallel = splitted_input[self.tp_rank].contiguous() input_parallel = splitted_input[self.tp_rank].contiguous()
# Matrix multiply. # Matrix multiply.
@ -77,8 +74,11 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
output_ = output_parallel output_ = output_parallel
if not self.base_layer.skip_bias_add: if not self.base_layer.skip_bias_add:
output = (output_ + self.base_layer.bias output = (
if self.base_layer.bias is not None else output_) output_ + self.base_layer.bias
if self.base_layer.bias is not None
else output_
)
output_bias = None output_bias = None
else: else:
output = output_ output = output_
@ -101,11 +101,11 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
return type(source_layer) is RowParallelLinear return type(source_layer) is RowParallelLinear
# The following layer is based on the tensor parallelism strategy given in # The following layer is based on the tensor parallelism strategy given in
# Y. Sheng et al., S-LoRA: Serving Thousands of Concurrent LoRA Adapters. 2023, # Y. Sheng et al., S-LoRA: Serving Thousands of Concurrent LoRA Adapters. 2023,
# https://arxiv.org/abs/2311.03285. # https://arxiv.org/abs/2311.03285.
class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA): class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA):
""" """
Differs from RowParallelLinearWithLoRA by slicing the Differs from RowParallelLinearWithLoRA by slicing the
@ -120,28 +120,26 @@ class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA):
shard_size = self.lora_b_stacked[0].shape[2] shard_size = self.lora_b_stacked[0].shape[2]
start_idx = self.tp_rank * shard_size start_idx = self.tp_rank * shard_size
end_idx = (self.tp_rank + 1) * shard_size end_idx = (self.tp_rank + 1) * shard_size
lora_b = lora_b[ start_idx:end_idx,:] lora_b = lora_b[start_idx:end_idx, :]
return lora_b return lora_b
def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
if bias is None: if bias is None:
return bias return bias
self.lora_bias_stacked = cast(tuple[torch.Tensor, ...], self.lora_bias_stacked = cast(tuple[torch.Tensor, ...], self.lora_bias_stacked)
self.lora_bias_stacked)
shard_size = self.lora_bias_stacked[0].shape[2] shard_size = self.lora_bias_stacked[0].shape[2]
start_idx = self.tp_rank * shard_size start_idx = self.tp_rank * shard_size
end_idx = (self.tp_rank + 1) * shard_size end_idx = (self.tp_rank + 1) * shard_size
bias = bias[start_idx:end_idx] bias = bias[start_idx:end_idx]
return bias return bias
def apply(self, def apply(
x: torch.Tensor, self, x: torch.Tensor, bias: Optional[torch.Tensor] = None
bias: Optional[torch.Tensor] = None) -> torch.Tensor: ) -> torch.Tensor:
output = self.base_layer.quant_method.apply(self.base_layer, x) output = self.base_layer.quant_method.apply(self.base_layer, x)
x = x.view(-1, x.shape[-1]) x = x.view(-1, x.shape[-1])
output, out_orig_shape = output.view(-1, output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape
output.shape[-1]), output.shape
buffer = torch.zeros( buffer = torch.zeros(
(self.n_slices, x.shape[0], self.lora_a_stacked[0].shape[2]), (self.n_slices, x.shape[0], self.lora_a_stacked[0].shape[2]),
dtype=torch.float32, dtype=torch.float32,
@ -149,10 +147,11 @@ class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA):
) )
shrunk_buffer: Optional[torch.Tensor] = self.punica_wrapper.add_shrink( shrunk_buffer: Optional[torch.Tensor] = self.punica_wrapper.add_shrink(
buffer, x, self.lora_a_stacked, 1.0) buffer, x, self.lora_a_stacked, 1.0
)
if not current_platform.can_update_inplace(): if not current_platform.can_update_inplace():
buffer = shrunk_buffer buffer = shrunk_buffer
if self.tp_size>1: if self.tp_size > 1:
buffer = tensor_model_parallel_all_reduce(buffer) buffer = tensor_model_parallel_all_reduce(buffer)
# following S-LoRA, allows the fusing of all_gather and all_reduce # following S-LoRA, allows the fusing of all_gather and all_reduce

View File

@ -19,8 +19,6 @@ from vllm.config.lora import LoRAConfig
from vllm.logger import init_logger from vllm.logger import init_logger
# being imported for _all_lora_classes below # being imported for _all_lora_classes below
# yapf conflicts with isort for this block
# yapf: disable
from vllm.lora.layers import ( from vllm.lora.layers import (
BaseLayerWithLoRA, BaseLayerWithLoRA,
ColumnParallelLinearWithLoRA, ColumnParallelLinearWithLoRA,
@ -39,8 +37,6 @@ from vllm.lora.layers import (
) )
from vllm.model_executor.layers.linear import LinearBase from vllm.model_executor.layers.linear import LinearBase
# yapf: enable
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead

View File

@ -14,8 +14,6 @@ import vllm.envs as envs
import vllm.model_executor.layers.fused_moe.modular_kernel as mk import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.logger import init_logger from vllm.logger import init_logger
# yapf: disable
from vllm.model_executor.layers.fused_moe.config import ( from vllm.model_executor.layers.fused_moe.config import (
FUSED_MOE_UNQUANTIZED_CONFIG, FUSED_MOE_UNQUANTIZED_CONFIG,
FusedMoEQuantConfig, FusedMoEQuantConfig,
@ -25,8 +23,6 @@ from vllm.model_executor.layers.fused_moe.cutlass_moe import (
_valid_cutlass_block_scaled_grouped_gemm, _valid_cutlass_block_scaled_grouped_gemm,
run_cutlass_block_scaled_fused_experts, run_cutlass_block_scaled_fused_experts,
) )
# yapf: enable
from vllm.model_executor.layers.fused_moe.deep_gemm_moe import ( from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
_valid_deep_gemm, _valid_deep_gemm,
deep_gemm_moe_fp8, deep_gemm_moe_fp8,

View File

@ -24,8 +24,6 @@ from vllm.distributed.eplb.eplb_state import EplbState
from vllm.forward_context import ForwardContext, get_forward_context from vllm.forward_context import ForwardContext, get_forward_context
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.custom_op import CustomOp
# yapf: disable
from vllm.model_executor.layers.fused_moe.config import ( from vllm.model_executor.layers.fused_moe.config import (
FUSED_MOE_UNQUANTIZED_CONFIG, FUSED_MOE_UNQUANTIZED_CONFIG,
FusedMoEConfig, FusedMoEConfig,
@ -34,8 +32,6 @@ from vllm.model_executor.layers.fused_moe.config import (
biased_moe_quant_config, biased_moe_quant_config,
) )
from vllm.model_executor.layers.fused_moe.fused_moe import zero_experts_compute_triton from vllm.model_executor.layers.fused_moe.fused_moe import zero_experts_compute_triton
# yapf: enable
from vllm.model_executor.layers.fused_moe.modular_kernel import ( from vllm.model_executor.layers.fused_moe.modular_kernel import (
FusedMoEActivationFormat, FusedMoEActivationFormat,
FusedMoEModularKernel, FusedMoEModularKernel,

View File

@ -10,7 +10,7 @@ import torch
import vllm.envs as envs import vllm.envs as envs
from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
from vllm.model_executor.layers.fused_moe.utils import ( # yapf: disable from vllm.model_executor.layers.fused_moe.utils import (
_resize_cache, _resize_cache,
count_expert_num_tokens, count_expert_num_tokens,
) )

View File

@ -24,8 +24,6 @@ from vllm.model_executor.layers.quantization.base_config import (
QuantizeMethodBase, QuantizeMethodBase,
) )
from vllm.model_executor.layers.utils import dispatch_unquantized_gemm from vllm.model_executor.layers.utils import dispatch_unquantized_gemm
# yapf: disable
from vllm.model_executor.parameter import ( from vllm.model_executor.parameter import (
BasevLLMParameter, BasevLLMParameter,
BlockQuantScaleParameter, BlockQuantScaleParameter,
@ -35,8 +33,6 @@ from vllm.model_executor.parameter import (
PerTensorScaleParameter, PerTensorScaleParameter,
RowvLLMParameter, RowvLLMParameter,
) )
# yapf: enable
from vllm.model_executor.utils import set_weight_attrs from vllm.model_executor.utils import set_weight_attrs
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import GiB_bytes from vllm.utils import GiB_bytes

View File

@ -17,17 +17,12 @@ from vllm.model_executor.layers.quantization.kernels.mixed_precision import (
from vllm.model_executor.layers.quantization.utils.marlin_utils import ( from vllm.model_executor.layers.quantization.utils.marlin_utils import (
marlin_repeat_scales_on_all_ranks, marlin_repeat_scales_on_all_ranks,
) )
# yapf conflicts with isort for this block
# yapf: disable
from vllm.model_executor.parameter import ( from vllm.model_executor.parameter import (
BasevLLMParameter, BasevLLMParameter,
ChannelQuantScaleParameter, ChannelQuantScaleParameter,
GroupQuantScaleParameter, GroupQuantScaleParameter,
PackedvLLMParameter, PackedvLLMParameter,
) )
# yapf: enable
from vllm.scalar_type import scalar_types from vllm.scalar_type import scalar_types
logger = init_logger(__name__) logger = init_logger(__name__)

View File

@ -17,9 +17,6 @@ from vllm.model_executor.layers.quantization.kernels.mixed_precision import (
from vllm.model_executor.layers.quantization.utils.marlin_utils import ( from vllm.model_executor.layers.quantization.utils.marlin_utils import (
marlin_repeat_scales_on_all_ranks, marlin_repeat_scales_on_all_ranks,
) )
# yapf conflicts with isort for this block
# yapf: disable
from vllm.model_executor.parameter import ( from vllm.model_executor.parameter import (
BasevLLMParameter, BasevLLMParameter,
ChannelQuantScaleParameter, ChannelQuantScaleParameter,
@ -28,8 +25,6 @@ from vllm.model_executor.parameter import (
PackedvLLMParameter, PackedvLLMParameter,
RowvLLMParameter, RowvLLMParameter,
) )
# yapf: enable
from vllm.scalar_type import scalar_types from vllm.scalar_type import scalar_types
logger = init_logger(__name__) logger = init_logger(__name__)

View File

@ -22,8 +22,6 @@ from vllm.distributed import (
get_tensor_model_parallel_rank, get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size, get_tensor_model_parallel_world_size,
) )
# yapf: enable
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.fused_moe import FusedMoE
from vllm.model_executor.layers.linear import ( from vllm.model_executor.layers.linear import (
@ -51,8 +49,6 @@ from vllm.model_executor.utils import (
) )
from vllm.platforms import current_platform from vllm.platforms import current_platform
# yapf conflicts with isort for this block
logger = init_logger(__name__) logger = init_logger(__name__)

View File

@ -39,13 +39,10 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.utils.tensor_schema import TensorSchema, TensorShape from vllm.utils.tensor_schema import TensorSchema, TensorShape
# yapf: disable
from .idefics2_vision_model import Idefics2VisionConfig from .idefics2_vision_model import Idefics2VisionConfig
from .idefics2_vision_model import ( from .idefics2_vision_model import (
Idefics2VisionTransformer as Idefics3VisionTransformer, Idefics2VisionTransformer as Idefics3VisionTransformer,
) )
# yapf: enable
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsQuant from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsQuant
from .llama import LlamaDecoderLayer, LlamaMLP, LlamaModel from .llama import LlamaDecoderLayer, LlamaMLP, LlamaModel
from .utils import ( from .utils import (

View File

@ -22,8 +22,6 @@ from vllm.multimodal.inputs import (
MultiModalKwargsItems, MultiModalKwargsItems,
) )
from vllm.multimodal.parse import ImageProcessorItems, ImageSize, MultiModalDataItems from vllm.multimodal.parse import ImageProcessorItems, ImageSize, MultiModalDataItems
# yapf: disable
from vllm.multimodal.processing import ( from vllm.multimodal.processing import (
BaseMultiModalProcessor, BaseMultiModalProcessor,
BaseProcessingInfo, BaseProcessingInfo,
@ -35,8 +33,6 @@ from vllm.multimodal.processing import (
PromptUpdateDetails, PromptUpdateDetails,
replace_token_matches, replace_token_matches,
) )
# yapf: enable
from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.utils.tensor_schema import TensorSchema, TensorShape from vllm.utils.tensor_schema import TensorSchema, TensorShape

View File

@ -6,14 +6,16 @@ from typing import Annotated, Any, Literal, Optional, Union, cast
import numpy as np import numpy as np
import torch import torch
# yapf: disable
from torch import nn from torch import nn
from transformers import AutoModel, BatchFeature from transformers import AutoModel, BatchFeature
from transformers.models.gemma3n import (Gemma3nAudioConfig, from transformers.models.gemma3n import (
Gemma3nAudioFeatureExtractor, Gemma3nAudioConfig,
Gemma3nConfig, Gemma3nProcessor, Gemma3nAudioFeatureExtractor,
Gemma3nTextConfig, Gemma3nConfig,
Gemma3nVisionConfig) Gemma3nProcessor,
Gemma3nTextConfig,
Gemma3nVisionConfig,
)
from transformers.models.siglip import SiglipImageProcessorFast from transformers.models.siglip import SiglipImageProcessorFast
from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig
@ -22,25 +24,32 @@ from vllm.inputs.data import PromptType
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.linear import RowParallelLinear from vllm.model_executor.layers.linear import RowParallelLinear
from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
VocabParallelEmbedding)
from vllm.model_executor.models.gemma3n import Gemma3nForCausalLM from vllm.model_executor.models.gemma3n import Gemma3nForCausalLM
from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.models.whisper import ISO639_1_SUPPORTED_LANGS from vllm.model_executor.models.whisper import ISO639_1_SUPPORTED_LANGS
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (
MultiModalKwargsItems) MultiModalDataDict,
from vllm.multimodal.parse import (ImageProcessorItems, MultiModalDataItems, MultiModalFieldConfig,
MultiModalDataParser) MultiModalKwargsItems,
from vllm.multimodal.processing import (BaseMultiModalProcessor, )
BaseProcessingInfo, from vllm.multimodal.parse import (
MultiModalPromptUpdates, ImageProcessorItems,
MultiModalPromptUpdatesApplyResult, MultiModalDataItems,
PlaceholderFeaturesInfo, MultiModalDataParser,
PromptReplacement, PromptUpdate, )
PromptUpdateDetails, from vllm.multimodal.processing import (
replace_token_matches) BaseMultiModalProcessor,
# yapf: enable BaseProcessingInfo,
MultiModalPromptUpdates,
MultiModalPromptUpdatesApplyResult,
PlaceholderFeaturesInfo,
PromptReplacement,
PromptUpdate,
PromptUpdateDetails,
replace_token_matches,
)
from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.utils.tensor_schema import TensorSchema, TensorShape from vllm.utils.tensor_schema import TensorSchema, TensorShape

View File

@ -43,9 +43,6 @@ from vllm.multimodal.inputs import (
MultiModalKwargsItems, MultiModalKwargsItems,
) )
from vllm.multimodal.parse import ImageProcessorItems, ImageSize from vllm.multimodal.parse import ImageProcessorItems, ImageSize
# yapf conflicts with isort for this block
# yapf: disable
from vllm.multimodal.processing import ( from vllm.multimodal.processing import (
BaseMultiModalProcessor, BaseMultiModalProcessor,
BaseProcessingInfo, BaseProcessingInfo,
@ -54,18 +51,13 @@ from vllm.multimodal.processing import (
PromptUpdate, PromptUpdate,
PromptUpdateDetails, PromptUpdateDetails,
) )
# yapf: enable
from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.utils.tensor_schema import TensorSchema, TensorShape from vllm.utils.tensor_schema import TensorSchema, TensorShape
# yapf: disable
from .idefics2_vision_model import ( from .idefics2_vision_model import (
Idefics2VisionTransformer as Idefics3VisionTransformer, Idefics2VisionTransformer as Idefics3VisionTransformer,
) )
# yapf: enable
from .interfaces import MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal from .interfaces import MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal
from .llama import LlamaModel from .llama import LlamaModel
from .utils import AutoWeightsLoader, maybe_prefix from .utils import AutoWeightsLoader, maybe_prefix

View File

@ -45,9 +45,6 @@ from vllm.multimodal.parse import (
ImageSize, ImageSize,
MultiModalDataItems, MultiModalDataItems,
) )
# yapf conflicts with isort for this block
# yapf: disable
from vllm.multimodal.processing import ( from vllm.multimodal.processing import (
BaseMultiModalProcessor, BaseMultiModalProcessor,
BaseProcessingInfo, BaseProcessingInfo,
@ -57,8 +54,6 @@ from vllm.multimodal.processing import (
PromptUpdate, PromptUpdate,
ResolvedPromptUpdate, ResolvedPromptUpdate,
) )
# yapf: enable
from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.utils import is_list_of from vllm.utils import is_list_of

View File

@ -52,16 +52,12 @@ from vllm.distributed import utils as dist_utils
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.layers.activation import get_act_and_mul_fn from vllm.model_executor.layers.activation import get_act_and_mul_fn
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
# yapf: disable
from vllm.model_executor.layers.linear import ( from vllm.model_executor.layers.linear import (
ColumnParallelLinear, ColumnParallelLinear,
MergedColumnParallelLinear, MergedColumnParallelLinear,
QKVParallelLinear, QKVParallelLinear,
RowParallelLinear, RowParallelLinear,
) )
# yapf: enable
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.module_mapping import MultiModelKeys

View File

@ -37,12 +37,7 @@ from vllm.model_executor.layers.fla.ops import (
fused_recurrent_gated_delta_rule, fused_recurrent_gated_delta_rule,
) )
from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.fused_moe import FusedMoE
# yapf conflicts with isort for this block
# yapf: disable
from vllm.model_executor.layers.layernorm import GemmaRMSNorm as Qwen3NextRMSNorm from vllm.model_executor.layers.layernorm import GemmaRMSNorm as Qwen3NextRMSNorm
# yapf: enable
from vllm.model_executor.layers.linear import ( from vllm.model_executor.layers.linear import (
ColumnParallelLinear, ColumnParallelLinear,
QKVParallelLinear, QKVParallelLinear,

View File

@ -54,7 +54,6 @@ from .interfaces_base import (
logger = init_logger(__name__) logger = init_logger(__name__)
# yapf: disable
_TEXT_GENERATION_MODELS = { _TEXT_GENERATION_MODELS = {
# [Decoder-only] # [Decoder-only]
"ApertusForCausalLM": ("apertus", "ApertusForCausalLM"), "ApertusForCausalLM": ("apertus", "ApertusForCausalLM"),
@ -106,8 +105,8 @@ _TEXT_GENERATION_MODELS = {
"GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"), "GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"),
"GraniteForCausalLM": ("granite", "GraniteForCausalLM"), "GraniteForCausalLM": ("granite", "GraniteForCausalLM"),
"GraniteMoeForCausalLM": ("granitemoe", "GraniteMoeForCausalLM"), "GraniteMoeForCausalLM": ("granitemoe", "GraniteMoeForCausalLM"),
"GraniteMoeHybridForCausalLM": ("granitemoehybrid", "GraniteMoeHybridForCausalLM"), # noqa: E501 "GraniteMoeHybridForCausalLM": ("granitemoehybrid", "GraniteMoeHybridForCausalLM"), # noqa: E501
"GraniteMoeSharedForCausalLM": ("granitemoeshared", "GraniteMoeSharedForCausalLM"), # noqa: E501 "GraniteMoeSharedForCausalLM": ("granitemoeshared", "GraniteMoeSharedForCausalLM"), # noqa: E501
"GritLM": ("gritlm", "GritLM"), "GritLM": ("gritlm", "GritLM"),
"Grok1ModelForCausalLM": ("grok1", "Grok1ForCausalLM"), "Grok1ModelForCausalLM": ("grok1", "Grok1ForCausalLM"),
"HunYuanMoEV1ForCausalLM": ("hunyuan_v1", "HunYuanMoEV1ForCausalLM"), "HunYuanMoEV1ForCausalLM": ("hunyuan_v1", "HunYuanMoEV1ForCausalLM"),
@ -127,7 +126,7 @@ _TEXT_GENERATION_MODELS = {
"LongcatFlashForCausalLM": ("longcat_flash", "LongcatFlashForCausalLM"), "LongcatFlashForCausalLM": ("longcat_flash", "LongcatFlashForCausalLM"),
"MambaForCausalLM": ("mamba", "MambaForCausalLM"), "MambaForCausalLM": ("mamba", "MambaForCausalLM"),
"FalconMambaForCausalLM": ("mamba", "MambaForCausalLM"), "FalconMambaForCausalLM": ("mamba", "MambaForCausalLM"),
"FalconH1ForCausalLM":("falcon_h1", "FalconH1ForCausalLM"), "FalconH1ForCausalLM": ("falcon_h1", "FalconH1ForCausalLM"),
"Mamba2ForCausalLM": ("mamba2", "Mamba2ForCausalLM"), "Mamba2ForCausalLM": ("mamba2", "Mamba2ForCausalLM"),
"MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"), "MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"),
"MiniCPM3ForCausalLM": ("minicpm3", "MiniCPM3ForCausalLM"), "MiniCPM3ForCausalLM": ("minicpm3", "MiniCPM3ForCausalLM"),
@ -184,7 +183,8 @@ _EMBEDDING_MODELS = {
"LlamaModel": ("llama", "LlamaForCausalLM"), "LlamaModel": ("llama", "LlamaForCausalLM"),
**{ **{
# Multiple models share the same architecture, so we include them all # Multiple models share the same architecture, so we include them all
k: (mod, arch) for k, (mod, arch) in _TEXT_GENERATION_MODELS.items() k: (mod, arch)
for k, (mod, arch) in _TEXT_GENERATION_MODELS.items()
if arch == "LlamaForCausalLM" if arch == "LlamaForCausalLM"
}, },
"MistralModel": ("llama", "LlamaForCausalLM"), "MistralModel": ("llama", "LlamaForCausalLM"),
@ -201,7 +201,10 @@ _EMBEDDING_MODELS = {
"XLMRobertaModel": ("roberta", "RobertaEmbeddingModel"), "XLMRobertaModel": ("roberta", "RobertaEmbeddingModel"),
# [Multimodal] # [Multimodal]
"CLIPModel": ("clip", "CLIPEmbeddingModel"), "CLIPModel": ("clip", "CLIPEmbeddingModel"),
"LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501 "LlavaNextForConditionalGeneration": (
"llava_next",
"LlavaNextForConditionalGeneration",
), # noqa: E501
"Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"), "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
"Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), # noqa: E501 "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), # noqa: E501
# Technically Terratorch models work on images, both in # Technically Terratorch models work on images, both in
@ -214,79 +217,150 @@ _EMBEDDING_MODELS = {
_CROSS_ENCODER_MODELS = { _CROSS_ENCODER_MODELS = {
"BertForSequenceClassification": ("bert", "BertForSequenceClassification"), "BertForSequenceClassification": ("bert", "BertForSequenceClassification"),
"BertForTokenClassification": ("bert", "BertForTokenClassification"), "BertForTokenClassification": ("bert", "BertForTokenClassification"),
"GteNewForSequenceClassification": ("bert_with_rope", "GteNewForSequenceClassification": (
"GteNewForSequenceClassification"), "bert_with_rope",
"ModernBertForSequenceClassification": ("modernbert", "GteNewForSequenceClassification",
"ModernBertForSequenceClassification"), ),
"RobertaForSequenceClassification": ("roberta", "ModernBertForSequenceClassification": (
"RobertaForSequenceClassification"), "modernbert",
"XLMRobertaForSequenceClassification": ("roberta", "ModernBertForSequenceClassification",
"RobertaForSequenceClassification"), ),
"RobertaForSequenceClassification": ("roberta", "RobertaForSequenceClassification"),
"XLMRobertaForSequenceClassification": (
"roberta",
"RobertaForSequenceClassification",
),
# [Auto-converted (see adapters.py)] # [Auto-converted (see adapters.py)]
"JinaVLForRanking": ("jina_vl", "JinaVLForSequenceClassification"), # noqa: E501, "JinaVLForRanking": ("jina_vl", "JinaVLForSequenceClassification"), # noqa: E501,
} }
_MULTIMODAL_MODELS = { _MULTIMODAL_MODELS = {
# [Decoder-only] # [Decoder-only]
"AriaForConditionalGeneration": ("aria", "AriaForConditionalGeneration"), "AriaForConditionalGeneration": ("aria", "AriaForConditionalGeneration"),
"AyaVisionForConditionalGeneration": ("aya_vision", "AyaVisionForConditionalGeneration"), # noqa: E501 "AyaVisionForConditionalGeneration": (
"aya_vision",
"AyaVisionForConditionalGeneration",
), # noqa: E501
"Blip2ForConditionalGeneration": ("blip2", "Blip2ForConditionalGeneration"), "Blip2ForConditionalGeneration": ("blip2", "Blip2ForConditionalGeneration"),
"ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"), # noqa: E501 "ChameleonForConditionalGeneration": (
"Cohere2VisionForConditionalGeneration": ("cohere2_vision", "Cohere2VisionForConditionalGeneration"), # noqa: E501 "chameleon",
"ChameleonForConditionalGeneration",
), # noqa: E501
"Cohere2VisionForConditionalGeneration": (
"cohere2_vision",
"Cohere2VisionForConditionalGeneration",
), # noqa: E501
"DeepseekVLV2ForCausalLM": ("deepseek_vl2", "DeepseekVLV2ForCausalLM"), "DeepseekVLV2ForCausalLM": ("deepseek_vl2", "DeepseekVLV2ForCausalLM"),
"DotsOCRForCausalLM": ("dots_ocr", "DotsOCRForCausalLM"), "DotsOCRForCausalLM": ("dots_ocr", "DotsOCRForCausalLM"),
"Ernie4_5_VLMoeForConditionalGeneration": ("ernie45_vl", "Ernie4_5_VLMoeForConditionalGeneration"), # noqa: E501 "Ernie4_5_VLMoeForConditionalGeneration": (
"ernie45_vl",
"Ernie4_5_VLMoeForConditionalGeneration",
), # noqa: E501
"FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"), "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
"Gemma3ForConditionalGeneration": ("gemma3_mm", "Gemma3ForConditionalGeneration"), # noqa: E501 "Gemma3ForConditionalGeneration": ("gemma3_mm", "Gemma3ForConditionalGeneration"), # noqa: E501
"Gemma3nForConditionalGeneration": ("gemma3n_mm", "Gemma3nForConditionalGeneration"), # noqa: E501 "Gemma3nForConditionalGeneration": (
"gemma3n_mm",
"Gemma3nForConditionalGeneration",
), # noqa: E501
"GLM4VForCausalLM": ("glm4v", "GLM4VForCausalLM"), "GLM4VForCausalLM": ("glm4v", "GLM4VForCausalLM"),
"Glm4vForConditionalGeneration": ("glm4_1v", "Glm4vForConditionalGeneration"), # noqa: E501 "Glm4vForConditionalGeneration": ("glm4_1v", "Glm4vForConditionalGeneration"), # noqa: E501
"Glm4vMoeForConditionalGeneration": ("glm4_1v", "Glm4vMoeForConditionalGeneration"), # noqa: E501 "Glm4vMoeForConditionalGeneration": ("glm4_1v", "Glm4vMoeForConditionalGeneration"), # noqa: E501
"GraniteSpeechForConditionalGeneration": ("granite_speech", "GraniteSpeechForConditionalGeneration"), # noqa: E501 "GraniteSpeechForConditionalGeneration": (
"granite_speech",
"GraniteSpeechForConditionalGeneration",
), # noqa: E501
"H2OVLChatModel": ("h2ovl", "H2OVLChatModel"), "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
"InternVLChatModel": ("internvl", "InternVLChatModel"), "InternVLChatModel": ("internvl", "InternVLChatModel"),
"NemotronH_Nano_VL_V2": ("nano_nemotron_vl", "NemotronH_Nano_VL_V2"), "NemotronH_Nano_VL_V2": ("nano_nemotron_vl", "NemotronH_Nano_VL_V2"),
"InternS1ForConditionalGeneration": ("interns1", "InternS1ForConditionalGeneration"), # noqa: E501 "InternS1ForConditionalGeneration": (
"InternVLForConditionalGeneration": ("interns1", "InternS1ForConditionalGeneration"), # noqa: E501 "interns1",
"Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"), "InternS1ForConditionalGeneration",
"SmolVLMForConditionalGeneration": ("smolvlm","SmolVLMForConditionalGeneration"), # noqa: E501 ), # noqa: E501
"InternVLForConditionalGeneration": (
"interns1",
"InternS1ForConditionalGeneration",
), # noqa: E501
"Idefics3ForConditionalGeneration": (
"idefics3",
"Idefics3ForConditionalGeneration",
),
"SmolVLMForConditionalGeneration": ("smolvlm", "SmolVLMForConditionalGeneration"), # noqa: E501
"KeyeForConditionalGeneration": ("keye", "KeyeForConditionalGeneration"), "KeyeForConditionalGeneration": ("keye", "KeyeForConditionalGeneration"),
"KeyeVL1_5ForConditionalGeneration": ("keye_vl1_5", "KeyeVL1_5ForConditionalGeneration"), # noqa: E501 "KeyeVL1_5ForConditionalGeneration": (
"keye_vl1_5",
"KeyeVL1_5ForConditionalGeneration",
), # noqa: E501
"RForConditionalGeneration": ("rvl", "RForConditionalGeneration"), "RForConditionalGeneration": ("rvl", "RForConditionalGeneration"),
"KimiVLForConditionalGeneration": ("kimi_vl", "KimiVLForConditionalGeneration"), # noqa: E501 "KimiVLForConditionalGeneration": ("kimi_vl", "KimiVLForConditionalGeneration"), # noqa: E501
"Llama_Nemotron_Nano_VL": ("nemotron_vl", "LlamaNemotronVLChatModel"), "Llama_Nemotron_Nano_VL": ("nemotron_vl", "LlamaNemotronVLChatModel"),
"Llama4ForConditionalGeneration": ("mllama4", "Llama4ForConditionalGeneration"), # noqa: E501 "Llama4ForConditionalGeneration": ("mllama4", "Llama4ForConditionalGeneration"), # noqa: E501
"LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"), "LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"),
"LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501 "LlavaNextForConditionalGeneration": (
"LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"), # noqa: E501 "llava_next",
"LlavaOnevisionForConditionalGeneration": ("llava_onevision", "LlavaOnevisionForConditionalGeneration"), # noqa: E501 "LlavaNextForConditionalGeneration",
), # noqa: E501
"LlavaNextVideoForConditionalGeneration": (
"llava_next_video",
"LlavaNextVideoForConditionalGeneration",
), # noqa: E501
"LlavaOnevisionForConditionalGeneration": (
"llava_onevision",
"LlavaOnevisionForConditionalGeneration",
), # noqa: E501
"MantisForConditionalGeneration": ("llava", "MantisForConditionalGeneration"), # noqa: E501 "MantisForConditionalGeneration": ("llava", "MantisForConditionalGeneration"), # noqa: E501
"MiDashengLMModel": ("midashenglm", "MiDashengLMModel"), "MiDashengLMModel": ("midashenglm", "MiDashengLMModel"),
"MiniMaxVL01ForConditionalGeneration": ("minimax_vl_01", "MiniMaxVL01ForConditionalGeneration"), # noqa: E501 "MiniMaxVL01ForConditionalGeneration": (
"minimax_vl_01",
"MiniMaxVL01ForConditionalGeneration",
), # noqa: E501
"MiniCPMO": ("minicpmo", "MiniCPMO"), "MiniCPMO": ("minicpmo", "MiniCPMO"),
"MiniCPMV": ("minicpmv", "MiniCPMV"), "MiniCPMV": ("minicpmv", "MiniCPMV"),
"Mistral3ForConditionalGeneration": ("mistral3", "Mistral3ForConditionalGeneration"), # noqa: E501 "Mistral3ForConditionalGeneration": (
"mistral3",
"Mistral3ForConditionalGeneration",
), # noqa: E501
"MolmoForCausalLM": ("molmo", "MolmoForCausalLM"), "MolmoForCausalLM": ("molmo", "MolmoForCausalLM"),
"NVLM_D": ("nvlm_d", "NVLM_D_Model"), "NVLM_D": ("nvlm_d", "NVLM_D_Model"),
"Ovis": ("ovis", "Ovis"), "Ovis": ("ovis", "Ovis"),
"Ovis2_5": ("ovis2_5", "Ovis2_5"), "Ovis2_5": ("ovis2_5", "Ovis2_5"),
"PaliGemmaForConditionalGeneration": ("paligemma", "PaliGemmaForConditionalGeneration"), # noqa: E501 "PaliGemmaForConditionalGeneration": (
"paligemma",
"PaliGemmaForConditionalGeneration",
), # noqa: E501
"Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"), "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
"Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"), "Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"),
"Phi4MultimodalForCausalLM": ("phi4_multimodal", "Phi4MultimodalForCausalLM"), # noqa: E501 "Phi4MultimodalForCausalLM": ("phi4_multimodal", "Phi4MultimodalForCausalLM"), # noqa: E501
"PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"), # noqa: E501 "PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"), # noqa: E501
"QwenVLForConditionalGeneration": ("qwen_vl", "QwenVLForConditionalGeneration"), # noqa: E501 "QwenVLForConditionalGeneration": ("qwen_vl", "QwenVLForConditionalGeneration"), # noqa: E501
"Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), # noqa: E501 "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), # noqa: E501
"Qwen2_5_VLForConditionalGeneration": ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"), # noqa: E501 "Qwen2_5_VLForConditionalGeneration": (
"Qwen2AudioForConditionalGeneration": ("qwen2_audio", "Qwen2AudioForConditionalGeneration"), # noqa: E501 "qwen2_5_vl",
"Qwen2_5OmniModel": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"), # noqa: E501 "Qwen2_5_VLForConditionalGeneration",
"Qwen2_5OmniForConditionalGeneration": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"), # noqa: E501 ), # noqa: E501
"Qwen2AudioForConditionalGeneration": (
"qwen2_audio",
"Qwen2AudioForConditionalGeneration",
), # noqa: E501
"Qwen2_5OmniModel": (
"qwen2_5_omni_thinker",
"Qwen2_5OmniThinkerForConditionalGeneration",
), # noqa: E501
"Qwen2_5OmniForConditionalGeneration": (
"qwen2_5_omni_thinker",
"Qwen2_5OmniThinkerForConditionalGeneration",
), # noqa: E501
"Qwen3VLForConditionalGeneration": ("qwen3_vl", "Qwen3VLForConditionalGeneration"), # noqa: E501 "Qwen3VLForConditionalGeneration": ("qwen3_vl", "Qwen3VLForConditionalGeneration"), # noqa: E501
"Qwen3VLMoeForConditionalGeneration": ("qwen3_vl_moe", "Qwen3VLMoeForConditionalGeneration"), # noqa: E501 "Qwen3VLMoeForConditionalGeneration": (
"qwen3_vl_moe",
"Qwen3VLMoeForConditionalGeneration",
), # noqa: E501
"SkyworkR1VChatModel": ("skyworkr1v", "SkyworkR1VChatModel"), "SkyworkR1VChatModel": ("skyworkr1v", "SkyworkR1VChatModel"),
"Step3VLForConditionalGeneration": ("step3_vl", "Step3VLForConditionalGeneration"), # noqa: E501 "Step3VLForConditionalGeneration": ("step3_vl", "Step3VLForConditionalGeneration"), # noqa: E501
"TarsierForConditionalGeneration": ("tarsier", "TarsierForConditionalGeneration"), # noqa: E501 "TarsierForConditionalGeneration": ("tarsier", "TarsierForConditionalGeneration"), # noqa: E501
"Tarsier2ForConditionalGeneration": ("qwen2_vl", "Tarsier2ForConditionalGeneration"), # noqa: E501 "Tarsier2ForConditionalGeneration": (
"qwen2_vl",
"Tarsier2ForConditionalGeneration",
), # noqa: E501
"UltravoxModel": ("ultravox", "UltravoxModel"), "UltravoxModel": ("ultravox", "UltravoxModel"),
"VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"), # noqa: E501 "VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"), # noqa: E501
# [Encoder-decoder] # [Encoder-decoder]
@ -324,13 +398,27 @@ _TRANSFORMERS_BACKEND_MODELS = {
"TransformersForCausalLM": ("transformers", "TransformersForCausalLM"), "TransformersForCausalLM": ("transformers", "TransformersForCausalLM"),
"TransformersForMultimodalLM": ("transformers", "TransformersForMultimodalLM"), # noqa: E501 "TransformersForMultimodalLM": ("transformers", "TransformersForMultimodalLM"), # noqa: E501
"TransformersMoEForCausalLM": ("transformers_moe", "TransformersMoEForCausalLM"), # noqa: E501 "TransformersMoEForCausalLM": ("transformers_moe", "TransformersMoEForCausalLM"), # noqa: E501
"TransformersMoEForMultimodalLM": ("transformers_moe", "TransformersMoEForMultimodalLM"), # noqa: E501 "TransformersMoEForMultimodalLM": (
"TransformersEmbeddingModel": ("transformers_pooling", "TransformersEmbeddingModel"), # noqa: E501 "transformers_moe",
"TransformersForSequenceClassification": ("transformers_pooling", "TransformersForSequenceClassification"), # noqa: E501 "TransformersMoEForMultimodalLM",
"TransformersMoEForSequenceClassification": ("transformers_pooling", "TransformersMoEForSequenceClassification"), # noqa: E501 ), # noqa: E501
"TransformersMoEEmbeddingModel": ("transformers_pooling", "TransformersMoEEmbeddingModel"), # noqa: E501 "TransformersEmbeddingModel": (
"transformers_pooling",
"TransformersEmbeddingModel",
), # noqa: E501
"TransformersForSequenceClassification": (
"transformers_pooling",
"TransformersForSequenceClassification",
), # noqa: E501
"TransformersMoEForSequenceClassification": (
"transformers_pooling",
"TransformersMoEForSequenceClassification",
), # noqa: E501
"TransformersMoEEmbeddingModel": (
"transformers_pooling",
"TransformersMoEEmbeddingModel",
), # noqa: E501
} }
# yapf: enable
_VLLM_MODELS = { _VLLM_MODELS = {
**_TEXT_GENERATION_MODELS, **_TEXT_GENERATION_MODELS,

View File

@ -8,13 +8,10 @@ from transformers import SmolVLMProcessor
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
# yapf: disable
from .idefics3 import Idefics3DummyInputsBuilder as SmolVLMDummyInputsBuilder from .idefics3 import Idefics3DummyInputsBuilder as SmolVLMDummyInputsBuilder
from .idefics3 import Idefics3ForConditionalGeneration, Idefics3ProcessingInfo from .idefics3 import Idefics3ForConditionalGeneration, Idefics3ProcessingInfo
from .idefics3 import Idefics3MultiModalProcessor as SmolVLMMultiModalProcessor from .idefics3 import Idefics3MultiModalProcessor as SmolVLMMultiModalProcessor
# yapf: enable
class SmolVLMProcessingInfo(Idefics3ProcessingInfo): class SmolVLMProcessingInfo(Idefics3ProcessingInfo):
def get_hf_processor(self, **kwargs: object) -> SmolVLMProcessor: def get_hf_processor(self, **kwargs: object) -> SmolVLMProcessor:

View File

@ -32,11 +32,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models import SupportsPP from vllm.model_executor.models import SupportsPP
from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.module_mapping import MultiModelKeys
# yapf: disable
from vllm.model_executor.models.whisper import WhisperEncoder from vllm.model_executor.models.whisper import WhisperEncoder
# yapf: enable
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
MultiModalDataDict, MultiModalDataDict,

View File

@ -28,7 +28,6 @@ def _get_minicpmv_chat_template_fallback(tokenizer_name_or_path: str) -> Optiona
return CHAT_TEMPLATES_DIR / "template_chatml.jinja" return CHAT_TEMPLATES_DIR / "template_chatml.jinja"
# yapf: disable
_MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = { _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = {
"blip-2": CHAT_TEMPLATES_DIR / "template_blip2.jinja", "blip-2": CHAT_TEMPLATES_DIR / "template_blip2.jinja",
"clip": CHAT_TEMPLATES_DIR / "template_basic.jinja", "clip": CHAT_TEMPLATES_DIR / "template_basic.jinja",
@ -39,7 +38,6 @@ _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = {
"paligemma": CHAT_TEMPLATES_DIR / "template_basic.jinja", "paligemma": CHAT_TEMPLATES_DIR / "template_basic.jinja",
"qwen": _get_qwen_chat_template_fallback, "qwen": _get_qwen_chat_template_fallback,
} }
# yapf: enable
def register_chat_template_fallback_path( def register_chat_template_fallback_path(

View File

@ -1,12 +1,11 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# yapf: disable
# ruff: noqa: E501 # ruff: noqa: E501
# coding=utf-8 # coding=utf-8
# Copied from # Copied from
# https://huggingface.co/Snowflake/snowflake-arctic-instruct/blob/main/configuration_arctic.py # https://huggingface.co/Snowflake/snowflake-arctic-instruct/blob/main/configuration_arctic.py
""" Arctic model configuration""" """Arctic model configuration"""
from dataclasses import asdict, dataclass from dataclasses import asdict, dataclass
from typing import Any from typing import Any

View File

@ -1,7 +1,6 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# yapf: disable
# ruff: noqa: E501 # ruff: noqa: E501
# Adapted from # Adapted from
# https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1/blob/main/configuration.py # https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1/blob/main/configuration.py
@ -16,7 +15,7 @@ from transformers.dynamic_module_utils import get_class_from_dynamic_module
class Nemotron_Nano_VL_Config(PretrainedConfig): class Nemotron_Nano_VL_Config(PretrainedConfig):
model_type = 'Llama_Nemotron_Nano_VL' model_type = "Llama_Nemotron_Nano_VL"
is_composition = True is_composition = True
def __init__( def __init__(
@ -26,17 +25,22 @@ class Nemotron_Nano_VL_Config(PretrainedConfig):
force_image_size=None, force_image_size=None,
downsample_ratio=0.5, downsample_ratio=0.5,
template=None, template=None,
ps_version='v1', ps_version="v1",
image_tag_type="internvl", image_tag_type="internvl",
projector_hidden_size=4096, projector_hidden_size=4096,
vit_hidden_size=1280, vit_hidden_size=1280,
**kwargs **kwargs,
): ):
super().__init__(**kwargs) super().__init__(**kwargs)
if vision_config is not None: if vision_config is not None:
assert "auto_map" in vision_config and "AutoConfig" in vision_config["auto_map"] assert (
vision_auto_config = get_class_from_dynamic_module(*vision_config["auto_map"]["AutoConfig"].split("--")[::-1]) "auto_map" in vision_config
and "AutoConfig" in vision_config["auto_map"]
)
vision_auto_config = get_class_from_dynamic_module(
*vision_config["auto_map"]["AutoConfig"].split("--")[::-1]
)
self.vision_config = vision_auto_config(**vision_config) self.vision_config = vision_auto_config(**vision_config)
else: else:
self.vision_config = PretrainedConfig() self.vision_config = PretrainedConfig()
@ -51,6 +55,6 @@ class Nemotron_Nano_VL_Config(PretrainedConfig):
self.downsample_ratio = downsample_ratio self.downsample_ratio = downsample_ratio
self.template = template # TODO move out of here and into the tokenizer self.template = template # TODO move out of here and into the tokenizer
self.ps_version = ps_version # Pixel shuffle version self.ps_version = ps_version # Pixel shuffle version
self.image_tag_type = image_tag_type # TODO: into the tokenizer too? self.image_tag_type = image_tag_type # TODO: into the tokenizer too?
self.projector_hidden_size = projector_hidden_size self.projector_hidden_size = projector_hidden_size
self.vit_hidden_size = vit_hidden_size self.vit_hidden_size = vit_hidden_size

View File

@ -1,7 +1,6 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# yapf: disable
# ruff: noqa: E501 # ruff: noqa: E501
# adapted from https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_aimv2.py # adapted from https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_aimv2.py
# and https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_ovis.py # and https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_ovis.py
@ -70,34 +69,37 @@ class AIMv2Config(PretrainedConfig):
# Visual Tokenizer Configuration # Visual Tokenizer Configuration
# ---------------------------------------------------------------------- # ----------------------------------------------------------------------
class BaseVisualTokenizerConfig(PretrainedConfig): class BaseVisualTokenizerConfig(PretrainedConfig):
def __init__(
def __init__(self, self,
vocab_size=16384, vocab_size=16384,
tokenize_function="softmax", tokenize_function="softmax",
tau=1.0, tau=1.0,
depths=None, depths=None,
drop_cls_token=False, drop_cls_token=False,
backbone_config: Optional[Union[PretrainedConfig, backbone_config: Optional[Union[PretrainedConfig, dict]] = None,
dict]] = None, hidden_stride: int = 1,
hidden_stride: int = 1, **kwargs,
**kwargs): ):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.tokenize_function = tokenize_function self.tokenize_function = tokenize_function
self.tau = tau self.tau = tau
if isinstance(depths, str): if isinstance(depths, str):
depths = [int(x) for x in depths.split('|')] depths = [int(x) for x in depths.split("|")]
self.depths = depths self.depths = depths
self.backbone_kwargs = dict[str, Any]() self.backbone_kwargs = dict[str, Any]()
self.drop_cls_token = drop_cls_token self.drop_cls_token = drop_cls_token
if backbone_config is not None: if backbone_config is not None:
assert isinstance(backbone_config, (PretrainedConfig, dict)), \ assert isinstance(backbone_config, (PretrainedConfig, dict)), (
f"expect `backbone_config` to be instance of PretrainedConfig or dict, but got {type(backbone_config)} type" f"expect `backbone_config` to be instance of PretrainedConfig or dict, but got {type(backbone_config)} type"
)
if not isinstance(backbone_config, PretrainedConfig): if not isinstance(backbone_config, PretrainedConfig):
model_type = backbone_config['model_type'] model_type = backbone_config["model_type"]
if model_type != "aimv2": if model_type != "aimv2":
backbone_config.pop('model_type') backbone_config.pop("model_type")
backbone_config = AutoConfig.for_model(model_type, **backbone_config) backbone_config = AutoConfig.for_model(
model_type, **backbone_config
)
else: else:
backbone_config = AIMv2Config(**backbone_config) backbone_config = AIMv2Config(**backbone_config)
self.backbone_config = backbone_config self.backbone_config = backbone_config
@ -113,7 +115,7 @@ class Aimv2VisualTokenizerConfig(BaseVisualTokenizerConfig):
self.drop_cls_token = False self.drop_cls_token = False
if self.depths: if self.depths:
assert len(self.depths) == 1 assert len(self.depths) == 1
self.backbone_kwargs['num_hidden_layers'] = self.depths[0] self.backbone_kwargs["num_hidden_layers"] = self.depths[0]
class SiglipVisualTokenizerConfig(BaseVisualTokenizerConfig): class SiglipVisualTokenizerConfig(BaseVisualTokenizerConfig):
@ -125,7 +127,7 @@ class SiglipVisualTokenizerConfig(BaseVisualTokenizerConfig):
self.drop_cls_token = False self.drop_cls_token = False
if self.depths: if self.depths:
assert len(self.depths) == 1 assert len(self.depths) == 1
self.backbone_kwargs['num_hidden_layers'] = self.depths[0] self.backbone_kwargs["num_hidden_layers"] = self.depths[0]
AutoConfig.register("siglip_visual_tokenizer", SiglipVisualTokenizerConfig) AutoConfig.register("siglip_visual_tokenizer", SiglipVisualTokenizerConfig)
@ -138,35 +140,39 @@ AutoConfig.register("aimv2_visual_tokenizer", Aimv2VisualTokenizerConfig)
class OvisConfig(PretrainedConfig): class OvisConfig(PretrainedConfig):
model_type = "ovis" model_type = "ovis"
def __init__(self, def __init__(
llm_config: Optional[Union[PretrainedConfig, dict]] = None, self,
visual_tokenizer_config: Optional[Union[PretrainedConfig, llm_config: Optional[Union[PretrainedConfig, dict]] = None,
dict]] = None, visual_tokenizer_config: Optional[Union[PretrainedConfig, dict]] = None,
multimodal_max_length=8192, multimodal_max_length=8192,
hidden_size=None, hidden_size=None,
conversation_formatter_class=None, conversation_formatter_class=None,
llm_attn_implementation=None, llm_attn_implementation=None,
disable_tie_weight=False, disable_tie_weight=False,
**kwargs): **kwargs,
):
super().__init__(**kwargs) super().__init__(**kwargs)
if llm_config is not None: if llm_config is not None:
assert isinstance(llm_config, (PretrainedConfig, dict)), \ assert isinstance(llm_config, (PretrainedConfig, dict)), (
f"expect `llm_config` to be instance of PretrainedConfig or dict, but got {type(llm_config)} type" f"expect `llm_config` to be instance of PretrainedConfig or dict, but got {type(llm_config)} type"
)
if not isinstance(llm_config, PretrainedConfig): if not isinstance(llm_config, PretrainedConfig):
model_type = llm_config['model_type'] model_type = llm_config["model_type"]
llm_config.pop('model_type') llm_config.pop("model_type")
llm_config = AutoConfig.for_model(model_type, **llm_config) llm_config = AutoConfig.for_model(model_type, **llm_config)
# map llm_config to text_config # map llm_config to text_config
self.text_config = llm_config self.text_config = llm_config
if visual_tokenizer_config is not None: if visual_tokenizer_config is not None:
assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), \ assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), (
f"expect `visual_tokenizer_config` to be instance of PretrainedConfig or dict, but got {type(visual_tokenizer_config)} type" f"expect `visual_tokenizer_config` to be instance of PretrainedConfig or dict, but got {type(visual_tokenizer_config)} type"
)
if not isinstance(visual_tokenizer_config, PretrainedConfig): if not isinstance(visual_tokenizer_config, PretrainedConfig):
model_type = visual_tokenizer_config['model_type'] model_type = visual_tokenizer_config["model_type"]
visual_tokenizer_config.pop('model_type') visual_tokenizer_config.pop("model_type")
visual_tokenizer_config = AutoConfig.for_model( visual_tokenizer_config = AutoConfig.for_model(
model_type, **visual_tokenizer_config) model_type, **visual_tokenizer_config
)
self.visual_tokenizer_config = visual_tokenizer_config self.visual_tokenizer_config = visual_tokenizer_config
self.multimodal_max_length = multimodal_max_length self.multimodal_max_length = multimodal_max_length

View File

@ -1,7 +1,6 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# yapf: disable
# ruff: noqa: E501 # ruff: noqa: E501
# coding=utf-8 # coding=utf-8
# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/ff23960c5cf9e6874b44be38af930cfb0ccbb620/deepseek_vl2/models/processing_deepseek_vl_v2.py # adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/ff23960c5cf9e6874b44be38af930cfb0ccbb620/deepseek_vl2/models/processing_deepseek_vl_v2.py
@ -35,11 +34,12 @@ from transformers.processing_utils import ProcessorMixin
class ImageTransform: class ImageTransform:
def __init__(
def __init__(self, self,
mean: tuple[float, float, float] = (0.5, 0.5, 0.5), mean: tuple[float, float, float] = (0.5, 0.5, 0.5),
std: tuple[float, float, float] = (0.5, 0.5, 0.5), std: tuple[float, float, float] = (0.5, 0.5, 0.5),
normalize: bool = True): normalize: bool = True,
):
self.mean = mean self.mean = mean
self.std = std self.std = std
self.normalize = normalize self.normalize = normalize
@ -77,7 +77,6 @@ class DeepseekVLV2Processor(ProcessorMixin):
ignore_id: int = -100, ignore_id: int = -100,
**kwargs, **kwargs,
): ):
self.candidate_resolutions = candidate_resolutions self.candidate_resolutions = candidate_resolutions
self.image_size = candidate_resolutions[0][0] self.image_size = candidate_resolutions[0][0]
self.patch_size = patch_size self.patch_size = patch_size
@ -86,13 +85,15 @@ class DeepseekVLV2Processor(ProcessorMixin):
self.normalize = normalize self.normalize = normalize
self.downsample_ratio = downsample_ratio self.downsample_ratio = downsample_ratio
self.image_transform = ImageTransform(mean=image_mean, std=image_std, normalize=normalize) self.image_transform = ImageTransform(
mean=image_mean, std=image_std, normalize=normalize
)
self.tokenizer = tokenizer self.tokenizer = tokenizer
self.tokenizer.padding_side = 'left' # must set thispadding side with make a difference in batch inference self.tokenizer.padding_side = "left" # must set thispadding side with make a difference in batch inference
# add the pad_token as special token to use 'tokenizer.pad_token' and 'tokenizer.pad_token_id' # add the pad_token as special token to use 'tokenizer.pad_token' and 'tokenizer.pad_token_id'
if tokenizer.pad_token is None: if tokenizer.pad_token is None:
self.tokenizer.add_special_tokens({'pad_token': pad_token}) self.tokenizer.add_special_tokens({"pad_token": pad_token})
# add image token # add image token
image_token_id = self.tokenizer.vocab.get(image_token) image_token_id = self.tokenizer.vocab.get(image_token)
@ -104,7 +105,7 @@ class DeepseekVLV2Processor(ProcessorMixin):
# add five special tokens for grounding-related tasks # add five special tokens for grounding-related tasks
# <|ref|>, <|/ref|>, <|det|>, <|/det|>, <|grounding|> # <|ref|>, <|/ref|>, <|det|>, <|/det|>, <|grounding|>
special_tokens = ['<|ref|>', '<|/ref|>', '<|det|>', '<|/det|>', '<|grounding|>'] special_tokens = ["<|ref|>", "<|/ref|>", "<|det|>", "<|/det|>", "<|grounding|>"]
special_tokens_dict = {"additional_special_tokens": special_tokens} special_tokens_dict = {"additional_special_tokens": special_tokens}
self.tokenizer.add_special_tokens(special_tokens_dict) self.tokenizer.add_special_tokens(special_tokens_dict)
@ -134,15 +135,19 @@ class DeepseekVLV2Processor(ProcessorMixin):
for width, height in self.candidate_resolutions: for width, height in self.candidate_resolutions:
scale = min(width / original_width, height / original_height) scale = min(width / original_width, height / original_height)
downscaled_width, downscaled_height = int( downscaled_width, downscaled_height = (
original_width * scale), int(original_height * scale) int(original_width * scale),
effective_resolution = min(downscaled_width * downscaled_height, int(original_height * scale),
original_width * original_height) )
effective_resolution = min(
downscaled_width * downscaled_height, original_width * original_height
)
wasted_resolution = (width * height) - effective_resolution wasted_resolution = (width * height) - effective_resolution
if effective_resolution > max_effective_resolution or ( if effective_resolution > max_effective_resolution or (
effective_resolution == max_effective_resolution effective_resolution == max_effective_resolution
and wasted_resolution < min_wasted_resolution): and wasted_resolution < min_wasted_resolution
):
max_effective_resolution = effective_resolution max_effective_resolution = effective_resolution
min_wasted_resolution = wasted_resolution min_wasted_resolution = wasted_resolution
best_fit = (width, height) best_fit = (width, height)
@ -198,12 +203,20 @@ class DeepseekVLV2Processor(ProcessorMixin):
- num_image_tokens (list[int]): the number of image tokens - num_image_tokens (list[int]): the number of image tokens
""" """
assert (prompt is not None and images is not None assert prompt is not None and images is not None, (
), "prompt and images must be used at the same time." "prompt and images must be used at the same time."
)
sft_format = prompt sft_format = prompt
tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens = self.tokenize_with_images( (
sft_format, images, bos=True, eos=True, cropping=len(images) <= 2) tokenized_str,
images_list,
images_seq_mask,
images_spatial_crop,
num_image_tokens,
) = self.tokenize_with_images(
sft_format, images, bos=True, eos=True, cropping=len(images) <= 2
)
masked_tokenized_str = [] masked_tokenized_str = []
for token_index in tokenized_str: for token_index in tokenized_str:
if token_index != self.image_token_id: if token_index != self.image_token_id:
@ -211,17 +224,21 @@ class DeepseekVLV2Processor(ProcessorMixin):
else: else:
masked_tokenized_str.append(self.ignore_id) masked_tokenized_str.append(self.ignore_id)
assert len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str), \ assert (
(f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, " len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str)
f"imags_seq_mask's length {len(images_seq_mask)}, are not equal") ), (
f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, "
f"imags_seq_mask's length {len(images_seq_mask)}, are not equal"
)
input_ids = torch.LongTensor(tokenized_str) input_ids = torch.LongTensor(tokenized_str)
target_ids = torch.LongTensor(masked_tokenized_str) target_ids = torch.LongTensor(masked_tokenized_str)
images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool) images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool)
# set input_ids < 0 | input_ids == self.image_token_id as ignore_id # set input_ids < 0 | input_ids == self.image_token_id as ignore_id
target_ids[(input_ids < 0) | target_ids[(input_ids < 0) | (input_ids == self.image_token_id)] = (
(input_ids == self.image_token_id)] = self.ignore_id self.ignore_id
)
input_ids[input_ids < 0] = self.pad_id input_ids[input_ids < 0] = self.pad_id
if inference_mode: if inference_mode:
@ -311,30 +328,50 @@ class DeepseekVLV2Processor(ProcessorMixin):
best_width, best_height = self.image_size, self.image_size best_width, best_height = self.image_size, self.image_size
"""process the global view""" """process the global view"""
global_view = ImageOps.pad(image, (self.image_size, self.image_size), global_view = ImageOps.pad(
color=tuple(int(x * 255) for x in self.image_transform.mean)) image,
(self.image_size, self.image_size),
color=tuple(int(x * 255) for x in self.image_transform.mean),
)
images_list.append(self.image_transform(global_view)) images_list.append(self.image_transform(global_view))
"""process the local views""" """process the local views"""
local_view = ImageOps.pad(image, (best_width, best_height), local_view = ImageOps.pad(
color=tuple(int(x * 255) for x in self.image_transform.mean)) image,
(best_width, best_height),
color=tuple(int(x * 255) for x in self.image_transform.mean),
)
for i in range(0, best_height, self.image_size): for i in range(0, best_height, self.image_size):
for j in range(0, best_width, self.image_size): for j in range(0, best_width, self.image_size):
images_list.append( images_list.append(
self.image_transform(local_view.crop((j, i, j + self.image_size, i + self.image_size)))) self.image_transform(
local_view.crop(
(j, i, j + self.image_size, i + self.image_size)
)
)
)
"""record height / width crop num""" """record height / width crop num"""
num_width_tiles, num_height_tiles = best_width // self.image_size, best_height // self.image_size num_width_tiles, num_height_tiles = (
best_width // self.image_size,
best_height // self.image_size,
)
images_spatial_crop.append([num_width_tiles, num_height_tiles]) images_spatial_crop.append([num_width_tiles, num_height_tiles])
"""add image tokens""" """add image tokens"""
h = w = math.ceil((self.image_size // self.patch_size) / self.downsample_ratio) h = w = math.ceil(
(self.image_size // self.patch_size) / self.downsample_ratio
)
# global views tokens h * (w + 1), 1 is for line separator # global views tokens h * (w + 1), 1 is for line separator
tokenized_image = [self.image_token_id] * h * (w + 1) tokenized_image = [self.image_token_id] * h * (w + 1)
# add a separator between global and local views # add a separator between global and local views
tokenized_image += [self.image_token_id] tokenized_image += [self.image_token_id]
# local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1) # local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1)
tokenized_image += [self.image_token_id] * (num_height_tiles * h) * (num_width_tiles * w + 1) tokenized_image += (
[self.image_token_id]
* (num_height_tiles * h)
* (num_width_tiles * w + 1)
)
tokenized_str += tokenized_image tokenized_str += tokenized_image
images_seq_mask += [True] * len(tokenized_image) images_seq_mask += [True] * len(tokenized_image)
@ -353,10 +390,17 @@ class DeepseekVLV2Processor(ProcessorMixin):
tokenized_str = tokenized_str + [self.eos_id] tokenized_str = tokenized_str + [self.eos_id]
images_seq_mask = images_seq_mask + [False] images_seq_mask = images_seq_mask + [False]
assert len(tokenized_str) == len( assert len(tokenized_str) == len(images_seq_mask), (
images_seq_mask), f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}" f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
)
return tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens return (
tokenized_str,
images_list,
images_seq_mask,
images_spatial_crop,
num_image_tokens,
)
AutoProcessor.register("DeepseekVLV2Processor", DeepseekVLV2Processor) AutoProcessor.register("DeepseekVLV2Processor", DeepseekVLV2Processor)

View File

@ -1,7 +1,6 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# yapf: disable
# ruff: noqa: E501 # ruff: noqa: E501
# coding=utf-8 # coding=utf-8
# adapted from https://github.com/AIDC-AI/Ovis/blob/35ab51a1a1e3542fa6db260a1084cefbc8f164bb/ovis/vllm/processing_ovis.py # adapted from https://github.com/AIDC-AI/Ovis/blob/35ab51a1a1e3542fa6db260a1084cefbc8f164bb/ovis/vllm/processing_ovis.py
@ -35,23 +34,24 @@ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
from vllm.multimodal.image import convert_image_mode from vllm.multimodal.image import convert_image_mode
__all__ = ['OvisProcessor'] __all__ = ["OvisProcessor"]
IGNORE_ID = -100 IGNORE_ID = -100
class OvisProcessorKwargs(ProcessingKwargs, total=False): # type: ignore[call-arg]
class OvisProcessorKwargs(ProcessingKwargs, total=False): # type: ignore[call-arg]
_defaults = { _defaults = {
"text_kwargs": { "text_kwargs": {
"padding": False, "padding": False,
}, },
"images_kwargs": { "images_kwargs": {
'max_partition':9, "max_partition": 9,
'covering_threshold':0.9, "covering_threshold": 0.9,
'convert_to_rgb':True, "convert_to_rgb": True,
'return_tensors':'pt'}, "return_tensors": "pt",
},
} }
class OvisProcessor(ProcessorMixin): class OvisProcessor(ProcessorMixin):
r""" r"""
Constructs an Ovis processor which wraps an Ovis image processor and a Qwen2 tokenizer into a single processor. Constructs an Ovis processor which wraps an Ovis image processor and a Qwen2 tokenizer into a single processor.
@ -97,14 +97,16 @@ class OvisProcessor(ProcessorMixin):
"image_col_sep": -303, "image_col_sep": -303,
"image_row_sep": -304, "image_row_sep": -304,
"image_end": -305, "image_end": -305,
'image_pad': image_pad_token_id, "image_pad": image_pad_token_id,
} }
return extra_special_tokens return extra_special_tokens
def __call__( def __call__(
self, self,
images: ImageInput = None, images: ImageInput = None,
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, text: Union[
TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]
] = None,
**kwargs: Unpack[OvisProcessorKwargs], **kwargs: Unpack[OvisProcessorKwargs],
) -> BatchFeature: ) -> BatchFeature:
""" """
@ -169,7 +171,6 @@ class OvisProcessor(ProcessorMixin):
# Process text input # Process text input
if text is not None: if text is not None:
if not isinstance(text, list): if not isinstance(text, list):
text = [text] text = [text]
@ -178,7 +179,10 @@ class OvisProcessor(ProcessorMixin):
replaced_ids_list = [] replaced_ids_list = []
idx = 0 idx = 0
for ids_tensor in tokenized_batched_text: for ids_tensor in tokenized_batched_text:
if image_token_id in ids_tensor and "image_placeholders" in image_features: if (
image_token_id in ids_tensor
and "image_placeholders" in image_features
):
if idx < len(image_features["image_placeholders"]): if idx < len(image_features["image_placeholders"]):
# Converts in list for ease of use # Converts in list for ease of use
ids_list = ids_tensor.tolist() ids_list = ids_tensor.tolist()
@ -188,7 +192,9 @@ class OvisProcessor(ProcessorMixin):
# replace placeholders # replace placeholders
for i, token_id in enumerate(ids_list): for i, token_id in enumerate(ids_list):
if token_id == image_token_id: if token_id == image_token_id:
placeholder_ids = image_features["image_placeholders"][idx] placeholder_ids = image_features["image_placeholders"][
idx
]
new_ids.extend(placeholder_ids) new_ids.extend(placeholder_ids)
idx += 1 idx += 1
else: else:
@ -198,7 +204,8 @@ class OvisProcessor(ProcessorMixin):
ids_tensor = torch.tensor(new_ids, dtype=torch.long) ids_tensor = torch.tensor(new_ids, dtype=torch.long)
else: else:
raise RuntimeError( raise RuntimeError(
'Mismatch between the images you provided and the number of placeholder present in the text') "Mismatch between the images you provided and the number of placeholder present in the text"
)
replaced_ids_list.append(ids_tensor) replaced_ids_list.append(ids_tensor)
@ -217,7 +224,7 @@ class OvisProcessor(ProcessorMixin):
# Add image features if present # Add image features if present
if image_features: if image_features:
output["pixel_values"] = processed_images output["pixel_values"] = processed_images
output['grids'] = grids output["grids"] = grids
return output return output
@ -227,8 +234,10 @@ class OvisProcessor(ProcessorMixin):
def _tokenize_with_image_symbol(self, text_list: list[str]) -> torch.LongTensor: def _tokenize_with_image_symbol(self, text_list: list[str]) -> torch.LongTensor:
batch_token_ids = [] batch_token_ids = []
for text in text_list: for text in text_list:
text_chunks = [self.tokenizer(chunk, add_special_tokens=False).input_ids for chunk in text_chunks = [
text.split(self.image_token)] self.tokenizer(chunk, add_special_tokens=False).input_ids
for chunk in text.split(self.image_token)
]
token_ids = [] token_ids = []
num_chuck = len(text_chunks) num_chuck = len(text_chunks)
for i, chunk in enumerate(text_chunks): for i, chunk in enumerate(text_chunks):
@ -240,50 +249,60 @@ class OvisProcessor(ProcessorMixin):
def get_image_size(self): def get_image_size(self):
size = self.image_processor.size size = self.image_processor.size
if 'shortest_edge' in size: if "shortest_edge" in size:
width = height = size['shortest_edge'] width = height = size["shortest_edge"]
elif "height" in size and "width" in size: elif "height" in size and "width" in size:
width = size['width'] width = size["width"]
height = size['height'] height = size["height"]
else: else:
raise ValueError( "Can't parse image size from image_processor config.") raise ValueError("Can't parse image size from image_processor config.")
return height, width return height, width
def get_token_value(self, tok): def get_token_value(self, tok):
return self.extra_special_tokens[tok] return self.extra_special_tokens[tok]
def construct_image_indicators(self, grid): def construct_image_indicators(self, grid):
image_placeholders = [self.get_token_value('image_start'), image_placeholders = [
self.get_token_value('image_atom'), self.get_token_value("image_start"),
self.get_token_value('image_prefix')] self.get_token_value("image_atom"),
self.get_token_value("image_prefix"),
]
if grid[0] * grid[1] > 1: if grid[0] * grid[1] > 1:
for r in range(grid[0]): for r in range(grid[0]):
for c in range(grid[1]): for c in range(grid[1]):
image_placeholders.append(self.get_token_value('image_atom') ) image_placeholders.append(self.get_token_value("image_atom"))
if c < grid[1] - 1: if c < grid[1] - 1:
image_placeholders.append(self.get_token_value('image_col_sep')) image_placeholders.append(self.get_token_value("image_col_sep"))
if r < grid[0] - 1: if r < grid[0] - 1:
image_placeholders.append(self.get_token_value('image_row_sep')) image_placeholders.append(self.get_token_value("image_row_sep"))
image_placeholders.append(self.get_token_value('image_end')) image_placeholders.append(self.get_token_value("image_end"))
return image_placeholders return image_placeholders
def construct_image_placeholders(self, grid): def construct_image_placeholders(self, grid):
image_placeholders = self.construct_image_indicators(grid) image_placeholders = self.construct_image_indicators(grid)
image_atom_token_id = self.get_token_value('image_atom') image_atom_token_id = self.get_token_value("image_atom")
# Extract the padding token ID from tokenizer # Extract the padding token ID from tokenizer
image_padding_token_id = self.get_token_value('image_pad') image_padding_token_id = self.get_token_value("image_pad")
# Create a new list with padding tokens inserted # Create a new list with padding tokens inserted
padded_placeholder_tokens = [] padded_placeholder_tokens = []
for token in image_placeholders: for token in image_placeholders:
padded_placeholder_tokens.append(image_padding_token_id) padded_placeholder_tokens.append(image_padding_token_id)
if token == image_atom_token_id: if token == image_atom_token_id:
padded_placeholder_tokens.extend([image_padding_token_id] * self.image_segment_len) padded_placeholder_tokens.extend(
[image_padding_token_id] * self.image_segment_len
)
return padded_placeholder_tokens return padded_placeholder_tokens
def preprocess_image(self, image: PIL.Image.Image, max_partition, covering_threshold, convert_to_rgb, return_tensors): def preprocess_image(
self,
image: PIL.Image.Image,
max_partition,
covering_threshold,
convert_to_rgb,
return_tensors,
):
def _preprocess(img: PIL.Image.Image, side): def _preprocess(img: PIL.Image.Image, side):
# first resize and preprocess # first resize and preprocess
w, h = img.size w, h = img.size
@ -296,19 +315,27 @@ class OvisProcessor(ProcessorMixin):
new_height = side new_height = side
new_width = int(w / h * new_height) new_width = int(w / h * new_height)
new_size = dict(height=new_height, width=new_width) new_size = dict(height=new_height, width=new_width)
pixel_values = self.image_processor.preprocess(img, size=new_size, return_tensors=return_tensors)['pixel_values'] pixel_values = self.image_processor.preprocess(
img, size=new_size, return_tensors=return_tensors
)["pixel_values"]
# then pad to square # then pad to square
square_values = torch.zeros([1, 3, side, side], dtype=pixel_values.dtype, device=pixel_values.device) square_values = torch.zeros(
[1, 3, side, side], dtype=pixel_values.dtype, device=pixel_values.device
)
new_height, new_width = pixel_values.shape[2:] new_height, new_width = pixel_values.shape[2:]
if new_height == new_width: if new_height == new_width:
square_values[:, :, :, :] = pixel_values square_values[:, :, :, :] = pixel_values
elif new_height > new_width: elif new_height > new_width:
from_index = (side - new_width) // 2 from_index = (side - new_width) // 2
square_values[:, :, :, from_index:from_index + new_width] = pixel_values square_values[:, :, :, from_index : from_index + new_width] = (
pixel_values
)
else: else:
from_index = (side - new_height) // 2 from_index = (side - new_height) // 2
square_values[:, :, from_index:from_index + new_height, :] = pixel_values square_values[:, :, from_index : from_index + new_height, :] = (
pixel_values
)
return square_values return square_values
@ -350,7 +377,9 @@ class OvisProcessor(ProcessorMixin):
good_grids = [] good_grids = []
for grid in candidate_grids: for grid in candidate_grids:
partition = _partition(img, grid) partition = _partition(img, grid)
covering_ratio = sum([_covering_area(*p, side) for p in partition]) / img_area covering_ratio = (
sum([_covering_area(*p, side) for p in partition]) / img_area
)
assert covering_ratio <= 1.0 assert covering_ratio <= 1.0
all_grids.append((grid, covering_ratio)) all_grids.append((grid, covering_ratio))
if covering_ratio > covering_threshold: if covering_ratio > covering_threshold:
@ -358,18 +387,19 @@ class OvisProcessor(ProcessorMixin):
if len(good_grids) > 0: if len(good_grids) > 0:
# pick the good partition with minimum #sub_images and break the tie using covering_ratio # pick the good partition with minimum #sub_images and break the tie using covering_ratio
return sorted(good_grids, key=lambda x: (x[0][0] * x[0][1], -x[1]))[0][0] return sorted(good_grids, key=lambda x: (x[0][0] * x[0][1], -x[1]))[0][
0
]
else: else:
# pick the partition with maximum covering_ratio and break the tie using #sub_images # pick the partition with maximum covering_ratio and break the tie using #sub_images
return sorted(all_grids, key=lambda x: (-x[1], x[0][0] * x[0][1]))[0][0] return sorted(all_grids, key=lambda x: (-x[1], x[0][0] * x[0][1]))[0][0]
if convert_to_rgb: if convert_to_rgb:
image = convert_image_mode(image, 'RGB') image = convert_image_mode(image, "RGB")
sides = self.get_image_size() sides = self.get_image_size()
if sides[0] != sides[1]: if sides[0] != sides[1]:
raise ValueError('get_image_size() returns non-square size') raise ValueError("get_image_size() returns non-square size")
side = sides[0] side = sides[0]
grid = _get_best_grid(image, side) grid = _get_best_grid(image, side)
partition = _partition(image, grid) partition = _partition(image, grid)
@ -405,14 +435,18 @@ class OvisProcessor(ProcessorMixin):
`list[str]`: The decoded text. `list[str]`: The decoded text.
""" """
return self.tokenizer.batch_decode( return self.tokenizer.batch_decode(
generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False generated_outputs,
skip_special_tokens=True,
clean_up_tokenization_spaces=False,
) )
@property @property
def model_input_names(self): def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names tokenizer_input_names = self.tokenizer.model_input_names
image_processor_input_names = self.image_processor.model_input_names image_processor_input_names = self.image_processor.model_input_names
names_from_processor = list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) names_from_processor = list(
dict.fromkeys(tokenizer_input_names + image_processor_input_names)
)
return names_from_processor + ["second_per_grid_ts"] return names_from_processor + ["second_per_grid_ts"]

View File

@ -40,9 +40,6 @@ from vllm.utils.flashinfer import (
supports_trtllm_attention, supports_trtllm_attention,
use_trtllm_attention, use_trtllm_attention,
) )
# yapf conflicts with isort for this block
# yapf: disable
from vllm.v1.attention.backends.utils import ( from vllm.v1.attention.backends.utils import (
AttentionCGSupport, AttentionCGSupport,
AttentionMetadataBuilder, AttentionMetadataBuilder,
@ -52,8 +49,6 @@ from vllm.v1.attention.backends.utils import (
infer_global_hyperparameters, infer_global_hyperparameters,
split_decodes_and_prefills, split_decodes_and_prefills,
) )
# yapf: enable
from vllm.v1.kv_cache_interface import AttentionSpec from vllm.v1.kv_cache_interface import AttentionSpec
FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024 FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024

View File

@ -11,9 +11,6 @@ from vllm.attention.backends.abstract import AttentionLayer
from vllm.attention.ops.rocm_aiter_mla import aiter_mla_decode_fwd from vllm.attention.ops.rocm_aiter_mla import aiter_mla_decode_fwd
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.utils import cdiv from vllm.utils import cdiv
# yapf conflicts with isort for this docstring
# yapf: disable
from vllm.v1.attention.backends.mla.common import ( from vllm.v1.attention.backends.mla.common import (
MLACommonBackend, MLACommonBackend,
MLACommonDecodeMetadata, MLACommonDecodeMetadata,
@ -24,8 +21,6 @@ from vllm.v1.attention.backends.mla.common import (
from vllm.v1.attention.backends.utils import AttentionCGSupport from vllm.v1.attention.backends.utils import AttentionCGSupport
from vllm.v1.kv_cache_interface import AttentionSpec from vllm.v1.kv_cache_interface import AttentionSpec
# yapf: enable
def is_aiter_mla_enabled() -> bool: def is_aiter_mla_enabled() -> bool:
return envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MLA return envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MLA

View File

@ -18,8 +18,6 @@ from msgspec import msgpack
from vllm import envs from vllm import envs
from vllm.logger import init_logger from vllm.logger import init_logger
# yapf: disable
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
BaseMultiModalField, BaseMultiModalField,
MultiModalBatchedField, MultiModalBatchedField,
@ -32,8 +30,6 @@ from vllm.multimodal.inputs import (
MultiModalSharedField, MultiModalSharedField,
NestedTensors, NestedTensors,
) )
# yapf: enable
from vllm.v1.engine import UtilityResult from vllm.v1.engine import UtilityResult
logger = init_logger(__name__) logger = init_logger(__name__)

View File

@ -48,9 +48,6 @@ from vllm.model_executor.layers.mamba.abstract import MambaBase
from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader
from vllm.model_executor.models.deepseek_v2 import DeepseekV32IndexerCache from vllm.model_executor.models.deepseek_v2 import DeepseekV32IndexerCache
# yapf conflicts with isort for this block
# yapf: disable
from vllm.model_executor.models.interfaces import ( from vllm.model_executor.models.interfaces import (
SupportsMultiModal, SupportsMultiModal,
is_mixture_of_experts, is_mixture_of_experts,
@ -59,8 +56,6 @@ from vllm.model_executor.models.interfaces import (
supports_multimodal_pruning, supports_multimodal_pruning,
supports_transcription, supports_transcription,
) )
# yapf: enable
from vllm.model_executor.models.interfaces_base import ( from vllm.model_executor.models.interfaces_base import (
VllmModelForPooling, VllmModelForPooling,
is_pooling_model, is_pooling_model,
@ -101,9 +96,6 @@ from vllm.v1.attention.backends.utils import (
split_attn_metadata, split_attn_metadata,
) )
from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
# yapf conflicts with isort for this block
# yapf: disable
from vllm.v1.kv_cache_interface import ( from vllm.v1.kv_cache_interface import (
AttentionSpec, AttentionSpec,
ChunkedLocalAttentionSpec, ChunkedLocalAttentionSpec,
@ -118,8 +110,6 @@ from vllm.v1.kv_cache_interface import (
SlidingWindowSpec, SlidingWindowSpec,
UniformTypeKVCacheSpecs, UniformTypeKVCacheSpecs,
) )
# yapf: enable
from vllm.v1.outputs import ( from vllm.v1.outputs import (
EMPTY_MODEL_RUNNER_OUTPUT, EMPTY_MODEL_RUNNER_OUTPUT,
AsyncModelRunnerOutput, AsyncModelRunnerOutput,