mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-16 11:37:12 +08:00
Remove all references to yapf as it's no longer used (#26251)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
parent
d6953beb91
commit
4e256cadc2
@ -12,9 +12,6 @@ from functools import reduce
|
||||
from typing import Optional, Union
|
||||
|
||||
import jinja2
|
||||
|
||||
# yapf conflicts with isort for this block
|
||||
# yapf: disable
|
||||
from vllm_cutlass_library_extension import (
|
||||
DataType,
|
||||
EpilogueScheduleTag,
|
||||
@ -31,8 +28,6 @@ from vllm_cutlass_library_extension import (
|
||||
VLLMKernelScheduleTag,
|
||||
)
|
||||
|
||||
# yapf: enable
|
||||
|
||||
#
|
||||
# Generator templating
|
||||
#
|
||||
|
||||
@ -21,8 +21,6 @@ from vllm.utils import FlexibleArgumentParser
|
||||
logger = logging.getLogger()
|
||||
|
||||
|
||||
# yapf conflicts with isort for this docstring
|
||||
# yapf: disable
|
||||
"""
|
||||
tensorize_vllm_model.py is a script that can be used to serialize and
|
||||
deserialize vLLM models. These models can be loaded using tensorizer
|
||||
@ -132,7 +130,8 @@ def get_parser():
|
||||
"can be loaded using tensorizer directly to the GPU "
|
||||
"extremely quickly. Tensor encryption and decryption is "
|
||||
"also supported, although libsodium must be installed to "
|
||||
"use it.")
|
||||
"use it."
|
||||
)
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
|
||||
parser.add_argument(
|
||||
@ -144,13 +143,14 @@ def get_parser():
|
||||
"along with the model by instantiating a TensorizerConfig object, "
|
||||
"creating a dict from it with TensorizerConfig.to_serializable(), "
|
||||
"and passing it to LoRARequest's initializer with the kwarg "
|
||||
"tensorizer_config_dict."
|
||||
"tensorizer_config_dict.",
|
||||
)
|
||||
|
||||
subparsers = parser.add_subparsers(dest='command', required=True)
|
||||
subparsers = parser.add_subparsers(dest="command", required=True)
|
||||
|
||||
serialize_parser = subparsers.add_parser(
|
||||
'serialize', help="Serialize a model to `--serialized-directory`")
|
||||
"serialize", help="Serialize a model to `--serialized-directory`"
|
||||
)
|
||||
|
||||
serialize_parser.add_argument(
|
||||
"--suffix",
|
||||
@ -163,7 +163,9 @@ def get_parser():
|
||||
"`--suffix` is `v1`, the serialized model tensors will be "
|
||||
"saved to "
|
||||
"`s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors`. "
|
||||
"If none is provided, a random UUID will be used."))
|
||||
"If none is provided, a random UUID will be used."
|
||||
),
|
||||
)
|
||||
serialize_parser.add_argument(
|
||||
"--serialized-directory",
|
||||
type=str,
|
||||
@ -175,108 +177,127 @@ def get_parser():
|
||||
"and the model HuggingFace ID is `EleutherAI/gpt-j-6B`, tensors will "
|
||||
"be saved to `dir/vllm/EleutherAI/gpt-j-6B/suffix/model.tensors`, "
|
||||
"where `suffix` is given by `--suffix` or a random UUID if not "
|
||||
"provided.")
|
||||
"provided.",
|
||||
)
|
||||
|
||||
serialize_parser.add_argument(
|
||||
"--serialization-kwargs",
|
||||
type=tensorizer_kwargs_arg,
|
||||
required=False,
|
||||
help=("A JSON string containing additional keyword arguments to "
|
||||
"pass to Tensorizer's TensorSerializer during "
|
||||
"serialization."))
|
||||
help=(
|
||||
"A JSON string containing additional keyword arguments to "
|
||||
"pass to Tensorizer's TensorSerializer during "
|
||||
"serialization."
|
||||
),
|
||||
)
|
||||
|
||||
serialize_parser.add_argument(
|
||||
"--keyfile",
|
||||
type=str,
|
||||
required=False,
|
||||
help=("Encrypt the model weights with a randomly-generated binary key,"
|
||||
" and save the key at this path"))
|
||||
help=(
|
||||
"Encrypt the model weights with a randomly-generated binary key,"
|
||||
" and save the key at this path"
|
||||
),
|
||||
)
|
||||
|
||||
deserialize_parser = subparsers.add_parser(
|
||||
'deserialize',
|
||||
help=("Deserialize a model from `--path-to-tensors`"
|
||||
" to verify it can be loaded and used."))
|
||||
"deserialize",
|
||||
help=(
|
||||
"Deserialize a model from `--path-to-tensors`"
|
||||
" to verify it can be loaded and used."
|
||||
),
|
||||
)
|
||||
|
||||
deserialize_parser.add_argument(
|
||||
"--path-to-tensors",
|
||||
type=str,
|
||||
required=False,
|
||||
help="The local path or S3 URI to the model tensors to deserialize. ")
|
||||
help="The local path or S3 URI to the model tensors to deserialize. ",
|
||||
)
|
||||
|
||||
deserialize_parser.add_argument(
|
||||
"--serialized-directory",
|
||||
type=str,
|
||||
required=False,
|
||||
help="Directory with model artifacts for loading. Assumes a "
|
||||
"model.tensors file exists therein. Can supersede "
|
||||
"--path-to-tensors.")
|
||||
"model.tensors file exists therein. Can supersede "
|
||||
"--path-to-tensors.",
|
||||
)
|
||||
|
||||
deserialize_parser.add_argument(
|
||||
"--keyfile",
|
||||
type=str,
|
||||
required=False,
|
||||
help=("Path to a binary key to use to decrypt the model weights,"
|
||||
" if the model was serialized with encryption"))
|
||||
help=(
|
||||
"Path to a binary key to use to decrypt the model weights,"
|
||||
" if the model was serialized with encryption"
|
||||
),
|
||||
)
|
||||
|
||||
deserialize_parser.add_argument(
|
||||
"--deserialization-kwargs",
|
||||
type=tensorizer_kwargs_arg,
|
||||
required=False,
|
||||
help=("A JSON string containing additional keyword arguments to "
|
||||
"pass to Tensorizer's `TensorDeserializer` during "
|
||||
"deserialization."))
|
||||
help=(
|
||||
"A JSON string containing additional keyword arguments to "
|
||||
"pass to Tensorizer's `TensorDeserializer` during "
|
||||
"deserialization."
|
||||
),
|
||||
)
|
||||
|
||||
TensorizerArgs.add_cli_args(deserialize_parser)
|
||||
|
||||
return parser
|
||||
|
||||
def merge_extra_config_with_tensorizer_config(extra_cfg: dict,
|
||||
cfg: TensorizerConfig):
|
||||
|
||||
def merge_extra_config_with_tensorizer_config(extra_cfg: dict, cfg: TensorizerConfig):
|
||||
for k, v in extra_cfg.items():
|
||||
if hasattr(cfg, k):
|
||||
setattr(cfg, k, v)
|
||||
logger.info(
|
||||
"Updating TensorizerConfig with %s from "
|
||||
"--model-loader-extra-config provided", k
|
||||
"--model-loader-extra-config provided",
|
||||
k,
|
||||
)
|
||||
|
||||
|
||||
def deserialize(args, tensorizer_config):
|
||||
if args.lora_path:
|
||||
tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
|
||||
llm = LLM(model=args.model,
|
||||
load_format="tensorizer",
|
||||
tensor_parallel_size=args.tensor_parallel_size,
|
||||
model_loader_extra_config=tensorizer_config,
|
||||
enable_lora=True,
|
||||
llm = LLM(
|
||||
model=args.model,
|
||||
load_format="tensorizer",
|
||||
tensor_parallel_size=args.tensor_parallel_size,
|
||||
model_loader_extra_config=tensorizer_config,
|
||||
enable_lora=True,
|
||||
)
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
max_tokens=256,
|
||||
stop=["[/assistant]"]
|
||||
temperature=0, max_tokens=256, stop=["[/assistant]"]
|
||||
)
|
||||
|
||||
# Truncating this as the extra text isn't necessary
|
||||
prompts = [
|
||||
"[user] Write a SQL query to answer the question based on ..."
|
||||
]
|
||||
prompts = ["[user] Write a SQL query to answer the question based on ..."]
|
||||
|
||||
# Test LoRA load
|
||||
print(
|
||||
llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest("sql-lora",
|
||||
1,
|
||||
args.lora_path,
|
||||
tensorizer_config_dict = tensorizer_config
|
||||
.to_serializable())
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest(
|
||||
"sql-lora",
|
||||
1,
|
||||
args.lora_path,
|
||||
tensorizer_config_dict=tensorizer_config.to_serializable(),
|
||||
),
|
||||
)
|
||||
)
|
||||
else:
|
||||
llm = LLM(model=args.model,
|
||||
load_format="tensorizer",
|
||||
tensor_parallel_size=args.tensor_parallel_size,
|
||||
model_loader_extra_config=tensorizer_config
|
||||
llm = LLM(
|
||||
model=args.model,
|
||||
load_format="tensorizer",
|
||||
tensor_parallel_size=args.tensor_parallel_size,
|
||||
model_loader_extra_config=tensorizer_config,
|
||||
)
|
||||
return llm
|
||||
|
||||
@ -285,17 +306,20 @@ def main():
|
||||
parser = get_parser()
|
||||
args = parser.parse_args()
|
||||
|
||||
s3_access_key_id = (getattr(args, 's3_access_key_id', None)
|
||||
or os.environ.get("S3_ACCESS_KEY_ID", None))
|
||||
s3_secret_access_key = (getattr(args, 's3_secret_access_key', None)
|
||||
or os.environ.get("S3_SECRET_ACCESS_KEY", None))
|
||||
s3_endpoint = (getattr(args, 's3_endpoint', None)
|
||||
or os.environ.get("S3_ENDPOINT_URL", None))
|
||||
s3_access_key_id = getattr(args, "s3_access_key_id", None) or os.environ.get(
|
||||
"S3_ACCESS_KEY_ID", None
|
||||
)
|
||||
s3_secret_access_key = getattr(
|
||||
args, "s3_secret_access_key", None
|
||||
) or os.environ.get("S3_SECRET_ACCESS_KEY", None)
|
||||
s3_endpoint = getattr(args, "s3_endpoint", None) or os.environ.get(
|
||||
"S3_ENDPOINT_URL", None
|
||||
)
|
||||
|
||||
credentials = {
|
||||
"s3_access_key_id": s3_access_key_id,
|
||||
"s3_secret_access_key": s3_secret_access_key,
|
||||
"s3_endpoint": s3_endpoint
|
||||
"s3_endpoint": s3_endpoint,
|
||||
}
|
||||
|
||||
model_ref = args.model
|
||||
@ -309,25 +333,25 @@ def main():
|
||||
if args.model_loader_extra_config:
|
||||
extra_config = json.loads(args.model_loader_extra_config)
|
||||
|
||||
|
||||
tensorizer_dir = (args.serialized_directory or
|
||||
extra_config.get("tensorizer_dir"))
|
||||
tensorizer_uri = (getattr(args, "path_to_tensors", None)
|
||||
or extra_config.get("tensorizer_uri"))
|
||||
tensorizer_dir = args.serialized_directory or extra_config.get("tensorizer_dir")
|
||||
tensorizer_uri = getattr(args, "path_to_tensors", None) or extra_config.get(
|
||||
"tensorizer_uri"
|
||||
)
|
||||
|
||||
if tensorizer_dir and tensorizer_uri:
|
||||
parser.error("--serialized-directory and --path-to-tensors "
|
||||
"cannot both be provided")
|
||||
parser.error(
|
||||
"--serialized-directory and --path-to-tensors cannot both be provided"
|
||||
)
|
||||
|
||||
if not tensorizer_dir and not tensorizer_uri:
|
||||
parser.error("Either --serialized-directory or --path-to-tensors "
|
||||
"must be provided")
|
||||
|
||||
parser.error(
|
||||
"Either --serialized-directory or --path-to-tensors must be provided"
|
||||
)
|
||||
|
||||
if args.command == "serialize":
|
||||
engine_args = EngineArgs.from_cli_args(args)
|
||||
|
||||
input_dir = tensorizer_dir.rstrip('/')
|
||||
input_dir = tensorizer_dir.rstrip("/")
|
||||
suffix = args.suffix if args.suffix else uuid.uuid4().hex
|
||||
base_path = f"{input_dir}/vllm/{model_ref}/{suffix}"
|
||||
if engine_args.tensor_parallel_size > 1:
|
||||
@ -339,15 +363,14 @@ def main():
|
||||
tensorizer_uri=model_path,
|
||||
encryption_keyfile=keyfile,
|
||||
serialization_kwargs=args.serialization_kwargs or {},
|
||||
**credentials
|
||||
**credentials,
|
||||
)
|
||||
|
||||
if args.lora_path:
|
||||
tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
|
||||
tensorize_lora_adapter(args.lora_path, tensorizer_config)
|
||||
|
||||
merge_extra_config_with_tensorizer_config(extra_config,
|
||||
tensorizer_config)
|
||||
merge_extra_config_with_tensorizer_config(extra_config, tensorizer_config)
|
||||
tensorize_vllm_model(engine_args, tensorizer_config)
|
||||
|
||||
elif args.command == "deserialize":
|
||||
@ -356,11 +379,10 @@ def main():
|
||||
tensorizer_dir=args.serialized_directory,
|
||||
encryption_keyfile=keyfile,
|
||||
deserialization_kwargs=args.deserialization_kwargs or {},
|
||||
**credentials
|
||||
**credentials,
|
||||
)
|
||||
|
||||
merge_extra_config_with_tensorizer_config(extra_config,
|
||||
tensorizer_config)
|
||||
merge_extra_config_with_tensorizer_config(extra_config, tensorizer_config)
|
||||
deserialize(args, tensorizer_config)
|
||||
else:
|
||||
raise ValueError("Either serialize or deserialize must be specified.")
|
||||
|
||||
@ -8,16 +8,11 @@ import torch
|
||||
import vllm.envs as envs
|
||||
from tests.kernels.quantization.nvfp4_utils import quant_nvfp4_tensor
|
||||
from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
|
||||
|
||||
# yapf conflicts with isort for this block
|
||||
# yapf: disable
|
||||
from vllm.compilation.activation_quant_fusion import (
|
||||
FUSED_OPS,
|
||||
SILU_MUL_OP,
|
||||
ActivationQuantFusionPass,
|
||||
)
|
||||
|
||||
# yapf: enable
|
||||
from vllm.compilation.fusion import QUANT_OPS
|
||||
from vllm.compilation.noop_elimination import NoOpEliminationPass
|
||||
from vllm.compilation.post_cleanup import PostCleanupPass
|
||||
|
||||
@ -107,10 +107,8 @@ class EPTestSettings:
|
||||
# NOTE: You can adjust tp_base locally to fit the model in GPU
|
||||
# The values displayed here are only a rough indicator of the size of the model
|
||||
|
||||
# yapf: disable
|
||||
TEST_MODELS = {
|
||||
"deepseek-ai/DeepSeek-V2-Lite-Chat": EPTestSettings.fast(
|
||||
trust_remote_code=True),
|
||||
"deepseek-ai/DeepSeek-V2-Lite-Chat": EPTestSettings.fast(trust_remote_code=True),
|
||||
"mistralai/Mixtral-8x7B-Instruct-v0.1": EPTestSettings.fast(tp_base=4),
|
||||
}
|
||||
|
||||
@ -192,22 +190,24 @@ def _compare_tp(
|
||||
]
|
||||
|
||||
try:
|
||||
compare_two_settings(model_name,
|
||||
ep_args,
|
||||
tp_args,
|
||||
ep_env,
|
||||
tp_env,
|
||||
method=method,
|
||||
max_wait_seconds=360)
|
||||
compare_two_settings(
|
||||
model_name,
|
||||
ep_args,
|
||||
tp_args,
|
||||
ep_env,
|
||||
tp_env,
|
||||
method=method,
|
||||
max_wait_seconds=360,
|
||||
)
|
||||
except Exception:
|
||||
raise
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("model_name", "parallel_setup", "distributed_backend", "runner",
|
||||
"test_options"),
|
||||
("model_name", "parallel_setup", "distributed_backend", "runner", "test_options"),
|
||||
[
|
||||
params for model_name, settings in TEST_MODELS.items()
|
||||
params
|
||||
for model_name, settings in TEST_MODELS.items()
|
||||
for params in settings.iter_params(model_name)
|
||||
],
|
||||
)
|
||||
@ -220,10 +220,12 @@ def test_ep(
|
||||
test_options: EPTestOptions,
|
||||
num_gpus_available,
|
||||
):
|
||||
_compare_tp(model_name,
|
||||
parallel_setup,
|
||||
distributed_backend,
|
||||
runner,
|
||||
test_options,
|
||||
num_gpus_available,
|
||||
method="generate")
|
||||
_compare_tp(
|
||||
model_name,
|
||||
parallel_setup,
|
||||
distributed_backend,
|
||||
runner,
|
||||
test_options,
|
||||
num_gpus_available,
|
||||
method="generate",
|
||||
)
|
||||
|
||||
@ -100,7 +100,6 @@ class PPTestSettings:
|
||||
# NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
|
||||
# The values displayed here are only a rough indicator of the size of the model
|
||||
|
||||
# yapf: disable
|
||||
TEXT_GENERATION_MODELS = {
|
||||
# [Decoder-only]
|
||||
# Uses Llama
|
||||
@ -150,7 +149,9 @@ TEXT_GENERATION_MODELS = {
|
||||
"adept/persimmon-8b-chat": PPTestSettings.fast(),
|
||||
"microsoft/phi-2": PPTestSettings.fast(),
|
||||
"microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(),
|
||||
"microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(multi_node_only=True, load_format="dummy"), # noqa: E501
|
||||
"microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(
|
||||
multi_node_only=True, load_format="dummy"
|
||||
), # noqa: E501
|
||||
"Qwen/Qwen-7B-Chat": PPTestSettings.fast(),
|
||||
"Qwen/Qwen2.5-0.5B-Instruct": PPTestSettings.fast(),
|
||||
"Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
|
||||
@ -196,7 +197,6 @@ MULTIMODAL_MODELS = {
|
||||
"Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
|
||||
"fixie-ai/ultravox-v0_5-llama-3_2-1b": PPTestSettings.fast(),
|
||||
}
|
||||
# yapf: enable
|
||||
|
||||
# NOTE: You can update this on your local machine to run specific tests
|
||||
TEST_MODELS = [
|
||||
|
||||
@ -287,29 +287,15 @@ def test_prefix_cache_default():
|
||||
assert not engine_args.enable_prefix_caching
|
||||
|
||||
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(("arg", "expected", "option"), [
|
||||
(None, None, "mm-processor-kwargs"),
|
||||
("{}", {}, "mm-processor-kwargs"),
|
||||
(
|
||||
'{"num_crops": 4}',
|
||||
{
|
||||
"num_crops": 4
|
||||
},
|
||||
"mm-processor-kwargs"
|
||||
),
|
||||
(
|
||||
'{"foo": {"bar": "baz"}}',
|
||||
{
|
||||
"foo":
|
||||
{
|
||||
"bar": "baz"
|
||||
}
|
||||
},
|
||||
"mm-processor-kwargs"
|
||||
),
|
||||
])
|
||||
# yapf: enable
|
||||
@pytest.mark.parametrize(
|
||||
("arg", "expected", "option"),
|
||||
[
|
||||
(None, None, "mm-processor-kwargs"),
|
||||
("{}", {}, "mm-processor-kwargs"),
|
||||
('{"num_crops": 4}', {"num_crops": 4}, "mm-processor-kwargs"),
|
||||
('{"foo": {"bar": "baz"}}', {"foo": {"bar": "baz"}}, "mm-processor-kwargs"),
|
||||
],
|
||||
)
|
||||
def test_composite_arg_parser(arg, expected, option):
|
||||
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
|
||||
if arg is None:
|
||||
@ -321,8 +307,7 @@ def test_composite_arg_parser(arg, expected, option):
|
||||
|
||||
def test_human_readable_model_len():
|
||||
# `exit_on_error` disabled to test invalid values below
|
||||
parser = EngineArgs.add_cli_args(
|
||||
FlexibleArgumentParser(exit_on_error=False))
|
||||
parser = EngineArgs.add_cli_args(FlexibleArgumentParser(exit_on_error=False))
|
||||
|
||||
args = parser.parse_args([])
|
||||
assert args.max_model_len is None
|
||||
|
||||
@ -15,6 +15,7 @@ from vllm.assets.video import VideoAsset
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.entrypoints.chat_utils import (
|
||||
_try_extract_ast,
|
||||
apply_mistral_chat_template,
|
||||
load_chat_template,
|
||||
parse_chat_messages,
|
||||
parse_chat_messages_futures,
|
||||
@ -1855,17 +1856,17 @@ def test_resolve_hf_chat_template_kwargs(sample_json_schema, model, expected_kwa
|
||||
|
||||
# NOTE: Qwen2-Audio default chat template is specially defined inside
|
||||
# processor class instead of using `tokenizer_config.json`
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(
|
||||
("model", "expected_format"),
|
||||
[(PHI3V_MODEL_ID, "string"),
|
||||
(QWEN2VL_MODEL_ID, "openai"),
|
||||
(QWEN25VL_MODEL_ID, "openai"),
|
||||
(ULTRAVOX_MODEL_ID, "string"),
|
||||
(QWEN2AUDIO_MODEL_ID, "openai"),
|
||||
(LLAMA_GUARD_MODEL_ID, "openai")],
|
||||
[
|
||||
(PHI3V_MODEL_ID, "string"),
|
||||
(QWEN2VL_MODEL_ID, "openai"),
|
||||
(QWEN25VL_MODEL_ID, "openai"),
|
||||
(ULTRAVOX_MODEL_ID, "string"),
|
||||
(QWEN2AUDIO_MODEL_ID, "openai"),
|
||||
(LLAMA_GUARD_MODEL_ID, "openai"),
|
||||
],
|
||||
)
|
||||
# yapf: enable
|
||||
def test_resolve_content_format_hf_defined(model, expected_format):
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
@ -1879,7 +1880,8 @@ def test_resolve_content_format_hf_defined(model, expected_format):
|
||||
hf_overrides=model_info.hf_overrides,
|
||||
skip_tokenizer_init=model_info.skip_tokenizer_init,
|
||||
enforce_eager=model_info.enforce_eager,
|
||||
dtype=model_info.dtype)
|
||||
dtype=model_info.dtype,
|
||||
)
|
||||
|
||||
tokenizer = get_tokenizer(
|
||||
model,
|
||||
@ -1911,18 +1913,18 @@ def test_resolve_content_format_hf_defined(model, expected_format):
|
||||
assert resolved_format == expected_format
|
||||
|
||||
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(
|
||||
("model", "expected_format"),
|
||||
[("Salesforce/blip2-opt-2.7b", "string"),
|
||||
("facebook/chameleon-7b", "string"),
|
||||
("deepseek-ai/deepseek-vl2-tiny", "string"),
|
||||
("adept/fuyu-8b", "string"),
|
||||
("google/paligemma-3b-mix-224", "string"),
|
||||
("Qwen/Qwen-VL", "string"),
|
||||
("Qwen/Qwen-VL-Chat", "string")],
|
||||
[
|
||||
("Salesforce/blip2-opt-2.7b", "string"),
|
||||
("facebook/chameleon-7b", "string"),
|
||||
("deepseek-ai/deepseek-vl2-tiny", "string"),
|
||||
("adept/fuyu-8b", "string"),
|
||||
("google/paligemma-3b-mix-224", "string"),
|
||||
("Qwen/Qwen-VL", "string"),
|
||||
("Qwen/Qwen-VL-Chat", "string"),
|
||||
],
|
||||
)
|
||||
# yapf: enable
|
||||
def test_resolve_content_format_fallbacks(model, expected_format):
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
@ -1936,7 +1938,8 @@ def test_resolve_content_format_fallbacks(model, expected_format):
|
||||
hf_overrides=model_info.hf_overrides,
|
||||
skip_tokenizer_init=model_info.skip_tokenizer_init,
|
||||
enforce_eager=model_info.enforce_eager,
|
||||
dtype=model_info.dtype)
|
||||
dtype=model_info.dtype,
|
||||
)
|
||||
|
||||
tokenizer = get_tokenizer(
|
||||
model_config.tokenizer,
|
||||
@ -1968,30 +1971,30 @@ def test_resolve_content_format_fallbacks(model, expected_format):
|
||||
assert resolved_format == expected_format
|
||||
|
||||
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(
|
||||
("template_path", "expected_format"),
|
||||
[("template_alpaca.jinja", "string"),
|
||||
("template_baichuan.jinja", "string"),
|
||||
("template_chatglm.jinja", "string"),
|
||||
("template_chatglm2.jinja", "string"),
|
||||
("template_chatml.jinja", "string"),
|
||||
("template_dse_qwen2_vl.jinja", "openai"),
|
||||
("template_falcon_180b.jinja", "string"),
|
||||
("template_falcon.jinja", "string"),
|
||||
("template_inkbot.jinja", "string"),
|
||||
("template_teleflm.jinja", "string"),
|
||||
("template_vlm2vec_phi3v.jinja", "openai"),
|
||||
("template_vlm2vec_qwen2vl.jinja", "openai"),
|
||||
("tool_chat_template_granite_20b_fc.jinja", "string"),
|
||||
("tool_chat_template_hermes.jinja", "string"),
|
||||
("tool_chat_template_internlm2_tool.jinja", "string"),
|
||||
("tool_chat_template_llama3.1_json.jinja", "openai"),
|
||||
("tool_chat_template_llama3.2_json.jinja", "openai"),
|
||||
("tool_chat_template_mistral_parallel.jinja", "string"),
|
||||
("tool_chat_template_mistral.jinja", "string")],
|
||||
[
|
||||
("template_alpaca.jinja", "string"),
|
||||
("template_baichuan.jinja", "string"),
|
||||
("template_chatglm.jinja", "string"),
|
||||
("template_chatglm2.jinja", "string"),
|
||||
("template_chatml.jinja", "string"),
|
||||
("template_dse_qwen2_vl.jinja", "openai"),
|
||||
("template_falcon_180b.jinja", "string"),
|
||||
("template_falcon.jinja", "string"),
|
||||
("template_inkbot.jinja", "string"),
|
||||
("template_teleflm.jinja", "string"),
|
||||
("template_vlm2vec_phi3v.jinja", "openai"),
|
||||
("template_vlm2vec_qwen2vl.jinja", "openai"),
|
||||
("tool_chat_template_granite_20b_fc.jinja", "string"),
|
||||
("tool_chat_template_hermes.jinja", "string"),
|
||||
("tool_chat_template_internlm2_tool.jinja", "string"),
|
||||
("tool_chat_template_llama3.1_json.jinja", "openai"),
|
||||
("tool_chat_template_llama3.2_json.jinja", "openai"),
|
||||
("tool_chat_template_mistral_parallel.jinja", "string"),
|
||||
("tool_chat_template_mistral.jinja", "string"),
|
||||
],
|
||||
)
|
||||
# yapf: enable
|
||||
def test_resolve_content_format_examples(template_path, expected_format):
|
||||
model_config = ModelConfig(
|
||||
PHI3V_MODEL_ID, # Dummy
|
||||
@ -2024,40 +2027,34 @@ def test_resolve_content_format_examples(template_path, expected_format):
|
||||
assert resolved_format == expected_format
|
||||
|
||||
|
||||
def test_parse_chat_messages_include_thinking_chunk(mistral_model_config,
|
||||
mistral_tokenizer):
|
||||
messages = [{
|
||||
"role":
|
||||
"system",
|
||||
"content": [{
|
||||
"type": "text",
|
||||
"text": "You are a helpful assistant."
|
||||
}, {
|
||||
"type":
|
||||
"thinking",
|
||||
"closed":
|
||||
True,
|
||||
"thinking":
|
||||
"Only return the answer when you are confident."
|
||||
}]
|
||||
}, {
|
||||
"role": "user",
|
||||
"content": "What is 2+2?"
|
||||
}, {
|
||||
"role":
|
||||
"assistant",
|
||||
"content": [{
|
||||
"type": "text",
|
||||
"text": "Let me think about it."
|
||||
}, {
|
||||
"type": "thinking",
|
||||
"closed": True,
|
||||
"thinking": "2+2 = 4"
|
||||
}, {
|
||||
"type": "text",
|
||||
"text": "The answer is 4.",
|
||||
}],
|
||||
}]
|
||||
def test_parse_chat_messages_include_thinking_chunk(
|
||||
mistral_model_config, mistral_tokenizer
|
||||
):
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": [
|
||||
{"type": "text", "text": "You are a helpful assistant."},
|
||||
{
|
||||
"type": "thinking",
|
||||
"closed": True,
|
||||
"thinking": "Only return the answer when you are confident.",
|
||||
},
|
||||
],
|
||||
},
|
||||
{"role": "user", "content": "What is 2+2?"},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": [
|
||||
{"type": "text", "text": "Let me think about it."},
|
||||
{"type": "thinking", "closed": True, "thinking": "2+2 = 4"},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "The answer is 4.",
|
||||
},
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
conversation_with_thinking, _, _ = parse_chat_messages(
|
||||
messages,
|
||||
@ -2066,122 +2063,105 @@ def test_parse_chat_messages_include_thinking_chunk(mistral_model_config,
|
||||
content_format="openai",
|
||||
)
|
||||
|
||||
expected_conversation = [{
|
||||
"role":
|
||||
"system",
|
||||
"content": [{
|
||||
"type": "text",
|
||||
"text": "You are a helpful assistant."
|
||||
}, {
|
||||
"type": "text",
|
||||
"text": "Only return the answer when you are confident."
|
||||
}],
|
||||
}, {
|
||||
"role":
|
||||
"user",
|
||||
"content": [{
|
||||
"type": "text",
|
||||
"text": "What is 2+2?"
|
||||
}],
|
||||
}, {
|
||||
"role":
|
||||
"assistant",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Let me think about it."
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "2+2 = 4"
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "The answer is 4."
|
||||
},
|
||||
]
|
||||
}]
|
||||
expected_conversation = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": [
|
||||
{"type": "text", "text": "You are a helpful assistant."},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Only return the answer when you are confident.",
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [{"type": "text", "text": "What is 2+2?"}],
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": [
|
||||
{"type": "text", "text": "Let me think about it."},
|
||||
{"type": "text", "text": "2+2 = 4"},
|
||||
{"type": "text", "text": "The answer is 4."},
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
assert conversation_with_thinking == expected_conversation
|
||||
|
||||
|
||||
def test_apply_mistral_chat_template_thinking_chunk():
|
||||
# Moved import here to avoid yapf and isort conflicts
|
||||
from vllm.entrypoints.chat_utils import apply_mistral_chat_template
|
||||
messages = [{
|
||||
"role":
|
||||
"system",
|
||||
"content": [{
|
||||
"type": "text",
|
||||
"text": "You are a helpful assistant."
|
||||
}, {
|
||||
"type":
|
||||
"thinking",
|
||||
"closed":
|
||||
True,
|
||||
"thinking":
|
||||
"Only return the answer when you are confident."
|
||||
}]
|
||||
}, {
|
||||
"role": "user",
|
||||
"content": "What is 2+2?"
|
||||
}, {
|
||||
"role":
|
||||
"assistant",
|
||||
"content": [{
|
||||
"type": "text",
|
||||
"text": "Let me think about it."
|
||||
}, {
|
||||
"type": "thinking",
|
||||
"closed": True,
|
||||
"thinking": "2+2 = 4"
|
||||
}, {
|
||||
"type": "text",
|
||||
"text": "The answer is 4.",
|
||||
}],
|
||||
}, {
|
||||
"role": "user",
|
||||
"content": "Thanks, what is 3+3?"
|
||||
}]
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": [
|
||||
{"type": "text", "text": "You are a helpful assistant."},
|
||||
{
|
||||
"type": "thinking",
|
||||
"closed": True,
|
||||
"thinking": "Only return the answer when you are confident.",
|
||||
},
|
||||
],
|
||||
},
|
||||
{"role": "user", "content": "What is 2+2?"},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": [
|
||||
{"type": "text", "text": "Let me think about it."},
|
||||
{"type": "thinking", "closed": True, "thinking": "2+2 = 4"},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "The answer is 4.",
|
||||
},
|
||||
],
|
||||
},
|
||||
{"role": "user", "content": "Thanks, what is 3+3?"},
|
||||
]
|
||||
|
||||
# TODO(Julien): upon model release change to a tokenizer already configured.
|
||||
# =================================================================
|
||||
mistral_tokenizer = MistralTokenizer.from_pretrained(
|
||||
"mistralai/Devstral-Small-2507")
|
||||
"mistralai/Devstral-Small-2507"
|
||||
)
|
||||
assert isinstance(mistral_tokenizer.tokenizer, Tekkenizer)
|
||||
# Add think special tokens to the tokenizer
|
||||
mistral_tokenizer.tokenizer._all_special_tokens[35] = SpecialTokenInfo(
|
||||
rank=35, is_control=True, token_str=SpecialTokens.begin_think.value)
|
||||
rank=35, is_control=True, token_str=SpecialTokens.begin_think.value
|
||||
)
|
||||
mistral_tokenizer.tokenizer._all_special_tokens[36] = SpecialTokenInfo(
|
||||
rank=36, is_control=True, token_str=SpecialTokens.end_think.value)
|
||||
rank=36, is_control=True, token_str=SpecialTokens.end_think.value
|
||||
)
|
||||
mistral_tokenizer.tokenizer._special_tokens_reverse_vocab = {
|
||||
k: v
|
||||
for k, v in
|
||||
mistral_tokenizer.tokenizer._special_tokens_reverse_vocab.items()
|
||||
for k, v in mistral_tokenizer.tokenizer._special_tokens_reverse_vocab.items()
|
||||
if v not in {35, 36}
|
||||
}
|
||||
mistral_tokenizer.tokenizer._special_tokens_reverse_vocab[
|
||||
SpecialTokens.begin_think.value] = 35
|
||||
SpecialTokens.begin_think.value
|
||||
] = 35
|
||||
mistral_tokenizer.tokenizer._special_tokens_reverse_vocab[
|
||||
SpecialTokens.end_think.value] = 36
|
||||
SpecialTokens.end_think.value
|
||||
] = 36
|
||||
mistral_tokenizer.instruct.BEGIN_THINK = 35
|
||||
mistral_tokenizer.instruct.END_THINK = 36
|
||||
# =================================================================
|
||||
|
||||
tokens_ids = apply_mistral_chat_template(mistral_tokenizer,
|
||||
messages,
|
||||
chat_template=None,
|
||||
tools=None)
|
||||
tokens_ids = apply_mistral_chat_template(
|
||||
mistral_tokenizer, messages, chat_template=None, tools=None
|
||||
)
|
||||
|
||||
string_tokens = mistral_tokenizer.mistral.decode(
|
||||
tokens_ids, special_token_policy=SpecialTokenPolicy.KEEP)
|
||||
tokens_ids, special_token_policy=SpecialTokenPolicy.KEEP
|
||||
)
|
||||
|
||||
expected_tokens = (
|
||||
r"<s>[SYSTEM_PROMPT]You are a helpful assistant.[THINK]Only return the"
|
||||
r" answer when you are confident.[/THINK][/SYSTEM_PROMPT]"
|
||||
r"[INST]What is 2+2?[/INST]"
|
||||
r"Let me think about it.[THINK]2+2 = 4[/THINK]The answer is 4.</s>"
|
||||
r"[INST]Thanks, what is 3+3?[/INST]")
|
||||
r"[INST]Thanks, what is 3+3?[/INST]"
|
||||
)
|
||||
|
||||
assert string_tokens == expected_tokens
|
||||
|
||||
@ -2192,37 +2172,32 @@ def test_parse_chat_messages_single_empty_audio_with_uuid(
|
||||
):
|
||||
audio_uuid = "abcd"
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "input_audio",
|
||||
"input_audio": {},
|
||||
"uuid": audio_uuid,
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What does the audio say?"
|
||||
},
|
||||
],
|
||||
}],
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "input_audio",
|
||||
"input_audio": {},
|
||||
"uuid": audio_uuid,
|
||||
},
|
||||
{"type": "text", "text": "What does the audio say?"},
|
||||
],
|
||||
}
|
||||
],
|
||||
qwen2_audio_model_config,
|
||||
qwen2_audio_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
assert conversation == [{
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?"
|
||||
}]
|
||||
assert conversation == [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?",
|
||||
}
|
||||
]
|
||||
_assert_mm_data_inputs(mm_data, {"audio": 1})
|
||||
_assert_mm_uuids(mm_uuids,
|
||||
1,
|
||||
modality="audio",
|
||||
expected_uuids=[audio_uuid])
|
||||
_assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[audio_uuid])
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@ -2232,34 +2207,29 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
|
||||
):
|
||||
audio_uuid = "abcd"
|
||||
conversation, mm_future, mm_uuids = parse_chat_messages_futures(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "input_audio",
|
||||
"input_audio": {},
|
||||
"uuid": audio_uuid,
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What does the audio say?"
|
||||
},
|
||||
],
|
||||
}],
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "input_audio",
|
||||
"input_audio": {},
|
||||
"uuid": audio_uuid,
|
||||
},
|
||||
{"type": "text", "text": "What does the audio say?"},
|
||||
],
|
||||
}
|
||||
],
|
||||
qwen2_audio_model_config,
|
||||
qwen2_audio_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
assert conversation == [{
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?"
|
||||
}]
|
||||
assert conversation == [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?",
|
||||
}
|
||||
]
|
||||
_assert_mm_data_inputs(await mm_future, {"audio": 1})
|
||||
_assert_mm_uuids(mm_uuids,
|
||||
1,
|
||||
modality="audio",
|
||||
expected_uuids=[audio_uuid])
|
||||
_assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[audio_uuid])
|
||||
|
||||
@ -12,9 +12,6 @@ import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from vllm.config.lora import LoRAConfig
|
||||
|
||||
# yapf conflicts with isort for this block
|
||||
# yapf: disable
|
||||
from vllm.lora.layers import (
|
||||
BaseLayerWithLoRA,
|
||||
ColumnParallelLinearWithLoRA,
|
||||
@ -32,8 +29,6 @@ from vllm.lora.layers import (
|
||||
RowParallelLinearWithShardedLoRA,
|
||||
VocabParallelEmbeddingWithLoRA,
|
||||
)
|
||||
|
||||
# yapf: enable
|
||||
from vllm.lora.models import LoRALayerWeights, PackedLoRALayerWeights
|
||||
from vllm.lora.punica_wrapper import get_punica_wrapper
|
||||
from vllm.model_executor.layers.linear import (
|
||||
|
||||
@ -17,8 +17,6 @@ import vllm.model_executor.model_loader.tensorizer
|
||||
from tests.utils import VLLM_PATH, RemoteOpenAIServer
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
|
||||
# yapf: disable
|
||||
from vllm.model_executor.model_loader.tensorizer import (
|
||||
TensorizerConfig,
|
||||
TensorSerializer,
|
||||
@ -29,8 +27,6 @@ from vllm.model_executor.model_loader.tensorizer import (
|
||||
from vllm.model_executor.model_loader.tensorizer_loader import (
|
||||
BLACKLISTED_TENSORIZER_ARGS,
|
||||
)
|
||||
|
||||
# yapf: enable
|
||||
from vllm.utils import PlaceholderModule
|
||||
|
||||
from .conftest import DummyExecutor, assert_from_collective_rpc
|
||||
|
||||
@ -45,18 +45,17 @@ from .vlm_utils.types import (
|
||||
if current_platform.is_rocm():
|
||||
os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
|
||||
|
||||
# yapf: disable
|
||||
COMMON_BROADCAST_SETTINGS = {
|
||||
"test_type": VLMTestType.IMAGE,
|
||||
"dtype": "half",
|
||||
"max_tokens": 5,
|
||||
"tensor_parallel_size": 2,
|
||||
"hf_model_kwargs": {"device_map": "auto"},
|
||||
"image_size_factors": [(.25, 0.5, 1.0)],
|
||||
"image_size_factors": [(0.25, 0.5, 1.0)],
|
||||
"distributed_executor_backend": (
|
||||
"ray",
|
||||
"mp",
|
||||
)
|
||||
),
|
||||
}
|
||||
|
||||
### Test configuration for specific models
|
||||
@ -96,22 +95,20 @@ VLM_TEST_SETTINGS = {
|
||||
#### Core tests to always run in the CI
|
||||
"llava": VLMTestInfo(
|
||||
models=["llava-hf/llava-1.5-7b-hf"],
|
||||
test_type=(
|
||||
VLMTestType.EMBEDDING,
|
||||
VLMTestType.IMAGE,
|
||||
VLMTestType.CUSTOM_INPUTS
|
||||
),
|
||||
test_type=(VLMTestType.EMBEDDING, VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
|
||||
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
||||
convert_assets_to_embeddings=model_utils.get_llava_embeddings,
|
||||
max_model_len=4096,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
||||
custom_test_opts=[CustomTestOptions(
|
||||
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
|
||||
formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:"
|
||||
),
|
||||
limit_mm_per_prompt={"image": 4},
|
||||
)],
|
||||
custom_test_opts=[
|
||||
CustomTestOptions(
|
||||
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
|
||||
formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:"
|
||||
),
|
||||
limit_mm_per_prompt={"image": 4},
|
||||
)
|
||||
],
|
||||
# TODO: Revert to "auto" when CPU backend can use torch > 2.6
|
||||
dtype="bfloat16" if current_platform.is_cpu() else "auto",
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||
@ -120,27 +117,27 @@ VLM_TEST_SETTINGS = {
|
||||
models=["google/paligemma-3b-mix-224"],
|
||||
test_type=VLMTestType.IMAGE,
|
||||
prompt_formatter=identity,
|
||||
img_idx_to_prompt = lambda idx: "",
|
||||
img_idx_to_prompt=lambda idx: "",
|
||||
# Paligemma uses its own sample prompts because the default one fails
|
||||
single_image_prompts=IMAGE_ASSETS.prompts({
|
||||
"stop_sign": "caption es",
|
||||
"cherry_blossom": "What is in the picture?",
|
||||
}),
|
||||
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "caption es",
|
||||
"cherry_blossom": "What is in the picture?",
|
||||
}
|
||||
),
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
|
||||
dtype="bfloat16",
|
||||
marks=[pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")], # noqa: E501
|
||||
marks=[
|
||||
pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")
|
||||
], # noqa: E501
|
||||
),
|
||||
"qwen2_5_vl": VLMTestInfo(
|
||||
models=["Qwen/Qwen2.5-VL-3B-Instruct"],
|
||||
test_type=(
|
||||
VLMTestType.IMAGE,
|
||||
VLMTestType.MULTI_IMAGE,
|
||||
VLMTestType.VIDEO
|
||||
),
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
|
||||
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
|
||||
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
@ -150,17 +147,13 @@ VLM_TEST_SETTINGS = {
|
||||
),
|
||||
"qwen2_5_omni": VLMTestInfo(
|
||||
models=["Qwen/Qwen2.5-Omni-3B"],
|
||||
test_type=(
|
||||
VLMTestType.IMAGE,
|
||||
VLMTestType.MULTI_IMAGE,
|
||||
VLMTestType.VIDEO
|
||||
),
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<|vision_bos|><|IMAGE|><|vision_eos|>", # noqa: E501
|
||||
video_idx_to_prompt=lambda idx: "<|vision_bos|><|VIDEO|><|vision_eos|>", # noqa: E501
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<|vision_bos|><|IMAGE|><|vision_eos|>", # noqa: E501
|
||||
video_idx_to_prompt=lambda idx: "<|vision_bos|><|VIDEO|><|vision_eos|>", # noqa: E501
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
num_logprobs= 6 if current_platform.is_cpu() else 5,
|
||||
num_logprobs=6 if current_platform.is_cpu() else 5,
|
||||
auto_cls=AutoModelForTextToWaveform,
|
||||
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
|
||||
patch_hf_runner=model_utils.qwen2_5_omni_patch_hf_runner,
|
||||
@ -168,9 +161,9 @@ VLM_TEST_SETTINGS = {
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||
),
|
||||
"ultravox": VLMTestInfo(
|
||||
models = ["fixie-ai/ultravox-v0_5-llama-3_2-1b"],
|
||||
models=["fixie-ai/ultravox-v0_5-llama-3_2-1b"],
|
||||
test_type=VLMTestType.AUDIO,
|
||||
prompt_formatter=lambda audio_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{audio_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
|
||||
prompt_formatter=lambda audio_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{audio_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
|
||||
audio_idx_to_prompt=lambda idx: "<|audio|>",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
@ -184,9 +177,11 @@ VLM_TEST_SETTINGS = {
|
||||
"llava-onevision-transformers": VLMTestInfo(
|
||||
models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
|
||||
test_type=VLMTestType.IMAGE,
|
||||
prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||
prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||
max_model_len=16384,
|
||||
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"), # noqa: E501
|
||||
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs(
|
||||
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
|
||||
), # noqa: E501
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
|
||||
image_size_factors=[(0.25, 0.5, 1.0)],
|
||||
@ -201,7 +196,7 @@ VLM_TEST_SETTINGS = {
|
||||
"idefics3-transformers": VLMTestInfo(
|
||||
models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501
|
||||
prompt_formatter=lambda img_prompt: f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<image>",
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
@ -217,8 +212,8 @@ VLM_TEST_SETTINGS = {
|
||||
"qwen2_5_vl-transformers": VLMTestInfo(
|
||||
models=["Qwen/Qwen2.5-VL-3B-Instruct"],
|
||||
test_type=VLMTestType.IMAGE,
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
@ -228,23 +223,24 @@ VLM_TEST_SETTINGS = {
|
||||
"model_impl": "transformers",
|
||||
},
|
||||
# FIXME: Investigate mrope issue
|
||||
marks=[large_gpu_mark(min_gb=32),
|
||||
pytest.mark.skip(reason="Mrope issue")],
|
||||
marks=[large_gpu_mark(min_gb=32), pytest.mark.skip(reason="Mrope issue")],
|
||||
),
|
||||
#### Extended model tests
|
||||
"aria": VLMTestInfo(
|
||||
models=["rhymes-ai/Aria"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
single_image_prompts=IMAGE_ASSETS.prompts({
|
||||
"stop_sign": "<vlm_image>Please describe the image shortly.",
|
||||
"cherry_blossom": "<vlm_image>Please infer the season with reason.", # noqa: E501
|
||||
}),
|
||||
multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.", # noqa: E501
|
||||
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "<vlm_image>Please describe the image shortly.",
|
||||
"cherry_blossom": "<vlm_image>Please infer the season with reason.", # noqa: E501
|
||||
}
|
||||
),
|
||||
multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.", # noqa: E501
|
||||
stop_str=["<|im_end|>"],
|
||||
image_size_factors=[(0.10, 0.15)],
|
||||
max_tokens=64,
|
||||
@ -253,11 +249,13 @@ VLM_TEST_SETTINGS = {
|
||||
"aya_vision": VLMTestInfo(
|
||||
models=["CohereForAI/aya-vision-8b"],
|
||||
test_type=(VLMTestType.IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
|
||||
single_image_prompts=IMAGE_ASSETS.prompts({
|
||||
"stop_sign": "<image>What's the content in the center of the image?", # noqa: E501
|
||||
"cherry_blossom": "<image>What is the season?", # noqa: E501
|
||||
}),
|
||||
prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
|
||||
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "<image>What's the content in the center of the image?", # noqa: E501
|
||||
"cherry_blossom": "<image>What is the season?", # noqa: E501
|
||||
}
|
||||
),
|
||||
multi_image_prompt="<image><image>Describe the two images in detail.", # noqa: E501
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
@ -267,11 +265,13 @@ VLM_TEST_SETTINGS = {
|
||||
"aya_vision-multi_image": VLMTestInfo(
|
||||
models=["CohereForAI/aya-vision-8b"],
|
||||
test_type=(VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
|
||||
single_image_prompts=IMAGE_ASSETS.prompts({
|
||||
"stop_sign": "<image>What's the content in the center of the image?", # noqa: E501
|
||||
"cherry_blossom": "<image>What is the season?", # noqa: E501
|
||||
}),
|
||||
prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
|
||||
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "<image>What's the content in the center of the image?", # noqa: E501
|
||||
"cherry_blossom": "<image>What is the season?", # noqa: E501
|
||||
}
|
||||
),
|
||||
multi_image_prompt="<image><image>Describe the two images in detail.", # noqa: E501
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
@ -297,27 +297,29 @@ VLM_TEST_SETTINGS = {
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
# For chameleon, we only compare the sequences
|
||||
vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
|
||||
hf_output_post_proc = lambda hf_output, model: hf_output[:2],
|
||||
vllm_output_post_proc=lambda vllm_output, model: vllm_output[:2],
|
||||
hf_output_post_proc=lambda hf_output, model: hf_output[:2],
|
||||
comparator=check_outputs_equal,
|
||||
max_tokens=8,
|
||||
dtype="bfloat16",
|
||||
),
|
||||
"deepseek_vl_v2": VLMTestInfo(
|
||||
models=["Isotr0py/deepseek-vl2-tiny"], # model repo using dynamic module
|
||||
models=["Isotr0py/deepseek-vl2-tiny"], # model repo using dynamic module
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ", # noqa: E501
|
||||
prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ", # noqa: E501
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
single_image_prompts=IMAGE_ASSETS.prompts({
|
||||
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
|
||||
"cherry_blossom": "<image>\nPlease infer the season with reason in details.", # noqa: E501
|
||||
}),
|
||||
multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?", # noqa: E501
|
||||
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
|
||||
"cherry_blossom": "<image>\nPlease infer the season with reason in details.", # noqa: E501
|
||||
}
|
||||
),
|
||||
multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?", # noqa: E501
|
||||
patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
|
||||
hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
|
||||
stop_str=["<|end▁of▁sentence|>", "<|begin▁of▁sentence|>"], # noqa: E501
|
||||
image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
|
||||
image_size_factors=[(), (1.0,), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
|
||||
),
|
||||
"fuyu": VLMTestInfo(
|
||||
models=["adept/fuyu-8b"],
|
||||
@ -336,11 +338,13 @@ VLM_TEST_SETTINGS = {
|
||||
"gemma3": VLMTestInfo(
|
||||
models=["google/gemma-3-4b-it"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
|
||||
single_image_prompts=IMAGE_ASSETS.prompts({
|
||||
"stop_sign": "<start_of_image>What's the content in the center of the image?", # noqa: E501
|
||||
"cherry_blossom": "<start_of_image>What is the season?", # noqa: E501
|
||||
}),
|
||||
prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
|
||||
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "<start_of_image>What's the content in the center of the image?", # noqa: E501
|
||||
"cherry_blossom": "<start_of_image>What is the season?", # noqa: E501
|
||||
}
|
||||
),
|
||||
multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
@ -353,10 +357,12 @@ VLM_TEST_SETTINGS = {
|
||||
models=["zai-org/glm-4v-9b"],
|
||||
test_type=VLMTestType.IMAGE,
|
||||
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>", # noqa: E501
|
||||
single_image_prompts=IMAGE_ASSETS.prompts({
|
||||
"stop_sign": "<|begin_of_image|><|endoftext|><|end_of_image|>What's the content in the center of the image?", # noqa: E501
|
||||
"cherry_blossom": "<|begin_of_image|><|endoftext|><|end_of_image|>What is the season?", # noqa: E501
|
||||
}),
|
||||
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "<|begin_of_image|><|endoftext|><|end_of_image|>What's the content in the center of the image?", # noqa: E501
|
||||
"cherry_blossom": "<|begin_of_image|><|endoftext|><|end_of_image|>What is the season?", # noqa: E501
|
||||
}
|
||||
),
|
||||
max_model_len=2048,
|
||||
max_num_seqs=2,
|
||||
get_stop_token_ids=lambda tok: [151329, 151336, 151338],
|
||||
@ -372,8 +378,8 @@ VLM_TEST_SETTINGS = {
|
||||
models=["zai-org/GLM-4.1V-9B-Thinking"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>", # noqa: E501
|
||||
video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>", # noqa: E501
|
||||
video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>", # noqa: E501
|
||||
max_model_len=2048,
|
||||
max_num_seqs=2,
|
||||
get_stop_token_ids=lambda tok: [151329, 151336, 151338],
|
||||
@ -390,23 +396,27 @@ VLM_TEST_SETTINGS = {
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
patch_hf_runner=model_utils.glm4_1v_patch_hf_runner,
|
||||
custom_test_opts=[CustomTestOptions(
|
||||
inputs=custom_inputs.video_with_metadata_glm4_1v(),
|
||||
limit_mm_per_prompt={"video": 1},
|
||||
)],
|
||||
custom_test_opts=[
|
||||
CustomTestOptions(
|
||||
inputs=custom_inputs.video_with_metadata_glm4_1v(),
|
||||
limit_mm_per_prompt={"video": 1},
|
||||
)
|
||||
],
|
||||
marks=[large_gpu_mark(min_gb=32)],
|
||||
),
|
||||
"h2ovl": VLMTestInfo(
|
||||
models = [
|
||||
models=[
|
||||
"h2oai/h2ovl-mississippi-800m",
|
||||
"h2oai/h2ovl-mississippi-2b",
|
||||
],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501
|
||||
single_image_prompts=IMAGE_ASSETS.prompts({
|
||||
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
|
||||
"cherry_blossom": "<image>\nWhat is the season?",
|
||||
}),
|
||||
prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501
|
||||
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
|
||||
"cherry_blossom": "<image>\nWhat is the season?",
|
||||
}
|
||||
),
|
||||
multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.", # noqa: E501
|
||||
max_model_len=8192,
|
||||
use_tokenizer_eos=True,
|
||||
@ -416,7 +426,7 @@ VLM_TEST_SETTINGS = {
|
||||
"idefics3": VLMTestInfo(
|
||||
models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501
|
||||
prompt_formatter=lambda img_prompt: f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<image>",
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
@ -431,11 +441,13 @@ VLM_TEST_SETTINGS = {
|
||||
# "OpenGVLab/Mono-InternVL-2B",
|
||||
],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
|
||||
single_image_prompts=IMAGE_ASSETS.prompts({
|
||||
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
|
||||
"cherry_blossom": "<image>\nWhat is the season?",
|
||||
}),
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
|
||||
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
|
||||
"cherry_blossom": "<image>\nWhat is the season?",
|
||||
}
|
||||
),
|
||||
multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.", # noqa: E501
|
||||
max_model_len=4096,
|
||||
use_tokenizer_eos=True,
|
||||
@ -446,7 +458,7 @@ VLM_TEST_SETTINGS = {
|
||||
"OpenGVLab/InternVL3-1B",
|
||||
],
|
||||
test_type=VLMTestType.VIDEO,
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
|
||||
video_idx_to_prompt=lambda idx: "<video>",
|
||||
max_model_len=8192,
|
||||
use_tokenizer_eos=True,
|
||||
@ -459,7 +471,7 @@ VLM_TEST_SETTINGS = {
|
||||
VLMTestType.MULTI_IMAGE,
|
||||
VLMTestType.VIDEO,
|
||||
),
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<IMG_CONTEXT>",
|
||||
video_idx_to_prompt=lambda idx: "<video>",
|
||||
max_model_len=8192,
|
||||
@ -469,7 +481,7 @@ VLM_TEST_SETTINGS = {
|
||||
"kimi_vl": VLMTestInfo(
|
||||
models=["moonshotai/Kimi-VL-A3B-Instruct"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<|im_user|>user<|im_middle|>{img_prompt}<|im_end|><|im_assistant|>assistant<|im_middle|>", # noqa: E501
|
||||
prompt_formatter=lambda img_prompt: f"<|im_user|>user<|im_middle|>{img_prompt}<|im_end|><|im_assistant|>assistant<|im_middle|>", # noqa: E501
|
||||
img_idx_to_prompt=lambda _: "<|media_start|>image<|media_content|><|media_pad|><|media_end|>", # noqa: E501
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
@ -480,11 +492,11 @@ VLM_TEST_SETTINGS = {
|
||||
),
|
||||
"llama4": VLMTestInfo(
|
||||
models=["meta-llama/Llama-4-Scout-17B-16E-Instruct"],
|
||||
prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|header_start|>user<|header_end|>\n\n{img_prompt}<|eot|><|header_start|>assistant<|header_end|>\n\n", # noqa: E501
|
||||
prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|header_start|>user<|header_end|>\n\n{img_prompt}<|eot|><|header_start|>assistant<|header_end|>\n\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda _: "<|image|>",
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
distributed_executor_backend="mp",
|
||||
image_size_factors=[(.25, 0.5, 1.0)],
|
||||
image_size_factors=[(0.25, 0.5, 1.0)],
|
||||
hf_model_kwargs={"device_map": "auto"},
|
||||
max_model_len=8192,
|
||||
max_num_seqs=4,
|
||||
@ -500,28 +512,34 @@ VLM_TEST_SETTINGS = {
|
||||
max_model_len=10240,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
||||
custom_test_opts=[CustomTestOptions(
|
||||
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
|
||||
formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]"
|
||||
),
|
||||
limit_mm_per_prompt={"image": 4},
|
||||
)],
|
||||
custom_test_opts=[
|
||||
CustomTestOptions(
|
||||
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
|
||||
formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]"
|
||||
),
|
||||
limit_mm_per_prompt={"image": 4},
|
||||
)
|
||||
],
|
||||
),
|
||||
"llava_onevision": VLMTestInfo(
|
||||
models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
|
||||
test_type=VLMTestType.CUSTOM_INPUTS,
|
||||
prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||
prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||
num_video_frames=16,
|
||||
max_model_len=16384,
|
||||
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"), # noqa: E501
|
||||
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs(
|
||||
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
|
||||
), # noqa: E501
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
|
||||
custom_test_opts=[CustomTestOptions(
|
||||
inputs=custom_inputs.multi_video_multi_aspect_ratio_inputs(
|
||||
formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||
),
|
||||
limit_mm_per_prompt={"video": 4},
|
||||
)],
|
||||
custom_test_opts=[
|
||||
CustomTestOptions(
|
||||
inputs=custom_inputs.multi_video_multi_aspect_ratio_inputs(
|
||||
formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||
),
|
||||
limit_mm_per_prompt={"video": 4},
|
||||
)
|
||||
],
|
||||
),
|
||||
"llava_next_video": VLMTestInfo(
|
||||
models=["llava-hf/LLaVA-NeXT-Video-7B-hf"],
|
||||
@ -563,7 +581,9 @@ VLM_TEST_SETTINGS = {
|
||||
img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501
|
||||
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(
|
||||
["<|im_end|>", "<|endoftext|>"]
|
||||
), # noqa: E501
|
||||
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
|
||||
patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
|
||||
# FIXME: https://huggingface.co/openbmb/MiniCPM-o-2_6/discussions/49
|
||||
@ -576,13 +596,15 @@ VLM_TEST_SETTINGS = {
|
||||
img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501
|
||||
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(
|
||||
["<|im_end|>", "<|endoftext|>"]
|
||||
), # noqa: E501
|
||||
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
|
||||
patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
|
||||
),
|
||||
"minimax_vl_01": VLMTestInfo(
|
||||
models=["MiniMaxAI/MiniMax-VL-01"],
|
||||
prompt_formatter=lambda img_prompt: f"<beginning_of_sentence>user: {img_prompt} assistant:<end_of_sentence>", # noqa: E501
|
||||
prompt_formatter=lambda img_prompt: f"<beginning_of_sentence>user: {img_prompt} assistant:<end_of_sentence>", # noqa: E501
|
||||
img_idx_to_prompt=lambda _: "<image>",
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
max_model_len=8192,
|
||||
@ -604,8 +626,8 @@ VLM_TEST_SETTINGS = {
|
||||
"ovis1_6-gemma2": VLMTestInfo(
|
||||
models=["AIDC-AI/Ovis1.6-Gemma2-9B"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
|
||||
prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
dtype="half",
|
||||
@ -617,8 +639,8 @@ VLM_TEST_SETTINGS = {
|
||||
"ovis2": VLMTestInfo(
|
||||
models=["AIDC-AI/Ovis2-1B"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
dtype="half",
|
||||
@ -628,13 +650,9 @@ VLM_TEST_SETTINGS = {
|
||||
),
|
||||
"ovis2_5": VLMTestInfo(
|
||||
models=["AIDC-AI/Ovis2.5-2B"],
|
||||
test_type=(
|
||||
VLMTestType.IMAGE,
|
||||
VLMTestType.MULTI_IMAGE,
|
||||
VLMTestType.VIDEO
|
||||
),
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
|
||||
video_idx_to_prompt=lambda idx: "<video>\n",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
@ -646,7 +664,7 @@ VLM_TEST_SETTINGS = {
|
||||
"phi3v": VLMTestInfo(
|
||||
models=["microsoft/Phi-3.5-vision-instruct"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|end|>\n<|assistant|>\n", # noqa: E501
|
||||
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|end|>\n<|assistant|>\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: f"<|image_{idx}|>\n",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
@ -681,15 +699,11 @@ VLM_TEST_SETTINGS = {
|
||||
),
|
||||
"qwen2_vl": VLMTestInfo(
|
||||
models=["Qwen/Qwen2-VL-2B-Instruct"],
|
||||
test_type=(
|
||||
VLMTestType.IMAGE,
|
||||
VLMTestType.MULTI_IMAGE,
|
||||
VLMTestType.VIDEO
|
||||
),
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
|
||||
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
|
||||
multi_image_prompt="Picture 1: <vlm_image>\nPicture 2: <vlm_image>\nDescribe these two images with one paragraph respectively.", # noqa: E501
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
|
||||
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
|
||||
multi_image_prompt="Picture 1: <vlm_image>\nPicture 2: <vlm_image>\nDescribe these two images with one paragraph respectively.", # noqa: E501
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
@ -700,11 +714,13 @@ VLM_TEST_SETTINGS = {
|
||||
"skywork_r1v": VLMTestInfo(
|
||||
models=["Skywork/Skywork-R1V-38B"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<|begin▁of▁sentence|><|User|>\n{img_prompt}<|Assistant|><think>\n", # noqa: E501
|
||||
single_image_prompts=IMAGE_ASSETS.prompts({
|
||||
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
|
||||
"cherry_blossom": "<image>\nWhat is the season?",
|
||||
}),
|
||||
prompt_formatter=lambda img_prompt: f"<|begin▁of▁sentence|><|User|>\n{img_prompt}<|Assistant|><think>\n", # noqa: E501
|
||||
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
|
||||
"cherry_blossom": "<image>\nWhat is the season?",
|
||||
}
|
||||
),
|
||||
multi_image_prompt="<image>\n<image>\nDescribe the two images in short.", # noqa: E501
|
||||
max_model_len=4096,
|
||||
use_tokenizer_eos=True,
|
||||
@ -737,9 +753,9 @@ VLM_TEST_SETTINGS = {
|
||||
VLMTestType.MULTI_IMAGE,
|
||||
VLMTestType.VIDEO,
|
||||
),
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
|
||||
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
|
||||
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
@ -752,11 +768,11 @@ VLM_TEST_SETTINGS = {
|
||||
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
||||
max_model_len=4096,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
|
||||
hf_output_post_proc = lambda hf_output, model: hf_output[:2],
|
||||
vllm_output_post_proc=lambda vllm_output, model: vllm_output[:2],
|
||||
hf_output_post_proc=lambda hf_output, model: hf_output[:2],
|
||||
comparator=check_outputs_equal,
|
||||
marks=multi_gpu_marks(num_gpus=2),
|
||||
**COMMON_BROADCAST_SETTINGS # type: ignore
|
||||
**COMMON_BROADCAST_SETTINGS, # type: ignore
|
||||
),
|
||||
"llava-broadcast": VLMTestInfo(
|
||||
models=["llava-hf/llava-1.5-7b-hf"],
|
||||
@ -765,7 +781,7 @@ VLM_TEST_SETTINGS = {
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
||||
marks=multi_gpu_marks(num_gpus=2),
|
||||
**COMMON_BROADCAST_SETTINGS # type: ignore
|
||||
**COMMON_BROADCAST_SETTINGS, # type: ignore
|
||||
),
|
||||
"llava_next-broadcast": VLMTestInfo(
|
||||
models=["llava-hf/llava-v1.6-mistral-7b-hf"],
|
||||
@ -774,12 +790,12 @@ VLM_TEST_SETTINGS = {
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
||||
marks=multi_gpu_marks(num_gpus=2),
|
||||
**COMMON_BROADCAST_SETTINGS # type: ignore
|
||||
**COMMON_BROADCAST_SETTINGS, # type: ignore
|
||||
),
|
||||
### Custom input edge-cases for specific models
|
||||
"intern_vl-diff-patches": VLMTestInfo(
|
||||
models=["OpenGVLab/InternVL2-2B"],
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
|
||||
test_type=VLMTestType.CUSTOM_INPUTS,
|
||||
max_model_len=4096,
|
||||
use_tokenizer_eos=True,
|
||||
@ -788,7 +804,8 @@ VLM_TEST_SETTINGS = {
|
||||
CustomTestOptions(
|
||||
inputs=inp,
|
||||
limit_mm_per_prompt={"image": 2},
|
||||
) for inp in custom_inputs.different_patch_input_cases_internvl()
|
||||
)
|
||||
for inp in custom_inputs.different_patch_input_cases_internvl()
|
||||
],
|
||||
),
|
||||
"llava_onevision-multiple-images": VLMTestInfo(
|
||||
@ -797,14 +814,18 @@ VLM_TEST_SETTINGS = {
|
||||
max_model_len=16384,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"), # noqa: E501
|
||||
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs(
|
||||
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
|
||||
), # noqa: E501
|
||||
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
|
||||
custom_test_opts=[CustomTestOptions(
|
||||
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
|
||||
formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||
),
|
||||
limit_mm_per_prompt={"image": 4},
|
||||
)],
|
||||
custom_test_opts=[
|
||||
CustomTestOptions(
|
||||
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
|
||||
formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||
),
|
||||
limit_mm_per_prompt={"image": 4},
|
||||
)
|
||||
],
|
||||
),
|
||||
# regression test for https://github.com/vllm-project/vllm/issues/15122
|
||||
"qwen2_5_vl-windows-attention": VLMTestInfo(
|
||||
@ -814,13 +835,14 @@ VLM_TEST_SETTINGS = {
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
|
||||
custom_test_opts=[CustomTestOptions(
|
||||
inputs=custom_inputs.windows_attention_image_qwen2_5_vl(),
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)],
|
||||
custom_test_opts=[
|
||||
CustomTestOptions(
|
||||
inputs=custom_inputs.windows_attention_image_qwen2_5_vl(),
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
],
|
||||
),
|
||||
}
|
||||
# yapf: enable
|
||||
|
||||
|
||||
def _mark_splits(
|
||||
|
||||
@ -114,7 +114,6 @@ def get_parametrized_options(
|
||||
raise ValueError("Test has type CUSTOM_INPUTS, but none given")
|
||||
iter_kwargs["custom_test_opts"] = test_info.custom_test_opts
|
||||
|
||||
# yapf: disable
|
||||
# Wrap all model cases in a pytest parameter & pass marks through
|
||||
return [
|
||||
pytest.param(
|
||||
@ -122,10 +121,10 @@ def get_parametrized_options(
|
||||
ExpandableVLMTestArgs(
|
||||
**{k: v for k, v in zip(iter_kwargs.keys(), case)}
|
||||
),
|
||||
marks=test_info.marks if test_info.marks is not None else []
|
||||
) for case in list(itertools.product(*iter_kwargs.values()))
|
||||
marks=test_info.marks if test_info.marks is not None else [],
|
||||
)
|
||||
for case in list(itertools.product(*iter_kwargs.values()))
|
||||
]
|
||||
# yapf: enable
|
||||
|
||||
# Get a list per model type, where each entry contains a tuple of all of
|
||||
# that model type's cases, then flatten them into the top level so that
|
||||
|
||||
@ -418,7 +418,6 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
self.image_size = self.vision_config.image_size
|
||||
|
||||
def __call__(self, text: str, images: Union[Image, list[Image]], **kwargs):
|
||||
# yapf: disable
|
||||
from vllm.model_executor.models.h2ovl import (
|
||||
IMG_CONTEXT,
|
||||
IMG_END,
|
||||
@ -426,7 +425,6 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
image_to_pixel_values_h2ovl,
|
||||
)
|
||||
|
||||
# yapf: enable
|
||||
images = [images] if isinstance(images, Image) else images
|
||||
pixel_values = [
|
||||
image_to_pixel_values_h2ovl(
|
||||
|
||||
@ -33,24 +33,26 @@ TEST_IMG_PLACEHOLDER = "<vlm_image>"
|
||||
TEST_VIDEO_PLACEHOLDER = "<vlm_video>"
|
||||
TEST_AUDIO_PLACEHOLDER = "<lmm_audio>"
|
||||
|
||||
# yapf: disable
|
||||
SINGLE_IMAGE_BASE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
"stop_sign": f"{TEST_IMG_PLACEHOLDER}What's the content of the image?",
|
||||
"cherry_blossom": f"{TEST_IMG_PLACEHOLDER}What is the season?",
|
||||
})
|
||||
SINGLE_AUDIO_BASE_PROMPT = AUDIO_ASSETS.prompts({
|
||||
"mary_had_lamb": f"{TEST_AUDIO_PLACEHOLDER}Transcribe this audio into English.", # noqa: E501
|
||||
"winning_call": f"{TEST_AUDIO_PLACEHOLDER}What is happening in this audio clip?", # noqa: E501
|
||||
})
|
||||
SINGLE_IMAGE_BASE_PROMPTS = IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": f"{TEST_IMG_PLACEHOLDER}What's the content of the image?",
|
||||
"cherry_blossom": f"{TEST_IMG_PLACEHOLDER}What is the season?",
|
||||
}
|
||||
)
|
||||
SINGLE_AUDIO_BASE_PROMPT = AUDIO_ASSETS.prompts(
|
||||
{
|
||||
"mary_had_lamb": f"{TEST_AUDIO_PLACEHOLDER}Transcribe this audio into English.", # noqa: E501
|
||||
"winning_call": f"{TEST_AUDIO_PLACEHOLDER}What is happening in this audio clip?", # noqa: E501
|
||||
}
|
||||
)
|
||||
|
||||
MULTI_IMAGE_BASE_PROMPT = f"Image-1: {TEST_IMG_PLACEHOLDER}Image-2: {TEST_IMG_PLACEHOLDER}Describe the two images in detail.\n" # noqa: E501
|
||||
VIDEO_BASE_PROMPT = f"{TEST_VIDEO_PLACEHOLDER}Why is this video funny?"
|
||||
|
||||
|
||||
IMAGE_SIZE_FACTORS = [(), (1.0, ), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)]
|
||||
EMBEDDING_SIZE_FACTORS = [(), (1.0, ), (1.0, 1.0, 1.0)]
|
||||
IMAGE_SIZE_FACTORS = [(), (1.0,), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)]
|
||||
EMBEDDING_SIZE_FACTORS = [(), (1.0,), (1.0, 1.0, 1.0)]
|
||||
RunnerOutput = tuple[list[int], str, Optional[SampleLogprobs]]
|
||||
# yapf: enable
|
||||
|
||||
|
||||
class PromptWithMultiModalInput(NamedTuple):
|
||||
|
||||
@ -322,80 +322,81 @@ def _test_processing_correctness_one(
|
||||
)
|
||||
|
||||
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize("model_id", [
|
||||
"rhymes-ai/Aria",
|
||||
"CohereForAI/aya-vision-8b",
|
||||
"Salesforce/blip2-opt-2.7b",
|
||||
"facebook/chameleon-7b",
|
||||
"CohereLabs/command-a-vision-07-2025",
|
||||
"deepseek-ai/deepseek-vl2-tiny",
|
||||
"baidu/ERNIE-4.5-VL-28B-A3B-PT",
|
||||
"adept/fuyu-8b",
|
||||
"google/gemma-3-4b-it",
|
||||
"google/gemma-3n-E2B-it",
|
||||
"zai-org/glm-4v-9b",
|
||||
"zai-org/GLM-4.1V-9B-Thinking",
|
||||
"zai-org/GLM-4.5V",
|
||||
"ibm-granite/granite-speech-3.3-2b",
|
||||
"h2oai/h2ovl-mississippi-800m",
|
||||
"naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
|
||||
"HuggingFaceM4/Idefics3-8B-Llama3",
|
||||
"internlm/Intern-S1",
|
||||
"OpenGVLab/InternVL2-1B",
|
||||
"OpenGVLab/InternVL3-1B",
|
||||
"OpenGVLab/InternVL3_5-1B",
|
||||
"OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview",
|
||||
"OpenGVLab/InternVL3_5-30B-A3B",
|
||||
"Kwai-Keye/Keye-VL-8B-Preview",
|
||||
"Kwai-Keye/Keye-VL-1_5-8B",
|
||||
"moonshotai/Kimi-VL-A3B-Instruct",
|
||||
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
||||
"llava-hf/llava-1.5-7b-hf",
|
||||
"llava-hf/llava-v1.6-mistral-7b-hf",
|
||||
"llava-hf/LLaVA-NeXT-Video-7B-hf",
|
||||
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
|
||||
"TIGER-Lab/Mantis-8B-siglip-llama3",
|
||||
"mispeech/midashenglm-7b",
|
||||
"openbmb/MiniCPM-Llama3-V-2_5",
|
||||
"openbmb/MiniCPM-o-2_6",
|
||||
"openbmb/MiniCPM-V-2_6",
|
||||
"MiniMaxAI/MiniMax-VL-01",
|
||||
"allenai/Molmo-7B-D-0924",
|
||||
"allenai/Molmo-7B-O-0924",
|
||||
"nvidia/NVLM-D-72B",
|
||||
"nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1",
|
||||
"AIDC-AI/Ovis1.6-Gemma2-9B",
|
||||
"AIDC-AI/Ovis1.6-Llama3.2-3B",
|
||||
"AIDC-AI/Ovis2-1B",
|
||||
"AIDC-AI/Ovis2.5-2B",
|
||||
"google/paligemma-3b-mix-224",
|
||||
"google/paligemma2-3b-ft-docci-448",
|
||||
"microsoft/Phi-3.5-vision-instruct",
|
||||
"microsoft/Phi-4-multimodal-instruct",
|
||||
"mistralai/Pixtral-12B-2409",
|
||||
"mistral-community/pixtral-12b",
|
||||
"Qwen/Qwen-VL-Chat",
|
||||
"Qwen/Qwen2-VL-2B-Instruct",
|
||||
"Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
"Qwen/Qwen2-Audio-7B-Instruct",
|
||||
"Qwen/Qwen2.5-Omni-3B",
|
||||
"Qwen/Qwen3-VL-4B-Instruct",
|
||||
"Qwen/Qwen3-VL-30B-A3B-Instruct",
|
||||
"YannQi/R-4B",
|
||||
"Skywork/Skywork-R1V-38B",
|
||||
"HuggingFaceTB/SmolVLM2-2.2B-Instruct",
|
||||
"stepfun-ai/step3",
|
||||
"fixie-ai/ultravox-v0_5-llama-3_2-1b",
|
||||
"openai/whisper-large-v3",
|
||||
"omni-research/Tarsier-7b",
|
||||
"omni-research/Tarsier2-Recap-7b",
|
||||
"mistralai/Voxtral-Mini-3B-2507",
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"model_id",
|
||||
[
|
||||
"rhymes-ai/Aria",
|
||||
"CohereForAI/aya-vision-8b",
|
||||
"Salesforce/blip2-opt-2.7b",
|
||||
"facebook/chameleon-7b",
|
||||
"CohereLabs/command-a-vision-07-2025",
|
||||
"deepseek-ai/deepseek-vl2-tiny",
|
||||
"baidu/ERNIE-4.5-VL-28B-A3B-PT",
|
||||
"adept/fuyu-8b",
|
||||
"google/gemma-3-4b-it",
|
||||
"google/gemma-3n-E2B-it",
|
||||
"zai-org/glm-4v-9b",
|
||||
"zai-org/GLM-4.1V-9B-Thinking",
|
||||
"zai-org/GLM-4.5V",
|
||||
"ibm-granite/granite-speech-3.3-2b",
|
||||
"h2oai/h2ovl-mississippi-800m",
|
||||
"naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
|
||||
"HuggingFaceM4/Idefics3-8B-Llama3",
|
||||
"internlm/Intern-S1",
|
||||
"OpenGVLab/InternVL2-1B",
|
||||
"OpenGVLab/InternVL3-1B",
|
||||
"OpenGVLab/InternVL3_5-1B",
|
||||
"OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview",
|
||||
"OpenGVLab/InternVL3_5-30B-A3B",
|
||||
"Kwai-Keye/Keye-VL-8B-Preview",
|
||||
"Kwai-Keye/Keye-VL-1_5-8B",
|
||||
"moonshotai/Kimi-VL-A3B-Instruct",
|
||||
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
||||
"llava-hf/llava-1.5-7b-hf",
|
||||
"llava-hf/llava-v1.6-mistral-7b-hf",
|
||||
"llava-hf/LLaVA-NeXT-Video-7B-hf",
|
||||
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
|
||||
"TIGER-Lab/Mantis-8B-siglip-llama3",
|
||||
"mispeech/midashenglm-7b",
|
||||
"openbmb/MiniCPM-Llama3-V-2_5",
|
||||
"openbmb/MiniCPM-o-2_6",
|
||||
"openbmb/MiniCPM-V-2_6",
|
||||
"MiniMaxAI/MiniMax-VL-01",
|
||||
"allenai/Molmo-7B-D-0924",
|
||||
"allenai/Molmo-7B-O-0924",
|
||||
"nvidia/NVLM-D-72B",
|
||||
"nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1",
|
||||
"AIDC-AI/Ovis1.6-Gemma2-9B",
|
||||
"AIDC-AI/Ovis1.6-Llama3.2-3B",
|
||||
"AIDC-AI/Ovis2-1B",
|
||||
"AIDC-AI/Ovis2.5-2B",
|
||||
"google/paligemma-3b-mix-224",
|
||||
"google/paligemma2-3b-ft-docci-448",
|
||||
"microsoft/Phi-3.5-vision-instruct",
|
||||
"microsoft/Phi-4-multimodal-instruct",
|
||||
"mistralai/Pixtral-12B-2409",
|
||||
"mistral-community/pixtral-12b",
|
||||
"Qwen/Qwen-VL-Chat",
|
||||
"Qwen/Qwen2-VL-2B-Instruct",
|
||||
"Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
"Qwen/Qwen2-Audio-7B-Instruct",
|
||||
"Qwen/Qwen2.5-Omni-3B",
|
||||
"Qwen/Qwen3-VL-4B-Instruct",
|
||||
"Qwen/Qwen3-VL-30B-A3B-Instruct",
|
||||
"YannQi/R-4B",
|
||||
"Skywork/Skywork-R1V-38B",
|
||||
"HuggingFaceTB/SmolVLM2-2.2B-Instruct",
|
||||
"stepfun-ai/step3",
|
||||
"fixie-ai/ultravox-v0_5-llama-3_2-1b",
|
||||
"openai/whisper-large-v3",
|
||||
"omni-research/Tarsier-7b",
|
||||
"omni-research/Tarsier2-Recap-7b",
|
||||
"mistralai/Voxtral-Mini-3B-2507",
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
|
||||
@pytest.mark.parametrize("num_batches", [32])
|
||||
@pytest.mark.parametrize("simplify_rate", [1.0])
|
||||
# yapf: enable
|
||||
def test_processing_correctness(
|
||||
model_id: str,
|
||||
hit_rate: float,
|
||||
|
||||
@ -12,7 +12,6 @@ from ...utils import build_model_context
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", ["HuggingFaceM4/Idefics3-8B-Llama3"])
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(
|
||||
("mm_processor_kwargs", "expected_toks_per_img"),
|
||||
[
|
||||
@ -20,7 +19,6 @@ from ...utils import build_model_context
|
||||
({"size": {"longest_edge": 728}}, 169 * (2**2 + 1)),
|
||||
],
|
||||
)
|
||||
# yapf: enable
|
||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||
@pytest.mark.parametrize("kwargs_on_init", [True, False])
|
||||
def test_processor_override(
|
||||
|
||||
@ -11,7 +11,6 @@ from ...utils import build_model_context
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", ["microsoft/Phi-3.5-vision-instruct"])
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(
|
||||
("mm_processor_kwargs", "expected_toks_per_img"),
|
||||
[
|
||||
@ -21,7 +20,6 @@ from ...utils import build_model_context
|
||||
({}, 757),
|
||||
],
|
||||
)
|
||||
# yapf: enable
|
||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||
@pytest.mark.parametrize("kwargs_on_init", [True, False])
|
||||
def test_processor_override(
|
||||
|
||||
@ -11,7 +11,6 @@ from ...utils import build_model_context
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", ["microsoft/Phi-4-multimodal-instruct"])
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(
|
||||
("mm_processor_kwargs", "expected_toks_per_img"),
|
||||
[
|
||||
@ -21,7 +20,6 @@ from ...utils import build_model_context
|
||||
({}, 9585),
|
||||
],
|
||||
)
|
||||
# yapf: enable
|
||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||
@pytest.mark.parametrize("kwargs_on_init", [True, False])
|
||||
def test_processor_override(
|
||||
|
||||
@ -10,7 +10,6 @@ from ...utils import build_model_context
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(
|
||||
("mm_processor_kwargs", "expected_toks_per_img", "expected_pixels_shape"),
|
||||
[
|
||||
@ -18,7 +17,6 @@ from ...utils import build_model_context
|
||||
({"min_pixels": 64**2, "max_pixels": 512**2}, 330, (1320, 1176)),
|
||||
],
|
||||
)
|
||||
# yapf: enable
|
||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||
@pytest.mark.parametrize("kwargs_on_init", [True, False])
|
||||
def test_processor_override(
|
||||
|
||||
@ -12,7 +12,6 @@ from ...utils import build_model_context
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", ["HuggingFaceTB/SmolVLM2-2.2B-Instruct"])
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(
|
||||
("mm_processor_kwargs", "expected_toks_per_img"),
|
||||
[
|
||||
@ -20,7 +19,6 @@ from ...utils import build_model_context
|
||||
({"max_image_size": {"longest_edge": 768}}, 405),
|
||||
],
|
||||
)
|
||||
# yapf: enable
|
||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||
@pytest.mark.parametrize("kwargs_on_init", [True, False])
|
||||
def test_processor_override(
|
||||
|
||||
@ -7,9 +7,7 @@ from vllm.config import ModelConfig
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
|
||||
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize("model_id",
|
||||
["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
|
||||
@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
|
||||
def test_multimodal_processor(model_id):
|
||||
model_config = ModelConfig(
|
||||
model=model_id,
|
||||
@ -18,9 +16,9 @@ def test_multimodal_processor(model_id):
|
||||
|
||||
mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config)
|
||||
|
||||
image_pil = ImageAsset('cherry_blossom').pil_image
|
||||
image_pil = ImageAsset("cherry_blossom").pil_image
|
||||
mm_data = {"image": image_pil}
|
||||
str_prompt = "<|im_start|>user <image>\nWhat is the content of this image?<|im_end|><|im_start|>assistant\n" # noqa: E501
|
||||
str_prompt = "<|im_start|>user <image>\nWhat is the content of this image?<|im_end|><|im_start|>assistant\n" # noqa: E501
|
||||
str_processed_inputs = mm_processor.apply(
|
||||
prompt=str_prompt,
|
||||
mm_data=mm_data,
|
||||
@ -28,8 +26,23 @@ def test_multimodal_processor(model_id):
|
||||
)
|
||||
|
||||
ids_prompt = [
|
||||
151644, 872, 220, 151646, 198, 3838, 374, 279, 2213, 315, 419, 2168,
|
||||
30, 151645, 151644, 77091, 198
|
||||
151644,
|
||||
872,
|
||||
220,
|
||||
151646,
|
||||
198,
|
||||
3838,
|
||||
374,
|
||||
279,
|
||||
2213,
|
||||
315,
|
||||
419,
|
||||
2168,
|
||||
30,
|
||||
151645,
|
||||
151644,
|
||||
77091,
|
||||
198,
|
||||
]
|
||||
ids_processed_inputs = mm_processor.apply(
|
||||
prompt=ids_prompt,
|
||||
@ -37,5 +50,7 @@ def test_multimodal_processor(model_id):
|
||||
hf_processor_mm_kwargs={},
|
||||
)
|
||||
|
||||
assert (str_processed_inputs["prompt_token_ids"]
|
||||
== ids_processed_inputs["prompt_token_ids"])
|
||||
assert (
|
||||
str_processed_inputs["prompt_token_ids"]
|
||||
== ids_processed_inputs["prompt_token_ids"]
|
||||
)
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -71,25 +71,27 @@ def _dummy_items(
|
||||
)
|
||||
|
||||
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(
|
||||
("item", "expected_size"),
|
||||
[
|
||||
(_dummy_item("a", {"a1": 100}), 100),
|
||||
(_dummy_item("a", {"a1": 100, "a2": 110}), 210),
|
||||
(_dummy_items({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460), # noqa: E501
|
||||
(_dummy_items({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}).get_data(), 460), # noqa: E501
|
||||
(
|
||||
_dummy_items(
|
||||
{"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}
|
||||
).get_data(),
|
||||
460,
|
||||
), # noqa: E501
|
||||
],
|
||||
)
|
||||
# yapf: enable
|
||||
def test_cache_item_size(item, expected_size):
|
||||
cache = MultiModalCache.get_lru_cache(2048, type(item))
|
||||
|
||||
cache[""] = item
|
||||
assert cache.currsize == expected_size
|
||||
|
||||
prompt_update = PromptInsertion("dummy", "target", "insertion") \
|
||||
.resolve(0)
|
||||
prompt_update = PromptInsertion("dummy", "target", "insertion").resolve(0)
|
||||
|
||||
cache[""] = MultiModalProcessorCacheItem(item, [prompt_update])
|
||||
assert cache.currsize == expected_size
|
||||
@ -106,9 +108,9 @@ def _create_vllm_config(
|
||||
return VllmConfig(
|
||||
model_config=ModelConfig(
|
||||
model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
|
||||
mm_processor_cache_gb=mm_processor_cache_gb),
|
||||
parallel_config=ParallelConfig(
|
||||
data_parallel_size=1 if enable_ipc else 2),
|
||||
mm_processor_cache_gb=mm_processor_cache_gb,
|
||||
),
|
||||
parallel_config=ParallelConfig(data_parallel_size=1 if enable_ipc else 2),
|
||||
)
|
||||
|
||||
|
||||
@ -124,11 +126,9 @@ def _compare_caches(
|
||||
seed: int = 0,
|
||||
):
|
||||
cache_0_p0 = processor_cache_from_config(config_0, MULTIMODAL_REGISTRY)
|
||||
cache_0_p1 = engine_receiver_cache_from_config(config_0,
|
||||
MULTIMODAL_REGISTRY)
|
||||
cache_0_p1 = engine_receiver_cache_from_config(config_0, MULTIMODAL_REGISTRY)
|
||||
cache_1_p0 = processor_cache_from_config(config_1, MULTIMODAL_REGISTRY)
|
||||
cache_1_p1 = engine_receiver_cache_from_config(config_1,
|
||||
MULTIMODAL_REGISTRY)
|
||||
cache_1_p1 = engine_receiver_cache_from_config(config_1, MULTIMODAL_REGISTRY)
|
||||
|
||||
cache_size_gb = max(
|
||||
config_0.model_config.multimodal_config.mm_processor_cache_gb,
|
||||
@ -142,8 +142,7 @@ def _compare_caches(
|
||||
for _ in range(int(item_capacity / hit_rate))
|
||||
]
|
||||
all_hashes = [
|
||||
MultiModalHasher.hash_kwargs(item=item.get_data())
|
||||
for item in all_items
|
||||
MultiModalHasher.hash_kwargs(item=item.get_data()) for item in all_items
|
||||
]
|
||||
|
||||
# Should not be used since there is nothing to convert to text
|
||||
@ -162,7 +161,8 @@ def _compare_caches(
|
||||
for _ in range(is_cached_calls_per_iter):
|
||||
cache_0_p0.is_cached(selected_hashes)
|
||||
cache_0_p0_out = [
|
||||
item for item, _ in cache_0_p0.get_and_update(
|
||||
item
|
||||
for item, _ in cache_0_p0.get_and_update(
|
||||
[(item, prompt_update.content) for item in selected_items],
|
||||
selected_hashes,
|
||||
)
|
||||
@ -174,7 +174,8 @@ def _compare_caches(
|
||||
for _ in range(is_cached_calls_per_iter):
|
||||
cache_1_p0.is_cached(selected_hashes)
|
||||
cache_1_p0_out = [
|
||||
item for item, _ in cache_1_p0.get_and_update(
|
||||
item
|
||||
for item, _ in cache_1_p0.get_and_update(
|
||||
[(item, prompt_update.content) for item in selected_items],
|
||||
selected_hashes,
|
||||
)
|
||||
@ -183,14 +184,12 @@ def _compare_caches(
|
||||
if cache_0_p1 is None:
|
||||
cache_0_p1_out = cache_0_p0_out
|
||||
else:
|
||||
cache_0_p1_out = cache_0_p1.get_and_update(cache_0_p0_out,
|
||||
selected_hashes)
|
||||
cache_0_p1_out = cache_0_p1.get_and_update(cache_0_p0_out, selected_hashes)
|
||||
|
||||
if cache_1_p1 is None:
|
||||
cache_1_p1_out = cache_1_p0_out
|
||||
else:
|
||||
cache_1_p1_out = cache_1_p1.get_and_update(cache_1_p0_out,
|
||||
selected_hashes)
|
||||
cache_1_p1_out = cache_1_p1.get_and_update(cache_1_p0_out, selected_hashes)
|
||||
|
||||
assert cache_0_p1_out == cache_1_p1_out, f"Failed at {it=}"
|
||||
|
||||
|
||||
@ -9,9 +9,6 @@ import pytest
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
|
||||
# yapf conflicts with isort for this block
|
||||
# yapf: disable
|
||||
from vllm.multimodal.processing import (
|
||||
InputProcessingContext,
|
||||
PlaceholderFeaturesInfo,
|
||||
@ -24,8 +21,6 @@ from vllm.multimodal.processing import (
|
||||
iter_token_matches,
|
||||
replace_token_matches,
|
||||
)
|
||||
|
||||
# yapf: enable
|
||||
from vllm.multimodal.profiling import MultiModalProfiler
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
|
||||
@ -34,7 +29,6 @@ from .utils import random_image
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(
|
||||
("token_ids", "match_ids", "expected"),
|
||||
[
|
||||
@ -44,34 +38,34 @@ pytestmark = pytest.mark.cpu_test
|
||||
[32000, 32000, 32000],
|
||||
[32000],
|
||||
[
|
||||
{ "start_idx": 0, "end_idx": 1 },
|
||||
{ "start_idx": 1, "end_idx": 2 },
|
||||
{ "start_idx": 2, "end_idx": 3 },
|
||||
{"start_idx": 0, "end_idx": 1},
|
||||
{"start_idx": 1, "end_idx": 2},
|
||||
{"start_idx": 2, "end_idx": 3},
|
||||
],
|
||||
),
|
||||
(
|
||||
[32000, 32000, 32000],
|
||||
[32000, 32000],
|
||||
[{ "start_idx": 0, "end_idx": 2 }],
|
||||
[{"start_idx": 0, "end_idx": 2}],
|
||||
),
|
||||
(
|
||||
[32000, 32000, 32000],
|
||||
[32000, 32000, 32000],
|
||||
[{ "start_idx": 0, "end_idx": 3 }],
|
||||
[{"start_idx": 0, "end_idx": 3}],
|
||||
),
|
||||
(
|
||||
[9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
|
||||
[28747, 32000],
|
||||
[
|
||||
{ "start_idx": 1, "end_idx": 3 },
|
||||
{ "start_idx": 6, "end_idx": 8 },
|
||||
{"start_idx": 1, "end_idx": 3},
|
||||
{"start_idx": 6, "end_idx": 8},
|
||||
],
|
||||
),
|
||||
(
|
||||
[9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
|
||||
[28747, 32000, 32000, 32000],
|
||||
[
|
||||
{ "start_idx": 1, "end_idx": 5 },
|
||||
{"start_idx": 1, "end_idx": 5},
|
||||
],
|
||||
),
|
||||
(
|
||||
@ -82,14 +76,13 @@ pytestmark = pytest.mark.cpu_test
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("start_idx", [0, 4, 8])
|
||||
# yapf: enable
|
||||
def test_iter_token_matches(token_ids, match_ids, expected, start_idx):
|
||||
result = list(iter_token_matches(token_ids, match_ids,
|
||||
start_idx=start_idx))
|
||||
result = list(iter_token_matches(token_ids, match_ids, start_idx=start_idx))
|
||||
|
||||
# Manually constructed results
|
||||
assert [item._asdict() for item in result
|
||||
] == [item for item in expected if item["start_idx"] >= start_idx]
|
||||
assert [item._asdict() for item in result] == [
|
||||
item for item in expected if item["start_idx"] >= start_idx
|
||||
]
|
||||
|
||||
# Invariants
|
||||
match_lens = [end - start for start, end in result]
|
||||
@ -97,7 +90,6 @@ def test_iter_token_matches(token_ids, match_ids, expected, start_idx):
|
||||
assert all(match_len == len(match_ids) for match_len in match_lens)
|
||||
|
||||
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(
|
||||
("token_ids", "match_ids", "new_ids", "expected"),
|
||||
[
|
||||
@ -141,7 +133,6 @@ def test_iter_token_matches(token_ids, match_ids, expected, start_idx):
|
||||
),
|
||||
],
|
||||
)
|
||||
# yapf: enable
|
||||
def test_replace_token_matches(token_ids, match_ids, new_ids, expected):
|
||||
result = replace_token_matches(token_ids, match_ids, new_ids)
|
||||
|
||||
@ -149,7 +140,6 @@ def test_replace_token_matches(token_ids, match_ids, new_ids, expected):
|
||||
assert result == expected
|
||||
|
||||
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(
|
||||
("prompt", "target_by_key", "expected_by_key"),
|
||||
[
|
||||
@ -166,11 +156,11 @@ def test_replace_token_matches(token_ids, match_ids, new_ids, expected):
|
||||
"pattern_1": [],
|
||||
"pattern_2": [],
|
||||
"pattern_3": [
|
||||
{ "start_idx": 0, "end_idx": 0 },
|
||||
{"start_idx": 0, "end_idx": 0},
|
||||
],
|
||||
"pattern_4": [],
|
||||
"pattern_5": [
|
||||
{ "start_idx": 0, "end_idx": 0 },
|
||||
{"start_idx": 0, "end_idx": 0},
|
||||
],
|
||||
},
|
||||
),
|
||||
@ -186,26 +176,26 @@ def test_replace_token_matches(token_ids, match_ids, new_ids, expected):
|
||||
},
|
||||
{
|
||||
"pattern_1": [
|
||||
{ "start_idx": 0, "end_idx": 1 },
|
||||
{ "start_idx": 1, "end_idx": 2 },
|
||||
{ "start_idx": 2, "end_idx": 3 },
|
||||
{ "start_idx": 3, "end_idx": 4 },
|
||||
{"start_idx": 0, "end_idx": 1},
|
||||
{"start_idx": 1, "end_idx": 2},
|
||||
{"start_idx": 2, "end_idx": 3},
|
||||
{"start_idx": 3, "end_idx": 4},
|
||||
],
|
||||
"pattern_2": [
|
||||
{ "start_idx": 0, "end_idx": 2 },
|
||||
{ "start_idx": 2, "end_idx": 4 },
|
||||
{"start_idx": 0, "end_idx": 2},
|
||||
{"start_idx": 2, "end_idx": 4},
|
||||
],
|
||||
"pattern_3": [
|
||||
{ "start_idx": 0, "end_idx": 3 },
|
||||
{"start_idx": 0, "end_idx": 3},
|
||||
],
|
||||
"pattern_4": [
|
||||
{ "start_idx": 0, "end_idx": 0 },
|
||||
{"start_idx": 0, "end_idx": 0},
|
||||
],
|
||||
"pattern_5": [
|
||||
{ "start_idx": 1, "end_idx": 1 },
|
||||
{"start_idx": 1, "end_idx": 1},
|
||||
],
|
||||
"pattern_6": [
|
||||
{ "start_idx": 4, "end_idx": 4 },
|
||||
{"start_idx": 4, "end_idx": 4},
|
||||
],
|
||||
},
|
||||
),
|
||||
@ -221,26 +211,25 @@ def test_replace_token_matches(token_ids, match_ids, new_ids, expected):
|
||||
},
|
||||
{
|
||||
"pattern_1": [
|
||||
{ "start_idx": 1, "end_idx": 3 },
|
||||
{ "start_idx": 6, "end_idx": 8 },
|
||||
{"start_idx": 1, "end_idx": 3},
|
||||
{"start_idx": 6, "end_idx": 8},
|
||||
],
|
||||
"pattern_2": [
|
||||
{ "start_idx": 1, "end_idx": 5 },
|
||||
{"start_idx": 1, "end_idx": 5},
|
||||
],
|
||||
"pattern_3": [],
|
||||
"pattern_4": [
|
||||
{ "start_idx": 0, "end_idx": 0 },
|
||||
{"start_idx": 0, "end_idx": 0},
|
||||
],
|
||||
"pattern_5": [],
|
||||
"pattern_6": [
|
||||
{ "start_idx": 10, "end_idx": 10 },
|
||||
{"start_idx": 10, "end_idx": 10},
|
||||
],
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("update_type", [PromptInsertion, PromptReplacement])
|
||||
# yapf: enable
|
||||
def test_find_token_matches(
|
||||
prompt,
|
||||
target_by_key,
|
||||
@ -272,7 +261,6 @@ def test_find_token_matches(
|
||||
} == expected_by_key
|
||||
|
||||
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(
|
||||
("prompt", "target_by_key", "expected_by_key"),
|
||||
[
|
||||
@ -288,16 +276,16 @@ def test_find_token_matches(
|
||||
"pattern_5": PromptIndexTargets.end(),
|
||||
},
|
||||
{
|
||||
"pattern_1": [{ "start_idx": 0, "end_idx": 0 }],
|
||||
"pattern_1": [{"start_idx": 0, "end_idx": 0}],
|
||||
"pattern_2": [],
|
||||
"pattern_3": [
|
||||
{ "start_idx": 0, "end_idx": 0 },
|
||||
{"start_idx": 0, "end_idx": 0},
|
||||
],
|
||||
"pattern_4": [],
|
||||
"pattern_5": [
|
||||
{ "start_idx": 0, "end_idx": 0 },
|
||||
{"start_idx": 0, "end_idx": 0},
|
||||
],
|
||||
}
|
||||
},
|
||||
),
|
||||
(
|
||||
"<image><image><image><image>",
|
||||
@ -311,26 +299,26 @@ def test_find_token_matches(
|
||||
},
|
||||
{
|
||||
"pattern_1": [
|
||||
{ "start_idx": 0, "end_idx": 7 },
|
||||
{ "start_idx": 7, "end_idx": 14 },
|
||||
{ "start_idx": 14, "end_idx": 21 },
|
||||
{ "start_idx": 21, "end_idx": 28 },
|
||||
{"start_idx": 0, "end_idx": 7},
|
||||
{"start_idx": 7, "end_idx": 14},
|
||||
{"start_idx": 14, "end_idx": 21},
|
||||
{"start_idx": 21, "end_idx": 28},
|
||||
],
|
||||
"pattern_2": [
|
||||
{ "start_idx": 0, "end_idx": 14 },
|
||||
{ "start_idx": 14, "end_idx": 28 },
|
||||
{"start_idx": 0, "end_idx": 14},
|
||||
{"start_idx": 14, "end_idx": 28},
|
||||
],
|
||||
"pattern_3": [
|
||||
{ "start_idx": 0, "end_idx": 21 },
|
||||
{"start_idx": 0, "end_idx": 21},
|
||||
],
|
||||
"pattern_4": [
|
||||
{ "start_idx": 0, "end_idx": 0 },
|
||||
{"start_idx": 0, "end_idx": 0},
|
||||
],
|
||||
"pattern_5": [
|
||||
{ "start_idx": 7, "end_idx": 7 },
|
||||
{"start_idx": 7, "end_idx": 7},
|
||||
],
|
||||
"pattern_6": [
|
||||
{ "start_idx": 28, "end_idx": 28 },
|
||||
{"start_idx": 28, "end_idx": 28},
|
||||
],
|
||||
},
|
||||
),
|
||||
@ -346,21 +334,21 @@ def test_find_token_matches(
|
||||
},
|
||||
{
|
||||
"pattern_1": [
|
||||
{ "start_idx": 0, "end_idx": 13 },
|
||||
{ "start_idx": 27, "end_idx": 40 },
|
||||
{"start_idx": 0, "end_idx": 13},
|
||||
{"start_idx": 27, "end_idx": 40},
|
||||
],
|
||||
"pattern_2": [
|
||||
{ "start_idx": 0, "end_idx": 27 },
|
||||
{"start_idx": 0, "end_idx": 27},
|
||||
],
|
||||
"pattern_3": [],
|
||||
"pattern_4": [
|
||||
{ "start_idx": 0, "end_idx": 0 },
|
||||
{"start_idx": 0, "end_idx": 0},
|
||||
],
|
||||
"pattern_5": [
|
||||
{ "start_idx": 13, "end_idx": 13 },
|
||||
{"start_idx": 13, "end_idx": 13},
|
||||
],
|
||||
"pattern_6": [
|
||||
{ "start_idx": 48, "end_idx": 48 },
|
||||
{"start_idx": 48, "end_idx": 48},
|
||||
],
|
||||
},
|
||||
),
|
||||
@ -374,22 +362,21 @@ def test_find_token_matches(
|
||||
},
|
||||
{
|
||||
"pattern_1": [
|
||||
{ "start_idx": 0, "end_idx": 9 },
|
||||
{ "start_idx": 16, "end_idx": 25 },
|
||||
{"start_idx": 0, "end_idx": 9},
|
||||
{"start_idx": 16, "end_idx": 25},
|
||||
],
|
||||
"pattern_2": [
|
||||
{ "start_idx": 0, "end_idx": 16 },
|
||||
{ "start_idx": 16, "end_idx": 32 },
|
||||
{"start_idx": 0, "end_idx": 16},
|
||||
{"start_idx": 16, "end_idx": 32},
|
||||
],
|
||||
"pattern_3": [
|
||||
{ "start_idx": 0, "end_idx": 25 },
|
||||
{"start_idx": 0, "end_idx": 25},
|
||||
],
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("update_type", [PromptInsertion, PromptReplacement])
|
||||
# yapf: enable
|
||||
def test_find_text_matches(
|
||||
prompt,
|
||||
target_by_key,
|
||||
@ -421,7 +408,6 @@ def test_find_text_matches(
|
||||
} == expected_by_key
|
||||
|
||||
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(
|
||||
("prompt", "target_by_key", "repl_by_key", "expected_by_update_type_mm_count"), # noqa: E501
|
||||
[
|
||||
@ -549,9 +535,8 @@ def test_find_text_matches(
|
||||
},
|
||||
},
|
||||
),
|
||||
]
|
||||
],
|
||||
)
|
||||
# yapf: enable
|
||||
def test_find_update_text(
|
||||
prompt,
|
||||
target_by_key,
|
||||
@ -562,13 +547,15 @@ def test_find_update_text(
|
||||
mock_tokenizer = cast(AnyTokenizer, object())
|
||||
|
||||
for (
|
||||
update_type,
|
||||
expected_by_mm_count,
|
||||
update_type,
|
||||
expected_by_mm_count,
|
||||
) in expected_by_update_type_mm_count.items():
|
||||
for mm_count, expected in expected_by_mm_count.items():
|
||||
mm_prompt_updates = {
|
||||
key: [[update_type(key, target, repl_by_key[key]).resolve(i)]
|
||||
for i in range(mm_count)]
|
||||
key: [
|
||||
[update_type(key, target, repl_by_key[key]).resolve(i)]
|
||||
for i in range(mm_count)
|
||||
]
|
||||
for key, target in target_by_key.items()
|
||||
}
|
||||
|
||||
@ -589,7 +576,6 @@ def test_find_update_text(
|
||||
assert new_prompt == expected
|
||||
|
||||
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(
|
||||
("prompt", "target_by_key", "repl_by_key", "expected_by_update_type_mm_count"), # noqa: E501
|
||||
[
|
||||
@ -615,8 +601,43 @@ def test_find_update_text(
|
||||
{
|
||||
PromptInsertion: {
|
||||
0: [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918],
|
||||
1: [1, 9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918, 1550, 918, 1550], # noqa: E501
|
||||
2: [1, 9833, 28747, 32000, 32000, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918, 1550, 918, 1550, 1550, 918, 1550], # noqa: E501
|
||||
1: [
|
||||
1,
|
||||
9833,
|
||||
28747,
|
||||
32000,
|
||||
32000,
|
||||
32000,
|
||||
9833,
|
||||
28747,
|
||||
32000,
|
||||
32000,
|
||||
918,
|
||||
1550,
|
||||
918,
|
||||
1550,
|
||||
], # noqa: E501
|
||||
2: [
|
||||
1,
|
||||
9833,
|
||||
28747,
|
||||
32000,
|
||||
32000,
|
||||
32000,
|
||||
32000,
|
||||
32000,
|
||||
9833,
|
||||
28747,
|
||||
32000,
|
||||
32000,
|
||||
918,
|
||||
1550,
|
||||
918,
|
||||
1550,
|
||||
1550,
|
||||
918,
|
||||
1550,
|
||||
], # noqa: E501
|
||||
},
|
||||
PromptReplacement: {
|
||||
0: [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918],
|
||||
@ -719,9 +740,8 @@ def test_find_update_text(
|
||||
},
|
||||
},
|
||||
),
|
||||
]
|
||||
],
|
||||
)
|
||||
# yapf: enable
|
||||
def test_find_update_tokens(
|
||||
prompt,
|
||||
target_by_key,
|
||||
@ -732,13 +752,15 @@ def test_find_update_tokens(
|
||||
mock_tokenizer = cast(AnyTokenizer, object())
|
||||
|
||||
for (
|
||||
update_type,
|
||||
expected_by_mm_count,
|
||||
update_type,
|
||||
expected_by_mm_count,
|
||||
) in expected_by_update_type_mm_count.items():
|
||||
for mm_count, expected in expected_by_mm_count.items():
|
||||
mm_prompt_updates = {
|
||||
key: [[update_type(key, target, repl_by_key[key]).resolve(i)]
|
||||
for i in range(mm_count)]
|
||||
key: [
|
||||
[update_type(key, target, repl_by_key[key]).resolve(i)]
|
||||
for i in range(mm_count)
|
||||
]
|
||||
for key, target in target_by_key.items()
|
||||
}
|
||||
|
||||
@ -759,7 +781,6 @@ def test_find_update_tokens(
|
||||
assert new_prompt == expected
|
||||
|
||||
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(
|
||||
"repl_by_key",
|
||||
[
|
||||
@ -796,8 +817,7 @@ def test_find_update_tokens(
|
||||
is_embed=None,
|
||||
),
|
||||
],
|
||||
}
|
||||
|
||||
},
|
||||
),
|
||||
(
|
||||
[1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550],
|
||||
@ -828,7 +848,7 @@ def test_find_update_tokens(
|
||||
),
|
||||
],
|
||||
# No match for pattern_4 as it has lower priority than pattern_1
|
||||
}
|
||||
},
|
||||
),
|
||||
(
|
||||
[1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550],
|
||||
@ -867,12 +887,11 @@ def test_find_update_tokens(
|
||||
is_embed=None,
|
||||
),
|
||||
],
|
||||
}
|
||||
},
|
||||
),
|
||||
]
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("update_type", [PromptInsertion, PromptReplacement])
|
||||
# yapf: enable
|
||||
def test_find_mm_placeholders(
|
||||
repl_by_key,
|
||||
prompt,
|
||||
@ -899,8 +918,15 @@ def test_find_mm_placeholders(
|
||||
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
|
||||
@pytest.mark.parametrize(
|
||||
("limit", "num_supported", "is_valid"),
|
||||
[(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),
|
||||
(2, 1, False), (2, 2, True)],
|
||||
[
|
||||
(0, 0, True),
|
||||
(0, 1, True),
|
||||
(1, 0, False),
|
||||
(1, 1, True),
|
||||
(1, 2, True),
|
||||
(2, 1, False),
|
||||
(2, 2, True),
|
||||
],
|
||||
)
|
||||
def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
|
||||
limit_mm_per_prompt = {"image": limit}
|
||||
@ -930,8 +956,15 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
|
||||
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
|
||||
@pytest.mark.parametrize(
|
||||
("num_images", "limit", "is_valid"),
|
||||
[(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),
|
||||
(2, 1, False), (2, 2, True)],
|
||||
[
|
||||
(0, 0, True),
|
||||
(0, 1, True),
|
||||
(1, 0, False),
|
||||
(1, 1, True),
|
||||
(1, 2, True),
|
||||
(2, 1, False),
|
||||
(2, 2, True),
|
||||
],
|
||||
)
|
||||
def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
|
||||
limit_mm_per_prompt = {"image": limit}
|
||||
@ -966,7 +999,6 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
|
||||
|
||||
|
||||
class DummyProcessor:
|
||||
|
||||
def __init__(self, a: int = 0, b: int = 0) -> None:
|
||||
super().__init__()
|
||||
|
||||
@ -982,7 +1014,6 @@ class DummyProcessor:
|
||||
return dict(a=a, c=c)
|
||||
|
||||
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) # Dummy
|
||||
@pytest.mark.parametrize(
|
||||
("config_kwargs", "inference_kwargs", "expected_kwargs"),
|
||||
@ -996,7 +1027,6 @@ class DummyProcessor:
|
||||
({"b": 1, "c": 1}, {}, {"a": 0, "b": 1}),
|
||||
],
|
||||
)
|
||||
# yapf: enable
|
||||
def test_hf_processor_init_kwargs(
|
||||
model_id,
|
||||
config_kwargs,
|
||||
@ -1020,7 +1050,6 @@ def test_hf_processor_init_kwargs(
|
||||
assert getattr(processor, k) == v
|
||||
|
||||
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) # Dummy
|
||||
@pytest.mark.parametrize(
|
||||
("config_kwargs", "inference_kwargs", "expected_kwargs"),
|
||||
@ -1034,7 +1063,6 @@ def test_hf_processor_init_kwargs(
|
||||
({"b": 1, "c": 1}, {}, {"a": 0, "c": 1}),
|
||||
],
|
||||
)
|
||||
# yapf: enable
|
||||
def test_hf_processor_call_kwargs(
|
||||
model_id,
|
||||
config_kwargs,
|
||||
|
||||
@ -233,7 +233,6 @@ async def test_fetch_video_http_with_dynamic_loader(
|
||||
assert metadata_sync["video_backend"] == "opencv_dynamic"
|
||||
|
||||
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(
|
||||
"case",
|
||||
[
|
||||
@ -264,7 +263,6 @@ async def test_fetch_video_http_with_dynamic_loader(
|
||||
("image", 0),
|
||||
],
|
||||
),
|
||||
|
||||
# Two modalities
|
||||
## Internally sorted
|
||||
dict(
|
||||
@ -276,7 +274,7 @@ async def test_fetch_video_http_with_dynamic_loader(
|
||||
"audio": [
|
||||
PlaceholderRange(offset=0, length=2),
|
||||
PlaceholderRange(offset=2, length=3),
|
||||
]
|
||||
],
|
||||
},
|
||||
expected_modality_idxs=[
|
||||
("audio", 0),
|
||||
@ -295,7 +293,7 @@ async def test_fetch_video_http_with_dynamic_loader(
|
||||
"audio": [
|
||||
PlaceholderRange(offset=5, length=2),
|
||||
PlaceholderRange(offset=11, length=4),
|
||||
]
|
||||
],
|
||||
},
|
||||
expected_modality_idxs=[
|
||||
("image", 0),
|
||||
@ -314,7 +312,7 @@ async def test_fetch_video_http_with_dynamic_loader(
|
||||
"audio": [
|
||||
PlaceholderRange(offset=11, length=4),
|
||||
PlaceholderRange(offset=5, length=2),
|
||||
]
|
||||
],
|
||||
},
|
||||
expected_modality_idxs=[
|
||||
("image", 1),
|
||||
@ -323,7 +321,6 @@ async def test_fetch_video_http_with_dynamic_loader(
|
||||
("audio", 0),
|
||||
],
|
||||
),
|
||||
|
||||
# Three modalities
|
||||
## Internally sorted
|
||||
dict(
|
||||
@ -339,7 +336,7 @@ async def test_fetch_video_http_with_dynamic_loader(
|
||||
PlaceholderRange(offset=3, length=4),
|
||||
PlaceholderRange(offset=7, length=5),
|
||||
PlaceholderRange(offset=12, length=6),
|
||||
]
|
||||
],
|
||||
},
|
||||
expected_modality_idxs=[
|
||||
("audio", 0),
|
||||
@ -363,7 +360,7 @@ async def test_fetch_video_http_with_dynamic_loader(
|
||||
],
|
||||
"video": [
|
||||
PlaceholderRange(offset=8, length=5),
|
||||
]
|
||||
],
|
||||
},
|
||||
expected_modality_idxs=[
|
||||
("image", 0),
|
||||
@ -386,7 +383,7 @@ async def test_fetch_video_http_with_dynamic_loader(
|
||||
],
|
||||
"video": [
|
||||
PlaceholderRange(offset=8, length=5),
|
||||
]
|
||||
],
|
||||
},
|
||||
expected_modality_idxs=[
|
||||
("image", 0),
|
||||
@ -398,7 +395,6 @@ async def test_fetch_video_http_with_dynamic_loader(
|
||||
),
|
||||
],
|
||||
)
|
||||
# yapf: enable
|
||||
def test_argsort_mm_positions(case):
|
||||
mm_positions = case["mm_positions"]
|
||||
expected_modality_idxs = case["expected_modality_idxs"]
|
||||
@ -413,13 +409,16 @@ def test_argsort_mm_positions(case):
|
||||
@pytest.mark.parametrize("num_frames", [-1, 32, 1800])
|
||||
async def test_allowed_media_domains(video_url: str, num_frames: int):
|
||||
connector = MediaConnector(
|
||||
media_io_kwargs={"video": {
|
||||
"num_frames": num_frames,
|
||||
}},
|
||||
media_io_kwargs={
|
||||
"video": {
|
||||
"num_frames": num_frames,
|
||||
}
|
||||
},
|
||||
allowed_media_domains=[
|
||||
"www.bogotobogo.com",
|
||||
"github.com",
|
||||
])
|
||||
],
|
||||
)
|
||||
|
||||
video_sync, metadata_sync = connector.fetch_video(video_url)
|
||||
video_async, metadata_async = await connector.fetch_video_async(video_url)
|
||||
|
||||
@ -59,48 +59,52 @@ def test_parse_raw_single_batch_string_slice(inputs_slice: slice):
|
||||
)
|
||||
|
||||
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize('mm_processor_kwargs,expected_mm_kwargs', [
|
||||
(None, [{}, {}]),
|
||||
({}, [{}, {}]),
|
||||
({"foo": 100}, [{"foo": 100}, {"foo": 100}]),
|
||||
([{"foo": 100}, {"bar": 200}], [{"foo": 100}, {"bar": 200}]),
|
||||
])
|
||||
# yapf: enable
|
||||
@pytest.mark.parametrize(
|
||||
"mm_processor_kwargs,expected_mm_kwargs",
|
||||
[
|
||||
(None, [{}, {}]),
|
||||
({}, [{}, {}]),
|
||||
({"foo": 100}, [{"foo": 100}, {"foo": 100}]),
|
||||
([{"foo": 100}, {"bar": 200}], [{"foo": 100}, {"bar": 200}]),
|
||||
],
|
||||
)
|
||||
def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
|
||||
"""Test mm_processor_kwargs init for zipping enc/dec prompts."""
|
||||
encoder_prompts = ['An encoder prompt', 'Another encoder prompt']
|
||||
decoder_prompts = ['A decoder prompt', 'Another decoder prompt']
|
||||
zipped_prompts = zip_enc_dec_prompts(encoder_prompts, decoder_prompts,
|
||||
mm_processor_kwargs)
|
||||
encoder_prompts = ["An encoder prompt", "Another encoder prompt"]
|
||||
decoder_prompts = ["A decoder prompt", "Another decoder prompt"]
|
||||
zipped_prompts = zip_enc_dec_prompts(
|
||||
encoder_prompts, decoder_prompts, mm_processor_kwargs
|
||||
)
|
||||
assert len(zipped_prompts) == len(encoder_prompts) == len(decoder_prompts)
|
||||
for enc, dec, exp_kwargs, zipped in zip(encoder_prompts, decoder_prompts,
|
||||
expected_mm_kwargs,
|
||||
zipped_prompts):
|
||||
for enc, dec, exp_kwargs, zipped in zip(
|
||||
encoder_prompts, decoder_prompts, expected_mm_kwargs, zipped_prompts
|
||||
):
|
||||
assert isinstance(zipped, dict)
|
||||
assert len(zipped.keys()) == 3
|
||||
assert zipped['encoder_prompt'] == enc
|
||||
assert zipped['decoder_prompt'] == dec
|
||||
assert zipped['mm_processor_kwargs'] == exp_kwargs
|
||||
assert zipped["encoder_prompt"] == enc
|
||||
assert zipped["decoder_prompt"] == dec
|
||||
assert zipped["mm_processor_kwargs"] == exp_kwargs
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", [
|
||||
"facebook/opt-125m",
|
||||
])
|
||||
@pytest.mark.parametrize("prompt", [
|
||||
{
|
||||
"prompt": "",
|
||||
"multi_modal_data": {
|
||||
"dummy": []
|
||||
@pytest.mark.parametrize(
|
||||
"model_id",
|
||||
[
|
||||
"facebook/opt-125m",
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"prompt",
|
||||
[
|
||||
{
|
||||
"prompt": "",
|
||||
"multi_modal_data": {"dummy": []},
|
||||
},
|
||||
},
|
||||
{
|
||||
"prompt_token_ids": [],
|
||||
"multi_modal_data": {
|
||||
"dummy": []
|
||||
{
|
||||
"prompt_token_ids": [],
|
||||
"multi_modal_data": {"dummy": []},
|
||||
},
|
||||
},
|
||||
])
|
||||
],
|
||||
)
|
||||
def test_preprocessor_text_no_mm_inputs(model_id, prompt):
|
||||
model_config = ModelConfig(model=model_id)
|
||||
tokenizer = init_tokenizer_from_configs(model_config)
|
||||
@ -110,15 +114,19 @@ def test_preprocessor_text_no_mm_inputs(model_id, prompt):
|
||||
input_preprocessor.preprocess(prompt)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", [
|
||||
"facebook/chameleon-7b",
|
||||
])
|
||||
@pytest.mark.parametrize("prompt", [
|
||||
"",
|
||||
{
|
||||
"prompt_token_ids": []
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"model_id",
|
||||
[
|
||||
"facebook/chameleon-7b",
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"prompt",
|
||||
[
|
||||
"",
|
||||
{"prompt_token_ids": []},
|
||||
],
|
||||
)
|
||||
def test_preprocessor_always_mm_code_path(model_id, prompt):
|
||||
model_config = ModelConfig(model=model_id)
|
||||
tokenizer = init_tokenizer_from_configs(model_config)
|
||||
|
||||
@ -9,14 +9,10 @@ import pytest
|
||||
import torch
|
||||
import torch_xla
|
||||
|
||||
# yapf conflicts with isort for this block
|
||||
# yapf: disable
|
||||
from vllm.model_executor.layers.fused_moe.moe_pallas import fused_moe as pallas_moe
|
||||
from vllm.model_executor.layers.fused_moe.moe_torch_iterative import (
|
||||
fused_moe as torch_moe,
|
||||
)
|
||||
|
||||
# yapf: enable
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if not current_platform.is_tpu():
|
||||
|
||||
@ -388,7 +388,6 @@ def test_duplicate_dict_args(caplog_vllm, parser):
|
||||
assert "-O.level" in caplog_vllm.text
|
||||
|
||||
|
||||
# yapf: enable
|
||||
@pytest.mark.parametrize(
|
||||
"callable,kw_name,requires_kw_only,allow_var_kwargs,is_supported",
|
||||
[
|
||||
@ -408,7 +407,6 @@ def test_duplicate_dict_args(caplog_vllm, parser):
|
||||
(lambda foo, **kwargs: None, "foo", True, True, False),
|
||||
],
|
||||
)
|
||||
# yapf: disable
|
||||
def test_supports_kw(
|
||||
callable, kw_name, requires_kw_only, allow_var_kwargs, is_supported
|
||||
):
|
||||
@ -681,7 +679,6 @@ def test_lru_cache():
|
||||
assert 6 in cache
|
||||
|
||||
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(
|
||||
("src_dtype", "tgt_dtype", "expected_result"),
|
||||
[
|
||||
@ -715,12 +712,10 @@ def test_lru_cache():
|
||||
(torch.complex64, torch.complex32, False),
|
||||
],
|
||||
)
|
||||
# yapf: enable
|
||||
def test_is_lossless_cast(src_dtype, tgt_dtype, expected_result):
|
||||
assert is_lossless_cast(src_dtype, tgt_dtype) == expected_result
|
||||
|
||||
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(
|
||||
("dtypes", "expected_result"),
|
||||
[
|
||||
@ -730,7 +725,6 @@ def test_is_lossless_cast(src_dtype, tgt_dtype, expected_result):
|
||||
([torch.bool, torch.int8, torch.float16, torch.complex32], torch.complex32), # noqa: E501
|
||||
],
|
||||
)
|
||||
# yapf: enable
|
||||
def test_common_broadcastable_dtype(dtypes, expected_result):
|
||||
assert common_broadcastable_dtype(dtypes) == expected_result
|
||||
|
||||
@ -775,7 +769,6 @@ def test_placeholder_module_error_handling():
|
||||
_ = placeholder_attr.module
|
||||
|
||||
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(
|
||||
"obj,key1,key2",
|
||||
[
|
||||
@ -785,8 +778,8 @@ def test_placeholder_module_error_handling():
|
||||
({1: "a", 2: "b"}, 1, 3),
|
||||
# Tests for both keys do not exist
|
||||
({1: "a", 2: "b"}, 3, 4),
|
||||
])
|
||||
# yapf: enable
|
||||
],
|
||||
)
|
||||
def test_swap_dict_values(obj, key1, key2):
|
||||
original_obj = obj.copy()
|
||||
swap_dict_values(obj, key1, key2)
|
||||
@ -800,26 +793,30 @@ def test_swap_dict_values(obj, key1, key2):
|
||||
assert key1 not in obj
|
||||
|
||||
|
||||
def test_model_specification(parser_with_config, cli_config_file,
|
||||
cli_config_file_with_model):
|
||||
def test_model_specification(
|
||||
parser_with_config, cli_config_file, cli_config_file_with_model
|
||||
):
|
||||
# Test model in CLI takes precedence over config
|
||||
args = parser_with_config.parse_args(
|
||||
['serve', 'cli-model', '--config', cli_config_file_with_model])
|
||||
assert args.model_tag == 'cli-model'
|
||||
assert args.served_model_name == 'mymodel'
|
||||
["serve", "cli-model", "--config", cli_config_file_with_model]
|
||||
)
|
||||
assert args.model_tag == "cli-model"
|
||||
assert args.served_model_name == "mymodel"
|
||||
|
||||
# Test model from config file works
|
||||
args = parser_with_config.parse_args([
|
||||
'serve',
|
||||
'--config',
|
||||
cli_config_file_with_model,
|
||||
])
|
||||
assert args.model == 'config-model'
|
||||
assert args.served_model_name == 'mymodel'
|
||||
args = parser_with_config.parse_args(
|
||||
[
|
||||
"serve",
|
||||
"--config",
|
||||
cli_config_file_with_model,
|
||||
]
|
||||
)
|
||||
assert args.model == "config-model"
|
||||
assert args.served_model_name == "mymodel"
|
||||
|
||||
# Test no model specified anywhere raises error
|
||||
with pytest.raises(ValueError, match="No model specified!"):
|
||||
parser_with_config.parse_args(['serve', '--config', cli_config_file])
|
||||
parser_with_config.parse_args(["serve", "--config", cli_config_file])
|
||||
|
||||
# Test using --model option raises error
|
||||
# with pytest.raises(
|
||||
@ -833,47 +830,52 @@ def test_model_specification(parser_with_config, cli_config_file,
|
||||
# Test using --model option back-compatibility
|
||||
# (when back-compatibility ends, the above test should be uncommented
|
||||
# and the below test should be removed)
|
||||
args = parser_with_config.parse_args([
|
||||
'serve',
|
||||
'--tensor-parallel-size',
|
||||
'2',
|
||||
'--model',
|
||||
'my-model',
|
||||
'--trust-remote-code',
|
||||
'--port',
|
||||
'8001',
|
||||
])
|
||||
args = parser_with_config.parse_args(
|
||||
[
|
||||
"serve",
|
||||
"--tensor-parallel-size",
|
||||
"2",
|
||||
"--model",
|
||||
"my-model",
|
||||
"--trust-remote-code",
|
||||
"--port",
|
||||
"8001",
|
||||
]
|
||||
)
|
||||
assert args.model is None
|
||||
assert args.tensor_parallel_size == 2
|
||||
assert args.trust_remote_code is True
|
||||
assert args.port == 8001
|
||||
|
||||
args = parser_with_config.parse_args([
|
||||
'serve',
|
||||
'--tensor-parallel-size=2',
|
||||
'--model=my-model',
|
||||
'--trust-remote-code',
|
||||
'--port=8001',
|
||||
])
|
||||
args = parser_with_config.parse_args(
|
||||
[
|
||||
"serve",
|
||||
"--tensor-parallel-size=2",
|
||||
"--model=my-model",
|
||||
"--trust-remote-code",
|
||||
"--port=8001",
|
||||
]
|
||||
)
|
||||
assert args.model is None
|
||||
assert args.tensor_parallel_size == 2
|
||||
assert args.trust_remote_code is True
|
||||
assert args.port == 8001
|
||||
|
||||
# Test other config values are preserved
|
||||
args = parser_with_config.parse_args([
|
||||
'serve',
|
||||
'cli-model',
|
||||
'--config',
|
||||
cli_config_file_with_model,
|
||||
])
|
||||
args = parser_with_config.parse_args(
|
||||
[
|
||||
"serve",
|
||||
"cli-model",
|
||||
"--config",
|
||||
cli_config_file_with_model,
|
||||
]
|
||||
)
|
||||
assert args.tensor_parallel_size == 2
|
||||
assert args.trust_remote_code is True
|
||||
assert args.port == 12312
|
||||
|
||||
|
||||
@pytest.mark.parametrize("input", [(), ("abc", ), (None, ),
|
||||
(None, bool, [1, 2, 3])])
|
||||
@pytest.mark.parametrize("input", [(), ("abc",), (None,), (None, bool, [1, 2, 3])])
|
||||
def test_sha256(input: tuple):
|
||||
digest = sha256(input)
|
||||
assert digest is not None
|
||||
@ -887,7 +889,7 @@ def test_sha256(input: tuple):
|
||||
assert digest == sha256(input)
|
||||
|
||||
# hashing different input, returns different value
|
||||
assert digest != sha256(input + (1, ))
|
||||
assert digest != sha256(input + (1,))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@ -897,7 +899,8 @@ def test_sha256(input: tuple):
|
||||
("tcp://127.0.0.1:5555", ("tcp", "127.0.0.1", "5555")),
|
||||
("tcp://[::1]:5555", ("tcp", "::1", "5555")), # IPv6 address
|
||||
("inproc://some_identifier", ("inproc", "some_identifier", "")),
|
||||
])
|
||||
],
|
||||
)
|
||||
def test_split_zmq_path(path, expected):
|
||||
assert split_zmq_path(path) == expected
|
||||
|
||||
@ -909,7 +912,8 @@ def test_split_zmq_path(path, expected):
|
||||
"tcp://127.0.0.1", # Missing port
|
||||
"tcp://[::1]", # Missing port for IPv6
|
||||
"tcp://:5555", # Missing host
|
||||
])
|
||||
],
|
||||
)
|
||||
def test_split_zmq_path_invalid(invalid_path):
|
||||
with pytest.raises(ValueError):
|
||||
split_zmq_path(invalid_path)
|
||||
@ -931,8 +935,9 @@ def test_make_zmq_socket_ipv6():
|
||||
zsock: zmq.Socket = make_zmq_socket(ctx, ipv6_path, socket_type)
|
||||
|
||||
# Verify that the IPV6 option is set
|
||||
assert zsock.getsockopt(
|
||||
zmq.IPV6) == 1, "IPV6 option should be enabled for IPv6 addresses"
|
||||
assert zsock.getsockopt(zmq.IPV6) == 1, (
|
||||
"IPV6 option should be enabled for IPv6 addresses"
|
||||
)
|
||||
|
||||
# Clean up
|
||||
zsock.close()
|
||||
@ -1019,15 +1024,14 @@ def test_convert_ids_list_to_tokens():
|
||||
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
|
||||
token_ids = tokenizer.encode("Hello, world!")
|
||||
# token_ids = [9707, 11, 1879, 0]
|
||||
assert tokenizer.convert_ids_to_tokens(token_ids) == [
|
||||
'Hello', ',', 'Ġworld', '!'
|
||||
]
|
||||
assert tokenizer.convert_ids_to_tokens(token_ids) == ["Hello", ",", "Ġworld", "!"]
|
||||
tokens = convert_ids_list_to_tokens(tokenizer, token_ids)
|
||||
assert tokens == ['Hello', ',', ' world', '!']
|
||||
assert tokens == ["Hello", ",", " world", "!"]
|
||||
|
||||
|
||||
def test_current_stream_multithread():
|
||||
import threading
|
||||
|
||||
if not torch.cuda.is_available():
|
||||
pytest.skip("CUDA not available")
|
||||
|
||||
@ -1046,13 +1050,18 @@ def test_current_stream_multithread():
|
||||
child_thread.start()
|
||||
|
||||
try:
|
||||
assert thread_stream_ready.wait(
|
||||
timeout=5), "Child thread failed to enter stream context in time"
|
||||
assert thread_stream_ready.wait(timeout=5), (
|
||||
"Child thread failed to enter stream context in time"
|
||||
)
|
||||
|
||||
main_current_stream = current_stream()
|
||||
|
||||
assert main_current_stream != child_stream, "Main thread's current_stream was contaminated by child thread"
|
||||
assert main_current_stream == main_default_stream, "Main thread's current_stream is not the default stream"
|
||||
assert main_current_stream != child_stream, (
|
||||
"Main thread's current_stream was contaminated by child thread"
|
||||
)
|
||||
assert main_current_stream == main_default_stream, (
|
||||
"Main thread's current_stream is not the default stream"
|
||||
)
|
||||
|
||||
# Notify child thread it can exit
|
||||
thread_can_exit.set()
|
||||
@ -1070,7 +1079,7 @@ def test_load_config_file(tmp_path):
|
||||
"enable-logging": True,
|
||||
"list-arg": ["item1", "item2"],
|
||||
"port": 12323,
|
||||
"tensor-parallel-size": 4
|
||||
"tensor-parallel-size": 4,
|
||||
}
|
||||
|
||||
# Write the configuration data to a temporary YAML file
|
||||
|
||||
@ -16,9 +16,6 @@ from vllm.multimodal.inputs import (
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.utils import GiB_bytes, sha256, sha256_cbor
|
||||
from vllm.v1.core.kv_cache_manager import KVCacheManager
|
||||
|
||||
# disable yapf here as it formats differently than isort such that both fail
|
||||
# yapf: disable
|
||||
from vllm.v1.core.kv_cache_utils import (
|
||||
BlockHash,
|
||||
FreeKVCacheBlockQueue,
|
||||
@ -48,8 +45,6 @@ from vllm.v1.kv_cache_interface import (
|
||||
from vllm.v1.metrics.stats import PrefixCacheStats
|
||||
from vllm.v1.request import Request
|
||||
|
||||
# yapf: enable
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
|
||||
|
||||
@ -22,8 +22,6 @@ from vllm.config import VllmConfig
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.utils import is_pin_memory_available
|
||||
|
||||
# yapf: disable
|
||||
from vllm.v1.sample.logits_processor import (
|
||||
BatchUpdate,
|
||||
BatchUpdateBuilder,
|
||||
@ -34,8 +32,6 @@ from vllm.v1.sample.logits_processor import (
|
||||
MoveDirectionality,
|
||||
build_logitsprocs,
|
||||
)
|
||||
|
||||
# yapf: enable
|
||||
from vllm.v1.sample.metadata import SamplingMetadata
|
||||
|
||||
PIN_MEMORY_AVAILABLE = is_pin_memory_available()
|
||||
|
||||
@ -7,8 +7,6 @@ from typing import Union
|
||||
import pytest
|
||||
|
||||
from tests.utils import create_new_process_for_each_test
|
||||
|
||||
# yapf: disable
|
||||
from tests.v1.logits_processors.utils import (
|
||||
DUMMY_LOGITPROC_ARG,
|
||||
DUMMY_LOGITPROC_FQCN,
|
||||
@ -24,8 +22,6 @@ from tests.v1.logits_processors.utils import (
|
||||
prompts,
|
||||
)
|
||||
from tests.v1.logits_processors.utils import entry_points as fake_entry_points
|
||||
|
||||
# yapf: enable
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.v1.sample.logits_processor import (
|
||||
STR_POOLING_REJECTS_LOGITSPROCS,
|
||||
|
||||
@ -11,8 +11,6 @@ import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from tests.utils import RemoteOpenAIServerCustom, create_new_process_for_each_test
|
||||
|
||||
# yapf: disable
|
||||
from tests.v1.logits_processors.utils import (
|
||||
DUMMY_LOGITPROC_ARG,
|
||||
DUMMY_LOGITPROC_FQCN,
|
||||
@ -25,8 +23,6 @@ from tests.v1.logits_processors.utils import (
|
||||
)
|
||||
from tests.v1.logits_processors.utils import entry_points as fake_entry_points
|
||||
|
||||
# yapf: enable
|
||||
|
||||
|
||||
def _server_with_logitproc_entrypoint(
|
||||
env_dict: Optional[dict[str, str]],
|
||||
|
||||
@ -4,7 +4,6 @@
|
||||
import importlib
|
||||
from typing import TYPE_CHECKING, Callable
|
||||
|
||||
# yapf: disable
|
||||
import vllm.envs as envs
|
||||
from vllm.distributed.kv_transfer.kv_connector.base import (
|
||||
KVConnectorBase,
|
||||
@ -13,8 +12,6 @@ from vllm.distributed.kv_transfer.kv_connector.base import (
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorRole
|
||||
from vllm.logger import init_logger
|
||||
|
||||
# yapf: enable
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.config.kv_transfer import KVTransferConfig
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# yapf: disable
|
||||
import argparse
|
||||
import copy
|
||||
import dataclasses
|
||||
@ -88,8 +87,6 @@ from vllm.transformers_utils.utils import check_gguf_file
|
||||
from vllm.utils import FlexibleArgumentParser, GiB_bytes, get_ip, is_in_ray_actor
|
||||
from vllm.v1.sample.logits_processor import LogitsProcessor
|
||||
|
||||
# yapf: enable
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.executor.executor_base import ExecutorBase
|
||||
from vllm.model_executor.layers.quantization import QuantizationMethods
|
||||
|
||||
@ -17,9 +17,6 @@ import jinja2.nodes
|
||||
import jinja2.parser
|
||||
import jinja2.sandbox
|
||||
import transformers.utils.chat_template_utils as hf_chat_utils
|
||||
|
||||
# yapf conflicts with isort for this block
|
||||
# yapf: disable
|
||||
from openai.types.chat import (
|
||||
ChatCompletionAssistantMessageParam,
|
||||
ChatCompletionContentPartImageParam,
|
||||
@ -40,8 +37,6 @@ from openai.types.responses import ResponseInputImageParam
|
||||
from openai_harmony import Message as OpenAIHarmonyMessage
|
||||
from PIL import Image
|
||||
from pydantic import BaseModel, ConfigDict, TypeAdapter
|
||||
|
||||
# yapf: enable
|
||||
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast, ProcessorMixin
|
||||
|
||||
# pydantic needs the TypedDict from typing_extensions
|
||||
@ -52,11 +47,7 @@ from vllm.logger import init_logger
|
||||
from vllm.model_executor.models import SupportsMultiModal
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict, MultiModalUUIDDict
|
||||
from vllm.multimodal.utils import MediaConnector
|
||||
|
||||
# yapf: disable
|
||||
from vllm.transformers_utils.chat_templates import get_chat_template_fallback_path
|
||||
|
||||
# yapf: enable
|
||||
from vllm.transformers_utils.processor import cached_get_processor
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
|
||||
from vllm.utils import random_uuid, supports_kw
|
||||
@ -317,11 +308,7 @@ def _is_var_or_elems_access(
|
||||
):
|
||||
return _is_var_or_elems_access(node.node, varname, key)
|
||||
|
||||
# yapf: disable
|
||||
return (
|
||||
_is_attr_access(node, varname, key) if key
|
||||
else _is_var_access(node, varname)
|
||||
) # yapf: enable
|
||||
return _is_attr_access(node, varname, key) if key else _is_var_access(node, varname)
|
||||
|
||||
|
||||
def _iter_nodes_assign_var_or_elems(root: jinja2.nodes.Node, varname: str):
|
||||
|
||||
@ -39,9 +39,6 @@ from vllm.entrypoints.chat_utils import (
|
||||
parse_chat_messages,
|
||||
resolve_chat_template_content_format,
|
||||
)
|
||||
|
||||
# yapf conflicts with isort for this block
|
||||
# yapf: disable
|
||||
from vllm.entrypoints.score_utils import (
|
||||
ScoreContentPartParam,
|
||||
ScoreMultiModalParam,
|
||||
@ -50,8 +47,6 @@ from vllm.entrypoints.score_utils import (
|
||||
compress_token_type_ids,
|
||||
get_score_prompt,
|
||||
)
|
||||
|
||||
# yapf: enable
|
||||
from vllm.entrypoints.utils import _validate_truncation_size, log_non_default_args
|
||||
from vllm.inputs import (
|
||||
DataPrompt,
|
||||
|
||||
@ -49,9 +49,6 @@ from vllm.entrypoints.chat_utils import (
|
||||
from vllm.entrypoints.launcher import serve_http
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args
|
||||
|
||||
# yapf conflicts with isort for this block
|
||||
# yapf: disable
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
ChatCompletionRequest,
|
||||
ChatCompletionResponse,
|
||||
@ -84,8 +81,6 @@ from vllm.entrypoints.openai.protocol import (
|
||||
TranslationResponse,
|
||||
UnloadLoRAAdapterRequest,
|
||||
)
|
||||
|
||||
# yapf: enable
|
||||
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
|
||||
from vllm.entrypoints.openai.serving_classification import ServingClassification
|
||||
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
|
||||
|
||||
@ -11,8 +11,6 @@ from typing import Annotated, Any, ClassVar, Generic, Literal, Optional, TypeVar
|
||||
import regex as re
|
||||
import torch
|
||||
from fastapi import HTTPException, UploadFile
|
||||
|
||||
# yapf: disable
|
||||
from openai.types.chat.chat_completion_audio import (
|
||||
ChatCompletionAudio as OpenAIChatCompletionAudio,
|
||||
)
|
||||
@ -46,8 +44,6 @@ from openai.types.responses import ResponseCreatedEvent as OpenAIResponseCreated
|
||||
from openai.types.responses import (
|
||||
ResponseInProgressEvent as OpenAIResponseInProgressEvent,
|
||||
)
|
||||
|
||||
# yapf: enable
|
||||
from openai.types.responses.response_reasoning_item import (
|
||||
Content as ResponseReasoningTextContent,
|
||||
)
|
||||
|
||||
@ -18,8 +18,6 @@ from vllm.config import VllmConfig
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs, optional_type
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
|
||||
# yapf: disable
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
BatchRequestInput,
|
||||
BatchRequestOutput,
|
||||
@ -30,8 +28,6 @@ from vllm.entrypoints.openai.protocol import (
|
||||
RerankResponse,
|
||||
ScoreResponse,
|
||||
)
|
||||
|
||||
# yapf: enable
|
||||
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
|
||||
from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
|
||||
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
|
||||
|
||||
@ -1733,13 +1733,15 @@ class OpenAIServingChat(OpenAIServing):
|
||||
is a tool call with arguments.
|
||||
"""
|
||||
|
||||
# yapf: disable
|
||||
return bool(
|
||||
# if there is a delta message that includes tool calls which
|
||||
# include a function that has arguments
|
||||
output.finish_reason is not None
|
||||
and self.enable_auto_tools and self.tool_parser and delta_message
|
||||
and delta_message.tool_calls and delta_message.tool_calls[0]
|
||||
and self.enable_auto_tools
|
||||
and self.tool_parser
|
||||
and delta_message
|
||||
and delta_message.tool_calls
|
||||
and delta_message.tool_calls[0]
|
||||
and delta_message.tool_calls[0].function
|
||||
and delta_message.tool_calls[0].function.arguments is not None
|
||||
)
|
||||
|
||||
@ -18,8 +18,6 @@ from vllm.entrypoints.openai.protocol import (
|
||||
ErrorResponse,
|
||||
UsageInfo,
|
||||
)
|
||||
|
||||
# yapf: enable
|
||||
from vllm.entrypoints.openai.serving_engine import (
|
||||
ClassificationServeContext,
|
||||
OpenAIServing,
|
||||
|
||||
@ -13,9 +13,6 @@ from fastapi import Request
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
|
||||
# yapf conflicts with isort for this block
|
||||
# yapf: disable
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
CompletionLogProbs,
|
||||
CompletionRequest,
|
||||
@ -29,8 +26,6 @@ from vllm.entrypoints.openai.protocol import (
|
||||
UsageInfo,
|
||||
)
|
||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs
|
||||
|
||||
# yapf: enable
|
||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||
from vllm.entrypoints.renderer import RenderConfig
|
||||
from vllm.entrypoints.utils import get_max_tokens
|
||||
|
||||
@ -14,9 +14,6 @@ from vllm.config import ModelConfig
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
|
||||
# yapf conflicts with isort for this docstring
|
||||
# yapf: disable
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
EmbeddingChatRequest,
|
||||
EmbeddingCompletionRequest,
|
||||
@ -32,8 +29,6 @@ from vllm.entrypoints.openai.serving_engine import (
|
||||
ServeContext,
|
||||
TextTokensPrompt,
|
||||
)
|
||||
|
||||
# yapf: enable
|
||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||
from vllm.entrypoints.renderer import RenderConfig
|
||||
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
|
||||
|
||||
@ -28,9 +28,6 @@ else:
|
||||
import vllm.envs as envs
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.engine.protocol import EngineClient
|
||||
|
||||
# yapf conflicts with isort for this block
|
||||
# yapf: disable
|
||||
from vllm.entrypoints.chat_utils import (
|
||||
ChatCompletionMessageParam,
|
||||
ChatTemplateContentFormatOption,
|
||||
@ -72,8 +69,6 @@ from vllm.entrypoints.openai.protocol import (
|
||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||
from vllm.entrypoints.openai.tool_parsers import ToolParser
|
||||
from vllm.entrypoints.renderer import BaseRenderer, CompletionRenderer, RenderConfig
|
||||
|
||||
# yapf: enable
|
||||
from vllm.inputs.data import PromptType
|
||||
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
|
||||
from vllm.inputs.parse import PromptComponents, get_prompt_components
|
||||
|
||||
@ -17,8 +17,6 @@ from vllm.config import VllmConfig
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
|
||||
# yapf: disable
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
ErrorResponse,
|
||||
IOProcessorRequest,
|
||||
@ -30,8 +28,6 @@ from vllm.entrypoints.openai.protocol import (
|
||||
PoolingResponseData,
|
||||
UsageInfo,
|
||||
)
|
||||
|
||||
# yapf: enable
|
||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||
from vllm.entrypoints.renderer import RenderConfig
|
||||
|
||||
@ -14,9 +14,6 @@ from typing import Callable, Final, Optional, Union
|
||||
|
||||
import jinja2
|
||||
from fastapi import Request
|
||||
|
||||
# yapf conflicts with isort for this block
|
||||
# yapf: disable
|
||||
from openai.types.responses import (
|
||||
ResponseCodeInterpreterCallCodeDeltaEvent,
|
||||
ResponseCodeInterpreterCallCodeDoneEvent,
|
||||
@ -46,8 +43,6 @@ from openai.types.responses import (
|
||||
response_text_delta_event,
|
||||
)
|
||||
from openai.types.responses.response_output_text import Logprob, LogprobTopLogprob
|
||||
|
||||
# yapf: enable
|
||||
from openai.types.responses.response_reasoning_item import (
|
||||
Content as ResponseReasoningTextContent,
|
||||
)
|
||||
@ -78,9 +73,6 @@ from vllm.entrypoints.harmony_utils import (
|
||||
render_for_completion,
|
||||
)
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
|
||||
# yapf conflicts with isort for this block
|
||||
# yapf: disable
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
DeltaMessage,
|
||||
ErrorResponse,
|
||||
@ -97,8 +89,6 @@ from vllm.entrypoints.openai.protocol import (
|
||||
ResponseUsage,
|
||||
StreamingResponsesResponse,
|
||||
)
|
||||
|
||||
# yapf: enable
|
||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||
from vllm.entrypoints.tool_server import ToolServer
|
||||
|
||||
@ -24,9 +24,6 @@ from vllm.entrypoints.openai.protocol import (
|
||||
)
|
||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||
|
||||
# yapf conflicts with isort for this block
|
||||
# yapf: disable
|
||||
from vllm.entrypoints.score_utils import (
|
||||
ScoreContentPartParam,
|
||||
ScoreMultiModalParam,
|
||||
@ -35,8 +32,6 @@ from vllm.entrypoints.score_utils import (
|
||||
compress_token_type_ids,
|
||||
get_score_prompt,
|
||||
)
|
||||
|
||||
# yapf: enable
|
||||
from vllm.entrypoints.utils import _validate_truncation_size
|
||||
from vllm.inputs.data import TokensPrompt
|
||||
from vllm.logger import init_logger
|
||||
|
||||
@ -10,9 +10,6 @@ from vllm.config import ModelConfig
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
|
||||
# yapf conflicts with isort for this block
|
||||
# yapf: disable
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
DetokenizeRequest,
|
||||
DetokenizeResponse,
|
||||
@ -22,8 +19,6 @@ from vllm.entrypoints.openai.protocol import (
|
||||
TokenizeResponse,
|
||||
TokenizerInfoResponse,
|
||||
)
|
||||
|
||||
# yapf: enable
|
||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||
from vllm.entrypoints.renderer import RenderConfig
|
||||
|
||||
@ -11,7 +11,7 @@ import cloudpickle
|
||||
import msgspec
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.executor.executor_base import DistributedExecutorBase # yapf: disable
|
||||
from vllm.executor.executor_base import DistributedExecutorBase
|
||||
from vllm.executor.msgspec_utils import encode_hook
|
||||
from vllm.executor.ray_utils import RayWorkerWrapper, initialize_ray_cluster, ray
|
||||
from vllm.logger import init_logger
|
||||
|
||||
@ -8,8 +8,6 @@ from transformers import PretrainedConfig
|
||||
|
||||
from vllm.config.lora import LoRAConfig
|
||||
from vllm.distributed.utils import divide
|
||||
|
||||
# yapf: disable
|
||||
from vllm.model_executor.layers.linear import (
|
||||
ColumnParallelLinear,
|
||||
LinearBase,
|
||||
@ -23,7 +21,6 @@ from .utils import _get_lora_device
|
||||
|
||||
|
||||
class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
|
||||
|
||||
def __init__(self, base_layer: LinearBase):
|
||||
super().__init__()
|
||||
self.base_layer = base_layer
|
||||
@ -50,16 +47,20 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
|
||||
lora_b_out_size = self.output_size
|
||||
|
||||
elif isinstance(self.base_layer, ColumnParallelLinear):
|
||||
lora_a_out_size = (lora_config.max_lora_rank if
|
||||
not lora_config.fully_sharded_loras else divide(
|
||||
lora_config.max_lora_rank, self.tp_size))
|
||||
lora_a_out_size = (
|
||||
lora_config.max_lora_rank
|
||||
if not lora_config.fully_sharded_loras
|
||||
else divide(lora_config.max_lora_rank, self.tp_size)
|
||||
)
|
||||
lora_b_out_size = self.output_size
|
||||
|
||||
elif isinstance(self.base_layer, RowParallelLinear):
|
||||
lora_a_out_size = lora_config.max_lora_rank
|
||||
lora_b_out_size = (self.output_size if
|
||||
not lora_config.fully_sharded_loras else divide(
|
||||
self.output_size, self.tp_size))
|
||||
lora_b_out_size = (
|
||||
self.output_size
|
||||
if not lora_config.fully_sharded_loras
|
||||
else divide(self.output_size, self.tp_size)
|
||||
)
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
@ -71,7 +72,9 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
|
||||
self.input_size,
|
||||
dtype=lora_config.lora_dtype,
|
||||
device=self.device,
|
||||
) for _ in range(self.n_slices))
|
||||
)
|
||||
for _ in range(self.n_slices)
|
||||
)
|
||||
self.lora_b_stacked = tuple(
|
||||
torch.zeros(
|
||||
max_loras,
|
||||
@ -80,7 +83,9 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
|
||||
lora_config.max_lora_rank,
|
||||
dtype=lora_config.lora_dtype,
|
||||
device=self.device,
|
||||
) for _ in range(self.n_slices))
|
||||
)
|
||||
for _ in range(self.n_slices)
|
||||
)
|
||||
if lora_config.bias_enabled:
|
||||
lora_bias_out_size = lora_b_out_size
|
||||
self.lora_bias_stacked = tuple(
|
||||
@ -90,8 +95,10 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
|
||||
lora_bias_out_size,
|
||||
dtype=lora_config.lora_dtype,
|
||||
device=self.device,
|
||||
) for _ in range(self.n_slices))
|
||||
self.output_slices = (self.lora_b_stacked[0].shape[2], )
|
||||
)
|
||||
for _ in range(self.n_slices)
|
||||
)
|
||||
self.output_slices = (self.lora_b_stacked[0].shape[2],)
|
||||
|
||||
def reset_lora(self, index: int):
|
||||
for s_index in range(self.n_slices):
|
||||
@ -99,8 +106,9 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
|
||||
self.lora_b_stacked[s_index][index] = 0
|
||||
if self.lora_config.bias_enabled:
|
||||
# Make mypy happy
|
||||
self.lora_bias_stacked = cast(tuple[torch.Tensor, ...],
|
||||
self.lora_bias_stacked)
|
||||
self.lora_bias_stacked = cast(
|
||||
tuple[torch.Tensor, ...], self.lora_bias_stacked
|
||||
)
|
||||
self.lora_bias_stacked[s_index][index] = 0
|
||||
|
||||
def set_lora(
|
||||
@ -115,8 +123,9 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
|
||||
# MergedColumnParallelLinearWithLoRA, all other linear LoRA layers
|
||||
# store weights in a tuple of size 1. These two layers will
|
||||
# override this function.
|
||||
assert (len(self.lora_a_stacked) == len(self.lora_b_stacked) ==
|
||||
self.n_slices == 1)
|
||||
assert (
|
||||
len(self.lora_a_stacked) == len(self.lora_b_stacked) == self.n_slices == 1
|
||||
)
|
||||
|
||||
self.reset_lora(index)
|
||||
if self.tp_size > 1:
|
||||
@ -125,23 +134,24 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
|
||||
if lora_bias is not None:
|
||||
lora_bias = self.slice_bias(lora_bias)
|
||||
|
||||
self.lora_a_stacked[0][index,
|
||||
0, :lora_a.shape[0], :lora_a.shape[1]].copy_(
|
||||
lora_a, non_blocking=True)
|
||||
self.lora_b_stacked[0][index,
|
||||
0, :lora_b.shape[0], :lora_b.shape[1]].copy_(
|
||||
lora_b, non_blocking=True)
|
||||
self.lora_a_stacked[0][index, 0, : lora_a.shape[0], : lora_a.shape[1]].copy_(
|
||||
lora_a, non_blocking=True
|
||||
)
|
||||
self.lora_b_stacked[0][index, 0, : lora_b.shape[0], : lora_b.shape[1]].copy_(
|
||||
lora_b, non_blocking=True
|
||||
)
|
||||
if lora_bias is not None:
|
||||
|
||||
self.lora_bias_stacked = cast(tuple[torch.Tensor, ...],
|
||||
self.lora_bias_stacked)
|
||||
self.lora_bias_stacked = cast(
|
||||
tuple[torch.Tensor, ...], self.lora_bias_stacked
|
||||
)
|
||||
assert len(self.lora_bias_stacked)
|
||||
self.lora_bias_stacked[0][index, 0, :lora_bias.shape[0]].copy_(
|
||||
lora_bias, non_blocking=True)
|
||||
self.lora_bias_stacked[0][index, 0, : lora_bias.shape[0]].copy_(
|
||||
lora_bias, non_blocking=True
|
||||
)
|
||||
|
||||
def apply(self,
|
||||
x: torch.Tensor,
|
||||
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
|
||||
def apply(
|
||||
self, x: torch.Tensor, bias: Optional[torch.Tensor] = None
|
||||
) -> torch.Tensor:
|
||||
output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
|
||||
|
||||
# In transformers backend, x and output have extra batch dimension like
|
||||
@ -151,10 +161,15 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
|
||||
output = output.flatten(0, 1)
|
||||
x = x.flatten(0, 1)
|
||||
|
||||
lora_output: Optional[
|
||||
torch.Tensor] = self.punica_wrapper.add_lora_linear(
|
||||
output, x, self.lora_a_stacked, self.lora_b_stacked,
|
||||
self.lora_bias_stacked, 1.0, self.output_slices)
|
||||
lora_output: Optional[torch.Tensor] = self.punica_wrapper.add_lora_linear(
|
||||
output,
|
||||
x,
|
||||
self.lora_a_stacked,
|
||||
self.lora_b_stacked,
|
||||
self.lora_bias_stacked,
|
||||
1.0,
|
||||
self.output_slices,
|
||||
)
|
||||
if not current_platform.can_update_inplace():
|
||||
output = lora_output
|
||||
|
||||
@ -162,7 +177,6 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
|
||||
|
||||
@property
|
||||
def weight(self) -> torch.Tensor:
|
||||
|
||||
# unquantizedLinear
|
||||
if hasattr(self.base_layer, "weight"):
|
||||
return self.base_layer.weight
|
||||
|
||||
@ -12,8 +12,6 @@ from vllm.distributed import (
|
||||
split_tensor_along_last_dim,
|
||||
tensor_model_parallel_all_reduce,
|
||||
)
|
||||
|
||||
# yapf: disable
|
||||
from vllm.model_executor.layers.linear import RowParallelLinear
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
@ -22,7 +20,6 @@ from .utils import _fully_sharded_can_replace, _not_fully_sharded_can_replace
|
||||
|
||||
|
||||
class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
|
||||
|
||||
def __init__(self, base_layer: RowParallelLinear) -> None:
|
||||
super().__init__(base_layer)
|
||||
|
||||
@ -33,11 +30,10 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
|
||||
self.n_slices = 1
|
||||
|
||||
def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
|
||||
|
||||
shard_size = self.input_size
|
||||
start_idx = self.tp_rank * shard_size
|
||||
end_idx = (self.tp_rank + 1) * shard_size
|
||||
lora_a = lora_a[:,start_idx:end_idx]
|
||||
lora_a = lora_a[:, start_idx:end_idx]
|
||||
return lora_a
|
||||
|
||||
def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
|
||||
@ -66,7 +62,8 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
|
||||
else:
|
||||
# TODO: simplify code below
|
||||
splitted_input = split_tensor_along_last_dim(
|
||||
input_, num_partitions=self.tp_size)
|
||||
input_, num_partitions=self.tp_size
|
||||
)
|
||||
input_parallel = splitted_input[self.tp_rank].contiguous()
|
||||
|
||||
# Matrix multiply.
|
||||
@ -77,8 +74,11 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
|
||||
output_ = output_parallel
|
||||
|
||||
if not self.base_layer.skip_bias_add:
|
||||
output = (output_ + self.base_layer.bias
|
||||
if self.base_layer.bias is not None else output_)
|
||||
output = (
|
||||
output_ + self.base_layer.bias
|
||||
if self.base_layer.bias is not None
|
||||
else output_
|
||||
)
|
||||
output_bias = None
|
||||
else:
|
||||
output = output_
|
||||
@ -101,11 +101,11 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
|
||||
return type(source_layer) is RowParallelLinear
|
||||
|
||||
|
||||
|
||||
# The following layer is based on the tensor parallelism strategy given in
|
||||
# Y. Sheng et al., S-LoRA: Serving Thousands of Concurrent LoRA Adapters. 2023,
|
||||
# https://arxiv.org/abs/2311.03285.
|
||||
|
||||
|
||||
class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA):
|
||||
"""
|
||||
Differs from RowParallelLinearWithLoRA by slicing the
|
||||
@ -120,28 +120,26 @@ class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA):
|
||||
shard_size = self.lora_b_stacked[0].shape[2]
|
||||
start_idx = self.tp_rank * shard_size
|
||||
end_idx = (self.tp_rank + 1) * shard_size
|
||||
lora_b = lora_b[ start_idx:end_idx,:]
|
||||
lora_b = lora_b[start_idx:end_idx, :]
|
||||
return lora_b
|
||||
|
||||
def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
|
||||
if bias is None:
|
||||
return bias
|
||||
self.lora_bias_stacked = cast(tuple[torch.Tensor, ...],
|
||||
self.lora_bias_stacked)
|
||||
self.lora_bias_stacked = cast(tuple[torch.Tensor, ...], self.lora_bias_stacked)
|
||||
shard_size = self.lora_bias_stacked[0].shape[2]
|
||||
start_idx = self.tp_rank * shard_size
|
||||
end_idx = (self.tp_rank + 1) * shard_size
|
||||
bias = bias[start_idx:end_idx]
|
||||
return bias
|
||||
|
||||
def apply(self,
|
||||
x: torch.Tensor,
|
||||
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
|
||||
def apply(
|
||||
self, x: torch.Tensor, bias: Optional[torch.Tensor] = None
|
||||
) -> torch.Tensor:
|
||||
output = self.base_layer.quant_method.apply(self.base_layer, x)
|
||||
|
||||
x = x.view(-1, x.shape[-1])
|
||||
output, out_orig_shape = output.view(-1,
|
||||
output.shape[-1]), output.shape
|
||||
output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape
|
||||
buffer = torch.zeros(
|
||||
(self.n_slices, x.shape[0], self.lora_a_stacked[0].shape[2]),
|
||||
dtype=torch.float32,
|
||||
@ -149,10 +147,11 @@ class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA):
|
||||
)
|
||||
|
||||
shrunk_buffer: Optional[torch.Tensor] = self.punica_wrapper.add_shrink(
|
||||
buffer, x, self.lora_a_stacked, 1.0)
|
||||
buffer, x, self.lora_a_stacked, 1.0
|
||||
)
|
||||
if not current_platform.can_update_inplace():
|
||||
buffer = shrunk_buffer
|
||||
if self.tp_size>1:
|
||||
if self.tp_size > 1:
|
||||
buffer = tensor_model_parallel_all_reduce(buffer)
|
||||
|
||||
# following S-LoRA, allows the fusing of all_gather and all_reduce
|
||||
|
||||
@ -19,8 +19,6 @@ from vllm.config.lora import LoRAConfig
|
||||
from vllm.logger import init_logger
|
||||
|
||||
# being imported for _all_lora_classes below
|
||||
# yapf conflicts with isort for this block
|
||||
# yapf: disable
|
||||
from vllm.lora.layers import (
|
||||
BaseLayerWithLoRA,
|
||||
ColumnParallelLinearWithLoRA,
|
||||
@ -39,8 +37,6 @@ from vllm.lora.layers import (
|
||||
)
|
||||
from vllm.model_executor.layers.linear import LinearBase
|
||||
|
||||
# yapf: enable
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
||||
|
||||
@ -14,8 +14,6 @@ import vllm.envs as envs
|
||||
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.logger import init_logger
|
||||
|
||||
# yapf: disable
|
||||
from vllm.model_executor.layers.fused_moe.config import (
|
||||
FUSED_MOE_UNQUANTIZED_CONFIG,
|
||||
FusedMoEQuantConfig,
|
||||
@ -25,8 +23,6 @@ from vllm.model_executor.layers.fused_moe.cutlass_moe import (
|
||||
_valid_cutlass_block_scaled_grouped_gemm,
|
||||
run_cutlass_block_scaled_fused_experts,
|
||||
)
|
||||
|
||||
# yapf: enable
|
||||
from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
|
||||
_valid_deep_gemm,
|
||||
deep_gemm_moe_fp8,
|
||||
|
||||
@ -24,8 +24,6 @@ from vllm.distributed.eplb.eplb_state import EplbState
|
||||
from vllm.forward_context import ForwardContext, get_forward_context
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.custom_op import CustomOp
|
||||
|
||||
# yapf: disable
|
||||
from vllm.model_executor.layers.fused_moe.config import (
|
||||
FUSED_MOE_UNQUANTIZED_CONFIG,
|
||||
FusedMoEConfig,
|
||||
@ -34,8 +32,6 @@ from vllm.model_executor.layers.fused_moe.config import (
|
||||
biased_moe_quant_config,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import zero_experts_compute_triton
|
||||
|
||||
# yapf: enable
|
||||
from vllm.model_executor.layers.fused_moe.modular_kernel import (
|
||||
FusedMoEActivationFormat,
|
||||
FusedMoEModularKernel,
|
||||
|
||||
@ -10,7 +10,7 @@ import torch
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
|
||||
from vllm.model_executor.layers.fused_moe.utils import ( # yapf: disable
|
||||
from vllm.model_executor.layers.fused_moe.utils import (
|
||||
_resize_cache,
|
||||
count_expert_num_tokens,
|
||||
)
|
||||
|
||||
@ -24,8 +24,6 @@ from vllm.model_executor.layers.quantization.base_config import (
|
||||
QuantizeMethodBase,
|
||||
)
|
||||
from vllm.model_executor.layers.utils import dispatch_unquantized_gemm
|
||||
|
||||
# yapf: disable
|
||||
from vllm.model_executor.parameter import (
|
||||
BasevLLMParameter,
|
||||
BlockQuantScaleParameter,
|
||||
@ -35,8 +33,6 @@ from vllm.model_executor.parameter import (
|
||||
PerTensorScaleParameter,
|
||||
RowvLLMParameter,
|
||||
)
|
||||
|
||||
# yapf: enable
|
||||
from vllm.model_executor.utils import set_weight_attrs
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import GiB_bytes
|
||||
|
||||
@ -17,17 +17,12 @@ from vllm.model_executor.layers.quantization.kernels.mixed_precision import (
|
||||
from vllm.model_executor.layers.quantization.utils.marlin_utils import (
|
||||
marlin_repeat_scales_on_all_ranks,
|
||||
)
|
||||
|
||||
# yapf conflicts with isort for this block
|
||||
# yapf: disable
|
||||
from vllm.model_executor.parameter import (
|
||||
BasevLLMParameter,
|
||||
ChannelQuantScaleParameter,
|
||||
GroupQuantScaleParameter,
|
||||
PackedvLLMParameter,
|
||||
)
|
||||
|
||||
# yapf: enable
|
||||
from vllm.scalar_type import scalar_types
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@ -17,9 +17,6 @@ from vllm.model_executor.layers.quantization.kernels.mixed_precision import (
|
||||
from vllm.model_executor.layers.quantization.utils.marlin_utils import (
|
||||
marlin_repeat_scales_on_all_ranks,
|
||||
)
|
||||
|
||||
# yapf conflicts with isort for this block
|
||||
# yapf: disable
|
||||
from vllm.model_executor.parameter import (
|
||||
BasevLLMParameter,
|
||||
ChannelQuantScaleParameter,
|
||||
@ -28,8 +25,6 @@ from vllm.model_executor.parameter import (
|
||||
PackedvLLMParameter,
|
||||
RowvLLMParameter,
|
||||
)
|
||||
|
||||
# yapf: enable
|
||||
from vllm.scalar_type import scalar_types
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@ -22,8 +22,6 @@ from vllm.distributed import (
|
||||
get_tensor_model_parallel_rank,
|
||||
get_tensor_model_parallel_world_size,
|
||||
)
|
||||
|
||||
# yapf: enable
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.fused_moe import FusedMoE
|
||||
from vllm.model_executor.layers.linear import (
|
||||
@ -51,8 +49,6 @@ from vllm.model_executor.utils import (
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
# yapf conflicts with isort for this block
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
|
||||
@ -39,13 +39,10 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
||||
|
||||
# yapf: disable
|
||||
from .idefics2_vision_model import Idefics2VisionConfig
|
||||
from .idefics2_vision_model import (
|
||||
Idefics2VisionTransformer as Idefics3VisionTransformer,
|
||||
)
|
||||
|
||||
# yapf: enable
|
||||
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsQuant
|
||||
from .llama import LlamaDecoderLayer, LlamaMLP, LlamaModel
|
||||
from .utils import (
|
||||
|
||||
@ -22,8 +22,6 @@ from vllm.multimodal.inputs import (
|
||||
MultiModalKwargsItems,
|
||||
)
|
||||
from vllm.multimodal.parse import ImageProcessorItems, ImageSize, MultiModalDataItems
|
||||
|
||||
# yapf: disable
|
||||
from vllm.multimodal.processing import (
|
||||
BaseMultiModalProcessor,
|
||||
BaseProcessingInfo,
|
||||
@ -35,8 +33,6 @@ from vllm.multimodal.processing import (
|
||||
PromptUpdateDetails,
|
||||
replace_token_matches,
|
||||
)
|
||||
|
||||
# yapf: enable
|
||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
||||
|
||||
@ -6,14 +6,16 @@ from typing import Annotated, Any, Literal, Optional, Union, cast
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
# yapf: disable
|
||||
from torch import nn
|
||||
from transformers import AutoModel, BatchFeature
|
||||
from transformers.models.gemma3n import (Gemma3nAudioConfig,
|
||||
Gemma3nAudioFeatureExtractor,
|
||||
Gemma3nConfig, Gemma3nProcessor,
|
||||
Gemma3nTextConfig,
|
||||
Gemma3nVisionConfig)
|
||||
from transformers.models.gemma3n import (
|
||||
Gemma3nAudioConfig,
|
||||
Gemma3nAudioFeatureExtractor,
|
||||
Gemma3nConfig,
|
||||
Gemma3nProcessor,
|
||||
Gemma3nTextConfig,
|
||||
Gemma3nVisionConfig,
|
||||
)
|
||||
from transformers.models.siglip import SiglipImageProcessorFast
|
||||
|
||||
from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig
|
||||
@ -22,25 +24,32 @@ from vllm.inputs.data import PromptType
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.model_executor.layers.linear import RowParallelLinear
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||
VocabParallelEmbedding)
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
|
||||
from vllm.model_executor.models.gemma3n import Gemma3nForCausalLM
|
||||
from vllm.model_executor.models.module_mapping import MultiModelKeys
|
||||
from vllm.model_executor.models.whisper import ISO639_1_SUPPORTED_LANGS
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalKwargsItems)
|
||||
from vllm.multimodal.parse import (ImageProcessorItems, MultiModalDataItems,
|
||||
MultiModalDataParser)
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
BaseProcessingInfo,
|
||||
MultiModalPromptUpdates,
|
||||
MultiModalPromptUpdatesApplyResult,
|
||||
PlaceholderFeaturesInfo,
|
||||
PromptReplacement, PromptUpdate,
|
||||
PromptUpdateDetails,
|
||||
replace_token_matches)
|
||||
# yapf: enable
|
||||
from vllm.multimodal.inputs import (
|
||||
MultiModalDataDict,
|
||||
MultiModalFieldConfig,
|
||||
MultiModalKwargsItems,
|
||||
)
|
||||
from vllm.multimodal.parse import (
|
||||
ImageProcessorItems,
|
||||
MultiModalDataItems,
|
||||
MultiModalDataParser,
|
||||
)
|
||||
from vllm.multimodal.processing import (
|
||||
BaseMultiModalProcessor,
|
||||
BaseProcessingInfo,
|
||||
MultiModalPromptUpdates,
|
||||
MultiModalPromptUpdatesApplyResult,
|
||||
PlaceholderFeaturesInfo,
|
||||
PromptReplacement,
|
||||
PromptUpdate,
|
||||
PromptUpdateDetails,
|
||||
replace_token_matches,
|
||||
)
|
||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
||||
|
||||
@ -43,9 +43,6 @@ from vllm.multimodal.inputs import (
|
||||
MultiModalKwargsItems,
|
||||
)
|
||||
from vllm.multimodal.parse import ImageProcessorItems, ImageSize
|
||||
|
||||
# yapf conflicts with isort for this block
|
||||
# yapf: disable
|
||||
from vllm.multimodal.processing import (
|
||||
BaseMultiModalProcessor,
|
||||
BaseProcessingInfo,
|
||||
@ -54,18 +51,13 @@ from vllm.multimodal.processing import (
|
||||
PromptUpdate,
|
||||
PromptUpdateDetails,
|
||||
)
|
||||
|
||||
# yapf: enable
|
||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
||||
|
||||
# yapf: disable
|
||||
from .idefics2_vision_model import (
|
||||
Idefics2VisionTransformer as Idefics3VisionTransformer,
|
||||
)
|
||||
|
||||
# yapf: enable
|
||||
from .interfaces import MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal
|
||||
from .llama import LlamaModel
|
||||
from .utils import AutoWeightsLoader, maybe_prefix
|
||||
|
||||
@ -45,9 +45,6 @@ from vllm.multimodal.parse import (
|
||||
ImageSize,
|
||||
MultiModalDataItems,
|
||||
)
|
||||
|
||||
# yapf conflicts with isort for this block
|
||||
# yapf: disable
|
||||
from vllm.multimodal.processing import (
|
||||
BaseMultiModalProcessor,
|
||||
BaseProcessingInfo,
|
||||
@ -57,8 +54,6 @@ from vllm.multimodal.processing import (
|
||||
PromptUpdate,
|
||||
ResolvedPromptUpdate,
|
||||
)
|
||||
|
||||
# yapf: enable
|
||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.utils import is_list_of
|
||||
|
||||
@ -52,16 +52,12 @@ from vllm.distributed import utils as dist_utils
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.activation import get_act_and_mul_fn
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
|
||||
# yapf: disable
|
||||
from vllm.model_executor.layers.linear import (
|
||||
ColumnParallelLinear,
|
||||
MergedColumnParallelLinear,
|
||||
QKVParallelLinear,
|
||||
RowParallelLinear,
|
||||
)
|
||||
|
||||
# yapf: enable
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
from vllm.model_executor.models.module_mapping import MultiModelKeys
|
||||
|
||||
@ -37,12 +37,7 @@ from vllm.model_executor.layers.fla.ops import (
|
||||
fused_recurrent_gated_delta_rule,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe import FusedMoE
|
||||
|
||||
# yapf conflicts with isort for this block
|
||||
# yapf: disable
|
||||
from vllm.model_executor.layers.layernorm import GemmaRMSNorm as Qwen3NextRMSNorm
|
||||
|
||||
# yapf: enable
|
||||
from vllm.model_executor.layers.linear import (
|
||||
ColumnParallelLinear,
|
||||
QKVParallelLinear,
|
||||
|
||||
@ -54,7 +54,6 @@ from .interfaces_base import (
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
# yapf: disable
|
||||
_TEXT_GENERATION_MODELS = {
|
||||
# [Decoder-only]
|
||||
"ApertusForCausalLM": ("apertus", "ApertusForCausalLM"),
|
||||
@ -106,8 +105,8 @@ _TEXT_GENERATION_MODELS = {
|
||||
"GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"),
|
||||
"GraniteForCausalLM": ("granite", "GraniteForCausalLM"),
|
||||
"GraniteMoeForCausalLM": ("granitemoe", "GraniteMoeForCausalLM"),
|
||||
"GraniteMoeHybridForCausalLM": ("granitemoehybrid", "GraniteMoeHybridForCausalLM"), # noqa: E501
|
||||
"GraniteMoeSharedForCausalLM": ("granitemoeshared", "GraniteMoeSharedForCausalLM"), # noqa: E501
|
||||
"GraniteMoeHybridForCausalLM": ("granitemoehybrid", "GraniteMoeHybridForCausalLM"), # noqa: E501
|
||||
"GraniteMoeSharedForCausalLM": ("granitemoeshared", "GraniteMoeSharedForCausalLM"), # noqa: E501
|
||||
"GritLM": ("gritlm", "GritLM"),
|
||||
"Grok1ModelForCausalLM": ("grok1", "Grok1ForCausalLM"),
|
||||
"HunYuanMoEV1ForCausalLM": ("hunyuan_v1", "HunYuanMoEV1ForCausalLM"),
|
||||
@ -127,7 +126,7 @@ _TEXT_GENERATION_MODELS = {
|
||||
"LongcatFlashForCausalLM": ("longcat_flash", "LongcatFlashForCausalLM"),
|
||||
"MambaForCausalLM": ("mamba", "MambaForCausalLM"),
|
||||
"FalconMambaForCausalLM": ("mamba", "MambaForCausalLM"),
|
||||
"FalconH1ForCausalLM":("falcon_h1", "FalconH1ForCausalLM"),
|
||||
"FalconH1ForCausalLM": ("falcon_h1", "FalconH1ForCausalLM"),
|
||||
"Mamba2ForCausalLM": ("mamba2", "Mamba2ForCausalLM"),
|
||||
"MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"),
|
||||
"MiniCPM3ForCausalLM": ("minicpm3", "MiniCPM3ForCausalLM"),
|
||||
@ -184,7 +183,8 @@ _EMBEDDING_MODELS = {
|
||||
"LlamaModel": ("llama", "LlamaForCausalLM"),
|
||||
**{
|
||||
# Multiple models share the same architecture, so we include them all
|
||||
k: (mod, arch) for k, (mod, arch) in _TEXT_GENERATION_MODELS.items()
|
||||
k: (mod, arch)
|
||||
for k, (mod, arch) in _TEXT_GENERATION_MODELS.items()
|
||||
if arch == "LlamaForCausalLM"
|
||||
},
|
||||
"MistralModel": ("llama", "LlamaForCausalLM"),
|
||||
@ -201,7 +201,10 @@ _EMBEDDING_MODELS = {
|
||||
"XLMRobertaModel": ("roberta", "RobertaEmbeddingModel"),
|
||||
# [Multimodal]
|
||||
"CLIPModel": ("clip", "CLIPEmbeddingModel"),
|
||||
"LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501
|
||||
"LlavaNextForConditionalGeneration": (
|
||||
"llava_next",
|
||||
"LlavaNextForConditionalGeneration",
|
||||
), # noqa: E501
|
||||
"Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
|
||||
"Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), # noqa: E501
|
||||
# Technically Terratorch models work on images, both in
|
||||
@ -214,79 +217,150 @@ _EMBEDDING_MODELS = {
|
||||
_CROSS_ENCODER_MODELS = {
|
||||
"BertForSequenceClassification": ("bert", "BertForSequenceClassification"),
|
||||
"BertForTokenClassification": ("bert", "BertForTokenClassification"),
|
||||
"GteNewForSequenceClassification": ("bert_with_rope",
|
||||
"GteNewForSequenceClassification"),
|
||||
"ModernBertForSequenceClassification": ("modernbert",
|
||||
"ModernBertForSequenceClassification"),
|
||||
"RobertaForSequenceClassification": ("roberta",
|
||||
"RobertaForSequenceClassification"),
|
||||
"XLMRobertaForSequenceClassification": ("roberta",
|
||||
"RobertaForSequenceClassification"),
|
||||
"GteNewForSequenceClassification": (
|
||||
"bert_with_rope",
|
||||
"GteNewForSequenceClassification",
|
||||
),
|
||||
"ModernBertForSequenceClassification": (
|
||||
"modernbert",
|
||||
"ModernBertForSequenceClassification",
|
||||
),
|
||||
"RobertaForSequenceClassification": ("roberta", "RobertaForSequenceClassification"),
|
||||
"XLMRobertaForSequenceClassification": (
|
||||
"roberta",
|
||||
"RobertaForSequenceClassification",
|
||||
),
|
||||
# [Auto-converted (see adapters.py)]
|
||||
"JinaVLForRanking": ("jina_vl", "JinaVLForSequenceClassification"), # noqa: E501,
|
||||
"JinaVLForRanking": ("jina_vl", "JinaVLForSequenceClassification"), # noqa: E501,
|
||||
}
|
||||
|
||||
_MULTIMODAL_MODELS = {
|
||||
# [Decoder-only]
|
||||
"AriaForConditionalGeneration": ("aria", "AriaForConditionalGeneration"),
|
||||
"AyaVisionForConditionalGeneration": ("aya_vision", "AyaVisionForConditionalGeneration"), # noqa: E501
|
||||
"AyaVisionForConditionalGeneration": (
|
||||
"aya_vision",
|
||||
"AyaVisionForConditionalGeneration",
|
||||
), # noqa: E501
|
||||
"Blip2ForConditionalGeneration": ("blip2", "Blip2ForConditionalGeneration"),
|
||||
"ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"), # noqa: E501
|
||||
"Cohere2VisionForConditionalGeneration": ("cohere2_vision", "Cohere2VisionForConditionalGeneration"), # noqa: E501
|
||||
"ChameleonForConditionalGeneration": (
|
||||
"chameleon",
|
||||
"ChameleonForConditionalGeneration",
|
||||
), # noqa: E501
|
||||
"Cohere2VisionForConditionalGeneration": (
|
||||
"cohere2_vision",
|
||||
"Cohere2VisionForConditionalGeneration",
|
||||
), # noqa: E501
|
||||
"DeepseekVLV2ForCausalLM": ("deepseek_vl2", "DeepseekVLV2ForCausalLM"),
|
||||
"DotsOCRForCausalLM": ("dots_ocr", "DotsOCRForCausalLM"),
|
||||
"Ernie4_5_VLMoeForConditionalGeneration": ("ernie45_vl", "Ernie4_5_VLMoeForConditionalGeneration"), # noqa: E501
|
||||
"Ernie4_5_VLMoeForConditionalGeneration": (
|
||||
"ernie45_vl",
|
||||
"Ernie4_5_VLMoeForConditionalGeneration",
|
||||
), # noqa: E501
|
||||
"FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
|
||||
"Gemma3ForConditionalGeneration": ("gemma3_mm", "Gemma3ForConditionalGeneration"), # noqa: E501
|
||||
"Gemma3nForConditionalGeneration": ("gemma3n_mm", "Gemma3nForConditionalGeneration"), # noqa: E501
|
||||
"Gemma3nForConditionalGeneration": (
|
||||
"gemma3n_mm",
|
||||
"Gemma3nForConditionalGeneration",
|
||||
), # noqa: E501
|
||||
"GLM4VForCausalLM": ("glm4v", "GLM4VForCausalLM"),
|
||||
"Glm4vForConditionalGeneration": ("glm4_1v", "Glm4vForConditionalGeneration"), # noqa: E501
|
||||
"Glm4vMoeForConditionalGeneration": ("glm4_1v", "Glm4vMoeForConditionalGeneration"), # noqa: E501
|
||||
"GraniteSpeechForConditionalGeneration": ("granite_speech", "GraniteSpeechForConditionalGeneration"), # noqa: E501
|
||||
"GraniteSpeechForConditionalGeneration": (
|
||||
"granite_speech",
|
||||
"GraniteSpeechForConditionalGeneration",
|
||||
), # noqa: E501
|
||||
"H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
|
||||
"InternVLChatModel": ("internvl", "InternVLChatModel"),
|
||||
"NemotronH_Nano_VL_V2": ("nano_nemotron_vl", "NemotronH_Nano_VL_V2"),
|
||||
"InternS1ForConditionalGeneration": ("interns1", "InternS1ForConditionalGeneration"), # noqa: E501
|
||||
"InternVLForConditionalGeneration": ("interns1", "InternS1ForConditionalGeneration"), # noqa: E501
|
||||
"Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"),
|
||||
"SmolVLMForConditionalGeneration": ("smolvlm","SmolVLMForConditionalGeneration"), # noqa: E501
|
||||
"InternS1ForConditionalGeneration": (
|
||||
"interns1",
|
||||
"InternS1ForConditionalGeneration",
|
||||
), # noqa: E501
|
||||
"InternVLForConditionalGeneration": (
|
||||
"interns1",
|
||||
"InternS1ForConditionalGeneration",
|
||||
), # noqa: E501
|
||||
"Idefics3ForConditionalGeneration": (
|
||||
"idefics3",
|
||||
"Idefics3ForConditionalGeneration",
|
||||
),
|
||||
"SmolVLMForConditionalGeneration": ("smolvlm", "SmolVLMForConditionalGeneration"), # noqa: E501
|
||||
"KeyeForConditionalGeneration": ("keye", "KeyeForConditionalGeneration"),
|
||||
"KeyeVL1_5ForConditionalGeneration": ("keye_vl1_5", "KeyeVL1_5ForConditionalGeneration"), # noqa: E501
|
||||
"KeyeVL1_5ForConditionalGeneration": (
|
||||
"keye_vl1_5",
|
||||
"KeyeVL1_5ForConditionalGeneration",
|
||||
), # noqa: E501
|
||||
"RForConditionalGeneration": ("rvl", "RForConditionalGeneration"),
|
||||
"KimiVLForConditionalGeneration": ("kimi_vl", "KimiVLForConditionalGeneration"), # noqa: E501
|
||||
"Llama_Nemotron_Nano_VL": ("nemotron_vl", "LlamaNemotronVLChatModel"),
|
||||
"Llama4ForConditionalGeneration": ("mllama4", "Llama4ForConditionalGeneration"), # noqa: E501
|
||||
"LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"),
|
||||
"LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501
|
||||
"LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"), # noqa: E501
|
||||
"LlavaOnevisionForConditionalGeneration": ("llava_onevision", "LlavaOnevisionForConditionalGeneration"), # noqa: E501
|
||||
"LlavaNextForConditionalGeneration": (
|
||||
"llava_next",
|
||||
"LlavaNextForConditionalGeneration",
|
||||
), # noqa: E501
|
||||
"LlavaNextVideoForConditionalGeneration": (
|
||||
"llava_next_video",
|
||||
"LlavaNextVideoForConditionalGeneration",
|
||||
), # noqa: E501
|
||||
"LlavaOnevisionForConditionalGeneration": (
|
||||
"llava_onevision",
|
||||
"LlavaOnevisionForConditionalGeneration",
|
||||
), # noqa: E501
|
||||
"MantisForConditionalGeneration": ("llava", "MantisForConditionalGeneration"), # noqa: E501
|
||||
"MiDashengLMModel": ("midashenglm", "MiDashengLMModel"),
|
||||
"MiniMaxVL01ForConditionalGeneration": ("minimax_vl_01", "MiniMaxVL01ForConditionalGeneration"), # noqa: E501
|
||||
"MiniMaxVL01ForConditionalGeneration": (
|
||||
"minimax_vl_01",
|
||||
"MiniMaxVL01ForConditionalGeneration",
|
||||
), # noqa: E501
|
||||
"MiniCPMO": ("minicpmo", "MiniCPMO"),
|
||||
"MiniCPMV": ("minicpmv", "MiniCPMV"),
|
||||
"Mistral3ForConditionalGeneration": ("mistral3", "Mistral3ForConditionalGeneration"), # noqa: E501
|
||||
"Mistral3ForConditionalGeneration": (
|
||||
"mistral3",
|
||||
"Mistral3ForConditionalGeneration",
|
||||
), # noqa: E501
|
||||
"MolmoForCausalLM": ("molmo", "MolmoForCausalLM"),
|
||||
"NVLM_D": ("nvlm_d", "NVLM_D_Model"),
|
||||
"Ovis": ("ovis", "Ovis"),
|
||||
"Ovis2_5": ("ovis2_5", "Ovis2_5"),
|
||||
"PaliGemmaForConditionalGeneration": ("paligemma", "PaliGemmaForConditionalGeneration"), # noqa: E501
|
||||
"PaliGemmaForConditionalGeneration": (
|
||||
"paligemma",
|
||||
"PaliGemmaForConditionalGeneration",
|
||||
), # noqa: E501
|
||||
"Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
|
||||
"Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"),
|
||||
"Phi4MultimodalForCausalLM": ("phi4_multimodal", "Phi4MultimodalForCausalLM"), # noqa: E501
|
||||
"PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"), # noqa: E501
|
||||
"QwenVLForConditionalGeneration": ("qwen_vl", "QwenVLForConditionalGeneration"), # noqa: E501
|
||||
"Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), # noqa: E501
|
||||
"Qwen2_5_VLForConditionalGeneration": ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"), # noqa: E501
|
||||
"Qwen2AudioForConditionalGeneration": ("qwen2_audio", "Qwen2AudioForConditionalGeneration"), # noqa: E501
|
||||
"Qwen2_5OmniModel": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"), # noqa: E501
|
||||
"Qwen2_5OmniForConditionalGeneration": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"), # noqa: E501
|
||||
"Qwen2_5_VLForConditionalGeneration": (
|
||||
"qwen2_5_vl",
|
||||
"Qwen2_5_VLForConditionalGeneration",
|
||||
), # noqa: E501
|
||||
"Qwen2AudioForConditionalGeneration": (
|
||||
"qwen2_audio",
|
||||
"Qwen2AudioForConditionalGeneration",
|
||||
), # noqa: E501
|
||||
"Qwen2_5OmniModel": (
|
||||
"qwen2_5_omni_thinker",
|
||||
"Qwen2_5OmniThinkerForConditionalGeneration",
|
||||
), # noqa: E501
|
||||
"Qwen2_5OmniForConditionalGeneration": (
|
||||
"qwen2_5_omni_thinker",
|
||||
"Qwen2_5OmniThinkerForConditionalGeneration",
|
||||
), # noqa: E501
|
||||
"Qwen3VLForConditionalGeneration": ("qwen3_vl", "Qwen3VLForConditionalGeneration"), # noqa: E501
|
||||
"Qwen3VLMoeForConditionalGeneration": ("qwen3_vl_moe", "Qwen3VLMoeForConditionalGeneration"), # noqa: E501
|
||||
"Qwen3VLMoeForConditionalGeneration": (
|
||||
"qwen3_vl_moe",
|
||||
"Qwen3VLMoeForConditionalGeneration",
|
||||
), # noqa: E501
|
||||
"SkyworkR1VChatModel": ("skyworkr1v", "SkyworkR1VChatModel"),
|
||||
"Step3VLForConditionalGeneration": ("step3_vl", "Step3VLForConditionalGeneration"), # noqa: E501
|
||||
"TarsierForConditionalGeneration": ("tarsier", "TarsierForConditionalGeneration"), # noqa: E501
|
||||
"Tarsier2ForConditionalGeneration": ("qwen2_vl", "Tarsier2ForConditionalGeneration"), # noqa: E501
|
||||
"Tarsier2ForConditionalGeneration": (
|
||||
"qwen2_vl",
|
||||
"Tarsier2ForConditionalGeneration",
|
||||
), # noqa: E501
|
||||
"UltravoxModel": ("ultravox", "UltravoxModel"),
|
||||
"VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"), # noqa: E501
|
||||
# [Encoder-decoder]
|
||||
@ -324,13 +398,27 @@ _TRANSFORMERS_BACKEND_MODELS = {
|
||||
"TransformersForCausalLM": ("transformers", "TransformersForCausalLM"),
|
||||
"TransformersForMultimodalLM": ("transformers", "TransformersForMultimodalLM"), # noqa: E501
|
||||
"TransformersMoEForCausalLM": ("transformers_moe", "TransformersMoEForCausalLM"), # noqa: E501
|
||||
"TransformersMoEForMultimodalLM": ("transformers_moe", "TransformersMoEForMultimodalLM"), # noqa: E501
|
||||
"TransformersEmbeddingModel": ("transformers_pooling", "TransformersEmbeddingModel"), # noqa: E501
|
||||
"TransformersForSequenceClassification": ("transformers_pooling", "TransformersForSequenceClassification"), # noqa: E501
|
||||
"TransformersMoEForSequenceClassification": ("transformers_pooling", "TransformersMoEForSequenceClassification"), # noqa: E501
|
||||
"TransformersMoEEmbeddingModel": ("transformers_pooling", "TransformersMoEEmbeddingModel"), # noqa: E501
|
||||
"TransformersMoEForMultimodalLM": (
|
||||
"transformers_moe",
|
||||
"TransformersMoEForMultimodalLM",
|
||||
), # noqa: E501
|
||||
"TransformersEmbeddingModel": (
|
||||
"transformers_pooling",
|
||||
"TransformersEmbeddingModel",
|
||||
), # noqa: E501
|
||||
"TransformersForSequenceClassification": (
|
||||
"transformers_pooling",
|
||||
"TransformersForSequenceClassification",
|
||||
), # noqa: E501
|
||||
"TransformersMoEForSequenceClassification": (
|
||||
"transformers_pooling",
|
||||
"TransformersMoEForSequenceClassification",
|
||||
), # noqa: E501
|
||||
"TransformersMoEEmbeddingModel": (
|
||||
"transformers_pooling",
|
||||
"TransformersMoEEmbeddingModel",
|
||||
), # noqa: E501
|
||||
}
|
||||
# yapf: enable
|
||||
|
||||
_VLLM_MODELS = {
|
||||
**_TEXT_GENERATION_MODELS,
|
||||
|
||||
@ -8,13 +8,10 @@ from transformers import SmolVLMProcessor
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
|
||||
# yapf: disable
|
||||
from .idefics3 import Idefics3DummyInputsBuilder as SmolVLMDummyInputsBuilder
|
||||
from .idefics3 import Idefics3ForConditionalGeneration, Idefics3ProcessingInfo
|
||||
from .idefics3 import Idefics3MultiModalProcessor as SmolVLMMultiModalProcessor
|
||||
|
||||
# yapf: enable
|
||||
|
||||
|
||||
class SmolVLMProcessingInfo(Idefics3ProcessingInfo):
|
||||
def get_hf_processor(self, **kwargs: object) -> SmolVLMProcessor:
|
||||
|
||||
@ -32,11 +32,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
from vllm.model_executor.models import SupportsPP
|
||||
from vllm.model_executor.models.module_mapping import MultiModelKeys
|
||||
|
||||
# yapf: disable
|
||||
from vllm.model_executor.models.whisper import WhisperEncoder
|
||||
|
||||
# yapf: enable
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (
|
||||
MultiModalDataDict,
|
||||
|
||||
@ -28,7 +28,6 @@ def _get_minicpmv_chat_template_fallback(tokenizer_name_or_path: str) -> Optiona
|
||||
return CHAT_TEMPLATES_DIR / "template_chatml.jinja"
|
||||
|
||||
|
||||
# yapf: disable
|
||||
_MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = {
|
||||
"blip-2": CHAT_TEMPLATES_DIR / "template_blip2.jinja",
|
||||
"clip": CHAT_TEMPLATES_DIR / "template_basic.jinja",
|
||||
@ -39,7 +38,6 @@ _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = {
|
||||
"paligemma": CHAT_TEMPLATES_DIR / "template_basic.jinja",
|
||||
"qwen": _get_qwen_chat_template_fallback,
|
||||
}
|
||||
# yapf: enable
|
||||
|
||||
|
||||
def register_chat_template_fallback_path(
|
||||
|
||||
@ -1,12 +1,11 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# yapf: disable
|
||||
# ruff: noqa: E501
|
||||
# coding=utf-8
|
||||
# Copied from
|
||||
# https://huggingface.co/Snowflake/snowflake-arctic-instruct/blob/main/configuration_arctic.py
|
||||
""" Arctic model configuration"""
|
||||
"""Arctic model configuration"""
|
||||
|
||||
from dataclasses import asdict, dataclass
|
||||
from typing import Any
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# yapf: disable
|
||||
# ruff: noqa: E501
|
||||
# Adapted from
|
||||
# https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1/blob/main/configuration.py
|
||||
@ -16,7 +15,7 @@ from transformers.dynamic_module_utils import get_class_from_dynamic_module
|
||||
|
||||
|
||||
class Nemotron_Nano_VL_Config(PretrainedConfig):
|
||||
model_type = 'Llama_Nemotron_Nano_VL'
|
||||
model_type = "Llama_Nemotron_Nano_VL"
|
||||
is_composition = True
|
||||
|
||||
def __init__(
|
||||
@ -26,17 +25,22 @@ class Nemotron_Nano_VL_Config(PretrainedConfig):
|
||||
force_image_size=None,
|
||||
downsample_ratio=0.5,
|
||||
template=None,
|
||||
ps_version='v1',
|
||||
ps_version="v1",
|
||||
image_tag_type="internvl",
|
||||
projector_hidden_size=4096,
|
||||
vit_hidden_size=1280,
|
||||
**kwargs
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
if vision_config is not None:
|
||||
assert "auto_map" in vision_config and "AutoConfig" in vision_config["auto_map"]
|
||||
vision_auto_config = get_class_from_dynamic_module(*vision_config["auto_map"]["AutoConfig"].split("--")[::-1])
|
||||
assert (
|
||||
"auto_map" in vision_config
|
||||
and "AutoConfig" in vision_config["auto_map"]
|
||||
)
|
||||
vision_auto_config = get_class_from_dynamic_module(
|
||||
*vision_config["auto_map"]["AutoConfig"].split("--")[::-1]
|
||||
)
|
||||
self.vision_config = vision_auto_config(**vision_config)
|
||||
else:
|
||||
self.vision_config = PretrainedConfig()
|
||||
@ -51,6 +55,6 @@ class Nemotron_Nano_VL_Config(PretrainedConfig):
|
||||
self.downsample_ratio = downsample_ratio
|
||||
self.template = template # TODO move out of here and into the tokenizer
|
||||
self.ps_version = ps_version # Pixel shuffle version
|
||||
self.image_tag_type = image_tag_type # TODO: into the tokenizer too?
|
||||
self.image_tag_type = image_tag_type # TODO: into the tokenizer too?
|
||||
self.projector_hidden_size = projector_hidden_size
|
||||
self.vit_hidden_size = vit_hidden_size
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# yapf: disable
|
||||
# ruff: noqa: E501
|
||||
# adapted from https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_aimv2.py
|
||||
# and https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_ovis.py
|
||||
@ -70,34 +69,37 @@ class AIMv2Config(PretrainedConfig):
|
||||
# Visual Tokenizer Configuration
|
||||
# ----------------------------------------------------------------------
|
||||
class BaseVisualTokenizerConfig(PretrainedConfig):
|
||||
|
||||
def __init__(self,
|
||||
vocab_size=16384,
|
||||
tokenize_function="softmax",
|
||||
tau=1.0,
|
||||
depths=None,
|
||||
drop_cls_token=False,
|
||||
backbone_config: Optional[Union[PretrainedConfig,
|
||||
dict]] = None,
|
||||
hidden_stride: int = 1,
|
||||
**kwargs):
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=16384,
|
||||
tokenize_function="softmax",
|
||||
tau=1.0,
|
||||
depths=None,
|
||||
drop_cls_token=False,
|
||||
backbone_config: Optional[Union[PretrainedConfig, dict]] = None,
|
||||
hidden_stride: int = 1,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self.vocab_size = vocab_size
|
||||
self.tokenize_function = tokenize_function
|
||||
self.tau = tau
|
||||
if isinstance(depths, str):
|
||||
depths = [int(x) for x in depths.split('|')]
|
||||
depths = [int(x) for x in depths.split("|")]
|
||||
self.depths = depths
|
||||
self.backbone_kwargs = dict[str, Any]()
|
||||
self.drop_cls_token = drop_cls_token
|
||||
if backbone_config is not None:
|
||||
assert isinstance(backbone_config, (PretrainedConfig, dict)), \
|
||||
assert isinstance(backbone_config, (PretrainedConfig, dict)), (
|
||||
f"expect `backbone_config` to be instance of PretrainedConfig or dict, but got {type(backbone_config)} type"
|
||||
)
|
||||
if not isinstance(backbone_config, PretrainedConfig):
|
||||
model_type = backbone_config['model_type']
|
||||
model_type = backbone_config["model_type"]
|
||||
if model_type != "aimv2":
|
||||
backbone_config.pop('model_type')
|
||||
backbone_config = AutoConfig.for_model(model_type, **backbone_config)
|
||||
backbone_config.pop("model_type")
|
||||
backbone_config = AutoConfig.for_model(
|
||||
model_type, **backbone_config
|
||||
)
|
||||
else:
|
||||
backbone_config = AIMv2Config(**backbone_config)
|
||||
self.backbone_config = backbone_config
|
||||
@ -113,7 +115,7 @@ class Aimv2VisualTokenizerConfig(BaseVisualTokenizerConfig):
|
||||
self.drop_cls_token = False
|
||||
if self.depths:
|
||||
assert len(self.depths) == 1
|
||||
self.backbone_kwargs['num_hidden_layers'] = self.depths[0]
|
||||
self.backbone_kwargs["num_hidden_layers"] = self.depths[0]
|
||||
|
||||
|
||||
class SiglipVisualTokenizerConfig(BaseVisualTokenizerConfig):
|
||||
@ -125,7 +127,7 @@ class SiglipVisualTokenizerConfig(BaseVisualTokenizerConfig):
|
||||
self.drop_cls_token = False
|
||||
if self.depths:
|
||||
assert len(self.depths) == 1
|
||||
self.backbone_kwargs['num_hidden_layers'] = self.depths[0]
|
||||
self.backbone_kwargs["num_hidden_layers"] = self.depths[0]
|
||||
|
||||
|
||||
AutoConfig.register("siglip_visual_tokenizer", SiglipVisualTokenizerConfig)
|
||||
@ -138,35 +140,39 @@ AutoConfig.register("aimv2_visual_tokenizer", Aimv2VisualTokenizerConfig)
|
||||
class OvisConfig(PretrainedConfig):
|
||||
model_type = "ovis"
|
||||
|
||||
def __init__(self,
|
||||
llm_config: Optional[Union[PretrainedConfig, dict]] = None,
|
||||
visual_tokenizer_config: Optional[Union[PretrainedConfig,
|
||||
dict]] = None,
|
||||
multimodal_max_length=8192,
|
||||
hidden_size=None,
|
||||
conversation_formatter_class=None,
|
||||
llm_attn_implementation=None,
|
||||
disable_tie_weight=False,
|
||||
**kwargs):
|
||||
def __init__(
|
||||
self,
|
||||
llm_config: Optional[Union[PretrainedConfig, dict]] = None,
|
||||
visual_tokenizer_config: Optional[Union[PretrainedConfig, dict]] = None,
|
||||
multimodal_max_length=8192,
|
||||
hidden_size=None,
|
||||
conversation_formatter_class=None,
|
||||
llm_attn_implementation=None,
|
||||
disable_tie_weight=False,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
if llm_config is not None:
|
||||
assert isinstance(llm_config, (PretrainedConfig, dict)), \
|
||||
assert isinstance(llm_config, (PretrainedConfig, dict)), (
|
||||
f"expect `llm_config` to be instance of PretrainedConfig or dict, but got {type(llm_config)} type"
|
||||
)
|
||||
if not isinstance(llm_config, PretrainedConfig):
|
||||
model_type = llm_config['model_type']
|
||||
llm_config.pop('model_type')
|
||||
model_type = llm_config["model_type"]
|
||||
llm_config.pop("model_type")
|
||||
llm_config = AutoConfig.for_model(model_type, **llm_config)
|
||||
|
||||
# map llm_config to text_config
|
||||
self.text_config = llm_config
|
||||
if visual_tokenizer_config is not None:
|
||||
assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), \
|
||||
assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), (
|
||||
f"expect `visual_tokenizer_config` to be instance of PretrainedConfig or dict, but got {type(visual_tokenizer_config)} type"
|
||||
)
|
||||
if not isinstance(visual_tokenizer_config, PretrainedConfig):
|
||||
model_type = visual_tokenizer_config['model_type']
|
||||
visual_tokenizer_config.pop('model_type')
|
||||
model_type = visual_tokenizer_config["model_type"]
|
||||
visual_tokenizer_config.pop("model_type")
|
||||
visual_tokenizer_config = AutoConfig.for_model(
|
||||
model_type, **visual_tokenizer_config)
|
||||
model_type, **visual_tokenizer_config
|
||||
)
|
||||
|
||||
self.visual_tokenizer_config = visual_tokenizer_config
|
||||
self.multimodal_max_length = multimodal_max_length
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# yapf: disable
|
||||
# ruff: noqa: E501
|
||||
# coding=utf-8
|
||||
# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/ff23960c5cf9e6874b44be38af930cfb0ccbb620/deepseek_vl2/models/processing_deepseek_vl_v2.py
|
||||
@ -35,11 +34,12 @@ from transformers.processing_utils import ProcessorMixin
|
||||
|
||||
|
||||
class ImageTransform:
|
||||
|
||||
def __init__(self,
|
||||
mean: tuple[float, float, float] = (0.5, 0.5, 0.5),
|
||||
std: tuple[float, float, float] = (0.5, 0.5, 0.5),
|
||||
normalize: bool = True):
|
||||
def __init__(
|
||||
self,
|
||||
mean: tuple[float, float, float] = (0.5, 0.5, 0.5),
|
||||
std: tuple[float, float, float] = (0.5, 0.5, 0.5),
|
||||
normalize: bool = True,
|
||||
):
|
||||
self.mean = mean
|
||||
self.std = std
|
||||
self.normalize = normalize
|
||||
@ -77,7 +77,6 @@ class DeepseekVLV2Processor(ProcessorMixin):
|
||||
ignore_id: int = -100,
|
||||
**kwargs,
|
||||
):
|
||||
|
||||
self.candidate_resolutions = candidate_resolutions
|
||||
self.image_size = candidate_resolutions[0][0]
|
||||
self.patch_size = patch_size
|
||||
@ -86,13 +85,15 @@ class DeepseekVLV2Processor(ProcessorMixin):
|
||||
self.normalize = normalize
|
||||
self.downsample_ratio = downsample_ratio
|
||||
|
||||
self.image_transform = ImageTransform(mean=image_mean, std=image_std, normalize=normalize)
|
||||
self.image_transform = ImageTransform(
|
||||
mean=image_mean, std=image_std, normalize=normalize
|
||||
)
|
||||
self.tokenizer = tokenizer
|
||||
self.tokenizer.padding_side = 'left' # must set this,padding side with make a difference in batch inference
|
||||
self.tokenizer.padding_side = "left" # must set this,padding side with make a difference in batch inference
|
||||
|
||||
# add the pad_token as special token to use 'tokenizer.pad_token' and 'tokenizer.pad_token_id'
|
||||
if tokenizer.pad_token is None:
|
||||
self.tokenizer.add_special_tokens({'pad_token': pad_token})
|
||||
self.tokenizer.add_special_tokens({"pad_token": pad_token})
|
||||
|
||||
# add image token
|
||||
image_token_id = self.tokenizer.vocab.get(image_token)
|
||||
@ -104,7 +105,7 @@ class DeepseekVLV2Processor(ProcessorMixin):
|
||||
|
||||
# add five special tokens for grounding-related tasks
|
||||
# <|ref|>, <|/ref|>, <|det|>, <|/det|>, <|grounding|>
|
||||
special_tokens = ['<|ref|>', '<|/ref|>', '<|det|>', '<|/det|>', '<|grounding|>']
|
||||
special_tokens = ["<|ref|>", "<|/ref|>", "<|det|>", "<|/det|>", "<|grounding|>"]
|
||||
special_tokens_dict = {"additional_special_tokens": special_tokens}
|
||||
self.tokenizer.add_special_tokens(special_tokens_dict)
|
||||
|
||||
@ -134,15 +135,19 @@ class DeepseekVLV2Processor(ProcessorMixin):
|
||||
|
||||
for width, height in self.candidate_resolutions:
|
||||
scale = min(width / original_width, height / original_height)
|
||||
downscaled_width, downscaled_height = int(
|
||||
original_width * scale), int(original_height * scale)
|
||||
effective_resolution = min(downscaled_width * downscaled_height,
|
||||
original_width * original_height)
|
||||
downscaled_width, downscaled_height = (
|
||||
int(original_width * scale),
|
||||
int(original_height * scale),
|
||||
)
|
||||
effective_resolution = min(
|
||||
downscaled_width * downscaled_height, original_width * original_height
|
||||
)
|
||||
wasted_resolution = (width * height) - effective_resolution
|
||||
|
||||
if effective_resolution > max_effective_resolution or (
|
||||
effective_resolution == max_effective_resolution
|
||||
and wasted_resolution < min_wasted_resolution):
|
||||
effective_resolution == max_effective_resolution
|
||||
and wasted_resolution < min_wasted_resolution
|
||||
):
|
||||
max_effective_resolution = effective_resolution
|
||||
min_wasted_resolution = wasted_resolution
|
||||
best_fit = (width, height)
|
||||
@ -198,12 +203,20 @@ class DeepseekVLV2Processor(ProcessorMixin):
|
||||
- num_image_tokens (list[int]): the number of image tokens
|
||||
"""
|
||||
|
||||
assert (prompt is not None and images is not None
|
||||
), "prompt and images must be used at the same time."
|
||||
assert prompt is not None and images is not None, (
|
||||
"prompt and images must be used at the same time."
|
||||
)
|
||||
|
||||
sft_format = prompt
|
||||
tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens = self.tokenize_with_images(
|
||||
sft_format, images, bos=True, eos=True, cropping=len(images) <= 2)
|
||||
(
|
||||
tokenized_str,
|
||||
images_list,
|
||||
images_seq_mask,
|
||||
images_spatial_crop,
|
||||
num_image_tokens,
|
||||
) = self.tokenize_with_images(
|
||||
sft_format, images, bos=True, eos=True, cropping=len(images) <= 2
|
||||
)
|
||||
masked_tokenized_str = []
|
||||
for token_index in tokenized_str:
|
||||
if token_index != self.image_token_id:
|
||||
@ -211,17 +224,21 @@ class DeepseekVLV2Processor(ProcessorMixin):
|
||||
else:
|
||||
masked_tokenized_str.append(self.ignore_id)
|
||||
|
||||
assert len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str), \
|
||||
(f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, "
|
||||
f"imags_seq_mask's length {len(images_seq_mask)}, are not equal")
|
||||
assert (
|
||||
len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str)
|
||||
), (
|
||||
f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, "
|
||||
f"imags_seq_mask's length {len(images_seq_mask)}, are not equal"
|
||||
)
|
||||
|
||||
input_ids = torch.LongTensor(tokenized_str)
|
||||
target_ids = torch.LongTensor(masked_tokenized_str)
|
||||
images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool)
|
||||
|
||||
# set input_ids < 0 | input_ids == self.image_token_id as ignore_id
|
||||
target_ids[(input_ids < 0) |
|
||||
(input_ids == self.image_token_id)] = self.ignore_id
|
||||
target_ids[(input_ids < 0) | (input_ids == self.image_token_id)] = (
|
||||
self.ignore_id
|
||||
)
|
||||
input_ids[input_ids < 0] = self.pad_id
|
||||
|
||||
if inference_mode:
|
||||
@ -311,30 +328,50 @@ class DeepseekVLV2Processor(ProcessorMixin):
|
||||
best_width, best_height = self.image_size, self.image_size
|
||||
|
||||
"""process the global view"""
|
||||
global_view = ImageOps.pad(image, (self.image_size, self.image_size),
|
||||
color=tuple(int(x * 255) for x in self.image_transform.mean))
|
||||
global_view = ImageOps.pad(
|
||||
image,
|
||||
(self.image_size, self.image_size),
|
||||
color=tuple(int(x * 255) for x in self.image_transform.mean),
|
||||
)
|
||||
images_list.append(self.image_transform(global_view))
|
||||
|
||||
"""process the local views"""
|
||||
local_view = ImageOps.pad(image, (best_width, best_height),
|
||||
color=tuple(int(x * 255) for x in self.image_transform.mean))
|
||||
local_view = ImageOps.pad(
|
||||
image,
|
||||
(best_width, best_height),
|
||||
color=tuple(int(x * 255) for x in self.image_transform.mean),
|
||||
)
|
||||
for i in range(0, best_height, self.image_size):
|
||||
for j in range(0, best_width, self.image_size):
|
||||
images_list.append(
|
||||
self.image_transform(local_view.crop((j, i, j + self.image_size, i + self.image_size))))
|
||||
self.image_transform(
|
||||
local_view.crop(
|
||||
(j, i, j + self.image_size, i + self.image_size)
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
"""record height / width crop num"""
|
||||
num_width_tiles, num_height_tiles = best_width // self.image_size, best_height // self.image_size
|
||||
num_width_tiles, num_height_tiles = (
|
||||
best_width // self.image_size,
|
||||
best_height // self.image_size,
|
||||
)
|
||||
images_spatial_crop.append([num_width_tiles, num_height_tiles])
|
||||
|
||||
"""add image tokens"""
|
||||
h = w = math.ceil((self.image_size // self.patch_size) / self.downsample_ratio)
|
||||
h = w = math.ceil(
|
||||
(self.image_size // self.patch_size) / self.downsample_ratio
|
||||
)
|
||||
# global views tokens h * (w + 1), 1 is for line separator
|
||||
tokenized_image = [self.image_token_id] * h * (w + 1)
|
||||
# add a separator between global and local views
|
||||
tokenized_image += [self.image_token_id]
|
||||
# local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1)
|
||||
tokenized_image += [self.image_token_id] * (num_height_tiles * h) * (num_width_tiles * w + 1)
|
||||
tokenized_image += (
|
||||
[self.image_token_id]
|
||||
* (num_height_tiles * h)
|
||||
* (num_width_tiles * w + 1)
|
||||
)
|
||||
|
||||
tokenized_str += tokenized_image
|
||||
images_seq_mask += [True] * len(tokenized_image)
|
||||
@ -353,10 +390,17 @@ class DeepseekVLV2Processor(ProcessorMixin):
|
||||
tokenized_str = tokenized_str + [self.eos_id]
|
||||
images_seq_mask = images_seq_mask + [False]
|
||||
|
||||
assert len(tokenized_str) == len(
|
||||
images_seq_mask), f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
|
||||
assert len(tokenized_str) == len(images_seq_mask), (
|
||||
f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
|
||||
)
|
||||
|
||||
return tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens
|
||||
return (
|
||||
tokenized_str,
|
||||
images_list,
|
||||
images_seq_mask,
|
||||
images_spatial_crop,
|
||||
num_image_tokens,
|
||||
)
|
||||
|
||||
|
||||
AutoProcessor.register("DeepseekVLV2Processor", DeepseekVLV2Processor)
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# yapf: disable
|
||||
# ruff: noqa: E501
|
||||
# coding=utf-8
|
||||
# adapted from https://github.com/AIDC-AI/Ovis/blob/35ab51a1a1e3542fa6db260a1084cefbc8f164bb/ovis/vllm/processing_ovis.py
|
||||
@ -35,23 +34,24 @@ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
|
||||
|
||||
from vllm.multimodal.image import convert_image_mode
|
||||
|
||||
__all__ = ['OvisProcessor']
|
||||
__all__ = ["OvisProcessor"]
|
||||
IGNORE_ID = -100
|
||||
|
||||
class OvisProcessorKwargs(ProcessingKwargs, total=False): # type: ignore[call-arg]
|
||||
|
||||
class OvisProcessorKwargs(ProcessingKwargs, total=False): # type: ignore[call-arg]
|
||||
_defaults = {
|
||||
"text_kwargs": {
|
||||
"padding": False,
|
||||
},
|
||||
"images_kwargs": {
|
||||
'max_partition':9,
|
||||
'covering_threshold':0.9,
|
||||
'convert_to_rgb':True,
|
||||
'return_tensors':'pt'},
|
||||
"max_partition": 9,
|
||||
"covering_threshold": 0.9,
|
||||
"convert_to_rgb": True,
|
||||
"return_tensors": "pt",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
|
||||
class OvisProcessor(ProcessorMixin):
|
||||
r"""
|
||||
Constructs an Ovis processor which wraps an Ovis image processor and a Qwen2 tokenizer into a single processor.
|
||||
@ -97,14 +97,16 @@ class OvisProcessor(ProcessorMixin):
|
||||
"image_col_sep": -303,
|
||||
"image_row_sep": -304,
|
||||
"image_end": -305,
|
||||
'image_pad': image_pad_token_id,
|
||||
"image_pad": image_pad_token_id,
|
||||
}
|
||||
return extra_special_tokens
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
images: ImageInput = None,
|
||||
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
|
||||
text: Union[
|
||||
TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]
|
||||
] = None,
|
||||
**kwargs: Unpack[OvisProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
@ -169,7 +171,6 @@ class OvisProcessor(ProcessorMixin):
|
||||
|
||||
# Process text input
|
||||
if text is not None:
|
||||
|
||||
if not isinstance(text, list):
|
||||
text = [text]
|
||||
|
||||
@ -178,7 +179,10 @@ class OvisProcessor(ProcessorMixin):
|
||||
replaced_ids_list = []
|
||||
idx = 0
|
||||
for ids_tensor in tokenized_batched_text:
|
||||
if image_token_id in ids_tensor and "image_placeholders" in image_features:
|
||||
if (
|
||||
image_token_id in ids_tensor
|
||||
and "image_placeholders" in image_features
|
||||
):
|
||||
if idx < len(image_features["image_placeholders"]):
|
||||
# Converts in list for ease of use
|
||||
ids_list = ids_tensor.tolist()
|
||||
@ -188,7 +192,9 @@ class OvisProcessor(ProcessorMixin):
|
||||
# replace placeholders
|
||||
for i, token_id in enumerate(ids_list):
|
||||
if token_id == image_token_id:
|
||||
placeholder_ids = image_features["image_placeholders"][idx]
|
||||
placeholder_ids = image_features["image_placeholders"][
|
||||
idx
|
||||
]
|
||||
new_ids.extend(placeholder_ids)
|
||||
idx += 1
|
||||
else:
|
||||
@ -198,7 +204,8 @@ class OvisProcessor(ProcessorMixin):
|
||||
ids_tensor = torch.tensor(new_ids, dtype=torch.long)
|
||||
else:
|
||||
raise RuntimeError(
|
||||
'Mismatch between the images you provided and the number of placeholder present in the text')
|
||||
"Mismatch between the images you provided and the number of placeholder present in the text"
|
||||
)
|
||||
|
||||
replaced_ids_list.append(ids_tensor)
|
||||
|
||||
@ -217,7 +224,7 @@ class OvisProcessor(ProcessorMixin):
|
||||
# Add image features if present
|
||||
if image_features:
|
||||
output["pixel_values"] = processed_images
|
||||
output['grids'] = grids
|
||||
output["grids"] = grids
|
||||
|
||||
return output
|
||||
|
||||
@ -227,8 +234,10 @@ class OvisProcessor(ProcessorMixin):
|
||||
def _tokenize_with_image_symbol(self, text_list: list[str]) -> torch.LongTensor:
|
||||
batch_token_ids = []
|
||||
for text in text_list:
|
||||
text_chunks = [self.tokenizer(chunk, add_special_tokens=False).input_ids for chunk in
|
||||
text.split(self.image_token)]
|
||||
text_chunks = [
|
||||
self.tokenizer(chunk, add_special_tokens=False).input_ids
|
||||
for chunk in text.split(self.image_token)
|
||||
]
|
||||
token_ids = []
|
||||
num_chuck = len(text_chunks)
|
||||
for i, chunk in enumerate(text_chunks):
|
||||
@ -240,50 +249,60 @@ class OvisProcessor(ProcessorMixin):
|
||||
|
||||
def get_image_size(self):
|
||||
size = self.image_processor.size
|
||||
if 'shortest_edge' in size:
|
||||
width = height = size['shortest_edge']
|
||||
if "shortest_edge" in size:
|
||||
width = height = size["shortest_edge"]
|
||||
elif "height" in size and "width" in size:
|
||||
width = size['width']
|
||||
height = size['height']
|
||||
width = size["width"]
|
||||
height = size["height"]
|
||||
else:
|
||||
raise ValueError( "Can't parse image size from image_processor config.")
|
||||
raise ValueError("Can't parse image size from image_processor config.")
|
||||
return height, width
|
||||
|
||||
def get_token_value(self, tok):
|
||||
return self.extra_special_tokens[tok]
|
||||
|
||||
def construct_image_indicators(self, grid):
|
||||
image_placeholders = [self.get_token_value('image_start'),
|
||||
self.get_token_value('image_atom'),
|
||||
self.get_token_value('image_prefix')]
|
||||
image_placeholders = [
|
||||
self.get_token_value("image_start"),
|
||||
self.get_token_value("image_atom"),
|
||||
self.get_token_value("image_prefix"),
|
||||
]
|
||||
if grid[0] * grid[1] > 1:
|
||||
for r in range(grid[0]):
|
||||
for c in range(grid[1]):
|
||||
image_placeholders.append(self.get_token_value('image_atom') )
|
||||
image_placeholders.append(self.get_token_value("image_atom"))
|
||||
if c < grid[1] - 1:
|
||||
image_placeholders.append(self.get_token_value('image_col_sep'))
|
||||
image_placeholders.append(self.get_token_value("image_col_sep"))
|
||||
if r < grid[0] - 1:
|
||||
image_placeholders.append(self.get_token_value('image_row_sep'))
|
||||
image_placeholders.append(self.get_token_value('image_end'))
|
||||
image_placeholders.append(self.get_token_value("image_row_sep"))
|
||||
image_placeholders.append(self.get_token_value("image_end"))
|
||||
return image_placeholders
|
||||
|
||||
def construct_image_placeholders(self, grid):
|
||||
|
||||
image_placeholders = self.construct_image_indicators(grid)
|
||||
|
||||
image_atom_token_id = self.get_token_value('image_atom')
|
||||
image_atom_token_id = self.get_token_value("image_atom")
|
||||
# Extract the padding token ID from tokenizer
|
||||
image_padding_token_id = self.get_token_value('image_pad')
|
||||
image_padding_token_id = self.get_token_value("image_pad")
|
||||
|
||||
# Create a new list with padding tokens inserted
|
||||
padded_placeholder_tokens = []
|
||||
for token in image_placeholders:
|
||||
padded_placeholder_tokens.append(image_padding_token_id)
|
||||
if token == image_atom_token_id:
|
||||
padded_placeholder_tokens.extend([image_padding_token_id] * self.image_segment_len)
|
||||
padded_placeholder_tokens.extend(
|
||||
[image_padding_token_id] * self.image_segment_len
|
||||
)
|
||||
return padded_placeholder_tokens
|
||||
|
||||
def preprocess_image(self, image: PIL.Image.Image, max_partition, covering_threshold, convert_to_rgb, return_tensors):
|
||||
def preprocess_image(
|
||||
self,
|
||||
image: PIL.Image.Image,
|
||||
max_partition,
|
||||
covering_threshold,
|
||||
convert_to_rgb,
|
||||
return_tensors,
|
||||
):
|
||||
def _preprocess(img: PIL.Image.Image, side):
|
||||
# first resize and preprocess
|
||||
w, h = img.size
|
||||
@ -296,19 +315,27 @@ class OvisProcessor(ProcessorMixin):
|
||||
new_height = side
|
||||
new_width = int(w / h * new_height)
|
||||
new_size = dict(height=new_height, width=new_width)
|
||||
pixel_values = self.image_processor.preprocess(img, size=new_size, return_tensors=return_tensors)['pixel_values']
|
||||
pixel_values = self.image_processor.preprocess(
|
||||
img, size=new_size, return_tensors=return_tensors
|
||||
)["pixel_values"]
|
||||
|
||||
# then pad to square
|
||||
square_values = torch.zeros([1, 3, side, side], dtype=pixel_values.dtype, device=pixel_values.device)
|
||||
square_values = torch.zeros(
|
||||
[1, 3, side, side], dtype=pixel_values.dtype, device=pixel_values.device
|
||||
)
|
||||
new_height, new_width = pixel_values.shape[2:]
|
||||
if new_height == new_width:
|
||||
square_values[:, :, :, :] = pixel_values
|
||||
elif new_height > new_width:
|
||||
from_index = (side - new_width) // 2
|
||||
square_values[:, :, :, from_index:from_index + new_width] = pixel_values
|
||||
square_values[:, :, :, from_index : from_index + new_width] = (
|
||||
pixel_values
|
||||
)
|
||||
else:
|
||||
from_index = (side - new_height) // 2
|
||||
square_values[:, :, from_index:from_index + new_height, :] = pixel_values
|
||||
square_values[:, :, from_index : from_index + new_height, :] = (
|
||||
pixel_values
|
||||
)
|
||||
|
||||
return square_values
|
||||
|
||||
@ -350,7 +377,9 @@ class OvisProcessor(ProcessorMixin):
|
||||
good_grids = []
|
||||
for grid in candidate_grids:
|
||||
partition = _partition(img, grid)
|
||||
covering_ratio = sum([_covering_area(*p, side) for p in partition]) / img_area
|
||||
covering_ratio = (
|
||||
sum([_covering_area(*p, side) for p in partition]) / img_area
|
||||
)
|
||||
assert covering_ratio <= 1.0
|
||||
all_grids.append((grid, covering_ratio))
|
||||
if covering_ratio > covering_threshold:
|
||||
@ -358,18 +387,19 @@ class OvisProcessor(ProcessorMixin):
|
||||
|
||||
if len(good_grids) > 0:
|
||||
# pick the good partition with minimum #sub_images and break the tie using covering_ratio
|
||||
return sorted(good_grids, key=lambda x: (x[0][0] * x[0][1], -x[1]))[0][0]
|
||||
return sorted(good_grids, key=lambda x: (x[0][0] * x[0][1], -x[1]))[0][
|
||||
0
|
||||
]
|
||||
else:
|
||||
# pick the partition with maximum covering_ratio and break the tie using #sub_images
|
||||
return sorted(all_grids, key=lambda x: (-x[1], x[0][0] * x[0][1]))[0][0]
|
||||
|
||||
if convert_to_rgb:
|
||||
image = convert_image_mode(image, 'RGB')
|
||||
|
||||
image = convert_image_mode(image, "RGB")
|
||||
|
||||
sides = self.get_image_size()
|
||||
if sides[0] != sides[1]:
|
||||
raise ValueError('get_image_size() returns non-square size')
|
||||
raise ValueError("get_image_size() returns non-square size")
|
||||
side = sides[0]
|
||||
grid = _get_best_grid(image, side)
|
||||
partition = _partition(image, grid)
|
||||
@ -405,14 +435,18 @@ class OvisProcessor(ProcessorMixin):
|
||||
`list[str]`: The decoded text.
|
||||
"""
|
||||
return self.tokenizer.batch_decode(
|
||||
generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
||||
generated_outputs,
|
||||
skip_special_tokens=True,
|
||||
clean_up_tokenization_spaces=False,
|
||||
)
|
||||
|
||||
@property
|
||||
def model_input_names(self):
|
||||
tokenizer_input_names = self.tokenizer.model_input_names
|
||||
image_processor_input_names = self.image_processor.model_input_names
|
||||
names_from_processor = list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
|
||||
names_from_processor = list(
|
||||
dict.fromkeys(tokenizer_input_names + image_processor_input_names)
|
||||
)
|
||||
return names_from_processor + ["second_per_grid_ts"]
|
||||
|
||||
|
||||
|
||||
@ -40,9 +40,6 @@ from vllm.utils.flashinfer import (
|
||||
supports_trtllm_attention,
|
||||
use_trtllm_attention,
|
||||
)
|
||||
|
||||
# yapf conflicts with isort for this block
|
||||
# yapf: disable
|
||||
from vllm.v1.attention.backends.utils import (
|
||||
AttentionCGSupport,
|
||||
AttentionMetadataBuilder,
|
||||
@ -52,8 +49,6 @@ from vllm.v1.attention.backends.utils import (
|
||||
infer_global_hyperparameters,
|
||||
split_decodes_and_prefills,
|
||||
)
|
||||
|
||||
# yapf: enable
|
||||
from vllm.v1.kv_cache_interface import AttentionSpec
|
||||
|
||||
FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024
|
||||
|
||||
@ -11,9 +11,6 @@ from vllm.attention.backends.abstract import AttentionLayer
|
||||
from vllm.attention.ops.rocm_aiter_mla import aiter_mla_decode_fwd
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.utils import cdiv
|
||||
|
||||
# yapf conflicts with isort for this docstring
|
||||
# yapf: disable
|
||||
from vllm.v1.attention.backends.mla.common import (
|
||||
MLACommonBackend,
|
||||
MLACommonDecodeMetadata,
|
||||
@ -24,8 +21,6 @@ from vllm.v1.attention.backends.mla.common import (
|
||||
from vllm.v1.attention.backends.utils import AttentionCGSupport
|
||||
from vllm.v1.kv_cache_interface import AttentionSpec
|
||||
|
||||
# yapf: enable
|
||||
|
||||
|
||||
def is_aiter_mla_enabled() -> bool:
|
||||
return envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MLA
|
||||
|
||||
@ -18,8 +18,6 @@ from msgspec import msgpack
|
||||
|
||||
from vllm import envs
|
||||
from vllm.logger import init_logger
|
||||
|
||||
# yapf: disable
|
||||
from vllm.multimodal.inputs import (
|
||||
BaseMultiModalField,
|
||||
MultiModalBatchedField,
|
||||
@ -32,8 +30,6 @@ from vllm.multimodal.inputs import (
|
||||
MultiModalSharedField,
|
||||
NestedTensors,
|
||||
)
|
||||
|
||||
# yapf: enable
|
||||
from vllm.v1.engine import UtilityResult
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@ -48,9 +48,6 @@ from vllm.model_executor.layers.mamba.abstract import MambaBase
|
||||
from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
|
||||
from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader
|
||||
from vllm.model_executor.models.deepseek_v2 import DeepseekV32IndexerCache
|
||||
|
||||
# yapf conflicts with isort for this block
|
||||
# yapf: disable
|
||||
from vllm.model_executor.models.interfaces import (
|
||||
SupportsMultiModal,
|
||||
is_mixture_of_experts,
|
||||
@ -59,8 +56,6 @@ from vllm.model_executor.models.interfaces import (
|
||||
supports_multimodal_pruning,
|
||||
supports_transcription,
|
||||
)
|
||||
|
||||
# yapf: enable
|
||||
from vllm.model_executor.models.interfaces_base import (
|
||||
VllmModelForPooling,
|
||||
is_pooling_model,
|
||||
@ -101,9 +96,6 @@ from vllm.v1.attention.backends.utils import (
|
||||
split_attn_metadata,
|
||||
)
|
||||
from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
|
||||
|
||||
# yapf conflicts with isort for this block
|
||||
# yapf: disable
|
||||
from vllm.v1.kv_cache_interface import (
|
||||
AttentionSpec,
|
||||
ChunkedLocalAttentionSpec,
|
||||
@ -118,8 +110,6 @@ from vllm.v1.kv_cache_interface import (
|
||||
SlidingWindowSpec,
|
||||
UniformTypeKVCacheSpecs,
|
||||
)
|
||||
|
||||
# yapf: enable
|
||||
from vllm.v1.outputs import (
|
||||
EMPTY_MODEL_RUNNER_OUTPUT,
|
||||
AsyncModelRunnerOutput,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user