mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-16 14:17:16 +08:00
Remove all references to yapf as it's no longer used (#26251)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
parent
d6953beb91
commit
4e256cadc2
@ -12,9 +12,6 @@ from functools import reduce
|
|||||||
from typing import Optional, Union
|
from typing import Optional, Union
|
||||||
|
|
||||||
import jinja2
|
import jinja2
|
||||||
|
|
||||||
# yapf conflicts with isort for this block
|
|
||||||
# yapf: disable
|
|
||||||
from vllm_cutlass_library_extension import (
|
from vllm_cutlass_library_extension import (
|
||||||
DataType,
|
DataType,
|
||||||
EpilogueScheduleTag,
|
EpilogueScheduleTag,
|
||||||
@ -31,8 +28,6 @@ from vllm_cutlass_library_extension import (
|
|||||||
VLLMKernelScheduleTag,
|
VLLMKernelScheduleTag,
|
||||||
)
|
)
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Generator templating
|
# Generator templating
|
||||||
#
|
#
|
||||||
|
|||||||
@ -21,8 +21,6 @@ from vllm.utils import FlexibleArgumentParser
|
|||||||
logger = logging.getLogger()
|
logger = logging.getLogger()
|
||||||
|
|
||||||
|
|
||||||
# yapf conflicts with isort for this docstring
|
|
||||||
# yapf: disable
|
|
||||||
"""
|
"""
|
||||||
tensorize_vllm_model.py is a script that can be used to serialize and
|
tensorize_vllm_model.py is a script that can be used to serialize and
|
||||||
deserialize vLLM models. These models can be loaded using tensorizer
|
deserialize vLLM models. These models can be loaded using tensorizer
|
||||||
@ -132,7 +130,8 @@ def get_parser():
|
|||||||
"can be loaded using tensorizer directly to the GPU "
|
"can be loaded using tensorizer directly to the GPU "
|
||||||
"extremely quickly. Tensor encryption and decryption is "
|
"extremely quickly. Tensor encryption and decryption is "
|
||||||
"also supported, although libsodium must be installed to "
|
"also supported, although libsodium must be installed to "
|
||||||
"use it.")
|
"use it."
|
||||||
|
)
|
||||||
parser = EngineArgs.add_cli_args(parser)
|
parser = EngineArgs.add_cli_args(parser)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -144,13 +143,14 @@ def get_parser():
|
|||||||
"along with the model by instantiating a TensorizerConfig object, "
|
"along with the model by instantiating a TensorizerConfig object, "
|
||||||
"creating a dict from it with TensorizerConfig.to_serializable(), "
|
"creating a dict from it with TensorizerConfig.to_serializable(), "
|
||||||
"and passing it to LoRARequest's initializer with the kwarg "
|
"and passing it to LoRARequest's initializer with the kwarg "
|
||||||
"tensorizer_config_dict."
|
"tensorizer_config_dict.",
|
||||||
)
|
)
|
||||||
|
|
||||||
subparsers = parser.add_subparsers(dest='command', required=True)
|
subparsers = parser.add_subparsers(dest="command", required=True)
|
||||||
|
|
||||||
serialize_parser = subparsers.add_parser(
|
serialize_parser = subparsers.add_parser(
|
||||||
'serialize', help="Serialize a model to `--serialized-directory`")
|
"serialize", help="Serialize a model to `--serialized-directory`"
|
||||||
|
)
|
||||||
|
|
||||||
serialize_parser.add_argument(
|
serialize_parser.add_argument(
|
||||||
"--suffix",
|
"--suffix",
|
||||||
@ -163,7 +163,9 @@ def get_parser():
|
|||||||
"`--suffix` is `v1`, the serialized model tensors will be "
|
"`--suffix` is `v1`, the serialized model tensors will be "
|
||||||
"saved to "
|
"saved to "
|
||||||
"`s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors`. "
|
"`s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors`. "
|
||||||
"If none is provided, a random UUID will be used."))
|
"If none is provided, a random UUID will be used."
|
||||||
|
),
|
||||||
|
)
|
||||||
serialize_parser.add_argument(
|
serialize_parser.add_argument(
|
||||||
"--serialized-directory",
|
"--serialized-directory",
|
||||||
type=str,
|
type=str,
|
||||||
@ -175,108 +177,127 @@ def get_parser():
|
|||||||
"and the model HuggingFace ID is `EleutherAI/gpt-j-6B`, tensors will "
|
"and the model HuggingFace ID is `EleutherAI/gpt-j-6B`, tensors will "
|
||||||
"be saved to `dir/vllm/EleutherAI/gpt-j-6B/suffix/model.tensors`, "
|
"be saved to `dir/vllm/EleutherAI/gpt-j-6B/suffix/model.tensors`, "
|
||||||
"where `suffix` is given by `--suffix` or a random UUID if not "
|
"where `suffix` is given by `--suffix` or a random UUID if not "
|
||||||
"provided.")
|
"provided.",
|
||||||
|
)
|
||||||
|
|
||||||
serialize_parser.add_argument(
|
serialize_parser.add_argument(
|
||||||
"--serialization-kwargs",
|
"--serialization-kwargs",
|
||||||
type=tensorizer_kwargs_arg,
|
type=tensorizer_kwargs_arg,
|
||||||
required=False,
|
required=False,
|
||||||
help=("A JSON string containing additional keyword arguments to "
|
help=(
|
||||||
"pass to Tensorizer's TensorSerializer during "
|
"A JSON string containing additional keyword arguments to "
|
||||||
"serialization."))
|
"pass to Tensorizer's TensorSerializer during "
|
||||||
|
"serialization."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
serialize_parser.add_argument(
|
serialize_parser.add_argument(
|
||||||
"--keyfile",
|
"--keyfile",
|
||||||
type=str,
|
type=str,
|
||||||
required=False,
|
required=False,
|
||||||
help=("Encrypt the model weights with a randomly-generated binary key,"
|
help=(
|
||||||
" and save the key at this path"))
|
"Encrypt the model weights with a randomly-generated binary key,"
|
||||||
|
" and save the key at this path"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
deserialize_parser = subparsers.add_parser(
|
deserialize_parser = subparsers.add_parser(
|
||||||
'deserialize',
|
"deserialize",
|
||||||
help=("Deserialize a model from `--path-to-tensors`"
|
help=(
|
||||||
" to verify it can be loaded and used."))
|
"Deserialize a model from `--path-to-tensors`"
|
||||||
|
" to verify it can be loaded and used."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
deserialize_parser.add_argument(
|
deserialize_parser.add_argument(
|
||||||
"--path-to-tensors",
|
"--path-to-tensors",
|
||||||
type=str,
|
type=str,
|
||||||
required=False,
|
required=False,
|
||||||
help="The local path or S3 URI to the model tensors to deserialize. ")
|
help="The local path or S3 URI to the model tensors to deserialize. ",
|
||||||
|
)
|
||||||
|
|
||||||
deserialize_parser.add_argument(
|
deserialize_parser.add_argument(
|
||||||
"--serialized-directory",
|
"--serialized-directory",
|
||||||
type=str,
|
type=str,
|
||||||
required=False,
|
required=False,
|
||||||
help="Directory with model artifacts for loading. Assumes a "
|
help="Directory with model artifacts for loading. Assumes a "
|
||||||
"model.tensors file exists therein. Can supersede "
|
"model.tensors file exists therein. Can supersede "
|
||||||
"--path-to-tensors.")
|
"--path-to-tensors.",
|
||||||
|
)
|
||||||
|
|
||||||
deserialize_parser.add_argument(
|
deserialize_parser.add_argument(
|
||||||
"--keyfile",
|
"--keyfile",
|
||||||
type=str,
|
type=str,
|
||||||
required=False,
|
required=False,
|
||||||
help=("Path to a binary key to use to decrypt the model weights,"
|
help=(
|
||||||
" if the model was serialized with encryption"))
|
"Path to a binary key to use to decrypt the model weights,"
|
||||||
|
" if the model was serialized with encryption"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
deserialize_parser.add_argument(
|
deserialize_parser.add_argument(
|
||||||
"--deserialization-kwargs",
|
"--deserialization-kwargs",
|
||||||
type=tensorizer_kwargs_arg,
|
type=tensorizer_kwargs_arg,
|
||||||
required=False,
|
required=False,
|
||||||
help=("A JSON string containing additional keyword arguments to "
|
help=(
|
||||||
"pass to Tensorizer's `TensorDeserializer` during "
|
"A JSON string containing additional keyword arguments to "
|
||||||
"deserialization."))
|
"pass to Tensorizer's `TensorDeserializer` during "
|
||||||
|
"deserialization."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
TensorizerArgs.add_cli_args(deserialize_parser)
|
TensorizerArgs.add_cli_args(deserialize_parser)
|
||||||
|
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
def merge_extra_config_with_tensorizer_config(extra_cfg: dict,
|
|
||||||
cfg: TensorizerConfig):
|
def merge_extra_config_with_tensorizer_config(extra_cfg: dict, cfg: TensorizerConfig):
|
||||||
for k, v in extra_cfg.items():
|
for k, v in extra_cfg.items():
|
||||||
if hasattr(cfg, k):
|
if hasattr(cfg, k):
|
||||||
setattr(cfg, k, v)
|
setattr(cfg, k, v)
|
||||||
logger.info(
|
logger.info(
|
||||||
"Updating TensorizerConfig with %s from "
|
"Updating TensorizerConfig with %s from "
|
||||||
"--model-loader-extra-config provided", k
|
"--model-loader-extra-config provided",
|
||||||
|
k,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def deserialize(args, tensorizer_config):
|
def deserialize(args, tensorizer_config):
|
||||||
if args.lora_path:
|
if args.lora_path:
|
||||||
tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
|
tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
|
||||||
llm = LLM(model=args.model,
|
llm = LLM(
|
||||||
load_format="tensorizer",
|
model=args.model,
|
||||||
tensor_parallel_size=args.tensor_parallel_size,
|
load_format="tensorizer",
|
||||||
model_loader_extra_config=tensorizer_config,
|
tensor_parallel_size=args.tensor_parallel_size,
|
||||||
enable_lora=True,
|
model_loader_extra_config=tensorizer_config,
|
||||||
|
enable_lora=True,
|
||||||
)
|
)
|
||||||
sampling_params = SamplingParams(
|
sampling_params = SamplingParams(
|
||||||
temperature=0,
|
temperature=0, max_tokens=256, stop=["[/assistant]"]
|
||||||
max_tokens=256,
|
|
||||||
stop=["[/assistant]"]
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Truncating this as the extra text isn't necessary
|
# Truncating this as the extra text isn't necessary
|
||||||
prompts = [
|
prompts = ["[user] Write a SQL query to answer the question based on ..."]
|
||||||
"[user] Write a SQL query to answer the question based on ..."
|
|
||||||
]
|
|
||||||
|
|
||||||
# Test LoRA load
|
# Test LoRA load
|
||||||
print(
|
print(
|
||||||
llm.generate(
|
llm.generate(
|
||||||
prompts,
|
prompts,
|
||||||
sampling_params,
|
sampling_params,
|
||||||
lora_request=LoRARequest("sql-lora",
|
lora_request=LoRARequest(
|
||||||
1,
|
"sql-lora",
|
||||||
args.lora_path,
|
1,
|
||||||
tensorizer_config_dict = tensorizer_config
|
args.lora_path,
|
||||||
.to_serializable())
|
tensorizer_config_dict=tensorizer_config.to_serializable(),
|
||||||
|
),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
llm = LLM(model=args.model,
|
llm = LLM(
|
||||||
load_format="tensorizer",
|
model=args.model,
|
||||||
tensor_parallel_size=args.tensor_parallel_size,
|
load_format="tensorizer",
|
||||||
model_loader_extra_config=tensorizer_config
|
tensor_parallel_size=args.tensor_parallel_size,
|
||||||
|
model_loader_extra_config=tensorizer_config,
|
||||||
)
|
)
|
||||||
return llm
|
return llm
|
||||||
|
|
||||||
@ -285,17 +306,20 @@ def main():
|
|||||||
parser = get_parser()
|
parser = get_parser()
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
s3_access_key_id = (getattr(args, 's3_access_key_id', None)
|
s3_access_key_id = getattr(args, "s3_access_key_id", None) or os.environ.get(
|
||||||
or os.environ.get("S3_ACCESS_KEY_ID", None))
|
"S3_ACCESS_KEY_ID", None
|
||||||
s3_secret_access_key = (getattr(args, 's3_secret_access_key', None)
|
)
|
||||||
or os.environ.get("S3_SECRET_ACCESS_KEY", None))
|
s3_secret_access_key = getattr(
|
||||||
s3_endpoint = (getattr(args, 's3_endpoint', None)
|
args, "s3_secret_access_key", None
|
||||||
or os.environ.get("S3_ENDPOINT_URL", None))
|
) or os.environ.get("S3_SECRET_ACCESS_KEY", None)
|
||||||
|
s3_endpoint = getattr(args, "s3_endpoint", None) or os.environ.get(
|
||||||
|
"S3_ENDPOINT_URL", None
|
||||||
|
)
|
||||||
|
|
||||||
credentials = {
|
credentials = {
|
||||||
"s3_access_key_id": s3_access_key_id,
|
"s3_access_key_id": s3_access_key_id,
|
||||||
"s3_secret_access_key": s3_secret_access_key,
|
"s3_secret_access_key": s3_secret_access_key,
|
||||||
"s3_endpoint": s3_endpoint
|
"s3_endpoint": s3_endpoint,
|
||||||
}
|
}
|
||||||
|
|
||||||
model_ref = args.model
|
model_ref = args.model
|
||||||
@ -309,25 +333,25 @@ def main():
|
|||||||
if args.model_loader_extra_config:
|
if args.model_loader_extra_config:
|
||||||
extra_config = json.loads(args.model_loader_extra_config)
|
extra_config = json.loads(args.model_loader_extra_config)
|
||||||
|
|
||||||
|
tensorizer_dir = args.serialized_directory or extra_config.get("tensorizer_dir")
|
||||||
tensorizer_dir = (args.serialized_directory or
|
tensorizer_uri = getattr(args, "path_to_tensors", None) or extra_config.get(
|
||||||
extra_config.get("tensorizer_dir"))
|
"tensorizer_uri"
|
||||||
tensorizer_uri = (getattr(args, "path_to_tensors", None)
|
)
|
||||||
or extra_config.get("tensorizer_uri"))
|
|
||||||
|
|
||||||
if tensorizer_dir and tensorizer_uri:
|
if tensorizer_dir and tensorizer_uri:
|
||||||
parser.error("--serialized-directory and --path-to-tensors "
|
parser.error(
|
||||||
"cannot both be provided")
|
"--serialized-directory and --path-to-tensors cannot both be provided"
|
||||||
|
)
|
||||||
|
|
||||||
if not tensorizer_dir and not tensorizer_uri:
|
if not tensorizer_dir and not tensorizer_uri:
|
||||||
parser.error("Either --serialized-directory or --path-to-tensors "
|
parser.error(
|
||||||
"must be provided")
|
"Either --serialized-directory or --path-to-tensors must be provided"
|
||||||
|
)
|
||||||
|
|
||||||
if args.command == "serialize":
|
if args.command == "serialize":
|
||||||
engine_args = EngineArgs.from_cli_args(args)
|
engine_args = EngineArgs.from_cli_args(args)
|
||||||
|
|
||||||
input_dir = tensorizer_dir.rstrip('/')
|
input_dir = tensorizer_dir.rstrip("/")
|
||||||
suffix = args.suffix if args.suffix else uuid.uuid4().hex
|
suffix = args.suffix if args.suffix else uuid.uuid4().hex
|
||||||
base_path = f"{input_dir}/vllm/{model_ref}/{suffix}"
|
base_path = f"{input_dir}/vllm/{model_ref}/{suffix}"
|
||||||
if engine_args.tensor_parallel_size > 1:
|
if engine_args.tensor_parallel_size > 1:
|
||||||
@ -339,15 +363,14 @@ def main():
|
|||||||
tensorizer_uri=model_path,
|
tensorizer_uri=model_path,
|
||||||
encryption_keyfile=keyfile,
|
encryption_keyfile=keyfile,
|
||||||
serialization_kwargs=args.serialization_kwargs or {},
|
serialization_kwargs=args.serialization_kwargs or {},
|
||||||
**credentials
|
**credentials,
|
||||||
)
|
)
|
||||||
|
|
||||||
if args.lora_path:
|
if args.lora_path:
|
||||||
tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
|
tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
|
||||||
tensorize_lora_adapter(args.lora_path, tensorizer_config)
|
tensorize_lora_adapter(args.lora_path, tensorizer_config)
|
||||||
|
|
||||||
merge_extra_config_with_tensorizer_config(extra_config,
|
merge_extra_config_with_tensorizer_config(extra_config, tensorizer_config)
|
||||||
tensorizer_config)
|
|
||||||
tensorize_vllm_model(engine_args, tensorizer_config)
|
tensorize_vllm_model(engine_args, tensorizer_config)
|
||||||
|
|
||||||
elif args.command == "deserialize":
|
elif args.command == "deserialize":
|
||||||
@ -356,11 +379,10 @@ def main():
|
|||||||
tensorizer_dir=args.serialized_directory,
|
tensorizer_dir=args.serialized_directory,
|
||||||
encryption_keyfile=keyfile,
|
encryption_keyfile=keyfile,
|
||||||
deserialization_kwargs=args.deserialization_kwargs or {},
|
deserialization_kwargs=args.deserialization_kwargs or {},
|
||||||
**credentials
|
**credentials,
|
||||||
)
|
)
|
||||||
|
|
||||||
merge_extra_config_with_tensorizer_config(extra_config,
|
merge_extra_config_with_tensorizer_config(extra_config, tensorizer_config)
|
||||||
tensorizer_config)
|
|
||||||
deserialize(args, tensorizer_config)
|
deserialize(args, tensorizer_config)
|
||||||
else:
|
else:
|
||||||
raise ValueError("Either serialize or deserialize must be specified.")
|
raise ValueError("Either serialize or deserialize must be specified.")
|
||||||
|
|||||||
@ -8,16 +8,11 @@ import torch
|
|||||||
import vllm.envs as envs
|
import vllm.envs as envs
|
||||||
from tests.kernels.quantization.nvfp4_utils import quant_nvfp4_tensor
|
from tests.kernels.quantization.nvfp4_utils import quant_nvfp4_tensor
|
||||||
from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
|
from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
|
||||||
|
|
||||||
# yapf conflicts with isort for this block
|
|
||||||
# yapf: disable
|
|
||||||
from vllm.compilation.activation_quant_fusion import (
|
from vllm.compilation.activation_quant_fusion import (
|
||||||
FUSED_OPS,
|
FUSED_OPS,
|
||||||
SILU_MUL_OP,
|
SILU_MUL_OP,
|
||||||
ActivationQuantFusionPass,
|
ActivationQuantFusionPass,
|
||||||
)
|
)
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
from vllm.compilation.fusion import QUANT_OPS
|
from vllm.compilation.fusion import QUANT_OPS
|
||||||
from vllm.compilation.noop_elimination import NoOpEliminationPass
|
from vllm.compilation.noop_elimination import NoOpEliminationPass
|
||||||
from vllm.compilation.post_cleanup import PostCleanupPass
|
from vllm.compilation.post_cleanup import PostCleanupPass
|
||||||
|
|||||||
@ -107,10 +107,8 @@ class EPTestSettings:
|
|||||||
# NOTE: You can adjust tp_base locally to fit the model in GPU
|
# NOTE: You can adjust tp_base locally to fit the model in GPU
|
||||||
# The values displayed here are only a rough indicator of the size of the model
|
# The values displayed here are only a rough indicator of the size of the model
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
TEST_MODELS = {
|
TEST_MODELS = {
|
||||||
"deepseek-ai/DeepSeek-V2-Lite-Chat": EPTestSettings.fast(
|
"deepseek-ai/DeepSeek-V2-Lite-Chat": EPTestSettings.fast(trust_remote_code=True),
|
||||||
trust_remote_code=True),
|
|
||||||
"mistralai/Mixtral-8x7B-Instruct-v0.1": EPTestSettings.fast(tp_base=4),
|
"mistralai/Mixtral-8x7B-Instruct-v0.1": EPTestSettings.fast(tp_base=4),
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -192,22 +190,24 @@ def _compare_tp(
|
|||||||
]
|
]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
compare_two_settings(model_name,
|
compare_two_settings(
|
||||||
ep_args,
|
model_name,
|
||||||
tp_args,
|
ep_args,
|
||||||
ep_env,
|
tp_args,
|
||||||
tp_env,
|
ep_env,
|
||||||
method=method,
|
tp_env,
|
||||||
max_wait_seconds=360)
|
method=method,
|
||||||
|
max_wait_seconds=360,
|
||||||
|
)
|
||||||
except Exception:
|
except Exception:
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("model_name", "parallel_setup", "distributed_backend", "runner",
|
("model_name", "parallel_setup", "distributed_backend", "runner", "test_options"),
|
||||||
"test_options"),
|
|
||||||
[
|
[
|
||||||
params for model_name, settings in TEST_MODELS.items()
|
params
|
||||||
|
for model_name, settings in TEST_MODELS.items()
|
||||||
for params in settings.iter_params(model_name)
|
for params in settings.iter_params(model_name)
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
@ -220,10 +220,12 @@ def test_ep(
|
|||||||
test_options: EPTestOptions,
|
test_options: EPTestOptions,
|
||||||
num_gpus_available,
|
num_gpus_available,
|
||||||
):
|
):
|
||||||
_compare_tp(model_name,
|
_compare_tp(
|
||||||
parallel_setup,
|
model_name,
|
||||||
distributed_backend,
|
parallel_setup,
|
||||||
runner,
|
distributed_backend,
|
||||||
test_options,
|
runner,
|
||||||
num_gpus_available,
|
test_options,
|
||||||
method="generate")
|
num_gpus_available,
|
||||||
|
method="generate",
|
||||||
|
)
|
||||||
|
|||||||
@ -100,7 +100,6 @@ class PPTestSettings:
|
|||||||
# NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
|
# NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
|
||||||
# The values displayed here are only a rough indicator of the size of the model
|
# The values displayed here are only a rough indicator of the size of the model
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
TEXT_GENERATION_MODELS = {
|
TEXT_GENERATION_MODELS = {
|
||||||
# [Decoder-only]
|
# [Decoder-only]
|
||||||
# Uses Llama
|
# Uses Llama
|
||||||
@ -150,7 +149,9 @@ TEXT_GENERATION_MODELS = {
|
|||||||
"adept/persimmon-8b-chat": PPTestSettings.fast(),
|
"adept/persimmon-8b-chat": PPTestSettings.fast(),
|
||||||
"microsoft/phi-2": PPTestSettings.fast(),
|
"microsoft/phi-2": PPTestSettings.fast(),
|
||||||
"microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(),
|
"microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(),
|
||||||
"microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(multi_node_only=True, load_format="dummy"), # noqa: E501
|
"microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(
|
||||||
|
multi_node_only=True, load_format="dummy"
|
||||||
|
), # noqa: E501
|
||||||
"Qwen/Qwen-7B-Chat": PPTestSettings.fast(),
|
"Qwen/Qwen-7B-Chat": PPTestSettings.fast(),
|
||||||
"Qwen/Qwen2.5-0.5B-Instruct": PPTestSettings.fast(),
|
"Qwen/Qwen2.5-0.5B-Instruct": PPTestSettings.fast(),
|
||||||
"Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
|
"Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
|
||||||
@ -196,7 +197,6 @@ MULTIMODAL_MODELS = {
|
|||||||
"Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
|
"Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
|
||||||
"fixie-ai/ultravox-v0_5-llama-3_2-1b": PPTestSettings.fast(),
|
"fixie-ai/ultravox-v0_5-llama-3_2-1b": PPTestSettings.fast(),
|
||||||
}
|
}
|
||||||
# yapf: enable
|
|
||||||
|
|
||||||
# NOTE: You can update this on your local machine to run specific tests
|
# NOTE: You can update this on your local machine to run specific tests
|
||||||
TEST_MODELS = [
|
TEST_MODELS = [
|
||||||
|
|||||||
@ -287,29 +287,15 @@ def test_prefix_cache_default():
|
|||||||
assert not engine_args.enable_prefix_caching
|
assert not engine_args.enable_prefix_caching
|
||||||
|
|
||||||
|
|
||||||
# yapf: disable
|
@pytest.mark.parametrize(
|
||||||
@pytest.mark.parametrize(("arg", "expected", "option"), [
|
("arg", "expected", "option"),
|
||||||
(None, None, "mm-processor-kwargs"),
|
[
|
||||||
("{}", {}, "mm-processor-kwargs"),
|
(None, None, "mm-processor-kwargs"),
|
||||||
(
|
("{}", {}, "mm-processor-kwargs"),
|
||||||
'{"num_crops": 4}',
|
('{"num_crops": 4}', {"num_crops": 4}, "mm-processor-kwargs"),
|
||||||
{
|
('{"foo": {"bar": "baz"}}', {"foo": {"bar": "baz"}}, "mm-processor-kwargs"),
|
||||||
"num_crops": 4
|
],
|
||||||
},
|
)
|
||||||
"mm-processor-kwargs"
|
|
||||||
),
|
|
||||||
(
|
|
||||||
'{"foo": {"bar": "baz"}}',
|
|
||||||
{
|
|
||||||
"foo":
|
|
||||||
{
|
|
||||||
"bar": "baz"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"mm-processor-kwargs"
|
|
||||||
),
|
|
||||||
])
|
|
||||||
# yapf: enable
|
|
||||||
def test_composite_arg_parser(arg, expected, option):
|
def test_composite_arg_parser(arg, expected, option):
|
||||||
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
|
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
|
||||||
if arg is None:
|
if arg is None:
|
||||||
@ -321,8 +307,7 @@ def test_composite_arg_parser(arg, expected, option):
|
|||||||
|
|
||||||
def test_human_readable_model_len():
|
def test_human_readable_model_len():
|
||||||
# `exit_on_error` disabled to test invalid values below
|
# `exit_on_error` disabled to test invalid values below
|
||||||
parser = EngineArgs.add_cli_args(
|
parser = EngineArgs.add_cli_args(FlexibleArgumentParser(exit_on_error=False))
|
||||||
FlexibleArgumentParser(exit_on_error=False))
|
|
||||||
|
|
||||||
args = parser.parse_args([])
|
args = parser.parse_args([])
|
||||||
assert args.max_model_len is None
|
assert args.max_model_len is None
|
||||||
|
|||||||
@ -15,6 +15,7 @@ from vllm.assets.video import VideoAsset
|
|||||||
from vllm.config import ModelConfig
|
from vllm.config import ModelConfig
|
||||||
from vllm.entrypoints.chat_utils import (
|
from vllm.entrypoints.chat_utils import (
|
||||||
_try_extract_ast,
|
_try_extract_ast,
|
||||||
|
apply_mistral_chat_template,
|
||||||
load_chat_template,
|
load_chat_template,
|
||||||
parse_chat_messages,
|
parse_chat_messages,
|
||||||
parse_chat_messages_futures,
|
parse_chat_messages_futures,
|
||||||
@ -1855,17 +1856,17 @@ def test_resolve_hf_chat_template_kwargs(sample_json_schema, model, expected_kwa
|
|||||||
|
|
||||||
# NOTE: Qwen2-Audio default chat template is specially defined inside
|
# NOTE: Qwen2-Audio default chat template is specially defined inside
|
||||||
# processor class instead of using `tokenizer_config.json`
|
# processor class instead of using `tokenizer_config.json`
|
||||||
# yapf: disable
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("model", "expected_format"),
|
("model", "expected_format"),
|
||||||
[(PHI3V_MODEL_ID, "string"),
|
[
|
||||||
(QWEN2VL_MODEL_ID, "openai"),
|
(PHI3V_MODEL_ID, "string"),
|
||||||
(QWEN25VL_MODEL_ID, "openai"),
|
(QWEN2VL_MODEL_ID, "openai"),
|
||||||
(ULTRAVOX_MODEL_ID, "string"),
|
(QWEN25VL_MODEL_ID, "openai"),
|
||||||
(QWEN2AUDIO_MODEL_ID, "openai"),
|
(ULTRAVOX_MODEL_ID, "string"),
|
||||||
(LLAMA_GUARD_MODEL_ID, "openai")],
|
(QWEN2AUDIO_MODEL_ID, "openai"),
|
||||||
|
(LLAMA_GUARD_MODEL_ID, "openai"),
|
||||||
|
],
|
||||||
)
|
)
|
||||||
# yapf: enable
|
|
||||||
def test_resolve_content_format_hf_defined(model, expected_format):
|
def test_resolve_content_format_hf_defined(model, expected_format):
|
||||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||||
model_info.check_available_online(on_fail="skip")
|
model_info.check_available_online(on_fail="skip")
|
||||||
@ -1879,7 +1880,8 @@ def test_resolve_content_format_hf_defined(model, expected_format):
|
|||||||
hf_overrides=model_info.hf_overrides,
|
hf_overrides=model_info.hf_overrides,
|
||||||
skip_tokenizer_init=model_info.skip_tokenizer_init,
|
skip_tokenizer_init=model_info.skip_tokenizer_init,
|
||||||
enforce_eager=model_info.enforce_eager,
|
enforce_eager=model_info.enforce_eager,
|
||||||
dtype=model_info.dtype)
|
dtype=model_info.dtype,
|
||||||
|
)
|
||||||
|
|
||||||
tokenizer = get_tokenizer(
|
tokenizer = get_tokenizer(
|
||||||
model,
|
model,
|
||||||
@ -1911,18 +1913,18 @@ def test_resolve_content_format_hf_defined(model, expected_format):
|
|||||||
assert resolved_format == expected_format
|
assert resolved_format == expected_format
|
||||||
|
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("model", "expected_format"),
|
("model", "expected_format"),
|
||||||
[("Salesforce/blip2-opt-2.7b", "string"),
|
[
|
||||||
("facebook/chameleon-7b", "string"),
|
("Salesforce/blip2-opt-2.7b", "string"),
|
||||||
("deepseek-ai/deepseek-vl2-tiny", "string"),
|
("facebook/chameleon-7b", "string"),
|
||||||
("adept/fuyu-8b", "string"),
|
("deepseek-ai/deepseek-vl2-tiny", "string"),
|
||||||
("google/paligemma-3b-mix-224", "string"),
|
("adept/fuyu-8b", "string"),
|
||||||
("Qwen/Qwen-VL", "string"),
|
("google/paligemma-3b-mix-224", "string"),
|
||||||
("Qwen/Qwen-VL-Chat", "string")],
|
("Qwen/Qwen-VL", "string"),
|
||||||
|
("Qwen/Qwen-VL-Chat", "string"),
|
||||||
|
],
|
||||||
)
|
)
|
||||||
# yapf: enable
|
|
||||||
def test_resolve_content_format_fallbacks(model, expected_format):
|
def test_resolve_content_format_fallbacks(model, expected_format):
|
||||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||||
model_info.check_available_online(on_fail="skip")
|
model_info.check_available_online(on_fail="skip")
|
||||||
@ -1936,7 +1938,8 @@ def test_resolve_content_format_fallbacks(model, expected_format):
|
|||||||
hf_overrides=model_info.hf_overrides,
|
hf_overrides=model_info.hf_overrides,
|
||||||
skip_tokenizer_init=model_info.skip_tokenizer_init,
|
skip_tokenizer_init=model_info.skip_tokenizer_init,
|
||||||
enforce_eager=model_info.enforce_eager,
|
enforce_eager=model_info.enforce_eager,
|
||||||
dtype=model_info.dtype)
|
dtype=model_info.dtype,
|
||||||
|
)
|
||||||
|
|
||||||
tokenizer = get_tokenizer(
|
tokenizer = get_tokenizer(
|
||||||
model_config.tokenizer,
|
model_config.tokenizer,
|
||||||
@ -1968,30 +1971,30 @@ def test_resolve_content_format_fallbacks(model, expected_format):
|
|||||||
assert resolved_format == expected_format
|
assert resolved_format == expected_format
|
||||||
|
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("template_path", "expected_format"),
|
("template_path", "expected_format"),
|
||||||
[("template_alpaca.jinja", "string"),
|
[
|
||||||
("template_baichuan.jinja", "string"),
|
("template_alpaca.jinja", "string"),
|
||||||
("template_chatglm.jinja", "string"),
|
("template_baichuan.jinja", "string"),
|
||||||
("template_chatglm2.jinja", "string"),
|
("template_chatglm.jinja", "string"),
|
||||||
("template_chatml.jinja", "string"),
|
("template_chatglm2.jinja", "string"),
|
||||||
("template_dse_qwen2_vl.jinja", "openai"),
|
("template_chatml.jinja", "string"),
|
||||||
("template_falcon_180b.jinja", "string"),
|
("template_dse_qwen2_vl.jinja", "openai"),
|
||||||
("template_falcon.jinja", "string"),
|
("template_falcon_180b.jinja", "string"),
|
||||||
("template_inkbot.jinja", "string"),
|
("template_falcon.jinja", "string"),
|
||||||
("template_teleflm.jinja", "string"),
|
("template_inkbot.jinja", "string"),
|
||||||
("template_vlm2vec_phi3v.jinja", "openai"),
|
("template_teleflm.jinja", "string"),
|
||||||
("template_vlm2vec_qwen2vl.jinja", "openai"),
|
("template_vlm2vec_phi3v.jinja", "openai"),
|
||||||
("tool_chat_template_granite_20b_fc.jinja", "string"),
|
("template_vlm2vec_qwen2vl.jinja", "openai"),
|
||||||
("tool_chat_template_hermes.jinja", "string"),
|
("tool_chat_template_granite_20b_fc.jinja", "string"),
|
||||||
("tool_chat_template_internlm2_tool.jinja", "string"),
|
("tool_chat_template_hermes.jinja", "string"),
|
||||||
("tool_chat_template_llama3.1_json.jinja", "openai"),
|
("tool_chat_template_internlm2_tool.jinja", "string"),
|
||||||
("tool_chat_template_llama3.2_json.jinja", "openai"),
|
("tool_chat_template_llama3.1_json.jinja", "openai"),
|
||||||
("tool_chat_template_mistral_parallel.jinja", "string"),
|
("tool_chat_template_llama3.2_json.jinja", "openai"),
|
||||||
("tool_chat_template_mistral.jinja", "string")],
|
("tool_chat_template_mistral_parallel.jinja", "string"),
|
||||||
|
("tool_chat_template_mistral.jinja", "string"),
|
||||||
|
],
|
||||||
)
|
)
|
||||||
# yapf: enable
|
|
||||||
def test_resolve_content_format_examples(template_path, expected_format):
|
def test_resolve_content_format_examples(template_path, expected_format):
|
||||||
model_config = ModelConfig(
|
model_config = ModelConfig(
|
||||||
PHI3V_MODEL_ID, # Dummy
|
PHI3V_MODEL_ID, # Dummy
|
||||||
@ -2024,40 +2027,34 @@ def test_resolve_content_format_examples(template_path, expected_format):
|
|||||||
assert resolved_format == expected_format
|
assert resolved_format == expected_format
|
||||||
|
|
||||||
|
|
||||||
def test_parse_chat_messages_include_thinking_chunk(mistral_model_config,
|
def test_parse_chat_messages_include_thinking_chunk(
|
||||||
mistral_tokenizer):
|
mistral_model_config, mistral_tokenizer
|
||||||
messages = [{
|
):
|
||||||
"role":
|
messages = [
|
||||||
"system",
|
{
|
||||||
"content": [{
|
"role": "system",
|
||||||
"type": "text",
|
"content": [
|
||||||
"text": "You are a helpful assistant."
|
{"type": "text", "text": "You are a helpful assistant."},
|
||||||
}, {
|
{
|
||||||
"type":
|
"type": "thinking",
|
||||||
"thinking",
|
"closed": True,
|
||||||
"closed":
|
"thinking": "Only return the answer when you are confident.",
|
||||||
True,
|
},
|
||||||
"thinking":
|
],
|
||||||
"Only return the answer when you are confident."
|
},
|
||||||
}]
|
{"role": "user", "content": "What is 2+2?"},
|
||||||
}, {
|
{
|
||||||
"role": "user",
|
"role": "assistant",
|
||||||
"content": "What is 2+2?"
|
"content": [
|
||||||
}, {
|
{"type": "text", "text": "Let me think about it."},
|
||||||
"role":
|
{"type": "thinking", "closed": True, "thinking": "2+2 = 4"},
|
||||||
"assistant",
|
{
|
||||||
"content": [{
|
"type": "text",
|
||||||
"type": "text",
|
"text": "The answer is 4.",
|
||||||
"text": "Let me think about it."
|
},
|
||||||
}, {
|
],
|
||||||
"type": "thinking",
|
},
|
||||||
"closed": True,
|
]
|
||||||
"thinking": "2+2 = 4"
|
|
||||||
}, {
|
|
||||||
"type": "text",
|
|
||||||
"text": "The answer is 4.",
|
|
||||||
}],
|
|
||||||
}]
|
|
||||||
|
|
||||||
conversation_with_thinking, _, _ = parse_chat_messages(
|
conversation_with_thinking, _, _ = parse_chat_messages(
|
||||||
messages,
|
messages,
|
||||||
@ -2066,122 +2063,105 @@ def test_parse_chat_messages_include_thinking_chunk(mistral_model_config,
|
|||||||
content_format="openai",
|
content_format="openai",
|
||||||
)
|
)
|
||||||
|
|
||||||
expected_conversation = [{
|
expected_conversation = [
|
||||||
"role":
|
{
|
||||||
"system",
|
"role": "system",
|
||||||
"content": [{
|
"content": [
|
||||||
"type": "text",
|
{"type": "text", "text": "You are a helpful assistant."},
|
||||||
"text": "You are a helpful assistant."
|
{
|
||||||
}, {
|
"type": "text",
|
||||||
"type": "text",
|
"text": "Only return the answer when you are confident.",
|
||||||
"text": "Only return the answer when you are confident."
|
},
|
||||||
}],
|
],
|
||||||
}, {
|
},
|
||||||
"role":
|
{
|
||||||
"user",
|
"role": "user",
|
||||||
"content": [{
|
"content": [{"type": "text", "text": "What is 2+2?"}],
|
||||||
"type": "text",
|
},
|
||||||
"text": "What is 2+2?"
|
{
|
||||||
}],
|
"role": "assistant",
|
||||||
}, {
|
"content": [
|
||||||
"role":
|
{"type": "text", "text": "Let me think about it."},
|
||||||
"assistant",
|
{"type": "text", "text": "2+2 = 4"},
|
||||||
"content": [
|
{"type": "text", "text": "The answer is 4."},
|
||||||
{
|
],
|
||||||
"type": "text",
|
},
|
||||||
"text": "Let me think about it."
|
]
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "text",
|
|
||||||
"text": "2+2 = 4"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "text",
|
|
||||||
"text": "The answer is 4."
|
|
||||||
},
|
|
||||||
]
|
|
||||||
}]
|
|
||||||
|
|
||||||
assert conversation_with_thinking == expected_conversation
|
assert conversation_with_thinking == expected_conversation
|
||||||
|
|
||||||
|
|
||||||
def test_apply_mistral_chat_template_thinking_chunk():
|
def test_apply_mistral_chat_template_thinking_chunk():
|
||||||
# Moved import here to avoid yapf and isort conflicts
|
messages = [
|
||||||
from vllm.entrypoints.chat_utils import apply_mistral_chat_template
|
{
|
||||||
messages = [{
|
"role": "system",
|
||||||
"role":
|
"content": [
|
||||||
"system",
|
{"type": "text", "text": "You are a helpful assistant."},
|
||||||
"content": [{
|
{
|
||||||
"type": "text",
|
"type": "thinking",
|
||||||
"text": "You are a helpful assistant."
|
"closed": True,
|
||||||
}, {
|
"thinking": "Only return the answer when you are confident.",
|
||||||
"type":
|
},
|
||||||
"thinking",
|
],
|
||||||
"closed":
|
},
|
||||||
True,
|
{"role": "user", "content": "What is 2+2?"},
|
||||||
"thinking":
|
{
|
||||||
"Only return the answer when you are confident."
|
"role": "assistant",
|
||||||
}]
|
"content": [
|
||||||
}, {
|
{"type": "text", "text": "Let me think about it."},
|
||||||
"role": "user",
|
{"type": "thinking", "closed": True, "thinking": "2+2 = 4"},
|
||||||
"content": "What is 2+2?"
|
{
|
||||||
}, {
|
"type": "text",
|
||||||
"role":
|
"text": "The answer is 4.",
|
||||||
"assistant",
|
},
|
||||||
"content": [{
|
],
|
||||||
"type": "text",
|
},
|
||||||
"text": "Let me think about it."
|
{"role": "user", "content": "Thanks, what is 3+3?"},
|
||||||
}, {
|
]
|
||||||
"type": "thinking",
|
|
||||||
"closed": True,
|
|
||||||
"thinking": "2+2 = 4"
|
|
||||||
}, {
|
|
||||||
"type": "text",
|
|
||||||
"text": "The answer is 4.",
|
|
||||||
}],
|
|
||||||
}, {
|
|
||||||
"role": "user",
|
|
||||||
"content": "Thanks, what is 3+3?"
|
|
||||||
}]
|
|
||||||
|
|
||||||
# TODO(Julien): upon model release change to a tokenizer already configured.
|
# TODO(Julien): upon model release change to a tokenizer already configured.
|
||||||
# =================================================================
|
# =================================================================
|
||||||
mistral_tokenizer = MistralTokenizer.from_pretrained(
|
mistral_tokenizer = MistralTokenizer.from_pretrained(
|
||||||
"mistralai/Devstral-Small-2507")
|
"mistralai/Devstral-Small-2507"
|
||||||
|
)
|
||||||
assert isinstance(mistral_tokenizer.tokenizer, Tekkenizer)
|
assert isinstance(mistral_tokenizer.tokenizer, Tekkenizer)
|
||||||
# Add think special tokens to the tokenizer
|
# Add think special tokens to the tokenizer
|
||||||
mistral_tokenizer.tokenizer._all_special_tokens[35] = SpecialTokenInfo(
|
mistral_tokenizer.tokenizer._all_special_tokens[35] = SpecialTokenInfo(
|
||||||
rank=35, is_control=True, token_str=SpecialTokens.begin_think.value)
|
rank=35, is_control=True, token_str=SpecialTokens.begin_think.value
|
||||||
|
)
|
||||||
mistral_tokenizer.tokenizer._all_special_tokens[36] = SpecialTokenInfo(
|
mistral_tokenizer.tokenizer._all_special_tokens[36] = SpecialTokenInfo(
|
||||||
rank=36, is_control=True, token_str=SpecialTokens.end_think.value)
|
rank=36, is_control=True, token_str=SpecialTokens.end_think.value
|
||||||
|
)
|
||||||
mistral_tokenizer.tokenizer._special_tokens_reverse_vocab = {
|
mistral_tokenizer.tokenizer._special_tokens_reverse_vocab = {
|
||||||
k: v
|
k: v
|
||||||
for k, v in
|
for k, v in mistral_tokenizer.tokenizer._special_tokens_reverse_vocab.items()
|
||||||
mistral_tokenizer.tokenizer._special_tokens_reverse_vocab.items()
|
|
||||||
if v not in {35, 36}
|
if v not in {35, 36}
|
||||||
}
|
}
|
||||||
mistral_tokenizer.tokenizer._special_tokens_reverse_vocab[
|
mistral_tokenizer.tokenizer._special_tokens_reverse_vocab[
|
||||||
SpecialTokens.begin_think.value] = 35
|
SpecialTokens.begin_think.value
|
||||||
|
] = 35
|
||||||
mistral_tokenizer.tokenizer._special_tokens_reverse_vocab[
|
mistral_tokenizer.tokenizer._special_tokens_reverse_vocab[
|
||||||
SpecialTokens.end_think.value] = 36
|
SpecialTokens.end_think.value
|
||||||
|
] = 36
|
||||||
mistral_tokenizer.instruct.BEGIN_THINK = 35
|
mistral_tokenizer.instruct.BEGIN_THINK = 35
|
||||||
mistral_tokenizer.instruct.END_THINK = 36
|
mistral_tokenizer.instruct.END_THINK = 36
|
||||||
# =================================================================
|
# =================================================================
|
||||||
|
|
||||||
tokens_ids = apply_mistral_chat_template(mistral_tokenizer,
|
tokens_ids = apply_mistral_chat_template(
|
||||||
messages,
|
mistral_tokenizer, messages, chat_template=None, tools=None
|
||||||
chat_template=None,
|
)
|
||||||
tools=None)
|
|
||||||
|
|
||||||
string_tokens = mistral_tokenizer.mistral.decode(
|
string_tokens = mistral_tokenizer.mistral.decode(
|
||||||
tokens_ids, special_token_policy=SpecialTokenPolicy.KEEP)
|
tokens_ids, special_token_policy=SpecialTokenPolicy.KEEP
|
||||||
|
)
|
||||||
|
|
||||||
expected_tokens = (
|
expected_tokens = (
|
||||||
r"<s>[SYSTEM_PROMPT]You are a helpful assistant.[THINK]Only return the"
|
r"<s>[SYSTEM_PROMPT]You are a helpful assistant.[THINK]Only return the"
|
||||||
r" answer when you are confident.[/THINK][/SYSTEM_PROMPT]"
|
r" answer when you are confident.[/THINK][/SYSTEM_PROMPT]"
|
||||||
r"[INST]What is 2+2?[/INST]"
|
r"[INST]What is 2+2?[/INST]"
|
||||||
r"Let me think about it.[THINK]2+2 = 4[/THINK]The answer is 4.</s>"
|
r"Let me think about it.[THINK]2+2 = 4[/THINK]The answer is 4.</s>"
|
||||||
r"[INST]Thanks, what is 3+3?[/INST]")
|
r"[INST]Thanks, what is 3+3?[/INST]"
|
||||||
|
)
|
||||||
|
|
||||||
assert string_tokens == expected_tokens
|
assert string_tokens == expected_tokens
|
||||||
|
|
||||||
@ -2192,37 +2172,32 @@ def test_parse_chat_messages_single_empty_audio_with_uuid(
|
|||||||
):
|
):
|
||||||
audio_uuid = "abcd"
|
audio_uuid = "abcd"
|
||||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||||
[{
|
[
|
||||||
"role":
|
{
|
||||||
"user",
|
"role": "user",
|
||||||
"content": [
|
"content": [
|
||||||
{
|
{
|
||||||
"type": "input_audio",
|
"type": "input_audio",
|
||||||
"input_audio": {},
|
"input_audio": {},
|
||||||
"uuid": audio_uuid,
|
"uuid": audio_uuid,
|
||||||
},
|
},
|
||||||
{
|
{"type": "text", "text": "What does the audio say?"},
|
||||||
"type": "text",
|
],
|
||||||
"text": "What does the audio say?"
|
}
|
||||||
},
|
],
|
||||||
],
|
|
||||||
}],
|
|
||||||
qwen2_audio_model_config,
|
qwen2_audio_model_config,
|
||||||
qwen2_audio_tokenizer,
|
qwen2_audio_tokenizer,
|
||||||
content_format="string",
|
content_format="string",
|
||||||
)
|
)
|
||||||
|
|
||||||
assert conversation == [{
|
assert conversation == [
|
||||||
"role":
|
{
|
||||||
"user",
|
"role": "user",
|
||||||
"content":
|
"content": "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?",
|
||||||
"Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?"
|
}
|
||||||
}]
|
]
|
||||||
_assert_mm_data_inputs(mm_data, {"audio": 1})
|
_assert_mm_data_inputs(mm_data, {"audio": 1})
|
||||||
_assert_mm_uuids(mm_uuids,
|
_assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[audio_uuid])
|
||||||
1,
|
|
||||||
modality="audio",
|
|
||||||
expected_uuids=[audio_uuid])
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@ -2232,34 +2207,29 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
|
|||||||
):
|
):
|
||||||
audio_uuid = "abcd"
|
audio_uuid = "abcd"
|
||||||
conversation, mm_future, mm_uuids = parse_chat_messages_futures(
|
conversation, mm_future, mm_uuids = parse_chat_messages_futures(
|
||||||
[{
|
[
|
||||||
"role":
|
{
|
||||||
"user",
|
"role": "user",
|
||||||
"content": [
|
"content": [
|
||||||
{
|
{
|
||||||
"type": "input_audio",
|
"type": "input_audio",
|
||||||
"input_audio": {},
|
"input_audio": {},
|
||||||
"uuid": audio_uuid,
|
"uuid": audio_uuid,
|
||||||
},
|
},
|
||||||
{
|
{"type": "text", "text": "What does the audio say?"},
|
||||||
"type": "text",
|
],
|
||||||
"text": "What does the audio say?"
|
}
|
||||||
},
|
],
|
||||||
],
|
|
||||||
}],
|
|
||||||
qwen2_audio_model_config,
|
qwen2_audio_model_config,
|
||||||
qwen2_audio_tokenizer,
|
qwen2_audio_tokenizer,
|
||||||
content_format="string",
|
content_format="string",
|
||||||
)
|
)
|
||||||
|
|
||||||
assert conversation == [{
|
assert conversation == [
|
||||||
"role":
|
{
|
||||||
"user",
|
"role": "user",
|
||||||
"content":
|
"content": "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?",
|
||||||
"Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?"
|
}
|
||||||
}]
|
]
|
||||||
_assert_mm_data_inputs(await mm_future, {"audio": 1})
|
_assert_mm_data_inputs(await mm_future, {"audio": 1})
|
||||||
_assert_mm_uuids(mm_uuids,
|
_assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[audio_uuid])
|
||||||
1,
|
|
||||||
modality="audio",
|
|
||||||
expected_uuids=[audio_uuid])
|
|
||||||
|
|||||||
@ -12,9 +12,6 @@ import torch
|
|||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
|
|
||||||
from vllm.config.lora import LoRAConfig
|
from vllm.config.lora import LoRAConfig
|
||||||
|
|
||||||
# yapf conflicts with isort for this block
|
|
||||||
# yapf: disable
|
|
||||||
from vllm.lora.layers import (
|
from vllm.lora.layers import (
|
||||||
BaseLayerWithLoRA,
|
BaseLayerWithLoRA,
|
||||||
ColumnParallelLinearWithLoRA,
|
ColumnParallelLinearWithLoRA,
|
||||||
@ -32,8 +29,6 @@ from vllm.lora.layers import (
|
|||||||
RowParallelLinearWithShardedLoRA,
|
RowParallelLinearWithShardedLoRA,
|
||||||
VocabParallelEmbeddingWithLoRA,
|
VocabParallelEmbeddingWithLoRA,
|
||||||
)
|
)
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
from vllm.lora.models import LoRALayerWeights, PackedLoRALayerWeights
|
from vllm.lora.models import LoRALayerWeights, PackedLoRALayerWeights
|
||||||
from vllm.lora.punica_wrapper import get_punica_wrapper
|
from vllm.lora.punica_wrapper import get_punica_wrapper
|
||||||
from vllm.model_executor.layers.linear import (
|
from vllm.model_executor.layers.linear import (
|
||||||
|
|||||||
@ -17,8 +17,6 @@ import vllm.model_executor.model_loader.tensorizer
|
|||||||
from tests.utils import VLLM_PATH, RemoteOpenAIServer
|
from tests.utils import VLLM_PATH, RemoteOpenAIServer
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
from vllm.model_executor.model_loader.tensorizer import (
|
from vllm.model_executor.model_loader.tensorizer import (
|
||||||
TensorizerConfig,
|
TensorizerConfig,
|
||||||
TensorSerializer,
|
TensorSerializer,
|
||||||
@ -29,8 +27,6 @@ from vllm.model_executor.model_loader.tensorizer import (
|
|||||||
from vllm.model_executor.model_loader.tensorizer_loader import (
|
from vllm.model_executor.model_loader.tensorizer_loader import (
|
||||||
BLACKLISTED_TENSORIZER_ARGS,
|
BLACKLISTED_TENSORIZER_ARGS,
|
||||||
)
|
)
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
from vllm.utils import PlaceholderModule
|
from vllm.utils import PlaceholderModule
|
||||||
|
|
||||||
from .conftest import DummyExecutor, assert_from_collective_rpc
|
from .conftest import DummyExecutor, assert_from_collective_rpc
|
||||||
|
|||||||
@ -45,18 +45,17 @@ from .vlm_utils.types import (
|
|||||||
if current_platform.is_rocm():
|
if current_platform.is_rocm():
|
||||||
os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
|
os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
COMMON_BROADCAST_SETTINGS = {
|
COMMON_BROADCAST_SETTINGS = {
|
||||||
"test_type": VLMTestType.IMAGE,
|
"test_type": VLMTestType.IMAGE,
|
||||||
"dtype": "half",
|
"dtype": "half",
|
||||||
"max_tokens": 5,
|
"max_tokens": 5,
|
||||||
"tensor_parallel_size": 2,
|
"tensor_parallel_size": 2,
|
||||||
"hf_model_kwargs": {"device_map": "auto"},
|
"hf_model_kwargs": {"device_map": "auto"},
|
||||||
"image_size_factors": [(.25, 0.5, 1.0)],
|
"image_size_factors": [(0.25, 0.5, 1.0)],
|
||||||
"distributed_executor_backend": (
|
"distributed_executor_backend": (
|
||||||
"ray",
|
"ray",
|
||||||
"mp",
|
"mp",
|
||||||
)
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
### Test configuration for specific models
|
### Test configuration for specific models
|
||||||
@ -96,22 +95,20 @@ VLM_TEST_SETTINGS = {
|
|||||||
#### Core tests to always run in the CI
|
#### Core tests to always run in the CI
|
||||||
"llava": VLMTestInfo(
|
"llava": VLMTestInfo(
|
||||||
models=["llava-hf/llava-1.5-7b-hf"],
|
models=["llava-hf/llava-1.5-7b-hf"],
|
||||||
test_type=(
|
test_type=(VLMTestType.EMBEDDING, VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
|
||||||
VLMTestType.EMBEDDING,
|
|
||||||
VLMTestType.IMAGE,
|
|
||||||
VLMTestType.CUSTOM_INPUTS
|
|
||||||
),
|
|
||||||
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
||||||
convert_assets_to_embeddings=model_utils.get_llava_embeddings,
|
convert_assets_to_embeddings=model_utils.get_llava_embeddings,
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
auto_cls=AutoModelForImageTextToText,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
||||||
custom_test_opts=[CustomTestOptions(
|
custom_test_opts=[
|
||||||
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
|
CustomTestOptions(
|
||||||
formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:"
|
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
|
||||||
),
|
formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:"
|
||||||
limit_mm_per_prompt={"image": 4},
|
),
|
||||||
)],
|
limit_mm_per_prompt={"image": 4},
|
||||||
|
)
|
||||||
|
],
|
||||||
# TODO: Revert to "auto" when CPU backend can use torch > 2.6
|
# TODO: Revert to "auto" when CPU backend can use torch > 2.6
|
||||||
dtype="bfloat16" if current_platform.is_cpu() else "auto",
|
dtype="bfloat16" if current_platform.is_cpu() else "auto",
|
||||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||||
@ -120,27 +117,27 @@ VLM_TEST_SETTINGS = {
|
|||||||
models=["google/paligemma-3b-mix-224"],
|
models=["google/paligemma-3b-mix-224"],
|
||||||
test_type=VLMTestType.IMAGE,
|
test_type=VLMTestType.IMAGE,
|
||||||
prompt_formatter=identity,
|
prompt_formatter=identity,
|
||||||
img_idx_to_prompt = lambda idx: "",
|
img_idx_to_prompt=lambda idx: "",
|
||||||
# Paligemma uses its own sample prompts because the default one fails
|
# Paligemma uses its own sample prompts because the default one fails
|
||||||
single_image_prompts=IMAGE_ASSETS.prompts({
|
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||||
"stop_sign": "caption es",
|
{
|
||||||
"cherry_blossom": "What is in the picture?",
|
"stop_sign": "caption es",
|
||||||
}),
|
"cherry_blossom": "What is in the picture?",
|
||||||
|
}
|
||||||
|
),
|
||||||
auto_cls=AutoModelForImageTextToText,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
|
vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
|
||||||
dtype="bfloat16",
|
dtype="bfloat16",
|
||||||
marks=[pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")], # noqa: E501
|
marks=[
|
||||||
|
pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")
|
||||||
|
], # noqa: E501
|
||||||
),
|
),
|
||||||
"qwen2_5_vl": VLMTestInfo(
|
"qwen2_5_vl": VLMTestInfo(
|
||||||
models=["Qwen/Qwen2.5-VL-3B-Instruct"],
|
models=["Qwen/Qwen2.5-VL-3B-Instruct"],
|
||||||
test_type=(
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
|
||||||
VLMTestType.IMAGE,
|
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||||
VLMTestType.MULTI_IMAGE,
|
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
|
||||||
VLMTestType.VIDEO
|
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
|
||||||
),
|
|
||||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
|
||||||
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
|
|
||||||
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
|
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
auto_cls=AutoModelForImageTextToText,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
@ -150,17 +147,13 @@ VLM_TEST_SETTINGS = {
|
|||||||
),
|
),
|
||||||
"qwen2_5_omni": VLMTestInfo(
|
"qwen2_5_omni": VLMTestInfo(
|
||||||
models=["Qwen/Qwen2.5-Omni-3B"],
|
models=["Qwen/Qwen2.5-Omni-3B"],
|
||||||
test_type=(
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
|
||||||
VLMTestType.IMAGE,
|
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||||
VLMTestType.MULTI_IMAGE,
|
img_idx_to_prompt=lambda idx: "<|vision_bos|><|IMAGE|><|vision_eos|>", # noqa: E501
|
||||||
VLMTestType.VIDEO
|
video_idx_to_prompt=lambda idx: "<|vision_bos|><|VIDEO|><|vision_eos|>", # noqa: E501
|
||||||
),
|
|
||||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
|
||||||
img_idx_to_prompt=lambda idx: "<|vision_bos|><|IMAGE|><|vision_eos|>", # noqa: E501
|
|
||||||
video_idx_to_prompt=lambda idx: "<|vision_bos|><|VIDEO|><|vision_eos|>", # noqa: E501
|
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
num_logprobs= 6 if current_platform.is_cpu() else 5,
|
num_logprobs=6 if current_platform.is_cpu() else 5,
|
||||||
auto_cls=AutoModelForTextToWaveform,
|
auto_cls=AutoModelForTextToWaveform,
|
||||||
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
|
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
|
||||||
patch_hf_runner=model_utils.qwen2_5_omni_patch_hf_runner,
|
patch_hf_runner=model_utils.qwen2_5_omni_patch_hf_runner,
|
||||||
@ -168,9 +161,9 @@ VLM_TEST_SETTINGS = {
|
|||||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||||
),
|
),
|
||||||
"ultravox": VLMTestInfo(
|
"ultravox": VLMTestInfo(
|
||||||
models = ["fixie-ai/ultravox-v0_5-llama-3_2-1b"],
|
models=["fixie-ai/ultravox-v0_5-llama-3_2-1b"],
|
||||||
test_type=VLMTestType.AUDIO,
|
test_type=VLMTestType.AUDIO,
|
||||||
prompt_formatter=lambda audio_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{audio_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
|
prompt_formatter=lambda audio_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{audio_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
|
||||||
audio_idx_to_prompt=lambda idx: "<|audio|>",
|
audio_idx_to_prompt=lambda idx: "<|audio|>",
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
@ -184,9 +177,11 @@ VLM_TEST_SETTINGS = {
|
|||||||
"llava-onevision-transformers": VLMTestInfo(
|
"llava-onevision-transformers": VLMTestInfo(
|
||||||
models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
|
models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
|
||||||
test_type=VLMTestType.IMAGE,
|
test_type=VLMTestType.IMAGE,
|
||||||
prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||||
max_model_len=16384,
|
max_model_len=16384,
|
||||||
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"), # noqa: E501
|
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs(
|
||||||
|
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
|
||||||
|
), # noqa: E501
|
||||||
auto_cls=AutoModelForImageTextToText,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
|
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
|
||||||
image_size_factors=[(0.25, 0.5, 1.0)],
|
image_size_factors=[(0.25, 0.5, 1.0)],
|
||||||
@ -201,7 +196,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
"idefics3-transformers": VLMTestInfo(
|
"idefics3-transformers": VLMTestInfo(
|
||||||
models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
|
models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
|
||||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||||
prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501
|
prompt_formatter=lambda img_prompt: f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501
|
||||||
img_idx_to_prompt=lambda idx: "<image>",
|
img_idx_to_prompt=lambda idx: "<image>",
|
||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
@ -217,8 +212,8 @@ VLM_TEST_SETTINGS = {
|
|||||||
"qwen2_5_vl-transformers": VLMTestInfo(
|
"qwen2_5_vl-transformers": VLMTestInfo(
|
||||||
models=["Qwen/Qwen2.5-VL-3B-Instruct"],
|
models=["Qwen/Qwen2.5-VL-3B-Instruct"],
|
||||||
test_type=VLMTestType.IMAGE,
|
test_type=VLMTestType.IMAGE,
|
||||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||||
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
|
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
auto_cls=AutoModelForImageTextToText,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
@ -228,23 +223,24 @@ VLM_TEST_SETTINGS = {
|
|||||||
"model_impl": "transformers",
|
"model_impl": "transformers",
|
||||||
},
|
},
|
||||||
# FIXME: Investigate mrope issue
|
# FIXME: Investigate mrope issue
|
||||||
marks=[large_gpu_mark(min_gb=32),
|
marks=[large_gpu_mark(min_gb=32), pytest.mark.skip(reason="Mrope issue")],
|
||||||
pytest.mark.skip(reason="Mrope issue")],
|
|
||||||
),
|
),
|
||||||
#### Extended model tests
|
#### Extended model tests
|
||||||
"aria": VLMTestInfo(
|
"aria": VLMTestInfo(
|
||||||
models=["rhymes-ai/Aria"],
|
models=["rhymes-ai/Aria"],
|
||||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||||
prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
|
prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
|
||||||
img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
|
img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
auto_cls=AutoModelForImageTextToText,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
single_image_prompts=IMAGE_ASSETS.prompts({
|
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||||
"stop_sign": "<vlm_image>Please describe the image shortly.",
|
{
|
||||||
"cherry_blossom": "<vlm_image>Please infer the season with reason.", # noqa: E501
|
"stop_sign": "<vlm_image>Please describe the image shortly.",
|
||||||
}),
|
"cherry_blossom": "<vlm_image>Please infer the season with reason.", # noqa: E501
|
||||||
multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.", # noqa: E501
|
}
|
||||||
|
),
|
||||||
|
multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.", # noqa: E501
|
||||||
stop_str=["<|im_end|>"],
|
stop_str=["<|im_end|>"],
|
||||||
image_size_factors=[(0.10, 0.15)],
|
image_size_factors=[(0.10, 0.15)],
|
||||||
max_tokens=64,
|
max_tokens=64,
|
||||||
@ -253,11 +249,13 @@ VLM_TEST_SETTINGS = {
|
|||||||
"aya_vision": VLMTestInfo(
|
"aya_vision": VLMTestInfo(
|
||||||
models=["CohereForAI/aya-vision-8b"],
|
models=["CohereForAI/aya-vision-8b"],
|
||||||
test_type=(VLMTestType.IMAGE),
|
test_type=(VLMTestType.IMAGE),
|
||||||
prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
|
prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
|
||||||
single_image_prompts=IMAGE_ASSETS.prompts({
|
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||||
"stop_sign": "<image>What's the content in the center of the image?", # noqa: E501
|
{
|
||||||
"cherry_blossom": "<image>What is the season?", # noqa: E501
|
"stop_sign": "<image>What's the content in the center of the image?", # noqa: E501
|
||||||
}),
|
"cherry_blossom": "<image>What is the season?", # noqa: E501
|
||||||
|
}
|
||||||
|
),
|
||||||
multi_image_prompt="<image><image>Describe the two images in detail.", # noqa: E501
|
multi_image_prompt="<image><image>Describe the two images in detail.", # noqa: E501
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
@ -267,11 +265,13 @@ VLM_TEST_SETTINGS = {
|
|||||||
"aya_vision-multi_image": VLMTestInfo(
|
"aya_vision-multi_image": VLMTestInfo(
|
||||||
models=["CohereForAI/aya-vision-8b"],
|
models=["CohereForAI/aya-vision-8b"],
|
||||||
test_type=(VLMTestType.MULTI_IMAGE),
|
test_type=(VLMTestType.MULTI_IMAGE),
|
||||||
prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
|
prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
|
||||||
single_image_prompts=IMAGE_ASSETS.prompts({
|
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||||
"stop_sign": "<image>What's the content in the center of the image?", # noqa: E501
|
{
|
||||||
"cherry_blossom": "<image>What is the season?", # noqa: E501
|
"stop_sign": "<image>What's the content in the center of the image?", # noqa: E501
|
||||||
}),
|
"cherry_blossom": "<image>What is the season?", # noqa: E501
|
||||||
|
}
|
||||||
|
),
|
||||||
multi_image_prompt="<image><image>Describe the two images in detail.", # noqa: E501
|
multi_image_prompt="<image><image>Describe the two images in detail.", # noqa: E501
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
@ -297,27 +297,29 @@ VLM_TEST_SETTINGS = {
|
|||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
auto_cls=AutoModelForImageTextToText,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
# For chameleon, we only compare the sequences
|
# For chameleon, we only compare the sequences
|
||||||
vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
|
vllm_output_post_proc=lambda vllm_output, model: vllm_output[:2],
|
||||||
hf_output_post_proc = lambda hf_output, model: hf_output[:2],
|
hf_output_post_proc=lambda hf_output, model: hf_output[:2],
|
||||||
comparator=check_outputs_equal,
|
comparator=check_outputs_equal,
|
||||||
max_tokens=8,
|
max_tokens=8,
|
||||||
dtype="bfloat16",
|
dtype="bfloat16",
|
||||||
),
|
),
|
||||||
"deepseek_vl_v2": VLMTestInfo(
|
"deepseek_vl_v2": VLMTestInfo(
|
||||||
models=["Isotr0py/deepseek-vl2-tiny"], # model repo using dynamic module
|
models=["Isotr0py/deepseek-vl2-tiny"], # model repo using dynamic module
|
||||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||||
prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ", # noqa: E501
|
prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ", # noqa: E501
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
single_image_prompts=IMAGE_ASSETS.prompts({
|
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||||
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
|
{
|
||||||
"cherry_blossom": "<image>\nPlease infer the season with reason in details.", # noqa: E501
|
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
|
||||||
}),
|
"cherry_blossom": "<image>\nPlease infer the season with reason in details.", # noqa: E501
|
||||||
multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?", # noqa: E501
|
}
|
||||||
|
),
|
||||||
|
multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?", # noqa: E501
|
||||||
patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
|
patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
|
||||||
hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
|
hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
|
||||||
stop_str=["<|end▁of▁sentence|>", "<|begin▁of▁sentence|>"], # noqa: E501
|
stop_str=["<|end▁of▁sentence|>", "<|begin▁of▁sentence|>"], # noqa: E501
|
||||||
image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
|
image_size_factors=[(), (1.0,), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
|
||||||
),
|
),
|
||||||
"fuyu": VLMTestInfo(
|
"fuyu": VLMTestInfo(
|
||||||
models=["adept/fuyu-8b"],
|
models=["adept/fuyu-8b"],
|
||||||
@ -336,11 +338,13 @@ VLM_TEST_SETTINGS = {
|
|||||||
"gemma3": VLMTestInfo(
|
"gemma3": VLMTestInfo(
|
||||||
models=["google/gemma-3-4b-it"],
|
models=["google/gemma-3-4b-it"],
|
||||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||||
prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
|
prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
|
||||||
single_image_prompts=IMAGE_ASSETS.prompts({
|
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||||
"stop_sign": "<start_of_image>What's the content in the center of the image?", # noqa: E501
|
{
|
||||||
"cherry_blossom": "<start_of_image>What is the season?", # noqa: E501
|
"stop_sign": "<start_of_image>What's the content in the center of the image?", # noqa: E501
|
||||||
}),
|
"cherry_blossom": "<start_of_image>What is the season?", # noqa: E501
|
||||||
|
}
|
||||||
|
),
|
||||||
multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501
|
multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
@ -353,10 +357,12 @@ VLM_TEST_SETTINGS = {
|
|||||||
models=["zai-org/glm-4v-9b"],
|
models=["zai-org/glm-4v-9b"],
|
||||||
test_type=VLMTestType.IMAGE,
|
test_type=VLMTestType.IMAGE,
|
||||||
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>", # noqa: E501
|
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>", # noqa: E501
|
||||||
single_image_prompts=IMAGE_ASSETS.prompts({
|
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||||
"stop_sign": "<|begin_of_image|><|endoftext|><|end_of_image|>What's the content in the center of the image?", # noqa: E501
|
{
|
||||||
"cherry_blossom": "<|begin_of_image|><|endoftext|><|end_of_image|>What is the season?", # noqa: E501
|
"stop_sign": "<|begin_of_image|><|endoftext|><|end_of_image|>What's the content in the center of the image?", # noqa: E501
|
||||||
}),
|
"cherry_blossom": "<|begin_of_image|><|endoftext|><|end_of_image|>What is the season?", # noqa: E501
|
||||||
|
}
|
||||||
|
),
|
||||||
max_model_len=2048,
|
max_model_len=2048,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
get_stop_token_ids=lambda tok: [151329, 151336, 151338],
|
get_stop_token_ids=lambda tok: [151329, 151336, 151338],
|
||||||
@ -372,8 +378,8 @@ VLM_TEST_SETTINGS = {
|
|||||||
models=["zai-org/GLM-4.1V-9B-Thinking"],
|
models=["zai-org/GLM-4.1V-9B-Thinking"],
|
||||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||||
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>", # noqa: E501
|
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>", # noqa: E501
|
||||||
img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>", # noqa: E501
|
img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>", # noqa: E501
|
||||||
video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>", # noqa: E501
|
video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>", # noqa: E501
|
||||||
max_model_len=2048,
|
max_model_len=2048,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
get_stop_token_ids=lambda tok: [151329, 151336, 151338],
|
get_stop_token_ids=lambda tok: [151329, 151336, 151338],
|
||||||
@ -390,23 +396,27 @@ VLM_TEST_SETTINGS = {
|
|||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
auto_cls=AutoModelForImageTextToText,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
patch_hf_runner=model_utils.glm4_1v_patch_hf_runner,
|
patch_hf_runner=model_utils.glm4_1v_patch_hf_runner,
|
||||||
custom_test_opts=[CustomTestOptions(
|
custom_test_opts=[
|
||||||
inputs=custom_inputs.video_with_metadata_glm4_1v(),
|
CustomTestOptions(
|
||||||
limit_mm_per_prompt={"video": 1},
|
inputs=custom_inputs.video_with_metadata_glm4_1v(),
|
||||||
)],
|
limit_mm_per_prompt={"video": 1},
|
||||||
|
)
|
||||||
|
],
|
||||||
marks=[large_gpu_mark(min_gb=32)],
|
marks=[large_gpu_mark(min_gb=32)],
|
||||||
),
|
),
|
||||||
"h2ovl": VLMTestInfo(
|
"h2ovl": VLMTestInfo(
|
||||||
models = [
|
models=[
|
||||||
"h2oai/h2ovl-mississippi-800m",
|
"h2oai/h2ovl-mississippi-800m",
|
||||||
"h2oai/h2ovl-mississippi-2b",
|
"h2oai/h2ovl-mississippi-2b",
|
||||||
],
|
],
|
||||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||||
prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501
|
prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501
|
||||||
single_image_prompts=IMAGE_ASSETS.prompts({
|
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||||
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
|
{
|
||||||
"cherry_blossom": "<image>\nWhat is the season?",
|
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
|
||||||
}),
|
"cherry_blossom": "<image>\nWhat is the season?",
|
||||||
|
}
|
||||||
|
),
|
||||||
multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.", # noqa: E501
|
multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.", # noqa: E501
|
||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
use_tokenizer_eos=True,
|
use_tokenizer_eos=True,
|
||||||
@ -416,7 +426,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
"idefics3": VLMTestInfo(
|
"idefics3": VLMTestInfo(
|
||||||
models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
|
models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
|
||||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||||
prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501
|
prompt_formatter=lambda img_prompt: f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501
|
||||||
img_idx_to_prompt=lambda idx: "<image>",
|
img_idx_to_prompt=lambda idx: "<image>",
|
||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
@ -431,11 +441,13 @@ VLM_TEST_SETTINGS = {
|
|||||||
# "OpenGVLab/Mono-InternVL-2B",
|
# "OpenGVLab/Mono-InternVL-2B",
|
||||||
],
|
],
|
||||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
|
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
|
||||||
single_image_prompts=IMAGE_ASSETS.prompts({
|
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||||
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
|
{
|
||||||
"cherry_blossom": "<image>\nWhat is the season?",
|
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
|
||||||
}),
|
"cherry_blossom": "<image>\nWhat is the season?",
|
||||||
|
}
|
||||||
|
),
|
||||||
multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.", # noqa: E501
|
multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.", # noqa: E501
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
use_tokenizer_eos=True,
|
use_tokenizer_eos=True,
|
||||||
@ -446,7 +458,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
"OpenGVLab/InternVL3-1B",
|
"OpenGVLab/InternVL3-1B",
|
||||||
],
|
],
|
||||||
test_type=VLMTestType.VIDEO,
|
test_type=VLMTestType.VIDEO,
|
||||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
|
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
|
||||||
video_idx_to_prompt=lambda idx: "<video>",
|
video_idx_to_prompt=lambda idx: "<video>",
|
||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
use_tokenizer_eos=True,
|
use_tokenizer_eos=True,
|
||||||
@ -459,7 +471,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
VLMTestType.MULTI_IMAGE,
|
VLMTestType.MULTI_IMAGE,
|
||||||
VLMTestType.VIDEO,
|
VLMTestType.VIDEO,
|
||||||
),
|
),
|
||||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
|
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
|
||||||
img_idx_to_prompt=lambda idx: "<IMG_CONTEXT>",
|
img_idx_to_prompt=lambda idx: "<IMG_CONTEXT>",
|
||||||
video_idx_to_prompt=lambda idx: "<video>",
|
video_idx_to_prompt=lambda idx: "<video>",
|
||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
@ -469,7 +481,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
"kimi_vl": VLMTestInfo(
|
"kimi_vl": VLMTestInfo(
|
||||||
models=["moonshotai/Kimi-VL-A3B-Instruct"],
|
models=["moonshotai/Kimi-VL-A3B-Instruct"],
|
||||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||||
prompt_formatter=lambda img_prompt: f"<|im_user|>user<|im_middle|>{img_prompt}<|im_end|><|im_assistant|>assistant<|im_middle|>", # noqa: E501
|
prompt_formatter=lambda img_prompt: f"<|im_user|>user<|im_middle|>{img_prompt}<|im_end|><|im_assistant|>assistant<|im_middle|>", # noqa: E501
|
||||||
img_idx_to_prompt=lambda _: "<|media_start|>image<|media_content|><|media_pad|><|media_end|>", # noqa: E501
|
img_idx_to_prompt=lambda _: "<|media_start|>image<|media_content|><|media_pad|><|media_end|>", # noqa: E501
|
||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
@ -480,11 +492,11 @@ VLM_TEST_SETTINGS = {
|
|||||||
),
|
),
|
||||||
"llama4": VLMTestInfo(
|
"llama4": VLMTestInfo(
|
||||||
models=["meta-llama/Llama-4-Scout-17B-16E-Instruct"],
|
models=["meta-llama/Llama-4-Scout-17B-16E-Instruct"],
|
||||||
prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|header_start|>user<|header_end|>\n\n{img_prompt}<|eot|><|header_start|>assistant<|header_end|>\n\n", # noqa: E501
|
prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|header_start|>user<|header_end|>\n\n{img_prompt}<|eot|><|header_start|>assistant<|header_end|>\n\n", # noqa: E501
|
||||||
img_idx_to_prompt=lambda _: "<|image|>",
|
img_idx_to_prompt=lambda _: "<|image|>",
|
||||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||||
distributed_executor_backend="mp",
|
distributed_executor_backend="mp",
|
||||||
image_size_factors=[(.25, 0.5, 1.0)],
|
image_size_factors=[(0.25, 0.5, 1.0)],
|
||||||
hf_model_kwargs={"device_map": "auto"},
|
hf_model_kwargs={"device_map": "auto"},
|
||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
max_num_seqs=4,
|
max_num_seqs=4,
|
||||||
@ -500,28 +512,34 @@ VLM_TEST_SETTINGS = {
|
|||||||
max_model_len=10240,
|
max_model_len=10240,
|
||||||
auto_cls=AutoModelForImageTextToText,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
||||||
custom_test_opts=[CustomTestOptions(
|
custom_test_opts=[
|
||||||
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
|
CustomTestOptions(
|
||||||
formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]"
|
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
|
||||||
),
|
formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]"
|
||||||
limit_mm_per_prompt={"image": 4},
|
),
|
||||||
)],
|
limit_mm_per_prompt={"image": 4},
|
||||||
|
)
|
||||||
|
],
|
||||||
),
|
),
|
||||||
"llava_onevision": VLMTestInfo(
|
"llava_onevision": VLMTestInfo(
|
||||||
models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
|
models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
|
||||||
test_type=VLMTestType.CUSTOM_INPUTS,
|
test_type=VLMTestType.CUSTOM_INPUTS,
|
||||||
prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||||
num_video_frames=16,
|
num_video_frames=16,
|
||||||
max_model_len=16384,
|
max_model_len=16384,
|
||||||
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"), # noqa: E501
|
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs(
|
||||||
|
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
|
||||||
|
), # noqa: E501
|
||||||
auto_cls=AutoModelForImageTextToText,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
|
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
|
||||||
custom_test_opts=[CustomTestOptions(
|
custom_test_opts=[
|
||||||
inputs=custom_inputs.multi_video_multi_aspect_ratio_inputs(
|
CustomTestOptions(
|
||||||
formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
inputs=custom_inputs.multi_video_multi_aspect_ratio_inputs(
|
||||||
),
|
formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||||
limit_mm_per_prompt={"video": 4},
|
),
|
||||||
)],
|
limit_mm_per_prompt={"video": 4},
|
||||||
|
)
|
||||||
|
],
|
||||||
),
|
),
|
||||||
"llava_next_video": VLMTestInfo(
|
"llava_next_video": VLMTestInfo(
|
||||||
models=["llava-hf/LLaVA-NeXT-Video-7B-hf"],
|
models=["llava-hf/LLaVA-NeXT-Video-7B-hf"],
|
||||||
@ -563,7 +581,9 @@ VLM_TEST_SETTINGS = {
|
|||||||
img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
|
img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501
|
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(
|
||||||
|
["<|im_end|>", "<|endoftext|>"]
|
||||||
|
), # noqa: E501
|
||||||
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
|
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
|
||||||
patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
|
patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
|
||||||
# FIXME: https://huggingface.co/openbmb/MiniCPM-o-2_6/discussions/49
|
# FIXME: https://huggingface.co/openbmb/MiniCPM-o-2_6/discussions/49
|
||||||
@ -576,13 +596,15 @@ VLM_TEST_SETTINGS = {
|
|||||||
img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
|
img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501
|
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(
|
||||||
|
["<|im_end|>", "<|endoftext|>"]
|
||||||
|
), # noqa: E501
|
||||||
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
|
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
|
||||||
patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
|
patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
|
||||||
),
|
),
|
||||||
"minimax_vl_01": VLMTestInfo(
|
"minimax_vl_01": VLMTestInfo(
|
||||||
models=["MiniMaxAI/MiniMax-VL-01"],
|
models=["MiniMaxAI/MiniMax-VL-01"],
|
||||||
prompt_formatter=lambda img_prompt: f"<beginning_of_sentence>user: {img_prompt} assistant:<end_of_sentence>", # noqa: E501
|
prompt_formatter=lambda img_prompt: f"<beginning_of_sentence>user: {img_prompt} assistant:<end_of_sentence>", # noqa: E501
|
||||||
img_idx_to_prompt=lambda _: "<image>",
|
img_idx_to_prompt=lambda _: "<image>",
|
||||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
@ -604,8 +626,8 @@ VLM_TEST_SETTINGS = {
|
|||||||
"ovis1_6-gemma2": VLMTestInfo(
|
"ovis1_6-gemma2": VLMTestInfo(
|
||||||
models=["AIDC-AI/Ovis1.6-Gemma2-9B"],
|
models=["AIDC-AI/Ovis1.6-Gemma2-9B"],
|
||||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||||
prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
|
prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
|
||||||
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
|
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
dtype="half",
|
dtype="half",
|
||||||
@ -617,8 +639,8 @@ VLM_TEST_SETTINGS = {
|
|||||||
"ovis2": VLMTestInfo(
|
"ovis2": VLMTestInfo(
|
||||||
models=["AIDC-AI/Ovis2-1B"],
|
models=["AIDC-AI/Ovis2-1B"],
|
||||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||||
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||||
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
|
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
dtype="half",
|
dtype="half",
|
||||||
@ -628,13 +650,9 @@ VLM_TEST_SETTINGS = {
|
|||||||
),
|
),
|
||||||
"ovis2_5": VLMTestInfo(
|
"ovis2_5": VLMTestInfo(
|
||||||
models=["AIDC-AI/Ovis2.5-2B"],
|
models=["AIDC-AI/Ovis2.5-2B"],
|
||||||
test_type=(
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
|
||||||
VLMTestType.IMAGE,
|
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||||
VLMTestType.MULTI_IMAGE,
|
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
|
||||||
VLMTestType.VIDEO
|
|
||||||
),
|
|
||||||
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
|
||||||
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
|
|
||||||
video_idx_to_prompt=lambda idx: "<video>\n",
|
video_idx_to_prompt=lambda idx: "<video>\n",
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
@ -646,7 +664,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
"phi3v": VLMTestInfo(
|
"phi3v": VLMTestInfo(
|
||||||
models=["microsoft/Phi-3.5-vision-instruct"],
|
models=["microsoft/Phi-3.5-vision-instruct"],
|
||||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||||
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|end|>\n<|assistant|>\n", # noqa: E501
|
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|end|>\n<|assistant|>\n", # noqa: E501
|
||||||
img_idx_to_prompt=lambda idx: f"<|image_{idx}|>\n",
|
img_idx_to_prompt=lambda idx: f"<|image_{idx}|>\n",
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
@ -681,15 +699,11 @@ VLM_TEST_SETTINGS = {
|
|||||||
),
|
),
|
||||||
"qwen2_vl": VLMTestInfo(
|
"qwen2_vl": VLMTestInfo(
|
||||||
models=["Qwen/Qwen2-VL-2B-Instruct"],
|
models=["Qwen/Qwen2-VL-2B-Instruct"],
|
||||||
test_type=(
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
|
||||||
VLMTestType.IMAGE,
|
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||||
VLMTestType.MULTI_IMAGE,
|
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
|
||||||
VLMTestType.VIDEO
|
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
|
||||||
),
|
multi_image_prompt="Picture 1: <vlm_image>\nPicture 2: <vlm_image>\nDescribe these two images with one paragraph respectively.", # noqa: E501
|
||||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
|
||||||
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
|
|
||||||
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
|
|
||||||
multi_image_prompt="Picture 1: <vlm_image>\nPicture 2: <vlm_image>\nDescribe these two images with one paragraph respectively.", # noqa: E501
|
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
auto_cls=AutoModelForImageTextToText,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
@ -700,11 +714,13 @@ VLM_TEST_SETTINGS = {
|
|||||||
"skywork_r1v": VLMTestInfo(
|
"skywork_r1v": VLMTestInfo(
|
||||||
models=["Skywork/Skywork-R1V-38B"],
|
models=["Skywork/Skywork-R1V-38B"],
|
||||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||||
prompt_formatter=lambda img_prompt: f"<|begin▁of▁sentence|><|User|>\n{img_prompt}<|Assistant|><think>\n", # noqa: E501
|
prompt_formatter=lambda img_prompt: f"<|begin▁of▁sentence|><|User|>\n{img_prompt}<|Assistant|><think>\n", # noqa: E501
|
||||||
single_image_prompts=IMAGE_ASSETS.prompts({
|
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||||
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
|
{
|
||||||
"cherry_blossom": "<image>\nWhat is the season?",
|
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
|
||||||
}),
|
"cherry_blossom": "<image>\nWhat is the season?",
|
||||||
|
}
|
||||||
|
),
|
||||||
multi_image_prompt="<image>\n<image>\nDescribe the two images in short.", # noqa: E501
|
multi_image_prompt="<image>\n<image>\nDescribe the two images in short.", # noqa: E501
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
use_tokenizer_eos=True,
|
use_tokenizer_eos=True,
|
||||||
@ -737,9 +753,9 @@ VLM_TEST_SETTINGS = {
|
|||||||
VLMTestType.MULTI_IMAGE,
|
VLMTestType.MULTI_IMAGE,
|
||||||
VLMTestType.VIDEO,
|
VLMTestType.VIDEO,
|
||||||
),
|
),
|
||||||
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||||
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
|
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
|
||||||
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
|
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
auto_cls=AutoModelForImageTextToText,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
@ -752,11 +768,11 @@ VLM_TEST_SETTINGS = {
|
|||||||
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
auto_cls=AutoModelForImageTextToText,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
|
vllm_output_post_proc=lambda vllm_output, model: vllm_output[:2],
|
||||||
hf_output_post_proc = lambda hf_output, model: hf_output[:2],
|
hf_output_post_proc=lambda hf_output, model: hf_output[:2],
|
||||||
comparator=check_outputs_equal,
|
comparator=check_outputs_equal,
|
||||||
marks=multi_gpu_marks(num_gpus=2),
|
marks=multi_gpu_marks(num_gpus=2),
|
||||||
**COMMON_BROADCAST_SETTINGS # type: ignore
|
**COMMON_BROADCAST_SETTINGS, # type: ignore
|
||||||
),
|
),
|
||||||
"llava-broadcast": VLMTestInfo(
|
"llava-broadcast": VLMTestInfo(
|
||||||
models=["llava-hf/llava-1.5-7b-hf"],
|
models=["llava-hf/llava-1.5-7b-hf"],
|
||||||
@ -765,7 +781,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
auto_cls=AutoModelForImageTextToText,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
||||||
marks=multi_gpu_marks(num_gpus=2),
|
marks=multi_gpu_marks(num_gpus=2),
|
||||||
**COMMON_BROADCAST_SETTINGS # type: ignore
|
**COMMON_BROADCAST_SETTINGS, # type: ignore
|
||||||
),
|
),
|
||||||
"llava_next-broadcast": VLMTestInfo(
|
"llava_next-broadcast": VLMTestInfo(
|
||||||
models=["llava-hf/llava-v1.6-mistral-7b-hf"],
|
models=["llava-hf/llava-v1.6-mistral-7b-hf"],
|
||||||
@ -774,12 +790,12 @@ VLM_TEST_SETTINGS = {
|
|||||||
auto_cls=AutoModelForImageTextToText,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
||||||
marks=multi_gpu_marks(num_gpus=2),
|
marks=multi_gpu_marks(num_gpus=2),
|
||||||
**COMMON_BROADCAST_SETTINGS # type: ignore
|
**COMMON_BROADCAST_SETTINGS, # type: ignore
|
||||||
),
|
),
|
||||||
### Custom input edge-cases for specific models
|
### Custom input edge-cases for specific models
|
||||||
"intern_vl-diff-patches": VLMTestInfo(
|
"intern_vl-diff-patches": VLMTestInfo(
|
||||||
models=["OpenGVLab/InternVL2-2B"],
|
models=["OpenGVLab/InternVL2-2B"],
|
||||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
|
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
|
||||||
test_type=VLMTestType.CUSTOM_INPUTS,
|
test_type=VLMTestType.CUSTOM_INPUTS,
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
use_tokenizer_eos=True,
|
use_tokenizer_eos=True,
|
||||||
@ -788,7 +804,8 @@ VLM_TEST_SETTINGS = {
|
|||||||
CustomTestOptions(
|
CustomTestOptions(
|
||||||
inputs=inp,
|
inputs=inp,
|
||||||
limit_mm_per_prompt={"image": 2},
|
limit_mm_per_prompt={"image": 2},
|
||||||
) for inp in custom_inputs.different_patch_input_cases_internvl()
|
)
|
||||||
|
for inp in custom_inputs.different_patch_input_cases_internvl()
|
||||||
],
|
],
|
||||||
),
|
),
|
||||||
"llava_onevision-multiple-images": VLMTestInfo(
|
"llava_onevision-multiple-images": VLMTestInfo(
|
||||||
@ -797,14 +814,18 @@ VLM_TEST_SETTINGS = {
|
|||||||
max_model_len=16384,
|
max_model_len=16384,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
auto_cls=AutoModelForImageTextToText,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"), # noqa: E501
|
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs(
|
||||||
|
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
|
||||||
|
), # noqa: E501
|
||||||
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
|
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
|
||||||
custom_test_opts=[CustomTestOptions(
|
custom_test_opts=[
|
||||||
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
|
CustomTestOptions(
|
||||||
formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
|
||||||
),
|
formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||||
limit_mm_per_prompt={"image": 4},
|
),
|
||||||
)],
|
limit_mm_per_prompt={"image": 4},
|
||||||
|
)
|
||||||
|
],
|
||||||
),
|
),
|
||||||
# regression test for https://github.com/vllm-project/vllm/issues/15122
|
# regression test for https://github.com/vllm-project/vllm/issues/15122
|
||||||
"qwen2_5_vl-windows-attention": VLMTestInfo(
|
"qwen2_5_vl-windows-attention": VLMTestInfo(
|
||||||
@ -814,13 +835,14 @@ VLM_TEST_SETTINGS = {
|
|||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
auto_cls=AutoModelForImageTextToText,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
|
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
|
||||||
custom_test_opts=[CustomTestOptions(
|
custom_test_opts=[
|
||||||
inputs=custom_inputs.windows_attention_image_qwen2_5_vl(),
|
CustomTestOptions(
|
||||||
limit_mm_per_prompt={"image": 1},
|
inputs=custom_inputs.windows_attention_image_qwen2_5_vl(),
|
||||||
)],
|
limit_mm_per_prompt={"image": 1},
|
||||||
|
)
|
||||||
|
],
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
# yapf: enable
|
|
||||||
|
|
||||||
|
|
||||||
def _mark_splits(
|
def _mark_splits(
|
||||||
|
|||||||
@ -114,7 +114,6 @@ def get_parametrized_options(
|
|||||||
raise ValueError("Test has type CUSTOM_INPUTS, but none given")
|
raise ValueError("Test has type CUSTOM_INPUTS, but none given")
|
||||||
iter_kwargs["custom_test_opts"] = test_info.custom_test_opts
|
iter_kwargs["custom_test_opts"] = test_info.custom_test_opts
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
# Wrap all model cases in a pytest parameter & pass marks through
|
# Wrap all model cases in a pytest parameter & pass marks through
|
||||||
return [
|
return [
|
||||||
pytest.param(
|
pytest.param(
|
||||||
@ -122,10 +121,10 @@ def get_parametrized_options(
|
|||||||
ExpandableVLMTestArgs(
|
ExpandableVLMTestArgs(
|
||||||
**{k: v for k, v in zip(iter_kwargs.keys(), case)}
|
**{k: v for k, v in zip(iter_kwargs.keys(), case)}
|
||||||
),
|
),
|
||||||
marks=test_info.marks if test_info.marks is not None else []
|
marks=test_info.marks if test_info.marks is not None else [],
|
||||||
) for case in list(itertools.product(*iter_kwargs.values()))
|
)
|
||||||
|
for case in list(itertools.product(*iter_kwargs.values()))
|
||||||
]
|
]
|
||||||
# yapf: enable
|
|
||||||
|
|
||||||
# Get a list per model type, where each entry contains a tuple of all of
|
# Get a list per model type, where each entry contains a tuple of all of
|
||||||
# that model type's cases, then flatten them into the top level so that
|
# that model type's cases, then flatten them into the top level so that
|
||||||
|
|||||||
@ -418,7 +418,6 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
|||||||
self.image_size = self.vision_config.image_size
|
self.image_size = self.vision_config.image_size
|
||||||
|
|
||||||
def __call__(self, text: str, images: Union[Image, list[Image]], **kwargs):
|
def __call__(self, text: str, images: Union[Image, list[Image]], **kwargs):
|
||||||
# yapf: disable
|
|
||||||
from vllm.model_executor.models.h2ovl import (
|
from vllm.model_executor.models.h2ovl import (
|
||||||
IMG_CONTEXT,
|
IMG_CONTEXT,
|
||||||
IMG_END,
|
IMG_END,
|
||||||
@ -426,7 +425,6 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
|||||||
image_to_pixel_values_h2ovl,
|
image_to_pixel_values_h2ovl,
|
||||||
)
|
)
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
images = [images] if isinstance(images, Image) else images
|
images = [images] if isinstance(images, Image) else images
|
||||||
pixel_values = [
|
pixel_values = [
|
||||||
image_to_pixel_values_h2ovl(
|
image_to_pixel_values_h2ovl(
|
||||||
|
|||||||
@ -33,24 +33,26 @@ TEST_IMG_PLACEHOLDER = "<vlm_image>"
|
|||||||
TEST_VIDEO_PLACEHOLDER = "<vlm_video>"
|
TEST_VIDEO_PLACEHOLDER = "<vlm_video>"
|
||||||
TEST_AUDIO_PLACEHOLDER = "<lmm_audio>"
|
TEST_AUDIO_PLACEHOLDER = "<lmm_audio>"
|
||||||
|
|
||||||
# yapf: disable
|
SINGLE_IMAGE_BASE_PROMPTS = IMAGE_ASSETS.prompts(
|
||||||
SINGLE_IMAGE_BASE_PROMPTS = IMAGE_ASSETS.prompts({
|
{
|
||||||
"stop_sign": f"{TEST_IMG_PLACEHOLDER}What's the content of the image?",
|
"stop_sign": f"{TEST_IMG_PLACEHOLDER}What's the content of the image?",
|
||||||
"cherry_blossom": f"{TEST_IMG_PLACEHOLDER}What is the season?",
|
"cherry_blossom": f"{TEST_IMG_PLACEHOLDER}What is the season?",
|
||||||
})
|
}
|
||||||
SINGLE_AUDIO_BASE_PROMPT = AUDIO_ASSETS.prompts({
|
)
|
||||||
"mary_had_lamb": f"{TEST_AUDIO_PLACEHOLDER}Transcribe this audio into English.", # noqa: E501
|
SINGLE_AUDIO_BASE_PROMPT = AUDIO_ASSETS.prompts(
|
||||||
"winning_call": f"{TEST_AUDIO_PLACEHOLDER}What is happening in this audio clip?", # noqa: E501
|
{
|
||||||
})
|
"mary_had_lamb": f"{TEST_AUDIO_PLACEHOLDER}Transcribe this audio into English.", # noqa: E501
|
||||||
|
"winning_call": f"{TEST_AUDIO_PLACEHOLDER}What is happening in this audio clip?", # noqa: E501
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
MULTI_IMAGE_BASE_PROMPT = f"Image-1: {TEST_IMG_PLACEHOLDER}Image-2: {TEST_IMG_PLACEHOLDER}Describe the two images in detail.\n" # noqa: E501
|
MULTI_IMAGE_BASE_PROMPT = f"Image-1: {TEST_IMG_PLACEHOLDER}Image-2: {TEST_IMG_PLACEHOLDER}Describe the two images in detail.\n" # noqa: E501
|
||||||
VIDEO_BASE_PROMPT = f"{TEST_VIDEO_PLACEHOLDER}Why is this video funny?"
|
VIDEO_BASE_PROMPT = f"{TEST_VIDEO_PLACEHOLDER}Why is this video funny?"
|
||||||
|
|
||||||
|
|
||||||
IMAGE_SIZE_FACTORS = [(), (1.0, ), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)]
|
IMAGE_SIZE_FACTORS = [(), (1.0,), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)]
|
||||||
EMBEDDING_SIZE_FACTORS = [(), (1.0, ), (1.0, 1.0, 1.0)]
|
EMBEDDING_SIZE_FACTORS = [(), (1.0,), (1.0, 1.0, 1.0)]
|
||||||
RunnerOutput = tuple[list[int], str, Optional[SampleLogprobs]]
|
RunnerOutput = tuple[list[int], str, Optional[SampleLogprobs]]
|
||||||
# yapf: enable
|
|
||||||
|
|
||||||
|
|
||||||
class PromptWithMultiModalInput(NamedTuple):
|
class PromptWithMultiModalInput(NamedTuple):
|
||||||
|
|||||||
@ -322,80 +322,81 @@ def _test_processing_correctness_one(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
# yapf: disable
|
@pytest.mark.parametrize(
|
||||||
@pytest.mark.parametrize("model_id", [
|
"model_id",
|
||||||
"rhymes-ai/Aria",
|
[
|
||||||
"CohereForAI/aya-vision-8b",
|
"rhymes-ai/Aria",
|
||||||
"Salesforce/blip2-opt-2.7b",
|
"CohereForAI/aya-vision-8b",
|
||||||
"facebook/chameleon-7b",
|
"Salesforce/blip2-opt-2.7b",
|
||||||
"CohereLabs/command-a-vision-07-2025",
|
"facebook/chameleon-7b",
|
||||||
"deepseek-ai/deepseek-vl2-tiny",
|
"CohereLabs/command-a-vision-07-2025",
|
||||||
"baidu/ERNIE-4.5-VL-28B-A3B-PT",
|
"deepseek-ai/deepseek-vl2-tiny",
|
||||||
"adept/fuyu-8b",
|
"baidu/ERNIE-4.5-VL-28B-A3B-PT",
|
||||||
"google/gemma-3-4b-it",
|
"adept/fuyu-8b",
|
||||||
"google/gemma-3n-E2B-it",
|
"google/gemma-3-4b-it",
|
||||||
"zai-org/glm-4v-9b",
|
"google/gemma-3n-E2B-it",
|
||||||
"zai-org/GLM-4.1V-9B-Thinking",
|
"zai-org/glm-4v-9b",
|
||||||
"zai-org/GLM-4.5V",
|
"zai-org/GLM-4.1V-9B-Thinking",
|
||||||
"ibm-granite/granite-speech-3.3-2b",
|
"zai-org/GLM-4.5V",
|
||||||
"h2oai/h2ovl-mississippi-800m",
|
"ibm-granite/granite-speech-3.3-2b",
|
||||||
"naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
|
"h2oai/h2ovl-mississippi-800m",
|
||||||
"HuggingFaceM4/Idefics3-8B-Llama3",
|
"naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
|
||||||
"internlm/Intern-S1",
|
"HuggingFaceM4/Idefics3-8B-Llama3",
|
||||||
"OpenGVLab/InternVL2-1B",
|
"internlm/Intern-S1",
|
||||||
"OpenGVLab/InternVL3-1B",
|
"OpenGVLab/InternVL2-1B",
|
||||||
"OpenGVLab/InternVL3_5-1B",
|
"OpenGVLab/InternVL3-1B",
|
||||||
"OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview",
|
"OpenGVLab/InternVL3_5-1B",
|
||||||
"OpenGVLab/InternVL3_5-30B-A3B",
|
"OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview",
|
||||||
"Kwai-Keye/Keye-VL-8B-Preview",
|
"OpenGVLab/InternVL3_5-30B-A3B",
|
||||||
"Kwai-Keye/Keye-VL-1_5-8B",
|
"Kwai-Keye/Keye-VL-8B-Preview",
|
||||||
"moonshotai/Kimi-VL-A3B-Instruct",
|
"Kwai-Keye/Keye-VL-1_5-8B",
|
||||||
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
"moonshotai/Kimi-VL-A3B-Instruct",
|
||||||
"llava-hf/llava-1.5-7b-hf",
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
||||||
"llava-hf/llava-v1.6-mistral-7b-hf",
|
"llava-hf/llava-1.5-7b-hf",
|
||||||
"llava-hf/LLaVA-NeXT-Video-7B-hf",
|
"llava-hf/llava-v1.6-mistral-7b-hf",
|
||||||
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
|
"llava-hf/LLaVA-NeXT-Video-7B-hf",
|
||||||
"TIGER-Lab/Mantis-8B-siglip-llama3",
|
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
|
||||||
"mispeech/midashenglm-7b",
|
"TIGER-Lab/Mantis-8B-siglip-llama3",
|
||||||
"openbmb/MiniCPM-Llama3-V-2_5",
|
"mispeech/midashenglm-7b",
|
||||||
"openbmb/MiniCPM-o-2_6",
|
"openbmb/MiniCPM-Llama3-V-2_5",
|
||||||
"openbmb/MiniCPM-V-2_6",
|
"openbmb/MiniCPM-o-2_6",
|
||||||
"MiniMaxAI/MiniMax-VL-01",
|
"openbmb/MiniCPM-V-2_6",
|
||||||
"allenai/Molmo-7B-D-0924",
|
"MiniMaxAI/MiniMax-VL-01",
|
||||||
"allenai/Molmo-7B-O-0924",
|
"allenai/Molmo-7B-D-0924",
|
||||||
"nvidia/NVLM-D-72B",
|
"allenai/Molmo-7B-O-0924",
|
||||||
"nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1",
|
"nvidia/NVLM-D-72B",
|
||||||
"AIDC-AI/Ovis1.6-Gemma2-9B",
|
"nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1",
|
||||||
"AIDC-AI/Ovis1.6-Llama3.2-3B",
|
"AIDC-AI/Ovis1.6-Gemma2-9B",
|
||||||
"AIDC-AI/Ovis2-1B",
|
"AIDC-AI/Ovis1.6-Llama3.2-3B",
|
||||||
"AIDC-AI/Ovis2.5-2B",
|
"AIDC-AI/Ovis2-1B",
|
||||||
"google/paligemma-3b-mix-224",
|
"AIDC-AI/Ovis2.5-2B",
|
||||||
"google/paligemma2-3b-ft-docci-448",
|
"google/paligemma-3b-mix-224",
|
||||||
"microsoft/Phi-3.5-vision-instruct",
|
"google/paligemma2-3b-ft-docci-448",
|
||||||
"microsoft/Phi-4-multimodal-instruct",
|
"microsoft/Phi-3.5-vision-instruct",
|
||||||
"mistralai/Pixtral-12B-2409",
|
"microsoft/Phi-4-multimodal-instruct",
|
||||||
"mistral-community/pixtral-12b",
|
"mistralai/Pixtral-12B-2409",
|
||||||
"Qwen/Qwen-VL-Chat",
|
"mistral-community/pixtral-12b",
|
||||||
"Qwen/Qwen2-VL-2B-Instruct",
|
"Qwen/Qwen-VL-Chat",
|
||||||
"Qwen/Qwen2.5-VL-3B-Instruct",
|
"Qwen/Qwen2-VL-2B-Instruct",
|
||||||
"Qwen/Qwen2-Audio-7B-Instruct",
|
"Qwen/Qwen2.5-VL-3B-Instruct",
|
||||||
"Qwen/Qwen2.5-Omni-3B",
|
"Qwen/Qwen2-Audio-7B-Instruct",
|
||||||
"Qwen/Qwen3-VL-4B-Instruct",
|
"Qwen/Qwen2.5-Omni-3B",
|
||||||
"Qwen/Qwen3-VL-30B-A3B-Instruct",
|
"Qwen/Qwen3-VL-4B-Instruct",
|
||||||
"YannQi/R-4B",
|
"Qwen/Qwen3-VL-30B-A3B-Instruct",
|
||||||
"Skywork/Skywork-R1V-38B",
|
"YannQi/R-4B",
|
||||||
"HuggingFaceTB/SmolVLM2-2.2B-Instruct",
|
"Skywork/Skywork-R1V-38B",
|
||||||
"stepfun-ai/step3",
|
"HuggingFaceTB/SmolVLM2-2.2B-Instruct",
|
||||||
"fixie-ai/ultravox-v0_5-llama-3_2-1b",
|
"stepfun-ai/step3",
|
||||||
"openai/whisper-large-v3",
|
"fixie-ai/ultravox-v0_5-llama-3_2-1b",
|
||||||
"omni-research/Tarsier-7b",
|
"openai/whisper-large-v3",
|
||||||
"omni-research/Tarsier2-Recap-7b",
|
"omni-research/Tarsier-7b",
|
||||||
"mistralai/Voxtral-Mini-3B-2507",
|
"omni-research/Tarsier2-Recap-7b",
|
||||||
])
|
"mistralai/Voxtral-Mini-3B-2507",
|
||||||
|
],
|
||||||
|
)
|
||||||
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
|
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
|
||||||
@pytest.mark.parametrize("num_batches", [32])
|
@pytest.mark.parametrize("num_batches", [32])
|
||||||
@pytest.mark.parametrize("simplify_rate", [1.0])
|
@pytest.mark.parametrize("simplify_rate", [1.0])
|
||||||
# yapf: enable
|
|
||||||
def test_processing_correctness(
|
def test_processing_correctness(
|
||||||
model_id: str,
|
model_id: str,
|
||||||
hit_rate: float,
|
hit_rate: float,
|
||||||
|
|||||||
@ -12,7 +12,6 @@ from ...utils import build_model_context
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model_id", ["HuggingFaceM4/Idefics3-8B-Llama3"])
|
@pytest.mark.parametrize("model_id", ["HuggingFaceM4/Idefics3-8B-Llama3"])
|
||||||
# yapf: disable
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("mm_processor_kwargs", "expected_toks_per_img"),
|
("mm_processor_kwargs", "expected_toks_per_img"),
|
||||||
[
|
[
|
||||||
@ -20,7 +19,6 @@ from ...utils import build_model_context
|
|||||||
({"size": {"longest_edge": 728}}, 169 * (2**2 + 1)),
|
({"size": {"longest_edge": 728}}, 169 * (2**2 + 1)),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
# yapf: enable
|
|
||||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||||
@pytest.mark.parametrize("kwargs_on_init", [True, False])
|
@pytest.mark.parametrize("kwargs_on_init", [True, False])
|
||||||
def test_processor_override(
|
def test_processor_override(
|
||||||
|
|||||||
@ -11,7 +11,6 @@ from ...utils import build_model_context
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model_id", ["microsoft/Phi-3.5-vision-instruct"])
|
@pytest.mark.parametrize("model_id", ["microsoft/Phi-3.5-vision-instruct"])
|
||||||
# yapf: disable
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("mm_processor_kwargs", "expected_toks_per_img"),
|
("mm_processor_kwargs", "expected_toks_per_img"),
|
||||||
[
|
[
|
||||||
@ -21,7 +20,6 @@ from ...utils import build_model_context
|
|||||||
({}, 757),
|
({}, 757),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
# yapf: enable
|
|
||||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||||
@pytest.mark.parametrize("kwargs_on_init", [True, False])
|
@pytest.mark.parametrize("kwargs_on_init", [True, False])
|
||||||
def test_processor_override(
|
def test_processor_override(
|
||||||
|
|||||||
@ -11,7 +11,6 @@ from ...utils import build_model_context
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model_id", ["microsoft/Phi-4-multimodal-instruct"])
|
@pytest.mark.parametrize("model_id", ["microsoft/Phi-4-multimodal-instruct"])
|
||||||
# yapf: disable
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("mm_processor_kwargs", "expected_toks_per_img"),
|
("mm_processor_kwargs", "expected_toks_per_img"),
|
||||||
[
|
[
|
||||||
@ -21,7 +20,6 @@ from ...utils import build_model_context
|
|||||||
({}, 9585),
|
({}, 9585),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
# yapf: enable
|
|
||||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||||
@pytest.mark.parametrize("kwargs_on_init", [True, False])
|
@pytest.mark.parametrize("kwargs_on_init", [True, False])
|
||||||
def test_processor_override(
|
def test_processor_override(
|
||||||
|
|||||||
@ -10,7 +10,6 @@ from ...utils import build_model_context
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])
|
@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])
|
||||||
# yapf: disable
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("mm_processor_kwargs", "expected_toks_per_img", "expected_pixels_shape"),
|
("mm_processor_kwargs", "expected_toks_per_img", "expected_pixels_shape"),
|
||||||
[
|
[
|
||||||
@ -18,7 +17,6 @@ from ...utils import build_model_context
|
|||||||
({"min_pixels": 64**2, "max_pixels": 512**2}, 330, (1320, 1176)),
|
({"min_pixels": 64**2, "max_pixels": 512**2}, 330, (1320, 1176)),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
# yapf: enable
|
|
||||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||||
@pytest.mark.parametrize("kwargs_on_init", [True, False])
|
@pytest.mark.parametrize("kwargs_on_init", [True, False])
|
||||||
def test_processor_override(
|
def test_processor_override(
|
||||||
|
|||||||
@ -12,7 +12,6 @@ from ...utils import build_model_context
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model_id", ["HuggingFaceTB/SmolVLM2-2.2B-Instruct"])
|
@pytest.mark.parametrize("model_id", ["HuggingFaceTB/SmolVLM2-2.2B-Instruct"])
|
||||||
# yapf: disable
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("mm_processor_kwargs", "expected_toks_per_img"),
|
("mm_processor_kwargs", "expected_toks_per_img"),
|
||||||
[
|
[
|
||||||
@ -20,7 +19,6 @@ from ...utils import build_model_context
|
|||||||
({"max_image_size": {"longest_edge": 768}}, 405),
|
({"max_image_size": {"longest_edge": 768}}, 405),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
# yapf: enable
|
|
||||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||||
@pytest.mark.parametrize("kwargs_on_init", [True, False])
|
@pytest.mark.parametrize("kwargs_on_init", [True, False])
|
||||||
def test_processor_override(
|
def test_processor_override(
|
||||||
|
|||||||
@ -7,9 +7,7 @@ from vllm.config import ModelConfig
|
|||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
|
|
||||||
|
|
||||||
# yapf: disable
|
@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
|
||||||
@pytest.mark.parametrize("model_id",
|
|
||||||
["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
|
|
||||||
def test_multimodal_processor(model_id):
|
def test_multimodal_processor(model_id):
|
||||||
model_config = ModelConfig(
|
model_config = ModelConfig(
|
||||||
model=model_id,
|
model=model_id,
|
||||||
@ -18,9 +16,9 @@ def test_multimodal_processor(model_id):
|
|||||||
|
|
||||||
mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config)
|
mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config)
|
||||||
|
|
||||||
image_pil = ImageAsset('cherry_blossom').pil_image
|
image_pil = ImageAsset("cherry_blossom").pil_image
|
||||||
mm_data = {"image": image_pil}
|
mm_data = {"image": image_pil}
|
||||||
str_prompt = "<|im_start|>user <image>\nWhat is the content of this image?<|im_end|><|im_start|>assistant\n" # noqa: E501
|
str_prompt = "<|im_start|>user <image>\nWhat is the content of this image?<|im_end|><|im_start|>assistant\n" # noqa: E501
|
||||||
str_processed_inputs = mm_processor.apply(
|
str_processed_inputs = mm_processor.apply(
|
||||||
prompt=str_prompt,
|
prompt=str_prompt,
|
||||||
mm_data=mm_data,
|
mm_data=mm_data,
|
||||||
@ -28,8 +26,23 @@ def test_multimodal_processor(model_id):
|
|||||||
)
|
)
|
||||||
|
|
||||||
ids_prompt = [
|
ids_prompt = [
|
||||||
151644, 872, 220, 151646, 198, 3838, 374, 279, 2213, 315, 419, 2168,
|
151644,
|
||||||
30, 151645, 151644, 77091, 198
|
872,
|
||||||
|
220,
|
||||||
|
151646,
|
||||||
|
198,
|
||||||
|
3838,
|
||||||
|
374,
|
||||||
|
279,
|
||||||
|
2213,
|
||||||
|
315,
|
||||||
|
419,
|
||||||
|
2168,
|
||||||
|
30,
|
||||||
|
151645,
|
||||||
|
151644,
|
||||||
|
77091,
|
||||||
|
198,
|
||||||
]
|
]
|
||||||
ids_processed_inputs = mm_processor.apply(
|
ids_processed_inputs = mm_processor.apply(
|
||||||
prompt=ids_prompt,
|
prompt=ids_prompt,
|
||||||
@ -37,5 +50,7 @@ def test_multimodal_processor(model_id):
|
|||||||
hf_processor_mm_kwargs={},
|
hf_processor_mm_kwargs={},
|
||||||
)
|
)
|
||||||
|
|
||||||
assert (str_processed_inputs["prompt_token_ids"]
|
assert (
|
||||||
== ids_processed_inputs["prompt_token_ids"])
|
str_processed_inputs["prompt_token_ids"]
|
||||||
|
== ids_processed_inputs["prompt_token_ids"]
|
||||||
|
)
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@ -71,25 +71,27 @@ def _dummy_items(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("item", "expected_size"),
|
("item", "expected_size"),
|
||||||
[
|
[
|
||||||
(_dummy_item("a", {"a1": 100}), 100),
|
(_dummy_item("a", {"a1": 100}), 100),
|
||||||
(_dummy_item("a", {"a1": 100, "a2": 110}), 210),
|
(_dummy_item("a", {"a1": 100, "a2": 110}), 210),
|
||||||
(_dummy_items({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460), # noqa: E501
|
(_dummy_items({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460), # noqa: E501
|
||||||
(_dummy_items({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}).get_data(), 460), # noqa: E501
|
(
|
||||||
|
_dummy_items(
|
||||||
|
{"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}
|
||||||
|
).get_data(),
|
||||||
|
460,
|
||||||
|
), # noqa: E501
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
# yapf: enable
|
|
||||||
def test_cache_item_size(item, expected_size):
|
def test_cache_item_size(item, expected_size):
|
||||||
cache = MultiModalCache.get_lru_cache(2048, type(item))
|
cache = MultiModalCache.get_lru_cache(2048, type(item))
|
||||||
|
|
||||||
cache[""] = item
|
cache[""] = item
|
||||||
assert cache.currsize == expected_size
|
assert cache.currsize == expected_size
|
||||||
|
|
||||||
prompt_update = PromptInsertion("dummy", "target", "insertion") \
|
prompt_update = PromptInsertion("dummy", "target", "insertion").resolve(0)
|
||||||
.resolve(0)
|
|
||||||
|
|
||||||
cache[""] = MultiModalProcessorCacheItem(item, [prompt_update])
|
cache[""] = MultiModalProcessorCacheItem(item, [prompt_update])
|
||||||
assert cache.currsize == expected_size
|
assert cache.currsize == expected_size
|
||||||
@ -106,9 +108,9 @@ def _create_vllm_config(
|
|||||||
return VllmConfig(
|
return VllmConfig(
|
||||||
model_config=ModelConfig(
|
model_config=ModelConfig(
|
||||||
model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
|
model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
|
||||||
mm_processor_cache_gb=mm_processor_cache_gb),
|
mm_processor_cache_gb=mm_processor_cache_gb,
|
||||||
parallel_config=ParallelConfig(
|
),
|
||||||
data_parallel_size=1 if enable_ipc else 2),
|
parallel_config=ParallelConfig(data_parallel_size=1 if enable_ipc else 2),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -124,11 +126,9 @@ def _compare_caches(
|
|||||||
seed: int = 0,
|
seed: int = 0,
|
||||||
):
|
):
|
||||||
cache_0_p0 = processor_cache_from_config(config_0, MULTIMODAL_REGISTRY)
|
cache_0_p0 = processor_cache_from_config(config_0, MULTIMODAL_REGISTRY)
|
||||||
cache_0_p1 = engine_receiver_cache_from_config(config_0,
|
cache_0_p1 = engine_receiver_cache_from_config(config_0, MULTIMODAL_REGISTRY)
|
||||||
MULTIMODAL_REGISTRY)
|
|
||||||
cache_1_p0 = processor_cache_from_config(config_1, MULTIMODAL_REGISTRY)
|
cache_1_p0 = processor_cache_from_config(config_1, MULTIMODAL_REGISTRY)
|
||||||
cache_1_p1 = engine_receiver_cache_from_config(config_1,
|
cache_1_p1 = engine_receiver_cache_from_config(config_1, MULTIMODAL_REGISTRY)
|
||||||
MULTIMODAL_REGISTRY)
|
|
||||||
|
|
||||||
cache_size_gb = max(
|
cache_size_gb = max(
|
||||||
config_0.model_config.multimodal_config.mm_processor_cache_gb,
|
config_0.model_config.multimodal_config.mm_processor_cache_gb,
|
||||||
@ -142,8 +142,7 @@ def _compare_caches(
|
|||||||
for _ in range(int(item_capacity / hit_rate))
|
for _ in range(int(item_capacity / hit_rate))
|
||||||
]
|
]
|
||||||
all_hashes = [
|
all_hashes = [
|
||||||
MultiModalHasher.hash_kwargs(item=item.get_data())
|
MultiModalHasher.hash_kwargs(item=item.get_data()) for item in all_items
|
||||||
for item in all_items
|
|
||||||
]
|
]
|
||||||
|
|
||||||
# Should not be used since there is nothing to convert to text
|
# Should not be used since there is nothing to convert to text
|
||||||
@ -162,7 +161,8 @@ def _compare_caches(
|
|||||||
for _ in range(is_cached_calls_per_iter):
|
for _ in range(is_cached_calls_per_iter):
|
||||||
cache_0_p0.is_cached(selected_hashes)
|
cache_0_p0.is_cached(selected_hashes)
|
||||||
cache_0_p0_out = [
|
cache_0_p0_out = [
|
||||||
item for item, _ in cache_0_p0.get_and_update(
|
item
|
||||||
|
for item, _ in cache_0_p0.get_and_update(
|
||||||
[(item, prompt_update.content) for item in selected_items],
|
[(item, prompt_update.content) for item in selected_items],
|
||||||
selected_hashes,
|
selected_hashes,
|
||||||
)
|
)
|
||||||
@ -174,7 +174,8 @@ def _compare_caches(
|
|||||||
for _ in range(is_cached_calls_per_iter):
|
for _ in range(is_cached_calls_per_iter):
|
||||||
cache_1_p0.is_cached(selected_hashes)
|
cache_1_p0.is_cached(selected_hashes)
|
||||||
cache_1_p0_out = [
|
cache_1_p0_out = [
|
||||||
item for item, _ in cache_1_p0.get_and_update(
|
item
|
||||||
|
for item, _ in cache_1_p0.get_and_update(
|
||||||
[(item, prompt_update.content) for item in selected_items],
|
[(item, prompt_update.content) for item in selected_items],
|
||||||
selected_hashes,
|
selected_hashes,
|
||||||
)
|
)
|
||||||
@ -183,14 +184,12 @@ def _compare_caches(
|
|||||||
if cache_0_p1 is None:
|
if cache_0_p1 is None:
|
||||||
cache_0_p1_out = cache_0_p0_out
|
cache_0_p1_out = cache_0_p0_out
|
||||||
else:
|
else:
|
||||||
cache_0_p1_out = cache_0_p1.get_and_update(cache_0_p0_out,
|
cache_0_p1_out = cache_0_p1.get_and_update(cache_0_p0_out, selected_hashes)
|
||||||
selected_hashes)
|
|
||||||
|
|
||||||
if cache_1_p1 is None:
|
if cache_1_p1 is None:
|
||||||
cache_1_p1_out = cache_1_p0_out
|
cache_1_p1_out = cache_1_p0_out
|
||||||
else:
|
else:
|
||||||
cache_1_p1_out = cache_1_p1.get_and_update(cache_1_p0_out,
|
cache_1_p1_out = cache_1_p1.get_and_update(cache_1_p0_out, selected_hashes)
|
||||||
selected_hashes)
|
|
||||||
|
|
||||||
assert cache_0_p1_out == cache_1_p1_out, f"Failed at {it=}"
|
assert cache_0_p1_out == cache_1_p1_out, f"Failed at {it=}"
|
||||||
|
|
||||||
|
|||||||
@ -9,9 +9,6 @@ import pytest
|
|||||||
|
|
||||||
from vllm.config import ModelConfig
|
from vllm.config import ModelConfig
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
|
|
||||||
# yapf conflicts with isort for this block
|
|
||||||
# yapf: disable
|
|
||||||
from vllm.multimodal.processing import (
|
from vllm.multimodal.processing import (
|
||||||
InputProcessingContext,
|
InputProcessingContext,
|
||||||
PlaceholderFeaturesInfo,
|
PlaceholderFeaturesInfo,
|
||||||
@ -24,8 +21,6 @@ from vllm.multimodal.processing import (
|
|||||||
iter_token_matches,
|
iter_token_matches,
|
||||||
replace_token_matches,
|
replace_token_matches,
|
||||||
)
|
)
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
from vllm.multimodal.profiling import MultiModalProfiler
|
from vllm.multimodal.profiling import MultiModalProfiler
|
||||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||||
|
|
||||||
@ -34,7 +29,6 @@ from .utils import random_image
|
|||||||
pytestmark = pytest.mark.cpu_test
|
pytestmark = pytest.mark.cpu_test
|
||||||
|
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("token_ids", "match_ids", "expected"),
|
("token_ids", "match_ids", "expected"),
|
||||||
[
|
[
|
||||||
@ -44,34 +38,34 @@ pytestmark = pytest.mark.cpu_test
|
|||||||
[32000, 32000, 32000],
|
[32000, 32000, 32000],
|
||||||
[32000],
|
[32000],
|
||||||
[
|
[
|
||||||
{ "start_idx": 0, "end_idx": 1 },
|
{"start_idx": 0, "end_idx": 1},
|
||||||
{ "start_idx": 1, "end_idx": 2 },
|
{"start_idx": 1, "end_idx": 2},
|
||||||
{ "start_idx": 2, "end_idx": 3 },
|
{"start_idx": 2, "end_idx": 3},
|
||||||
],
|
],
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
[32000, 32000, 32000],
|
[32000, 32000, 32000],
|
||||||
[32000, 32000],
|
[32000, 32000],
|
||||||
[{ "start_idx": 0, "end_idx": 2 }],
|
[{"start_idx": 0, "end_idx": 2}],
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
[32000, 32000, 32000],
|
[32000, 32000, 32000],
|
||||||
[32000, 32000, 32000],
|
[32000, 32000, 32000],
|
||||||
[{ "start_idx": 0, "end_idx": 3 }],
|
[{"start_idx": 0, "end_idx": 3}],
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
[9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
|
[9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
|
||||||
[28747, 32000],
|
[28747, 32000],
|
||||||
[
|
[
|
||||||
{ "start_idx": 1, "end_idx": 3 },
|
{"start_idx": 1, "end_idx": 3},
|
||||||
{ "start_idx": 6, "end_idx": 8 },
|
{"start_idx": 6, "end_idx": 8},
|
||||||
],
|
],
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
[9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
|
[9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
|
||||||
[28747, 32000, 32000, 32000],
|
[28747, 32000, 32000, 32000],
|
||||||
[
|
[
|
||||||
{ "start_idx": 1, "end_idx": 5 },
|
{"start_idx": 1, "end_idx": 5},
|
||||||
],
|
],
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
@ -82,14 +76,13 @@ pytestmark = pytest.mark.cpu_test
|
|||||||
],
|
],
|
||||||
)
|
)
|
||||||
@pytest.mark.parametrize("start_idx", [0, 4, 8])
|
@pytest.mark.parametrize("start_idx", [0, 4, 8])
|
||||||
# yapf: enable
|
|
||||||
def test_iter_token_matches(token_ids, match_ids, expected, start_idx):
|
def test_iter_token_matches(token_ids, match_ids, expected, start_idx):
|
||||||
result = list(iter_token_matches(token_ids, match_ids,
|
result = list(iter_token_matches(token_ids, match_ids, start_idx=start_idx))
|
||||||
start_idx=start_idx))
|
|
||||||
|
|
||||||
# Manually constructed results
|
# Manually constructed results
|
||||||
assert [item._asdict() for item in result
|
assert [item._asdict() for item in result] == [
|
||||||
] == [item for item in expected if item["start_idx"] >= start_idx]
|
item for item in expected if item["start_idx"] >= start_idx
|
||||||
|
]
|
||||||
|
|
||||||
# Invariants
|
# Invariants
|
||||||
match_lens = [end - start for start, end in result]
|
match_lens = [end - start for start, end in result]
|
||||||
@ -97,7 +90,6 @@ def test_iter_token_matches(token_ids, match_ids, expected, start_idx):
|
|||||||
assert all(match_len == len(match_ids) for match_len in match_lens)
|
assert all(match_len == len(match_ids) for match_len in match_lens)
|
||||||
|
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("token_ids", "match_ids", "new_ids", "expected"),
|
("token_ids", "match_ids", "new_ids", "expected"),
|
||||||
[
|
[
|
||||||
@ -141,7 +133,6 @@ def test_iter_token_matches(token_ids, match_ids, expected, start_idx):
|
|||||||
),
|
),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
# yapf: enable
|
|
||||||
def test_replace_token_matches(token_ids, match_ids, new_ids, expected):
|
def test_replace_token_matches(token_ids, match_ids, new_ids, expected):
|
||||||
result = replace_token_matches(token_ids, match_ids, new_ids)
|
result = replace_token_matches(token_ids, match_ids, new_ids)
|
||||||
|
|
||||||
@ -149,7 +140,6 @@ def test_replace_token_matches(token_ids, match_ids, new_ids, expected):
|
|||||||
assert result == expected
|
assert result == expected
|
||||||
|
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("prompt", "target_by_key", "expected_by_key"),
|
("prompt", "target_by_key", "expected_by_key"),
|
||||||
[
|
[
|
||||||
@ -166,11 +156,11 @@ def test_replace_token_matches(token_ids, match_ids, new_ids, expected):
|
|||||||
"pattern_1": [],
|
"pattern_1": [],
|
||||||
"pattern_2": [],
|
"pattern_2": [],
|
||||||
"pattern_3": [
|
"pattern_3": [
|
||||||
{ "start_idx": 0, "end_idx": 0 },
|
{"start_idx": 0, "end_idx": 0},
|
||||||
],
|
],
|
||||||
"pattern_4": [],
|
"pattern_4": [],
|
||||||
"pattern_5": [
|
"pattern_5": [
|
||||||
{ "start_idx": 0, "end_idx": 0 },
|
{"start_idx": 0, "end_idx": 0},
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
@ -186,26 +176,26 @@ def test_replace_token_matches(token_ids, match_ids, new_ids, expected):
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"pattern_1": [
|
"pattern_1": [
|
||||||
{ "start_idx": 0, "end_idx": 1 },
|
{"start_idx": 0, "end_idx": 1},
|
||||||
{ "start_idx": 1, "end_idx": 2 },
|
{"start_idx": 1, "end_idx": 2},
|
||||||
{ "start_idx": 2, "end_idx": 3 },
|
{"start_idx": 2, "end_idx": 3},
|
||||||
{ "start_idx": 3, "end_idx": 4 },
|
{"start_idx": 3, "end_idx": 4},
|
||||||
],
|
],
|
||||||
"pattern_2": [
|
"pattern_2": [
|
||||||
{ "start_idx": 0, "end_idx": 2 },
|
{"start_idx": 0, "end_idx": 2},
|
||||||
{ "start_idx": 2, "end_idx": 4 },
|
{"start_idx": 2, "end_idx": 4},
|
||||||
],
|
],
|
||||||
"pattern_3": [
|
"pattern_3": [
|
||||||
{ "start_idx": 0, "end_idx": 3 },
|
{"start_idx": 0, "end_idx": 3},
|
||||||
],
|
],
|
||||||
"pattern_4": [
|
"pattern_4": [
|
||||||
{ "start_idx": 0, "end_idx": 0 },
|
{"start_idx": 0, "end_idx": 0},
|
||||||
],
|
],
|
||||||
"pattern_5": [
|
"pattern_5": [
|
||||||
{ "start_idx": 1, "end_idx": 1 },
|
{"start_idx": 1, "end_idx": 1},
|
||||||
],
|
],
|
||||||
"pattern_6": [
|
"pattern_6": [
|
||||||
{ "start_idx": 4, "end_idx": 4 },
|
{"start_idx": 4, "end_idx": 4},
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
@ -221,26 +211,25 @@ def test_replace_token_matches(token_ids, match_ids, new_ids, expected):
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"pattern_1": [
|
"pattern_1": [
|
||||||
{ "start_idx": 1, "end_idx": 3 },
|
{"start_idx": 1, "end_idx": 3},
|
||||||
{ "start_idx": 6, "end_idx": 8 },
|
{"start_idx": 6, "end_idx": 8},
|
||||||
],
|
],
|
||||||
"pattern_2": [
|
"pattern_2": [
|
||||||
{ "start_idx": 1, "end_idx": 5 },
|
{"start_idx": 1, "end_idx": 5},
|
||||||
],
|
],
|
||||||
"pattern_3": [],
|
"pattern_3": [],
|
||||||
"pattern_4": [
|
"pattern_4": [
|
||||||
{ "start_idx": 0, "end_idx": 0 },
|
{"start_idx": 0, "end_idx": 0},
|
||||||
],
|
],
|
||||||
"pattern_5": [],
|
"pattern_5": [],
|
||||||
"pattern_6": [
|
"pattern_6": [
|
||||||
{ "start_idx": 10, "end_idx": 10 },
|
{"start_idx": 10, "end_idx": 10},
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
@pytest.mark.parametrize("update_type", [PromptInsertion, PromptReplacement])
|
@pytest.mark.parametrize("update_type", [PromptInsertion, PromptReplacement])
|
||||||
# yapf: enable
|
|
||||||
def test_find_token_matches(
|
def test_find_token_matches(
|
||||||
prompt,
|
prompt,
|
||||||
target_by_key,
|
target_by_key,
|
||||||
@ -272,7 +261,6 @@ def test_find_token_matches(
|
|||||||
} == expected_by_key
|
} == expected_by_key
|
||||||
|
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("prompt", "target_by_key", "expected_by_key"),
|
("prompt", "target_by_key", "expected_by_key"),
|
||||||
[
|
[
|
||||||
@ -288,16 +276,16 @@ def test_find_token_matches(
|
|||||||
"pattern_5": PromptIndexTargets.end(),
|
"pattern_5": PromptIndexTargets.end(),
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"pattern_1": [{ "start_idx": 0, "end_idx": 0 }],
|
"pattern_1": [{"start_idx": 0, "end_idx": 0}],
|
||||||
"pattern_2": [],
|
"pattern_2": [],
|
||||||
"pattern_3": [
|
"pattern_3": [
|
||||||
{ "start_idx": 0, "end_idx": 0 },
|
{"start_idx": 0, "end_idx": 0},
|
||||||
],
|
],
|
||||||
"pattern_4": [],
|
"pattern_4": [],
|
||||||
"pattern_5": [
|
"pattern_5": [
|
||||||
{ "start_idx": 0, "end_idx": 0 },
|
{"start_idx": 0, "end_idx": 0},
|
||||||
],
|
],
|
||||||
}
|
},
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
"<image><image><image><image>",
|
"<image><image><image><image>",
|
||||||
@ -311,26 +299,26 @@ def test_find_token_matches(
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"pattern_1": [
|
"pattern_1": [
|
||||||
{ "start_idx": 0, "end_idx": 7 },
|
{"start_idx": 0, "end_idx": 7},
|
||||||
{ "start_idx": 7, "end_idx": 14 },
|
{"start_idx": 7, "end_idx": 14},
|
||||||
{ "start_idx": 14, "end_idx": 21 },
|
{"start_idx": 14, "end_idx": 21},
|
||||||
{ "start_idx": 21, "end_idx": 28 },
|
{"start_idx": 21, "end_idx": 28},
|
||||||
],
|
],
|
||||||
"pattern_2": [
|
"pattern_2": [
|
||||||
{ "start_idx": 0, "end_idx": 14 },
|
{"start_idx": 0, "end_idx": 14},
|
||||||
{ "start_idx": 14, "end_idx": 28 },
|
{"start_idx": 14, "end_idx": 28},
|
||||||
],
|
],
|
||||||
"pattern_3": [
|
"pattern_3": [
|
||||||
{ "start_idx": 0, "end_idx": 21 },
|
{"start_idx": 0, "end_idx": 21},
|
||||||
],
|
],
|
||||||
"pattern_4": [
|
"pattern_4": [
|
||||||
{ "start_idx": 0, "end_idx": 0 },
|
{"start_idx": 0, "end_idx": 0},
|
||||||
],
|
],
|
||||||
"pattern_5": [
|
"pattern_5": [
|
||||||
{ "start_idx": 7, "end_idx": 7 },
|
{"start_idx": 7, "end_idx": 7},
|
||||||
],
|
],
|
||||||
"pattern_6": [
|
"pattern_6": [
|
||||||
{ "start_idx": 28, "end_idx": 28 },
|
{"start_idx": 28, "end_idx": 28},
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
@ -346,21 +334,21 @@ def test_find_token_matches(
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"pattern_1": [
|
"pattern_1": [
|
||||||
{ "start_idx": 0, "end_idx": 13 },
|
{"start_idx": 0, "end_idx": 13},
|
||||||
{ "start_idx": 27, "end_idx": 40 },
|
{"start_idx": 27, "end_idx": 40},
|
||||||
],
|
],
|
||||||
"pattern_2": [
|
"pattern_2": [
|
||||||
{ "start_idx": 0, "end_idx": 27 },
|
{"start_idx": 0, "end_idx": 27},
|
||||||
],
|
],
|
||||||
"pattern_3": [],
|
"pattern_3": [],
|
||||||
"pattern_4": [
|
"pattern_4": [
|
||||||
{ "start_idx": 0, "end_idx": 0 },
|
{"start_idx": 0, "end_idx": 0},
|
||||||
],
|
],
|
||||||
"pattern_5": [
|
"pattern_5": [
|
||||||
{ "start_idx": 13, "end_idx": 13 },
|
{"start_idx": 13, "end_idx": 13},
|
||||||
],
|
],
|
||||||
"pattern_6": [
|
"pattern_6": [
|
||||||
{ "start_idx": 48, "end_idx": 48 },
|
{"start_idx": 48, "end_idx": 48},
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
@ -374,22 +362,21 @@ def test_find_token_matches(
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"pattern_1": [
|
"pattern_1": [
|
||||||
{ "start_idx": 0, "end_idx": 9 },
|
{"start_idx": 0, "end_idx": 9},
|
||||||
{ "start_idx": 16, "end_idx": 25 },
|
{"start_idx": 16, "end_idx": 25},
|
||||||
],
|
],
|
||||||
"pattern_2": [
|
"pattern_2": [
|
||||||
{ "start_idx": 0, "end_idx": 16 },
|
{"start_idx": 0, "end_idx": 16},
|
||||||
{ "start_idx": 16, "end_idx": 32 },
|
{"start_idx": 16, "end_idx": 32},
|
||||||
],
|
],
|
||||||
"pattern_3": [
|
"pattern_3": [
|
||||||
{ "start_idx": 0, "end_idx": 25 },
|
{"start_idx": 0, "end_idx": 25},
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
@pytest.mark.parametrize("update_type", [PromptInsertion, PromptReplacement])
|
@pytest.mark.parametrize("update_type", [PromptInsertion, PromptReplacement])
|
||||||
# yapf: enable
|
|
||||||
def test_find_text_matches(
|
def test_find_text_matches(
|
||||||
prompt,
|
prompt,
|
||||||
target_by_key,
|
target_by_key,
|
||||||
@ -421,7 +408,6 @@ def test_find_text_matches(
|
|||||||
} == expected_by_key
|
} == expected_by_key
|
||||||
|
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("prompt", "target_by_key", "repl_by_key", "expected_by_update_type_mm_count"), # noqa: E501
|
("prompt", "target_by_key", "repl_by_key", "expected_by_update_type_mm_count"), # noqa: E501
|
||||||
[
|
[
|
||||||
@ -549,9 +535,8 @@ def test_find_text_matches(
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
]
|
],
|
||||||
)
|
)
|
||||||
# yapf: enable
|
|
||||||
def test_find_update_text(
|
def test_find_update_text(
|
||||||
prompt,
|
prompt,
|
||||||
target_by_key,
|
target_by_key,
|
||||||
@ -562,13 +547,15 @@ def test_find_update_text(
|
|||||||
mock_tokenizer = cast(AnyTokenizer, object())
|
mock_tokenizer = cast(AnyTokenizer, object())
|
||||||
|
|
||||||
for (
|
for (
|
||||||
update_type,
|
update_type,
|
||||||
expected_by_mm_count,
|
expected_by_mm_count,
|
||||||
) in expected_by_update_type_mm_count.items():
|
) in expected_by_update_type_mm_count.items():
|
||||||
for mm_count, expected in expected_by_mm_count.items():
|
for mm_count, expected in expected_by_mm_count.items():
|
||||||
mm_prompt_updates = {
|
mm_prompt_updates = {
|
||||||
key: [[update_type(key, target, repl_by_key[key]).resolve(i)]
|
key: [
|
||||||
for i in range(mm_count)]
|
[update_type(key, target, repl_by_key[key]).resolve(i)]
|
||||||
|
for i in range(mm_count)
|
||||||
|
]
|
||||||
for key, target in target_by_key.items()
|
for key, target in target_by_key.items()
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -589,7 +576,6 @@ def test_find_update_text(
|
|||||||
assert new_prompt == expected
|
assert new_prompt == expected
|
||||||
|
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("prompt", "target_by_key", "repl_by_key", "expected_by_update_type_mm_count"), # noqa: E501
|
("prompt", "target_by_key", "repl_by_key", "expected_by_update_type_mm_count"), # noqa: E501
|
||||||
[
|
[
|
||||||
@ -615,8 +601,43 @@ def test_find_update_text(
|
|||||||
{
|
{
|
||||||
PromptInsertion: {
|
PromptInsertion: {
|
||||||
0: [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918],
|
0: [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918],
|
||||||
1: [1, 9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918, 1550, 918, 1550], # noqa: E501
|
1: [
|
||||||
2: [1, 9833, 28747, 32000, 32000, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918, 1550, 918, 1550, 1550, 918, 1550], # noqa: E501
|
1,
|
||||||
|
9833,
|
||||||
|
28747,
|
||||||
|
32000,
|
||||||
|
32000,
|
||||||
|
32000,
|
||||||
|
9833,
|
||||||
|
28747,
|
||||||
|
32000,
|
||||||
|
32000,
|
||||||
|
918,
|
||||||
|
1550,
|
||||||
|
918,
|
||||||
|
1550,
|
||||||
|
], # noqa: E501
|
||||||
|
2: [
|
||||||
|
1,
|
||||||
|
9833,
|
||||||
|
28747,
|
||||||
|
32000,
|
||||||
|
32000,
|
||||||
|
32000,
|
||||||
|
32000,
|
||||||
|
32000,
|
||||||
|
9833,
|
||||||
|
28747,
|
||||||
|
32000,
|
||||||
|
32000,
|
||||||
|
918,
|
||||||
|
1550,
|
||||||
|
918,
|
||||||
|
1550,
|
||||||
|
1550,
|
||||||
|
918,
|
||||||
|
1550,
|
||||||
|
], # noqa: E501
|
||||||
},
|
},
|
||||||
PromptReplacement: {
|
PromptReplacement: {
|
||||||
0: [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918],
|
0: [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918],
|
||||||
@ -719,9 +740,8 @@ def test_find_update_text(
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
]
|
],
|
||||||
)
|
)
|
||||||
# yapf: enable
|
|
||||||
def test_find_update_tokens(
|
def test_find_update_tokens(
|
||||||
prompt,
|
prompt,
|
||||||
target_by_key,
|
target_by_key,
|
||||||
@ -732,13 +752,15 @@ def test_find_update_tokens(
|
|||||||
mock_tokenizer = cast(AnyTokenizer, object())
|
mock_tokenizer = cast(AnyTokenizer, object())
|
||||||
|
|
||||||
for (
|
for (
|
||||||
update_type,
|
update_type,
|
||||||
expected_by_mm_count,
|
expected_by_mm_count,
|
||||||
) in expected_by_update_type_mm_count.items():
|
) in expected_by_update_type_mm_count.items():
|
||||||
for mm_count, expected in expected_by_mm_count.items():
|
for mm_count, expected in expected_by_mm_count.items():
|
||||||
mm_prompt_updates = {
|
mm_prompt_updates = {
|
||||||
key: [[update_type(key, target, repl_by_key[key]).resolve(i)]
|
key: [
|
||||||
for i in range(mm_count)]
|
[update_type(key, target, repl_by_key[key]).resolve(i)]
|
||||||
|
for i in range(mm_count)
|
||||||
|
]
|
||||||
for key, target in target_by_key.items()
|
for key, target in target_by_key.items()
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -759,7 +781,6 @@ def test_find_update_tokens(
|
|||||||
assert new_prompt == expected
|
assert new_prompt == expected
|
||||||
|
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"repl_by_key",
|
"repl_by_key",
|
||||||
[
|
[
|
||||||
@ -796,8 +817,7 @@ def test_find_update_tokens(
|
|||||||
is_embed=None,
|
is_embed=None,
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
}
|
},
|
||||||
|
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
[1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550],
|
[1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550],
|
||||||
@ -828,7 +848,7 @@ def test_find_update_tokens(
|
|||||||
),
|
),
|
||||||
],
|
],
|
||||||
# No match for pattern_4 as it has lower priority than pattern_1
|
# No match for pattern_4 as it has lower priority than pattern_1
|
||||||
}
|
},
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
[1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550],
|
[1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550],
|
||||||
@ -867,12 +887,11 @@ def test_find_update_tokens(
|
|||||||
is_embed=None,
|
is_embed=None,
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
}
|
},
|
||||||
),
|
),
|
||||||
]
|
],
|
||||||
)
|
)
|
||||||
@pytest.mark.parametrize("update_type", [PromptInsertion, PromptReplacement])
|
@pytest.mark.parametrize("update_type", [PromptInsertion, PromptReplacement])
|
||||||
# yapf: enable
|
|
||||||
def test_find_mm_placeholders(
|
def test_find_mm_placeholders(
|
||||||
repl_by_key,
|
repl_by_key,
|
||||||
prompt,
|
prompt,
|
||||||
@ -899,8 +918,15 @@ def test_find_mm_placeholders(
|
|||||||
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
|
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("limit", "num_supported", "is_valid"),
|
("limit", "num_supported", "is_valid"),
|
||||||
[(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),
|
[
|
||||||
(2, 1, False), (2, 2, True)],
|
(0, 0, True),
|
||||||
|
(0, 1, True),
|
||||||
|
(1, 0, False),
|
||||||
|
(1, 1, True),
|
||||||
|
(1, 2, True),
|
||||||
|
(2, 1, False),
|
||||||
|
(2, 2, True),
|
||||||
|
],
|
||||||
)
|
)
|
||||||
def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
|
def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
|
||||||
limit_mm_per_prompt = {"image": limit}
|
limit_mm_per_prompt = {"image": limit}
|
||||||
@ -930,8 +956,15 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
|
|||||||
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
|
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("num_images", "limit", "is_valid"),
|
("num_images", "limit", "is_valid"),
|
||||||
[(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),
|
[
|
||||||
(2, 1, False), (2, 2, True)],
|
(0, 0, True),
|
||||||
|
(0, 1, True),
|
||||||
|
(1, 0, False),
|
||||||
|
(1, 1, True),
|
||||||
|
(1, 2, True),
|
||||||
|
(2, 1, False),
|
||||||
|
(2, 2, True),
|
||||||
|
],
|
||||||
)
|
)
|
||||||
def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
|
def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
|
||||||
limit_mm_per_prompt = {"image": limit}
|
limit_mm_per_prompt = {"image": limit}
|
||||||
@ -966,7 +999,6 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
|
|||||||
|
|
||||||
|
|
||||||
class DummyProcessor:
|
class DummyProcessor:
|
||||||
|
|
||||||
def __init__(self, a: int = 0, b: int = 0) -> None:
|
def __init__(self, a: int = 0, b: int = 0) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
@ -982,7 +1014,6 @@ class DummyProcessor:
|
|||||||
return dict(a=a, c=c)
|
return dict(a=a, c=c)
|
||||||
|
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) # Dummy
|
@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) # Dummy
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("config_kwargs", "inference_kwargs", "expected_kwargs"),
|
("config_kwargs", "inference_kwargs", "expected_kwargs"),
|
||||||
@ -996,7 +1027,6 @@ class DummyProcessor:
|
|||||||
({"b": 1, "c": 1}, {}, {"a": 0, "b": 1}),
|
({"b": 1, "c": 1}, {}, {"a": 0, "b": 1}),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
# yapf: enable
|
|
||||||
def test_hf_processor_init_kwargs(
|
def test_hf_processor_init_kwargs(
|
||||||
model_id,
|
model_id,
|
||||||
config_kwargs,
|
config_kwargs,
|
||||||
@ -1020,7 +1050,6 @@ def test_hf_processor_init_kwargs(
|
|||||||
assert getattr(processor, k) == v
|
assert getattr(processor, k) == v
|
||||||
|
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) # Dummy
|
@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) # Dummy
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("config_kwargs", "inference_kwargs", "expected_kwargs"),
|
("config_kwargs", "inference_kwargs", "expected_kwargs"),
|
||||||
@ -1034,7 +1063,6 @@ def test_hf_processor_init_kwargs(
|
|||||||
({"b": 1, "c": 1}, {}, {"a": 0, "c": 1}),
|
({"b": 1, "c": 1}, {}, {"a": 0, "c": 1}),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
# yapf: enable
|
|
||||||
def test_hf_processor_call_kwargs(
|
def test_hf_processor_call_kwargs(
|
||||||
model_id,
|
model_id,
|
||||||
config_kwargs,
|
config_kwargs,
|
||||||
|
|||||||
@ -233,7 +233,6 @@ async def test_fetch_video_http_with_dynamic_loader(
|
|||||||
assert metadata_sync["video_backend"] == "opencv_dynamic"
|
assert metadata_sync["video_backend"] == "opencv_dynamic"
|
||||||
|
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"case",
|
"case",
|
||||||
[
|
[
|
||||||
@ -264,7 +263,6 @@ async def test_fetch_video_http_with_dynamic_loader(
|
|||||||
("image", 0),
|
("image", 0),
|
||||||
],
|
],
|
||||||
),
|
),
|
||||||
|
|
||||||
# Two modalities
|
# Two modalities
|
||||||
## Internally sorted
|
## Internally sorted
|
||||||
dict(
|
dict(
|
||||||
@ -276,7 +274,7 @@ async def test_fetch_video_http_with_dynamic_loader(
|
|||||||
"audio": [
|
"audio": [
|
||||||
PlaceholderRange(offset=0, length=2),
|
PlaceholderRange(offset=0, length=2),
|
||||||
PlaceholderRange(offset=2, length=3),
|
PlaceholderRange(offset=2, length=3),
|
||||||
]
|
],
|
||||||
},
|
},
|
||||||
expected_modality_idxs=[
|
expected_modality_idxs=[
|
||||||
("audio", 0),
|
("audio", 0),
|
||||||
@ -295,7 +293,7 @@ async def test_fetch_video_http_with_dynamic_loader(
|
|||||||
"audio": [
|
"audio": [
|
||||||
PlaceholderRange(offset=5, length=2),
|
PlaceholderRange(offset=5, length=2),
|
||||||
PlaceholderRange(offset=11, length=4),
|
PlaceholderRange(offset=11, length=4),
|
||||||
]
|
],
|
||||||
},
|
},
|
||||||
expected_modality_idxs=[
|
expected_modality_idxs=[
|
||||||
("image", 0),
|
("image", 0),
|
||||||
@ -314,7 +312,7 @@ async def test_fetch_video_http_with_dynamic_loader(
|
|||||||
"audio": [
|
"audio": [
|
||||||
PlaceholderRange(offset=11, length=4),
|
PlaceholderRange(offset=11, length=4),
|
||||||
PlaceholderRange(offset=5, length=2),
|
PlaceholderRange(offset=5, length=2),
|
||||||
]
|
],
|
||||||
},
|
},
|
||||||
expected_modality_idxs=[
|
expected_modality_idxs=[
|
||||||
("image", 1),
|
("image", 1),
|
||||||
@ -323,7 +321,6 @@ async def test_fetch_video_http_with_dynamic_loader(
|
|||||||
("audio", 0),
|
("audio", 0),
|
||||||
],
|
],
|
||||||
),
|
),
|
||||||
|
|
||||||
# Three modalities
|
# Three modalities
|
||||||
## Internally sorted
|
## Internally sorted
|
||||||
dict(
|
dict(
|
||||||
@ -339,7 +336,7 @@ async def test_fetch_video_http_with_dynamic_loader(
|
|||||||
PlaceholderRange(offset=3, length=4),
|
PlaceholderRange(offset=3, length=4),
|
||||||
PlaceholderRange(offset=7, length=5),
|
PlaceholderRange(offset=7, length=5),
|
||||||
PlaceholderRange(offset=12, length=6),
|
PlaceholderRange(offset=12, length=6),
|
||||||
]
|
],
|
||||||
},
|
},
|
||||||
expected_modality_idxs=[
|
expected_modality_idxs=[
|
||||||
("audio", 0),
|
("audio", 0),
|
||||||
@ -363,7 +360,7 @@ async def test_fetch_video_http_with_dynamic_loader(
|
|||||||
],
|
],
|
||||||
"video": [
|
"video": [
|
||||||
PlaceholderRange(offset=8, length=5),
|
PlaceholderRange(offset=8, length=5),
|
||||||
]
|
],
|
||||||
},
|
},
|
||||||
expected_modality_idxs=[
|
expected_modality_idxs=[
|
||||||
("image", 0),
|
("image", 0),
|
||||||
@ -386,7 +383,7 @@ async def test_fetch_video_http_with_dynamic_loader(
|
|||||||
],
|
],
|
||||||
"video": [
|
"video": [
|
||||||
PlaceholderRange(offset=8, length=5),
|
PlaceholderRange(offset=8, length=5),
|
||||||
]
|
],
|
||||||
},
|
},
|
||||||
expected_modality_idxs=[
|
expected_modality_idxs=[
|
||||||
("image", 0),
|
("image", 0),
|
||||||
@ -398,7 +395,6 @@ async def test_fetch_video_http_with_dynamic_loader(
|
|||||||
),
|
),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
# yapf: enable
|
|
||||||
def test_argsort_mm_positions(case):
|
def test_argsort_mm_positions(case):
|
||||||
mm_positions = case["mm_positions"]
|
mm_positions = case["mm_positions"]
|
||||||
expected_modality_idxs = case["expected_modality_idxs"]
|
expected_modality_idxs = case["expected_modality_idxs"]
|
||||||
@ -413,13 +409,16 @@ def test_argsort_mm_positions(case):
|
|||||||
@pytest.mark.parametrize("num_frames", [-1, 32, 1800])
|
@pytest.mark.parametrize("num_frames", [-1, 32, 1800])
|
||||||
async def test_allowed_media_domains(video_url: str, num_frames: int):
|
async def test_allowed_media_domains(video_url: str, num_frames: int):
|
||||||
connector = MediaConnector(
|
connector = MediaConnector(
|
||||||
media_io_kwargs={"video": {
|
media_io_kwargs={
|
||||||
"num_frames": num_frames,
|
"video": {
|
||||||
}},
|
"num_frames": num_frames,
|
||||||
|
}
|
||||||
|
},
|
||||||
allowed_media_domains=[
|
allowed_media_domains=[
|
||||||
"www.bogotobogo.com",
|
"www.bogotobogo.com",
|
||||||
"github.com",
|
"github.com",
|
||||||
])
|
],
|
||||||
|
)
|
||||||
|
|
||||||
video_sync, metadata_sync = connector.fetch_video(video_url)
|
video_sync, metadata_sync = connector.fetch_video(video_url)
|
||||||
video_async, metadata_async = await connector.fetch_video_async(video_url)
|
video_async, metadata_async = await connector.fetch_video_async(video_url)
|
||||||
|
|||||||
@ -59,48 +59,52 @@ def test_parse_raw_single_batch_string_slice(inputs_slice: slice):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
# yapf: disable
|
@pytest.mark.parametrize(
|
||||||
@pytest.mark.parametrize('mm_processor_kwargs,expected_mm_kwargs', [
|
"mm_processor_kwargs,expected_mm_kwargs",
|
||||||
(None, [{}, {}]),
|
[
|
||||||
({}, [{}, {}]),
|
(None, [{}, {}]),
|
||||||
({"foo": 100}, [{"foo": 100}, {"foo": 100}]),
|
({}, [{}, {}]),
|
||||||
([{"foo": 100}, {"bar": 200}], [{"foo": 100}, {"bar": 200}]),
|
({"foo": 100}, [{"foo": 100}, {"foo": 100}]),
|
||||||
])
|
([{"foo": 100}, {"bar": 200}], [{"foo": 100}, {"bar": 200}]),
|
||||||
# yapf: enable
|
],
|
||||||
|
)
|
||||||
def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
|
def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
|
||||||
"""Test mm_processor_kwargs init for zipping enc/dec prompts."""
|
"""Test mm_processor_kwargs init for zipping enc/dec prompts."""
|
||||||
encoder_prompts = ['An encoder prompt', 'Another encoder prompt']
|
encoder_prompts = ["An encoder prompt", "Another encoder prompt"]
|
||||||
decoder_prompts = ['A decoder prompt', 'Another decoder prompt']
|
decoder_prompts = ["A decoder prompt", "Another decoder prompt"]
|
||||||
zipped_prompts = zip_enc_dec_prompts(encoder_prompts, decoder_prompts,
|
zipped_prompts = zip_enc_dec_prompts(
|
||||||
mm_processor_kwargs)
|
encoder_prompts, decoder_prompts, mm_processor_kwargs
|
||||||
|
)
|
||||||
assert len(zipped_prompts) == len(encoder_prompts) == len(decoder_prompts)
|
assert len(zipped_prompts) == len(encoder_prompts) == len(decoder_prompts)
|
||||||
for enc, dec, exp_kwargs, zipped in zip(encoder_prompts, decoder_prompts,
|
for enc, dec, exp_kwargs, zipped in zip(
|
||||||
expected_mm_kwargs,
|
encoder_prompts, decoder_prompts, expected_mm_kwargs, zipped_prompts
|
||||||
zipped_prompts):
|
):
|
||||||
assert isinstance(zipped, dict)
|
assert isinstance(zipped, dict)
|
||||||
assert len(zipped.keys()) == 3
|
assert len(zipped.keys()) == 3
|
||||||
assert zipped['encoder_prompt'] == enc
|
assert zipped["encoder_prompt"] == enc
|
||||||
assert zipped['decoder_prompt'] == dec
|
assert zipped["decoder_prompt"] == dec
|
||||||
assert zipped['mm_processor_kwargs'] == exp_kwargs
|
assert zipped["mm_processor_kwargs"] == exp_kwargs
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model_id", [
|
@pytest.mark.parametrize(
|
||||||
"facebook/opt-125m",
|
"model_id",
|
||||||
])
|
[
|
||||||
@pytest.mark.parametrize("prompt", [
|
"facebook/opt-125m",
|
||||||
{
|
],
|
||||||
"prompt": "",
|
)
|
||||||
"multi_modal_data": {
|
@pytest.mark.parametrize(
|
||||||
"dummy": []
|
"prompt",
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"prompt": "",
|
||||||
|
"multi_modal_data": {"dummy": []},
|
||||||
},
|
},
|
||||||
},
|
{
|
||||||
{
|
"prompt_token_ids": [],
|
||||||
"prompt_token_ids": [],
|
"multi_modal_data": {"dummy": []},
|
||||||
"multi_modal_data": {
|
|
||||||
"dummy": []
|
|
||||||
},
|
},
|
||||||
},
|
],
|
||||||
])
|
)
|
||||||
def test_preprocessor_text_no_mm_inputs(model_id, prompt):
|
def test_preprocessor_text_no_mm_inputs(model_id, prompt):
|
||||||
model_config = ModelConfig(model=model_id)
|
model_config = ModelConfig(model=model_id)
|
||||||
tokenizer = init_tokenizer_from_configs(model_config)
|
tokenizer = init_tokenizer_from_configs(model_config)
|
||||||
@ -110,15 +114,19 @@ def test_preprocessor_text_no_mm_inputs(model_id, prompt):
|
|||||||
input_preprocessor.preprocess(prompt)
|
input_preprocessor.preprocess(prompt)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model_id", [
|
@pytest.mark.parametrize(
|
||||||
"facebook/chameleon-7b",
|
"model_id",
|
||||||
])
|
[
|
||||||
@pytest.mark.parametrize("prompt", [
|
"facebook/chameleon-7b",
|
||||||
"",
|
],
|
||||||
{
|
)
|
||||||
"prompt_token_ids": []
|
@pytest.mark.parametrize(
|
||||||
},
|
"prompt",
|
||||||
])
|
[
|
||||||
|
"",
|
||||||
|
{"prompt_token_ids": []},
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_preprocessor_always_mm_code_path(model_id, prompt):
|
def test_preprocessor_always_mm_code_path(model_id, prompt):
|
||||||
model_config = ModelConfig(model=model_id)
|
model_config = ModelConfig(model=model_id)
|
||||||
tokenizer = init_tokenizer_from_configs(model_config)
|
tokenizer = init_tokenizer_from_configs(model_config)
|
||||||
|
|||||||
@ -9,14 +9,10 @@ import pytest
|
|||||||
import torch
|
import torch
|
||||||
import torch_xla
|
import torch_xla
|
||||||
|
|
||||||
# yapf conflicts with isort for this block
|
|
||||||
# yapf: disable
|
|
||||||
from vllm.model_executor.layers.fused_moe.moe_pallas import fused_moe as pallas_moe
|
from vllm.model_executor.layers.fused_moe.moe_pallas import fused_moe as pallas_moe
|
||||||
from vllm.model_executor.layers.fused_moe.moe_torch_iterative import (
|
from vllm.model_executor.layers.fused_moe.moe_torch_iterative import (
|
||||||
fused_moe as torch_moe,
|
fused_moe as torch_moe,
|
||||||
)
|
)
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
if not current_platform.is_tpu():
|
if not current_platform.is_tpu():
|
||||||
|
|||||||
@ -388,7 +388,6 @@ def test_duplicate_dict_args(caplog_vllm, parser):
|
|||||||
assert "-O.level" in caplog_vllm.text
|
assert "-O.level" in caplog_vllm.text
|
||||||
|
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"callable,kw_name,requires_kw_only,allow_var_kwargs,is_supported",
|
"callable,kw_name,requires_kw_only,allow_var_kwargs,is_supported",
|
||||||
[
|
[
|
||||||
@ -408,7 +407,6 @@ def test_duplicate_dict_args(caplog_vllm, parser):
|
|||||||
(lambda foo, **kwargs: None, "foo", True, True, False),
|
(lambda foo, **kwargs: None, "foo", True, True, False),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
# yapf: disable
|
|
||||||
def test_supports_kw(
|
def test_supports_kw(
|
||||||
callable, kw_name, requires_kw_only, allow_var_kwargs, is_supported
|
callable, kw_name, requires_kw_only, allow_var_kwargs, is_supported
|
||||||
):
|
):
|
||||||
@ -681,7 +679,6 @@ def test_lru_cache():
|
|||||||
assert 6 in cache
|
assert 6 in cache
|
||||||
|
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("src_dtype", "tgt_dtype", "expected_result"),
|
("src_dtype", "tgt_dtype", "expected_result"),
|
||||||
[
|
[
|
||||||
@ -715,12 +712,10 @@ def test_lru_cache():
|
|||||||
(torch.complex64, torch.complex32, False),
|
(torch.complex64, torch.complex32, False),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
# yapf: enable
|
|
||||||
def test_is_lossless_cast(src_dtype, tgt_dtype, expected_result):
|
def test_is_lossless_cast(src_dtype, tgt_dtype, expected_result):
|
||||||
assert is_lossless_cast(src_dtype, tgt_dtype) == expected_result
|
assert is_lossless_cast(src_dtype, tgt_dtype) == expected_result
|
||||||
|
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("dtypes", "expected_result"),
|
("dtypes", "expected_result"),
|
||||||
[
|
[
|
||||||
@ -730,7 +725,6 @@ def test_is_lossless_cast(src_dtype, tgt_dtype, expected_result):
|
|||||||
([torch.bool, torch.int8, torch.float16, torch.complex32], torch.complex32), # noqa: E501
|
([torch.bool, torch.int8, torch.float16, torch.complex32], torch.complex32), # noqa: E501
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
# yapf: enable
|
|
||||||
def test_common_broadcastable_dtype(dtypes, expected_result):
|
def test_common_broadcastable_dtype(dtypes, expected_result):
|
||||||
assert common_broadcastable_dtype(dtypes) == expected_result
|
assert common_broadcastable_dtype(dtypes) == expected_result
|
||||||
|
|
||||||
@ -775,7 +769,6 @@ def test_placeholder_module_error_handling():
|
|||||||
_ = placeholder_attr.module
|
_ = placeholder_attr.module
|
||||||
|
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"obj,key1,key2",
|
"obj,key1,key2",
|
||||||
[
|
[
|
||||||
@ -785,8 +778,8 @@ def test_placeholder_module_error_handling():
|
|||||||
({1: "a", 2: "b"}, 1, 3),
|
({1: "a", 2: "b"}, 1, 3),
|
||||||
# Tests for both keys do not exist
|
# Tests for both keys do not exist
|
||||||
({1: "a", 2: "b"}, 3, 4),
|
({1: "a", 2: "b"}, 3, 4),
|
||||||
])
|
],
|
||||||
# yapf: enable
|
)
|
||||||
def test_swap_dict_values(obj, key1, key2):
|
def test_swap_dict_values(obj, key1, key2):
|
||||||
original_obj = obj.copy()
|
original_obj = obj.copy()
|
||||||
swap_dict_values(obj, key1, key2)
|
swap_dict_values(obj, key1, key2)
|
||||||
@ -800,26 +793,30 @@ def test_swap_dict_values(obj, key1, key2):
|
|||||||
assert key1 not in obj
|
assert key1 not in obj
|
||||||
|
|
||||||
|
|
||||||
def test_model_specification(parser_with_config, cli_config_file,
|
def test_model_specification(
|
||||||
cli_config_file_with_model):
|
parser_with_config, cli_config_file, cli_config_file_with_model
|
||||||
|
):
|
||||||
# Test model in CLI takes precedence over config
|
# Test model in CLI takes precedence over config
|
||||||
args = parser_with_config.parse_args(
|
args = parser_with_config.parse_args(
|
||||||
['serve', 'cli-model', '--config', cli_config_file_with_model])
|
["serve", "cli-model", "--config", cli_config_file_with_model]
|
||||||
assert args.model_tag == 'cli-model'
|
)
|
||||||
assert args.served_model_name == 'mymodel'
|
assert args.model_tag == "cli-model"
|
||||||
|
assert args.served_model_name == "mymodel"
|
||||||
|
|
||||||
# Test model from config file works
|
# Test model from config file works
|
||||||
args = parser_with_config.parse_args([
|
args = parser_with_config.parse_args(
|
||||||
'serve',
|
[
|
||||||
'--config',
|
"serve",
|
||||||
cli_config_file_with_model,
|
"--config",
|
||||||
])
|
cli_config_file_with_model,
|
||||||
assert args.model == 'config-model'
|
]
|
||||||
assert args.served_model_name == 'mymodel'
|
)
|
||||||
|
assert args.model == "config-model"
|
||||||
|
assert args.served_model_name == "mymodel"
|
||||||
|
|
||||||
# Test no model specified anywhere raises error
|
# Test no model specified anywhere raises error
|
||||||
with pytest.raises(ValueError, match="No model specified!"):
|
with pytest.raises(ValueError, match="No model specified!"):
|
||||||
parser_with_config.parse_args(['serve', '--config', cli_config_file])
|
parser_with_config.parse_args(["serve", "--config", cli_config_file])
|
||||||
|
|
||||||
# Test using --model option raises error
|
# Test using --model option raises error
|
||||||
# with pytest.raises(
|
# with pytest.raises(
|
||||||
@ -833,47 +830,52 @@ def test_model_specification(parser_with_config, cli_config_file,
|
|||||||
# Test using --model option back-compatibility
|
# Test using --model option back-compatibility
|
||||||
# (when back-compatibility ends, the above test should be uncommented
|
# (when back-compatibility ends, the above test should be uncommented
|
||||||
# and the below test should be removed)
|
# and the below test should be removed)
|
||||||
args = parser_with_config.parse_args([
|
args = parser_with_config.parse_args(
|
||||||
'serve',
|
[
|
||||||
'--tensor-parallel-size',
|
"serve",
|
||||||
'2',
|
"--tensor-parallel-size",
|
||||||
'--model',
|
"2",
|
||||||
'my-model',
|
"--model",
|
||||||
'--trust-remote-code',
|
"my-model",
|
||||||
'--port',
|
"--trust-remote-code",
|
||||||
'8001',
|
"--port",
|
||||||
])
|
"8001",
|
||||||
|
]
|
||||||
|
)
|
||||||
assert args.model is None
|
assert args.model is None
|
||||||
assert args.tensor_parallel_size == 2
|
assert args.tensor_parallel_size == 2
|
||||||
assert args.trust_remote_code is True
|
assert args.trust_remote_code is True
|
||||||
assert args.port == 8001
|
assert args.port == 8001
|
||||||
|
|
||||||
args = parser_with_config.parse_args([
|
args = parser_with_config.parse_args(
|
||||||
'serve',
|
[
|
||||||
'--tensor-parallel-size=2',
|
"serve",
|
||||||
'--model=my-model',
|
"--tensor-parallel-size=2",
|
||||||
'--trust-remote-code',
|
"--model=my-model",
|
||||||
'--port=8001',
|
"--trust-remote-code",
|
||||||
])
|
"--port=8001",
|
||||||
|
]
|
||||||
|
)
|
||||||
assert args.model is None
|
assert args.model is None
|
||||||
assert args.tensor_parallel_size == 2
|
assert args.tensor_parallel_size == 2
|
||||||
assert args.trust_remote_code is True
|
assert args.trust_remote_code is True
|
||||||
assert args.port == 8001
|
assert args.port == 8001
|
||||||
|
|
||||||
# Test other config values are preserved
|
# Test other config values are preserved
|
||||||
args = parser_with_config.parse_args([
|
args = parser_with_config.parse_args(
|
||||||
'serve',
|
[
|
||||||
'cli-model',
|
"serve",
|
||||||
'--config',
|
"cli-model",
|
||||||
cli_config_file_with_model,
|
"--config",
|
||||||
])
|
cli_config_file_with_model,
|
||||||
|
]
|
||||||
|
)
|
||||||
assert args.tensor_parallel_size == 2
|
assert args.tensor_parallel_size == 2
|
||||||
assert args.trust_remote_code is True
|
assert args.trust_remote_code is True
|
||||||
assert args.port == 12312
|
assert args.port == 12312
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("input", [(), ("abc", ), (None, ),
|
@pytest.mark.parametrize("input", [(), ("abc",), (None,), (None, bool, [1, 2, 3])])
|
||||||
(None, bool, [1, 2, 3])])
|
|
||||||
def test_sha256(input: tuple):
|
def test_sha256(input: tuple):
|
||||||
digest = sha256(input)
|
digest = sha256(input)
|
||||||
assert digest is not None
|
assert digest is not None
|
||||||
@ -887,7 +889,7 @@ def test_sha256(input: tuple):
|
|||||||
assert digest == sha256(input)
|
assert digest == sha256(input)
|
||||||
|
|
||||||
# hashing different input, returns different value
|
# hashing different input, returns different value
|
||||||
assert digest != sha256(input + (1, ))
|
assert digest != sha256(input + (1,))
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@ -897,7 +899,8 @@ def test_sha256(input: tuple):
|
|||||||
("tcp://127.0.0.1:5555", ("tcp", "127.0.0.1", "5555")),
|
("tcp://127.0.0.1:5555", ("tcp", "127.0.0.1", "5555")),
|
||||||
("tcp://[::1]:5555", ("tcp", "::1", "5555")), # IPv6 address
|
("tcp://[::1]:5555", ("tcp", "::1", "5555")), # IPv6 address
|
||||||
("inproc://some_identifier", ("inproc", "some_identifier", "")),
|
("inproc://some_identifier", ("inproc", "some_identifier", "")),
|
||||||
])
|
],
|
||||||
|
)
|
||||||
def test_split_zmq_path(path, expected):
|
def test_split_zmq_path(path, expected):
|
||||||
assert split_zmq_path(path) == expected
|
assert split_zmq_path(path) == expected
|
||||||
|
|
||||||
@ -909,7 +912,8 @@ def test_split_zmq_path(path, expected):
|
|||||||
"tcp://127.0.0.1", # Missing port
|
"tcp://127.0.0.1", # Missing port
|
||||||
"tcp://[::1]", # Missing port for IPv6
|
"tcp://[::1]", # Missing port for IPv6
|
||||||
"tcp://:5555", # Missing host
|
"tcp://:5555", # Missing host
|
||||||
])
|
],
|
||||||
|
)
|
||||||
def test_split_zmq_path_invalid(invalid_path):
|
def test_split_zmq_path_invalid(invalid_path):
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
split_zmq_path(invalid_path)
|
split_zmq_path(invalid_path)
|
||||||
@ -931,8 +935,9 @@ def test_make_zmq_socket_ipv6():
|
|||||||
zsock: zmq.Socket = make_zmq_socket(ctx, ipv6_path, socket_type)
|
zsock: zmq.Socket = make_zmq_socket(ctx, ipv6_path, socket_type)
|
||||||
|
|
||||||
# Verify that the IPV6 option is set
|
# Verify that the IPV6 option is set
|
||||||
assert zsock.getsockopt(
|
assert zsock.getsockopt(zmq.IPV6) == 1, (
|
||||||
zmq.IPV6) == 1, "IPV6 option should be enabled for IPv6 addresses"
|
"IPV6 option should be enabled for IPv6 addresses"
|
||||||
|
)
|
||||||
|
|
||||||
# Clean up
|
# Clean up
|
||||||
zsock.close()
|
zsock.close()
|
||||||
@ -1019,15 +1024,14 @@ def test_convert_ids_list_to_tokens():
|
|||||||
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
|
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
|
||||||
token_ids = tokenizer.encode("Hello, world!")
|
token_ids = tokenizer.encode("Hello, world!")
|
||||||
# token_ids = [9707, 11, 1879, 0]
|
# token_ids = [9707, 11, 1879, 0]
|
||||||
assert tokenizer.convert_ids_to_tokens(token_ids) == [
|
assert tokenizer.convert_ids_to_tokens(token_ids) == ["Hello", ",", "Ġworld", "!"]
|
||||||
'Hello', ',', 'Ġworld', '!'
|
|
||||||
]
|
|
||||||
tokens = convert_ids_list_to_tokens(tokenizer, token_ids)
|
tokens = convert_ids_list_to_tokens(tokenizer, token_ids)
|
||||||
assert tokens == ['Hello', ',', ' world', '!']
|
assert tokens == ["Hello", ",", " world", "!"]
|
||||||
|
|
||||||
|
|
||||||
def test_current_stream_multithread():
|
def test_current_stream_multithread():
|
||||||
import threading
|
import threading
|
||||||
|
|
||||||
if not torch.cuda.is_available():
|
if not torch.cuda.is_available():
|
||||||
pytest.skip("CUDA not available")
|
pytest.skip("CUDA not available")
|
||||||
|
|
||||||
@ -1046,13 +1050,18 @@ def test_current_stream_multithread():
|
|||||||
child_thread.start()
|
child_thread.start()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
assert thread_stream_ready.wait(
|
assert thread_stream_ready.wait(timeout=5), (
|
||||||
timeout=5), "Child thread failed to enter stream context in time"
|
"Child thread failed to enter stream context in time"
|
||||||
|
)
|
||||||
|
|
||||||
main_current_stream = current_stream()
|
main_current_stream = current_stream()
|
||||||
|
|
||||||
assert main_current_stream != child_stream, "Main thread's current_stream was contaminated by child thread"
|
assert main_current_stream != child_stream, (
|
||||||
assert main_current_stream == main_default_stream, "Main thread's current_stream is not the default stream"
|
"Main thread's current_stream was contaminated by child thread"
|
||||||
|
)
|
||||||
|
assert main_current_stream == main_default_stream, (
|
||||||
|
"Main thread's current_stream is not the default stream"
|
||||||
|
)
|
||||||
|
|
||||||
# Notify child thread it can exit
|
# Notify child thread it can exit
|
||||||
thread_can_exit.set()
|
thread_can_exit.set()
|
||||||
@ -1070,7 +1079,7 @@ def test_load_config_file(tmp_path):
|
|||||||
"enable-logging": True,
|
"enable-logging": True,
|
||||||
"list-arg": ["item1", "item2"],
|
"list-arg": ["item1", "item2"],
|
||||||
"port": 12323,
|
"port": 12323,
|
||||||
"tensor-parallel-size": 4
|
"tensor-parallel-size": 4,
|
||||||
}
|
}
|
||||||
|
|
||||||
# Write the configuration data to a temporary YAML file
|
# Write the configuration data to a temporary YAML file
|
||||||
|
|||||||
@ -16,9 +16,6 @@ from vllm.multimodal.inputs import (
|
|||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
from vllm.utils import GiB_bytes, sha256, sha256_cbor
|
from vllm.utils import GiB_bytes, sha256, sha256_cbor
|
||||||
from vllm.v1.core.kv_cache_manager import KVCacheManager
|
from vllm.v1.core.kv_cache_manager import KVCacheManager
|
||||||
|
|
||||||
# disable yapf here as it formats differently than isort such that both fail
|
|
||||||
# yapf: disable
|
|
||||||
from vllm.v1.core.kv_cache_utils import (
|
from vllm.v1.core.kv_cache_utils import (
|
||||||
BlockHash,
|
BlockHash,
|
||||||
FreeKVCacheBlockQueue,
|
FreeKVCacheBlockQueue,
|
||||||
@ -48,8 +45,6 @@ from vllm.v1.kv_cache_interface import (
|
|||||||
from vllm.v1.metrics.stats import PrefixCacheStats
|
from vllm.v1.metrics.stats import PrefixCacheStats
|
||||||
from vllm.v1.request import Request
|
from vllm.v1.request import Request
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
|
|
||||||
pytestmark = pytest.mark.cpu_test
|
pytestmark = pytest.mark.cpu_test
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -22,8 +22,6 @@ from vllm.config import VllmConfig
|
|||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
from vllm.utils import is_pin_memory_available
|
from vllm.utils import is_pin_memory_available
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
from vllm.v1.sample.logits_processor import (
|
from vllm.v1.sample.logits_processor import (
|
||||||
BatchUpdate,
|
BatchUpdate,
|
||||||
BatchUpdateBuilder,
|
BatchUpdateBuilder,
|
||||||
@ -34,8 +32,6 @@ from vllm.v1.sample.logits_processor import (
|
|||||||
MoveDirectionality,
|
MoveDirectionality,
|
||||||
build_logitsprocs,
|
build_logitsprocs,
|
||||||
)
|
)
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
from vllm.v1.sample.metadata import SamplingMetadata
|
from vllm.v1.sample.metadata import SamplingMetadata
|
||||||
|
|
||||||
PIN_MEMORY_AVAILABLE = is_pin_memory_available()
|
PIN_MEMORY_AVAILABLE = is_pin_memory_available()
|
||||||
|
|||||||
@ -7,8 +7,6 @@ from typing import Union
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from tests.utils import create_new_process_for_each_test
|
from tests.utils import create_new_process_for_each_test
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
from tests.v1.logits_processors.utils import (
|
from tests.v1.logits_processors.utils import (
|
||||||
DUMMY_LOGITPROC_ARG,
|
DUMMY_LOGITPROC_ARG,
|
||||||
DUMMY_LOGITPROC_FQCN,
|
DUMMY_LOGITPROC_FQCN,
|
||||||
@ -24,8 +22,6 @@ from tests.v1.logits_processors.utils import (
|
|||||||
prompts,
|
prompts,
|
||||||
)
|
)
|
||||||
from tests.v1.logits_processors.utils import entry_points as fake_entry_points
|
from tests.v1.logits_processors.utils import entry_points as fake_entry_points
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.v1.sample.logits_processor import (
|
from vllm.v1.sample.logits_processor import (
|
||||||
STR_POOLING_REJECTS_LOGITSPROCS,
|
STR_POOLING_REJECTS_LOGITSPROCS,
|
||||||
|
|||||||
@ -11,8 +11,6 @@ import pytest
|
|||||||
import pytest_asyncio
|
import pytest_asyncio
|
||||||
|
|
||||||
from tests.utils import RemoteOpenAIServerCustom, create_new_process_for_each_test
|
from tests.utils import RemoteOpenAIServerCustom, create_new_process_for_each_test
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
from tests.v1.logits_processors.utils import (
|
from tests.v1.logits_processors.utils import (
|
||||||
DUMMY_LOGITPROC_ARG,
|
DUMMY_LOGITPROC_ARG,
|
||||||
DUMMY_LOGITPROC_FQCN,
|
DUMMY_LOGITPROC_FQCN,
|
||||||
@ -25,8 +23,6 @@ from tests.v1.logits_processors.utils import (
|
|||||||
)
|
)
|
||||||
from tests.v1.logits_processors.utils import entry_points as fake_entry_points
|
from tests.v1.logits_processors.utils import entry_points as fake_entry_points
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
|
|
||||||
|
|
||||||
def _server_with_logitproc_entrypoint(
|
def _server_with_logitproc_entrypoint(
|
||||||
env_dict: Optional[dict[str, str]],
|
env_dict: Optional[dict[str, str]],
|
||||||
|
|||||||
@ -4,7 +4,6 @@
|
|||||||
import importlib
|
import importlib
|
||||||
from typing import TYPE_CHECKING, Callable
|
from typing import TYPE_CHECKING, Callable
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
import vllm.envs as envs
|
import vllm.envs as envs
|
||||||
from vllm.distributed.kv_transfer.kv_connector.base import (
|
from vllm.distributed.kv_transfer.kv_connector.base import (
|
||||||
KVConnectorBase,
|
KVConnectorBase,
|
||||||
@ -13,8 +12,6 @@ from vllm.distributed.kv_transfer.kv_connector.base import (
|
|||||||
from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorRole
|
from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorRole
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from vllm.config import VllmConfig
|
from vllm.config import VllmConfig
|
||||||
from vllm.config.kv_transfer import KVTransferConfig
|
from vllm.config.kv_transfer import KVTransferConfig
|
||||||
|
|||||||
@ -1,7 +1,6 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
import argparse
|
import argparse
|
||||||
import copy
|
import copy
|
||||||
import dataclasses
|
import dataclasses
|
||||||
@ -88,8 +87,6 @@ from vllm.transformers_utils.utils import check_gguf_file
|
|||||||
from vllm.utils import FlexibleArgumentParser, GiB_bytes, get_ip, is_in_ray_actor
|
from vllm.utils import FlexibleArgumentParser, GiB_bytes, get_ip, is_in_ray_actor
|
||||||
from vllm.v1.sample.logits_processor import LogitsProcessor
|
from vllm.v1.sample.logits_processor import LogitsProcessor
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from vllm.executor.executor_base import ExecutorBase
|
from vllm.executor.executor_base import ExecutorBase
|
||||||
from vllm.model_executor.layers.quantization import QuantizationMethods
|
from vllm.model_executor.layers.quantization import QuantizationMethods
|
||||||
|
|||||||
@ -17,9 +17,6 @@ import jinja2.nodes
|
|||||||
import jinja2.parser
|
import jinja2.parser
|
||||||
import jinja2.sandbox
|
import jinja2.sandbox
|
||||||
import transformers.utils.chat_template_utils as hf_chat_utils
|
import transformers.utils.chat_template_utils as hf_chat_utils
|
||||||
|
|
||||||
# yapf conflicts with isort for this block
|
|
||||||
# yapf: disable
|
|
||||||
from openai.types.chat import (
|
from openai.types.chat import (
|
||||||
ChatCompletionAssistantMessageParam,
|
ChatCompletionAssistantMessageParam,
|
||||||
ChatCompletionContentPartImageParam,
|
ChatCompletionContentPartImageParam,
|
||||||
@ -40,8 +37,6 @@ from openai.types.responses import ResponseInputImageParam
|
|||||||
from openai_harmony import Message as OpenAIHarmonyMessage
|
from openai_harmony import Message as OpenAIHarmonyMessage
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from pydantic import BaseModel, ConfigDict, TypeAdapter
|
from pydantic import BaseModel, ConfigDict, TypeAdapter
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast, ProcessorMixin
|
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast, ProcessorMixin
|
||||||
|
|
||||||
# pydantic needs the TypedDict from typing_extensions
|
# pydantic needs the TypedDict from typing_extensions
|
||||||
@ -52,11 +47,7 @@ from vllm.logger import init_logger
|
|||||||
from vllm.model_executor.models import SupportsMultiModal
|
from vllm.model_executor.models import SupportsMultiModal
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict, MultiModalUUIDDict
|
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict, MultiModalUUIDDict
|
||||||
from vllm.multimodal.utils import MediaConnector
|
from vllm.multimodal.utils import MediaConnector
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
from vllm.transformers_utils.chat_templates import get_chat_template_fallback_path
|
from vllm.transformers_utils.chat_templates import get_chat_template_fallback_path
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
from vllm.transformers_utils.processor import cached_get_processor
|
from vllm.transformers_utils.processor import cached_get_processor
|
||||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
|
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
|
||||||
from vllm.utils import random_uuid, supports_kw
|
from vllm.utils import random_uuid, supports_kw
|
||||||
@ -317,11 +308,7 @@ def _is_var_or_elems_access(
|
|||||||
):
|
):
|
||||||
return _is_var_or_elems_access(node.node, varname, key)
|
return _is_var_or_elems_access(node.node, varname, key)
|
||||||
|
|
||||||
# yapf: disable
|
return _is_attr_access(node, varname, key) if key else _is_var_access(node, varname)
|
||||||
return (
|
|
||||||
_is_attr_access(node, varname, key) if key
|
|
||||||
else _is_var_access(node, varname)
|
|
||||||
) # yapf: enable
|
|
||||||
|
|
||||||
|
|
||||||
def _iter_nodes_assign_var_or_elems(root: jinja2.nodes.Node, varname: str):
|
def _iter_nodes_assign_var_or_elems(root: jinja2.nodes.Node, varname: str):
|
||||||
|
|||||||
@ -39,9 +39,6 @@ from vllm.entrypoints.chat_utils import (
|
|||||||
parse_chat_messages,
|
parse_chat_messages,
|
||||||
resolve_chat_template_content_format,
|
resolve_chat_template_content_format,
|
||||||
)
|
)
|
||||||
|
|
||||||
# yapf conflicts with isort for this block
|
|
||||||
# yapf: disable
|
|
||||||
from vllm.entrypoints.score_utils import (
|
from vllm.entrypoints.score_utils import (
|
||||||
ScoreContentPartParam,
|
ScoreContentPartParam,
|
||||||
ScoreMultiModalParam,
|
ScoreMultiModalParam,
|
||||||
@ -50,8 +47,6 @@ from vllm.entrypoints.score_utils import (
|
|||||||
compress_token_type_ids,
|
compress_token_type_ids,
|
||||||
get_score_prompt,
|
get_score_prompt,
|
||||||
)
|
)
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
from vllm.entrypoints.utils import _validate_truncation_size, log_non_default_args
|
from vllm.entrypoints.utils import _validate_truncation_size, log_non_default_args
|
||||||
from vllm.inputs import (
|
from vllm.inputs import (
|
||||||
DataPrompt,
|
DataPrompt,
|
||||||
|
|||||||
@ -49,9 +49,6 @@ from vllm.entrypoints.chat_utils import (
|
|||||||
from vllm.entrypoints.launcher import serve_http
|
from vllm.entrypoints.launcher import serve_http
|
||||||
from vllm.entrypoints.logger import RequestLogger
|
from vllm.entrypoints.logger import RequestLogger
|
||||||
from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args
|
from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args
|
||||||
|
|
||||||
# yapf conflicts with isort for this block
|
|
||||||
# yapf: disable
|
|
||||||
from vllm.entrypoints.openai.protocol import (
|
from vllm.entrypoints.openai.protocol import (
|
||||||
ChatCompletionRequest,
|
ChatCompletionRequest,
|
||||||
ChatCompletionResponse,
|
ChatCompletionResponse,
|
||||||
@ -84,8 +81,6 @@ from vllm.entrypoints.openai.protocol import (
|
|||||||
TranslationResponse,
|
TranslationResponse,
|
||||||
UnloadLoRAAdapterRequest,
|
UnloadLoRAAdapterRequest,
|
||||||
)
|
)
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
|
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
|
||||||
from vllm.entrypoints.openai.serving_classification import ServingClassification
|
from vllm.entrypoints.openai.serving_classification import ServingClassification
|
||||||
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
|
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
|
||||||
|
|||||||
@ -11,8 +11,6 @@ from typing import Annotated, Any, ClassVar, Generic, Literal, Optional, TypeVar
|
|||||||
import regex as re
|
import regex as re
|
||||||
import torch
|
import torch
|
||||||
from fastapi import HTTPException, UploadFile
|
from fastapi import HTTPException, UploadFile
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
from openai.types.chat.chat_completion_audio import (
|
from openai.types.chat.chat_completion_audio import (
|
||||||
ChatCompletionAudio as OpenAIChatCompletionAudio,
|
ChatCompletionAudio as OpenAIChatCompletionAudio,
|
||||||
)
|
)
|
||||||
@ -46,8 +44,6 @@ from openai.types.responses import ResponseCreatedEvent as OpenAIResponseCreated
|
|||||||
from openai.types.responses import (
|
from openai.types.responses import (
|
||||||
ResponseInProgressEvent as OpenAIResponseInProgressEvent,
|
ResponseInProgressEvent as OpenAIResponseInProgressEvent,
|
||||||
)
|
)
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
from openai.types.responses.response_reasoning_item import (
|
from openai.types.responses.response_reasoning_item import (
|
||||||
Content as ResponseReasoningTextContent,
|
Content as ResponseReasoningTextContent,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -18,8 +18,6 @@ from vllm.config import VllmConfig
|
|||||||
from vllm.engine.arg_utils import AsyncEngineArgs, optional_type
|
from vllm.engine.arg_utils import AsyncEngineArgs, optional_type
|
||||||
from vllm.engine.protocol import EngineClient
|
from vllm.engine.protocol import EngineClient
|
||||||
from vllm.entrypoints.logger import RequestLogger
|
from vllm.entrypoints.logger import RequestLogger
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
from vllm.entrypoints.openai.protocol import (
|
from vllm.entrypoints.openai.protocol import (
|
||||||
BatchRequestInput,
|
BatchRequestInput,
|
||||||
BatchRequestOutput,
|
BatchRequestOutput,
|
||||||
@ -30,8 +28,6 @@ from vllm.entrypoints.openai.protocol import (
|
|||||||
RerankResponse,
|
RerankResponse,
|
||||||
ScoreResponse,
|
ScoreResponse,
|
||||||
)
|
)
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
|
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
|
||||||
from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
|
from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
|
||||||
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
|
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
|
||||||
|
|||||||
@ -1733,13 +1733,15 @@ class OpenAIServingChat(OpenAIServing):
|
|||||||
is a tool call with arguments.
|
is a tool call with arguments.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
return bool(
|
return bool(
|
||||||
# if there is a delta message that includes tool calls which
|
# if there is a delta message that includes tool calls which
|
||||||
# include a function that has arguments
|
# include a function that has arguments
|
||||||
output.finish_reason is not None
|
output.finish_reason is not None
|
||||||
and self.enable_auto_tools and self.tool_parser and delta_message
|
and self.enable_auto_tools
|
||||||
and delta_message.tool_calls and delta_message.tool_calls[0]
|
and self.tool_parser
|
||||||
|
and delta_message
|
||||||
|
and delta_message.tool_calls
|
||||||
|
and delta_message.tool_calls[0]
|
||||||
and delta_message.tool_calls[0].function
|
and delta_message.tool_calls[0].function
|
||||||
and delta_message.tool_calls[0].function.arguments is not None
|
and delta_message.tool_calls[0].function.arguments is not None
|
||||||
)
|
)
|
||||||
|
|||||||
@ -18,8 +18,6 @@ from vllm.entrypoints.openai.protocol import (
|
|||||||
ErrorResponse,
|
ErrorResponse,
|
||||||
UsageInfo,
|
UsageInfo,
|
||||||
)
|
)
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
from vllm.entrypoints.openai.serving_engine import (
|
from vllm.entrypoints.openai.serving_engine import (
|
||||||
ClassificationServeContext,
|
ClassificationServeContext,
|
||||||
OpenAIServing,
|
OpenAIServing,
|
||||||
|
|||||||
@ -13,9 +13,6 @@ from fastapi import Request
|
|||||||
from vllm.config import ModelConfig
|
from vllm.config import ModelConfig
|
||||||
from vllm.engine.protocol import EngineClient
|
from vllm.engine.protocol import EngineClient
|
||||||
from vllm.entrypoints.logger import RequestLogger
|
from vllm.entrypoints.logger import RequestLogger
|
||||||
|
|
||||||
# yapf conflicts with isort for this block
|
|
||||||
# yapf: disable
|
|
||||||
from vllm.entrypoints.openai.protocol import (
|
from vllm.entrypoints.openai.protocol import (
|
||||||
CompletionLogProbs,
|
CompletionLogProbs,
|
||||||
CompletionRequest,
|
CompletionRequest,
|
||||||
@ -29,8 +26,6 @@ from vllm.entrypoints.openai.protocol import (
|
|||||||
UsageInfo,
|
UsageInfo,
|
||||||
)
|
)
|
||||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs
|
from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||||
from vllm.entrypoints.renderer import RenderConfig
|
from vllm.entrypoints.renderer import RenderConfig
|
||||||
from vllm.entrypoints.utils import get_max_tokens
|
from vllm.entrypoints.utils import get_max_tokens
|
||||||
|
|||||||
@ -14,9 +14,6 @@ from vllm.config import ModelConfig
|
|||||||
from vllm.engine.protocol import EngineClient
|
from vllm.engine.protocol import EngineClient
|
||||||
from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
|
from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
|
||||||
from vllm.entrypoints.logger import RequestLogger
|
from vllm.entrypoints.logger import RequestLogger
|
||||||
|
|
||||||
# yapf conflicts with isort for this docstring
|
|
||||||
# yapf: disable
|
|
||||||
from vllm.entrypoints.openai.protocol import (
|
from vllm.entrypoints.openai.protocol import (
|
||||||
EmbeddingChatRequest,
|
EmbeddingChatRequest,
|
||||||
EmbeddingCompletionRequest,
|
EmbeddingCompletionRequest,
|
||||||
@ -32,8 +29,6 @@ from vllm.entrypoints.openai.serving_engine import (
|
|||||||
ServeContext,
|
ServeContext,
|
||||||
TextTokensPrompt,
|
TextTokensPrompt,
|
||||||
)
|
)
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||||
from vllm.entrypoints.renderer import RenderConfig
|
from vllm.entrypoints.renderer import RenderConfig
|
||||||
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
|
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
|
||||||
|
|||||||
@ -28,9 +28,6 @@ else:
|
|||||||
import vllm.envs as envs
|
import vllm.envs as envs
|
||||||
from vllm.config import ModelConfig
|
from vllm.config import ModelConfig
|
||||||
from vllm.engine.protocol import EngineClient
|
from vllm.engine.protocol import EngineClient
|
||||||
|
|
||||||
# yapf conflicts with isort for this block
|
|
||||||
# yapf: disable
|
|
||||||
from vllm.entrypoints.chat_utils import (
|
from vllm.entrypoints.chat_utils import (
|
||||||
ChatCompletionMessageParam,
|
ChatCompletionMessageParam,
|
||||||
ChatTemplateContentFormatOption,
|
ChatTemplateContentFormatOption,
|
||||||
@ -72,8 +69,6 @@ from vllm.entrypoints.openai.protocol import (
|
|||||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||||
from vllm.entrypoints.openai.tool_parsers import ToolParser
|
from vllm.entrypoints.openai.tool_parsers import ToolParser
|
||||||
from vllm.entrypoints.renderer import BaseRenderer, CompletionRenderer, RenderConfig
|
from vllm.entrypoints.renderer import BaseRenderer, CompletionRenderer, RenderConfig
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
from vllm.inputs.data import PromptType
|
from vllm.inputs.data import PromptType
|
||||||
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
|
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
|
||||||
from vllm.inputs.parse import PromptComponents, get_prompt_components
|
from vllm.inputs.parse import PromptComponents, get_prompt_components
|
||||||
|
|||||||
@ -17,8 +17,6 @@ from vllm.config import VllmConfig
|
|||||||
from vllm.engine.protocol import EngineClient
|
from vllm.engine.protocol import EngineClient
|
||||||
from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
|
from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
|
||||||
from vllm.entrypoints.logger import RequestLogger
|
from vllm.entrypoints.logger import RequestLogger
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
from vllm.entrypoints.openai.protocol import (
|
from vllm.entrypoints.openai.protocol import (
|
||||||
ErrorResponse,
|
ErrorResponse,
|
||||||
IOProcessorRequest,
|
IOProcessorRequest,
|
||||||
@ -30,8 +28,6 @@ from vllm.entrypoints.openai.protocol import (
|
|||||||
PoolingResponseData,
|
PoolingResponseData,
|
||||||
UsageInfo,
|
UsageInfo,
|
||||||
)
|
)
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
||||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||||
from vllm.entrypoints.renderer import RenderConfig
|
from vllm.entrypoints.renderer import RenderConfig
|
||||||
|
|||||||
@ -14,9 +14,6 @@ from typing import Callable, Final, Optional, Union
|
|||||||
|
|
||||||
import jinja2
|
import jinja2
|
||||||
from fastapi import Request
|
from fastapi import Request
|
||||||
|
|
||||||
# yapf conflicts with isort for this block
|
|
||||||
# yapf: disable
|
|
||||||
from openai.types.responses import (
|
from openai.types.responses import (
|
||||||
ResponseCodeInterpreterCallCodeDeltaEvent,
|
ResponseCodeInterpreterCallCodeDeltaEvent,
|
||||||
ResponseCodeInterpreterCallCodeDoneEvent,
|
ResponseCodeInterpreterCallCodeDoneEvent,
|
||||||
@ -46,8 +43,6 @@ from openai.types.responses import (
|
|||||||
response_text_delta_event,
|
response_text_delta_event,
|
||||||
)
|
)
|
||||||
from openai.types.responses.response_output_text import Logprob, LogprobTopLogprob
|
from openai.types.responses.response_output_text import Logprob, LogprobTopLogprob
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
from openai.types.responses.response_reasoning_item import (
|
from openai.types.responses.response_reasoning_item import (
|
||||||
Content as ResponseReasoningTextContent,
|
Content as ResponseReasoningTextContent,
|
||||||
)
|
)
|
||||||
@ -78,9 +73,6 @@ from vllm.entrypoints.harmony_utils import (
|
|||||||
render_for_completion,
|
render_for_completion,
|
||||||
)
|
)
|
||||||
from vllm.entrypoints.logger import RequestLogger
|
from vllm.entrypoints.logger import RequestLogger
|
||||||
|
|
||||||
# yapf conflicts with isort for this block
|
|
||||||
# yapf: disable
|
|
||||||
from vllm.entrypoints.openai.protocol import (
|
from vllm.entrypoints.openai.protocol import (
|
||||||
DeltaMessage,
|
DeltaMessage,
|
||||||
ErrorResponse,
|
ErrorResponse,
|
||||||
@ -97,8 +89,6 @@ from vllm.entrypoints.openai.protocol import (
|
|||||||
ResponseUsage,
|
ResponseUsage,
|
||||||
StreamingResponsesResponse,
|
StreamingResponsesResponse,
|
||||||
)
|
)
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
||||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||||
from vllm.entrypoints.tool_server import ToolServer
|
from vllm.entrypoints.tool_server import ToolServer
|
||||||
|
|||||||
@ -24,9 +24,6 @@ from vllm.entrypoints.openai.protocol import (
|
|||||||
)
|
)
|
||||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
||||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||||
|
|
||||||
# yapf conflicts with isort for this block
|
|
||||||
# yapf: disable
|
|
||||||
from vllm.entrypoints.score_utils import (
|
from vllm.entrypoints.score_utils import (
|
||||||
ScoreContentPartParam,
|
ScoreContentPartParam,
|
||||||
ScoreMultiModalParam,
|
ScoreMultiModalParam,
|
||||||
@ -35,8 +32,6 @@ from vllm.entrypoints.score_utils import (
|
|||||||
compress_token_type_ids,
|
compress_token_type_ids,
|
||||||
get_score_prompt,
|
get_score_prompt,
|
||||||
)
|
)
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
from vllm.entrypoints.utils import _validate_truncation_size
|
from vllm.entrypoints.utils import _validate_truncation_size
|
||||||
from vllm.inputs.data import TokensPrompt
|
from vllm.inputs.data import TokensPrompt
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
|||||||
@ -10,9 +10,6 @@ from vllm.config import ModelConfig
|
|||||||
from vllm.engine.protocol import EngineClient
|
from vllm.engine.protocol import EngineClient
|
||||||
from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
|
from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
|
||||||
from vllm.entrypoints.logger import RequestLogger
|
from vllm.entrypoints.logger import RequestLogger
|
||||||
|
|
||||||
# yapf conflicts with isort for this block
|
|
||||||
# yapf: disable
|
|
||||||
from vllm.entrypoints.openai.protocol import (
|
from vllm.entrypoints.openai.protocol import (
|
||||||
DetokenizeRequest,
|
DetokenizeRequest,
|
||||||
DetokenizeResponse,
|
DetokenizeResponse,
|
||||||
@ -22,8 +19,6 @@ from vllm.entrypoints.openai.protocol import (
|
|||||||
TokenizeResponse,
|
TokenizeResponse,
|
||||||
TokenizerInfoResponse,
|
TokenizerInfoResponse,
|
||||||
)
|
)
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
||||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||||
from vllm.entrypoints.renderer import RenderConfig
|
from vllm.entrypoints.renderer import RenderConfig
|
||||||
|
|||||||
@ -11,7 +11,7 @@ import cloudpickle
|
|||||||
import msgspec
|
import msgspec
|
||||||
|
|
||||||
import vllm.envs as envs
|
import vllm.envs as envs
|
||||||
from vllm.executor.executor_base import DistributedExecutorBase # yapf: disable
|
from vllm.executor.executor_base import DistributedExecutorBase
|
||||||
from vllm.executor.msgspec_utils import encode_hook
|
from vllm.executor.msgspec_utils import encode_hook
|
||||||
from vllm.executor.ray_utils import RayWorkerWrapper, initialize_ray_cluster, ray
|
from vllm.executor.ray_utils import RayWorkerWrapper, initialize_ray_cluster, ray
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
|||||||
@ -8,8 +8,6 @@ from transformers import PretrainedConfig
|
|||||||
|
|
||||||
from vllm.config.lora import LoRAConfig
|
from vllm.config.lora import LoRAConfig
|
||||||
from vllm.distributed.utils import divide
|
from vllm.distributed.utils import divide
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
from vllm.model_executor.layers.linear import (
|
from vllm.model_executor.layers.linear import (
|
||||||
ColumnParallelLinear,
|
ColumnParallelLinear,
|
||||||
LinearBase,
|
LinearBase,
|
||||||
@ -23,7 +21,6 @@ from .utils import _get_lora_device
|
|||||||
|
|
||||||
|
|
||||||
class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
|
class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
|
||||||
|
|
||||||
def __init__(self, base_layer: LinearBase):
|
def __init__(self, base_layer: LinearBase):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.base_layer = base_layer
|
self.base_layer = base_layer
|
||||||
@ -50,16 +47,20 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
|
|||||||
lora_b_out_size = self.output_size
|
lora_b_out_size = self.output_size
|
||||||
|
|
||||||
elif isinstance(self.base_layer, ColumnParallelLinear):
|
elif isinstance(self.base_layer, ColumnParallelLinear):
|
||||||
lora_a_out_size = (lora_config.max_lora_rank if
|
lora_a_out_size = (
|
||||||
not lora_config.fully_sharded_loras else divide(
|
lora_config.max_lora_rank
|
||||||
lora_config.max_lora_rank, self.tp_size))
|
if not lora_config.fully_sharded_loras
|
||||||
|
else divide(lora_config.max_lora_rank, self.tp_size)
|
||||||
|
)
|
||||||
lora_b_out_size = self.output_size
|
lora_b_out_size = self.output_size
|
||||||
|
|
||||||
elif isinstance(self.base_layer, RowParallelLinear):
|
elif isinstance(self.base_layer, RowParallelLinear):
|
||||||
lora_a_out_size = lora_config.max_lora_rank
|
lora_a_out_size = lora_config.max_lora_rank
|
||||||
lora_b_out_size = (self.output_size if
|
lora_b_out_size = (
|
||||||
not lora_config.fully_sharded_loras else divide(
|
self.output_size
|
||||||
self.output_size, self.tp_size))
|
if not lora_config.fully_sharded_loras
|
||||||
|
else divide(self.output_size, self.tp_size)
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
@ -71,7 +72,9 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
|
|||||||
self.input_size,
|
self.input_size,
|
||||||
dtype=lora_config.lora_dtype,
|
dtype=lora_config.lora_dtype,
|
||||||
device=self.device,
|
device=self.device,
|
||||||
) for _ in range(self.n_slices))
|
)
|
||||||
|
for _ in range(self.n_slices)
|
||||||
|
)
|
||||||
self.lora_b_stacked = tuple(
|
self.lora_b_stacked = tuple(
|
||||||
torch.zeros(
|
torch.zeros(
|
||||||
max_loras,
|
max_loras,
|
||||||
@ -80,7 +83,9 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
|
|||||||
lora_config.max_lora_rank,
|
lora_config.max_lora_rank,
|
||||||
dtype=lora_config.lora_dtype,
|
dtype=lora_config.lora_dtype,
|
||||||
device=self.device,
|
device=self.device,
|
||||||
) for _ in range(self.n_slices))
|
)
|
||||||
|
for _ in range(self.n_slices)
|
||||||
|
)
|
||||||
if lora_config.bias_enabled:
|
if lora_config.bias_enabled:
|
||||||
lora_bias_out_size = lora_b_out_size
|
lora_bias_out_size = lora_b_out_size
|
||||||
self.lora_bias_stacked = tuple(
|
self.lora_bias_stacked = tuple(
|
||||||
@ -90,8 +95,10 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
|
|||||||
lora_bias_out_size,
|
lora_bias_out_size,
|
||||||
dtype=lora_config.lora_dtype,
|
dtype=lora_config.lora_dtype,
|
||||||
device=self.device,
|
device=self.device,
|
||||||
) for _ in range(self.n_slices))
|
)
|
||||||
self.output_slices = (self.lora_b_stacked[0].shape[2], )
|
for _ in range(self.n_slices)
|
||||||
|
)
|
||||||
|
self.output_slices = (self.lora_b_stacked[0].shape[2],)
|
||||||
|
|
||||||
def reset_lora(self, index: int):
|
def reset_lora(self, index: int):
|
||||||
for s_index in range(self.n_slices):
|
for s_index in range(self.n_slices):
|
||||||
@ -99,8 +106,9 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
|
|||||||
self.lora_b_stacked[s_index][index] = 0
|
self.lora_b_stacked[s_index][index] = 0
|
||||||
if self.lora_config.bias_enabled:
|
if self.lora_config.bias_enabled:
|
||||||
# Make mypy happy
|
# Make mypy happy
|
||||||
self.lora_bias_stacked = cast(tuple[torch.Tensor, ...],
|
self.lora_bias_stacked = cast(
|
||||||
self.lora_bias_stacked)
|
tuple[torch.Tensor, ...], self.lora_bias_stacked
|
||||||
|
)
|
||||||
self.lora_bias_stacked[s_index][index] = 0
|
self.lora_bias_stacked[s_index][index] = 0
|
||||||
|
|
||||||
def set_lora(
|
def set_lora(
|
||||||
@ -115,8 +123,9 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
|
|||||||
# MergedColumnParallelLinearWithLoRA, all other linear LoRA layers
|
# MergedColumnParallelLinearWithLoRA, all other linear LoRA layers
|
||||||
# store weights in a tuple of size 1. These two layers will
|
# store weights in a tuple of size 1. These two layers will
|
||||||
# override this function.
|
# override this function.
|
||||||
assert (len(self.lora_a_stacked) == len(self.lora_b_stacked) ==
|
assert (
|
||||||
self.n_slices == 1)
|
len(self.lora_a_stacked) == len(self.lora_b_stacked) == self.n_slices == 1
|
||||||
|
)
|
||||||
|
|
||||||
self.reset_lora(index)
|
self.reset_lora(index)
|
||||||
if self.tp_size > 1:
|
if self.tp_size > 1:
|
||||||
@ -125,23 +134,24 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
|
|||||||
if lora_bias is not None:
|
if lora_bias is not None:
|
||||||
lora_bias = self.slice_bias(lora_bias)
|
lora_bias = self.slice_bias(lora_bias)
|
||||||
|
|
||||||
self.lora_a_stacked[0][index,
|
self.lora_a_stacked[0][index, 0, : lora_a.shape[0], : lora_a.shape[1]].copy_(
|
||||||
0, :lora_a.shape[0], :lora_a.shape[1]].copy_(
|
lora_a, non_blocking=True
|
||||||
lora_a, non_blocking=True)
|
)
|
||||||
self.lora_b_stacked[0][index,
|
self.lora_b_stacked[0][index, 0, : lora_b.shape[0], : lora_b.shape[1]].copy_(
|
||||||
0, :lora_b.shape[0], :lora_b.shape[1]].copy_(
|
lora_b, non_blocking=True
|
||||||
lora_b, non_blocking=True)
|
)
|
||||||
if lora_bias is not None:
|
if lora_bias is not None:
|
||||||
|
self.lora_bias_stacked = cast(
|
||||||
self.lora_bias_stacked = cast(tuple[torch.Tensor, ...],
|
tuple[torch.Tensor, ...], self.lora_bias_stacked
|
||||||
self.lora_bias_stacked)
|
)
|
||||||
assert len(self.lora_bias_stacked)
|
assert len(self.lora_bias_stacked)
|
||||||
self.lora_bias_stacked[0][index, 0, :lora_bias.shape[0]].copy_(
|
self.lora_bias_stacked[0][index, 0, : lora_bias.shape[0]].copy_(
|
||||||
lora_bias, non_blocking=True)
|
lora_bias, non_blocking=True
|
||||||
|
)
|
||||||
|
|
||||||
def apply(self,
|
def apply(
|
||||||
x: torch.Tensor,
|
self, x: torch.Tensor, bias: Optional[torch.Tensor] = None
|
||||||
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
|
output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
|
||||||
|
|
||||||
# In transformers backend, x and output have extra batch dimension like
|
# In transformers backend, x and output have extra batch dimension like
|
||||||
@ -151,10 +161,15 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
|
|||||||
output = output.flatten(0, 1)
|
output = output.flatten(0, 1)
|
||||||
x = x.flatten(0, 1)
|
x = x.flatten(0, 1)
|
||||||
|
|
||||||
lora_output: Optional[
|
lora_output: Optional[torch.Tensor] = self.punica_wrapper.add_lora_linear(
|
||||||
torch.Tensor] = self.punica_wrapper.add_lora_linear(
|
output,
|
||||||
output, x, self.lora_a_stacked, self.lora_b_stacked,
|
x,
|
||||||
self.lora_bias_stacked, 1.0, self.output_slices)
|
self.lora_a_stacked,
|
||||||
|
self.lora_b_stacked,
|
||||||
|
self.lora_bias_stacked,
|
||||||
|
1.0,
|
||||||
|
self.output_slices,
|
||||||
|
)
|
||||||
if not current_platform.can_update_inplace():
|
if not current_platform.can_update_inplace():
|
||||||
output = lora_output
|
output = lora_output
|
||||||
|
|
||||||
@ -162,7 +177,6 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
def weight(self) -> torch.Tensor:
|
def weight(self) -> torch.Tensor:
|
||||||
|
|
||||||
# unquantizedLinear
|
# unquantizedLinear
|
||||||
if hasattr(self.base_layer, "weight"):
|
if hasattr(self.base_layer, "weight"):
|
||||||
return self.base_layer.weight
|
return self.base_layer.weight
|
||||||
|
|||||||
@ -12,8 +12,6 @@ from vllm.distributed import (
|
|||||||
split_tensor_along_last_dim,
|
split_tensor_along_last_dim,
|
||||||
tensor_model_parallel_all_reduce,
|
tensor_model_parallel_all_reduce,
|
||||||
)
|
)
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
from vllm.model_executor.layers.linear import RowParallelLinear
|
from vllm.model_executor.layers.linear import RowParallelLinear
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
@ -22,7 +20,6 @@ from .utils import _fully_sharded_can_replace, _not_fully_sharded_can_replace
|
|||||||
|
|
||||||
|
|
||||||
class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
|
class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
|
||||||
|
|
||||||
def __init__(self, base_layer: RowParallelLinear) -> None:
|
def __init__(self, base_layer: RowParallelLinear) -> None:
|
||||||
super().__init__(base_layer)
|
super().__init__(base_layer)
|
||||||
|
|
||||||
@ -33,11 +30,10 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
|
|||||||
self.n_slices = 1
|
self.n_slices = 1
|
||||||
|
|
||||||
def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
|
def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
|
||||||
|
|
||||||
shard_size = self.input_size
|
shard_size = self.input_size
|
||||||
start_idx = self.tp_rank * shard_size
|
start_idx = self.tp_rank * shard_size
|
||||||
end_idx = (self.tp_rank + 1) * shard_size
|
end_idx = (self.tp_rank + 1) * shard_size
|
||||||
lora_a = lora_a[:,start_idx:end_idx]
|
lora_a = lora_a[:, start_idx:end_idx]
|
||||||
return lora_a
|
return lora_a
|
||||||
|
|
||||||
def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
|
def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
|
||||||
@ -66,7 +62,8 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
|
|||||||
else:
|
else:
|
||||||
# TODO: simplify code below
|
# TODO: simplify code below
|
||||||
splitted_input = split_tensor_along_last_dim(
|
splitted_input = split_tensor_along_last_dim(
|
||||||
input_, num_partitions=self.tp_size)
|
input_, num_partitions=self.tp_size
|
||||||
|
)
|
||||||
input_parallel = splitted_input[self.tp_rank].contiguous()
|
input_parallel = splitted_input[self.tp_rank].contiguous()
|
||||||
|
|
||||||
# Matrix multiply.
|
# Matrix multiply.
|
||||||
@ -77,8 +74,11 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
|
|||||||
output_ = output_parallel
|
output_ = output_parallel
|
||||||
|
|
||||||
if not self.base_layer.skip_bias_add:
|
if not self.base_layer.skip_bias_add:
|
||||||
output = (output_ + self.base_layer.bias
|
output = (
|
||||||
if self.base_layer.bias is not None else output_)
|
output_ + self.base_layer.bias
|
||||||
|
if self.base_layer.bias is not None
|
||||||
|
else output_
|
||||||
|
)
|
||||||
output_bias = None
|
output_bias = None
|
||||||
else:
|
else:
|
||||||
output = output_
|
output = output_
|
||||||
@ -101,11 +101,11 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
|
|||||||
return type(source_layer) is RowParallelLinear
|
return type(source_layer) is RowParallelLinear
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# The following layer is based on the tensor parallelism strategy given in
|
# The following layer is based on the tensor parallelism strategy given in
|
||||||
# Y. Sheng et al., S-LoRA: Serving Thousands of Concurrent LoRA Adapters. 2023,
|
# Y. Sheng et al., S-LoRA: Serving Thousands of Concurrent LoRA Adapters. 2023,
|
||||||
# https://arxiv.org/abs/2311.03285.
|
# https://arxiv.org/abs/2311.03285.
|
||||||
|
|
||||||
|
|
||||||
class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA):
|
class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA):
|
||||||
"""
|
"""
|
||||||
Differs from RowParallelLinearWithLoRA by slicing the
|
Differs from RowParallelLinearWithLoRA by slicing the
|
||||||
@ -120,28 +120,26 @@ class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA):
|
|||||||
shard_size = self.lora_b_stacked[0].shape[2]
|
shard_size = self.lora_b_stacked[0].shape[2]
|
||||||
start_idx = self.tp_rank * shard_size
|
start_idx = self.tp_rank * shard_size
|
||||||
end_idx = (self.tp_rank + 1) * shard_size
|
end_idx = (self.tp_rank + 1) * shard_size
|
||||||
lora_b = lora_b[ start_idx:end_idx,:]
|
lora_b = lora_b[start_idx:end_idx, :]
|
||||||
return lora_b
|
return lora_b
|
||||||
|
|
||||||
def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
|
def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
|
||||||
if bias is None:
|
if bias is None:
|
||||||
return bias
|
return bias
|
||||||
self.lora_bias_stacked = cast(tuple[torch.Tensor, ...],
|
self.lora_bias_stacked = cast(tuple[torch.Tensor, ...], self.lora_bias_stacked)
|
||||||
self.lora_bias_stacked)
|
|
||||||
shard_size = self.lora_bias_stacked[0].shape[2]
|
shard_size = self.lora_bias_stacked[0].shape[2]
|
||||||
start_idx = self.tp_rank * shard_size
|
start_idx = self.tp_rank * shard_size
|
||||||
end_idx = (self.tp_rank + 1) * shard_size
|
end_idx = (self.tp_rank + 1) * shard_size
|
||||||
bias = bias[start_idx:end_idx]
|
bias = bias[start_idx:end_idx]
|
||||||
return bias
|
return bias
|
||||||
|
|
||||||
def apply(self,
|
def apply(
|
||||||
x: torch.Tensor,
|
self, x: torch.Tensor, bias: Optional[torch.Tensor] = None
|
||||||
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
output = self.base_layer.quant_method.apply(self.base_layer, x)
|
output = self.base_layer.quant_method.apply(self.base_layer, x)
|
||||||
|
|
||||||
x = x.view(-1, x.shape[-1])
|
x = x.view(-1, x.shape[-1])
|
||||||
output, out_orig_shape = output.view(-1,
|
output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape
|
||||||
output.shape[-1]), output.shape
|
|
||||||
buffer = torch.zeros(
|
buffer = torch.zeros(
|
||||||
(self.n_slices, x.shape[0], self.lora_a_stacked[0].shape[2]),
|
(self.n_slices, x.shape[0], self.lora_a_stacked[0].shape[2]),
|
||||||
dtype=torch.float32,
|
dtype=torch.float32,
|
||||||
@ -149,10 +147,11 @@ class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA):
|
|||||||
)
|
)
|
||||||
|
|
||||||
shrunk_buffer: Optional[torch.Tensor] = self.punica_wrapper.add_shrink(
|
shrunk_buffer: Optional[torch.Tensor] = self.punica_wrapper.add_shrink(
|
||||||
buffer, x, self.lora_a_stacked, 1.0)
|
buffer, x, self.lora_a_stacked, 1.0
|
||||||
|
)
|
||||||
if not current_platform.can_update_inplace():
|
if not current_platform.can_update_inplace():
|
||||||
buffer = shrunk_buffer
|
buffer = shrunk_buffer
|
||||||
if self.tp_size>1:
|
if self.tp_size > 1:
|
||||||
buffer = tensor_model_parallel_all_reduce(buffer)
|
buffer = tensor_model_parallel_all_reduce(buffer)
|
||||||
|
|
||||||
# following S-LoRA, allows the fusing of all_gather and all_reduce
|
# following S-LoRA, allows the fusing of all_gather and all_reduce
|
||||||
|
|||||||
@ -19,8 +19,6 @@ from vllm.config.lora import LoRAConfig
|
|||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
|
||||||
# being imported for _all_lora_classes below
|
# being imported for _all_lora_classes below
|
||||||
# yapf conflicts with isort for this block
|
|
||||||
# yapf: disable
|
|
||||||
from vllm.lora.layers import (
|
from vllm.lora.layers import (
|
||||||
BaseLayerWithLoRA,
|
BaseLayerWithLoRA,
|
||||||
ColumnParallelLinearWithLoRA,
|
ColumnParallelLinearWithLoRA,
|
||||||
@ -39,8 +37,6 @@ from vllm.lora.layers import (
|
|||||||
)
|
)
|
||||||
from vllm.model_executor.layers.linear import LinearBase
|
from vllm.model_executor.layers.linear import LinearBase
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||||
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
||||||
|
|||||||
@ -14,8 +14,6 @@ import vllm.envs as envs
|
|||||||
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
from vllm.model_executor.layers.fused_moe.config import (
|
from vllm.model_executor.layers.fused_moe.config import (
|
||||||
FUSED_MOE_UNQUANTIZED_CONFIG,
|
FUSED_MOE_UNQUANTIZED_CONFIG,
|
||||||
FusedMoEQuantConfig,
|
FusedMoEQuantConfig,
|
||||||
@ -25,8 +23,6 @@ from vllm.model_executor.layers.fused_moe.cutlass_moe import (
|
|||||||
_valid_cutlass_block_scaled_grouped_gemm,
|
_valid_cutlass_block_scaled_grouped_gemm,
|
||||||
run_cutlass_block_scaled_fused_experts,
|
run_cutlass_block_scaled_fused_experts,
|
||||||
)
|
)
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
|
from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
|
||||||
_valid_deep_gemm,
|
_valid_deep_gemm,
|
||||||
deep_gemm_moe_fp8,
|
deep_gemm_moe_fp8,
|
||||||
|
|||||||
@ -24,8 +24,6 @@ from vllm.distributed.eplb.eplb_state import EplbState
|
|||||||
from vllm.forward_context import ForwardContext, get_forward_context
|
from vllm.forward_context import ForwardContext, get_forward_context
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.model_executor.custom_op import CustomOp
|
from vllm.model_executor.custom_op import CustomOp
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
from vllm.model_executor.layers.fused_moe.config import (
|
from vllm.model_executor.layers.fused_moe.config import (
|
||||||
FUSED_MOE_UNQUANTIZED_CONFIG,
|
FUSED_MOE_UNQUANTIZED_CONFIG,
|
||||||
FusedMoEConfig,
|
FusedMoEConfig,
|
||||||
@ -34,8 +32,6 @@ from vllm.model_executor.layers.fused_moe.config import (
|
|||||||
biased_moe_quant_config,
|
biased_moe_quant_config,
|
||||||
)
|
)
|
||||||
from vllm.model_executor.layers.fused_moe.fused_moe import zero_experts_compute_triton
|
from vllm.model_executor.layers.fused_moe.fused_moe import zero_experts_compute_triton
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
from vllm.model_executor.layers.fused_moe.modular_kernel import (
|
from vllm.model_executor.layers.fused_moe.modular_kernel import (
|
||||||
FusedMoEActivationFormat,
|
FusedMoEActivationFormat,
|
||||||
FusedMoEModularKernel,
|
FusedMoEModularKernel,
|
||||||
|
|||||||
@ -10,7 +10,7 @@ import torch
|
|||||||
|
|
||||||
import vllm.envs as envs
|
import vllm.envs as envs
|
||||||
from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
|
from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
|
||||||
from vllm.model_executor.layers.fused_moe.utils import ( # yapf: disable
|
from vllm.model_executor.layers.fused_moe.utils import (
|
||||||
_resize_cache,
|
_resize_cache,
|
||||||
count_expert_num_tokens,
|
count_expert_num_tokens,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -24,8 +24,6 @@ from vllm.model_executor.layers.quantization.base_config import (
|
|||||||
QuantizeMethodBase,
|
QuantizeMethodBase,
|
||||||
)
|
)
|
||||||
from vllm.model_executor.layers.utils import dispatch_unquantized_gemm
|
from vllm.model_executor.layers.utils import dispatch_unquantized_gemm
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
from vllm.model_executor.parameter import (
|
from vllm.model_executor.parameter import (
|
||||||
BasevLLMParameter,
|
BasevLLMParameter,
|
||||||
BlockQuantScaleParameter,
|
BlockQuantScaleParameter,
|
||||||
@ -35,8 +33,6 @@ from vllm.model_executor.parameter import (
|
|||||||
PerTensorScaleParameter,
|
PerTensorScaleParameter,
|
||||||
RowvLLMParameter,
|
RowvLLMParameter,
|
||||||
)
|
)
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
from vllm.model_executor.utils import set_weight_attrs
|
from vllm.model_executor.utils import set_weight_attrs
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import GiB_bytes
|
from vllm.utils import GiB_bytes
|
||||||
|
|||||||
@ -17,17 +17,12 @@ from vllm.model_executor.layers.quantization.kernels.mixed_precision import (
|
|||||||
from vllm.model_executor.layers.quantization.utils.marlin_utils import (
|
from vllm.model_executor.layers.quantization.utils.marlin_utils import (
|
||||||
marlin_repeat_scales_on_all_ranks,
|
marlin_repeat_scales_on_all_ranks,
|
||||||
)
|
)
|
||||||
|
|
||||||
# yapf conflicts with isort for this block
|
|
||||||
# yapf: disable
|
|
||||||
from vllm.model_executor.parameter import (
|
from vllm.model_executor.parameter import (
|
||||||
BasevLLMParameter,
|
BasevLLMParameter,
|
||||||
ChannelQuantScaleParameter,
|
ChannelQuantScaleParameter,
|
||||||
GroupQuantScaleParameter,
|
GroupQuantScaleParameter,
|
||||||
PackedvLLMParameter,
|
PackedvLLMParameter,
|
||||||
)
|
)
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
from vllm.scalar_type import scalar_types
|
from vllm.scalar_type import scalar_types
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|||||||
@ -17,9 +17,6 @@ from vllm.model_executor.layers.quantization.kernels.mixed_precision import (
|
|||||||
from vllm.model_executor.layers.quantization.utils.marlin_utils import (
|
from vllm.model_executor.layers.quantization.utils.marlin_utils import (
|
||||||
marlin_repeat_scales_on_all_ranks,
|
marlin_repeat_scales_on_all_ranks,
|
||||||
)
|
)
|
||||||
|
|
||||||
# yapf conflicts with isort for this block
|
|
||||||
# yapf: disable
|
|
||||||
from vllm.model_executor.parameter import (
|
from vllm.model_executor.parameter import (
|
||||||
BasevLLMParameter,
|
BasevLLMParameter,
|
||||||
ChannelQuantScaleParameter,
|
ChannelQuantScaleParameter,
|
||||||
@ -28,8 +25,6 @@ from vllm.model_executor.parameter import (
|
|||||||
PackedvLLMParameter,
|
PackedvLLMParameter,
|
||||||
RowvLLMParameter,
|
RowvLLMParameter,
|
||||||
)
|
)
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
from vllm.scalar_type import scalar_types
|
from vllm.scalar_type import scalar_types
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|||||||
@ -22,8 +22,6 @@ from vllm.distributed import (
|
|||||||
get_tensor_model_parallel_rank,
|
get_tensor_model_parallel_rank,
|
||||||
get_tensor_model_parallel_world_size,
|
get_tensor_model_parallel_world_size,
|
||||||
)
|
)
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.model_executor.layers.fused_moe import FusedMoE
|
from vllm.model_executor.layers.fused_moe import FusedMoE
|
||||||
from vllm.model_executor.layers.linear import (
|
from vllm.model_executor.layers.linear import (
|
||||||
@ -51,8 +49,6 @@ from vllm.model_executor.utils import (
|
|||||||
)
|
)
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
# yapf conflicts with isort for this block
|
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -39,13 +39,10 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
|||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
from .idefics2_vision_model import Idefics2VisionConfig
|
from .idefics2_vision_model import Idefics2VisionConfig
|
||||||
from .idefics2_vision_model import (
|
from .idefics2_vision_model import (
|
||||||
Idefics2VisionTransformer as Idefics3VisionTransformer,
|
Idefics2VisionTransformer as Idefics3VisionTransformer,
|
||||||
)
|
)
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsQuant
|
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsQuant
|
||||||
from .llama import LlamaDecoderLayer, LlamaMLP, LlamaModel
|
from .llama import LlamaDecoderLayer, LlamaMLP, LlamaModel
|
||||||
from .utils import (
|
from .utils import (
|
||||||
|
|||||||
@ -22,8 +22,6 @@ from vllm.multimodal.inputs import (
|
|||||||
MultiModalKwargsItems,
|
MultiModalKwargsItems,
|
||||||
)
|
)
|
||||||
from vllm.multimodal.parse import ImageProcessorItems, ImageSize, MultiModalDataItems
|
from vllm.multimodal.parse import ImageProcessorItems, ImageSize, MultiModalDataItems
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
from vllm.multimodal.processing import (
|
from vllm.multimodal.processing import (
|
||||||
BaseMultiModalProcessor,
|
BaseMultiModalProcessor,
|
||||||
BaseProcessingInfo,
|
BaseProcessingInfo,
|
||||||
@ -35,8 +33,6 @@ from vllm.multimodal.processing import (
|
|||||||
PromptUpdateDetails,
|
PromptUpdateDetails,
|
||||||
replace_token_matches,
|
replace_token_matches,
|
||||||
)
|
)
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
||||||
|
|||||||
@ -6,14 +6,16 @@ from typing import Annotated, Any, Literal, Optional, Union, cast
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from transformers import AutoModel, BatchFeature
|
from transformers import AutoModel, BatchFeature
|
||||||
from transformers.models.gemma3n import (Gemma3nAudioConfig,
|
from transformers.models.gemma3n import (
|
||||||
Gemma3nAudioFeatureExtractor,
|
Gemma3nAudioConfig,
|
||||||
Gemma3nConfig, Gemma3nProcessor,
|
Gemma3nAudioFeatureExtractor,
|
||||||
Gemma3nTextConfig,
|
Gemma3nConfig,
|
||||||
Gemma3nVisionConfig)
|
Gemma3nProcessor,
|
||||||
|
Gemma3nTextConfig,
|
||||||
|
Gemma3nVisionConfig,
|
||||||
|
)
|
||||||
from transformers.models.siglip import SiglipImageProcessorFast
|
from transformers.models.siglip import SiglipImageProcessorFast
|
||||||
|
|
||||||
from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig
|
from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig
|
||||||
@ -22,25 +24,32 @@ from vllm.inputs.data import PromptType
|
|||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||||
from vllm.model_executor.layers.linear import RowParallelLinear
|
from vllm.model_executor.layers.linear import RowParallelLinear
|
||||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
|
||||||
VocabParallelEmbedding)
|
|
||||||
from vllm.model_executor.models.gemma3n import Gemma3nForCausalLM
|
from vllm.model_executor.models.gemma3n import Gemma3nForCausalLM
|
||||||
from vllm.model_executor.models.module_mapping import MultiModelKeys
|
from vllm.model_executor.models.module_mapping import MultiModelKeys
|
||||||
from vllm.model_executor.models.whisper import ISO639_1_SUPPORTED_LANGS
|
from vllm.model_executor.models.whisper import ISO639_1_SUPPORTED_LANGS
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
from vllm.multimodal.inputs import (
|
||||||
MultiModalKwargsItems)
|
MultiModalDataDict,
|
||||||
from vllm.multimodal.parse import (ImageProcessorItems, MultiModalDataItems,
|
MultiModalFieldConfig,
|
||||||
MultiModalDataParser)
|
MultiModalKwargsItems,
|
||||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
)
|
||||||
BaseProcessingInfo,
|
from vllm.multimodal.parse import (
|
||||||
MultiModalPromptUpdates,
|
ImageProcessorItems,
|
||||||
MultiModalPromptUpdatesApplyResult,
|
MultiModalDataItems,
|
||||||
PlaceholderFeaturesInfo,
|
MultiModalDataParser,
|
||||||
PromptReplacement, PromptUpdate,
|
)
|
||||||
PromptUpdateDetails,
|
from vllm.multimodal.processing import (
|
||||||
replace_token_matches)
|
BaseMultiModalProcessor,
|
||||||
# yapf: enable
|
BaseProcessingInfo,
|
||||||
|
MultiModalPromptUpdates,
|
||||||
|
MultiModalPromptUpdatesApplyResult,
|
||||||
|
PlaceholderFeaturesInfo,
|
||||||
|
PromptReplacement,
|
||||||
|
PromptUpdate,
|
||||||
|
PromptUpdateDetails,
|
||||||
|
replace_token_matches,
|
||||||
|
)
|
||||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
||||||
|
|||||||
@ -43,9 +43,6 @@ from vllm.multimodal.inputs import (
|
|||||||
MultiModalKwargsItems,
|
MultiModalKwargsItems,
|
||||||
)
|
)
|
||||||
from vllm.multimodal.parse import ImageProcessorItems, ImageSize
|
from vllm.multimodal.parse import ImageProcessorItems, ImageSize
|
||||||
|
|
||||||
# yapf conflicts with isort for this block
|
|
||||||
# yapf: disable
|
|
||||||
from vllm.multimodal.processing import (
|
from vllm.multimodal.processing import (
|
||||||
BaseMultiModalProcessor,
|
BaseMultiModalProcessor,
|
||||||
BaseProcessingInfo,
|
BaseProcessingInfo,
|
||||||
@ -54,18 +51,13 @@ from vllm.multimodal.processing import (
|
|||||||
PromptUpdate,
|
PromptUpdate,
|
||||||
PromptUpdateDetails,
|
PromptUpdateDetails,
|
||||||
)
|
)
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
from .idefics2_vision_model import (
|
from .idefics2_vision_model import (
|
||||||
Idefics2VisionTransformer as Idefics3VisionTransformer,
|
Idefics2VisionTransformer as Idefics3VisionTransformer,
|
||||||
)
|
)
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
from .interfaces import MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal
|
from .interfaces import MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal
|
||||||
from .llama import LlamaModel
|
from .llama import LlamaModel
|
||||||
from .utils import AutoWeightsLoader, maybe_prefix
|
from .utils import AutoWeightsLoader, maybe_prefix
|
||||||
|
|||||||
@ -45,9 +45,6 @@ from vllm.multimodal.parse import (
|
|||||||
ImageSize,
|
ImageSize,
|
||||||
MultiModalDataItems,
|
MultiModalDataItems,
|
||||||
)
|
)
|
||||||
|
|
||||||
# yapf conflicts with isort for this block
|
|
||||||
# yapf: disable
|
|
||||||
from vllm.multimodal.processing import (
|
from vllm.multimodal.processing import (
|
||||||
BaseMultiModalProcessor,
|
BaseMultiModalProcessor,
|
||||||
BaseProcessingInfo,
|
BaseProcessingInfo,
|
||||||
@ -57,8 +54,6 @@ from vllm.multimodal.processing import (
|
|||||||
PromptUpdate,
|
PromptUpdate,
|
||||||
ResolvedPromptUpdate,
|
ResolvedPromptUpdate,
|
||||||
)
|
)
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
from vllm.utils import is_list_of
|
from vllm.utils import is_list_of
|
||||||
|
|||||||
@ -52,16 +52,12 @@ from vllm.distributed import utils as dist_utils
|
|||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.model_executor.layers.activation import get_act_and_mul_fn
|
from vllm.model_executor.layers.activation import get_act_and_mul_fn
|
||||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
from vllm.model_executor.layers.linear import (
|
from vllm.model_executor.layers.linear import (
|
||||||
ColumnParallelLinear,
|
ColumnParallelLinear,
|
||||||
MergedColumnParallelLinear,
|
MergedColumnParallelLinear,
|
||||||
QKVParallelLinear,
|
QKVParallelLinear,
|
||||||
RowParallelLinear,
|
RowParallelLinear,
|
||||||
)
|
)
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||||
from vllm.model_executor.models.module_mapping import MultiModelKeys
|
from vllm.model_executor.models.module_mapping import MultiModelKeys
|
||||||
|
|||||||
@ -37,12 +37,7 @@ from vllm.model_executor.layers.fla.ops import (
|
|||||||
fused_recurrent_gated_delta_rule,
|
fused_recurrent_gated_delta_rule,
|
||||||
)
|
)
|
||||||
from vllm.model_executor.layers.fused_moe import FusedMoE
|
from vllm.model_executor.layers.fused_moe import FusedMoE
|
||||||
|
|
||||||
# yapf conflicts with isort for this block
|
|
||||||
# yapf: disable
|
|
||||||
from vllm.model_executor.layers.layernorm import GemmaRMSNorm as Qwen3NextRMSNorm
|
from vllm.model_executor.layers.layernorm import GemmaRMSNorm as Qwen3NextRMSNorm
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
from vllm.model_executor.layers.linear import (
|
from vllm.model_executor.layers.linear import (
|
||||||
ColumnParallelLinear,
|
ColumnParallelLinear,
|
||||||
QKVParallelLinear,
|
QKVParallelLinear,
|
||||||
|
|||||||
@ -54,7 +54,6 @@ from .interfaces_base import (
|
|||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
_TEXT_GENERATION_MODELS = {
|
_TEXT_GENERATION_MODELS = {
|
||||||
# [Decoder-only]
|
# [Decoder-only]
|
||||||
"ApertusForCausalLM": ("apertus", "ApertusForCausalLM"),
|
"ApertusForCausalLM": ("apertus", "ApertusForCausalLM"),
|
||||||
@ -106,8 +105,8 @@ _TEXT_GENERATION_MODELS = {
|
|||||||
"GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"),
|
"GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"),
|
||||||
"GraniteForCausalLM": ("granite", "GraniteForCausalLM"),
|
"GraniteForCausalLM": ("granite", "GraniteForCausalLM"),
|
||||||
"GraniteMoeForCausalLM": ("granitemoe", "GraniteMoeForCausalLM"),
|
"GraniteMoeForCausalLM": ("granitemoe", "GraniteMoeForCausalLM"),
|
||||||
"GraniteMoeHybridForCausalLM": ("granitemoehybrid", "GraniteMoeHybridForCausalLM"), # noqa: E501
|
"GraniteMoeHybridForCausalLM": ("granitemoehybrid", "GraniteMoeHybridForCausalLM"), # noqa: E501
|
||||||
"GraniteMoeSharedForCausalLM": ("granitemoeshared", "GraniteMoeSharedForCausalLM"), # noqa: E501
|
"GraniteMoeSharedForCausalLM": ("granitemoeshared", "GraniteMoeSharedForCausalLM"), # noqa: E501
|
||||||
"GritLM": ("gritlm", "GritLM"),
|
"GritLM": ("gritlm", "GritLM"),
|
||||||
"Grok1ModelForCausalLM": ("grok1", "Grok1ForCausalLM"),
|
"Grok1ModelForCausalLM": ("grok1", "Grok1ForCausalLM"),
|
||||||
"HunYuanMoEV1ForCausalLM": ("hunyuan_v1", "HunYuanMoEV1ForCausalLM"),
|
"HunYuanMoEV1ForCausalLM": ("hunyuan_v1", "HunYuanMoEV1ForCausalLM"),
|
||||||
@ -127,7 +126,7 @@ _TEXT_GENERATION_MODELS = {
|
|||||||
"LongcatFlashForCausalLM": ("longcat_flash", "LongcatFlashForCausalLM"),
|
"LongcatFlashForCausalLM": ("longcat_flash", "LongcatFlashForCausalLM"),
|
||||||
"MambaForCausalLM": ("mamba", "MambaForCausalLM"),
|
"MambaForCausalLM": ("mamba", "MambaForCausalLM"),
|
||||||
"FalconMambaForCausalLM": ("mamba", "MambaForCausalLM"),
|
"FalconMambaForCausalLM": ("mamba", "MambaForCausalLM"),
|
||||||
"FalconH1ForCausalLM":("falcon_h1", "FalconH1ForCausalLM"),
|
"FalconH1ForCausalLM": ("falcon_h1", "FalconH1ForCausalLM"),
|
||||||
"Mamba2ForCausalLM": ("mamba2", "Mamba2ForCausalLM"),
|
"Mamba2ForCausalLM": ("mamba2", "Mamba2ForCausalLM"),
|
||||||
"MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"),
|
"MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"),
|
||||||
"MiniCPM3ForCausalLM": ("minicpm3", "MiniCPM3ForCausalLM"),
|
"MiniCPM3ForCausalLM": ("minicpm3", "MiniCPM3ForCausalLM"),
|
||||||
@ -184,7 +183,8 @@ _EMBEDDING_MODELS = {
|
|||||||
"LlamaModel": ("llama", "LlamaForCausalLM"),
|
"LlamaModel": ("llama", "LlamaForCausalLM"),
|
||||||
**{
|
**{
|
||||||
# Multiple models share the same architecture, so we include them all
|
# Multiple models share the same architecture, so we include them all
|
||||||
k: (mod, arch) for k, (mod, arch) in _TEXT_GENERATION_MODELS.items()
|
k: (mod, arch)
|
||||||
|
for k, (mod, arch) in _TEXT_GENERATION_MODELS.items()
|
||||||
if arch == "LlamaForCausalLM"
|
if arch == "LlamaForCausalLM"
|
||||||
},
|
},
|
||||||
"MistralModel": ("llama", "LlamaForCausalLM"),
|
"MistralModel": ("llama", "LlamaForCausalLM"),
|
||||||
@ -201,7 +201,10 @@ _EMBEDDING_MODELS = {
|
|||||||
"XLMRobertaModel": ("roberta", "RobertaEmbeddingModel"),
|
"XLMRobertaModel": ("roberta", "RobertaEmbeddingModel"),
|
||||||
# [Multimodal]
|
# [Multimodal]
|
||||||
"CLIPModel": ("clip", "CLIPEmbeddingModel"),
|
"CLIPModel": ("clip", "CLIPEmbeddingModel"),
|
||||||
"LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501
|
"LlavaNextForConditionalGeneration": (
|
||||||
|
"llava_next",
|
||||||
|
"LlavaNextForConditionalGeneration",
|
||||||
|
), # noqa: E501
|
||||||
"Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
|
"Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
|
||||||
"Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), # noqa: E501
|
"Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), # noqa: E501
|
||||||
# Technically Terratorch models work on images, both in
|
# Technically Terratorch models work on images, both in
|
||||||
@ -214,79 +217,150 @@ _EMBEDDING_MODELS = {
|
|||||||
_CROSS_ENCODER_MODELS = {
|
_CROSS_ENCODER_MODELS = {
|
||||||
"BertForSequenceClassification": ("bert", "BertForSequenceClassification"),
|
"BertForSequenceClassification": ("bert", "BertForSequenceClassification"),
|
||||||
"BertForTokenClassification": ("bert", "BertForTokenClassification"),
|
"BertForTokenClassification": ("bert", "BertForTokenClassification"),
|
||||||
"GteNewForSequenceClassification": ("bert_with_rope",
|
"GteNewForSequenceClassification": (
|
||||||
"GteNewForSequenceClassification"),
|
"bert_with_rope",
|
||||||
"ModernBertForSequenceClassification": ("modernbert",
|
"GteNewForSequenceClassification",
|
||||||
"ModernBertForSequenceClassification"),
|
),
|
||||||
"RobertaForSequenceClassification": ("roberta",
|
"ModernBertForSequenceClassification": (
|
||||||
"RobertaForSequenceClassification"),
|
"modernbert",
|
||||||
"XLMRobertaForSequenceClassification": ("roberta",
|
"ModernBertForSequenceClassification",
|
||||||
"RobertaForSequenceClassification"),
|
),
|
||||||
|
"RobertaForSequenceClassification": ("roberta", "RobertaForSequenceClassification"),
|
||||||
|
"XLMRobertaForSequenceClassification": (
|
||||||
|
"roberta",
|
||||||
|
"RobertaForSequenceClassification",
|
||||||
|
),
|
||||||
# [Auto-converted (see adapters.py)]
|
# [Auto-converted (see adapters.py)]
|
||||||
"JinaVLForRanking": ("jina_vl", "JinaVLForSequenceClassification"), # noqa: E501,
|
"JinaVLForRanking": ("jina_vl", "JinaVLForSequenceClassification"), # noqa: E501,
|
||||||
}
|
}
|
||||||
|
|
||||||
_MULTIMODAL_MODELS = {
|
_MULTIMODAL_MODELS = {
|
||||||
# [Decoder-only]
|
# [Decoder-only]
|
||||||
"AriaForConditionalGeneration": ("aria", "AriaForConditionalGeneration"),
|
"AriaForConditionalGeneration": ("aria", "AriaForConditionalGeneration"),
|
||||||
"AyaVisionForConditionalGeneration": ("aya_vision", "AyaVisionForConditionalGeneration"), # noqa: E501
|
"AyaVisionForConditionalGeneration": (
|
||||||
|
"aya_vision",
|
||||||
|
"AyaVisionForConditionalGeneration",
|
||||||
|
), # noqa: E501
|
||||||
"Blip2ForConditionalGeneration": ("blip2", "Blip2ForConditionalGeneration"),
|
"Blip2ForConditionalGeneration": ("blip2", "Blip2ForConditionalGeneration"),
|
||||||
"ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"), # noqa: E501
|
"ChameleonForConditionalGeneration": (
|
||||||
"Cohere2VisionForConditionalGeneration": ("cohere2_vision", "Cohere2VisionForConditionalGeneration"), # noqa: E501
|
"chameleon",
|
||||||
|
"ChameleonForConditionalGeneration",
|
||||||
|
), # noqa: E501
|
||||||
|
"Cohere2VisionForConditionalGeneration": (
|
||||||
|
"cohere2_vision",
|
||||||
|
"Cohere2VisionForConditionalGeneration",
|
||||||
|
), # noqa: E501
|
||||||
"DeepseekVLV2ForCausalLM": ("deepseek_vl2", "DeepseekVLV2ForCausalLM"),
|
"DeepseekVLV2ForCausalLM": ("deepseek_vl2", "DeepseekVLV2ForCausalLM"),
|
||||||
"DotsOCRForCausalLM": ("dots_ocr", "DotsOCRForCausalLM"),
|
"DotsOCRForCausalLM": ("dots_ocr", "DotsOCRForCausalLM"),
|
||||||
"Ernie4_5_VLMoeForConditionalGeneration": ("ernie45_vl", "Ernie4_5_VLMoeForConditionalGeneration"), # noqa: E501
|
"Ernie4_5_VLMoeForConditionalGeneration": (
|
||||||
|
"ernie45_vl",
|
||||||
|
"Ernie4_5_VLMoeForConditionalGeneration",
|
||||||
|
), # noqa: E501
|
||||||
"FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
|
"FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
|
||||||
"Gemma3ForConditionalGeneration": ("gemma3_mm", "Gemma3ForConditionalGeneration"), # noqa: E501
|
"Gemma3ForConditionalGeneration": ("gemma3_mm", "Gemma3ForConditionalGeneration"), # noqa: E501
|
||||||
"Gemma3nForConditionalGeneration": ("gemma3n_mm", "Gemma3nForConditionalGeneration"), # noqa: E501
|
"Gemma3nForConditionalGeneration": (
|
||||||
|
"gemma3n_mm",
|
||||||
|
"Gemma3nForConditionalGeneration",
|
||||||
|
), # noqa: E501
|
||||||
"GLM4VForCausalLM": ("glm4v", "GLM4VForCausalLM"),
|
"GLM4VForCausalLM": ("glm4v", "GLM4VForCausalLM"),
|
||||||
"Glm4vForConditionalGeneration": ("glm4_1v", "Glm4vForConditionalGeneration"), # noqa: E501
|
"Glm4vForConditionalGeneration": ("glm4_1v", "Glm4vForConditionalGeneration"), # noqa: E501
|
||||||
"Glm4vMoeForConditionalGeneration": ("glm4_1v", "Glm4vMoeForConditionalGeneration"), # noqa: E501
|
"Glm4vMoeForConditionalGeneration": ("glm4_1v", "Glm4vMoeForConditionalGeneration"), # noqa: E501
|
||||||
"GraniteSpeechForConditionalGeneration": ("granite_speech", "GraniteSpeechForConditionalGeneration"), # noqa: E501
|
"GraniteSpeechForConditionalGeneration": (
|
||||||
|
"granite_speech",
|
||||||
|
"GraniteSpeechForConditionalGeneration",
|
||||||
|
), # noqa: E501
|
||||||
"H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
|
"H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
|
||||||
"InternVLChatModel": ("internvl", "InternVLChatModel"),
|
"InternVLChatModel": ("internvl", "InternVLChatModel"),
|
||||||
"NemotronH_Nano_VL_V2": ("nano_nemotron_vl", "NemotronH_Nano_VL_V2"),
|
"NemotronH_Nano_VL_V2": ("nano_nemotron_vl", "NemotronH_Nano_VL_V2"),
|
||||||
"InternS1ForConditionalGeneration": ("interns1", "InternS1ForConditionalGeneration"), # noqa: E501
|
"InternS1ForConditionalGeneration": (
|
||||||
"InternVLForConditionalGeneration": ("interns1", "InternS1ForConditionalGeneration"), # noqa: E501
|
"interns1",
|
||||||
"Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"),
|
"InternS1ForConditionalGeneration",
|
||||||
"SmolVLMForConditionalGeneration": ("smolvlm","SmolVLMForConditionalGeneration"), # noqa: E501
|
), # noqa: E501
|
||||||
|
"InternVLForConditionalGeneration": (
|
||||||
|
"interns1",
|
||||||
|
"InternS1ForConditionalGeneration",
|
||||||
|
), # noqa: E501
|
||||||
|
"Idefics3ForConditionalGeneration": (
|
||||||
|
"idefics3",
|
||||||
|
"Idefics3ForConditionalGeneration",
|
||||||
|
),
|
||||||
|
"SmolVLMForConditionalGeneration": ("smolvlm", "SmolVLMForConditionalGeneration"), # noqa: E501
|
||||||
"KeyeForConditionalGeneration": ("keye", "KeyeForConditionalGeneration"),
|
"KeyeForConditionalGeneration": ("keye", "KeyeForConditionalGeneration"),
|
||||||
"KeyeVL1_5ForConditionalGeneration": ("keye_vl1_5", "KeyeVL1_5ForConditionalGeneration"), # noqa: E501
|
"KeyeVL1_5ForConditionalGeneration": (
|
||||||
|
"keye_vl1_5",
|
||||||
|
"KeyeVL1_5ForConditionalGeneration",
|
||||||
|
), # noqa: E501
|
||||||
"RForConditionalGeneration": ("rvl", "RForConditionalGeneration"),
|
"RForConditionalGeneration": ("rvl", "RForConditionalGeneration"),
|
||||||
"KimiVLForConditionalGeneration": ("kimi_vl", "KimiVLForConditionalGeneration"), # noqa: E501
|
"KimiVLForConditionalGeneration": ("kimi_vl", "KimiVLForConditionalGeneration"), # noqa: E501
|
||||||
"Llama_Nemotron_Nano_VL": ("nemotron_vl", "LlamaNemotronVLChatModel"),
|
"Llama_Nemotron_Nano_VL": ("nemotron_vl", "LlamaNemotronVLChatModel"),
|
||||||
"Llama4ForConditionalGeneration": ("mllama4", "Llama4ForConditionalGeneration"), # noqa: E501
|
"Llama4ForConditionalGeneration": ("mllama4", "Llama4ForConditionalGeneration"), # noqa: E501
|
||||||
"LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"),
|
"LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"),
|
||||||
"LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501
|
"LlavaNextForConditionalGeneration": (
|
||||||
"LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"), # noqa: E501
|
"llava_next",
|
||||||
"LlavaOnevisionForConditionalGeneration": ("llava_onevision", "LlavaOnevisionForConditionalGeneration"), # noqa: E501
|
"LlavaNextForConditionalGeneration",
|
||||||
|
), # noqa: E501
|
||||||
|
"LlavaNextVideoForConditionalGeneration": (
|
||||||
|
"llava_next_video",
|
||||||
|
"LlavaNextVideoForConditionalGeneration",
|
||||||
|
), # noqa: E501
|
||||||
|
"LlavaOnevisionForConditionalGeneration": (
|
||||||
|
"llava_onevision",
|
||||||
|
"LlavaOnevisionForConditionalGeneration",
|
||||||
|
), # noqa: E501
|
||||||
"MantisForConditionalGeneration": ("llava", "MantisForConditionalGeneration"), # noqa: E501
|
"MantisForConditionalGeneration": ("llava", "MantisForConditionalGeneration"), # noqa: E501
|
||||||
"MiDashengLMModel": ("midashenglm", "MiDashengLMModel"),
|
"MiDashengLMModel": ("midashenglm", "MiDashengLMModel"),
|
||||||
"MiniMaxVL01ForConditionalGeneration": ("minimax_vl_01", "MiniMaxVL01ForConditionalGeneration"), # noqa: E501
|
"MiniMaxVL01ForConditionalGeneration": (
|
||||||
|
"minimax_vl_01",
|
||||||
|
"MiniMaxVL01ForConditionalGeneration",
|
||||||
|
), # noqa: E501
|
||||||
"MiniCPMO": ("minicpmo", "MiniCPMO"),
|
"MiniCPMO": ("minicpmo", "MiniCPMO"),
|
||||||
"MiniCPMV": ("minicpmv", "MiniCPMV"),
|
"MiniCPMV": ("minicpmv", "MiniCPMV"),
|
||||||
"Mistral3ForConditionalGeneration": ("mistral3", "Mistral3ForConditionalGeneration"), # noqa: E501
|
"Mistral3ForConditionalGeneration": (
|
||||||
|
"mistral3",
|
||||||
|
"Mistral3ForConditionalGeneration",
|
||||||
|
), # noqa: E501
|
||||||
"MolmoForCausalLM": ("molmo", "MolmoForCausalLM"),
|
"MolmoForCausalLM": ("molmo", "MolmoForCausalLM"),
|
||||||
"NVLM_D": ("nvlm_d", "NVLM_D_Model"),
|
"NVLM_D": ("nvlm_d", "NVLM_D_Model"),
|
||||||
"Ovis": ("ovis", "Ovis"),
|
"Ovis": ("ovis", "Ovis"),
|
||||||
"Ovis2_5": ("ovis2_5", "Ovis2_5"),
|
"Ovis2_5": ("ovis2_5", "Ovis2_5"),
|
||||||
"PaliGemmaForConditionalGeneration": ("paligemma", "PaliGemmaForConditionalGeneration"), # noqa: E501
|
"PaliGemmaForConditionalGeneration": (
|
||||||
|
"paligemma",
|
||||||
|
"PaliGemmaForConditionalGeneration",
|
||||||
|
), # noqa: E501
|
||||||
"Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
|
"Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
|
||||||
"Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"),
|
"Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"),
|
||||||
"Phi4MultimodalForCausalLM": ("phi4_multimodal", "Phi4MultimodalForCausalLM"), # noqa: E501
|
"Phi4MultimodalForCausalLM": ("phi4_multimodal", "Phi4MultimodalForCausalLM"), # noqa: E501
|
||||||
"PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"), # noqa: E501
|
"PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"), # noqa: E501
|
||||||
"QwenVLForConditionalGeneration": ("qwen_vl", "QwenVLForConditionalGeneration"), # noqa: E501
|
"QwenVLForConditionalGeneration": ("qwen_vl", "QwenVLForConditionalGeneration"), # noqa: E501
|
||||||
"Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), # noqa: E501
|
"Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), # noqa: E501
|
||||||
"Qwen2_5_VLForConditionalGeneration": ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"), # noqa: E501
|
"Qwen2_5_VLForConditionalGeneration": (
|
||||||
"Qwen2AudioForConditionalGeneration": ("qwen2_audio", "Qwen2AudioForConditionalGeneration"), # noqa: E501
|
"qwen2_5_vl",
|
||||||
"Qwen2_5OmniModel": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"), # noqa: E501
|
"Qwen2_5_VLForConditionalGeneration",
|
||||||
"Qwen2_5OmniForConditionalGeneration": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"), # noqa: E501
|
), # noqa: E501
|
||||||
|
"Qwen2AudioForConditionalGeneration": (
|
||||||
|
"qwen2_audio",
|
||||||
|
"Qwen2AudioForConditionalGeneration",
|
||||||
|
), # noqa: E501
|
||||||
|
"Qwen2_5OmniModel": (
|
||||||
|
"qwen2_5_omni_thinker",
|
||||||
|
"Qwen2_5OmniThinkerForConditionalGeneration",
|
||||||
|
), # noqa: E501
|
||||||
|
"Qwen2_5OmniForConditionalGeneration": (
|
||||||
|
"qwen2_5_omni_thinker",
|
||||||
|
"Qwen2_5OmniThinkerForConditionalGeneration",
|
||||||
|
), # noqa: E501
|
||||||
"Qwen3VLForConditionalGeneration": ("qwen3_vl", "Qwen3VLForConditionalGeneration"), # noqa: E501
|
"Qwen3VLForConditionalGeneration": ("qwen3_vl", "Qwen3VLForConditionalGeneration"), # noqa: E501
|
||||||
"Qwen3VLMoeForConditionalGeneration": ("qwen3_vl_moe", "Qwen3VLMoeForConditionalGeneration"), # noqa: E501
|
"Qwen3VLMoeForConditionalGeneration": (
|
||||||
|
"qwen3_vl_moe",
|
||||||
|
"Qwen3VLMoeForConditionalGeneration",
|
||||||
|
), # noqa: E501
|
||||||
"SkyworkR1VChatModel": ("skyworkr1v", "SkyworkR1VChatModel"),
|
"SkyworkR1VChatModel": ("skyworkr1v", "SkyworkR1VChatModel"),
|
||||||
"Step3VLForConditionalGeneration": ("step3_vl", "Step3VLForConditionalGeneration"), # noqa: E501
|
"Step3VLForConditionalGeneration": ("step3_vl", "Step3VLForConditionalGeneration"), # noqa: E501
|
||||||
"TarsierForConditionalGeneration": ("tarsier", "TarsierForConditionalGeneration"), # noqa: E501
|
"TarsierForConditionalGeneration": ("tarsier", "TarsierForConditionalGeneration"), # noqa: E501
|
||||||
"Tarsier2ForConditionalGeneration": ("qwen2_vl", "Tarsier2ForConditionalGeneration"), # noqa: E501
|
"Tarsier2ForConditionalGeneration": (
|
||||||
|
"qwen2_vl",
|
||||||
|
"Tarsier2ForConditionalGeneration",
|
||||||
|
), # noqa: E501
|
||||||
"UltravoxModel": ("ultravox", "UltravoxModel"),
|
"UltravoxModel": ("ultravox", "UltravoxModel"),
|
||||||
"VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"), # noqa: E501
|
"VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"), # noqa: E501
|
||||||
# [Encoder-decoder]
|
# [Encoder-decoder]
|
||||||
@ -324,13 +398,27 @@ _TRANSFORMERS_BACKEND_MODELS = {
|
|||||||
"TransformersForCausalLM": ("transformers", "TransformersForCausalLM"),
|
"TransformersForCausalLM": ("transformers", "TransformersForCausalLM"),
|
||||||
"TransformersForMultimodalLM": ("transformers", "TransformersForMultimodalLM"), # noqa: E501
|
"TransformersForMultimodalLM": ("transformers", "TransformersForMultimodalLM"), # noqa: E501
|
||||||
"TransformersMoEForCausalLM": ("transformers_moe", "TransformersMoEForCausalLM"), # noqa: E501
|
"TransformersMoEForCausalLM": ("transformers_moe", "TransformersMoEForCausalLM"), # noqa: E501
|
||||||
"TransformersMoEForMultimodalLM": ("transformers_moe", "TransformersMoEForMultimodalLM"), # noqa: E501
|
"TransformersMoEForMultimodalLM": (
|
||||||
"TransformersEmbeddingModel": ("transformers_pooling", "TransformersEmbeddingModel"), # noqa: E501
|
"transformers_moe",
|
||||||
"TransformersForSequenceClassification": ("transformers_pooling", "TransformersForSequenceClassification"), # noqa: E501
|
"TransformersMoEForMultimodalLM",
|
||||||
"TransformersMoEForSequenceClassification": ("transformers_pooling", "TransformersMoEForSequenceClassification"), # noqa: E501
|
), # noqa: E501
|
||||||
"TransformersMoEEmbeddingModel": ("transformers_pooling", "TransformersMoEEmbeddingModel"), # noqa: E501
|
"TransformersEmbeddingModel": (
|
||||||
|
"transformers_pooling",
|
||||||
|
"TransformersEmbeddingModel",
|
||||||
|
), # noqa: E501
|
||||||
|
"TransformersForSequenceClassification": (
|
||||||
|
"transformers_pooling",
|
||||||
|
"TransformersForSequenceClassification",
|
||||||
|
), # noqa: E501
|
||||||
|
"TransformersMoEForSequenceClassification": (
|
||||||
|
"transformers_pooling",
|
||||||
|
"TransformersMoEForSequenceClassification",
|
||||||
|
), # noqa: E501
|
||||||
|
"TransformersMoEEmbeddingModel": (
|
||||||
|
"transformers_pooling",
|
||||||
|
"TransformersMoEEmbeddingModel",
|
||||||
|
), # noqa: E501
|
||||||
}
|
}
|
||||||
# yapf: enable
|
|
||||||
|
|
||||||
_VLLM_MODELS = {
|
_VLLM_MODELS = {
|
||||||
**_TEXT_GENERATION_MODELS,
|
**_TEXT_GENERATION_MODELS,
|
||||||
|
|||||||
@ -8,13 +8,10 @@ from transformers import SmolVLMProcessor
|
|||||||
from vllm.config import VllmConfig
|
from vllm.config import VllmConfig
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
from .idefics3 import Idefics3DummyInputsBuilder as SmolVLMDummyInputsBuilder
|
from .idefics3 import Idefics3DummyInputsBuilder as SmolVLMDummyInputsBuilder
|
||||||
from .idefics3 import Idefics3ForConditionalGeneration, Idefics3ProcessingInfo
|
from .idefics3 import Idefics3ForConditionalGeneration, Idefics3ProcessingInfo
|
||||||
from .idefics3 import Idefics3MultiModalProcessor as SmolVLMMultiModalProcessor
|
from .idefics3 import Idefics3MultiModalProcessor as SmolVLMMultiModalProcessor
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
|
|
||||||
|
|
||||||
class SmolVLMProcessingInfo(Idefics3ProcessingInfo):
|
class SmolVLMProcessingInfo(Idefics3ProcessingInfo):
|
||||||
def get_hf_processor(self, **kwargs: object) -> SmolVLMProcessor:
|
def get_hf_processor(self, **kwargs: object) -> SmolVLMProcessor:
|
||||||
|
|||||||
@ -32,11 +32,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
|
|||||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||||
from vllm.model_executor.models import SupportsPP
|
from vllm.model_executor.models import SupportsPP
|
||||||
from vllm.model_executor.models.module_mapping import MultiModelKeys
|
from vllm.model_executor.models.module_mapping import MultiModelKeys
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
from vllm.model_executor.models.whisper import WhisperEncoder
|
from vllm.model_executor.models.whisper import WhisperEncoder
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.inputs import (
|
from vllm.multimodal.inputs import (
|
||||||
MultiModalDataDict,
|
MultiModalDataDict,
|
||||||
|
|||||||
@ -28,7 +28,6 @@ def _get_minicpmv_chat_template_fallback(tokenizer_name_or_path: str) -> Optiona
|
|||||||
return CHAT_TEMPLATES_DIR / "template_chatml.jinja"
|
return CHAT_TEMPLATES_DIR / "template_chatml.jinja"
|
||||||
|
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
_MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = {
|
_MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = {
|
||||||
"blip-2": CHAT_TEMPLATES_DIR / "template_blip2.jinja",
|
"blip-2": CHAT_TEMPLATES_DIR / "template_blip2.jinja",
|
||||||
"clip": CHAT_TEMPLATES_DIR / "template_basic.jinja",
|
"clip": CHAT_TEMPLATES_DIR / "template_basic.jinja",
|
||||||
@ -39,7 +38,6 @@ _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = {
|
|||||||
"paligemma": CHAT_TEMPLATES_DIR / "template_basic.jinja",
|
"paligemma": CHAT_TEMPLATES_DIR / "template_basic.jinja",
|
||||||
"qwen": _get_qwen_chat_template_fallback,
|
"qwen": _get_qwen_chat_template_fallback,
|
||||||
}
|
}
|
||||||
# yapf: enable
|
|
||||||
|
|
||||||
|
|
||||||
def register_chat_template_fallback_path(
|
def register_chat_template_fallback_path(
|
||||||
|
|||||||
@ -1,12 +1,11 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
# ruff: noqa: E501
|
# ruff: noqa: E501
|
||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
# Copied from
|
# Copied from
|
||||||
# https://huggingface.co/Snowflake/snowflake-arctic-instruct/blob/main/configuration_arctic.py
|
# https://huggingface.co/Snowflake/snowflake-arctic-instruct/blob/main/configuration_arctic.py
|
||||||
""" Arctic model configuration"""
|
"""Arctic model configuration"""
|
||||||
|
|
||||||
from dataclasses import asdict, dataclass
|
from dataclasses import asdict, dataclass
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|||||||
@ -1,7 +1,6 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
# ruff: noqa: E501
|
# ruff: noqa: E501
|
||||||
# Adapted from
|
# Adapted from
|
||||||
# https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1/blob/main/configuration.py
|
# https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1/blob/main/configuration.py
|
||||||
@ -16,7 +15,7 @@ from transformers.dynamic_module_utils import get_class_from_dynamic_module
|
|||||||
|
|
||||||
|
|
||||||
class Nemotron_Nano_VL_Config(PretrainedConfig):
|
class Nemotron_Nano_VL_Config(PretrainedConfig):
|
||||||
model_type = 'Llama_Nemotron_Nano_VL'
|
model_type = "Llama_Nemotron_Nano_VL"
|
||||||
is_composition = True
|
is_composition = True
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@ -26,17 +25,22 @@ class Nemotron_Nano_VL_Config(PretrainedConfig):
|
|||||||
force_image_size=None,
|
force_image_size=None,
|
||||||
downsample_ratio=0.5,
|
downsample_ratio=0.5,
|
||||||
template=None,
|
template=None,
|
||||||
ps_version='v1',
|
ps_version="v1",
|
||||||
image_tag_type="internvl",
|
image_tag_type="internvl",
|
||||||
projector_hidden_size=4096,
|
projector_hidden_size=4096,
|
||||||
vit_hidden_size=1280,
|
vit_hidden_size=1280,
|
||||||
**kwargs
|
**kwargs,
|
||||||
):
|
):
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
if vision_config is not None:
|
if vision_config is not None:
|
||||||
assert "auto_map" in vision_config and "AutoConfig" in vision_config["auto_map"]
|
assert (
|
||||||
vision_auto_config = get_class_from_dynamic_module(*vision_config["auto_map"]["AutoConfig"].split("--")[::-1])
|
"auto_map" in vision_config
|
||||||
|
and "AutoConfig" in vision_config["auto_map"]
|
||||||
|
)
|
||||||
|
vision_auto_config = get_class_from_dynamic_module(
|
||||||
|
*vision_config["auto_map"]["AutoConfig"].split("--")[::-1]
|
||||||
|
)
|
||||||
self.vision_config = vision_auto_config(**vision_config)
|
self.vision_config = vision_auto_config(**vision_config)
|
||||||
else:
|
else:
|
||||||
self.vision_config = PretrainedConfig()
|
self.vision_config = PretrainedConfig()
|
||||||
@ -51,6 +55,6 @@ class Nemotron_Nano_VL_Config(PretrainedConfig):
|
|||||||
self.downsample_ratio = downsample_ratio
|
self.downsample_ratio = downsample_ratio
|
||||||
self.template = template # TODO move out of here and into the tokenizer
|
self.template = template # TODO move out of here and into the tokenizer
|
||||||
self.ps_version = ps_version # Pixel shuffle version
|
self.ps_version = ps_version # Pixel shuffle version
|
||||||
self.image_tag_type = image_tag_type # TODO: into the tokenizer too?
|
self.image_tag_type = image_tag_type # TODO: into the tokenizer too?
|
||||||
self.projector_hidden_size = projector_hidden_size
|
self.projector_hidden_size = projector_hidden_size
|
||||||
self.vit_hidden_size = vit_hidden_size
|
self.vit_hidden_size = vit_hidden_size
|
||||||
|
|||||||
@ -1,7 +1,6 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
# ruff: noqa: E501
|
# ruff: noqa: E501
|
||||||
# adapted from https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_aimv2.py
|
# adapted from https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_aimv2.py
|
||||||
# and https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_ovis.py
|
# and https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_ovis.py
|
||||||
@ -70,34 +69,37 @@ class AIMv2Config(PretrainedConfig):
|
|||||||
# Visual Tokenizer Configuration
|
# Visual Tokenizer Configuration
|
||||||
# ----------------------------------------------------------------------
|
# ----------------------------------------------------------------------
|
||||||
class BaseVisualTokenizerConfig(PretrainedConfig):
|
class BaseVisualTokenizerConfig(PretrainedConfig):
|
||||||
|
def __init__(
|
||||||
def __init__(self,
|
self,
|
||||||
vocab_size=16384,
|
vocab_size=16384,
|
||||||
tokenize_function="softmax",
|
tokenize_function="softmax",
|
||||||
tau=1.0,
|
tau=1.0,
|
||||||
depths=None,
|
depths=None,
|
||||||
drop_cls_token=False,
|
drop_cls_token=False,
|
||||||
backbone_config: Optional[Union[PretrainedConfig,
|
backbone_config: Optional[Union[PretrainedConfig, dict]] = None,
|
||||||
dict]] = None,
|
hidden_stride: int = 1,
|
||||||
hidden_stride: int = 1,
|
**kwargs,
|
||||||
**kwargs):
|
):
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.vocab_size = vocab_size
|
self.vocab_size = vocab_size
|
||||||
self.tokenize_function = tokenize_function
|
self.tokenize_function = tokenize_function
|
||||||
self.tau = tau
|
self.tau = tau
|
||||||
if isinstance(depths, str):
|
if isinstance(depths, str):
|
||||||
depths = [int(x) for x in depths.split('|')]
|
depths = [int(x) for x in depths.split("|")]
|
||||||
self.depths = depths
|
self.depths = depths
|
||||||
self.backbone_kwargs = dict[str, Any]()
|
self.backbone_kwargs = dict[str, Any]()
|
||||||
self.drop_cls_token = drop_cls_token
|
self.drop_cls_token = drop_cls_token
|
||||||
if backbone_config is not None:
|
if backbone_config is not None:
|
||||||
assert isinstance(backbone_config, (PretrainedConfig, dict)), \
|
assert isinstance(backbone_config, (PretrainedConfig, dict)), (
|
||||||
f"expect `backbone_config` to be instance of PretrainedConfig or dict, but got {type(backbone_config)} type"
|
f"expect `backbone_config` to be instance of PretrainedConfig or dict, but got {type(backbone_config)} type"
|
||||||
|
)
|
||||||
if not isinstance(backbone_config, PretrainedConfig):
|
if not isinstance(backbone_config, PretrainedConfig):
|
||||||
model_type = backbone_config['model_type']
|
model_type = backbone_config["model_type"]
|
||||||
if model_type != "aimv2":
|
if model_type != "aimv2":
|
||||||
backbone_config.pop('model_type')
|
backbone_config.pop("model_type")
|
||||||
backbone_config = AutoConfig.for_model(model_type, **backbone_config)
|
backbone_config = AutoConfig.for_model(
|
||||||
|
model_type, **backbone_config
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
backbone_config = AIMv2Config(**backbone_config)
|
backbone_config = AIMv2Config(**backbone_config)
|
||||||
self.backbone_config = backbone_config
|
self.backbone_config = backbone_config
|
||||||
@ -113,7 +115,7 @@ class Aimv2VisualTokenizerConfig(BaseVisualTokenizerConfig):
|
|||||||
self.drop_cls_token = False
|
self.drop_cls_token = False
|
||||||
if self.depths:
|
if self.depths:
|
||||||
assert len(self.depths) == 1
|
assert len(self.depths) == 1
|
||||||
self.backbone_kwargs['num_hidden_layers'] = self.depths[0]
|
self.backbone_kwargs["num_hidden_layers"] = self.depths[0]
|
||||||
|
|
||||||
|
|
||||||
class SiglipVisualTokenizerConfig(BaseVisualTokenizerConfig):
|
class SiglipVisualTokenizerConfig(BaseVisualTokenizerConfig):
|
||||||
@ -125,7 +127,7 @@ class SiglipVisualTokenizerConfig(BaseVisualTokenizerConfig):
|
|||||||
self.drop_cls_token = False
|
self.drop_cls_token = False
|
||||||
if self.depths:
|
if self.depths:
|
||||||
assert len(self.depths) == 1
|
assert len(self.depths) == 1
|
||||||
self.backbone_kwargs['num_hidden_layers'] = self.depths[0]
|
self.backbone_kwargs["num_hidden_layers"] = self.depths[0]
|
||||||
|
|
||||||
|
|
||||||
AutoConfig.register("siglip_visual_tokenizer", SiglipVisualTokenizerConfig)
|
AutoConfig.register("siglip_visual_tokenizer", SiglipVisualTokenizerConfig)
|
||||||
@ -138,35 +140,39 @@ AutoConfig.register("aimv2_visual_tokenizer", Aimv2VisualTokenizerConfig)
|
|||||||
class OvisConfig(PretrainedConfig):
|
class OvisConfig(PretrainedConfig):
|
||||||
model_type = "ovis"
|
model_type = "ovis"
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(
|
||||||
llm_config: Optional[Union[PretrainedConfig, dict]] = None,
|
self,
|
||||||
visual_tokenizer_config: Optional[Union[PretrainedConfig,
|
llm_config: Optional[Union[PretrainedConfig, dict]] = None,
|
||||||
dict]] = None,
|
visual_tokenizer_config: Optional[Union[PretrainedConfig, dict]] = None,
|
||||||
multimodal_max_length=8192,
|
multimodal_max_length=8192,
|
||||||
hidden_size=None,
|
hidden_size=None,
|
||||||
conversation_formatter_class=None,
|
conversation_formatter_class=None,
|
||||||
llm_attn_implementation=None,
|
llm_attn_implementation=None,
|
||||||
disable_tie_weight=False,
|
disable_tie_weight=False,
|
||||||
**kwargs):
|
**kwargs,
|
||||||
|
):
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
if llm_config is not None:
|
if llm_config is not None:
|
||||||
assert isinstance(llm_config, (PretrainedConfig, dict)), \
|
assert isinstance(llm_config, (PretrainedConfig, dict)), (
|
||||||
f"expect `llm_config` to be instance of PretrainedConfig or dict, but got {type(llm_config)} type"
|
f"expect `llm_config` to be instance of PretrainedConfig or dict, but got {type(llm_config)} type"
|
||||||
|
)
|
||||||
if not isinstance(llm_config, PretrainedConfig):
|
if not isinstance(llm_config, PretrainedConfig):
|
||||||
model_type = llm_config['model_type']
|
model_type = llm_config["model_type"]
|
||||||
llm_config.pop('model_type')
|
llm_config.pop("model_type")
|
||||||
llm_config = AutoConfig.for_model(model_type, **llm_config)
|
llm_config = AutoConfig.for_model(model_type, **llm_config)
|
||||||
|
|
||||||
# map llm_config to text_config
|
# map llm_config to text_config
|
||||||
self.text_config = llm_config
|
self.text_config = llm_config
|
||||||
if visual_tokenizer_config is not None:
|
if visual_tokenizer_config is not None:
|
||||||
assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), \
|
assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), (
|
||||||
f"expect `visual_tokenizer_config` to be instance of PretrainedConfig or dict, but got {type(visual_tokenizer_config)} type"
|
f"expect `visual_tokenizer_config` to be instance of PretrainedConfig or dict, but got {type(visual_tokenizer_config)} type"
|
||||||
|
)
|
||||||
if not isinstance(visual_tokenizer_config, PretrainedConfig):
|
if not isinstance(visual_tokenizer_config, PretrainedConfig):
|
||||||
model_type = visual_tokenizer_config['model_type']
|
model_type = visual_tokenizer_config["model_type"]
|
||||||
visual_tokenizer_config.pop('model_type')
|
visual_tokenizer_config.pop("model_type")
|
||||||
visual_tokenizer_config = AutoConfig.for_model(
|
visual_tokenizer_config = AutoConfig.for_model(
|
||||||
model_type, **visual_tokenizer_config)
|
model_type, **visual_tokenizer_config
|
||||||
|
)
|
||||||
|
|
||||||
self.visual_tokenizer_config = visual_tokenizer_config
|
self.visual_tokenizer_config = visual_tokenizer_config
|
||||||
self.multimodal_max_length = multimodal_max_length
|
self.multimodal_max_length = multimodal_max_length
|
||||||
|
|||||||
@ -1,7 +1,6 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
# ruff: noqa: E501
|
# ruff: noqa: E501
|
||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/ff23960c5cf9e6874b44be38af930cfb0ccbb620/deepseek_vl2/models/processing_deepseek_vl_v2.py
|
# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/ff23960c5cf9e6874b44be38af930cfb0ccbb620/deepseek_vl2/models/processing_deepseek_vl_v2.py
|
||||||
@ -35,11 +34,12 @@ from transformers.processing_utils import ProcessorMixin
|
|||||||
|
|
||||||
|
|
||||||
class ImageTransform:
|
class ImageTransform:
|
||||||
|
def __init__(
|
||||||
def __init__(self,
|
self,
|
||||||
mean: tuple[float, float, float] = (0.5, 0.5, 0.5),
|
mean: tuple[float, float, float] = (0.5, 0.5, 0.5),
|
||||||
std: tuple[float, float, float] = (0.5, 0.5, 0.5),
|
std: tuple[float, float, float] = (0.5, 0.5, 0.5),
|
||||||
normalize: bool = True):
|
normalize: bool = True,
|
||||||
|
):
|
||||||
self.mean = mean
|
self.mean = mean
|
||||||
self.std = std
|
self.std = std
|
||||||
self.normalize = normalize
|
self.normalize = normalize
|
||||||
@ -77,7 +77,6 @@ class DeepseekVLV2Processor(ProcessorMixin):
|
|||||||
ignore_id: int = -100,
|
ignore_id: int = -100,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
|
|
||||||
self.candidate_resolutions = candidate_resolutions
|
self.candidate_resolutions = candidate_resolutions
|
||||||
self.image_size = candidate_resolutions[0][0]
|
self.image_size = candidate_resolutions[0][0]
|
||||||
self.patch_size = patch_size
|
self.patch_size = patch_size
|
||||||
@ -86,13 +85,15 @@ class DeepseekVLV2Processor(ProcessorMixin):
|
|||||||
self.normalize = normalize
|
self.normalize = normalize
|
||||||
self.downsample_ratio = downsample_ratio
|
self.downsample_ratio = downsample_ratio
|
||||||
|
|
||||||
self.image_transform = ImageTransform(mean=image_mean, std=image_std, normalize=normalize)
|
self.image_transform = ImageTransform(
|
||||||
|
mean=image_mean, std=image_std, normalize=normalize
|
||||||
|
)
|
||||||
self.tokenizer = tokenizer
|
self.tokenizer = tokenizer
|
||||||
self.tokenizer.padding_side = 'left' # must set this,padding side with make a difference in batch inference
|
self.tokenizer.padding_side = "left" # must set this,padding side with make a difference in batch inference
|
||||||
|
|
||||||
# add the pad_token as special token to use 'tokenizer.pad_token' and 'tokenizer.pad_token_id'
|
# add the pad_token as special token to use 'tokenizer.pad_token' and 'tokenizer.pad_token_id'
|
||||||
if tokenizer.pad_token is None:
|
if tokenizer.pad_token is None:
|
||||||
self.tokenizer.add_special_tokens({'pad_token': pad_token})
|
self.tokenizer.add_special_tokens({"pad_token": pad_token})
|
||||||
|
|
||||||
# add image token
|
# add image token
|
||||||
image_token_id = self.tokenizer.vocab.get(image_token)
|
image_token_id = self.tokenizer.vocab.get(image_token)
|
||||||
@ -104,7 +105,7 @@ class DeepseekVLV2Processor(ProcessorMixin):
|
|||||||
|
|
||||||
# add five special tokens for grounding-related tasks
|
# add five special tokens for grounding-related tasks
|
||||||
# <|ref|>, <|/ref|>, <|det|>, <|/det|>, <|grounding|>
|
# <|ref|>, <|/ref|>, <|det|>, <|/det|>, <|grounding|>
|
||||||
special_tokens = ['<|ref|>', '<|/ref|>', '<|det|>', '<|/det|>', '<|grounding|>']
|
special_tokens = ["<|ref|>", "<|/ref|>", "<|det|>", "<|/det|>", "<|grounding|>"]
|
||||||
special_tokens_dict = {"additional_special_tokens": special_tokens}
|
special_tokens_dict = {"additional_special_tokens": special_tokens}
|
||||||
self.tokenizer.add_special_tokens(special_tokens_dict)
|
self.tokenizer.add_special_tokens(special_tokens_dict)
|
||||||
|
|
||||||
@ -134,15 +135,19 @@ class DeepseekVLV2Processor(ProcessorMixin):
|
|||||||
|
|
||||||
for width, height in self.candidate_resolutions:
|
for width, height in self.candidate_resolutions:
|
||||||
scale = min(width / original_width, height / original_height)
|
scale = min(width / original_width, height / original_height)
|
||||||
downscaled_width, downscaled_height = int(
|
downscaled_width, downscaled_height = (
|
||||||
original_width * scale), int(original_height * scale)
|
int(original_width * scale),
|
||||||
effective_resolution = min(downscaled_width * downscaled_height,
|
int(original_height * scale),
|
||||||
original_width * original_height)
|
)
|
||||||
|
effective_resolution = min(
|
||||||
|
downscaled_width * downscaled_height, original_width * original_height
|
||||||
|
)
|
||||||
wasted_resolution = (width * height) - effective_resolution
|
wasted_resolution = (width * height) - effective_resolution
|
||||||
|
|
||||||
if effective_resolution > max_effective_resolution or (
|
if effective_resolution > max_effective_resolution or (
|
||||||
effective_resolution == max_effective_resolution
|
effective_resolution == max_effective_resolution
|
||||||
and wasted_resolution < min_wasted_resolution):
|
and wasted_resolution < min_wasted_resolution
|
||||||
|
):
|
||||||
max_effective_resolution = effective_resolution
|
max_effective_resolution = effective_resolution
|
||||||
min_wasted_resolution = wasted_resolution
|
min_wasted_resolution = wasted_resolution
|
||||||
best_fit = (width, height)
|
best_fit = (width, height)
|
||||||
@ -198,12 +203,20 @@ class DeepseekVLV2Processor(ProcessorMixin):
|
|||||||
- num_image_tokens (list[int]): the number of image tokens
|
- num_image_tokens (list[int]): the number of image tokens
|
||||||
"""
|
"""
|
||||||
|
|
||||||
assert (prompt is not None and images is not None
|
assert prompt is not None and images is not None, (
|
||||||
), "prompt and images must be used at the same time."
|
"prompt and images must be used at the same time."
|
||||||
|
)
|
||||||
|
|
||||||
sft_format = prompt
|
sft_format = prompt
|
||||||
tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens = self.tokenize_with_images(
|
(
|
||||||
sft_format, images, bos=True, eos=True, cropping=len(images) <= 2)
|
tokenized_str,
|
||||||
|
images_list,
|
||||||
|
images_seq_mask,
|
||||||
|
images_spatial_crop,
|
||||||
|
num_image_tokens,
|
||||||
|
) = self.tokenize_with_images(
|
||||||
|
sft_format, images, bos=True, eos=True, cropping=len(images) <= 2
|
||||||
|
)
|
||||||
masked_tokenized_str = []
|
masked_tokenized_str = []
|
||||||
for token_index in tokenized_str:
|
for token_index in tokenized_str:
|
||||||
if token_index != self.image_token_id:
|
if token_index != self.image_token_id:
|
||||||
@ -211,17 +224,21 @@ class DeepseekVLV2Processor(ProcessorMixin):
|
|||||||
else:
|
else:
|
||||||
masked_tokenized_str.append(self.ignore_id)
|
masked_tokenized_str.append(self.ignore_id)
|
||||||
|
|
||||||
assert len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str), \
|
assert (
|
||||||
(f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, "
|
len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str)
|
||||||
f"imags_seq_mask's length {len(images_seq_mask)}, are not equal")
|
), (
|
||||||
|
f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, "
|
||||||
|
f"imags_seq_mask's length {len(images_seq_mask)}, are not equal"
|
||||||
|
)
|
||||||
|
|
||||||
input_ids = torch.LongTensor(tokenized_str)
|
input_ids = torch.LongTensor(tokenized_str)
|
||||||
target_ids = torch.LongTensor(masked_tokenized_str)
|
target_ids = torch.LongTensor(masked_tokenized_str)
|
||||||
images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool)
|
images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool)
|
||||||
|
|
||||||
# set input_ids < 0 | input_ids == self.image_token_id as ignore_id
|
# set input_ids < 0 | input_ids == self.image_token_id as ignore_id
|
||||||
target_ids[(input_ids < 0) |
|
target_ids[(input_ids < 0) | (input_ids == self.image_token_id)] = (
|
||||||
(input_ids == self.image_token_id)] = self.ignore_id
|
self.ignore_id
|
||||||
|
)
|
||||||
input_ids[input_ids < 0] = self.pad_id
|
input_ids[input_ids < 0] = self.pad_id
|
||||||
|
|
||||||
if inference_mode:
|
if inference_mode:
|
||||||
@ -311,30 +328,50 @@ class DeepseekVLV2Processor(ProcessorMixin):
|
|||||||
best_width, best_height = self.image_size, self.image_size
|
best_width, best_height = self.image_size, self.image_size
|
||||||
|
|
||||||
"""process the global view"""
|
"""process the global view"""
|
||||||
global_view = ImageOps.pad(image, (self.image_size, self.image_size),
|
global_view = ImageOps.pad(
|
||||||
color=tuple(int(x * 255) for x in self.image_transform.mean))
|
image,
|
||||||
|
(self.image_size, self.image_size),
|
||||||
|
color=tuple(int(x * 255) for x in self.image_transform.mean),
|
||||||
|
)
|
||||||
images_list.append(self.image_transform(global_view))
|
images_list.append(self.image_transform(global_view))
|
||||||
|
|
||||||
"""process the local views"""
|
"""process the local views"""
|
||||||
local_view = ImageOps.pad(image, (best_width, best_height),
|
local_view = ImageOps.pad(
|
||||||
color=tuple(int(x * 255) for x in self.image_transform.mean))
|
image,
|
||||||
|
(best_width, best_height),
|
||||||
|
color=tuple(int(x * 255) for x in self.image_transform.mean),
|
||||||
|
)
|
||||||
for i in range(0, best_height, self.image_size):
|
for i in range(0, best_height, self.image_size):
|
||||||
for j in range(0, best_width, self.image_size):
|
for j in range(0, best_width, self.image_size):
|
||||||
images_list.append(
|
images_list.append(
|
||||||
self.image_transform(local_view.crop((j, i, j + self.image_size, i + self.image_size))))
|
self.image_transform(
|
||||||
|
local_view.crop(
|
||||||
|
(j, i, j + self.image_size, i + self.image_size)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
"""record height / width crop num"""
|
"""record height / width crop num"""
|
||||||
num_width_tiles, num_height_tiles = best_width // self.image_size, best_height // self.image_size
|
num_width_tiles, num_height_tiles = (
|
||||||
|
best_width // self.image_size,
|
||||||
|
best_height // self.image_size,
|
||||||
|
)
|
||||||
images_spatial_crop.append([num_width_tiles, num_height_tiles])
|
images_spatial_crop.append([num_width_tiles, num_height_tiles])
|
||||||
|
|
||||||
"""add image tokens"""
|
"""add image tokens"""
|
||||||
h = w = math.ceil((self.image_size // self.patch_size) / self.downsample_ratio)
|
h = w = math.ceil(
|
||||||
|
(self.image_size // self.patch_size) / self.downsample_ratio
|
||||||
|
)
|
||||||
# global views tokens h * (w + 1), 1 is for line separator
|
# global views tokens h * (w + 1), 1 is for line separator
|
||||||
tokenized_image = [self.image_token_id] * h * (w + 1)
|
tokenized_image = [self.image_token_id] * h * (w + 1)
|
||||||
# add a separator between global and local views
|
# add a separator between global and local views
|
||||||
tokenized_image += [self.image_token_id]
|
tokenized_image += [self.image_token_id]
|
||||||
# local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1)
|
# local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1)
|
||||||
tokenized_image += [self.image_token_id] * (num_height_tiles * h) * (num_width_tiles * w + 1)
|
tokenized_image += (
|
||||||
|
[self.image_token_id]
|
||||||
|
* (num_height_tiles * h)
|
||||||
|
* (num_width_tiles * w + 1)
|
||||||
|
)
|
||||||
|
|
||||||
tokenized_str += tokenized_image
|
tokenized_str += tokenized_image
|
||||||
images_seq_mask += [True] * len(tokenized_image)
|
images_seq_mask += [True] * len(tokenized_image)
|
||||||
@ -353,10 +390,17 @@ class DeepseekVLV2Processor(ProcessorMixin):
|
|||||||
tokenized_str = tokenized_str + [self.eos_id]
|
tokenized_str = tokenized_str + [self.eos_id]
|
||||||
images_seq_mask = images_seq_mask + [False]
|
images_seq_mask = images_seq_mask + [False]
|
||||||
|
|
||||||
assert len(tokenized_str) == len(
|
assert len(tokenized_str) == len(images_seq_mask), (
|
||||||
images_seq_mask), f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
|
f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
|
||||||
|
)
|
||||||
|
|
||||||
return tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens
|
return (
|
||||||
|
tokenized_str,
|
||||||
|
images_list,
|
||||||
|
images_seq_mask,
|
||||||
|
images_spatial_crop,
|
||||||
|
num_image_tokens,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
AutoProcessor.register("DeepseekVLV2Processor", DeepseekVLV2Processor)
|
AutoProcessor.register("DeepseekVLV2Processor", DeepseekVLV2Processor)
|
||||||
|
|||||||
@ -1,7 +1,6 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
# ruff: noqa: E501
|
# ruff: noqa: E501
|
||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
# adapted from https://github.com/AIDC-AI/Ovis/blob/35ab51a1a1e3542fa6db260a1084cefbc8f164bb/ovis/vllm/processing_ovis.py
|
# adapted from https://github.com/AIDC-AI/Ovis/blob/35ab51a1a1e3542fa6db260a1084cefbc8f164bb/ovis/vllm/processing_ovis.py
|
||||||
@ -35,23 +34,24 @@ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
|
|||||||
|
|
||||||
from vllm.multimodal.image import convert_image_mode
|
from vllm.multimodal.image import convert_image_mode
|
||||||
|
|
||||||
__all__ = ['OvisProcessor']
|
__all__ = ["OvisProcessor"]
|
||||||
IGNORE_ID = -100
|
IGNORE_ID = -100
|
||||||
|
|
||||||
class OvisProcessorKwargs(ProcessingKwargs, total=False): # type: ignore[call-arg]
|
|
||||||
|
class OvisProcessorKwargs(ProcessingKwargs, total=False): # type: ignore[call-arg]
|
||||||
_defaults = {
|
_defaults = {
|
||||||
"text_kwargs": {
|
"text_kwargs": {
|
||||||
"padding": False,
|
"padding": False,
|
||||||
},
|
},
|
||||||
"images_kwargs": {
|
"images_kwargs": {
|
||||||
'max_partition':9,
|
"max_partition": 9,
|
||||||
'covering_threshold':0.9,
|
"covering_threshold": 0.9,
|
||||||
'convert_to_rgb':True,
|
"convert_to_rgb": True,
|
||||||
'return_tensors':'pt'},
|
"return_tensors": "pt",
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class OvisProcessor(ProcessorMixin):
|
class OvisProcessor(ProcessorMixin):
|
||||||
r"""
|
r"""
|
||||||
Constructs an Ovis processor which wraps an Ovis image processor and a Qwen2 tokenizer into a single processor.
|
Constructs an Ovis processor which wraps an Ovis image processor and a Qwen2 tokenizer into a single processor.
|
||||||
@ -97,14 +97,16 @@ class OvisProcessor(ProcessorMixin):
|
|||||||
"image_col_sep": -303,
|
"image_col_sep": -303,
|
||||||
"image_row_sep": -304,
|
"image_row_sep": -304,
|
||||||
"image_end": -305,
|
"image_end": -305,
|
||||||
'image_pad': image_pad_token_id,
|
"image_pad": image_pad_token_id,
|
||||||
}
|
}
|
||||||
return extra_special_tokens
|
return extra_special_tokens
|
||||||
|
|
||||||
def __call__(
|
def __call__(
|
||||||
self,
|
self,
|
||||||
images: ImageInput = None,
|
images: ImageInput = None,
|
||||||
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
|
text: Union[
|
||||||
|
TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]
|
||||||
|
] = None,
|
||||||
**kwargs: Unpack[OvisProcessorKwargs],
|
**kwargs: Unpack[OvisProcessorKwargs],
|
||||||
) -> BatchFeature:
|
) -> BatchFeature:
|
||||||
"""
|
"""
|
||||||
@ -169,7 +171,6 @@ class OvisProcessor(ProcessorMixin):
|
|||||||
|
|
||||||
# Process text input
|
# Process text input
|
||||||
if text is not None:
|
if text is not None:
|
||||||
|
|
||||||
if not isinstance(text, list):
|
if not isinstance(text, list):
|
||||||
text = [text]
|
text = [text]
|
||||||
|
|
||||||
@ -178,7 +179,10 @@ class OvisProcessor(ProcessorMixin):
|
|||||||
replaced_ids_list = []
|
replaced_ids_list = []
|
||||||
idx = 0
|
idx = 0
|
||||||
for ids_tensor in tokenized_batched_text:
|
for ids_tensor in tokenized_batched_text:
|
||||||
if image_token_id in ids_tensor and "image_placeholders" in image_features:
|
if (
|
||||||
|
image_token_id in ids_tensor
|
||||||
|
and "image_placeholders" in image_features
|
||||||
|
):
|
||||||
if idx < len(image_features["image_placeholders"]):
|
if idx < len(image_features["image_placeholders"]):
|
||||||
# Converts in list for ease of use
|
# Converts in list for ease of use
|
||||||
ids_list = ids_tensor.tolist()
|
ids_list = ids_tensor.tolist()
|
||||||
@ -188,7 +192,9 @@ class OvisProcessor(ProcessorMixin):
|
|||||||
# replace placeholders
|
# replace placeholders
|
||||||
for i, token_id in enumerate(ids_list):
|
for i, token_id in enumerate(ids_list):
|
||||||
if token_id == image_token_id:
|
if token_id == image_token_id:
|
||||||
placeholder_ids = image_features["image_placeholders"][idx]
|
placeholder_ids = image_features["image_placeholders"][
|
||||||
|
idx
|
||||||
|
]
|
||||||
new_ids.extend(placeholder_ids)
|
new_ids.extend(placeholder_ids)
|
||||||
idx += 1
|
idx += 1
|
||||||
else:
|
else:
|
||||||
@ -198,7 +204,8 @@ class OvisProcessor(ProcessorMixin):
|
|||||||
ids_tensor = torch.tensor(new_ids, dtype=torch.long)
|
ids_tensor = torch.tensor(new_ids, dtype=torch.long)
|
||||||
else:
|
else:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
'Mismatch between the images you provided and the number of placeholder present in the text')
|
"Mismatch between the images you provided and the number of placeholder present in the text"
|
||||||
|
)
|
||||||
|
|
||||||
replaced_ids_list.append(ids_tensor)
|
replaced_ids_list.append(ids_tensor)
|
||||||
|
|
||||||
@ -217,7 +224,7 @@ class OvisProcessor(ProcessorMixin):
|
|||||||
# Add image features if present
|
# Add image features if present
|
||||||
if image_features:
|
if image_features:
|
||||||
output["pixel_values"] = processed_images
|
output["pixel_values"] = processed_images
|
||||||
output['grids'] = grids
|
output["grids"] = grids
|
||||||
|
|
||||||
return output
|
return output
|
||||||
|
|
||||||
@ -227,8 +234,10 @@ class OvisProcessor(ProcessorMixin):
|
|||||||
def _tokenize_with_image_symbol(self, text_list: list[str]) -> torch.LongTensor:
|
def _tokenize_with_image_symbol(self, text_list: list[str]) -> torch.LongTensor:
|
||||||
batch_token_ids = []
|
batch_token_ids = []
|
||||||
for text in text_list:
|
for text in text_list:
|
||||||
text_chunks = [self.tokenizer(chunk, add_special_tokens=False).input_ids for chunk in
|
text_chunks = [
|
||||||
text.split(self.image_token)]
|
self.tokenizer(chunk, add_special_tokens=False).input_ids
|
||||||
|
for chunk in text.split(self.image_token)
|
||||||
|
]
|
||||||
token_ids = []
|
token_ids = []
|
||||||
num_chuck = len(text_chunks)
|
num_chuck = len(text_chunks)
|
||||||
for i, chunk in enumerate(text_chunks):
|
for i, chunk in enumerate(text_chunks):
|
||||||
@ -240,50 +249,60 @@ class OvisProcessor(ProcessorMixin):
|
|||||||
|
|
||||||
def get_image_size(self):
|
def get_image_size(self):
|
||||||
size = self.image_processor.size
|
size = self.image_processor.size
|
||||||
if 'shortest_edge' in size:
|
if "shortest_edge" in size:
|
||||||
width = height = size['shortest_edge']
|
width = height = size["shortest_edge"]
|
||||||
elif "height" in size and "width" in size:
|
elif "height" in size and "width" in size:
|
||||||
width = size['width']
|
width = size["width"]
|
||||||
height = size['height']
|
height = size["height"]
|
||||||
else:
|
else:
|
||||||
raise ValueError( "Can't parse image size from image_processor config.")
|
raise ValueError("Can't parse image size from image_processor config.")
|
||||||
return height, width
|
return height, width
|
||||||
|
|
||||||
def get_token_value(self, tok):
|
def get_token_value(self, tok):
|
||||||
return self.extra_special_tokens[tok]
|
return self.extra_special_tokens[tok]
|
||||||
|
|
||||||
def construct_image_indicators(self, grid):
|
def construct_image_indicators(self, grid):
|
||||||
image_placeholders = [self.get_token_value('image_start'),
|
image_placeholders = [
|
||||||
self.get_token_value('image_atom'),
|
self.get_token_value("image_start"),
|
||||||
self.get_token_value('image_prefix')]
|
self.get_token_value("image_atom"),
|
||||||
|
self.get_token_value("image_prefix"),
|
||||||
|
]
|
||||||
if grid[0] * grid[1] > 1:
|
if grid[0] * grid[1] > 1:
|
||||||
for r in range(grid[0]):
|
for r in range(grid[0]):
|
||||||
for c in range(grid[1]):
|
for c in range(grid[1]):
|
||||||
image_placeholders.append(self.get_token_value('image_atom') )
|
image_placeholders.append(self.get_token_value("image_atom"))
|
||||||
if c < grid[1] - 1:
|
if c < grid[1] - 1:
|
||||||
image_placeholders.append(self.get_token_value('image_col_sep'))
|
image_placeholders.append(self.get_token_value("image_col_sep"))
|
||||||
if r < grid[0] - 1:
|
if r < grid[0] - 1:
|
||||||
image_placeholders.append(self.get_token_value('image_row_sep'))
|
image_placeholders.append(self.get_token_value("image_row_sep"))
|
||||||
image_placeholders.append(self.get_token_value('image_end'))
|
image_placeholders.append(self.get_token_value("image_end"))
|
||||||
return image_placeholders
|
return image_placeholders
|
||||||
|
|
||||||
def construct_image_placeholders(self, grid):
|
def construct_image_placeholders(self, grid):
|
||||||
|
|
||||||
image_placeholders = self.construct_image_indicators(grid)
|
image_placeholders = self.construct_image_indicators(grid)
|
||||||
|
|
||||||
image_atom_token_id = self.get_token_value('image_atom')
|
image_atom_token_id = self.get_token_value("image_atom")
|
||||||
# Extract the padding token ID from tokenizer
|
# Extract the padding token ID from tokenizer
|
||||||
image_padding_token_id = self.get_token_value('image_pad')
|
image_padding_token_id = self.get_token_value("image_pad")
|
||||||
|
|
||||||
# Create a new list with padding tokens inserted
|
# Create a new list with padding tokens inserted
|
||||||
padded_placeholder_tokens = []
|
padded_placeholder_tokens = []
|
||||||
for token in image_placeholders:
|
for token in image_placeholders:
|
||||||
padded_placeholder_tokens.append(image_padding_token_id)
|
padded_placeholder_tokens.append(image_padding_token_id)
|
||||||
if token == image_atom_token_id:
|
if token == image_atom_token_id:
|
||||||
padded_placeholder_tokens.extend([image_padding_token_id] * self.image_segment_len)
|
padded_placeholder_tokens.extend(
|
||||||
|
[image_padding_token_id] * self.image_segment_len
|
||||||
|
)
|
||||||
return padded_placeholder_tokens
|
return padded_placeholder_tokens
|
||||||
|
|
||||||
def preprocess_image(self, image: PIL.Image.Image, max_partition, covering_threshold, convert_to_rgb, return_tensors):
|
def preprocess_image(
|
||||||
|
self,
|
||||||
|
image: PIL.Image.Image,
|
||||||
|
max_partition,
|
||||||
|
covering_threshold,
|
||||||
|
convert_to_rgb,
|
||||||
|
return_tensors,
|
||||||
|
):
|
||||||
def _preprocess(img: PIL.Image.Image, side):
|
def _preprocess(img: PIL.Image.Image, side):
|
||||||
# first resize and preprocess
|
# first resize and preprocess
|
||||||
w, h = img.size
|
w, h = img.size
|
||||||
@ -296,19 +315,27 @@ class OvisProcessor(ProcessorMixin):
|
|||||||
new_height = side
|
new_height = side
|
||||||
new_width = int(w / h * new_height)
|
new_width = int(w / h * new_height)
|
||||||
new_size = dict(height=new_height, width=new_width)
|
new_size = dict(height=new_height, width=new_width)
|
||||||
pixel_values = self.image_processor.preprocess(img, size=new_size, return_tensors=return_tensors)['pixel_values']
|
pixel_values = self.image_processor.preprocess(
|
||||||
|
img, size=new_size, return_tensors=return_tensors
|
||||||
|
)["pixel_values"]
|
||||||
|
|
||||||
# then pad to square
|
# then pad to square
|
||||||
square_values = torch.zeros([1, 3, side, side], dtype=pixel_values.dtype, device=pixel_values.device)
|
square_values = torch.zeros(
|
||||||
|
[1, 3, side, side], dtype=pixel_values.dtype, device=pixel_values.device
|
||||||
|
)
|
||||||
new_height, new_width = pixel_values.shape[2:]
|
new_height, new_width = pixel_values.shape[2:]
|
||||||
if new_height == new_width:
|
if new_height == new_width:
|
||||||
square_values[:, :, :, :] = pixel_values
|
square_values[:, :, :, :] = pixel_values
|
||||||
elif new_height > new_width:
|
elif new_height > new_width:
|
||||||
from_index = (side - new_width) // 2
|
from_index = (side - new_width) // 2
|
||||||
square_values[:, :, :, from_index:from_index + new_width] = pixel_values
|
square_values[:, :, :, from_index : from_index + new_width] = (
|
||||||
|
pixel_values
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
from_index = (side - new_height) // 2
|
from_index = (side - new_height) // 2
|
||||||
square_values[:, :, from_index:from_index + new_height, :] = pixel_values
|
square_values[:, :, from_index : from_index + new_height, :] = (
|
||||||
|
pixel_values
|
||||||
|
)
|
||||||
|
|
||||||
return square_values
|
return square_values
|
||||||
|
|
||||||
@ -350,7 +377,9 @@ class OvisProcessor(ProcessorMixin):
|
|||||||
good_grids = []
|
good_grids = []
|
||||||
for grid in candidate_grids:
|
for grid in candidate_grids:
|
||||||
partition = _partition(img, grid)
|
partition = _partition(img, grid)
|
||||||
covering_ratio = sum([_covering_area(*p, side) for p in partition]) / img_area
|
covering_ratio = (
|
||||||
|
sum([_covering_area(*p, side) for p in partition]) / img_area
|
||||||
|
)
|
||||||
assert covering_ratio <= 1.0
|
assert covering_ratio <= 1.0
|
||||||
all_grids.append((grid, covering_ratio))
|
all_grids.append((grid, covering_ratio))
|
||||||
if covering_ratio > covering_threshold:
|
if covering_ratio > covering_threshold:
|
||||||
@ -358,18 +387,19 @@ class OvisProcessor(ProcessorMixin):
|
|||||||
|
|
||||||
if len(good_grids) > 0:
|
if len(good_grids) > 0:
|
||||||
# pick the good partition with minimum #sub_images and break the tie using covering_ratio
|
# pick the good partition with minimum #sub_images and break the tie using covering_ratio
|
||||||
return sorted(good_grids, key=lambda x: (x[0][0] * x[0][1], -x[1]))[0][0]
|
return sorted(good_grids, key=lambda x: (x[0][0] * x[0][1], -x[1]))[0][
|
||||||
|
0
|
||||||
|
]
|
||||||
else:
|
else:
|
||||||
# pick the partition with maximum covering_ratio and break the tie using #sub_images
|
# pick the partition with maximum covering_ratio and break the tie using #sub_images
|
||||||
return sorted(all_grids, key=lambda x: (-x[1], x[0][0] * x[0][1]))[0][0]
|
return sorted(all_grids, key=lambda x: (-x[1], x[0][0] * x[0][1]))[0][0]
|
||||||
|
|
||||||
if convert_to_rgb:
|
if convert_to_rgb:
|
||||||
image = convert_image_mode(image, 'RGB')
|
image = convert_image_mode(image, "RGB")
|
||||||
|
|
||||||
|
|
||||||
sides = self.get_image_size()
|
sides = self.get_image_size()
|
||||||
if sides[0] != sides[1]:
|
if sides[0] != sides[1]:
|
||||||
raise ValueError('get_image_size() returns non-square size')
|
raise ValueError("get_image_size() returns non-square size")
|
||||||
side = sides[0]
|
side = sides[0]
|
||||||
grid = _get_best_grid(image, side)
|
grid = _get_best_grid(image, side)
|
||||||
partition = _partition(image, grid)
|
partition = _partition(image, grid)
|
||||||
@ -405,14 +435,18 @@ class OvisProcessor(ProcessorMixin):
|
|||||||
`list[str]`: The decoded text.
|
`list[str]`: The decoded text.
|
||||||
"""
|
"""
|
||||||
return self.tokenizer.batch_decode(
|
return self.tokenizer.batch_decode(
|
||||||
generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
generated_outputs,
|
||||||
|
skip_special_tokens=True,
|
||||||
|
clean_up_tokenization_spaces=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def model_input_names(self):
|
def model_input_names(self):
|
||||||
tokenizer_input_names = self.tokenizer.model_input_names
|
tokenizer_input_names = self.tokenizer.model_input_names
|
||||||
image_processor_input_names = self.image_processor.model_input_names
|
image_processor_input_names = self.image_processor.model_input_names
|
||||||
names_from_processor = list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
|
names_from_processor = list(
|
||||||
|
dict.fromkeys(tokenizer_input_names + image_processor_input_names)
|
||||||
|
)
|
||||||
return names_from_processor + ["second_per_grid_ts"]
|
return names_from_processor + ["second_per_grid_ts"]
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -40,9 +40,6 @@ from vllm.utils.flashinfer import (
|
|||||||
supports_trtllm_attention,
|
supports_trtllm_attention,
|
||||||
use_trtllm_attention,
|
use_trtllm_attention,
|
||||||
)
|
)
|
||||||
|
|
||||||
# yapf conflicts with isort for this block
|
|
||||||
# yapf: disable
|
|
||||||
from vllm.v1.attention.backends.utils import (
|
from vllm.v1.attention.backends.utils import (
|
||||||
AttentionCGSupport,
|
AttentionCGSupport,
|
||||||
AttentionMetadataBuilder,
|
AttentionMetadataBuilder,
|
||||||
@ -52,8 +49,6 @@ from vllm.v1.attention.backends.utils import (
|
|||||||
infer_global_hyperparameters,
|
infer_global_hyperparameters,
|
||||||
split_decodes_and_prefills,
|
split_decodes_and_prefills,
|
||||||
)
|
)
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
from vllm.v1.kv_cache_interface import AttentionSpec
|
from vllm.v1.kv_cache_interface import AttentionSpec
|
||||||
|
|
||||||
FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024
|
FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024
|
||||||
|
|||||||
@ -11,9 +11,6 @@ from vllm.attention.backends.abstract import AttentionLayer
|
|||||||
from vllm.attention.ops.rocm_aiter_mla import aiter_mla_decode_fwd
|
from vllm.attention.ops.rocm_aiter_mla import aiter_mla_decode_fwd
|
||||||
from vllm.config import VllmConfig
|
from vllm.config import VllmConfig
|
||||||
from vllm.utils import cdiv
|
from vllm.utils import cdiv
|
||||||
|
|
||||||
# yapf conflicts with isort for this docstring
|
|
||||||
# yapf: disable
|
|
||||||
from vllm.v1.attention.backends.mla.common import (
|
from vllm.v1.attention.backends.mla.common import (
|
||||||
MLACommonBackend,
|
MLACommonBackend,
|
||||||
MLACommonDecodeMetadata,
|
MLACommonDecodeMetadata,
|
||||||
@ -24,8 +21,6 @@ from vllm.v1.attention.backends.mla.common import (
|
|||||||
from vllm.v1.attention.backends.utils import AttentionCGSupport
|
from vllm.v1.attention.backends.utils import AttentionCGSupport
|
||||||
from vllm.v1.kv_cache_interface import AttentionSpec
|
from vllm.v1.kv_cache_interface import AttentionSpec
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
|
|
||||||
|
|
||||||
def is_aiter_mla_enabled() -> bool:
|
def is_aiter_mla_enabled() -> bool:
|
||||||
return envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MLA
|
return envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MLA
|
||||||
|
|||||||
@ -18,8 +18,6 @@ from msgspec import msgpack
|
|||||||
|
|
||||||
from vllm import envs
|
from vllm import envs
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
from vllm.multimodal.inputs import (
|
from vllm.multimodal.inputs import (
|
||||||
BaseMultiModalField,
|
BaseMultiModalField,
|
||||||
MultiModalBatchedField,
|
MultiModalBatchedField,
|
||||||
@ -32,8 +30,6 @@ from vllm.multimodal.inputs import (
|
|||||||
MultiModalSharedField,
|
MultiModalSharedField,
|
||||||
NestedTensors,
|
NestedTensors,
|
||||||
)
|
)
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
from vllm.v1.engine import UtilityResult
|
from vllm.v1.engine import UtilityResult
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|||||||
@ -48,9 +48,6 @@ from vllm.model_executor.layers.mamba.abstract import MambaBase
|
|||||||
from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
|
from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
|
||||||
from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader
|
from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader
|
||||||
from vllm.model_executor.models.deepseek_v2 import DeepseekV32IndexerCache
|
from vllm.model_executor.models.deepseek_v2 import DeepseekV32IndexerCache
|
||||||
|
|
||||||
# yapf conflicts with isort for this block
|
|
||||||
# yapf: disable
|
|
||||||
from vllm.model_executor.models.interfaces import (
|
from vllm.model_executor.models.interfaces import (
|
||||||
SupportsMultiModal,
|
SupportsMultiModal,
|
||||||
is_mixture_of_experts,
|
is_mixture_of_experts,
|
||||||
@ -59,8 +56,6 @@ from vllm.model_executor.models.interfaces import (
|
|||||||
supports_multimodal_pruning,
|
supports_multimodal_pruning,
|
||||||
supports_transcription,
|
supports_transcription,
|
||||||
)
|
)
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
from vllm.model_executor.models.interfaces_base import (
|
from vllm.model_executor.models.interfaces_base import (
|
||||||
VllmModelForPooling,
|
VllmModelForPooling,
|
||||||
is_pooling_model,
|
is_pooling_model,
|
||||||
@ -101,9 +96,6 @@ from vllm.v1.attention.backends.utils import (
|
|||||||
split_attn_metadata,
|
split_attn_metadata,
|
||||||
)
|
)
|
||||||
from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
|
from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
|
||||||
|
|
||||||
# yapf conflicts with isort for this block
|
|
||||||
# yapf: disable
|
|
||||||
from vllm.v1.kv_cache_interface import (
|
from vllm.v1.kv_cache_interface import (
|
||||||
AttentionSpec,
|
AttentionSpec,
|
||||||
ChunkedLocalAttentionSpec,
|
ChunkedLocalAttentionSpec,
|
||||||
@ -118,8 +110,6 @@ from vllm.v1.kv_cache_interface import (
|
|||||||
SlidingWindowSpec,
|
SlidingWindowSpec,
|
||||||
UniformTypeKVCacheSpecs,
|
UniformTypeKVCacheSpecs,
|
||||||
)
|
)
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
from vllm.v1.outputs import (
|
from vllm.v1.outputs import (
|
||||||
EMPTY_MODEL_RUNNER_OUTPUT,
|
EMPTY_MODEL_RUNNER_OUTPUT,
|
||||||
AsyncModelRunnerOutput,
|
AsyncModelRunnerOutput,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user