mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 07:15:01 +08:00
[New model support]Support Tarsier2 (#19887)
Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
This commit is contained in:
parent
6f170f11dd
commit
c3bf9bad11
@ -562,6 +562,7 @@ Specified using `--task generate`.
|
||||
| `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | ✅︎ |
|
||||
| `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | ✅︎ |
|
||||
| `TarsierForConditionalGeneration` | Tarsier | T + I<sup>E+</sup> | `omni-search/Tarsier-7b`,`omni-search/Tarsier-34b` | | ✅︎ | ✅︎ |
|
||||
| `Tarsier2ForConditionalGeneration`<sup>^</sup> | Tarsier2 | T + I<sup>E+</sup> + V<sup>E+</sup> | `omni-research/Tarsier2-Recap-7b`,`omni-research/Tarsier2-7b-0115` | | ✅︎ | ✅︎ |
|
||||
|
||||
<sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM.
|
||||
• For example, to use DeepSeek-VL2 series models:
|
||||
|
||||
@ -1040,6 +1040,37 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
|
||||
)
|
||||
|
||||
|
||||
def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
|
||||
model_name = "omni-research/Tarsier2-Recap-7b"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=4096,
|
||||
hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
)
|
||||
|
||||
if modality == "image":
|
||||
placeholder = "<|image_pad|>"
|
||||
elif modality == "video":
|
||||
placeholder = "<|video_pad|>"
|
||||
|
||||
prompts = [
|
||||
(
|
||||
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
|
||||
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
|
||||
f"{question}<|im_end|>\n"
|
||||
"<|im_start|>assistant\n"
|
||||
)
|
||||
for question in questions
|
||||
]
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# SkyworkR1V
|
||||
def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
@ -1112,6 +1143,7 @@ model_example_map = {
|
||||
"skywork_chat": run_skyworkr1v,
|
||||
"smolvlm": run_smolvlm,
|
||||
"tarsier": run_tarsier,
|
||||
"tarsier2": run_tarsier2,
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -828,6 +828,32 @@ def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "omni-research/Tarsier2-Recap-7b"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=32768,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
|
||||
)
|
||||
|
||||
prompt = (
|
||||
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
|
||||
f"<|im_start|>user\n<|vision_start|>{'<|image_pad|>' * len(image_urls)}"
|
||||
f"<|vision_end|>{question}<|im_end|>\n"
|
||||
"<|im_start|>assistant\n"
|
||||
)
|
||||
image_data = [fetch_image(url) for url in image_urls]
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
image_data=image_data,
|
||||
)
|
||||
|
||||
|
||||
model_example_map = {
|
||||
"aria": load_aria,
|
||||
"aya_vision": load_aya_vision,
|
||||
@ -853,6 +879,7 @@ model_example_map = {
|
||||
"qwen2_5_vl": load_qwen2_5_vl,
|
||||
"smolvlm": load_smolvlm,
|
||||
"tarsier": load_tarsier,
|
||||
"tarsier2": load_tarsier2,
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -284,6 +284,7 @@ def _test_processing_correctness_one(
|
||||
"fixie-ai/ultravox-v0_5-llama-3_2-1b",
|
||||
"openai/whisper-large-v3",
|
||||
"omni-research/Tarsier-7b",
|
||||
"omni-research/Tarsier2-Recap-7b"
|
||||
])
|
||||
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
|
||||
@pytest.mark.parametrize("num_batches", [32])
|
||||
|
||||
@ -398,6 +398,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
trust_remote_code=True),
|
||||
"TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b", # noqa: E501
|
||||
hf_overrides={"architectures": ["TarsierForConditionalGeneration"]}), # noqa: E501
|
||||
"Tarsier2ForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier2-Recap-7b", # noqa: E501
|
||||
hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]}), # noqa: E501
|
||||
# [Encoder-decoder]
|
||||
# Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
|
||||
# Therefore, we borrow the BartTokenizer from the original Bart model
|
||||
|
||||
@ -32,12 +32,14 @@ import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from einops import rearrange, repeat
|
||||
from transformers import BatchFeature
|
||||
from transformers import AutoConfig, BatchFeature
|
||||
from transformers.models.qwen2_vl import (Qwen2VLImageProcessor,
|
||||
Qwen2VLProcessor)
|
||||
from transformers.models.qwen2_vl.configuration_qwen2_vl import (
|
||||
Qwen2VLConfig, Qwen2VLVisionConfig)
|
||||
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
|
||||
from transformers.models.qwen2_vl.video_processing_qwen2_vl import (
|
||||
Qwen2VLVideoProcessor)
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.distributed import parallel_state, tensor_model_parallel_all_gather
|
||||
@ -69,6 +71,7 @@ from vllm.sequence import IntermediateTensors
|
||||
from vllm.transformers_utils.config import uses_mrope
|
||||
from vllm.transformers_utils.processor import (
|
||||
cached_image_processor_from_config)
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
|
||||
from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
|
||||
SupportsMultiModal, SupportsPP)
|
||||
@ -1405,3 +1408,87 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
connector="visual.merger.",
|
||||
tower_model="visual.",
|
||||
)
|
||||
|
||||
|
||||
class Tarsier2MultiModalProcessor(Qwen2VLMultiModalProcessor):
|
||||
pass
|
||||
|
||||
|
||||
class Tarsier2ImageProcessor(Qwen2VLImageProcessor):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
size: Optional[dict[str, int]] = None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
if size is not None and "min_pixels" in size and "max_pixels" in size:
|
||||
# Remap if Tarsier2-specific format is provided
|
||||
remapped_size = {
|
||||
"shortest_edge": size["min_pixels"],
|
||||
"longest_edge": size["max_pixels"]
|
||||
}
|
||||
super().__init__(size=remapped_size, **kwargs)
|
||||
else:
|
||||
super().__init__(size=size, **kwargs)
|
||||
|
||||
|
||||
class Tarsier2Processor(Qwen2VLProcessor):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vision_config: dict,
|
||||
tokenizer: AnyTokenizer,
|
||||
**kwargs,
|
||||
):
|
||||
self.image_processor = Tarsier2ImageProcessor(**vision_config)
|
||||
super().__init__(image_processor=self.image_processor,
|
||||
tokenizer=tokenizer,
|
||||
video_processor=Qwen2VLVideoProcessor(),
|
||||
chat_template=None,
|
||||
**kwargs)
|
||||
|
||||
|
||||
class Tarsier2ProcessingInfo(Qwen2VLProcessingInfo):
|
||||
|
||||
def get_hf_config(self) -> Qwen2VLConfig:
|
||||
model_path = self.ctx.model_config.model
|
||||
original_config = AutoConfig.from_pretrained(model_path)
|
||||
config_dict = original_config.to_dict()
|
||||
correct_config = Qwen2VLConfig.from_dict(config_dict)
|
||||
|
||||
return correct_config
|
||||
|
||||
def get_hf_processor(self, **kwargs: object) -> Tarsier2Processor:
|
||||
return Tarsier2Processor(
|
||||
vision_config=self.ctx.get_hf_image_processor_config(),
|
||||
tokenizer=self.get_tokenizer(),
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def get_image_processor(self) -> Tarsier2ImageProcessor:
|
||||
return Tarsier2ImageProcessor(
|
||||
**self.ctx.get_hf_image_processor_config())
|
||||
|
||||
|
||||
@MULTIMODAL_REGISTRY.register_processor(Tarsier2MultiModalProcessor,
|
||||
info=Tarsier2ProcessingInfo,
|
||||
dummy_inputs=Qwen2VLDummyInputsBuilder)
|
||||
class Tarsier2ForConditionalGeneration(Qwen2VLForConditionalGeneration):
|
||||
hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
|
||||
"vision_tower.": "visual.",
|
||||
})
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
# Tarsier2 uses llava as model_type, which will create a Qwen2VLConfig
|
||||
# as text_config, we need to reconstruct Qwen2VLConfig from LlavaConfig.
|
||||
config = vllm_config.model_config.hf_config
|
||||
qwen2vl_config = config.text_config
|
||||
qwen2vl_config.architectures = config.architectures
|
||||
vllm_config.model_config.hf_config = qwen2vl_config
|
||||
super().__init__(vllm_config=vllm_config, prefix=prefix)
|
||||
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
|
||||
loader = AutoWeightsLoader(self)
|
||||
return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
|
||||
|
||||
@ -217,6 +217,7 @@ _MULTIMODAL_MODELS = {
|
||||
"UltravoxModel": ("ultravox", "UltravoxModel"),
|
||||
"Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"),
|
||||
"TarsierForConditionalGeneration": ("tarsier", "TarsierForConditionalGeneration"), # noqa: E501
|
||||
"Tarsier2ForConditionalGeneration": ("qwen2_vl", "Tarsier2ForConditionalGeneration"), # noqa: E501
|
||||
# [Encoder-decoder]
|
||||
"Florence2ForConditionalGeneration": ("florence2", "Florence2ForConditionalGeneration"), # noqa: E501
|
||||
"MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"), # noqa: E501
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user