mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 00:15:51 +08:00
[Bugfix] Fix nightly transformers CI failure (#21427)
Signed-off-by: Isotr0py <2037008807@qq.com>
This commit is contained in:
parent
107111a859
commit
4ecedd1806
@ -443,6 +443,12 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
|||||||
hf_overrides={"architectures": ["TarsierForConditionalGeneration"]}), # noqa: E501
|
hf_overrides={"architectures": ["TarsierForConditionalGeneration"]}), # noqa: E501
|
||||||
"Tarsier2ForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier2-Recap-7b", # noqa: E501
|
"Tarsier2ForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier2-Recap-7b", # noqa: E501
|
||||||
hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]}), # noqa: E501
|
hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]}), # noqa: E501
|
||||||
|
"VoxtralForConditionalGeneration": _HfExamplesInfo(
|
||||||
|
"mistralai/Voxtral-Mini-3B-2507",
|
||||||
|
min_transformers_version="4.54",
|
||||||
|
# disable this temporarily until we support HF format
|
||||||
|
is_available_online=False,
|
||||||
|
),
|
||||||
# [Encoder-decoder]
|
# [Encoder-decoder]
|
||||||
# Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
|
# Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
|
||||||
# Therefore, we borrow the BartTokenizer from the original Bart model
|
# Therefore, we borrow the BartTokenizer from the original Bart model
|
||||||
@ -450,13 +456,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
|||||||
tokenizer="Isotr0py/Florence-2-tokenizer", # noqa: E501
|
tokenizer="Isotr0py/Florence-2-tokenizer", # noqa: E501
|
||||||
trust_remote_code=True), # noqa: E501
|
trust_remote_code=True), # noqa: E501
|
||||||
"MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501
|
"MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501
|
||||||
"VoxtralForConditionalGeneration": _HfExamplesInfo(
|
|
||||||
"mistralai/Voxtral-Mini-3B-2507",
|
|
||||||
tokenizer_mode="mistral",
|
|
||||||
min_transformers_version="4.54"
|
|
||||||
),
|
|
||||||
"WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), # noqa: E501
|
"WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), # noqa: E501
|
||||||
|
|
||||||
# [Cross-encoder]
|
# [Cross-encoder]
|
||||||
"JinaVLForRanking": _HfExamplesInfo("jinaai/jina-reranker-m0"), # noqa: E501
|
"JinaVLForRanking": _HfExamplesInfo("jinaai/jina-reranker-m0"), # noqa: E501
|
||||||
}
|
}
|
||||||
|
|||||||
@ -13,8 +13,7 @@ from transformers import LlavaConfig as HfLlavaConfig
|
|||||||
from transformers import PretrainedConfig, SiglipVisionConfig
|
from transformers import PretrainedConfig, SiglipVisionConfig
|
||||||
from transformers.image_utils import ImageInput, get_image_size, to_numpy_array
|
from transformers.image_utils import ImageInput, get_image_size, to_numpy_array
|
||||||
from transformers.models.llava import LlavaProcessor
|
from transformers.models.llava import LlavaProcessor
|
||||||
from transformers.processing_utils import (ProcessingKwargs, Unpack,
|
from transformers.processing_utils import ProcessingKwargs, Unpack
|
||||||
_validate_images_text_input_order)
|
|
||||||
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
|
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
|
||||||
|
|
||||||
from vllm.config import VllmConfig
|
from vllm.config import VllmConfig
|
||||||
@ -94,9 +93,6 @@ class TarsierProcessor(LlavaProcessor):
|
|||||||
raise ValueError(
|
raise ValueError(
|
||||||
"You have to specify at least one of `images` or `text`.")
|
"You have to specify at least one of `images` or `text`.")
|
||||||
|
|
||||||
# check if images and text inputs are reversed for BC
|
|
||||||
images, text = _validate_images_text_input_order(images, text)
|
|
||||||
|
|
||||||
output_kwargs = self._merge_kwargs(
|
output_kwargs = self._merge_kwargs(
|
||||||
TarsierProcessorKwargs,
|
TarsierProcessorKwargs,
|
||||||
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
||||||
|
|||||||
@ -37,6 +37,7 @@ from vllm.transformers_utils.configs import (ChatGLMConfig, Cohere2Config,
|
|||||||
MiniMaxText01Config,
|
MiniMaxText01Config,
|
||||||
MiniMaxVL01Config, MllamaConfig,
|
MiniMaxVL01Config, MllamaConfig,
|
||||||
MLPSpeculatorConfig, MPTConfig,
|
MLPSpeculatorConfig, MPTConfig,
|
||||||
|
Nemotron_Nano_VL_Config,
|
||||||
NemotronConfig, NVLM_D_Config,
|
NemotronConfig, NVLM_D_Config,
|
||||||
OvisConfig, RWConfig,
|
OvisConfig, RWConfig,
|
||||||
SkyworkR1VChatConfig, SolarConfig,
|
SkyworkR1VChatConfig, SolarConfig,
|
||||||
@ -80,6 +81,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = {
|
|||||||
"dbrx": DbrxConfig,
|
"dbrx": DbrxConfig,
|
||||||
"deepseek_vl_v2": DeepseekVLV2Config,
|
"deepseek_vl_v2": DeepseekVLV2Config,
|
||||||
"kimi_vl": KimiVLConfig,
|
"kimi_vl": KimiVLConfig,
|
||||||
|
"Llama_Nemotron_Nano_VL": Nemotron_Nano_VL_Config,
|
||||||
"mpt": MPTConfig,
|
"mpt": MPTConfig,
|
||||||
"RefinedWeb": RWConfig, # For tiiuae/falcon-40b(-instruct)
|
"RefinedWeb": RWConfig, # For tiiuae/falcon-40b(-instruct)
|
||||||
"RefinedWebModel": RWConfig, # For tiiuae/falcon-7b(-instruct)
|
"RefinedWebModel": RWConfig, # For tiiuae/falcon-7b(-instruct)
|
||||||
|
|||||||
@ -23,6 +23,7 @@ from vllm.transformers_utils.configs.moonvit import MoonViTConfig
|
|||||||
from vllm.transformers_utils.configs.mpt import MPTConfig
|
from vllm.transformers_utils.configs.mpt import MPTConfig
|
||||||
from vllm.transformers_utils.configs.nemotron import NemotronConfig
|
from vllm.transformers_utils.configs.nemotron import NemotronConfig
|
||||||
from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig
|
from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig
|
||||||
|
from vllm.transformers_utils.configs.nemotron_vl import Nemotron_Nano_VL_Config
|
||||||
from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config
|
from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config
|
||||||
from vllm.transformers_utils.configs.ovis import OvisConfig
|
from vllm.transformers_utils.configs.ovis import OvisConfig
|
||||||
from vllm.transformers_utils.configs.skyworkr1v import SkyworkR1VChatConfig
|
from vllm.transformers_utils.configs.skyworkr1v import SkyworkR1VChatConfig
|
||||||
@ -50,6 +51,7 @@ __all__ = [
|
|||||||
"KimiVLConfig",
|
"KimiVLConfig",
|
||||||
"NemotronConfig",
|
"NemotronConfig",
|
||||||
"NemotronHConfig",
|
"NemotronHConfig",
|
||||||
|
"Nemotron_Nano_VL_Config",
|
||||||
"NVLM_D_Config",
|
"NVLM_D_Config",
|
||||||
"OvisConfig",
|
"OvisConfig",
|
||||||
"SkyworkR1VChatConfig",
|
"SkyworkR1VChatConfig",
|
||||||
|
|||||||
56
vllm/transformers_utils/configs/nemotron_vl.py
Normal file
56
vllm/transformers_utils/configs/nemotron_vl.py
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
# yapf: disable
|
||||||
|
# ruff: noqa: E501
|
||||||
|
# Adapted from
|
||||||
|
# https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1/blob/main/configuration.py
|
||||||
|
# --------------------------------------------------------
|
||||||
|
# Adapted from https://huggingface.co/OpenGVLab/InternVL2-Llama3-76B under MIT License
|
||||||
|
# LICENSE is in incl_licenses directory.
|
||||||
|
# --------------------------------------------------------
|
||||||
|
|
||||||
|
from transformers import LlamaConfig
|
||||||
|
from transformers.configuration_utils import PretrainedConfig
|
||||||
|
from transformers.dynamic_module_utils import get_class_from_dynamic_module
|
||||||
|
|
||||||
|
|
||||||
|
class Nemotron_Nano_VL_Config(PretrainedConfig):
|
||||||
|
model_type = 'Llama_Nemotron_Nano_VL'
|
||||||
|
is_composition = True
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
vision_config=None,
|
||||||
|
llm_config=None,
|
||||||
|
force_image_size=None,
|
||||||
|
downsample_ratio=0.5,
|
||||||
|
template=None,
|
||||||
|
ps_version='v1',
|
||||||
|
image_tag_type="internvl",
|
||||||
|
projector_hidden_size=4096,
|
||||||
|
vit_hidden_size=1280,
|
||||||
|
**kwargs
|
||||||
|
):
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
|
if vision_config is not None:
|
||||||
|
assert "auto_map" in vision_config and "AutoConfig" in vision_config["auto_map"]
|
||||||
|
vision_auto_config = get_class_from_dynamic_module(*vision_config["auto_map"]["AutoConfig"].split("--")[::-1])
|
||||||
|
self.vision_config = vision_auto_config(**vision_config)
|
||||||
|
else:
|
||||||
|
self.vision_config = PretrainedConfig()
|
||||||
|
|
||||||
|
if llm_config is None:
|
||||||
|
self.text_config = LlamaConfig()
|
||||||
|
else:
|
||||||
|
self.text_config = LlamaConfig(**llm_config)
|
||||||
|
|
||||||
|
# Assign configuration values
|
||||||
|
self.force_image_size = force_image_size
|
||||||
|
self.downsample_ratio = downsample_ratio
|
||||||
|
self.template = template # TODO move out of here and into the tokenizer
|
||||||
|
self.ps_version = ps_version # Pixel shuffle version
|
||||||
|
self.image_tag_type = image_tag_type # TODO: into the tokenizer too?
|
||||||
|
self.projector_hidden_size = projector_hidden_size
|
||||||
|
self.vit_hidden_size = vit_hidden_size
|
||||||
Loading…
x
Reference in New Issue
Block a user