mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-08 06:07:02 +08:00
[Model] Officially support Emu3 with Transformers backend (#21319)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
parent
61b8cea3b4
commit
13abd0eaf9
@ -623,6 +623,12 @@ Specified using `--task generate`.
|
||||
| `TarsierForConditionalGeneration` | Tarsier | T + I<sup>E+</sup> | `omni-search/Tarsier-7b`, `omni-search/Tarsier-34b` | | ✅︎ | ✅︎ |
|
||||
| `Tarsier2ForConditionalGeneration`<sup>^</sup> | Tarsier2 | T + I<sup>E+</sup> + V<sup>E+</sup> | `omni-research/Tarsier2-Recap-7b`, `omni-research/Tarsier2-7b-0115` | | ✅︎ | ✅︎ |
|
||||
|
||||
Some models are supported only via the [Transformers backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
|
||||
|
||||
| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
|
||||
|--------------|--------|--------|-------------------|-----------------------------|-----------------------------------------|---------------------|
|
||||
| `Emu3ForConditionalGeneration` | Emu3 | T + I | `BAAI/Emu3-Chat-hf` | ✅︎ | ✅︎ | ✅︎ |
|
||||
|
||||
<sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM.
|
||||
• For example, to use DeepSeek-VL2 series models:
|
||||
`--hf-overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'`
|
||||
|
||||
@ -23,18 +23,14 @@ def create_repo_dummy_weights(repo: str) -> Iterable[tuple[str, torch.Tensor]]:
|
||||
return ((name, torch.empty(0)) for name in weight_names)
|
||||
|
||||
|
||||
def create_model_dummy_weights(
|
||||
repo: str,
|
||||
model_arch: str,
|
||||
) -> Iterable[tuple[str, torch.Tensor]]:
|
||||
def create_dummy_model(repo: str, model_arch: str) -> PreTrainedModel:
|
||||
"""
|
||||
Create weights from a dummy meta deserialized hf model with name conversion
|
||||
"""
|
||||
model_cls: PreTrainedModel = getattr(transformers, model_arch)
|
||||
config = AutoConfig.from_pretrained(repo)
|
||||
with torch.device("meta"):
|
||||
model: PreTrainedModel = model_cls._from_config(config)
|
||||
return model.named_parameters()
|
||||
return model_cls._from_config(config)
|
||||
|
||||
|
||||
def model_architectures_for_test() -> list[str]:
|
||||
@ -70,14 +66,21 @@ def test_hf_model_weights_mapper(model_arch: str):
|
||||
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
|
||||
|
||||
original_weights = create_repo_dummy_weights(model_id)
|
||||
hf_converted_weights = create_model_dummy_weights(model_id, model_arch)
|
||||
hf_dummy_model = create_dummy_model(model_id, model_arch)
|
||||
hf_converted_weights = hf_dummy_model.named_parameters()
|
||||
hf_converted_buffers = hf_dummy_model.named_buffers()
|
||||
mapper: WeightsMapper = model_cls.hf_to_vllm_mapper
|
||||
|
||||
mapped_original_weights = mapper.apply(original_weights)
|
||||
mapped_hf_converted_weights = mapper.apply(hf_converted_weights)
|
||||
mapped_hf_converted_buffers = mapper.apply(hf_converted_buffers)
|
||||
|
||||
ref_weight_names = set(map(lambda x: x[0], mapped_original_weights))
|
||||
weight_names = set(map(lambda x: x[0], mapped_hf_converted_weights))
|
||||
buffer_names = set(map(lambda x: x[0], mapped_hf_converted_buffers))
|
||||
|
||||
# Some checkpoints may have buffers, we ignore them for this test
|
||||
ref_weight_names -= buffer_names
|
||||
|
||||
weights_missing = ref_weight_names - weight_names
|
||||
weights_unmapped = weight_names - ref_weight_names
|
||||
|
||||
@ -357,6 +357,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
max_transformers_version="4.48", # noqa: E501
|
||||
transformers_version_reason="HF model is not compatible.", # noqa: E501
|
||||
hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}), # noqa: E501
|
||||
"Emu3ForConditionalGeneration": _HfExamplesInfo("BAAI/Emu3-Chat-hf"),
|
||||
"FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
|
||||
"Gemma3ForConditionalGeneration": _HfExamplesInfo("google/gemma-3-4b-it"),
|
||||
"GraniteSpeechForConditionalGeneration": _HfExamplesInfo("ibm-granite/granite-speech-3.3-2b"), # noqa: E501
|
||||
@ -501,7 +502,7 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
|
||||
speculative_model="XiaomiMiMo/MiMo-7B-RL")
|
||||
}
|
||||
|
||||
_TRANSFORMERS_MODELS = {
|
||||
_TRANSFORMERS_BACKEND_MODELS = {
|
||||
"TransformersForCausalLM": _HfExamplesInfo("hmellor/Ilama-3.2-1B", trust_remote_code=True), # noqa: E501
|
||||
"TransformersForMultimodalLM": _HfExamplesInfo("OpenGVLab/InternVL3-1B-hf"),
|
||||
}
|
||||
@ -512,7 +513,7 @@ _EXAMPLE_MODELS = {
|
||||
**_SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS,
|
||||
**_MULTIMODAL_EXAMPLE_MODELS,
|
||||
**_SPECULATIVE_DECODING_EXAMPLE_MODELS,
|
||||
**_TRANSFORMERS_MODELS,
|
||||
**_TRANSFORMERS_BACKEND_MODELS,
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -26,7 +26,7 @@ from vllm.model_executor.models.adapters import (as_embedding_model,
|
||||
as_seq_cls_model)
|
||||
from vllm.model_executor.models.interfaces import SupportsQuant
|
||||
from vllm.model_executor.models.registry import (_PREVIOUSLY_SUPPORTED_MODELS,
|
||||
_TRANSFORMERS_MODELS)
|
||||
_TRANSFORMERS_BACKEND_MODELS)
|
||||
from vllm.utils import is_pin_memory_available
|
||||
|
||||
logger = init_logger(__name__)
|
||||
@ -178,7 +178,7 @@ def resolve_transformers_arch(model_config: ModelConfig,
|
||||
"happen.")
|
||||
|
||||
for i, arch in enumerate(architectures):
|
||||
if arch in _TRANSFORMERS_MODELS:
|
||||
if arch in _TRANSFORMERS_BACKEND_MODELS:
|
||||
continue
|
||||
|
||||
if model_config.model_impl == ModelImpl.AUTO:
|
||||
@ -241,7 +241,7 @@ def get_model_architecture(
|
||||
|
||||
vllm_supported_archs = ModelRegistry.get_supported_archs()
|
||||
is_supported = lambda arch: (arch in vllm_supported_archs and arch not in
|
||||
_TRANSFORMERS_MODELS)
|
||||
_TRANSFORMERS_BACKEND_MODELS)
|
||||
vllm_not_supported = not any(is_supported(arch) for arch in architectures)
|
||||
|
||||
if vllm_not_supported:
|
||||
|
||||
@ -254,7 +254,11 @@ _SPECULATIVE_DECODING_MODELS = {
|
||||
# "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
|
||||
}
|
||||
|
||||
_TRANSFORMERS_MODELS = {
|
||||
_TRANSFORMERS_SUPPORTED_MODELS = {
|
||||
"Emu3ForConditionalGeneration": ("transformers", "TransformersForMultimodalLM"), # noqa: E501
|
||||
}
|
||||
|
||||
_TRANSFORMERS_BACKEND_MODELS = {
|
||||
"TransformersForMultimodalLM": ("transformers", "TransformersForMultimodalLM"), # noqa: E501
|
||||
"TransformersForCausalLM": ("transformers", "TransformersForCausalLM"),
|
||||
}
|
||||
@ -266,7 +270,8 @@ _VLLM_MODELS = {
|
||||
**_CROSS_ENCODER_MODELS,
|
||||
**_MULTIMODAL_MODELS,
|
||||
**_SPECULATIVE_DECODING_MODELS,
|
||||
**_TRANSFORMERS_MODELS,
|
||||
**_TRANSFORMERS_SUPPORTED_MODELS,
|
||||
**_TRANSFORMERS_BACKEND_MODELS,
|
||||
}
|
||||
|
||||
# This variable is used as the args for subprocess.run(). We
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user