[Model] Officially support Emu3 with Transformers backend (#21319)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor 2025-07-24 11:22:12 +01:00 committed by GitHub
parent 61b8cea3b4
commit 13abd0eaf9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 29 additions and 14 deletions

View File

@ -623,6 +623,12 @@ Specified using `--task generate`.
| `TarsierForConditionalGeneration` | Tarsier | T + I<sup>E+</sup> | `omni-search/Tarsier-7b`, `omni-search/Tarsier-34b` | | ✅︎ | ✅︎ |
| `Tarsier2ForConditionalGeneration`<sup>^</sup> | Tarsier2 | T + I<sup>E+</sup> + V<sup>E+</sup> | `omni-research/Tarsier2-Recap-7b`, `omni-research/Tarsier2-7b-0115` | | ✅︎ | ✅︎ |
Some models are supported only via the [Transformers backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
|--------------|--------|--------|-------------------|-----------------------------|-----------------------------------------|---------------------|
| `Emu3ForConditionalGeneration` | Emu3 | T + I | `BAAI/Emu3-Chat-hf` | ✅︎ | ✅︎ | ✅︎ |
<sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM.
&nbsp;&nbsp;&nbsp;&nbsp;• For example, to use DeepSeek-VL2 series models:
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`--hf-overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'`

View File

@ -23,18 +23,14 @@ def create_repo_dummy_weights(repo: str) -> Iterable[tuple[str, torch.Tensor]]:
return ((name, torch.empty(0)) for name in weight_names)
def create_model_dummy_weights(
repo: str,
model_arch: str,
) -> Iterable[tuple[str, torch.Tensor]]:
def create_dummy_model(repo: str, model_arch: str) -> PreTrainedModel:
"""
Create weights from a dummy meta deserialized hf model with name conversion
"""
model_cls: PreTrainedModel = getattr(transformers, model_arch)
config = AutoConfig.from_pretrained(repo)
with torch.device("meta"):
model: PreTrainedModel = model_cls._from_config(config)
return model.named_parameters()
return model_cls._from_config(config)
def model_architectures_for_test() -> list[str]:
@ -70,14 +66,21 @@ def test_hf_model_weights_mapper(model_arch: str):
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
original_weights = create_repo_dummy_weights(model_id)
hf_converted_weights = create_model_dummy_weights(model_id, model_arch)
hf_dummy_model = create_dummy_model(model_id, model_arch)
hf_converted_weights = hf_dummy_model.named_parameters()
hf_converted_buffers = hf_dummy_model.named_buffers()
mapper: WeightsMapper = model_cls.hf_to_vllm_mapper
mapped_original_weights = mapper.apply(original_weights)
mapped_hf_converted_weights = mapper.apply(hf_converted_weights)
mapped_hf_converted_buffers = mapper.apply(hf_converted_buffers)
ref_weight_names = set(map(lambda x: x[0], mapped_original_weights))
weight_names = set(map(lambda x: x[0], mapped_hf_converted_weights))
buffer_names = set(map(lambda x: x[0], mapped_hf_converted_buffers))
# Some checkpoints may have buffers, we ignore them for this test
ref_weight_names -= buffer_names
weights_missing = ref_weight_names - weight_names
weights_unmapped = weight_names - ref_weight_names

View File

@ -357,6 +357,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
max_transformers_version="4.48", # noqa: E501
transformers_version_reason="HF model is not compatible.", # noqa: E501
hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}), # noqa: E501
"Emu3ForConditionalGeneration": _HfExamplesInfo("BAAI/Emu3-Chat-hf"),
"FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
"Gemma3ForConditionalGeneration": _HfExamplesInfo("google/gemma-3-4b-it"),
"GraniteSpeechForConditionalGeneration": _HfExamplesInfo("ibm-granite/granite-speech-3.3-2b"), # noqa: E501
@ -501,7 +502,7 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
speculative_model="XiaomiMiMo/MiMo-7B-RL")
}
_TRANSFORMERS_MODELS = {
_TRANSFORMERS_BACKEND_MODELS = {
"TransformersForCausalLM": _HfExamplesInfo("hmellor/Ilama-3.2-1B", trust_remote_code=True), # noqa: E501
"TransformersForMultimodalLM": _HfExamplesInfo("OpenGVLab/InternVL3-1B-hf"),
}
@ -512,7 +513,7 @@ _EXAMPLE_MODELS = {
**_SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS,
**_MULTIMODAL_EXAMPLE_MODELS,
**_SPECULATIVE_DECODING_EXAMPLE_MODELS,
**_TRANSFORMERS_MODELS,
**_TRANSFORMERS_BACKEND_MODELS,
}

View File

@ -26,7 +26,7 @@ from vllm.model_executor.models.adapters import (as_embedding_model,
as_seq_cls_model)
from vllm.model_executor.models.interfaces import SupportsQuant
from vllm.model_executor.models.registry import (_PREVIOUSLY_SUPPORTED_MODELS,
_TRANSFORMERS_MODELS)
_TRANSFORMERS_BACKEND_MODELS)
from vllm.utils import is_pin_memory_available
logger = init_logger(__name__)
@ -178,7 +178,7 @@ def resolve_transformers_arch(model_config: ModelConfig,
"happen.")
for i, arch in enumerate(architectures):
if arch in _TRANSFORMERS_MODELS:
if arch in _TRANSFORMERS_BACKEND_MODELS:
continue
if model_config.model_impl == ModelImpl.AUTO:
@ -241,7 +241,7 @@ def get_model_architecture(
vllm_supported_archs = ModelRegistry.get_supported_archs()
is_supported = lambda arch: (arch in vllm_supported_archs and arch not in
_TRANSFORMERS_MODELS)
_TRANSFORMERS_BACKEND_MODELS)
vllm_not_supported = not any(is_supported(arch) for arch in architectures)
if vllm_not_supported:

View File

@ -254,7 +254,11 @@ _SPECULATIVE_DECODING_MODELS = {
# "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
}
_TRANSFORMERS_MODELS = {
_TRANSFORMERS_SUPPORTED_MODELS = {
"Emu3ForConditionalGeneration": ("transformers", "TransformersForMultimodalLM"), # noqa: E501
}
_TRANSFORMERS_BACKEND_MODELS = {
"TransformersForMultimodalLM": ("transformers", "TransformersForMultimodalLM"), # noqa: E501
"TransformersForCausalLM": ("transformers", "TransformersForCausalLM"),
}
@ -266,7 +270,8 @@ _VLLM_MODELS = {
**_CROSS_ENCODER_MODELS,
**_MULTIMODAL_MODELS,
**_SPECULATIVE_DECODING_MODELS,
**_TRANSFORMERS_MODELS,
**_TRANSFORMERS_SUPPORTED_MODELS,
**_TRANSFORMERS_BACKEND_MODELS,
}
# This variable is used as the args for subprocess.run(). We