[Model] Officially support Emu3 with Transformers backend (#21319)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-06-07 23:55:42 +08:00 · 2025-07-24 11:22:12 +01:00 · 2025-07-24 11:22:12 +01:00 · 13abd0eaf9
commit 13abd0eaf9
parent 61b8cea3b4
5 changed files with 29 additions and 14 deletions
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@ -623,6 +623,12 @@ Specified using `--task generate`.
 | `TarsierForConditionalGeneration` | Tarsier | T + I<sup>E+</sup> | `omni-search/Tarsier-7b`, `omni-search/Tarsier-34b` | | ✅︎ | ✅︎ |
 | `Tarsier2ForConditionalGeneration`<sup>^</sup> | Tarsier2 | T + I<sup>E+</sup> + V<sup>E+</sup> | `omni-research/Tarsier2-Recap-7b`, `omni-research/Tarsier2-7b-0115` | | ✅︎ | ✅︎ |

+Some models are supported only via the [Transformers backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
+
+| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
+|--------------|--------|--------|-------------------|-----------------------------|-----------------------------------------|---------------------|
+| `Emu3ForConditionalGeneration` | Emu3 | T + I | `BAAI/Emu3-Chat-hf` | ✅︎ | ✅︎ | ✅︎ |
+
 <sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM.  
 &nbsp;&nbsp;&nbsp;&nbsp;• For example, to use DeepSeek-VL2 series models:  
 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`--hf-overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'`  
--- a/tests/models/multimodal/test_mapping.py
+++ b/tests/models/multimodal/test_mapping.py
@ -23,18 +23,14 @@ def create_repo_dummy_weights(repo: str) -> Iterable[tuple[str, torch.Tensor]]:
        return ((name, torch.empty(0)) for name in weight_names)


-def create_model_dummy_weights(
-    repo: str,
-    model_arch: str,
-) -> Iterable[tuple[str, torch.Tensor]]:
+def create_dummy_model(repo: str, model_arch: str) -> PreTrainedModel:
    """
    Create weights from a dummy meta deserialized hf model with name conversion
    """
    model_cls: PreTrainedModel = getattr(transformers, model_arch)
    config = AutoConfig.from_pretrained(repo)
    with torch.device("meta"):
-        model: PreTrainedModel = model_cls._from_config(config)
-    return model.named_parameters()
+        return model_cls._from_config(config)


 def model_architectures_for_test() -> list[str]:
@ -70,14 +66,21 @@ def test_hf_model_weights_mapper(model_arch: str):
    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)

    original_weights = create_repo_dummy_weights(model_id)
-    hf_converted_weights = create_model_dummy_weights(model_id, model_arch)
+    hf_dummy_model = create_dummy_model(model_id, model_arch)
+    hf_converted_weights = hf_dummy_model.named_parameters()
+    hf_converted_buffers = hf_dummy_model.named_buffers()
    mapper: WeightsMapper = model_cls.hf_to_vllm_mapper

    mapped_original_weights = mapper.apply(original_weights)
    mapped_hf_converted_weights = mapper.apply(hf_converted_weights)
+    mapped_hf_converted_buffers = mapper.apply(hf_converted_buffers)

    ref_weight_names = set(map(lambda x: x[0], mapped_original_weights))
    weight_names = set(map(lambda x: x[0], mapped_hf_converted_weights))
+    buffer_names = set(map(lambda x: x[0], mapped_hf_converted_buffers))
+
+    # Some checkpoints may have buffers, we ignore them for this test
+    ref_weight_names -= buffer_names

    weights_missing = ref_weight_names - weight_names
    weights_unmapped = weight_names - ref_weight_names
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@ -357,6 +357,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                                max_transformers_version="4.48",  # noqa: E501
                                                transformers_version_reason="HF model is not compatible.",  # noqa: E501
                                                hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}),  # noqa: E501
+    "Emu3ForConditionalGeneration": _HfExamplesInfo("BAAI/Emu3-Chat-hf"),
    "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
    "Gemma3ForConditionalGeneration": _HfExamplesInfo("google/gemma-3-4b-it"),
    "GraniteSpeechForConditionalGeneration": _HfExamplesInfo("ibm-granite/granite-speech-3.3-2b"),  # noqa: E501
@ -501,7 +502,7 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
                                    speculative_model="XiaomiMiMo/MiMo-7B-RL")
 }

-_TRANSFORMERS_MODELS = {
+_TRANSFORMERS_BACKEND_MODELS = {
    "TransformersForCausalLM": _HfExamplesInfo("hmellor/Ilama-3.2-1B", trust_remote_code=True),  # noqa: E501
    "TransformersForMultimodalLM": _HfExamplesInfo("OpenGVLab/InternVL3-1B-hf"),
 }
@ -512,7 +513,7 @@ _EXAMPLE_MODELS = {
    **_SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS,
    **_MULTIMODAL_EXAMPLE_MODELS,
    **_SPECULATIVE_DECODING_EXAMPLE_MODELS,
-    **_TRANSFORMERS_MODELS,
+    **_TRANSFORMERS_BACKEND_MODELS,
 }


--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@ -26,7 +26,7 @@ from vllm.model_executor.models.adapters import (as_embedding_model,
                                                 as_seq_cls_model)
 from vllm.model_executor.models.interfaces import SupportsQuant
 from vllm.model_executor.models.registry import (_PREVIOUSLY_SUPPORTED_MODELS,
-                                                 _TRANSFORMERS_MODELS)
+                                                 _TRANSFORMERS_BACKEND_MODELS)
 from vllm.utils import is_pin_memory_available

 logger = init_logger(__name__)
@ -178,7 +178,7 @@ def resolve_transformers_arch(model_config: ModelConfig,
            "happen.")

    for i, arch in enumerate(architectures):
-        if arch in _TRANSFORMERS_MODELS:
+        if arch in _TRANSFORMERS_BACKEND_MODELS:
            continue

        if model_config.model_impl == ModelImpl.AUTO:
@ -241,7 +241,7 @@ def get_model_architecture(

    vllm_supported_archs = ModelRegistry.get_supported_archs()
    is_supported = lambda arch: (arch in vllm_supported_archs and arch not in
-                                 _TRANSFORMERS_MODELS)
+                                 _TRANSFORMERS_BACKEND_MODELS)
    vllm_not_supported = not any(is_supported(arch) for arch in architectures)

    if vllm_not_supported:
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@ -254,7 +254,11 @@ _SPECULATIVE_DECODING_MODELS = {
    # "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
 }

-_TRANSFORMERS_MODELS = {
+_TRANSFORMERS_SUPPORTED_MODELS = {
+    "Emu3ForConditionalGeneration": ("transformers", "TransformersForMultimodalLM"),  # noqa: E501
+}
+
+_TRANSFORMERS_BACKEND_MODELS = {
    "TransformersForMultimodalLM": ("transformers", "TransformersForMultimodalLM"), # noqa: E501
    "TransformersForCausalLM": ("transformers", "TransformersForCausalLM"),
 }
@ -266,7 +270,8 @@ _VLLM_MODELS = {
    **_CROSS_ENCODER_MODELS,
    **_MULTIMODAL_MODELS,
    **_SPECULATIVE_DECODING_MODELS,
-    **_TRANSFORMERS_MODELS,
+    **_TRANSFORMERS_SUPPORTED_MODELS,
+    **_TRANSFORMERS_BACKEND_MODELS,
 }

 # This variable is used as the args for subprocess.run(). We