From 2c8ed8ee485c13d88d7aa4fc4b8e1ac4b4301a78 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 24 Apr 2025 03:54:03 +0100 Subject: [PATCH] More informative error when using Transformers backend (#16988) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/source/models/supported_models.md | 48 ++++++++++++----------- vllm/model_executor/model_loader/utils.py | 24 ++++++------ 2 files changed, 38 insertions(+), 34 deletions(-) diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 0fdffbeefd03..0fad1b1ef276 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -40,33 +40,37 @@ You can force the use of `TransformersForCausalLM` by setting `model_impl="trans vLLM may not fully optimise the Transformers implementation so you may see degraded performance if comparing a native model to a Transformers model in vLLM. ::: -#### Supported features +#### Custom models -The Transformers modeling backend explicitly supports the following features: +If a model is neither supported natively by vLLM or Transformers, it can still be used in vLLM! -- (except GGUF) -- -- +For a model to be compatible with the Transformers backend for vLLM it must: -#### Remote Code +- be a Transformers compatible custom model (see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)): + * The model directory must have the correct structure (e.g. `config.json` is present). + * `config.json` must contain `auto_map.AutoModel`. +- be a Transformers backend for vLLM compatible model (see ): + * Customisation should be done in the base model (e.g. in `MyModel`, not `MyModelForCausalLM`). -If your model is neither supported natively by vLLM or Transformers, you can still run it in vLLM! +If the compatible model is: -Simply set `trust_remote_code=True` and vLLM will run any model on the Model Hub that is compatible with Transformers. -Provided that the model writer implements their model in a compatible way, this means that you can run new models before they are officially supported in Transformers or vLLM! +- on the Hugging Face Model Hub, simply set `trust_remote_code=True` for or `--trust-remode-code` for the . +- in a local directory, simply pass directory path to `model=` for or `vllm serve ` for the . -:::{tip} -If you have not yet created your custom model, you can follow this guide on [customising models in Transformers](https://huggingface.co/docs/transformers/en/custom_models). -::: +This means that, with the Transformers backend for vLLM, new models can be used before they are officially supported in Transformers or vLLM! -```python -from vllm import LLM -llm = LLM(model=..., task="generate", trust_remote_code=True) # Name or path of your model -llm.apply_model(lambda model: print(model.__class__)) -``` +(writing-custom-models)= + +#### Writing custom models + +This section details the necessary modifications to make to a Transformers compatible custom model that make it compatible with the Transformers backend for vLLM. (We assume that a Transformers compatible custom model has already been created, see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)). To make your model compatible with the Transformers backend, it needs: +1. `kwargs` passed down through all modules from `MyModel` to `MyAttention`. +2. `MyAttention` must use `ALL_ATTENTION_FUNCTIONS` to call attention. +3. `MyModel` must contain `_supports_attention_backend = True`. + ```{code-block} python :caption: modeling_my_model.py @@ -75,7 +79,7 @@ from torch import nn class MyAttention(nn.Module): - def forward(self, hidden_states, **kwargs): # <- kwargs are required + def forward(self, hidden_states, **kwargs): ... attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( @@ -91,11 +95,11 @@ class MyModel(PreTrainedModel): _supports_attention_backend = True ``` -Here is what happens in the background: +Here is what happens in the background when this model is loaded: -1. The config is loaded -2. `MyModel` Python class is loaded from the `auto_map`, and we check that the model `_supports_attention_backend`. -3. The `TransformersForCausalLM` backend is used. See , which leverage `self.config._attn_implementation = "vllm"`, thus the need to use `ALL_ATTENTION_FUNCTION`. +1. The config is loaded. +2. `MyModel` Python class is loaded from the `auto_map` in config, and we check that the model `is_backend_compatible()`. +3. `MyModel` is loaded into `TransformersForCausalLM` (see ) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used. That's it! diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index af4f2e95a9af..0ca6b6fd88b6 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -30,15 +30,6 @@ def set_default_torch_dtype(dtype: torch.dtype): torch.set_default_dtype(old_dtype) -def is_transformers_impl_compatible( - arch: str, - module: Optional["transformers.PreTrainedModel"] = None) -> bool: - mod = module or getattr(transformers, arch, None) - if mod is None: - return False - return mod.is_backend_compatible() - - def resolve_transformers_arch(model_config: ModelConfig, architectures: list[str]): for i, arch in enumerate(architectures): @@ -61,17 +52,26 @@ def resolve_transformers_arch(model_config: ModelConfig, revision=model_config.revision) for name, module in sorted(auto_map.items(), key=lambda x: x[0]) } - custom_model_module = auto_modules.get("AutoModel") + model_module = getattr(transformers, arch, None) + if model_module is None: + if "AutoModel" not in auto_map: + raise ValueError( + f"Cannot find model module. '{arch}' is not a registered " + "model in the Transformers library (only relevant if the " + "model is meant to be in Transformers) and 'AutoModel' is " + "not present in the model config's 'auto_map' (relevant " + "if the model is custom).") + model_module = auto_modules["AutoModel"] # TODO(Isotr0py): Further clean up these raises. # perhaps handled them in _ModelRegistry._raise_for_unsupported? if model_config.model_impl == ModelImpl.TRANSFORMERS: - if not is_transformers_impl_compatible(arch, custom_model_module): + if not model_module.is_backend_compatible(): raise ValueError( f"The Transformers implementation of {arch} is not " "compatible with vLLM.") architectures[i] = "TransformersForCausalLM" if model_config.model_impl == ModelImpl.AUTO: - if not is_transformers_impl_compatible(arch, custom_model_module): + if not model_module.is_backend_compatible(): raise ValueError( f"{arch} has no vLLM implementation and the Transformers " "implementation is not compatible with vLLM. Try setting "