mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 09:16:06 +08:00
[Doc] Move guide for multimodal model and other improvements (#6168)
This commit is contained in:
parent
175c43eca4
commit
9389380015
@ -5,10 +5,10 @@ Input Processing
|
||||
|
||||
.. currentmodule:: vllm.inputs
|
||||
|
||||
vLLM provides a mechanism for defining input processors for each model so that the inputs are processed
|
||||
in :class:`~vllm.LLMEngine` before they are passed to model executors.
|
||||
Each model can override parts of vLLM's :ref:`input processing pipeline <input_processing_pipeline>` via
|
||||
:data:`~vllm.inputs.INPUT_REGISTRY` and :data:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
|
||||
|
||||
Currently, this mechanism is only utilized in :ref:`multi-modal models <multi_modality>` for preprocessing multi-modal input
|
||||
Currently, this mechanism is only utilized in :ref:`multi-modal <multi_modality>` models for preprocessing multi-modal input
|
||||
data in addition to input prompt, but it can be extended to text-only language models when needed.
|
||||
|
||||
Guides
|
||||
|
||||
@ -7,25 +7,17 @@ Multi-Modality
|
||||
|
||||
vLLM provides experimental support for multi-modal models through the :mod:`vllm.multimodal` package.
|
||||
|
||||
:class:`vllm.inputs.PromptStrictInputs` accepts an additional attribute ``multi_modal_data``
|
||||
which allows you to pass in multi-modal input alongside text and token prompts.
|
||||
Multi-modal input can be passed alongside text and token prompts to :ref:`supported models <supported_vlms>`
|
||||
via the ``multi_modal_data`` field in :class:`vllm.inputs.PromptStrictInputs`.
|
||||
|
||||
.. note::
|
||||
``multi_modal_data`` can accept keys and values beyond the builtin ones, as long as a customized plugin is registered through
|
||||
:class:`vllm.multimodal.MULTIMODAL_REGISTRY`.
|
||||
the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
|
||||
|
||||
By default, vLLM models do not support multi-modal inputs. To enable multi-modal support for a model, please follow :ref:`the guide for adding a new multimodal model. <adding_a_new_multimodal_model>`.
|
||||
To implement a new multi-modal model in vLLM, please follow :ref:`this guide <enabling_multimodal_inputs>`.
|
||||
|
||||
|
||||
# TODO: Add more instructions on how to do that once embeddings is in.
|
||||
|
||||
Guides
|
||||
++++++
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
adding_multimodal_model
|
||||
..
|
||||
TODO: Add more instructions on how to add new plugins once embeddings is in.
|
||||
|
||||
Module Contents
|
||||
+++++++++++++++
|
||||
|
||||
@ -92,6 +92,7 @@ Documentation
|
||||
|
||||
models/supported_models
|
||||
models/adding_model
|
||||
models/enabling_multimodal_inputs
|
||||
models/engine_args
|
||||
models/lora
|
||||
models/vlm
|
||||
@ -116,6 +117,7 @@ Documentation
|
||||
automatic_prefix_caching/details
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:caption: Developer Documentation
|
||||
|
||||
dev/sampling_params
|
||||
|
||||
@ -10,6 +10,10 @@ This document provides a high-level guide on integrating a `HuggingFace Transfor
|
||||
The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
|
||||
However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
|
||||
|
||||
.. note::
|
||||
By default, vLLM models do not support multi-modal inputs. To enable multi-modal support,
|
||||
please follow :ref:`this guide <enabling_multimodal_inputs>` after implementing the model here.
|
||||
|
||||
.. tip::
|
||||
If you are encountering issues while integrating your model into vLLM, feel free to open an issue on our `GitHub <https://github.com/vllm-project/vllm/issues>`_ repository.
|
||||
We will be happy to help you out!
|
||||
@ -44,23 +48,23 @@ Next, you need to rewrite the :meth:`~torch.nn.Module.forward` method of your mo
|
||||
|
||||
.. code-block:: diff
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
- attention_mask: Optional[torch.Tensor] = None,
|
||||
- position_ids: Optional[torch.LongTensor] = None,
|
||||
- past_key_values: Optional[List[torch.FloatTensor]] = None,
|
||||
- inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
- labels: Optional[torch.LongTensor] = None,
|
||||
- use_cache: Optional[bool] = None,
|
||||
- output_attentions: Optional[bool] = None,
|
||||
- output_hidden_states: Optional[bool] = None,
|
||||
- return_dict: Optional[bool] = None,
|
||||
-) -> Union[Tuple, CausalLMOutputWithPast]:
|
||||
+ positions: torch.Tensor,
|
||||
+ kv_caches: List[torch.Tensor],
|
||||
+ attn_metadata: AttentionMetadata,
|
||||
+) -> Optional[SamplerOutput]:
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
- attention_mask: Optional[torch.Tensor] = None,
|
||||
- position_ids: Optional[torch.LongTensor] = None,
|
||||
- past_key_values: Optional[List[torch.FloatTensor]] = None,
|
||||
- inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
- labels: Optional[torch.LongTensor] = None,
|
||||
- use_cache: Optional[bool] = None,
|
||||
- output_attentions: Optional[bool] = None,
|
||||
- output_hidden_states: Optional[bool] = None,
|
||||
- return_dict: Optional[bool] = None,
|
||||
- ) -> Union[Tuple, CausalLMOutputWithPast]:
|
||||
+ positions: torch.Tensor,
|
||||
+ kv_caches: List[torch.Tensor],
|
||||
+ attn_metadata: AttentionMetadata,
|
||||
+ ) -> Optional[SamplerOutput]:
|
||||
|
||||
1. Update the code by considering that :code:`input_ids` and :code:`positions` are now flattened tensors.
|
||||
2. Replace the attention operation with either :code:`PagedAttention`, :code:`PagedAttentionWithRoPE`, or :code:`PagedAttentionWithALiBi` depending on the model's architecture.
|
||||
|
||||
@ -1,26 +1,21 @@
|
||||
.. _adding_a_new_multimodal_model:
|
||||
.. _enabling_multimodal_inputs:
|
||||
|
||||
Adding a New Multimodal Model
|
||||
=============================
|
||||
Enabling Multimodal Inputs
|
||||
==========================
|
||||
|
||||
This document provides a high-level guide on integrating a :ref:`multi-modal model <multi_modality>` into vLLM.
|
||||
This document walks you through the steps to extend a vLLM model so that it accepts :ref:`multi-modal <multi_modality>` inputs.
|
||||
|
||||
.. note::
|
||||
The complexity of adding a new model depends heavily on the model's architecture.
|
||||
The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
|
||||
However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
|
||||
|
||||
.. tip::
|
||||
If you are encountering issues while integrating your model into vLLM, feel free to open an issue on our `GitHub <https://github.com/vllm-project/vllm/issues>`_ repository.
|
||||
We will be happy to help you out!
|
||||
.. seealso::
|
||||
:ref:`adding_a_new_model`
|
||||
|
||||
|
||||
1. Set up the base vLLM model
|
||||
1. Update the base vLLM model
|
||||
-----------------------------
|
||||
|
||||
As usual, follow :ref:`these steps <adding_a_new_model>` to implement the model in vLLM, but note the following:
|
||||
It is assumed that you have already implemented the model in vLLM according to :ref:`these steps <adding_a_new_model>`.
|
||||
Further update the model as follows:
|
||||
|
||||
- You should additionally implement the :class:`~vllm.model_executor.models.interfaces.SupportsVision` interface.
|
||||
- Implement the :class:`~vllm.model_executor.models.interfaces.SupportsVision` interface.
|
||||
|
||||
.. code-block:: diff
|
||||
|
||||
@ -33,19 +28,19 @@ As usual, follow :ref:`these steps <adding_a_new_model>` to implement the model
|
||||
The model class does not have to be named :code:`*ForCausalLM`.
|
||||
Check out `the HuggingFace Transformers documentation <https://huggingface.co/docs/transformers/model_doc/auto#multimodal>`__ for some examples.
|
||||
|
||||
- While implementing the :meth:`~torch.nn.Module.forward` method, reserve a keyword parameter
|
||||
- If you haven't already done so, reserve a keyword parameter in :meth:`~torch.nn.Module.forward`
|
||||
for each input tensor that corresponds to a multi-modal input, as shown in the following example:
|
||||
|
||||
.. code-block:: diff
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
kv_caches: List[torch.Tensor],
|
||||
attn_metadata: AttentionMetadata,
|
||||
+ pixel_values: torch.Tensor,
|
||||
) -> SamplerOutput:
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
kv_caches: List[torch.Tensor],
|
||||
attn_metadata: AttentionMetadata,
|
||||
+ pixel_values: torch.Tensor,
|
||||
) -> SamplerOutput:
|
||||
|
||||
|
||||
2. Register input mappers
|
||||
@ -68,8 +63,8 @@ A default mapper is available for each modality in the core vLLM library. This i
|
||||
:ref:`input_processing_pipeline`
|
||||
|
||||
|
||||
3. Register maximum number of multimodal tokens
|
||||
----------------------------------------------------------
|
||||
3. Register maximum number of multi-modal tokens
|
||||
------------------------------------------------
|
||||
|
||||
For each modality type that the model accepts as input, calculate the maximum possible number of tokens
|
||||
and register it via :meth:`INPUT_REGISTRY.register_dummy_data <vllm.inputs.registry.InputRegistry.register_max_multimodal_tokens>`.
|
||||
@ -192,7 +192,7 @@ Vision Language Models
|
||||
-
|
||||
|
||||
If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
|
||||
Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` and :ref:`Adding a New Multimodal Model <adding_a_new_multimodal_model>`
|
||||
Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` and :ref:`Enabling Multimodal Inputs <enabling_multimodal_inputs>`
|
||||
for instructions on how to implement support for your model.
|
||||
Alternatively, you can raise an issue on our `GitHub <https://github.com/vllm-project/vllm/issues>`_ project.
|
||||
|
||||
|
||||
@ -141,7 +141,7 @@ class InputRegistry:
|
||||
The model is identified by ``model_config``.
|
||||
|
||||
See also:
|
||||
:ref:`adding_a_new_multimodal_model`
|
||||
:ref:`enabling_multimodal_inputs`
|
||||
"""
|
||||
# Avoid circular import
|
||||
from vllm.model_executor.model_loader import get_model_architecture
|
||||
|
||||
@ -162,8 +162,8 @@ class MultiModalPlugin(ABC):
|
||||
If `None` is provided, then the default input mapper is used instead.
|
||||
|
||||
See also:
|
||||
:ref:`input_processing_pipeline`
|
||||
:ref:`adding_a_new_multimodal_model`
|
||||
- :ref:`input_processing_pipeline`
|
||||
- :ref:`enabling_multimodal_inputs`
|
||||
"""
|
||||
|
||||
def wrapper(model_cls: N) -> N:
|
||||
@ -192,7 +192,8 @@ class MultiModalPlugin(ABC):
|
||||
TypeError: If the data type is not supported.
|
||||
|
||||
See also:
|
||||
:ref:`adding_a_new_multimodal_model`
|
||||
- :ref:`input_processing_pipeline`
|
||||
- :ref:`enabling_multimodal_inputs`
|
||||
"""
|
||||
# Avoid circular import
|
||||
from vllm.model_executor.model_loader import get_model_architecture
|
||||
@ -230,7 +231,7 @@ class MultiModalPlugin(ABC):
|
||||
If `None` is provided, then the default calculation is used instead.
|
||||
|
||||
See also:
|
||||
:ref:`adding_a_new_multimodal_model`
|
||||
:ref:`enabling_multimodal_inputs`
|
||||
"""
|
||||
|
||||
def wrapper(model_cls: N) -> N:
|
||||
@ -260,7 +261,7 @@ class MultiModalPlugin(ABC):
|
||||
The model is identified by ``model_config``.
|
||||
|
||||
See also:
|
||||
:ref:`adding_a_new_multimodal_model`
|
||||
:ref:`enabling_multimodal_inputs`
|
||||
"""
|
||||
# Avoid circular import
|
||||
from vllm.model_executor.model_loader import get_model_architecture
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user