mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 01:05:01 +08:00
[Doc] Update LLaVA docs (#5437)
Co-authored-by: Roger Wang <ywang@roblox.com>
This commit is contained in:
parent
39873476f8
commit
0ce7b952f8
@ -20,9 +20,9 @@ The following :ref:`engine arguments <engine_args>` are specific to VLMs:
|
|||||||
Currently, the support for vision language models on vLLM has the following limitations:
|
Currently, the support for vision language models on vLLM has the following limitations:
|
||||||
|
|
||||||
* Only single image input is supported per text prompt.
|
* Only single image input is supported per text prompt.
|
||||||
* Dynamic ``image_input_shape`` is not supported: the input image will be resized to the static ``image_input_shape``. This means model output might not exactly match the HuggingFace implementation.
|
* Dynamic ``image_input_shape`` is not supported: the input image will be resized to the static ``image_input_shape``. This means our LLaVA-NeXT output may not exactly match the huggingface implementation.
|
||||||
|
|
||||||
We are continuously improving user & developer experience for VLMs. Please raise an issue on GitHub if you have any feedback or feature requests.
|
We are continuously improving user & developer experience for VLMs. Please `open an issue on GitHub <https://github.com/vllm-project/vllm/issues/new/choose>`_ if you have any feedback or feature requests.
|
||||||
|
|
||||||
Offline Batched Inference
|
Offline Batched Inference
|
||||||
-------------------------
|
-------------------------
|
||||||
|
|||||||
@ -227,7 +227,7 @@ class LlavaForConditionalGeneration(VisionLanguageModelBase):
|
|||||||
attn_metadata: AttentionMetadata,
|
attn_metadata: AttentionMetadata,
|
||||||
**kwargs: object,
|
**kwargs: object,
|
||||||
) -> SamplerOutput:
|
) -> SamplerOutput:
|
||||||
"""Run forward pass for Llava 1.5.
|
"""Run forward pass for LLaVA-1.5.
|
||||||
|
|
||||||
One key thing to understand is the `input_ids` already accounts for the
|
One key thing to understand is the `input_ids` already accounts for the
|
||||||
positions of the to-be-inserted image embeddings.
|
positions of the to-be-inserted image embeddings.
|
||||||
@ -247,22 +247,25 @@ class LlavaForConditionalGeneration(VisionLanguageModelBase):
|
|||||||
This way, the `positions` and `attn_metadata` are consistent
|
This way, the `positions` and `attn_metadata` are consistent
|
||||||
with the `input_ids`.
|
with the `input_ids`.
|
||||||
|
|
||||||
The model takes two types of image inputs:
|
This model has two modes of image inputs:
|
||||||
PIXEL_VALUES and IMAGE_FEATURES.
|
`PIXEL_VALUES` and `IMAGE_FEATURES`.
|
||||||
The following shows how each maps to huggingface implementation.
|
|
||||||
PIXEL_VALUES:
|
|
||||||
- https://github.com/huggingface/transformers/blob/07bdbeb/src/transformers/models/llava/modeling_llava.py#L353
|
|
||||||
IMAGE_FEATURES:
|
|
||||||
- https://github.com/huggingface/transformers/blob/07bdbeb/src/transformers/models/llava/modeling_llava.py#L430
|
|
||||||
before going through the multi modal projector.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
input_ids: Flattened (concatenated) input_ids corresponding to a
|
input_ids: Flattened (concatenated) input_ids corresponding to a
|
||||||
batch.
|
batch.
|
||||||
pixel_values: For PIXEL_VALUES, expects a batch with shape
|
pixel_values: The pixels in each input image.
|
||||||
[1, 3, 336, 336].
|
Expects a batch with shape `[1, 3, 336, 336]`.
|
||||||
image_features: For IMAGE_FEATURES, expects a batch with shape
|
(Only applicable to `PIXEL_VALUES` mode)
|
||||||
[1, 576, 1024].
|
image_features: The image features for each input image outputted by
|
||||||
|
the vision tower before passing to the multi-modal projector.
|
||||||
|
Expects a batch with shape `[1, 576, 1024]`.
|
||||||
|
(Only applicable to `IMAGE_FEATURES` mode)
|
||||||
|
|
||||||
|
See also:
|
||||||
|
Each input maps to huggingface implementation, as follows:
|
||||||
|
|
||||||
|
- `pixel_values`: https://github.com/huggingface/transformers/blob/v4.41.1/src/transformers/models/llava/modeling_llava.py#L360
|
||||||
|
- `image_features`: https://github.com/huggingface/transformers/blob/v4.41.1/src/transformers/models/llava/modeling_llava.py#L437
|
||||||
"""
|
"""
|
||||||
image_input = self._parse_and_validate_image_input(**kwargs)
|
image_input = self._parse_and_validate_image_input(**kwargs)
|
||||||
|
|
||||||
|
|||||||
@ -108,15 +108,6 @@ def _image_pixel_processor(
|
|||||||
@MULTIMODAL_REGISTRY.register_image_pixel_input(_image_pixel_processor)
|
@MULTIMODAL_REGISTRY.register_image_pixel_input(_image_pixel_processor)
|
||||||
@MULTIMODAL_REGISTRY.register_dummy_data(_get_dummy_image_data)
|
@MULTIMODAL_REGISTRY.register_dummy_data(_get_dummy_image_data)
|
||||||
class LlavaNextForConditionalGeneration(VisionLanguageModelBase):
|
class LlavaNextForConditionalGeneration(VisionLanguageModelBase):
|
||||||
"""
|
|
||||||
Args to `forward()`:
|
|
||||||
input_ids: Flattened (concatenated) input_ids corresponding to a
|
|
||||||
batch.
|
|
||||||
pixel_values: For PIXEL_VALUES, expects a batch with shape
|
|
||||||
[1, num_patches, 3, 336, 336].
|
|
||||||
image_features: For IMAGE_FEATURES, expects a batch with shape
|
|
||||||
[1, num_patches, 1176, 1024].
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
config: LlavaNextConfig,
|
config: LlavaNextConfig,
|
||||||
@ -355,7 +346,7 @@ class LlavaNextForConditionalGeneration(VisionLanguageModelBase):
|
|||||||
attn_metadata: AttentionMetadata,
|
attn_metadata: AttentionMetadata,
|
||||||
**kwargs: object,
|
**kwargs: object,
|
||||||
) -> SamplerOutput:
|
) -> SamplerOutput:
|
||||||
"""Run forward pass for Llava 1.5.
|
"""Run forward pass for LlaVA-NeXT.
|
||||||
|
|
||||||
One key thing to understand is the `input_ids` already accounts for the
|
One key thing to understand is the `input_ids` already accounts for the
|
||||||
positions of the to-be-inserted image embeddings.
|
positions of the to-be-inserted image embeddings.
|
||||||
@ -375,22 +366,19 @@ class LlavaNextForConditionalGeneration(VisionLanguageModelBase):
|
|||||||
This way, the `positions` and `attn_metadata` are consistent
|
This way, the `positions` and `attn_metadata` are consistent
|
||||||
with the `input_ids`.
|
with the `input_ids`.
|
||||||
|
|
||||||
The model takes two types of image inputs:
|
|
||||||
PIXEL_VALUES and IMAGE_FEATURES.
|
|
||||||
The following shows how each maps to huggingface implementation.
|
|
||||||
PIXEL_VALUES:
|
|
||||||
- https://github.com/huggingface/transformers/blob/07bdbeb/src/transformers/models/llava/modeling_llava.py#L353
|
|
||||||
IMAGE_FEATURES:
|
|
||||||
- https://github.com/huggingface/transformers/blob/07bdbeb/src/transformers/models/llava/modeling_llava.py#L430
|
|
||||||
before going through the multi modal projector.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
input_ids: Flattened (concatenated) input_ids corresponding to a
|
input_ids: Flattened (concatenated) input_ids corresponding to a
|
||||||
batch.
|
batch.
|
||||||
pixel_values: For PIXEL_VALUES, expects a batch with shape
|
pixel_values: The pixels in each grid patch for each input image.
|
||||||
[1, 3, 336, 336].
|
Expects a batch with shape `[1, num_patches, 3, 336, 336]`.
|
||||||
image_features: For IMAGE_FEATURES, expects a batch with shape
|
image_sizes: The original `(width, height)` for each input image.
|
||||||
[1, 576, 1024].
|
Expects a batch with shape `[1, 2]`.
|
||||||
|
|
||||||
|
See also:
|
||||||
|
Each input maps to huggingface implementation, as follows:
|
||||||
|
|
||||||
|
- `pixel_values`: https://github.com/huggingface/transformers/blob/v4.41.1/src/transformers/models/llava_next/modeling_llava_next.py#L690
|
||||||
|
- `image_sizes`: https://github.com/huggingface/transformers/blob/v4.41.1/src/transformers/models/llava_next/modeling_llava_next.py#L691
|
||||||
"""
|
"""
|
||||||
image_input = self._parse_and_validate_image_input(**kwargs)
|
image_input = self._parse_and_validate_image_input(**kwargs)
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user