From 2f652e6cdf09da407e78a60327832f913b32e26e Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 1 Oct 2025 02:58:29 +0800 Subject: [PATCH] [Doc] Improve MM Pooling model documentation (#25966) Signed-off-by: DarkLight1337 --- docs/features/multimodal_inputs.md | 2 +- docs/models/supported_models.md | 28 ++- docs/serving/openai_compatible_server.md | 60 ++++-- .../vision_language_pooling.py | 85 +++++++- ...ai_chat_embedding_client_for_multimodal.py | 197 ++++++++++++------ ...vec.jinja => template_vlm2vec_phi3v.jinja} | 0 examples/template_vlm2vec_qwen2vl.jinja | 15 ++ .../pooling/openai/test_vision_embedding.py | 2 +- tests/entrypoints/test_chat_utils.py | 3 +- 9 files changed, 292 insertions(+), 100 deletions(-) rename examples/{template_vlm2vec.jinja => template_vlm2vec_phi3v.jinja} (100%) create mode 100644 examples/template_vlm2vec_qwen2vl.jinja diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index bcc48e7560462..b61a54d3ca7fd 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -428,7 +428,7 @@ Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions If no fallback is available, an error is raised and you have to provide the chat template manually via the `--chat-template` argument. For certain models, we provide alternative chat templates inside . - For example, VLM2Vec uses which is different from the default one for Phi-3-Vision. + For example, VLM2Vec uses which is different from the default one for Phi-3-Vision. ### Image Inputs diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index d720fa2458e1d..18c8b7846cb2d 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -626,7 +626,29 @@ See [this page](../features/multimodal_inputs.md) on how to pass multi-modal inp For hybrid-only models such as Llama-4, Step3 and Mistral-3, a text-only mode can be enabled by setting all supported multimodal modalities to 0 (e.g, `--limit-mm-per-prompt '{"image":0}`) so that their multimodal modules will not be loaded to free up more GPU memory for KV cache. !!! note - vLLM currently only supports adding LoRA to the language backbone of multimodal models. + vLLM currently only supports dynamic LoRA adapters on the language backbone of multimodal models. + If you wish to use a model with LoRA in the multi-modal encoder, + please merge the weights into the base model first before running it in vLLM like a regular model. + + ```python + from peft import PeftConfig, PeftModel + from transformers import AutoModelForImageTextToText, AutoProcessor + + def merge_and_save(model_id: str, output_dir: str): + base_model = AutoModelForImageTextToText.from_pretrained(model_id) + lora_model = PeftModel.from_pretrained( + base_model, + model_id, + config=PeftConfig.from_pretrained(model_id), + ) + model = lora_model.merge_and_unload().to(dtype=base_model.dtype) + model._hf_peft_config_loaded = False # Needed to save the merged model + + processor = AutoProcessor.from_pretrained(model_id) + + model.save_pretrained(output_dir) + processor.save_pretrained(output_dir) + ``` ### Generative Models @@ -805,8 +827,8 @@ The following table lists those that are tested in vLLM. | Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | |--------------|--------|--------|-------------------|----------------------|---------------------------|---------------------| -| `LlavaNextForConditionalGeneration`C | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | | | -| `Phi3VForCausalLM`C | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | 🚧 | ✅︎ | | +| `LlavaNextForConditionalGeneration`C | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | ✅︎ | ✅︎ | +| `Phi3VForCausalLM`C | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | | ✅︎ | ✅︎ | | `*ForConditionalGeneration`C, `*ForCausalLM`C, etc. | Generative models | \* | N/A | \* | \* | \* | C Automatically converted into an embedding model via `--convert embed`. ([details](./pooling_models.md#model-conversion)) diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index 1ffe9c9ade208..fe0e1e3df378b 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -236,11 +236,33 @@ The following extra parameters are supported: Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings); you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it. -If the model has a [chat template][chat-template], you can replace `inputs` with a list of `messages` (same schema as [Chat API][chat-api]) -which will be treated as a single prompt to the model. - Code example: +If the model has a [chat template][chat-template], you can replace `inputs` with a list of `messages` (same schema as [Chat API][chat-api]) +which will be treated as a single prompt to the model. Here is a convenience function for calling the API while retaining OpenAI's type annotations: + +??? code + + ```python + from openai import OpenAI + from openai._types import NOT_GIVEN, NotGiven + from openai.types.chat import ChatCompletionMessageParam + from openai.types.create_embedding_response import CreateEmbeddingResponse + + def create_chat_embeddings( + client: OpenAI, + *, + messages: list[ChatCompletionMessageParam], + model: str, + encoding_format: Union[Literal["base64", "float"], NotGiven] = NOT_GIVEN, + ) -> CreateEmbeddingResponse: + return client.post( + "/embeddings", + cast_to=CreateEmbeddingResponse, + body={"messages": messages, "model": model, "encoding_format": encoding_format}, + ) + ``` + #### Multi-modal inputs You can pass multi-modal inputs to embedding models by defining a custom chat template for the server @@ -254,7 +276,7 @@ and passing a list of `messages` in the request. Refer to the examples below for vllm serve TIGER-Lab/VLM2Vec-Full --runner pooling \ --trust-remote-code \ --max-model-len 4096 \ - --chat-template examples/template_vlm2vec.jinja + --chat-template examples/template_vlm2vec_phi3v.jinja ``` !!! important @@ -262,34 +284,36 @@ and passing a list of `messages` in the request. Refer to the examples below for to run this model in embedding mode instead of text generation mode. The custom chat template is completely different from the original one for this model, - and can be found here: + and can be found here: Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library: ??? code ```python - import requests - + from openai import OpenAI + client = OpenAI( + base_url="http://localhost:8000/v1", + api_key="EMPTY", + ) image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" - response = requests.post( - "http://localhost:8000/v1/embeddings", - json={ - "model": "TIGER-Lab/VLM2Vec-Full", - "messages": [{ + response = create_chat_embeddings( + client, + model="TIGER-Lab/VLM2Vec-Full", + messages=[ + { "role": "user", "content": [ {"type": "image_url", "image_url": {"url": image_url}}, {"type": "text", "text": "Represent the given image."}, ], - }], - "encoding_format": "float", - }, + } + ], + encoding_format="float", ) - response.raise_for_status() - response_json = response.json() - print("Embedding output:", response_json["data"][0]["embedding"]) + + print("Image embedding output:", response.data[0].embedding) ``` === "DSE-Qwen2-MRL" diff --git a/examples/offline_inference/vision_language_pooling.py b/examples/offline_inference/vision_language_pooling.py index 0cc0c1e708b12..3d1daf4d19ff8 100644 --- a/examples/offline_inference/vision_language_pooling.py +++ b/examples/offline_inference/vision_language_pooling.py @@ -10,6 +10,7 @@ on HuggingFace model repository. from argparse import Namespace from dataclasses import asdict +from pathlib import Path from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args from PIL.Image import Image @@ -19,6 +20,9 @@ from vllm.entrypoints.score_utils import ScoreMultiModalParam from vllm.multimodal.utils import fetch_image from vllm.utils import FlexibleArgumentParser +ROOT_DIR = Path(__file__).parent.parent.parent +EXAMPLES_DIR = ROOT_DIR / "examples" + class TextQuery(TypedDict): modality: Literal["text"] @@ -82,23 +86,27 @@ def run_e5_v(query: Query) -> ModelRequestData: ) -def run_vlm2vec(query: Query) -> ModelRequestData: +def _get_vlm2vec_prompt_image(query: Query, image_token: str): if query["modality"] == "text": text = query["text"] prompt = f"Find me an everyday image that matches the given caption: {text}" # noqa: E501 image = None elif query["modality"] == "image": - prompt = "<|image_1|> Find a day-to-day image that looks similar to the provided image." # noqa: E501 + prompt = f"{image_token} Find a day-to-day image that looks similar to the provided image." # noqa: E501 image = query["image"] elif query["modality"] == "text+image": text = query["text"] - prompt = ( - f"<|image_1|> Represent the given image with the following question: {text}" # noqa: E501 - ) + prompt = f"{image_token} Represent the given image with the following question: {text}" # noqa: E501 image = query["image"] else: modality = query["modality"] - raise ValueError(f"Unsupported query modality: '{modality}'") + raise ValueError(f"Unsupported query modality: {modality!r}") + + return prompt, image + + +def run_vlm2vec_phi3v(query: Query) -> ModelRequestData: + prompt, image = _get_vlm2vec_prompt_image(query, "<|image_1|>") engine_args = EngineArgs( model="TIGER-Lab/VLM2Vec-Full", @@ -116,6 +124,66 @@ def run_vlm2vec(query: Query) -> ModelRequestData: ) +def run_vlm2vec_qwen2vl(query: Query) -> ModelRequestData: + # vLLM does not support LoRA adapters on multi-modal encoder, + # so we merge the weights first + from huggingface_hub.constants import HF_HUB_CACHE + from peft import PeftConfig, PeftModel + from transformers import AutoModelForImageTextToText, AutoProcessor + + from vllm.entrypoints.chat_utils import load_chat_template + + model_id = "TIGER-Lab/VLM2Vec-Qwen2VL-2B" + + base_model = AutoModelForImageTextToText.from_pretrained(model_id) + lora_model = PeftModel.from_pretrained( + base_model, + model_id, + config=PeftConfig.from_pretrained(model_id), + ) + model = lora_model.merge_and_unload().to(dtype=base_model.dtype) + model._hf_peft_config_loaded = False # Needed to save the merged model + + processor = AutoProcessor.from_pretrained( + model_id, + # `min_pixels` and `max_pixels` are deprecated + size={"shortest_edge": 3136, "longest_edge": 12845056}, + ) + processor.chat_template = load_chat_template( + # The original chat template is not correct + EXAMPLES_DIR / "template_vlm2vec_qwen2vl.jinja", + ) + + merged_path = str( + Path(HF_HUB_CACHE) / ("models--" + model_id.replace("/", "--") + "-vllm") + ) + print(f"Saving merged model to {merged_path}...") + print( + "NOTE: This directory is not tracked by `huggingface_hub` " + "so you have to delete this manually if you don't want it anymore." + ) + model.save_pretrained(merged_path) + processor.save_pretrained(merged_path) + print("Done!") + + prompt, image = _get_vlm2vec_prompt_image(query, "<|image_pad|>") + + engine_args = EngineArgs( + model=merged_path, + runner="pooling", + max_model_len=4096, + trust_remote_code=True, + mm_processor_kwargs={"num_crops": 4}, + limit_mm_per_prompt={"image": 1}, + ) + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + image=image, + ) + + def run_jinavl_reranker(query: Query) -> ModelRequestData: if query["modality"] != "text+images": raise ValueError(f"Unsupported query modality: '{query['modality']}'") @@ -232,7 +300,8 @@ def run_score(model: str, modality: QueryModality, seed: Optional[int]): model_example_map = { "e5_v": run_e5_v, - "vlm2vec": run_vlm2vec, + "vlm2vec_phi3v": run_vlm2vec_phi3v, + "vlm2vec_qwen2vl": run_vlm2vec_qwen2vl, "jinavl_reranker": run_jinavl_reranker, } @@ -246,7 +315,7 @@ def parse_args(): "--model-name", "-m", type=str, - default="vlm2vec", + default="vlm2vec_phi3v", choices=model_example_map.keys(), help="The name of the embedding model.", ) diff --git a/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py b/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py index 30cb3325b9b18..6e31c3836806f 100644 --- a/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py +++ b/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py @@ -4,69 +4,137 @@ """Example Python client for multimodal embedding API using vLLM API server NOTE: start a supported multimodal embeddings model server with `vllm serve`, e.g. - vllm serve TIGER-Lab/VLM2Vec-Full --runner pooling --trust_remote_code --max_model_len=1024 + vllm serve TIGER-Lab/VLM2Vec-Full \ + --runner pooling \ + --trust-remote-code \ + --max-model-len 4096 \ + --chat-template examples/template_vlm2vec_phi3v.jinja """ import argparse import base64 import io +from typing import Literal, Union -import requests +from openai import OpenAI +from openai._types import NOT_GIVEN, NotGiven +from openai.types.chat import ChatCompletionMessageParam +from openai.types.create_embedding_response import CreateEmbeddingResponse from PIL import Image +# Modify OpenAI's API key and API base to use vLLM's API server. +openai_api_key = "EMPTY" +openai_api_base = "http://localhost:8000/v1" + image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" -def vlm2vec(): - response = requests.post( - "http://localhost:8000/v1/embeddings", - json={ - "model": "TIGER-Lab/VLM2Vec-Full", - "messages": [ - { - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": image_url}}, - {"type": "text", "text": "Represent the given image."}, - ], - } - ], - "encoding_format": "float", - }, +def create_chat_embeddings( + client: OpenAI, + *, + messages: list[ChatCompletionMessageParam], + model: str, + encoding_format: Union[Literal["base64", "float"], NotGiven] = NOT_GIVEN, +) -> CreateEmbeddingResponse: + """ + Convenience function for accessing vLLM's Chat Embeddings API, + which is an extension of OpenAI's existing Embeddings API. + """ + return client.post( + "/embeddings", + cast_to=CreateEmbeddingResponse, + body={"messages": messages, "model": model, "encoding_format": encoding_format}, ) - response.raise_for_status() - response_json = response.json() - - print("Embedding output:", response_json["data"][0]["embedding"]) -def dse_qwen2_vl(inp: dict): - # Embedding an Image - if inp["type"] == "image": - messages = [ +def run_vlm2vec(client: OpenAI, model: str): + response = create_chat_embeddings( + client, + messages=[ + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": image_url}}, + {"type": "text", "text": "Represent the given image."}, + ], + } + ], + model=model, + encoding_format="float", + ) + + print("Image embedding output:", response.data[0].embedding) + + response = create_chat_embeddings( + client, + messages=[ + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": image_url}}, + { + "type": "text", + "text": "Represent the given image with the following question: What is in the image.", + }, + ], + } + ], + model=model, + encoding_format="float", + ) + + print("Image+Text embedding output:", response.data[0].embedding) + + response = create_chat_embeddings( + client, + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "A cat and a dog"}, + ], + } + ], + model=model, + encoding_format="float", + ) + + print("Text embedding output:", response.data[0].embedding) + + +def run_dse_qwen2_vl(client: OpenAI, model: str): + response = create_chat_embeddings( + client, + messages=[ { "role": "user", "content": [ { "type": "image_url", "image_url": { - "url": inp["image_url"], + "url": image_url, }, }, {"type": "text", "text": "What is shown in this image?"}, ], } - ] - # Embedding a Text Query - else: - # MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image - # of the minimum input size - buffer = io.BytesIO() - image_placeholder = Image.new("RGB", (56, 56)) - image_placeholder.save(buffer, "png") - buffer.seek(0) - image_placeholder = base64.b64encode(buffer.read()).decode("utf-8") - messages = [ + ], + model=model, + encoding_format="float", + ) + + print("Image embedding output:", response.data[0].embedding) + + # MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image + # of the minimum input size + buffer = io.BytesIO() + image_placeholder = Image.new("RGB", (56, 56)) + image_placeholder.save(buffer, "png") + buffer.seek(0) + image_placeholder = base64.b64encode(buffer.read()).decode("utf-8") + response = create_chat_embeddings( + client, + messages=[ { "role": "user", "content": [ @@ -76,23 +144,21 @@ def dse_qwen2_vl(inp: dict): "url": f"data:image/jpeg;base64,{image_placeholder}", }, }, - {"type": "text", "text": f"Query: {inp['content']}"}, + {"type": "text", "text": "Query: What is the weather like today?"}, ], } - ] - - response = requests.post( - "http://localhost:8000/v1/embeddings", - json={ - "model": "MrLight/dse-qwen2-2b-mrl-v1", - "messages": messages, - "encoding_format": "float", - }, + ], + model=model, + encoding_format="float", ) - response.raise_for_status() - response_json = response.json() - print("Embedding output:", response_json["data"][0]["embedding"]) + print("Text embedding output:", response.data[0].embedding) + + +model_example_map = { + "vlm2vec": run_vlm2vec, + "dse_qwen2_vl": run_dse_qwen2_vl, +} def parse_args(): @@ -103,29 +169,24 @@ def parse_args(): parser.add_argument( "--model", type=str, - choices=["vlm2vec", "dse_qwen2_vl"], + choices=model_example_map.keys(), required=True, - help="Which model to call.", + help="The name of the embedding model.", ) return parser.parse_args() def main(args): - if args.model == "vlm2vec": - vlm2vec() - elif args.model == "dse_qwen2_vl": - dse_qwen2_vl( - { - "type": "image", - "image_url": image_url, - } - ) - dse_qwen2_vl( - { - "type": "text", - "content": "What is the weather like today?", - } - ) + client = OpenAI( + # defaults to os.environ.get("OPENAI_API_KEY") + api_key=openai_api_key, + base_url=openai_api_base, + ) + + models = client.models.list() + model_id = models.data[0].id + + model_example_map[args.model](client, model_id) if __name__ == "__main__": diff --git a/examples/template_vlm2vec.jinja b/examples/template_vlm2vec_phi3v.jinja similarity index 100% rename from examples/template_vlm2vec.jinja rename to examples/template_vlm2vec_phi3v.jinja diff --git a/examples/template_vlm2vec_qwen2vl.jinja b/examples/template_vlm2vec_qwen2vl.jinja new file mode 100644 index 0000000000000..3ab099d8f546d --- /dev/null +++ b/examples/template_vlm2vec_qwen2vl.jinja @@ -0,0 +1,15 @@ +{%- if messages | length > 1 -%} + {{ raise_exception('Embedding models should only embed one message at a time') }} +{%- endif -%} + +{% set vars = namespace(parts=[]) %} +{%- for message in messages -%} + {%- for content in message['content'] -%} + {%- if content['type'] == 'text' -%} + {%- set vars.parts = vars.parts + [content['text']] %} + {%- elif content['type'] == 'image' -%} + {%- set vars.parts = vars.parts + ['<|image_pad|>'] %} + {%- endif -%} + {%- endfor -%} +{%- endfor -%} +{{ vars.parts | join(' ') }} diff --git a/tests/entrypoints/pooling/openai/test_vision_embedding.py b/tests/entrypoints/pooling/openai/test_vision_embedding.py index 48434e36eb265..a30413bc32987 100644 --- a/tests/entrypoints/pooling/openai/test_vision_embedding.py +++ b/tests/entrypoints/pooling/openai/test_vision_embedding.py @@ -14,7 +14,7 @@ from vllm.multimodal.utils import encode_image_base64, fetch_image MODEL_NAME = "TIGER-Lab/VLM2Vec-Full" MAXIMUM_IMAGES = 2 -vlm2vec_jinja_path = VLLM_PATH / "examples/template_vlm2vec.jinja" +vlm2vec_jinja_path = VLLM_PATH / "examples/template_vlm2vec_phi3v.jinja" assert vlm2vec_jinja_path.exists() # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA) diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index a268f573ef905..85b305c2fa023 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -2468,7 +2468,8 @@ def test_resolve_content_format_fallbacks(model, expected_format): ("template_falcon.jinja", "string"), ("template_inkbot.jinja", "string"), ("template_teleflm.jinja", "string"), - ("template_vlm2vec.jinja", "openai"), + ("template_vlm2vec_phi3v.jinja", "openai"), + ("template_vlm2vec_qwen2vl.jinja", "openai"), ("tool_chat_template_granite_20b_fc.jinja", "string"), ("tool_chat_template_hermes.jinja", "string"), ("tool_chat_template_internlm2_tool.jinja", "string"),