From 4bed167768bd228b9251d77f7b338fb997d3aefe Mon Sep 17 00:00:00 2001 From: shineran96 Date: Fri, 11 Jul 2025 01:43:43 +0800 Subject: [PATCH] [Model][VLM] Support JinaVL Reranker (#20260) Signed-off-by: shineran96 --- .buildkite/test-pipeline.yaml | 2 +- docs/models/supported_models.md | 8 + docs/serving/openai_compatible_server.md | 54 +++++- ...mbedding.py => vision_language_pooling.py} | 96 +++++++++- ...enai_cross_encoder_score_for_multimodal.py | 60 ++++++ .../pooling/test_jinavl_reranker.py | 160 ++++++++++++++++ tests/models/registry.py | 3 + vllm/entrypoints/llm.py | 181 ++++++++++++------ vllm/entrypoints/openai/protocol.py | 24 ++- vllm/entrypoints/openai/serving_score.py | 156 ++++++++++----- vllm/entrypoints/score_utils.py | 160 +++++++++++++++- vllm/model_executor/models/config.py | 10 + vllm/model_executor/models/interfaces.py | 57 ++++++ vllm/model_executor/models/jina_vl.py | 150 +++++++++++++++ vllm/model_executor/models/registry.py | 1 + 15 files changed, 991 insertions(+), 131 deletions(-) rename examples/offline_inference/{vision_language_embedding.py => vision_language_pooling.py} (66%) create mode 100644 examples/online_serving/openai_cross_encoder_score_for_multimodal.py create mode 100644 tests/models/multimodal/pooling/test_jinavl_reranker.py create mode 100644 vllm/model_executor/models/jina_vl.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 148cf8074232f..af0bf2ae364ff 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -282,7 +282,7 @@ steps: - python3 offline_inference/llm_engine_example.py - python3 offline_inference/audio_language.py --seed 0 - python3 offline_inference/vision_language.py --seed 0 - - python3 offline_inference/vision_language_embedding.py --seed 0 + - python3 offline_inference/vision_language_pooling.py --seed 0 - python3 offline_inference/vision_language_multi_image.py --seed 0 - VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 offline_inference/encoder_decoder.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 52c7fa9c0e8fc..ddc920aeb2dda 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -712,6 +712,14 @@ The following table lists those that are tested in vLLM. --- +#### Scoring + +Specified using `--task score`. + +| Architecture | Models | Inputs | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) | +|-------------------------------------|--------------------|----------|--------------------------|------------------------|-----------------------------|-----------------------| +| `JinaVLForSequenceClassification` | JinaVL-based | T + IE+ | `jinaai/jina-reranker-m0`, etc. | | | ✅︎ | + ## Model Support Policy At vLLM, we are committed to facilitating the integration and support of third-party models within our ecosystem. Our approach is designed to balance the need for robustness and the practical limitations of supporting a wide range of models. Here’s how we manage third-party model support: diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index cebef2b6a2d60..2cf45eeaab4d4 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -537,7 +537,7 @@ The following extra parameters are supported: ### Score API -Our Score API can apply a cross-encoder model or an embedding model to predict scores for sentence pairs. When using an embedding model the score corresponds to the cosine similarity between each embedding pair. +Our Score API can apply a cross-encoder model or an embedding model to predict scores for sentence or multimodal pairs. When using an embedding model the score corresponds to the cosine similarity between each embedding pair. Usually, the score for a sentence pair refers to the similarity between two sentences, on a scale of 0 to 1. You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html). @@ -676,6 +676,55 @@ The total number of pairs is `len(text_2)`. } ``` +#### Multi-modal inputs + +You can pass multi-modal inputs to scoring models by passing `content` including a list of multi-modal input (image, etc.) in the request. Refer to the examples below for illustration. + +=== "JinaVL-Reranker" + + To serve the model: + + ```bash + vllm serve jinaai/jina-reranker-m0 + ``` + + Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library: + + ??? Code + + ```python + import requests + + response = requests.post( + "http://localhost:8000/v1/score", + json={ + "model": "jinaai/jina-reranker-m0", + "text_1": "slm markdown", + "text_2": { + "content": [ + { + "type": "image_url", + "image_url": { + "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png" + }, + }, + { + "type": "image_url", + "image_url": { + "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png" + }, + }, + ] + } + }, + ) + response.raise_for_status() + response_json = response.json() + print("Scoring output:", response_json["data"][0]["score"]) + print("Scoring output:", response_json["data"][1]["score"]) + ``` +Full example: + #### Extra parameters The following [pooling parameters][pooling-params] are supported. @@ -695,8 +744,7 @@ The following extra parameters are supported: ### Re-rank API Our Re-rank API can apply an embedding model or a cross-encoder model to predict relevant scores between a single query, and -each of a list of documents. Usually, the score for a sentence pair refers to the similarity between two sentences, on -a scale of 0 to 1. +each of a list of documents. Usually, the score for a sentence pair refers to the similarity between two sentences or multi-modal inputs (image, etc.), on a scale of 0 to 1. You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html). diff --git a/examples/offline_inference/vision_language_embedding.py b/examples/offline_inference/vision_language_pooling.py similarity index 66% rename from examples/offline_inference/vision_language_embedding.py rename to examples/offline_inference/vision_language_pooling.py index 9451825f0b734..57963ebd2b100 100644 --- a/examples/offline_inference/vision_language_embedding.py +++ b/examples/offline_inference/vision_language_pooling.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This example shows how to use vLLM for running offline inference with -the correct prompt format on vision language models for multimodal embedding. +the correct prompt format on vision language models for multimodal pooling. For most models, the prompt format should follow corresponding examples on HuggingFace model repository. @@ -15,6 +15,7 @@ from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args from PIL.Image import Image from vllm import LLM, EngineArgs +from vllm.entrypoints.score_utils import ScoreMultiModalParam from vllm.multimodal.utils import fetch_image from vllm.utils import FlexibleArgumentParser @@ -35,14 +36,22 @@ class TextImageQuery(TypedDict): image: Image -QueryModality = Literal["text", "image", "text+image"] -Query = Union[TextQuery, ImageQuery, TextImageQuery] +class TextImagesQuery(TypedDict): + modality: Literal["text+images"] + text: str + image: ScoreMultiModalParam + + +QueryModality = Literal["text", "image", "text+image", "text+images"] +Query = Union[TextQuery, ImageQuery, TextImageQuery, TextImagesQuery] class ModelRequestData(NamedTuple): engine_args: EngineArgs - prompt: str - image: Optional[Image] + prompt: Optional[str] = None + image: Optional[Image] = None + query: Optional[str] = None + documents: Optional[ScoreMultiModalParam] = None def run_e5_v(query: Query) -> ModelRequestData: @@ -107,6 +116,29 @@ def run_vlm2vec(query: Query) -> ModelRequestData: ) +def run_jinavl_reranker(query: Query) -> ModelRequestData: + if query["modality"] != "text+images": + raise ValueError(f"Unsupported query modality: '{query['modality']}'") + + engine_args = EngineArgs( + model="jinaai/jina-reranker-m0", + task="score", + max_model_len=32768, + trust_remote_code=True, + mm_processor_kwargs={ + "min_pixels": 3136, + "max_pixels": 602112, + }, + limit_mm_per_prompt={"image": 1}, + ) + + return ModelRequestData( + engine_args=engine_args, + query=query["text"], + documents=query["image"], + ) + + def get_query(modality: QueryModality): if modality == "text": return TextQuery(modality="text", text="A dog sitting in the grass") @@ -128,6 +160,28 @@ def get_query(modality: QueryModality): ), ) + if modality == "text+images": + return TextImagesQuery( + modality="text+images", + text="slm markdown", + image={ + "content": [ + { + "type": "image_url", + "image_url": { + "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png" + }, + }, + { + "type": "image_url", + "image_url": { + "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png" + }, + }, + ] + }, + ) + msg = f"Modality {modality} is not supported." raise ValueError(msg) @@ -162,16 +216,31 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]): print("-" * 50) +def run_score(model: str, modality: QueryModality, seed: Optional[int]): + query = get_query(modality) + req_data = model_example_map[model](query) + + engine_args = asdict(req_data.engine_args) | {"seed": seed} + llm = LLM(**engine_args) + + outputs = llm.score(req_data.query, req_data.documents) + + print("-" * 30) + print([output.outputs.score for output in outputs]) + print("-" * 30) + + model_example_map = { "e5_v": run_e5_v, "vlm2vec": run_vlm2vec, + "jinavl_reranker": run_jinavl_reranker, } def parse_args(): parser = FlexibleArgumentParser( description="Demo on using vLLM for offline inference with " - "vision language models for multimodal embedding" + "vision language models for multimodal pooling tasks." ) parser.add_argument( "--model-name", @@ -181,6 +250,14 @@ def parse_args(): choices=model_example_map.keys(), help="The name of the embedding model.", ) + parser.add_argument( + "--task", + "-t", + type=str, + default="embedding", + choices=["embedding", "scoring"], + help="The task type.", + ) parser.add_argument( "--modality", type=str, @@ -198,7 +275,12 @@ def parse_args(): def main(args: Namespace): - run_encode(args.model_name, args.modality, args.seed) + if args.task == "embedding": + run_encode(args.model_name, args.modality, args.seed) + elif args.task == "scoring": + run_score(args.model_name, args.modality, args.seed) + else: + raise ValueError(f"Unsupported task: {args.task}") if __name__ == "__main__": diff --git a/examples/online_serving/openai_cross_encoder_score_for_multimodal.py b/examples/online_serving/openai_cross_encoder_score_for_multimodal.py new file mode 100644 index 0000000000000..e49905a864c1d --- /dev/null +++ b/examples/online_serving/openai_cross_encoder_score_for_multimodal.py @@ -0,0 +1,60 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Example online usage of Score API. + +Run `vllm serve --task score` to start up the server in vLLM. +""" + +import argparse +import pprint + +import requests + + +def post_http_request(prompt: dict, api_url: str) -> requests.Response: + headers = {"User-Agent": "Test Client"} + response = requests.post(api_url, headers=headers, json=prompt) + return response + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--host", type=str, default="localhost") + parser.add_argument("--port", type=int, default=8000) + parser.add_argument("--model", type=str, default="jinaai/jina-reranker-m0") + return parser.parse_args() + + +def main(args): + api_url = f"http://{args.host}:{args.port}/score" + model_name = args.model + + text_1 = "slm markdown" + text_2 = { + "content": [ + { + "type": "image_url", + "image_url": { + "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png" + }, + }, + { + "type": "image_url", + "image_url": { + "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png" + }, + }, + ] + } + prompt = {"model": model_name, "text_1": text_1, "text_2": text_2} + score_response = post_http_request(prompt=prompt, api_url=api_url) + print("\nPrompt when text_1 is string and text_2 is a image list:") + pprint.pprint(prompt) + print("\nScore Response:") + pprint.pprint(score_response.json()) + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/tests/models/multimodal/pooling/test_jinavl_reranker.py b/tests/models/multimodal/pooling/test_jinavl_reranker.py new file mode 100644 index 0000000000000..83d6ab8e40395 --- /dev/null +++ b/tests/models/multimodal/pooling/test_jinavl_reranker.py @@ -0,0 +1,160 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest +from transformers import AutoModel + +model_name = "jinaai/jina-reranker-m0" + +mm_processor_kwargs = { + "min_pixels": 3136, + "max_pixels": 602112, +} + +limit_mm_per_prompt = {"image": 2} + + +def vllm_reranker(model_name, + query, + documents, + query_type="text", + doc_type="text"): + from vllm import LLM + + model = LLM( + model=model_name, + task="score", + max_model_len=32768, + mm_processor_kwargs=mm_processor_kwargs, + limit_mm_per_prompt=limit_mm_per_prompt, + ) + + def create_image_param(url: str): + return {"type": "image_url", "image_url": {"url": f"{url}"}} + + if query_type == "image": + query = {"content": [create_image_param(url) for url in query]} + + if doc_type == "image": + documents = {"content": [create_image_param(url) for url in documents]} + + outputs = model.score(query, documents) + + return [output.outputs.score for output in outputs] + + +def hf_reranker(model_name, + query, + documents, + query_type="text", + doc_type="text"): + + checkpoint_to_hf_mapper = { + "visual.": "model.visual.", + "model.": "model.language_model.", + } + + model = AutoModel.from_pretrained( + model_name, + torch_dtype="auto", + trust_remote_code=True, + key_mapping=checkpoint_to_hf_mapper).to("cuda").eval() + + data_pairs = [[query[0], d] for d in documents] + + scores = model.compute_score(data_pairs, + max_length=2048, + query_type=query_type, + doc_type=doc_type) + return scores + + +# Visual Documents Reranking +@pytest.mark.parametrize("model_name", [model_name]) +def test_model_text_image(model_name): + + query = ["slm markdown"] + documents = [ + "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png", + "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png", + ] + + hf_outputs = hf_reranker(model_name, query, documents, "text", "image") + vllm_outputs = vllm_reranker(model_name, query, documents, "text", "image") + + assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02) + assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02) + + +# Textual Documents Reranking +@pytest.mark.parametrize("model_name", [model_name]) +def test_model_text_text(model_name): + + query = ["slm markdown"] + documents = [ + """We present ReaderLM-v2, a compact 1.5 billion parameter language model designed for efficient + web content extraction. Our model processes documents up to 512K tokens, transforming messy HTML + into clean Markdown or JSON formats with high accuracy -- making it an ideal tool for grounding + large language models. The models effectiveness results from two key innovations: (1) a three-stage + data synthesis pipeline that generates high quality, diverse training data by iteratively drafting, + refining, and critiquing web content extraction; and (2) a unified training framework combining + continuous pre-training with multi-objective optimization. Intensive evaluation demonstrates that + ReaderLM-v2 outperforms GPT-4o-2024-08-06 and other larger models by 15-20% on carefully curated + benchmarks, particularly excelling at documents exceeding 100K tokens, while maintaining significantly + lower computational requirements.""", # noqa: E501 + "数据提取么?为什么不用正则啊,你用正则不就全解决了么?", + ] + + hf_outputs = hf_reranker(model_name, query, documents, "text", "text") + vllm_outputs = vllm_reranker(model_name, query, documents, "text", "text") + + assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02) + assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02) + + +# Image Querying for Textual Documents +@pytest.mark.parametrize("model_name", [model_name]) +def test_model_image_text(model_name): + + query = [ + "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png" + ] + documents = [ + """We present ReaderLM-v2, a compact 1.5 billion parameter language model designed for efficient + web content extraction. Our model processes documents up to 512K tokens, transforming messy HTML + into clean Markdown or JSON formats with high accuracy -- making it an ideal tool for grounding + large language models. The models effectiveness results from two key innovations: (1) a three-stage + data synthesis pipeline that generates high quality, diverse training data by iteratively drafting, + refining, and critiquing web content extraction; and (2) a unified training framework combining + continuous pre-training with multi-objective optimization. Intensive evaluation demonstrates that + ReaderLM-v2 outperforms GPT-4o-2024-08-06 and other larger models by 15-20% on carefully curated + benchmarks, particularly excelling at documents exceeding 100K tokens, while maintaining significantly + lower computational requirements.""", # noqa: E501 + "数据提取么?为什么不用正则啊,你用正则不就全解决了么?", + ] + + hf_outputs = hf_reranker(model_name, query, documents, "image", "text") + vllm_outputs = vllm_reranker(model_name, query, documents, "image", "text") + + assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02) + assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02) + + +# Image Querying for Image Documents +@pytest.mark.parametrize("model_name", [model_name]) +def test_model_image_image(model_name): + + query = [ + "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png" + ] + documents = [ + "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png", + "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png", + ] + + hf_outputs = hf_reranker(model_name, query, documents, "image", "image") + vllm_outputs = vllm_reranker(model_name, query, documents, "image", + "image") + + assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02) + assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02) diff --git a/tests/models/registry.py b/tests/models/registry.py index 04fff03862fce..5eb92c4639026 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -432,6 +432,9 @@ _MULTIMODAL_EXAMPLE_MODELS = { trust_remote_code=True), # noqa: E501 "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501 "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), # noqa: E501 + + # [Cross-encoder] + "JinaVLForRanking": _HfExamplesInfo("jinaai/jina-reranker-m0"), # noqa: E501 } _SPECULATIVE_DECODING_EXAMPLE_MODELS = { diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 16c051d61de32..d5ecd7a864d6f 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -28,8 +28,11 @@ from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam, apply_mistral_chat_template, parse_chat_messages, resolve_chat_template_content_format) -from vllm.entrypoints.score_utils import (_cosine_similarity, - _validate_score_input_lens) +from vllm.entrypoints.score_utils import (ScoreContentPartParam, + ScoreMultiModalParam, + _cosine_similarity, + _validate_score_input_lens, + get_score_prompt) from vllm.entrypoints.utils import _validate_truncation_size from vllm.inputs import PromptType, SingletonPrompt, TextPrompt, TokensPrompt from vllm.inputs.parse import parse_and_batch_prompt @@ -1187,8 +1190,8 @@ class LLM: def _cross_encoding_score( self, tokenizer: AnyTokenizer, - text_1: list[str], - text_2: list[str], + data_1: Union[list[str], list[ScoreContentPartParam]], + data_2: Union[list[str], list[ScoreContentPartParam]], truncate_prompt_tokens: Optional[int] = None, use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, @@ -1199,10 +1202,8 @@ class LLM: raise ValueError( "Score API is only enabled for `--task embed or score`") - if len(text_1) == 1: - text_1 = text_1 * len(text_2) - - input_pairs = [(t1, t2) for t1, t2 in zip(text_1, text_2)] + if len(data_1) == 1: + data_1 = data_1 * len(data_2) pooling_params = PoolingParams(use_cross_encoder=True) tokenization_kwargs: dict[str, Any] = {} @@ -1211,19 +1212,41 @@ class LLM: parsed_prompts = [] - for q, t in input_pairs: - if self.llm_engine.model_config.use_pad_token: - # cross_encoder models defaults to using pad_token. - prompt_inputs = tokenizer(text=q, - text_pair=t, - **tokenization_kwargs) - else: - # `llm as reranker` models defaults to not using pad_token. - prompt_inputs = tokenizer(text=q + t, **tokenization_kwargs) - engine_prompt = TokensPrompt( - prompt_token_ids=prompt_inputs["input_ids"], - token_type_ids=prompt_inputs.get("token_type_ids")) - parsed_prompts.append(engine_prompt) + input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)] + + if self.llm_engine.model_config.is_multimodal_model: + + model_config = self.llm_engine.model_config + + for q, d in input_pairs: + _, engine_prompt = get_score_prompt( + model_config=model_config, + data_1=q, + data_2=d, + tokenizer=tokenizer, + tokenization_kwargs=tokenization_kwargs, + ) + + parsed_prompts.append(engine_prompt) + + else: + + for q, t in input_pairs: + if self.llm_engine.model_config.use_pad_token: + # cross_encoder models defaults to using pad_token. + prompt_inputs = tokenizer( + text=q, # type: ignore[arg-type] + text_pair=t, # type: ignore[arg-type] + **tokenization_kwargs) + else: + # `llm as reranker` models defaults to not using pad_token. + prompt_inputs = tokenizer( + text=q + t, # type: ignore[operator] + **tokenization_kwargs) + engine_prompt = TokensPrompt( + prompt_token_ids=prompt_inputs["input_ids"], + token_type_ids=prompt_inputs.get("token_type_ids")) + parsed_prompts.append(engine_prompt) self._validate_and_add_requests( prompts=parsed_prompts, @@ -1241,8 +1264,10 @@ class LLM: def score( self, - text_1: Union[SingletonPrompt, Sequence[SingletonPrompt]], - text_2: Union[SingletonPrompt, Sequence[SingletonPrompt]], + data_1: Union[SingletonPrompt, Sequence[SingletonPrompt], + ScoreMultiModalParam], + data_2: Union[SingletonPrompt, Sequence[SingletonPrompt], + ScoreMultiModalParam], /, *, truncate_prompt_tokens: Optional[int] = None, @@ -1250,22 +1275,30 @@ class LLM: lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> list[ScoringRequestOutput]: - """Generate similarity scores for all pairs ``. + """Generate similarity scores for all pairs `` or + ``. The inputs can be `1 -> 1`, `1 -> N` or `N -> N`. - In the `1 - N` case the `text_1` sentence will be replicated `N` - times to pair with the `text_2` sentences. + In the `1 - N` case the `data_1` input will be replicated `N` + times to pair with the `data_2` inputs. The input pairs are used to build a list of prompts for the cross encoder model. This class automatically batches the prompts, considering the memory constraint. For the best performance, put all - of your texts into a single list and pass it to this method. + of your inputs into a single list and pass it to this method. + + Supports both text and multi-modal data (images, etc.) when used with + appropriate multi-modal models. For multi-modal inputs, ensure the + prompt structure matches the model's expected input format. Args: - text_1: can be a single prompt or a list of prompts, in which - case it has to have the same length as the `text_2` list - text_2: The texts to pair with the query to form the input - to the LLM. See [PromptType][vllm.inputs.PromptType] for - more details about the format of each prompts. + data_1: Can be a single prompt, a list of prompts or + `ScoreMultiModalParam`, which can contain either text or + multi-modal data. When a list, it must have the same length as + the `data_2` list. + data_2: The data to pair with the query to form the input to + the LLM. Can be text or multi-modal data. See [PromptType] + [vllm.inputs.PromptType] for more details about the format of + each prompt. use_tqdm: If `True`, shows a tqdm progress bar. If a callable (e.g., `functools.partial(tqdm, leave=False)`), it is used to create the progress bar. @@ -1306,42 +1339,70 @@ class LLM: # lists of tokens to the `text` and `text_pair` kwargs tokenizer = self.get_tokenizer() - def ensure_str(prompt: SingletonPrompt): - if isinstance(prompt, dict): - if "multi_modal_data" in prompt: - raise ValueError("Multi-modal prompt is not " - "supported for scoring") - elif "prompt_token_ids" in prompt: - prompt = tokenizer.decode( - cast(TokensPrompt, prompt)["prompt_token_ids"]) - elif "prompt" in prompt: - prompt = cast(TextPrompt, prompt)["prompt"] - assert type(prompt) is str - return prompt + if not self.llm_engine.model_config.is_multimodal_model: - if isinstance(text_1, (str, dict)): - # Convert a single prompt to a list. - text_1 = [text_1] - input_text_1: list[str] = [ensure_str(t) for t in text_1] + def check_data_type(data: Union[SingletonPrompt, + Sequence[SingletonPrompt], + ScoreMultiModalParam]): + if isinstance(data, dict) and "content" in data: + raise ValueError( + f"ScoreMultiModalParam is not supported for {self.llm_engine.model_config.architecture}", # noqa: E501 + ) - if isinstance(text_2, (str, dict)): - # Convert a single prompt to a list. - text_2 = [text_2] - input_text_2: list[str] = [ensure_str(t) for t in text_2] + check_data_type(data_1) + check_data_type(data_2) - _validate_score_input_lens(input_text_1, input_text_2) + def ensure_str(prompt: SingletonPrompt): + if isinstance(prompt, dict): + if "multi_modal_data" in prompt: + raise ValueError("Multi-modal prompt is not " + "supported for scoring") + elif "prompt_token_ids" in prompt: + prompt = tokenizer.decode( + cast(TokensPrompt, prompt)["prompt_token_ids"]) + elif "prompt" in prompt: + prompt = cast(TextPrompt, prompt)["prompt"] + assert type(prompt) is str + return prompt + + if isinstance(data_1, (str, dict)): + # Convert a single prompt to a list. + data_1 = [data_1] # type: ignore[list-item] + + data_1 = [ensure_str(t) for t in data_1] + + if isinstance(data_2, (str, dict)): + # Convert a single prompt to a list. + data_2 = [data_2] # type: ignore[list-item] + + data_2 = [ensure_str(t) for t in data_2] + + if isinstance(data_1, dict) and "content" in data_1: + data_1 = data_1.get("content") # type: ignore[assignment] + elif isinstance(data_1, str): + data_1 = [data_1] + + if isinstance(data_2, dict) and "content" in data_2: + data_2 = data_2.get("content") # type: ignore[assignment] + elif isinstance(data_2, str): + data_2 = [data_2] + + _validate_score_input_lens(data_1, data_2) # type: ignore[arg-type] if self.llm_engine.model_config.is_cross_encoder: - return self._cross_encoding_score(tokenizer, input_text_1, - input_text_2, - truncate_prompt_tokens, use_tqdm, - lora_request, - prompt_adapter_request) + return self._cross_encoding_score( + tokenizer, + data_1, # type: ignore[arg-type] + data_2, # type: ignore[arg-type] + truncate_prompt_tokens, + use_tqdm, + lora_request, + prompt_adapter_request) else: return self._embedding_score( tokenizer, - input_text_1, # type: ignore[arg-type] - input_text_2, # type: ignore[arg-type] + data_1, # type: ignore[arg-type] + data_2, # type: ignore[arg-type] truncate_prompt_tokens, use_tqdm, lora_request, diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index b3395c598e747..bfebe0ec0dc9a 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -24,6 +24,7 @@ from typing_extensions import TypeAlias from vllm import envs from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam, random_tool_call_id) +from vllm.entrypoints.score_utils import ScoreMultiModalParam from vllm.logger import init_logger from vllm.pooling_params import PoolingParams from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams, @@ -1282,8 +1283,8 @@ PoolingRequest = Union[PoolingCompletionRequest, PoolingChatRequest] class ScoreRequest(OpenAIBaseModel): model: Optional[str] = None - text_1: Union[list[str], str] - text_2: Union[list[str], str] + text_1: Union[list[str], str, ScoreMultiModalParam] + text_2: Union[list[str], str, ScoreMultiModalParam] truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None # --8<-- [start:score-pooling-params] @@ -1291,6 +1292,12 @@ class ScoreRequest(OpenAIBaseModel): # --8<-- [end:score-pooling-params] # --8<-- [start:score-extra-params] + + mm_processor_kwargs: Optional[dict[str, Any]] = Field( + default=None, + description=("Additional kwargs to pass to the HF processor."), + ) + priority: int = Field( default=0, description=( @@ -1308,8 +1315,8 @@ class ScoreRequest(OpenAIBaseModel): class RerankRequest(OpenAIBaseModel): model: Optional[str] = None - query: str - documents: list[str] + query: Union[str, ScoreMultiModalParam] + documents: Union[list[str], ScoreMultiModalParam] top_n: int = Field(default_factory=lambda: 0) truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None @@ -1318,6 +1325,12 @@ class RerankRequest(OpenAIBaseModel): # --8<-- [end:rerank-pooling-params] # --8<-- [start:rerank-extra-params] + + mm_processor_kwargs: Optional[dict[str, Any]] = Field( + default=None, + description=("Additional kwargs to pass to the HF processor."), + ) + priority: int = Field( default=0, description=( @@ -1334,7 +1347,8 @@ class RerankRequest(OpenAIBaseModel): class RerankDocument(BaseModel): - text: str + text: Optional[str] = None + multi_modal: Optional[ScoreMultiModalParam] = None class RerankResult(BaseModel): diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py index 8b2e3e507c4db..b4fdbfcc7f604 100644 --- a/vllm/entrypoints/openai/serving_score.py +++ b/vllm/entrypoints/openai/serving_score.py @@ -17,8 +17,11 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse, RerankDocument, ScoreResponseData, UsageInfo) from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.serving_models import OpenAIServingModels -from vllm.entrypoints.score_utils import (_cosine_similarity, - _validate_score_input_lens) +from vllm.entrypoints.score_utils import (ScoreContentPartParam, + ScoreMultiModalParam, + _cosine_similarity, + _validate_score_input_lens, + get_score_prompt) from vllm.entrypoints.utils import _validate_truncation_size from vllm.inputs.data import TokensPrompt from vllm.logger import init_logger @@ -137,11 +140,34 @@ class ServingScores(OpenAIServing): return final_res_batch + def _preprocess_score( + self, + request: Union[RerankRequest, ScoreRequest], + tokenizer: AnyTokenizer, + tokenization_kwargs: dict[str, Any], + data_1: Union[str, ScoreContentPartParam], + data_2: Union[str, ScoreContentPartParam], + ) -> tuple[str, TokensPrompt]: + + model_config = self.model_config + + full_prompt, engine_prompt = get_score_prompt( + model_config=model_config, + data_1=data_1, + data_2=data_2, + tokenizer=tokenizer, + tokenization_kwargs=tokenization_kwargs, + ) + if request.mm_processor_kwargs is not None: + engine_prompt["mm_processor_kwargs"] = request.mm_processor_kwargs + + return full_prompt, engine_prompt + async def _cross_encoding_score( self, tokenizer: AnyTokenizer, - texts_1: list[str], - texts_2: list[str], + data_1: Union[list[str], list[ScoreContentPartParam]], + data_2: Union[list[str], list[ScoreContentPartParam]], request: Union[RerankRequest, ScoreRequest], request_id=str, tokenization_kwargs: Optional[dict[str, Any]] = None, @@ -154,46 +180,67 @@ class ServingScores(OpenAIServing): request_prompts: list[str] = [] engine_prompts: list[TokensPrompt] = [] - if len(texts_1) == 1: - texts_1 = texts_1 * len(texts_2) - - input_pairs = [(t1, t2) for t1, t2 in zip(texts_1, texts_2)] + if len(data_1) == 1: + data_1 = data_1 * len(data_2) if isinstance(tokenizer, MistralTokenizer): raise ValueError( "MistralTokenizer not supported for cross-encoding") - tokenize_async = make_async(tokenizer.__call__, - executor=self._tokenizer_executor) - tokenization_kwargs = tokenization_kwargs or {} - use_pad_token = self.model_config.use_pad_token - if use_pad_token: - # cross_encoder models defaults to using pad_token. - tokenized_prompts = await asyncio.gather( - *(tokenize_async(text=t1, text_pair=t2, **tokenization_kwargs) - for t1, t2 in input_pairs)) + input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)] + + if self.model_config.is_multimodal_model: + + preprocess_async = make_async(self._preprocess_score, + executor=self._tokenizer_executor) + + preprocessed_prompts = await asyncio.gather( + *(preprocess_async(request=request, + tokenizer=tokenizer, + tokenization_kwargs=tokenization_kwargs, + data_1=t1, + data_2=t2) for t1, t2 in input_pairs)) + + for full_prompt, engine_prompt in preprocessed_prompts: + request_prompts.append(full_prompt) + engine_prompts.append(engine_prompt) + else: - # `llm as reranker` models defaults to not using pad_token. - tokenized_prompts = await asyncio.gather( - *(tokenize_async(text=t1 + t2, **tokenization_kwargs) - for t1, t2 in input_pairs)) + tokenize_async = make_async(tokenizer.__call__, + executor=self._tokenizer_executor) + use_pad_token = self.model_config.use_pad_token - for prompt_inputs, (t1, t2) in zip(tokenized_prompts, input_pairs): - sep_token = tokenizer.sep_token if (tokenizer.sep_token - and use_pad_token) else '' - request_prompt = f"{t1}{sep_token}{t2}" + if use_pad_token: + # cross_encoder models defaults to using pad_token. + tokenized_prompts = await asyncio.gather(*( + tokenize_async( + text=t1, # type: ignore[arg-type] + text_pair=t2, # type: ignore[arg-type] + **tokenization_kwargs) for t1, t2 in input_pairs)) + else: + # `llm as reranker` models defaults to not using pad_token. + tokenized_prompts = await asyncio.gather(*( + tokenize_async( + text=t1 + # type: ignore[operator] + t2, + **tokenization_kwargs) for t1, t2 in input_pairs)) - input_ids = prompt_inputs["input_ids"] - text_token_prompt = \ - self._validate_input(request, input_ids, request_prompt) - engine_prompt = TokensPrompt( - prompt_token_ids=text_token_prompt["prompt_token_ids"], - token_type_ids=prompt_inputs.get("token_type_ids")) + for prompt_inputs, (t1, t2) in zip(tokenized_prompts, input_pairs): + sep_token = tokenizer.sep_token if (tokenizer.sep_token + and use_pad_token) else '' + request_prompt = f"{t1}{sep_token}{t2}" - request_prompts.append(request_prompt) - engine_prompts.append(engine_prompt) + input_ids = prompt_inputs["input_ids"] + text_token_prompt = \ + self._validate_input(request, input_ids, request_prompt) + engine_prompt = TokensPrompt( + prompt_token_ids=text_token_prompt["prompt_token_ids"], + token_type_ids=prompt_inputs.get("token_type_ids")) + + request_prompts.append(request_prompt) + engine_prompts.append(engine_prompt) # Schedule the request and get the result generator. generators: list[AsyncGenerator[PoolingRequestOutput, None]] = [] @@ -233,8 +280,8 @@ class ServingScores(OpenAIServing): async def _run_scoring( self, - texts_1: Union[str, list[str]], - texts_2: Union[str, list[str]], + data_1: Union[list[str], str, ScoreMultiModalParam], + data_2: Union[list[str], str, ScoreMultiModalParam], request: Union[ScoreRequest, RerankRequest], request_id: str, raw_request: Optional[Request] = None, @@ -259,18 +306,29 @@ class ServingScores(OpenAIServing): trace_headers = (None if raw_request is None else await self._get_trace_headers(raw_request.headers)) - if isinstance(texts_1, str): - texts_1 = [texts_1] - if isinstance(texts_2, str): - texts_2 = [texts_2] + if not self.model_config.is_multimodal_model and (isinstance( + data_1, dict) or isinstance(data_2, dict)): + raise ValueError( + f"MultiModalParam is not supported for {self.model_config.architecture}" # noqa: E501 + ) - _validate_score_input_lens(texts_1, texts_2) + if isinstance(data_1, str): + data_1 = [data_1] + elif isinstance(data_1, dict): + data_1 = data_1.get("content") # type: ignore[assignment] + + if isinstance(data_2, str): + data_2 = [data_2] + elif isinstance(data_2, dict): + data_2 = data_2.get("content") # type: ignore[assignment] + + _validate_score_input_lens(data_1, data_2) # type: ignore[arg-type] if self.model_config.is_cross_encoder: return await self._cross_encoding_score( tokenizer=tokenizer, - texts_1=texts_1, - texts_2=texts_2, + data_1=data_1, # type: ignore[arg-type] + data_2=data_2, # type: ignore[arg-type] request=request, request_id=request_id, tokenization_kwargs=tokenization_kwargs, @@ -281,8 +339,8 @@ class ServingScores(OpenAIServing): else: return await self._embedding_score( tokenizer=tokenizer, - texts_1=texts_1, - texts_2=texts_2, + texts_1=data_1, # type: ignore[arg-type] + texts_2=data_2, # type: ignore[arg-type] request=request, request_id=request_id, tokenization_kwargs=tokenization_kwargs, @@ -349,7 +407,9 @@ class ServingScores(OpenAIServing): request_id = f"rerank-{self._base_request_id(raw_request)}" documents = request.documents - top_n = request.top_n if request.top_n > 0 else len(documents) + top_n = request.top_n if request.top_n > 0 else ( + len(documents) + if isinstance(documents, list) else len(documents["content"])) try: final_res_batch = await self._run_scoring( @@ -410,7 +470,7 @@ class ServingScores(OpenAIServing): def request_output_to_rerank_response( self, final_res_batch: list[PoolingRequestOutput], request_id: str, - model_name: str, documents: list[str], + model_name: str, documents: Union[list[str], ScoreMultiModalParam], top_n: int) -> RerankResponse: """ Convert the output of do_rank to a RerankResponse @@ -422,7 +482,9 @@ class ServingScores(OpenAIServing): result = RerankResult( index=idx, - document=RerankDocument(text=documents[idx]), + document=RerankDocument(text=documents[idx]) if isinstance( + documents, list) else RerankDocument( + multi_modal=documents["content"][idx]), relevance_score=classify_res.outputs.score, ) results.append(result) diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py index c4e044f3a28e9..3fc4ed606b8a9 100644 --- a/vllm/entrypoints/score_utils.py +++ b/vllm/entrypoints/score_utils.py @@ -1,13 +1,41 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Union +from typing import Any, Optional, Union, cast from torch.nn import CosineSimilarity +from typing_extensions import Required, TypeAlias, TypedDict +from vllm.config import ModelConfig +from vllm.entrypoints.chat_utils import ( + BaseMultiModalItemTracker, ChatCompletionContentPartImageEmbedsParam, + ChatCompletionContentPartImageParam, ChatCompletionContentPartTextParam, + MultiModalItemTracker, _ContentPart, _parse_chat_message_content_part) +from vllm.inputs import TokensPrompt +from vllm.model_executor.model_loader import get_model_cls +from vllm.model_executor.models.interfaces import supports_score_template +from vllm.multimodal.inputs import MultiModalDataDict from vllm.outputs import PoolingRequestOutput -from vllm.transformers_utils.tokenizer import (PreTrainedTokenizer, +from vllm.transformers_utils.tokenizer import (AnyTokenizer, + PreTrainedTokenizer, PreTrainedTokenizerFast) +ScoreContentPartParam: TypeAlias = Union[ + ChatCompletionContentPartImageParam, + ChatCompletionContentPartImageEmbedsParam] + + +class ScoreMultiModalParam(TypedDict, total=False): + """ + A specialized parameter type for scoring multimodal content + + The reasons why don't reuse `CustomChatCompletionMessageParam` directly: + 1. Score tasks don't need the 'role' field (user/assistant/system) that's required in chat completions + 2. Including chat-specific fields would confuse users about their purpose in scoring + 3. This is a more focused interface that only exposes what's needed for scoring + """ # noqa: E501 + content: Required[list[ScoreContentPartParam]] + """The multimodal contents""" + def _cosine_similarity( tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], @@ -39,12 +67,128 @@ def _cosine_similarity( def _validate_score_input_lens( - texts_1: Union[list[str], list[dict]], - texts_2: Union[list[str], list[dict]], + data_1: Union[list[str], list[ScoreContentPartParam]], + data_2: Union[list[str], list[ScoreContentPartParam]], ): - if len(texts_1) > 1 and len(texts_1) != len(texts_2): + len_1 = len(data_1) + len_2 = len(data_2) + + if len_1 > 1 and len_1 != len_2: raise ValueError("Input lengths must be either 1:1, 1:N or N:N") - if len(texts_1) == 0: + if len_1 == 0: raise ValueError("At least one text element must be given") - if len(texts_2) == 0: - raise ValueError("At least one text_pair element must be given") \ No newline at end of file + if len_2 == 0: + raise ValueError("At least one text_pair element must be given") + + +def parse_score_data( + data_1: Union[str, ScoreContentPartParam], + data_2: Union[str, ScoreContentPartParam], + model_config: ModelConfig, + tokenizer: AnyTokenizer, +) -> tuple[str, str, Optional[MultiModalDataDict]]: + mm_tracker = MultiModalItemTracker(model_config, tokenizer) + + content_1 = _parse_score_content(data_1, mm_tracker) + + content_2 = _parse_score_content(data_2, mm_tracker) + + def ensure_str(content: Optional[_ContentPart]) -> str: + if content is not None and isinstance(content, str): + return cast(str, content) + else: + raise ValueError( + f"Only string content is supported, but got {content}.") + + prompt_1 = ensure_str(content_1) + prompt_2 = ensure_str(content_2) + + return prompt_1, prompt_2, mm_tracker.all_mm_data() + + +def _parse_score_content( + data: Union[str, ScoreContentPartParam], + mm_tracker: BaseMultiModalItemTracker, +) -> Optional[_ContentPart]: + + if isinstance(data, str): + data = ChatCompletionContentPartTextParam(type="text", text=data) + + mm_parser = mm_tracker.create_parser() + + parse_res = _parse_chat_message_content_part( + data, + mm_parser, + wrap_dicts=False, + interleave_strings=False, + ) + + if parse_res: + return parse_res + + mm_placeholder_storage = mm_parser.mm_placeholder_storage() + + if len(mm_placeholder_storage) != 1 or len( + next(iter(mm_placeholder_storage.values()))) != 1: + raise ValueError("Only one multi-modal item is supported") + + return next(iter(mm_placeholder_storage.values()))[0] + + +def apply_score_template( + model_config: ModelConfig, + prompt_1: str, + prompt_2: str, +) -> str: + + model = get_model_cls(model_config) + if supports_score_template(model): + full_prompt = model.get_score_template(prompt_1, prompt_2) + if full_prompt is None: + raise ValueError("Get empty score template from model") + return full_prompt + + raise ValueError( + f"Unsupported model architecture: {model_config.architecture}") + + +def post_process_tokens( + model_config: ModelConfig, + prompt: TokensPrompt, +) -> None: + """ + Perform architecture-specific manipulations on the input tokens. + + Note: + This is an in-place operation. + """ + model = get_model_cls(model_config) + if supports_score_template(model): + model.post_process_tokens(prompt) + + +def get_score_prompt( + model_config: ModelConfig, + tokenizer: AnyTokenizer, + tokenization_kwargs: dict[str, Any], + data_1: Union[str, ScoreContentPartParam], + data_2: Union[str, ScoreContentPartParam], +) -> tuple[str, TokensPrompt]: + prompt_1, prompt_2, mm_data = parse_score_data( + data_1, + data_2, + model_config, + tokenizer, + ) + + full_prompt = apply_score_template(model_config, prompt_1, prompt_2) + + prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs) + + engine_prompt = TokensPrompt(prompt_token_ids=prompt_inputs["input_ids"]) + + post_process_tokens(model_config, engine_prompt) + + if mm_data is not None: + engine_prompt["multi_modal_data"] = mm_data + return full_prompt, engine_prompt diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 552c4b0742164..6d0ffad1a819f 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -170,6 +170,15 @@ class Qwen3ForSequenceClassificationConfig(VerifyAndUpdateConfig): vllm_config.model_config.hf_config.method = "from_2_way_softmax" +class JinaVLForSequenceClassificationConfig(VerifyAndUpdateConfig): + + @staticmethod + def verify_and_update_config(vllm_config: "VllmConfig") -> None: + config = vllm_config.model_config.hf_config + + config.num_labels = 1 + + class SnowflakeGteNewModelConfig(VerifyAndUpdateConfig): @staticmethod @@ -197,4 +206,5 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = { "NomicBertModel": NomicBertModelConfig, "Qwen3ForSequenceClassification": Qwen3ForSequenceClassificationConfig, "XLMRobertaModel": JinaRobertaModelConfig, + "JinaVLForRanking": JinaVLForSequenceClassificationConfig, } diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 3863d8454bffb..503147367106c 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -9,6 +9,7 @@ import torch from torch import Tensor from typing_extensions import Self, TypeIs +from vllm.inputs import TokensPrompt from vllm.logger import init_logger from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) @@ -141,6 +142,62 @@ def supports_multimodal( return isinstance(model, SupportsMultiModal) +@runtime_checkable +class SupportsScoreTemplate(Protocol): + """The interface required for all models that support score template.""" + + supports_score_template: ClassVar[Literal[True]] = True + """ + A flag that indicates this model supports score template. + + Note: + There is no need to redefine this flag if this class is in the + MRO of your model class. + """ + + @classmethod + def get_score_template(cls, query: str, document: str) -> Optional[str]: + """ + Generate a full prompt by populating the score template with query and document content. + """ # noqa: E501 + ... + + @classmethod + def post_process_tokens(cls, prompt: TokensPrompt) -> None: + """ + Perform architecture-specific manipulations on the input tokens. + """ + ... + + +# We can't use runtime_checkable with ClassVar for issubclass checks +# so we need to treat the class as an instance and use isinstance instead +@runtime_checkable +class _SupportsScoreTemplateType(Protocol): + supports_score_template: Literal[True] + + +@overload +def supports_score_template( + model: type[object]) -> TypeIs[type[SupportsScoreTemplate]]: + ... + + +@overload +def supports_score_template(model: object) -> TypeIs[SupportsScoreTemplate]: + ... + + +def supports_score_template( + model: Union[type[object], object], +) -> Union[TypeIs[type[SupportsScoreTemplate]], TypeIs[SupportsScoreTemplate]]: + + if isinstance(model, type): + return isinstance(model, _SupportsScoreTemplateType) + + return isinstance(model, SupportsScoreTemplate) + + @runtime_checkable class SupportsLoRA(Protocol): """The interface required for all models that support LoRA.""" diff --git a/vllm/model_executor/models/jina_vl.py b/vllm/model_executor/models/jina_vl.py new file mode 100644 index 0000000000000..78e58896e0d8e --- /dev/null +++ b/vllm/model_executor/models/jina_vl.py @@ -0,0 +1,150 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Iterable, Mapping +from typing import Optional + +import torch +import torch.nn as nn +from transformers import BatchFeature, PretrainedConfig + +from vllm.config import VllmConfig +from vllm.inputs import TokensPrompt +from vllm.logger import init_logger +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.pooler import Pooler, PoolingType +from vllm.model_executor.pooling_metadata import PoolingMetadata +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.sequence import IntermediateTensors, PoolerOutput + +from .interfaces import (SupportsCrossEncoding, SupportsMultiModal, + SupportsScoreTemplate) +from .qwen2_vl import (Qwen2VLDummyInputsBuilder, + Qwen2VLForConditionalGeneration, + Qwen2VLMultiModalProcessor, Qwen2VLProcessingInfo) +from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix + +logger = init_logger(__name__) + + +class JinaVLScorer(nn.Module): + + def __init__(self, config: PretrainedConfig): + super().__init__() + self.dense = ColumnParallelLinear(config.hidden_size, + config.hidden_size, + bias=True) + self.out_proj = RowParallelLinear(config.hidden_size, + config.num_labels, + bias=True) + + def forward(self, x, **kwargs): + x, _ = self.dense(x) + x = torch.relu(x) + x, _ = self.out_proj(x) + return x + + +class JinaVLMultiModalProcessor(Qwen2VLMultiModalProcessor): + + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + tok_kwargs: Mapping[str, object], + ) -> BatchFeature: + + # NOTE: We should reverse the order of the mm_data because the + # query prompt is placed after the document prompt in the score + # template for JinaVLForRanking model, but in mm_data they are + # stored in the opposite order (query first, then document). + for _, value in mm_data.items(): + value.reverse() + return super()._call_hf_processor(prompt, mm_data, mm_kwargs, + tok_kwargs) + + +@MULTIMODAL_REGISTRY.register_processor(JinaVLMultiModalProcessor, + info=Qwen2VLProcessingInfo, + dummy_inputs=Qwen2VLDummyInputsBuilder) +class JinaVLForSequenceClassification(Qwen2VLForConditionalGeneration, + SupportsCrossEncoding, + SupportsMultiModal, + SupportsScoreTemplate): + weight_mapper = WeightsMapper( + orig_to_new_prefix={ + "score.0.": "score.dense.", + "score.2.": "score.out_proj.", + # mapping for new names in checkpoint saved after transformers v4.52 + "model.language_model.": "language_model.model.", + "visual.": "visual.", + # mapping for original checkpoint + "lm_head.": "language_model.lm_head.", + "model.": "language_model.model.", + }) + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "qwen2_vl")) + config = vllm_config.model_config.hf_config + pooler_config = vllm_config.model_config.pooler_config + + # logit bias for sigmoid normalization + self.LOGIT_BIAS = 2.65 + + self.score = JinaVLScorer(config) + + self._pooler = Pooler.from_config_with_defaults( + pooler_config, + pooling_type=PoolingType.LAST, + normalize=False, + softmax=True) + + @classmethod + def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: + if modality.startswith("image"): + return "<|vision_start|><|image_pad|><|vision_end|>" + + raise ValueError("Only image modality is supported") + + @classmethod + def get_score_template(cls, query: str, document: str) -> Optional[str]: + return f"**Document**:\n{document}\n**Query**:\n{query}" + + @classmethod + def post_process_tokens(cls, prompt: TokensPrompt) -> None: + + # add score target token at the end of prompt tokens + prompt['prompt_token_ids'].append(100) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + **kwargs: object, + ) -> torch.Tensor: + hidden_states = super().forward( + input_ids=input_ids, + positions=positions, + intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds, + **kwargs, + ) + + logits = self.score(hidden_states) - self.LOGIT_BIAS + return logits + + def pooler( + self, + hidden_states: torch.Tensor, + pooling_metadata: PoolingMetadata, + ) -> Optional[PoolerOutput]: + return self._pooler(hidden_states, pooling_metadata) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): + + loader = AutoWeightsLoader(self) + return loader.load_weights(weights, mapper=self.weight_mapper) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 03e45bd26d7ac..04d8b2f557b70 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -181,6 +181,7 @@ _CROSS_ENCODER_MODELS = { "GemmaForSequenceClassification": ("gemma", "GemmaForSequenceClassification"), # noqa: E501 "Qwen2ForSequenceClassification": ("qwen2", "Qwen2ForSequenceClassification"), # noqa: E501 "Qwen3ForSequenceClassification": ("qwen3", "Qwen3ForSequenceClassification"), # noqa: E501 + "JinaVLForRanking": ("jina_vl", "JinaVLForSequenceClassification"), # noqa: E501 } _MULTIMODAL_MODELS = {