mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-16 11:47:09 +08:00
[Model][VLM] Support JinaVL Reranker (#20260)
Signed-off-by: shineran96 <shinewang96@gmail.com>
This commit is contained in:
parent
b140416abf
commit
4bed167768
@ -282,7 +282,7 @@ steps:
|
||||
- python3 offline_inference/llm_engine_example.py
|
||||
- python3 offline_inference/audio_language.py --seed 0
|
||||
- python3 offline_inference/vision_language.py --seed 0
|
||||
- python3 offline_inference/vision_language_embedding.py --seed 0
|
||||
- python3 offline_inference/vision_language_pooling.py --seed 0
|
||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||
- VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 offline_inference/encoder_decoder.py
|
||||
|
||||
@ -712,6 +712,14 @@ The following table lists those that are tested in vLLM.
|
||||
|
||||
---
|
||||
|
||||
#### Scoring
|
||||
|
||||
Specified using `--task score`.
|
||||
|
||||
| Architecture | Models | Inputs | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) |
|
||||
|-------------------------------------|--------------------|----------|--------------------------|------------------------|-----------------------------|-----------------------|
|
||||
| `JinaVLForSequenceClassification` | JinaVL-based | T + I<sup>E+</sup> | `jinaai/jina-reranker-m0`, etc. | | | ✅︎ |
|
||||
|
||||
## Model Support Policy
|
||||
|
||||
At vLLM, we are committed to facilitating the integration and support of third-party models within our ecosystem. Our approach is designed to balance the need for robustness and the practical limitations of supporting a wide range of models. Here’s how we manage third-party model support:
|
||||
|
||||
@ -537,7 +537,7 @@ The following extra parameters are supported:
|
||||
|
||||
### Score API
|
||||
|
||||
Our Score API can apply a cross-encoder model or an embedding model to predict scores for sentence pairs. When using an embedding model the score corresponds to the cosine similarity between each embedding pair.
|
||||
Our Score API can apply a cross-encoder model or an embedding model to predict scores for sentence or multimodal pairs. When using an embedding model the score corresponds to the cosine similarity between each embedding pair.
|
||||
Usually, the score for a sentence pair refers to the similarity between two sentences, on a scale of 0 to 1.
|
||||
|
||||
You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
|
||||
@ -676,6 +676,55 @@ The total number of pairs is `len(text_2)`.
|
||||
}
|
||||
```
|
||||
|
||||
#### Multi-modal inputs
|
||||
|
||||
You can pass multi-modal inputs to scoring models by passing `content` including a list of multi-modal input (image, etc.) in the request. Refer to the examples below for illustration.
|
||||
|
||||
=== "JinaVL-Reranker"
|
||||
|
||||
To serve the model:
|
||||
|
||||
```bash
|
||||
vllm serve jinaai/jina-reranker-m0
|
||||
```
|
||||
|
||||
Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:
|
||||
|
||||
??? Code
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
response = requests.post(
|
||||
"http://localhost:8000/v1/score",
|
||||
json={
|
||||
"model": "jinaai/jina-reranker-m0",
|
||||
"text_1": "slm markdown",
|
||||
"text_2": {
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
|
||||
},
|
||||
},
|
||||
]
|
||||
}
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
response_json = response.json()
|
||||
print("Scoring output:", response_json["data"][0]["score"])
|
||||
print("Scoring output:", response_json["data"][1]["score"])
|
||||
```
|
||||
Full example: <gh-file:examples/online_serving/openai_cross_encoder_score_for_multimodal.py>
|
||||
|
||||
#### Extra parameters
|
||||
|
||||
The following [pooling parameters][pooling-params] are supported.
|
||||
@ -695,8 +744,7 @@ The following extra parameters are supported:
|
||||
### Re-rank API
|
||||
|
||||
Our Re-rank API can apply an embedding model or a cross-encoder model to predict relevant scores between a single query, and
|
||||
each of a list of documents. Usually, the score for a sentence pair refers to the similarity between two sentences, on
|
||||
a scale of 0 to 1.
|
||||
each of a list of documents. Usually, the score for a sentence pair refers to the similarity between two sentences or multi-modal inputs (image, etc.), on a scale of 0 to 1.
|
||||
|
||||
You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
|
||||
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
This example shows how to use vLLM for running offline inference with
|
||||
the correct prompt format on vision language models for multimodal embedding.
|
||||
the correct prompt format on vision language models for multimodal pooling.
|
||||
|
||||
For most models, the prompt format should follow corresponding examples
|
||||
on HuggingFace model repository.
|
||||
@ -15,6 +15,7 @@ from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args
|
||||
from PIL.Image import Image
|
||||
|
||||
from vllm import LLM, EngineArgs
|
||||
from vllm.entrypoints.score_utils import ScoreMultiModalParam
|
||||
from vllm.multimodal.utils import fetch_image
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
@ -35,14 +36,22 @@ class TextImageQuery(TypedDict):
|
||||
image: Image
|
||||
|
||||
|
||||
QueryModality = Literal["text", "image", "text+image"]
|
||||
Query = Union[TextQuery, ImageQuery, TextImageQuery]
|
||||
class TextImagesQuery(TypedDict):
|
||||
modality: Literal["text+images"]
|
||||
text: str
|
||||
image: ScoreMultiModalParam
|
||||
|
||||
|
||||
QueryModality = Literal["text", "image", "text+image", "text+images"]
|
||||
Query = Union[TextQuery, ImageQuery, TextImageQuery, TextImagesQuery]
|
||||
|
||||
|
||||
class ModelRequestData(NamedTuple):
|
||||
engine_args: EngineArgs
|
||||
prompt: str
|
||||
image: Optional[Image]
|
||||
prompt: Optional[str] = None
|
||||
image: Optional[Image] = None
|
||||
query: Optional[str] = None
|
||||
documents: Optional[ScoreMultiModalParam] = None
|
||||
|
||||
|
||||
def run_e5_v(query: Query) -> ModelRequestData:
|
||||
@ -107,6 +116,29 @@ def run_vlm2vec(query: Query) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
def run_jinavl_reranker(query: Query) -> ModelRequestData:
|
||||
if query["modality"] != "text+images":
|
||||
raise ValueError(f"Unsupported query modality: '{query['modality']}'")
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model="jinaai/jina-reranker-m0",
|
||||
task="score",
|
||||
max_model_len=32768,
|
||||
trust_remote_code=True,
|
||||
mm_processor_kwargs={
|
||||
"min_pixels": 3136,
|
||||
"max_pixels": 602112,
|
||||
},
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
query=query["text"],
|
||||
documents=query["image"],
|
||||
)
|
||||
|
||||
|
||||
def get_query(modality: QueryModality):
|
||||
if modality == "text":
|
||||
return TextQuery(modality="text", text="A dog sitting in the grass")
|
||||
@ -128,6 +160,28 @@ def get_query(modality: QueryModality):
|
||||
),
|
||||
)
|
||||
|
||||
if modality == "text+images":
|
||||
return TextImagesQuery(
|
||||
modality="text+images",
|
||||
text="slm markdown",
|
||||
image={
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
|
||||
},
|
||||
},
|
||||
]
|
||||
},
|
||||
)
|
||||
|
||||
msg = f"Modality {modality} is not supported."
|
||||
raise ValueError(msg)
|
||||
|
||||
@ -162,16 +216,31 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
|
||||
print("-" * 50)
|
||||
|
||||
|
||||
def run_score(model: str, modality: QueryModality, seed: Optional[int]):
|
||||
query = get_query(modality)
|
||||
req_data = model_example_map[model](query)
|
||||
|
||||
engine_args = asdict(req_data.engine_args) | {"seed": seed}
|
||||
llm = LLM(**engine_args)
|
||||
|
||||
outputs = llm.score(req_data.query, req_data.documents)
|
||||
|
||||
print("-" * 30)
|
||||
print([output.outputs.score for output in outputs])
|
||||
print("-" * 30)
|
||||
|
||||
|
||||
model_example_map = {
|
||||
"e5_v": run_e5_v,
|
||||
"vlm2vec": run_vlm2vec,
|
||||
"jinavl_reranker": run_jinavl_reranker,
|
||||
}
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Demo on using vLLM for offline inference with "
|
||||
"vision language models for multimodal embedding"
|
||||
"vision language models for multimodal pooling tasks."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model-name",
|
||||
@ -181,6 +250,14 @@ def parse_args():
|
||||
choices=model_example_map.keys(),
|
||||
help="The name of the embedding model.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--task",
|
||||
"-t",
|
||||
type=str,
|
||||
default="embedding",
|
||||
choices=["embedding", "scoring"],
|
||||
help="The task type.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--modality",
|
||||
type=str,
|
||||
@ -198,7 +275,12 @@ def parse_args():
|
||||
|
||||
|
||||
def main(args: Namespace):
|
||||
run_encode(args.model_name, args.modality, args.seed)
|
||||
if args.task == "embedding":
|
||||
run_encode(args.model_name, args.modality, args.seed)
|
||||
elif args.task == "scoring":
|
||||
run_score(args.model_name, args.modality, args.seed)
|
||||
else:
|
||||
raise ValueError(f"Unsupported task: {args.task}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
@ -0,0 +1,60 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Example online usage of Score API.
|
||||
|
||||
Run `vllm serve <model> --task score` to start up the server in vLLM.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import pprint
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
def post_http_request(prompt: dict, api_url: str) -> requests.Response:
|
||||
headers = {"User-Agent": "Test Client"}
|
||||
response = requests.post(api_url, headers=headers, json=prompt)
|
||||
return response
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--host", type=str, default="localhost")
|
||||
parser.add_argument("--port", type=int, default=8000)
|
||||
parser.add_argument("--model", type=str, default="jinaai/jina-reranker-m0")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main(args):
|
||||
api_url = f"http://{args.host}:{args.port}/score"
|
||||
model_name = args.model
|
||||
|
||||
text_1 = "slm markdown"
|
||||
text_2 = {
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
|
||||
},
|
||||
},
|
||||
]
|
||||
}
|
||||
prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
|
||||
score_response = post_http_request(prompt=prompt, api_url=api_url)
|
||||
print("\nPrompt when text_1 is string and text_2 is a image list:")
|
||||
pprint.pprint(prompt)
|
||||
print("\nScore Response:")
|
||||
pprint.pprint(score_response.json())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
main(args)
|
||||
160
tests/models/multimodal/pooling/test_jinavl_reranker.py
Normal file
160
tests/models/multimodal/pooling/test_jinavl_reranker.py
Normal file
@ -0,0 +1,160 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
from transformers import AutoModel
|
||||
|
||||
model_name = "jinaai/jina-reranker-m0"
|
||||
|
||||
mm_processor_kwargs = {
|
||||
"min_pixels": 3136,
|
||||
"max_pixels": 602112,
|
||||
}
|
||||
|
||||
limit_mm_per_prompt = {"image": 2}
|
||||
|
||||
|
||||
def vllm_reranker(model_name,
|
||||
query,
|
||||
documents,
|
||||
query_type="text",
|
||||
doc_type="text"):
|
||||
from vllm import LLM
|
||||
|
||||
model = LLM(
|
||||
model=model_name,
|
||||
task="score",
|
||||
max_model_len=32768,
|
||||
mm_processor_kwargs=mm_processor_kwargs,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
)
|
||||
|
||||
def create_image_param(url: str):
|
||||
return {"type": "image_url", "image_url": {"url": f"{url}"}}
|
||||
|
||||
if query_type == "image":
|
||||
query = {"content": [create_image_param(url) for url in query]}
|
||||
|
||||
if doc_type == "image":
|
||||
documents = {"content": [create_image_param(url) for url in documents]}
|
||||
|
||||
outputs = model.score(query, documents)
|
||||
|
||||
return [output.outputs.score for output in outputs]
|
||||
|
||||
|
||||
def hf_reranker(model_name,
|
||||
query,
|
||||
documents,
|
||||
query_type="text",
|
||||
doc_type="text"):
|
||||
|
||||
checkpoint_to_hf_mapper = {
|
||||
"visual.": "model.visual.",
|
||||
"model.": "model.language_model.",
|
||||
}
|
||||
|
||||
model = AutoModel.from_pretrained(
|
||||
model_name,
|
||||
torch_dtype="auto",
|
||||
trust_remote_code=True,
|
||||
key_mapping=checkpoint_to_hf_mapper).to("cuda").eval()
|
||||
|
||||
data_pairs = [[query[0], d] for d in documents]
|
||||
|
||||
scores = model.compute_score(data_pairs,
|
||||
max_length=2048,
|
||||
query_type=query_type,
|
||||
doc_type=doc_type)
|
||||
return scores
|
||||
|
||||
|
||||
# Visual Documents Reranking
|
||||
@pytest.mark.parametrize("model_name", [model_name])
|
||||
def test_model_text_image(model_name):
|
||||
|
||||
query = ["slm markdown"]
|
||||
documents = [
|
||||
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png",
|
||||
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png",
|
||||
]
|
||||
|
||||
hf_outputs = hf_reranker(model_name, query, documents, "text", "image")
|
||||
vllm_outputs = vllm_reranker(model_name, query, documents, "text", "image")
|
||||
|
||||
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
|
||||
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
|
||||
|
||||
|
||||
# Textual Documents Reranking
|
||||
@pytest.mark.parametrize("model_name", [model_name])
|
||||
def test_model_text_text(model_name):
|
||||
|
||||
query = ["slm markdown"]
|
||||
documents = [
|
||||
"""We present ReaderLM-v2, a compact 1.5 billion parameter language model designed for efficient
|
||||
web content extraction. Our model processes documents up to 512K tokens, transforming messy HTML
|
||||
into clean Markdown or JSON formats with high accuracy -- making it an ideal tool for grounding
|
||||
large language models. The models effectiveness results from two key innovations: (1) a three-stage
|
||||
data synthesis pipeline that generates high quality, diverse training data by iteratively drafting,
|
||||
refining, and critiquing web content extraction; and (2) a unified training framework combining
|
||||
continuous pre-training with multi-objective optimization. Intensive evaluation demonstrates that
|
||||
ReaderLM-v2 outperforms GPT-4o-2024-08-06 and other larger models by 15-20% on carefully curated
|
||||
benchmarks, particularly excelling at documents exceeding 100K tokens, while maintaining significantly
|
||||
lower computational requirements.""", # noqa: E501
|
||||
"数据提取么?为什么不用正则啊,你用正则不就全解决了么?",
|
||||
]
|
||||
|
||||
hf_outputs = hf_reranker(model_name, query, documents, "text", "text")
|
||||
vllm_outputs = vllm_reranker(model_name, query, documents, "text", "text")
|
||||
|
||||
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
|
||||
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
|
||||
|
||||
|
||||
# Image Querying for Textual Documents
|
||||
@pytest.mark.parametrize("model_name", [model_name])
|
||||
def test_model_image_text(model_name):
|
||||
|
||||
query = [
|
||||
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
|
||||
]
|
||||
documents = [
|
||||
"""We present ReaderLM-v2, a compact 1.5 billion parameter language model designed for efficient
|
||||
web content extraction. Our model processes documents up to 512K tokens, transforming messy HTML
|
||||
into clean Markdown or JSON formats with high accuracy -- making it an ideal tool for grounding
|
||||
large language models. The models effectiveness results from two key innovations: (1) a three-stage
|
||||
data synthesis pipeline that generates high quality, diverse training data by iteratively drafting,
|
||||
refining, and critiquing web content extraction; and (2) a unified training framework combining
|
||||
continuous pre-training with multi-objective optimization. Intensive evaluation demonstrates that
|
||||
ReaderLM-v2 outperforms GPT-4o-2024-08-06 and other larger models by 15-20% on carefully curated
|
||||
benchmarks, particularly excelling at documents exceeding 100K tokens, while maintaining significantly
|
||||
lower computational requirements.""", # noqa: E501
|
||||
"数据提取么?为什么不用正则啊,你用正则不就全解决了么?",
|
||||
]
|
||||
|
||||
hf_outputs = hf_reranker(model_name, query, documents, "image", "text")
|
||||
vllm_outputs = vllm_reranker(model_name, query, documents, "image", "text")
|
||||
|
||||
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
|
||||
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
|
||||
|
||||
|
||||
# Image Querying for Image Documents
|
||||
@pytest.mark.parametrize("model_name", [model_name])
|
||||
def test_model_image_image(model_name):
|
||||
|
||||
query = [
|
||||
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
|
||||
]
|
||||
documents = [
|
||||
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png",
|
||||
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png",
|
||||
]
|
||||
|
||||
hf_outputs = hf_reranker(model_name, query, documents, "image", "image")
|
||||
vllm_outputs = vllm_reranker(model_name, query, documents, "image",
|
||||
"image")
|
||||
|
||||
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
|
||||
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
|
||||
@ -432,6 +432,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
trust_remote_code=True), # noqa: E501
|
||||
"MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501
|
||||
"WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), # noqa: E501
|
||||
|
||||
# [Cross-encoder]
|
||||
"JinaVLForRanking": _HfExamplesInfo("jinaai/jina-reranker-m0"), # noqa: E501
|
||||
}
|
||||
|
||||
_SPECULATIVE_DECODING_EXAMPLE_MODELS = {
|
||||
|
||||
@ -28,8 +28,11 @@ from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
|
||||
apply_mistral_chat_template,
|
||||
parse_chat_messages,
|
||||
resolve_chat_template_content_format)
|
||||
from vllm.entrypoints.score_utils import (_cosine_similarity,
|
||||
_validate_score_input_lens)
|
||||
from vllm.entrypoints.score_utils import (ScoreContentPartParam,
|
||||
ScoreMultiModalParam,
|
||||
_cosine_similarity,
|
||||
_validate_score_input_lens,
|
||||
get_score_prompt)
|
||||
from vllm.entrypoints.utils import _validate_truncation_size
|
||||
from vllm.inputs import PromptType, SingletonPrompt, TextPrompt, TokensPrompt
|
||||
from vllm.inputs.parse import parse_and_batch_prompt
|
||||
@ -1187,8 +1190,8 @@ class LLM:
|
||||
def _cross_encoding_score(
|
||||
self,
|
||||
tokenizer: AnyTokenizer,
|
||||
text_1: list[str],
|
||||
text_2: list[str],
|
||||
data_1: Union[list[str], list[ScoreContentPartParam]],
|
||||
data_2: Union[list[str], list[ScoreContentPartParam]],
|
||||
truncate_prompt_tokens: Optional[int] = None,
|
||||
use_tqdm: Union[bool, Callable[..., tqdm]] = True,
|
||||
lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
|
||||
@ -1199,10 +1202,8 @@ class LLM:
|
||||
raise ValueError(
|
||||
"Score API is only enabled for `--task embed or score`")
|
||||
|
||||
if len(text_1) == 1:
|
||||
text_1 = text_1 * len(text_2)
|
||||
|
||||
input_pairs = [(t1, t2) for t1, t2 in zip(text_1, text_2)]
|
||||
if len(data_1) == 1:
|
||||
data_1 = data_1 * len(data_2)
|
||||
|
||||
pooling_params = PoolingParams(use_cross_encoder=True)
|
||||
tokenization_kwargs: dict[str, Any] = {}
|
||||
@ -1211,19 +1212,41 @@ class LLM:
|
||||
|
||||
parsed_prompts = []
|
||||
|
||||
for q, t in input_pairs:
|
||||
if self.llm_engine.model_config.use_pad_token:
|
||||
# cross_encoder models defaults to using pad_token.
|
||||
prompt_inputs = tokenizer(text=q,
|
||||
text_pair=t,
|
||||
**tokenization_kwargs)
|
||||
else:
|
||||
# `llm as reranker` models defaults to not using pad_token.
|
||||
prompt_inputs = tokenizer(text=q + t, **tokenization_kwargs)
|
||||
engine_prompt = TokensPrompt(
|
||||
prompt_token_ids=prompt_inputs["input_ids"],
|
||||
token_type_ids=prompt_inputs.get("token_type_ids"))
|
||||
parsed_prompts.append(engine_prompt)
|
||||
input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)]
|
||||
|
||||
if self.llm_engine.model_config.is_multimodal_model:
|
||||
|
||||
model_config = self.llm_engine.model_config
|
||||
|
||||
for q, d in input_pairs:
|
||||
_, engine_prompt = get_score_prompt(
|
||||
model_config=model_config,
|
||||
data_1=q,
|
||||
data_2=d,
|
||||
tokenizer=tokenizer,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
)
|
||||
|
||||
parsed_prompts.append(engine_prompt)
|
||||
|
||||
else:
|
||||
|
||||
for q, t in input_pairs:
|
||||
if self.llm_engine.model_config.use_pad_token:
|
||||
# cross_encoder models defaults to using pad_token.
|
||||
prompt_inputs = tokenizer(
|
||||
text=q, # type: ignore[arg-type]
|
||||
text_pair=t, # type: ignore[arg-type]
|
||||
**tokenization_kwargs)
|
||||
else:
|
||||
# `llm as reranker` models defaults to not using pad_token.
|
||||
prompt_inputs = tokenizer(
|
||||
text=q + t, # type: ignore[operator]
|
||||
**tokenization_kwargs)
|
||||
engine_prompt = TokensPrompt(
|
||||
prompt_token_ids=prompt_inputs["input_ids"],
|
||||
token_type_ids=prompt_inputs.get("token_type_ids"))
|
||||
parsed_prompts.append(engine_prompt)
|
||||
|
||||
self._validate_and_add_requests(
|
||||
prompts=parsed_prompts,
|
||||
@ -1241,8 +1264,10 @@ class LLM:
|
||||
|
||||
def score(
|
||||
self,
|
||||
text_1: Union[SingletonPrompt, Sequence[SingletonPrompt]],
|
||||
text_2: Union[SingletonPrompt, Sequence[SingletonPrompt]],
|
||||
data_1: Union[SingletonPrompt, Sequence[SingletonPrompt],
|
||||
ScoreMultiModalParam],
|
||||
data_2: Union[SingletonPrompt, Sequence[SingletonPrompt],
|
||||
ScoreMultiModalParam],
|
||||
/,
|
||||
*,
|
||||
truncate_prompt_tokens: Optional[int] = None,
|
||||
@ -1250,22 +1275,30 @@ class LLM:
|
||||
lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
|
||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
||||
) -> list[ScoringRequestOutput]:
|
||||
"""Generate similarity scores for all pairs `<text,text_pair>`.
|
||||
"""Generate similarity scores for all pairs `<text,text_pair>` or
|
||||
`<multi-modal data, multi-modal data pair>`.
|
||||
|
||||
The inputs can be `1 -> 1`, `1 -> N` or `N -> N`.
|
||||
In the `1 - N` case the `text_1` sentence will be replicated `N`
|
||||
times to pair with the `text_2` sentences.
|
||||
In the `1 - N` case the `data_1` input will be replicated `N`
|
||||
times to pair with the `data_2` inputs.
|
||||
The input pairs are used to build a list of prompts for the
|
||||
cross encoder model. This class automatically batches the prompts,
|
||||
considering the memory constraint. For the best performance, put all
|
||||
of your texts into a single list and pass it to this method.
|
||||
of your inputs into a single list and pass it to this method.
|
||||
|
||||
Supports both text and multi-modal data (images, etc.) when used with
|
||||
appropriate multi-modal models. For multi-modal inputs, ensure the
|
||||
prompt structure matches the model's expected input format.
|
||||
|
||||
Args:
|
||||
text_1: can be a single prompt or a list of prompts, in which
|
||||
case it has to have the same length as the `text_2` list
|
||||
text_2: The texts to pair with the query to form the input
|
||||
to the LLM. See [PromptType][vllm.inputs.PromptType] for
|
||||
more details about the format of each prompts.
|
||||
data_1: Can be a single prompt, a list of prompts or
|
||||
`ScoreMultiModalParam`, which can contain either text or
|
||||
multi-modal data. When a list, it must have the same length as
|
||||
the `data_2` list.
|
||||
data_2: The data to pair with the query to form the input to
|
||||
the LLM. Can be text or multi-modal data. See [PromptType]
|
||||
[vllm.inputs.PromptType] for more details about the format of
|
||||
each prompt.
|
||||
use_tqdm: If `True`, shows a tqdm progress bar.
|
||||
If a callable (e.g., `functools.partial(tqdm, leave=False)`),
|
||||
it is used to create the progress bar.
|
||||
@ -1306,42 +1339,70 @@ class LLM:
|
||||
# lists of tokens to the `text` and `text_pair` kwargs
|
||||
tokenizer = self.get_tokenizer()
|
||||
|
||||
def ensure_str(prompt: SingletonPrompt):
|
||||
if isinstance(prompt, dict):
|
||||
if "multi_modal_data" in prompt:
|
||||
raise ValueError("Multi-modal prompt is not "
|
||||
"supported for scoring")
|
||||
elif "prompt_token_ids" in prompt:
|
||||
prompt = tokenizer.decode(
|
||||
cast(TokensPrompt, prompt)["prompt_token_ids"])
|
||||
elif "prompt" in prompt:
|
||||
prompt = cast(TextPrompt, prompt)["prompt"]
|
||||
assert type(prompt) is str
|
||||
return prompt
|
||||
if not self.llm_engine.model_config.is_multimodal_model:
|
||||
|
||||
if isinstance(text_1, (str, dict)):
|
||||
# Convert a single prompt to a list.
|
||||
text_1 = [text_1]
|
||||
input_text_1: list[str] = [ensure_str(t) for t in text_1]
|
||||
def check_data_type(data: Union[SingletonPrompt,
|
||||
Sequence[SingletonPrompt],
|
||||
ScoreMultiModalParam]):
|
||||
if isinstance(data, dict) and "content" in data:
|
||||
raise ValueError(
|
||||
f"ScoreMultiModalParam is not supported for {self.llm_engine.model_config.architecture}", # noqa: E501
|
||||
)
|
||||
|
||||
if isinstance(text_2, (str, dict)):
|
||||
# Convert a single prompt to a list.
|
||||
text_2 = [text_2]
|
||||
input_text_2: list[str] = [ensure_str(t) for t in text_2]
|
||||
check_data_type(data_1)
|
||||
check_data_type(data_2)
|
||||
|
||||
_validate_score_input_lens(input_text_1, input_text_2)
|
||||
def ensure_str(prompt: SingletonPrompt):
|
||||
if isinstance(prompt, dict):
|
||||
if "multi_modal_data" in prompt:
|
||||
raise ValueError("Multi-modal prompt is not "
|
||||
"supported for scoring")
|
||||
elif "prompt_token_ids" in prompt:
|
||||
prompt = tokenizer.decode(
|
||||
cast(TokensPrompt, prompt)["prompt_token_ids"])
|
||||
elif "prompt" in prompt:
|
||||
prompt = cast(TextPrompt, prompt)["prompt"]
|
||||
assert type(prompt) is str
|
||||
return prompt
|
||||
|
||||
if isinstance(data_1, (str, dict)):
|
||||
# Convert a single prompt to a list.
|
||||
data_1 = [data_1] # type: ignore[list-item]
|
||||
|
||||
data_1 = [ensure_str(t) for t in data_1]
|
||||
|
||||
if isinstance(data_2, (str, dict)):
|
||||
# Convert a single prompt to a list.
|
||||
data_2 = [data_2] # type: ignore[list-item]
|
||||
|
||||
data_2 = [ensure_str(t) for t in data_2]
|
||||
|
||||
if isinstance(data_1, dict) and "content" in data_1:
|
||||
data_1 = data_1.get("content") # type: ignore[assignment]
|
||||
elif isinstance(data_1, str):
|
||||
data_1 = [data_1]
|
||||
|
||||
if isinstance(data_2, dict) and "content" in data_2:
|
||||
data_2 = data_2.get("content") # type: ignore[assignment]
|
||||
elif isinstance(data_2, str):
|
||||
data_2 = [data_2]
|
||||
|
||||
_validate_score_input_lens(data_1, data_2) # type: ignore[arg-type]
|
||||
|
||||
if self.llm_engine.model_config.is_cross_encoder:
|
||||
return self._cross_encoding_score(tokenizer, input_text_1,
|
||||
input_text_2,
|
||||
truncate_prompt_tokens, use_tqdm,
|
||||
lora_request,
|
||||
prompt_adapter_request)
|
||||
return self._cross_encoding_score(
|
||||
tokenizer,
|
||||
data_1, # type: ignore[arg-type]
|
||||
data_2, # type: ignore[arg-type]
|
||||
truncate_prompt_tokens,
|
||||
use_tqdm,
|
||||
lora_request,
|
||||
prompt_adapter_request)
|
||||
else:
|
||||
return self._embedding_score(
|
||||
tokenizer,
|
||||
input_text_1, # type: ignore[arg-type]
|
||||
input_text_2, # type: ignore[arg-type]
|
||||
data_1, # type: ignore[arg-type]
|
||||
data_2, # type: ignore[arg-type]
|
||||
truncate_prompt_tokens,
|
||||
use_tqdm,
|
||||
lora_request,
|
||||
|
||||
@ -24,6 +24,7 @@ from typing_extensions import TypeAlias
|
||||
from vllm import envs
|
||||
from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
|
||||
random_tool_call_id)
|
||||
from vllm.entrypoints.score_utils import ScoreMultiModalParam
|
||||
from vllm.logger import init_logger
|
||||
from vllm.pooling_params import PoolingParams
|
||||
from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
|
||||
@ -1282,8 +1283,8 @@ PoolingRequest = Union[PoolingCompletionRequest, PoolingChatRequest]
|
||||
|
||||
class ScoreRequest(OpenAIBaseModel):
|
||||
model: Optional[str] = None
|
||||
text_1: Union[list[str], str]
|
||||
text_2: Union[list[str], str]
|
||||
text_1: Union[list[str], str, ScoreMultiModalParam]
|
||||
text_2: Union[list[str], str, ScoreMultiModalParam]
|
||||
truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
|
||||
|
||||
# --8<-- [start:score-pooling-params]
|
||||
@ -1291,6 +1292,12 @@ class ScoreRequest(OpenAIBaseModel):
|
||||
# --8<-- [end:score-pooling-params]
|
||||
|
||||
# --8<-- [start:score-extra-params]
|
||||
|
||||
mm_processor_kwargs: Optional[dict[str, Any]] = Field(
|
||||
default=None,
|
||||
description=("Additional kwargs to pass to the HF processor."),
|
||||
)
|
||||
|
||||
priority: int = Field(
|
||||
default=0,
|
||||
description=(
|
||||
@ -1308,8 +1315,8 @@ class ScoreRequest(OpenAIBaseModel):
|
||||
|
||||
class RerankRequest(OpenAIBaseModel):
|
||||
model: Optional[str] = None
|
||||
query: str
|
||||
documents: list[str]
|
||||
query: Union[str, ScoreMultiModalParam]
|
||||
documents: Union[list[str], ScoreMultiModalParam]
|
||||
top_n: int = Field(default_factory=lambda: 0)
|
||||
truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
|
||||
|
||||
@ -1318,6 +1325,12 @@ class RerankRequest(OpenAIBaseModel):
|
||||
# --8<-- [end:rerank-pooling-params]
|
||||
|
||||
# --8<-- [start:rerank-extra-params]
|
||||
|
||||
mm_processor_kwargs: Optional[dict[str, Any]] = Field(
|
||||
default=None,
|
||||
description=("Additional kwargs to pass to the HF processor."),
|
||||
)
|
||||
|
||||
priority: int = Field(
|
||||
default=0,
|
||||
description=(
|
||||
@ -1334,7 +1347,8 @@ class RerankRequest(OpenAIBaseModel):
|
||||
|
||||
|
||||
class RerankDocument(BaseModel):
|
||||
text: str
|
||||
text: Optional[str] = None
|
||||
multi_modal: Optional[ScoreMultiModalParam] = None
|
||||
|
||||
|
||||
class RerankResult(BaseModel):
|
||||
|
||||
@ -17,8 +17,11 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse, RerankDocument,
|
||||
ScoreResponseData, UsageInfo)
|
||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||
from vllm.entrypoints.score_utils import (_cosine_similarity,
|
||||
_validate_score_input_lens)
|
||||
from vllm.entrypoints.score_utils import (ScoreContentPartParam,
|
||||
ScoreMultiModalParam,
|
||||
_cosine_similarity,
|
||||
_validate_score_input_lens,
|
||||
get_score_prompt)
|
||||
from vllm.entrypoints.utils import _validate_truncation_size
|
||||
from vllm.inputs.data import TokensPrompt
|
||||
from vllm.logger import init_logger
|
||||
@ -137,11 +140,34 @@ class ServingScores(OpenAIServing):
|
||||
|
||||
return final_res_batch
|
||||
|
||||
def _preprocess_score(
|
||||
self,
|
||||
request: Union[RerankRequest, ScoreRequest],
|
||||
tokenizer: AnyTokenizer,
|
||||
tokenization_kwargs: dict[str, Any],
|
||||
data_1: Union[str, ScoreContentPartParam],
|
||||
data_2: Union[str, ScoreContentPartParam],
|
||||
) -> tuple[str, TokensPrompt]:
|
||||
|
||||
model_config = self.model_config
|
||||
|
||||
full_prompt, engine_prompt = get_score_prompt(
|
||||
model_config=model_config,
|
||||
data_1=data_1,
|
||||
data_2=data_2,
|
||||
tokenizer=tokenizer,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
)
|
||||
if request.mm_processor_kwargs is not None:
|
||||
engine_prompt["mm_processor_kwargs"] = request.mm_processor_kwargs
|
||||
|
||||
return full_prompt, engine_prompt
|
||||
|
||||
async def _cross_encoding_score(
|
||||
self,
|
||||
tokenizer: AnyTokenizer,
|
||||
texts_1: list[str],
|
||||
texts_2: list[str],
|
||||
data_1: Union[list[str], list[ScoreContentPartParam]],
|
||||
data_2: Union[list[str], list[ScoreContentPartParam]],
|
||||
request: Union[RerankRequest, ScoreRequest],
|
||||
request_id=str,
|
||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||
@ -154,46 +180,67 @@ class ServingScores(OpenAIServing):
|
||||
request_prompts: list[str] = []
|
||||
engine_prompts: list[TokensPrompt] = []
|
||||
|
||||
if len(texts_1) == 1:
|
||||
texts_1 = texts_1 * len(texts_2)
|
||||
|
||||
input_pairs = [(t1, t2) for t1, t2 in zip(texts_1, texts_2)]
|
||||
if len(data_1) == 1:
|
||||
data_1 = data_1 * len(data_2)
|
||||
|
||||
if isinstance(tokenizer, MistralTokenizer):
|
||||
raise ValueError(
|
||||
"MistralTokenizer not supported for cross-encoding")
|
||||
|
||||
tokenize_async = make_async(tokenizer.__call__,
|
||||
executor=self._tokenizer_executor)
|
||||
|
||||
tokenization_kwargs = tokenization_kwargs or {}
|
||||
use_pad_token = self.model_config.use_pad_token
|
||||
|
||||
if use_pad_token:
|
||||
# cross_encoder models defaults to using pad_token.
|
||||
tokenized_prompts = await asyncio.gather(
|
||||
*(tokenize_async(text=t1, text_pair=t2, **tokenization_kwargs)
|
||||
for t1, t2 in input_pairs))
|
||||
input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)]
|
||||
|
||||
if self.model_config.is_multimodal_model:
|
||||
|
||||
preprocess_async = make_async(self._preprocess_score,
|
||||
executor=self._tokenizer_executor)
|
||||
|
||||
preprocessed_prompts = await asyncio.gather(
|
||||
*(preprocess_async(request=request,
|
||||
tokenizer=tokenizer,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
data_1=t1,
|
||||
data_2=t2) for t1, t2 in input_pairs))
|
||||
|
||||
for full_prompt, engine_prompt in preprocessed_prompts:
|
||||
request_prompts.append(full_prompt)
|
||||
engine_prompts.append(engine_prompt)
|
||||
|
||||
else:
|
||||
# `llm as reranker` models defaults to not using pad_token.
|
||||
tokenized_prompts = await asyncio.gather(
|
||||
*(tokenize_async(text=t1 + t2, **tokenization_kwargs)
|
||||
for t1, t2 in input_pairs))
|
||||
tokenize_async = make_async(tokenizer.__call__,
|
||||
executor=self._tokenizer_executor)
|
||||
use_pad_token = self.model_config.use_pad_token
|
||||
|
||||
for prompt_inputs, (t1, t2) in zip(tokenized_prompts, input_pairs):
|
||||
sep_token = tokenizer.sep_token if (tokenizer.sep_token
|
||||
and use_pad_token) else ''
|
||||
request_prompt = f"{t1}{sep_token}{t2}"
|
||||
if use_pad_token:
|
||||
# cross_encoder models defaults to using pad_token.
|
||||
tokenized_prompts = await asyncio.gather(*(
|
||||
tokenize_async(
|
||||
text=t1, # type: ignore[arg-type]
|
||||
text_pair=t2, # type: ignore[arg-type]
|
||||
**tokenization_kwargs) for t1, t2 in input_pairs))
|
||||
else:
|
||||
# `llm as reranker` models defaults to not using pad_token.
|
||||
tokenized_prompts = await asyncio.gather(*(
|
||||
tokenize_async(
|
||||
text=t1 + # type: ignore[operator]
|
||||
t2,
|
||||
**tokenization_kwargs) for t1, t2 in input_pairs))
|
||||
|
||||
input_ids = prompt_inputs["input_ids"]
|
||||
text_token_prompt = \
|
||||
self._validate_input(request, input_ids, request_prompt)
|
||||
engine_prompt = TokensPrompt(
|
||||
prompt_token_ids=text_token_prompt["prompt_token_ids"],
|
||||
token_type_ids=prompt_inputs.get("token_type_ids"))
|
||||
for prompt_inputs, (t1, t2) in zip(tokenized_prompts, input_pairs):
|
||||
sep_token = tokenizer.sep_token if (tokenizer.sep_token
|
||||
and use_pad_token) else ''
|
||||
request_prompt = f"{t1}{sep_token}{t2}"
|
||||
|
||||
request_prompts.append(request_prompt)
|
||||
engine_prompts.append(engine_prompt)
|
||||
input_ids = prompt_inputs["input_ids"]
|
||||
text_token_prompt = \
|
||||
self._validate_input(request, input_ids, request_prompt)
|
||||
engine_prompt = TokensPrompt(
|
||||
prompt_token_ids=text_token_prompt["prompt_token_ids"],
|
||||
token_type_ids=prompt_inputs.get("token_type_ids"))
|
||||
|
||||
request_prompts.append(request_prompt)
|
||||
engine_prompts.append(engine_prompt)
|
||||
|
||||
# Schedule the request and get the result generator.
|
||||
generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
|
||||
@ -233,8 +280,8 @@ class ServingScores(OpenAIServing):
|
||||
|
||||
async def _run_scoring(
|
||||
self,
|
||||
texts_1: Union[str, list[str]],
|
||||
texts_2: Union[str, list[str]],
|
||||
data_1: Union[list[str], str, ScoreMultiModalParam],
|
||||
data_2: Union[list[str], str, ScoreMultiModalParam],
|
||||
request: Union[ScoreRequest, RerankRequest],
|
||||
request_id: str,
|
||||
raw_request: Optional[Request] = None,
|
||||
@ -259,18 +306,29 @@ class ServingScores(OpenAIServing):
|
||||
trace_headers = (None if raw_request is None else await
|
||||
self._get_trace_headers(raw_request.headers))
|
||||
|
||||
if isinstance(texts_1, str):
|
||||
texts_1 = [texts_1]
|
||||
if isinstance(texts_2, str):
|
||||
texts_2 = [texts_2]
|
||||
if not self.model_config.is_multimodal_model and (isinstance(
|
||||
data_1, dict) or isinstance(data_2, dict)):
|
||||
raise ValueError(
|
||||
f"MultiModalParam is not supported for {self.model_config.architecture}" # noqa: E501
|
||||
)
|
||||
|
||||
_validate_score_input_lens(texts_1, texts_2)
|
||||
if isinstance(data_1, str):
|
||||
data_1 = [data_1]
|
||||
elif isinstance(data_1, dict):
|
||||
data_1 = data_1.get("content") # type: ignore[assignment]
|
||||
|
||||
if isinstance(data_2, str):
|
||||
data_2 = [data_2]
|
||||
elif isinstance(data_2, dict):
|
||||
data_2 = data_2.get("content") # type: ignore[assignment]
|
||||
|
||||
_validate_score_input_lens(data_1, data_2) # type: ignore[arg-type]
|
||||
|
||||
if self.model_config.is_cross_encoder:
|
||||
return await self._cross_encoding_score(
|
||||
tokenizer=tokenizer,
|
||||
texts_1=texts_1,
|
||||
texts_2=texts_2,
|
||||
data_1=data_1, # type: ignore[arg-type]
|
||||
data_2=data_2, # type: ignore[arg-type]
|
||||
request=request,
|
||||
request_id=request_id,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
@ -281,8 +339,8 @@ class ServingScores(OpenAIServing):
|
||||
else:
|
||||
return await self._embedding_score(
|
||||
tokenizer=tokenizer,
|
||||
texts_1=texts_1,
|
||||
texts_2=texts_2,
|
||||
texts_1=data_1, # type: ignore[arg-type]
|
||||
texts_2=data_2, # type: ignore[arg-type]
|
||||
request=request,
|
||||
request_id=request_id,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
@ -349,7 +407,9 @@ class ServingScores(OpenAIServing):
|
||||
|
||||
request_id = f"rerank-{self._base_request_id(raw_request)}"
|
||||
documents = request.documents
|
||||
top_n = request.top_n if request.top_n > 0 else len(documents)
|
||||
top_n = request.top_n if request.top_n > 0 else (
|
||||
len(documents)
|
||||
if isinstance(documents, list) else len(documents["content"]))
|
||||
|
||||
try:
|
||||
final_res_batch = await self._run_scoring(
|
||||
@ -410,7 +470,7 @@ class ServingScores(OpenAIServing):
|
||||
|
||||
def request_output_to_rerank_response(
|
||||
self, final_res_batch: list[PoolingRequestOutput], request_id: str,
|
||||
model_name: str, documents: list[str],
|
||||
model_name: str, documents: Union[list[str], ScoreMultiModalParam],
|
||||
top_n: int) -> RerankResponse:
|
||||
"""
|
||||
Convert the output of do_rank to a RerankResponse
|
||||
@ -422,7 +482,9 @@ class ServingScores(OpenAIServing):
|
||||
|
||||
result = RerankResult(
|
||||
index=idx,
|
||||
document=RerankDocument(text=documents[idx]),
|
||||
document=RerankDocument(text=documents[idx]) if isinstance(
|
||||
documents, list) else RerankDocument(
|
||||
multi_modal=documents["content"][idx]),
|
||||
relevance_score=classify_res.outputs.score,
|
||||
)
|
||||
results.append(result)
|
||||
|
||||
@ -1,13 +1,41 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import Union
|
||||
from typing import Any, Optional, Union, cast
|
||||
|
||||
from torch.nn import CosineSimilarity
|
||||
from typing_extensions import Required, TypeAlias, TypedDict
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.entrypoints.chat_utils import (
|
||||
BaseMultiModalItemTracker, ChatCompletionContentPartImageEmbedsParam,
|
||||
ChatCompletionContentPartImageParam, ChatCompletionContentPartTextParam,
|
||||
MultiModalItemTracker, _ContentPart, _parse_chat_message_content_part)
|
||||
from vllm.inputs import TokensPrompt
|
||||
from vllm.model_executor.model_loader import get_model_cls
|
||||
from vllm.model_executor.models.interfaces import supports_score_template
|
||||
from vllm.multimodal.inputs import MultiModalDataDict
|
||||
from vllm.outputs import PoolingRequestOutput
|
||||
from vllm.transformers_utils.tokenizer import (PreTrainedTokenizer,
|
||||
from vllm.transformers_utils.tokenizer import (AnyTokenizer,
|
||||
PreTrainedTokenizer,
|
||||
PreTrainedTokenizerFast)
|
||||
|
||||
ScoreContentPartParam: TypeAlias = Union[
|
||||
ChatCompletionContentPartImageParam,
|
||||
ChatCompletionContentPartImageEmbedsParam]
|
||||
|
||||
|
||||
class ScoreMultiModalParam(TypedDict, total=False):
|
||||
"""
|
||||
A specialized parameter type for scoring multimodal content
|
||||
|
||||
The reasons why don't reuse `CustomChatCompletionMessageParam` directly:
|
||||
1. Score tasks don't need the 'role' field (user/assistant/system) that's required in chat completions
|
||||
2. Including chat-specific fields would confuse users about their purpose in scoring
|
||||
3. This is a more focused interface that only exposes what's needed for scoring
|
||||
""" # noqa: E501
|
||||
content: Required[list[ScoreContentPartParam]]
|
||||
"""The multimodal contents"""
|
||||
|
||||
|
||||
def _cosine_similarity(
|
||||
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
|
||||
@ -39,12 +67,128 @@ def _cosine_similarity(
|
||||
|
||||
|
||||
def _validate_score_input_lens(
|
||||
texts_1: Union[list[str], list[dict]],
|
||||
texts_2: Union[list[str], list[dict]],
|
||||
data_1: Union[list[str], list[ScoreContentPartParam]],
|
||||
data_2: Union[list[str], list[ScoreContentPartParam]],
|
||||
):
|
||||
if len(texts_1) > 1 and len(texts_1) != len(texts_2):
|
||||
len_1 = len(data_1)
|
||||
len_2 = len(data_2)
|
||||
|
||||
if len_1 > 1 and len_1 != len_2:
|
||||
raise ValueError("Input lengths must be either 1:1, 1:N or N:N")
|
||||
if len(texts_1) == 0:
|
||||
if len_1 == 0:
|
||||
raise ValueError("At least one text element must be given")
|
||||
if len(texts_2) == 0:
|
||||
raise ValueError("At least one text_pair element must be given")
|
||||
if len_2 == 0:
|
||||
raise ValueError("At least one text_pair element must be given")
|
||||
|
||||
|
||||
def parse_score_data(
|
||||
data_1: Union[str, ScoreContentPartParam],
|
||||
data_2: Union[str, ScoreContentPartParam],
|
||||
model_config: ModelConfig,
|
||||
tokenizer: AnyTokenizer,
|
||||
) -> tuple[str, str, Optional[MultiModalDataDict]]:
|
||||
mm_tracker = MultiModalItemTracker(model_config, tokenizer)
|
||||
|
||||
content_1 = _parse_score_content(data_1, mm_tracker)
|
||||
|
||||
content_2 = _parse_score_content(data_2, mm_tracker)
|
||||
|
||||
def ensure_str(content: Optional[_ContentPart]) -> str:
|
||||
if content is not None and isinstance(content, str):
|
||||
return cast(str, content)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Only string content is supported, but got {content}.")
|
||||
|
||||
prompt_1 = ensure_str(content_1)
|
||||
prompt_2 = ensure_str(content_2)
|
||||
|
||||
return prompt_1, prompt_2, mm_tracker.all_mm_data()
|
||||
|
||||
|
||||
def _parse_score_content(
|
||||
data: Union[str, ScoreContentPartParam],
|
||||
mm_tracker: BaseMultiModalItemTracker,
|
||||
) -> Optional[_ContentPart]:
|
||||
|
||||
if isinstance(data, str):
|
||||
data = ChatCompletionContentPartTextParam(type="text", text=data)
|
||||
|
||||
mm_parser = mm_tracker.create_parser()
|
||||
|
||||
parse_res = _parse_chat_message_content_part(
|
||||
data,
|
||||
mm_parser,
|
||||
wrap_dicts=False,
|
||||
interleave_strings=False,
|
||||
)
|
||||
|
||||
if parse_res:
|
||||
return parse_res
|
||||
|
||||
mm_placeholder_storage = mm_parser.mm_placeholder_storage()
|
||||
|
||||
if len(mm_placeholder_storage) != 1 or len(
|
||||
next(iter(mm_placeholder_storage.values()))) != 1:
|
||||
raise ValueError("Only one multi-modal item is supported")
|
||||
|
||||
return next(iter(mm_placeholder_storage.values()))[0]
|
||||
|
||||
|
||||
def apply_score_template(
|
||||
model_config: ModelConfig,
|
||||
prompt_1: str,
|
||||
prompt_2: str,
|
||||
) -> str:
|
||||
|
||||
model = get_model_cls(model_config)
|
||||
if supports_score_template(model):
|
||||
full_prompt = model.get_score_template(prompt_1, prompt_2)
|
||||
if full_prompt is None:
|
||||
raise ValueError("Get empty score template from model")
|
||||
return full_prompt
|
||||
|
||||
raise ValueError(
|
||||
f"Unsupported model architecture: {model_config.architecture}")
|
||||
|
||||
|
||||
def post_process_tokens(
|
||||
model_config: ModelConfig,
|
||||
prompt: TokensPrompt,
|
||||
) -> None:
|
||||
"""
|
||||
Perform architecture-specific manipulations on the input tokens.
|
||||
|
||||
Note:
|
||||
This is an in-place operation.
|
||||
"""
|
||||
model = get_model_cls(model_config)
|
||||
if supports_score_template(model):
|
||||
model.post_process_tokens(prompt)
|
||||
|
||||
|
||||
def get_score_prompt(
|
||||
model_config: ModelConfig,
|
||||
tokenizer: AnyTokenizer,
|
||||
tokenization_kwargs: dict[str, Any],
|
||||
data_1: Union[str, ScoreContentPartParam],
|
||||
data_2: Union[str, ScoreContentPartParam],
|
||||
) -> tuple[str, TokensPrompt]:
|
||||
prompt_1, prompt_2, mm_data = parse_score_data(
|
||||
data_1,
|
||||
data_2,
|
||||
model_config,
|
||||
tokenizer,
|
||||
)
|
||||
|
||||
full_prompt = apply_score_template(model_config, prompt_1, prompt_2)
|
||||
|
||||
prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs)
|
||||
|
||||
engine_prompt = TokensPrompt(prompt_token_ids=prompt_inputs["input_ids"])
|
||||
|
||||
post_process_tokens(model_config, engine_prompt)
|
||||
|
||||
if mm_data is not None:
|
||||
engine_prompt["multi_modal_data"] = mm_data
|
||||
return full_prompt, engine_prompt
|
||||
|
||||
@ -170,6 +170,15 @@ class Qwen3ForSequenceClassificationConfig(VerifyAndUpdateConfig):
|
||||
vllm_config.model_config.hf_config.method = "from_2_way_softmax"
|
||||
|
||||
|
||||
class JinaVLForSequenceClassificationConfig(VerifyAndUpdateConfig):
|
||||
|
||||
@staticmethod
|
||||
def verify_and_update_config(vllm_config: "VllmConfig") -> None:
|
||||
config = vllm_config.model_config.hf_config
|
||||
|
||||
config.num_labels = 1
|
||||
|
||||
|
||||
class SnowflakeGteNewModelConfig(VerifyAndUpdateConfig):
|
||||
|
||||
@staticmethod
|
||||
@ -197,4 +206,5 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
|
||||
"NomicBertModel": NomicBertModelConfig,
|
||||
"Qwen3ForSequenceClassification": Qwen3ForSequenceClassificationConfig,
|
||||
"XLMRobertaModel": JinaRobertaModelConfig,
|
||||
"JinaVLForRanking": JinaVLForSequenceClassificationConfig,
|
||||
}
|
||||
|
||||
@ -9,6 +9,7 @@ import torch
|
||||
from torch import Tensor
|
||||
from typing_extensions import Self, TypeIs
|
||||
|
||||
from vllm.inputs import TokensPrompt
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.quantization.base_config import (
|
||||
QuantizationConfig)
|
||||
@ -141,6 +142,62 @@ def supports_multimodal(
|
||||
return isinstance(model, SupportsMultiModal)
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class SupportsScoreTemplate(Protocol):
|
||||
"""The interface required for all models that support score template."""
|
||||
|
||||
supports_score_template: ClassVar[Literal[True]] = True
|
||||
"""
|
||||
A flag that indicates this model supports score template.
|
||||
|
||||
Note:
|
||||
There is no need to redefine this flag if this class is in the
|
||||
MRO of your model class.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def get_score_template(cls, query: str, document: str) -> Optional[str]:
|
||||
"""
|
||||
Generate a full prompt by populating the score template with query and document content.
|
||||
""" # noqa: E501
|
||||
...
|
||||
|
||||
@classmethod
|
||||
def post_process_tokens(cls, prompt: TokensPrompt) -> None:
|
||||
"""
|
||||
Perform architecture-specific manipulations on the input tokens.
|
||||
"""
|
||||
...
|
||||
|
||||
|
||||
# We can't use runtime_checkable with ClassVar for issubclass checks
|
||||
# so we need to treat the class as an instance and use isinstance instead
|
||||
@runtime_checkable
|
||||
class _SupportsScoreTemplateType(Protocol):
|
||||
supports_score_template: Literal[True]
|
||||
|
||||
|
||||
@overload
|
||||
def supports_score_template(
|
||||
model: type[object]) -> TypeIs[type[SupportsScoreTemplate]]:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def supports_score_template(model: object) -> TypeIs[SupportsScoreTemplate]:
|
||||
...
|
||||
|
||||
|
||||
def supports_score_template(
|
||||
model: Union[type[object], object],
|
||||
) -> Union[TypeIs[type[SupportsScoreTemplate]], TypeIs[SupportsScoreTemplate]]:
|
||||
|
||||
if isinstance(model, type):
|
||||
return isinstance(model, _SupportsScoreTemplateType)
|
||||
|
||||
return isinstance(model, SupportsScoreTemplate)
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class SupportsLoRA(Protocol):
|
||||
"""The interface required for all models that support LoRA."""
|
||||
|
||||
150
vllm/model_executor/models/jina_vl.py
Normal file
150
vllm/model_executor/models/jina_vl.py
Normal file
@ -0,0 +1,150 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from collections.abc import Iterable, Mapping
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from transformers import BatchFeature, PretrainedConfig
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.inputs import TokensPrompt
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
||||
RowParallelLinear)
|
||||
from vllm.model_executor.layers.pooler import Pooler, PoolingType
|
||||
from vllm.model_executor.pooling_metadata import PoolingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.sequence import IntermediateTensors, PoolerOutput
|
||||
|
||||
from .interfaces import (SupportsCrossEncoding, SupportsMultiModal,
|
||||
SupportsScoreTemplate)
|
||||
from .qwen2_vl import (Qwen2VLDummyInputsBuilder,
|
||||
Qwen2VLForConditionalGeneration,
|
||||
Qwen2VLMultiModalProcessor, Qwen2VLProcessingInfo)
|
||||
from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class JinaVLScorer(nn.Module):
|
||||
|
||||
def __init__(self, config: PretrainedConfig):
|
||||
super().__init__()
|
||||
self.dense = ColumnParallelLinear(config.hidden_size,
|
||||
config.hidden_size,
|
||||
bias=True)
|
||||
self.out_proj = RowParallelLinear(config.hidden_size,
|
||||
config.num_labels,
|
||||
bias=True)
|
||||
|
||||
def forward(self, x, **kwargs):
|
||||
x, _ = self.dense(x)
|
||||
x = torch.relu(x)
|
||||
x, _ = self.out_proj(x)
|
||||
return x
|
||||
|
||||
|
||||
class JinaVLMultiModalProcessor(Qwen2VLMultiModalProcessor):
|
||||
|
||||
def _call_hf_processor(
|
||||
self,
|
||||
prompt: str,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
tok_kwargs: Mapping[str, object],
|
||||
) -> BatchFeature:
|
||||
|
||||
# NOTE: We should reverse the order of the mm_data because the
|
||||
# query prompt is placed after the document prompt in the score
|
||||
# template for JinaVLForRanking model, but in mm_data they are
|
||||
# stored in the opposite order (query first, then document).
|
||||
for _, value in mm_data.items():
|
||||
value.reverse()
|
||||
return super()._call_hf_processor(prompt, mm_data, mm_kwargs,
|
||||
tok_kwargs)
|
||||
|
||||
|
||||
@MULTIMODAL_REGISTRY.register_processor(JinaVLMultiModalProcessor,
|
||||
info=Qwen2VLProcessingInfo,
|
||||
dummy_inputs=Qwen2VLDummyInputsBuilder)
|
||||
class JinaVLForSequenceClassification(Qwen2VLForConditionalGeneration,
|
||||
SupportsCrossEncoding,
|
||||
SupportsMultiModal,
|
||||
SupportsScoreTemplate):
|
||||
weight_mapper = WeightsMapper(
|
||||
orig_to_new_prefix={
|
||||
"score.0.": "score.dense.",
|
||||
"score.2.": "score.out_proj.",
|
||||
# mapping for new names in checkpoint saved after transformers v4.52
|
||||
"model.language_model.": "language_model.model.",
|
||||
"visual.": "visual.",
|
||||
# mapping for original checkpoint
|
||||
"lm_head.": "language_model.lm_head.",
|
||||
"model.": "language_model.model.",
|
||||
})
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
super().__init__(vllm_config=vllm_config,
|
||||
prefix=maybe_prefix(prefix, "qwen2_vl"))
|
||||
config = vllm_config.model_config.hf_config
|
||||
pooler_config = vllm_config.model_config.pooler_config
|
||||
|
||||
# logit bias for sigmoid normalization
|
||||
self.LOGIT_BIAS = 2.65
|
||||
|
||||
self.score = JinaVLScorer(config)
|
||||
|
||||
self._pooler = Pooler.from_config_with_defaults(
|
||||
pooler_config,
|
||||
pooling_type=PoolingType.LAST,
|
||||
normalize=False,
|
||||
softmax=True)
|
||||
|
||||
@classmethod
|
||||
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
|
||||
if modality.startswith("image"):
|
||||
return "<|vision_start|><|image_pad|><|vision_end|>"
|
||||
|
||||
raise ValueError("Only image modality is supported")
|
||||
|
||||
@classmethod
|
||||
def get_score_template(cls, query: str, document: str) -> Optional[str]:
|
||||
return f"**Document**:\n{document}\n**Query**:\n{query}"
|
||||
|
||||
@classmethod
|
||||
def post_process_tokens(cls, prompt: TokensPrompt) -> None:
|
||||
|
||||
# add score target token at the end of prompt tokens
|
||||
prompt['prompt_token_ids'].append(100)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
intermediate_tensors: Optional[IntermediateTensors] = None,
|
||||
inputs_embeds: Optional[torch.Tensor] = None,
|
||||
**kwargs: object,
|
||||
) -> torch.Tensor:
|
||||
hidden_states = super().forward(
|
||||
input_ids=input_ids,
|
||||
positions=positions,
|
||||
intermediate_tensors=intermediate_tensors,
|
||||
inputs_embeds=inputs_embeds,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
logits = self.score(hidden_states) - self.LOGIT_BIAS
|
||||
return logits
|
||||
|
||||
def pooler(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
pooling_metadata: PoolingMetadata,
|
||||
) -> Optional[PoolerOutput]:
|
||||
return self._pooler(hidden_states, pooling_metadata)
|
||||
|
||||
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
|
||||
|
||||
loader = AutoWeightsLoader(self)
|
||||
return loader.load_weights(weights, mapper=self.weight_mapper)
|
||||
@ -181,6 +181,7 @@ _CROSS_ENCODER_MODELS = {
|
||||
"GemmaForSequenceClassification": ("gemma", "GemmaForSequenceClassification"), # noqa: E501
|
||||
"Qwen2ForSequenceClassification": ("qwen2", "Qwen2ForSequenceClassification"), # noqa: E501
|
||||
"Qwen3ForSequenceClassification": ("qwen3", "Qwen3ForSequenceClassification"), # noqa: E501
|
||||
"JinaVLForRanking": ("jina_vl", "JinaVLForSequenceClassification"), # noqa: E501
|
||||
}
|
||||
|
||||
_MULTIMODAL_MODELS = {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user