mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-22 10:44:26 +08:00
[Model][VLM] Support JinaVL Reranker (#20260)
Signed-off-by: shineran96 <shinewang96@gmail.com>
This commit is contained in:
parent
b140416abf
commit
4bed167768
@ -282,7 +282,7 @@ steps:
|
|||||||
- python3 offline_inference/llm_engine_example.py
|
- python3 offline_inference/llm_engine_example.py
|
||||||
- python3 offline_inference/audio_language.py --seed 0
|
- python3 offline_inference/audio_language.py --seed 0
|
||||||
- python3 offline_inference/vision_language.py --seed 0
|
- python3 offline_inference/vision_language.py --seed 0
|
||||||
- python3 offline_inference/vision_language_embedding.py --seed 0
|
- python3 offline_inference/vision_language_pooling.py --seed 0
|
||||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||||
- VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
- VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||||
- python3 offline_inference/encoder_decoder.py
|
- python3 offline_inference/encoder_decoder.py
|
||||||
|
|||||||
@ -712,6 +712,14 @@ The following table lists those that are tested in vLLM.
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
#### Scoring
|
||||||
|
|
||||||
|
Specified using `--task score`.
|
||||||
|
|
||||||
|
| Architecture | Models | Inputs | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) |
|
||||||
|
|-------------------------------------|--------------------|----------|--------------------------|------------------------|-----------------------------|-----------------------|
|
||||||
|
| `JinaVLForSequenceClassification` | JinaVL-based | T + I<sup>E+</sup> | `jinaai/jina-reranker-m0`, etc. | | | ✅︎ |
|
||||||
|
|
||||||
## Model Support Policy
|
## Model Support Policy
|
||||||
|
|
||||||
At vLLM, we are committed to facilitating the integration and support of third-party models within our ecosystem. Our approach is designed to balance the need for robustness and the practical limitations of supporting a wide range of models. Here’s how we manage third-party model support:
|
At vLLM, we are committed to facilitating the integration and support of third-party models within our ecosystem. Our approach is designed to balance the need for robustness and the practical limitations of supporting a wide range of models. Here’s how we manage third-party model support:
|
||||||
|
|||||||
@ -537,7 +537,7 @@ The following extra parameters are supported:
|
|||||||
|
|
||||||
### Score API
|
### Score API
|
||||||
|
|
||||||
Our Score API can apply a cross-encoder model or an embedding model to predict scores for sentence pairs. When using an embedding model the score corresponds to the cosine similarity between each embedding pair.
|
Our Score API can apply a cross-encoder model or an embedding model to predict scores for sentence or multimodal pairs. When using an embedding model the score corresponds to the cosine similarity between each embedding pair.
|
||||||
Usually, the score for a sentence pair refers to the similarity between two sentences, on a scale of 0 to 1.
|
Usually, the score for a sentence pair refers to the similarity between two sentences, on a scale of 0 to 1.
|
||||||
|
|
||||||
You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
|
You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
|
||||||
@ -676,6 +676,55 @@ The total number of pairs is `len(text_2)`.
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
#### Multi-modal inputs
|
||||||
|
|
||||||
|
You can pass multi-modal inputs to scoring models by passing `content` including a list of multi-modal input (image, etc.) in the request. Refer to the examples below for illustration.
|
||||||
|
|
||||||
|
=== "JinaVL-Reranker"
|
||||||
|
|
||||||
|
To serve the model:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
vllm serve jinaai/jina-reranker-m0
|
||||||
|
```
|
||||||
|
|
||||||
|
Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:
|
||||||
|
|
||||||
|
??? Code
|
||||||
|
|
||||||
|
```python
|
||||||
|
import requests
|
||||||
|
|
||||||
|
response = requests.post(
|
||||||
|
"http://localhost:8000/v1/score",
|
||||||
|
json={
|
||||||
|
"model": "jinaai/jina-reranker-m0",
|
||||||
|
"text_1": "slm markdown",
|
||||||
|
"text_2": {
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
|
||||||
|
},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
response_json = response.json()
|
||||||
|
print("Scoring output:", response_json["data"][0]["score"])
|
||||||
|
print("Scoring output:", response_json["data"][1]["score"])
|
||||||
|
```
|
||||||
|
Full example: <gh-file:examples/online_serving/openai_cross_encoder_score_for_multimodal.py>
|
||||||
|
|
||||||
#### Extra parameters
|
#### Extra parameters
|
||||||
|
|
||||||
The following [pooling parameters][pooling-params] are supported.
|
The following [pooling parameters][pooling-params] are supported.
|
||||||
@ -695,8 +744,7 @@ The following extra parameters are supported:
|
|||||||
### Re-rank API
|
### Re-rank API
|
||||||
|
|
||||||
Our Re-rank API can apply an embedding model or a cross-encoder model to predict relevant scores between a single query, and
|
Our Re-rank API can apply an embedding model or a cross-encoder model to predict relevant scores between a single query, and
|
||||||
each of a list of documents. Usually, the score for a sentence pair refers to the similarity between two sentences, on
|
each of a list of documents. Usually, the score for a sentence pair refers to the similarity between two sentences or multi-modal inputs (image, etc.), on a scale of 0 to 1.
|
||||||
a scale of 0 to 1.
|
|
||||||
|
|
||||||
You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
|
You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
|
||||||
|
|
||||||
|
|||||||
@ -2,7 +2,7 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
"""
|
"""
|
||||||
This example shows how to use vLLM for running offline inference with
|
This example shows how to use vLLM for running offline inference with
|
||||||
the correct prompt format on vision language models for multimodal embedding.
|
the correct prompt format on vision language models for multimodal pooling.
|
||||||
|
|
||||||
For most models, the prompt format should follow corresponding examples
|
For most models, the prompt format should follow corresponding examples
|
||||||
on HuggingFace model repository.
|
on HuggingFace model repository.
|
||||||
@ -15,6 +15,7 @@ from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args
|
|||||||
from PIL.Image import Image
|
from PIL.Image import Image
|
||||||
|
|
||||||
from vllm import LLM, EngineArgs
|
from vllm import LLM, EngineArgs
|
||||||
|
from vllm.entrypoints.score_utils import ScoreMultiModalParam
|
||||||
from vllm.multimodal.utils import fetch_image
|
from vllm.multimodal.utils import fetch_image
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
@ -35,14 +36,22 @@ class TextImageQuery(TypedDict):
|
|||||||
image: Image
|
image: Image
|
||||||
|
|
||||||
|
|
||||||
QueryModality = Literal["text", "image", "text+image"]
|
class TextImagesQuery(TypedDict):
|
||||||
Query = Union[TextQuery, ImageQuery, TextImageQuery]
|
modality: Literal["text+images"]
|
||||||
|
text: str
|
||||||
|
image: ScoreMultiModalParam
|
||||||
|
|
||||||
|
|
||||||
|
QueryModality = Literal["text", "image", "text+image", "text+images"]
|
||||||
|
Query = Union[TextQuery, ImageQuery, TextImageQuery, TextImagesQuery]
|
||||||
|
|
||||||
|
|
||||||
class ModelRequestData(NamedTuple):
|
class ModelRequestData(NamedTuple):
|
||||||
engine_args: EngineArgs
|
engine_args: EngineArgs
|
||||||
prompt: str
|
prompt: Optional[str] = None
|
||||||
image: Optional[Image]
|
image: Optional[Image] = None
|
||||||
|
query: Optional[str] = None
|
||||||
|
documents: Optional[ScoreMultiModalParam] = None
|
||||||
|
|
||||||
|
|
||||||
def run_e5_v(query: Query) -> ModelRequestData:
|
def run_e5_v(query: Query) -> ModelRequestData:
|
||||||
@ -107,6 +116,29 @@ def run_vlm2vec(query: Query) -> ModelRequestData:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def run_jinavl_reranker(query: Query) -> ModelRequestData:
|
||||||
|
if query["modality"] != "text+images":
|
||||||
|
raise ValueError(f"Unsupported query modality: '{query['modality']}'")
|
||||||
|
|
||||||
|
engine_args = EngineArgs(
|
||||||
|
model="jinaai/jina-reranker-m0",
|
||||||
|
task="score",
|
||||||
|
max_model_len=32768,
|
||||||
|
trust_remote_code=True,
|
||||||
|
mm_processor_kwargs={
|
||||||
|
"min_pixels": 3136,
|
||||||
|
"max_pixels": 602112,
|
||||||
|
},
|
||||||
|
limit_mm_per_prompt={"image": 1},
|
||||||
|
)
|
||||||
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
query=query["text"],
|
||||||
|
documents=query["image"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_query(modality: QueryModality):
|
def get_query(modality: QueryModality):
|
||||||
if modality == "text":
|
if modality == "text":
|
||||||
return TextQuery(modality="text", text="A dog sitting in the grass")
|
return TextQuery(modality="text", text="A dog sitting in the grass")
|
||||||
@ -128,6 +160,28 @@ def get_query(modality: QueryModality):
|
|||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if modality == "text+images":
|
||||||
|
return TextImagesQuery(
|
||||||
|
modality="text+images",
|
||||||
|
text="slm markdown",
|
||||||
|
image={
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
|
||||||
|
},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
msg = f"Modality {modality} is not supported."
|
msg = f"Modality {modality} is not supported."
|
||||||
raise ValueError(msg)
|
raise ValueError(msg)
|
||||||
|
|
||||||
@ -162,16 +216,31 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
|
|||||||
print("-" * 50)
|
print("-" * 50)
|
||||||
|
|
||||||
|
|
||||||
|
def run_score(model: str, modality: QueryModality, seed: Optional[int]):
|
||||||
|
query = get_query(modality)
|
||||||
|
req_data = model_example_map[model](query)
|
||||||
|
|
||||||
|
engine_args = asdict(req_data.engine_args) | {"seed": seed}
|
||||||
|
llm = LLM(**engine_args)
|
||||||
|
|
||||||
|
outputs = llm.score(req_data.query, req_data.documents)
|
||||||
|
|
||||||
|
print("-" * 30)
|
||||||
|
print([output.outputs.score for output in outputs])
|
||||||
|
print("-" * 30)
|
||||||
|
|
||||||
|
|
||||||
model_example_map = {
|
model_example_map = {
|
||||||
"e5_v": run_e5_v,
|
"e5_v": run_e5_v,
|
||||||
"vlm2vec": run_vlm2vec,
|
"vlm2vec": run_vlm2vec,
|
||||||
|
"jinavl_reranker": run_jinavl_reranker,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
parser = FlexibleArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description="Demo on using vLLM for offline inference with "
|
description="Demo on using vLLM for offline inference with "
|
||||||
"vision language models for multimodal embedding"
|
"vision language models for multimodal pooling tasks."
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--model-name",
|
"--model-name",
|
||||||
@ -181,6 +250,14 @@ def parse_args():
|
|||||||
choices=model_example_map.keys(),
|
choices=model_example_map.keys(),
|
||||||
help="The name of the embedding model.",
|
help="The name of the embedding model.",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--task",
|
||||||
|
"-t",
|
||||||
|
type=str,
|
||||||
|
default="embedding",
|
||||||
|
choices=["embedding", "scoring"],
|
||||||
|
help="The task type.",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--modality",
|
"--modality",
|
||||||
type=str,
|
type=str,
|
||||||
@ -198,7 +275,12 @@ def parse_args():
|
|||||||
|
|
||||||
|
|
||||||
def main(args: Namespace):
|
def main(args: Namespace):
|
||||||
run_encode(args.model_name, args.modality, args.seed)
|
if args.task == "embedding":
|
||||||
|
run_encode(args.model_name, args.modality, args.seed)
|
||||||
|
elif args.task == "scoring":
|
||||||
|
run_score(args.model_name, args.modality, args.seed)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unsupported task: {args.task}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
@ -0,0 +1,60 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
"""
|
||||||
|
Example online usage of Score API.
|
||||||
|
|
||||||
|
Run `vllm serve <model> --task score` to start up the server in vLLM.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import pprint
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
|
def post_http_request(prompt: dict, api_url: str) -> requests.Response:
|
||||||
|
headers = {"User-Agent": "Test Client"}
|
||||||
|
response = requests.post(api_url, headers=headers, json=prompt)
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--host", type=str, default="localhost")
|
||||||
|
parser.add_argument("--port", type=int, default=8000)
|
||||||
|
parser.add_argument("--model", type=str, default="jinaai/jina-reranker-m0")
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
api_url = f"http://{args.host}:{args.port}/score"
|
||||||
|
model_name = args.model
|
||||||
|
|
||||||
|
text_1 = "slm markdown"
|
||||||
|
text_2 = {
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
|
||||||
|
},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
|
||||||
|
score_response = post_http_request(prompt=prompt, api_url=api_url)
|
||||||
|
print("\nPrompt when text_1 is string and text_2 is a image list:")
|
||||||
|
pprint.pprint(prompt)
|
||||||
|
print("\nScore Response:")
|
||||||
|
pprint.pprint(score_response.json())
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
args = parse_args()
|
||||||
|
main(args)
|
||||||
160
tests/models/multimodal/pooling/test_jinavl_reranker.py
Normal file
160
tests/models/multimodal/pooling/test_jinavl_reranker.py
Normal file
@ -0,0 +1,160 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from transformers import AutoModel
|
||||||
|
|
||||||
|
model_name = "jinaai/jina-reranker-m0"
|
||||||
|
|
||||||
|
mm_processor_kwargs = {
|
||||||
|
"min_pixels": 3136,
|
||||||
|
"max_pixels": 602112,
|
||||||
|
}
|
||||||
|
|
||||||
|
limit_mm_per_prompt = {"image": 2}
|
||||||
|
|
||||||
|
|
||||||
|
def vllm_reranker(model_name,
|
||||||
|
query,
|
||||||
|
documents,
|
||||||
|
query_type="text",
|
||||||
|
doc_type="text"):
|
||||||
|
from vllm import LLM
|
||||||
|
|
||||||
|
model = LLM(
|
||||||
|
model=model_name,
|
||||||
|
task="score",
|
||||||
|
max_model_len=32768,
|
||||||
|
mm_processor_kwargs=mm_processor_kwargs,
|
||||||
|
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||||
|
)
|
||||||
|
|
||||||
|
def create_image_param(url: str):
|
||||||
|
return {"type": "image_url", "image_url": {"url": f"{url}"}}
|
||||||
|
|
||||||
|
if query_type == "image":
|
||||||
|
query = {"content": [create_image_param(url) for url in query]}
|
||||||
|
|
||||||
|
if doc_type == "image":
|
||||||
|
documents = {"content": [create_image_param(url) for url in documents]}
|
||||||
|
|
||||||
|
outputs = model.score(query, documents)
|
||||||
|
|
||||||
|
return [output.outputs.score for output in outputs]
|
||||||
|
|
||||||
|
|
||||||
|
def hf_reranker(model_name,
|
||||||
|
query,
|
||||||
|
documents,
|
||||||
|
query_type="text",
|
||||||
|
doc_type="text"):
|
||||||
|
|
||||||
|
checkpoint_to_hf_mapper = {
|
||||||
|
"visual.": "model.visual.",
|
||||||
|
"model.": "model.language_model.",
|
||||||
|
}
|
||||||
|
|
||||||
|
model = AutoModel.from_pretrained(
|
||||||
|
model_name,
|
||||||
|
torch_dtype="auto",
|
||||||
|
trust_remote_code=True,
|
||||||
|
key_mapping=checkpoint_to_hf_mapper).to("cuda").eval()
|
||||||
|
|
||||||
|
data_pairs = [[query[0], d] for d in documents]
|
||||||
|
|
||||||
|
scores = model.compute_score(data_pairs,
|
||||||
|
max_length=2048,
|
||||||
|
query_type=query_type,
|
||||||
|
doc_type=doc_type)
|
||||||
|
return scores
|
||||||
|
|
||||||
|
|
||||||
|
# Visual Documents Reranking
|
||||||
|
@pytest.mark.parametrize("model_name", [model_name])
|
||||||
|
def test_model_text_image(model_name):
|
||||||
|
|
||||||
|
query = ["slm markdown"]
|
||||||
|
documents = [
|
||||||
|
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png",
|
||||||
|
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png",
|
||||||
|
]
|
||||||
|
|
||||||
|
hf_outputs = hf_reranker(model_name, query, documents, "text", "image")
|
||||||
|
vllm_outputs = vllm_reranker(model_name, query, documents, "text", "image")
|
||||||
|
|
||||||
|
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
|
||||||
|
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
|
||||||
|
|
||||||
|
|
||||||
|
# Textual Documents Reranking
|
||||||
|
@pytest.mark.parametrize("model_name", [model_name])
|
||||||
|
def test_model_text_text(model_name):
|
||||||
|
|
||||||
|
query = ["slm markdown"]
|
||||||
|
documents = [
|
||||||
|
"""We present ReaderLM-v2, a compact 1.5 billion parameter language model designed for efficient
|
||||||
|
web content extraction. Our model processes documents up to 512K tokens, transforming messy HTML
|
||||||
|
into clean Markdown or JSON formats with high accuracy -- making it an ideal tool for grounding
|
||||||
|
large language models. The models effectiveness results from two key innovations: (1) a three-stage
|
||||||
|
data synthesis pipeline that generates high quality, diverse training data by iteratively drafting,
|
||||||
|
refining, and critiquing web content extraction; and (2) a unified training framework combining
|
||||||
|
continuous pre-training with multi-objective optimization. Intensive evaluation demonstrates that
|
||||||
|
ReaderLM-v2 outperforms GPT-4o-2024-08-06 and other larger models by 15-20% on carefully curated
|
||||||
|
benchmarks, particularly excelling at documents exceeding 100K tokens, while maintaining significantly
|
||||||
|
lower computational requirements.""", # noqa: E501
|
||||||
|
"数据提取么?为什么不用正则啊,你用正则不就全解决了么?",
|
||||||
|
]
|
||||||
|
|
||||||
|
hf_outputs = hf_reranker(model_name, query, documents, "text", "text")
|
||||||
|
vllm_outputs = vllm_reranker(model_name, query, documents, "text", "text")
|
||||||
|
|
||||||
|
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
|
||||||
|
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
|
||||||
|
|
||||||
|
|
||||||
|
# Image Querying for Textual Documents
|
||||||
|
@pytest.mark.parametrize("model_name", [model_name])
|
||||||
|
def test_model_image_text(model_name):
|
||||||
|
|
||||||
|
query = [
|
||||||
|
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
|
||||||
|
]
|
||||||
|
documents = [
|
||||||
|
"""We present ReaderLM-v2, a compact 1.5 billion parameter language model designed for efficient
|
||||||
|
web content extraction. Our model processes documents up to 512K tokens, transforming messy HTML
|
||||||
|
into clean Markdown or JSON formats with high accuracy -- making it an ideal tool for grounding
|
||||||
|
large language models. The models effectiveness results from two key innovations: (1) a three-stage
|
||||||
|
data synthesis pipeline that generates high quality, diverse training data by iteratively drafting,
|
||||||
|
refining, and critiquing web content extraction; and (2) a unified training framework combining
|
||||||
|
continuous pre-training with multi-objective optimization. Intensive evaluation demonstrates that
|
||||||
|
ReaderLM-v2 outperforms GPT-4o-2024-08-06 and other larger models by 15-20% on carefully curated
|
||||||
|
benchmarks, particularly excelling at documents exceeding 100K tokens, while maintaining significantly
|
||||||
|
lower computational requirements.""", # noqa: E501
|
||||||
|
"数据提取么?为什么不用正则啊,你用正则不就全解决了么?",
|
||||||
|
]
|
||||||
|
|
||||||
|
hf_outputs = hf_reranker(model_name, query, documents, "image", "text")
|
||||||
|
vllm_outputs = vllm_reranker(model_name, query, documents, "image", "text")
|
||||||
|
|
||||||
|
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
|
||||||
|
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
|
||||||
|
|
||||||
|
|
||||||
|
# Image Querying for Image Documents
|
||||||
|
@pytest.mark.parametrize("model_name", [model_name])
|
||||||
|
def test_model_image_image(model_name):
|
||||||
|
|
||||||
|
query = [
|
||||||
|
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
|
||||||
|
]
|
||||||
|
documents = [
|
||||||
|
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png",
|
||||||
|
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png",
|
||||||
|
]
|
||||||
|
|
||||||
|
hf_outputs = hf_reranker(model_name, query, documents, "image", "image")
|
||||||
|
vllm_outputs = vllm_reranker(model_name, query, documents, "image",
|
||||||
|
"image")
|
||||||
|
|
||||||
|
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
|
||||||
|
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
|
||||||
@ -432,6 +432,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
|||||||
trust_remote_code=True), # noqa: E501
|
trust_remote_code=True), # noqa: E501
|
||||||
"MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501
|
"MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501
|
||||||
"WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), # noqa: E501
|
"WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), # noqa: E501
|
||||||
|
|
||||||
|
# [Cross-encoder]
|
||||||
|
"JinaVLForRanking": _HfExamplesInfo("jinaai/jina-reranker-m0"), # noqa: E501
|
||||||
}
|
}
|
||||||
|
|
||||||
_SPECULATIVE_DECODING_EXAMPLE_MODELS = {
|
_SPECULATIVE_DECODING_EXAMPLE_MODELS = {
|
||||||
|
|||||||
@ -28,8 +28,11 @@ from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
|
|||||||
apply_mistral_chat_template,
|
apply_mistral_chat_template,
|
||||||
parse_chat_messages,
|
parse_chat_messages,
|
||||||
resolve_chat_template_content_format)
|
resolve_chat_template_content_format)
|
||||||
from vllm.entrypoints.score_utils import (_cosine_similarity,
|
from vllm.entrypoints.score_utils import (ScoreContentPartParam,
|
||||||
_validate_score_input_lens)
|
ScoreMultiModalParam,
|
||||||
|
_cosine_similarity,
|
||||||
|
_validate_score_input_lens,
|
||||||
|
get_score_prompt)
|
||||||
from vllm.entrypoints.utils import _validate_truncation_size
|
from vllm.entrypoints.utils import _validate_truncation_size
|
||||||
from vllm.inputs import PromptType, SingletonPrompt, TextPrompt, TokensPrompt
|
from vllm.inputs import PromptType, SingletonPrompt, TextPrompt, TokensPrompt
|
||||||
from vllm.inputs.parse import parse_and_batch_prompt
|
from vllm.inputs.parse import parse_and_batch_prompt
|
||||||
@ -1187,8 +1190,8 @@ class LLM:
|
|||||||
def _cross_encoding_score(
|
def _cross_encoding_score(
|
||||||
self,
|
self,
|
||||||
tokenizer: AnyTokenizer,
|
tokenizer: AnyTokenizer,
|
||||||
text_1: list[str],
|
data_1: Union[list[str], list[ScoreContentPartParam]],
|
||||||
text_2: list[str],
|
data_2: Union[list[str], list[ScoreContentPartParam]],
|
||||||
truncate_prompt_tokens: Optional[int] = None,
|
truncate_prompt_tokens: Optional[int] = None,
|
||||||
use_tqdm: Union[bool, Callable[..., tqdm]] = True,
|
use_tqdm: Union[bool, Callable[..., tqdm]] = True,
|
||||||
lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
|
lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
|
||||||
@ -1199,10 +1202,8 @@ class LLM:
|
|||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Score API is only enabled for `--task embed or score`")
|
"Score API is only enabled for `--task embed or score`")
|
||||||
|
|
||||||
if len(text_1) == 1:
|
if len(data_1) == 1:
|
||||||
text_1 = text_1 * len(text_2)
|
data_1 = data_1 * len(data_2)
|
||||||
|
|
||||||
input_pairs = [(t1, t2) for t1, t2 in zip(text_1, text_2)]
|
|
||||||
|
|
||||||
pooling_params = PoolingParams(use_cross_encoder=True)
|
pooling_params = PoolingParams(use_cross_encoder=True)
|
||||||
tokenization_kwargs: dict[str, Any] = {}
|
tokenization_kwargs: dict[str, Any] = {}
|
||||||
@ -1211,19 +1212,41 @@ class LLM:
|
|||||||
|
|
||||||
parsed_prompts = []
|
parsed_prompts = []
|
||||||
|
|
||||||
for q, t in input_pairs:
|
input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)]
|
||||||
if self.llm_engine.model_config.use_pad_token:
|
|
||||||
# cross_encoder models defaults to using pad_token.
|
if self.llm_engine.model_config.is_multimodal_model:
|
||||||
prompt_inputs = tokenizer(text=q,
|
|
||||||
text_pair=t,
|
model_config = self.llm_engine.model_config
|
||||||
**tokenization_kwargs)
|
|
||||||
else:
|
for q, d in input_pairs:
|
||||||
# `llm as reranker` models defaults to not using pad_token.
|
_, engine_prompt = get_score_prompt(
|
||||||
prompt_inputs = tokenizer(text=q + t, **tokenization_kwargs)
|
model_config=model_config,
|
||||||
engine_prompt = TokensPrompt(
|
data_1=q,
|
||||||
prompt_token_ids=prompt_inputs["input_ids"],
|
data_2=d,
|
||||||
token_type_ids=prompt_inputs.get("token_type_ids"))
|
tokenizer=tokenizer,
|
||||||
parsed_prompts.append(engine_prompt)
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
|
parsed_prompts.append(engine_prompt)
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
for q, t in input_pairs:
|
||||||
|
if self.llm_engine.model_config.use_pad_token:
|
||||||
|
# cross_encoder models defaults to using pad_token.
|
||||||
|
prompt_inputs = tokenizer(
|
||||||
|
text=q, # type: ignore[arg-type]
|
||||||
|
text_pair=t, # type: ignore[arg-type]
|
||||||
|
**tokenization_kwargs)
|
||||||
|
else:
|
||||||
|
# `llm as reranker` models defaults to not using pad_token.
|
||||||
|
prompt_inputs = tokenizer(
|
||||||
|
text=q + t, # type: ignore[operator]
|
||||||
|
**tokenization_kwargs)
|
||||||
|
engine_prompt = TokensPrompt(
|
||||||
|
prompt_token_ids=prompt_inputs["input_ids"],
|
||||||
|
token_type_ids=prompt_inputs.get("token_type_ids"))
|
||||||
|
parsed_prompts.append(engine_prompt)
|
||||||
|
|
||||||
self._validate_and_add_requests(
|
self._validate_and_add_requests(
|
||||||
prompts=parsed_prompts,
|
prompts=parsed_prompts,
|
||||||
@ -1241,8 +1264,10 @@ class LLM:
|
|||||||
|
|
||||||
def score(
|
def score(
|
||||||
self,
|
self,
|
||||||
text_1: Union[SingletonPrompt, Sequence[SingletonPrompt]],
|
data_1: Union[SingletonPrompt, Sequence[SingletonPrompt],
|
||||||
text_2: Union[SingletonPrompt, Sequence[SingletonPrompt]],
|
ScoreMultiModalParam],
|
||||||
|
data_2: Union[SingletonPrompt, Sequence[SingletonPrompt],
|
||||||
|
ScoreMultiModalParam],
|
||||||
/,
|
/,
|
||||||
*,
|
*,
|
||||||
truncate_prompt_tokens: Optional[int] = None,
|
truncate_prompt_tokens: Optional[int] = None,
|
||||||
@ -1250,22 +1275,30 @@ class LLM:
|
|||||||
lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
|
lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
|
||||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
||||||
) -> list[ScoringRequestOutput]:
|
) -> list[ScoringRequestOutput]:
|
||||||
"""Generate similarity scores for all pairs `<text,text_pair>`.
|
"""Generate similarity scores for all pairs `<text,text_pair>` or
|
||||||
|
`<multi-modal data, multi-modal data pair>`.
|
||||||
|
|
||||||
The inputs can be `1 -> 1`, `1 -> N` or `N -> N`.
|
The inputs can be `1 -> 1`, `1 -> N` or `N -> N`.
|
||||||
In the `1 - N` case the `text_1` sentence will be replicated `N`
|
In the `1 - N` case the `data_1` input will be replicated `N`
|
||||||
times to pair with the `text_2` sentences.
|
times to pair with the `data_2` inputs.
|
||||||
The input pairs are used to build a list of prompts for the
|
The input pairs are used to build a list of prompts for the
|
||||||
cross encoder model. This class automatically batches the prompts,
|
cross encoder model. This class automatically batches the prompts,
|
||||||
considering the memory constraint. For the best performance, put all
|
considering the memory constraint. For the best performance, put all
|
||||||
of your texts into a single list and pass it to this method.
|
of your inputs into a single list and pass it to this method.
|
||||||
|
|
||||||
|
Supports both text and multi-modal data (images, etc.) when used with
|
||||||
|
appropriate multi-modal models. For multi-modal inputs, ensure the
|
||||||
|
prompt structure matches the model's expected input format.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
text_1: can be a single prompt or a list of prompts, in which
|
data_1: Can be a single prompt, a list of prompts or
|
||||||
case it has to have the same length as the `text_2` list
|
`ScoreMultiModalParam`, which can contain either text or
|
||||||
text_2: The texts to pair with the query to form the input
|
multi-modal data. When a list, it must have the same length as
|
||||||
to the LLM. See [PromptType][vllm.inputs.PromptType] for
|
the `data_2` list.
|
||||||
more details about the format of each prompts.
|
data_2: The data to pair with the query to form the input to
|
||||||
|
the LLM. Can be text or multi-modal data. See [PromptType]
|
||||||
|
[vllm.inputs.PromptType] for more details about the format of
|
||||||
|
each prompt.
|
||||||
use_tqdm: If `True`, shows a tqdm progress bar.
|
use_tqdm: If `True`, shows a tqdm progress bar.
|
||||||
If a callable (e.g., `functools.partial(tqdm, leave=False)`),
|
If a callable (e.g., `functools.partial(tqdm, leave=False)`),
|
||||||
it is used to create the progress bar.
|
it is used to create the progress bar.
|
||||||
@ -1306,42 +1339,70 @@ class LLM:
|
|||||||
# lists of tokens to the `text` and `text_pair` kwargs
|
# lists of tokens to the `text` and `text_pair` kwargs
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
|
|
||||||
def ensure_str(prompt: SingletonPrompt):
|
if not self.llm_engine.model_config.is_multimodal_model:
|
||||||
if isinstance(prompt, dict):
|
|
||||||
if "multi_modal_data" in prompt:
|
|
||||||
raise ValueError("Multi-modal prompt is not "
|
|
||||||
"supported for scoring")
|
|
||||||
elif "prompt_token_ids" in prompt:
|
|
||||||
prompt = tokenizer.decode(
|
|
||||||
cast(TokensPrompt, prompt)["prompt_token_ids"])
|
|
||||||
elif "prompt" in prompt:
|
|
||||||
prompt = cast(TextPrompt, prompt)["prompt"]
|
|
||||||
assert type(prompt) is str
|
|
||||||
return prompt
|
|
||||||
|
|
||||||
if isinstance(text_1, (str, dict)):
|
def check_data_type(data: Union[SingletonPrompt,
|
||||||
# Convert a single prompt to a list.
|
Sequence[SingletonPrompt],
|
||||||
text_1 = [text_1]
|
ScoreMultiModalParam]):
|
||||||
input_text_1: list[str] = [ensure_str(t) for t in text_1]
|
if isinstance(data, dict) and "content" in data:
|
||||||
|
raise ValueError(
|
||||||
|
f"ScoreMultiModalParam is not supported for {self.llm_engine.model_config.architecture}", # noqa: E501
|
||||||
|
)
|
||||||
|
|
||||||
if isinstance(text_2, (str, dict)):
|
check_data_type(data_1)
|
||||||
# Convert a single prompt to a list.
|
check_data_type(data_2)
|
||||||
text_2 = [text_2]
|
|
||||||
input_text_2: list[str] = [ensure_str(t) for t in text_2]
|
|
||||||
|
|
||||||
_validate_score_input_lens(input_text_1, input_text_2)
|
def ensure_str(prompt: SingletonPrompt):
|
||||||
|
if isinstance(prompt, dict):
|
||||||
|
if "multi_modal_data" in prompt:
|
||||||
|
raise ValueError("Multi-modal prompt is not "
|
||||||
|
"supported for scoring")
|
||||||
|
elif "prompt_token_ids" in prompt:
|
||||||
|
prompt = tokenizer.decode(
|
||||||
|
cast(TokensPrompt, prompt)["prompt_token_ids"])
|
||||||
|
elif "prompt" in prompt:
|
||||||
|
prompt = cast(TextPrompt, prompt)["prompt"]
|
||||||
|
assert type(prompt) is str
|
||||||
|
return prompt
|
||||||
|
|
||||||
|
if isinstance(data_1, (str, dict)):
|
||||||
|
# Convert a single prompt to a list.
|
||||||
|
data_1 = [data_1] # type: ignore[list-item]
|
||||||
|
|
||||||
|
data_1 = [ensure_str(t) for t in data_1]
|
||||||
|
|
||||||
|
if isinstance(data_2, (str, dict)):
|
||||||
|
# Convert a single prompt to a list.
|
||||||
|
data_2 = [data_2] # type: ignore[list-item]
|
||||||
|
|
||||||
|
data_2 = [ensure_str(t) for t in data_2]
|
||||||
|
|
||||||
|
if isinstance(data_1, dict) and "content" in data_1:
|
||||||
|
data_1 = data_1.get("content") # type: ignore[assignment]
|
||||||
|
elif isinstance(data_1, str):
|
||||||
|
data_1 = [data_1]
|
||||||
|
|
||||||
|
if isinstance(data_2, dict) and "content" in data_2:
|
||||||
|
data_2 = data_2.get("content") # type: ignore[assignment]
|
||||||
|
elif isinstance(data_2, str):
|
||||||
|
data_2 = [data_2]
|
||||||
|
|
||||||
|
_validate_score_input_lens(data_1, data_2) # type: ignore[arg-type]
|
||||||
|
|
||||||
if self.llm_engine.model_config.is_cross_encoder:
|
if self.llm_engine.model_config.is_cross_encoder:
|
||||||
return self._cross_encoding_score(tokenizer, input_text_1,
|
return self._cross_encoding_score(
|
||||||
input_text_2,
|
tokenizer,
|
||||||
truncate_prompt_tokens, use_tqdm,
|
data_1, # type: ignore[arg-type]
|
||||||
lora_request,
|
data_2, # type: ignore[arg-type]
|
||||||
prompt_adapter_request)
|
truncate_prompt_tokens,
|
||||||
|
use_tqdm,
|
||||||
|
lora_request,
|
||||||
|
prompt_adapter_request)
|
||||||
else:
|
else:
|
||||||
return self._embedding_score(
|
return self._embedding_score(
|
||||||
tokenizer,
|
tokenizer,
|
||||||
input_text_1, # type: ignore[arg-type]
|
data_1, # type: ignore[arg-type]
|
||||||
input_text_2, # type: ignore[arg-type]
|
data_2, # type: ignore[arg-type]
|
||||||
truncate_prompt_tokens,
|
truncate_prompt_tokens,
|
||||||
use_tqdm,
|
use_tqdm,
|
||||||
lora_request,
|
lora_request,
|
||||||
|
|||||||
@ -24,6 +24,7 @@ from typing_extensions import TypeAlias
|
|||||||
from vllm import envs
|
from vllm import envs
|
||||||
from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
|
from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
|
||||||
random_tool_call_id)
|
random_tool_call_id)
|
||||||
|
from vllm.entrypoints.score_utils import ScoreMultiModalParam
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.pooling_params import PoolingParams
|
from vllm.pooling_params import PoolingParams
|
||||||
from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
|
from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
|
||||||
@ -1282,8 +1283,8 @@ PoolingRequest = Union[PoolingCompletionRequest, PoolingChatRequest]
|
|||||||
|
|
||||||
class ScoreRequest(OpenAIBaseModel):
|
class ScoreRequest(OpenAIBaseModel):
|
||||||
model: Optional[str] = None
|
model: Optional[str] = None
|
||||||
text_1: Union[list[str], str]
|
text_1: Union[list[str], str, ScoreMultiModalParam]
|
||||||
text_2: Union[list[str], str]
|
text_2: Union[list[str], str, ScoreMultiModalParam]
|
||||||
truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
|
truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
|
||||||
|
|
||||||
# --8<-- [start:score-pooling-params]
|
# --8<-- [start:score-pooling-params]
|
||||||
@ -1291,6 +1292,12 @@ class ScoreRequest(OpenAIBaseModel):
|
|||||||
# --8<-- [end:score-pooling-params]
|
# --8<-- [end:score-pooling-params]
|
||||||
|
|
||||||
# --8<-- [start:score-extra-params]
|
# --8<-- [start:score-extra-params]
|
||||||
|
|
||||||
|
mm_processor_kwargs: Optional[dict[str, Any]] = Field(
|
||||||
|
default=None,
|
||||||
|
description=("Additional kwargs to pass to the HF processor."),
|
||||||
|
)
|
||||||
|
|
||||||
priority: int = Field(
|
priority: int = Field(
|
||||||
default=0,
|
default=0,
|
||||||
description=(
|
description=(
|
||||||
@ -1308,8 +1315,8 @@ class ScoreRequest(OpenAIBaseModel):
|
|||||||
|
|
||||||
class RerankRequest(OpenAIBaseModel):
|
class RerankRequest(OpenAIBaseModel):
|
||||||
model: Optional[str] = None
|
model: Optional[str] = None
|
||||||
query: str
|
query: Union[str, ScoreMultiModalParam]
|
||||||
documents: list[str]
|
documents: Union[list[str], ScoreMultiModalParam]
|
||||||
top_n: int = Field(default_factory=lambda: 0)
|
top_n: int = Field(default_factory=lambda: 0)
|
||||||
truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
|
truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
|
||||||
|
|
||||||
@ -1318,6 +1325,12 @@ class RerankRequest(OpenAIBaseModel):
|
|||||||
# --8<-- [end:rerank-pooling-params]
|
# --8<-- [end:rerank-pooling-params]
|
||||||
|
|
||||||
# --8<-- [start:rerank-extra-params]
|
# --8<-- [start:rerank-extra-params]
|
||||||
|
|
||||||
|
mm_processor_kwargs: Optional[dict[str, Any]] = Field(
|
||||||
|
default=None,
|
||||||
|
description=("Additional kwargs to pass to the HF processor."),
|
||||||
|
)
|
||||||
|
|
||||||
priority: int = Field(
|
priority: int = Field(
|
||||||
default=0,
|
default=0,
|
||||||
description=(
|
description=(
|
||||||
@ -1334,7 +1347,8 @@ class RerankRequest(OpenAIBaseModel):
|
|||||||
|
|
||||||
|
|
||||||
class RerankDocument(BaseModel):
|
class RerankDocument(BaseModel):
|
||||||
text: str
|
text: Optional[str] = None
|
||||||
|
multi_modal: Optional[ScoreMultiModalParam] = None
|
||||||
|
|
||||||
|
|
||||||
class RerankResult(BaseModel):
|
class RerankResult(BaseModel):
|
||||||
|
|||||||
@ -17,8 +17,11 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse, RerankDocument,
|
|||||||
ScoreResponseData, UsageInfo)
|
ScoreResponseData, UsageInfo)
|
||||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
||||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||||
from vllm.entrypoints.score_utils import (_cosine_similarity,
|
from vllm.entrypoints.score_utils import (ScoreContentPartParam,
|
||||||
_validate_score_input_lens)
|
ScoreMultiModalParam,
|
||||||
|
_cosine_similarity,
|
||||||
|
_validate_score_input_lens,
|
||||||
|
get_score_prompt)
|
||||||
from vllm.entrypoints.utils import _validate_truncation_size
|
from vllm.entrypoints.utils import _validate_truncation_size
|
||||||
from vllm.inputs.data import TokensPrompt
|
from vllm.inputs.data import TokensPrompt
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
@ -137,11 +140,34 @@ class ServingScores(OpenAIServing):
|
|||||||
|
|
||||||
return final_res_batch
|
return final_res_batch
|
||||||
|
|
||||||
|
def _preprocess_score(
|
||||||
|
self,
|
||||||
|
request: Union[RerankRequest, ScoreRequest],
|
||||||
|
tokenizer: AnyTokenizer,
|
||||||
|
tokenization_kwargs: dict[str, Any],
|
||||||
|
data_1: Union[str, ScoreContentPartParam],
|
||||||
|
data_2: Union[str, ScoreContentPartParam],
|
||||||
|
) -> tuple[str, TokensPrompt]:
|
||||||
|
|
||||||
|
model_config = self.model_config
|
||||||
|
|
||||||
|
full_prompt, engine_prompt = get_score_prompt(
|
||||||
|
model_config=model_config,
|
||||||
|
data_1=data_1,
|
||||||
|
data_2=data_2,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
|
)
|
||||||
|
if request.mm_processor_kwargs is not None:
|
||||||
|
engine_prompt["mm_processor_kwargs"] = request.mm_processor_kwargs
|
||||||
|
|
||||||
|
return full_prompt, engine_prompt
|
||||||
|
|
||||||
async def _cross_encoding_score(
|
async def _cross_encoding_score(
|
||||||
self,
|
self,
|
||||||
tokenizer: AnyTokenizer,
|
tokenizer: AnyTokenizer,
|
||||||
texts_1: list[str],
|
data_1: Union[list[str], list[ScoreContentPartParam]],
|
||||||
texts_2: list[str],
|
data_2: Union[list[str], list[ScoreContentPartParam]],
|
||||||
request: Union[RerankRequest, ScoreRequest],
|
request: Union[RerankRequest, ScoreRequest],
|
||||||
request_id=str,
|
request_id=str,
|
||||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||||
@ -154,46 +180,67 @@ class ServingScores(OpenAIServing):
|
|||||||
request_prompts: list[str] = []
|
request_prompts: list[str] = []
|
||||||
engine_prompts: list[TokensPrompt] = []
|
engine_prompts: list[TokensPrompt] = []
|
||||||
|
|
||||||
if len(texts_1) == 1:
|
if len(data_1) == 1:
|
||||||
texts_1 = texts_1 * len(texts_2)
|
data_1 = data_1 * len(data_2)
|
||||||
|
|
||||||
input_pairs = [(t1, t2) for t1, t2 in zip(texts_1, texts_2)]
|
|
||||||
|
|
||||||
if isinstance(tokenizer, MistralTokenizer):
|
if isinstance(tokenizer, MistralTokenizer):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"MistralTokenizer not supported for cross-encoding")
|
"MistralTokenizer not supported for cross-encoding")
|
||||||
|
|
||||||
tokenize_async = make_async(tokenizer.__call__,
|
|
||||||
executor=self._tokenizer_executor)
|
|
||||||
|
|
||||||
tokenization_kwargs = tokenization_kwargs or {}
|
tokenization_kwargs = tokenization_kwargs or {}
|
||||||
use_pad_token = self.model_config.use_pad_token
|
|
||||||
|
|
||||||
if use_pad_token:
|
input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)]
|
||||||
# cross_encoder models defaults to using pad_token.
|
|
||||||
tokenized_prompts = await asyncio.gather(
|
if self.model_config.is_multimodal_model:
|
||||||
*(tokenize_async(text=t1, text_pair=t2, **tokenization_kwargs)
|
|
||||||
for t1, t2 in input_pairs))
|
preprocess_async = make_async(self._preprocess_score,
|
||||||
|
executor=self._tokenizer_executor)
|
||||||
|
|
||||||
|
preprocessed_prompts = await asyncio.gather(
|
||||||
|
*(preprocess_async(request=request,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
|
data_1=t1,
|
||||||
|
data_2=t2) for t1, t2 in input_pairs))
|
||||||
|
|
||||||
|
for full_prompt, engine_prompt in preprocessed_prompts:
|
||||||
|
request_prompts.append(full_prompt)
|
||||||
|
engine_prompts.append(engine_prompt)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# `llm as reranker` models defaults to not using pad_token.
|
tokenize_async = make_async(tokenizer.__call__,
|
||||||
tokenized_prompts = await asyncio.gather(
|
executor=self._tokenizer_executor)
|
||||||
*(tokenize_async(text=t1 + t2, **tokenization_kwargs)
|
use_pad_token = self.model_config.use_pad_token
|
||||||
for t1, t2 in input_pairs))
|
|
||||||
|
|
||||||
for prompt_inputs, (t1, t2) in zip(tokenized_prompts, input_pairs):
|
if use_pad_token:
|
||||||
sep_token = tokenizer.sep_token if (tokenizer.sep_token
|
# cross_encoder models defaults to using pad_token.
|
||||||
and use_pad_token) else ''
|
tokenized_prompts = await asyncio.gather(*(
|
||||||
request_prompt = f"{t1}{sep_token}{t2}"
|
tokenize_async(
|
||||||
|
text=t1, # type: ignore[arg-type]
|
||||||
|
text_pair=t2, # type: ignore[arg-type]
|
||||||
|
**tokenization_kwargs) for t1, t2 in input_pairs))
|
||||||
|
else:
|
||||||
|
# `llm as reranker` models defaults to not using pad_token.
|
||||||
|
tokenized_prompts = await asyncio.gather(*(
|
||||||
|
tokenize_async(
|
||||||
|
text=t1 + # type: ignore[operator]
|
||||||
|
t2,
|
||||||
|
**tokenization_kwargs) for t1, t2 in input_pairs))
|
||||||
|
|
||||||
input_ids = prompt_inputs["input_ids"]
|
for prompt_inputs, (t1, t2) in zip(tokenized_prompts, input_pairs):
|
||||||
text_token_prompt = \
|
sep_token = tokenizer.sep_token if (tokenizer.sep_token
|
||||||
self._validate_input(request, input_ids, request_prompt)
|
and use_pad_token) else ''
|
||||||
engine_prompt = TokensPrompt(
|
request_prompt = f"{t1}{sep_token}{t2}"
|
||||||
prompt_token_ids=text_token_prompt["prompt_token_ids"],
|
|
||||||
token_type_ids=prompt_inputs.get("token_type_ids"))
|
|
||||||
|
|
||||||
request_prompts.append(request_prompt)
|
input_ids = prompt_inputs["input_ids"]
|
||||||
engine_prompts.append(engine_prompt)
|
text_token_prompt = \
|
||||||
|
self._validate_input(request, input_ids, request_prompt)
|
||||||
|
engine_prompt = TokensPrompt(
|
||||||
|
prompt_token_ids=text_token_prompt["prompt_token_ids"],
|
||||||
|
token_type_ids=prompt_inputs.get("token_type_ids"))
|
||||||
|
|
||||||
|
request_prompts.append(request_prompt)
|
||||||
|
engine_prompts.append(engine_prompt)
|
||||||
|
|
||||||
# Schedule the request and get the result generator.
|
# Schedule the request and get the result generator.
|
||||||
generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
|
generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
|
||||||
@ -233,8 +280,8 @@ class ServingScores(OpenAIServing):
|
|||||||
|
|
||||||
async def _run_scoring(
|
async def _run_scoring(
|
||||||
self,
|
self,
|
||||||
texts_1: Union[str, list[str]],
|
data_1: Union[list[str], str, ScoreMultiModalParam],
|
||||||
texts_2: Union[str, list[str]],
|
data_2: Union[list[str], str, ScoreMultiModalParam],
|
||||||
request: Union[ScoreRequest, RerankRequest],
|
request: Union[ScoreRequest, RerankRequest],
|
||||||
request_id: str,
|
request_id: str,
|
||||||
raw_request: Optional[Request] = None,
|
raw_request: Optional[Request] = None,
|
||||||
@ -259,18 +306,29 @@ class ServingScores(OpenAIServing):
|
|||||||
trace_headers = (None if raw_request is None else await
|
trace_headers = (None if raw_request is None else await
|
||||||
self._get_trace_headers(raw_request.headers))
|
self._get_trace_headers(raw_request.headers))
|
||||||
|
|
||||||
if isinstance(texts_1, str):
|
if not self.model_config.is_multimodal_model and (isinstance(
|
||||||
texts_1 = [texts_1]
|
data_1, dict) or isinstance(data_2, dict)):
|
||||||
if isinstance(texts_2, str):
|
raise ValueError(
|
||||||
texts_2 = [texts_2]
|
f"MultiModalParam is not supported for {self.model_config.architecture}" # noqa: E501
|
||||||
|
)
|
||||||
|
|
||||||
_validate_score_input_lens(texts_1, texts_2)
|
if isinstance(data_1, str):
|
||||||
|
data_1 = [data_1]
|
||||||
|
elif isinstance(data_1, dict):
|
||||||
|
data_1 = data_1.get("content") # type: ignore[assignment]
|
||||||
|
|
||||||
|
if isinstance(data_2, str):
|
||||||
|
data_2 = [data_2]
|
||||||
|
elif isinstance(data_2, dict):
|
||||||
|
data_2 = data_2.get("content") # type: ignore[assignment]
|
||||||
|
|
||||||
|
_validate_score_input_lens(data_1, data_2) # type: ignore[arg-type]
|
||||||
|
|
||||||
if self.model_config.is_cross_encoder:
|
if self.model_config.is_cross_encoder:
|
||||||
return await self._cross_encoding_score(
|
return await self._cross_encoding_score(
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
texts_1=texts_1,
|
data_1=data_1, # type: ignore[arg-type]
|
||||||
texts_2=texts_2,
|
data_2=data_2, # type: ignore[arg-type]
|
||||||
request=request,
|
request=request,
|
||||||
request_id=request_id,
|
request_id=request_id,
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
@ -281,8 +339,8 @@ class ServingScores(OpenAIServing):
|
|||||||
else:
|
else:
|
||||||
return await self._embedding_score(
|
return await self._embedding_score(
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
texts_1=texts_1,
|
texts_1=data_1, # type: ignore[arg-type]
|
||||||
texts_2=texts_2,
|
texts_2=data_2, # type: ignore[arg-type]
|
||||||
request=request,
|
request=request,
|
||||||
request_id=request_id,
|
request_id=request_id,
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
@ -349,7 +407,9 @@ class ServingScores(OpenAIServing):
|
|||||||
|
|
||||||
request_id = f"rerank-{self._base_request_id(raw_request)}"
|
request_id = f"rerank-{self._base_request_id(raw_request)}"
|
||||||
documents = request.documents
|
documents = request.documents
|
||||||
top_n = request.top_n if request.top_n > 0 else len(documents)
|
top_n = request.top_n if request.top_n > 0 else (
|
||||||
|
len(documents)
|
||||||
|
if isinstance(documents, list) else len(documents["content"]))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
final_res_batch = await self._run_scoring(
|
final_res_batch = await self._run_scoring(
|
||||||
@ -410,7 +470,7 @@ class ServingScores(OpenAIServing):
|
|||||||
|
|
||||||
def request_output_to_rerank_response(
|
def request_output_to_rerank_response(
|
||||||
self, final_res_batch: list[PoolingRequestOutput], request_id: str,
|
self, final_res_batch: list[PoolingRequestOutput], request_id: str,
|
||||||
model_name: str, documents: list[str],
|
model_name: str, documents: Union[list[str], ScoreMultiModalParam],
|
||||||
top_n: int) -> RerankResponse:
|
top_n: int) -> RerankResponse:
|
||||||
"""
|
"""
|
||||||
Convert the output of do_rank to a RerankResponse
|
Convert the output of do_rank to a RerankResponse
|
||||||
@ -422,7 +482,9 @@ class ServingScores(OpenAIServing):
|
|||||||
|
|
||||||
result = RerankResult(
|
result = RerankResult(
|
||||||
index=idx,
|
index=idx,
|
||||||
document=RerankDocument(text=documents[idx]),
|
document=RerankDocument(text=documents[idx]) if isinstance(
|
||||||
|
documents, list) else RerankDocument(
|
||||||
|
multi_modal=documents["content"][idx]),
|
||||||
relevance_score=classify_res.outputs.score,
|
relevance_score=classify_res.outputs.score,
|
||||||
)
|
)
|
||||||
results.append(result)
|
results.append(result)
|
||||||
|
|||||||
@ -1,13 +1,41 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
from typing import Union
|
from typing import Any, Optional, Union, cast
|
||||||
|
|
||||||
from torch.nn import CosineSimilarity
|
from torch.nn import CosineSimilarity
|
||||||
|
from typing_extensions import Required, TypeAlias, TypedDict
|
||||||
|
|
||||||
|
from vllm.config import ModelConfig
|
||||||
|
from vllm.entrypoints.chat_utils import (
|
||||||
|
BaseMultiModalItemTracker, ChatCompletionContentPartImageEmbedsParam,
|
||||||
|
ChatCompletionContentPartImageParam, ChatCompletionContentPartTextParam,
|
||||||
|
MultiModalItemTracker, _ContentPart, _parse_chat_message_content_part)
|
||||||
|
from vllm.inputs import TokensPrompt
|
||||||
|
from vllm.model_executor.model_loader import get_model_cls
|
||||||
|
from vllm.model_executor.models.interfaces import supports_score_template
|
||||||
|
from vllm.multimodal.inputs import MultiModalDataDict
|
||||||
from vllm.outputs import PoolingRequestOutput
|
from vllm.outputs import PoolingRequestOutput
|
||||||
from vllm.transformers_utils.tokenizer import (PreTrainedTokenizer,
|
from vllm.transformers_utils.tokenizer import (AnyTokenizer,
|
||||||
|
PreTrainedTokenizer,
|
||||||
PreTrainedTokenizerFast)
|
PreTrainedTokenizerFast)
|
||||||
|
|
||||||
|
ScoreContentPartParam: TypeAlias = Union[
|
||||||
|
ChatCompletionContentPartImageParam,
|
||||||
|
ChatCompletionContentPartImageEmbedsParam]
|
||||||
|
|
||||||
|
|
||||||
|
class ScoreMultiModalParam(TypedDict, total=False):
|
||||||
|
"""
|
||||||
|
A specialized parameter type for scoring multimodal content
|
||||||
|
|
||||||
|
The reasons why don't reuse `CustomChatCompletionMessageParam` directly:
|
||||||
|
1. Score tasks don't need the 'role' field (user/assistant/system) that's required in chat completions
|
||||||
|
2. Including chat-specific fields would confuse users about their purpose in scoring
|
||||||
|
3. This is a more focused interface that only exposes what's needed for scoring
|
||||||
|
""" # noqa: E501
|
||||||
|
content: Required[list[ScoreContentPartParam]]
|
||||||
|
"""The multimodal contents"""
|
||||||
|
|
||||||
|
|
||||||
def _cosine_similarity(
|
def _cosine_similarity(
|
||||||
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
|
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
|
||||||
@ -39,12 +67,128 @@ def _cosine_similarity(
|
|||||||
|
|
||||||
|
|
||||||
def _validate_score_input_lens(
|
def _validate_score_input_lens(
|
||||||
texts_1: Union[list[str], list[dict]],
|
data_1: Union[list[str], list[ScoreContentPartParam]],
|
||||||
texts_2: Union[list[str], list[dict]],
|
data_2: Union[list[str], list[ScoreContentPartParam]],
|
||||||
):
|
):
|
||||||
if len(texts_1) > 1 and len(texts_1) != len(texts_2):
|
len_1 = len(data_1)
|
||||||
|
len_2 = len(data_2)
|
||||||
|
|
||||||
|
if len_1 > 1 and len_1 != len_2:
|
||||||
raise ValueError("Input lengths must be either 1:1, 1:N or N:N")
|
raise ValueError("Input lengths must be either 1:1, 1:N or N:N")
|
||||||
if len(texts_1) == 0:
|
if len_1 == 0:
|
||||||
raise ValueError("At least one text element must be given")
|
raise ValueError("At least one text element must be given")
|
||||||
if len(texts_2) == 0:
|
if len_2 == 0:
|
||||||
raise ValueError("At least one text_pair element must be given")
|
raise ValueError("At least one text_pair element must be given")
|
||||||
|
|
||||||
|
|
||||||
|
def parse_score_data(
|
||||||
|
data_1: Union[str, ScoreContentPartParam],
|
||||||
|
data_2: Union[str, ScoreContentPartParam],
|
||||||
|
model_config: ModelConfig,
|
||||||
|
tokenizer: AnyTokenizer,
|
||||||
|
) -> tuple[str, str, Optional[MultiModalDataDict]]:
|
||||||
|
mm_tracker = MultiModalItemTracker(model_config, tokenizer)
|
||||||
|
|
||||||
|
content_1 = _parse_score_content(data_1, mm_tracker)
|
||||||
|
|
||||||
|
content_2 = _parse_score_content(data_2, mm_tracker)
|
||||||
|
|
||||||
|
def ensure_str(content: Optional[_ContentPart]) -> str:
|
||||||
|
if content is not None and isinstance(content, str):
|
||||||
|
return cast(str, content)
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
f"Only string content is supported, but got {content}.")
|
||||||
|
|
||||||
|
prompt_1 = ensure_str(content_1)
|
||||||
|
prompt_2 = ensure_str(content_2)
|
||||||
|
|
||||||
|
return prompt_1, prompt_2, mm_tracker.all_mm_data()
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_score_content(
|
||||||
|
data: Union[str, ScoreContentPartParam],
|
||||||
|
mm_tracker: BaseMultiModalItemTracker,
|
||||||
|
) -> Optional[_ContentPart]:
|
||||||
|
|
||||||
|
if isinstance(data, str):
|
||||||
|
data = ChatCompletionContentPartTextParam(type="text", text=data)
|
||||||
|
|
||||||
|
mm_parser = mm_tracker.create_parser()
|
||||||
|
|
||||||
|
parse_res = _parse_chat_message_content_part(
|
||||||
|
data,
|
||||||
|
mm_parser,
|
||||||
|
wrap_dicts=False,
|
||||||
|
interleave_strings=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
if parse_res:
|
||||||
|
return parse_res
|
||||||
|
|
||||||
|
mm_placeholder_storage = mm_parser.mm_placeholder_storage()
|
||||||
|
|
||||||
|
if len(mm_placeholder_storage) != 1 or len(
|
||||||
|
next(iter(mm_placeholder_storage.values()))) != 1:
|
||||||
|
raise ValueError("Only one multi-modal item is supported")
|
||||||
|
|
||||||
|
return next(iter(mm_placeholder_storage.values()))[0]
|
||||||
|
|
||||||
|
|
||||||
|
def apply_score_template(
|
||||||
|
model_config: ModelConfig,
|
||||||
|
prompt_1: str,
|
||||||
|
prompt_2: str,
|
||||||
|
) -> str:
|
||||||
|
|
||||||
|
model = get_model_cls(model_config)
|
||||||
|
if supports_score_template(model):
|
||||||
|
full_prompt = model.get_score_template(prompt_1, prompt_2)
|
||||||
|
if full_prompt is None:
|
||||||
|
raise ValueError("Get empty score template from model")
|
||||||
|
return full_prompt
|
||||||
|
|
||||||
|
raise ValueError(
|
||||||
|
f"Unsupported model architecture: {model_config.architecture}")
|
||||||
|
|
||||||
|
|
||||||
|
def post_process_tokens(
|
||||||
|
model_config: ModelConfig,
|
||||||
|
prompt: TokensPrompt,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Perform architecture-specific manipulations on the input tokens.
|
||||||
|
|
||||||
|
Note:
|
||||||
|
This is an in-place operation.
|
||||||
|
"""
|
||||||
|
model = get_model_cls(model_config)
|
||||||
|
if supports_score_template(model):
|
||||||
|
model.post_process_tokens(prompt)
|
||||||
|
|
||||||
|
|
||||||
|
def get_score_prompt(
|
||||||
|
model_config: ModelConfig,
|
||||||
|
tokenizer: AnyTokenizer,
|
||||||
|
tokenization_kwargs: dict[str, Any],
|
||||||
|
data_1: Union[str, ScoreContentPartParam],
|
||||||
|
data_2: Union[str, ScoreContentPartParam],
|
||||||
|
) -> tuple[str, TokensPrompt]:
|
||||||
|
prompt_1, prompt_2, mm_data = parse_score_data(
|
||||||
|
data_1,
|
||||||
|
data_2,
|
||||||
|
model_config,
|
||||||
|
tokenizer,
|
||||||
|
)
|
||||||
|
|
||||||
|
full_prompt = apply_score_template(model_config, prompt_1, prompt_2)
|
||||||
|
|
||||||
|
prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs)
|
||||||
|
|
||||||
|
engine_prompt = TokensPrompt(prompt_token_ids=prompt_inputs["input_ids"])
|
||||||
|
|
||||||
|
post_process_tokens(model_config, engine_prompt)
|
||||||
|
|
||||||
|
if mm_data is not None:
|
||||||
|
engine_prompt["multi_modal_data"] = mm_data
|
||||||
|
return full_prompt, engine_prompt
|
||||||
|
|||||||
@ -170,6 +170,15 @@ class Qwen3ForSequenceClassificationConfig(VerifyAndUpdateConfig):
|
|||||||
vllm_config.model_config.hf_config.method = "from_2_way_softmax"
|
vllm_config.model_config.hf_config.method = "from_2_way_softmax"
|
||||||
|
|
||||||
|
|
||||||
|
class JinaVLForSequenceClassificationConfig(VerifyAndUpdateConfig):
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def verify_and_update_config(vllm_config: "VllmConfig") -> None:
|
||||||
|
config = vllm_config.model_config.hf_config
|
||||||
|
|
||||||
|
config.num_labels = 1
|
||||||
|
|
||||||
|
|
||||||
class SnowflakeGteNewModelConfig(VerifyAndUpdateConfig):
|
class SnowflakeGteNewModelConfig(VerifyAndUpdateConfig):
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -197,4 +206,5 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
|
|||||||
"NomicBertModel": NomicBertModelConfig,
|
"NomicBertModel": NomicBertModelConfig,
|
||||||
"Qwen3ForSequenceClassification": Qwen3ForSequenceClassificationConfig,
|
"Qwen3ForSequenceClassification": Qwen3ForSequenceClassificationConfig,
|
||||||
"XLMRobertaModel": JinaRobertaModelConfig,
|
"XLMRobertaModel": JinaRobertaModelConfig,
|
||||||
|
"JinaVLForRanking": JinaVLForSequenceClassificationConfig,
|
||||||
}
|
}
|
||||||
|
|||||||
@ -9,6 +9,7 @@ import torch
|
|||||||
from torch import Tensor
|
from torch import Tensor
|
||||||
from typing_extensions import Self, TypeIs
|
from typing_extensions import Self, TypeIs
|
||||||
|
|
||||||
|
from vllm.inputs import TokensPrompt
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.model_executor.layers.quantization.base_config import (
|
from vllm.model_executor.layers.quantization.base_config import (
|
||||||
QuantizationConfig)
|
QuantizationConfig)
|
||||||
@ -141,6 +142,62 @@ def supports_multimodal(
|
|||||||
return isinstance(model, SupportsMultiModal)
|
return isinstance(model, SupportsMultiModal)
|
||||||
|
|
||||||
|
|
||||||
|
@runtime_checkable
|
||||||
|
class SupportsScoreTemplate(Protocol):
|
||||||
|
"""The interface required for all models that support score template."""
|
||||||
|
|
||||||
|
supports_score_template: ClassVar[Literal[True]] = True
|
||||||
|
"""
|
||||||
|
A flag that indicates this model supports score template.
|
||||||
|
|
||||||
|
Note:
|
||||||
|
There is no need to redefine this flag if this class is in the
|
||||||
|
MRO of your model class.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_score_template(cls, query: str, document: str) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Generate a full prompt by populating the score template with query and document content.
|
||||||
|
""" # noqa: E501
|
||||||
|
...
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def post_process_tokens(cls, prompt: TokensPrompt) -> None:
|
||||||
|
"""
|
||||||
|
Perform architecture-specific manipulations on the input tokens.
|
||||||
|
"""
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
# We can't use runtime_checkable with ClassVar for issubclass checks
|
||||||
|
# so we need to treat the class as an instance and use isinstance instead
|
||||||
|
@runtime_checkable
|
||||||
|
class _SupportsScoreTemplateType(Protocol):
|
||||||
|
supports_score_template: Literal[True]
|
||||||
|
|
||||||
|
|
||||||
|
@overload
|
||||||
|
def supports_score_template(
|
||||||
|
model: type[object]) -> TypeIs[type[SupportsScoreTemplate]]:
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
@overload
|
||||||
|
def supports_score_template(model: object) -> TypeIs[SupportsScoreTemplate]:
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
def supports_score_template(
|
||||||
|
model: Union[type[object], object],
|
||||||
|
) -> Union[TypeIs[type[SupportsScoreTemplate]], TypeIs[SupportsScoreTemplate]]:
|
||||||
|
|
||||||
|
if isinstance(model, type):
|
||||||
|
return isinstance(model, _SupportsScoreTemplateType)
|
||||||
|
|
||||||
|
return isinstance(model, SupportsScoreTemplate)
|
||||||
|
|
||||||
|
|
||||||
@runtime_checkable
|
@runtime_checkable
|
||||||
class SupportsLoRA(Protocol):
|
class SupportsLoRA(Protocol):
|
||||||
"""The interface required for all models that support LoRA."""
|
"""The interface required for all models that support LoRA."""
|
||||||
|
|||||||
150
vllm/model_executor/models/jina_vl.py
Normal file
150
vllm/model_executor/models/jina_vl.py
Normal file
@ -0,0 +1,150 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
from collections.abc import Iterable, Mapping
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
from transformers import BatchFeature, PretrainedConfig
|
||||||
|
|
||||||
|
from vllm.config import VllmConfig
|
||||||
|
from vllm.inputs import TokensPrompt
|
||||||
|
from vllm.logger import init_logger
|
||||||
|
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
||||||
|
RowParallelLinear)
|
||||||
|
from vllm.model_executor.layers.pooler import Pooler, PoolingType
|
||||||
|
from vllm.model_executor.pooling_metadata import PoolingMetadata
|
||||||
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
|
from vllm.sequence import IntermediateTensors, PoolerOutput
|
||||||
|
|
||||||
|
from .interfaces import (SupportsCrossEncoding, SupportsMultiModal,
|
||||||
|
SupportsScoreTemplate)
|
||||||
|
from .qwen2_vl import (Qwen2VLDummyInputsBuilder,
|
||||||
|
Qwen2VLForConditionalGeneration,
|
||||||
|
Qwen2VLMultiModalProcessor, Qwen2VLProcessingInfo)
|
||||||
|
from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
|
||||||
|
|
||||||
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class JinaVLScorer(nn.Module):
|
||||||
|
|
||||||
|
def __init__(self, config: PretrainedConfig):
|
||||||
|
super().__init__()
|
||||||
|
self.dense = ColumnParallelLinear(config.hidden_size,
|
||||||
|
config.hidden_size,
|
||||||
|
bias=True)
|
||||||
|
self.out_proj = RowParallelLinear(config.hidden_size,
|
||||||
|
config.num_labels,
|
||||||
|
bias=True)
|
||||||
|
|
||||||
|
def forward(self, x, **kwargs):
|
||||||
|
x, _ = self.dense(x)
|
||||||
|
x = torch.relu(x)
|
||||||
|
x, _ = self.out_proj(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class JinaVLMultiModalProcessor(Qwen2VLMultiModalProcessor):
|
||||||
|
|
||||||
|
def _call_hf_processor(
|
||||||
|
self,
|
||||||
|
prompt: str,
|
||||||
|
mm_data: Mapping[str, object],
|
||||||
|
mm_kwargs: Mapping[str, object],
|
||||||
|
tok_kwargs: Mapping[str, object],
|
||||||
|
) -> BatchFeature:
|
||||||
|
|
||||||
|
# NOTE: We should reverse the order of the mm_data because the
|
||||||
|
# query prompt is placed after the document prompt in the score
|
||||||
|
# template for JinaVLForRanking model, but in mm_data they are
|
||||||
|
# stored in the opposite order (query first, then document).
|
||||||
|
for _, value in mm_data.items():
|
||||||
|
value.reverse()
|
||||||
|
return super()._call_hf_processor(prompt, mm_data, mm_kwargs,
|
||||||
|
tok_kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
@MULTIMODAL_REGISTRY.register_processor(JinaVLMultiModalProcessor,
|
||||||
|
info=Qwen2VLProcessingInfo,
|
||||||
|
dummy_inputs=Qwen2VLDummyInputsBuilder)
|
||||||
|
class JinaVLForSequenceClassification(Qwen2VLForConditionalGeneration,
|
||||||
|
SupportsCrossEncoding,
|
||||||
|
SupportsMultiModal,
|
||||||
|
SupportsScoreTemplate):
|
||||||
|
weight_mapper = WeightsMapper(
|
||||||
|
orig_to_new_prefix={
|
||||||
|
"score.0.": "score.dense.",
|
||||||
|
"score.2.": "score.out_proj.",
|
||||||
|
# mapping for new names in checkpoint saved after transformers v4.52
|
||||||
|
"model.language_model.": "language_model.model.",
|
||||||
|
"visual.": "visual.",
|
||||||
|
# mapping for original checkpoint
|
||||||
|
"lm_head.": "language_model.lm_head.",
|
||||||
|
"model.": "language_model.model.",
|
||||||
|
})
|
||||||
|
|
||||||
|
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||||
|
super().__init__(vllm_config=vllm_config,
|
||||||
|
prefix=maybe_prefix(prefix, "qwen2_vl"))
|
||||||
|
config = vllm_config.model_config.hf_config
|
||||||
|
pooler_config = vllm_config.model_config.pooler_config
|
||||||
|
|
||||||
|
# logit bias for sigmoid normalization
|
||||||
|
self.LOGIT_BIAS = 2.65
|
||||||
|
|
||||||
|
self.score = JinaVLScorer(config)
|
||||||
|
|
||||||
|
self._pooler = Pooler.from_config_with_defaults(
|
||||||
|
pooler_config,
|
||||||
|
pooling_type=PoolingType.LAST,
|
||||||
|
normalize=False,
|
||||||
|
softmax=True)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
|
||||||
|
if modality.startswith("image"):
|
||||||
|
return "<|vision_start|><|image_pad|><|vision_end|>"
|
||||||
|
|
||||||
|
raise ValueError("Only image modality is supported")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_score_template(cls, query: str, document: str) -> Optional[str]:
|
||||||
|
return f"**Document**:\n{document}\n**Query**:\n{query}"
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def post_process_tokens(cls, prompt: TokensPrompt) -> None:
|
||||||
|
|
||||||
|
# add score target token at the end of prompt tokens
|
||||||
|
prompt['prompt_token_ids'].append(100)
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
input_ids: torch.Tensor,
|
||||||
|
positions: torch.Tensor,
|
||||||
|
intermediate_tensors: Optional[IntermediateTensors] = None,
|
||||||
|
inputs_embeds: Optional[torch.Tensor] = None,
|
||||||
|
**kwargs: object,
|
||||||
|
) -> torch.Tensor:
|
||||||
|
hidden_states = super().forward(
|
||||||
|
input_ids=input_ids,
|
||||||
|
positions=positions,
|
||||||
|
intermediate_tensors=intermediate_tensors,
|
||||||
|
inputs_embeds=inputs_embeds,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
|
logits = self.score(hidden_states) - self.LOGIT_BIAS
|
||||||
|
return logits
|
||||||
|
|
||||||
|
def pooler(
|
||||||
|
self,
|
||||||
|
hidden_states: torch.Tensor,
|
||||||
|
pooling_metadata: PoolingMetadata,
|
||||||
|
) -> Optional[PoolerOutput]:
|
||||||
|
return self._pooler(hidden_states, pooling_metadata)
|
||||||
|
|
||||||
|
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
|
||||||
|
|
||||||
|
loader = AutoWeightsLoader(self)
|
||||||
|
return loader.load_weights(weights, mapper=self.weight_mapper)
|
||||||
@ -181,6 +181,7 @@ _CROSS_ENCODER_MODELS = {
|
|||||||
"GemmaForSequenceClassification": ("gemma", "GemmaForSequenceClassification"), # noqa: E501
|
"GemmaForSequenceClassification": ("gemma", "GemmaForSequenceClassification"), # noqa: E501
|
||||||
"Qwen2ForSequenceClassification": ("qwen2", "Qwen2ForSequenceClassification"), # noqa: E501
|
"Qwen2ForSequenceClassification": ("qwen2", "Qwen2ForSequenceClassification"), # noqa: E501
|
||||||
"Qwen3ForSequenceClassification": ("qwen3", "Qwen3ForSequenceClassification"), # noqa: E501
|
"Qwen3ForSequenceClassification": ("qwen3", "Qwen3ForSequenceClassification"), # noqa: E501
|
||||||
|
"JinaVLForRanking": ("jina_vl", "JinaVLForSequenceClassification"), # noqa: E501
|
||||||
}
|
}
|
||||||
|
|
||||||
_MULTIMODAL_MODELS = {
|
_MULTIMODAL_MODELS = {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user