mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-16 14:07:13 +08:00
[FEATURE] Enables offline /score for embedding models (#12021)
Signed-off-by: Gabriel Marinho <gmarinho@ibm.com>
This commit is contained in:
parent
23a7cbc88b
commit
0f465ab533
@ -5,12 +5,18 @@ Run `pytest tests/models/embedding/language/test_scoring.py`.
|
||||
import math
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
MODELS = [
|
||||
"cross-encoder/ms-marco-MiniLM-L-6-v2", # Bert
|
||||
"BAAI/bge-reranker-v2-m3", # Roberta
|
||||
]
|
||||
|
||||
EMBEDDING_MODELS = [
|
||||
"sentence-transformers/all-MiniLM-L12-v2",
|
||||
]
|
||||
|
||||
TEXTS_1 = [
|
||||
"What is the capital of France?",
|
||||
"What is the capital of Germany?",
|
||||
@ -87,3 +93,97 @@ def test_llm_N_to_N(vllm_runner, hf_runner, model_name, dtype: str):
|
||||
|
||||
assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
|
||||
assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", params=EMBEDDING_MODELS)
|
||||
def emb_model_name(request):
|
||||
yield request.param
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_llm_1_to_1_embedding(vllm_runner, hf_runner, emb_model_name,
|
||||
dtype: str):
|
||||
|
||||
text_pair = [TEXTS_1[0], TEXTS_2[0]]
|
||||
|
||||
with hf_runner(emb_model_name, dtype=dtype,
|
||||
is_sentence_transformer=True) as hf_model:
|
||||
hf_embeddings = hf_model.encode(text_pair)
|
||||
hf_outputs = [
|
||||
F.cosine_similarity(*map(torch.tensor, hf_embeddings), dim=0)
|
||||
]
|
||||
|
||||
with vllm_runner(emb_model_name,
|
||||
task="embed",
|
||||
dtype=dtype,
|
||||
max_model_len=None) as vllm_model:
|
||||
vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
|
||||
|
||||
assert len(vllm_outputs) == 1
|
||||
assert len(hf_outputs) == 1
|
||||
|
||||
assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_llm_1_to_N_embedding(vllm_runner, hf_runner, emb_model_name,
|
||||
dtype: str):
|
||||
|
||||
text_pairs = [
|
||||
[TEXTS_1[0], TEXTS_2[0]],
|
||||
[TEXTS_1[0], TEXTS_2[1]],
|
||||
]
|
||||
|
||||
with hf_runner(emb_model_name, dtype=dtype,
|
||||
is_sentence_transformer=True) as hf_model:
|
||||
hf_embeddings = [
|
||||
hf_model.encode(text_pair) for text_pair in text_pairs
|
||||
]
|
||||
hf_outputs = [
|
||||
F.cosine_similarity(*map(torch.tensor, pair), dim=0)
|
||||
for pair in hf_embeddings
|
||||
]
|
||||
|
||||
with vllm_runner(emb_model_name,
|
||||
task="embed",
|
||||
dtype=dtype,
|
||||
max_model_len=None) as vllm_model:
|
||||
vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
|
||||
|
||||
assert len(vllm_outputs) == 2
|
||||
assert len(hf_outputs) == 2
|
||||
|
||||
assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
|
||||
assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_llm_N_to_N_embedding(vllm_runner, hf_runner, emb_model_name,
|
||||
dtype: str):
|
||||
|
||||
text_pairs = [
|
||||
[TEXTS_1[0], TEXTS_2[0]],
|
||||
[TEXTS_1[1], TEXTS_2[1]],
|
||||
]
|
||||
|
||||
with hf_runner(emb_model_name, dtype=dtype,
|
||||
is_sentence_transformer=True) as hf_model:
|
||||
hf_embeddings = [
|
||||
hf_model.encode(text_pair) for text_pair in text_pairs
|
||||
]
|
||||
hf_outputs = [
|
||||
F.cosine_similarity(*map(torch.tensor, pair), dim=0)
|
||||
for pair in hf_embeddings
|
||||
]
|
||||
|
||||
with vllm_runner(emb_model_name,
|
||||
task="embed",
|
||||
dtype=dtype,
|
||||
max_model_len=None) as vllm_model:
|
||||
vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
|
||||
|
||||
assert len(vllm_outputs) == 2
|
||||
assert len(hf_outputs) == 2
|
||||
|
||||
assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
|
||||
assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
|
||||
|
||||
@ -5,6 +5,7 @@ from typing import (Any, Callable, ClassVar, Dict, List, Optional, Sequence,
|
||||
Tuple, Type, Union, cast, overload)
|
||||
|
||||
import cloudpickle
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from tqdm import tqdm
|
||||
from typing_extensions import TypeVar, deprecated
|
||||
@ -996,6 +997,107 @@ class LLM:
|
||||
|
||||
return [ClassificationRequestOutput.from_base(item) for item in items]
|
||||
|
||||
def _embedding_score(
|
||||
self,
|
||||
tokenizer: AnyTokenizer,
|
||||
text_1: List[Union[str, TextPrompt, TokensPrompt]],
|
||||
text_2: List[Union[str, TextPrompt, TokensPrompt]],
|
||||
truncate_prompt_tokens: Optional[int] = None,
|
||||
use_tqdm: bool = True,
|
||||
lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
|
||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
||||
) -> List[ScoringRequestOutput]:
|
||||
|
||||
encoded_output = self.encode(
|
||||
text_1 + text_2,
|
||||
use_tqdm=use_tqdm,
|
||||
lora_request=lora_request,
|
||||
prompt_adapter_request=prompt_adapter_request)
|
||||
encoded_output_1 = encoded_output[0:len(text_1)]
|
||||
encoded_output_2 = encoded_output[len(text_1):]
|
||||
|
||||
if len(encoded_output_1) == 1:
|
||||
encoded_output_1 = encoded_output_1 * len(encoded_output_2)
|
||||
|
||||
output_pairs = [(t1, t2)
|
||||
for t1, t2 in zip(encoded_output_1, encoded_output_2)]
|
||||
|
||||
scores = []
|
||||
scorer = torch.nn.CosineSimilarity(0)
|
||||
|
||||
for embed_1, embed_2 in output_pairs:
|
||||
pair_score = scorer(embed_1.outputs.data, embed_2.outputs.data)
|
||||
|
||||
if (pad_token_id := getattr(tokenizer, "pad_token_id",
|
||||
None)) is not None:
|
||||
tokens = embed_1.prompt_token_ids + [
|
||||
pad_token_id
|
||||
] + embed_2.prompt_token_ids
|
||||
else:
|
||||
tokens = embed_1.prompt_token_ids + embed_2.prompt_token_ids
|
||||
|
||||
scores.append(
|
||||
PoolingRequestOutput(
|
||||
request_id=f"{embed_1.request_id}_{embed_2.request_id}",
|
||||
outputs=pair_score,
|
||||
prompt_token_ids=tokens,
|
||||
finished=True))
|
||||
|
||||
items = self.engine_class.validate_outputs(scores,
|
||||
PoolingRequestOutput)
|
||||
return [ScoringRequestOutput.from_base(item) for item in items]
|
||||
|
||||
def _cross_encoding_score(
|
||||
self,
|
||||
tokenizer: Union[AnyTokenizer],
|
||||
text_1: List[Union[str, TextPrompt, TokensPrompt]],
|
||||
text_2: List[Union[str, TextPrompt, TokensPrompt]],
|
||||
truncate_prompt_tokens: Optional[int] = None,
|
||||
use_tqdm: bool = True,
|
||||
lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
|
||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
||||
) -> List[ScoringRequestOutput]:
|
||||
|
||||
if isinstance(tokenizer, MistralTokenizer):
|
||||
raise ValueError(
|
||||
"Score API is only enabled for `--task embed or score`")
|
||||
|
||||
if len(text_1) == 1:
|
||||
text_1 = text_1 * len(text_2)
|
||||
|
||||
input_pairs = [(t1, t2) for t1, t2 in zip(text_1, text_2)]
|
||||
|
||||
pooling_params = PoolingParams()
|
||||
|
||||
tokenization_kwargs: Dict[str, Any] = {}
|
||||
if truncate_prompt_tokens is not None:
|
||||
tokenization_kwargs["truncation"] = True
|
||||
tokenization_kwargs["max_length"] = truncate_prompt_tokens
|
||||
|
||||
parsed_prompts = []
|
||||
|
||||
for q, t in input_pairs:
|
||||
prompt_inputs = tokenizer(text=q,
|
||||
text_pair=t,
|
||||
**tokenization_kwargs)
|
||||
engine_prompt = TokensPrompt(
|
||||
prompt_token_ids=prompt_inputs["input_ids"],
|
||||
token_type_ids=prompt_inputs.get("token_type_ids"))
|
||||
parsed_prompts.append(engine_prompt)
|
||||
|
||||
self._validate_and_add_requests(
|
||||
prompts=parsed_prompts,
|
||||
params=pooling_params,
|
||||
lora_request=lora_request,
|
||||
prompt_adapter_request=prompt_adapter_request,
|
||||
)
|
||||
|
||||
outputs = self._run_engine(use_tqdm=use_tqdm)
|
||||
items = self.engine_class.validate_outputs(outputs,
|
||||
PoolingRequestOutput)
|
||||
|
||||
return [ScoringRequestOutput.from_base(item) for item in items]
|
||||
|
||||
def score(
|
||||
self,
|
||||
text_1: Union[SingletonPrompt, Sequence[SingletonPrompt]],
|
||||
@ -1047,25 +1149,20 @@ class LLM:
|
||||
|
||||
raise ValueError(" ".join(messages))
|
||||
|
||||
if not self.llm_engine.model_config.is_cross_encoder:
|
||||
raise ValueError("Your model does not support cross encoding")
|
||||
if self.llm_engine.model_config.task != "score":
|
||||
raise ValueError("Score API is only enabled for `--task score`")
|
||||
|
||||
tokenizer = self.llm_engine.get_tokenizer()
|
||||
|
||||
if isinstance(tokenizer, MistralTokenizer):
|
||||
if self.llm_engine.model_config.task not in ("embed", "score"):
|
||||
raise ValueError(
|
||||
"MistralTokenizer not supported for cross-encoding")
|
||||
"Score API is only enabled for `--task embed or --task score`")
|
||||
|
||||
# the tokenizer for models such as
|
||||
# "cross-encoder/ms-marco-MiniLM-L-6-v2" doesn't support passing
|
||||
# lists of tokens to the `text` and `text_pair` kwargs
|
||||
tokenizer = self.llm_engine.get_tokenizer()
|
||||
|
||||
def ensure_str(prompt: SingletonPrompt):
|
||||
if isinstance(prompt, dict):
|
||||
if "multi_modal_data" in prompt:
|
||||
raise ValueError("Multi-modal prompt is not "
|
||||
"supported for cross encoding")
|
||||
"supported for scoring")
|
||||
elif "prompt_token_ids" in prompt:
|
||||
prompt = tokenizer.decode(
|
||||
cast(TokensPrompt, prompt)["prompt_token_ids"])
|
||||
@ -1091,40 +1188,15 @@ class LLM:
|
||||
if len(text_2) == 0:
|
||||
raise ValueError("At least one text_pair element must be given")
|
||||
|
||||
if len(text_1) == 1:
|
||||
text_1 = text_1 * len(text_2)
|
||||
|
||||
input_pairs = [(t1, t2) for t1, t2 in zip(text_1, text_2)]
|
||||
pooling_params = PoolingParams()
|
||||
|
||||
tokenization_kwargs: Dict[str, Any] = {}
|
||||
if truncate_prompt_tokens is not None:
|
||||
tokenization_kwargs["truncation"] = True
|
||||
tokenization_kwargs["max_length"] = truncate_prompt_tokens
|
||||
|
||||
parsed_prompts = []
|
||||
|
||||
for q, t in input_pairs:
|
||||
prompt_inputs = tokenizer(text=q,
|
||||
text_pair=t,
|
||||
**tokenization_kwargs)
|
||||
engine_prompt = TokensPrompt(
|
||||
prompt_token_ids=prompt_inputs["input_ids"],
|
||||
token_type_ids=prompt_inputs.get("token_type_ids"))
|
||||
parsed_prompts.append(engine_prompt)
|
||||
|
||||
self._validate_and_add_requests(
|
||||
prompts=parsed_prompts,
|
||||
params=pooling_params,
|
||||
lora_request=lora_request,
|
||||
prompt_adapter_request=prompt_adapter_request,
|
||||
)
|
||||
|
||||
outputs = self._run_engine(use_tqdm=use_tqdm)
|
||||
items = self.engine_class.validate_outputs(outputs,
|
||||
PoolingRequestOutput)
|
||||
|
||||
return [ScoringRequestOutput.from_base(item) for item in items]
|
||||
if self.llm_engine.model_config.is_cross_encoder:
|
||||
return self._cross_encoding_score(tokenizer, text_1, text_2,
|
||||
truncate_prompt_tokens, use_tqdm,
|
||||
lora_request,
|
||||
prompt_adapter_request)
|
||||
else:
|
||||
return self._embedding_score(tokenizer, text_1, text_2,
|
||||
truncate_prompt_tokens, use_tqdm,
|
||||
lora_request, prompt_adapter_request)
|
||||
|
||||
def start_profile(self) -> None:
|
||||
self.llm_engine.start_profile()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user