mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-28 11:17:05 +08:00
Merge branch 'main' into feat/vanisimov/kv_cache_groups_optimization
This commit is contained in:
commit
437ac4e047
@ -2,7 +2,7 @@
|
|||||||
# We can use this script to compute baseline accuracy on chartqa for vllm.
|
# We can use this script to compute baseline accuracy on chartqa for vllm.
|
||||||
#
|
#
|
||||||
# Make sure you have lm-eval-harness installed:
|
# Make sure you have lm-eval-harness installed:
|
||||||
# pip install lm-eval==0.4.9
|
# pip install "lm-eval[api]>=0.4.9.2"
|
||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
echo``
|
echo``
|
||||||
|
|||||||
@ -2,7 +2,7 @@
|
|||||||
# We can use this script to compute baseline accuracy on GSM for transformers.
|
# We can use this script to compute baseline accuracy on GSM for transformers.
|
||||||
#
|
#
|
||||||
# Make sure you have lm-eval-harness installed:
|
# Make sure you have lm-eval-harness installed:
|
||||||
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
|
# pip install "lm-eval[api]>=0.4.9.2"
|
||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
echo``
|
echo``
|
||||||
|
|||||||
@ -3,7 +3,7 @@
|
|||||||
# We use this for fp8, which HF does not support.
|
# We use this for fp8, which HF does not support.
|
||||||
#
|
#
|
||||||
# Make sure you have lm-eval-harness installed:
|
# Make sure you have lm-eval-harness installed:
|
||||||
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
|
# pip install "lm-eval[api]>=0.4.9.2"
|
||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
echo``
|
echo``
|
||||||
|
|||||||
@ -3,7 +3,7 @@
|
|||||||
# We use this for fp8, which HF does not support.
|
# We use this for fp8, which HF does not support.
|
||||||
#
|
#
|
||||||
# Make sure you have lm-eval-harness installed:
|
# Make sure you have lm-eval-harness installed:
|
||||||
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
|
# pip install "lm-eval[api]>=0.4.9.2"
|
||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
echo``
|
echo``
|
||||||
|
|||||||
@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
|
|||||||
echo "--- Installing Python dependencies ---"
|
echo "--- Installing Python dependencies ---"
|
||||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
||||||
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
||||||
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
|
&& python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
|
||||||
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
||||||
echo "--- Python dependencies installed ---"
|
echo "--- Python dependencies installed ---"
|
||||||
|
|
||||||
|
|||||||
@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
|
|||||||
echo "--- Installing Python dependencies ---"
|
echo "--- Installing Python dependencies ---"
|
||||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
||||||
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
||||||
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
|
&& python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
|
||||||
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
||||||
echo "--- Python dependencies installed ---"
|
echo "--- Python dependencies installed ---"
|
||||||
|
|
||||||
|
|||||||
@ -24,6 +24,8 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
|
|||||||
#ifndef VLLM_NUMA_DISABLED
|
#ifndef VLLM_NUMA_DISABLED
|
||||||
std::string init_cpu_threads_env(const std::string& cpu_ids) {
|
std::string init_cpu_threads_env(const std::string& cpu_ids) {
|
||||||
bitmask* omp_cpu_mask = numa_parse_cpustring_all(cpu_ids.c_str());
|
bitmask* omp_cpu_mask = numa_parse_cpustring_all(cpu_ids.c_str());
|
||||||
|
TORCH_CHECK(omp_cpu_mask != nullptr,
|
||||||
|
"Failed to parse CPU string: " + cpu_ids);
|
||||||
TORCH_CHECK(omp_cpu_mask->size > 0);
|
TORCH_CHECK(omp_cpu_mask->size > 0);
|
||||||
std::vector<int> omp_cpu_ids;
|
std::vector<int> omp_cpu_ids;
|
||||||
omp_cpu_ids.reserve(omp_cpu_mask->size);
|
omp_cpu_ids.reserve(omp_cpu_mask->size);
|
||||||
@ -44,20 +46,12 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
|
|||||||
|
|
||||||
// Memory node binding
|
// Memory node binding
|
||||||
if (numa_available() != -1) {
|
if (numa_available() != -1) {
|
||||||
int mem_node_id = numa_node_of_cpu(omp_cpu_ids.front());
|
|
||||||
std::set<int> node_ids;
|
std::set<int> node_ids;
|
||||||
for (const auto& cpu_id : omp_cpu_ids) {
|
for (const auto& cpu_id : omp_cpu_ids) {
|
||||||
int node_id = numa_node_of_cpu(cpu_id);
|
int node_id = numa_node_of_cpu(cpu_id);
|
||||||
if (node_id != -1) {
|
if (node_id != -1) {
|
||||||
node_ids.insert(node_id);
|
node_ids.insert(node_id);
|
||||||
}
|
}
|
||||||
if (node_id != mem_node_id) {
|
|
||||||
TORCH_WARN("CPU ", cpu_id, " is on NUMA node ", node_id, ", but CPU ",
|
|
||||||
omp_cpu_ids.front(), " is on NUMA node ", mem_node_id,
|
|
||||||
". All CPUs should be on the same NUMA node for optimal "
|
|
||||||
"performance. Memory will be bound to NUMA node ",
|
|
||||||
mem_node_id, ".");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
// Concatenate all node_ids into a single comma-separated string
|
// Concatenate all node_ids into a single comma-separated string
|
||||||
if (!node_ids.empty()) {
|
if (!node_ids.empty()) {
|
||||||
@ -70,7 +64,7 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
bitmask* mask = numa_parse_nodestring(node_ids_str.c_str());
|
bitmask* mask = numa_parse_nodestring(node_ids_str.c_str());
|
||||||
bitmask* src_mask = numa_get_membind();
|
bitmask* src_mask = numa_get_mems_allowed();
|
||||||
|
|
||||||
int pid = getpid();
|
int pid = getpid();
|
||||||
|
|
||||||
@ -83,15 +77,46 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
|
|||||||
std::to_string(errno));
|
std::to_string(errno));
|
||||||
}
|
}
|
||||||
|
|
||||||
// restrict memory allocation node.
|
// Restrict memory allocation to the selected NUMA node(s).
|
||||||
numa_set_membind(mask);
|
// Enhances memory locality for the threads bound to those NUMA CPUs.
|
||||||
|
if (node_ids.size() > 1) {
|
||||||
|
errno = 0;
|
||||||
|
numa_set_interleave_mask(mask);
|
||||||
|
if (errno != 0) {
|
||||||
|
TORCH_WARN("numa_set_interleave_mask failed. errno: " +
|
||||||
|
std::to_string(errno));
|
||||||
|
} else {
|
||||||
|
TORCH_WARN(
|
||||||
|
"NUMA binding: Using INTERLEAVE policy for memory "
|
||||||
|
"allocation across multiple NUMA nodes (nodes: " +
|
||||||
|
node_ids_str +
|
||||||
|
"). Memory allocations will be "
|
||||||
|
"interleaved across the specified NUMA nodes.");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
errno = 0;
|
||||||
|
numa_set_membind(mask);
|
||||||
|
if (errno != 0) {
|
||||||
|
TORCH_WARN("numa_set_membind failed. errno: " +
|
||||||
|
std::to_string(errno));
|
||||||
|
} else {
|
||||||
|
TORCH_WARN(
|
||||||
|
"NUMA binding: Using MEMBIND policy for memory "
|
||||||
|
"allocation on the NUMA nodes (" +
|
||||||
|
node_ids_str +
|
||||||
|
"). Memory allocations will be "
|
||||||
|
"strictly bound to these NUMA nodes.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
numa_set_strict(1);
|
numa_set_strict(1);
|
||||||
|
|
||||||
numa_free_nodemask(mask);
|
numa_free_nodemask(mask);
|
||||||
numa_free_nodemask(src_mask);
|
numa_free_nodemask(src_mask);
|
||||||
} else {
|
} else {
|
||||||
TORCH_WARN("numa_parse_nodestring or numa_get_membind failed. errno: " +
|
TORCH_WARN(
|
||||||
std::to_string(errno));
|
"numa_parse_nodestring or numa_get_run_node_mask failed. errno: " +
|
||||||
|
std::to_string(errno));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -84,7 +84,7 @@ Since simple RTN does not require data for weight quantization and the activatio
|
|||||||
Install `vllm` and `lm-evaluation-harness` for evaluation:
|
Install `vllm` and `lm-evaluation-harness` for evaluation:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
|
pip install vllm "lm-eval[api]>=0.4.9.2"
|
||||||
```
|
```
|
||||||
|
|
||||||
Load and run the model in `vllm`:
|
Load and run the model in `vllm`:
|
||||||
|
|||||||
@ -18,7 +18,7 @@ pip install llmcompressor
|
|||||||
Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
|
Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
|
pip install vllm "lm-eval[api]>=0.4.9.2"
|
||||||
```
|
```
|
||||||
|
|
||||||
## Quantization Process
|
## Quantization Process
|
||||||
|
|||||||
@ -23,7 +23,7 @@ pip install llmcompressor
|
|||||||
Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
|
Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
|
pip install vllm "lm-eval[api]>=0.4.9.2"
|
||||||
```
|
```
|
||||||
|
|
||||||
## Quantization Process
|
## Quantization Process
|
||||||
|
|||||||
@ -20,7 +20,7 @@ for more installation details.
|
|||||||
Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
|
Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
|
pip install vllm "lm-eval[api]>=0.4.9.2"
|
||||||
```
|
```
|
||||||
|
|
||||||
## Quantization Process
|
## Quantization Process
|
||||||
|
|||||||
@ -27,7 +27,7 @@ mistral_common[image,audio] >= 1.8.5 # required for voxtral test
|
|||||||
num2words # required for smolvlm test
|
num2words # required for smolvlm test
|
||||||
opencv-python-headless >= 4.11.0 # required for video test
|
opencv-python-headless >= 4.11.0 # required for video test
|
||||||
datamodel_code_generator # required for minicpm3 test
|
datamodel_code_generator # required for minicpm3 test
|
||||||
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
|
lm-eval[api]>=0.4.9.2 # required for model evaluation test
|
||||||
mteb>=1.38.11, <2 # required for mteb test
|
mteb>=1.38.11, <2 # required for mteb test
|
||||||
transformers==4.57.3
|
transformers==4.57.3
|
||||||
tokenizers==0.22.0
|
tokenizers==0.22.0
|
||||||
|
|||||||
@ -58,7 +58,7 @@ schemathesis==3.39.15
|
|||||||
# OpenAI schema test
|
# OpenAI schema test
|
||||||
|
|
||||||
# Evaluation and benchmarking
|
# Evaluation and benchmarking
|
||||||
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
|
lm-eval[api]>=0.4.9.2
|
||||||
jiwer==4.0.0
|
jiwer==4.0.0
|
||||||
|
|
||||||
# Required for multiprocessed tests that use spawn method, Datasets and Evaluate Test
|
# Required for multiprocessed tests that use spawn method, Datasets and Evaluate Test
|
||||||
|
|||||||
@ -34,8 +34,7 @@ num2words # required for smolvlm test
|
|||||||
open_clip_torch==2.32.0 # Required for nemotron_vl test
|
open_clip_torch==2.32.0 # Required for nemotron_vl test
|
||||||
opencv-python-headless >= 4.11.0 # required for video test
|
opencv-python-headless >= 4.11.0 # required for video test
|
||||||
datamodel_code_generator # required for minicpm3 test
|
datamodel_code_generator # required for minicpm3 test
|
||||||
# TODO: Use lm-eval[api]==0.4.10 once released
|
lm-eval[api]>=0.4.9.2 # required for model evaluation test
|
||||||
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
|
|
||||||
mteb[bm25s]>=2, <3 # required for mteb test
|
mteb[bm25s]>=2, <3 # required for mteb test
|
||||||
transformers==4.57.3
|
transformers==4.57.3
|
||||||
tokenizers==0.22.0
|
tokenizers==0.22.0
|
||||||
|
|||||||
@ -441,7 +441,7 @@ lightning-utilities==0.14.3
|
|||||||
# torchmetrics
|
# torchmetrics
|
||||||
llvmlite==0.44.0
|
llvmlite==0.44.0
|
||||||
# via numba
|
# via numba
|
||||||
lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
|
lm-eval==0.4.9.2
|
||||||
# via -r requirements/test.in
|
# via -r requirements/test.in
|
||||||
lxml==5.3.0
|
lxml==5.3.0
|
||||||
# via
|
# via
|
||||||
|
|||||||
@ -410,7 +410,7 @@ class HfRunner:
|
|||||||
|
|
||||||
# don't put this import at the top level
|
# don't put this import at the top level
|
||||||
# it will call torch.cuda.device_count()
|
# it will call torch.cuda.device_count()
|
||||||
from transformers import AutoProcessor # noqa: F401
|
from transformers import AutoProcessor
|
||||||
|
|
||||||
self.processor = AutoProcessor.from_pretrained(
|
self.processor = AutoProcessor.from_pretrained(
|
||||||
model_name,
|
model_name,
|
||||||
|
|||||||
@ -15,7 +15,7 @@ MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
|
|||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def server(): # noqa: F811
|
def server():
|
||||||
args = [
|
args = [
|
||||||
# use half precision for speed and memory savings in CI environment
|
# use half precision for speed and memory savings in CI environment
|
||||||
"--dtype",
|
"--dtype",
|
||||||
|
|||||||
@ -28,7 +28,7 @@ def zephyr_lora_files():
|
|||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def server(zephyr_lora_files): # noqa: F811
|
def server(zephyr_lora_files):
|
||||||
args = [
|
args = [
|
||||||
# use half precision for speed and memory savings in CI environment
|
# use half precision for speed and memory savings in CI environment
|
||||||
"--dtype",
|
"--dtype",
|
||||||
|
|||||||
@ -12,7 +12,7 @@ MODEL_NAME = "Qwen/QwQ-32B"
|
|||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def server(): # noqa: F811
|
def server():
|
||||||
args = [
|
args = [
|
||||||
"--max-model-len",
|
"--max-model-len",
|
||||||
"8192",
|
"8192",
|
||||||
|
|||||||
@ -125,7 +125,7 @@ messages = [
|
|||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def server(): # noqa: F811
|
def server():
|
||||||
args = [
|
args = [
|
||||||
# use half precision for speed and memory savings in CI environment
|
# use half precision for speed and memory savings in CI environment
|
||||||
"--dtype",
|
"--dtype",
|
||||||
@ -212,7 +212,7 @@ async def test_function_tool_use(
|
|||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def k2_server(): # noqa: F811
|
def k2_server():
|
||||||
args = [
|
args = [
|
||||||
# use half precision for speed and memory savings in CI environment
|
# use half precision for speed and memory savings in CI environment
|
||||||
"--dtype",
|
"--dtype",
|
||||||
|
|||||||
@ -23,7 +23,7 @@ ACTIVE_MM_LORA_RESPONSE = "Spoken text: The first words I spoke in the original
|
|||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def multimodal_server(): # noqa: F811
|
def multimodal_server():
|
||||||
args = [
|
args = [
|
||||||
# use half precision for speed and memory savings in CI environment
|
# use half precision for speed and memory savings in CI environment
|
||||||
"--dtype",
|
"--dtype",
|
||||||
|
|||||||
@ -8,7 +8,7 @@ from ...utils import RemoteOpenAIServer
|
|||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def chat_server_with_force_include_usage(request): # noqa: F811
|
def chat_server_with_force_include_usage(request):
|
||||||
args = [
|
args = [
|
||||||
# use half precision for speed and memory savings in CI environment
|
# use half precision for speed and memory savings in CI environment
|
||||||
"--dtype",
|
"--dtype",
|
||||||
|
|||||||
@ -11,7 +11,7 @@ MODEL_NAME = "Qwen/Qwen3-0.6B"
|
|||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def server(): # noqa: F811
|
def server():
|
||||||
args = [
|
args = [
|
||||||
"--max-model-len",
|
"--max-model-len",
|
||||||
"2048",
|
"2048",
|
||||||
|
|||||||
@ -37,7 +37,7 @@ def default_server_args(qwen3_lora_files):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def server_fixture(request, default_server_args): # noqa: F811
|
def server_fixture(request, default_server_args):
|
||||||
use_server_flag = request.param
|
use_server_flag = request.param
|
||||||
if use_server_flag:
|
if use_server_flag:
|
||||||
args_with_flag = default_server_args + ["--return-tokens-as-token-ids"]
|
args_with_flag = default_server_args + ["--return-tokens-as-token-ids"]
|
||||||
|
|||||||
@ -4,7 +4,7 @@ import os
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from tests.models.language.pooling_mteb_test.mteb_utils import (
|
from tests.models.language.pooling_mteb_test.mteb_embed_utils import (
|
||||||
MTEB_EMBED_TASKS,
|
MTEB_EMBED_TASKS,
|
||||||
MTEB_EMBED_TOL,
|
MTEB_EMBED_TOL,
|
||||||
OpenAIClientMtebEncoder,
|
OpenAIClientMtebEncoder,
|
||||||
|
|||||||
@ -4,7 +4,7 @@ import os
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from tests.models.language.pooling_mteb_test.mteb_utils import (
|
from tests.models.language.pooling_mteb_test.mteb_score_utils import (
|
||||||
MTEB_RERANK_LANGS,
|
MTEB_RERANK_LANGS,
|
||||||
MTEB_RERANK_TASKS,
|
MTEB_RERANK_TASKS,
|
||||||
MTEB_RERANK_TOL,
|
MTEB_RERANK_TOL,
|
||||||
|
|||||||
@ -202,11 +202,10 @@ class TestGetScorePrompt:
|
|||||||
tokenization_kwargs,
|
tokenization_kwargs,
|
||||||
mock_model_no_score_template,
|
mock_model_no_score_template,
|
||||||
):
|
):
|
||||||
# FIXME: Models implementing SupportsScoreTemplate must use their custom
|
# FIXME: For now, we only apply a template when one is explicitly provided.
|
||||||
# template implementation by default to preserve existing functionality.
|
# We cannot rely on the tokenizer's chat template because many models
|
||||||
# Attempting to use tokenizer_config.json templates would most likely break
|
# inherit junk templates from their base LLM, which breaks both the models
|
||||||
# these models, as often they just inherit the template from the original LLM.
|
# and the tests that use them.
|
||||||
# CLI --chat-template overrides are still supported.
|
|
||||||
with (
|
with (
|
||||||
patch(
|
patch(
|
||||||
"vllm.model_executor.model_loader.get_model_cls",
|
"vllm.model_executor.model_loader.get_model_cls",
|
||||||
|
|||||||
228
tests/models/language/pooling_mteb_test/mteb_embed_utils.py
Normal file
228
tests/models/language/pooling_mteb_test/mteb_embed_utils.py
Normal file
@ -0,0 +1,228 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
import mteb
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
from mteb.models import ModelMeta
|
||||||
|
from mteb.types import Array
|
||||||
|
from torch.utils.data import DataLoader
|
||||||
|
|
||||||
|
import tests.ci_envs as ci_envs
|
||||||
|
from tests.models.utils import (
|
||||||
|
EmbedModelInfo,
|
||||||
|
check_embeddings_close,
|
||||||
|
get_vllm_extra_kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Most embedding models on the STS12 task (See #17175):
|
||||||
|
# - Model implementation and minor changes in tensor dtype
|
||||||
|
# results in differences less than 1e-4
|
||||||
|
# - Different model results in differences more than 1e-3
|
||||||
|
# 1e-4 is a good tolerance threshold
|
||||||
|
MTEB_EMBED_TASKS = ["STS12"]
|
||||||
|
MTEB_EMBED_TOL = 1e-4
|
||||||
|
|
||||||
|
|
||||||
|
_empty_model_meta = ModelMeta(
|
||||||
|
loader=None,
|
||||||
|
name="vllm/model",
|
||||||
|
revision="1",
|
||||||
|
release_date=None,
|
||||||
|
languages=None,
|
||||||
|
framework=[],
|
||||||
|
similarity_fn_name=None,
|
||||||
|
n_parameters=None,
|
||||||
|
memory_usage_mb=None,
|
||||||
|
max_tokens=None,
|
||||||
|
embed_dim=None,
|
||||||
|
license=None,
|
||||||
|
open_weights=None,
|
||||||
|
public_training_code=None,
|
||||||
|
public_training_data=None,
|
||||||
|
use_instructions=None,
|
||||||
|
training_datasets=None,
|
||||||
|
modalities=["text"], # 'image' can be added to evaluate multimodal models
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class MtebEmbedMixin(mteb.EncoderProtocol):
|
||||||
|
mteb_model_meta = _empty_model_meta
|
||||||
|
|
||||||
|
def similarity(
|
||||||
|
self,
|
||||||
|
embeddings1: np.ndarray,
|
||||||
|
embeddings2: np.ndarray,
|
||||||
|
) -> np.ndarray:
|
||||||
|
# Cosine similarity
|
||||||
|
norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
|
||||||
|
norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
|
||||||
|
sim = np.dot(embeddings1, embeddings2.T) / (norm1 * norm2.T)
|
||||||
|
return sim
|
||||||
|
|
||||||
|
def similarity_pairwise(
|
||||||
|
self,
|
||||||
|
embeddings1: Array,
|
||||||
|
embeddings2: Array,
|
||||||
|
) -> Array:
|
||||||
|
# Cosine similarity
|
||||||
|
norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
|
||||||
|
norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
|
||||||
|
sim = np.sum(embeddings1 * embeddings2, axis=1) / (
|
||||||
|
norm1.flatten() * norm2.flatten()
|
||||||
|
)
|
||||||
|
return sim
|
||||||
|
|
||||||
|
|
||||||
|
class VllmMtebEncoder(MtebEmbedMixin):
|
||||||
|
def __init__(self, vllm_model):
|
||||||
|
self.llm = vllm_model
|
||||||
|
self.rng = np.random.default_rng(seed=42)
|
||||||
|
|
||||||
|
def encode(
|
||||||
|
self,
|
||||||
|
inputs: DataLoader[mteb.types.BatchedInput],
|
||||||
|
*args,
|
||||||
|
**kwargs,
|
||||||
|
) -> np.ndarray:
|
||||||
|
# Hoping to discover potential scheduling
|
||||||
|
# issues by randomizing the order.
|
||||||
|
sentences = [text for batch in inputs for text in batch["text"]]
|
||||||
|
r = self.rng.permutation(len(sentences))
|
||||||
|
sentences = [sentences[i] for i in r]
|
||||||
|
outputs = self.llm.embed(sentences, use_tqdm=False)
|
||||||
|
embeds = np.array(outputs)
|
||||||
|
embeds = embeds[np.argsort(r)]
|
||||||
|
return embeds
|
||||||
|
|
||||||
|
|
||||||
|
class OpenAIClientMtebEncoder(MtebEmbedMixin):
|
||||||
|
def __init__(self, model_name: str, client):
|
||||||
|
self.model_name = model_name
|
||||||
|
self.client = client
|
||||||
|
self.rng = np.random.default_rng(seed=42)
|
||||||
|
|
||||||
|
def encode(
|
||||||
|
self,
|
||||||
|
inputs: DataLoader[mteb.types.BatchedInput],
|
||||||
|
*args,
|
||||||
|
**kwargs,
|
||||||
|
) -> np.ndarray:
|
||||||
|
# Hoping to discover potential scheduling
|
||||||
|
# issues by randomizing the order.
|
||||||
|
sentences = [text for batch in inputs for text in batch["text"]]
|
||||||
|
r = self.rng.permutation(len(sentences))
|
||||||
|
sentences = [sentences[i] for i in r]
|
||||||
|
|
||||||
|
embeddings = self.client.embeddings.create(
|
||||||
|
model=self.model_name, input=sentences
|
||||||
|
)
|
||||||
|
outputs = [d.embedding for d in embeddings.data]
|
||||||
|
embeds = np.array(outputs)
|
||||||
|
embeds = embeds[np.argsort(r)]
|
||||||
|
return embeds
|
||||||
|
|
||||||
|
|
||||||
|
def run_mteb_embed_task(encoder: mteb.EncoderProtocol, tasks):
|
||||||
|
tasks = mteb.get_tasks(tasks=tasks)
|
||||||
|
results = mteb.evaluate(
|
||||||
|
encoder,
|
||||||
|
tasks,
|
||||||
|
cache=None,
|
||||||
|
show_progress_bar=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
main_score = results[0].scores["test"][0]["main_score"]
|
||||||
|
return main_score
|
||||||
|
|
||||||
|
|
||||||
|
def mteb_test_embed_models(
|
||||||
|
hf_runner,
|
||||||
|
vllm_runner,
|
||||||
|
model_info: EmbedModelInfo,
|
||||||
|
vllm_extra_kwargs=None,
|
||||||
|
hf_model_callback=None,
|
||||||
|
atol=MTEB_EMBED_TOL,
|
||||||
|
):
|
||||||
|
vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)
|
||||||
|
|
||||||
|
# Test embed_dims, isnan and whether to use normalize
|
||||||
|
example_prompts = ["The chef prepared a delicious meal." * 1000]
|
||||||
|
|
||||||
|
with vllm_runner(
|
||||||
|
model_info.name,
|
||||||
|
runner="pooling",
|
||||||
|
max_model_len=model_info.max_model_len,
|
||||||
|
**vllm_extra_kwargs,
|
||||||
|
) as vllm_model:
|
||||||
|
model_config = vllm_model.llm.llm_engine.model_config
|
||||||
|
|
||||||
|
# Confirm whether vllm is using the correct architecture
|
||||||
|
if model_info.architecture:
|
||||||
|
assert model_info.architecture in model_config.architectures
|
||||||
|
|
||||||
|
# Confirm whether the important configs in model_config are correct.
|
||||||
|
if model_info.pooling_type is not None:
|
||||||
|
assert model_config.pooler_config.pooling_type == model_info.pooling_type
|
||||||
|
if model_info.attn_type is not None:
|
||||||
|
assert model_config.attn_type == model_info.attn_type
|
||||||
|
if model_info.is_prefix_caching_supported is not None:
|
||||||
|
assert (
|
||||||
|
model_config.is_prefix_caching_supported
|
||||||
|
== model_info.is_prefix_caching_supported
|
||||||
|
)
|
||||||
|
if model_info.is_chunked_prefill_supported is not None:
|
||||||
|
assert (
|
||||||
|
model_config.is_chunked_prefill_supported
|
||||||
|
== model_info.is_chunked_prefill_supported
|
||||||
|
)
|
||||||
|
|
||||||
|
vllm_main_score = run_mteb_embed_task(
|
||||||
|
VllmMtebEncoder(vllm_model), MTEB_EMBED_TASKS
|
||||||
|
)
|
||||||
|
vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
|
||||||
|
head_dtype = model_config.head_dtype
|
||||||
|
|
||||||
|
# Test embedding_size, isnan and whether to use normalize
|
||||||
|
vllm_outputs = vllm_model.embed(example_prompts, truncate_prompt_tokens=-1)
|
||||||
|
outputs_tensor = torch.tensor(vllm_outputs)
|
||||||
|
assert not torch.any(torch.isnan(outputs_tensor))
|
||||||
|
embedding_size = model_config.embedding_size
|
||||||
|
assert torch.tensor(vllm_outputs).shape[-1] == embedding_size
|
||||||
|
|
||||||
|
# Accelerate mteb test by setting
|
||||||
|
# SentenceTransformers mteb score to a constant
|
||||||
|
if model_info.mteb_score is None:
|
||||||
|
with hf_runner(
|
||||||
|
model_info.name,
|
||||||
|
is_sentence_transformer=True,
|
||||||
|
dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype,
|
||||||
|
) as hf_model:
|
||||||
|
# e.g. setting default parameters for the encode method of hf_runner
|
||||||
|
if hf_model_callback is not None:
|
||||||
|
hf_model_callback(hf_model)
|
||||||
|
|
||||||
|
st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
|
||||||
|
st_dtype = next(hf_model.model.parameters()).dtype
|
||||||
|
|
||||||
|
# Check embeddings close to hf outputs
|
||||||
|
hf_outputs = hf_model.encode(example_prompts)
|
||||||
|
check_embeddings_close(
|
||||||
|
embeddings_0_lst=hf_outputs,
|
||||||
|
embeddings_1_lst=vllm_outputs,
|
||||||
|
name_0="hf",
|
||||||
|
name_1="vllm",
|
||||||
|
tol=1e-2,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
st_main_score = model_info.mteb_score
|
||||||
|
st_dtype = "Constant"
|
||||||
|
|
||||||
|
print("Model:", model_info.name)
|
||||||
|
print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", vllm_main_score)
|
||||||
|
print("SentenceTransformers:", st_dtype, st_main_score)
|
||||||
|
print("Difference:", st_main_score - vllm_main_score)
|
||||||
|
|
||||||
|
# We are not concerned that the vllm mteb results are better
|
||||||
|
# than SentenceTransformers, so we only perform one-sided testing.
|
||||||
|
assert st_main_score - vllm_main_score < atol
|
||||||
@ -7,37 +7,24 @@ from pathlib import Path
|
|||||||
import mteb
|
import mteb
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import requests
|
import requests
|
||||||
import torch
|
|
||||||
from mteb.models import ModelMeta
|
from mteb.models import ModelMeta
|
||||||
from mteb.types import Array
|
|
||||||
from torch.utils.data import DataLoader
|
from torch.utils.data import DataLoader
|
||||||
|
|
||||||
import tests.ci_envs as ci_envs
|
|
||||||
from tests.models.utils import (
|
from tests.models.utils import (
|
||||||
EmbedModelInfo,
|
|
||||||
RerankModelInfo,
|
RerankModelInfo,
|
||||||
check_embeddings_close,
|
|
||||||
get_vllm_extra_kwargs,
|
get_vllm_extra_kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
template_home = (
|
|
||||||
Path(__file__).parent.parent.parent.parent.parent
|
|
||||||
/ "examples/pooling/score/template"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Most embedding models on the STS12 task (See #17175):
|
|
||||||
# - Model implementation and minor changes in tensor dtype
|
|
||||||
# results in differences less than 1e-4
|
|
||||||
# - Different model results in differences more than 1e-3
|
|
||||||
# 1e-4 is a good tolerance threshold
|
|
||||||
MTEB_EMBED_TASKS = ["STS12"]
|
|
||||||
MTEB_EMBED_TOL = 1e-4
|
|
||||||
|
|
||||||
# See #19344
|
# See #19344
|
||||||
MTEB_RERANK_TASKS = ["NFCorpus"]
|
MTEB_RERANK_TASKS = ["NFCorpus"]
|
||||||
MTEB_RERANK_LANGS = ["eng"]
|
MTEB_RERANK_LANGS = ["eng"]
|
||||||
MTEB_RERANK_TOL = 2e-3
|
MTEB_RERANK_TOL = 2e-3
|
||||||
|
|
||||||
|
template_home = (
|
||||||
|
Path(__file__).parent.parent.parent.parent.parent
|
||||||
|
/ "examples/pooling/score/template"
|
||||||
|
)
|
||||||
|
|
||||||
_empty_model_meta = ModelMeta(
|
_empty_model_meta = ModelMeta(
|
||||||
loader=None,
|
loader=None,
|
||||||
name="vllm/model",
|
name="vllm/model",
|
||||||
@ -60,84 +47,11 @@ _empty_model_meta = ModelMeta(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class VllmMtebEncoder(mteb.EncoderProtocol):
|
class MtebCrossEncoderMixin(mteb.CrossEncoderProtocol):
|
||||||
mteb_model_meta = _empty_model_meta
|
mteb_model_meta = _empty_model_meta
|
||||||
|
|
||||||
def __init__(self, vllm_model):
|
|
||||||
self.llm = vllm_model
|
|
||||||
self.rng = np.random.default_rng(seed=42)
|
|
||||||
|
|
||||||
def encode(
|
|
||||||
self,
|
|
||||||
inputs: DataLoader[mteb.types.BatchedInput],
|
|
||||||
*args,
|
|
||||||
**kwargs,
|
|
||||||
) -> np.ndarray:
|
|
||||||
# Hoping to discover potential scheduling
|
|
||||||
# issues by randomizing the order.
|
|
||||||
sentences = [text for batch in inputs for text in batch["text"]]
|
|
||||||
r = self.rng.permutation(len(sentences))
|
|
||||||
sentences = [sentences[i] for i in r]
|
|
||||||
outputs = self.llm.embed(sentences, use_tqdm=False)
|
|
||||||
embeds = np.array(outputs)
|
|
||||||
embeds = embeds[np.argsort(r)]
|
|
||||||
return embeds
|
|
||||||
|
|
||||||
def similarity(
|
|
||||||
self,
|
|
||||||
embeddings1: np.ndarray,
|
|
||||||
embeddings2: np.ndarray,
|
|
||||||
) -> np.ndarray:
|
|
||||||
# Cosine similarity
|
|
||||||
norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
|
|
||||||
norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
|
|
||||||
sim = np.dot(embeddings1, embeddings2.T) / (norm1 * norm2.T)
|
|
||||||
return sim
|
|
||||||
|
|
||||||
def similarity_pairwise(
|
|
||||||
self,
|
|
||||||
embeddings1: Array,
|
|
||||||
embeddings2: Array,
|
|
||||||
) -> Array:
|
|
||||||
# Cosine similarity
|
|
||||||
norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
|
|
||||||
norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
|
|
||||||
sim = np.sum(embeddings1 * embeddings2, axis=1) / (
|
|
||||||
norm1.flatten() * norm2.flatten()
|
|
||||||
)
|
|
||||||
return sim
|
|
||||||
|
|
||||||
|
|
||||||
class OpenAIClientMtebEncoder(VllmMtebEncoder):
|
|
||||||
def __init__(self, model_name: str, client):
|
|
||||||
self.model_name = model_name
|
|
||||||
self.client = client
|
|
||||||
self.rng = np.random.default_rng(seed=42)
|
|
||||||
|
|
||||||
def encode(
|
|
||||||
self,
|
|
||||||
inputs: DataLoader[mteb.types.BatchedInput],
|
|
||||||
*args,
|
|
||||||
**kwargs,
|
|
||||||
) -> np.ndarray:
|
|
||||||
# Hoping to discover potential scheduling
|
|
||||||
# issues by randomizing the order.
|
|
||||||
sentences = [text for batch in inputs for text in batch["text"]]
|
|
||||||
r = self.rng.permutation(len(sentences))
|
|
||||||
sentences = [sentences[i] for i in r]
|
|
||||||
|
|
||||||
embeddings = self.client.embeddings.create(
|
|
||||||
model=self.model_name, input=sentences
|
|
||||||
)
|
|
||||||
outputs = [d.embedding for d in embeddings.data]
|
|
||||||
embeds = np.array(outputs)
|
|
||||||
embeds = embeds[np.argsort(r)]
|
|
||||||
return embeds
|
|
||||||
|
|
||||||
|
|
||||||
class VllmMtebCrossEncoder(mteb.CrossEncoderProtocol):
|
|
||||||
mteb_model_meta = _empty_model_meta
|
|
||||||
|
|
||||||
|
class VllmMtebCrossEncoder(MtebCrossEncoderMixin):
|
||||||
def __init__(self, vllm_model):
|
def __init__(self, vllm_model):
|
||||||
self.llm = vllm_model
|
self.llm = vllm_model
|
||||||
self.rng = np.random.default_rng(seed=42)
|
self.rng = np.random.default_rng(seed=42)
|
||||||
@ -164,7 +78,7 @@ class VllmMtebCrossEncoder(mteb.CrossEncoderProtocol):
|
|||||||
return scores
|
return scores
|
||||||
|
|
||||||
|
|
||||||
class ScoreClientMtebEncoder(mteb.CrossEncoderProtocol):
|
class ScoreClientMtebEncoder(MtebCrossEncoderMixin):
|
||||||
mteb_model_meta = _empty_model_meta
|
mteb_model_meta = _empty_model_meta
|
||||||
|
|
||||||
def __init__(self, model_name: str, url):
|
def __init__(self, model_name: str, url):
|
||||||
@ -216,102 +130,6 @@ class RerankClientMtebEncoder(ScoreClientMtebEncoder):
|
|||||||
return response["results"][0]["relevance_score"]
|
return response["results"][0]["relevance_score"]
|
||||||
|
|
||||||
|
|
||||||
def run_mteb_embed_task(encoder: mteb.EncoderProtocol, tasks):
|
|
||||||
tasks = mteb.get_tasks(tasks=tasks)
|
|
||||||
results = mteb.evaluate(
|
|
||||||
encoder,
|
|
||||||
tasks,
|
|
||||||
cache=None,
|
|
||||||
show_progress_bar=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
main_score = results[0].scores["test"][0]["main_score"]
|
|
||||||
return main_score
|
|
||||||
|
|
||||||
|
|
||||||
def mteb_test_embed_models(
|
|
||||||
hf_runner,
|
|
||||||
vllm_runner,
|
|
||||||
model_info: EmbedModelInfo,
|
|
||||||
vllm_extra_kwargs=None,
|
|
||||||
hf_model_callback=None,
|
|
||||||
atol=MTEB_EMBED_TOL,
|
|
||||||
):
|
|
||||||
vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)
|
|
||||||
|
|
||||||
# Test embed_dims, isnan and whether to use normalize
|
|
||||||
example_prompts = ["The chef prepared a delicious meal." * 1000]
|
|
||||||
|
|
||||||
with vllm_runner(
|
|
||||||
model_info.name,
|
|
||||||
runner="pooling",
|
|
||||||
max_model_len=model_info.max_model_len,
|
|
||||||
**vllm_extra_kwargs,
|
|
||||||
) as vllm_model:
|
|
||||||
model_config = vllm_model.llm.llm_engine.model_config
|
|
||||||
|
|
||||||
# Confirm whether vllm is using the correct architecture
|
|
||||||
if model_info.architecture:
|
|
||||||
assert model_info.architecture in model_config.architectures
|
|
||||||
|
|
||||||
# Confirm whether vllm uses the correct default_pooling_type, which
|
|
||||||
# relates to whether chunked prefill and prefix caching are enabled
|
|
||||||
assert (
|
|
||||||
model_config._model_info.default_pooling_type
|
|
||||||
== model_info.default_pooling_type
|
|
||||||
)
|
|
||||||
|
|
||||||
vllm_main_score = run_mteb_embed_task(
|
|
||||||
VllmMtebEncoder(vllm_model), MTEB_EMBED_TASKS
|
|
||||||
)
|
|
||||||
vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
|
|
||||||
head_dtype = model_config.head_dtype
|
|
||||||
|
|
||||||
# Test embedding_size, isnan and whether to use normalize
|
|
||||||
vllm_outputs = vllm_model.embed(example_prompts, truncate_prompt_tokens=-1)
|
|
||||||
outputs_tensor = torch.tensor(vllm_outputs)
|
|
||||||
assert not torch.any(torch.isnan(outputs_tensor))
|
|
||||||
embedding_size = model_config.embedding_size
|
|
||||||
assert torch.tensor(vllm_outputs).shape[-1] == embedding_size
|
|
||||||
|
|
||||||
# Accelerate mteb test by setting
|
|
||||||
# SentenceTransformers mteb score to a constant
|
|
||||||
if model_info.mteb_score is None:
|
|
||||||
with hf_runner(
|
|
||||||
model_info.name,
|
|
||||||
is_sentence_transformer=True,
|
|
||||||
dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype,
|
|
||||||
) as hf_model:
|
|
||||||
# e.g. setting default parameters for the encode method of hf_runner
|
|
||||||
if hf_model_callback is not None:
|
|
||||||
hf_model_callback(hf_model)
|
|
||||||
|
|
||||||
st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
|
|
||||||
st_dtype = next(hf_model.model.parameters()).dtype
|
|
||||||
|
|
||||||
# Check embeddings close to hf outputs
|
|
||||||
hf_outputs = hf_model.encode(example_prompts)
|
|
||||||
check_embeddings_close(
|
|
||||||
embeddings_0_lst=hf_outputs,
|
|
||||||
embeddings_1_lst=vllm_outputs,
|
|
||||||
name_0="hf",
|
|
||||||
name_1="vllm",
|
|
||||||
tol=1e-2,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
st_main_score = model_info.mteb_score
|
|
||||||
st_dtype = "Constant"
|
|
||||||
|
|
||||||
print("Model:", model_info.name)
|
|
||||||
print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", vllm_main_score)
|
|
||||||
print("SentenceTransformers:", st_dtype, st_main_score)
|
|
||||||
print("Difference:", st_main_score - vllm_main_score)
|
|
||||||
|
|
||||||
# We are not concerned that the vllm mteb results are better
|
|
||||||
# than SentenceTransformers, so we only perform one-sided testing.
|
|
||||||
assert st_main_score - vllm_main_score < atol
|
|
||||||
|
|
||||||
|
|
||||||
def run_mteb_rerank(cross_encoder: mteb.CrossEncoderProtocol, tasks, languages):
|
def run_mteb_rerank(cross_encoder: mteb.CrossEncoderProtocol, tasks, languages):
|
||||||
with tempfile.TemporaryDirectory() as prediction_folder:
|
with tempfile.TemporaryDirectory() as prediction_folder:
|
||||||
bm25s = mteb.get_model("bm25s")
|
bm25s = mteb.get_model("bm25s")
|
||||||
@ -391,18 +209,28 @@ def mteb_test_rerank_models(
|
|||||||
# Score API is only enabled for num_labels == 1
|
# Score API is only enabled for num_labels == 1
|
||||||
assert model_config.hf_config.num_labels == 1
|
assert model_config.hf_config.num_labels == 1
|
||||||
|
|
||||||
# Confirm whether vllm uses the correct default_pooling_type, which
|
# Maybe load chat_template.
|
||||||
# relates to whether chunked prefill and prefix caching are enabled
|
|
||||||
assert (
|
|
||||||
model_config._model_info.default_pooling_type
|
|
||||||
== model_info.default_pooling_type
|
|
||||||
)
|
|
||||||
|
|
||||||
chat_template: str | None = None
|
chat_template: str | None = None
|
||||||
if model_info.chat_template_name is not None:
|
if model_info.chat_template_name is not None:
|
||||||
chat_template = (template_home / model_info.chat_template_name).read_text()
|
chat_template = (template_home / model_info.chat_template_name).read_text()
|
||||||
vllm_model.chat_template = chat_template
|
vllm_model.chat_template = chat_template
|
||||||
|
|
||||||
|
# Confirm whether the important configs in model_config are correct.
|
||||||
|
if model_info.pooling_type is not None:
|
||||||
|
assert model_config.pooler_config.pooling_type == model_info.pooling_type
|
||||||
|
if model_info.attn_type is not None:
|
||||||
|
assert model_config.attn_type == model_info.attn_type
|
||||||
|
if model_info.is_prefix_caching_supported is not None:
|
||||||
|
assert (
|
||||||
|
model_config.is_prefix_caching_supported
|
||||||
|
== model_info.is_prefix_caching_supported
|
||||||
|
)
|
||||||
|
if model_info.is_chunked_prefill_supported is not None:
|
||||||
|
assert (
|
||||||
|
model_config.is_chunked_prefill_supported
|
||||||
|
== model_info.is_chunked_prefill_supported
|
||||||
|
)
|
||||||
|
|
||||||
vllm_main_score = run_mteb_rerank(
|
vllm_main_score = run_mteb_rerank(
|
||||||
vllm_mteb_encoder(vllm_model),
|
vllm_mteb_encoder(vllm_model),
|
||||||
tasks=MTEB_RERANK_TASKS,
|
tasks=MTEB_RERANK_TASKS,
|
||||||
@ -4,90 +4,94 @@ import pytest
|
|||||||
|
|
||||||
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
|
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
|
||||||
from tests.models.utils import (
|
from tests.models.utils import (
|
||||||
CLSPoolingEmbedModelInfo,
|
|
||||||
CLSPoolingRerankModelInfo,
|
|
||||||
EmbedModelInfo,
|
EmbedModelInfo,
|
||||||
LASTPoolingEmbedModelInfo,
|
|
||||||
RerankModelInfo,
|
RerankModelInfo,
|
||||||
)
|
)
|
||||||
|
|
||||||
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
|
from .mteb_embed_utils import mteb_test_embed_models
|
||||||
|
from .mteb_score_utils import mteb_test_rerank_models
|
||||||
|
|
||||||
MODELS = [
|
MODELS = [
|
||||||
########## BertModel
|
########## BertModel
|
||||||
CLSPoolingEmbedModelInfo(
|
EmbedModelInfo(
|
||||||
"BAAI/bge-base-en",
|
"BAAI/bge-base-en",
|
||||||
architecture="BertModel",
|
architecture="BertModel",
|
||||||
mteb_score=0.779336792,
|
mteb_score=0.779336792,
|
||||||
|
pooling_type="CLS",
|
||||||
|
attn_type="encoder_only",
|
||||||
|
is_prefix_caching_supported=False,
|
||||||
|
is_chunked_prefill_supported=False,
|
||||||
enable_test=True,
|
enable_test=True,
|
||||||
),
|
),
|
||||||
CLSPoolingEmbedModelInfo(
|
EmbedModelInfo("BAAI/bge-base-zh", architecture="BertModel", enable_test=False),
|
||||||
"BAAI/bge-base-zh", architecture="BertModel", enable_test=False
|
EmbedModelInfo("BAAI/bge-small-en", architecture="BertModel", enable_test=False),
|
||||||
),
|
EmbedModelInfo("BAAI/bge-small-zh", architecture="BertModel", enable_test=False),
|
||||||
CLSPoolingEmbedModelInfo(
|
EmbedModelInfo("BAAI/bge-large-en", architecture="BertModel", enable_test=False),
|
||||||
"BAAI/bge-small-en", architecture="BertModel", enable_test=False
|
EmbedModelInfo("BAAI/bge-large-zh", architecture="BertModel", enable_test=False),
|
||||||
),
|
EmbedModelInfo(
|
||||||
CLSPoolingEmbedModelInfo(
|
|
||||||
"BAAI/bge-small-zh", architecture="BertModel", enable_test=False
|
|
||||||
),
|
|
||||||
CLSPoolingEmbedModelInfo(
|
|
||||||
"BAAI/bge-large-en", architecture="BertModel", enable_test=False
|
|
||||||
),
|
|
||||||
CLSPoolingEmbedModelInfo(
|
|
||||||
"BAAI/bge-large-zh", architecture="BertModel", enable_test=False
|
|
||||||
),
|
|
||||||
CLSPoolingEmbedModelInfo(
|
|
||||||
"BAAI/bge-large-zh-noinstruct", architecture="BertModel", enable_test=False
|
"BAAI/bge-large-zh-noinstruct", architecture="BertModel", enable_test=False
|
||||||
),
|
),
|
||||||
CLSPoolingEmbedModelInfo(
|
EmbedModelInfo(
|
||||||
"BAAI/bge-base-en-v1.5", architecture="BertModel", enable_test=False
|
"BAAI/bge-base-en-v1.5", architecture="BertModel", enable_test=False
|
||||||
),
|
),
|
||||||
CLSPoolingEmbedModelInfo(
|
EmbedModelInfo(
|
||||||
"BAAI/bge-base-zh-v1.5", architecture="BertModel", enable_test=False
|
"BAAI/bge-base-zh-v1.5", architecture="BertModel", enable_test=False
|
||||||
),
|
),
|
||||||
CLSPoolingEmbedModelInfo(
|
EmbedModelInfo(
|
||||||
"BAAI/bge-small-en-v1.5", architecture="BertModel", enable_test=False
|
"BAAI/bge-small-en-v1.5", architecture="BertModel", enable_test=False
|
||||||
),
|
),
|
||||||
CLSPoolingEmbedModelInfo(
|
EmbedModelInfo(
|
||||||
"BAAI/bge-small-zh-v1.5", architecture="BertModel", enable_test=False
|
"BAAI/bge-small-zh-v1.5", architecture="BertModel", enable_test=False
|
||||||
),
|
),
|
||||||
CLSPoolingEmbedModelInfo(
|
EmbedModelInfo(
|
||||||
"BAAI/bge-large-en-v1.5", architecture="BertModel", enable_test=False
|
"BAAI/bge-large-en-v1.5", architecture="BertModel", enable_test=False
|
||||||
),
|
),
|
||||||
CLSPoolingEmbedModelInfo(
|
EmbedModelInfo(
|
||||||
"BAAI/bge-large-zh-v1.5", architecture="BertModel", enable_test=False
|
"BAAI/bge-large-zh-v1.5", architecture="BertModel", enable_test=False
|
||||||
),
|
),
|
||||||
########## XLMRobertaModel
|
########## XLMRobertaModel
|
||||||
CLSPoolingEmbedModelInfo(
|
EmbedModelInfo(
|
||||||
"BAAI/bge-m3",
|
"BAAI/bge-m3",
|
||||||
architecture="XLMRobertaModel",
|
architecture="XLMRobertaModel",
|
||||||
mteb_score=0.787343078,
|
mteb_score=0.787343078,
|
||||||
|
pooling_type="CLS",
|
||||||
|
attn_type="encoder_only",
|
||||||
|
is_prefix_caching_supported=False,
|
||||||
|
is_chunked_prefill_supported=False,
|
||||||
enable_test=True,
|
enable_test=True,
|
||||||
),
|
),
|
||||||
########## Qwen2Model
|
########## Qwen2Model
|
||||||
LASTPoolingEmbedModelInfo(
|
EmbedModelInfo(
|
||||||
"BAAI/bge-code-v1",
|
"BAAI/bge-code-v1",
|
||||||
architecture="Qwen2Model",
|
architecture="Qwen2Model",
|
||||||
mteb_score=0.75724465,
|
mteb_score=0.75724465,
|
||||||
dtype="float32",
|
dtype="float32",
|
||||||
|
pooling_type="LAST",
|
||||||
|
attn_type="decoder",
|
||||||
|
is_prefix_caching_supported=True,
|
||||||
|
is_chunked_prefill_supported=True,
|
||||||
enable_test=True,
|
enable_test=True,
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
RERANK_MODELS = [
|
RERANK_MODELS = [
|
||||||
########## XLMRobertaForSequenceClassification
|
########## XLMRobertaForSequenceClassification
|
||||||
CLSPoolingRerankModelInfo(
|
RerankModelInfo(
|
||||||
"BAAI/bge-reranker-base",
|
"BAAI/bge-reranker-base",
|
||||||
architecture="XLMRobertaForSequenceClassification",
|
architecture="XLMRobertaForSequenceClassification",
|
||||||
mteb_score=0.32398,
|
mteb_score=0.32398,
|
||||||
|
pooling_type="CLS",
|
||||||
|
attn_type="encoder_only",
|
||||||
|
is_prefix_caching_supported=False,
|
||||||
|
is_chunked_prefill_supported=False,
|
||||||
enable_test=True,
|
enable_test=True,
|
||||||
),
|
),
|
||||||
CLSPoolingRerankModelInfo(
|
RerankModelInfo(
|
||||||
"BAAI/bge-reranker-large",
|
"BAAI/bge-reranker-large",
|
||||||
architecture="XLMRobertaForSequenceClassification",
|
architecture="XLMRobertaForSequenceClassification",
|
||||||
enable_test=False,
|
enable_test=False,
|
||||||
),
|
),
|
||||||
CLSPoolingRerankModelInfo(
|
RerankModelInfo(
|
||||||
"BAAI/bge-reranker-v2-m3",
|
"BAAI/bge-reranker-v2-m3",
|
||||||
architecture="XLMRobertaForSequenceClassification",
|
architecture="XLMRobertaForSequenceClassification",
|
||||||
enable_test=False,
|
enable_test=False,
|
||||||
|
|||||||
@ -9,14 +9,12 @@ import torch
|
|||||||
from torch.utils.data import DataLoader
|
from torch.utils.data import DataLoader
|
||||||
|
|
||||||
from tests.conftest import HfRunner
|
from tests.conftest import HfRunner
|
||||||
from tests.models.language.pooling_mteb_test.mteb_utils import (
|
from tests.models.utils import RerankModelInfo
|
||||||
VllmMtebCrossEncoder,
|
|
||||||
mteb_test_rerank_models,
|
from .mteb_score_utils import VllmMtebCrossEncoder, mteb_test_rerank_models
|
||||||
)
|
|
||||||
from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
|
|
||||||
|
|
||||||
RERANK_MODELS = [
|
RERANK_MODELS = [
|
||||||
LASTPoolingRerankModelInfo(
|
RerankModelInfo(
|
||||||
"BAAI/bge-reranker-v2-gemma",
|
"BAAI/bge-reranker-v2-gemma",
|
||||||
architecture="GemmaForSequenceClassification",
|
architecture="GemmaForSequenceClassification",
|
||||||
mteb_score=0.33757,
|
mteb_score=0.33757,
|
||||||
@ -25,6 +23,10 @@ RERANK_MODELS = [
|
|||||||
"classifier_from_token": ["Yes"],
|
"classifier_from_token": ["Yes"],
|
||||||
"method": "no_post_processing",
|
"method": "no_post_processing",
|
||||||
},
|
},
|
||||||
|
pooling_type="LAST",
|
||||||
|
attn_type="decoder",
|
||||||
|
is_prefix_caching_supported=True,
|
||||||
|
is_chunked_prefill_supported=True,
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
@ -3,23 +3,29 @@
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from tests.models.utils import (
|
from tests.models.utils import (
|
||||||
CLSPoolingRerankModelInfo,
|
|
||||||
LASTPoolingRerankModelInfo,
|
|
||||||
RerankModelInfo,
|
RerankModelInfo,
|
||||||
)
|
)
|
||||||
|
|
||||||
from .mteb_utils import mteb_test_rerank_models
|
from .mteb_score_utils import mteb_test_rerank_models
|
||||||
|
|
||||||
RERANK_MODELS = [
|
RERANK_MODELS = [
|
||||||
CLSPoolingRerankModelInfo(
|
RerankModelInfo(
|
||||||
"cross-encoder/ms-marco-TinyBERT-L-2-v2",
|
"cross-encoder/ms-marco-TinyBERT-L-2-v2",
|
||||||
mteb_score=0.32898,
|
mteb_score=0.32898,
|
||||||
architecture="BertForSequenceClassification",
|
architecture="BertForSequenceClassification",
|
||||||
|
pooling_type="CLS",
|
||||||
|
attn_type="encoder_only",
|
||||||
|
is_prefix_caching_supported=False,
|
||||||
|
is_chunked_prefill_supported=False,
|
||||||
),
|
),
|
||||||
LASTPoolingRerankModelInfo(
|
RerankModelInfo(
|
||||||
"tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
|
"tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
|
||||||
mteb_score=0.25736,
|
mteb_score=0.25736,
|
||||||
architecture="Qwen3ForSequenceClassification",
|
architecture="Qwen3ForSequenceClassification",
|
||||||
|
pooling_type="LAST",
|
||||||
|
attn_type="decoder",
|
||||||
|
is_prefix_caching_supported=True,
|
||||||
|
is_chunked_prefill_supported=True,
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
@ -5,36 +5,32 @@ import pytest
|
|||||||
|
|
||||||
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
|
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
|
||||||
from tests.models.utils import (
|
from tests.models.utils import (
|
||||||
CLSPoolingEmbedModelInfo,
|
|
||||||
CLSPoolingRerankModelInfo,
|
|
||||||
EmbedModelInfo,
|
EmbedModelInfo,
|
||||||
LASTPoolingEmbedModelInfo,
|
|
||||||
RerankModelInfo,
|
RerankModelInfo,
|
||||||
)
|
)
|
||||||
|
|
||||||
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
|
from .mteb_embed_utils import mteb_test_embed_models
|
||||||
|
from .mteb_score_utils import mteb_test_rerank_models
|
||||||
|
|
||||||
MODELS = [
|
MODELS = [
|
||||||
########## BertModel
|
########## BertModel
|
||||||
CLSPoolingEmbedModelInfo(
|
EmbedModelInfo(
|
||||||
"thenlper/gte-large",
|
"thenlper/gte-large",
|
||||||
mteb_score=0.76807651,
|
mteb_score=0.76807651,
|
||||||
architecture="BertModel",
|
architecture="BertModel",
|
||||||
|
pooling_type="MEAN",
|
||||||
|
attn_type="encoder_only",
|
||||||
|
is_prefix_caching_supported=False,
|
||||||
|
is_chunked_prefill_supported=False,
|
||||||
enable_test=True,
|
enable_test=True,
|
||||||
),
|
),
|
||||||
CLSPoolingEmbedModelInfo(
|
EmbedModelInfo("thenlper/gte-base", architecture="BertModel", enable_test=False),
|
||||||
"thenlper/gte-base", architecture="BertModel", enable_test=False
|
EmbedModelInfo("thenlper/gte-small", architecture="BertModel", enable_test=False),
|
||||||
),
|
EmbedModelInfo(
|
||||||
CLSPoolingEmbedModelInfo(
|
|
||||||
"thenlper/gte-small", architecture="BertModel", enable_test=False
|
|
||||||
),
|
|
||||||
CLSPoolingEmbedModelInfo(
|
|
||||||
"thenlper/gte-large-zh", architecture="BertModel", enable_test=False
|
"thenlper/gte-large-zh", architecture="BertModel", enable_test=False
|
||||||
),
|
),
|
||||||
CLSPoolingEmbedModelInfo(
|
EmbedModelInfo("thenlper/gte-base-zh", architecture="BertModel", enable_test=False),
|
||||||
"thenlper/gte-base-zh", architecture="BertModel", enable_test=False
|
EmbedModelInfo(
|
||||||
),
|
|
||||||
CLSPoolingEmbedModelInfo(
|
|
||||||
"thenlper/gte-small-zh", architecture="BertModel", enable_test=False
|
"thenlper/gte-small-zh", architecture="BertModel", enable_test=False
|
||||||
),
|
),
|
||||||
########### NewModel
|
########### NewModel
|
||||||
@ -43,48 +39,64 @@ MODELS = [
|
|||||||
# - whether to use token_type_embeddings
|
# - whether to use token_type_embeddings
|
||||||
# - whether to use context expansion
|
# - whether to use context expansion
|
||||||
# So only test one (the most widely used) model
|
# So only test one (the most widely used) model
|
||||||
CLSPoolingEmbedModelInfo(
|
EmbedModelInfo(
|
||||||
"Alibaba-NLP/gte-multilingual-base",
|
"Alibaba-NLP/gte-multilingual-base",
|
||||||
architecture="GteNewModel",
|
architecture="GteNewModel",
|
||||||
mteb_score=0.775074696,
|
mteb_score=0.775074696,
|
||||||
hf_overrides={"architectures": ["GteNewModel"]},
|
hf_overrides={"architectures": ["GteNewModel"]},
|
||||||
|
pooling_type="CLS",
|
||||||
|
attn_type="encoder_only",
|
||||||
|
is_prefix_caching_supported=False,
|
||||||
|
is_chunked_prefill_supported=False,
|
||||||
enable_test=True,
|
enable_test=True,
|
||||||
),
|
),
|
||||||
CLSPoolingEmbedModelInfo(
|
EmbedModelInfo(
|
||||||
"Alibaba-NLP/gte-base-en-v1.5",
|
"Alibaba-NLP/gte-base-en-v1.5",
|
||||||
architecture="GteNewModel",
|
architecture="GteNewModel",
|
||||||
hf_overrides={"architectures": ["GteNewModel"]},
|
hf_overrides={"architectures": ["GteNewModel"]},
|
||||||
enable_test=False,
|
enable_test=False,
|
||||||
),
|
),
|
||||||
CLSPoolingEmbedModelInfo(
|
EmbedModelInfo(
|
||||||
"Alibaba-NLP/gte-large-en-v1.5",
|
"Alibaba-NLP/gte-large-en-v1.5",
|
||||||
architecture="GteNewModel",
|
architecture="GteNewModel",
|
||||||
hf_overrides={"architectures": ["GteNewModel"]},
|
hf_overrides={"architectures": ["GteNewModel"]},
|
||||||
enable_test=False,
|
enable_test=False,
|
||||||
),
|
),
|
||||||
########### Qwen2ForCausalLM
|
########### Qwen2ForCausalLM
|
||||||
LASTPoolingEmbedModelInfo(
|
EmbedModelInfo(
|
||||||
"Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
"Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
||||||
mteb_score=0.758473459018872,
|
mteb_score=0.758473459018872,
|
||||||
architecture="Qwen2ForCausalLM",
|
architecture="Qwen2ForCausalLM",
|
||||||
|
pooling_type="LAST",
|
||||||
|
attn_type="encoder_only",
|
||||||
|
is_prefix_caching_supported=False,
|
||||||
|
is_chunked_prefill_supported=False,
|
||||||
enable_test=True,
|
enable_test=True,
|
||||||
),
|
),
|
||||||
########## ModernBertModel
|
########## ModernBertModel
|
||||||
CLSPoolingEmbedModelInfo(
|
EmbedModelInfo(
|
||||||
"Alibaba-NLP/gte-modernbert-base",
|
"Alibaba-NLP/gte-modernbert-base",
|
||||||
mteb_score=0.748193353,
|
mteb_score=0.748193353,
|
||||||
architecture="ModernBertModel",
|
architecture="ModernBertModel",
|
||||||
|
pooling_type="CLS",
|
||||||
|
attn_type="encoder_only",
|
||||||
|
is_prefix_caching_supported=False,
|
||||||
|
is_chunked_prefill_supported=False,
|
||||||
enable_test=True,
|
enable_test=True,
|
||||||
),
|
),
|
||||||
########## Qwen3ForCausalLM
|
########## Qwen3ForCausalLM
|
||||||
LASTPoolingEmbedModelInfo(
|
EmbedModelInfo(
|
||||||
"Qwen/Qwen3-Embedding-0.6B",
|
"Qwen/Qwen3-Embedding-0.6B",
|
||||||
mteb_score=0.771163695,
|
mteb_score=0.771163695,
|
||||||
architecture="Qwen3ForCausalLM",
|
architecture="Qwen3ForCausalLM",
|
||||||
dtype="float32",
|
dtype="float32",
|
||||||
|
pooling_type="LAST",
|
||||||
|
attn_type="decoder",
|
||||||
|
is_prefix_caching_supported=True,
|
||||||
|
is_chunked_prefill_supported=True,
|
||||||
enable_test=True,
|
enable_test=True,
|
||||||
),
|
),
|
||||||
LASTPoolingEmbedModelInfo(
|
EmbedModelInfo(
|
||||||
"Qwen/Qwen3-Embedding-4B",
|
"Qwen/Qwen3-Embedding-4B",
|
||||||
architecture="Qwen3ForCausalLM",
|
architecture="Qwen3ForCausalLM",
|
||||||
dtype="float32",
|
dtype="float32",
|
||||||
@ -93,18 +105,26 @@ MODELS = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
RERANK_MODELS = [
|
RERANK_MODELS = [
|
||||||
CLSPoolingRerankModelInfo(
|
RerankModelInfo(
|
||||||
# classifier_pooling: mean
|
# classifier_pooling: mean
|
||||||
"Alibaba-NLP/gte-reranker-modernbert-base",
|
"Alibaba-NLP/gte-reranker-modernbert-base",
|
||||||
mteb_score=0.33386,
|
mteb_score=0.33386,
|
||||||
architecture="ModernBertForSequenceClassification",
|
architecture="ModernBertForSequenceClassification",
|
||||||
|
pooling_type="CLS",
|
||||||
|
attn_type="encoder_only",
|
||||||
|
is_prefix_caching_supported=False,
|
||||||
|
is_chunked_prefill_supported=False,
|
||||||
enable_test=True,
|
enable_test=True,
|
||||||
),
|
),
|
||||||
CLSPoolingRerankModelInfo(
|
RerankModelInfo(
|
||||||
"Alibaba-NLP/gte-multilingual-reranker-base",
|
"Alibaba-NLP/gte-multilingual-reranker-base",
|
||||||
mteb_score=0.33062,
|
mteb_score=0.33062,
|
||||||
architecture="GteNewForSequenceClassification",
|
architecture="GteNewForSequenceClassification",
|
||||||
hf_overrides={"architectures": ["GteNewForSequenceClassification"]},
|
hf_overrides={"architectures": ["GteNewForSequenceClassification"]},
|
||||||
|
pooling_type="CLS",
|
||||||
|
attn_type="encoder_only",
|
||||||
|
is_prefix_caching_supported=False,
|
||||||
|
is_chunked_prefill_supported=False,
|
||||||
enable_test=True,
|
enable_test=True,
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|||||||
@ -3,40 +3,44 @@
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
|
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
|
||||||
from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
|
from tests.models.utils import EmbedModelInfo
|
||||||
|
|
||||||
from .mteb_utils import mteb_test_embed_models
|
from .mteb_embed_utils import mteb_test_embed_models
|
||||||
|
|
||||||
MODELS = [
|
MODELS = [
|
||||||
########## BertModel
|
########## BertModel
|
||||||
CLSPoolingEmbedModelInfo(
|
EmbedModelInfo(
|
||||||
"intfloat/e5-small",
|
"intfloat/e5-small",
|
||||||
architecture="BertModel",
|
architecture="BertModel",
|
||||||
mteb_score=0.742285423,
|
mteb_score=0.742285423,
|
||||||
|
pooling_type="MEAN",
|
||||||
|
attn_type="encoder_only",
|
||||||
|
is_prefix_caching_supported=False,
|
||||||
|
is_chunked_prefill_supported=False,
|
||||||
enable_test=True,
|
enable_test=True,
|
||||||
),
|
),
|
||||||
CLSPoolingEmbedModelInfo(
|
EmbedModelInfo("intfloat/e5-base", architecture="BertModel", enable_test=False),
|
||||||
"intfloat/e5-base", architecture="BertModel", enable_test=False
|
EmbedModelInfo("intfloat/e5-large", architecture="BertModel", enable_test=False),
|
||||||
),
|
EmbedModelInfo(
|
||||||
CLSPoolingEmbedModelInfo(
|
|
||||||
"intfloat/e5-large", architecture="BertModel", enable_test=False
|
|
||||||
),
|
|
||||||
CLSPoolingEmbedModelInfo(
|
|
||||||
"intfloat/multilingual-e5-small", architecture="BertModel", enable_test=False
|
"intfloat/multilingual-e5-small", architecture="BertModel", enable_test=False
|
||||||
),
|
),
|
||||||
########## XLMRobertaModel
|
########## XLMRobertaModel
|
||||||
CLSPoolingEmbedModelInfo(
|
EmbedModelInfo(
|
||||||
"intfloat/multilingual-e5-base",
|
"intfloat/multilingual-e5-base",
|
||||||
architecture="XLMRobertaModel",
|
architecture="XLMRobertaModel",
|
||||||
mteb_score=0.779325955,
|
mteb_score=0.779325955,
|
||||||
|
pooling_type="MEAN",
|
||||||
|
attn_type="encoder_only",
|
||||||
|
is_prefix_caching_supported=False,
|
||||||
|
is_chunked_prefill_supported=False,
|
||||||
enable_test=True,
|
enable_test=True,
|
||||||
),
|
),
|
||||||
CLSPoolingEmbedModelInfo(
|
EmbedModelInfo(
|
||||||
"intfloat/multilingual-e5-large",
|
"intfloat/multilingual-e5-large",
|
||||||
architecture="XLMRobertaModel",
|
architecture="XLMRobertaModel",
|
||||||
enable_test=False,
|
enable_test=False,
|
||||||
),
|
),
|
||||||
CLSPoolingEmbedModelInfo(
|
EmbedModelInfo(
|
||||||
"intfloat/multilingual-e5-large-instruct",
|
"intfloat/multilingual-e5-large-instruct",
|
||||||
architecture="XLMRobertaModel",
|
architecture="XLMRobertaModel",
|
||||||
enable_test=False,
|
enable_test=False,
|
||||||
|
|||||||
@ -10,30 +10,37 @@ from tests.models.language.pooling.embed_utils import (
|
|||||||
matryoshka_fy,
|
matryoshka_fy,
|
||||||
)
|
)
|
||||||
from tests.models.utils import (
|
from tests.models.utils import (
|
||||||
CLSPoolingEmbedModelInfo,
|
|
||||||
CLSPoolingRerankModelInfo,
|
|
||||||
EmbedModelInfo,
|
EmbedModelInfo,
|
||||||
RerankModelInfo,
|
RerankModelInfo,
|
||||||
)
|
)
|
||||||
from vllm import PoolingParams
|
from vllm import PoolingParams
|
||||||
|
|
||||||
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
|
from .mteb_embed_utils import mteb_test_embed_models
|
||||||
|
from .mteb_score_utils import mteb_test_rerank_models
|
||||||
|
|
||||||
EMBEDDING_MODELS = [
|
EMBEDDING_MODELS = [
|
||||||
CLSPoolingEmbedModelInfo(
|
EmbedModelInfo(
|
||||||
"jinaai/jina-embeddings-v3",
|
"jinaai/jina-embeddings-v3",
|
||||||
mteb_score=0.824413164,
|
mteb_score=0.824413164,
|
||||||
architecture="XLMRobertaModel",
|
architecture="XLMRobertaModel",
|
||||||
is_matryoshka=True,
|
is_matryoshka=True,
|
||||||
|
pooling_type="MEAN",
|
||||||
|
attn_type="encoder_only",
|
||||||
|
is_prefix_caching_supported=False,
|
||||||
|
is_chunked_prefill_supported=False,
|
||||||
dtype="float32",
|
dtype="float32",
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
RERANK_MODELS = [
|
RERANK_MODELS = [
|
||||||
CLSPoolingRerankModelInfo(
|
RerankModelInfo(
|
||||||
"jinaai/jina-reranker-v2-base-multilingual",
|
"jinaai/jina-reranker-v2-base-multilingual",
|
||||||
mteb_score=0.33643,
|
mteb_score=0.33643,
|
||||||
architecture="XLMRobertaForSequenceClassification",
|
architecture="XLMRobertaForSequenceClassification",
|
||||||
|
pooling_type="CLS",
|
||||||
|
attn_type="encoder_only",
|
||||||
|
is_prefix_caching_supported=False,
|
||||||
|
is_chunked_prefill_supported=False,
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
@ -6,9 +6,9 @@ import pytest
|
|||||||
import torch
|
import torch
|
||||||
|
|
||||||
from tests.conftest import HfRunner
|
from tests.conftest import HfRunner
|
||||||
from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
|
from tests.models.utils import RerankModelInfo
|
||||||
|
|
||||||
from .mteb_utils import mteb_test_rerank_models
|
from .mteb_score_utils import mteb_test_rerank_models
|
||||||
|
|
||||||
mxbai_rerank_hf_overrides = {
|
mxbai_rerank_hf_overrides = {
|
||||||
"architectures": ["Qwen2ForSequenceClassification"],
|
"architectures": ["Qwen2ForSequenceClassification"],
|
||||||
@ -17,14 +17,18 @@ mxbai_rerank_hf_overrides = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
RERANK_MODELS = [
|
RERANK_MODELS = [
|
||||||
LASTPoolingRerankModelInfo(
|
RerankModelInfo(
|
||||||
"mixedbread-ai/mxbai-rerank-base-v2",
|
"mixedbread-ai/mxbai-rerank-base-v2",
|
||||||
architecture="Qwen2ForSequenceClassification",
|
architecture="Qwen2ForSequenceClassification",
|
||||||
hf_overrides=mxbai_rerank_hf_overrides,
|
hf_overrides=mxbai_rerank_hf_overrides,
|
||||||
mteb_score=0.273,
|
mteb_score=0.273,
|
||||||
|
pooling_type="LAST",
|
||||||
|
attn_type="decoder",
|
||||||
|
is_prefix_caching_supported=True,
|
||||||
|
is_chunked_prefill_supported=True,
|
||||||
enable_test=True,
|
enable_test=True,
|
||||||
),
|
),
|
||||||
LASTPoolingRerankModelInfo(
|
RerankModelInfo(
|
||||||
"mixedbread-ai/mxbai-rerank-large-v2",
|
"mixedbread-ai/mxbai-rerank-large-v2",
|
||||||
architecture="Qwen2ForSequenceClassification",
|
architecture="Qwen2ForSequenceClassification",
|
||||||
hf_overrides=mxbai_rerank_hf_overrides,
|
hf_overrides=mxbai_rerank_hf_overrides,
|
||||||
|
|||||||
@ -3,29 +3,39 @@
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
from tests.models.language.pooling_mteb_test.mteb_embed_utils import (
|
||||||
|
mteb_test_embed_models,
|
||||||
|
)
|
||||||
|
from tests.models.language.pooling_mteb_test.mteb_score_utils import (
|
||||||
|
mteb_test_rerank_models,
|
||||||
|
)
|
||||||
from tests.models.utils import (
|
from tests.models.utils import (
|
||||||
EmbedModelInfo,
|
EmbedModelInfo,
|
||||||
LASTPoolingEmbedModelInfo,
|
|
||||||
LASTPoolingRerankModelInfo,
|
|
||||||
RerankModelInfo,
|
RerankModelInfo,
|
||||||
)
|
)
|
||||||
|
|
||||||
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
|
|
||||||
|
|
||||||
EMBEDDING_MODELS = [
|
EMBEDDING_MODELS = [
|
||||||
LASTPoolingEmbedModelInfo(
|
EmbedModelInfo(
|
||||||
"nvidia/llama-nemotron-embed-1b-v2",
|
"nvidia/llama-nemotron-embed-1b-v2",
|
||||||
architecture="LlamaBidirectionalModel",
|
architecture="LlamaBidirectionalModel",
|
||||||
mteb_score=0.689164662128673,
|
mteb_score=0.689164662128673,
|
||||||
|
pooling_type="MEAN",
|
||||||
|
attn_type="encoder_only",
|
||||||
|
is_prefix_caching_supported=False,
|
||||||
|
is_chunked_prefill_supported=False,
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
RERANK_MODELS = [
|
RERANK_MODELS = [
|
||||||
LASTPoolingRerankModelInfo(
|
RerankModelInfo(
|
||||||
"nvidia/llama-nemotron-rerank-1b-v2",
|
"nvidia/llama-nemotron-rerank-1b-v2",
|
||||||
architecture="LlamaBidirectionalForSequenceClassification",
|
architecture="LlamaBidirectionalForSequenceClassification",
|
||||||
chat_template_name="nemotron-rerank.jinja",
|
chat_template_name="nemotron-rerank.jinja",
|
||||||
mteb_score=0.33994,
|
mteb_score=0.33994,
|
||||||
|
pooling_type="MEAN",
|
||||||
|
attn_type="encoder_only",
|
||||||
|
is_prefix_caching_supported=False,
|
||||||
|
is_chunked_prefill_supported=False,
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
@ -4,30 +4,38 @@
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
|
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
|
||||||
from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
|
from tests.models.utils import EmbedModelInfo
|
||||||
|
|
||||||
from .mteb_utils import mteb_test_embed_models
|
from .mteb_embed_utils import mteb_test_embed_models
|
||||||
|
|
||||||
MODELS = [
|
MODELS = [
|
||||||
CLSPoolingEmbedModelInfo(
|
EmbedModelInfo(
|
||||||
"nomic-ai/nomic-embed-text-v1",
|
"nomic-ai/nomic-embed-text-v1",
|
||||||
architecture="NomicBertModel",
|
architecture="NomicBertModel",
|
||||||
mteb_score=0.737568559,
|
mteb_score=0.737568559,
|
||||||
enable_test=True,
|
enable_test=True,
|
||||||
|
pooling_type="MEAN",
|
||||||
|
attn_type="encoder_only",
|
||||||
|
is_prefix_caching_supported=False,
|
||||||
|
is_chunked_prefill_supported=False,
|
||||||
),
|
),
|
||||||
CLSPoolingEmbedModelInfo(
|
EmbedModelInfo(
|
||||||
"nomic-ai/nomic-embed-text-v1.5",
|
"nomic-ai/nomic-embed-text-v1.5",
|
||||||
architecture="NomicBertModel",
|
architecture="NomicBertModel",
|
||||||
enable_test=False,
|
enable_test=False,
|
||||||
),
|
),
|
||||||
CLSPoolingEmbedModelInfo(
|
EmbedModelInfo(
|
||||||
"nomic-ai/CodeRankEmbed", architecture="NomicBertModel", enable_test=False
|
"nomic-ai/CodeRankEmbed", architecture="NomicBertModel", enable_test=False
|
||||||
),
|
),
|
||||||
CLSPoolingEmbedModelInfo(
|
EmbedModelInfo(
|
||||||
"nomic-ai/nomic-embed-text-v2-moe",
|
"nomic-ai/nomic-embed-text-v2-moe",
|
||||||
architecture="NomicBertModel",
|
architecture="NomicBertModel",
|
||||||
mteb_score=0.715488912,
|
mteb_score=0.715488912,
|
||||||
enable_test=True,
|
enable_test=True,
|
||||||
|
pooling_type="MEAN",
|
||||||
|
attn_type="encoder_only",
|
||||||
|
is_prefix_caching_supported=False,
|
||||||
|
is_chunked_prefill_supported=False,
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
@ -6,10 +6,10 @@ import pytest
|
|||||||
import torch
|
import torch
|
||||||
|
|
||||||
from tests.conftest import HfRunner
|
from tests.conftest import HfRunner
|
||||||
from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
|
from tests.models.utils import RerankModelInfo
|
||||||
from tests.utils import multi_gpu_test
|
from tests.utils import multi_gpu_test
|
||||||
|
|
||||||
from .mteb_utils import mteb_test_rerank_models
|
from .mteb_score_utils import mteb_test_rerank_models
|
||||||
|
|
||||||
qwen3_reranker_hf_overrides = {
|
qwen3_reranker_hf_overrides = {
|
||||||
"architectures": ["Qwen3ForSequenceClassification"],
|
"architectures": ["Qwen3ForSequenceClassification"],
|
||||||
@ -18,14 +18,18 @@ qwen3_reranker_hf_overrides = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
RERANK_MODELS = [
|
RERANK_MODELS = [
|
||||||
LASTPoolingRerankModelInfo(
|
RerankModelInfo(
|
||||||
"Qwen/Qwen3-Reranker-0.6B",
|
"Qwen/Qwen3-Reranker-0.6B",
|
||||||
architecture="Qwen3ForSequenceClassification",
|
architecture="Qwen3ForSequenceClassification",
|
||||||
mteb_score=0.25736,
|
mteb_score=0.25736,
|
||||||
hf_overrides=qwen3_reranker_hf_overrides,
|
hf_overrides=qwen3_reranker_hf_overrides,
|
||||||
|
pooling_type="LAST",
|
||||||
|
attn_type="decoder",
|
||||||
|
is_prefix_caching_supported=True,
|
||||||
|
is_chunked_prefill_supported=True,
|
||||||
enable_test=True,
|
enable_test=True,
|
||||||
),
|
),
|
||||||
LASTPoolingRerankModelInfo(
|
RerankModelInfo(
|
||||||
"Qwen/Qwen3-Reranker-4B",
|
"Qwen/Qwen3-Reranker-4B",
|
||||||
architecture="Qwen3ForSequenceClassification",
|
architecture="Qwen3ForSequenceClassification",
|
||||||
hf_overrides=qwen3_reranker_hf_overrides,
|
hf_overrides=qwen3_reranker_hf_overrides,
|
||||||
|
|||||||
@ -4,62 +4,82 @@
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
|
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
|
||||||
from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
|
from tests.models.utils import EmbedModelInfo
|
||||||
|
|
||||||
from .mteb_utils import mteb_test_embed_models
|
from .mteb_embed_utils import mteb_test_embed_models
|
||||||
|
|
||||||
MODELS = [
|
MODELS = [
|
||||||
CLSPoolingEmbedModelInfo(
|
EmbedModelInfo(
|
||||||
"Snowflake/snowflake-arctic-embed-xs",
|
"Snowflake/snowflake-arctic-embed-xs",
|
||||||
is_matryoshka=False,
|
is_matryoshka=False,
|
||||||
architecture="BertModel",
|
architecture="BertModel",
|
||||||
mteb_score=0.714927797,
|
mteb_score=0.714927797,
|
||||||
|
pooling_type="CLS",
|
||||||
|
attn_type="encoder_only",
|
||||||
|
is_prefix_caching_supported=False,
|
||||||
|
is_chunked_prefill_supported=False,
|
||||||
enable_test=True,
|
enable_test=True,
|
||||||
),
|
),
|
||||||
CLSPoolingEmbedModelInfo(
|
EmbedModelInfo(
|
||||||
"Snowflake/snowflake-arctic-embed-s",
|
"Snowflake/snowflake-arctic-embed-s",
|
||||||
is_matryoshka=False,
|
is_matryoshka=False,
|
||||||
architecture="BertModel",
|
architecture="BertModel",
|
||||||
enable_test=False,
|
enable_test=False,
|
||||||
),
|
),
|
||||||
CLSPoolingEmbedModelInfo(
|
EmbedModelInfo(
|
||||||
"Snowflake/snowflake-arctic-embed-m",
|
"Snowflake/snowflake-arctic-embed-m",
|
||||||
is_matryoshka=False,
|
is_matryoshka=False,
|
||||||
architecture="BertModel",
|
architecture="BertModel",
|
||||||
enable_test=False,
|
enable_test=False,
|
||||||
),
|
),
|
||||||
CLSPoolingEmbedModelInfo(
|
EmbedModelInfo(
|
||||||
"Snowflake/snowflake-arctic-embed-m-long",
|
"Snowflake/snowflake-arctic-embed-m-long",
|
||||||
is_matryoshka=False,
|
is_matryoshka=False,
|
||||||
architecture="NomicBertModel",
|
architecture="NomicBertModel",
|
||||||
mteb_score=0.681146831,
|
mteb_score=0.681146831,
|
||||||
|
pooling_type="CLS",
|
||||||
|
attn_type="encoder_only",
|
||||||
|
is_prefix_caching_supported=False,
|
||||||
|
is_chunked_prefill_supported=False,
|
||||||
enable_test=True,
|
enable_test=True,
|
||||||
),
|
),
|
||||||
CLSPoolingEmbedModelInfo(
|
EmbedModelInfo(
|
||||||
"Snowflake/snowflake-arctic-embed-l",
|
"Snowflake/snowflake-arctic-embed-l",
|
||||||
is_matryoshka=False,
|
is_matryoshka=False,
|
||||||
architecture="BertModel",
|
architecture="BertModel",
|
||||||
enable_test=False,
|
enable_test=False,
|
||||||
),
|
),
|
||||||
CLSPoolingEmbedModelInfo(
|
EmbedModelInfo(
|
||||||
"Snowflake/snowflake-arctic-embed-m-v1.5",
|
"Snowflake/snowflake-arctic-embed-m-v1.5",
|
||||||
is_matryoshka=True,
|
is_matryoshka=True,
|
||||||
architecture="BertModel",
|
architecture="BertModel",
|
||||||
mteb_score=0.649088363,
|
mteb_score=0.649088363,
|
||||||
|
pooling_type="CLS",
|
||||||
|
attn_type="encoder_only",
|
||||||
|
is_prefix_caching_supported=False,
|
||||||
|
is_chunked_prefill_supported=False,
|
||||||
enable_test=True,
|
enable_test=True,
|
||||||
),
|
),
|
||||||
CLSPoolingEmbedModelInfo(
|
EmbedModelInfo(
|
||||||
"Snowflake/snowflake-arctic-embed-l-v2.0",
|
"Snowflake/snowflake-arctic-embed-l-v2.0",
|
||||||
is_matryoshka=True,
|
is_matryoshka=True,
|
||||||
architecture="XLMRobertaModel",
|
architecture="XLMRobertaModel",
|
||||||
mteb_score=0.712258299,
|
mteb_score=0.712258299,
|
||||||
|
pooling_type="CLS",
|
||||||
|
attn_type="encoder_only",
|
||||||
|
is_prefix_caching_supported=False,
|
||||||
|
is_chunked_prefill_supported=False,
|
||||||
enable_test=True,
|
enable_test=True,
|
||||||
),
|
),
|
||||||
CLSPoolingEmbedModelInfo(
|
EmbedModelInfo(
|
||||||
"Snowflake/snowflake-arctic-embed-m-v2.0",
|
"Snowflake/snowflake-arctic-embed-m-v2.0",
|
||||||
is_matryoshka=True,
|
is_matryoshka=True,
|
||||||
architecture="GteModel",
|
architecture="GteModel",
|
||||||
mteb_score=0.706622444,
|
mteb_score=0.706622444,
|
||||||
|
pooling_type="CLS",
|
||||||
|
attn_type="encoder_only",
|
||||||
|
is_prefix_caching_supported=False,
|
||||||
|
is_chunked_prefill_supported=False,
|
||||||
enable_test=True,
|
enable_test=True,
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|||||||
@ -3,25 +3,31 @@
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from tests.models.utils import (
|
from tests.models.utils import (
|
||||||
CLSPoolingEmbedModelInfo,
|
|
||||||
EmbedModelInfo,
|
EmbedModelInfo,
|
||||||
LASTPoolingEmbedModelInfo,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
from .mteb_utils import mteb_test_embed_models
|
from .mteb_embed_utils import mteb_test_embed_models
|
||||||
|
|
||||||
# ST models with projector (Dense) layers
|
# ST models with projector (Dense) layers
|
||||||
ST_PROJECTOR_MODELS = [
|
ST_PROJECTOR_MODELS = [
|
||||||
CLSPoolingEmbedModelInfo(
|
EmbedModelInfo(
|
||||||
"TencentBAC/Conan-embedding-v1",
|
"TencentBAC/Conan-embedding-v1",
|
||||||
architecture="BertModel",
|
architecture="BertModel",
|
||||||
mteb_score=0.688611955,
|
mteb_score=0.688611955,
|
||||||
|
pooling_type="MEAN",
|
||||||
|
attn_type="encoder_only",
|
||||||
|
is_prefix_caching_supported=False,
|
||||||
|
is_chunked_prefill_supported=False,
|
||||||
enable_test=True,
|
enable_test=True,
|
||||||
),
|
),
|
||||||
LASTPoolingEmbedModelInfo(
|
EmbedModelInfo(
|
||||||
"google/embeddinggemma-300m",
|
"google/embeddinggemma-300m",
|
||||||
architecture="Gemma3TextModel",
|
architecture="Gemma3TextModel",
|
||||||
mteb_score=0.7473819294684156,
|
mteb_score=0.7473819294684156,
|
||||||
|
pooling_type="MEAN",
|
||||||
|
attn_type="encoder_only",
|
||||||
|
is_prefix_caching_supported=False,
|
||||||
|
is_chunked_prefill_supported=False,
|
||||||
enable_test=True,
|
enable_test=True,
|
||||||
dtype="float32",
|
dtype="float32",
|
||||||
),
|
),
|
||||||
|
|||||||
@ -267,7 +267,7 @@ def run_embedding_input_test(
|
|||||||
"""Inference result should be the same between
|
"""Inference result should be the same between
|
||||||
original image/video input and image/video embeddings input.
|
original image/video input and image/video embeddings input.
|
||||||
"""
|
"""
|
||||||
from transformers import AutoProcessor # noqa: F401
|
from transformers import AutoProcessor
|
||||||
|
|
||||||
processor = AutoProcessor.from_pretrained(model)
|
processor = AutoProcessor.from_pretrained(model)
|
||||||
|
|
||||||
|
|||||||
@ -10,7 +10,7 @@ import torch
|
|||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
from transformers import PretrainedConfig
|
from transformers import PretrainedConfig
|
||||||
|
|
||||||
from vllm.config.model import ModelConfig, ModelDType, RunnerOption
|
from vllm.config.model import AttnTypeStr, ModelConfig, ModelDType, RunnerOption
|
||||||
from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs
|
from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs
|
||||||
from vllm.multimodal.processing import InputProcessingContext
|
from vllm.multimodal.processing import InputProcessingContext
|
||||||
from vllm.tokenizers import cached_tokenizer_from_config
|
from vllm.tokenizers import cached_tokenizer_from_config
|
||||||
@ -375,7 +375,10 @@ class ModelInfo:
|
|||||||
max_model_len: int | None = None
|
max_model_len: int | None = None
|
||||||
hf_dtype: str = "float32"
|
hf_dtype: str = "float32"
|
||||||
hf_overrides: dict[str, Any] | None = None
|
hf_overrides: dict[str, Any] | None = None
|
||||||
default_pooling_type: str = ""
|
pooling_type: str | None = None
|
||||||
|
attn_type: AttnTypeStr | None = None
|
||||||
|
is_prefix_caching_supported: bool | None = None
|
||||||
|
is_chunked_prefill_supported: bool | None = None
|
||||||
enable_test: bool = True
|
enable_test: bool = True
|
||||||
|
|
||||||
|
|
||||||
@ -386,32 +389,12 @@ class EmbedModelInfo(ModelInfo):
|
|||||||
matryoshka_dimensions: list[int] | None = None
|
matryoshka_dimensions: list[int] | None = None
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class CLSPoolingEmbedModelInfo(EmbedModelInfo):
|
|
||||||
default_pooling_type: str = "CLS"
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class LASTPoolingEmbedModelInfo(EmbedModelInfo):
|
|
||||||
default_pooling_type: str = "LAST"
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class RerankModelInfo(ModelInfo):
|
class RerankModelInfo(ModelInfo):
|
||||||
mteb_score: float | None = None
|
mteb_score: float | None = None
|
||||||
chat_template_name: str | None = None
|
chat_template_name: str | None = None
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class CLSPoolingRerankModelInfo(RerankModelInfo):
|
|
||||||
default_pooling_type: str = "CLS"
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class LASTPoolingRerankModelInfo(RerankModelInfo):
|
|
||||||
default_pooling_type: str = "LAST"
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class GenerateModelInfo(ModelInfo):
|
class GenerateModelInfo(ModelInfo):
|
||||||
hf_dtype: str = "auto"
|
hf_dtype: str = "auto"
|
||||||
|
|||||||
@ -145,7 +145,7 @@ def test_shared_storage_connector_hashes(tmp_path):
|
|||||||
|
|
||||||
# don't put this import at the top level
|
# don't put this import at the top level
|
||||||
# it will call torch.cuda.device_count()
|
# it will call torch.cuda.device_count()
|
||||||
from transformers import AutoProcessor # noqa: F401
|
from transformers import AutoProcessor
|
||||||
|
|
||||||
# Create processor to handle the chat prompt
|
# Create processor to handle the chat prompt
|
||||||
processor = AutoProcessor.from_pretrained(MODEL_NAME)
|
processor = AutoProcessor.from_pretrained(MODEL_NAME)
|
||||||
|
|||||||
@ -164,7 +164,7 @@ class ModelConfig:
|
|||||||
"""The specific revision to use for the tokenizer on the Hugging Face Hub.
|
"""The specific revision to use for the tokenizer on the Hugging Face Hub.
|
||||||
It can be a branch name, a tag name, or a commit id. If unspecified, will
|
It can be a branch name, a tag name, or a commit id. If unspecified, will
|
||||||
use the default version."""
|
use the default version."""
|
||||||
max_model_len: int = Field(default=None, gt=0)
|
max_model_len: int = Field(default=None, ge=-1)
|
||||||
"""Model context length (prompt and output). If unspecified, will be
|
"""Model context length (prompt and output). If unspecified, will be
|
||||||
automatically derived from the model config.
|
automatically derived from the model config.
|
||||||
|
|
||||||
@ -595,7 +595,7 @@ class ModelConfig:
|
|||||||
|
|
||||||
# Avoid running try_verify_and_update_config multiple times
|
# Avoid running try_verify_and_update_config multiple times
|
||||||
self.config_updated = False
|
self.config_updated = False
|
||||||
|
self._try_verify_and_update_model_config()
|
||||||
self._verify_quantization()
|
self._verify_quantization()
|
||||||
self._verify_cuda_graph()
|
self._verify_cuda_graph()
|
||||||
self._verify_bnb_config()
|
self._verify_bnb_config()
|
||||||
@ -1008,6 +1008,23 @@ class ModelConfig:
|
|||||||
"when expert parallelism is enabled."
|
"when expert parallelism is enabled."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _try_verify_and_update_model_config(self):
|
||||||
|
# Avoid running try_verify_and_update_config multiple times
|
||||||
|
if getattr(self, "config_updated", False):
|
||||||
|
return
|
||||||
|
|
||||||
|
architecture = self.architecture
|
||||||
|
if architecture is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
from vllm.model_executor.models.config import (
|
||||||
|
MODELS_CONFIG_MAP,
|
||||||
|
)
|
||||||
|
|
||||||
|
cls = MODELS_CONFIG_MAP.get(architecture, None)
|
||||||
|
if cls is not None:
|
||||||
|
cls.verify_and_update_model_config(self)
|
||||||
|
|
||||||
def verify_dual_chunk_attention_config(
|
def verify_dual_chunk_attention_config(
|
||||||
self,
|
self,
|
||||||
load_config: LoadConfig,
|
load_config: LoadConfig,
|
||||||
|
|||||||
@ -81,10 +81,7 @@ class ECExampleConnector(ECConnectorBase):
|
|||||||
assert encoder_cache is not None
|
assert encoder_cache is not None
|
||||||
if metadata is None:
|
if metadata is None:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
(
|
"In connector.start_load_caches, but the connector metadata is None"
|
||||||
"In connector.start_load_caches, ",
|
|
||||||
"but the connector metadata is None",
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
# Load the EC for each mm data
|
# Load the EC for each mm data
|
||||||
|
|||||||
@ -297,16 +297,14 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, dict[str, Any]]:
|
|||||||
elif contains_type(type_hints, set):
|
elif contains_type(type_hints, set):
|
||||||
kwargs[name].update(collection_to_kwargs(type_hints, set))
|
kwargs[name].update(collection_to_kwargs(type_hints, set))
|
||||||
elif contains_type(type_hints, int):
|
elif contains_type(type_hints, int):
|
||||||
kwargs[name]["type"] = int
|
if name == "max_model_len":
|
||||||
# Special case for large integers
|
kwargs[name]["type"] = human_readable_int_or_auto
|
||||||
human_readable_ints = {
|
kwargs[name]["help"] += f"\n\n{human_readable_int_or_auto.__doc__}"
|
||||||
"max_model_len",
|
elif name in ("max_num_batched_tokens", "kv_cache_memory_bytes"):
|
||||||
"max_num_batched_tokens",
|
|
||||||
"kv_cache_memory_bytes",
|
|
||||||
}
|
|
||||||
if name in human_readable_ints:
|
|
||||||
kwargs[name]["type"] = human_readable_int
|
kwargs[name]["type"] = human_readable_int
|
||||||
kwargs[name]["help"] += f"\n\n{human_readable_int.__doc__}"
|
kwargs[name]["help"] += f"\n\n{human_readable_int.__doc__}"
|
||||||
|
else:
|
||||||
|
kwargs[name]["type"] = int
|
||||||
elif contains_type(type_hints, float):
|
elif contains_type(type_hints, float):
|
||||||
kwargs[name]["type"] = float
|
kwargs[name]["type"] = float
|
||||||
elif contains_type(type_hints, dict) and (
|
elif contains_type(type_hints, dict) and (
|
||||||
@ -2042,23 +2040,17 @@ def _raise_unsupported_error(feature_name: str):
|
|||||||
raise NotImplementedError(msg)
|
raise NotImplementedError(msg)
|
||||||
|
|
||||||
|
|
||||||
def human_readable_int(value):
|
def human_readable_int(value: str) -> int:
|
||||||
"""Parse human-readable integers like '1k', '2M', etc.
|
"""Parse human-readable integers like '1k', '2M', etc.
|
||||||
Including decimal values with decimal multipliers.
|
Including decimal values with decimal multipliers.
|
||||||
Also accepts -1 or 'auto' as a special value for auto-detection.
|
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
- '1k' -> 1,000
|
- '1k' -> 1,000
|
||||||
- '1K' -> 1,024
|
- '1K' -> 1,024
|
||||||
- '25.6k' -> 25,600
|
- '25.6k' -> 25,600
|
||||||
- '-1' or 'auto' -> -1 (special value for auto-detection)
|
|
||||||
"""
|
"""
|
||||||
value = value.strip()
|
value = value.strip()
|
||||||
|
|
||||||
# Handle -1 or 'auto' as a special value for auto-detection
|
|
||||||
if value == "-1" or value.lower() == "auto":
|
|
||||||
return -1
|
|
||||||
|
|
||||||
match = re.fullmatch(r"(\d+(?:\.\d+)?)([kKmMgGtT])", value)
|
match = re.fullmatch(r"(\d+(?:\.\d+)?)([kKmMgGtT])", value)
|
||||||
if match:
|
if match:
|
||||||
decimal_multiplier = {
|
decimal_multiplier = {
|
||||||
@ -2092,3 +2084,22 @@ def human_readable_int(value):
|
|||||||
|
|
||||||
# Regular plain number.
|
# Regular plain number.
|
||||||
return int(value)
|
return int(value)
|
||||||
|
|
||||||
|
|
||||||
|
def human_readable_int_or_auto(value: str) -> int:
|
||||||
|
"""Parse human-readable integers like '1k', '2M', etc.
|
||||||
|
Including decimal values with decimal multipliers.
|
||||||
|
Also accepts -1 or 'auto' as a special value for auto-detection.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
- '1k' -> 1,000
|
||||||
|
- '1K' -> 1,024
|
||||||
|
- '25.6k' -> 25,600
|
||||||
|
- '-1' or 'auto' -> -1 (special value for auto-detection)
|
||||||
|
"""
|
||||||
|
value = value.strip()
|
||||||
|
|
||||||
|
if value == "-1" or value.lower() == "auto":
|
||||||
|
return -1
|
||||||
|
|
||||||
|
return human_readable_int(value)
|
||||||
|
|||||||
@ -32,6 +32,7 @@ class BenchmarkSubcommand(CLISubcommand):
|
|||||||
) -> FlexibleArgumentParser:
|
) -> FlexibleArgumentParser:
|
||||||
bench_parser = subparsers.add_parser(
|
bench_parser = subparsers.add_parser(
|
||||||
self.name,
|
self.name,
|
||||||
|
help=self.help,
|
||||||
description=self.help,
|
description=self.help,
|
||||||
usage=f"vllm {self.name} <bench_type> [options]",
|
usage=f"vllm {self.name} <bench_type> [options]",
|
||||||
)
|
)
|
||||||
|
|||||||
@ -66,7 +66,11 @@ class ServeSubcommand(CLISubcommand):
|
|||||||
self, subparsers: argparse._SubParsersAction
|
self, subparsers: argparse._SubParsersAction
|
||||||
) -> FlexibleArgumentParser:
|
) -> FlexibleArgumentParser:
|
||||||
serve_parser = subparsers.add_parser(
|
serve_parser = subparsers.add_parser(
|
||||||
self.name, description=DESCRIPTION, usage="vllm serve [model_tag] [options]"
|
self.name,
|
||||||
|
help="Launch a local OpenAI-compatible API server to serve LLM "
|
||||||
|
"completions via HTTP.",
|
||||||
|
description=DESCRIPTION,
|
||||||
|
usage="vllm serve [model_tag] [options]",
|
||||||
)
|
)
|
||||||
|
|
||||||
serve_parser = make_arg_parser(serve_parser)
|
serve_parser = make_arg_parser(serve_parser)
|
||||||
|
|||||||
@ -43,7 +43,7 @@ async def scale_elastic_ep(raw_request: Request):
|
|||||||
try:
|
try:
|
||||||
body = await raw_request.json()
|
body = await raw_request.json()
|
||||||
except json.JSONDecodeError as e:
|
except json.JSONDecodeError as e:
|
||||||
raise HTTPException(status_code=400, detail="Invalid JSON format") from e # noqa: B904
|
raise HTTPException(status_code=400, detail="Invalid JSON format") from e
|
||||||
|
|
||||||
new_data_parallel_size = body.get("new_data_parallel_size")
|
new_data_parallel_size = body.get("new_data_parallel_size")
|
||||||
drain_timeout = body.get("drain_timeout", 120) # Default 2 minutes
|
drain_timeout = body.get("drain_timeout", 120) # Default 2 minutes
|
||||||
|
|||||||
@ -625,8 +625,9 @@ def silu_mul_per_token_group_quant_fp8_colmajor(
|
|||||||
M, N = input.size()
|
M, N = input.size()
|
||||||
N_2 = N // 2
|
N_2 = N // 2
|
||||||
|
|
||||||
|
fp8_dtype = current_platform.fp8_dtype()
|
||||||
if output is None:
|
if output is None:
|
||||||
output = torch.empty((M, N_2), dtype=torch.float8_e4m3fn, device=input.device)
|
output = torch.empty((M, N_2), dtype=fp8_dtype, device=input.device)
|
||||||
|
|
||||||
output_scales = torch.empty(
|
output_scales = torch.empty(
|
||||||
((N_2 // GROUP_SIZE), M), dtype=torch.float32, device=input.device
|
((N_2 // GROUP_SIZE), M), dtype=torch.float32, device=input.device
|
||||||
@ -637,9 +638,12 @@ def silu_mul_per_token_group_quant_fp8_colmajor(
|
|||||||
assert M % BLOCK_M == 0
|
assert M % BLOCK_M == 0
|
||||||
assert N_2 % BLOCK_N == 0
|
assert N_2 % BLOCK_N == 0
|
||||||
|
|
||||||
finfo = torch.finfo(torch.float8_e4m3fn)
|
# Using the default value (240.0) from pytorch will cause accuracy
|
||||||
fp8_min = finfo.min
|
# issue on dynamic quantization models. Here use 224.0 for fnuz on ROCm
|
||||||
fp8_max = finfo.max
|
# platforms that use the torch.float8_e4m3fnuz dtype.
|
||||||
|
finfo = torch.finfo(fp8_dtype)
|
||||||
|
fp8_min = -224.0 if current_platform.is_fp8_fnuz() else finfo.min
|
||||||
|
fp8_max = 224.0 if current_platform.is_fp8_fnuz() else finfo.max
|
||||||
|
|
||||||
# Force even division so we can avoid edgecases within the kernel.
|
# Force even division so we can avoid edgecases within the kernel.
|
||||||
assert M % BLOCK_M == 0
|
assert M % BLOCK_M == 0
|
||||||
|
|||||||
@ -1,6 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
# ruff: noqa: SIM117
|
|
||||||
import fnmatch
|
import fnmatch
|
||||||
import glob
|
import glob
|
||||||
import itertools
|
import itertools
|
||||||
@ -59,7 +58,7 @@ def is_moe_model(model: torch.nn.Module) -> bool:
|
|||||||
|
|
||||||
|
|
||||||
class BitsAndBytesModelLoader(BaseModelLoader):
|
class BitsAndBytesModelLoader(BaseModelLoader):
|
||||||
"""Model loader to load model weights with BitAndBytes quantization."""
|
"""Model loader to load model weights with BitsAndBytes quantization."""
|
||||||
|
|
||||||
possible_config_file_names = ["adapter_config.json"]
|
possible_config_file_names = ["adapter_config.json"]
|
||||||
|
|
||||||
|
|||||||
@ -1,6 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
# ruff: noqa: SIM117
|
|
||||||
import os
|
import os
|
||||||
from collections.abc import Generator
|
from collections.abc import Generator
|
||||||
|
|
||||||
|
|||||||
@ -13,7 +13,7 @@ from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
|||||||
from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec, MLAAttentionSpec
|
from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec, MLAAttentionSpec
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from vllm.config import VllmConfig
|
from vllm.config import ModelConfig, VllmConfig
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
@ -21,20 +21,24 @@ logger = init_logger(__name__)
|
|||||||
class VerifyAndUpdateConfig:
|
class VerifyAndUpdateConfig:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def verify_and_update_config(vllm_config: "VllmConfig") -> None:
|
def verify_and_update_config(vllm_config: "VllmConfig") -> None:
|
||||||
raise NotImplementedError
|
return
|
||||||
|
|
||||||
|
|
||||||
class Gemma3TextModelConfig:
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def verify_and_update_config(vllm_config: "VllmConfig") -> None:
|
def verify_and_update_model_config(model_config: "ModelConfig") -> None:
|
||||||
hf_config = vllm_config.model_config.hf_config
|
return
|
||||||
|
|
||||||
|
|
||||||
|
class Gemma3TextModelConfig(VerifyAndUpdateConfig):
|
||||||
|
@staticmethod
|
||||||
|
def verify_and_update_model_config(model_config: "ModelConfig") -> None:
|
||||||
|
hf_config = model_config.hf_config
|
||||||
hf_config.is_causal = not hf_config.use_bidirectional_attention
|
hf_config.is_causal = not hf_config.use_bidirectional_attention
|
||||||
|
|
||||||
|
|
||||||
class GteNewModelConfig(VerifyAndUpdateConfig):
|
class GteNewModelConfig(VerifyAndUpdateConfig):
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def verify_and_update_config(vllm_config: "VllmConfig") -> None:
|
def verify_and_update_model_config(model_config: "ModelConfig") -> None:
|
||||||
config = vllm_config.model_config.hf_config
|
config = model_config.hf_config
|
||||||
|
|
||||||
assert config.__class__.__name__ == "NewConfig"
|
assert config.__class__.__name__ == "NewConfig"
|
||||||
assert config.hidden_act == "gelu"
|
assert config.hidden_act == "gelu"
|
||||||
@ -53,16 +57,15 @@ class GteNewModelConfig(VerifyAndUpdateConfig):
|
|||||||
|
|
||||||
class JambaForSequenceClassificationConfig(VerifyAndUpdateConfig):
|
class JambaForSequenceClassificationConfig(VerifyAndUpdateConfig):
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def verify_and_update_config(vllm_config: "VllmConfig") -> None:
|
def verify_and_update_model_config(model_config: "ModelConfig") -> None:
|
||||||
pooler_config = vllm_config.model_config.pooler_config
|
pooler_config = model_config.pooler_config
|
||||||
if pooler_config.use_activation is None:
|
if pooler_config.use_activation is None:
|
||||||
pooler_config.use_activation = False
|
pooler_config.use_activation = False
|
||||||
|
|
||||||
|
|
||||||
class JinaRobertaModelConfig(VerifyAndUpdateConfig):
|
class JinaRobertaModelConfig(VerifyAndUpdateConfig):
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def verify_and_update_config(vllm_config: "VllmConfig") -> None:
|
def verify_and_update_model_config(model_config: "ModelConfig") -> None:
|
||||||
model_config = vllm_config.model_config
|
|
||||||
config = model_config.hf_config
|
config = model_config.hf_config
|
||||||
|
|
||||||
if config.position_embedding_type == "rotary":
|
if config.position_embedding_type == "rotary":
|
||||||
@ -90,10 +93,10 @@ class JinaRobertaModelConfig(VerifyAndUpdateConfig):
|
|||||||
|
|
||||||
class LlamaBidirectionalConfig(VerifyAndUpdateConfig):
|
class LlamaBidirectionalConfig(VerifyAndUpdateConfig):
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def verify_and_update_config(vllm_config: "VllmConfig") -> None:
|
def verify_and_update_model_config(model_config: "ModelConfig") -> None:
|
||||||
from vllm.config.pooler import PoolingTypeStr
|
from vllm.config.pooler import PoolingTypeStr
|
||||||
|
|
||||||
hf_config = vllm_config.model_config.hf_config
|
hf_config = model_config.hf_config
|
||||||
hf_config.is_causal = False
|
hf_config.is_causal = False
|
||||||
|
|
||||||
pooling_type_map: dict[str, PoolingTypeStr] = {
|
pooling_type_map: dict[str, PoolingTypeStr] = {
|
||||||
@ -105,7 +108,7 @@ class LlamaBidirectionalConfig(VerifyAndUpdateConfig):
|
|||||||
pooling_type = pooling_type_map.get(hf_config.pooling, None)
|
pooling_type = pooling_type_map.get(hf_config.pooling, None)
|
||||||
if pooling_type is None:
|
if pooling_type is None:
|
||||||
raise ValueError(f"pool_type {hf_config.pooling} not supported")
|
raise ValueError(f"pool_type {hf_config.pooling} not supported")
|
||||||
vllm_config.model_config.pooler_config.pooling_type = pooling_type
|
model_config.pooler_config.pooling_type = pooling_type
|
||||||
|
|
||||||
|
|
||||||
class NomicBertModelConfig(VerifyAndUpdateConfig):
|
class NomicBertModelConfig(VerifyAndUpdateConfig):
|
||||||
@ -204,8 +207,8 @@ class NomicBertModelConfig(VerifyAndUpdateConfig):
|
|||||||
|
|
||||||
class Qwen2ForProcessRewardModelConfig(VerifyAndUpdateConfig):
|
class Qwen2ForProcessRewardModelConfig(VerifyAndUpdateConfig):
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def verify_and_update_config(vllm_config: "VllmConfig") -> None:
|
def verify_and_update_model_config(model_config: "ModelConfig") -> None:
|
||||||
pooler_config = vllm_config.model_config.pooler_config
|
pooler_config = model_config.pooler_config
|
||||||
|
|
||||||
if pooler_config.step_tag_id is None:
|
if pooler_config.step_tag_id is None:
|
||||||
pooler_config.step_tag_id = 151651
|
pooler_config.step_tag_id = 151651
|
||||||
@ -213,8 +216,8 @@ class Qwen2ForProcessRewardModelConfig(VerifyAndUpdateConfig):
|
|||||||
|
|
||||||
class Qwen2ForRewardModelConfig(VerifyAndUpdateConfig):
|
class Qwen2ForRewardModelConfig(VerifyAndUpdateConfig):
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def verify_and_update_config(vllm_config: "VllmConfig") -> None:
|
def verify_and_update_model_config(model_config: "ModelConfig") -> None:
|
||||||
pooler_config = vllm_config.model_config.pooler_config
|
pooler_config = model_config.pooler_config
|
||||||
|
|
||||||
if pooler_config.softmax is None:
|
if pooler_config.softmax is None:
|
||||||
pooler_config.softmax = False
|
pooler_config.softmax = False
|
||||||
@ -222,8 +225,8 @@ class Qwen2ForRewardModelConfig(VerifyAndUpdateConfig):
|
|||||||
|
|
||||||
class Qwen3ForSequenceClassificationConfig(VerifyAndUpdateConfig):
|
class Qwen3ForSequenceClassificationConfig(VerifyAndUpdateConfig):
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def verify_and_update_config(vllm_config: "VllmConfig") -> None:
|
def verify_and_update_model_config(model_config: "ModelConfig") -> None:
|
||||||
config = vllm_config.model_config.hf_config
|
config = model_config.hf_config
|
||||||
|
|
||||||
is_original_qwen3_reranker = getattr(
|
is_original_qwen3_reranker = getattr(
|
||||||
config, "is_original_qwen3_reranker", False
|
config, "is_original_qwen3_reranker", False
|
||||||
@ -237,23 +240,23 @@ class Qwen3ForSequenceClassificationConfig(VerifyAndUpdateConfig):
|
|||||||
"Try loading the original Qwen3 Reranker?, see: "
|
"Try loading the original Qwen3 Reranker?, see: "
|
||||||
"https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/offline_reranker.py"
|
"https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/offline_reranker.py"
|
||||||
)
|
)
|
||||||
vllm_config.model_config.hf_config.method = "from_2_way_softmax"
|
model_config.hf_config.method = "from_2_way_softmax"
|
||||||
|
|
||||||
|
|
||||||
class JinaVLForSequenceClassificationConfig(VerifyAndUpdateConfig):
|
class JinaVLForSequenceClassificationConfig(VerifyAndUpdateConfig):
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def verify_and_update_config(vllm_config: "VllmConfig") -> None:
|
def verify_and_update_model_config(model_config: "ModelConfig") -> None:
|
||||||
config = vllm_config.model_config.hf_config
|
config = model_config.hf_config
|
||||||
config.num_labels = 1
|
config.num_labels = 1
|
||||||
pooler_config = vllm_config.model_config.pooler_config
|
pooler_config = model_config.pooler_config
|
||||||
if pooler_config.logit_bias is None:
|
if pooler_config.logit_bias is None:
|
||||||
pooler_config.logit_bias = 2.65
|
pooler_config.logit_bias = 2.65
|
||||||
|
|
||||||
|
|
||||||
class SnowflakeGteNewModelConfig(VerifyAndUpdateConfig):
|
class SnowflakeGteNewModelConfig(VerifyAndUpdateConfig):
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def verify_and_update_config(vllm_config: "VllmConfig") -> None:
|
def verify_and_update_model_config(model_config: "ModelConfig") -> None:
|
||||||
config = vllm_config.model_config.hf_config
|
config = model_config.hf_config
|
||||||
|
|
||||||
assert config.__class__.__name__ == "GteConfig"
|
assert config.__class__.__name__ == "GteConfig"
|
||||||
assert config.hidden_act == "gelu"
|
assert config.hidden_act == "gelu"
|
||||||
|
|||||||
@ -64,7 +64,6 @@ from .interfaces import (
|
|||||||
SupportsLoRA,
|
SupportsLoRA,
|
||||||
SupportsPP,
|
SupportsPP,
|
||||||
)
|
)
|
||||||
from .interfaces_base import attn_type
|
|
||||||
from .utils import (
|
from .utils import (
|
||||||
AutoWeightsLoader,
|
AutoWeightsLoader,
|
||||||
PPMissingLayer,
|
PPMissingLayer,
|
||||||
@ -707,14 +706,12 @@ class LlamaForCausalLM(
|
|||||||
return name, loaded_weight
|
return name, loaded_weight
|
||||||
|
|
||||||
|
|
||||||
@attn_type("encoder_only")
|
|
||||||
class LlamaBidirectionalForSequenceClassification(as_seq_cls_model(LlamaForCausalLM)):
|
class LlamaBidirectionalForSequenceClassification(as_seq_cls_model(LlamaForCausalLM)):
|
||||||
# This class sets the correct attention type and pooling type
|
# This class sets the correct attention type and pooling type
|
||||||
# through LlamaBidirectionalConfig.
|
# through LlamaBidirectionalConfig.
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
@attn_type("encoder_only")
|
|
||||||
class LlamaBidirectionalModel(as_embedding_model(LlamaForCausalLM)):
|
class LlamaBidirectionalModel(as_embedding_model(LlamaForCausalLM)):
|
||||||
# This class sets the correct attention type and pooling type
|
# This class sets the correct attention type and pooling type
|
||||||
# through LlamaBidirectionalConfig.
|
# through LlamaBidirectionalConfig.
|
||||||
|
|||||||
@ -606,6 +606,43 @@ def get_request_block_hasher(
|
|||||||
return request_block_hasher
|
return request_block_hasher
|
||||||
|
|
||||||
|
|
||||||
|
def _check_enough_kv_cache_memory(
|
||||||
|
available_memory: int,
|
||||||
|
get_needed_memory: Callable[[], int],
|
||||||
|
max_model_len: int,
|
||||||
|
estimate_max_model_len: Callable[[int], int],
|
||||||
|
):
|
||||||
|
if available_memory <= 0:
|
||||||
|
raise ValueError(
|
||||||
|
"No available memory for the cache blocks. "
|
||||||
|
"Try increasing `gpu_memory_utilization` when initializing the engine. "
|
||||||
|
"See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ "
|
||||||
|
"for more details."
|
||||||
|
)
|
||||||
|
|
||||||
|
needed_memory = get_needed_memory()
|
||||||
|
|
||||||
|
if needed_memory > available_memory:
|
||||||
|
estimated_max_len = estimate_max_model_len(available_memory)
|
||||||
|
estimated_msg = ""
|
||||||
|
if estimated_max_len > 0:
|
||||||
|
estimated_msg = (
|
||||||
|
"Based on the available memory, "
|
||||||
|
f"the estimated maximum model length is {estimated_max_len}. "
|
||||||
|
)
|
||||||
|
|
||||||
|
raise ValueError(
|
||||||
|
f"To serve at least one request with the models's max seq len "
|
||||||
|
f"({max_model_len}), ({needed_memory / GiB_bytes:.2f} GiB KV "
|
||||||
|
f"cache is needed, which is larger than the available KV cache "
|
||||||
|
f"memory ({available_memory / GiB_bytes:.2f} GiB). {estimated_msg}"
|
||||||
|
f"Try increasing `gpu_memory_utilization` or decreasing `max_model_len` "
|
||||||
|
f"when initializing the engine. "
|
||||||
|
f"See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ "
|
||||||
|
f"for more details."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def max_memory_usage_bytes(
|
def max_memory_usage_bytes(
|
||||||
vllm_config: VllmConfig, kv_cache_specs: Iterable[KVCacheSpec]
|
vllm_config: VllmConfig, kv_cache_specs: Iterable[KVCacheSpec]
|
||||||
) -> int:
|
) -> int:
|
||||||
@ -688,43 +725,12 @@ def check_enough_kv_cache_memory(
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
# No need to check for available memory if the kv_cache_spec is empty
|
# No need to check for available memory if the kv_cache_spec is empty
|
||||||
if not kv_cache_spec:
|
if kv_cache_spec:
|
||||||
return
|
_check_enough_kv_cache_memory(
|
||||||
|
available_memory,
|
||||||
if available_memory <= 0:
|
lambda: max_memory_usage_bytes(vllm_config, kv_cache_spec.values()),
|
||||||
raise ValueError(
|
vllm_config.model_config.max_model_len,
|
||||||
"No available memory for the cache blocks. "
|
lambda am: estimate_max_model_len(vllm_config, kv_cache_spec, am),
|
||||||
"Try increasing `gpu_memory_utilization` when "
|
|
||||||
"initializing the engine. "
|
|
||||||
"See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ "
|
|
||||||
"for more details."
|
|
||||||
)
|
|
||||||
|
|
||||||
max_model_len = vllm_config.model_config.max_model_len
|
|
||||||
needed_memory = max_memory_usage_bytes(vllm_config, kv_cache_spec.values())
|
|
||||||
|
|
||||||
if needed_memory > available_memory:
|
|
||||||
# Estimate the maximum model length that can fit in the available memory
|
|
||||||
estimated_max_len = estimate_max_model_len(
|
|
||||||
vllm_config, kv_cache_spec, available_memory
|
|
||||||
)
|
|
||||||
estimated_msg = ""
|
|
||||||
if estimated_max_len > 0:
|
|
||||||
estimated_msg = (
|
|
||||||
"Based on the available memory, "
|
|
||||||
f"the estimated maximum model length is {estimated_max_len}."
|
|
||||||
)
|
|
||||||
|
|
||||||
raise ValueError(
|
|
||||||
f"To serve at least one request with the models's max seq len "
|
|
||||||
f"({max_model_len}), ({needed_memory / GiB_bytes:.2f} GiB KV "
|
|
||||||
f"cache is needed, which is larger than the available KV cache "
|
|
||||||
f"memory ({available_memory / GiB_bytes:.2f} GiB). "
|
|
||||||
f"{estimated_msg} "
|
|
||||||
f"Try increasing `gpu_memory_utilization` or decreasing `max_model_len` "
|
|
||||||
f"when initializing the engine. "
|
|
||||||
f"See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ "
|
|
||||||
f"for more details."
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -1586,36 +1592,16 @@ def get_kv_cache_configs(
|
|||||||
# Check if the available memory is enough (using min across all workers).
|
# Check if the available memory is enough (using min across all workers).
|
||||||
# We use the global groups to correctly account for padding.
|
# We use the global groups to correctly account for padding.
|
||||||
if global_kv_cache_groups:
|
if global_kv_cache_groups:
|
||||||
min_available_memory = min(available_memory)
|
_check_enough_kv_cache_memory(
|
||||||
if min_available_memory <= 0:
|
min(available_memory),
|
||||||
raise ValueError(
|
lambda: _max_memory_usage_bytes_from_groups(
|
||||||
"No available memory for the cache blocks. "
|
vllm_config, global_kv_cache_groups
|
||||||
"Try increasing `gpu_memory_utilization` when "
|
),
|
||||||
"initializing the engine."
|
vllm_config.model_config.max_model_len,
|
||||||
)
|
lambda am: _estimate_max_model_len_from_groups(
|
||||||
max_model_len = vllm_config.model_config.max_model_len
|
vllm_config, global_kv_cache_groups, am
|
||||||
needed_memory = _max_memory_usage_bytes_from_groups(
|
),
|
||||||
vllm_config, global_kv_cache_groups
|
|
||||||
)
|
)
|
||||||
if needed_memory > min_available_memory:
|
|
||||||
estimated_max_len = _estimate_max_model_len_from_groups(
|
|
||||||
vllm_config, global_kv_cache_groups, min_available_memory
|
|
||||||
)
|
|
||||||
estimated_msg = ""
|
|
||||||
if estimated_max_len > 0:
|
|
||||||
estimated_msg = (
|
|
||||||
f"Based on the available memory, the estimated maximum "
|
|
||||||
f"model length is {estimated_max_len}. "
|
|
||||||
)
|
|
||||||
raise ValueError(
|
|
||||||
f"To serve at least one request with the models's max seq len "
|
|
||||||
f"({max_model_len}), ({needed_memory / GiB_bytes:.2f} GiB KV "
|
|
||||||
f"cache is needed, which is larger than the available KV cache "
|
|
||||||
f"memory ({min_available_memory / GiB_bytes:.2f} GiB). "
|
|
||||||
f"{estimated_msg}"
|
|
||||||
f"Try increasing `gpu_memory_utilization` or decreasing "
|
|
||||||
f"`max_model_len` when initializing the engine."
|
|
||||||
)
|
|
||||||
|
|
||||||
kv_cache_configs: list[KVCacheConfig] = []
|
kv_cache_configs: list[KVCacheConfig] = []
|
||||||
for kv_cache_spec_one_worker, available_memory_one_worker in zip(
|
for kv_cache_spec_one_worker, available_memory_one_worker in zip(
|
||||||
|
|||||||
@ -6,9 +6,7 @@ Define EC connector functionality mixin for model runners.
|
|||||||
|
|
||||||
from collections.abc import Generator
|
from collections.abc import Generator
|
||||||
from contextlib import AbstractContextManager, contextmanager, nullcontext
|
from contextlib import AbstractContextManager, contextmanager, nullcontext
|
||||||
from typing import (
|
from typing import TYPE_CHECKING
|
||||||
TYPE_CHECKING, # noqa: UP035
|
|
||||||
)
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
|||||||
@ -7,9 +7,7 @@ Define KV connector functionality mixin for model runners.
|
|||||||
import copy
|
import copy
|
||||||
from collections.abc import Generator
|
from collections.abc import Generator
|
||||||
from contextlib import AbstractContextManager, contextmanager, nullcontext
|
from contextlib import AbstractContextManager, contextmanager, nullcontext
|
||||||
from typing import (
|
from typing import TYPE_CHECKING
|
||||||
TYPE_CHECKING, # noqa: UP035
|
|
||||||
)
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user