Merge branch 'main' into feat/vanisimov/kv_cache_groups_optimization

This commit is contained in:
vovani 2025-12-24 18:20:24 +01:00 committed by GitHub
commit 437ac4e047
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
58 changed files with 690 additions and 519 deletions

View File

@ -2,7 +2,7 @@
# We can use this script to compute baseline accuracy on chartqa for vllm. # We can use this script to compute baseline accuracy on chartqa for vllm.
# #
# Make sure you have lm-eval-harness installed: # Make sure you have lm-eval-harness installed:
# pip install lm-eval==0.4.9 # pip install "lm-eval[api]>=0.4.9.2"
usage() { usage() {
echo`` echo``

View File

@ -2,7 +2,7 @@
# We can use this script to compute baseline accuracy on GSM for transformers. # We can use this script to compute baseline accuracy on GSM for transformers.
# #
# Make sure you have lm-eval-harness installed: # Make sure you have lm-eval-harness installed:
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] # pip install "lm-eval[api]>=0.4.9.2"
usage() { usage() {
echo`` echo``

View File

@ -3,7 +3,7 @@
# We use this for fp8, which HF does not support. # We use this for fp8, which HF does not support.
# #
# Make sure you have lm-eval-harness installed: # Make sure you have lm-eval-harness installed:
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] # pip install "lm-eval[api]>=0.4.9.2"
usage() { usage() {
echo`` echo``

View File

@ -3,7 +3,7 @@
# We use this for fp8, which HF does not support. # We use this for fp8, which HF does not support.
# #
# Make sure you have lm-eval-harness installed: # Make sure you have lm-eval-harness installed:
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] # pip install "lm-eval[api]>=0.4.9.2"
usage() { usage() {
echo`` echo``

View File

@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
echo "--- Installing Python dependencies ---" echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \ && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0 && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
echo "--- Python dependencies installed ---" echo "--- Python dependencies installed ---"

View File

@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
echo "--- Installing Python dependencies ---" echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \ && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0 && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
echo "--- Python dependencies installed ---" echo "--- Python dependencies installed ---"

View File

@ -24,6 +24,8 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
#ifndef VLLM_NUMA_DISABLED #ifndef VLLM_NUMA_DISABLED
std::string init_cpu_threads_env(const std::string& cpu_ids) { std::string init_cpu_threads_env(const std::string& cpu_ids) {
bitmask* omp_cpu_mask = numa_parse_cpustring_all(cpu_ids.c_str()); bitmask* omp_cpu_mask = numa_parse_cpustring_all(cpu_ids.c_str());
TORCH_CHECK(omp_cpu_mask != nullptr,
"Failed to parse CPU string: " + cpu_ids);
TORCH_CHECK(omp_cpu_mask->size > 0); TORCH_CHECK(omp_cpu_mask->size > 0);
std::vector<int> omp_cpu_ids; std::vector<int> omp_cpu_ids;
omp_cpu_ids.reserve(omp_cpu_mask->size); omp_cpu_ids.reserve(omp_cpu_mask->size);
@ -44,20 +46,12 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
// Memory node binding // Memory node binding
if (numa_available() != -1) { if (numa_available() != -1) {
int mem_node_id = numa_node_of_cpu(omp_cpu_ids.front());
std::set<int> node_ids; std::set<int> node_ids;
for (const auto& cpu_id : omp_cpu_ids) { for (const auto& cpu_id : omp_cpu_ids) {
int node_id = numa_node_of_cpu(cpu_id); int node_id = numa_node_of_cpu(cpu_id);
if (node_id != -1) { if (node_id != -1) {
node_ids.insert(node_id); node_ids.insert(node_id);
} }
if (node_id != mem_node_id) {
TORCH_WARN("CPU ", cpu_id, " is on NUMA node ", node_id, ", but CPU ",
omp_cpu_ids.front(), " is on NUMA node ", mem_node_id,
". All CPUs should be on the same NUMA node for optimal "
"performance. Memory will be bound to NUMA node ",
mem_node_id, ".");
}
} }
// Concatenate all node_ids into a single comma-separated string // Concatenate all node_ids into a single comma-separated string
if (!node_ids.empty()) { if (!node_ids.empty()) {
@ -70,7 +64,7 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
} }
bitmask* mask = numa_parse_nodestring(node_ids_str.c_str()); bitmask* mask = numa_parse_nodestring(node_ids_str.c_str());
bitmask* src_mask = numa_get_membind(); bitmask* src_mask = numa_get_mems_allowed();
int pid = getpid(); int pid = getpid();
@ -83,15 +77,46 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
std::to_string(errno)); std::to_string(errno));
} }
// restrict memory allocation node. // Restrict memory allocation to the selected NUMA node(s).
numa_set_membind(mask); // Enhances memory locality for the threads bound to those NUMA CPUs.
if (node_ids.size() > 1) {
errno = 0;
numa_set_interleave_mask(mask);
if (errno != 0) {
TORCH_WARN("numa_set_interleave_mask failed. errno: " +
std::to_string(errno));
} else {
TORCH_WARN(
"NUMA binding: Using INTERLEAVE policy for memory "
"allocation across multiple NUMA nodes (nodes: " +
node_ids_str +
"). Memory allocations will be "
"interleaved across the specified NUMA nodes.");
}
} else {
errno = 0;
numa_set_membind(mask);
if (errno != 0) {
TORCH_WARN("numa_set_membind failed. errno: " +
std::to_string(errno));
} else {
TORCH_WARN(
"NUMA binding: Using MEMBIND policy for memory "
"allocation on the NUMA nodes (" +
node_ids_str +
"). Memory allocations will be "
"strictly bound to these NUMA nodes.");
}
}
numa_set_strict(1); numa_set_strict(1);
numa_free_nodemask(mask); numa_free_nodemask(mask);
numa_free_nodemask(src_mask); numa_free_nodemask(src_mask);
} else { } else {
TORCH_WARN("numa_parse_nodestring or numa_get_membind failed. errno: " + TORCH_WARN(
std::to_string(errno)); "numa_parse_nodestring or numa_get_run_node_mask failed. errno: " +
std::to_string(errno));
} }
} }
} }

View File

@ -84,7 +84,7 @@ Since simple RTN does not require data for weight quantization and the activatio
Install `vllm` and `lm-evaluation-harness` for evaluation: Install `vllm` and `lm-evaluation-harness` for evaluation:
```bash ```bash
pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] pip install vllm "lm-eval[api]>=0.4.9.2"
``` ```
Load and run the model in `vllm`: Load and run the model in `vllm`:

View File

@ -18,7 +18,7 @@ pip install llmcompressor
Additionally, install `vllm` and `lm-evaluation-harness` for evaluation: Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
```bash ```bash
pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] pip install vllm "lm-eval[api]>=0.4.9.2"
``` ```
## Quantization Process ## Quantization Process

View File

@ -23,7 +23,7 @@ pip install llmcompressor
Additionally, install `vllm` and `lm-evaluation-harness` for evaluation: Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
```bash ```bash
pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] pip install vllm "lm-eval[api]>=0.4.9.2"
``` ```
## Quantization Process ## Quantization Process

View File

@ -20,7 +20,7 @@ for more installation details.
Additionally, install `vllm` and `lm-evaluation-harness` for evaluation: Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
```bash ```bash
pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] pip install vllm "lm-eval[api]>=0.4.9.2"
``` ```
## Quantization Process ## Quantization Process

View File

@ -27,7 +27,7 @@ mistral_common[image,audio] >= 1.8.5 # required for voxtral test
num2words # required for smolvlm test num2words # required for smolvlm test
opencv-python-headless >= 4.11.0 # required for video test opencv-python-headless >= 4.11.0 # required for video test
datamodel_code_generator # required for minicpm3 test datamodel_code_generator # required for minicpm3 test
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test lm-eval[api]>=0.4.9.2 # required for model evaluation test
mteb>=1.38.11, <2 # required for mteb test mteb>=1.38.11, <2 # required for mteb test
transformers==4.57.3 transformers==4.57.3
tokenizers==0.22.0 tokenizers==0.22.0

View File

@ -58,7 +58,7 @@ schemathesis==3.39.15
# OpenAI schema test # OpenAI schema test
# Evaluation and benchmarking # Evaluation and benchmarking
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d lm-eval[api]>=0.4.9.2
jiwer==4.0.0 jiwer==4.0.0
# Required for multiprocessed tests that use spawn method, Datasets and Evaluate Test # Required for multiprocessed tests that use spawn method, Datasets and Evaluate Test

View File

@ -34,8 +34,7 @@ num2words # required for smolvlm test
open_clip_torch==2.32.0 # Required for nemotron_vl test open_clip_torch==2.32.0 # Required for nemotron_vl test
opencv-python-headless >= 4.11.0 # required for video test opencv-python-headless >= 4.11.0 # required for video test
datamodel_code_generator # required for minicpm3 test datamodel_code_generator # required for minicpm3 test
# TODO: Use lm-eval[api]==0.4.10 once released lm-eval[api]>=0.4.9.2 # required for model evaluation test
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
mteb[bm25s]>=2, <3 # required for mteb test mteb[bm25s]>=2, <3 # required for mteb test
transformers==4.57.3 transformers==4.57.3
tokenizers==0.22.0 tokenizers==0.22.0

View File

@ -441,7 +441,7 @@ lightning-utilities==0.14.3
# torchmetrics # torchmetrics
llvmlite==0.44.0 llvmlite==0.44.0
# via numba # via numba
lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d lm-eval==0.4.9.2
# via -r requirements/test.in # via -r requirements/test.in
lxml==5.3.0 lxml==5.3.0
# via # via

View File

@ -410,7 +410,7 @@ class HfRunner:
# don't put this import at the top level # don't put this import at the top level
# it will call torch.cuda.device_count() # it will call torch.cuda.device_count()
from transformers import AutoProcessor # noqa: F401 from transformers import AutoProcessor
self.processor = AutoProcessor.from_pretrained( self.processor = AutoProcessor.from_pretrained(
model_name, model_name,

View File

@ -15,7 +15,7 @@ MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): # noqa: F811 def server():
args = [ args = [
# use half precision for speed and memory savings in CI environment # use half precision for speed and memory savings in CI environment
"--dtype", "--dtype",

View File

@ -28,7 +28,7 @@ def zephyr_lora_files():
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(zephyr_lora_files): # noqa: F811 def server(zephyr_lora_files):
args = [ args = [
# use half precision for speed and memory savings in CI environment # use half precision for speed and memory savings in CI environment
"--dtype", "--dtype",

View File

@ -12,7 +12,7 @@ MODEL_NAME = "Qwen/QwQ-32B"
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): # noqa: F811 def server():
args = [ args = [
"--max-model-len", "--max-model-len",
"8192", "8192",

View File

@ -125,7 +125,7 @@ messages = [
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): # noqa: F811 def server():
args = [ args = [
# use half precision for speed and memory savings in CI environment # use half precision for speed and memory savings in CI environment
"--dtype", "--dtype",
@ -212,7 +212,7 @@ async def test_function_tool_use(
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def k2_server(): # noqa: F811 def k2_server():
args = [ args = [
# use half precision for speed and memory savings in CI environment # use half precision for speed and memory savings in CI environment
"--dtype", "--dtype",

View File

@ -23,7 +23,7 @@ ACTIVE_MM_LORA_RESPONSE = "Spoken text: The first words I spoke in the original
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def multimodal_server(): # noqa: F811 def multimodal_server():
args = [ args = [
# use half precision for speed and memory savings in CI environment # use half precision for speed and memory savings in CI environment
"--dtype", "--dtype",

View File

@ -8,7 +8,7 @@ from ...utils import RemoteOpenAIServer
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def chat_server_with_force_include_usage(request): # noqa: F811 def chat_server_with_force_include_usage(request):
args = [ args = [
# use half precision for speed and memory savings in CI environment # use half precision for speed and memory savings in CI environment
"--dtype", "--dtype",

View File

@ -11,7 +11,7 @@ MODEL_NAME = "Qwen/Qwen3-0.6B"
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): # noqa: F811 def server():
args = [ args = [
"--max-model-len", "--max-model-len",
"2048", "2048",

View File

@ -37,7 +37,7 @@ def default_server_args(qwen3_lora_files):
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server_fixture(request, default_server_args): # noqa: F811 def server_fixture(request, default_server_args):
use_server_flag = request.param use_server_flag = request.param
if use_server_flag: if use_server_flag:
args_with_flag = default_server_args + ["--return-tokens-as-token-ids"] args_with_flag = default_server_args + ["--return-tokens-as-token-ids"]

View File

@ -4,7 +4,7 @@ import os
import pytest import pytest
from tests.models.language.pooling_mteb_test.mteb_utils import ( from tests.models.language.pooling_mteb_test.mteb_embed_utils import (
MTEB_EMBED_TASKS, MTEB_EMBED_TASKS,
MTEB_EMBED_TOL, MTEB_EMBED_TOL,
OpenAIClientMtebEncoder, OpenAIClientMtebEncoder,

View File

@ -4,7 +4,7 @@ import os
import pytest import pytest
from tests.models.language.pooling_mteb_test.mteb_utils import ( from tests.models.language.pooling_mteb_test.mteb_score_utils import (
MTEB_RERANK_LANGS, MTEB_RERANK_LANGS,
MTEB_RERANK_TASKS, MTEB_RERANK_TASKS,
MTEB_RERANK_TOL, MTEB_RERANK_TOL,

View File

@ -202,11 +202,10 @@ class TestGetScorePrompt:
tokenization_kwargs, tokenization_kwargs,
mock_model_no_score_template, mock_model_no_score_template,
): ):
# FIXME: Models implementing SupportsScoreTemplate must use their custom # FIXME: For now, we only apply a template when one is explicitly provided.
# template implementation by default to preserve existing functionality. # We cannot rely on the tokenizer's chat template because many models
# Attempting to use tokenizer_config.json templates would most likely break # inherit junk templates from their base LLM, which breaks both the models
# these models, as often they just inherit the template from the original LLM. # and the tests that use them.
# CLI --chat-template overrides are still supported.
with ( with (
patch( patch(
"vllm.model_executor.model_loader.get_model_cls", "vllm.model_executor.model_loader.get_model_cls",

View File

@ -0,0 +1,228 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import mteb
import numpy as np
import torch
from mteb.models import ModelMeta
from mteb.types import Array
from torch.utils.data import DataLoader
import tests.ci_envs as ci_envs
from tests.models.utils import (
EmbedModelInfo,
check_embeddings_close,
get_vllm_extra_kwargs,
)
# Most embedding models on the STS12 task (See #17175):
# - Model implementation and minor changes in tensor dtype
# results in differences less than 1e-4
# - Different model results in differences more than 1e-3
# 1e-4 is a good tolerance threshold
MTEB_EMBED_TASKS = ["STS12"]
MTEB_EMBED_TOL = 1e-4
_empty_model_meta = ModelMeta(
loader=None,
name="vllm/model",
revision="1",
release_date=None,
languages=None,
framework=[],
similarity_fn_name=None,
n_parameters=None,
memory_usage_mb=None,
max_tokens=None,
embed_dim=None,
license=None,
open_weights=None,
public_training_code=None,
public_training_data=None,
use_instructions=None,
training_datasets=None,
modalities=["text"], # 'image' can be added to evaluate multimodal models
)
class MtebEmbedMixin(mteb.EncoderProtocol):
mteb_model_meta = _empty_model_meta
def similarity(
self,
embeddings1: np.ndarray,
embeddings2: np.ndarray,
) -> np.ndarray:
# Cosine similarity
norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
sim = np.dot(embeddings1, embeddings2.T) / (norm1 * norm2.T)
return sim
def similarity_pairwise(
self,
embeddings1: Array,
embeddings2: Array,
) -> Array:
# Cosine similarity
norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
sim = np.sum(embeddings1 * embeddings2, axis=1) / (
norm1.flatten() * norm2.flatten()
)
return sim
class VllmMtebEncoder(MtebEmbedMixin):
def __init__(self, vllm_model):
self.llm = vllm_model
self.rng = np.random.default_rng(seed=42)
def encode(
self,
inputs: DataLoader[mteb.types.BatchedInput],
*args,
**kwargs,
) -> np.ndarray:
# Hoping to discover potential scheduling
# issues by randomizing the order.
sentences = [text for batch in inputs for text in batch["text"]]
r = self.rng.permutation(len(sentences))
sentences = [sentences[i] for i in r]
outputs = self.llm.embed(sentences, use_tqdm=False)
embeds = np.array(outputs)
embeds = embeds[np.argsort(r)]
return embeds
class OpenAIClientMtebEncoder(MtebEmbedMixin):
def __init__(self, model_name: str, client):
self.model_name = model_name
self.client = client
self.rng = np.random.default_rng(seed=42)
def encode(
self,
inputs: DataLoader[mteb.types.BatchedInput],
*args,
**kwargs,
) -> np.ndarray:
# Hoping to discover potential scheduling
# issues by randomizing the order.
sentences = [text for batch in inputs for text in batch["text"]]
r = self.rng.permutation(len(sentences))
sentences = [sentences[i] for i in r]
embeddings = self.client.embeddings.create(
model=self.model_name, input=sentences
)
outputs = [d.embedding for d in embeddings.data]
embeds = np.array(outputs)
embeds = embeds[np.argsort(r)]
return embeds
def run_mteb_embed_task(encoder: mteb.EncoderProtocol, tasks):
tasks = mteb.get_tasks(tasks=tasks)
results = mteb.evaluate(
encoder,
tasks,
cache=None,
show_progress_bar=False,
)
main_score = results[0].scores["test"][0]["main_score"]
return main_score
def mteb_test_embed_models(
hf_runner,
vllm_runner,
model_info: EmbedModelInfo,
vllm_extra_kwargs=None,
hf_model_callback=None,
atol=MTEB_EMBED_TOL,
):
vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)
# Test embed_dims, isnan and whether to use normalize
example_prompts = ["The chef prepared a delicious meal." * 1000]
with vllm_runner(
model_info.name,
runner="pooling",
max_model_len=model_info.max_model_len,
**vllm_extra_kwargs,
) as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config
# Confirm whether vllm is using the correct architecture
if model_info.architecture:
assert model_info.architecture in model_config.architectures
# Confirm whether the important configs in model_config are correct.
if model_info.pooling_type is not None:
assert model_config.pooler_config.pooling_type == model_info.pooling_type
if model_info.attn_type is not None:
assert model_config.attn_type == model_info.attn_type
if model_info.is_prefix_caching_supported is not None:
assert (
model_config.is_prefix_caching_supported
== model_info.is_prefix_caching_supported
)
if model_info.is_chunked_prefill_supported is not None:
assert (
model_config.is_chunked_prefill_supported
== model_info.is_chunked_prefill_supported
)
vllm_main_score = run_mteb_embed_task(
VllmMtebEncoder(vllm_model), MTEB_EMBED_TASKS
)
vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
head_dtype = model_config.head_dtype
# Test embedding_size, isnan and whether to use normalize
vllm_outputs = vllm_model.embed(example_prompts, truncate_prompt_tokens=-1)
outputs_tensor = torch.tensor(vllm_outputs)
assert not torch.any(torch.isnan(outputs_tensor))
embedding_size = model_config.embedding_size
assert torch.tensor(vllm_outputs).shape[-1] == embedding_size
# Accelerate mteb test by setting
# SentenceTransformers mteb score to a constant
if model_info.mteb_score is None:
with hf_runner(
model_info.name,
is_sentence_transformer=True,
dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype,
) as hf_model:
# e.g. setting default parameters for the encode method of hf_runner
if hf_model_callback is not None:
hf_model_callback(hf_model)
st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
st_dtype = next(hf_model.model.parameters()).dtype
# Check embeddings close to hf outputs
hf_outputs = hf_model.encode(example_prompts)
check_embeddings_close(
embeddings_0_lst=hf_outputs,
embeddings_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
tol=1e-2,
)
else:
st_main_score = model_info.mteb_score
st_dtype = "Constant"
print("Model:", model_info.name)
print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", vllm_main_score)
print("SentenceTransformers:", st_dtype, st_main_score)
print("Difference:", st_main_score - vllm_main_score)
# We are not concerned that the vllm mteb results are better
# than SentenceTransformers, so we only perform one-sided testing.
assert st_main_score - vllm_main_score < atol

View File

@ -7,37 +7,24 @@ from pathlib import Path
import mteb import mteb
import numpy as np import numpy as np
import requests import requests
import torch
from mteb.models import ModelMeta from mteb.models import ModelMeta
from mteb.types import Array
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
import tests.ci_envs as ci_envs
from tests.models.utils import ( from tests.models.utils import (
EmbedModelInfo,
RerankModelInfo, RerankModelInfo,
check_embeddings_close,
get_vllm_extra_kwargs, get_vllm_extra_kwargs,
) )
template_home = (
Path(__file__).parent.parent.parent.parent.parent
/ "examples/pooling/score/template"
)
# Most embedding models on the STS12 task (See #17175):
# - Model implementation and minor changes in tensor dtype
# results in differences less than 1e-4
# - Different model results in differences more than 1e-3
# 1e-4 is a good tolerance threshold
MTEB_EMBED_TASKS = ["STS12"]
MTEB_EMBED_TOL = 1e-4
# See #19344 # See #19344
MTEB_RERANK_TASKS = ["NFCorpus"] MTEB_RERANK_TASKS = ["NFCorpus"]
MTEB_RERANK_LANGS = ["eng"] MTEB_RERANK_LANGS = ["eng"]
MTEB_RERANK_TOL = 2e-3 MTEB_RERANK_TOL = 2e-3
template_home = (
Path(__file__).parent.parent.parent.parent.parent
/ "examples/pooling/score/template"
)
_empty_model_meta = ModelMeta( _empty_model_meta = ModelMeta(
loader=None, loader=None,
name="vllm/model", name="vllm/model",
@ -60,84 +47,11 @@ _empty_model_meta = ModelMeta(
) )
class VllmMtebEncoder(mteb.EncoderProtocol): class MtebCrossEncoderMixin(mteb.CrossEncoderProtocol):
mteb_model_meta = _empty_model_meta mteb_model_meta = _empty_model_meta
def __init__(self, vllm_model):
self.llm = vllm_model
self.rng = np.random.default_rng(seed=42)
def encode(
self,
inputs: DataLoader[mteb.types.BatchedInput],
*args,
**kwargs,
) -> np.ndarray:
# Hoping to discover potential scheduling
# issues by randomizing the order.
sentences = [text for batch in inputs for text in batch["text"]]
r = self.rng.permutation(len(sentences))
sentences = [sentences[i] for i in r]
outputs = self.llm.embed(sentences, use_tqdm=False)
embeds = np.array(outputs)
embeds = embeds[np.argsort(r)]
return embeds
def similarity(
self,
embeddings1: np.ndarray,
embeddings2: np.ndarray,
) -> np.ndarray:
# Cosine similarity
norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
sim = np.dot(embeddings1, embeddings2.T) / (norm1 * norm2.T)
return sim
def similarity_pairwise(
self,
embeddings1: Array,
embeddings2: Array,
) -> Array:
# Cosine similarity
norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
sim = np.sum(embeddings1 * embeddings2, axis=1) / (
norm1.flatten() * norm2.flatten()
)
return sim
class OpenAIClientMtebEncoder(VllmMtebEncoder):
def __init__(self, model_name: str, client):
self.model_name = model_name
self.client = client
self.rng = np.random.default_rng(seed=42)
def encode(
self,
inputs: DataLoader[mteb.types.BatchedInput],
*args,
**kwargs,
) -> np.ndarray:
# Hoping to discover potential scheduling
# issues by randomizing the order.
sentences = [text for batch in inputs for text in batch["text"]]
r = self.rng.permutation(len(sentences))
sentences = [sentences[i] for i in r]
embeddings = self.client.embeddings.create(
model=self.model_name, input=sentences
)
outputs = [d.embedding for d in embeddings.data]
embeds = np.array(outputs)
embeds = embeds[np.argsort(r)]
return embeds
class VllmMtebCrossEncoder(mteb.CrossEncoderProtocol):
mteb_model_meta = _empty_model_meta
class VllmMtebCrossEncoder(MtebCrossEncoderMixin):
def __init__(self, vllm_model): def __init__(self, vllm_model):
self.llm = vllm_model self.llm = vllm_model
self.rng = np.random.default_rng(seed=42) self.rng = np.random.default_rng(seed=42)
@ -164,7 +78,7 @@ class VllmMtebCrossEncoder(mteb.CrossEncoderProtocol):
return scores return scores
class ScoreClientMtebEncoder(mteb.CrossEncoderProtocol): class ScoreClientMtebEncoder(MtebCrossEncoderMixin):
mteb_model_meta = _empty_model_meta mteb_model_meta = _empty_model_meta
def __init__(self, model_name: str, url): def __init__(self, model_name: str, url):
@ -216,102 +130,6 @@ class RerankClientMtebEncoder(ScoreClientMtebEncoder):
return response["results"][0]["relevance_score"] return response["results"][0]["relevance_score"]
def run_mteb_embed_task(encoder: mteb.EncoderProtocol, tasks):
tasks = mteb.get_tasks(tasks=tasks)
results = mteb.evaluate(
encoder,
tasks,
cache=None,
show_progress_bar=False,
)
main_score = results[0].scores["test"][0]["main_score"]
return main_score
def mteb_test_embed_models(
hf_runner,
vllm_runner,
model_info: EmbedModelInfo,
vllm_extra_kwargs=None,
hf_model_callback=None,
atol=MTEB_EMBED_TOL,
):
vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)
# Test embed_dims, isnan and whether to use normalize
example_prompts = ["The chef prepared a delicious meal." * 1000]
with vllm_runner(
model_info.name,
runner="pooling",
max_model_len=model_info.max_model_len,
**vllm_extra_kwargs,
) as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config
# Confirm whether vllm is using the correct architecture
if model_info.architecture:
assert model_info.architecture in model_config.architectures
# Confirm whether vllm uses the correct default_pooling_type, which
# relates to whether chunked prefill and prefix caching are enabled
assert (
model_config._model_info.default_pooling_type
== model_info.default_pooling_type
)
vllm_main_score = run_mteb_embed_task(
VllmMtebEncoder(vllm_model), MTEB_EMBED_TASKS
)
vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
head_dtype = model_config.head_dtype
# Test embedding_size, isnan and whether to use normalize
vllm_outputs = vllm_model.embed(example_prompts, truncate_prompt_tokens=-1)
outputs_tensor = torch.tensor(vllm_outputs)
assert not torch.any(torch.isnan(outputs_tensor))
embedding_size = model_config.embedding_size
assert torch.tensor(vllm_outputs).shape[-1] == embedding_size
# Accelerate mteb test by setting
# SentenceTransformers mteb score to a constant
if model_info.mteb_score is None:
with hf_runner(
model_info.name,
is_sentence_transformer=True,
dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype,
) as hf_model:
# e.g. setting default parameters for the encode method of hf_runner
if hf_model_callback is not None:
hf_model_callback(hf_model)
st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
st_dtype = next(hf_model.model.parameters()).dtype
# Check embeddings close to hf outputs
hf_outputs = hf_model.encode(example_prompts)
check_embeddings_close(
embeddings_0_lst=hf_outputs,
embeddings_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
tol=1e-2,
)
else:
st_main_score = model_info.mteb_score
st_dtype = "Constant"
print("Model:", model_info.name)
print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", vllm_main_score)
print("SentenceTransformers:", st_dtype, st_main_score)
print("Difference:", st_main_score - vllm_main_score)
# We are not concerned that the vllm mteb results are better
# than SentenceTransformers, so we only perform one-sided testing.
assert st_main_score - vllm_main_score < atol
def run_mteb_rerank(cross_encoder: mteb.CrossEncoderProtocol, tasks, languages): def run_mteb_rerank(cross_encoder: mteb.CrossEncoderProtocol, tasks, languages):
with tempfile.TemporaryDirectory() as prediction_folder: with tempfile.TemporaryDirectory() as prediction_folder:
bm25s = mteb.get_model("bm25s") bm25s = mteb.get_model("bm25s")
@ -391,18 +209,28 @@ def mteb_test_rerank_models(
# Score API is only enabled for num_labels == 1 # Score API is only enabled for num_labels == 1
assert model_config.hf_config.num_labels == 1 assert model_config.hf_config.num_labels == 1
# Confirm whether vllm uses the correct default_pooling_type, which # Maybe load chat_template.
# relates to whether chunked prefill and prefix caching are enabled
assert (
model_config._model_info.default_pooling_type
== model_info.default_pooling_type
)
chat_template: str | None = None chat_template: str | None = None
if model_info.chat_template_name is not None: if model_info.chat_template_name is not None:
chat_template = (template_home / model_info.chat_template_name).read_text() chat_template = (template_home / model_info.chat_template_name).read_text()
vllm_model.chat_template = chat_template vllm_model.chat_template = chat_template
# Confirm whether the important configs in model_config are correct.
if model_info.pooling_type is not None:
assert model_config.pooler_config.pooling_type == model_info.pooling_type
if model_info.attn_type is not None:
assert model_config.attn_type == model_info.attn_type
if model_info.is_prefix_caching_supported is not None:
assert (
model_config.is_prefix_caching_supported
== model_info.is_prefix_caching_supported
)
if model_info.is_chunked_prefill_supported is not None:
assert (
model_config.is_chunked_prefill_supported
== model_info.is_chunked_prefill_supported
)
vllm_main_score = run_mteb_rerank( vllm_main_score = run_mteb_rerank(
vllm_mteb_encoder(vllm_model), vllm_mteb_encoder(vllm_model),
tasks=MTEB_RERANK_TASKS, tasks=MTEB_RERANK_TASKS,

View File

@ -4,90 +4,94 @@ import pytest
from tests.models.language.pooling.embed_utils import correctness_test_embed_models from tests.models.language.pooling.embed_utils import correctness_test_embed_models
from tests.models.utils import ( from tests.models.utils import (
CLSPoolingEmbedModelInfo,
CLSPoolingRerankModelInfo,
EmbedModelInfo, EmbedModelInfo,
LASTPoolingEmbedModelInfo,
RerankModelInfo, RerankModelInfo,
) )
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models from .mteb_embed_utils import mteb_test_embed_models
from .mteb_score_utils import mteb_test_rerank_models
MODELS = [ MODELS = [
########## BertModel ########## BertModel
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"BAAI/bge-base-en", "BAAI/bge-base-en",
architecture="BertModel", architecture="BertModel",
mteb_score=0.779336792, mteb_score=0.779336792,
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo("BAAI/bge-base-zh", architecture="BertModel", enable_test=False),
"BAAI/bge-base-zh", architecture="BertModel", enable_test=False EmbedModelInfo("BAAI/bge-small-en", architecture="BertModel", enable_test=False),
), EmbedModelInfo("BAAI/bge-small-zh", architecture="BertModel", enable_test=False),
CLSPoolingEmbedModelInfo( EmbedModelInfo("BAAI/bge-large-en", architecture="BertModel", enable_test=False),
"BAAI/bge-small-en", architecture="BertModel", enable_test=False EmbedModelInfo("BAAI/bge-large-zh", architecture="BertModel", enable_test=False),
), EmbedModelInfo(
CLSPoolingEmbedModelInfo(
"BAAI/bge-small-zh", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-large-en", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-large-zh", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-large-zh-noinstruct", architecture="BertModel", enable_test=False "BAAI/bge-large-zh-noinstruct", architecture="BertModel", enable_test=False
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"BAAI/bge-base-en-v1.5", architecture="BertModel", enable_test=False "BAAI/bge-base-en-v1.5", architecture="BertModel", enable_test=False
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"BAAI/bge-base-zh-v1.5", architecture="BertModel", enable_test=False "BAAI/bge-base-zh-v1.5", architecture="BertModel", enable_test=False
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"BAAI/bge-small-en-v1.5", architecture="BertModel", enable_test=False "BAAI/bge-small-en-v1.5", architecture="BertModel", enable_test=False
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"BAAI/bge-small-zh-v1.5", architecture="BertModel", enable_test=False "BAAI/bge-small-zh-v1.5", architecture="BertModel", enable_test=False
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"BAAI/bge-large-en-v1.5", architecture="BertModel", enable_test=False "BAAI/bge-large-en-v1.5", architecture="BertModel", enable_test=False
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"BAAI/bge-large-zh-v1.5", architecture="BertModel", enable_test=False "BAAI/bge-large-zh-v1.5", architecture="BertModel", enable_test=False
), ),
########## XLMRobertaModel ########## XLMRobertaModel
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"BAAI/bge-m3", "BAAI/bge-m3",
architecture="XLMRobertaModel", architecture="XLMRobertaModel",
mteb_score=0.787343078, mteb_score=0.787343078,
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
########## Qwen2Model ########## Qwen2Model
LASTPoolingEmbedModelInfo( EmbedModelInfo(
"BAAI/bge-code-v1", "BAAI/bge-code-v1",
architecture="Qwen2Model", architecture="Qwen2Model",
mteb_score=0.75724465, mteb_score=0.75724465,
dtype="float32", dtype="float32",
pooling_type="LAST",
attn_type="decoder",
is_prefix_caching_supported=True,
is_chunked_prefill_supported=True,
enable_test=True, enable_test=True,
), ),
] ]
RERANK_MODELS = [ RERANK_MODELS = [
########## XLMRobertaForSequenceClassification ########## XLMRobertaForSequenceClassification
CLSPoolingRerankModelInfo( RerankModelInfo(
"BAAI/bge-reranker-base", "BAAI/bge-reranker-base",
architecture="XLMRobertaForSequenceClassification", architecture="XLMRobertaForSequenceClassification",
mteb_score=0.32398, mteb_score=0.32398,
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
CLSPoolingRerankModelInfo( RerankModelInfo(
"BAAI/bge-reranker-large", "BAAI/bge-reranker-large",
architecture="XLMRobertaForSequenceClassification", architecture="XLMRobertaForSequenceClassification",
enable_test=False, enable_test=False,
), ),
CLSPoolingRerankModelInfo( RerankModelInfo(
"BAAI/bge-reranker-v2-m3", "BAAI/bge-reranker-v2-m3",
architecture="XLMRobertaForSequenceClassification", architecture="XLMRobertaForSequenceClassification",
enable_test=False, enable_test=False,

View File

@ -9,14 +9,12 @@ import torch
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
from tests.conftest import HfRunner from tests.conftest import HfRunner
from tests.models.language.pooling_mteb_test.mteb_utils import ( from tests.models.utils import RerankModelInfo
VllmMtebCrossEncoder,
mteb_test_rerank_models, from .mteb_score_utils import VllmMtebCrossEncoder, mteb_test_rerank_models
)
from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
RERANK_MODELS = [ RERANK_MODELS = [
LASTPoolingRerankModelInfo( RerankModelInfo(
"BAAI/bge-reranker-v2-gemma", "BAAI/bge-reranker-v2-gemma",
architecture="GemmaForSequenceClassification", architecture="GemmaForSequenceClassification",
mteb_score=0.33757, mteb_score=0.33757,
@ -25,6 +23,10 @@ RERANK_MODELS = [
"classifier_from_token": ["Yes"], "classifier_from_token": ["Yes"],
"method": "no_post_processing", "method": "no_post_processing",
}, },
pooling_type="LAST",
attn_type="decoder",
is_prefix_caching_supported=True,
is_chunked_prefill_supported=True,
), ),
] ]

View File

@ -3,23 +3,29 @@
import pytest import pytest
from tests.models.utils import ( from tests.models.utils import (
CLSPoolingRerankModelInfo,
LASTPoolingRerankModelInfo,
RerankModelInfo, RerankModelInfo,
) )
from .mteb_utils import mteb_test_rerank_models from .mteb_score_utils import mteb_test_rerank_models
RERANK_MODELS = [ RERANK_MODELS = [
CLSPoolingRerankModelInfo( RerankModelInfo(
"cross-encoder/ms-marco-TinyBERT-L-2-v2", "cross-encoder/ms-marco-TinyBERT-L-2-v2",
mteb_score=0.32898, mteb_score=0.32898,
architecture="BertForSequenceClassification", architecture="BertForSequenceClassification",
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
), ),
LASTPoolingRerankModelInfo( RerankModelInfo(
"tomaarsen/Qwen3-Reranker-0.6B-seq-cls", "tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
mteb_score=0.25736, mteb_score=0.25736,
architecture="Qwen3ForSequenceClassification", architecture="Qwen3ForSequenceClassification",
pooling_type="LAST",
attn_type="decoder",
is_prefix_caching_supported=True,
is_chunked_prefill_supported=True,
), ),
] ]

View File

@ -5,36 +5,32 @@ import pytest
from tests.models.language.pooling.embed_utils import correctness_test_embed_models from tests.models.language.pooling.embed_utils import correctness_test_embed_models
from tests.models.utils import ( from tests.models.utils import (
CLSPoolingEmbedModelInfo,
CLSPoolingRerankModelInfo,
EmbedModelInfo, EmbedModelInfo,
LASTPoolingEmbedModelInfo,
RerankModelInfo, RerankModelInfo,
) )
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models from .mteb_embed_utils import mteb_test_embed_models
from .mteb_score_utils import mteb_test_rerank_models
MODELS = [ MODELS = [
########## BertModel ########## BertModel
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"thenlper/gte-large", "thenlper/gte-large",
mteb_score=0.76807651, mteb_score=0.76807651,
architecture="BertModel", architecture="BertModel",
pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo("thenlper/gte-base", architecture="BertModel", enable_test=False),
"thenlper/gte-base", architecture="BertModel", enable_test=False EmbedModelInfo("thenlper/gte-small", architecture="BertModel", enable_test=False),
), EmbedModelInfo(
CLSPoolingEmbedModelInfo(
"thenlper/gte-small", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"thenlper/gte-large-zh", architecture="BertModel", enable_test=False "thenlper/gte-large-zh", architecture="BertModel", enable_test=False
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo("thenlper/gte-base-zh", architecture="BertModel", enable_test=False),
"thenlper/gte-base-zh", architecture="BertModel", enable_test=False EmbedModelInfo(
),
CLSPoolingEmbedModelInfo(
"thenlper/gte-small-zh", architecture="BertModel", enable_test=False "thenlper/gte-small-zh", architecture="BertModel", enable_test=False
), ),
########### NewModel ########### NewModel
@ -43,48 +39,64 @@ MODELS = [
# - whether to use token_type_embeddings # - whether to use token_type_embeddings
# - whether to use context expansion # - whether to use context expansion
# So only test one (the most widely used) model # So only test one (the most widely used) model
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"Alibaba-NLP/gte-multilingual-base", "Alibaba-NLP/gte-multilingual-base",
architecture="GteNewModel", architecture="GteNewModel",
mteb_score=0.775074696, mteb_score=0.775074696,
hf_overrides={"architectures": ["GteNewModel"]}, hf_overrides={"architectures": ["GteNewModel"]},
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"Alibaba-NLP/gte-base-en-v1.5", "Alibaba-NLP/gte-base-en-v1.5",
architecture="GteNewModel", architecture="GteNewModel",
hf_overrides={"architectures": ["GteNewModel"]}, hf_overrides={"architectures": ["GteNewModel"]},
enable_test=False, enable_test=False,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"Alibaba-NLP/gte-large-en-v1.5", "Alibaba-NLP/gte-large-en-v1.5",
architecture="GteNewModel", architecture="GteNewModel",
hf_overrides={"architectures": ["GteNewModel"]}, hf_overrides={"architectures": ["GteNewModel"]},
enable_test=False, enable_test=False,
), ),
########### Qwen2ForCausalLM ########### Qwen2ForCausalLM
LASTPoolingEmbedModelInfo( EmbedModelInfo(
"Alibaba-NLP/gte-Qwen2-1.5B-instruct", "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
mteb_score=0.758473459018872, mteb_score=0.758473459018872,
architecture="Qwen2ForCausalLM", architecture="Qwen2ForCausalLM",
pooling_type="LAST",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
########## ModernBertModel ########## ModernBertModel
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"Alibaba-NLP/gte-modernbert-base", "Alibaba-NLP/gte-modernbert-base",
mteb_score=0.748193353, mteb_score=0.748193353,
architecture="ModernBertModel", architecture="ModernBertModel",
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
########## Qwen3ForCausalLM ########## Qwen3ForCausalLM
LASTPoolingEmbedModelInfo( EmbedModelInfo(
"Qwen/Qwen3-Embedding-0.6B", "Qwen/Qwen3-Embedding-0.6B",
mteb_score=0.771163695, mteb_score=0.771163695,
architecture="Qwen3ForCausalLM", architecture="Qwen3ForCausalLM",
dtype="float32", dtype="float32",
pooling_type="LAST",
attn_type="decoder",
is_prefix_caching_supported=True,
is_chunked_prefill_supported=True,
enable_test=True, enable_test=True,
), ),
LASTPoolingEmbedModelInfo( EmbedModelInfo(
"Qwen/Qwen3-Embedding-4B", "Qwen/Qwen3-Embedding-4B",
architecture="Qwen3ForCausalLM", architecture="Qwen3ForCausalLM",
dtype="float32", dtype="float32",
@ -93,18 +105,26 @@ MODELS = [
] ]
RERANK_MODELS = [ RERANK_MODELS = [
CLSPoolingRerankModelInfo( RerankModelInfo(
# classifier_pooling: mean # classifier_pooling: mean
"Alibaba-NLP/gte-reranker-modernbert-base", "Alibaba-NLP/gte-reranker-modernbert-base",
mteb_score=0.33386, mteb_score=0.33386,
architecture="ModernBertForSequenceClassification", architecture="ModernBertForSequenceClassification",
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
CLSPoolingRerankModelInfo( RerankModelInfo(
"Alibaba-NLP/gte-multilingual-reranker-base", "Alibaba-NLP/gte-multilingual-reranker-base",
mteb_score=0.33062, mteb_score=0.33062,
architecture="GteNewForSequenceClassification", architecture="GteNewForSequenceClassification",
hf_overrides={"architectures": ["GteNewForSequenceClassification"]}, hf_overrides={"architectures": ["GteNewForSequenceClassification"]},
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
] ]

View File

@ -3,40 +3,44 @@
import pytest import pytest
from tests.models.language.pooling.embed_utils import correctness_test_embed_models from tests.models.language.pooling.embed_utils import correctness_test_embed_models
from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo from tests.models.utils import EmbedModelInfo
from .mteb_utils import mteb_test_embed_models from .mteb_embed_utils import mteb_test_embed_models
MODELS = [ MODELS = [
########## BertModel ########## BertModel
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"intfloat/e5-small", "intfloat/e5-small",
architecture="BertModel", architecture="BertModel",
mteb_score=0.742285423, mteb_score=0.742285423,
pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo("intfloat/e5-base", architecture="BertModel", enable_test=False),
"intfloat/e5-base", architecture="BertModel", enable_test=False EmbedModelInfo("intfloat/e5-large", architecture="BertModel", enable_test=False),
), EmbedModelInfo(
CLSPoolingEmbedModelInfo(
"intfloat/e5-large", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"intfloat/multilingual-e5-small", architecture="BertModel", enable_test=False "intfloat/multilingual-e5-small", architecture="BertModel", enable_test=False
), ),
########## XLMRobertaModel ########## XLMRobertaModel
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"intfloat/multilingual-e5-base", "intfloat/multilingual-e5-base",
architecture="XLMRobertaModel", architecture="XLMRobertaModel",
mteb_score=0.779325955, mteb_score=0.779325955,
pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"intfloat/multilingual-e5-large", "intfloat/multilingual-e5-large",
architecture="XLMRobertaModel", architecture="XLMRobertaModel",
enable_test=False, enable_test=False,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"intfloat/multilingual-e5-large-instruct", "intfloat/multilingual-e5-large-instruct",
architecture="XLMRobertaModel", architecture="XLMRobertaModel",
enable_test=False, enable_test=False,

View File

@ -10,30 +10,37 @@ from tests.models.language.pooling.embed_utils import (
matryoshka_fy, matryoshka_fy,
) )
from tests.models.utils import ( from tests.models.utils import (
CLSPoolingEmbedModelInfo,
CLSPoolingRerankModelInfo,
EmbedModelInfo, EmbedModelInfo,
RerankModelInfo, RerankModelInfo,
) )
from vllm import PoolingParams from vllm import PoolingParams
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models from .mteb_embed_utils import mteb_test_embed_models
from .mteb_score_utils import mteb_test_rerank_models
EMBEDDING_MODELS = [ EMBEDDING_MODELS = [
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"jinaai/jina-embeddings-v3", "jinaai/jina-embeddings-v3",
mteb_score=0.824413164, mteb_score=0.824413164,
architecture="XLMRobertaModel", architecture="XLMRobertaModel",
is_matryoshka=True, is_matryoshka=True,
pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
dtype="float32", dtype="float32",
) )
] ]
RERANK_MODELS = [ RERANK_MODELS = [
CLSPoolingRerankModelInfo( RerankModelInfo(
"jinaai/jina-reranker-v2-base-multilingual", "jinaai/jina-reranker-v2-base-multilingual",
mteb_score=0.33643, mteb_score=0.33643,
architecture="XLMRobertaForSequenceClassification", architecture="XLMRobertaForSequenceClassification",
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
) )
] ]

View File

@ -6,9 +6,9 @@ import pytest
import torch import torch
from tests.conftest import HfRunner from tests.conftest import HfRunner
from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo from tests.models.utils import RerankModelInfo
from .mteb_utils import mteb_test_rerank_models from .mteb_score_utils import mteb_test_rerank_models
mxbai_rerank_hf_overrides = { mxbai_rerank_hf_overrides = {
"architectures": ["Qwen2ForSequenceClassification"], "architectures": ["Qwen2ForSequenceClassification"],
@ -17,14 +17,18 @@ mxbai_rerank_hf_overrides = {
} }
RERANK_MODELS = [ RERANK_MODELS = [
LASTPoolingRerankModelInfo( RerankModelInfo(
"mixedbread-ai/mxbai-rerank-base-v2", "mixedbread-ai/mxbai-rerank-base-v2",
architecture="Qwen2ForSequenceClassification", architecture="Qwen2ForSequenceClassification",
hf_overrides=mxbai_rerank_hf_overrides, hf_overrides=mxbai_rerank_hf_overrides,
mteb_score=0.273, mteb_score=0.273,
pooling_type="LAST",
attn_type="decoder",
is_prefix_caching_supported=True,
is_chunked_prefill_supported=True,
enable_test=True, enable_test=True,
), ),
LASTPoolingRerankModelInfo( RerankModelInfo(
"mixedbread-ai/mxbai-rerank-large-v2", "mixedbread-ai/mxbai-rerank-large-v2",
architecture="Qwen2ForSequenceClassification", architecture="Qwen2ForSequenceClassification",
hf_overrides=mxbai_rerank_hf_overrides, hf_overrides=mxbai_rerank_hf_overrides,

View File

@ -3,29 +3,39 @@
import pytest import pytest
from tests.models.language.pooling_mteb_test.mteb_embed_utils import (
mteb_test_embed_models,
)
from tests.models.language.pooling_mteb_test.mteb_score_utils import (
mteb_test_rerank_models,
)
from tests.models.utils import ( from tests.models.utils import (
EmbedModelInfo, EmbedModelInfo,
LASTPoolingEmbedModelInfo,
LASTPoolingRerankModelInfo,
RerankModelInfo, RerankModelInfo,
) )
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
EMBEDDING_MODELS = [ EMBEDDING_MODELS = [
LASTPoolingEmbedModelInfo( EmbedModelInfo(
"nvidia/llama-nemotron-embed-1b-v2", "nvidia/llama-nemotron-embed-1b-v2",
architecture="LlamaBidirectionalModel", architecture="LlamaBidirectionalModel",
mteb_score=0.689164662128673, mteb_score=0.689164662128673,
pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
) )
] ]
RERANK_MODELS = [ RERANK_MODELS = [
LASTPoolingRerankModelInfo( RerankModelInfo(
"nvidia/llama-nemotron-rerank-1b-v2", "nvidia/llama-nemotron-rerank-1b-v2",
architecture="LlamaBidirectionalForSequenceClassification", architecture="LlamaBidirectionalForSequenceClassification",
chat_template_name="nemotron-rerank.jinja", chat_template_name="nemotron-rerank.jinja",
mteb_score=0.33994, mteb_score=0.33994,
pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
), ),
] ]

View File

@ -4,30 +4,38 @@
import pytest import pytest
from tests.models.language.pooling.embed_utils import correctness_test_embed_models from tests.models.language.pooling.embed_utils import correctness_test_embed_models
from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo from tests.models.utils import EmbedModelInfo
from .mteb_utils import mteb_test_embed_models from .mteb_embed_utils import mteb_test_embed_models
MODELS = [ MODELS = [
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"nomic-ai/nomic-embed-text-v1", "nomic-ai/nomic-embed-text-v1",
architecture="NomicBertModel", architecture="NomicBertModel",
mteb_score=0.737568559, mteb_score=0.737568559,
enable_test=True, enable_test=True,
pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"nomic-ai/nomic-embed-text-v1.5", "nomic-ai/nomic-embed-text-v1.5",
architecture="NomicBertModel", architecture="NomicBertModel",
enable_test=False, enable_test=False,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"nomic-ai/CodeRankEmbed", architecture="NomicBertModel", enable_test=False "nomic-ai/CodeRankEmbed", architecture="NomicBertModel", enable_test=False
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"nomic-ai/nomic-embed-text-v2-moe", "nomic-ai/nomic-embed-text-v2-moe",
architecture="NomicBertModel", architecture="NomicBertModel",
mteb_score=0.715488912, mteb_score=0.715488912,
enable_test=True, enable_test=True,
pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
), ),
] ]

View File

@ -6,10 +6,10 @@ import pytest
import torch import torch
from tests.conftest import HfRunner from tests.conftest import HfRunner
from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo from tests.models.utils import RerankModelInfo
from tests.utils import multi_gpu_test from tests.utils import multi_gpu_test
from .mteb_utils import mteb_test_rerank_models from .mteb_score_utils import mteb_test_rerank_models
qwen3_reranker_hf_overrides = { qwen3_reranker_hf_overrides = {
"architectures": ["Qwen3ForSequenceClassification"], "architectures": ["Qwen3ForSequenceClassification"],
@ -18,14 +18,18 @@ qwen3_reranker_hf_overrides = {
} }
RERANK_MODELS = [ RERANK_MODELS = [
LASTPoolingRerankModelInfo( RerankModelInfo(
"Qwen/Qwen3-Reranker-0.6B", "Qwen/Qwen3-Reranker-0.6B",
architecture="Qwen3ForSequenceClassification", architecture="Qwen3ForSequenceClassification",
mteb_score=0.25736, mteb_score=0.25736,
hf_overrides=qwen3_reranker_hf_overrides, hf_overrides=qwen3_reranker_hf_overrides,
pooling_type="LAST",
attn_type="decoder",
is_prefix_caching_supported=True,
is_chunked_prefill_supported=True,
enable_test=True, enable_test=True,
), ),
LASTPoolingRerankModelInfo( RerankModelInfo(
"Qwen/Qwen3-Reranker-4B", "Qwen/Qwen3-Reranker-4B",
architecture="Qwen3ForSequenceClassification", architecture="Qwen3ForSequenceClassification",
hf_overrides=qwen3_reranker_hf_overrides, hf_overrides=qwen3_reranker_hf_overrides,

View File

@ -4,62 +4,82 @@
import pytest import pytest
from tests.models.language.pooling.embed_utils import correctness_test_embed_models from tests.models.language.pooling.embed_utils import correctness_test_embed_models
from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo from tests.models.utils import EmbedModelInfo
from .mteb_utils import mteb_test_embed_models from .mteb_embed_utils import mteb_test_embed_models
MODELS = [ MODELS = [
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-xs", "Snowflake/snowflake-arctic-embed-xs",
is_matryoshka=False, is_matryoshka=False,
architecture="BertModel", architecture="BertModel",
mteb_score=0.714927797, mteb_score=0.714927797,
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-s", "Snowflake/snowflake-arctic-embed-s",
is_matryoshka=False, is_matryoshka=False,
architecture="BertModel", architecture="BertModel",
enable_test=False, enable_test=False,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-m", "Snowflake/snowflake-arctic-embed-m",
is_matryoshka=False, is_matryoshka=False,
architecture="BertModel", architecture="BertModel",
enable_test=False, enable_test=False,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-m-long", "Snowflake/snowflake-arctic-embed-m-long",
is_matryoshka=False, is_matryoshka=False,
architecture="NomicBertModel", architecture="NomicBertModel",
mteb_score=0.681146831, mteb_score=0.681146831,
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-l", "Snowflake/snowflake-arctic-embed-l",
is_matryoshka=False, is_matryoshka=False,
architecture="BertModel", architecture="BertModel",
enable_test=False, enable_test=False,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-m-v1.5", "Snowflake/snowflake-arctic-embed-m-v1.5",
is_matryoshka=True, is_matryoshka=True,
architecture="BertModel", architecture="BertModel",
mteb_score=0.649088363, mteb_score=0.649088363,
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-l-v2.0", "Snowflake/snowflake-arctic-embed-l-v2.0",
is_matryoshka=True, is_matryoshka=True,
architecture="XLMRobertaModel", architecture="XLMRobertaModel",
mteb_score=0.712258299, mteb_score=0.712258299,
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-m-v2.0", "Snowflake/snowflake-arctic-embed-m-v2.0",
is_matryoshka=True, is_matryoshka=True,
architecture="GteModel", architecture="GteModel",
mteb_score=0.706622444, mteb_score=0.706622444,
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
] ]

View File

@ -3,25 +3,31 @@
import pytest import pytest
from tests.models.utils import ( from tests.models.utils import (
CLSPoolingEmbedModelInfo,
EmbedModelInfo, EmbedModelInfo,
LASTPoolingEmbedModelInfo,
) )
from .mteb_utils import mteb_test_embed_models from .mteb_embed_utils import mteb_test_embed_models
# ST models with projector (Dense) layers # ST models with projector (Dense) layers
ST_PROJECTOR_MODELS = [ ST_PROJECTOR_MODELS = [
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"TencentBAC/Conan-embedding-v1", "TencentBAC/Conan-embedding-v1",
architecture="BertModel", architecture="BertModel",
mteb_score=0.688611955, mteb_score=0.688611955,
pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
LASTPoolingEmbedModelInfo( EmbedModelInfo(
"google/embeddinggemma-300m", "google/embeddinggemma-300m",
architecture="Gemma3TextModel", architecture="Gemma3TextModel",
mteb_score=0.7473819294684156, mteb_score=0.7473819294684156,
pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
dtype="float32", dtype="float32",
), ),

View File

@ -267,7 +267,7 @@ def run_embedding_input_test(
"""Inference result should be the same between """Inference result should be the same between
original image/video input and image/video embeddings input. original image/video input and image/video embeddings input.
""" """
from transformers import AutoProcessor # noqa: F401 from transformers import AutoProcessor
processor = AutoProcessor.from_pretrained(model) processor = AutoProcessor.from_pretrained(model)

View File

@ -10,7 +10,7 @@ import torch
import torch.nn.functional as F import torch.nn.functional as F
from transformers import PretrainedConfig from transformers import PretrainedConfig
from vllm.config.model import ModelConfig, ModelDType, RunnerOption from vllm.config.model import AttnTypeStr, ModelConfig, ModelDType, RunnerOption
from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs
from vllm.multimodal.processing import InputProcessingContext from vllm.multimodal.processing import InputProcessingContext
from vllm.tokenizers import cached_tokenizer_from_config from vllm.tokenizers import cached_tokenizer_from_config
@ -375,7 +375,10 @@ class ModelInfo:
max_model_len: int | None = None max_model_len: int | None = None
hf_dtype: str = "float32" hf_dtype: str = "float32"
hf_overrides: dict[str, Any] | None = None hf_overrides: dict[str, Any] | None = None
default_pooling_type: str = "" pooling_type: str | None = None
attn_type: AttnTypeStr | None = None
is_prefix_caching_supported: bool | None = None
is_chunked_prefill_supported: bool | None = None
enable_test: bool = True enable_test: bool = True
@ -386,32 +389,12 @@ class EmbedModelInfo(ModelInfo):
matryoshka_dimensions: list[int] | None = None matryoshka_dimensions: list[int] | None = None
@dataclass
class CLSPoolingEmbedModelInfo(EmbedModelInfo):
default_pooling_type: str = "CLS"
@dataclass
class LASTPoolingEmbedModelInfo(EmbedModelInfo):
default_pooling_type: str = "LAST"
@dataclass @dataclass
class RerankModelInfo(ModelInfo): class RerankModelInfo(ModelInfo):
mteb_score: float | None = None mteb_score: float | None = None
chat_template_name: str | None = None chat_template_name: str | None = None
@dataclass
class CLSPoolingRerankModelInfo(RerankModelInfo):
default_pooling_type: str = "CLS"
@dataclass
class LASTPoolingRerankModelInfo(RerankModelInfo):
default_pooling_type: str = "LAST"
@dataclass @dataclass
class GenerateModelInfo(ModelInfo): class GenerateModelInfo(ModelInfo):
hf_dtype: str = "auto" hf_dtype: str = "auto"

View File

@ -145,7 +145,7 @@ def test_shared_storage_connector_hashes(tmp_path):
# don't put this import at the top level # don't put this import at the top level
# it will call torch.cuda.device_count() # it will call torch.cuda.device_count()
from transformers import AutoProcessor # noqa: F401 from transformers import AutoProcessor
# Create processor to handle the chat prompt # Create processor to handle the chat prompt
processor = AutoProcessor.from_pretrained(MODEL_NAME) processor = AutoProcessor.from_pretrained(MODEL_NAME)

View File

@ -164,7 +164,7 @@ class ModelConfig:
"""The specific revision to use for the tokenizer on the Hugging Face Hub. """The specific revision to use for the tokenizer on the Hugging Face Hub.
It can be a branch name, a tag name, or a commit id. If unspecified, will It can be a branch name, a tag name, or a commit id. If unspecified, will
use the default version.""" use the default version."""
max_model_len: int = Field(default=None, gt=0) max_model_len: int = Field(default=None, ge=-1)
"""Model context length (prompt and output). If unspecified, will be """Model context length (prompt and output). If unspecified, will be
automatically derived from the model config. automatically derived from the model config.
@ -595,7 +595,7 @@ class ModelConfig:
# Avoid running try_verify_and_update_config multiple times # Avoid running try_verify_and_update_config multiple times
self.config_updated = False self.config_updated = False
self._try_verify_and_update_model_config()
self._verify_quantization() self._verify_quantization()
self._verify_cuda_graph() self._verify_cuda_graph()
self._verify_bnb_config() self._verify_bnb_config()
@ -1008,6 +1008,23 @@ class ModelConfig:
"when expert parallelism is enabled." "when expert parallelism is enabled."
) )
def _try_verify_and_update_model_config(self):
# Avoid running try_verify_and_update_config multiple times
if getattr(self, "config_updated", False):
return
architecture = self.architecture
if architecture is None:
return
from vllm.model_executor.models.config import (
MODELS_CONFIG_MAP,
)
cls = MODELS_CONFIG_MAP.get(architecture, None)
if cls is not None:
cls.verify_and_update_model_config(self)
def verify_dual_chunk_attention_config( def verify_dual_chunk_attention_config(
self, self,
load_config: LoadConfig, load_config: LoadConfig,

View File

@ -81,10 +81,7 @@ class ECExampleConnector(ECConnectorBase):
assert encoder_cache is not None assert encoder_cache is not None
if metadata is None: if metadata is None:
logger.warning( logger.warning(
( "In connector.start_load_caches, but the connector metadata is None"
"In connector.start_load_caches, ",
"but the connector metadata is None",
)
) )
return return
# Load the EC for each mm data # Load the EC for each mm data

View File

@ -297,16 +297,14 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, dict[str, Any]]:
elif contains_type(type_hints, set): elif contains_type(type_hints, set):
kwargs[name].update(collection_to_kwargs(type_hints, set)) kwargs[name].update(collection_to_kwargs(type_hints, set))
elif contains_type(type_hints, int): elif contains_type(type_hints, int):
kwargs[name]["type"] = int if name == "max_model_len":
# Special case for large integers kwargs[name]["type"] = human_readable_int_or_auto
human_readable_ints = { kwargs[name]["help"] += f"\n\n{human_readable_int_or_auto.__doc__}"
"max_model_len", elif name in ("max_num_batched_tokens", "kv_cache_memory_bytes"):
"max_num_batched_tokens",
"kv_cache_memory_bytes",
}
if name in human_readable_ints:
kwargs[name]["type"] = human_readable_int kwargs[name]["type"] = human_readable_int
kwargs[name]["help"] += f"\n\n{human_readable_int.__doc__}" kwargs[name]["help"] += f"\n\n{human_readable_int.__doc__}"
else:
kwargs[name]["type"] = int
elif contains_type(type_hints, float): elif contains_type(type_hints, float):
kwargs[name]["type"] = float kwargs[name]["type"] = float
elif contains_type(type_hints, dict) and ( elif contains_type(type_hints, dict) and (
@ -2042,23 +2040,17 @@ def _raise_unsupported_error(feature_name: str):
raise NotImplementedError(msg) raise NotImplementedError(msg)
def human_readable_int(value): def human_readable_int(value: str) -> int:
"""Parse human-readable integers like '1k', '2M', etc. """Parse human-readable integers like '1k', '2M', etc.
Including decimal values with decimal multipliers. Including decimal values with decimal multipliers.
Also accepts -1 or 'auto' as a special value for auto-detection.
Examples: Examples:
- '1k' -> 1,000 - '1k' -> 1,000
- '1K' -> 1,024 - '1K' -> 1,024
- '25.6k' -> 25,600 - '25.6k' -> 25,600
- '-1' or 'auto' -> -1 (special value for auto-detection)
""" """
value = value.strip() value = value.strip()
# Handle -1 or 'auto' as a special value for auto-detection
if value == "-1" or value.lower() == "auto":
return -1
match = re.fullmatch(r"(\d+(?:\.\d+)?)([kKmMgGtT])", value) match = re.fullmatch(r"(\d+(?:\.\d+)?)([kKmMgGtT])", value)
if match: if match:
decimal_multiplier = { decimal_multiplier = {
@ -2092,3 +2084,22 @@ def human_readable_int(value):
# Regular plain number. # Regular plain number.
return int(value) return int(value)
def human_readable_int_or_auto(value: str) -> int:
"""Parse human-readable integers like '1k', '2M', etc.
Including decimal values with decimal multipliers.
Also accepts -1 or 'auto' as a special value for auto-detection.
Examples:
- '1k' -> 1,000
- '1K' -> 1,024
- '25.6k' -> 25,600
- '-1' or 'auto' -> -1 (special value for auto-detection)
"""
value = value.strip()
if value == "-1" or value.lower() == "auto":
return -1
return human_readable_int(value)

View File

@ -32,6 +32,7 @@ class BenchmarkSubcommand(CLISubcommand):
) -> FlexibleArgumentParser: ) -> FlexibleArgumentParser:
bench_parser = subparsers.add_parser( bench_parser = subparsers.add_parser(
self.name, self.name,
help=self.help,
description=self.help, description=self.help,
usage=f"vllm {self.name} <bench_type> [options]", usage=f"vllm {self.name} <bench_type> [options]",
) )

View File

@ -66,7 +66,11 @@ class ServeSubcommand(CLISubcommand):
self, subparsers: argparse._SubParsersAction self, subparsers: argparse._SubParsersAction
) -> FlexibleArgumentParser: ) -> FlexibleArgumentParser:
serve_parser = subparsers.add_parser( serve_parser = subparsers.add_parser(
self.name, description=DESCRIPTION, usage="vllm serve [model_tag] [options]" self.name,
help="Launch a local OpenAI-compatible API server to serve LLM "
"completions via HTTP.",
description=DESCRIPTION,
usage="vllm serve [model_tag] [options]",
) )
serve_parser = make_arg_parser(serve_parser) serve_parser = make_arg_parser(serve_parser)

View File

@ -43,7 +43,7 @@ async def scale_elastic_ep(raw_request: Request):
try: try:
body = await raw_request.json() body = await raw_request.json()
except json.JSONDecodeError as e: except json.JSONDecodeError as e:
raise HTTPException(status_code=400, detail="Invalid JSON format") from e # noqa: B904 raise HTTPException(status_code=400, detail="Invalid JSON format") from e
new_data_parallel_size = body.get("new_data_parallel_size") new_data_parallel_size = body.get("new_data_parallel_size")
drain_timeout = body.get("drain_timeout", 120) # Default 2 minutes drain_timeout = body.get("drain_timeout", 120) # Default 2 minutes

View File

@ -625,8 +625,9 @@ def silu_mul_per_token_group_quant_fp8_colmajor(
M, N = input.size() M, N = input.size()
N_2 = N // 2 N_2 = N // 2
fp8_dtype = current_platform.fp8_dtype()
if output is None: if output is None:
output = torch.empty((M, N_2), dtype=torch.float8_e4m3fn, device=input.device) output = torch.empty((M, N_2), dtype=fp8_dtype, device=input.device)
output_scales = torch.empty( output_scales = torch.empty(
((N_2 // GROUP_SIZE), M), dtype=torch.float32, device=input.device ((N_2 // GROUP_SIZE), M), dtype=torch.float32, device=input.device
@ -637,9 +638,12 @@ def silu_mul_per_token_group_quant_fp8_colmajor(
assert M % BLOCK_M == 0 assert M % BLOCK_M == 0
assert N_2 % BLOCK_N == 0 assert N_2 % BLOCK_N == 0
finfo = torch.finfo(torch.float8_e4m3fn) # Using the default value (240.0) from pytorch will cause accuracy
fp8_min = finfo.min # issue on dynamic quantization models. Here use 224.0 for fnuz on ROCm
fp8_max = finfo.max # platforms that use the torch.float8_e4m3fnuz dtype.
finfo = torch.finfo(fp8_dtype)
fp8_min = -224.0 if current_platform.is_fp8_fnuz() else finfo.min
fp8_max = 224.0 if current_platform.is_fp8_fnuz() else finfo.max
# Force even division so we can avoid edgecases within the kernel. # Force even division so we can avoid edgecases within the kernel.
assert M % BLOCK_M == 0 assert M % BLOCK_M == 0

View File

@ -1,6 +1,5 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: SIM117
import fnmatch import fnmatch
import glob import glob
import itertools import itertools
@ -59,7 +58,7 @@ def is_moe_model(model: torch.nn.Module) -> bool:
class BitsAndBytesModelLoader(BaseModelLoader): class BitsAndBytesModelLoader(BaseModelLoader):
"""Model loader to load model weights with BitAndBytes quantization.""" """Model loader to load model weights with BitsAndBytes quantization."""
possible_config_file_names = ["adapter_config.json"] possible_config_file_names = ["adapter_config.json"]

View File

@ -1,6 +1,5 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: SIM117
import os import os
from collections.abc import Generator from collections.abc import Generator

View File

@ -13,7 +13,7 @@ from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec, MLAAttentionSpec from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec, MLAAttentionSpec
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.config import VllmConfig from vllm.config import ModelConfig, VllmConfig
logger = init_logger(__name__) logger = init_logger(__name__)
@ -21,20 +21,24 @@ logger = init_logger(__name__)
class VerifyAndUpdateConfig: class VerifyAndUpdateConfig:
@staticmethod @staticmethod
def verify_and_update_config(vllm_config: "VllmConfig") -> None: def verify_and_update_config(vllm_config: "VllmConfig") -> None:
raise NotImplementedError return
class Gemma3TextModelConfig:
@staticmethod @staticmethod
def verify_and_update_config(vllm_config: "VllmConfig") -> None: def verify_and_update_model_config(model_config: "ModelConfig") -> None:
hf_config = vllm_config.model_config.hf_config return
class Gemma3TextModelConfig(VerifyAndUpdateConfig):
@staticmethod
def verify_and_update_model_config(model_config: "ModelConfig") -> None:
hf_config = model_config.hf_config
hf_config.is_causal = not hf_config.use_bidirectional_attention hf_config.is_causal = not hf_config.use_bidirectional_attention
class GteNewModelConfig(VerifyAndUpdateConfig): class GteNewModelConfig(VerifyAndUpdateConfig):
@staticmethod @staticmethod
def verify_and_update_config(vllm_config: "VllmConfig") -> None: def verify_and_update_model_config(model_config: "ModelConfig") -> None:
config = vllm_config.model_config.hf_config config = model_config.hf_config
assert config.__class__.__name__ == "NewConfig" assert config.__class__.__name__ == "NewConfig"
assert config.hidden_act == "gelu" assert config.hidden_act == "gelu"
@ -53,16 +57,15 @@ class GteNewModelConfig(VerifyAndUpdateConfig):
class JambaForSequenceClassificationConfig(VerifyAndUpdateConfig): class JambaForSequenceClassificationConfig(VerifyAndUpdateConfig):
@staticmethod @staticmethod
def verify_and_update_config(vllm_config: "VllmConfig") -> None: def verify_and_update_model_config(model_config: "ModelConfig") -> None:
pooler_config = vllm_config.model_config.pooler_config pooler_config = model_config.pooler_config
if pooler_config.use_activation is None: if pooler_config.use_activation is None:
pooler_config.use_activation = False pooler_config.use_activation = False
class JinaRobertaModelConfig(VerifyAndUpdateConfig): class JinaRobertaModelConfig(VerifyAndUpdateConfig):
@staticmethod @staticmethod
def verify_and_update_config(vllm_config: "VllmConfig") -> None: def verify_and_update_model_config(model_config: "ModelConfig") -> None:
model_config = vllm_config.model_config
config = model_config.hf_config config = model_config.hf_config
if config.position_embedding_type == "rotary": if config.position_embedding_type == "rotary":
@ -90,10 +93,10 @@ class JinaRobertaModelConfig(VerifyAndUpdateConfig):
class LlamaBidirectionalConfig(VerifyAndUpdateConfig): class LlamaBidirectionalConfig(VerifyAndUpdateConfig):
@staticmethod @staticmethod
def verify_and_update_config(vllm_config: "VllmConfig") -> None: def verify_and_update_model_config(model_config: "ModelConfig") -> None:
from vllm.config.pooler import PoolingTypeStr from vllm.config.pooler import PoolingTypeStr
hf_config = vllm_config.model_config.hf_config hf_config = model_config.hf_config
hf_config.is_causal = False hf_config.is_causal = False
pooling_type_map: dict[str, PoolingTypeStr] = { pooling_type_map: dict[str, PoolingTypeStr] = {
@ -105,7 +108,7 @@ class LlamaBidirectionalConfig(VerifyAndUpdateConfig):
pooling_type = pooling_type_map.get(hf_config.pooling, None) pooling_type = pooling_type_map.get(hf_config.pooling, None)
if pooling_type is None: if pooling_type is None:
raise ValueError(f"pool_type {hf_config.pooling} not supported") raise ValueError(f"pool_type {hf_config.pooling} not supported")
vllm_config.model_config.pooler_config.pooling_type = pooling_type model_config.pooler_config.pooling_type = pooling_type
class NomicBertModelConfig(VerifyAndUpdateConfig): class NomicBertModelConfig(VerifyAndUpdateConfig):
@ -204,8 +207,8 @@ class NomicBertModelConfig(VerifyAndUpdateConfig):
class Qwen2ForProcessRewardModelConfig(VerifyAndUpdateConfig): class Qwen2ForProcessRewardModelConfig(VerifyAndUpdateConfig):
@staticmethod @staticmethod
def verify_and_update_config(vllm_config: "VllmConfig") -> None: def verify_and_update_model_config(model_config: "ModelConfig") -> None:
pooler_config = vllm_config.model_config.pooler_config pooler_config = model_config.pooler_config
if pooler_config.step_tag_id is None: if pooler_config.step_tag_id is None:
pooler_config.step_tag_id = 151651 pooler_config.step_tag_id = 151651
@ -213,8 +216,8 @@ class Qwen2ForProcessRewardModelConfig(VerifyAndUpdateConfig):
class Qwen2ForRewardModelConfig(VerifyAndUpdateConfig): class Qwen2ForRewardModelConfig(VerifyAndUpdateConfig):
@staticmethod @staticmethod
def verify_and_update_config(vllm_config: "VllmConfig") -> None: def verify_and_update_model_config(model_config: "ModelConfig") -> None:
pooler_config = vllm_config.model_config.pooler_config pooler_config = model_config.pooler_config
if pooler_config.softmax is None: if pooler_config.softmax is None:
pooler_config.softmax = False pooler_config.softmax = False
@ -222,8 +225,8 @@ class Qwen2ForRewardModelConfig(VerifyAndUpdateConfig):
class Qwen3ForSequenceClassificationConfig(VerifyAndUpdateConfig): class Qwen3ForSequenceClassificationConfig(VerifyAndUpdateConfig):
@staticmethod @staticmethod
def verify_and_update_config(vllm_config: "VllmConfig") -> None: def verify_and_update_model_config(model_config: "ModelConfig") -> None:
config = vllm_config.model_config.hf_config config = model_config.hf_config
is_original_qwen3_reranker = getattr( is_original_qwen3_reranker = getattr(
config, "is_original_qwen3_reranker", False config, "is_original_qwen3_reranker", False
@ -237,23 +240,23 @@ class Qwen3ForSequenceClassificationConfig(VerifyAndUpdateConfig):
"Try loading the original Qwen3 Reranker?, see: " "Try loading the original Qwen3 Reranker?, see: "
"https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/offline_reranker.py" "https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/offline_reranker.py"
) )
vllm_config.model_config.hf_config.method = "from_2_way_softmax" model_config.hf_config.method = "from_2_way_softmax"
class JinaVLForSequenceClassificationConfig(VerifyAndUpdateConfig): class JinaVLForSequenceClassificationConfig(VerifyAndUpdateConfig):
@staticmethod @staticmethod
def verify_and_update_config(vllm_config: "VllmConfig") -> None: def verify_and_update_model_config(model_config: "ModelConfig") -> None:
config = vllm_config.model_config.hf_config config = model_config.hf_config
config.num_labels = 1 config.num_labels = 1
pooler_config = vllm_config.model_config.pooler_config pooler_config = model_config.pooler_config
if pooler_config.logit_bias is None: if pooler_config.logit_bias is None:
pooler_config.logit_bias = 2.65 pooler_config.logit_bias = 2.65
class SnowflakeGteNewModelConfig(VerifyAndUpdateConfig): class SnowflakeGteNewModelConfig(VerifyAndUpdateConfig):
@staticmethod @staticmethod
def verify_and_update_config(vllm_config: "VllmConfig") -> None: def verify_and_update_model_config(model_config: "ModelConfig") -> None:
config = vllm_config.model_config.hf_config config = model_config.hf_config
assert config.__class__.__name__ == "GteConfig" assert config.__class__.__name__ == "GteConfig"
assert config.hidden_act == "gelu" assert config.hidden_act == "gelu"

View File

@ -64,7 +64,6 @@ from .interfaces import (
SupportsLoRA, SupportsLoRA,
SupportsPP, SupportsPP,
) )
from .interfaces_base import attn_type
from .utils import ( from .utils import (
AutoWeightsLoader, AutoWeightsLoader,
PPMissingLayer, PPMissingLayer,
@ -707,14 +706,12 @@ class LlamaForCausalLM(
return name, loaded_weight return name, loaded_weight
@attn_type("encoder_only")
class LlamaBidirectionalForSequenceClassification(as_seq_cls_model(LlamaForCausalLM)): class LlamaBidirectionalForSequenceClassification(as_seq_cls_model(LlamaForCausalLM)):
# This class sets the correct attention type and pooling type # This class sets the correct attention type and pooling type
# through LlamaBidirectionalConfig. # through LlamaBidirectionalConfig.
pass pass
@attn_type("encoder_only")
class LlamaBidirectionalModel(as_embedding_model(LlamaForCausalLM)): class LlamaBidirectionalModel(as_embedding_model(LlamaForCausalLM)):
# This class sets the correct attention type and pooling type # This class sets the correct attention type and pooling type
# through LlamaBidirectionalConfig. # through LlamaBidirectionalConfig.

View File

@ -606,6 +606,43 @@ def get_request_block_hasher(
return request_block_hasher return request_block_hasher
def _check_enough_kv_cache_memory(
available_memory: int,
get_needed_memory: Callable[[], int],
max_model_len: int,
estimate_max_model_len: Callable[[int], int],
):
if available_memory <= 0:
raise ValueError(
"No available memory for the cache blocks. "
"Try increasing `gpu_memory_utilization` when initializing the engine. "
"See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ "
"for more details."
)
needed_memory = get_needed_memory()
if needed_memory > available_memory:
estimated_max_len = estimate_max_model_len(available_memory)
estimated_msg = ""
if estimated_max_len > 0:
estimated_msg = (
"Based on the available memory, "
f"the estimated maximum model length is {estimated_max_len}. "
)
raise ValueError(
f"To serve at least one request with the models's max seq len "
f"({max_model_len}), ({needed_memory / GiB_bytes:.2f} GiB KV "
f"cache is needed, which is larger than the available KV cache "
f"memory ({available_memory / GiB_bytes:.2f} GiB). {estimated_msg}"
f"Try increasing `gpu_memory_utilization` or decreasing `max_model_len` "
f"when initializing the engine. "
f"See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ "
f"for more details."
)
def max_memory_usage_bytes( def max_memory_usage_bytes(
vllm_config: VllmConfig, kv_cache_specs: Iterable[KVCacheSpec] vllm_config: VllmConfig, kv_cache_specs: Iterable[KVCacheSpec]
) -> int: ) -> int:
@ -688,43 +725,12 @@ def check_enough_kv_cache_memory(
""" """
# No need to check for available memory if the kv_cache_spec is empty # No need to check for available memory if the kv_cache_spec is empty
if not kv_cache_spec: if kv_cache_spec:
return _check_enough_kv_cache_memory(
available_memory,
if available_memory <= 0: lambda: max_memory_usage_bytes(vllm_config, kv_cache_spec.values()),
raise ValueError( vllm_config.model_config.max_model_len,
"No available memory for the cache blocks. " lambda am: estimate_max_model_len(vllm_config, kv_cache_spec, am),
"Try increasing `gpu_memory_utilization` when "
"initializing the engine. "
"See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ "
"for more details."
)
max_model_len = vllm_config.model_config.max_model_len
needed_memory = max_memory_usage_bytes(vllm_config, kv_cache_spec.values())
if needed_memory > available_memory:
# Estimate the maximum model length that can fit in the available memory
estimated_max_len = estimate_max_model_len(
vllm_config, kv_cache_spec, available_memory
)
estimated_msg = ""
if estimated_max_len > 0:
estimated_msg = (
"Based on the available memory, "
f"the estimated maximum model length is {estimated_max_len}."
)
raise ValueError(
f"To serve at least one request with the models's max seq len "
f"({max_model_len}), ({needed_memory / GiB_bytes:.2f} GiB KV "
f"cache is needed, which is larger than the available KV cache "
f"memory ({available_memory / GiB_bytes:.2f} GiB). "
f"{estimated_msg} "
f"Try increasing `gpu_memory_utilization` or decreasing `max_model_len` "
f"when initializing the engine. "
f"See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ "
f"for more details."
) )
@ -1586,36 +1592,16 @@ def get_kv_cache_configs(
# Check if the available memory is enough (using min across all workers). # Check if the available memory is enough (using min across all workers).
# We use the global groups to correctly account for padding. # We use the global groups to correctly account for padding.
if global_kv_cache_groups: if global_kv_cache_groups:
min_available_memory = min(available_memory) _check_enough_kv_cache_memory(
if min_available_memory <= 0: min(available_memory),
raise ValueError( lambda: _max_memory_usage_bytes_from_groups(
"No available memory for the cache blocks. " vllm_config, global_kv_cache_groups
"Try increasing `gpu_memory_utilization` when " ),
"initializing the engine." vllm_config.model_config.max_model_len,
) lambda am: _estimate_max_model_len_from_groups(
max_model_len = vllm_config.model_config.max_model_len vllm_config, global_kv_cache_groups, am
needed_memory = _max_memory_usage_bytes_from_groups( ),
vllm_config, global_kv_cache_groups
) )
if needed_memory > min_available_memory:
estimated_max_len = _estimate_max_model_len_from_groups(
vllm_config, global_kv_cache_groups, min_available_memory
)
estimated_msg = ""
if estimated_max_len > 0:
estimated_msg = (
f"Based on the available memory, the estimated maximum "
f"model length is {estimated_max_len}. "
)
raise ValueError(
f"To serve at least one request with the models's max seq len "
f"({max_model_len}), ({needed_memory / GiB_bytes:.2f} GiB KV "
f"cache is needed, which is larger than the available KV cache "
f"memory ({min_available_memory / GiB_bytes:.2f} GiB). "
f"{estimated_msg}"
f"Try increasing `gpu_memory_utilization` or decreasing "
f"`max_model_len` when initializing the engine."
)
kv_cache_configs: list[KVCacheConfig] = [] kv_cache_configs: list[KVCacheConfig] = []
for kv_cache_spec_one_worker, available_memory_one_worker in zip( for kv_cache_spec_one_worker, available_memory_one_worker in zip(

View File

@ -6,9 +6,7 @@ Define EC connector functionality mixin for model runners.
from collections.abc import Generator from collections.abc import Generator
from contextlib import AbstractContextManager, contextmanager, nullcontext from contextlib import AbstractContextManager, contextmanager, nullcontext
from typing import ( from typing import TYPE_CHECKING
TYPE_CHECKING, # noqa: UP035
)
import torch import torch

View File

@ -7,9 +7,7 @@ Define KV connector functionality mixin for model runners.
import copy import copy
from collections.abc import Generator from collections.abc import Generator
from contextlib import AbstractContextManager, contextmanager, nullcontext from contextlib import AbstractContextManager, contextmanager, nullcontext
from typing import ( from typing import TYPE_CHECKING
TYPE_CHECKING, # noqa: UP035
)
import torch import torch