Merge branch 'main' into feat/vanisimov/kv_cache_groups_optimization

2026-05-05 02:17:54 +08:00 · 2025-12-24 18:20:24 +01:00 · 2025-12-24 18:20:24 +01:00 · 437ac4e047
commit 437ac4e047
parent 2c92ed30cd 5d93089686
58 changed files with 690 additions and 519 deletions
--- a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
@ -2,7 +2,7 @@
 # We can use this script to compute baseline accuracy on chartqa for vllm.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install lm-eval==0.4.9
+#   pip install "lm-eval[api]>=0.4.9.2"

 usage() {
    echo``
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@ -2,7 +2,7 @@
 # We can use this script to compute baseline accuracy on GSM for transformers.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
+#   pip install "lm-eval[api]>=0.4.9.2"

 usage() {
    echo``
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
+#   pip install "lm-eval[api]>=0.4.9.2"

 usage() {
    echo``
--- a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
+#   pip install "lm-eval[api]>=0.4.9.2"

 usage() {
    echo``
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
+    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"

--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
+    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"

--- a/csrc/cpu/utils.cpp
+++ b/csrc/cpu/utils.cpp
@ -24,6 +24,8 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
 #ifndef VLLM_NUMA_DISABLED
 std::string init_cpu_threads_env(const std::string& cpu_ids) {
  bitmask* omp_cpu_mask = numa_parse_cpustring_all(cpu_ids.c_str());
+  TORCH_CHECK(omp_cpu_mask != nullptr,
+              "Failed to parse CPU string: " + cpu_ids);
  TORCH_CHECK(omp_cpu_mask->size > 0);
  std::vector<int> omp_cpu_ids;
  omp_cpu_ids.reserve(omp_cpu_mask->size);
@ -44,20 +46,12 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {

  // Memory node binding
  if (numa_available() != -1) {
-    int mem_node_id = numa_node_of_cpu(omp_cpu_ids.front());
    std::set<int> node_ids;
    for (const auto& cpu_id : omp_cpu_ids) {
      int node_id = numa_node_of_cpu(cpu_id);
      if (node_id != -1) {
        node_ids.insert(node_id);
      }
-      if (node_id != mem_node_id) {
-        TORCH_WARN("CPU ", cpu_id, " is on NUMA node ", node_id, ", but CPU ",
-                   omp_cpu_ids.front(), " is on NUMA node ", mem_node_id,
-                   ". All CPUs should be on the same NUMA node for optimal "
-                   "performance. Memory will be bound to NUMA node ",
-                   mem_node_id, ".");
-      }
    }
    // Concatenate all node_ids into a single comma-separated string
    if (!node_ids.empty()) {
@ -70,7 +64,7 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
      }

      bitmask* mask = numa_parse_nodestring(node_ids_str.c_str());
-      bitmask* src_mask = numa_get_membind();
+      bitmask* src_mask = numa_get_mems_allowed();

      int pid = getpid();

@ -83,15 +77,46 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
                     std::to_string(errno));
        }

-        // restrict memory allocation node.
-        numa_set_membind(mask);
+        // Restrict memory allocation to the selected NUMA node(s).
+        // Enhances memory locality for the threads bound to those NUMA CPUs.
+        if (node_ids.size() > 1) {
+          errno = 0;
+          numa_set_interleave_mask(mask);
+          if (errno != 0) {
+            TORCH_WARN("numa_set_interleave_mask failed. errno: " +
+                       std::to_string(errno));
+          } else {
+            TORCH_WARN(
+                "NUMA binding: Using INTERLEAVE policy for memory "
+                "allocation across multiple NUMA nodes (nodes: " +
+                node_ids_str +
+                "). Memory allocations will be "
+                "interleaved across the specified NUMA nodes.");
+          }
+        } else {
+          errno = 0;
+          numa_set_membind(mask);
+          if (errno != 0) {
+            TORCH_WARN("numa_set_membind failed. errno: " +
+                       std::to_string(errno));
+          } else {
+            TORCH_WARN(
+                "NUMA binding: Using MEMBIND policy for memory "
+                "allocation on the NUMA nodes (" +
+                node_ids_str +
+                "). Memory allocations will be "
+                "strictly bound to these NUMA nodes.");
+          }
+        }
+
        numa_set_strict(1);

        numa_free_nodemask(mask);
        numa_free_nodemask(src_mask);
      } else {
-        TORCH_WARN("numa_parse_nodestring or numa_get_membind failed. errno: " +
-                   std::to_string(errno));
+        TORCH_WARN(
+            "numa_parse_nodestring or numa_get_run_node_mask failed. errno: " +
+            std::to_string(errno));
      }
    }
  }
--- a/docs/features/quantization/fp8.md
+++ b/docs/features/quantization/fp8.md
@ -84,7 +84,7 @@ Since simple RTN does not require data for weight quantization and the activatio
 Install `vllm` and `lm-evaluation-harness` for evaluation:

 ```bash
-pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
+pip install vllm "lm-eval[api]>=0.4.9.2"
 ```

 Load and run the model in `vllm`:
--- a/docs/features/quantization/int4.md
+++ b/docs/features/quantization/int4.md
@ -18,7 +18,7 @@ pip install llmcompressor
 Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:

 ```bash
-pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
+pip install vllm "lm-eval[api]>=0.4.9.2"
 ```

 ## Quantization Process
--- a/docs/features/quantization/int8.md
+++ b/docs/features/quantization/int8.md
@ -23,7 +23,7 @@ pip install llmcompressor
 Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:

 ```bash
-pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
+pip install vllm "lm-eval[api]>=0.4.9.2"
 ```

 ## Quantization Process
--- a/docs/features/quantization/quark.md
+++ b/docs/features/quantization/quark.md
@ -20,7 +20,7 @@ for more installation details.
 Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:

 ```bash
-pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
+pip install vllm "lm-eval[api]>=0.4.9.2"
 ```

 ## Quantization Process
--- a/requirements/nightly_torch_test.txt
+++ b/requirements/nightly_torch_test.txt
@ -27,7 +27,7 @@ mistral_common[image,audio] >= 1.8.5 # required for voxtral test
 num2words # required for smolvlm test
 opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
-lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
+lm-eval[api]>=0.4.9.2 # required for model evaluation test
 mteb>=1.38.11, <2 # required for mteb test
 transformers==4.57.3
 tokenizers==0.22.0
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@ -58,7 +58,7 @@ schemathesis==3.39.15
    # OpenAI schema test

 # Evaluation and benchmarking
-lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
+lm-eval[api]>=0.4.9.2
 jiwer==4.0.0

 # Required for multiprocessed tests that use spawn method, Datasets and Evaluate Test
--- a/requirements/test.in
+++ b/requirements/test.in
@ -34,8 +34,7 @@ num2words # required for smolvlm test
 open_clip_torch==2.32.0 # Required for nemotron_vl test
 opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
-# TODO: Use lm-eval[api]==0.4.10 once released
-lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
+lm-eval[api]>=0.4.9.2 # required for model evaluation test
 mteb[bm25s]>=2, <3 # required for mteb test
 transformers==4.57.3
 tokenizers==0.22.0
--- a/requirements/test.txt
+++ b/requirements/test.txt
@ -441,7 +441,7 @@ lightning-utilities==0.14.3
    #   torchmetrics
 llvmlite==0.44.0
    # via numba
-lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
+lm-eval==0.4.9.2
    # via -r requirements/test.in
 lxml==5.3.0
    # via
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -410,7 +410,7 @@ class HfRunner:

        # don't put this import at the top level
        # it will call torch.cuda.device_count()
-        from transformers import AutoProcessor  # noqa: F401
+        from transformers import AutoProcessor

        self.processor = AutoProcessor.from_pretrained(
            model_name,
--- a/tests/entrypoints/openai/test_async_tokenization.py
+++ b/tests/entrypoints/openai/test_async_tokenization.py
@ -15,7 +15,7 @@ MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"


@pytest.fixture(scope="module")
-def server():  # noqa: F811
+def server():
    args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@ -28,7 +28,7 @@ def zephyr_lora_files():


@pytest.fixture(scope="module")
-def server(zephyr_lora_files):  # noqa: F811
+def server(zephyr_lora_files):
    args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
--- a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
+++ b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
@ -12,7 +12,7 @@ MODEL_NAME = "Qwen/QwQ-32B"


@pytest.fixture(scope="module")
-def server():  # noqa: F811
+def server():
    args = [
        "--max-model-len",
        "8192",
--- a/tests/entrypoints/openai/test_completion_with_function_calling.py
+++ b/tests/entrypoints/openai/test_completion_with_function_calling.py
@ -125,7 +125,7 @@ messages = [


@pytest.fixture(scope="module")
-def server():  # noqa: F811
+def server():
    args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
@ -212,7 +212,7 @@ async def test_function_tool_use(


@pytest.fixture(scope="module")
-def k2_server():  # noqa: F811
+def k2_server():
    args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
--- a/tests/entrypoints/openai/test_default_mm_loras.py
+++ b/tests/entrypoints/openai/test_default_mm_loras.py
@ -23,7 +23,7 @@ ACTIVE_MM_LORA_RESPONSE = "Spoken text: The first words I spoke in the original


@pytest.fixture(scope="module")
-def multimodal_server():  # noqa: F811
+def multimodal_server():
    args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
--- a/tests/entrypoints/openai/test_enable_force_include_usage.py
+++ b/tests/entrypoints/openai/test_enable_force_include_usage.py
@ -8,7 +8,7 @@ from ...utils import RemoteOpenAIServer


@pytest.fixture(scope="module")
-def chat_server_with_force_include_usage(request):  # noqa: F811
+def chat_server_with_force_include_usage(request):
    args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
--- a/tests/entrypoints/openai/test_messages.py
+++ b/tests/entrypoints/openai/test_messages.py
@ -11,7 +11,7 @@ MODEL_NAME = "Qwen/Qwen3-0.6B"


@pytest.fixture(scope="module")
-def server():  # noqa: F811
+def server():
    args = [
        "--max-model-len",
        "2048",
--- a/tests/entrypoints/openai/test_return_tokens_as_ids.py
+++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py
@ -37,7 +37,7 @@ def default_server_args(qwen3_lora_files):


@pytest.fixture(scope="module")
-def server_fixture(request, default_server_args):  # noqa: F811
+def server_fixture(request, default_server_args):
    use_server_flag = request.param
    if use_server_flag:
        args_with_flag = default_server_args + ["--return-tokens-as-token-ids"]
--- a/tests/entrypoints/pooling/embed/test_correctness_mteb.py
+++ b/tests/entrypoints/pooling/embed/test_correctness_mteb.py
@ -4,7 +4,7 @@ import os

 import pytest

-from tests.models.language.pooling_mteb_test.mteb_utils import (
+from tests.models.language.pooling_mteb_test.mteb_embed_utils import (
    MTEB_EMBED_TASKS,
    MTEB_EMBED_TOL,
    OpenAIClientMtebEncoder,
--- a/tests/entrypoints/pooling/score/test_correctness_mteb.py
+++ b/tests/entrypoints/pooling/score/test_correctness_mteb.py
@ -4,7 +4,7 @@ import os

 import pytest

-from tests.models.language.pooling_mteb_test.mteb_utils import (
+from tests.models.language.pooling_mteb_test.mteb_score_utils import (
    MTEB_RERANK_LANGS,
    MTEB_RERANK_TASKS,
    MTEB_RERANK_TOL,
--- a/tests/entrypoints/pooling/score/test_utils.py
+++ b/tests/entrypoints/pooling/score/test_utils.py
@ -202,11 +202,10 @@ class TestGetScorePrompt:
        tokenization_kwargs,
        mock_model_no_score_template,
    ):
-        # FIXME: Models implementing SupportsScoreTemplate must use their custom
-        # template implementation by default to preserve existing functionality.
-        # Attempting to use tokenizer_config.json templates would most likely break
-        # these models, as often they just inherit the template from the original LLM.
-        # CLI --chat-template overrides are still supported.
+        # FIXME: For now, we only apply a template when one is explicitly provided.
+        # We cannot rely on the tokenizer's chat template because many models
+        # inherit junk templates from their base LLM, which breaks both the models
+        # and the tests that use them.
        with (
            patch(
                "vllm.model_executor.model_loader.get_model_cls",
--- a/tests/models/language/pooling_mteb_test/mteb_embed_utils.py
+++ b/tests/models/language/pooling_mteb_test/mteb_embed_utils.py
@ -0,0 +1,228 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import mteb
+import numpy as np
+import torch
+from mteb.models import ModelMeta
+from mteb.types import Array
+from torch.utils.data import DataLoader
+
+import tests.ci_envs as ci_envs
+from tests.models.utils import (
+    EmbedModelInfo,
+    check_embeddings_close,
+    get_vllm_extra_kwargs,
+)
+
+# Most embedding models on the STS12 task (See #17175):
+# - Model implementation and minor changes in tensor dtype
+#   results in differences less than 1e-4
+# - Different model results in differences more than 1e-3
+# 1e-4 is a good tolerance threshold
+MTEB_EMBED_TASKS = ["STS12"]
+MTEB_EMBED_TOL = 1e-4
+
+
+_empty_model_meta = ModelMeta(
+    loader=None,
+    name="vllm/model",
+    revision="1",
+    release_date=None,
+    languages=None,
+    framework=[],
+    similarity_fn_name=None,
+    n_parameters=None,
+    memory_usage_mb=None,
+    max_tokens=None,
+    embed_dim=None,
+    license=None,
+    open_weights=None,
+    public_training_code=None,
+    public_training_data=None,
+    use_instructions=None,
+    training_datasets=None,
+    modalities=["text"],  # 'image' can be added to evaluate multimodal models
+)
+
+
+class MtebEmbedMixin(mteb.EncoderProtocol):
+    mteb_model_meta = _empty_model_meta
+
+    def similarity(
+        self,
+        embeddings1: np.ndarray,
+        embeddings2: np.ndarray,
+    ) -> np.ndarray:
+        # Cosine similarity
+        norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
+        norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
+        sim = np.dot(embeddings1, embeddings2.T) / (norm1 * norm2.T)
+        return sim
+
+    def similarity_pairwise(
+        self,
+        embeddings1: Array,
+        embeddings2: Array,
+    ) -> Array:
+        # Cosine similarity
+        norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
+        norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
+        sim = np.sum(embeddings1 * embeddings2, axis=1) / (
+            norm1.flatten() * norm2.flatten()
+        )
+        return sim
+
+
+class VllmMtebEncoder(MtebEmbedMixin):
+    def __init__(self, vllm_model):
+        self.llm = vllm_model
+        self.rng = np.random.default_rng(seed=42)
+
+    def encode(
+        self,
+        inputs: DataLoader[mteb.types.BatchedInput],
+        *args,
+        **kwargs,
+    ) -> np.ndarray:
+        # Hoping to discover potential scheduling
+        # issues by randomizing the order.
+        sentences = [text for batch in inputs for text in batch["text"]]
+        r = self.rng.permutation(len(sentences))
+        sentences = [sentences[i] for i in r]
+        outputs = self.llm.embed(sentences, use_tqdm=False)
+        embeds = np.array(outputs)
+        embeds = embeds[np.argsort(r)]
+        return embeds
+
+
+class OpenAIClientMtebEncoder(MtebEmbedMixin):
+    def __init__(self, model_name: str, client):
+        self.model_name = model_name
+        self.client = client
+        self.rng = np.random.default_rng(seed=42)
+
+    def encode(
+        self,
+        inputs: DataLoader[mteb.types.BatchedInput],
+        *args,
+        **kwargs,
+    ) -> np.ndarray:
+        # Hoping to discover potential scheduling
+        # issues by randomizing the order.
+        sentences = [text for batch in inputs for text in batch["text"]]
+        r = self.rng.permutation(len(sentences))
+        sentences = [sentences[i] for i in r]
+
+        embeddings = self.client.embeddings.create(
+            model=self.model_name, input=sentences
+        )
+        outputs = [d.embedding for d in embeddings.data]
+        embeds = np.array(outputs)
+        embeds = embeds[np.argsort(r)]
+        return embeds
+
+
+def run_mteb_embed_task(encoder: mteb.EncoderProtocol, tasks):
+    tasks = mteb.get_tasks(tasks=tasks)
+    results = mteb.evaluate(
+        encoder,
+        tasks,
+        cache=None,
+        show_progress_bar=False,
+    )
+
+    main_score = results[0].scores["test"][0]["main_score"]
+    return main_score
+
+
+def mteb_test_embed_models(
+    hf_runner,
+    vllm_runner,
+    model_info: EmbedModelInfo,
+    vllm_extra_kwargs=None,
+    hf_model_callback=None,
+    atol=MTEB_EMBED_TOL,
+):
+    vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)
+
+    # Test embed_dims, isnan and whether to use normalize
+    example_prompts = ["The chef prepared a delicious meal." * 1000]
+
+    with vllm_runner(
+        model_info.name,
+        runner="pooling",
+        max_model_len=model_info.max_model_len,
+        **vllm_extra_kwargs,
+    ) as vllm_model:
+        model_config = vllm_model.llm.llm_engine.model_config
+
+        # Confirm whether vllm is using the correct architecture
+        if model_info.architecture:
+            assert model_info.architecture in model_config.architectures
+
+        # Confirm whether the important configs in model_config are correct.
+        if model_info.pooling_type is not None:
+            assert model_config.pooler_config.pooling_type == model_info.pooling_type
+        if model_info.attn_type is not None:
+            assert model_config.attn_type == model_info.attn_type
+        if model_info.is_prefix_caching_supported is not None:
+            assert (
+                model_config.is_prefix_caching_supported
+                == model_info.is_prefix_caching_supported
+            )
+        if model_info.is_chunked_prefill_supported is not None:
+            assert (
+                model_config.is_chunked_prefill_supported
+                == model_info.is_chunked_prefill_supported
+            )
+
+        vllm_main_score = run_mteb_embed_task(
+            VllmMtebEncoder(vllm_model), MTEB_EMBED_TASKS
+        )
+        vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
+        head_dtype = model_config.head_dtype
+
+        # Test embedding_size, isnan and whether to use normalize
+        vllm_outputs = vllm_model.embed(example_prompts, truncate_prompt_tokens=-1)
+        outputs_tensor = torch.tensor(vllm_outputs)
+        assert not torch.any(torch.isnan(outputs_tensor))
+        embedding_size = model_config.embedding_size
+        assert torch.tensor(vllm_outputs).shape[-1] == embedding_size
+
+    # Accelerate mteb test by setting
+    # SentenceTransformers mteb score to a constant
+    if model_info.mteb_score is None:
+        with hf_runner(
+            model_info.name,
+            is_sentence_transformer=True,
+            dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype,
+        ) as hf_model:
+            # e.g. setting default parameters for the encode method of hf_runner
+            if hf_model_callback is not None:
+                hf_model_callback(hf_model)
+
+            st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
+            st_dtype = next(hf_model.model.parameters()).dtype
+
+            # Check embeddings close to hf outputs
+            hf_outputs = hf_model.encode(example_prompts)
+            check_embeddings_close(
+                embeddings_0_lst=hf_outputs,
+                embeddings_1_lst=vllm_outputs,
+                name_0="hf",
+                name_1="vllm",
+                tol=1e-2,
+            )
+    else:
+        st_main_score = model_info.mteb_score
+        st_dtype = "Constant"
+
+    print("Model:", model_info.name)
+    print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", vllm_main_score)
+    print("SentenceTransformers:", st_dtype, st_main_score)
+    print("Difference:", st_main_score - vllm_main_score)
+
+    # We are not concerned that the vllm mteb results are better
+    # than SentenceTransformers, so we only perform one-sided testing.
+    assert st_main_score - vllm_main_score < atol
--- a/tests/models/language/pooling_mteb_test/mteb_score_utils.py
+++ b/tests/models/language/pooling_mteb_test/mteb_score_utils.py
@ -7,37 +7,24 @@ from pathlib import Path
 import mteb
 import numpy as np
 import requests
-import torch
 from mteb.models import ModelMeta
-from mteb.types import Array
 from torch.utils.data import DataLoader

-import tests.ci_envs as ci_envs
 from tests.models.utils import (
-    EmbedModelInfo,
    RerankModelInfo,
-    check_embeddings_close,
    get_vllm_extra_kwargs,
 )

-template_home = (
-    Path(__file__).parent.parent.parent.parent.parent
-    / "examples/pooling/score/template"
-)
-
-# Most embedding models on the STS12 task (See #17175):
-# - Model implementation and minor changes in tensor dtype
-#   results in differences less than 1e-4
-# - Different model results in differences more than 1e-3
-# 1e-4 is a good tolerance threshold
-MTEB_EMBED_TASKS = ["STS12"]
-MTEB_EMBED_TOL = 1e-4
-
 # See #19344
 MTEB_RERANK_TASKS = ["NFCorpus"]
 MTEB_RERANK_LANGS = ["eng"]
 MTEB_RERANK_TOL = 2e-3

+template_home = (
+    Path(__file__).parent.parent.parent.parent.parent
+    / "examples/pooling/score/template"
+)
+
 _empty_model_meta = ModelMeta(
    loader=None,
    name="vllm/model",
@ -60,84 +47,11 @@ _empty_model_meta = ModelMeta(
 )


-class VllmMtebEncoder(mteb.EncoderProtocol):
+class MtebCrossEncoderMixin(mteb.CrossEncoderProtocol):
    mteb_model_meta = _empty_model_meta

-    def __init__(self, vllm_model):
-        self.llm = vllm_model
-        self.rng = np.random.default_rng(seed=42)
-
-    def encode(
-        self,
-        inputs: DataLoader[mteb.types.BatchedInput],
-        *args,
-        **kwargs,
-    ) -> np.ndarray:
-        # Hoping to discover potential scheduling
-        # issues by randomizing the order.
-        sentences = [text for batch in inputs for text in batch["text"]]
-        r = self.rng.permutation(len(sentences))
-        sentences = [sentences[i] for i in r]
-        outputs = self.llm.embed(sentences, use_tqdm=False)
-        embeds = np.array(outputs)
-        embeds = embeds[np.argsort(r)]
-        return embeds
-
-    def similarity(
-        self,
-        embeddings1: np.ndarray,
-        embeddings2: np.ndarray,
-    ) -> np.ndarray:
-        # Cosine similarity
-        norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
-        norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
-        sim = np.dot(embeddings1, embeddings2.T) / (norm1 * norm2.T)
-        return sim
-
-    def similarity_pairwise(
-        self,
-        embeddings1: Array,
-        embeddings2: Array,
-    ) -> Array:
-        # Cosine similarity
-        norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
-        norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
-        sim = np.sum(embeddings1 * embeddings2, axis=1) / (
-            norm1.flatten() * norm2.flatten()
-        )
-        return sim
-
-
-class OpenAIClientMtebEncoder(VllmMtebEncoder):
-    def __init__(self, model_name: str, client):
-        self.model_name = model_name
-        self.client = client
-        self.rng = np.random.default_rng(seed=42)
-
-    def encode(
-        self,
-        inputs: DataLoader[mteb.types.BatchedInput],
-        *args,
-        **kwargs,
-    ) -> np.ndarray:
-        # Hoping to discover potential scheduling
-        # issues by randomizing the order.
-        sentences = [text for batch in inputs for text in batch["text"]]
-        r = self.rng.permutation(len(sentences))
-        sentences = [sentences[i] for i in r]
-
-        embeddings = self.client.embeddings.create(
-            model=self.model_name, input=sentences
-        )
-        outputs = [d.embedding for d in embeddings.data]
-        embeds = np.array(outputs)
-        embeds = embeds[np.argsort(r)]
-        return embeds
-
-
-class VllmMtebCrossEncoder(mteb.CrossEncoderProtocol):
-    mteb_model_meta = _empty_model_meta

+class VllmMtebCrossEncoder(MtebCrossEncoderMixin):
    def __init__(self, vllm_model):
        self.llm = vllm_model
        self.rng = np.random.default_rng(seed=42)
@ -164,7 +78,7 @@ class VllmMtebCrossEncoder(mteb.CrossEncoderProtocol):
        return scores


-class ScoreClientMtebEncoder(mteb.CrossEncoderProtocol):
+class ScoreClientMtebEncoder(MtebCrossEncoderMixin):
    mteb_model_meta = _empty_model_meta

    def __init__(self, model_name: str, url):
@ -216,102 +130,6 @@ class RerankClientMtebEncoder(ScoreClientMtebEncoder):
        return response["results"][0]["relevance_score"]


-def run_mteb_embed_task(encoder: mteb.EncoderProtocol, tasks):
-    tasks = mteb.get_tasks(tasks=tasks)
-    results = mteb.evaluate(
-        encoder,
-        tasks,
-        cache=None,
-        show_progress_bar=False,
-    )
-
-    main_score = results[0].scores["test"][0]["main_score"]
-    return main_score
-
-
-def mteb_test_embed_models(
-    hf_runner,
-    vllm_runner,
-    model_info: EmbedModelInfo,
-    vllm_extra_kwargs=None,
-    hf_model_callback=None,
-    atol=MTEB_EMBED_TOL,
-):
-    vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)
-
-    # Test embed_dims, isnan and whether to use normalize
-    example_prompts = ["The chef prepared a delicious meal." * 1000]
-
-    with vllm_runner(
-        model_info.name,
-        runner="pooling",
-        max_model_len=model_info.max_model_len,
-        **vllm_extra_kwargs,
-    ) as vllm_model:
-        model_config = vllm_model.llm.llm_engine.model_config
-
-        # Confirm whether vllm is using the correct architecture
-        if model_info.architecture:
-            assert model_info.architecture in model_config.architectures
-
-        # Confirm whether vllm uses the correct default_pooling_type, which
-        # relates to whether chunked prefill and prefix caching are enabled
-        assert (
-            model_config._model_info.default_pooling_type
-            == model_info.default_pooling_type
-        )
-
-        vllm_main_score = run_mteb_embed_task(
-            VllmMtebEncoder(vllm_model), MTEB_EMBED_TASKS
-        )
-        vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
-        head_dtype = model_config.head_dtype
-
-        # Test embedding_size, isnan and whether to use normalize
-        vllm_outputs = vllm_model.embed(example_prompts, truncate_prompt_tokens=-1)
-        outputs_tensor = torch.tensor(vllm_outputs)
-        assert not torch.any(torch.isnan(outputs_tensor))
-        embedding_size = model_config.embedding_size
-        assert torch.tensor(vllm_outputs).shape[-1] == embedding_size
-
-    # Accelerate mteb test by setting
-    # SentenceTransformers mteb score to a constant
-    if model_info.mteb_score is None:
-        with hf_runner(
-            model_info.name,
-            is_sentence_transformer=True,
-            dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype,
-        ) as hf_model:
-            # e.g. setting default parameters for the encode method of hf_runner
-            if hf_model_callback is not None:
-                hf_model_callback(hf_model)
-
-            st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
-            st_dtype = next(hf_model.model.parameters()).dtype
-
-            # Check embeddings close to hf outputs
-            hf_outputs = hf_model.encode(example_prompts)
-            check_embeddings_close(
-                embeddings_0_lst=hf_outputs,
-                embeddings_1_lst=vllm_outputs,
-                name_0="hf",
-                name_1="vllm",
-                tol=1e-2,
-            )
-    else:
-        st_main_score = model_info.mteb_score
-        st_dtype = "Constant"
-
-    print("Model:", model_info.name)
-    print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", vllm_main_score)
-    print("SentenceTransformers:", st_dtype, st_main_score)
-    print("Difference:", st_main_score - vllm_main_score)
-
-    # We are not concerned that the vllm mteb results are better
-    # than SentenceTransformers, so we only perform one-sided testing.
-    assert st_main_score - vllm_main_score < atol
-
-
 def run_mteb_rerank(cross_encoder: mteb.CrossEncoderProtocol, tasks, languages):
    with tempfile.TemporaryDirectory() as prediction_folder:
        bm25s = mteb.get_model("bm25s")
@ -391,18 +209,28 @@ def mteb_test_rerank_models(
        # Score API is only enabled for num_labels == 1
        assert model_config.hf_config.num_labels == 1

-        # Confirm whether vllm uses the correct default_pooling_type, which
-        # relates to whether chunked prefill and prefix caching are enabled
-        assert (
-            model_config._model_info.default_pooling_type
-            == model_info.default_pooling_type
-        )
-
+        # Maybe load chat_template.
        chat_template: str | None = None
        if model_info.chat_template_name is not None:
            chat_template = (template_home / model_info.chat_template_name).read_text()
        vllm_model.chat_template = chat_template

+        # Confirm whether the important configs in model_config are correct.
+        if model_info.pooling_type is not None:
+            assert model_config.pooler_config.pooling_type == model_info.pooling_type
+        if model_info.attn_type is not None:
+            assert model_config.attn_type == model_info.attn_type
+        if model_info.is_prefix_caching_supported is not None:
+            assert (
+                model_config.is_prefix_caching_supported
+                == model_info.is_prefix_caching_supported
+            )
+        if model_info.is_chunked_prefill_supported is not None:
+            assert (
+                model_config.is_chunked_prefill_supported
+                == model_info.is_chunked_prefill_supported
+            )
+
        vllm_main_score = run_mteb_rerank(
            vllm_mteb_encoder(vllm_model),
            tasks=MTEB_RERANK_TASKS,
--- a/tests/models/language/pooling_mteb_test/test_baai.py
+++ b/tests/models/language/pooling_mteb_test/test_baai.py
@ -4,90 +4,94 @@ import pytest

 from tests.models.language.pooling.embed_utils import correctness_test_embed_models
 from tests.models.utils import (
-    CLSPoolingEmbedModelInfo,
-    CLSPoolingRerankModelInfo,
    EmbedModelInfo,
-    LASTPoolingEmbedModelInfo,
    RerankModelInfo,
 )

-from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
+from .mteb_embed_utils import mteb_test_embed_models
+from .mteb_score_utils import mteb_test_rerank_models

 MODELS = [
    ########## BertModel
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "BAAI/bge-base-en",
        architecture="BertModel",
        mteb_score=0.779336792,
+        pooling_type="CLS",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
        enable_test=True,
    ),
-    CLSPoolingEmbedModelInfo(
-        "BAAI/bge-base-zh", architecture="BertModel", enable_test=False
-    ),
-    CLSPoolingEmbedModelInfo(
-        "BAAI/bge-small-en", architecture="BertModel", enable_test=False
-    ),
-    CLSPoolingEmbedModelInfo(
-        "BAAI/bge-small-zh", architecture="BertModel", enable_test=False
-    ),
-    CLSPoolingEmbedModelInfo(
-        "BAAI/bge-large-en", architecture="BertModel", enable_test=False
-    ),
-    CLSPoolingEmbedModelInfo(
-        "BAAI/bge-large-zh", architecture="BertModel", enable_test=False
-    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo("BAAI/bge-base-zh", architecture="BertModel", enable_test=False),
+    EmbedModelInfo("BAAI/bge-small-en", architecture="BertModel", enable_test=False),
+    EmbedModelInfo("BAAI/bge-small-zh", architecture="BertModel", enable_test=False),
+    EmbedModelInfo("BAAI/bge-large-en", architecture="BertModel", enable_test=False),
+    EmbedModelInfo("BAAI/bge-large-zh", architecture="BertModel", enable_test=False),
+    EmbedModelInfo(
        "BAAI/bge-large-zh-noinstruct", architecture="BertModel", enable_test=False
    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "BAAI/bge-base-en-v1.5", architecture="BertModel", enable_test=False
    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "BAAI/bge-base-zh-v1.5", architecture="BertModel", enable_test=False
    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "BAAI/bge-small-en-v1.5", architecture="BertModel", enable_test=False
    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "BAAI/bge-small-zh-v1.5", architecture="BertModel", enable_test=False
    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "BAAI/bge-large-en-v1.5", architecture="BertModel", enable_test=False
    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "BAAI/bge-large-zh-v1.5", architecture="BertModel", enable_test=False
    ),
    ########## XLMRobertaModel
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "BAAI/bge-m3",
        architecture="XLMRobertaModel",
        mteb_score=0.787343078,
+        pooling_type="CLS",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
        enable_test=True,
    ),
    ########## Qwen2Model
-    LASTPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "BAAI/bge-code-v1",
        architecture="Qwen2Model",
        mteb_score=0.75724465,
        dtype="float32",
+        pooling_type="LAST",
+        attn_type="decoder",
+        is_prefix_caching_supported=True,
+        is_chunked_prefill_supported=True,
        enable_test=True,
    ),
 ]

 RERANK_MODELS = [
    ########## XLMRobertaForSequenceClassification
-    CLSPoolingRerankModelInfo(
+    RerankModelInfo(
        "BAAI/bge-reranker-base",
        architecture="XLMRobertaForSequenceClassification",
        mteb_score=0.32398,
+        pooling_type="CLS",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
        enable_test=True,
    ),
-    CLSPoolingRerankModelInfo(
+    RerankModelInfo(
        "BAAI/bge-reranker-large",
        architecture="XLMRobertaForSequenceClassification",
        enable_test=False,
    ),
-    CLSPoolingRerankModelInfo(
+    RerankModelInfo(
        "BAAI/bge-reranker-v2-m3",
        architecture="XLMRobertaForSequenceClassification",
        enable_test=False,
--- a/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py
+++ b/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py
@ -9,14 +9,12 @@ import torch
 from torch.utils.data import DataLoader

 from tests.conftest import HfRunner
-from tests.models.language.pooling_mteb_test.mteb_utils import (
-    VllmMtebCrossEncoder,
-    mteb_test_rerank_models,
-)
-from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
+from tests.models.utils import RerankModelInfo
+
+from .mteb_score_utils import VllmMtebCrossEncoder, mteb_test_rerank_models

 RERANK_MODELS = [
-    LASTPoolingRerankModelInfo(
+    RerankModelInfo(
        "BAAI/bge-reranker-v2-gemma",
        architecture="GemmaForSequenceClassification",
        mteb_score=0.33757,
@ -25,6 +23,10 @@ RERANK_MODELS = [
            "classifier_from_token": ["Yes"],
            "method": "no_post_processing",
        },
+        pooling_type="LAST",
+        attn_type="decoder",
+        is_prefix_caching_supported=True,
+        is_chunked_prefill_supported=True,
    ),
 ]

--- a/tests/models/language/pooling_mteb_test/test_cross_encoder.py
+++ b/tests/models/language/pooling_mteb_test/test_cross_encoder.py
@ -3,23 +3,29 @@
 import pytest

 from tests.models.utils import (
-    CLSPoolingRerankModelInfo,
-    LASTPoolingRerankModelInfo,
    RerankModelInfo,
 )

-from .mteb_utils import mteb_test_rerank_models
+from .mteb_score_utils import mteb_test_rerank_models

 RERANK_MODELS = [
-    CLSPoolingRerankModelInfo(
+    RerankModelInfo(
        "cross-encoder/ms-marco-TinyBERT-L-2-v2",
        mteb_score=0.32898,
        architecture="BertForSequenceClassification",
+        pooling_type="CLS",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
    ),
-    LASTPoolingRerankModelInfo(
+    RerankModelInfo(
        "tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
        mteb_score=0.25736,
        architecture="Qwen3ForSequenceClassification",
+        pooling_type="LAST",
+        attn_type="decoder",
+        is_prefix_caching_supported=True,
+        is_chunked_prefill_supported=True,
    ),
 ]

--- a/tests/models/language/pooling_mteb_test/test_gte.py
+++ b/tests/models/language/pooling_mteb_test/test_gte.py
@ -5,36 +5,32 @@ import pytest

 from tests.models.language.pooling.embed_utils import correctness_test_embed_models
 from tests.models.utils import (
-    CLSPoolingEmbedModelInfo,
-    CLSPoolingRerankModelInfo,
    EmbedModelInfo,
-    LASTPoolingEmbedModelInfo,
    RerankModelInfo,
 )

-from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
+from .mteb_embed_utils import mteb_test_embed_models
+from .mteb_score_utils import mteb_test_rerank_models

 MODELS = [
    ########## BertModel
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "thenlper/gte-large",
        mteb_score=0.76807651,
        architecture="BertModel",
+        pooling_type="MEAN",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
        enable_test=True,
    ),
-    CLSPoolingEmbedModelInfo(
-        "thenlper/gte-base", architecture="BertModel", enable_test=False
-    ),
-    CLSPoolingEmbedModelInfo(
-        "thenlper/gte-small", architecture="BertModel", enable_test=False
-    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo("thenlper/gte-base", architecture="BertModel", enable_test=False),
+    EmbedModelInfo("thenlper/gte-small", architecture="BertModel", enable_test=False),
+    EmbedModelInfo(
        "thenlper/gte-large-zh", architecture="BertModel", enable_test=False
    ),
-    CLSPoolingEmbedModelInfo(
-        "thenlper/gte-base-zh", architecture="BertModel", enable_test=False
-    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo("thenlper/gte-base-zh", architecture="BertModel", enable_test=False),
+    EmbedModelInfo(
        "thenlper/gte-small-zh", architecture="BertModel", enable_test=False
    ),
    ########### NewModel
@ -43,48 +39,64 @@ MODELS = [
    # - whether to use token_type_embeddings
    # - whether to use context expansion
    # So only test one (the most widely used) model
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "Alibaba-NLP/gte-multilingual-base",
        architecture="GteNewModel",
        mteb_score=0.775074696,
        hf_overrides={"architectures": ["GteNewModel"]},
+        pooling_type="CLS",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
        enable_test=True,
    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "Alibaba-NLP/gte-base-en-v1.5",
        architecture="GteNewModel",
        hf_overrides={"architectures": ["GteNewModel"]},
        enable_test=False,
    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "Alibaba-NLP/gte-large-en-v1.5",
        architecture="GteNewModel",
        hf_overrides={"architectures": ["GteNewModel"]},
        enable_test=False,
    ),
    ########### Qwen2ForCausalLM
-    LASTPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
        mteb_score=0.758473459018872,
        architecture="Qwen2ForCausalLM",
+        pooling_type="LAST",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
        enable_test=True,
    ),
    ########## ModernBertModel
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "Alibaba-NLP/gte-modernbert-base",
        mteb_score=0.748193353,
        architecture="ModernBertModel",
+        pooling_type="CLS",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
        enable_test=True,
    ),
    ########## Qwen3ForCausalLM
-    LASTPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "Qwen/Qwen3-Embedding-0.6B",
        mteb_score=0.771163695,
        architecture="Qwen3ForCausalLM",
        dtype="float32",
+        pooling_type="LAST",
+        attn_type="decoder",
+        is_prefix_caching_supported=True,
+        is_chunked_prefill_supported=True,
        enable_test=True,
    ),
-    LASTPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "Qwen/Qwen3-Embedding-4B",
        architecture="Qwen3ForCausalLM",
        dtype="float32",
@ -93,18 +105,26 @@ MODELS = [
 ]

 RERANK_MODELS = [
-    CLSPoolingRerankModelInfo(
+    RerankModelInfo(
        # classifier_pooling: mean
        "Alibaba-NLP/gte-reranker-modernbert-base",
        mteb_score=0.33386,
        architecture="ModernBertForSequenceClassification",
+        pooling_type="CLS",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
        enable_test=True,
    ),
-    CLSPoolingRerankModelInfo(
+    RerankModelInfo(
        "Alibaba-NLP/gte-multilingual-reranker-base",
        mteb_score=0.33062,
        architecture="GteNewForSequenceClassification",
        hf_overrides={"architectures": ["GteNewForSequenceClassification"]},
+        pooling_type="CLS",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
        enable_test=True,
    ),
 ]
--- a/tests/models/language/pooling_mteb_test/test_intfloat.py
+++ b/tests/models/language/pooling_mteb_test/test_intfloat.py
@ -3,40 +3,44 @@
 import pytest

 from tests.models.language.pooling.embed_utils import correctness_test_embed_models
-from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
+from tests.models.utils import EmbedModelInfo

-from .mteb_utils import mteb_test_embed_models
+from .mteb_embed_utils import mteb_test_embed_models

 MODELS = [
    ########## BertModel
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "intfloat/e5-small",
        architecture="BertModel",
        mteb_score=0.742285423,
+        pooling_type="MEAN",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
        enable_test=True,
    ),
-    CLSPoolingEmbedModelInfo(
-        "intfloat/e5-base", architecture="BertModel", enable_test=False
-    ),
-    CLSPoolingEmbedModelInfo(
-        "intfloat/e5-large", architecture="BertModel", enable_test=False
-    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo("intfloat/e5-base", architecture="BertModel", enable_test=False),
+    EmbedModelInfo("intfloat/e5-large", architecture="BertModel", enable_test=False),
+    EmbedModelInfo(
        "intfloat/multilingual-e5-small", architecture="BertModel", enable_test=False
    ),
    ########## XLMRobertaModel
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "intfloat/multilingual-e5-base",
        architecture="XLMRobertaModel",
        mteb_score=0.779325955,
+        pooling_type="MEAN",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
        enable_test=True,
    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "intfloat/multilingual-e5-large",
        architecture="XLMRobertaModel",
        enable_test=False,
    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "intfloat/multilingual-e5-large-instruct",
        architecture="XLMRobertaModel",
        enable_test=False,
--- a/tests/models/language/pooling_mteb_test/test_jina.py
+++ b/tests/models/language/pooling_mteb_test/test_jina.py
@ -10,30 +10,37 @@ from tests.models.language.pooling.embed_utils import (
    matryoshka_fy,
 )
 from tests.models.utils import (
-    CLSPoolingEmbedModelInfo,
-    CLSPoolingRerankModelInfo,
    EmbedModelInfo,
    RerankModelInfo,
 )
 from vllm import PoolingParams

-from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
+from .mteb_embed_utils import mteb_test_embed_models
+from .mteb_score_utils import mteb_test_rerank_models

 EMBEDDING_MODELS = [
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "jinaai/jina-embeddings-v3",
        mteb_score=0.824413164,
        architecture="XLMRobertaModel",
        is_matryoshka=True,
+        pooling_type="MEAN",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
        dtype="float32",
    )
 ]

 RERANK_MODELS = [
-    CLSPoolingRerankModelInfo(
+    RerankModelInfo(
        "jinaai/jina-reranker-v2-base-multilingual",
        mteb_score=0.33643,
        architecture="XLMRobertaForSequenceClassification",
+        pooling_type="CLS",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
    )
 ]

--- a/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py
+++ b/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py
@ -6,9 +6,9 @@ import pytest
 import torch

 from tests.conftest import HfRunner
-from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
+from tests.models.utils import RerankModelInfo

-from .mteb_utils import mteb_test_rerank_models
+from .mteb_score_utils import mteb_test_rerank_models

 mxbai_rerank_hf_overrides = {
    "architectures": ["Qwen2ForSequenceClassification"],
@ -17,14 +17,18 @@ mxbai_rerank_hf_overrides = {
 }

 RERANK_MODELS = [
-    LASTPoolingRerankModelInfo(
+    RerankModelInfo(
        "mixedbread-ai/mxbai-rerank-base-v2",
        architecture="Qwen2ForSequenceClassification",
        hf_overrides=mxbai_rerank_hf_overrides,
        mteb_score=0.273,
+        pooling_type="LAST",
+        attn_type="decoder",
+        is_prefix_caching_supported=True,
+        is_chunked_prefill_supported=True,
        enable_test=True,
    ),
-    LASTPoolingRerankModelInfo(
+    RerankModelInfo(
        "mixedbread-ai/mxbai-rerank-large-v2",
        architecture="Qwen2ForSequenceClassification",
        hf_overrides=mxbai_rerank_hf_overrides,
--- a/tests/models/language/pooling_mteb_test/test_nemotron.py
+++ b/tests/models/language/pooling_mteb_test/test_nemotron.py
@ -3,29 +3,39 @@

 import pytest

+from tests.models.language.pooling_mteb_test.mteb_embed_utils import (
+    mteb_test_embed_models,
+)
+from tests.models.language.pooling_mteb_test.mteb_score_utils import (
+    mteb_test_rerank_models,
+)
 from tests.models.utils import (
    EmbedModelInfo,
-    LASTPoolingEmbedModelInfo,
-    LASTPoolingRerankModelInfo,
    RerankModelInfo,
 )

-from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
-
 EMBEDDING_MODELS = [
-    LASTPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "nvidia/llama-nemotron-embed-1b-v2",
        architecture="LlamaBidirectionalModel",
        mteb_score=0.689164662128673,
+        pooling_type="MEAN",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
    )
 ]

 RERANK_MODELS = [
-    LASTPoolingRerankModelInfo(
+    RerankModelInfo(
        "nvidia/llama-nemotron-rerank-1b-v2",
        architecture="LlamaBidirectionalForSequenceClassification",
        chat_template_name="nemotron-rerank.jinja",
        mteb_score=0.33994,
+        pooling_type="MEAN",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
    ),
 ]

--- a/tests/models/language/pooling_mteb_test/test_nomic.py
+++ b/tests/models/language/pooling_mteb_test/test_nomic.py
@ -4,30 +4,38 @@
 import pytest

 from tests.models.language.pooling.embed_utils import correctness_test_embed_models
-from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
+from tests.models.utils import EmbedModelInfo

-from .mteb_utils import mteb_test_embed_models
+from .mteb_embed_utils import mteb_test_embed_models

 MODELS = [
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "nomic-ai/nomic-embed-text-v1",
        architecture="NomicBertModel",
        mteb_score=0.737568559,
        enable_test=True,
+        pooling_type="MEAN",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "nomic-ai/nomic-embed-text-v1.5",
        architecture="NomicBertModel",
        enable_test=False,
    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "nomic-ai/CodeRankEmbed", architecture="NomicBertModel", enable_test=False
    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "nomic-ai/nomic-embed-text-v2-moe",
        architecture="NomicBertModel",
        mteb_score=0.715488912,
        enable_test=True,
+        pooling_type="MEAN",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
    ),
 ]

--- a/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py
+++ b/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py
@ -6,10 +6,10 @@ import pytest
 import torch

 from tests.conftest import HfRunner
-from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
+from tests.models.utils import RerankModelInfo
 from tests.utils import multi_gpu_test

-from .mteb_utils import mteb_test_rerank_models
+from .mteb_score_utils import mteb_test_rerank_models

 qwen3_reranker_hf_overrides = {
    "architectures": ["Qwen3ForSequenceClassification"],
@ -18,14 +18,18 @@ qwen3_reranker_hf_overrides = {
 }

 RERANK_MODELS = [
-    LASTPoolingRerankModelInfo(
+    RerankModelInfo(
        "Qwen/Qwen3-Reranker-0.6B",
        architecture="Qwen3ForSequenceClassification",
        mteb_score=0.25736,
        hf_overrides=qwen3_reranker_hf_overrides,
+        pooling_type="LAST",
+        attn_type="decoder",
+        is_prefix_caching_supported=True,
+        is_chunked_prefill_supported=True,
        enable_test=True,
    ),
-    LASTPoolingRerankModelInfo(
+    RerankModelInfo(
        "Qwen/Qwen3-Reranker-4B",
        architecture="Qwen3ForSequenceClassification",
        hf_overrides=qwen3_reranker_hf_overrides,
--- a/tests/models/language/pooling_mteb_test/test_snowflake_arctic_embed.py
+++ b/tests/models/language/pooling_mteb_test/test_snowflake_arctic_embed.py
@ -4,62 +4,82 @@
 import pytest

 from tests.models.language.pooling.embed_utils import correctness_test_embed_models
-from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
+from tests.models.utils import EmbedModelInfo

-from .mteb_utils import mteb_test_embed_models
+from .mteb_embed_utils import mteb_test_embed_models

 MODELS = [
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "Snowflake/snowflake-arctic-embed-xs",
        is_matryoshka=False,
        architecture="BertModel",
        mteb_score=0.714927797,
+        pooling_type="CLS",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
        enable_test=True,
    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "Snowflake/snowflake-arctic-embed-s",
        is_matryoshka=False,
        architecture="BertModel",
        enable_test=False,
    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "Snowflake/snowflake-arctic-embed-m",
        is_matryoshka=False,
        architecture="BertModel",
        enable_test=False,
    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "Snowflake/snowflake-arctic-embed-m-long",
        is_matryoshka=False,
        architecture="NomicBertModel",
        mteb_score=0.681146831,
+        pooling_type="CLS",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
        enable_test=True,
    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "Snowflake/snowflake-arctic-embed-l",
        is_matryoshka=False,
        architecture="BertModel",
        enable_test=False,
    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "Snowflake/snowflake-arctic-embed-m-v1.5",
        is_matryoshka=True,
        architecture="BertModel",
        mteb_score=0.649088363,
+        pooling_type="CLS",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
        enable_test=True,
    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "Snowflake/snowflake-arctic-embed-l-v2.0",
        is_matryoshka=True,
        architecture="XLMRobertaModel",
        mteb_score=0.712258299,
+        pooling_type="CLS",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
        enable_test=True,
    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "Snowflake/snowflake-arctic-embed-m-v2.0",
        is_matryoshka=True,
        architecture="GteModel",
        mteb_score=0.706622444,
+        pooling_type="CLS",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
        enable_test=True,
    ),
 ]
--- a/tests/models/language/pooling_mteb_test/test_st_projector.py
+++ b/tests/models/language/pooling_mteb_test/test_st_projector.py
@ -3,25 +3,31 @@
 import pytest

 from tests.models.utils import (
-    CLSPoolingEmbedModelInfo,
    EmbedModelInfo,
-    LASTPoolingEmbedModelInfo,
 )

-from .mteb_utils import mteb_test_embed_models
+from .mteb_embed_utils import mteb_test_embed_models

 # ST models with projector (Dense) layers
 ST_PROJECTOR_MODELS = [
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "TencentBAC/Conan-embedding-v1",
        architecture="BertModel",
        mteb_score=0.688611955,
+        pooling_type="MEAN",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
        enable_test=True,
    ),
-    LASTPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "google/embeddinggemma-300m",
        architecture="Gemma3TextModel",
        mteb_score=0.7473819294684156,
+        pooling_type="MEAN",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
        enable_test=True,
        dtype="float32",
    ),
--- a/tests/models/multimodal/generation/test_qwen2_vl.py
+++ b/tests/models/multimodal/generation/test_qwen2_vl.py
@ -267,7 +267,7 @@ def run_embedding_input_test(
    """Inference result should be the same between
    original image/video input and image/video embeddings input.
    """
-    from transformers import AutoProcessor  # noqa: F401
+    from transformers import AutoProcessor

    processor = AutoProcessor.from_pretrained(model)

--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@ -10,7 +10,7 @@ import torch
 import torch.nn.functional as F
 from transformers import PretrainedConfig

-from vllm.config.model import ModelConfig, ModelDType, RunnerOption
+from vllm.config.model import AttnTypeStr, ModelConfig, ModelDType, RunnerOption
 from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs
 from vllm.multimodal.processing import InputProcessingContext
 from vllm.tokenizers import cached_tokenizer_from_config
@ -375,7 +375,10 @@ class ModelInfo:
    max_model_len: int | None = None
    hf_dtype: str = "float32"
    hf_overrides: dict[str, Any] | None = None
-    default_pooling_type: str = ""
+    pooling_type: str | None = None
+    attn_type: AttnTypeStr | None = None
+    is_prefix_caching_supported: bool | None = None
+    is_chunked_prefill_supported: bool | None = None
    enable_test: bool = True


@ -386,32 +389,12 @@ class EmbedModelInfo(ModelInfo):
    matryoshka_dimensions: list[int] | None = None


-@dataclass
-class CLSPoolingEmbedModelInfo(EmbedModelInfo):
-    default_pooling_type: str = "CLS"
-
-
-@dataclass
-class LASTPoolingEmbedModelInfo(EmbedModelInfo):
-    default_pooling_type: str = "LAST"
-
-
@dataclass
 class RerankModelInfo(ModelInfo):
    mteb_score: float | None = None
    chat_template_name: str | None = None


-@dataclass
-class CLSPoolingRerankModelInfo(RerankModelInfo):
-    default_pooling_type: str = "CLS"
-
-
-@dataclass
-class LASTPoolingRerankModelInfo(RerankModelInfo):
-    default_pooling_type: str = "LAST"
-
-
@dataclass
 class GenerateModelInfo(ModelInfo):
    hf_dtype: str = "auto"
--- a/tests/v1/kv_connector/unit/test_example_connector.py
+++ b/tests/v1/kv_connector/unit/test_example_connector.py
@ -145,7 +145,7 @@ def test_shared_storage_connector_hashes(tmp_path):

    # don't put this import at the top level
    # it will call torch.cuda.device_count()
-    from transformers import AutoProcessor  # noqa: F401
+    from transformers import AutoProcessor

    # Create processor to handle the chat prompt
    processor = AutoProcessor.from_pretrained(MODEL_NAME)
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@ -164,7 +164,7 @@ class ModelConfig:
    """The specific revision to use for the tokenizer on the Hugging Face Hub.
    It can be a branch name, a tag name, or a commit id. If unspecified, will
    use the default version."""
-    max_model_len: int = Field(default=None, gt=0)
+    max_model_len: int = Field(default=None, ge=-1)
    """Model context length (prompt and output). If unspecified, will be
    automatically derived from the model config.

@ -595,7 +595,7 @@ class ModelConfig:

        # Avoid running try_verify_and_update_config multiple times
        self.config_updated = False
-
+        self._try_verify_and_update_model_config()
        self._verify_quantization()
        self._verify_cuda_graph()
        self._verify_bnb_config()
@ -1008,6 +1008,23 @@ class ModelConfig:
                "when expert parallelism is enabled."
            )

+    def _try_verify_and_update_model_config(self):
+        # Avoid running try_verify_and_update_config multiple times
+        if getattr(self, "config_updated", False):
+            return
+
+        architecture = self.architecture
+        if architecture is None:
+            return
+
+        from vllm.model_executor.models.config import (
+            MODELS_CONFIG_MAP,
+        )
+
+        cls = MODELS_CONFIG_MAP.get(architecture, None)
+        if cls is not None:
+            cls.verify_and_update_model_config(self)
+
    def verify_dual_chunk_attention_config(
        self,
        load_config: LoadConfig,
--- a/vllm/distributed/ec_transfer/ec_connector/example_connector.py
+++ b/vllm/distributed/ec_transfer/ec_connector/example_connector.py
@ -81,10 +81,7 @@ class ECExampleConnector(ECConnectorBase):
        assert encoder_cache is not None
        if metadata is None:
            logger.warning(
-                (
-                    "In connector.start_load_caches, ",
-                    "but the connector metadata is None",
-                )
+                "In connector.start_load_caches, but the connector metadata is None"
            )
            return
        # Load the EC for each mm data
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@ -297,16 +297,14 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, dict[str, Any]]:
        elif contains_type(type_hints, set):
            kwargs[name].update(collection_to_kwargs(type_hints, set))
        elif contains_type(type_hints, int):
-            kwargs[name]["type"] = int
-            # Special case for large integers
-            human_readable_ints = {
-                "max_model_len",
-                "max_num_batched_tokens",
-                "kv_cache_memory_bytes",
-            }
-            if name in human_readable_ints:
+            if name == "max_model_len":
+                kwargs[name]["type"] = human_readable_int_or_auto
+                kwargs[name]["help"] += f"\n\n{human_readable_int_or_auto.__doc__}"
+            elif name in ("max_num_batched_tokens", "kv_cache_memory_bytes"):
                kwargs[name]["type"] = human_readable_int
                kwargs[name]["help"] += f"\n\n{human_readable_int.__doc__}"
+            else:
+                kwargs[name]["type"] = int
        elif contains_type(type_hints, float):
            kwargs[name]["type"] = float
        elif contains_type(type_hints, dict) and (
@ -2042,23 +2040,17 @@ def _raise_unsupported_error(feature_name: str):
    raise NotImplementedError(msg)


-def human_readable_int(value):
+def human_readable_int(value: str) -> int:
    """Parse human-readable integers like '1k', '2M', etc.
    Including decimal values with decimal multipliers.
-    Also accepts -1 or 'auto' as a special value for auto-detection.

    Examples:
    - '1k' -> 1,000
    - '1K' -> 1,024
    - '25.6k' -> 25,600
-    - '-1' or 'auto' -> -1 (special value for auto-detection)
    """
    value = value.strip()

-    # Handle -1 or 'auto' as a special value for auto-detection
-    if value == "-1" or value.lower() == "auto":
-        return -1
-
    match = re.fullmatch(r"(\d+(?:\.\d+)?)([kKmMgGtT])", value)
    if match:
        decimal_multiplier = {
@ -2092,3 +2084,22 @@ def human_readable_int(value):

    # Regular plain number.
    return int(value)
+
+
+def human_readable_int_or_auto(value: str) -> int:
+    """Parse human-readable integers like '1k', '2M', etc.
+    Including decimal values with decimal multipliers.
+    Also accepts -1 or 'auto' as a special value for auto-detection.
+
+    Examples:
+    - '1k' -> 1,000
+    - '1K' -> 1,024
+    - '25.6k' -> 25,600
+    - '-1' or 'auto' -> -1 (special value for auto-detection)
+    """
+    value = value.strip()
+
+    if value == "-1" or value.lower() == "auto":
+        return -1
+
+    return human_readable_int(value)
--- a/vllm/entrypoints/cli/benchmark/main.py
+++ b/vllm/entrypoints/cli/benchmark/main.py
@ -32,6 +32,7 @@ class BenchmarkSubcommand(CLISubcommand):
    ) -> FlexibleArgumentParser:
        bench_parser = subparsers.add_parser(
            self.name,
+            help=self.help,
            description=self.help,
            usage=f"vllm {self.name} <bench_type> [options]",
        )
--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@ -66,7 +66,11 @@ class ServeSubcommand(CLISubcommand):
        self, subparsers: argparse._SubParsersAction
    ) -> FlexibleArgumentParser:
        serve_parser = subparsers.add_parser(
-            self.name, description=DESCRIPTION, usage="vllm serve [model_tag] [options]"
+            self.name,
+            help="Launch a local OpenAI-compatible API server to serve LLM "
+            "completions via HTTP.",
+            description=DESCRIPTION,
+            usage="vllm serve [model_tag] [options]",
        )

        serve_parser = make_arg_parser(serve_parser)
--- a/vllm/entrypoints/serve/elastic_ep/api_router.py
+++ b/vllm/entrypoints/serve/elastic_ep/api_router.py
@ -43,7 +43,7 @@ async def scale_elastic_ep(raw_request: Request):
    try:
        body = await raw_request.json()
    except json.JSONDecodeError as e:
-        raise HTTPException(status_code=400, detail="Invalid JSON format") from e  # noqa: B904
+        raise HTTPException(status_code=400, detail="Invalid JSON format") from e

    new_data_parallel_size = body.get("new_data_parallel_size")
    drain_timeout = body.get("drain_timeout", 120)  # Default 2 minutes
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@ -625,8 +625,9 @@ def silu_mul_per_token_group_quant_fp8_colmajor(
    M, N = input.size()
    N_2 = N // 2

+    fp8_dtype = current_platform.fp8_dtype()
    if output is None:
-        output = torch.empty((M, N_2), dtype=torch.float8_e4m3fn, device=input.device)
+        output = torch.empty((M, N_2), dtype=fp8_dtype, device=input.device)

    output_scales = torch.empty(
        ((N_2 // GROUP_SIZE), M), dtype=torch.float32, device=input.device
@ -637,9 +638,12 @@ def silu_mul_per_token_group_quant_fp8_colmajor(
    assert M % BLOCK_M == 0
    assert N_2 % BLOCK_N == 0

-    finfo = torch.finfo(torch.float8_e4m3fn)
-    fp8_min = finfo.min
-    fp8_max = finfo.max
+    # Using the default value (240.0) from pytorch will cause accuracy
+    # issue on dynamic quantization models. Here use 224.0 for fnuz on ROCm
+    # platforms that use the torch.float8_e4m3fnuz dtype.
+    finfo = torch.finfo(fp8_dtype)
+    fp8_min = -224.0 if current_platform.is_fp8_fnuz() else finfo.min
+    fp8_max = 224.0 if current_platform.is_fp8_fnuz() else finfo.max

    # Force even division so we can avoid edgecases within the kernel.
    assert M % BLOCK_M == 0
--- a/vllm/model_executor/model_loader/bitsandbytes_loader.py
+++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py
@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# ruff: noqa: SIM117
 import fnmatch
 import glob
 import itertools
@ -59,7 +58,7 @@ def is_moe_model(model: torch.nn.Module) -> bool:


 class BitsAndBytesModelLoader(BaseModelLoader):
-    """Model loader to load model weights with BitAndBytes quantization."""
+    """Model loader to load model weights with BitsAndBytes quantization."""

    possible_config_file_names = ["adapter_config.json"]

--- a/vllm/model_executor/model_loader/runai_streamer_loader.py
+++ b/vllm/model_executor/model_loader/runai_streamer_loader.py
@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# ruff: noqa: SIM117
 import os
 from collections.abc import Generator

--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@ -13,7 +13,7 @@ from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec, MLAAttentionSpec

 if TYPE_CHECKING:
-    from vllm.config import VllmConfig
+    from vllm.config import ModelConfig, VllmConfig

 logger = init_logger(__name__)

@ -21,20 +21,24 @@ logger = init_logger(__name__)
 class VerifyAndUpdateConfig:
    @staticmethod
    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
-        raise NotImplementedError
+        return

-
-class Gemma3TextModelConfig:
    @staticmethod
-    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
-        hf_config = vllm_config.model_config.hf_config
+    def verify_and_update_model_config(model_config: "ModelConfig") -> None:
+        return
+
+
+class Gemma3TextModelConfig(VerifyAndUpdateConfig):
+    @staticmethod
+    def verify_and_update_model_config(model_config: "ModelConfig") -> None:
+        hf_config = model_config.hf_config
        hf_config.is_causal = not hf_config.use_bidirectional_attention


 class GteNewModelConfig(VerifyAndUpdateConfig):
    @staticmethod
-    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
-        config = vllm_config.model_config.hf_config
+    def verify_and_update_model_config(model_config: "ModelConfig") -> None:
+        config = model_config.hf_config

        assert config.__class__.__name__ == "NewConfig"
        assert config.hidden_act == "gelu"
@ -53,16 +57,15 @@ class GteNewModelConfig(VerifyAndUpdateConfig):

 class JambaForSequenceClassificationConfig(VerifyAndUpdateConfig):
    @staticmethod
-    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
-        pooler_config = vllm_config.model_config.pooler_config
+    def verify_and_update_model_config(model_config: "ModelConfig") -> None:
+        pooler_config = model_config.pooler_config
        if pooler_config.use_activation is None:
            pooler_config.use_activation = False


 class JinaRobertaModelConfig(VerifyAndUpdateConfig):
    @staticmethod
-    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
-        model_config = vllm_config.model_config
+    def verify_and_update_model_config(model_config: "ModelConfig") -> None:
        config = model_config.hf_config

        if config.position_embedding_type == "rotary":
@ -90,10 +93,10 @@ class JinaRobertaModelConfig(VerifyAndUpdateConfig):

 class LlamaBidirectionalConfig(VerifyAndUpdateConfig):
    @staticmethod
-    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
+    def verify_and_update_model_config(model_config: "ModelConfig") -> None:
        from vllm.config.pooler import PoolingTypeStr

-        hf_config = vllm_config.model_config.hf_config
+        hf_config = model_config.hf_config
        hf_config.is_causal = False

        pooling_type_map: dict[str, PoolingTypeStr] = {
@ -105,7 +108,7 @@ class LlamaBidirectionalConfig(VerifyAndUpdateConfig):
        pooling_type = pooling_type_map.get(hf_config.pooling, None)
        if pooling_type is None:
            raise ValueError(f"pool_type {hf_config.pooling} not supported")
-        vllm_config.model_config.pooler_config.pooling_type = pooling_type
+        model_config.pooler_config.pooling_type = pooling_type


 class NomicBertModelConfig(VerifyAndUpdateConfig):
@ -204,8 +207,8 @@ class NomicBertModelConfig(VerifyAndUpdateConfig):

 class Qwen2ForProcessRewardModelConfig(VerifyAndUpdateConfig):
    @staticmethod
-    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
-        pooler_config = vllm_config.model_config.pooler_config
+    def verify_and_update_model_config(model_config: "ModelConfig") -> None:
+        pooler_config = model_config.pooler_config

        if pooler_config.step_tag_id is None:
            pooler_config.step_tag_id = 151651
@ -213,8 +216,8 @@ class Qwen2ForProcessRewardModelConfig(VerifyAndUpdateConfig):

 class Qwen2ForRewardModelConfig(VerifyAndUpdateConfig):
    @staticmethod
-    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
-        pooler_config = vllm_config.model_config.pooler_config
+    def verify_and_update_model_config(model_config: "ModelConfig") -> None:
+        pooler_config = model_config.pooler_config

        if pooler_config.softmax is None:
            pooler_config.softmax = False
@ -222,8 +225,8 @@ class Qwen2ForRewardModelConfig(VerifyAndUpdateConfig):

 class Qwen3ForSequenceClassificationConfig(VerifyAndUpdateConfig):
    @staticmethod
-    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
-        config = vllm_config.model_config.hf_config
+    def verify_and_update_model_config(model_config: "ModelConfig") -> None:
+        config = model_config.hf_config

        is_original_qwen3_reranker = getattr(
            config, "is_original_qwen3_reranker", False
@ -237,23 +240,23 @@ class Qwen3ForSequenceClassificationConfig(VerifyAndUpdateConfig):
            "Try loading the original Qwen3 Reranker?, see: "
            "https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/offline_reranker.py"
        )
-        vllm_config.model_config.hf_config.method = "from_2_way_softmax"
+        model_config.hf_config.method = "from_2_way_softmax"


 class JinaVLForSequenceClassificationConfig(VerifyAndUpdateConfig):
    @staticmethod
-    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
-        config = vllm_config.model_config.hf_config
+    def verify_and_update_model_config(model_config: "ModelConfig") -> None:
+        config = model_config.hf_config
        config.num_labels = 1
-        pooler_config = vllm_config.model_config.pooler_config
+        pooler_config = model_config.pooler_config
        if pooler_config.logit_bias is None:
            pooler_config.logit_bias = 2.65


 class SnowflakeGteNewModelConfig(VerifyAndUpdateConfig):
    @staticmethod
-    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
-        config = vllm_config.model_config.hf_config
+    def verify_and_update_model_config(model_config: "ModelConfig") -> None:
+        config = model_config.hf_config

        assert config.__class__.__name__ == "GteConfig"
        assert config.hidden_act == "gelu"
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@ -64,7 +64,6 @@ from .interfaces import (
    SupportsLoRA,
    SupportsPP,
 )
-from .interfaces_base import attn_type
 from .utils import (
    AutoWeightsLoader,
    PPMissingLayer,
@ -707,14 +706,12 @@ class LlamaForCausalLM(
        return name, loaded_weight


-@attn_type("encoder_only")
 class LlamaBidirectionalForSequenceClassification(as_seq_cls_model(LlamaForCausalLM)):
    # This class sets the correct attention type and pooling type
    # through LlamaBidirectionalConfig.
    pass


-@attn_type("encoder_only")
 class LlamaBidirectionalModel(as_embedding_model(LlamaForCausalLM)):
    # This class sets the correct attention type and pooling type
    # through LlamaBidirectionalConfig.
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@ -606,6 +606,43 @@ def get_request_block_hasher(
    return request_block_hasher


+def _check_enough_kv_cache_memory(
+    available_memory: int,
+    get_needed_memory: Callable[[], int],
+    max_model_len: int,
+    estimate_max_model_len: Callable[[int], int],
+):
+    if available_memory <= 0:
+        raise ValueError(
+            "No available memory for the cache blocks. "
+            "Try increasing `gpu_memory_utilization` when initializing the engine. "
+            "See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ "
+            "for more details."
+        )
+
+    needed_memory = get_needed_memory()
+
+    if needed_memory > available_memory:
+        estimated_max_len = estimate_max_model_len(available_memory)
+        estimated_msg = ""
+        if estimated_max_len > 0:
+            estimated_msg = (
+                "Based on the available memory, "
+                f"the estimated maximum model length is {estimated_max_len}. "
+            )
+
+        raise ValueError(
+            f"To serve at least one request with the models's max seq len "
+            f"({max_model_len}), ({needed_memory / GiB_bytes:.2f} GiB KV "
+            f"cache is needed, which is larger than the available KV cache "
+            f"memory ({available_memory / GiB_bytes:.2f} GiB). {estimated_msg}"
+            f"Try increasing `gpu_memory_utilization` or decreasing `max_model_len` "
+            f"when initializing the engine. "
+            f"See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ "
+            f"for more details."
+        )
+
+
 def max_memory_usage_bytes(
    vllm_config: VllmConfig, kv_cache_specs: Iterable[KVCacheSpec]
 ) -> int:
@ -688,43 +725,12 @@ def check_enough_kv_cache_memory(
    """

    # No need to check for available memory if the kv_cache_spec is empty
-    if not kv_cache_spec:
-        return
-
-    if available_memory <= 0:
-        raise ValueError(
-            "No available memory for the cache blocks. "
-            "Try increasing `gpu_memory_utilization` when "
-            "initializing the engine. "
-            "See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ "
-            "for more details."
-        )
-
-    max_model_len = vllm_config.model_config.max_model_len
-    needed_memory = max_memory_usage_bytes(vllm_config, kv_cache_spec.values())
-
-    if needed_memory > available_memory:
-        # Estimate the maximum model length that can fit in the available memory
-        estimated_max_len = estimate_max_model_len(
-            vllm_config, kv_cache_spec, available_memory
-        )
-        estimated_msg = ""
-        if estimated_max_len > 0:
-            estimated_msg = (
-                "Based on the available memory, "
-                f"the estimated maximum model length is {estimated_max_len}."
-            )
-
-        raise ValueError(
-            f"To serve at least one request with the models's max seq len "
-            f"({max_model_len}), ({needed_memory / GiB_bytes:.2f} GiB KV "
-            f"cache is needed, which is larger than the available KV cache "
-            f"memory ({available_memory / GiB_bytes:.2f} GiB). "
-            f"{estimated_msg} "
-            f"Try increasing `gpu_memory_utilization` or decreasing `max_model_len` "
-            f"when initializing the engine. "
-            f"See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ "
-            f"for more details."
+    if kv_cache_spec:
+        _check_enough_kv_cache_memory(
+            available_memory,
+            lambda: max_memory_usage_bytes(vllm_config, kv_cache_spec.values()),
+            vllm_config.model_config.max_model_len,
+            lambda am: estimate_max_model_len(vllm_config, kv_cache_spec, am),
        )


@ -1586,36 +1592,16 @@ def get_kv_cache_configs(
    # Check if the available memory is enough (using min across all workers).
    # We use the global groups to correctly account for padding.
    if global_kv_cache_groups:
-        min_available_memory = min(available_memory)
-        if min_available_memory <= 0:
-            raise ValueError(
-                "No available memory for the cache blocks. "
-                "Try increasing `gpu_memory_utilization` when "
-                "initializing the engine."
-            )
-        max_model_len = vllm_config.model_config.max_model_len
-        needed_memory = _max_memory_usage_bytes_from_groups(
-            vllm_config, global_kv_cache_groups
+        _check_enough_kv_cache_memory(
+            min(available_memory),
+            lambda: _max_memory_usage_bytes_from_groups(
+                vllm_config, global_kv_cache_groups
+            ),
+            vllm_config.model_config.max_model_len,
+            lambda am: _estimate_max_model_len_from_groups(
+                vllm_config, global_kv_cache_groups, am
+            ),
        )
-        if needed_memory > min_available_memory:
-            estimated_max_len = _estimate_max_model_len_from_groups(
-                vllm_config, global_kv_cache_groups, min_available_memory
-            )
-            estimated_msg = ""
-            if estimated_max_len > 0:
-                estimated_msg = (
-                    f"Based on the available memory, the estimated maximum "
-                    f"model length is {estimated_max_len}. "
-                )
-            raise ValueError(
-                f"To serve at least one request with the models's max seq len "
-                f"({max_model_len}), ({needed_memory / GiB_bytes:.2f} GiB KV "
-                f"cache is needed, which is larger than the available KV cache "
-                f"memory ({min_available_memory / GiB_bytes:.2f} GiB). "
-                f"{estimated_msg}"
-                f"Try increasing `gpu_memory_utilization` or decreasing "
-                f"`max_model_len` when initializing the engine."
-            )

    kv_cache_configs: list[KVCacheConfig] = []
    for kv_cache_spec_one_worker, available_memory_one_worker in zip(
--- a/vllm/v1/worker/ec_connector_model_runner_mixin.py
+++ b/vllm/v1/worker/ec_connector_model_runner_mixin.py
@ -6,9 +6,7 @@ Define EC connector functionality mixin for model runners.

 from collections.abc import Generator
 from contextlib import AbstractContextManager, contextmanager, nullcontext
-from typing import (
-    TYPE_CHECKING,  # noqa: UP035
-)
+from typing import TYPE_CHECKING

 import torch

--- a/vllm/v1/worker/kv_connector_model_runner_mixin.py
+++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py
@ -7,9 +7,7 @@ Define KV connector functionality mixin for model runners.
 import copy
 from collections.abc import Generator
 from contextlib import AbstractContextManager, contextmanager, nullcontext
-from typing import (
-    TYPE_CHECKING,  # noqa: UP035
-)
+from typing import TYPE_CHECKING

 import torch