[Doc] Update docs to refer to pooling models (#11093)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-03-19 07:07:07 +08:00 · 2024-12-11 21:36:27 +08:00 · 2024-12-11 21:36:27 +08:00 · cad5c0a6ed
commit cad5c0a6ed
parent 8f10d5e393
14 changed files with 26 additions and 21 deletions
--- a/docs/source/usage/faq.rst
+++ b/docs/source/usage/faq.rst
@ -11,7 +11,12 @@ A: Assuming that you're referring to using OpenAI compatible server to serve mul

    Q: Which model to use for offline inference embedding?

-A: If you want to use an embedding model, try: https://huggingface.co/intfloat/e5-mistral-7b-instruct. Instead models, such as Llama-3-8b, Mistral-7B-Instruct-v0.3, are generation models rather than an embedding model
+A: You can try `e5-mistral-7b-instruct <https://huggingface.co/intfloat/e5-mistral-7b-instruct>`__ and `BAAI/bge-base-en-v1.5 <https://huggingface.co/BAAI/bge-base-en-v1.5>`__;
+more are listed :ref:`here <supported_models>`.
+
+By extracting hidden states, vLLM can automatically convert text generation models like `Llama-3-8B <https://huggingface.co/meta-llama/Meta-Llama-3-8B>`__,
+`Mistral-7B-Instruct-v0.3 <https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3>`__ into embedding models,
+but they are expected be inferior to models that are specifically trained on embedding tasks.

 ----------------------------------------

--- a/vllm/attention/backends/placeholder_attn.py
+++ b/vllm/attention/backends/placeholder_attn.py
@ -14,7 +14,7 @@ if TYPE_CHECKING:
    from vllm.worker.model_runner import (ModelInputForGPUBuilder,
                                          ModelInputForGPUWithSamplingMetadata)

-# Placeholder attention backend for models like Mamba and embedding models that
+# Placeholder attention backend for models like Mamba and pooling models that
 # lack attention.


--- a/vllm/config.py
+++ b/vllm/config.py
@ -152,7 +152,7 @@ class ModelConfig:
            this argument will be used to configure the neuron config that
            can not be gathered from the vllm arguments.
        override_pooler_config: Initialize non default pooling config or
-            override default pooling config for the embedding model.
+            override default pooling config for the pooling model.
    """

    def __init__(
@ -576,7 +576,7 @@ class ModelConfig:
            self.use_async_output_proc = False
            return

-        # Async postprocessor is not necessary with embedding mode
+        # Async postprocessor is not necessary for pooling models
        # since there is no token generation
        if self.runner_type == "pooling":
            self.use_async_output_proc = False
@ -1825,11 +1825,11 @@ class MultiModalConfig:

@dataclass
 class PoolerConfig:
-    """Controls the behavior of output pooling in embedding models."""
+    """Controls the behavior of output pooling in pooling models."""

    pooling_type: Optional[str] = None
    """
-    The pooling method of the embedding model. This should be a key in
+    The pooling method of the pooling model. This should be a key in
    :class:`vllm.model_executor.layers.pooler.PoolingType`.
    """

--- a/vllm/core/placeholder_block_space_manager.py
+++ b/vllm/core/placeholder_block_space_manager.py
@ -8,7 +8,7 @@ from vllm.utils import Device
 class PlaceholderBlockSpaceManager(BlockSpaceManager):
    """A version of BlockSpaceManager for use in environments
    where block management is not required. 
-    For example: embedding models or attention-free models like Mamba.
+    For example: pooling models or attention-free models like Mamba.

    This class provides the same interface as BlockSpaceManager, but its
    methods perform no actions or return simple values like True in specific
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@ -893,7 +893,7 @@ class EngineArgs:
            '--override-pooler-config',
            type=PoolerConfig.from_json,
            default=None,
-            help="Override or set the pooling method in the embedding model. "
+            help="Override or set the pooling method for pooling models. "
            "e.g. {\"pooling_type\": \"mean\", \"normalize\": false}.'")

        parser.add_argument('--compilation-config',
@ -1085,7 +1085,7 @@ class EngineArgs:
                "setting --max-model-len to a smaller value.", max_model_len)
        elif (self.enable_chunked_prefill
              and model_config.runner_type == "pooling"):
-            msg = "Chunked prefill is not supported for embedding models"
+            msg = "Chunked prefill is not supported for pooling models"
            raise ValueError(msg)


--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@ -1085,7 +1085,7 @@ class AsyncLLMEngine(EngineClient):
        trace_headers: Optional[Mapping[str, str]] = None,
        priority: int = 0,
    ) -> AsyncGenerator[PoolingRequestOutput, None]:
-        """Generate outputs for a request from an embedding model.
+        """Generate outputs for a request from a pooling model.

        Generate outputs for a request. This method is a coroutine. It adds the
        request into the waiting queue of the LLMEngine and streams the outputs
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@ -527,7 +527,7 @@ class MQLLMEngineClient(EngineClient):
        *,
        inputs: Optional[PromptType] = None  # DEPRECATED
    ) -> AsyncGenerator[PoolingRequestOutput, None]:
-        """Generate outputs for a request from an embedding model.
+        """Generate outputs for a request from a pooling model.

        Generate outputs for a request. This method is a coroutine. It adds the
        request into the waiting queue of the LLMEngine and streams the outputs
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@ -209,7 +209,7 @@ class EngineClient(ABC):
        trace_headers: Optional[Mapping[str, str]] = None,
        priority: int = 0,
    ) -> AsyncGenerator[PoolingRequestOutput, None]:
-        """Generate outputs for a request from an embedding model."""
+        """Generate outputs for a request from a pooling model."""
        ...

    @abstractmethod
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@ -119,7 +119,7 @@ class OpenAIServingScores(OpenAIServing):

            if prompt_adapter_request is not None:
                raise NotImplementedError("Prompt adapter is not supported "
-                                          "for embedding models")
+                                          "for scoring models")

            if isinstance(tokenizer, MistralTokenizer):
                raise ValueError(
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@ -618,9 +618,9 @@ class SequenceGroup:
        arrival_time: The arrival time of the request.
        lora_request: LoRA request.
        embeddings: The embeddings vectors of the prompt of the sequence group
-            for an embedding model.
+            for a pooling model.
        pooling_params: The pooling parameters used to generate the pooling
-            for an embedding model.
+            for a pooling model.
        encoder_seq: Optional, the single encoder sequence. Should be None
                     unless you are working with an encoder/decoder model.
        trace_headers: OpenTelemetry trace headers.
@ -1102,7 +1102,7 @@ class PoolerOutput(
        msgspec.Struct,
        omit_defaults=True,  # type: ignore[call-arg]
        array_like=True):  # type: ignore[call-arg]
-    """The output from a pooling operation in the embedding model."""
+    """The output from a pooling operation in the pooling model."""
    outputs: List[EmbeddingSequenceGroupOutput]

    # lazy import to avoid circular import
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@ -59,7 +59,7 @@ class Processor:
        priority: int = 0,
    ) -> Tuple[DetokenizerRequest, EngineCoreRequest]:

-        # TODO(woosuk): Support embedding mode.
+        # TODO(woosuk): Support pooling models.
        # TODO(woosuk): Check max_logprobs
        # TODO(woosuk): Support encoder-decoder models.

--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@ -178,7 +178,7 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
        # Uninitialized cache engine. Will be initialized by
        # initialize_cache.
        self.cache_engine: List[CPUCacheEngine]
-        # Initialize cpu_cache as embedding models don't initialize kv_caches
+        # Initialize cpu_cache as pooling models don't initialize kv_caches
        self.cpu_cache: Optional[List[List[torch.Tensor]]] = None

        # Torch profiler. Enabled and configured through env vars:
--- a/vllm/worker/hpu_worker.py
+++ b/vllm/worker/hpu_worker.py
@ -65,8 +65,8 @@ class HPUWorker(LocalOrDistributedWorkerBase):
        # Uninitialized cache engine. Will be initialized by
        # initialize_cache.
        self.cache_engine: List[HPUCacheEngine]
-        # Initialize gpu_cache as embedding models don't initialize kv_caches
-        self.hpu_cache: Optional[List[List[torch.tensor]]] = None
+        # Initialize gpu_cache as pooling models don't initialize kv_caches
+        self.hpu_cache: Optional[List[List[torch.Tensor]]] = None
        # Torch profiler. Enabled and configured through env vars:
        # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
        if envs.VLLM_TORCH_PROFILER_DIR:
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@ -91,7 +91,7 @@ class Worker(LocalOrDistributedWorkerBase):
        # Uninitialized cache engine. Will be initialized by
        # initialize_cache.
        self.cache_engine: List[CacheEngine]
-        # Initialize gpu_cache as embedding models don't initialize kv_caches
+        # Initialize gpu_cache as pooling models don't initialize kv_caches
        self.gpu_cache: Optional[List[List[torch.Tensor]]] = None
        self._seq_group_metadata_cache: Dict[str, SequenceGroupMetadata] = {}