[Doc] Update docs to refer to pooling models (#11093)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung 2024-12-11 21:36:27 +08:00 committed by GitHub
parent 8f10d5e393
commit cad5c0a6ed
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 26 additions and 21 deletions

View File

@ -11,7 +11,12 @@ A: Assuming that you're referring to using OpenAI compatible server to serve mul
Q: Which model to use for offline inference embedding?
A: If you want to use an embedding model, try: https://huggingface.co/intfloat/e5-mistral-7b-instruct. Instead models, such as Llama-3-8b, Mistral-7B-Instruct-v0.3, are generation models rather than an embedding model
A: You can try `e5-mistral-7b-instruct <https://huggingface.co/intfloat/e5-mistral-7b-instruct>`__ and `BAAI/bge-base-en-v1.5 <https://huggingface.co/BAAI/bge-base-en-v1.5>`__;
more are listed :ref:`here <supported_models>`.
By extracting hidden states, vLLM can automatically convert text generation models like `Llama-3-8B <https://huggingface.co/meta-llama/Meta-Llama-3-8B>`__,
`Mistral-7B-Instruct-v0.3 <https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3>`__ into embedding models,
but they are expected be inferior to models that are specifically trained on embedding tasks.
----------------------------------------

View File

@ -14,7 +14,7 @@ if TYPE_CHECKING:
from vllm.worker.model_runner import (ModelInputForGPUBuilder,
ModelInputForGPUWithSamplingMetadata)
# Placeholder attention backend for models like Mamba and embedding models that
# Placeholder attention backend for models like Mamba and pooling models that
# lack attention.

View File

@ -152,7 +152,7 @@ class ModelConfig:
this argument will be used to configure the neuron config that
can not be gathered from the vllm arguments.
override_pooler_config: Initialize non default pooling config or
override default pooling config for the embedding model.
override default pooling config for the pooling model.
"""
def __init__(
@ -576,7 +576,7 @@ class ModelConfig:
self.use_async_output_proc = False
return
# Async postprocessor is not necessary with embedding mode
# Async postprocessor is not necessary for pooling models
# since there is no token generation
if self.runner_type == "pooling":
self.use_async_output_proc = False
@ -1825,11 +1825,11 @@ class MultiModalConfig:
@dataclass
class PoolerConfig:
"""Controls the behavior of output pooling in embedding models."""
"""Controls the behavior of output pooling in pooling models."""
pooling_type: Optional[str] = None
"""
The pooling method of the embedding model. This should be a key in
The pooling method of the pooling model. This should be a key in
:class:`vllm.model_executor.layers.pooler.PoolingType`.
"""

View File

@ -8,7 +8,7 @@ from vllm.utils import Device
class PlaceholderBlockSpaceManager(BlockSpaceManager):
"""A version of BlockSpaceManager for use in environments
where block management is not required.
For example: embedding models or attention-free models like Mamba.
For example: pooling models or attention-free models like Mamba.
This class provides the same interface as BlockSpaceManager, but its
methods perform no actions or return simple values like True in specific

View File

@ -893,7 +893,7 @@ class EngineArgs:
'--override-pooler-config',
type=PoolerConfig.from_json,
default=None,
help="Override or set the pooling method in the embedding model. "
help="Override or set the pooling method for pooling models. "
"e.g. {\"pooling_type\": \"mean\", \"normalize\": false}.'")
parser.add_argument('--compilation-config',
@ -1085,7 +1085,7 @@ class EngineArgs:
"setting --max-model-len to a smaller value.", max_model_len)
elif (self.enable_chunked_prefill
and model_config.runner_type == "pooling"):
msg = "Chunked prefill is not supported for embedding models"
msg = "Chunked prefill is not supported for pooling models"
raise ValueError(msg)

View File

@ -1085,7 +1085,7 @@ class AsyncLLMEngine(EngineClient):
trace_headers: Optional[Mapping[str, str]] = None,
priority: int = 0,
) -> AsyncGenerator[PoolingRequestOutput, None]:
"""Generate outputs for a request from an embedding model.
"""Generate outputs for a request from a pooling model.
Generate outputs for a request. This method is a coroutine. It adds the
request into the waiting queue of the LLMEngine and streams the outputs

View File

@ -527,7 +527,7 @@ class MQLLMEngineClient(EngineClient):
*,
inputs: Optional[PromptType] = None # DEPRECATED
) -> AsyncGenerator[PoolingRequestOutput, None]:
"""Generate outputs for a request from an embedding model.
"""Generate outputs for a request from a pooling model.
Generate outputs for a request. This method is a coroutine. It adds the
request into the waiting queue of the LLMEngine and streams the outputs

View File

@ -209,7 +209,7 @@ class EngineClient(ABC):
trace_headers: Optional[Mapping[str, str]] = None,
priority: int = 0,
) -> AsyncGenerator[PoolingRequestOutput, None]:
"""Generate outputs for a request from an embedding model."""
"""Generate outputs for a request from a pooling model."""
...
@abstractmethod

View File

@ -119,7 +119,7 @@ class OpenAIServingScores(OpenAIServing):
if prompt_adapter_request is not None:
raise NotImplementedError("Prompt adapter is not supported "
"for embedding models")
"for scoring models")
if isinstance(tokenizer, MistralTokenizer):
raise ValueError(

View File

@ -618,9 +618,9 @@ class SequenceGroup:
arrival_time: The arrival time of the request.
lora_request: LoRA request.
embeddings: The embeddings vectors of the prompt of the sequence group
for an embedding model.
for a pooling model.
pooling_params: The pooling parameters used to generate the pooling
for an embedding model.
for a pooling model.
encoder_seq: Optional, the single encoder sequence. Should be None
unless you are working with an encoder/decoder model.
trace_headers: OpenTelemetry trace headers.
@ -1102,7 +1102,7 @@ class PoolerOutput(
msgspec.Struct,
omit_defaults=True, # type: ignore[call-arg]
array_like=True): # type: ignore[call-arg]
"""The output from a pooling operation in the embedding model."""
"""The output from a pooling operation in the pooling model."""
outputs: List[EmbeddingSequenceGroupOutput]
# lazy import to avoid circular import

View File

@ -59,7 +59,7 @@ class Processor:
priority: int = 0,
) -> Tuple[DetokenizerRequest, EngineCoreRequest]:
# TODO(woosuk): Support embedding mode.
# TODO(woosuk): Support pooling models.
# TODO(woosuk): Check max_logprobs
# TODO(woosuk): Support encoder-decoder models.

View File

@ -178,7 +178,7 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
# Uninitialized cache engine. Will be initialized by
# initialize_cache.
self.cache_engine: List[CPUCacheEngine]
# Initialize cpu_cache as embedding models don't initialize kv_caches
# Initialize cpu_cache as pooling models don't initialize kv_caches
self.cpu_cache: Optional[List[List[torch.Tensor]]] = None
# Torch profiler. Enabled and configured through env vars:

View File

@ -65,8 +65,8 @@ class HPUWorker(LocalOrDistributedWorkerBase):
# Uninitialized cache engine. Will be initialized by
# initialize_cache.
self.cache_engine: List[HPUCacheEngine]
# Initialize gpu_cache as embedding models don't initialize kv_caches
self.hpu_cache: Optional[List[List[torch.tensor]]] = None
# Initialize gpu_cache as pooling models don't initialize kv_caches
self.hpu_cache: Optional[List[List[torch.Tensor]]] = None
# Torch profiler. Enabled and configured through env vars:
# VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
if envs.VLLM_TORCH_PROFILER_DIR:

View File

@ -91,7 +91,7 @@ class Worker(LocalOrDistributedWorkerBase):
# Uninitialized cache engine. Will be initialized by
# initialize_cache.
self.cache_engine: List[CacheEngine]
# Initialize gpu_cache as embedding models don't initialize kv_caches
# Initialize gpu_cache as pooling models don't initialize kv_caches
self.gpu_cache: Optional[List[List[torch.Tensor]]] = None
self._seq_group_metadata_cache: Dict[str, SequenceGroupMetadata] = {}