[Doc] Update docs to refer to pooling models (#11093)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung 2024-12-11 21:36:27 +08:00 committed by GitHub
parent 8f10d5e393
commit cad5c0a6ed
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 26 additions and 21 deletions

View File

@ -11,7 +11,12 @@ A: Assuming that you're referring to using OpenAI compatible server to serve mul
Q: Which model to use for offline inference embedding? Q: Which model to use for offline inference embedding?
A: If you want to use an embedding model, try: https://huggingface.co/intfloat/e5-mistral-7b-instruct. Instead models, such as Llama-3-8b, Mistral-7B-Instruct-v0.3, are generation models rather than an embedding model A: You can try `e5-mistral-7b-instruct <https://huggingface.co/intfloat/e5-mistral-7b-instruct>`__ and `BAAI/bge-base-en-v1.5 <https://huggingface.co/BAAI/bge-base-en-v1.5>`__;
more are listed :ref:`here <supported_models>`.
By extracting hidden states, vLLM can automatically convert text generation models like `Llama-3-8B <https://huggingface.co/meta-llama/Meta-Llama-3-8B>`__,
`Mistral-7B-Instruct-v0.3 <https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3>`__ into embedding models,
but they are expected be inferior to models that are specifically trained on embedding tasks.
---------------------------------------- ----------------------------------------

View File

@ -14,7 +14,7 @@ if TYPE_CHECKING:
from vllm.worker.model_runner import (ModelInputForGPUBuilder, from vllm.worker.model_runner import (ModelInputForGPUBuilder,
ModelInputForGPUWithSamplingMetadata) ModelInputForGPUWithSamplingMetadata)
# Placeholder attention backend for models like Mamba and embedding models that # Placeholder attention backend for models like Mamba and pooling models that
# lack attention. # lack attention.

View File

@ -152,7 +152,7 @@ class ModelConfig:
this argument will be used to configure the neuron config that this argument will be used to configure the neuron config that
can not be gathered from the vllm arguments. can not be gathered from the vllm arguments.
override_pooler_config: Initialize non default pooling config or override_pooler_config: Initialize non default pooling config or
override default pooling config for the embedding model. override default pooling config for the pooling model.
""" """
def __init__( def __init__(
@ -576,7 +576,7 @@ class ModelConfig:
self.use_async_output_proc = False self.use_async_output_proc = False
return return
# Async postprocessor is not necessary with embedding mode # Async postprocessor is not necessary for pooling models
# since there is no token generation # since there is no token generation
if self.runner_type == "pooling": if self.runner_type == "pooling":
self.use_async_output_proc = False self.use_async_output_proc = False
@ -1825,11 +1825,11 @@ class MultiModalConfig:
@dataclass @dataclass
class PoolerConfig: class PoolerConfig:
"""Controls the behavior of output pooling in embedding models.""" """Controls the behavior of output pooling in pooling models."""
pooling_type: Optional[str] = None pooling_type: Optional[str] = None
""" """
The pooling method of the embedding model. This should be a key in The pooling method of the pooling model. This should be a key in
:class:`vllm.model_executor.layers.pooler.PoolingType`. :class:`vllm.model_executor.layers.pooler.PoolingType`.
""" """

View File

@ -8,7 +8,7 @@ from vllm.utils import Device
class PlaceholderBlockSpaceManager(BlockSpaceManager): class PlaceholderBlockSpaceManager(BlockSpaceManager):
"""A version of BlockSpaceManager for use in environments """A version of BlockSpaceManager for use in environments
where block management is not required. where block management is not required.
For example: embedding models or attention-free models like Mamba. For example: pooling models or attention-free models like Mamba.
This class provides the same interface as BlockSpaceManager, but its This class provides the same interface as BlockSpaceManager, but its
methods perform no actions or return simple values like True in specific methods perform no actions or return simple values like True in specific

View File

@ -893,7 +893,7 @@ class EngineArgs:
'--override-pooler-config', '--override-pooler-config',
type=PoolerConfig.from_json, type=PoolerConfig.from_json,
default=None, default=None,
help="Override or set the pooling method in the embedding model. " help="Override or set the pooling method for pooling models. "
"e.g. {\"pooling_type\": \"mean\", \"normalize\": false}.'") "e.g. {\"pooling_type\": \"mean\", \"normalize\": false}.'")
parser.add_argument('--compilation-config', parser.add_argument('--compilation-config',
@ -1085,7 +1085,7 @@ class EngineArgs:
"setting --max-model-len to a smaller value.", max_model_len) "setting --max-model-len to a smaller value.", max_model_len)
elif (self.enable_chunked_prefill elif (self.enable_chunked_prefill
and model_config.runner_type == "pooling"): and model_config.runner_type == "pooling"):
msg = "Chunked prefill is not supported for embedding models" msg = "Chunked prefill is not supported for pooling models"
raise ValueError(msg) raise ValueError(msg)

View File

@ -1085,7 +1085,7 @@ class AsyncLLMEngine(EngineClient):
trace_headers: Optional[Mapping[str, str]] = None, trace_headers: Optional[Mapping[str, str]] = None,
priority: int = 0, priority: int = 0,
) -> AsyncGenerator[PoolingRequestOutput, None]: ) -> AsyncGenerator[PoolingRequestOutput, None]:
"""Generate outputs for a request from an embedding model. """Generate outputs for a request from a pooling model.
Generate outputs for a request. This method is a coroutine. It adds the Generate outputs for a request. This method is a coroutine. It adds the
request into the waiting queue of the LLMEngine and streams the outputs request into the waiting queue of the LLMEngine and streams the outputs

View File

@ -527,7 +527,7 @@ class MQLLMEngineClient(EngineClient):
*, *,
inputs: Optional[PromptType] = None # DEPRECATED inputs: Optional[PromptType] = None # DEPRECATED
) -> AsyncGenerator[PoolingRequestOutput, None]: ) -> AsyncGenerator[PoolingRequestOutput, None]:
"""Generate outputs for a request from an embedding model. """Generate outputs for a request from a pooling model.
Generate outputs for a request. This method is a coroutine. It adds the Generate outputs for a request. This method is a coroutine. It adds the
request into the waiting queue of the LLMEngine and streams the outputs request into the waiting queue of the LLMEngine and streams the outputs

View File

@ -209,7 +209,7 @@ class EngineClient(ABC):
trace_headers: Optional[Mapping[str, str]] = None, trace_headers: Optional[Mapping[str, str]] = None,
priority: int = 0, priority: int = 0,
) -> AsyncGenerator[PoolingRequestOutput, None]: ) -> AsyncGenerator[PoolingRequestOutput, None]:
"""Generate outputs for a request from an embedding model.""" """Generate outputs for a request from a pooling model."""
... ...
@abstractmethod @abstractmethod

View File

@ -119,7 +119,7 @@ class OpenAIServingScores(OpenAIServing):
if prompt_adapter_request is not None: if prompt_adapter_request is not None:
raise NotImplementedError("Prompt adapter is not supported " raise NotImplementedError("Prompt adapter is not supported "
"for embedding models") "for scoring models")
if isinstance(tokenizer, MistralTokenizer): if isinstance(tokenizer, MistralTokenizer):
raise ValueError( raise ValueError(

View File

@ -618,9 +618,9 @@ class SequenceGroup:
arrival_time: The arrival time of the request. arrival_time: The arrival time of the request.
lora_request: LoRA request. lora_request: LoRA request.
embeddings: The embeddings vectors of the prompt of the sequence group embeddings: The embeddings vectors of the prompt of the sequence group
for an embedding model. for a pooling model.
pooling_params: The pooling parameters used to generate the pooling pooling_params: The pooling parameters used to generate the pooling
for an embedding model. for a pooling model.
encoder_seq: Optional, the single encoder sequence. Should be None encoder_seq: Optional, the single encoder sequence. Should be None
unless you are working with an encoder/decoder model. unless you are working with an encoder/decoder model.
trace_headers: OpenTelemetry trace headers. trace_headers: OpenTelemetry trace headers.
@ -1102,7 +1102,7 @@ class PoolerOutput(
msgspec.Struct, msgspec.Struct,
omit_defaults=True, # type: ignore[call-arg] omit_defaults=True, # type: ignore[call-arg]
array_like=True): # type: ignore[call-arg] array_like=True): # type: ignore[call-arg]
"""The output from a pooling operation in the embedding model.""" """The output from a pooling operation in the pooling model."""
outputs: List[EmbeddingSequenceGroupOutput] outputs: List[EmbeddingSequenceGroupOutput]
# lazy import to avoid circular import # lazy import to avoid circular import

View File

@ -59,7 +59,7 @@ class Processor:
priority: int = 0, priority: int = 0,
) -> Tuple[DetokenizerRequest, EngineCoreRequest]: ) -> Tuple[DetokenizerRequest, EngineCoreRequest]:
# TODO(woosuk): Support embedding mode. # TODO(woosuk): Support pooling models.
# TODO(woosuk): Check max_logprobs # TODO(woosuk): Check max_logprobs
# TODO(woosuk): Support encoder-decoder models. # TODO(woosuk): Support encoder-decoder models.

View File

@ -178,7 +178,7 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
# Uninitialized cache engine. Will be initialized by # Uninitialized cache engine. Will be initialized by
# initialize_cache. # initialize_cache.
self.cache_engine: List[CPUCacheEngine] self.cache_engine: List[CPUCacheEngine]
# Initialize cpu_cache as embedding models don't initialize kv_caches # Initialize cpu_cache as pooling models don't initialize kv_caches
self.cpu_cache: Optional[List[List[torch.Tensor]]] = None self.cpu_cache: Optional[List[List[torch.Tensor]]] = None
# Torch profiler. Enabled and configured through env vars: # Torch profiler. Enabled and configured through env vars:

View File

@ -65,8 +65,8 @@ class HPUWorker(LocalOrDistributedWorkerBase):
# Uninitialized cache engine. Will be initialized by # Uninitialized cache engine. Will be initialized by
# initialize_cache. # initialize_cache.
self.cache_engine: List[HPUCacheEngine] self.cache_engine: List[HPUCacheEngine]
# Initialize gpu_cache as embedding models don't initialize kv_caches # Initialize gpu_cache as pooling models don't initialize kv_caches
self.hpu_cache: Optional[List[List[torch.tensor]]] = None self.hpu_cache: Optional[List[List[torch.Tensor]]] = None
# Torch profiler. Enabled and configured through env vars: # Torch profiler. Enabled and configured through env vars:
# VLLM_TORCH_PROFILER_DIR=/path/to/save/trace # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
if envs.VLLM_TORCH_PROFILER_DIR: if envs.VLLM_TORCH_PROFILER_DIR:

View File

@ -91,7 +91,7 @@ class Worker(LocalOrDistributedWorkerBase):
# Uninitialized cache engine. Will be initialized by # Uninitialized cache engine. Will be initialized by
# initialize_cache. # initialize_cache.
self.cache_engine: List[CacheEngine] self.cache_engine: List[CacheEngine]
# Initialize gpu_cache as embedding models don't initialize kv_caches # Initialize gpu_cache as pooling models don't initialize kv_caches
self.gpu_cache: Optional[List[List[torch.Tensor]]] = None self.gpu_cache: Optional[List[List[torch.Tensor]]] = None
self._seq_group_metadata_cache: Dict[str, SequenceGroupMetadata] = {} self._seq_group_metadata_cache: Dict[str, SequenceGroupMetadata] = {}