mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-01-25 06:24:27 +08:00
[Doc] Update docs to refer to pooling models (#11093)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
parent
8f10d5e393
commit
cad5c0a6ed
@ -11,7 +11,12 @@ A: Assuming that you're referring to using OpenAI compatible server to serve mul
|
||||
|
||||
Q: Which model to use for offline inference embedding?
|
||||
|
||||
A: If you want to use an embedding model, try: https://huggingface.co/intfloat/e5-mistral-7b-instruct. Instead models, such as Llama-3-8b, Mistral-7B-Instruct-v0.3, are generation models rather than an embedding model
|
||||
A: You can try `e5-mistral-7b-instruct <https://huggingface.co/intfloat/e5-mistral-7b-instruct>`__ and `BAAI/bge-base-en-v1.5 <https://huggingface.co/BAAI/bge-base-en-v1.5>`__;
|
||||
more are listed :ref:`here <supported_models>`.
|
||||
|
||||
By extracting hidden states, vLLM can automatically convert text generation models like `Llama-3-8B <https://huggingface.co/meta-llama/Meta-Llama-3-8B>`__,
|
||||
`Mistral-7B-Instruct-v0.3 <https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3>`__ into embedding models,
|
||||
but they are expected be inferior to models that are specifically trained on embedding tasks.
|
||||
|
||||
----------------------------------------
|
||||
|
||||
|
||||
@ -14,7 +14,7 @@ if TYPE_CHECKING:
|
||||
from vllm.worker.model_runner import (ModelInputForGPUBuilder,
|
||||
ModelInputForGPUWithSamplingMetadata)
|
||||
|
||||
# Placeholder attention backend for models like Mamba and embedding models that
|
||||
# Placeholder attention backend for models like Mamba and pooling models that
|
||||
# lack attention.
|
||||
|
||||
|
||||
|
||||
@ -152,7 +152,7 @@ class ModelConfig:
|
||||
this argument will be used to configure the neuron config that
|
||||
can not be gathered from the vllm arguments.
|
||||
override_pooler_config: Initialize non default pooling config or
|
||||
override default pooling config for the embedding model.
|
||||
override default pooling config for the pooling model.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@ -576,7 +576,7 @@ class ModelConfig:
|
||||
self.use_async_output_proc = False
|
||||
return
|
||||
|
||||
# Async postprocessor is not necessary with embedding mode
|
||||
# Async postprocessor is not necessary for pooling models
|
||||
# since there is no token generation
|
||||
if self.runner_type == "pooling":
|
||||
self.use_async_output_proc = False
|
||||
@ -1825,11 +1825,11 @@ class MultiModalConfig:
|
||||
|
||||
@dataclass
|
||||
class PoolerConfig:
|
||||
"""Controls the behavior of output pooling in embedding models."""
|
||||
"""Controls the behavior of output pooling in pooling models."""
|
||||
|
||||
pooling_type: Optional[str] = None
|
||||
"""
|
||||
The pooling method of the embedding model. This should be a key in
|
||||
The pooling method of the pooling model. This should be a key in
|
||||
:class:`vllm.model_executor.layers.pooler.PoolingType`.
|
||||
"""
|
||||
|
||||
|
||||
@ -8,7 +8,7 @@ from vllm.utils import Device
|
||||
class PlaceholderBlockSpaceManager(BlockSpaceManager):
|
||||
"""A version of BlockSpaceManager for use in environments
|
||||
where block management is not required.
|
||||
For example: embedding models or attention-free models like Mamba.
|
||||
For example: pooling models or attention-free models like Mamba.
|
||||
|
||||
This class provides the same interface as BlockSpaceManager, but its
|
||||
methods perform no actions or return simple values like True in specific
|
||||
|
||||
@ -893,7 +893,7 @@ class EngineArgs:
|
||||
'--override-pooler-config',
|
||||
type=PoolerConfig.from_json,
|
||||
default=None,
|
||||
help="Override or set the pooling method in the embedding model. "
|
||||
help="Override or set the pooling method for pooling models. "
|
||||
"e.g. {\"pooling_type\": \"mean\", \"normalize\": false}.'")
|
||||
|
||||
parser.add_argument('--compilation-config',
|
||||
@ -1085,7 +1085,7 @@ class EngineArgs:
|
||||
"setting --max-model-len to a smaller value.", max_model_len)
|
||||
elif (self.enable_chunked_prefill
|
||||
and model_config.runner_type == "pooling"):
|
||||
msg = "Chunked prefill is not supported for embedding models"
|
||||
msg = "Chunked prefill is not supported for pooling models"
|
||||
raise ValueError(msg)
|
||||
|
||||
|
||||
|
||||
@ -1085,7 +1085,7 @@ class AsyncLLMEngine(EngineClient):
|
||||
trace_headers: Optional[Mapping[str, str]] = None,
|
||||
priority: int = 0,
|
||||
) -> AsyncGenerator[PoolingRequestOutput, None]:
|
||||
"""Generate outputs for a request from an embedding model.
|
||||
"""Generate outputs for a request from a pooling model.
|
||||
|
||||
Generate outputs for a request. This method is a coroutine. It adds the
|
||||
request into the waiting queue of the LLMEngine and streams the outputs
|
||||
|
||||
@ -527,7 +527,7 @@ class MQLLMEngineClient(EngineClient):
|
||||
*,
|
||||
inputs: Optional[PromptType] = None # DEPRECATED
|
||||
) -> AsyncGenerator[PoolingRequestOutput, None]:
|
||||
"""Generate outputs for a request from an embedding model.
|
||||
"""Generate outputs for a request from a pooling model.
|
||||
|
||||
Generate outputs for a request. This method is a coroutine. It adds the
|
||||
request into the waiting queue of the LLMEngine and streams the outputs
|
||||
|
||||
@ -209,7 +209,7 @@ class EngineClient(ABC):
|
||||
trace_headers: Optional[Mapping[str, str]] = None,
|
||||
priority: int = 0,
|
||||
) -> AsyncGenerator[PoolingRequestOutput, None]:
|
||||
"""Generate outputs for a request from an embedding model."""
|
||||
"""Generate outputs for a request from a pooling model."""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
|
||||
@ -119,7 +119,7 @@ class OpenAIServingScores(OpenAIServing):
|
||||
|
||||
if prompt_adapter_request is not None:
|
||||
raise NotImplementedError("Prompt adapter is not supported "
|
||||
"for embedding models")
|
||||
"for scoring models")
|
||||
|
||||
if isinstance(tokenizer, MistralTokenizer):
|
||||
raise ValueError(
|
||||
|
||||
@ -618,9 +618,9 @@ class SequenceGroup:
|
||||
arrival_time: The arrival time of the request.
|
||||
lora_request: LoRA request.
|
||||
embeddings: The embeddings vectors of the prompt of the sequence group
|
||||
for an embedding model.
|
||||
for a pooling model.
|
||||
pooling_params: The pooling parameters used to generate the pooling
|
||||
for an embedding model.
|
||||
for a pooling model.
|
||||
encoder_seq: Optional, the single encoder sequence. Should be None
|
||||
unless you are working with an encoder/decoder model.
|
||||
trace_headers: OpenTelemetry trace headers.
|
||||
@ -1102,7 +1102,7 @@ class PoolerOutput(
|
||||
msgspec.Struct,
|
||||
omit_defaults=True, # type: ignore[call-arg]
|
||||
array_like=True): # type: ignore[call-arg]
|
||||
"""The output from a pooling operation in the embedding model."""
|
||||
"""The output from a pooling operation in the pooling model."""
|
||||
outputs: List[EmbeddingSequenceGroupOutput]
|
||||
|
||||
# lazy import to avoid circular import
|
||||
|
||||
@ -59,7 +59,7 @@ class Processor:
|
||||
priority: int = 0,
|
||||
) -> Tuple[DetokenizerRequest, EngineCoreRequest]:
|
||||
|
||||
# TODO(woosuk): Support embedding mode.
|
||||
# TODO(woosuk): Support pooling models.
|
||||
# TODO(woosuk): Check max_logprobs
|
||||
# TODO(woosuk): Support encoder-decoder models.
|
||||
|
||||
|
||||
@ -178,7 +178,7 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
|
||||
# Uninitialized cache engine. Will be initialized by
|
||||
# initialize_cache.
|
||||
self.cache_engine: List[CPUCacheEngine]
|
||||
# Initialize cpu_cache as embedding models don't initialize kv_caches
|
||||
# Initialize cpu_cache as pooling models don't initialize kv_caches
|
||||
self.cpu_cache: Optional[List[List[torch.Tensor]]] = None
|
||||
|
||||
# Torch profiler. Enabled and configured through env vars:
|
||||
|
||||
@ -65,8 +65,8 @@ class HPUWorker(LocalOrDistributedWorkerBase):
|
||||
# Uninitialized cache engine. Will be initialized by
|
||||
# initialize_cache.
|
||||
self.cache_engine: List[HPUCacheEngine]
|
||||
# Initialize gpu_cache as embedding models don't initialize kv_caches
|
||||
self.hpu_cache: Optional[List[List[torch.tensor]]] = None
|
||||
# Initialize gpu_cache as pooling models don't initialize kv_caches
|
||||
self.hpu_cache: Optional[List[List[torch.Tensor]]] = None
|
||||
# Torch profiler. Enabled and configured through env vars:
|
||||
# VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
|
||||
if envs.VLLM_TORCH_PROFILER_DIR:
|
||||
|
||||
@ -91,7 +91,7 @@ class Worker(LocalOrDistributedWorkerBase):
|
||||
# Uninitialized cache engine. Will be initialized by
|
||||
# initialize_cache.
|
||||
self.cache_engine: List[CacheEngine]
|
||||
# Initialize gpu_cache as embedding models don't initialize kv_caches
|
||||
# Initialize gpu_cache as pooling models don't initialize kv_caches
|
||||
self.gpu_cache: Optional[List[List[torch.Tensor]]] = None
|
||||
self._seq_group_metadata_cache: Dict[str, SequenceGroupMetadata] = {}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user