mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-16 19:35:53 +08:00
[Doc] Update docs to refer to pooling models (#11093)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
parent
8f10d5e393
commit
cad5c0a6ed
@ -11,7 +11,12 @@ A: Assuming that you're referring to using OpenAI compatible server to serve mul
|
|||||||
|
|
||||||
Q: Which model to use for offline inference embedding?
|
Q: Which model to use for offline inference embedding?
|
||||||
|
|
||||||
A: If you want to use an embedding model, try: https://huggingface.co/intfloat/e5-mistral-7b-instruct. Instead models, such as Llama-3-8b, Mistral-7B-Instruct-v0.3, are generation models rather than an embedding model
|
A: You can try `e5-mistral-7b-instruct <https://huggingface.co/intfloat/e5-mistral-7b-instruct>`__ and `BAAI/bge-base-en-v1.5 <https://huggingface.co/BAAI/bge-base-en-v1.5>`__;
|
||||||
|
more are listed :ref:`here <supported_models>`.
|
||||||
|
|
||||||
|
By extracting hidden states, vLLM can automatically convert text generation models like `Llama-3-8B <https://huggingface.co/meta-llama/Meta-Llama-3-8B>`__,
|
||||||
|
`Mistral-7B-Instruct-v0.3 <https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3>`__ into embedding models,
|
||||||
|
but they are expected be inferior to models that are specifically trained on embedding tasks.
|
||||||
|
|
||||||
----------------------------------------
|
----------------------------------------
|
||||||
|
|
||||||
|
|||||||
@ -14,7 +14,7 @@ if TYPE_CHECKING:
|
|||||||
from vllm.worker.model_runner import (ModelInputForGPUBuilder,
|
from vllm.worker.model_runner import (ModelInputForGPUBuilder,
|
||||||
ModelInputForGPUWithSamplingMetadata)
|
ModelInputForGPUWithSamplingMetadata)
|
||||||
|
|
||||||
# Placeholder attention backend for models like Mamba and embedding models that
|
# Placeholder attention backend for models like Mamba and pooling models that
|
||||||
# lack attention.
|
# lack attention.
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -152,7 +152,7 @@ class ModelConfig:
|
|||||||
this argument will be used to configure the neuron config that
|
this argument will be used to configure the neuron config that
|
||||||
can not be gathered from the vllm arguments.
|
can not be gathered from the vllm arguments.
|
||||||
override_pooler_config: Initialize non default pooling config or
|
override_pooler_config: Initialize non default pooling config or
|
||||||
override default pooling config for the embedding model.
|
override default pooling config for the pooling model.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@ -576,7 +576,7 @@ class ModelConfig:
|
|||||||
self.use_async_output_proc = False
|
self.use_async_output_proc = False
|
||||||
return
|
return
|
||||||
|
|
||||||
# Async postprocessor is not necessary with embedding mode
|
# Async postprocessor is not necessary for pooling models
|
||||||
# since there is no token generation
|
# since there is no token generation
|
||||||
if self.runner_type == "pooling":
|
if self.runner_type == "pooling":
|
||||||
self.use_async_output_proc = False
|
self.use_async_output_proc = False
|
||||||
@ -1825,11 +1825,11 @@ class MultiModalConfig:
|
|||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class PoolerConfig:
|
class PoolerConfig:
|
||||||
"""Controls the behavior of output pooling in embedding models."""
|
"""Controls the behavior of output pooling in pooling models."""
|
||||||
|
|
||||||
pooling_type: Optional[str] = None
|
pooling_type: Optional[str] = None
|
||||||
"""
|
"""
|
||||||
The pooling method of the embedding model. This should be a key in
|
The pooling method of the pooling model. This should be a key in
|
||||||
:class:`vllm.model_executor.layers.pooler.PoolingType`.
|
:class:`vllm.model_executor.layers.pooler.PoolingType`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|||||||
@ -8,7 +8,7 @@ from vllm.utils import Device
|
|||||||
class PlaceholderBlockSpaceManager(BlockSpaceManager):
|
class PlaceholderBlockSpaceManager(BlockSpaceManager):
|
||||||
"""A version of BlockSpaceManager for use in environments
|
"""A version of BlockSpaceManager for use in environments
|
||||||
where block management is not required.
|
where block management is not required.
|
||||||
For example: embedding models or attention-free models like Mamba.
|
For example: pooling models or attention-free models like Mamba.
|
||||||
|
|
||||||
This class provides the same interface as BlockSpaceManager, but its
|
This class provides the same interface as BlockSpaceManager, but its
|
||||||
methods perform no actions or return simple values like True in specific
|
methods perform no actions or return simple values like True in specific
|
||||||
|
|||||||
@ -893,7 +893,7 @@ class EngineArgs:
|
|||||||
'--override-pooler-config',
|
'--override-pooler-config',
|
||||||
type=PoolerConfig.from_json,
|
type=PoolerConfig.from_json,
|
||||||
default=None,
|
default=None,
|
||||||
help="Override or set the pooling method in the embedding model. "
|
help="Override or set the pooling method for pooling models. "
|
||||||
"e.g. {\"pooling_type\": \"mean\", \"normalize\": false}.'")
|
"e.g. {\"pooling_type\": \"mean\", \"normalize\": false}.'")
|
||||||
|
|
||||||
parser.add_argument('--compilation-config',
|
parser.add_argument('--compilation-config',
|
||||||
@ -1085,7 +1085,7 @@ class EngineArgs:
|
|||||||
"setting --max-model-len to a smaller value.", max_model_len)
|
"setting --max-model-len to a smaller value.", max_model_len)
|
||||||
elif (self.enable_chunked_prefill
|
elif (self.enable_chunked_prefill
|
||||||
and model_config.runner_type == "pooling"):
|
and model_config.runner_type == "pooling"):
|
||||||
msg = "Chunked prefill is not supported for embedding models"
|
msg = "Chunked prefill is not supported for pooling models"
|
||||||
raise ValueError(msg)
|
raise ValueError(msg)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1085,7 +1085,7 @@ class AsyncLLMEngine(EngineClient):
|
|||||||
trace_headers: Optional[Mapping[str, str]] = None,
|
trace_headers: Optional[Mapping[str, str]] = None,
|
||||||
priority: int = 0,
|
priority: int = 0,
|
||||||
) -> AsyncGenerator[PoolingRequestOutput, None]:
|
) -> AsyncGenerator[PoolingRequestOutput, None]:
|
||||||
"""Generate outputs for a request from an embedding model.
|
"""Generate outputs for a request from a pooling model.
|
||||||
|
|
||||||
Generate outputs for a request. This method is a coroutine. It adds the
|
Generate outputs for a request. This method is a coroutine. It adds the
|
||||||
request into the waiting queue of the LLMEngine and streams the outputs
|
request into the waiting queue of the LLMEngine and streams the outputs
|
||||||
|
|||||||
@ -527,7 +527,7 @@ class MQLLMEngineClient(EngineClient):
|
|||||||
*,
|
*,
|
||||||
inputs: Optional[PromptType] = None # DEPRECATED
|
inputs: Optional[PromptType] = None # DEPRECATED
|
||||||
) -> AsyncGenerator[PoolingRequestOutput, None]:
|
) -> AsyncGenerator[PoolingRequestOutput, None]:
|
||||||
"""Generate outputs for a request from an embedding model.
|
"""Generate outputs for a request from a pooling model.
|
||||||
|
|
||||||
Generate outputs for a request. This method is a coroutine. It adds the
|
Generate outputs for a request. This method is a coroutine. It adds the
|
||||||
request into the waiting queue of the LLMEngine and streams the outputs
|
request into the waiting queue of the LLMEngine and streams the outputs
|
||||||
|
|||||||
@ -209,7 +209,7 @@ class EngineClient(ABC):
|
|||||||
trace_headers: Optional[Mapping[str, str]] = None,
|
trace_headers: Optional[Mapping[str, str]] = None,
|
||||||
priority: int = 0,
|
priority: int = 0,
|
||||||
) -> AsyncGenerator[PoolingRequestOutput, None]:
|
) -> AsyncGenerator[PoolingRequestOutput, None]:
|
||||||
"""Generate outputs for a request from an embedding model."""
|
"""Generate outputs for a request from a pooling model."""
|
||||||
...
|
...
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
|
|||||||
@ -119,7 +119,7 @@ class OpenAIServingScores(OpenAIServing):
|
|||||||
|
|
||||||
if prompt_adapter_request is not None:
|
if prompt_adapter_request is not None:
|
||||||
raise NotImplementedError("Prompt adapter is not supported "
|
raise NotImplementedError("Prompt adapter is not supported "
|
||||||
"for embedding models")
|
"for scoring models")
|
||||||
|
|
||||||
if isinstance(tokenizer, MistralTokenizer):
|
if isinstance(tokenizer, MistralTokenizer):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
|||||||
@ -618,9 +618,9 @@ class SequenceGroup:
|
|||||||
arrival_time: The arrival time of the request.
|
arrival_time: The arrival time of the request.
|
||||||
lora_request: LoRA request.
|
lora_request: LoRA request.
|
||||||
embeddings: The embeddings vectors of the prompt of the sequence group
|
embeddings: The embeddings vectors of the prompt of the sequence group
|
||||||
for an embedding model.
|
for a pooling model.
|
||||||
pooling_params: The pooling parameters used to generate the pooling
|
pooling_params: The pooling parameters used to generate the pooling
|
||||||
for an embedding model.
|
for a pooling model.
|
||||||
encoder_seq: Optional, the single encoder sequence. Should be None
|
encoder_seq: Optional, the single encoder sequence. Should be None
|
||||||
unless you are working with an encoder/decoder model.
|
unless you are working with an encoder/decoder model.
|
||||||
trace_headers: OpenTelemetry trace headers.
|
trace_headers: OpenTelemetry trace headers.
|
||||||
@ -1102,7 +1102,7 @@ class PoolerOutput(
|
|||||||
msgspec.Struct,
|
msgspec.Struct,
|
||||||
omit_defaults=True, # type: ignore[call-arg]
|
omit_defaults=True, # type: ignore[call-arg]
|
||||||
array_like=True): # type: ignore[call-arg]
|
array_like=True): # type: ignore[call-arg]
|
||||||
"""The output from a pooling operation in the embedding model."""
|
"""The output from a pooling operation in the pooling model."""
|
||||||
outputs: List[EmbeddingSequenceGroupOutput]
|
outputs: List[EmbeddingSequenceGroupOutput]
|
||||||
|
|
||||||
# lazy import to avoid circular import
|
# lazy import to avoid circular import
|
||||||
|
|||||||
@ -59,7 +59,7 @@ class Processor:
|
|||||||
priority: int = 0,
|
priority: int = 0,
|
||||||
) -> Tuple[DetokenizerRequest, EngineCoreRequest]:
|
) -> Tuple[DetokenizerRequest, EngineCoreRequest]:
|
||||||
|
|
||||||
# TODO(woosuk): Support embedding mode.
|
# TODO(woosuk): Support pooling models.
|
||||||
# TODO(woosuk): Check max_logprobs
|
# TODO(woosuk): Check max_logprobs
|
||||||
# TODO(woosuk): Support encoder-decoder models.
|
# TODO(woosuk): Support encoder-decoder models.
|
||||||
|
|
||||||
|
|||||||
@ -178,7 +178,7 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
|
|||||||
# Uninitialized cache engine. Will be initialized by
|
# Uninitialized cache engine. Will be initialized by
|
||||||
# initialize_cache.
|
# initialize_cache.
|
||||||
self.cache_engine: List[CPUCacheEngine]
|
self.cache_engine: List[CPUCacheEngine]
|
||||||
# Initialize cpu_cache as embedding models don't initialize kv_caches
|
# Initialize cpu_cache as pooling models don't initialize kv_caches
|
||||||
self.cpu_cache: Optional[List[List[torch.Tensor]]] = None
|
self.cpu_cache: Optional[List[List[torch.Tensor]]] = None
|
||||||
|
|
||||||
# Torch profiler. Enabled and configured through env vars:
|
# Torch profiler. Enabled and configured through env vars:
|
||||||
|
|||||||
@ -65,8 +65,8 @@ class HPUWorker(LocalOrDistributedWorkerBase):
|
|||||||
# Uninitialized cache engine. Will be initialized by
|
# Uninitialized cache engine. Will be initialized by
|
||||||
# initialize_cache.
|
# initialize_cache.
|
||||||
self.cache_engine: List[HPUCacheEngine]
|
self.cache_engine: List[HPUCacheEngine]
|
||||||
# Initialize gpu_cache as embedding models don't initialize kv_caches
|
# Initialize gpu_cache as pooling models don't initialize kv_caches
|
||||||
self.hpu_cache: Optional[List[List[torch.tensor]]] = None
|
self.hpu_cache: Optional[List[List[torch.Tensor]]] = None
|
||||||
# Torch profiler. Enabled and configured through env vars:
|
# Torch profiler. Enabled and configured through env vars:
|
||||||
# VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
|
# VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
|
||||||
if envs.VLLM_TORCH_PROFILER_DIR:
|
if envs.VLLM_TORCH_PROFILER_DIR:
|
||||||
|
|||||||
@ -91,7 +91,7 @@ class Worker(LocalOrDistributedWorkerBase):
|
|||||||
# Uninitialized cache engine. Will be initialized by
|
# Uninitialized cache engine. Will be initialized by
|
||||||
# initialize_cache.
|
# initialize_cache.
|
||||||
self.cache_engine: List[CacheEngine]
|
self.cache_engine: List[CacheEngine]
|
||||||
# Initialize gpu_cache as embedding models don't initialize kv_caches
|
# Initialize gpu_cache as pooling models don't initialize kv_caches
|
||||||
self.gpu_cache: Optional[List[List[torch.Tensor]]] = None
|
self.gpu_cache: Optional[List[List[torch.Tensor]]] = None
|
||||||
self._seq_group_metadata_cache: Dict[str, SequenceGroupMetadata] = {}
|
self._seq_group_metadata_cache: Dict[str, SequenceGroupMetadata] = {}
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user