[Deprecation] Remove fallbacks for Embeddings API (#18795)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung 2025-05-28 15:09:04 +08:00 committed by GitHub
parent 0f0926b43f
commit 0c492b7824
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 11 additions and 59 deletions

View File

@ -797,17 +797,12 @@ class ModelConfig:
else:
# Aliases
if task_option == "embedding":
preferred_task = self._get_preferred_task(
architectures, supported_tasks)
if preferred_task != "embed":
msg = ("The 'embedding' task will be restricted to "
"embedding models in a future release. Please "
"pass `--task classify`, `--task score`, or "
"`--task reward` explicitly for other pooling "
"models.")
warnings.warn(msg, DeprecationWarning, stacklevel=2)
msg = ("The 'embedding' task has been renamed to "
"'embed', please use the new name. The old name "
"will be removed in v1.0.")
warnings.warn(msg, DeprecationWarning, stacklevel=2)
task_option = preferred_task or "embed"
task_option = "embed"
if task_option not in supported_tasks:
msg = (

View File

@ -17,7 +17,7 @@ from contextlib import asynccontextmanager
from functools import partial
from http import HTTPStatus
from json import JSONDecodeError
from typing import Annotated, Optional, Union
from typing import Annotated, Optional
import prometheus_client
import regex as re
@ -59,9 +59,7 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
EmbeddingChatRequest,
EmbeddingCompletionRequest,
EmbeddingRequest,
EmbeddingResponse,
EmbeddingResponseData,
ErrorResponse,
EmbeddingResponse, ErrorResponse,
LoadLoRAAdapterRequest,
PoolingChatRequest,
PoolingCompletionRequest,
@ -627,37 +625,10 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
async def create_embedding(request: EmbeddingRequest, raw_request: Request):
handler = embedding(raw_request)
if handler is None:
fallback_handler = pooling(raw_request)
if fallback_handler is None:
return base(raw_request).create_error_response(
message="The model does not support Embeddings API")
return base(raw_request).create_error_response(
message="The model does not support Embeddings API")
logger.warning(
"Embeddings API will become exclusive to embedding models "
"in a future release. To return the hidden states directly, "
"use the Pooling API (`/pooling`) instead.")
res = await fallback_handler.create_pooling(request, raw_request)
generator: Union[ErrorResponse, EmbeddingResponse]
if isinstance(res, PoolingResponse):
generator = EmbeddingResponse(
id=res.id,
object=res.object,
created=res.created,
model=res.model,
data=[
EmbeddingResponseData(
index=d.index,
embedding=d.data, # type: ignore
) for d in res.data
],
usage=res.usage,
)
else:
generator = res
else:
generator = await handler.create_embedding(request, raw_request)
generator = await handler.create_embedding(request, raw_request)
if isinstance(generator, ErrorResponse):
return JSONResponse(content=generator.model_dump(),

View File

@ -7,7 +7,7 @@ from dataclasses import dataclass
from typing import Any, Generic, Optional, Union
import torch
from typing_extensions import TypeVar, deprecated
from typing_extensions import TypeVar
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
@ -76,14 +76,6 @@ class PoolingOutput:
return (isinstance(other, self.__class__) and bool(
(self.data == other.data).all()))
@property
@deprecated("`LLM.encode()` now stores raw outputs in the `data` "
"attribute. To return embeddings, use `LLM.embed()`. "
"To return class probabilities, use `LLM.classify()` "
"and access the `probs` attribute. ")
def embedding(self) -> list[float]:
return self.data.tolist()
class RequestOutput:
"""The output data of a completion request to the LLM.
@ -506,12 +498,6 @@ class ScoringOutput:
def __repr__(self) -> str:
return f"ScoringOutput(score={self.score})"
@property
@deprecated("`LLM.score()` now returns scalar scores. "
"Please access it via the `score` attribute. ")
def embedding(self) -> list[float]:
return [self.score]
class ScoringRequestOutput(PoolingRequestOutput[ScoringOutput]):