diff --git a/vllm/config.py b/vllm/config.py index 4afdda3cca641..738a9b3376c3b 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -797,17 +797,12 @@ class ModelConfig: else: # Aliases if task_option == "embedding": - preferred_task = self._get_preferred_task( - architectures, supported_tasks) - if preferred_task != "embed": - msg = ("The 'embedding' task will be restricted to " - "embedding models in a future release. Please " - "pass `--task classify`, `--task score`, or " - "`--task reward` explicitly for other pooling " - "models.") - warnings.warn(msg, DeprecationWarning, stacklevel=2) + msg = ("The 'embedding' task has been renamed to " + "'embed', please use the new name. The old name " + "will be removed in v1.0.") + warnings.warn(msg, DeprecationWarning, stacklevel=2) - task_option = preferred_task or "embed" + task_option = "embed" if task_option not in supported_tasks: msg = ( diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 2da89b4f5944c..b991cb3a444bc 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -17,7 +17,7 @@ from contextlib import asynccontextmanager from functools import partial from http import HTTPStatus from json import JSONDecodeError -from typing import Annotated, Optional, Union +from typing import Annotated, Optional import prometheus_client import regex as re @@ -59,9 +59,7 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, EmbeddingChatRequest, EmbeddingCompletionRequest, EmbeddingRequest, - EmbeddingResponse, - EmbeddingResponseData, - ErrorResponse, + EmbeddingResponse, ErrorResponse, LoadLoRAAdapterRequest, PoolingChatRequest, PoolingCompletionRequest, @@ -627,37 +625,10 @@ async def create_completion(request: CompletionRequest, raw_request: Request): async def create_embedding(request: EmbeddingRequest, raw_request: Request): handler = embedding(raw_request) if handler is None: - fallback_handler = pooling(raw_request) - if fallback_handler is None: - return base(raw_request).create_error_response( - message="The model does not support Embeddings API") + return base(raw_request).create_error_response( + message="The model does not support Embeddings API") - logger.warning( - "Embeddings API will become exclusive to embedding models " - "in a future release. To return the hidden states directly, " - "use the Pooling API (`/pooling`) instead.") - - res = await fallback_handler.create_pooling(request, raw_request) - - generator: Union[ErrorResponse, EmbeddingResponse] - if isinstance(res, PoolingResponse): - generator = EmbeddingResponse( - id=res.id, - object=res.object, - created=res.created, - model=res.model, - data=[ - EmbeddingResponseData( - index=d.index, - embedding=d.data, # type: ignore - ) for d in res.data - ], - usage=res.usage, - ) - else: - generator = res - else: - generator = await handler.create_embedding(request, raw_request) + generator = await handler.create_embedding(request, raw_request) if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), diff --git a/vllm/outputs.py b/vllm/outputs.py index 33cc50c872b67..3960388bf73c6 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -7,7 +7,7 @@ from dataclasses import dataclass from typing import Any, Generic, Optional, Union import torch -from typing_extensions import TypeVar, deprecated +from typing_extensions import TypeVar from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -76,14 +76,6 @@ class PoolingOutput: return (isinstance(other, self.__class__) and bool( (self.data == other.data).all())) - @property - @deprecated("`LLM.encode()` now stores raw outputs in the `data` " - "attribute. To return embeddings, use `LLM.embed()`. " - "To return class probabilities, use `LLM.classify()` " - "and access the `probs` attribute. ") - def embedding(self) -> list[float]: - return self.data.tolist() - class RequestOutput: """The output data of a completion request to the LLM. @@ -506,12 +498,6 @@ class ScoringOutput: def __repr__(self) -> str: return f"ScoringOutput(score={self.score})" - @property - @deprecated("`LLM.score()` now returns scalar scores. " - "Please access it via the `score` attribute. ") - def embedding(self) -> list[float]: - return [self.score] - class ScoringRequestOutput(PoolingRequestOutput[ScoringOutput]):