[Deprecation] Remove inputs arg fallback in Engine classes (#18799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung 2025-06-09 10:19:56 +08:00 committed by GitHub
parent 2ffb9b6e07
commit e31ae3de36
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 21 additions and 268 deletions

View File

@ -6,12 +6,10 @@ import copy
import time
import weakref
from functools import partial
from typing import (Any, AsyncGenerator, Callable, Coroutine, Dict, Iterable,
List, Mapping, Optional, Set, Tuple, Type, Union, overload)
from typing import (Any, AsyncGenerator, Callable, Dict, Iterable, List,
Mapping, Optional, Set, Tuple, Type, Union)
from weakref import ReferenceType
from typing_extensions import deprecated
import vllm.envs as envs
from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
ParallelConfig, SchedulerConfig, VllmConfig)
@ -36,7 +34,7 @@ from vllm.sampling_params import SamplingParams
from vllm.sequence import ExecuteModelRequest
from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.usage.usage_lib import UsageContext
from vllm.utils import Device, deprecate_kwargs, weak_bind
from vllm.utils import Device, weak_bind
logger = init_logger(__name__)
ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S
@ -429,24 +427,6 @@ class _AsyncLLMEngine(LLMEngine):
return await (
self.get_tokenizer_group().get_lora_tokenizer_async(lora_request))
@overload
@deprecated("'inputs' will be renamed to 'prompt")
async def add_request_async(
self,
request_id: str,
*,
inputs: PromptType,
params: Union[SamplingParams, PoolingParams],
arrival_time: Optional[float] = None,
lora_request: Optional[LoRARequest] = None,
trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
data_parallel_rank: Optional[int] = None,
) -> None:
...
@overload
async def add_request_async(
self,
request_id: str,
@ -459,32 +439,10 @@ class _AsyncLLMEngine(LLMEngine):
priority: int = 0,
data_parallel_rank: Optional[int] = None,
) -> None:
...
@deprecate_kwargs(
"inputs",
additional_message="Please use the 'prompt' parameter instead.",
)
async def add_request_async(
self,
request_id: str,
prompt: Optional[PromptType] = None,
params: Optional[Union[SamplingParams, PoolingParams]] = None,
arrival_time: Optional[float] = None,
lora_request: Optional[LoRARequest] = None,
trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
data_parallel_rank: Optional[int] = None,
*,
inputs: Optional[PromptType] = None, # DEPRECATED
) -> None:
"""Async version of
[`add_request`][vllm.engine.llm_engine.LLMEngine.add_request]."""
if inputs is not None:
prompt = inputs
assert prompt is not None and params is not None
"""
Async version of
[`add_request`][vllm.engine.llm_engine.LLMEngine.add_request].
"""
if lora_request is not None and not self.lora_config:
raise ValueError(f"Got lora_request {lora_request} but LoRA is "
"not enabled!")
@ -521,8 +479,7 @@ class _AsyncLLMEngine(LLMEngine):
params = await build_guided_decoding_logits_processor_async(
sampling_params=params,
tokenizer=await self.get_tokenizer_async(lora_request),
default_guided_backend=self.decoding_config.
guided_decoding_backend,
default_guided_backend=self.decoding_config.backend,
reasoning_backend=self.decoding_config.reasoning_backend,
model_config=self.model_config)
@ -894,28 +851,7 @@ class AsyncLLMEngine(EngineClient):
raise
await asyncio.sleep(0)
# This method does not need to be async, but kept that way
# for backwards compatibility.
@overload
@deprecated("'inputs' will be renamed to 'prompt")
def add_request(
self,
request_id: str,
*,
inputs: PromptType,
params: Union[SamplingParams, PoolingParams],
arrival_time: Optional[float] = None,
lora_request: Optional[LoRARequest] = None,
trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
data_parallel_rank: Optional[int] = None,
) -> Coroutine[None, None, AsyncGenerator[Union[
RequestOutput, PoolingRequestOutput], None]]:
...
@overload
def add_request(
async def add_request(
self,
request_id: str,
prompt: PromptType,
@ -926,32 +862,7 @@ class AsyncLLMEngine(EngineClient):
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
data_parallel_rank: Optional[int] = None,
) -> Coroutine[None, None, AsyncGenerator[Union[
RequestOutput, PoolingRequestOutput], None]]:
...
@deprecate_kwargs(
"inputs",
additional_message="Please use the 'prompt' parameter instead.",
)
async def add_request(
self,
request_id: str,
prompt: Optional[PromptType] = None,
params: Optional[Union[SamplingParams, PoolingParams]] = None,
arrival_time: Optional[float] = None,
lora_request: Optional[LoRARequest] = None,
trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
data_parallel_rank: Optional[int] = None,
*,
inputs: Optional[PromptType] = None, # DEPRECATED
) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
if inputs is not None:
prompt = inputs
assert prompt is not None and params is not None
if not self.is_running:
if self.start_engine_loop:
self.start_background_loop()

View File

@ -11,10 +11,10 @@ from functools import partial
from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Deque, Dict,
Iterable, List, Literal, Mapping, NamedTuple, Optional)
from typing import Sequence as GenericSequence
from typing import Set, Type, Union, cast, overload
from typing import Set, Type, Union, cast
import torch
from typing_extensions import TypeVar, deprecated
from typing_extensions import TypeVar
import vllm.envs as envs
from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
@ -58,8 +58,7 @@ from vllm.transformers_utils.tokenizer_group import (
TokenizerGroup, init_tokenizer_from_configs)
from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
usage_message)
from vllm.utils import (Counter, Device, deprecate_kwargs,
resolve_obj_by_qualname, weak_bind)
from vllm.utils import Counter, Device, resolve_obj_by_qualname, weak_bind
from vllm.version import __version__ as VLLM_VERSION
from vllm.worker.model_runner_base import InputProcessingError
@ -629,7 +628,6 @@ class LLMEngine:
def stop_remote_worker_execution_loop(self) -> None:
self.model_executor.stop_remote_worker_execution_loop()
@overload
def add_request(
self,
request_id: str,
@ -641,42 +639,6 @@ class LLMEngine:
trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
) -> None:
...
@overload
@deprecated("'inputs' will be renamed to 'prompt")
def add_request(
self,
request_id: str,
*,
inputs: PromptType,
params: Union[SamplingParams, PoolingParams],
arrival_time: Optional[float] = None,
lora_request: Optional[LoRARequest] = None,
trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
) -> None:
...
@deprecate_kwargs(
"inputs",
additional_message="Please use the 'prompt' parameter instead.",
)
def add_request(
self,
request_id: str,
prompt: Optional[PromptType] = None,
params: Optional[Union[SamplingParams, PoolingParams]] = None,
arrival_time: Optional[float] = None,
lora_request: Optional[LoRARequest] = None,
tokenization_kwargs: Optional[dict[str, Any]] = None,
trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
*,
inputs: Optional[PromptType] = None, # DEPRECATED
) -> None:
"""Add a request to the engine's request pool.
@ -725,10 +687,6 @@ class LLMEngine:
>>> # continue the request processing
>>> ...
"""
if inputs is not None:
prompt = inputs
assert prompt is not None and params is not None
if lora_request is not None and not self.lora_config:
raise ValueError(f"Got lora_request {lora_request} but LoRA is "
"not enabled!")

View File

@ -4,9 +4,7 @@
import uuid
from dataclasses import dataclass, field
from enum import Enum
from typing import List, Mapping, Optional, Union, overload
from typing_extensions import deprecated
from typing import List, Mapping, Optional, Union
from vllm import PoolingParams
from vllm.inputs import PromptType
@ -14,7 +12,7 @@ from vllm.lora.request import LoRARequest
from vllm.outputs import RequestOutput
from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sampling_params import SamplingParams
from vllm.utils import Device, deprecate_kwargs
from vllm.utils import Device
VLLM_RPC_SUCCESS_STR = "SUCCESS"
@ -38,7 +36,6 @@ class RPCProcessRequest:
prompt_adapter_request: Optional[PromptAdapterRequest] = None
priority: int = 0
@overload
def __init__(
self,
prompt: PromptType,
@ -49,44 +46,6 @@ class RPCProcessRequest:
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
) -> None:
...
@overload
@deprecated("'inputs' will be renamed to 'prompt")
def __init__(
self,
*,
inputs: PromptType,
params: Union[SamplingParams, PoolingParams],
request_id: str,
lora_request: Optional[LoRARequest] = None,
trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
) -> None:
...
@deprecate_kwargs(
"inputs",
additional_message="Please use the 'prompt' parameter instead.",
)
def __init__(
self,
prompt: Optional[PromptType] = None,
params: Optional[Union[SamplingParams, PoolingParams]] = None,
request_id: Optional[str] = None,
lora_request: Optional[LoRARequest] = None,
trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
*,
inputs: Optional[PromptType] = None, # DEPRECATED
) -> None:
if inputs is not None:
prompt = inputs
assert (prompt is not None and params is not None
and request_id is not None)
super().__init__()
self.prompt = prompt

View File

@ -6,13 +6,12 @@ import copy
import pickle
from contextlib import contextmanager, suppress
from typing import (Any, AsyncGenerator, Dict, Iterator, List, Mapping,
Optional, Union, cast, overload)
Optional, Union, cast)
import cloudpickle
import psutil
import zmq
import zmq.asyncio
from typing_extensions import deprecated
from zmq import Frame # type: ignore[attr-defined]
from zmq.asyncio import Socket
@ -49,7 +48,7 @@ from vllm.outputs import PoolingRequestOutput, RequestOutput
from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sampling_params import SamplingParams
from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
from vllm.utils import Device, deprecate_kwargs
from vllm.utils import Device
logger = init_logger(__name__)
@ -442,7 +441,6 @@ class MQLLMEngineClient(EngineClient):
def dead_error(self) -> BaseException:
return ENGINE_DEAD_ERROR(self._errored_with)
@overload
def generate(
self,
prompt: PromptType,
@ -452,39 +450,6 @@ class MQLLMEngineClient(EngineClient):
trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
) -> AsyncGenerator[RequestOutput, None]:
...
@overload
@deprecated("'inputs' will be renamed to 'prompt")
def generate(
self,
*,
inputs: PromptType,
sampling_params: SamplingParams,
request_id: str,
lora_request: Optional[LoRARequest] = None,
trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
) -> AsyncGenerator[RequestOutput, None]:
...
@deprecate_kwargs(
"inputs",
additional_message="Please use the 'prompt' parameter instead.",
)
def generate(
self,
prompt: Optional[PromptType] = None,
sampling_params: Optional[SamplingParams] = None,
request_id: Optional[str] = None,
lora_request: Optional[LoRARequest] = None,
trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
*,
inputs: Optional[PromptType] = None # DEPRECATED
) -> AsyncGenerator[RequestOutput, None]:
"""Generate outputs for a request.
@ -506,16 +471,12 @@ class MQLLMEngineClient(EngineClient):
Any priority other than 0 will lead to an error if the
scheduling policy is not "priority".
"""
if inputs is not None:
prompt = inputs
assert (prompt is not None and sampling_params is not None
and request_id is not None)
return cast(
AsyncGenerator[RequestOutput, None],
self._process_request(prompt, sampling_params, request_id,
lora_request, trace_headers,
prompt_adapter_request, priority))
return self._process_request(prompt, sampling_params, request_id,
lora_request, trace_headers,
prompt_adapter_request, priority)
@overload
def encode(
self,
prompt: PromptType,
@ -524,37 +485,6 @@ class MQLLMEngineClient(EngineClient):
lora_request: Optional[LoRARequest] = None,
trace_headers: Optional[Mapping[str, str]] = None,
priority: int = 0,
) -> AsyncGenerator[PoolingRequestOutput, None]:
...
@overload
@deprecated("'inputs' will be renamed to 'prompt")
def encode(
self,
*,
inputs: PromptType,
pooling_params: PoolingParams,
request_id: str,
lora_request: Optional[LoRARequest] = None,
trace_headers: Optional[Mapping[str, str]] = None,
priority: int = 0,
) -> AsyncGenerator[PoolingRequestOutput, None]:
...
@deprecate_kwargs(
"inputs",
additional_message="Please use the 'prompt' parameter instead.",
)
def encode(
self,
prompt: Optional[PromptType] = None,
pooling_params: Optional[PoolingParams] = None,
request_id: Optional[str] = None,
lora_request: Optional[LoRARequest] = None,
trace_headers: Optional[Mapping[str, str]] = None,
priority: int = 0,
*,
inputs: Optional[PromptType] = None # DEPRECATED
) -> AsyncGenerator[PoolingRequestOutput, None]:
"""Generate outputs for a request from a pooling model.
@ -575,11 +505,6 @@ class MQLLMEngineClient(EngineClient):
The output `PoolingRequestOutput` objects from the LLMEngine
for the request.
"""
if inputs is not None:
prompt = inputs
assert (prompt is not None and pooling_params is not None
and request_id is not None)
return cast(
AsyncGenerator[PoolingRequestOutput, None],
self._process_request(prompt,