[Deprecation] Remove inputs arg fallback in Engine classes (#18799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung 2025-06-09 10:19:56 +08:00 committed by GitHub
parent 2ffb9b6e07
commit e31ae3de36
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 21 additions and 268 deletions

View File

@ -6,12 +6,10 @@ import copy
import time import time
import weakref import weakref
from functools import partial from functools import partial
from typing import (Any, AsyncGenerator, Callable, Coroutine, Dict, Iterable, from typing import (Any, AsyncGenerator, Callable, Dict, Iterable, List,
List, Mapping, Optional, Set, Tuple, Type, Union, overload) Mapping, Optional, Set, Tuple, Type, Union)
from weakref import ReferenceType from weakref import ReferenceType
from typing_extensions import deprecated
import vllm.envs as envs import vllm.envs as envs
from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig, from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
ParallelConfig, SchedulerConfig, VllmConfig) ParallelConfig, SchedulerConfig, VllmConfig)
@ -36,7 +34,7 @@ from vllm.sampling_params import SamplingParams
from vllm.sequence import ExecuteModelRequest from vllm.sequence import ExecuteModelRequest
from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.usage.usage_lib import UsageContext from vllm.usage.usage_lib import UsageContext
from vllm.utils import Device, deprecate_kwargs, weak_bind from vllm.utils import Device, weak_bind
logger = init_logger(__name__) logger = init_logger(__name__)
ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S
@ -429,24 +427,6 @@ class _AsyncLLMEngine(LLMEngine):
return await ( return await (
self.get_tokenizer_group().get_lora_tokenizer_async(lora_request)) self.get_tokenizer_group().get_lora_tokenizer_async(lora_request))
@overload
@deprecated("'inputs' will be renamed to 'prompt")
async def add_request_async(
self,
request_id: str,
*,
inputs: PromptType,
params: Union[SamplingParams, PoolingParams],
arrival_time: Optional[float] = None,
lora_request: Optional[LoRARequest] = None,
trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
data_parallel_rank: Optional[int] = None,
) -> None:
...
@overload
async def add_request_async( async def add_request_async(
self, self,
request_id: str, request_id: str,
@ -459,32 +439,10 @@ class _AsyncLLMEngine(LLMEngine):
priority: int = 0, priority: int = 0,
data_parallel_rank: Optional[int] = None, data_parallel_rank: Optional[int] = None,
) -> None: ) -> None:
... """
Async version of
@deprecate_kwargs( [`add_request`][vllm.engine.llm_engine.LLMEngine.add_request].
"inputs", """
additional_message="Please use the 'prompt' parameter instead.",
)
async def add_request_async(
self,
request_id: str,
prompt: Optional[PromptType] = None,
params: Optional[Union[SamplingParams, PoolingParams]] = None,
arrival_time: Optional[float] = None,
lora_request: Optional[LoRARequest] = None,
trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
data_parallel_rank: Optional[int] = None,
*,
inputs: Optional[PromptType] = None, # DEPRECATED
) -> None:
"""Async version of
[`add_request`][vllm.engine.llm_engine.LLMEngine.add_request]."""
if inputs is not None:
prompt = inputs
assert prompt is not None and params is not None
if lora_request is not None and not self.lora_config: if lora_request is not None and not self.lora_config:
raise ValueError(f"Got lora_request {lora_request} but LoRA is " raise ValueError(f"Got lora_request {lora_request} but LoRA is "
"not enabled!") "not enabled!")
@ -521,8 +479,7 @@ class _AsyncLLMEngine(LLMEngine):
params = await build_guided_decoding_logits_processor_async( params = await build_guided_decoding_logits_processor_async(
sampling_params=params, sampling_params=params,
tokenizer=await self.get_tokenizer_async(lora_request), tokenizer=await self.get_tokenizer_async(lora_request),
default_guided_backend=self.decoding_config. default_guided_backend=self.decoding_config.backend,
guided_decoding_backend,
reasoning_backend=self.decoding_config.reasoning_backend, reasoning_backend=self.decoding_config.reasoning_backend,
model_config=self.model_config) model_config=self.model_config)
@ -894,28 +851,7 @@ class AsyncLLMEngine(EngineClient):
raise raise
await asyncio.sleep(0) await asyncio.sleep(0)
# This method does not need to be async, but kept that way async def add_request(
# for backwards compatibility.
@overload
@deprecated("'inputs' will be renamed to 'prompt")
def add_request(
self,
request_id: str,
*,
inputs: PromptType,
params: Union[SamplingParams, PoolingParams],
arrival_time: Optional[float] = None,
lora_request: Optional[LoRARequest] = None,
trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
data_parallel_rank: Optional[int] = None,
) -> Coroutine[None, None, AsyncGenerator[Union[
RequestOutput, PoolingRequestOutput], None]]:
...
@overload
def add_request(
self, self,
request_id: str, request_id: str,
prompt: PromptType, prompt: PromptType,
@ -926,32 +862,7 @@ class AsyncLLMEngine(EngineClient):
prompt_adapter_request: Optional[PromptAdapterRequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0, priority: int = 0,
data_parallel_rank: Optional[int] = None, data_parallel_rank: Optional[int] = None,
) -> Coroutine[None, None, AsyncGenerator[Union[
RequestOutput, PoolingRequestOutput], None]]:
...
@deprecate_kwargs(
"inputs",
additional_message="Please use the 'prompt' parameter instead.",
)
async def add_request(
self,
request_id: str,
prompt: Optional[PromptType] = None,
params: Optional[Union[SamplingParams, PoolingParams]] = None,
arrival_time: Optional[float] = None,
lora_request: Optional[LoRARequest] = None,
trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
data_parallel_rank: Optional[int] = None,
*,
inputs: Optional[PromptType] = None, # DEPRECATED
) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]: ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
if inputs is not None:
prompt = inputs
assert prompt is not None and params is not None
if not self.is_running: if not self.is_running:
if self.start_engine_loop: if self.start_engine_loop:
self.start_background_loop() self.start_background_loop()

View File

@ -11,10 +11,10 @@ from functools import partial
from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Deque, Dict, from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Deque, Dict,
Iterable, List, Literal, Mapping, NamedTuple, Optional) Iterable, List, Literal, Mapping, NamedTuple, Optional)
from typing import Sequence as GenericSequence from typing import Sequence as GenericSequence
from typing import Set, Type, Union, cast, overload from typing import Set, Type, Union, cast
import torch import torch
from typing_extensions import TypeVar, deprecated from typing_extensions import TypeVar
import vllm.envs as envs import vllm.envs as envs
from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig, from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
@ -58,8 +58,7 @@ from vllm.transformers_utils.tokenizer_group import (
TokenizerGroup, init_tokenizer_from_configs) TokenizerGroup, init_tokenizer_from_configs)
from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
usage_message) usage_message)
from vllm.utils import (Counter, Device, deprecate_kwargs, from vllm.utils import Counter, Device, resolve_obj_by_qualname, weak_bind
resolve_obj_by_qualname, weak_bind)
from vllm.version import __version__ as VLLM_VERSION from vllm.version import __version__ as VLLM_VERSION
from vllm.worker.model_runner_base import InputProcessingError from vllm.worker.model_runner_base import InputProcessingError
@ -629,7 +628,6 @@ class LLMEngine:
def stop_remote_worker_execution_loop(self) -> None: def stop_remote_worker_execution_loop(self) -> None:
self.model_executor.stop_remote_worker_execution_loop() self.model_executor.stop_remote_worker_execution_loop()
@overload
def add_request( def add_request(
self, self,
request_id: str, request_id: str,
@ -641,42 +639,6 @@ class LLMEngine:
trace_headers: Optional[Mapping[str, str]] = None, trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0, priority: int = 0,
) -> None:
...
@overload
@deprecated("'inputs' will be renamed to 'prompt")
def add_request(
self,
request_id: str,
*,
inputs: PromptType,
params: Union[SamplingParams, PoolingParams],
arrival_time: Optional[float] = None,
lora_request: Optional[LoRARequest] = None,
trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
) -> None:
...
@deprecate_kwargs(
"inputs",
additional_message="Please use the 'prompt' parameter instead.",
)
def add_request(
self,
request_id: str,
prompt: Optional[PromptType] = None,
params: Optional[Union[SamplingParams, PoolingParams]] = None,
arrival_time: Optional[float] = None,
lora_request: Optional[LoRARequest] = None,
tokenization_kwargs: Optional[dict[str, Any]] = None,
trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
*,
inputs: Optional[PromptType] = None, # DEPRECATED
) -> None: ) -> None:
"""Add a request to the engine's request pool. """Add a request to the engine's request pool.
@ -725,10 +687,6 @@ class LLMEngine:
>>> # continue the request processing >>> # continue the request processing
>>> ... >>> ...
""" """
if inputs is not None:
prompt = inputs
assert prompt is not None and params is not None
if lora_request is not None and not self.lora_config: if lora_request is not None and not self.lora_config:
raise ValueError(f"Got lora_request {lora_request} but LoRA is " raise ValueError(f"Got lora_request {lora_request} but LoRA is "
"not enabled!") "not enabled!")

View File

@ -4,9 +4,7 @@
import uuid import uuid
from dataclasses import dataclass, field from dataclasses import dataclass, field
from enum import Enum from enum import Enum
from typing import List, Mapping, Optional, Union, overload from typing import List, Mapping, Optional, Union
from typing_extensions import deprecated
from vllm import PoolingParams from vllm import PoolingParams
from vllm.inputs import PromptType from vllm.inputs import PromptType
@ -14,7 +12,7 @@ from vllm.lora.request import LoRARequest
from vllm.outputs import RequestOutput from vllm.outputs import RequestOutput
from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.utils import Device, deprecate_kwargs from vllm.utils import Device
VLLM_RPC_SUCCESS_STR = "SUCCESS" VLLM_RPC_SUCCESS_STR = "SUCCESS"
@ -38,7 +36,6 @@ class RPCProcessRequest:
prompt_adapter_request: Optional[PromptAdapterRequest] = None prompt_adapter_request: Optional[PromptAdapterRequest] = None
priority: int = 0 priority: int = 0
@overload
def __init__( def __init__(
self, self,
prompt: PromptType, prompt: PromptType,
@ -49,44 +46,6 @@ class RPCProcessRequest:
prompt_adapter_request: Optional[PromptAdapterRequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0, priority: int = 0,
) -> None: ) -> None:
...
@overload
@deprecated("'inputs' will be renamed to 'prompt")
def __init__(
self,
*,
inputs: PromptType,
params: Union[SamplingParams, PoolingParams],
request_id: str,
lora_request: Optional[LoRARequest] = None,
trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
) -> None:
...
@deprecate_kwargs(
"inputs",
additional_message="Please use the 'prompt' parameter instead.",
)
def __init__(
self,
prompt: Optional[PromptType] = None,
params: Optional[Union[SamplingParams, PoolingParams]] = None,
request_id: Optional[str] = None,
lora_request: Optional[LoRARequest] = None,
trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
*,
inputs: Optional[PromptType] = None, # DEPRECATED
) -> None:
if inputs is not None:
prompt = inputs
assert (prompt is not None and params is not None
and request_id is not None)
super().__init__() super().__init__()
self.prompt = prompt self.prompt = prompt

View File

@ -6,13 +6,12 @@ import copy
import pickle import pickle
from contextlib import contextmanager, suppress from contextlib import contextmanager, suppress
from typing import (Any, AsyncGenerator, Dict, Iterator, List, Mapping, from typing import (Any, AsyncGenerator, Dict, Iterator, List, Mapping,
Optional, Union, cast, overload) Optional, Union, cast)
import cloudpickle import cloudpickle
import psutil import psutil
import zmq import zmq
import zmq.asyncio import zmq.asyncio
from typing_extensions import deprecated
from zmq import Frame # type: ignore[attr-defined] from zmq import Frame # type: ignore[attr-defined]
from zmq.asyncio import Socket from zmq.asyncio import Socket
@ -49,7 +48,7 @@ from vllm.outputs import PoolingRequestOutput, RequestOutput
from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
from vllm.utils import Device, deprecate_kwargs from vllm.utils import Device
logger = init_logger(__name__) logger = init_logger(__name__)
@ -442,7 +441,6 @@ class MQLLMEngineClient(EngineClient):
def dead_error(self) -> BaseException: def dead_error(self) -> BaseException:
return ENGINE_DEAD_ERROR(self._errored_with) return ENGINE_DEAD_ERROR(self._errored_with)
@overload
def generate( def generate(
self, self,
prompt: PromptType, prompt: PromptType,
@ -452,39 +450,6 @@ class MQLLMEngineClient(EngineClient):
trace_headers: Optional[Mapping[str, str]] = None, trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0, priority: int = 0,
) -> AsyncGenerator[RequestOutput, None]:
...
@overload
@deprecated("'inputs' will be renamed to 'prompt")
def generate(
self,
*,
inputs: PromptType,
sampling_params: SamplingParams,
request_id: str,
lora_request: Optional[LoRARequest] = None,
trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
) -> AsyncGenerator[RequestOutput, None]:
...
@deprecate_kwargs(
"inputs",
additional_message="Please use the 'prompt' parameter instead.",
)
def generate(
self,
prompt: Optional[PromptType] = None,
sampling_params: Optional[SamplingParams] = None,
request_id: Optional[str] = None,
lora_request: Optional[LoRARequest] = None,
trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
*,
inputs: Optional[PromptType] = None # DEPRECATED
) -> AsyncGenerator[RequestOutput, None]: ) -> AsyncGenerator[RequestOutput, None]:
"""Generate outputs for a request. """Generate outputs for a request.
@ -506,16 +471,12 @@ class MQLLMEngineClient(EngineClient):
Any priority other than 0 will lead to an error if the Any priority other than 0 will lead to an error if the
scheduling policy is not "priority". scheduling policy is not "priority".
""" """
if inputs is not None: return cast(
prompt = inputs AsyncGenerator[RequestOutput, None],
assert (prompt is not None and sampling_params is not None self._process_request(prompt, sampling_params, request_id,
and request_id is not None)
return self._process_request(prompt, sampling_params, request_id,
lora_request, trace_headers, lora_request, trace_headers,
prompt_adapter_request, priority) prompt_adapter_request, priority))
@overload
def encode( def encode(
self, self,
prompt: PromptType, prompt: PromptType,
@ -524,37 +485,6 @@ class MQLLMEngineClient(EngineClient):
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
trace_headers: Optional[Mapping[str, str]] = None, trace_headers: Optional[Mapping[str, str]] = None,
priority: int = 0, priority: int = 0,
) -> AsyncGenerator[PoolingRequestOutput, None]:
...
@overload
@deprecated("'inputs' will be renamed to 'prompt")
def encode(
self,
*,
inputs: PromptType,
pooling_params: PoolingParams,
request_id: str,
lora_request: Optional[LoRARequest] = None,
trace_headers: Optional[Mapping[str, str]] = None,
priority: int = 0,
) -> AsyncGenerator[PoolingRequestOutput, None]:
...
@deprecate_kwargs(
"inputs",
additional_message="Please use the 'prompt' parameter instead.",
)
def encode(
self,
prompt: Optional[PromptType] = None,
pooling_params: Optional[PoolingParams] = None,
request_id: Optional[str] = None,
lora_request: Optional[LoRARequest] = None,
trace_headers: Optional[Mapping[str, str]] = None,
priority: int = 0,
*,
inputs: Optional[PromptType] = None # DEPRECATED
) -> AsyncGenerator[PoolingRequestOutput, None]: ) -> AsyncGenerator[PoolingRequestOutput, None]:
"""Generate outputs for a request from a pooling model. """Generate outputs for a request from a pooling model.
@ -575,11 +505,6 @@ class MQLLMEngineClient(EngineClient):
The output `PoolingRequestOutput` objects from the LLMEngine The output `PoolingRequestOutput` objects from the LLMEngine
for the request. for the request.
""" """
if inputs is not None:
prompt = inputs
assert (prompt is not None and pooling_params is not None
and request_id is not None)
return cast( return cast(
AsyncGenerator[PoolingRequestOutput, None], AsyncGenerator[PoolingRequestOutput, None],
self._process_request(prompt, self._process_request(prompt,