mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-13 23:15:39 +08:00
[Deprecation] Remove inputs arg fallback in Engine classes (#18799)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
parent
2ffb9b6e07
commit
e31ae3de36
@ -6,12 +6,10 @@ import copy
|
|||||||
import time
|
import time
|
||||||
import weakref
|
import weakref
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from typing import (Any, AsyncGenerator, Callable, Coroutine, Dict, Iterable,
|
from typing import (Any, AsyncGenerator, Callable, Dict, Iterable, List,
|
||||||
List, Mapping, Optional, Set, Tuple, Type, Union, overload)
|
Mapping, Optional, Set, Tuple, Type, Union)
|
||||||
from weakref import ReferenceType
|
from weakref import ReferenceType
|
||||||
|
|
||||||
from typing_extensions import deprecated
|
|
||||||
|
|
||||||
import vllm.envs as envs
|
import vllm.envs as envs
|
||||||
from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
|
from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
|
||||||
ParallelConfig, SchedulerConfig, VllmConfig)
|
ParallelConfig, SchedulerConfig, VllmConfig)
|
||||||
@ -36,7 +34,7 @@ from vllm.sampling_params import SamplingParams
|
|||||||
from vllm.sequence import ExecuteModelRequest
|
from vllm.sequence import ExecuteModelRequest
|
||||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||||
from vllm.usage.usage_lib import UsageContext
|
from vllm.usage.usage_lib import UsageContext
|
||||||
from vllm.utils import Device, deprecate_kwargs, weak_bind
|
from vllm.utils import Device, weak_bind
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S
|
ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S
|
||||||
@ -429,24 +427,6 @@ class _AsyncLLMEngine(LLMEngine):
|
|||||||
return await (
|
return await (
|
||||||
self.get_tokenizer_group().get_lora_tokenizer_async(lora_request))
|
self.get_tokenizer_group().get_lora_tokenizer_async(lora_request))
|
||||||
|
|
||||||
@overload
|
|
||||||
@deprecated("'inputs' will be renamed to 'prompt")
|
|
||||||
async def add_request_async(
|
|
||||||
self,
|
|
||||||
request_id: str,
|
|
||||||
*,
|
|
||||||
inputs: PromptType,
|
|
||||||
params: Union[SamplingParams, PoolingParams],
|
|
||||||
arrival_time: Optional[float] = None,
|
|
||||||
lora_request: Optional[LoRARequest] = None,
|
|
||||||
trace_headers: Optional[Mapping[str, str]] = None,
|
|
||||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
|
||||||
priority: int = 0,
|
|
||||||
data_parallel_rank: Optional[int] = None,
|
|
||||||
) -> None:
|
|
||||||
...
|
|
||||||
|
|
||||||
@overload
|
|
||||||
async def add_request_async(
|
async def add_request_async(
|
||||||
self,
|
self,
|
||||||
request_id: str,
|
request_id: str,
|
||||||
@ -459,32 +439,10 @@ class _AsyncLLMEngine(LLMEngine):
|
|||||||
priority: int = 0,
|
priority: int = 0,
|
||||||
data_parallel_rank: Optional[int] = None,
|
data_parallel_rank: Optional[int] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
...
|
"""
|
||||||
|
Async version of
|
||||||
@deprecate_kwargs(
|
[`add_request`][vllm.engine.llm_engine.LLMEngine.add_request].
|
||||||
"inputs",
|
"""
|
||||||
additional_message="Please use the 'prompt' parameter instead.",
|
|
||||||
)
|
|
||||||
async def add_request_async(
|
|
||||||
self,
|
|
||||||
request_id: str,
|
|
||||||
prompt: Optional[PromptType] = None,
|
|
||||||
params: Optional[Union[SamplingParams, PoolingParams]] = None,
|
|
||||||
arrival_time: Optional[float] = None,
|
|
||||||
lora_request: Optional[LoRARequest] = None,
|
|
||||||
trace_headers: Optional[Mapping[str, str]] = None,
|
|
||||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
|
||||||
priority: int = 0,
|
|
||||||
data_parallel_rank: Optional[int] = None,
|
|
||||||
*,
|
|
||||||
inputs: Optional[PromptType] = None, # DEPRECATED
|
|
||||||
) -> None:
|
|
||||||
"""Async version of
|
|
||||||
[`add_request`][vllm.engine.llm_engine.LLMEngine.add_request]."""
|
|
||||||
if inputs is not None:
|
|
||||||
prompt = inputs
|
|
||||||
assert prompt is not None and params is not None
|
|
||||||
|
|
||||||
if lora_request is not None and not self.lora_config:
|
if lora_request is not None and not self.lora_config:
|
||||||
raise ValueError(f"Got lora_request {lora_request} but LoRA is "
|
raise ValueError(f"Got lora_request {lora_request} but LoRA is "
|
||||||
"not enabled!")
|
"not enabled!")
|
||||||
@ -521,8 +479,7 @@ class _AsyncLLMEngine(LLMEngine):
|
|||||||
params = await build_guided_decoding_logits_processor_async(
|
params = await build_guided_decoding_logits_processor_async(
|
||||||
sampling_params=params,
|
sampling_params=params,
|
||||||
tokenizer=await self.get_tokenizer_async(lora_request),
|
tokenizer=await self.get_tokenizer_async(lora_request),
|
||||||
default_guided_backend=self.decoding_config.
|
default_guided_backend=self.decoding_config.backend,
|
||||||
guided_decoding_backend,
|
|
||||||
reasoning_backend=self.decoding_config.reasoning_backend,
|
reasoning_backend=self.decoding_config.reasoning_backend,
|
||||||
model_config=self.model_config)
|
model_config=self.model_config)
|
||||||
|
|
||||||
@ -894,28 +851,7 @@ class AsyncLLMEngine(EngineClient):
|
|||||||
raise
|
raise
|
||||||
await asyncio.sleep(0)
|
await asyncio.sleep(0)
|
||||||
|
|
||||||
# This method does not need to be async, but kept that way
|
async def add_request(
|
||||||
# for backwards compatibility.
|
|
||||||
@overload
|
|
||||||
@deprecated("'inputs' will be renamed to 'prompt")
|
|
||||||
def add_request(
|
|
||||||
self,
|
|
||||||
request_id: str,
|
|
||||||
*,
|
|
||||||
inputs: PromptType,
|
|
||||||
params: Union[SamplingParams, PoolingParams],
|
|
||||||
arrival_time: Optional[float] = None,
|
|
||||||
lora_request: Optional[LoRARequest] = None,
|
|
||||||
trace_headers: Optional[Mapping[str, str]] = None,
|
|
||||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
|
||||||
priority: int = 0,
|
|
||||||
data_parallel_rank: Optional[int] = None,
|
|
||||||
) -> Coroutine[None, None, AsyncGenerator[Union[
|
|
||||||
RequestOutput, PoolingRequestOutput], None]]:
|
|
||||||
...
|
|
||||||
|
|
||||||
@overload
|
|
||||||
def add_request(
|
|
||||||
self,
|
self,
|
||||||
request_id: str,
|
request_id: str,
|
||||||
prompt: PromptType,
|
prompt: PromptType,
|
||||||
@ -926,32 +862,7 @@ class AsyncLLMEngine(EngineClient):
|
|||||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
||||||
priority: int = 0,
|
priority: int = 0,
|
||||||
data_parallel_rank: Optional[int] = None,
|
data_parallel_rank: Optional[int] = None,
|
||||||
) -> Coroutine[None, None, AsyncGenerator[Union[
|
|
||||||
RequestOutput, PoolingRequestOutput], None]]:
|
|
||||||
...
|
|
||||||
|
|
||||||
@deprecate_kwargs(
|
|
||||||
"inputs",
|
|
||||||
additional_message="Please use the 'prompt' parameter instead.",
|
|
||||||
)
|
|
||||||
async def add_request(
|
|
||||||
self,
|
|
||||||
request_id: str,
|
|
||||||
prompt: Optional[PromptType] = None,
|
|
||||||
params: Optional[Union[SamplingParams, PoolingParams]] = None,
|
|
||||||
arrival_time: Optional[float] = None,
|
|
||||||
lora_request: Optional[LoRARequest] = None,
|
|
||||||
trace_headers: Optional[Mapping[str, str]] = None,
|
|
||||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
|
||||||
priority: int = 0,
|
|
||||||
data_parallel_rank: Optional[int] = None,
|
|
||||||
*,
|
|
||||||
inputs: Optional[PromptType] = None, # DEPRECATED
|
|
||||||
) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
|
) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
|
||||||
if inputs is not None:
|
|
||||||
prompt = inputs
|
|
||||||
assert prompt is not None and params is not None
|
|
||||||
|
|
||||||
if not self.is_running:
|
if not self.is_running:
|
||||||
if self.start_engine_loop:
|
if self.start_engine_loop:
|
||||||
self.start_background_loop()
|
self.start_background_loop()
|
||||||
|
|||||||
@ -11,10 +11,10 @@ from functools import partial
|
|||||||
from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Deque, Dict,
|
from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Deque, Dict,
|
||||||
Iterable, List, Literal, Mapping, NamedTuple, Optional)
|
Iterable, List, Literal, Mapping, NamedTuple, Optional)
|
||||||
from typing import Sequence as GenericSequence
|
from typing import Sequence as GenericSequence
|
||||||
from typing import Set, Type, Union, cast, overload
|
from typing import Set, Type, Union, cast
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from typing_extensions import TypeVar, deprecated
|
from typing_extensions import TypeVar
|
||||||
|
|
||||||
import vllm.envs as envs
|
import vllm.envs as envs
|
||||||
from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
|
from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
|
||||||
@ -58,8 +58,7 @@ from vllm.transformers_utils.tokenizer_group import (
|
|||||||
TokenizerGroup, init_tokenizer_from_configs)
|
TokenizerGroup, init_tokenizer_from_configs)
|
||||||
from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
|
from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
|
||||||
usage_message)
|
usage_message)
|
||||||
from vllm.utils import (Counter, Device, deprecate_kwargs,
|
from vllm.utils import Counter, Device, resolve_obj_by_qualname, weak_bind
|
||||||
resolve_obj_by_qualname, weak_bind)
|
|
||||||
from vllm.version import __version__ as VLLM_VERSION
|
from vllm.version import __version__ as VLLM_VERSION
|
||||||
from vllm.worker.model_runner_base import InputProcessingError
|
from vllm.worker.model_runner_base import InputProcessingError
|
||||||
|
|
||||||
@ -629,7 +628,6 @@ class LLMEngine:
|
|||||||
def stop_remote_worker_execution_loop(self) -> None:
|
def stop_remote_worker_execution_loop(self) -> None:
|
||||||
self.model_executor.stop_remote_worker_execution_loop()
|
self.model_executor.stop_remote_worker_execution_loop()
|
||||||
|
|
||||||
@overload
|
|
||||||
def add_request(
|
def add_request(
|
||||||
self,
|
self,
|
||||||
request_id: str,
|
request_id: str,
|
||||||
@ -641,42 +639,6 @@ class LLMEngine:
|
|||||||
trace_headers: Optional[Mapping[str, str]] = None,
|
trace_headers: Optional[Mapping[str, str]] = None,
|
||||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
||||||
priority: int = 0,
|
priority: int = 0,
|
||||||
) -> None:
|
|
||||||
...
|
|
||||||
|
|
||||||
@overload
|
|
||||||
@deprecated("'inputs' will be renamed to 'prompt")
|
|
||||||
def add_request(
|
|
||||||
self,
|
|
||||||
request_id: str,
|
|
||||||
*,
|
|
||||||
inputs: PromptType,
|
|
||||||
params: Union[SamplingParams, PoolingParams],
|
|
||||||
arrival_time: Optional[float] = None,
|
|
||||||
lora_request: Optional[LoRARequest] = None,
|
|
||||||
trace_headers: Optional[Mapping[str, str]] = None,
|
|
||||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
|
||||||
priority: int = 0,
|
|
||||||
) -> None:
|
|
||||||
...
|
|
||||||
|
|
||||||
@deprecate_kwargs(
|
|
||||||
"inputs",
|
|
||||||
additional_message="Please use the 'prompt' parameter instead.",
|
|
||||||
)
|
|
||||||
def add_request(
|
|
||||||
self,
|
|
||||||
request_id: str,
|
|
||||||
prompt: Optional[PromptType] = None,
|
|
||||||
params: Optional[Union[SamplingParams, PoolingParams]] = None,
|
|
||||||
arrival_time: Optional[float] = None,
|
|
||||||
lora_request: Optional[LoRARequest] = None,
|
|
||||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
|
||||||
trace_headers: Optional[Mapping[str, str]] = None,
|
|
||||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
|
||||||
priority: int = 0,
|
|
||||||
*,
|
|
||||||
inputs: Optional[PromptType] = None, # DEPRECATED
|
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Add a request to the engine's request pool.
|
"""Add a request to the engine's request pool.
|
||||||
|
|
||||||
@ -725,10 +687,6 @@ class LLMEngine:
|
|||||||
>>> # continue the request processing
|
>>> # continue the request processing
|
||||||
>>> ...
|
>>> ...
|
||||||
"""
|
"""
|
||||||
if inputs is not None:
|
|
||||||
prompt = inputs
|
|
||||||
assert prompt is not None and params is not None
|
|
||||||
|
|
||||||
if lora_request is not None and not self.lora_config:
|
if lora_request is not None and not self.lora_config:
|
||||||
raise ValueError(f"Got lora_request {lora_request} but LoRA is "
|
raise ValueError(f"Got lora_request {lora_request} but LoRA is "
|
||||||
"not enabled!")
|
"not enabled!")
|
||||||
|
|||||||
@ -4,9 +4,7 @@
|
|||||||
import uuid
|
import uuid
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import List, Mapping, Optional, Union, overload
|
from typing import List, Mapping, Optional, Union
|
||||||
|
|
||||||
from typing_extensions import deprecated
|
|
||||||
|
|
||||||
from vllm import PoolingParams
|
from vllm import PoolingParams
|
||||||
from vllm.inputs import PromptType
|
from vllm.inputs import PromptType
|
||||||
@ -14,7 +12,7 @@ from vllm.lora.request import LoRARequest
|
|||||||
from vllm.outputs import RequestOutput
|
from vllm.outputs import RequestOutput
|
||||||
from vllm.prompt_adapter.request import PromptAdapterRequest
|
from vllm.prompt_adapter.request import PromptAdapterRequest
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
from vllm.utils import Device, deprecate_kwargs
|
from vllm.utils import Device
|
||||||
|
|
||||||
VLLM_RPC_SUCCESS_STR = "SUCCESS"
|
VLLM_RPC_SUCCESS_STR = "SUCCESS"
|
||||||
|
|
||||||
@ -38,7 +36,6 @@ class RPCProcessRequest:
|
|||||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None
|
prompt_adapter_request: Optional[PromptAdapterRequest] = None
|
||||||
priority: int = 0
|
priority: int = 0
|
||||||
|
|
||||||
@overload
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
prompt: PromptType,
|
prompt: PromptType,
|
||||||
@ -49,44 +46,6 @@ class RPCProcessRequest:
|
|||||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
||||||
priority: int = 0,
|
priority: int = 0,
|
||||||
) -> None:
|
) -> None:
|
||||||
...
|
|
||||||
|
|
||||||
@overload
|
|
||||||
@deprecated("'inputs' will be renamed to 'prompt")
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
*,
|
|
||||||
inputs: PromptType,
|
|
||||||
params: Union[SamplingParams, PoolingParams],
|
|
||||||
request_id: str,
|
|
||||||
lora_request: Optional[LoRARequest] = None,
|
|
||||||
trace_headers: Optional[Mapping[str, str]] = None,
|
|
||||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
|
||||||
priority: int = 0,
|
|
||||||
) -> None:
|
|
||||||
...
|
|
||||||
|
|
||||||
@deprecate_kwargs(
|
|
||||||
"inputs",
|
|
||||||
additional_message="Please use the 'prompt' parameter instead.",
|
|
||||||
)
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
prompt: Optional[PromptType] = None,
|
|
||||||
params: Optional[Union[SamplingParams, PoolingParams]] = None,
|
|
||||||
request_id: Optional[str] = None,
|
|
||||||
lora_request: Optional[LoRARequest] = None,
|
|
||||||
trace_headers: Optional[Mapping[str, str]] = None,
|
|
||||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
|
||||||
priority: int = 0,
|
|
||||||
*,
|
|
||||||
inputs: Optional[PromptType] = None, # DEPRECATED
|
|
||||||
) -> None:
|
|
||||||
if inputs is not None:
|
|
||||||
prompt = inputs
|
|
||||||
assert (prompt is not None and params is not None
|
|
||||||
and request_id is not None)
|
|
||||||
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
self.prompt = prompt
|
self.prompt = prompt
|
||||||
|
|||||||
@ -6,13 +6,12 @@ import copy
|
|||||||
import pickle
|
import pickle
|
||||||
from contextlib import contextmanager, suppress
|
from contextlib import contextmanager, suppress
|
||||||
from typing import (Any, AsyncGenerator, Dict, Iterator, List, Mapping,
|
from typing import (Any, AsyncGenerator, Dict, Iterator, List, Mapping,
|
||||||
Optional, Union, cast, overload)
|
Optional, Union, cast)
|
||||||
|
|
||||||
import cloudpickle
|
import cloudpickle
|
||||||
import psutil
|
import psutil
|
||||||
import zmq
|
import zmq
|
||||||
import zmq.asyncio
|
import zmq.asyncio
|
||||||
from typing_extensions import deprecated
|
|
||||||
from zmq import Frame # type: ignore[attr-defined]
|
from zmq import Frame # type: ignore[attr-defined]
|
||||||
from zmq.asyncio import Socket
|
from zmq.asyncio import Socket
|
||||||
|
|
||||||
@ -49,7 +48,7 @@ from vllm.outputs import PoolingRequestOutput, RequestOutput
|
|||||||
from vllm.prompt_adapter.request import PromptAdapterRequest
|
from vllm.prompt_adapter.request import PromptAdapterRequest
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
|
from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
|
||||||
from vllm.utils import Device, deprecate_kwargs
|
from vllm.utils import Device
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
@ -442,7 +441,6 @@ class MQLLMEngineClient(EngineClient):
|
|||||||
def dead_error(self) -> BaseException:
|
def dead_error(self) -> BaseException:
|
||||||
return ENGINE_DEAD_ERROR(self._errored_with)
|
return ENGINE_DEAD_ERROR(self._errored_with)
|
||||||
|
|
||||||
@overload
|
|
||||||
def generate(
|
def generate(
|
||||||
self,
|
self,
|
||||||
prompt: PromptType,
|
prompt: PromptType,
|
||||||
@ -452,39 +450,6 @@ class MQLLMEngineClient(EngineClient):
|
|||||||
trace_headers: Optional[Mapping[str, str]] = None,
|
trace_headers: Optional[Mapping[str, str]] = None,
|
||||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
||||||
priority: int = 0,
|
priority: int = 0,
|
||||||
) -> AsyncGenerator[RequestOutput, None]:
|
|
||||||
...
|
|
||||||
|
|
||||||
@overload
|
|
||||||
@deprecated("'inputs' will be renamed to 'prompt")
|
|
||||||
def generate(
|
|
||||||
self,
|
|
||||||
*,
|
|
||||||
inputs: PromptType,
|
|
||||||
sampling_params: SamplingParams,
|
|
||||||
request_id: str,
|
|
||||||
lora_request: Optional[LoRARequest] = None,
|
|
||||||
trace_headers: Optional[Mapping[str, str]] = None,
|
|
||||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
|
||||||
priority: int = 0,
|
|
||||||
) -> AsyncGenerator[RequestOutput, None]:
|
|
||||||
...
|
|
||||||
|
|
||||||
@deprecate_kwargs(
|
|
||||||
"inputs",
|
|
||||||
additional_message="Please use the 'prompt' parameter instead.",
|
|
||||||
)
|
|
||||||
def generate(
|
|
||||||
self,
|
|
||||||
prompt: Optional[PromptType] = None,
|
|
||||||
sampling_params: Optional[SamplingParams] = None,
|
|
||||||
request_id: Optional[str] = None,
|
|
||||||
lora_request: Optional[LoRARequest] = None,
|
|
||||||
trace_headers: Optional[Mapping[str, str]] = None,
|
|
||||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
|
||||||
priority: int = 0,
|
|
||||||
*,
|
|
||||||
inputs: Optional[PromptType] = None # DEPRECATED
|
|
||||||
) -> AsyncGenerator[RequestOutput, None]:
|
) -> AsyncGenerator[RequestOutput, None]:
|
||||||
"""Generate outputs for a request.
|
"""Generate outputs for a request.
|
||||||
|
|
||||||
@ -506,16 +471,12 @@ class MQLLMEngineClient(EngineClient):
|
|||||||
Any priority other than 0 will lead to an error if the
|
Any priority other than 0 will lead to an error if the
|
||||||
scheduling policy is not "priority".
|
scheduling policy is not "priority".
|
||||||
"""
|
"""
|
||||||
if inputs is not None:
|
return cast(
|
||||||
prompt = inputs
|
AsyncGenerator[RequestOutput, None],
|
||||||
assert (prompt is not None and sampling_params is not None
|
self._process_request(prompt, sampling_params, request_id,
|
||||||
and request_id is not None)
|
|
||||||
|
|
||||||
return self._process_request(prompt, sampling_params, request_id,
|
|
||||||
lora_request, trace_headers,
|
lora_request, trace_headers,
|
||||||
prompt_adapter_request, priority)
|
prompt_adapter_request, priority))
|
||||||
|
|
||||||
@overload
|
|
||||||
def encode(
|
def encode(
|
||||||
self,
|
self,
|
||||||
prompt: PromptType,
|
prompt: PromptType,
|
||||||
@ -524,37 +485,6 @@ class MQLLMEngineClient(EngineClient):
|
|||||||
lora_request: Optional[LoRARequest] = None,
|
lora_request: Optional[LoRARequest] = None,
|
||||||
trace_headers: Optional[Mapping[str, str]] = None,
|
trace_headers: Optional[Mapping[str, str]] = None,
|
||||||
priority: int = 0,
|
priority: int = 0,
|
||||||
) -> AsyncGenerator[PoolingRequestOutput, None]:
|
|
||||||
...
|
|
||||||
|
|
||||||
@overload
|
|
||||||
@deprecated("'inputs' will be renamed to 'prompt")
|
|
||||||
def encode(
|
|
||||||
self,
|
|
||||||
*,
|
|
||||||
inputs: PromptType,
|
|
||||||
pooling_params: PoolingParams,
|
|
||||||
request_id: str,
|
|
||||||
lora_request: Optional[LoRARequest] = None,
|
|
||||||
trace_headers: Optional[Mapping[str, str]] = None,
|
|
||||||
priority: int = 0,
|
|
||||||
) -> AsyncGenerator[PoolingRequestOutput, None]:
|
|
||||||
...
|
|
||||||
|
|
||||||
@deprecate_kwargs(
|
|
||||||
"inputs",
|
|
||||||
additional_message="Please use the 'prompt' parameter instead.",
|
|
||||||
)
|
|
||||||
def encode(
|
|
||||||
self,
|
|
||||||
prompt: Optional[PromptType] = None,
|
|
||||||
pooling_params: Optional[PoolingParams] = None,
|
|
||||||
request_id: Optional[str] = None,
|
|
||||||
lora_request: Optional[LoRARequest] = None,
|
|
||||||
trace_headers: Optional[Mapping[str, str]] = None,
|
|
||||||
priority: int = 0,
|
|
||||||
*,
|
|
||||||
inputs: Optional[PromptType] = None # DEPRECATED
|
|
||||||
) -> AsyncGenerator[PoolingRequestOutput, None]:
|
) -> AsyncGenerator[PoolingRequestOutput, None]:
|
||||||
"""Generate outputs for a request from a pooling model.
|
"""Generate outputs for a request from a pooling model.
|
||||||
|
|
||||||
@ -575,11 +505,6 @@ class MQLLMEngineClient(EngineClient):
|
|||||||
The output `PoolingRequestOutput` objects from the LLMEngine
|
The output `PoolingRequestOutput` objects from the LLMEngine
|
||||||
for the request.
|
for the request.
|
||||||
"""
|
"""
|
||||||
if inputs is not None:
|
|
||||||
prompt = inputs
|
|
||||||
assert (prompt is not None and pooling_params is not None
|
|
||||||
and request_id is not None)
|
|
||||||
|
|
||||||
return cast(
|
return cast(
|
||||||
AsyncGenerator[PoolingRequestOutput, None],
|
AsyncGenerator[PoolingRequestOutput, None],
|
||||||
self._process_request(prompt,
|
self._process_request(prompt,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user