mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-06 02:27:02 +08:00
[V0 Deprecation] Remove best_of (#29090)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
parent
698024ecce
commit
56e96b37e4
@ -169,8 +169,8 @@ As part of the major architectural rework in vLLM V1, several legacy features ha
|
||||
- **best_of**: This feature has been deprecated due to limited usage. See details at [RFC #13361](https://github.com/vllm-project/vllm/issues/13361).
|
||||
- **Per-Request Logits Processors**: In V0, users could pass custom
|
||||
processing functions to adjust logits on a per-request basis. In vLLM V1, this
|
||||
feature has been deprecated. Instead, the design is moving toward supporting **global logits
|
||||
processors**, a feature the team is actively working on for future releases. See details at [RFC #13360](https://github.com/vllm-project/vllm/pull/13360).
|
||||
feature has been deprecated. Instead, we now support **global logits processors**
|
||||
which are set at startup time, see [RFC #17799](https://github.com/vllm-project/vllm/issues/17799).
|
||||
|
||||
##### KV Cache features
|
||||
|
||||
|
||||
@ -22,14 +22,6 @@ def test_n_gt_1(llm):
|
||||
assert len(outputs[0].outputs) == 3
|
||||
|
||||
|
||||
def test_best_of(llm):
|
||||
"""Raise a ValueError since best_of is deprecated."""
|
||||
|
||||
params = SamplingParams(n=2, best_of=3)
|
||||
with pytest.raises(ValueError):
|
||||
_ = llm.generate(PROMPT, params)
|
||||
|
||||
|
||||
def test_penalties(llm):
|
||||
"""Check that we do not get errors if applied."""
|
||||
|
||||
|
||||
@ -565,7 +565,6 @@ class ChatCompletionRequest(OpenAIBaseModel):
|
||||
user: str | None = None
|
||||
|
||||
# --8<-- [start:chat-completion-sampling-params]
|
||||
best_of: int | None = None
|
||||
use_beam_search: bool = False
|
||||
top_k: int | None = None
|
||||
min_p: float | None = None
|
||||
@ -889,7 +888,6 @@ class ChatCompletionRequest(OpenAIBaseModel):
|
||||
extra_args["kv_transfer_params"] = self.kv_transfer_params
|
||||
return SamplingParams.from_optional(
|
||||
n=self.n,
|
||||
best_of=self.best_of,
|
||||
presence_penalty=self.presence_penalty,
|
||||
frequency_penalty=self.frequency_penalty,
|
||||
repetition_penalty=repetition_penalty,
|
||||
@ -1088,7 +1086,6 @@ class CompletionRequest(OpenAIBaseModel):
|
||||
# https://platform.openai.com/docs/api-reference/completions/create
|
||||
model: str | None = None
|
||||
prompt: list[int] | list[list[int]] | str | list[str] | None = None
|
||||
best_of: int | None = None
|
||||
echo: bool | None = False
|
||||
frequency_penalty: float | None = 0.0
|
||||
logit_bias: dict[str, float] | None = None
|
||||
@ -1375,7 +1372,6 @@ class CompletionRequest(OpenAIBaseModel):
|
||||
extra_args["kv_transfer_params"] = self.kv_transfer_params
|
||||
return SamplingParams.from_optional(
|
||||
n=self.n,
|
||||
best_of=self.best_of,
|
||||
presence_penalty=self.presence_penalty,
|
||||
frequency_penalty=self.frequency_penalty,
|
||||
repetition_penalty=repetition_penalty,
|
||||
|
||||
@ -250,14 +250,8 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||
model_name = self.models.model_name(lora_request)
|
||||
num_prompts = len(engine_prompts)
|
||||
|
||||
# Similar to the OpenAI API, when n != best_of, we do not stream the
|
||||
# results. Noting that best_of is only supported in V0. In addition,
|
||||
# we do not stream the results when use beam search.
|
||||
stream = (
|
||||
request.stream
|
||||
and (request.best_of is None or request.n == request.best_of)
|
||||
and not request.use_beam_search
|
||||
)
|
||||
# We do not stream the results when using beam search.
|
||||
stream = request.stream and not request.use_beam_search
|
||||
|
||||
# Streaming response
|
||||
if stream:
|
||||
|
||||
@ -144,12 +144,6 @@ class SamplingParams(
|
||||
are generated and streamed cumulatively per request. To see all `n`
|
||||
outputs upon completion, use `output_kind=RequestOutputKind.FINAL_ONLY`
|
||||
in `SamplingParams`."""
|
||||
best_of: int | None = None
|
||||
"""Number of output sequences that are generated from the prompt. From
|
||||
these `best_of` sequences, the top `n` sequences are returned. `best_of`
|
||||
must be greater than or equal to `n`. By default, `best_of` is set to `n`.
|
||||
Warning, this is only supported in V0."""
|
||||
_real_n: int | None = None
|
||||
presence_penalty: float = 0.0
|
||||
"""Penalizes new tokens based on whether they appear in the generated text
|
||||
so far. Values > 0 encourage the model to use new tokens, while values < 0
|
||||
@ -265,7 +259,6 @@ class SamplingParams(
|
||||
@staticmethod
|
||||
def from_optional(
|
||||
n: int | None = 1,
|
||||
best_of: int | None = None,
|
||||
presence_penalty: float | None = 0.0,
|
||||
frequency_penalty: float | None = 0.0,
|
||||
repetition_penalty: float | None = 1.0,
|
||||
@ -315,7 +308,6 @@ class SamplingParams(
|
||||
|
||||
return SamplingParams(
|
||||
n=1 if n is None else n,
|
||||
best_of=best_of,
|
||||
presence_penalty=0.0 if presence_penalty is None else presence_penalty,
|
||||
frequency_penalty=0.0 if frequency_penalty is None else frequency_penalty,
|
||||
repetition_penalty=1.0
|
||||
@ -348,22 +340,6 @@ class SamplingParams(
|
||||
)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
# how we deal with `best_of`:
|
||||
# if `best_of` is not set, we default to `n`;
|
||||
# if `best_of` is set, we set `n` to `best_of`,
|
||||
# and set `_real_n` to the original `n`.
|
||||
# when we return the result, we will check
|
||||
# if we need to return `n` or `_real_n` results
|
||||
if self.best_of:
|
||||
if self.best_of < self.n:
|
||||
raise ValueError(
|
||||
f"best_of must be greater than or equal to n, "
|
||||
f"got n={self.n} and best_of={self.best_of}."
|
||||
)
|
||||
if not self._real_n:
|
||||
self._real_n = self.n
|
||||
self.n = self.best_of
|
||||
|
||||
if 0 < self.temperature < _MAX_TEMP:
|
||||
logger.warning(
|
||||
"temperature %s is less than %s, which may cause numerical "
|
||||
@ -433,18 +409,6 @@ class SamplingParams(
|
||||
raise ValueError(f"n must be an int, but is of type {type(self.n)}")
|
||||
if self.n < 1:
|
||||
raise ValueError(f"n must be at least 1, got {self.n}.")
|
||||
if self.best_of is not None:
|
||||
if not isinstance(self.best_of, int):
|
||||
raise ValueError(
|
||||
f"best_of must be an integer, got {type(self.best_of)}"
|
||||
)
|
||||
if self.best_of < 1:
|
||||
raise ValueError(f"best_of must be at least 1, got {self.best_of}")
|
||||
if self.best_of < self.n:
|
||||
raise ValueError(
|
||||
f"best_of must be greater than or equal to n, "
|
||||
f"got n={self.n} and best_of={self.best_of}."
|
||||
)
|
||||
if not -2.0 <= self.presence_penalty <= 2.0:
|
||||
raise ValueError(
|
||||
f"presence_penalty must be in [-2, 2], got {self.presence_penalty}."
|
||||
@ -519,10 +483,6 @@ class SamplingParams(
|
||||
"stop strings are only supported when detokenize is True. "
|
||||
"Set detokenize=True to use stop."
|
||||
)
|
||||
if self.best_of != self._real_n and self.output_kind == (
|
||||
RequestOutputKind.DELTA
|
||||
):
|
||||
raise ValueError("best_of must equal n to use output_kind=DELTA")
|
||||
|
||||
def _verify_greedy_sampling(self) -> None:
|
||||
if self.n > 1:
|
||||
|
||||
@ -142,9 +142,6 @@ class Processor:
|
||||
self,
|
||||
params: SamplingParams,
|
||||
) -> None:
|
||||
# Best of not yet supported.
|
||||
if params.best_of is not None and params.best_of > 1:
|
||||
raise ValueError("vLLM V1 does not yet support best_of.")
|
||||
# Logits processors not supported.
|
||||
if params.logits_processors:
|
||||
raise ValueError(
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user