mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-15 09:55:57 +08:00
Change top_k to be disabled with 0 (still accept -1 for now) (#17773)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
parent
5b2dcbf0b8
commit
c6798baa9c
@ -478,7 +478,7 @@ def test_sampler_mixed(seed: int, device: str):
|
|||||||
sampling_params = SamplingParams(
|
sampling_params = SamplingParams(
|
||||||
temperature=random.random() + 0.1,
|
temperature=random.random() + 0.1,
|
||||||
top_p=min(random.random() + 0.1, 1),
|
top_p=min(random.random() + 0.1, 1),
|
||||||
top_k=random.randint(0, 10) or -1,
|
top_k=random.randint(0, 10),
|
||||||
n=n,
|
n=n,
|
||||||
presence_penalty=random.randint(0, 1),
|
presence_penalty=random.randint(0, 1),
|
||||||
)
|
)
|
||||||
|
|||||||
@ -409,7 +409,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
|
|||||||
"repetition_penalty": 1.0,
|
"repetition_penalty": 1.0,
|
||||||
"temperature": 1.0,
|
"temperature": 1.0,
|
||||||
"top_p": 1.0,
|
"top_p": 1.0,
|
||||||
"top_k": -1,
|
"top_k": 0,
|
||||||
"min_p": 0.0,
|
"min_p": 0.0,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -853,7 +853,7 @@ class CompletionRequest(OpenAIBaseModel):
|
|||||||
"repetition_penalty": 1.0,
|
"repetition_penalty": 1.0,
|
||||||
"temperature": 1.0,
|
"temperature": 1.0,
|
||||||
"top_p": 1.0,
|
"top_p": 1.0,
|
||||||
"top_k": -1,
|
"top_k": 0,
|
||||||
"min_p": 0.0,
|
"min_p": 0.0,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1679,7 +1679,7 @@ class TranscriptionRequest(OpenAIBaseModel):
|
|||||||
"repetition_penalty": 1.0,
|
"repetition_penalty": 1.0,
|
||||||
"temperature": 1.0,
|
"temperature": 1.0,
|
||||||
"top_p": 1.0,
|
"top_p": 1.0,
|
||||||
"top_k": -1,
|
"top_k": 0,
|
||||||
"min_p": 0.0,
|
"min_p": 0.0,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -416,7 +416,7 @@ class SamplingTensors:
|
|||||||
|
|
||||||
# k should not be greater than the vocab size.
|
# k should not be greater than the vocab size.
|
||||||
top_k = min(sampling_params.top_k, vocab_size)
|
top_k = min(sampling_params.top_k, vocab_size)
|
||||||
top_k = vocab_size if top_k == -1 else top_k
|
top_k = vocab_size if top_k < 1 else top_k
|
||||||
if temperature < _SAMPLING_EPS:
|
if temperature < _SAMPLING_EPS:
|
||||||
# NOTE: Zero temperature means deterministic sampling
|
# NOTE: Zero temperature means deterministic sampling
|
||||||
# (i.e., greedy sampling or beam search).
|
# (i.e., greedy sampling or beam search).
|
||||||
|
|||||||
@ -149,7 +149,7 @@ class SamplingParams(
|
|||||||
top_p: Float that controls the cumulative probability of the top tokens
|
top_p: Float that controls the cumulative probability of the top tokens
|
||||||
to consider. Must be in (0, 1]. Set to 1 to consider all tokens.
|
to consider. Must be in (0, 1]. Set to 1 to consider all tokens.
|
||||||
top_k: Integer that controls the number of top tokens to consider. Set
|
top_k: Integer that controls the number of top tokens to consider. Set
|
||||||
to -1 to consider all tokens.
|
to 0 (or -1) to consider all tokens.
|
||||||
min_p: Float that represents the minimum probability for a token to be
|
min_p: Float that represents the minimum probability for a token to be
|
||||||
considered, relative to the probability of the most likely token.
|
considered, relative to the probability of the most likely token.
|
||||||
Must be in [0, 1]. Set to 0 to disable this.
|
Must be in [0, 1]. Set to 0 to disable this.
|
||||||
@ -209,7 +209,7 @@ class SamplingParams(
|
|||||||
repetition_penalty: float = 1.0
|
repetition_penalty: float = 1.0
|
||||||
temperature: float = 1.0
|
temperature: float = 1.0
|
||||||
top_p: float = 1.0
|
top_p: float = 1.0
|
||||||
top_k: int = -1
|
top_k: int = 0
|
||||||
min_p: float = 0.0
|
min_p: float = 0.0
|
||||||
seed: Optional[int] = None
|
seed: Optional[int] = None
|
||||||
stop: Optional[Union[str, list[str]]] = None
|
stop: Optional[Union[str, list[str]]] = None
|
||||||
@ -256,7 +256,7 @@ class SamplingParams(
|
|||||||
repetition_penalty: Optional[float] = 1.0,
|
repetition_penalty: Optional[float] = 1.0,
|
||||||
temperature: Optional[float] = 1.0,
|
temperature: Optional[float] = 1.0,
|
||||||
top_p: Optional[float] = 1.0,
|
top_p: Optional[float] = 1.0,
|
||||||
top_k: int = -1,
|
top_k: int = 0,
|
||||||
min_p: float = 0.0,
|
min_p: float = 0.0,
|
||||||
seed: Optional[int] = None,
|
seed: Optional[int] = None,
|
||||||
stop: Optional[Union[str, list[str]]] = None,
|
stop: Optional[Union[str, list[str]]] = None,
|
||||||
@ -376,7 +376,7 @@ class SamplingParams(
|
|||||||
if self.temperature < _SAMPLING_EPS:
|
if self.temperature < _SAMPLING_EPS:
|
||||||
# Zero temperature means greedy sampling.
|
# Zero temperature means greedy sampling.
|
||||||
self.top_p = 1.0
|
self.top_p = 1.0
|
||||||
self.top_k = -1
|
self.top_k = 0
|
||||||
self.min_p = 0.0
|
self.min_p = 0.0
|
||||||
self._verify_greedy_sampling()
|
self._verify_greedy_sampling()
|
||||||
|
|
||||||
@ -404,8 +404,9 @@ class SamplingParams(
|
|||||||
f"temperature must be non-negative, got {self.temperature}.")
|
f"temperature must be non-negative, got {self.temperature}.")
|
||||||
if not 0.0 < self.top_p <= 1.0:
|
if not 0.0 < self.top_p <= 1.0:
|
||||||
raise ValueError(f"top_p must be in (0, 1], got {self.top_p}.")
|
raise ValueError(f"top_p must be in (0, 1], got {self.top_p}.")
|
||||||
if self.top_k < -1 or self.top_k == 0:
|
# quietly accept -1 as disabled, but prefer 0
|
||||||
raise ValueError(f"top_k must be -1 (disable), or at least 1, "
|
if self.top_k < -1:
|
||||||
|
raise ValueError(f"top_k must be 0 (disable), or at least 1, "
|
||||||
f"got {self.top_k}.")
|
f"got {self.top_k}.")
|
||||||
if not isinstance(self.top_k, int):
|
if not isinstance(self.top_k, int):
|
||||||
raise TypeError(
|
raise TypeError(
|
||||||
|
|||||||
@ -348,7 +348,7 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
|
|||||||
if temperature == 0.0:
|
if temperature == 0.0:
|
||||||
# Enable greedy sampling on zero temperature
|
# Enable greedy sampling on zero temperature
|
||||||
return (1, 1.0, 1.0)
|
return (1, 1.0, 1.0)
|
||||||
if top_k < 0 or top_k > self._MAX_NEURON_SAMPLING_TOP_K:
|
if top_k < 1 or top_k > self._MAX_NEURON_SAMPLING_TOP_K:
|
||||||
top_k = self._MAX_NEURON_SAMPLING_TOP_K
|
top_k = self._MAX_NEURON_SAMPLING_TOP_K
|
||||||
|
|
||||||
return (top_k, top_p, temperature)
|
return (top_k, top_p, temperature)
|
||||||
|
|||||||
@ -525,7 +525,7 @@ class TPUModelRunner(ModelRunnerBase[ModelInputForTPU]):
|
|||||||
"Top-p sampling is currently disabled for the TPU backend "
|
"Top-p sampling is currently disabled for the TPU backend "
|
||||||
"due to performance issues.")
|
"due to performance issues.")
|
||||||
p.append(sampling_params.top_p)
|
p.append(sampling_params.top_p)
|
||||||
if sampling_params.top_k != -1:
|
if sampling_params.top_k > 0:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
"Top-k sampling is currently disabled for the TPU backend "
|
"Top-k sampling is currently disabled for the TPU backend "
|
||||||
"due to performance issues.")
|
"due to performance issues.")
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user