diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py index 6924aba11576..90340f8cff03 100644 --- a/tests/samplers/test_sampler.py +++ b/tests/samplers/test_sampler.py @@ -478,7 +478,7 @@ def test_sampler_mixed(seed: int, device: str): sampling_params = SamplingParams( temperature=random.random() + 0.1, top_p=min(random.random() + 0.1, 1), - top_k=random.randint(0, 10) or -1, + top_k=random.randint(0, 10), n=n, presence_penalty=random.randint(0, 1), ) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 40e477f03194..1aa400741340 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -409,7 +409,7 @@ class ChatCompletionRequest(OpenAIBaseModel): "repetition_penalty": 1.0, "temperature": 1.0, "top_p": 1.0, - "top_k": -1, + "top_k": 0, "min_p": 0.0, } @@ -853,7 +853,7 @@ class CompletionRequest(OpenAIBaseModel): "repetition_penalty": 1.0, "temperature": 1.0, "top_p": 1.0, - "top_k": -1, + "top_k": 0, "min_p": 0.0, } @@ -1679,7 +1679,7 @@ class TranscriptionRequest(OpenAIBaseModel): "repetition_penalty": 1.0, "temperature": 1.0, "top_p": 1.0, - "top_k": -1, + "top_k": 0, "min_p": 0.0, } diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index d76c75d9e6ce..888ca3e5009e 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -416,7 +416,7 @@ class SamplingTensors: # k should not be greater than the vocab size. top_k = min(sampling_params.top_k, vocab_size) - top_k = vocab_size if top_k == -1 else top_k + top_k = vocab_size if top_k < 1 else top_k if temperature < _SAMPLING_EPS: # NOTE: Zero temperature means deterministic sampling # (i.e., greedy sampling or beam search). diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index affc5c64b941..dc38daa388ce 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -149,7 +149,7 @@ class SamplingParams( top_p: Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. Set to 1 to consider all tokens. top_k: Integer that controls the number of top tokens to consider. Set - to -1 to consider all tokens. + to 0 (or -1) to consider all tokens. min_p: Float that represents the minimum probability for a token to be considered, relative to the probability of the most likely token. Must be in [0, 1]. Set to 0 to disable this. @@ -209,7 +209,7 @@ class SamplingParams( repetition_penalty: float = 1.0 temperature: float = 1.0 top_p: float = 1.0 - top_k: int = -1 + top_k: int = 0 min_p: float = 0.0 seed: Optional[int] = None stop: Optional[Union[str, list[str]]] = None @@ -256,7 +256,7 @@ class SamplingParams( repetition_penalty: Optional[float] = 1.0, temperature: Optional[float] = 1.0, top_p: Optional[float] = 1.0, - top_k: int = -1, + top_k: int = 0, min_p: float = 0.0, seed: Optional[int] = None, stop: Optional[Union[str, list[str]]] = None, @@ -376,7 +376,7 @@ class SamplingParams( if self.temperature < _SAMPLING_EPS: # Zero temperature means greedy sampling. self.top_p = 1.0 - self.top_k = -1 + self.top_k = 0 self.min_p = 0.0 self._verify_greedy_sampling() @@ -404,8 +404,9 @@ class SamplingParams( f"temperature must be non-negative, got {self.temperature}.") if not 0.0 < self.top_p <= 1.0: raise ValueError(f"top_p must be in (0, 1], got {self.top_p}.") - if self.top_k < -1 or self.top_k == 0: - raise ValueError(f"top_k must be -1 (disable), or at least 1, " + # quietly accept -1 as disabled, but prefer 0 + if self.top_k < -1: + raise ValueError(f"top_k must be 0 (disable), or at least 1, " f"got {self.top_k}.") if not isinstance(self.top_k, int): raise TypeError( diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py index c80b69e78dc0..e97adf757cc1 100644 --- a/vllm/worker/neuron_model_runner.py +++ b/vllm/worker/neuron_model_runner.py @@ -348,7 +348,7 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]): if temperature == 0.0: # Enable greedy sampling on zero temperature return (1, 1.0, 1.0) - if top_k < 0 or top_k > self._MAX_NEURON_SAMPLING_TOP_K: + if top_k < 1 or top_k > self._MAX_NEURON_SAMPLING_TOP_K: top_k = self._MAX_NEURON_SAMPLING_TOP_K return (top_k, top_p, temperature) diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py index 53541a2579ed..e0cca9072745 100644 --- a/vllm/worker/tpu_model_runner.py +++ b/vllm/worker/tpu_model_runner.py @@ -525,7 +525,7 @@ class TPUModelRunner(ModelRunnerBase[ModelInputForTPU]): "Top-p sampling is currently disabled for the TPU backend " "due to performance issues.") p.append(sampling_params.top_p) - if sampling_params.top_k != -1: + if sampling_params.top_k > 0: raise NotImplementedError( "Top-k sampling is currently disabled for the TPU backend " "due to performance issues.")