From e66d787bce22c56f995f4e2974e31ac020bc57ea Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 14 Oct 2025 22:35:18 -0400 Subject: [PATCH] Disable FlashInfer sampler by default (#26859) Signed-off-by: mgoin --- vllm/v1/sample/ops/topk_topp_sampler.py | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py index ed8bc55a3cf2f..43a40bce6847d 100644 --- a/vllm/v1/sample/ops/topk_topp_sampler.py +++ b/vllm/v1/sample/ops/topk_topp_sampler.py @@ -46,23 +46,15 @@ class TopKTopPSampler(nn.Module): "Falling back to default sampling implementation." ) self.forward = self.forward_native - elif envs.VLLM_USE_FLASHINFER_SAMPLER is not False: - # NOTE(woosuk): The V0 sampler doesn't use FlashInfer for - # sampling unless VLLM_USE_FLASHINFER_SAMPLER=1 (i.e., by - # default it is unused). For backward compatibility, we set - # `VLLM_USE_FLASHINFER_SAMPLER` as None by default and - # interpret it differently in V0 and V1 samplers: In V0, - # None means False, while in V1, None means True. This is - # why we use the condition - # `envs.VLLM_USE_FLASHINFER_SAMPLER is not False` here. + elif envs.VLLM_USE_FLASHINFER_SAMPLER: + # Users must opt in explicitly via VLLM_USE_FLASHINFER_SAMPLER=1. logger.info_once("Using FlashInfer for top-p & top-k sampling.") self.forward = self.forward_cuda else: - logger.warning_once( - "FlashInfer is available, but it is not enabled. " - "Falling back to the PyTorch-native implementation of " - "top-p & top-k sampling. For the best performance, " - "please set VLLM_USE_FLASHINFER_SAMPLER=1." + logger.debug_once( + "FlashInfer top-p/top-k sampling is available but disabled " + "by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in " + "after verifying accuracy for your workloads." ) self.forward = self.forward_native else: