From e66d787bce22c56f995f4e2974e31ac020bc57ea Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Tue, 14 Oct 2025 22:35:18 -0400
Subject: [PATCH] Disable FlashInfer sampler by default (#26859)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 vllm/v1/sample/ops/topk_topp_sampler.py | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index ed8bc55a3cf2f..43a40bce6847d 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -46,23 +46,15 @@ class TopKTopPSampler(nn.Module):
                         "Falling back to default sampling implementation."
                     )
                     self.forward = self.forward_native
-                elif envs.VLLM_USE_FLASHINFER_SAMPLER is not False:
-                    # NOTE(woosuk): The V0 sampler doesn't use FlashInfer for
-                    # sampling unless VLLM_USE_FLASHINFER_SAMPLER=1 (i.e., by
-                    # default it is unused). For backward compatibility, we set
-                    # `VLLM_USE_FLASHINFER_SAMPLER` as None by default and
-                    # interpret it differently in V0 and V1 samplers: In V0,
-                    # None means False, while in V1, None means True. This is
-                    # why we use the condition
-                    # `envs.VLLM_USE_FLASHINFER_SAMPLER is not False` here.
+                elif envs.VLLM_USE_FLASHINFER_SAMPLER:
+                    # Users must opt in explicitly via VLLM_USE_FLASHINFER_SAMPLER=1.
                     logger.info_once("Using FlashInfer for top-p & top-k sampling.")
                     self.forward = self.forward_cuda
                 else:
-                    logger.warning_once(
-                        "FlashInfer is available, but it is not enabled. "
-                        "Falling back to the PyTorch-native implementation of "
-                        "top-p & top-k sampling. For the best performance, "
-                        "please set VLLM_USE_FLASHINFER_SAMPLER=1."
+                    logger.debug_once(
+                        "FlashInfer top-p/top-k sampling is available but disabled "
+                        "by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in "
+                        "after verifying accuracy for your workloads."
                     )
                     self.forward = self.forward_native
             else: