From acaea3bb07883c80b71643ebee1cd08d555797bc Mon Sep 17 00:00:00 2001
From: DefTruth <31974251+DefTruth@users.noreply.github.com>
Date: Sat, 15 Mar 2025 11:42:38 +0800
Subject: [PATCH] [Bugfix][V1] Fix flashinfer sampling (#14815)

---
 vllm/v1/sample/ops/topk_topp_sampler.py | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index 7d70e839b6f4e..d461a80989337 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -24,7 +24,24 @@ class TopKTopPSampler(nn.Module):
         super().__init__()
         if current_platform.is_cuda():
             if is_flashinfer_available:
-                if envs.VLLM_USE_FLASHINFER_SAMPLER is not False:
+                flashinfer_version = flashinfer.__version__
+                if flashinfer_version >= "0.2.3":
+                    # FIXME(DefTruth): Currently, we have errors when using
+                    # FlashInfer>=v0.2.3 for top-p & top-k sampling. As a
+                    # workaround, we disable FlashInfer for top-p & top-k
+                    # sampling by default while FlashInfer>=v0.2.3.
+                    # The sampling API removes the success return value
+                    # of all sampling API, which is not compatible with
+                    # earlier design.
+                    # https://github.com/flashinfer-ai/flashinfer/releases/
+                    # tag/v0.2.3
+                    logger.info(
+                        "Currently, FlashInfer top-p & top-k sampling sampler "
+                        "is disabled because FlashInfer>=v0.2.3 is not "
+                        "backward compatible. Falling back to the PyTorch-"
+                        "native implementation of top-p & top-k sampling.")
+                    self.forward = self.forward_native
+                elif envs.VLLM_USE_FLASHINFER_SAMPLER is not False:
                     # NOTE(woosuk): The V0 sampler doesn't use FlashInfer for
                     # sampling unless VLLM_USE_FLASHINFER_SAMPLER=1 (i.e., by
                     # default it is unused). For backward compatibility, we set