From 9eec282cb5a6fe0b39449476dc4d14da0516984c Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 28 Nov 2025 16:34:48 +0000 Subject: [PATCH] Guard FlashInfer sampler using the same check as FlashInfer attention backend (#29415) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Cyrus Leung --- vllm/v1/sample/ops/topk_topp_sampler.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py index 5b2d130b0ea42..c9229e788b6bf 100644 --- a/vllm/v1/sample/ops/topk_topp_sampler.py +++ b/vllm/v1/sample/ops/topk_topp_sampler.py @@ -33,6 +33,16 @@ class TopKTopPSampler(nn.Module): and current_platform.is_cuda() ): if envs.VLLM_USE_FLASHINFER_SAMPLER: + from vllm.v1.attention.backends.flashinfer import FlashInferBackend + + capability = current_platform.get_device_capability() + assert capability is not None + if not FlashInferBackend.supports_compute_capability(capability): + capability_str = capability.as_version_str() + raise RuntimeError( + "FlashInfer does not support compute capability " + f"{capability_str}, unset VLLM_USE_FLASHINFER_SAMPLER=1." + ) # Users must opt in explicitly via VLLM_USE_FLASHINFER_SAMPLER=1. logger.info_once( "Using FlashInfer for top-p & top-k sampling.",