diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py index 87a84e5bf4350..460e1c0b05bca 100644 --- a/vllm/v1/sample/ops/topk_topp_sampler.py +++ b/vllm/v1/sample/ops/topk_topp_sampler.py @@ -33,7 +33,7 @@ class TopKTopPSampler(nn.Module): if is_flashinfer_available: flashinfer_version = flashinfer.__version__ if flashinfer_version < "0.2.3": - logger.warning( + logger.warning_once( "FlashInfer version >= 0.2.3 required. " "Falling back to default sampling implementation.") self.forward = self.forward_native @@ -46,17 +46,18 @@ class TopKTopPSampler(nn.Module): # None means False, while in V1, None means True. This is # why we use the condition # `envs.VLLM_USE_FLASHINFER_SAMPLER is not False` here. - logger.info("Using FlashInfer for top-p & top-k sampling.") + logger.info_once( + "Using FlashInfer for top-p & top-k sampling.") self.forward = self.forward_cuda else: - logger.warning( + logger.warning_once( "FlashInfer is available, but it is not enabled. " "Falling back to the PyTorch-native implementation of " "top-p & top-k sampling. For the best performance, " "please set VLLM_USE_FLASHINFER_SAMPLER=1.") self.forward = self.forward_native else: - logger.warning( + logger.warning_once( "FlashInfer is not available. Falling back to the PyTorch-" "native implementation of top-p & top-k sampling. For the " "best performance, please install FlashInfer.") @@ -97,9 +98,9 @@ class TopKTopPSampler(nn.Module): probs = logits.softmax(dim=-1, dtype=torch.float32) return random_sample(probs, generators) if generators: - logger.warning("FlashInfer 0.2.3+ does not support " - "per-request generators. Falling back to " - "PyTorch-native implementation.") + logger.warning_once("FlashInfer 0.2.3+ does not support " + "per-request generators. Falling back to " + "PyTorch-native implementation.") return self.forward_native(logits, generators, k, p) # flashinfer sampling functions expect contiguous logits. # In flex_attn/triton_attn fp32 inference, logits can be non-contiguous