mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-10 10:57:03 +08:00
[Logs] Change flashinfer sampler logs to once (#21759)
Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
parent
31084b3b1f
commit
34a20c49b3
@ -33,7 +33,7 @@ class TopKTopPSampler(nn.Module):
|
||||
if is_flashinfer_available:
|
||||
flashinfer_version = flashinfer.__version__
|
||||
if flashinfer_version < "0.2.3":
|
||||
logger.warning(
|
||||
logger.warning_once(
|
||||
"FlashInfer version >= 0.2.3 required. "
|
||||
"Falling back to default sampling implementation.")
|
||||
self.forward = self.forward_native
|
||||
@ -46,17 +46,18 @@ class TopKTopPSampler(nn.Module):
|
||||
# None means False, while in V1, None means True. This is
|
||||
# why we use the condition
|
||||
# `envs.VLLM_USE_FLASHINFER_SAMPLER is not False` here.
|
||||
logger.info("Using FlashInfer for top-p & top-k sampling.")
|
||||
logger.info_once(
|
||||
"Using FlashInfer for top-p & top-k sampling.")
|
||||
self.forward = self.forward_cuda
|
||||
else:
|
||||
logger.warning(
|
||||
logger.warning_once(
|
||||
"FlashInfer is available, but it is not enabled. "
|
||||
"Falling back to the PyTorch-native implementation of "
|
||||
"top-p & top-k sampling. For the best performance, "
|
||||
"please set VLLM_USE_FLASHINFER_SAMPLER=1.")
|
||||
self.forward = self.forward_native
|
||||
else:
|
||||
logger.warning(
|
||||
logger.warning_once(
|
||||
"FlashInfer is not available. Falling back to the PyTorch-"
|
||||
"native implementation of top-p & top-k sampling. For the "
|
||||
"best performance, please install FlashInfer.")
|
||||
@ -97,9 +98,9 @@ class TopKTopPSampler(nn.Module):
|
||||
probs = logits.softmax(dim=-1, dtype=torch.float32)
|
||||
return random_sample(probs, generators)
|
||||
if generators:
|
||||
logger.warning("FlashInfer 0.2.3+ does not support "
|
||||
"per-request generators. Falling back to "
|
||||
"PyTorch-native implementation.")
|
||||
logger.warning_once("FlashInfer 0.2.3+ does not support "
|
||||
"per-request generators. Falling back to "
|
||||
"PyTorch-native implementation.")
|
||||
return self.forward_native(logits, generators, k, p)
|
||||
# flashinfer sampling functions expect contiguous logits.
|
||||
# In flex_attn/triton_attn fp32 inference, logits can be non-contiguous
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user