mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-11 06:25:51 +08:00
Disable FlashInfer sampler by default (#26859)
Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
parent
bfad142e25
commit
e66d787bce
@ -46,23 +46,15 @@ class TopKTopPSampler(nn.Module):
|
|||||||
"Falling back to default sampling implementation."
|
"Falling back to default sampling implementation."
|
||||||
)
|
)
|
||||||
self.forward = self.forward_native
|
self.forward = self.forward_native
|
||||||
elif envs.VLLM_USE_FLASHINFER_SAMPLER is not False:
|
elif envs.VLLM_USE_FLASHINFER_SAMPLER:
|
||||||
# NOTE(woosuk): The V0 sampler doesn't use FlashInfer for
|
# Users must opt in explicitly via VLLM_USE_FLASHINFER_SAMPLER=1.
|
||||||
# sampling unless VLLM_USE_FLASHINFER_SAMPLER=1 (i.e., by
|
|
||||||
# default it is unused). For backward compatibility, we set
|
|
||||||
# `VLLM_USE_FLASHINFER_SAMPLER` as None by default and
|
|
||||||
# interpret it differently in V0 and V1 samplers: In V0,
|
|
||||||
# None means False, while in V1, None means True. This is
|
|
||||||
# why we use the condition
|
|
||||||
# `envs.VLLM_USE_FLASHINFER_SAMPLER is not False` here.
|
|
||||||
logger.info_once("Using FlashInfer for top-p & top-k sampling.")
|
logger.info_once("Using FlashInfer for top-p & top-k sampling.")
|
||||||
self.forward = self.forward_cuda
|
self.forward = self.forward_cuda
|
||||||
else:
|
else:
|
||||||
logger.warning_once(
|
logger.debug_once(
|
||||||
"FlashInfer is available, but it is not enabled. "
|
"FlashInfer top-p/top-k sampling is available but disabled "
|
||||||
"Falling back to the PyTorch-native implementation of "
|
"by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in "
|
||||||
"top-p & top-k sampling. For the best performance, "
|
"after verifying accuracy for your workloads."
|
||||||
"please set VLLM_USE_FLASHINFER_SAMPLER=1."
|
|
||||||
)
|
)
|
||||||
self.forward = self.forward_native
|
self.forward = self.forward_native
|
||||||
else:
|
else:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user