mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-09 16:47:08 +08:00
format
Signed-off-by: Reagan <reaganjlee@gmail.com>
This commit is contained in:
parent
169be31b01
commit
6275b3c1b9
62
vllm/envs.py
62
vllm/envs.py
@ -651,11 +651,9 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"NO_COLOR": lambda: os.getenv("NO_COLOR", "0") != "0",
|
||||
# If set, vllm will log stats at this interval in seconds
|
||||
# If not set, vllm will log stats every 10 seconds.
|
||||
"VLLM_LOG_STATS_INTERVAL": lambda: (
|
||||
val
|
||||
if (val := float(os.getenv("VLLM_LOG_STATS_INTERVAL", "10."))) > 0.0
|
||||
else 10.0
|
||||
),
|
||||
"VLLM_LOG_STATS_INTERVAL": lambda: val
|
||||
if (val := float(os.getenv("VLLM_LOG_STATS_INTERVAL", "10."))) > 0.0
|
||||
else 10.0,
|
||||
# Trace function calls
|
||||
# If set to 1, vllm will trace function calls
|
||||
# Useful for debugging
|
||||
@ -680,30 +678,28 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
),
|
||||
),
|
||||
# If set, vllm will use flashinfer sampler
|
||||
"VLLM_USE_FLASHINFER_SAMPLER": lambda: (
|
||||
bool(int(os.environ["VLLM_USE_FLASHINFER_SAMPLER"]))
|
||||
if "VLLM_USE_FLASHINFER_SAMPLER" in os.environ
|
||||
else None
|
||||
),
|
||||
"VLLM_USE_FLASHINFER_SAMPLER": lambda: bool(
|
||||
int(os.environ["VLLM_USE_FLASHINFER_SAMPLER"])
|
||||
)
|
||||
if "VLLM_USE_FLASHINFER_SAMPLER" in os.environ
|
||||
else None,
|
||||
# Pipeline stage partition strategy
|
||||
"VLLM_PP_LAYER_PARTITION": lambda: os.getenv("VLLM_PP_LAYER_PARTITION", None),
|
||||
# (CPU backend only) CPU key-value cache space.
|
||||
# default is None and will be set as 4 GB
|
||||
"VLLM_CPU_KVCACHE_SPACE": lambda: (
|
||||
int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0"))
|
||||
if "VLLM_CPU_KVCACHE_SPACE" in os.environ
|
||||
else None
|
||||
),
|
||||
"VLLM_CPU_KVCACHE_SPACE": lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0"))
|
||||
if "VLLM_CPU_KVCACHE_SPACE" in os.environ
|
||||
else None,
|
||||
# (CPU backend only) CPU core ids bound by OpenMP threads, e.g., "0-31",
|
||||
# "0,1,2", "0-31,33". CPU cores of different ranks are separated by '|'.
|
||||
"VLLM_CPU_OMP_THREADS_BIND": lambda: os.getenv("VLLM_CPU_OMP_THREADS_BIND", "auto"),
|
||||
# (CPU backend only) CPU cores not used by OMP threads .
|
||||
# Those CPU cores will not be used by OMP threads of a rank.
|
||||
"VLLM_CPU_NUM_OF_RESERVED_CPU": lambda: (
|
||||
int(os.getenv("VLLM_CPU_NUM_OF_RESERVED_CPU", "0"))
|
||||
if "VLLM_CPU_NUM_OF_RESERVED_CPU" in os.environ
|
||||
else None
|
||||
),
|
||||
"VLLM_CPU_NUM_OF_RESERVED_CPU": lambda: int(
|
||||
os.getenv("VLLM_CPU_NUM_OF_RESERVED_CPU", "0")
|
||||
)
|
||||
if "VLLM_CPU_NUM_OF_RESERVED_CPU" in os.environ
|
||||
else None,
|
||||
# (CPU backend only) whether to use SGL kernels, optimized for small batch.
|
||||
"VLLM_CPU_SGL_KERNEL": lambda: bool(int(os.getenv("VLLM_CPU_SGL_KERNEL", "0"))),
|
||||
# If the env var is set, Ray Compiled Graph uses the specified
|
||||
@ -847,11 +843,9 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
# a list of plugin names to load, separated by commas.
|
||||
# if this is not set, it means all plugins will be loaded
|
||||
# if this is set to an empty string, no plugins will be loaded
|
||||
"VLLM_PLUGINS": lambda: (
|
||||
None
|
||||
if "VLLM_PLUGINS" not in os.environ
|
||||
else os.environ["VLLM_PLUGINS"].split(",")
|
||||
),
|
||||
"VLLM_PLUGINS": lambda: None
|
||||
if "VLLM_PLUGINS" not in os.environ
|
||||
else os.environ["VLLM_PLUGINS"].split(","),
|
||||
# a local directory to look in for unrecognized LoRA adapters.
|
||||
# only works if plugins are enabled and
|
||||
# VLLM_ALLOW_RUNTIME_LORA_UPDATING is enabled.
|
||||
@ -923,11 +917,9 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
# and performance comparisons. Currently only affects MPLinearKernel
|
||||
# selection
|
||||
# (kernels: MacheteLinearKernel, MarlinLinearKernel, ExllamaLinearKernel)
|
||||
"VLLM_DISABLED_KERNELS": lambda: (
|
||||
[]
|
||||
if "VLLM_DISABLED_KERNELS" not in os.environ
|
||||
else os.environ["VLLM_DISABLED_KERNELS"].split(",")
|
||||
),
|
||||
"VLLM_DISABLED_KERNELS": lambda: []
|
||||
if "VLLM_DISABLED_KERNELS" not in os.environ
|
||||
else os.environ["VLLM_DISABLED_KERNELS"].split(","),
|
||||
# Disable pynccl (using torch.distributed instead)
|
||||
"VLLM_DISABLE_PYNCCL": lambda: (
|
||||
os.getenv("VLLM_DISABLE_PYNCCL", "False").lower() in ("true", "1")
|
||||
@ -1163,11 +1155,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
== "1",
|
||||
# Gap between padding buckets for the forward pass. So we have
|
||||
# 8, we will run forward pass with [16, 24, 32, ...].
|
||||
"VLLM_TPU_BUCKET_PADDING_GAP": lambda: (
|
||||
int(os.environ["VLLM_TPU_BUCKET_PADDING_GAP"])
|
||||
if "VLLM_TPU_BUCKET_PADDING_GAP" in os.environ
|
||||
else 0
|
||||
),
|
||||
"VLLM_TPU_BUCKET_PADDING_GAP": lambda: int(
|
||||
os.environ["VLLM_TPU_BUCKET_PADDING_GAP"]
|
||||
)
|
||||
if "VLLM_TPU_BUCKET_PADDING_GAP" in os.environ
|
||||
else 0,
|
||||
"VLLM_TPU_MOST_MODEL_LEN": lambda: maybe_convert_int(
|
||||
os.environ.get("VLLM_TPU_MOST_MODEL_LEN", None)
|
||||
),
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user