Signed-off-by: Reagan <reaganjlee@gmail.com>
This commit is contained in:
Reagan 2025-12-16 17:07:02 -08:00
parent 169be31b01
commit 6275b3c1b9

View File

@ -651,11 +651,9 @@ environment_variables: dict[str, Callable[[], Any]] = {
"NO_COLOR": lambda: os.getenv("NO_COLOR", "0") != "0",
# If set, vllm will log stats at this interval in seconds
# If not set, vllm will log stats every 10 seconds.
"VLLM_LOG_STATS_INTERVAL": lambda: (
val
if (val := float(os.getenv("VLLM_LOG_STATS_INTERVAL", "10."))) > 0.0
else 10.0
),
"VLLM_LOG_STATS_INTERVAL": lambda: val
if (val := float(os.getenv("VLLM_LOG_STATS_INTERVAL", "10."))) > 0.0
else 10.0,
# Trace function calls
# If set to 1, vllm will trace function calls
# Useful for debugging
@ -680,30 +678,28 @@ environment_variables: dict[str, Callable[[], Any]] = {
),
),
# If set, vllm will use flashinfer sampler
"VLLM_USE_FLASHINFER_SAMPLER": lambda: (
bool(int(os.environ["VLLM_USE_FLASHINFER_SAMPLER"]))
if "VLLM_USE_FLASHINFER_SAMPLER" in os.environ
else None
),
"VLLM_USE_FLASHINFER_SAMPLER": lambda: bool(
int(os.environ["VLLM_USE_FLASHINFER_SAMPLER"])
)
if "VLLM_USE_FLASHINFER_SAMPLER" in os.environ
else None,
# Pipeline stage partition strategy
"VLLM_PP_LAYER_PARTITION": lambda: os.getenv("VLLM_PP_LAYER_PARTITION", None),
# (CPU backend only) CPU key-value cache space.
# default is None and will be set as 4 GB
"VLLM_CPU_KVCACHE_SPACE": lambda: (
int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0"))
if "VLLM_CPU_KVCACHE_SPACE" in os.environ
else None
),
"VLLM_CPU_KVCACHE_SPACE": lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0"))
if "VLLM_CPU_KVCACHE_SPACE" in os.environ
else None,
# (CPU backend only) CPU core ids bound by OpenMP threads, e.g., "0-31",
# "0,1,2", "0-31,33". CPU cores of different ranks are separated by '|'.
"VLLM_CPU_OMP_THREADS_BIND": lambda: os.getenv("VLLM_CPU_OMP_THREADS_BIND", "auto"),
# (CPU backend only) CPU cores not used by OMP threads .
# Those CPU cores will not be used by OMP threads of a rank.
"VLLM_CPU_NUM_OF_RESERVED_CPU": lambda: (
int(os.getenv("VLLM_CPU_NUM_OF_RESERVED_CPU", "0"))
if "VLLM_CPU_NUM_OF_RESERVED_CPU" in os.environ
else None
),
"VLLM_CPU_NUM_OF_RESERVED_CPU": lambda: int(
os.getenv("VLLM_CPU_NUM_OF_RESERVED_CPU", "0")
)
if "VLLM_CPU_NUM_OF_RESERVED_CPU" in os.environ
else None,
# (CPU backend only) whether to use SGL kernels, optimized for small batch.
"VLLM_CPU_SGL_KERNEL": lambda: bool(int(os.getenv("VLLM_CPU_SGL_KERNEL", "0"))),
# If the env var is set, Ray Compiled Graph uses the specified
@ -847,11 +843,9 @@ environment_variables: dict[str, Callable[[], Any]] = {
# a list of plugin names to load, separated by commas.
# if this is not set, it means all plugins will be loaded
# if this is set to an empty string, no plugins will be loaded
"VLLM_PLUGINS": lambda: (
None
if "VLLM_PLUGINS" not in os.environ
else os.environ["VLLM_PLUGINS"].split(",")
),
"VLLM_PLUGINS": lambda: None
if "VLLM_PLUGINS" not in os.environ
else os.environ["VLLM_PLUGINS"].split(","),
# a local directory to look in for unrecognized LoRA adapters.
# only works if plugins are enabled and
# VLLM_ALLOW_RUNTIME_LORA_UPDATING is enabled.
@ -923,11 +917,9 @@ environment_variables: dict[str, Callable[[], Any]] = {
# and performance comparisons. Currently only affects MPLinearKernel
# selection
# (kernels: MacheteLinearKernel, MarlinLinearKernel, ExllamaLinearKernel)
"VLLM_DISABLED_KERNELS": lambda: (
[]
if "VLLM_DISABLED_KERNELS" not in os.environ
else os.environ["VLLM_DISABLED_KERNELS"].split(",")
),
"VLLM_DISABLED_KERNELS": lambda: []
if "VLLM_DISABLED_KERNELS" not in os.environ
else os.environ["VLLM_DISABLED_KERNELS"].split(","),
# Disable pynccl (using torch.distributed instead)
"VLLM_DISABLE_PYNCCL": lambda: (
os.getenv("VLLM_DISABLE_PYNCCL", "False").lower() in ("true", "1")
@ -1163,11 +1155,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
== "1",
# Gap between padding buckets for the forward pass. So we have
# 8, we will run forward pass with [16, 24, 32, ...].
"VLLM_TPU_BUCKET_PADDING_GAP": lambda: (
int(os.environ["VLLM_TPU_BUCKET_PADDING_GAP"])
if "VLLM_TPU_BUCKET_PADDING_GAP" in os.environ
else 0
),
"VLLM_TPU_BUCKET_PADDING_GAP": lambda: int(
os.environ["VLLM_TPU_BUCKET_PADDING_GAP"]
)
if "VLLM_TPU_BUCKET_PADDING_GAP" in os.environ
else 0,
"VLLM_TPU_MOST_MODEL_LEN": lambda: maybe_convert_int(
os.environ.get("VLLM_TPU_MOST_MODEL_LEN", None)
),