diff --git a/vllm/envs.py b/vllm/envs.py index e0d28a0369ad7..7e072a588591c 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -651,11 +651,9 @@ environment_variables: dict[str, Callable[[], Any]] = { "NO_COLOR": lambda: os.getenv("NO_COLOR", "0") != "0", # If set, vllm will log stats at this interval in seconds # If not set, vllm will log stats every 10 seconds. - "VLLM_LOG_STATS_INTERVAL": lambda: ( - val - if (val := float(os.getenv("VLLM_LOG_STATS_INTERVAL", "10."))) > 0.0 - else 10.0 - ), + "VLLM_LOG_STATS_INTERVAL": lambda: val + if (val := float(os.getenv("VLLM_LOG_STATS_INTERVAL", "10."))) > 0.0 + else 10.0, # Trace function calls # If set to 1, vllm will trace function calls # Useful for debugging @@ -680,30 +678,28 @@ environment_variables: dict[str, Callable[[], Any]] = { ), ), # If set, vllm will use flashinfer sampler - "VLLM_USE_FLASHINFER_SAMPLER": lambda: ( - bool(int(os.environ["VLLM_USE_FLASHINFER_SAMPLER"])) - if "VLLM_USE_FLASHINFER_SAMPLER" in os.environ - else None - ), + "VLLM_USE_FLASHINFER_SAMPLER": lambda: bool( + int(os.environ["VLLM_USE_FLASHINFER_SAMPLER"]) + ) + if "VLLM_USE_FLASHINFER_SAMPLER" in os.environ + else None, # Pipeline stage partition strategy "VLLM_PP_LAYER_PARTITION": lambda: os.getenv("VLLM_PP_LAYER_PARTITION", None), # (CPU backend only) CPU key-value cache space. # default is None and will be set as 4 GB - "VLLM_CPU_KVCACHE_SPACE": lambda: ( - int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0")) - if "VLLM_CPU_KVCACHE_SPACE" in os.environ - else None - ), + "VLLM_CPU_KVCACHE_SPACE": lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0")) + if "VLLM_CPU_KVCACHE_SPACE" in os.environ + else None, # (CPU backend only) CPU core ids bound by OpenMP threads, e.g., "0-31", # "0,1,2", "0-31,33". CPU cores of different ranks are separated by '|'. "VLLM_CPU_OMP_THREADS_BIND": lambda: os.getenv("VLLM_CPU_OMP_THREADS_BIND", "auto"), # (CPU backend only) CPU cores not used by OMP threads . # Those CPU cores will not be used by OMP threads of a rank. - "VLLM_CPU_NUM_OF_RESERVED_CPU": lambda: ( - int(os.getenv("VLLM_CPU_NUM_OF_RESERVED_CPU", "0")) - if "VLLM_CPU_NUM_OF_RESERVED_CPU" in os.environ - else None - ), + "VLLM_CPU_NUM_OF_RESERVED_CPU": lambda: int( + os.getenv("VLLM_CPU_NUM_OF_RESERVED_CPU", "0") + ) + if "VLLM_CPU_NUM_OF_RESERVED_CPU" in os.environ + else None, # (CPU backend only) whether to use SGL kernels, optimized for small batch. "VLLM_CPU_SGL_KERNEL": lambda: bool(int(os.getenv("VLLM_CPU_SGL_KERNEL", "0"))), # If the env var is set, Ray Compiled Graph uses the specified @@ -847,11 +843,9 @@ environment_variables: dict[str, Callable[[], Any]] = { # a list of plugin names to load, separated by commas. # if this is not set, it means all plugins will be loaded # if this is set to an empty string, no plugins will be loaded - "VLLM_PLUGINS": lambda: ( - None - if "VLLM_PLUGINS" not in os.environ - else os.environ["VLLM_PLUGINS"].split(",") - ), + "VLLM_PLUGINS": lambda: None + if "VLLM_PLUGINS" not in os.environ + else os.environ["VLLM_PLUGINS"].split(","), # a local directory to look in for unrecognized LoRA adapters. # only works if plugins are enabled and # VLLM_ALLOW_RUNTIME_LORA_UPDATING is enabled. @@ -923,11 +917,9 @@ environment_variables: dict[str, Callable[[], Any]] = { # and performance comparisons. Currently only affects MPLinearKernel # selection # (kernels: MacheteLinearKernel, MarlinLinearKernel, ExllamaLinearKernel) - "VLLM_DISABLED_KERNELS": lambda: ( - [] - if "VLLM_DISABLED_KERNELS" not in os.environ - else os.environ["VLLM_DISABLED_KERNELS"].split(",") - ), + "VLLM_DISABLED_KERNELS": lambda: [] + if "VLLM_DISABLED_KERNELS" not in os.environ + else os.environ["VLLM_DISABLED_KERNELS"].split(","), # Disable pynccl (using torch.distributed instead) "VLLM_DISABLE_PYNCCL": lambda: ( os.getenv("VLLM_DISABLE_PYNCCL", "False").lower() in ("true", "1") @@ -1163,11 +1155,11 @@ environment_variables: dict[str, Callable[[], Any]] = { == "1", # Gap between padding buckets for the forward pass. So we have # 8, we will run forward pass with [16, 24, 32, ...]. - "VLLM_TPU_BUCKET_PADDING_GAP": lambda: ( - int(os.environ["VLLM_TPU_BUCKET_PADDING_GAP"]) - if "VLLM_TPU_BUCKET_PADDING_GAP" in os.environ - else 0 - ), + "VLLM_TPU_BUCKET_PADDING_GAP": lambda: int( + os.environ["VLLM_TPU_BUCKET_PADDING_GAP"] + ) + if "VLLM_TPU_BUCKET_PADDING_GAP" in os.environ + else 0, "VLLM_TPU_MOST_MODEL_LEN": lambda: maybe_convert_int( os.environ.get("VLLM_TPU_MOST_MODEL_LEN", None) ),