From 1eec2bf88b073a74b9e974be14ae4273ef7e56a9 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 26 Aug 2025 00:45:58 +0000
Subject: [PATCH] Complete refactoring - remove backup file and finalize envs
 module

Co-authored-by: ProExpertProg <11367180+ProExpertProg@users.noreply.github.com>
---
 vllm/envs.py.backup | 1272 -------------------------------------------
 1 file changed, 1272 deletions(-)
 delete mode 100755 vllm/envs.py.backup

diff --git a/vllm/envs.py.backup b/vllm/envs.py.backup
deleted file mode 100755
index 1c9c4cdde8001..0000000000000
--- a/vllm/envs.py.backup
+++ /dev/null
@@ -1,1272 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import hashlib
-import os
-import sys
-import tempfile
-from typing import TYPE_CHECKING, Any, Callable, Optional
-
-if TYPE_CHECKING:
-    VLLM_HOST_IP: str = ""
-    VLLM_PORT: Optional[int] = None
-    VLLM_RPC_BASE_PATH: str = tempfile.gettempdir()
-    VLLM_USE_MODELSCOPE: bool = False
-    VLLM_RINGBUFFER_WARNING_INTERVAL: int = 60
-    VLLM_NCCL_SO_PATH: Optional[str] = None
-    LD_LIBRARY_PATH: Optional[str] = None
-    VLLM_USE_TRITON_FLASH_ATTN: bool = True
-    VLLM_V1_USE_PREFILL_DECODE_ATTENTION: bool = False
-    VLLM_USE_AITER_UNIFIED_ATTENTION: bool = False
-    VLLM_FLASH_ATTN_VERSION: Optional[int] = None
-    LOCAL_RANK: int = 0
-    CUDA_VISIBLE_DEVICES: Optional[str] = None
-    VLLM_ENGINE_ITERATION_TIMEOUT_S: int = 60
-    VLLM_API_KEY: Optional[str] = None
-    S3_ACCESS_KEY_ID: Optional[str] = None
-    S3_SECRET_ACCESS_KEY: Optional[str] = None
-    S3_ENDPOINT_URL: Optional[str] = None
-    VLLM_MODEL_REDIRECT_PATH: Optional[str] = None
-    VLLM_CACHE_ROOT: str = os.path.expanduser("~/.cache/vllm")
-    VLLM_CONFIG_ROOT: str = os.path.expanduser("~/.config/vllm")
-    VLLM_USAGE_STATS_SERVER: str = "https://stats.vllm.ai"
-    VLLM_NO_USAGE_STATS: bool = False
-    VLLM_DO_NOT_TRACK: bool = False
-    VLLM_USAGE_SOURCE: str = ""
-    VLLM_CONFIGURE_LOGGING: int = 1
-    VLLM_LOGGING_LEVEL: str = "INFO"
-    VLLM_LOGGING_PREFIX: str = ""
-    VLLM_LOGGING_CONFIG_PATH: Optional[str] = None
-    VLLM_LOGITS_PROCESSOR_THREADS: Optional[int] = None
-    VLLM_LOG_STATS_INTERVAL: float = 10.
-    VLLM_TRACE_FUNCTION: int = 0
-    VLLM_ATTENTION_BACKEND: Optional[str] = None
-    VLLM_USE_FLASHINFER_SAMPLER: Optional[bool] = None
-    VLLM_PP_LAYER_PARTITION: Optional[str] = None
-    VLLM_CPU_KVCACHE_SPACE: Optional[int] = 0
-    VLLM_CPU_OMP_THREADS_BIND: str = ""
-    VLLM_CPU_NUM_OF_RESERVED_CPU: Optional[int] = None
-    VLLM_CPU_MOE_PREPACK: bool = True
-    VLLM_CPU_SGL_KERNEL: bool = False
-    VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache")
-    VLLM_XLA_CHECK_RECOMPILATION: bool = False
-    VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024
-    VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING: bool = True
-    VLLM_USE_RAY_SPMD_WORKER: bool = False
-    VLLM_USE_RAY_COMPILED_DAG: bool = False
-    VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: str = "auto"
-    VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False
-    VLLM_USE_RAY_WRAPPED_PP_COMM: bool = True
-    VLLM_XLA_USE_SPMD: bool = False
-    VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
-    VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets")
-    VLLM_IMAGE_FETCH_TIMEOUT: int = 5
-    VLLM_VIDEO_FETCH_TIMEOUT: int = 30
-    VLLM_AUDIO_FETCH_TIMEOUT: int = 10
-    VLLM_MEDIA_LOADING_THREAD_COUNT: int = 8
-    VLLM_MAX_AUDIO_CLIP_FILESIZE_MB: int = 25
-    VLLM_VIDEO_LOADER_BACKEND: str = "opencv"
-    VLLM_MM_INPUT_CACHE_GIB: int = 4
-    VLLM_TARGET_DEVICE: str = "cuda"
-    MAX_JOBS: Optional[str] = None
-    NVCC_THREADS: Optional[str] = None
-    VLLM_USE_PRECOMPILED: bool = False
-    VLLM_DOCKER_BUILD_CONTEXT: bool = False
-    VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False
-    VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
-    CMAKE_BUILD_TYPE: Optional[str] = None
-    VERBOSE: bool = False
-    VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False
-    VLLM_RPC_TIMEOUT: int = 10000  # ms
-    VLLM_HTTP_TIMEOUT_KEEP_ALIVE: int = 5  # seconds
-    VLLM_PLUGINS: Optional[list[str]] = None
-    VLLM_LORA_RESOLVER_CACHE_DIR: Optional[str] = None
-    VLLM_TORCH_PROFILER_DIR: Optional[str] = None
-    VLLM_TORCH_PROFILER_RECORD_SHAPES: bool = False
-    VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY: bool = False
-    VLLM_TORCH_PROFILER_WITH_STACK: bool = True
-    VLLM_TORCH_PROFILER_WITH_FLOPS: bool = False
-    VLLM_USE_TRITON_AWQ: bool = False
-    VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
-    VLLM_SKIP_P2P_CHECK: bool = False
-    VLLM_DISABLED_KERNELS: list[str] = []
-    VLLM_USE_V1: bool = True
-    VLLM_ROCM_USE_AITER: bool = False
-    VLLM_ROCM_USE_AITER_PAGED_ATTN: bool = False
-    VLLM_ROCM_USE_AITER_LINEAR: bool = True
-    VLLM_ROCM_USE_AITER_MOE: bool = True
-    VLLM_ROCM_USE_AITER_RMSNORM: bool = True
-    VLLM_ROCM_USE_AITER_MLA: bool = True
-    VLLM_ROCM_USE_AITER_MHA: bool = True
-    VLLM_ROCM_USE_SKINNY_GEMM: bool = True
-    VLLM_ROCM_FP8_PADDING: bool = True
-    VLLM_ROCM_MOE_PADDING: bool = True
-    VLLM_ROCM_CUSTOM_PAGED_ATTN: bool = True
-    VLLM_ENABLE_V1_MULTIPROCESSING: bool = True
-    VLLM_LOG_BATCHSIZE_INTERVAL: float = -1
-    VLLM_DISABLE_COMPILE_CACHE: bool = False
-    Q_SCALE_CONSTANT: int = 200
-    K_SCALE_CONSTANT: int = 200
-    V_SCALE_CONSTANT: int = 100
-    VLLM_SERVER_DEV_MODE: bool = False
-    VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128
-    VLLM_MLA_DISABLE: bool = False
-    VLLM_RAY_PER_WORKER_GPUS: float = 1.0
-    VLLM_RAY_BUNDLE_INDICES: str = ""
-    VLLM_CUDART_SO_PATH: Optional[str] = None
-    VLLM_DP_RANK: int = 0
-    VLLM_DP_RANK_LOCAL: int = -1
-    VLLM_DP_SIZE: int = 1
-    VLLM_DP_MASTER_IP: str = ""
-    VLLM_DP_MASTER_PORT: int = 0
-    VLLM_MOE_DP_CHUNK_SIZE: int = 256
-    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: bool = False
-    VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
-    VLLM_MXFP4_USE_MARLIN: Optional[bool] = None
-    VLLM_V0_USE_OUTLINES_CACHE: bool = False
-    VLLM_V1_USE_OUTLINES_CACHE: bool = False
-    VLLM_TPU_BUCKET_PADDING_GAP: int = 0
-    VLLM_TPU_MOST_MODEL_LEN: Optional[int] = None
-    VLLM_TPU_USING_PATHWAYS: bool = False
-    VLLM_USE_DEEP_GEMM: bool = False
-    VLLM_USE_DEEP_GEMM_E8M0: bool = True
-    VLLM_SKIP_DEEP_GEMM_WARMUP: bool = False
-    VLLM_USE_FUSED_MOE_GROUPED_TOPK: bool = True
-    VLLM_USE_FLASHINFER_MOE_FP8: bool = False
-    VLLM_USE_FLASHINFER_MOE_FP4: bool = False
-    VLLM_FLASHINFER_MOE_BACKEND: str = "throughput"
-    VLLM_XGRAMMAR_CACHE_MB: int = 0
-    VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256
-    VLLM_ALLOW_INSECURE_SERIALIZATION: bool = False
-    VLLM_NIXL_SIDE_CHANNEL_HOST: str = "localhost"
-    VLLM_NIXL_SIDE_CHANNEL_PORT: int = 5557
-    VLLM_ALL2ALL_BACKEND: str = "naive"
-    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840
-    VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1
-    VLLM_SLEEP_WHEN_IDLE: bool = False
-    VLLM_MQ_MAX_CHUNK_BYTES_MB: int = 16
-    VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: int = 300
-    VLLM_KV_CACHE_LAYOUT: Optional[str] = None
-    VLLM_COMPUTE_NANS_IN_LOGITS: bool = False
-    VLLM_USE_NVFP4_CT_EMULATIONS: bool = False
-    VLLM_ROCM_QUICK_REDUCE_QUANTIZATION: str = "NONE"
-    VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16: bool = True
-    VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB: Optional[int] = None
-    VLLM_NIXL_ABORT_REQUEST_TIMEOUT: int = 120
-    VLLM_USE_CUDNN_PREFILL: bool = False
-    VLLM_ENABLE_CUDAGRAPH_GC: bool = False
-    VLLM_LOOPBACK_IP: str = ""
-    VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = False
-    VLLM_ENABLE_RESPONSES_API_STORE: bool = False
-    VLLM_USE_TRTLLM_ATTENTION: Optional[str] = None
-    VLLM_HAS_FLASHINFER_CUBIN: bool = False
-    VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: bool = False
-    VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False
-    VLLM_ALLREDUCE_USE_SYMM_MEM: bool = False
-    VLLM_TUNED_CONFIG_FOLDER: Optional[str] = None
-
-
-def get_default_cache_root():
-    return os.getenv(
-        "XDG_CACHE_HOME",
-        os.path.join(os.path.expanduser("~"), ".cache"),
-    )
-
-
-def get_default_config_root():
-    return os.getenv(
-        "XDG_CONFIG_HOME",
-        os.path.join(os.path.expanduser("~"), ".config"),
-    )
-
-
-def maybe_convert_int(value: Optional[str]) -> Optional[int]:
-    if value is None:
-        return None
-    return int(value)
-
-
-def maybe_convert_bool(value: Optional[str]) -> Optional[bool]:
-    if value is None:
-        return None
-    return bool(int(value))
-
-
-def get_vllm_port() -> Optional[int]:
-    """Get the port from VLLM_PORT environment variable.
-
-    Returns:
-        The port number as an integer if VLLM_PORT is set, None otherwise.
-
-    Raises:
-        ValueError: If VLLM_PORT is a URI, suggest k8s service discovery issue.
-    """
-    if 'VLLM_PORT' not in os.environ:
-        return None
-
-    port = os.getenv('VLLM_PORT', '0')
-
-    try:
-        return int(port)
-    except ValueError as err:
-        from urllib.parse import urlparse
-        parsed = urlparse(port)
-        if parsed.scheme:
-            raise ValueError(
-                f"VLLM_PORT '{port}' appears to be a URI. "
-                "This may be caused by a Kubernetes service discovery issue,"
-                "check the warning in: https://docs.vllm.ai/en/stable/serving/env_vars.html"
-            ) from None
-        raise ValueError(
-            f"VLLM_PORT '{port}' must be a valid integer") from err
-
-
-# The begin-* and end* here are used by the documentation generator
-# to extract the used env vars.
-
-# --8<-- [start:env-vars-definition]
-
-environment_variables: dict[str, Callable[[], Any]] = {
-
-    # ================== Installation Time Env Vars ==================
-
-    # Target device of vLLM, supporting [cuda (by default),
-    # rocm, neuron, cpu]
-    "VLLM_TARGET_DEVICE":
-    lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda").lower(),
-
-    # Maximum number of compilation jobs to run in parallel.
-    # By default this is the number of CPUs
-    "MAX_JOBS":
-    lambda: os.getenv("MAX_JOBS", None),
-
-    # Number of threads to use for nvcc
-    # By default this is 1.
-    # If set, `MAX_JOBS` will be reduced to avoid oversubscribing the CPU.
-    "NVCC_THREADS":
-    lambda: os.getenv("NVCC_THREADS", None),
-
-    # If set, vllm will use precompiled binaries (*.so)
-    "VLLM_USE_PRECOMPILED":
-    lambda: os.environ.get("VLLM_USE_PRECOMPILED", "").strip().lower() in
-    ("1", "true") or bool(os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")),
-
-    # Used to mark that setup.py is running in a Docker build context,
-    # in order to force the use of precompiled binaries.
-    "VLLM_DOCKER_BUILD_CONTEXT":
-    lambda: os.environ.get("VLLM_DOCKER_BUILD_CONTEXT", "").strip().lower() in
-    ("1", "true"),
-
-    # Whether to force using nightly wheel in python build.
-    # This is used for testing the nightly wheel in python build.
-    "VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL":
-    lambda: bool(int(os.getenv("VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL", "0"))
-                 ),
-
-    # CMake build type
-    # If not set, defaults to "Debug" or "RelWithDebInfo"
-    # Available options: "Debug", "Release", "RelWithDebInfo"
-    "CMAKE_BUILD_TYPE":
-    lambda: os.getenv("CMAKE_BUILD_TYPE"),
-
-    # If set, vllm will print verbose logs during installation
-    "VERBOSE":
-    lambda: bool(int(os.getenv('VERBOSE', '0'))),
-
-    # Root directory for vLLM configuration files
-    # Defaults to `~/.config/vllm` unless `XDG_CONFIG_HOME` is set
-    # Note that this not only affects how vllm finds its configuration files
-    # during runtime, but also affects how vllm installs its configuration
-    # files during **installation**.
-    "VLLM_CONFIG_ROOT":
-    lambda: os.path.expanduser(
-        os.getenv(
-            "VLLM_CONFIG_ROOT",
-            os.path.join(get_default_config_root(), "vllm"),
-        )),
-
-    # ================== Runtime Env Vars ==================
-
-    # Root directory for vLLM cache files
-    # Defaults to `~/.cache/vllm` unless `XDG_CACHE_HOME` is set
-    "VLLM_CACHE_ROOT":
-    lambda: os.path.expanduser(
-        os.getenv(
-            "VLLM_CACHE_ROOT",
-            os.path.join(get_default_cache_root(), "vllm"),
-        )),
-
-    # used in distributed environment to determine the ip address
-    # of the current node, when the node has multiple network interfaces.
-    # If you are using multi-node inference, you should set this differently
-    # on each node.
-    'VLLM_HOST_IP':
-    lambda: os.getenv('VLLM_HOST_IP', ""),
-
-    # used in distributed environment to manually set the communication port
-    # Note: if VLLM_PORT is set, and some code asks for multiple ports, the
-    # VLLM_PORT will be used as the first port, and the rest will be generated
-    # by incrementing the VLLM_PORT value.
-    'VLLM_PORT':
-    get_vllm_port,
-
-    # path used for ipc when the frontend api server is running in
-    # multi-processing mode to communicate with the backend engine process.
-    'VLLM_RPC_BASE_PATH':
-    lambda: os.getenv('VLLM_RPC_BASE_PATH', tempfile.gettempdir()),
-
-    # If true, will load models from ModelScope instead of Hugging Face Hub.
-    # note that the value is true or false, not numbers
-    "VLLM_USE_MODELSCOPE":
-    lambda: os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true",
-
-    # Interval in seconds to log a warning message when the ring buffer is full
-    "VLLM_RINGBUFFER_WARNING_INTERVAL":
-    lambda: int(os.environ.get("VLLM_RINGBUFFER_WARNING_INTERVAL", "60")),
-
-    # path to cudatoolkit home directory, under which should be bin, include,
-    # and lib directories.
-    "CUDA_HOME":
-    lambda: os.environ.get("CUDA_HOME", None),
-
-    # Path to the NCCL library file. It is needed because nccl>=2.19 brought
-    # by PyTorch contains a bug: https://github.com/NVIDIA/nccl/issues/1234
-    "VLLM_NCCL_SO_PATH":
-    lambda: os.environ.get("VLLM_NCCL_SO_PATH", None),
-
-    # when `VLLM_NCCL_SO_PATH` is not set, vllm will try to find the nccl
-    # library file in the locations specified by `LD_LIBRARY_PATH`
-    "LD_LIBRARY_PATH":
-    lambda: os.environ.get("LD_LIBRARY_PATH", None),
-
-    # flag to control if vllm should use triton flash attention
-    "VLLM_USE_TRITON_FLASH_ATTN":
-    lambda: (os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in
-             ("true", "1")),
-
-    # Use separate prefill and decode kernels for V1 attention instead of
-    # the unified triton kernel.
-    "VLLM_V1_USE_PREFILL_DECODE_ATTENTION":
-    lambda:
-    (os.getenv("VLLM_V1_USE_PREFILL_DECODE_ATTENTION", "False").lower() in
-     ("true", "1")),
-
-    # Use AITER triton unified attention for V1 attention
-    "VLLM_USE_AITER_UNIFIED_ATTENTION":
-    lambda:
-    (os.getenv("VLLM_USE_AITER_UNIFIED_ATTENTION", "False").lower() in
-     ("true", "1")),
-
-    # Force vllm to use a specific flash-attention version (2 or 3), only valid
-    # when using the flash-attention backend.
-    "VLLM_FLASH_ATTN_VERSION":
-    lambda: maybe_convert_int(os.environ.get("VLLM_FLASH_ATTN_VERSION", None)),
-
-    # Internal flag to enable Dynamo fullgraph capture
-    "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE":
-    lambda: bool(
-        os.environ.get("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1") != "0"),
-
-    # Feature flag to enable/disable Inductor standalone compile.
-    # In torch <= 2.7 we ignore this flag; in torch >= 2.8 this is
-    # enabled by default.
-    "VLLM_USE_STANDALONE_COMPILE":
-    lambda: os.environ.get("VLLM_USE_STANDALONE_COMPILE", "1") == "1",
-
-    # local rank of the process in the distributed setting, used to determine
-    # the GPU device id
-    "LOCAL_RANK":
-    lambda: int(os.environ.get("LOCAL_RANK", "0")),
-
-    # used to control the visible devices in the distributed setting
-    "CUDA_VISIBLE_DEVICES":
-    lambda: os.environ.get("CUDA_VISIBLE_DEVICES", None),
-
-    # timeout for each iteration in the engine
-    "VLLM_ENGINE_ITERATION_TIMEOUT_S":
-    lambda: int(os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60")),
-
-    # API key for vLLM API server
-    "VLLM_API_KEY":
-    lambda: os.environ.get("VLLM_API_KEY", None),
-
-    # Whether to log responses from API Server for debugging
-    "VLLM_DEBUG_LOG_API_SERVER_RESPONSE":
-    lambda: os.environ.get("VLLM_DEBUG_LOG_API_SERVER_RESPONSE", "False"
-                           ).lower() == "true",
-
-    # S3 access information, used for tensorizer to load model from S3
-    "S3_ACCESS_KEY_ID":
-    lambda: os.environ.get("S3_ACCESS_KEY_ID", None),
-    "S3_SECRET_ACCESS_KEY":
-    lambda: os.environ.get("S3_SECRET_ACCESS_KEY", None),
-    "S3_ENDPOINT_URL":
-    lambda: os.environ.get("S3_ENDPOINT_URL", None),
-
-    # Usage stats collection
-    "VLLM_USAGE_STATS_SERVER":
-    lambda: os.environ.get("VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai"),
-    "VLLM_NO_USAGE_STATS":
-    lambda: os.environ.get("VLLM_NO_USAGE_STATS", "0") == "1",
-    "VLLM_DO_NOT_TRACK":
-    lambda: (os.environ.get("VLLM_DO_NOT_TRACK", None) or os.environ.get(
-        "DO_NOT_TRACK", None) or "0") == "1",
-    "VLLM_USAGE_SOURCE":
-    lambda: os.environ.get("VLLM_USAGE_SOURCE", "production"),
-
-    # Logging configuration
-    # If set to 0, vllm will not configure logging
-    # If set to 1, vllm will configure logging using the default configuration
-    #    or the configuration file specified by VLLM_LOGGING_CONFIG_PATH
-    "VLLM_CONFIGURE_LOGGING":
-    lambda: int(os.getenv("VLLM_CONFIGURE_LOGGING", "1")),
-    "VLLM_LOGGING_CONFIG_PATH":
-    lambda: os.getenv("VLLM_LOGGING_CONFIG_PATH"),
-
-    # this is used for configuring the default logging level
-    "VLLM_LOGGING_LEVEL":
-    lambda: os.getenv("VLLM_LOGGING_LEVEL", "INFO").upper(),
-
-    # if set, VLLM_LOGGING_PREFIX will be prepended to all log messages
-    "VLLM_LOGGING_PREFIX":
-    lambda: os.getenv("VLLM_LOGGING_PREFIX", ""),
-
-    # if set, vllm will call logits processors in a thread pool with this many
-    # threads. This is useful when using custom logits processors that either
-    # (a) launch additional CUDA kernels or (b) do significant CPU-bound work
-    # while not holding the python GIL, or both.
-    "VLLM_LOGITS_PROCESSOR_THREADS":
-    lambda: int(os.getenv("VLLM_LOGITS_PROCESSOR_THREADS", "0"))
-    if "VLLM_LOGITS_PROCESSOR_THREADS" in os.environ else None,
-
-    # If set, vllm will log stats at this interval in seconds
-    # If not set, vllm will log stats every 10 seconds.
-    "VLLM_LOG_STATS_INTERVAL":
-    lambda: val if (val := float(os.getenv("VLLM_LOG_STATS_INTERVAL", "10.")))
-        > 0. else 10.,
-
-    # Trace function calls
-    # If set to 1, vllm will trace function calls
-    # Useful for debugging
-    "VLLM_TRACE_FUNCTION":
-    lambda: int(os.getenv("VLLM_TRACE_FUNCTION", "0")),
-
-    # Backend for attention computation
-    # Available options:
-    # - "TORCH_SDPA": use torch.nn.MultiheadAttention
-    # - "FLASH_ATTN": use FlashAttention
-    # - "XFORMERS": use XFormers
-    # - "ROCM_FLASH": use ROCmFlashAttention
-    # - "FLASHINFER": use flashinfer
-    # - "FLASHMLA": use FlashMLA
-    "VLLM_ATTENTION_BACKEND":
-    lambda: os.getenv("VLLM_ATTENTION_BACKEND", None),
-
-    # If set, vllm will use flashinfer sampler
-    "VLLM_USE_FLASHINFER_SAMPLER":
-    lambda: bool(int(os.environ["VLLM_USE_FLASHINFER_SAMPLER"]))
-    if "VLLM_USE_FLASHINFER_SAMPLER" in os.environ else None,
-
-    # Pipeline stage partition strategy
-    "VLLM_PP_LAYER_PARTITION":
-    lambda: os.getenv("VLLM_PP_LAYER_PARTITION", None),
-
-    # (CPU backend only) CPU key-value cache space.
-    # default is None and will be set as 4 GB
-    "VLLM_CPU_KVCACHE_SPACE":
-    lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0"))
-    if "VLLM_CPU_KVCACHE_SPACE" in os.environ else None,
-
-    # (CPU backend only) CPU core ids bound by OpenMP threads, e.g., "0-31",
-    # "0,1,2", "0-31,33". CPU cores of different ranks are separated by '|'.
-    "VLLM_CPU_OMP_THREADS_BIND":
-    lambda: os.getenv("VLLM_CPU_OMP_THREADS_BIND", "auto"),
-
-    # (CPU backend only) CPU cores not used by OMP threads .
-    # Those CPU cores will not be used by OMP threads of a rank.
-    "VLLM_CPU_NUM_OF_RESERVED_CPU":
-    lambda: int(os.getenv("VLLM_CPU_NUM_OF_RESERVED_CPU", "0"))
-    if "VLLM_CPU_NUM_OF_RESERVED_CPU" in os.environ else None,
-
-    # (CPU backend only) whether to use prepack for MoE layer. This will be
-    # passed to ipex.llm.modules.GatedMLPMOE. On unsupported CPUs, you might
-    # need to set this to "0" (False).
-    "VLLM_CPU_MOE_PREPACK":
-    lambda: bool(int(os.getenv("VLLM_CPU_MOE_PREPACK", "1"))),
-
-    # (CPU backend only) whether to use SGL kernels, optimized for small batch.
-    "VLLM_CPU_SGL_KERNEL":
-    lambda: bool(int(os.getenv("VLLM_CPU_SGL_KERNEL", "0"))),
-
-    # If the env var is set, then all workers will execute as separate
-    # processes from the engine, and we use the same mechanism to trigger
-    # execution on all workers.
-    # Run vLLM with VLLM_USE_RAY_SPMD_WORKER=1 to enable it.
-    "VLLM_USE_RAY_SPMD_WORKER":
-    lambda: bool(int(os.getenv("VLLM_USE_RAY_SPMD_WORKER", "0"))),
-
-    # If the env var is set, it uses the Ray's Compiled Graph
-    # (previously known as ADAG) API which optimizes the
-    # control plane overhead.
-    # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
-    # Note that this variable is set to 1 in V1 by default
-    # when ray distributed executor is used.
-    "VLLM_USE_RAY_COMPILED_DAG":
-    lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG", "0"))),
-
-    # If the env var is set, Ray Compiled Graph uses the specified
-    # channel type to communicate between workers belonging to
-    # different pipeline-parallel stages.
-    # Available options:
-    # - "auto": use the default channel type
-    # - "nccl": use NCCL for communication
-    # - "shm": use shared memory and gRPC for communication
-    # This flag is ignored if VLLM_USE_RAY_COMPILED_DAG is not set.
-    "VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE":
-    lambda: os.getenv("VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE", "auto"),
-
-    # If the env var is set, it enables GPU communication overlap
-    # (experimental feature) in Ray's Compiled Graph. This flag is ignored if
-    # VLLM_USE_RAY_COMPILED_DAG is not set.
-    "VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM":
-    lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM", "0"))
-                 ),
-
-    # If the env var is set, it uses a Ray Communicator wrapping
-    # vLLM's pipeline parallelism communicator to interact with Ray's
-    # Compiled Graph. Otherwise, it uses Ray's NCCL communicator.
-    # This flag is ignored if VLLM_USE_RAY_COMPILED_DAG is not set.
-    "VLLM_USE_RAY_WRAPPED_PP_COMM":
-    lambda: bool(int(os.getenv("VLLM_USE_RAY_WRAPPED_PP_COMM", "1"))),
-
-    # Use dedicated multiprocess context for workers.
-    # Both spawn and fork work
-    "VLLM_WORKER_MULTIPROC_METHOD":
-    lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "fork"),
-
-    # Path to the cache for storing downloaded assets
-    "VLLM_ASSETS_CACHE":
-    lambda: os.path.expanduser(
-        os.getenv(
-            "VLLM_ASSETS_CACHE",
-            os.path.join(get_default_cache_root(), "vllm", "assets"),
-        )),
-
-    # Timeout for fetching images when serving multimodal models
-    # Default is 5 seconds
-    "VLLM_IMAGE_FETCH_TIMEOUT":
-    lambda: int(os.getenv("VLLM_IMAGE_FETCH_TIMEOUT", "5")),
-
-    # Timeout for fetching videos when serving multimodal models
-    # Default is 30 seconds
-    "VLLM_VIDEO_FETCH_TIMEOUT":
-    lambda: int(os.getenv("VLLM_VIDEO_FETCH_TIMEOUT", "30")),
-
-    # Timeout for fetching audio when serving multimodal models
-    # Default is 10 seconds
-    "VLLM_AUDIO_FETCH_TIMEOUT":
-    lambda: int(os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")),
-
-    # Max number of workers for the thread pool handling
-    # media bytes loading. Set to 1 to disable parallel processing.
-    # Default is 8
-    "VLLM_MEDIA_LOADING_THREAD_COUNT":
-    lambda: int(os.getenv("VLLM_MEDIA_LOADING_THREAD_COUNT", "8")),
-
-    # Maximum filesize in MB for a single audio file when processing
-    # speech-to-text requests. Files larger than this will be rejected.
-    # Default is 25 MB
-    "VLLM_MAX_AUDIO_CLIP_FILESIZE_MB":
-    lambda: int(os.getenv("VLLM_MAX_AUDIO_CLIP_FILESIZE_MB", "25")),
-
-    # Backend for Video IO
-    # - "opencv": Default backend that uses OpenCV stream buffered backend.
-    #
-    # Custom backend implementations can be registered
-    # via `@VIDEO_LOADER_REGISTRY.register("my_custom_video_loader")` and
-    # imported at runtime.
-    # If a non-existing backend is used, an AssertionError will be thrown.
-    "VLLM_VIDEO_LOADER_BACKEND":
-    lambda: os.getenv("VLLM_VIDEO_LOADER_BACKEND", "opencv"),
-
-    # [DEPRECATED] Cache size (in GiB per process) for multimodal input cache
-    # Default is 4 GiB per API process + 4 GiB per engine core process
-    "VLLM_MM_INPUT_CACHE_GIB":
-    lambda: int(os.getenv("VLLM_MM_INPUT_CACHE_GIB", "4")),
-
-    # Path to the XLA persistent cache directory.
-    # Only used for XLA devices such as TPUs.
-    "VLLM_XLA_CACHE_PATH":
-    lambda: os.path.expanduser(
-        os.getenv(
-            "VLLM_XLA_CACHE_PATH",
-            os.path.join(get_default_cache_root(), "vllm", "xla_cache"),
-        )),
-
-    # If set, assert on XLA recompilation after each execution step.
-    "VLLM_XLA_CHECK_RECOMPILATION":
-    lambda: bool(int(os.getenv("VLLM_XLA_CHECK_RECOMPILATION", "0"))),
-
-    # Enable SPMD mode for TPU backend.
-    "VLLM_XLA_USE_SPMD":
-    lambda: bool(int(os.getenv("VLLM_XLA_USE_SPMD", "0"))),
-    "VLLM_FUSED_MOE_CHUNK_SIZE":
-    lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")),
-    # Control whether to use fused MoE activation chunking. Current chunking
-    # logic is incompatible with torch.compile and causes IMA. See issue
-    # https://github.com/vllm-project/vllm/issues/19631.
-    "VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING":
-    lambda: bool(
-        int(os.getenv("VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING", "1"))),
-
-    # If set, the OpenAI API server will stay alive even after the underlying
-    # AsyncLLMEngine errors and stops serving requests
-    "VLLM_KEEP_ALIVE_ON_ENGINE_DEATH":
-    lambda: bool(os.getenv("VLLM_KEEP_ALIVE_ON_ENGINE_DEATH", 0)),
-
-    # If the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN is set, it allows
-    # the user to specify a max sequence length greater than
-    # the max length derived from the model's config.json.
-    # To enable this, set VLLM_ALLOW_LONG_MAX_MODEL_LEN=1.
-    "VLLM_ALLOW_LONG_MAX_MODEL_LEN":
-    lambda:
-    (os.environ.get("VLLM_ALLOW_LONG_MAX_MODEL_LEN", "0").strip().lower() in
-     ("1", "true")),
-
-    # If set, forces FP8 Marlin to be used for FP8 quantization regardless
-    # of the hardware support for FP8 compute.
-    "VLLM_TEST_FORCE_FP8_MARLIN":
-    lambda:
-    (os.environ.get("VLLM_TEST_FORCE_FP8_MARLIN", "0").strip().lower() in
-     ("1", "true")),
-    "VLLM_TEST_FORCE_LOAD_FORMAT":
-    lambda: os.getenv("VLLM_TEST_FORCE_LOAD_FORMAT", "dummy"),
-
-    # Time in ms for the zmq client to wait for a response from the backend
-    # server for simple data operations
-    "VLLM_RPC_TIMEOUT":
-    lambda: int(os.getenv("VLLM_RPC_TIMEOUT", "10000")),
-
-    # Timeout in seconds for keeping HTTP connections alive in API server
-    "VLLM_HTTP_TIMEOUT_KEEP_ALIVE":
-    lambda: int(os.environ.get("VLLM_HTTP_TIMEOUT_KEEP_ALIVE", "5")),
-
-    # a list of plugin names to load, separated by commas.
-    # if this is not set, it means all plugins will be loaded
-    # if this is set to an empty string, no plugins will be loaded
-    "VLLM_PLUGINS":
-    lambda: None if "VLLM_PLUGINS" not in os.environ else os.environ[
-        "VLLM_PLUGINS"].split(","),
-
-    # a local directory to look in for unrecognized LoRA adapters.
-    # only works if plugins are enabled and
-    # VLLM_ALLOW_RUNTIME_LORA_UPDATING is enabled.
-    "VLLM_LORA_RESOLVER_CACHE_DIR":
-    lambda: os.getenv("VLLM_LORA_RESOLVER_CACHE_DIR", None),
-
-    # Enables torch profiler if set.
-    # Both AsyncLLM's CPU traces as well as workers'
-    # traces (CPU & GPU) will be saved under this directory.
-    # Note that it must be an absolute path.
-    "VLLM_TORCH_PROFILER_DIR":
-    lambda: (None if os.getenv("VLLM_TORCH_PROFILER_DIR", None) is None else os
-             .path.abspath(os.path.expanduser(os.getenv(
-        "VLLM_TORCH_PROFILER_DIR", ".")))),
-
-    # Enable torch profiler to record shapes if set
-    # VLLM_TORCH_PROFILER_RECORD_SHAPES=1. If not set, torch profiler will
-    # not record shapes.
-    "VLLM_TORCH_PROFILER_RECORD_SHAPES":
-    lambda: bool(os.getenv("VLLM_TORCH_PROFILER_RECORD_SHAPES", "0") != "0"),
-
-    # Enable torch profiler to profile memory if set
-    # VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY=1. If not set, torch profiler
-    # will not profile memory.
-    "VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY":
-    lambda: bool(
-        os.getenv("VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY", "0") != "0"),
-
-    # Enable torch profiler to profile stack if set
-    # VLLM_TORCH_PROFILER_WITH_STACK=1. If not set, torch profiler WILL
-    # profile stack by default.
-    "VLLM_TORCH_PROFILER_WITH_STACK":
-    lambda: bool(os.getenv("VLLM_TORCH_PROFILER_WITH_STACK", "1") != "0"),
-
-    # Enable torch profiler to profile flops if set
-    # VLLM_TORCH_PROFILER_WITH_FLOPS=1. If not set, torch profiler will
-    # not profile flops.
-    "VLLM_TORCH_PROFILER_WITH_FLOPS":
-    lambda: bool(os.getenv("VLLM_TORCH_PROFILER_WITH_FLOPS", "0") != "0"),
-
-    # If set, vLLM will use Triton implementations of AWQ.
-    "VLLM_USE_TRITON_AWQ":
-    lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))),
-
-    # If set, allow loading or unloading lora adapters in runtime,
-    "VLLM_ALLOW_RUNTIME_LORA_UPDATING":
-    lambda:
-    (os.environ.get("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "0").strip().lower() in
-     ("1", "true")),
-
-    # We assume drivers can report p2p status correctly.
-    # If the program hangs when using custom allreduce,
-    # potantially caused by a bug in the driver (535 series),
-    # if might be helpful to set VLLM_SKIP_P2P_CHECK=0
-    # so that vLLM can verify if p2p is actually working.
-    # See https://github.com/vllm-project/vllm/blob/a9b15c606fea67a072416ea0ea115261a2756058/vllm/distributed/device_communicators/custom_all_reduce_utils.py#L101-L108 for details. # noqa
-    "VLLM_SKIP_P2P_CHECK":
-    lambda: os.getenv("VLLM_SKIP_P2P_CHECK", "1") == "1",
-
-    # List of quantization kernels that should be disabled, used for testing
-    # and performance comparisons. Currently only affects MPLinearKernel
-    # selection
-    # (kernels: MacheteLinearKernel, MarlinLinearKernel, ExllamaLinearKernel)
-    "VLLM_DISABLED_KERNELS":
-    lambda: [] if "VLLM_DISABLED_KERNELS" not in os.environ else os.environ[
-        "VLLM_DISABLED_KERNELS"].split(","),
-
-    # If set, use the V1 code path.
-    "VLLM_USE_V1":
-    lambda: bool(int(os.getenv("VLLM_USE_V1", "1"))),
-
-    # Disable aiter ops unless specifically enabled.
-    # Acts as a parent switch to enable the rest of the other operations.
-    "VLLM_ROCM_USE_AITER":
-    lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in
-             ("true", "1")),
-
-    # Whether to use aiter paged attention.
-    # By default is disabled.
-    "VLLM_ROCM_USE_AITER_PAGED_ATTN":
-    lambda: (os.getenv("VLLM_ROCM_USE_AITER_PAGED_ATTN", "False").lower() in
-             ("true", "1")),
-
-    # use aiter linear op if aiter ops are enabled
-    # The following list of related ops
-    # - scaled_mm (per-tensor / rowwise)
-    "VLLM_ROCM_USE_AITER_LINEAR":
-    lambda: (os.getenv("VLLM_ROCM_USE_AITER_LINEAR", "True").lower() in
-             ("true", "1")),
-
-    # Whether to use aiter moe ops.
-    # By default is enabled.
-    "VLLM_ROCM_USE_AITER_MOE":
-    lambda: (os.getenv("VLLM_ROCM_USE_AITER_MOE", "True").lower() in
-             ("true", "1")),
-
-    # use aiter rms norm op if aiter ops are enabled.
-    "VLLM_ROCM_USE_AITER_RMSNORM":
-    lambda: (os.getenv("VLLM_ROCM_USE_AITER_RMSNORM", "True").lower() in
-             ("true", "1")),
-
-    # Whether to use aiter mla ops.
-    # By default is enabled.
-    "VLLM_ROCM_USE_AITER_MLA":
-    lambda: (os.getenv("VLLM_ROCM_USE_AITER_MLA", "True").lower() in
-             ("true", "1")),
-
-    # Whether to use aiter mha ops.
-    # By default is enabled.
-    "VLLM_ROCM_USE_AITER_MHA":
-    lambda: (os.getenv("VLLM_ROCM_USE_AITER_MHA", "True").lower() in
-             ("true", "1")),
-
-    # use rocm skinny gemms
-    "VLLM_ROCM_USE_SKINNY_GEMM":
-    lambda: (os.getenv("VLLM_ROCM_USE_SKINNY_GEMM", "True").lower() in
-             ("true", "1")),
-
-    # Pad the fp8 weights to 256 bytes for ROCm
-    "VLLM_ROCM_FP8_PADDING":
-    lambda: bool(int(os.getenv("VLLM_ROCM_FP8_PADDING", "1"))),
-
-    # Pad the weights for the moe kernel
-    "VLLM_ROCM_MOE_PADDING":
-    lambda: bool(int(os.getenv("VLLM_ROCM_MOE_PADDING", "1"))),
-
-    # custom paged attention kernel for MI3* cards
-    "VLLM_ROCM_CUSTOM_PAGED_ATTN":
-    lambda: (os.getenv("VLLM_ROCM_CUSTOM_PAGED_ATTN", "True").lower() in
-             ("true", "1")),
-
-    # Custom quick allreduce kernel for MI3* cards
-    # Choice of quantization level: FP, INT8, INT6, INT4 or NONE
-    # Recommended for large models to get allreduce
-    "VLLM_ROCM_QUICK_REDUCE_QUANTIZATION":
-    lambda: os.getenv("VLLM_ROCM_QUICK_REDUCE_QUANTIZATION", "NONE").upper(),
-
-    # Custom quick allreduce kernel for MI3* cards
-    # Due to the lack of the bfloat16 asm instruction, bfloat16
-    # kernels are slower than fp16,
-    # If environment variable is set to 1, the input is converted to fp16
-    "VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16":
-    lambda:
-    (os.getenv("VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16", "True").lower() in
-     ("true", "1")),
-
-    # Custom quick allreduce kernel for MI3* cards.
-    # Controls the maximum allowed number of data bytes(MB) for custom quick
-    # allreduce communication.
-    # Default: 2048 MB.
-    # Data exceeding this size will use either custom allreduce or RCCL
-    # communication.
-    "VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB":
-    lambda: maybe_convert_int(
-        os.environ.get("VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB", None)),
-
-    # Divisor for dynamic query scale factor calculation for FP8 KV Cache
-    "Q_SCALE_CONSTANT":
-    lambda: int(os.getenv("Q_SCALE_CONSTANT", "200")),
-    # Divisor for dynamic key scale factor calculation for FP8 KV Cache
-    "K_SCALE_CONSTANT":
-    lambda: int(os.getenv("K_SCALE_CONSTANT", "200")),
-    # Divisor for dynamic value scale factor calculation for FP8 KV Cache
-    "V_SCALE_CONSTANT":
-    lambda: int(os.getenv("V_SCALE_CONSTANT", "100")),
-
-    # If set, enable multiprocessing in LLM for the V1 code path.
-    "VLLM_ENABLE_V1_MULTIPROCESSING":
-    lambda: bool(int(os.getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1"))),
-    "VLLM_LOG_BATCHSIZE_INTERVAL":
-    lambda: float(os.getenv("VLLM_LOG_BATCHSIZE_INTERVAL", "-1")),
-    "VLLM_DISABLE_COMPILE_CACHE":
-    lambda: bool(int(os.getenv("VLLM_DISABLE_COMPILE_CACHE", "0"))),
-
-    # If set, vllm will run in development mode, which will enable
-    # some additional endpoints for developing and debugging,
-    # e.g. `/reset_prefix_cache`
-    "VLLM_SERVER_DEV_MODE":
-    lambda: bool(int(os.getenv("VLLM_SERVER_DEV_MODE", "0"))),
-
-    # Controls the maximum number of requests to handle in a
-    # single asyncio task when processing per-token outputs in the
-    # V1 AsyncLLM interface. It is applicable when handling a high
-    # concurrency of streaming requests.
-    # Setting this too high can result in a higher variance of
-    # inter-message latencies. Setting it too low can negatively impact
-    # TTFT and overall throughput.
-    "VLLM_V1_OUTPUT_PROC_CHUNK_SIZE":
-    lambda: int(os.getenv("VLLM_V1_OUTPUT_PROC_CHUNK_SIZE", "128")),
-
-    # If set, vLLM will disable the MLA attention optimizations.
-    "VLLM_MLA_DISABLE":
-    lambda: bool(int(os.getenv("VLLM_MLA_DISABLE", "0"))),
-
-    # Number of GPUs per worker in Ray, if it is set to be a fraction,
-    # it allows ray to schedule multiple actors on a single GPU,
-    # so that users can colocate other actors on the same GPUs as vLLM.
-    "VLLM_RAY_PER_WORKER_GPUS":
-    lambda: float(os.getenv("VLLM_RAY_PER_WORKER_GPUS", "1.0")),
-
-    # Bundle indices for Ray, if it is set, it can control precisely
-    # which indices are used for the Ray bundle, for every worker.
-    # Format: comma-separated list of integers, e.g. "0,1,2,3"
-    "VLLM_RAY_BUNDLE_INDICES":
-    lambda: os.getenv("VLLM_RAY_BUNDLE_INDICES", ""),
-
-    # In some system, find_loaded_library() may not work. So we allow users to
-    # specify the path through environment variable VLLM_CUDART_SO_PATH.
-    "VLLM_CUDART_SO_PATH":
-    lambda: os.getenv("VLLM_CUDART_SO_PATH", None),
-
-    # Rank of the process in the data parallel setting
-    "VLLM_DP_RANK":
-    lambda: int(os.getenv("VLLM_DP_RANK", "0")),
-
-    # Rank of the process in the data parallel setting.
-    # Defaults to VLLM_DP_RANK when not set.
-    "VLLM_DP_RANK_LOCAL":
-    lambda: int(
-        os.getenv("VLLM_DP_RANK_LOCAL", sys.modules[__name__].VLLM_DP_RANK)),
-
-    # World size of the data parallel setting
-    "VLLM_DP_SIZE":
-    lambda: int(os.getenv("VLLM_DP_SIZE", "1")),
-
-    # IP address of the master node in the data parallel setting
-    "VLLM_DP_MASTER_IP":
-    lambda: os.getenv("VLLM_DP_MASTER_IP", "127.0.0.1"),
-
-    # Port of the master node in the data parallel setting
-    "VLLM_DP_MASTER_PORT":
-    lambda: int(os.getenv("VLLM_DP_MASTER_PORT", "0")),
-
-    # In the context of executing MoE models with Data-Parallel, Expert-Parallel
-    # and Batched All-to-All dispatch/combine kernels, VLLM_MOE_DP_CHUNK_SIZE
-    # dictates the quantum of tokens that can be dispatched from a DP
-    # rank. All DP ranks process the activations in VLLM_MOE_DP_CHUNK_SIZE
-    # units.
-    "VLLM_MOE_DP_CHUNK_SIZE":
-    lambda: int(os.getenv("VLLM_MOE_DP_CHUNK_SIZE", "256")),
-
-    # Randomize inputs during dummy runs when using Data Parallel
-    "VLLM_RANDOMIZE_DP_DUMMY_INPUTS":
-    lambda: os.environ.get("VLLM_RANDOMIZE_DP_DUMMY_INPUTS", "0") == "1",
-
-    # Whether to use S3 path for model loading in CI via RunAI Streamer
-    "VLLM_CI_USE_S3":
-    lambda: os.environ.get("VLLM_CI_USE_S3", "0") == "1",
-
-    # Use model_redirect to redirect the model name to a local folder.
-    # `model_redirect` can be a json file mapping the model between
-    # repo_id and local folder:
-    # {"meta-llama/Llama-3.2-1B": "/tmp/Llama-3.2-1B"}
-    # or a space separated values table file:
-    # meta-llama/Llama-3.2-1B   /tmp/Llama-3.2-1B
-    "VLLM_MODEL_REDIRECT_PATH":
-    lambda: os.environ.get("VLLM_MODEL_REDIRECT_PATH", None),
-
-    # Whether to use atomicAdd reduce in gptq/awq marlin kernel.
-    "VLLM_MARLIN_USE_ATOMIC_ADD":
-    lambda: os.environ.get("VLLM_MARLIN_USE_ATOMIC_ADD", "0") == "1",
-
-    # Whether to use marlin kernel in mxfp4 quantization method
-    "VLLM_MXFP4_USE_MARLIN":
-    lambda: maybe_convert_bool(os.environ.get("VLLM_MXFP4_USE_MARLIN", None)),
-
-    # Whether to turn on the outlines cache for V0
-    # This cache is unbounded and on disk, so it's not safe to use in
-    # an environment with potentially malicious users.
-    "VLLM_V0_USE_OUTLINES_CACHE":
-    lambda: os.environ.get("VLLM_V0_USE_OUTLINES_CACHE", "0") == "1",
-
-    # Whether to turn on the outlines cache for V1
-    # This cache is unbounded and on disk, so it's not safe to use in
-    # an environment with potentially malicious users.
-    "VLLM_V1_USE_OUTLINES_CACHE":
-    lambda: os.environ.get("VLLM_V1_USE_OUTLINES_CACHE", "0") == "1",
-
-    # Gap between padding buckets for the forward pass. So we have
-    # 8, we will run forward pass with [16, 24, 32, ...].
-    "VLLM_TPU_BUCKET_PADDING_GAP":
-    lambda: int(os.environ["VLLM_TPU_BUCKET_PADDING_GAP"])
-    if "VLLM_TPU_BUCKET_PADDING_GAP" in os.environ else 0,
-    "VLLM_TPU_MOST_MODEL_LEN":
-    lambda: maybe_convert_int(os.environ.get("VLLM_TPU_MOST_MODEL_LEN", None)),
-
-    # Whether using Pathways
-    "VLLM_TPU_USING_PATHWAYS":
-    lambda: bool("proxy" in os.getenv("JAX_PLATFORMS", "").lower()),
-
-    # Allow use of DeepGemm kernels for fused moe ops.
-    "VLLM_USE_DEEP_GEMM":
-    lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "0"))),
-
-    # Whether to use E8M0 scaling when DeepGEMM is used on Blackwell GPUs.
-    # E8M0 is faster on B200 but may reduce accuracy.
-    "VLLM_USE_DEEP_GEMM_E8M0":
-    lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM_E8M0", "1"))),
-    # DeepGemm JITs the kernels on-demand. The warmup attempts to make DeepGemm
-    # JIT all the required kernels before model execution so there is no
-    # JIT'ing in the hot-path. However, this warmup increases the engine
-    # startup time by a couple of minutes.
-    # Set `VLLM_SKIP_DEEP_GEMM_WARMUP` to disable the warmup.
-    "VLLM_SKIP_DEEP_GEMM_WARMUP":
-    lambda: bool(int(os.getenv("VLLM_SKIP_DEEP_GEMM_WARMUP", "0"))),
-
-    # Whether to use fused grouped_topk used for MoE expert selection.
-    "VLLM_USE_FUSED_MOE_GROUPED_TOPK":
-    lambda: bool(int(os.getenv("VLLM_USE_FUSED_MOE_GROUPED_TOPK", "1"))),
-
-    # Allow use of FlashInfer MoE kernels for fused moe ops.
-    "VLLM_USE_FLASHINFER_MOE_FP8":
-    lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP8", "0"))),
-
-    # Allow use of FlashInfer CUTLASS kernels for fused moe ops.
-    "VLLM_USE_FLASHINFER_MOE_FP4":
-    lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP4", "0"))),
-
-    # If set to 1, use the FlashInfer
-    # MXFP8 (activation) x MXFP4 (weight) MoE backend.
-    "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8":
-    lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "0"))),
-
-    # If set to 1, use the FlashInfer
-    # BF16 (activation) x MXFP4 (weight) MoE backend.
-    "VLLM_USE_FLASHINFER_MOE_MXFP4_BF16":
-    lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "0"))),
-
-    # Control the cache sized used by the xgrammar compiler. The default
-    # of 512 MB should be enough for roughly 1000 JSON schemas.
-    # It can be changed with this variable if needed for some reason.
-    "VLLM_XGRAMMAR_CACHE_MB":
-    lambda: int(os.getenv("VLLM_XGRAMMAR_CACHE_MB", "512")),
-
-    # Control the threshold for msgspec to use 'zero copy' for
-    # serialization/deserialization of tensors. Tensors below
-    # this limit will be encoded into the msgpack buffer, and
-    # tensors above will instead be sent via a separate message.
-    # While the sending side still actually copies the tensor
-    # in all cases, on the receiving side, tensors above this
-    # limit will actually be zero-copy decoded.
-    "VLLM_MSGPACK_ZERO_COPY_THRESHOLD":
-    lambda: int(os.getenv("VLLM_MSGPACK_ZERO_COPY_THRESHOLD", "256")),
-
-    # If set, allow insecure serialization using pickle.
-    # This is useful for environments where it is deemed safe to use the
-    # insecure method and it is needed for some reason.
-    "VLLM_ALLOW_INSECURE_SERIALIZATION":
-    lambda: bool(int(os.getenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "0"))),
-
-    # IP address used for NIXL handshake between remote agents.
-    "VLLM_NIXL_SIDE_CHANNEL_HOST":
-    lambda: os.getenv("VLLM_NIXL_SIDE_CHANNEL_HOST", "localhost"),
-
-    # Port used for NIXL handshake between remote agents.
-    "VLLM_NIXL_SIDE_CHANNEL_PORT":
-    lambda: int(os.getenv("VLLM_NIXL_SIDE_CHANNEL_PORT", "5557")),
-
-    # all2all backend for vllm's expert parallel communication
-    # Available options:
-    # - "naive": naive all2all implementation using all-reduce
-    # - "pplx": use pplx kernels
-    # - "deepep_high_throughput", use deepep high-throughput kernels
-    # - "deepep_low_latency", use deepep low-latency kernels
-    "VLLM_ALL2ALL_BACKEND":
-    lambda: os.getenv("VLLM_ALL2ALL_BACKEND", "naive"),
-
-    # Flashinfer MoE backend for vLLM's fused Mixture-of-Experts support. Both
-    # require compute capability 10.0 or above.
-    # Available options:
-    # - "throughput":  [default]
-    #     Uses CUTLASS kernels optimized for high-throughput batch inference.
-    # - "latency":
-    #     Uses TensorRT-LLM kernels optimized for low-latency inference.
-    # To set this backend, define the environment variable:
-    #     export VLLM_FLASHINFER_MOE_BACKEND=latency.
-    # If not set, defaults to "throughput".
-    "VLLM_FLASHINFER_MOE_BACKEND": lambda: os.getenv(
-    "VLLM_FLASHINFER_MOE_BACKEND", "throughput"
-    ),
-
-    # Control the maximum number of tokens per expert supported by the
-    # NVFP4 MoE CUTLASS Kernel. This value is used to create a buffer for
-    # the blockscale tensor of activations NVFP4 Quantization.
-    # This is used to prevent the kernel from running out of memory.
-    "VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE":
-    lambda: int(os.getenv("VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE", "163840")),
-
-    # MoE routing strategy selector.
-    # See `RoutingSimulator.get_available_strategies()` # for available
-    # strategies.
-    # Cutstom routing strategies can be registered by
-    # RoutingSimulator.register_strategy()
-    # Note: custom strategies may not produce correct model outputs
-    "VLLM_MOE_ROUTING_SIMULATION_STRATEGY":
-    lambda: os.environ.get("VLLM_MOE_ROUTING_SIMULATION_STRATEGY", "").lower(),
-
-    # Regex timeout for use by the vLLM tool parsing plugins.
-    "VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS":
-    lambda: int(os.getenv("VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS", "1")),
-
-    # Reduce CPU usage when vLLM is idle. Enabling this will incur small
-    # latency penalty when a request eventually comes.
-    "VLLM_SLEEP_WHEN_IDLE":
-    lambda: bool(int(os.getenv("VLLM_SLEEP_WHEN_IDLE", "0"))),
-
-    # Control the max chunk bytes (in MB) for the rpc message queue.
-    # Object larger than this threshold will be broadcast to worker
-    # processes via zmq.
-    "VLLM_MQ_MAX_CHUNK_BYTES_MB":
-    lambda: int(os.getenv("VLLM_MQ_MAX_CHUNK_BYTES_MB", "16")),
-
-    # Timeout in seconds for execute_model RPC calls in multiprocessing
-    # executor (only applies when TP > 1).
-    "VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS":
-    lambda: int(os.getenv("VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS", "300")),
-
-    # KV Cache layout used throughout vllm.
-    # Some common values are:
-    # - NHD
-    # - HND
-    # Where N=num_blocks, H=num_heads and D=head_size. The default value will
-    # leave the layout choice to the backend. Mind that backends may only
-    # implement and support a subset of all possible layouts.
-    "VLLM_KV_CACHE_LAYOUT":
-    lambda: os.getenv("VLLM_KV_CACHE_LAYOUT", None),
-
-    # Enable checking whether the generated logits contain NaNs,
-    # indicating corrupted output. Useful for debugging low level bugs
-    # or bad hardware but it may add compute overhead.
-    "VLLM_COMPUTE_NANS_IN_LOGITS":
-    lambda: bool(int(os.getenv("VLLM_COMPUTE_NANS_IN_LOGITS", "0"))),
-
-    # Controls whether or not emulations are used for NVFP4
-    # generations on machines < 100 for compressed-tensors
-    # models
-    "VLLM_USE_NVFP4_CT_EMULATIONS":
-    lambda: bool(int(os.getenv("VLLM_USE_NVFP4_CT_EMULATIONS", "0"))),
-
-    # Time (in seconds) after which the KV cache on the producer side is
-    # automatically cleared if no READ notification is received from the
-    # consumer. This is only applicable when using NixlConnector in a
-    # disaggregated decode-prefill setup.
-    "VLLM_NIXL_ABORT_REQUEST_TIMEOUT":
-    lambda: int(os.getenv("VLLM_NIXL_ABORT_REQUEST_TIMEOUT", "120")),
-
-    # Controls whether or not to use cudnn prefill
-    "VLLM_USE_CUDNN_PREFILL":
-    lambda: bool(int(os.getenv("VLLM_USE_CUDNN_PREFILL", "0"))),
-
-    # If set to 1, use the TRTLLM attention backend in flashinfer.
-    "VLLM_USE_TRTLLM_ATTENTION":
-    lambda: os.getenv("VLLM_USE_TRTLLM_ATTENTION", None),
-
-    # If set, it means we pre-downloaded cubin files and flashinfer will
-    # read the cubin files directly.
-    "VLLM_HAS_FLASHINFER_CUBIN":
-    lambda: os.getenv("VLLM_HAS_FLASHINFER_CUBIN", False),
-
-    # If set to 1, force the use of TRTLLM FP4 GEMM backend in flashinfer.
-    # Otherwise, uses the first available of: flashinfer cutlass GEMM,
-    # vllm cutlass GEMM, marlin GEMM.
-    "VLLM_USE_TRTLLM_FP4_GEMM":
-    lambda: bool(int(os.getenv("VLLM_USE_TRTLLM_FP4_GEMM", "0"))),
-
-    # Controls garbage collection during CUDA graph capture.
-    # If set to 0 (default), enables GC freezing to speed up capture time.
-    # If set to 1, allows GC to run during capture.
-    "VLLM_ENABLE_CUDAGRAPH_GC":
-    lambda: bool(int(os.getenv("VLLM_ENABLE_CUDAGRAPH_GC", "0"))),
-
-    # Used to force set up loopback IP
-    "VLLM_LOOPBACK_IP":
-    lambda: os.getenv("VLLM_LOOPBACK_IP", ""),
-
-    # Used to set the process name prefix for vLLM processes.
-    # This is useful for debugging and monitoring purposes.
-    # The default value is "VLLM".
-    "VLLM_PROCESS_NAME_PREFIX":
-    lambda: os.getenv("VLLM_PROCESS_NAME_PREFIX", "VLLM"),
-
-    # Allow chunked local attention with hybrid kv cache manager.
-    # Currently using the Hybrid KV cache manager with chunked local attention
-    # in the Llama4 models (the only models currently using chunked local attn)
-    # causes a latency regression. For this reason, we disable it by default.
-    # This flag is used to allow users to enable it if they want to (to save on
-    # kv-cache memory usage and enable longer contexts)
-    # TODO(lucas): Remove this flag once latency regression is resolved.
-    "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE":
-    lambda: bool(int(os.getenv(\
-            "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE", "0"))),
-
-    # Enables support for the "store" option in the OpenAI Responses API.
-    # When set to 1, vLLM's OpenAI server will retain the input and output
-    # messages for those requests in memory. By default, this is disabled (0),
-    # and the "store" option is ignored.
-    # NOTE/WARNING:
-    # 1. Messages are kept in memory only (not persisted to disk) and will be
-    #    lost when the vLLM server shuts down.
-    # 2. Enabling this option will cause a memory leak, as stored messages are
-    #    never removed from memory until the server terminates.
-    "VLLM_ENABLE_RESPONSES_API_STORE":
-    lambda: bool(int(os.getenv("VLLM_ENABLE_RESPONSES_API_STORE", "0"))),
-
-    # Whether to use pytorch symmetric memory for allreduce
-    "VLLM_ALLREDUCE_USE_SYMM_MEM":
-    lambda: bool(int(os.getenv("VLLM_ALLREDUCE_USE_SYMM_MEM", "0"))),
-
-    # Allows vllm to find tuned config under customized folder
-    "VLLM_TUNED_CONFIG_FOLDER":
-    lambda: os.getenv("VLLM_TUNED_CONFIG_FOLDER", None),
-
-}
-
-# --8<-- [end:env-vars-definition]
-
-
-def __getattr__(name: str):
-    # lazy evaluation of environment variables
-    if name in environment_variables:
-        return environment_variables[name]()
-    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
-
-
-def __dir__():
-    return list(environment_variables.keys())
-
-
-def is_set(name: str):
-    """Check if an environment variable is explicitly set."""
-    if name in environment_variables:
-        return name in os.environ
-    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
-
-
-def set_vllm_use_v1(use_v1: bool):
-    if is_set("VLLM_USE_V1"):
-        raise ValueError(
-            "Should not call set_vllm_use_v1() if VLLM_USE_V1 is set "
-            "explicitly by the user. Please raise this as a Github "
-            "Issue and explicitly set VLLM_USE_V1=0 or 1.")
-    os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0"
-
-
-def compute_hash() -> str:
-    """
-    WARNING: Whenever a new key is added to this environment
-    variables, ensure that it is included in the factors list if
-    it affects the computation graph. For example, different values
-    of VLLM_PP_LAYER_PARTITION will generate different computation
-    graphs, so it is included in the factors list. The env vars that
-    affect the choice of different kernels or attention backends should
-    also be included in the factors list.
-    """
-
-    # The values of envs may affects the computation graph.
-    # TODO(DefTruth): hash all environment variables?
-    # for key in environment_variables:
-    #     factorize(key)
-    environment_variables_to_hash = [
-        "VLLM_PP_LAYER_PARTITION",
-        "VLLM_MLA_DISABLE",
-        "VLLM_USE_TRITON_FLASH_ATTN",
-        "VLLM_USE_TRITON_AWQ",
-        "VLLM_DP_RANK",
-        "VLLM_DP_SIZE",
-        "VLLM_USE_STANDALONE_COMPILE",
-        "VLLM_FUSED_MOE_CHUNK_SIZE",
-        "VLLM_FLASHINFER_MOE_BACKEND",
-        "VLLM_V1_USE_PREFILL_DECODE_ATTENTION",
-        "VLLM_USE_AITER_UNIFIED_ATTENTION",
-        "VLLM_ATTENTION_BACKEND",
-        "VLLM_USE_FLASHINFER_SAMPLER",
-        "VLLM_DISABLED_KERNELS",
-        "VLLM_USE_DEEP_GEMM",
-        "VLLM_USE_TRTLLM_FP4_GEMM",
-        "VLLM_USE_FUSED_MOE_GROUPED_TOPK",
-        "VLLM_USE_FLASHINFER_MOE_FP8",
-        "VLLM_USE_FLASHINFER_MOE_FP4",
-        "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8",
-        "VLLM_USE_FLASHINFER_MOE_MXFP4_BF16",
-        "VLLM_USE_CUDNN_PREFILL",
-        "VLLM_USE_TRTLLM_ATTENTION",
-        "VLLM_ROCM_USE_AITER",
-        "VLLM_ROCM_USE_AITER_PAGED_ATTN",
-        "VLLM_ROCM_USE_AITER_LINEAR",
-        "VLLM_ROCM_USE_AITER_MOE",
-        "VLLM_ROCM_USE_AITER_RMSNORM",
-        "VLLM_ROCM_USE_AITER_MLA",
-        "VLLM_ROCM_USE_AITER_MHA",
-        "VLLM_ROCM_USE_SKINNY_GEMM",
-        "VLLM_ROCM_FP8_PADDING",
-        "VLLM_ROCM_MOE_PADDING",
-        "VLLM_ROCM_CUSTOM_PAGED_ATTN",
-        "VLLM_ROCM_QUICK_REDUCE_QUANTIZATION",
-        "VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16",
-        "VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB",
-    ]
-    for key in environment_variables_to_hash:
-        # if this goes out of sync with environment_variables,
-        # it's not a user error, it's a bug
-        assert key in environment_variables, \
-            "Please update environment_variables_to_hash in envs.py"
-
-    factors = [
-        environment_variables[key]() for key in environment_variables_to_hash
-    ]
-
-    hash_str = hashlib.md5(str(factors).encode(),
-                           usedforsecurity=False).hexdigest()
-
-    return hash_str