From 1c3d99d6a351d3bb7f5865cce62f93a9a113a10d Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 26 Aug 2025 01:22:13 +0000 Subject: [PATCH] Implement standardized environment variable parsing as requested Co-authored-by: ProExpertProg <11367180+ProExpertProg@users.noreply.github.com> --- vllm/envs/__init__.py | 728 ++++++++-------------------------------- vllm/envs/_variables.py | 24 +- 2 files changed, 147 insertions(+), 605 deletions(-) diff --git a/vllm/envs/__init__.py b/vllm/envs/__init__.py index 4c96136b60ef4..da5fe322311bc 100644 --- a/vllm/envs/__init__.py +++ b/vllm/envs/__init__.py @@ -28,24 +28,17 @@ Usage: """ import hashlib -import importlib.util import os -import sys import tempfile -from typing import Any, Callable, Optional +from typing import TYPE_CHECKING, get_type_hints, Type, Union, get_origin, get_args, Optional from urllib.parse import urlparse -# Import variable definitions - handle both relative and absolute imports -try: - from . import _variables -except (ImportError, SystemError): - # Fall back to absolute import for testing or when imported directly - import importlib.util - import os - _variables_path = os.path.join(os.path.dirname(__file__), '_variables.py') - spec = importlib.util.spec_from_file_location('_variables', _variables_path) - _variables = importlib.util.module_from_spec(spec) - spec.loader.exec_module(_variables) +from . import _variables +from ._variables import __defaults as _env_defaults + +if TYPE_CHECKING: + # This way IDEs & type checkers get the declarations directly + from ._variables import * def get_default_cache_root() -> str: @@ -64,29 +57,21 @@ def get_default_config_root() -> str: ) -def maybe_convert_int(value: Optional[str]) -> Optional[int]: - """Convert string to int if not None.""" - if value is None: - return None - return int(value) +def _unwrap_optional(type_: Type) -> Type: + """Unwrap Optional[T] to get T.""" + origin = get_origin(type_) + if origin is not Union: + return type_ + + args = get_args(type_) + if len(args) != 2 or type(None) not in args: + raise ValueError("Unions not currently supported") + + return next(arg for arg in args if arg is not type(None)) -def maybe_convert_bool(value: Optional[str]) -> Optional[bool]: - """Convert string to bool if not None.""" - if value is None: - return None - return bool(int(value)) - - -def get_vllm_port() -> Optional[int]: - """Get the port from VLLM_PORT environment variable. - - Returns: - The port number as an integer if VLLM_PORT is set, None otherwise. - - Raises: - ValueError: If VLLM_PORT is a URI, suggest k8s service discovery issue. - """ +def _get_vllm_port() -> Optional[int]: + """Get the port from VLLM_PORT environment variable with special validation.""" if 'VLLM_PORT' not in os.environ: return None @@ -106,577 +91,127 @@ def get_vllm_port() -> Optional[int]: f"VLLM_PORT '{port}' must be a valid integer") from err -# Environment variable lookup functions -# These lambdas provide the actual environment variable lookup with type conversion -environment_variables: dict[str, Callable[[], Any]] = { - # Installation Time Env Vars - "VLLM_TARGET_DEVICE": - lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda").lower(), +def _parse_list_value(value: str) -> list[str]: + """Parse comma-separated string into list.""" + if not value: + return [] + return [item.strip() for item in value.split(',') if item.strip()] - "MAX_JOBS": - lambda: os.getenv("MAX_JOBS", None), - "NVCC_THREADS": - lambda: os.getenv("NVCC_THREADS", None), - - "VLLM_USE_PRECOMPILED": - lambda: os.environ.get("VLLM_USE_PRECOMPILED", "").strip().lower() in - ("1", "true") or bool(os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")), - - "VLLM_DOCKER_BUILD_CONTEXT": - lambda: os.environ.get("VLLM_DOCKER_BUILD_CONTEXT", "").strip().lower() in - ("1", "true"), - - "VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL": - lambda: bool(int(os.getenv("VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL", "0")) - ), - - "CMAKE_BUILD_TYPE": - lambda: os.getenv("CMAKE_BUILD_TYPE"), - - "VERBOSE": - lambda: bool(int(os.getenv('VERBOSE', '0'))), - - "VLLM_CONFIG_ROOT": - lambda: os.path.expanduser( - os.getenv( - "VLLM_CONFIG_ROOT", - os.path.join(get_default_config_root(), "vllm"), - )), - - # Runtime Env Vars - "VLLM_CACHE_ROOT": - lambda: os.path.expanduser( - os.getenv( - "VLLM_CACHE_ROOT", - os.path.join(get_default_cache_root(), "vllm"), - )), - - 'VLLM_HOST_IP': - lambda: os.getenv('VLLM_HOST_IP', ""), - - 'VLLM_PORT': - get_vllm_port, - - 'VLLM_RPC_BASE_PATH': - lambda: os.getenv('VLLM_RPC_BASE_PATH', tempfile.gettempdir()), - - "VLLM_USE_MODELSCOPE": - lambda: os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true", - - "VLLM_RINGBUFFER_WARNING_INTERVAL": - lambda: int(os.environ.get("VLLM_RINGBUFFER_WARNING_INTERVAL", "60")), - - "CUDA_HOME": - lambda: os.environ.get("CUDA_HOME", None), - - "VLLM_NCCL_SO_PATH": - lambda: os.environ.get("VLLM_NCCL_SO_PATH", None), - - "LD_LIBRARY_PATH": - lambda: os.environ.get("LD_LIBRARY_PATH", None), - - "VLLM_USE_TRITON_FLASH_ATTN": - lambda: (os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in - ("true", "1")), - - "VLLM_V1_USE_PREFILL_DECODE_ATTENTION": - lambda: - (os.getenv("VLLM_V1_USE_PREFILL_DECODE_ATTENTION", "False").lower() in - ("true", "1")), - - "VLLM_USE_AITER_UNIFIED_ATTENTION": - lambda: - (os.getenv("VLLM_USE_AITER_UNIFIED_ATTENTION", "False").lower() in - ("true", "1")), - - "VLLM_FLASH_ATTN_VERSION": - lambda: maybe_convert_int(os.environ.get("VLLM_FLASH_ATTN_VERSION", None)), - - "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE": - lambda: bool( - os.environ.get("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1") != "0"), - - "VLLM_USE_STANDALONE_COMPILE": - lambda: os.environ.get("VLLM_USE_STANDALONE_COMPILE", "1") == "1", - - "LOCAL_RANK": - lambda: int(os.environ.get("LOCAL_RANK", "0")), - - "CUDA_VISIBLE_DEVICES": - lambda: os.environ.get("CUDA_VISIBLE_DEVICES", None), - - "VLLM_ENGINE_ITERATION_TIMEOUT_S": - lambda: int(os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60")), - - "VLLM_API_KEY": - lambda: os.environ.get("VLLM_API_KEY", None), - - "VLLM_DEBUG_LOG_API_SERVER_RESPONSE": - lambda: os.environ.get("VLLM_DEBUG_LOG_API_SERVER_RESPONSE", "False" - ).lower() == "true", - - "S3_ACCESS_KEY_ID": - lambda: os.environ.get("S3_ACCESS_KEY_ID", None), - "S3_SECRET_ACCESS_KEY": - lambda: os.environ.get("S3_SECRET_ACCESS_KEY", None), - "S3_ENDPOINT_URL": - lambda: os.environ.get("S3_ENDPOINT_URL", None), - - "VLLM_USAGE_STATS_SERVER": - lambda: os.environ.get("VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai"), - "VLLM_NO_USAGE_STATS": - lambda: os.environ.get("VLLM_NO_USAGE_STATS", "0") == "1", - "VLLM_DO_NOT_TRACK": - lambda: (os.environ.get("VLLM_DO_NOT_TRACK", None) or os.environ.get( - "DO_NOT_TRACK", None) or "0") == "1", - "VLLM_USAGE_SOURCE": - lambda: os.environ.get("VLLM_USAGE_SOURCE", "production"), - - "VLLM_CONFIGURE_LOGGING": - lambda: int(os.getenv("VLLM_CONFIGURE_LOGGING", "1")), - "VLLM_LOGGING_CONFIG_PATH": - lambda: os.getenv("VLLM_LOGGING_CONFIG_PATH"), - - "VLLM_LOGGING_LEVEL": - lambda: os.getenv("VLLM_LOGGING_LEVEL", "INFO").upper(), - - "VLLM_LOGGING_PREFIX": - lambda: os.getenv("VLLM_LOGGING_PREFIX", ""), - - "VLLM_LOGITS_PROCESSOR_THREADS": - lambda: int(os.getenv("VLLM_LOGITS_PROCESSOR_THREADS", "0")) - if "VLLM_LOGITS_PROCESSOR_THREADS" in os.environ else None, - - "VLLM_LOG_STATS_INTERVAL": - lambda: val if (val := float(os.getenv("VLLM_LOG_STATS_INTERVAL", "10."))) - > 0. else 10., - - "VLLM_TRACE_FUNCTION": - lambda: int(os.getenv("VLLM_TRACE_FUNCTION", "0")), - - "VLLM_ATTENTION_BACKEND": - lambda: os.getenv("VLLM_ATTENTION_BACKEND", None), - - "VLLM_USE_FLASHINFER_SAMPLER": - lambda: bool(int(os.environ["VLLM_USE_FLASHINFER_SAMPLER"])) - if "VLLM_USE_FLASHINFER_SAMPLER" in os.environ else None, - - "VLLM_PP_LAYER_PARTITION": - lambda: os.getenv("VLLM_PP_LAYER_PARTITION", None), - - "VLLM_CPU_KVCACHE_SPACE": - lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0")) - if "VLLM_CPU_KVCACHE_SPACE" in os.environ else None, - - "VLLM_CPU_OMP_THREADS_BIND": - lambda: os.getenv("VLLM_CPU_OMP_THREADS_BIND", "auto"), - - "VLLM_CPU_NUM_OF_RESERVED_CPU": - lambda: int(os.getenv("VLLM_CPU_NUM_OF_RESERVED_CPU", "0")) - if "VLLM_CPU_NUM_OF_RESERVED_CPU" in os.environ else None, - - "VLLM_CPU_MOE_PREPACK": - lambda: bool(int(os.getenv("VLLM_CPU_MOE_PREPACK", "1"))), - - "VLLM_CPU_SGL_KERNEL": - lambda: bool(int(os.getenv("VLLM_CPU_SGL_KERNEL", "0"))), - - "VLLM_USE_RAY_SPMD_WORKER": - lambda: bool(int(os.getenv("VLLM_USE_RAY_SPMD_WORKER", "0"))), - - "VLLM_USE_RAY_COMPILED_DAG": - lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG", "0"))), - - "VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE": - lambda: os.getenv("VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE", "auto"), - - "VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM": - lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM", "0")) - ), - - "VLLM_USE_RAY_WRAPPED_PP_COMM": - lambda: bool(int(os.getenv("VLLM_USE_RAY_WRAPPED_PP_COMM", "1"))), - - "VLLM_WORKER_MULTIPROC_METHOD": - lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "fork"), - - "VLLM_ASSETS_CACHE": - lambda: os.path.expanduser( - os.getenv( - "VLLM_ASSETS_CACHE", - os.path.join(get_default_cache_root(), "vllm", "assets"), - )), - - "VLLM_IMAGE_FETCH_TIMEOUT": - lambda: int(os.getenv("VLLM_IMAGE_FETCH_TIMEOUT", "5")), - - "VLLM_VIDEO_FETCH_TIMEOUT": - lambda: int(os.getenv("VLLM_VIDEO_FETCH_TIMEOUT", "30")), - - "VLLM_AUDIO_FETCH_TIMEOUT": - lambda: int(os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")), - - "VLLM_MEDIA_LOADING_THREAD_COUNT": - lambda: int(os.getenv("VLLM_MEDIA_LOADING_THREAD_COUNT", "8")), - - "VLLM_MAX_AUDIO_CLIP_FILESIZE_MB": - lambda: int(os.getenv("VLLM_MAX_AUDIO_CLIP_FILESIZE_MB", "25")), - - "VLLM_VIDEO_LOADER_BACKEND": - lambda: os.getenv("VLLM_VIDEO_LOADER_BACKEND", "opencv"), - - "VLLM_MM_INPUT_CACHE_GIB": - lambda: int(os.getenv("VLLM_MM_INPUT_CACHE_GIB", "4")), - - "VLLM_XLA_CACHE_PATH": - lambda: os.path.expanduser( - os.getenv( - "VLLM_XLA_CACHE_PATH", - os.path.join(get_default_cache_root(), "vllm", "xla_cache"), - )), - - "VLLM_XLA_CHECK_RECOMPILATION": - lambda: bool(int(os.getenv("VLLM_XLA_CHECK_RECOMPILATION", "0"))), - - "VLLM_XLA_USE_SPMD": - lambda: bool(int(os.getenv("VLLM_XLA_USE_SPMD", "0"))), - "VLLM_FUSED_MOE_CHUNK_SIZE": - lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")), - "VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING": - lambda: bool( - int(os.getenv("VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING", "1"))), - - "VLLM_KEEP_ALIVE_ON_ENGINE_DEATH": - lambda: bool(os.getenv("VLLM_KEEP_ALIVE_ON_ENGINE_DEATH", 0)), - - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": - lambda: - (os.environ.get("VLLM_ALLOW_LONG_MAX_MODEL_LEN", "0").strip().lower() in - ("1", "true")), - - "VLLM_TEST_FORCE_FP8_MARLIN": - lambda: - (os.environ.get("VLLM_TEST_FORCE_FP8_MARLIN", "0").strip().lower() in - ("1", "true")), - "VLLM_TEST_FORCE_LOAD_FORMAT": - lambda: os.getenv("VLLM_TEST_FORCE_LOAD_FORMAT", "dummy"), - - "VLLM_RPC_TIMEOUT": - lambda: int(os.getenv("VLLM_RPC_TIMEOUT", "10000")), - - "VLLM_HTTP_TIMEOUT_KEEP_ALIVE": - lambda: int(os.environ.get("VLLM_HTTP_TIMEOUT_KEEP_ALIVE", "5")), - - "VLLM_PLUGINS": - lambda: None if "VLLM_PLUGINS" not in os.environ else os.environ[ - "VLLM_PLUGINS"].split(","), - - "VLLM_LORA_RESOLVER_CACHE_DIR": - lambda: os.getenv("VLLM_LORA_RESOLVER_CACHE_DIR", None), - - "VLLM_TORCH_PROFILER_DIR": - lambda: (None if os.getenv("VLLM_TORCH_PROFILER_DIR", None) is None else os - .path.abspath(os.path.expanduser(os.getenv( - "VLLM_TORCH_PROFILER_DIR", ".")))), - - "VLLM_TORCH_PROFILER_RECORD_SHAPES": - lambda: bool(os.getenv("VLLM_TORCH_PROFILER_RECORD_SHAPES", "0") != "0"), - - "VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY": - lambda: bool( - os.getenv("VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY", "0") != "0"), - - "VLLM_TORCH_PROFILER_WITH_STACK": - lambda: bool(os.getenv("VLLM_TORCH_PROFILER_WITH_STACK", "1") != "0"), - - "VLLM_TORCH_PROFILER_WITH_FLOPS": - lambda: bool(os.getenv("VLLM_TORCH_PROFILER_WITH_FLOPS", "0") != "0"), - - "VLLM_USE_TRITON_AWQ": - lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))), - - "VLLM_ALLOW_RUNTIME_LORA_UPDATING": - lambda: - (os.environ.get("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "0").strip().lower() in - ("1", "true")), - - "VLLM_SKIP_P2P_CHECK": - lambda: os.getenv("VLLM_SKIP_P2P_CHECK", "1") == "1", - - "VLLM_DISABLED_KERNELS": - lambda: [] if "VLLM_DISABLED_KERNELS" not in os.environ else os.environ[ - "VLLM_DISABLED_KERNELS"].split(","), - - "VLLM_USE_V1": - lambda: bool(int(os.getenv("VLLM_USE_V1", "1"))), - - "VLLM_ROCM_USE_AITER": - lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in - ("true", "1")), - - "VLLM_ROCM_USE_AITER_PAGED_ATTN": - lambda: (os.getenv("VLLM_ROCM_USE_AITER_PAGED_ATTN", "False").lower() in - ("true", "1")), - - "VLLM_ROCM_USE_AITER_LINEAR": - lambda: (os.getenv("VLLM_ROCM_USE_AITER_LINEAR", "True").lower() in - ("true", "1")), - - "VLLM_ROCM_USE_AITER_MOE": - lambda: (os.getenv("VLLM_ROCM_USE_AITER_MOE", "True").lower() in - ("true", "1")), - - "VLLM_ROCM_USE_AITER_RMSNORM": - lambda: (os.getenv("VLLM_ROCM_USE_AITER_RMSNORM", "True").lower() in - ("true", "1")), - - "VLLM_ROCM_USE_AITER_MLA": - lambda: (os.getenv("VLLM_ROCM_USE_AITER_MLA", "True").lower() in - ("true", "1")), - - "VLLM_ROCM_USE_AITER_MHA": - lambda: (os.getenv("VLLM_ROCM_USE_AITER_MHA", "True").lower() in - ("true", "1")), - - "VLLM_ROCM_USE_SKINNY_GEMM": - lambda: (os.getenv("VLLM_ROCM_USE_SKINNY_GEMM", "True").lower() in - ("true", "1")), - - "VLLM_ROCM_FP8_PADDING": - lambda: bool(int(os.getenv("VLLM_ROCM_FP8_PADDING", "1"))), - - "VLLM_ROCM_MOE_PADDING": - lambda: bool(int(os.getenv("VLLM_ROCM_MOE_PADDING", "1"))), - - "VLLM_ROCM_CUSTOM_PAGED_ATTN": - lambda: (os.getenv("VLLM_ROCM_CUSTOM_PAGED_ATTN", "True").lower() in - ("true", "1")), - - "VLLM_ROCM_QUICK_REDUCE_QUANTIZATION": - lambda: os.getenv("VLLM_ROCM_QUICK_REDUCE_QUANTIZATION", "NONE").upper(), - - "VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16": - lambda: - (os.getenv("VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16", "True").lower() in - ("true", "1")), - - "VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB": - lambda: maybe_convert_int( - os.environ.get("VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB", None)), - - "Q_SCALE_CONSTANT": - lambda: int(os.getenv("Q_SCALE_CONSTANT", "200")), - "K_SCALE_CONSTANT": - lambda: int(os.getenv("K_SCALE_CONSTANT", "200")), - "V_SCALE_CONSTANT": - lambda: int(os.getenv("V_SCALE_CONSTANT", "100")), - - "VLLM_ENABLE_V1_MULTIPROCESSING": - lambda: bool(int(os.getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1"))), - "VLLM_LOG_BATCHSIZE_INTERVAL": - lambda: float(os.getenv("VLLM_LOG_BATCHSIZE_INTERVAL", "-1")), - "VLLM_DISABLE_COMPILE_CACHE": - lambda: bool(int(os.getenv("VLLM_DISABLE_COMPILE_CACHE", "0"))), - - "VLLM_SERVER_DEV_MODE": - lambda: bool(int(os.getenv("VLLM_SERVER_DEV_MODE", "0"))), - - "VLLM_V1_OUTPUT_PROC_CHUNK_SIZE": - lambda: int(os.getenv("VLLM_V1_OUTPUT_PROC_CHUNK_SIZE", "128")), - - "VLLM_MLA_DISABLE": - lambda: bool(int(os.getenv("VLLM_MLA_DISABLE", "0"))), - - "VLLM_RAY_PER_WORKER_GPUS": - lambda: float(os.getenv("VLLM_RAY_PER_WORKER_GPUS", "1.0")), - - "VLLM_RAY_BUNDLE_INDICES": - lambda: os.getenv("VLLM_RAY_BUNDLE_INDICES", ""), - - "VLLM_CUDART_SO_PATH": - lambda: os.getenv("VLLM_CUDART_SO_PATH", None), - - "VLLM_DP_RANK": - lambda: int(os.getenv("VLLM_DP_RANK", "0")), - - "VLLM_DP_RANK_LOCAL": - lambda: int( - os.getenv("VLLM_DP_RANK_LOCAL", os.getenv("VLLM_DP_RANK", "0"))), - - "VLLM_DP_SIZE": - lambda: int(os.getenv("VLLM_DP_SIZE", "1")), - - "VLLM_DP_MASTER_IP": - lambda: os.getenv("VLLM_DP_MASTER_IP", "127.0.0.1"), - - "VLLM_DP_MASTER_PORT": - lambda: int(os.getenv("VLLM_DP_MASTER_PORT", "0")), - - "VLLM_MOE_DP_CHUNK_SIZE": - lambda: int(os.getenv("VLLM_MOE_DP_CHUNK_SIZE", "256")), - - "VLLM_RANDOMIZE_DP_DUMMY_INPUTS": - lambda: os.environ.get("VLLM_RANDOMIZE_DP_DUMMY_INPUTS", "0") == "1", - - "VLLM_CI_USE_S3": - lambda: os.environ.get("VLLM_CI_USE_S3", "0") == "1", - - "VLLM_MODEL_REDIRECT_PATH": - lambda: os.environ.get("VLLM_MODEL_REDIRECT_PATH", None), - - "VLLM_MARLIN_USE_ATOMIC_ADD": - lambda: os.environ.get("VLLM_MARLIN_USE_ATOMIC_ADD", "0") == "1", - - "VLLM_MXFP4_USE_MARLIN": - lambda: maybe_convert_bool(os.environ.get("VLLM_MXFP4_USE_MARLIN", None)), - - "VLLM_V0_USE_OUTLINES_CACHE": - lambda: os.environ.get("VLLM_V0_USE_OUTLINES_CACHE", "0") == "1", - - "VLLM_V1_USE_OUTLINES_CACHE": - lambda: os.environ.get("VLLM_V1_USE_OUTLINES_CACHE", "0") == "1", - - "VLLM_TPU_BUCKET_PADDING_GAP": - lambda: int(os.environ["VLLM_TPU_BUCKET_PADDING_GAP"]) - if "VLLM_TPU_BUCKET_PADDING_GAP" in os.environ else 0, - "VLLM_TPU_MOST_MODEL_LEN": - lambda: maybe_convert_int(os.environ.get("VLLM_TPU_MOST_MODEL_LEN", None)), - - "VLLM_TPU_USING_PATHWAYS": - lambda: bool("proxy" in os.getenv("JAX_PLATFORMS", "").lower()), - - "VLLM_USE_DEEP_GEMM": - lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "0"))), - - "VLLM_USE_DEEP_GEMM_E8M0": - lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM_E8M0", "1"))), - "VLLM_SKIP_DEEP_GEMM_WARMUP": - lambda: bool(int(os.getenv("VLLM_SKIP_DEEP_GEMM_WARMUP", "0"))), - - "VLLM_USE_FUSED_MOE_GROUPED_TOPK": - lambda: bool(int(os.getenv("VLLM_USE_FUSED_MOE_GROUPED_TOPK", "1"))), - - "VLLM_USE_FLASHINFER_MOE_FP8": - lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP8", "0"))), - - "VLLM_USE_FLASHINFER_MOE_FP4": - lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP4", "0"))), - - "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8": - lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "0"))), - - "VLLM_USE_FLASHINFER_MOE_MXFP4_BF16": - lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "0"))), - - "VLLM_XGRAMMAR_CACHE_MB": - lambda: int(os.getenv("VLLM_XGRAMMAR_CACHE_MB", "512")), - - "VLLM_MSGPACK_ZERO_COPY_THRESHOLD": - lambda: int(os.getenv("VLLM_MSGPACK_ZERO_COPY_THRESHOLD", "256")), - - "VLLM_ALLOW_INSECURE_SERIALIZATION": - lambda: bool(int(os.getenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "0"))), - - "VLLM_NIXL_SIDE_CHANNEL_HOST": - lambda: os.getenv("VLLM_NIXL_SIDE_CHANNEL_HOST", "localhost"), - - "VLLM_NIXL_SIDE_CHANNEL_PORT": - lambda: int(os.getenv("VLLM_NIXL_SIDE_CHANNEL_PORT", "5557")), - - "VLLM_ALL2ALL_BACKEND": - lambda: os.getenv("VLLM_ALL2ALL_BACKEND", "naive"), - - "VLLM_FLASHINFER_MOE_BACKEND": lambda: os.getenv( - "VLLM_FLASHINFER_MOE_BACKEND", "throughput" - ), - - "VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE": - lambda: int(os.getenv("VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE", "163840")), - - "VLLM_MOE_ROUTING_SIMULATION_STRATEGY": - lambda: os.environ.get("VLLM_MOE_ROUTING_SIMULATION_STRATEGY", "").lower(), - - "VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS": - lambda: int(os.getenv("VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS", "1")), - - "VLLM_SLEEP_WHEN_IDLE": - lambda: bool(int(os.getenv("VLLM_SLEEP_WHEN_IDLE", "0"))), - - "VLLM_MQ_MAX_CHUNK_BYTES_MB": - lambda: int(os.getenv("VLLM_MQ_MAX_CHUNK_BYTES_MB", "16")), - - "VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS": - lambda: int(os.getenv("VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS", "300")), - - "VLLM_KV_CACHE_LAYOUT": - lambda: os.getenv("VLLM_KV_CACHE_LAYOUT", None), - - "VLLM_COMPUTE_NANS_IN_LOGITS": - lambda: bool(int(os.getenv("VLLM_COMPUTE_NANS_IN_LOGITS", "0"))), - - "VLLM_USE_NVFP4_CT_EMULATIONS": - lambda: bool(int(os.getenv("VLLM_USE_NVFP4_CT_EMULATIONS", "0"))), - - "VLLM_NIXL_ABORT_REQUEST_TIMEOUT": - lambda: int(os.getenv("VLLM_NIXL_ABORT_REQUEST_TIMEOUT", "120")), - - "VLLM_USE_CUDNN_PREFILL": - lambda: bool(int(os.getenv("VLLM_USE_CUDNN_PREFILL", "0"))), - - "VLLM_USE_TRTLLM_ATTENTION": - lambda: os.getenv("VLLM_USE_TRTLLM_ATTENTION", None), - - "VLLM_HAS_FLASHINFER_CUBIN": - lambda: os.getenv("VLLM_HAS_FLASHINFER_CUBIN", False), - - "VLLM_USE_TRTLLM_FP4_GEMM": - lambda: bool(int(os.getenv("VLLM_USE_TRTLLM_FP4_GEMM", "0"))), - - "VLLM_ENABLE_CUDAGRAPH_GC": - lambda: bool(int(os.getenv("VLLM_ENABLE_CUDAGRAPH_GC", "0"))), - - "VLLM_LOOPBACK_IP": - lambda: os.getenv("VLLM_LOOPBACK_IP", ""), - - "VLLM_PROCESS_NAME_PREFIX": - lambda: os.getenv("VLLM_PROCESS_NAME_PREFIX", "VLLM"), - - "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE": - lambda: bool(int(os.getenv(\ - "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE", "0"))), - - "VLLM_ENABLE_RESPONSES_API_STORE": - lambda: bool(int(os.getenv("VLLM_ENABLE_RESPONSES_API_STORE", "0"))), - - "VLLM_ALLREDUCE_USE_SYMM_MEM": - lambda: bool(int(os.getenv("VLLM_ALLREDUCE_USE_SYMM_MEM", "0"))), - - "VLLM_TUNED_CONFIG_FOLDER": - lambda: os.getenv("VLLM_TUNED_CONFIG_FOLDER", None), -} +_type_hints = get_type_hints(_variables) def __getattr__(name: str): - """Lazy evaluation of environment variables.""" - if name in environment_variables: - return environment_variables[name]() - raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + """Lazy evaluation of environment variables with standardized parsing.""" + if name not in _env_defaults: + raise AttributeError(f"module {__name__} has no attribute {name}") + + # Special handling for complex variables + if name == "VLLM_PORT": + return _get_vllm_port() + + # Handle variables that need path expansion + if name == "VLLM_CONFIG_ROOT": + return os.path.expanduser( + os.getenv( + "VLLM_CONFIG_ROOT", + os.path.join(get_default_config_root(), "vllm"), + ) + ) + + if name == "VLLM_CACHE_ROOT": + return os.path.expanduser( + os.getenv( + "VLLM_CACHE_ROOT", + os.path.join(get_default_cache_root(), "vllm"), + ) + ) + + if name == "VLLM_ASSETS_CACHE": + return os.path.expanduser( + os.getenv( + "VLLM_ASSETS_CACHE", + os.path.join(get_default_cache_root(), "vllm", "assets"), + ) + ) + + if name == "VLLM_XLA_CACHE_PATH": + return os.path.expanduser( + os.getenv( + "VLLM_XLA_CACHE_PATH", + os.path.join(get_default_cache_root(), "vllm", "xla_cache"), + ) + ) + + if name == "VLLM_RPC_BASE_PATH": + return os.getenv('VLLM_RPC_BASE_PATH', tempfile.gettempdir()) + + # Handle special cases for compound logic + if name == "VLLM_USE_PRECOMPILED": + return (os.environ.get("VLLM_USE_PRECOMPILED", "").strip().lower() in + ("1", "true") or bool(os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION"))) + + if name == "VLLM_DO_NOT_TRACK": + return (os.environ.get("VLLM_DO_NOT_TRACK", None) or + os.environ.get("DO_NOT_TRACK", None) or "0") == "1" + + if name == "VLLM_DP_RANK_LOCAL": + return int(os.getenv("VLLM_DP_RANK_LOCAL", os.getenv("VLLM_DP_RANK", "0"))) + + if name == "VLLM_TPU_USING_PATHWAYS": + return bool("proxy" in os.getenv("JAX_PLATFORMS", "").lower()) + + if name == "VLLM_TORCH_PROFILER_DIR": + value = os.getenv("VLLM_TORCH_PROFILER_DIR", None) + return None if value is None else os.path.abspath(os.path.expanduser(value)) + + # Get environment value + env_value = os.getenv(name) + if env_value is None: + return _env_defaults[name] + + # Get type for this variable + var_type = _type_hints[name] + var_type = _unwrap_optional(var_type) + + # Parse based on type + if var_type is str: + # Handle special string parsing + if name == "VLLM_TARGET_DEVICE": + return env_value.lower() + if name == "VLLM_LOGGING_LEVEL": + return env_value.upper() + if name == "VLLM_ROCM_QUICK_REDUCE_QUANTIZATION": + return env_value.upper() + if name == "VLLM_MOE_ROUTING_SIMULATION_STRATEGY": + return env_value.lower() + return env_value + + if var_type is bool: + return env_value.lower() in ("1", "true") + + if var_type in (int, float): + return var_type(env_value) + + if var_type == list[str] or (hasattr(var_type, '__origin__') and var_type.__origin__ is list): + return _parse_list_value(env_value) + + raise ValueError(f"Unsupported type {var_type} for environment variable {name}") def __dir__(): """Return list of available environment variables.""" - return list(environment_variables.keys()) + return list(_env_defaults.keys()) -def is_set(name: str): +def is_set(name: str) -> bool: """Check if an environment variable is explicitly set.""" - if name in environment_variables: - return name in os.environ - raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + if name not in _env_defaults: + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + return name in os.environ -def set_vllm_use_v1(use_v1: bool): +def set_vllm_use_v1(use_v1: bool) -> None: """Set VLLM_USE_V1 environment variable.""" if is_set("VLLM_USE_V1"): raise ValueError( @@ -738,14 +273,15 @@ def compute_hash() -> str: "VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16", "VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB", ] + for key in environment_variables_to_hash: - # if this goes out of sync with environment_variables, + # if this goes out of sync with _env_defaults, # it's not a user error, it's a bug - assert key in environment_variables, \ - "Please update environment_variables_to_hash in envs/__init__.py" + assert key in _env_defaults, \ + f"Please update environment_variables_to_hash in envs/__init__.py. Missing: {key}" factors = [ - environment_variables[key]() for key in environment_variables_to_hash + getattr(__import__(__name__), key) for key in environment_variables_to_hash ] hash_str = hashlib.md5(str(factors).encode(), diff --git a/vllm/envs/_variables.py b/vllm/envs/_variables.py index a7783adc9ea3c..6a8f7ff88d8d7 100644 --- a/vllm/envs/_variables.py +++ b/vllm/envs/_variables.py @@ -94,9 +94,9 @@ VLLM_TRACE_FUNCTION: int = 0 # Pipeline and partitioning VLLM_PP_LAYER_PARTITION: Optional[str] = None -# CPU backend settings -VLLM_CPU_KVCACHE_SPACE: Optional[int] = 0 -VLLM_CPU_OMP_THREADS_BIND: str = "" +# CPU backend settings +VLLM_CPU_KVCACHE_SPACE: Optional[int] = None +VLLM_CPU_OMP_THREADS_BIND: str = "auto" VLLM_CPU_NUM_OF_RESERVED_CPU: Optional[int] = None VLLM_CPU_MOE_PREPACK: bool = True VLLM_CPU_SGL_KERNEL: bool = False @@ -107,7 +107,7 @@ VLLM_XLA_CHECK_RECOMPILATION: bool = False VLLM_XLA_USE_SPMD: bool = False # MoE (Mixture of Experts) settings -VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024 +VLLM_FUSED_MOE_CHUNK_SIZE: int = 32768 VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING: bool = True # Ray distributed computing @@ -152,7 +152,7 @@ VLLM_TORCH_PROFILER_WITH_FLOPS: bool = False # Quantization and kernels VLLM_USE_TRITON_AWQ: bool = False VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False -VLLM_SKIP_P2P_CHECK: bool = False +VLLM_SKIP_P2P_CHECK: bool = True VLLM_DISABLED_KERNELS: list[str] = [] # Version control @@ -198,9 +198,9 @@ VLLM_CUDART_SO_PATH: Optional[str] = None # Data parallel settings VLLM_DP_RANK: int = 0 -VLLM_DP_RANK_LOCAL: int = -1 +VLLM_DP_RANK_LOCAL: int = 0 # Will be computed dynamically VLLM_DP_SIZE: int = 1 -VLLM_DP_MASTER_IP: str = "" +VLLM_DP_MASTER_IP: str = "127.0.0.1" VLLM_DP_MASTER_PORT: int = 0 VLLM_MOE_DP_CHUNK_SIZE: int = 256 VLLM_RANDOMIZE_DP_DUMMY_INPUTS: bool = False @@ -236,7 +236,7 @@ VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: bool = False VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False # Additional settings -VLLM_XGRAMMAR_CACHE_MB: int = 0 +VLLM_XGRAMMAR_CACHE_MB: int = 512 VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256 VLLM_ALLOW_INSECURE_SERIALIZATION: bool = False @@ -280,4 +280,10 @@ VLLM_ENABLE_RESPONSES_API_STORE: bool = False VLLM_ALLREDUCE_USE_SYMM_MEM: bool = False # Configuration folder -VLLM_TUNED_CONFIG_FOLDER: Optional[str] = None \ No newline at end of file +VLLM_TUNED_CONFIG_FOLDER: Optional[str] = None + + +# Create a dictionary of all defaults for easy access +# This will be used by __init__.py for standardized parsing +__defaults = {name: value for name, value in globals().items() + if not name.startswith('_') and not callable(value) and name.isupper()} \ No newline at end of file