diff --git a/vllm/envs.py b/vllm/envs.py index dc919e7741969..99f2ad2bc3d00 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -776,7 +776,7 @@ environment_variables: dict[str, Callable[[], Any]] = { # If set, the OpenAI API server will stay alive even after the underlying # AsyncLLMEngine errors and stops serving requests "VLLM_KEEP_ALIVE_ON_ENGINE_DEATH": lambda: bool( - os.getenv("VLLM_KEEP_ALIVE_ON_ENGINE_DEATH", 0) + int(os.getenv("VLLM_KEEP_ALIVE_ON_ENGINE_DEATH", "0")) ), # If the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN is set, it allows # the user to specify a max sequence length greater than @@ -1313,7 +1313,9 @@ environment_variables: dict[str, Callable[[], Any]] = { ), # If set, it means we pre-downloaded cubin files and flashinfer will # read the cubin files directly. - "VLLM_HAS_FLASHINFER_CUBIN": lambda: os.getenv("VLLM_HAS_FLASHINFER_CUBIN", False), + "VLLM_HAS_FLASHINFER_CUBIN": lambda: bool( + int(os.getenv("VLLM_HAS_FLASHINFER_CUBIN", "0")) + ), # Supported options: # - "flashinfer-cudnn": use flashinfer cudnn GEMM backend # - "flashinfer-trtllm": use flashinfer trtllm GEMM backend @@ -1449,8 +1451,8 @@ environment_variables: dict[str, Callable[[], Any]] = { # top 5 collected objects "VLLM_GC_DEBUG": lambda: os.getenv("VLLM_GC_DEBUG", ""), # Disables parallel execution of shared_experts via separate cuda stream - "VLLM_DISABLE_SHARED_EXPERTS_STREAM": lambda: os.getenv( - "VLLM_DISABLE_SHARED_EXPERTS_STREAM", False + "VLLM_DISABLE_SHARED_EXPERTS_STREAM": lambda: bool( + int(os.getenv("VLLM_DISABLE_SHARED_EXPERTS_STREAM", "0")) ), # Format for saving torch.compile cache artifacts # - "binary": saves as binary file