mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 08:34:56 +08:00
[UX] Enforce valid choices for envs like VLLM_ATTENTION_BACKEND, etc (#24761)
Signed-off-by: mgoin <mgoin64@gmail.com> Signed-off-by: Michael Goin <mgoin64@gmail.com>
This commit is contained in:
parent
5a411ef6c4
commit
dd83a157f1
101
vllm/envs.py
101
vllm/envs.py
@ -6,7 +6,7 @@ import json
|
|||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import tempfile
|
import tempfile
|
||||||
from typing import TYPE_CHECKING, Any, Callable, Optional
|
from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Union
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
VLLM_HOST_IP: str = ""
|
VLLM_HOST_IP: str = ""
|
||||||
@ -56,11 +56,12 @@ if TYPE_CHECKING:
|
|||||||
VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING: bool = True
|
VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING: bool = True
|
||||||
VLLM_USE_RAY_SPMD_WORKER: bool = False
|
VLLM_USE_RAY_SPMD_WORKER: bool = False
|
||||||
VLLM_USE_RAY_COMPILED_DAG: bool = False
|
VLLM_USE_RAY_COMPILED_DAG: bool = False
|
||||||
VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: str = "auto"
|
VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: Literal["auto", "nccl",
|
||||||
|
"shm"] = "auto"
|
||||||
VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False
|
VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False
|
||||||
VLLM_USE_RAY_WRAPPED_PP_COMM: bool = True
|
VLLM_USE_RAY_WRAPPED_PP_COMM: bool = True
|
||||||
VLLM_XLA_USE_SPMD: bool = False
|
VLLM_XLA_USE_SPMD: bool = False
|
||||||
VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
|
VLLM_WORKER_MULTIPROC_METHOD: Literal["fork", "spawn"] = "fork"
|
||||||
VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets")
|
VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets")
|
||||||
VLLM_IMAGE_FETCH_TIMEOUT: int = 5
|
VLLM_IMAGE_FETCH_TIMEOUT: int = 5
|
||||||
VLLM_VIDEO_FETCH_TIMEOUT: int = 30
|
VLLM_VIDEO_FETCH_TIMEOUT: int = 30
|
||||||
@ -77,7 +78,8 @@ if TYPE_CHECKING:
|
|||||||
VLLM_DOCKER_BUILD_CONTEXT: bool = False
|
VLLM_DOCKER_BUILD_CONTEXT: bool = False
|
||||||
VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False
|
VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False
|
||||||
VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
|
VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
|
||||||
CMAKE_BUILD_TYPE: Optional[str] = None
|
CMAKE_BUILD_TYPE: Optional[Literal["Debug", "Release",
|
||||||
|
"RelWithDebInfo"]] = None
|
||||||
VERBOSE: bool = False
|
VERBOSE: bool = False
|
||||||
VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False
|
VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False
|
||||||
VLLM_RPC_TIMEOUT: int = 10000 # ms
|
VLLM_RPC_TIMEOUT: int = 10000 # ms
|
||||||
@ -140,22 +142,25 @@ if TYPE_CHECKING:
|
|||||||
VLLM_USE_FUSED_MOE_GROUPED_TOPK: bool = True
|
VLLM_USE_FUSED_MOE_GROUPED_TOPK: bool = True
|
||||||
VLLM_USE_FLASHINFER_MOE_FP8: bool = False
|
VLLM_USE_FLASHINFER_MOE_FP8: bool = False
|
||||||
VLLM_USE_FLASHINFER_MOE_FP4: bool = False
|
VLLM_USE_FLASHINFER_MOE_FP4: bool = False
|
||||||
VLLM_FLASHINFER_MOE_BACKEND: str = "throughput"
|
VLLM_FLASHINFER_MOE_BACKEND: Literal["throughput",
|
||||||
|
"latency"] = "throughput"
|
||||||
VLLM_XGRAMMAR_CACHE_MB: int = 0
|
VLLM_XGRAMMAR_CACHE_MB: int = 0
|
||||||
VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256
|
VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256
|
||||||
VLLM_ALLOW_INSECURE_SERIALIZATION: bool = False
|
VLLM_ALLOW_INSECURE_SERIALIZATION: bool = False
|
||||||
VLLM_NIXL_SIDE_CHANNEL_HOST: str = "localhost"
|
VLLM_NIXL_SIDE_CHANNEL_HOST: str = "localhost"
|
||||||
VLLM_NIXL_SIDE_CHANNEL_PORT: int = 5557
|
VLLM_NIXL_SIDE_CHANNEL_PORT: int = 5557
|
||||||
VLLM_ALL2ALL_BACKEND: str = "naive"
|
VLLM_ALL2ALL_BACKEND: Literal["naive", "pplx", "deepep_high_throughput",
|
||||||
|
"deepep_low_latency"] = "naive"
|
||||||
VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840
|
VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840
|
||||||
VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1
|
VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1
|
||||||
VLLM_SLEEP_WHEN_IDLE: bool = False
|
VLLM_SLEEP_WHEN_IDLE: bool = False
|
||||||
VLLM_MQ_MAX_CHUNK_BYTES_MB: int = 16
|
VLLM_MQ_MAX_CHUNK_BYTES_MB: int = 16
|
||||||
VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: int = 300
|
VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: int = 300
|
||||||
VLLM_KV_CACHE_LAYOUT: Optional[str] = None
|
VLLM_KV_CACHE_LAYOUT: Optional[Literal["NHD", "HND"]] = None
|
||||||
VLLM_COMPUTE_NANS_IN_LOGITS: bool = False
|
VLLM_COMPUTE_NANS_IN_LOGITS: bool = False
|
||||||
VLLM_USE_NVFP4_CT_EMULATIONS: bool = False
|
VLLM_USE_NVFP4_CT_EMULATIONS: bool = False
|
||||||
VLLM_ROCM_QUICK_REDUCE_QUANTIZATION: str = "NONE"
|
VLLM_ROCM_QUICK_REDUCE_QUANTIZATION: Literal["FP", "INT8", "INT6", "INT4",
|
||||||
|
"NONE"] = "NONE"
|
||||||
VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16: bool = True
|
VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16: bool = True
|
||||||
VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB: Optional[int] = None
|
VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB: Optional[int] = None
|
||||||
VLLM_NIXL_ABORT_REQUEST_TIMEOUT: int = 120
|
VLLM_NIXL_ABORT_REQUEST_TIMEOUT: int = 120
|
||||||
@ -207,6 +212,48 @@ def maybe_convert_bool(value: Optional[str]) -> Optional[bool]:
|
|||||||
return bool(int(value))
|
return bool(int(value))
|
||||||
|
|
||||||
|
|
||||||
|
def env_with_choices(
|
||||||
|
env_name: str,
|
||||||
|
default: Optional[str],
|
||||||
|
choices: Union[list[str], Callable[[], list[str]]],
|
||||||
|
case_sensitive: bool = True) -> Callable[[], Optional[str]]:
|
||||||
|
"""
|
||||||
|
Create a lambda that validates environment variable against allowed choices
|
||||||
|
|
||||||
|
Args:
|
||||||
|
env_name: Name of the environment variable
|
||||||
|
default: Default value if not set (can be None)
|
||||||
|
choices: List of valid string options or callable that returns list
|
||||||
|
case_sensitive: Whether validation should be case sensitive
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Lambda function for environment_variables dict
|
||||||
|
"""
|
||||||
|
|
||||||
|
def _get_validated_env() -> Optional[str]:
|
||||||
|
value = os.getenv(env_name)
|
||||||
|
if value is None:
|
||||||
|
return default
|
||||||
|
|
||||||
|
# Resolve choices if it's a callable (for lazy loading)
|
||||||
|
actual_choices = choices() if callable(choices) else choices
|
||||||
|
|
||||||
|
if not case_sensitive:
|
||||||
|
check_value = value.lower()
|
||||||
|
check_choices = [choice.lower() for choice in actual_choices]
|
||||||
|
else:
|
||||||
|
check_value = value
|
||||||
|
check_choices = actual_choices
|
||||||
|
|
||||||
|
if check_value not in check_choices:
|
||||||
|
raise ValueError(f"Invalid value '{value}' for {env_name}. "
|
||||||
|
f"Valid options: {actual_choices}.")
|
||||||
|
|
||||||
|
return value
|
||||||
|
|
||||||
|
return _get_validated_env
|
||||||
|
|
||||||
|
|
||||||
def get_vllm_port() -> Optional[int]:
|
def get_vllm_port() -> Optional[int]:
|
||||||
"""Get the port from VLLM_PORT environment variable.
|
"""Get the port from VLLM_PORT environment variable.
|
||||||
|
|
||||||
@ -287,7 +334,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
# If not set, defaults to "Debug" or "RelWithDebInfo"
|
# If not set, defaults to "Debug" or "RelWithDebInfo"
|
||||||
# Available options: "Debug", "Release", "RelWithDebInfo"
|
# Available options: "Debug", "Release", "RelWithDebInfo"
|
||||||
"CMAKE_BUILD_TYPE":
|
"CMAKE_BUILD_TYPE":
|
||||||
lambda: os.getenv("CMAKE_BUILD_TYPE"),
|
env_with_choices("CMAKE_BUILD_TYPE", None,
|
||||||
|
["Debug", "Release", "RelWithDebInfo"]),
|
||||||
|
|
||||||
# If set, vllm will print verbose logs during installation
|
# If set, vllm will print verbose logs during installation
|
||||||
"VERBOSE":
|
"VERBOSE":
|
||||||
@ -476,7 +524,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
lambda: int(os.getenv("VLLM_TRACE_FUNCTION", "0")),
|
lambda: int(os.getenv("VLLM_TRACE_FUNCTION", "0")),
|
||||||
|
|
||||||
# Backend for attention computation
|
# Backend for attention computation
|
||||||
# Available options:
|
# Example options:
|
||||||
# - "TORCH_SDPA": use torch.nn.MultiheadAttention
|
# - "TORCH_SDPA": use torch.nn.MultiheadAttention
|
||||||
# - "FLASH_ATTN": use FlashAttention
|
# - "FLASH_ATTN": use FlashAttention
|
||||||
# - "XFORMERS": use XFormers
|
# - "XFORMERS": use XFormers
|
||||||
@ -486,8 +534,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
# - "FLASH_ATTN_MLA": use FlashAttention for MLA
|
# - "FLASH_ATTN_MLA": use FlashAttention for MLA
|
||||||
# - "FLASHINFER_MLA": use FlashInfer for MLA
|
# - "FLASHINFER_MLA": use FlashInfer for MLA
|
||||||
# - "CUTLASS_MLA": use CUTLASS for MLA
|
# - "CUTLASS_MLA": use CUTLASS for MLA
|
||||||
|
# All possible options loaded dynamically from _Backend enum
|
||||||
"VLLM_ATTENTION_BACKEND":
|
"VLLM_ATTENTION_BACKEND":
|
||||||
lambda: os.getenv("VLLM_ATTENTION_BACKEND", None),
|
env_with_choices("VLLM_ATTENTION_BACKEND", None,
|
||||||
|
lambda: list(__import__('vllm.platforms.interface', \
|
||||||
|
fromlist=['_Backend'])._Backend.__members__.keys())),
|
||||||
|
|
||||||
# If set, vllm will use flashinfer sampler
|
# If set, vllm will use flashinfer sampler
|
||||||
"VLLM_USE_FLASHINFER_SAMPLER":
|
"VLLM_USE_FLASHINFER_SAMPLER":
|
||||||
@ -550,7 +601,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
# - "shm": use shared memory and gRPC for communication
|
# - "shm": use shared memory and gRPC for communication
|
||||||
# This flag is ignored if VLLM_USE_RAY_COMPILED_DAG is not set.
|
# This flag is ignored if VLLM_USE_RAY_COMPILED_DAG is not set.
|
||||||
"VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE":
|
"VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE":
|
||||||
lambda: os.getenv("VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE", "auto"),
|
env_with_choices("VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE", "auto",
|
||||||
|
["auto", "nccl", "shm"]),
|
||||||
|
|
||||||
# If the env var is set, it enables GPU communication overlap
|
# If the env var is set, it enables GPU communication overlap
|
||||||
# (experimental feature) in Ray's Compiled Graph. This flag is ignored if
|
# (experimental feature) in Ray's Compiled Graph. This flag is ignored if
|
||||||
@ -569,7 +621,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
# Use dedicated multiprocess context for workers.
|
# Use dedicated multiprocess context for workers.
|
||||||
# Both spawn and fork work
|
# Both spawn and fork work
|
||||||
"VLLM_WORKER_MULTIPROC_METHOD":
|
"VLLM_WORKER_MULTIPROC_METHOD":
|
||||||
lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "fork"),
|
env_with_choices("VLLM_WORKER_MULTIPROC_METHOD", "fork",
|
||||||
|
["spawn", "fork"]),
|
||||||
|
|
||||||
# Path to the cache for storing downloaded assets
|
# Path to the cache for storing downloaded assets
|
||||||
"VLLM_ASSETS_CACHE":
|
"VLLM_ASSETS_CACHE":
|
||||||
@ -833,7 +886,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
# Choice of quantization level: FP, INT8, INT6, INT4 or NONE
|
# Choice of quantization level: FP, INT8, INT6, INT4 or NONE
|
||||||
# Recommended for large models to get allreduce
|
# Recommended for large models to get allreduce
|
||||||
"VLLM_ROCM_QUICK_REDUCE_QUANTIZATION":
|
"VLLM_ROCM_QUICK_REDUCE_QUANTIZATION":
|
||||||
lambda: os.getenv("VLLM_ROCM_QUICK_REDUCE_QUANTIZATION", "NONE").upper(),
|
env_with_choices("VLLM_ROCM_QUICK_REDUCE_QUANTIZATION", "NONE",
|
||||||
|
["FP", "INT8", "INT6", "INT4", "NONE"]),
|
||||||
|
|
||||||
# Custom quick allreduce kernel for MI3* cards
|
# Custom quick allreduce kernel for MI3* cards
|
||||||
# Due to the lack of the bfloat16 asm instruction, bfloat16
|
# Due to the lack of the bfloat16 asm instruction, bfloat16
|
||||||
@ -1075,21 +1129,20 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
# - "deepep_high_throughput", use deepep high-throughput kernels
|
# - "deepep_high_throughput", use deepep high-throughput kernels
|
||||||
# - "deepep_low_latency", use deepep low-latency kernels
|
# - "deepep_low_latency", use deepep low-latency kernels
|
||||||
"VLLM_ALL2ALL_BACKEND":
|
"VLLM_ALL2ALL_BACKEND":
|
||||||
lambda: os.getenv("VLLM_ALL2ALL_BACKEND", "naive"),
|
env_with_choices("VLLM_ALL2ALL_BACKEND", "naive",
|
||||||
|
["naive", "pplx",
|
||||||
|
"deepep_high_throughput", "deepep_low_latency"]),
|
||||||
|
|
||||||
# Flashinfer MoE backend for vLLM's fused Mixture-of-Experts support. Both
|
# Flashinfer MoE backend for vLLM's fused Mixture-of-Experts support.
|
||||||
# require compute capability 10.0 or above.
|
# Both require compute capability 10.0 or above.
|
||||||
# Available options:
|
# Available options:
|
||||||
# - "throughput": [default]
|
# - "throughput": [default]
|
||||||
# Uses CUTLASS kernels optimized for high-throughput batch inference.
|
# Uses CUTLASS kernels optimized for high-throughput batch inference.
|
||||||
# - "latency":
|
# - "latency":
|
||||||
# Uses TensorRT-LLM kernels optimized for low-latency inference.
|
# Uses TensorRT-LLM kernels optimized for low-latency inference.
|
||||||
# To set this backend, define the environment variable:
|
"VLLM_FLASHINFER_MOE_BACKEND":
|
||||||
# export VLLM_FLASHINFER_MOE_BACKEND=latency.
|
env_with_choices("VLLM_FLASHINFER_MOE_BACKEND", "throughput",
|
||||||
# If not set, defaults to "throughput".
|
["throughput", "latency"]),
|
||||||
"VLLM_FLASHINFER_MOE_BACKEND": lambda: os.getenv(
|
|
||||||
"VLLM_FLASHINFER_MOE_BACKEND", "throughput"
|
|
||||||
),
|
|
||||||
|
|
||||||
# Control the maximum number of tokens per expert supported by the
|
# Control the maximum number of tokens per expert supported by the
|
||||||
# NVFP4 MoE CUTLASS Kernel. This value is used to create a buffer for
|
# NVFP4 MoE CUTLASS Kernel. This value is used to create a buffer for
|
||||||
@ -1145,7 +1198,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
# leave the layout choice to the backend. Mind that backends may only
|
# leave the layout choice to the backend. Mind that backends may only
|
||||||
# implement and support a subset of all possible layouts.
|
# implement and support a subset of all possible layouts.
|
||||||
"VLLM_KV_CACHE_LAYOUT":
|
"VLLM_KV_CACHE_LAYOUT":
|
||||||
lambda: os.getenv("VLLM_KV_CACHE_LAYOUT", None),
|
env_with_choices("VLLM_KV_CACHE_LAYOUT", None, ["NHD", "HND"]),
|
||||||
|
|
||||||
# Enable checking whether the generated logits contain NaNs,
|
# Enable checking whether the generated logits contain NaNs,
|
||||||
# indicating corrupted output. Useful for debugging low level bugs
|
# indicating corrupted output. Useful for debugging low level bugs
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user