diff --git a/vllm/config.py b/vllm/config.py index 7cdc7b1adf9ab..ebe46c4bec5b1 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -748,8 +748,6 @@ class ModelConfig: def get_head_size(self) -> int: # TODO remove hard code if self.is_deepseek_mla: - # FlashAttention supports only head_size 32, 64, 128, 256, - # we need to pad head_size 192 to 256 if self.should_use_mla: return self.hf_text_config.kv_lora_rank else: @@ -974,7 +972,7 @@ class ModelConfig: @property def should_use_mla(self) -> bool: use_mla = (self.is_deepseek_mla and not self.disable_mla - and not envs.VLLM_DISABLE_MLA) + and not envs.VLLM_MLA_DISABLE) return use_mla def supported_runner_types(self) -> Set[RunnerType]: diff --git a/vllm/envs.py b/vllm/envs.py index c8b7340c0d251..2a18e3b9bc51d 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -77,6 +77,7 @@ if TYPE_CHECKING: V_SCALE_CONSTANT: int = 100 VLLM_SERVER_DEV_MODE: bool = False VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128 + VLLM_MLA_DISABLE: bool = False VLLM_MLA_PERFORM_MATRIX_ABSORPTION: bool = True @@ -302,10 +303,6 @@ environment_variables: Dict[str, Callable[[], Any]] = { "VLLM_FLASHINFER_FORCE_TENSOR_CORES": lambda: bool(int(os.getenv("VLLM_FLASHINFER_FORCE_TENSOR_CORES", "0"))), - # If set, vLLM will disable the MLA attention optimizations. - "VLLM_DISABLE_MLA": - lambda: bool(int(os.getenv("VLLM_DISABLE_MLA", "0"))), - # Pipeline stage partition strategy "VLLM_PP_LAYER_PARTITION": lambda: os.getenv("VLLM_PP_LAYER_PARTITION", None), @@ -512,6 +509,10 @@ environment_variables: Dict[str, Callable[[], Any]] = { "VLLM_V1_OUTPUT_PROC_CHUNK_SIZE": lambda: int(os.getenv("VLLM_V1_OUTPUT_PROC_CHUNK_SIZE", "128")), + # If set, vLLM will disable the MLA attention optimizations. + "VLLM_MLA_DISABLE": + lambda: bool(int(os.getenv("VLLM_MLA_DISABLE", "0"))), + # Flag that can control whether or not we perform matrix-absorption for MLA # decode, i.e. absorb W_UK into W_Q/W_UK and W_UV into W_O, absorbing the # matrices reduces the runtime FLOPs needed to compute MLA but requires