From 6dd5e52823cc0ca8ddc9c4377d29ead37cc09a95 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Mon, 3 Feb 2025 16:29:56 -0500 Subject: [PATCH] Squelch MLA warning for Compressed-Tensors Models (#12704) Signed-off-by: Kyle Sayers --- vllm/config.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index d70a637956edf..2f4a7ad769d98 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -986,6 +986,9 @@ class ModelConfig: @property def use_mla(self) -> bool: + if not self.is_deepseek_mla or envs.VLLM_MLA_DISABLE: + return False + if self.quantization is not None and self.quantization not in [\ "fp8", "compressed-tensors"]: logger.warning( @@ -1012,8 +1015,7 @@ class ModelConfig: quant_config) return False - use_mla = (self.is_deepseek_mla and not envs.VLLM_MLA_DISABLE) - return use_mla + return True @property def supported_runner_types(self) -> Set[RunnerType]: