Squelch MLA warning for Compressed-Tensors Models (#12704)

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
2025-12-28 22:16:32 +08:00 · 2025-02-03 16:29:56 -05:00 · 2025-02-03 16:29:56 -05:00 · 6dd5e52823
commit 6dd5e52823
parent c11de33dad
1 changed files with 4 additions and 2 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@ -986,6 +986,9 @@ class ModelConfig:

    @property
    def use_mla(self) -> bool:
+        if not self.is_deepseek_mla or envs.VLLM_MLA_DISABLE:
+            return False
+
        if self.quantization is not None and self.quantization not in [\
            "fp8", "compressed-tensors"]:
            logger.warning(
@ -1012,8 +1015,7 @@ class ModelConfig:
                        quant_config)
                    return False

-        use_mla = (self.is_deepseek_mla and not envs.VLLM_MLA_DISABLE)
-        return use_mla
+        return True

    @property
    def supported_runner_types(self) -> Set[RunnerType]: