From 67532a1a6855e8262b3e1c9512c85e2fc934b3c0 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 16 Sep 2025 23:57:51 -0400 Subject: [PATCH] [UX] Remove "quantization is not fully optimized yet" log (#25012) Signed-off-by: mgoin --- vllm/config/__init__.py | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 535802585d18b..5f30576099714 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -1086,22 +1086,6 @@ class ModelConfig: def _verify_quantization(self) -> None: supported_quantization = me_quant.QUANTIZATION_METHODS - optimized_quantization_methods = [ - "fp8", - "modelopt", - "gptq_marlin_24", - "gptq_marlin", - "awq_marlin", - "fbgemm_fp8", - "compressed-tensors", - "experts_int8", - "quark", - "modelopt_fp4", - "bitblas", - "gptq_bitblas", - "inc", - "petit_nvfp4", - ] if self.quantization is not None: self.quantization = cast(me_quant.QuantizationMethods, self.quantization) @@ -1183,11 +1167,6 @@ class ModelConfig: f"be one of {supported_quantization}.") from vllm.platforms import current_platform current_platform.verify_quantization(self.quantization) - if self.quantization not in optimized_quantization_methods: - logger.warning( - "%s quantization is not fully " - "optimized yet. The speed can be slower than " - "non-quantized models.", self.quantization) def _verify_cuda_graph(self) -> None: # The `max_seq_len_to_capture` was incorrectly