From 9187de9fac94996120ae29afc43d5a34aa86cf18 Mon Sep 17 00:00:00 2001 From: Jinzhen Lin Date: Fri, 19 Dec 2025 16:56:35 +0800 Subject: [PATCH] [Quantization] enable compressed-tensors marlin support for turing (2) (#31008) Signed-off-by: Jinzhen Lin --- vllm/model_executor/layers/quantization/utils/marlin_utils.py | 2 +- .../layers/quantization/utils/marlin_utils_fp4.py | 2 +- .../layers/quantization/utils/marlin_utils_fp8.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py index 072b46f055210..66e979b505f0d 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py @@ -48,7 +48,7 @@ def query_marlin_supported_quant_types( -1 if capability_tuple is None else capability_tuple.to_int() ) - if device_capability < 80: + if device_capability < 75: return [] # - has_zp is True: return quant_types that has zero points diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py index b94d5bbf36540..876c724bf972d 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py @@ -23,7 +23,7 @@ logger = init_logger(__name__) def is_fp4_marlin_supported(): - return current_platform.has_device_capability(80) + return current_platform.has_device_capability(75) def nvfp4_marlin_process_scales(marlin_scales): diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py index c67e4f437cf0c..3d084516bf9a2 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py @@ -22,7 +22,7 @@ logger = init_logger(__name__) def is_fp8_marlin_supported(): - return current_platform.has_device_capability(80) + return current_platform.has_device_capability(75) def fp8_fused_exponent_bias_into_scales(scales):