diff --git a/docs/source/features/quantization/supported_hardware.md b/docs/source/features/quantization/supported_hardware.md index 984e6626e241..08893f0e9595 100644 --- a/docs/source/features/quantization/supported_hardware.md +++ b/docs/source/features/quantization/supported_hardware.md @@ -80,7 +80,7 @@ The table below shows the compatibility of various quantization implementations * ✅︎ * ✅︎ * ✅︎ - * ✅︎ + * ❌ * ❌ * ❌ * ❌ diff --git a/vllm/model_executor/layers/quantization/gptq_bitblas.py b/vllm/model_executor/layers/quantization/gptq_bitblas.py index 891d8cdf36af..6ee3a2f1bbbb 100644 --- a/vllm/model_executor/layers/quantization/gptq_bitblas.py +++ b/vllm/model_executor/layers/quantization/gptq_bitblas.py @@ -25,6 +25,7 @@ from vllm.model_executor.parameter import (ChannelQuantScaleParameter, PackedColumnParameter, PackedvLLMParameter, RowvLLMParameter) +from vllm.platforms import current_platform from vllm.scalar_type import scalar_types logger = init_logger(__name__) @@ -191,6 +192,10 @@ class GPTQBitBLASConfig(QuantizationConfig): sym = quant_config.get("sym") desc_act = quant_config.get("desc_act") + # temporarily disable on ROCm platform + if not current_platform.is_cuda(): + return False + # If we cannot find the info needed in the config, cannot convert. if (num_bits is None or group_size is None or sym is None or desc_act is None):