Enable bitsandbytes quantization on AMD GPUs that use warp size 32 (#27307)

Signed-off-by: sstamenk <strahinja.stamenkovic@amd.com>
2026-06-03 10:31:19 +08:00 · 2025-11-19 04:12:31 +01:00 · 2025-11-19 04:12:31 +01:00 · 814843e021
commit 814843e021
parent 20852c8f4c
2 changed files with 10 additions and 4 deletions
--- a/tests/models/quantization/test_bitsandbytes.py
+++ b/tests/models/quantization/test_bitsandbytes.py
@ -14,10 +14,13 @@ from vllm.platforms import current_platform
 from ...utils import compare_two_settings, multi_gpu_test
 from ..utils import check_embeddings_close, check_logprobs_close
-pytestmark = pytest.mark.skipif(
+if current_platform.is_rocm():
-    current_platform.is_rocm(),
+    from vllm.platforms.rocm import on_gfx9
-    reason="bitsandbytes quantization not supported on ROCm (CUDA-only kernels)",
+
-)
+    pytestmark = pytest.mark.skipif(
        on_gfx9(),
        reason="bitsandbytes not supported on gfx9 (warp size 64 limitation)",
    )
 models_4bit_to_test = [
    ("facebook/opt-125m", "quantize opt model inflight"),
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@ -185,6 +185,9 @@ class RocmPlatform(Platform):
        "petit_nvfp4",
        "torchao",
    ]
    # bitsandbytes not supported on gfx9 (warp size 64 limitation)
    if not on_gfx9():
        supported_quantization += ["bitsandbytes"]
    @classmethod
    def get_vit_attn_backend(