mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-09 20:34:58 +08:00
Enable bitsandbytes quantization on AMD GPUs that use warp size 32 (#27307)
Signed-off-by: sstamenk <strahinja.stamenkovic@amd.com>
This commit is contained in:
parent
20852c8f4c
commit
814843e021
@ -14,10 +14,13 @@ from vllm.platforms import current_platform
|
|||||||
from ...utils import compare_two_settings, multi_gpu_test
|
from ...utils import compare_two_settings, multi_gpu_test
|
||||||
from ..utils import check_embeddings_close, check_logprobs_close
|
from ..utils import check_embeddings_close, check_logprobs_close
|
||||||
|
|
||||||
pytestmark = pytest.mark.skipif(
|
if current_platform.is_rocm():
|
||||||
current_platform.is_rocm(),
|
from vllm.platforms.rocm import on_gfx9
|
||||||
reason="bitsandbytes quantization not supported on ROCm (CUDA-only kernels)",
|
|
||||||
)
|
pytestmark = pytest.mark.skipif(
|
||||||
|
on_gfx9(),
|
||||||
|
reason="bitsandbytes not supported on gfx9 (warp size 64 limitation)",
|
||||||
|
)
|
||||||
|
|
||||||
models_4bit_to_test = [
|
models_4bit_to_test = [
|
||||||
("facebook/opt-125m", "quantize opt model inflight"),
|
("facebook/opt-125m", "quantize opt model inflight"),
|
||||||
|
|||||||
@ -185,6 +185,9 @@ class RocmPlatform(Platform):
|
|||||||
"petit_nvfp4",
|
"petit_nvfp4",
|
||||||
"torchao",
|
"torchao",
|
||||||
]
|
]
|
||||||
|
# bitsandbytes not supported on gfx9 (warp size 64 limitation)
|
||||||
|
if not on_gfx9():
|
||||||
|
supported_quantization += ["bitsandbytes"]
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_vit_attn_backend(
|
def get_vit_attn_backend(
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user