From 1819fbda638d4e10512570ebad0e6b16661238d8 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Thu, 3 Jul 2025 21:58:46 +0800 Subject: [PATCH] [Quantization] Bump to use latest bitsandbytes (#20424) Signed-off-by: Jee Jee Li --- docker/Dockerfile | 2 +- docs/features/quantization/bnb.md | 2 +- requirements/nightly_torch_test.txt | 2 +- requirements/test.in | 2 +- requirements/test.txt | 2 +- vllm/config.py | 2 +- vllm/model_executor/layers/quantization/bitsandbytes.py | 8 ++++---- vllm/model_executor/model_loader/bitsandbytes_loader.py | 8 ++++---- 8 files changed, 14 insertions(+), 14 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index d1009fb4fb18..ec18c45a096a 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -498,7 +498,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \ else \ - uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.3' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \ + uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.46.1' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \ fi ENV VLLM_USAGE_SOURCE production-docker-image diff --git a/docs/features/quantization/bnb.md b/docs/features/quantization/bnb.md index 5756fdb28837..ca13ee107ef4 100644 --- a/docs/features/quantization/bnb.md +++ b/docs/features/quantization/bnb.md @@ -10,7 +10,7 @@ Compared to other quantization methods, BitsAndBytes eliminates the need for cal Below are the steps to utilize BitsAndBytes with vLLM. ```bash -pip install bitsandbytes>=0.45.3 +pip install bitsandbytes>=0.46.1 ``` vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint. diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt index fd0b0fac12a9..0bade084fdf6 100644 --- a/requirements/nightly_torch_test.txt +++ b/requirements/nightly_torch_test.txt @@ -34,7 +34,7 @@ tokenizers==0.21.1 huggingface-hub[hf_xet]>=0.30.0 # Required for Xet downloads. schemathesis>=3.39.15 # Required for openai schema test. # quantization -bitsandbytes>=0.45.3 +bitsandbytes>=0.46.1 buildkite-test-collector==0.1.9 diff --git a/requirements/test.in b/requirements/test.in index 85c96df8e8f4..5f8b97a0e341 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -39,7 +39,7 @@ tokenizers==0.21.1 huggingface-hub[hf_xet]>=0.33.0 # Required for Xet downloads. schemathesis>=3.39.15 # Required for openai schema test. # quantization -bitsandbytes>=0.45.3 +bitsandbytes==0.46.1 buildkite-test-collector==0.1.9 diff --git a/requirements/test.txt b/requirements/test.txt index 16d8ee54adcf..f6f599df758f 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -45,7 +45,7 @@ backoff==2.2.1 # via # -r requirements/test.in # schemathesis -bitsandbytes==0.45.3 +bitsandbytes==0.46.1 # via -r requirements/test.in black==24.10.0 # via datamodel-code-generator diff --git a/vllm/config.py b/vllm/config.py index 5c19061a0d51..8a0080f79702 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -969,7 +969,7 @@ class ModelConfig: def _verify_bnb_config(self) -> None: """ - The current version of bitsandbytes (0.45.3) with 8-bit models does not + The current version of bitsandbytes (0.46.1) with 8-bit models does not yet support CUDA graph. # TODO Remove this when bitsandbytes supports. """ diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index 53ba84ea8e75..1ed3ef8d2173 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -156,12 +156,12 @@ class BitsAndBytesLinearMethod(LinearMethodBase): def __init__(self, quant_config: BitsAndBytesConfig): try: import bitsandbytes - if bitsandbytes.__version__ < "0.45.3": + if bitsandbytes.__version__ < "0.46.1": raise ImportError("bitsandbytes version is wrong. Please " - "install bitsandbytes>=0.45.3.") + "install bitsandbytes>=0.46.1.") except ImportError as err: - raise ImportError("Please install bitsandbytes>=0.45.3 via " - "`pip install bitsandbytes>=0.45.3` to use " + raise ImportError("Please install bitsandbytes>=0.46.1 via " + "`pip install bitsandbytes>=0.46.1` to use " "bitsandbytes quantizer.") from err self.quant_config = quant_config diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py index 0c46d170e88d..8e330f7eeaf4 100644 --- a/vllm/model_executor/model_loader/bitsandbytes_loader.py +++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py @@ -183,12 +183,12 @@ class BitsAndBytesModelLoader(BaseModelLoader): try: import bitsandbytes - if bitsandbytes.__version__ < "0.45.3": + if bitsandbytes.__version__ < "0.46.1": raise ImportError("bitsandbytes version is wrong. Please " - "install bitsandbytes>=0.45.3.") + "install bitsandbytes>=0.46.1.") except ImportError as err: - raise ImportError("Please install bitsandbytes>=0.45.3 via " - "`pip install bitsandbytes>=0.45.3` to use " + raise ImportError("Please install bitsandbytes>=0.46.1 via " + "`pip install bitsandbytes>=0.46.1` to use " "bitsandbytes quantizer.") from err hf_weights_files, use_safetensors = self._prepare_weights(