[[Misc]Upgrade bitsandbytes to the latest version 0.44.0 (#8768)

2025-12-12 16:16:22 +08:00 · 2024-09-25 08:08:55 +08:00 · 2024-09-25 08:08:55 +08:00 · 13f9f7a3d0
commit 13f9f7a3d0
parent 1e7d5c01f5
7 changed files with 44 additions and 34 deletions
--- a/docs/source/quantization/bnb.rst
+++ b/docs/source/quantization/bnb.rst
@ -11,7 +11,7 @@ Below are the steps to utilize BitsAndBytes with vLLM.
 .. code-block:: console
-    $ pip install bitsandbytes>=0.42.0
+    $ pip install bitsandbytes>=0.44.0
 vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint.
--- a/examples/lora_with_quantization_inference.py
+++ b/examples/lora_with_quantization_inference.py
@ -79,23 +79,17 @@ def initialize_engine(model: str, quantization: str,
        # It quantizes the model when loading, with some config info from the
        # LoRA adapter repo. So need to set the parameter of load_format and
        # qlora_adapter_name_or_path as below.
-        engine_args = EngineArgs(
+        engine_args = EngineArgs(model=model,
            model=model,
                                 quantization=quantization,
                                 qlora_adapter_name_or_path=lora_repo,
                                 load_format="bitsandbytes",
                                 enable_lora=True,
-            max_lora_rank=64,
+                                 max_lora_rank=64)
            # set it only in GPUs of limited memory
            enforce_eager=True)
    else:
-        engine_args = EngineArgs(
+        engine_args = EngineArgs(model=model,
            model=model,
                                 quantization=quantization,
                                 enable_lora=True,
-            max_loras=4,
+                                 max_loras=4)
            # set it only in GPUs of limited memory
            enforce_eager=True)
    return LLMEngine.from_engine_args(engine_args)
--- a/requirements-test.txt
+++ b/requirements-test.txt
@ -30,5 +30,5 @@ datamodel_code_generator # required for minicpm3 test
 aiohttp
 # quantization
-bitsandbytes==0.42.0
+bitsandbytes>=0.44.0
 buildkite-test-collector==0.1.8
--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@ -107,7 +107,7 @@ def validate_generated_texts(hf_runner,
                     quantization='bitsandbytes',
                     load_format='bitsandbytes',
                     tensor_parallel_size=vllm_tp_size,
-                     enforce_eager=True,
+                     enforce_eager=False,
                     gpu_memory_utilization=0.8) as llm:
        vllm_outputs = llm.generate_greedy(prompts, 8)
        vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner")
--- a/vllm/config.py
+++ b/vllm/config.py
@ -222,6 +222,7 @@ class ModelConfig:
        self._verify_embedding_mode()
        self._verify_quantization()
        self._verify_cuda_graph()
        self._verify_bnb_config()
    def _init_multimodal_config(
        self, limit_mm_per_prompt: Optional[Mapping[str, int]]
@ -337,6 +338,28 @@ class ModelConfig:
        self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
                                          self.max_model_len)
    def _verify_bnb_config(self) -> None:
        """
        The current version of bitsandbytes (0.44.0) with 8-bit models does not 
        yet support CUDA graph.
        """
        is_bitsandbytes = self.quantization == "bitsandbytes"
        has_quantization_config = (getattr(self.hf_config,
                                           "quantization_config", None)
                                   is not None)
        is_8bit = (self.hf_config.quantization_config.get(
            "load_in_8bit", False) if has_quantization_config else False)
        if all([
                is_bitsandbytes,
                has_quantization_config,
                is_8bit,
                not self.enforce_eager,
        ]):
            logger.warning(
                "CUDA graph is not supported on BitAndBytes 8bit yet, "
                "fallback to the eager mode.")
            self.enforce_eager = True
    def verify_async_output_proc(self, parallel_config, speculative_config,
                                 device_config) -> None:
        if not self.use_async_output_proc:
@ -401,13 +424,6 @@ class ModelConfig:
                "Pipeline parallelism is only supported for the following "
                f" architectures: {_PP_SUPPORTED_MODELS}.")
        # Remove the constraint after the bitsandbytes issue is fixed:
        # https://github.com/bitsandbytes-foundation/bitsandbytes/issues/1308
        if self.quantization == "bitsandbytes" and self.enforce_eager is False:
            logger.warning("CUDA graph is not supported on BitAndBytes yet, "
                           "fallback to the eager mode.")
            self.enforce_eager = True
        if pipeline_parallel_size > 1 and self.use_async_output_proc:
            logger.warning("Async output processor is not supported with "
                           "pipeline parallelism currently. Disabling it.")
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@ -121,12 +121,12 @@ class BitsAndBytesLinearMethod(LinearMethodBase):
    def __init__(self, quant_config: BitsAndBytesConfig):
        try:
            import bitsandbytes
-            if bitsandbytes.__version__ < "0.42.0":
+            if bitsandbytes.__version__ < "0.44.0":
                raise ImportError("bitsandbytes version is wrong. Please "
-                                  "install bitsandbytes>=0.42.0.")
+                                  "install bitsandbytes>=0.44.0.")
        except ImportError as err:
-            raise ImportError("Please install bitsandbytes>=0.42.0 via "
+            raise ImportError("Please install bitsandbytes>=0.44.0 via "
-                              "`pip install bitsandbytes>=0.42.0` to use "
+                              "`pip install bitsandbytes>=0.44.0` to use "
                              "bitsandbytes quantizer.") from err
        self.quant_config = quant_config
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@ -851,12 +851,12 @@ class BitsAndBytesModelLoader(BaseModelLoader):
        # only load the bitsandbytes module when needed
        try:
            import bitsandbytes
-            if bitsandbytes.__version__ < "0.42.0":
+            if bitsandbytes.__version__ < "0.44.0":
                raise ImportError("bitsandbytes version is wrong. Please "
-                                  "install bitsandbytes>=0.42.0.")
+                                  "install bitsandbytes>=0.44.0.")
        except ImportError as err:
-            raise ImportError("Please install bitsandbytes>=0.42.0 via "
+            raise ImportError("Please install bitsandbytes>=0.44.0 via "
-                              "`pip install bitsandbytes>=0.42.0` to use "
+                              "`pip install bitsandbytes>=0.44.0` to use "
                              "bitsandbytes quantizer.") from err
        hf_weights_files, use_safetensors = self._prepare_weights(