[[Misc]Upgrade bitsandbytes to the latest version 0.44.0 (#8768)

This commit is contained in:
Jee Jee Li 2024-09-25 08:08:55 +08:00 committed by GitHub
parent 1e7d5c01f5
commit 13f9f7a3d0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 44 additions and 34 deletions

View File

@ -11,7 +11,7 @@ Below are the steps to utilize BitsAndBytes with vLLM.
.. code-block:: console
$ pip install bitsandbytes>=0.42.0
$ pip install bitsandbytes>=0.44.0
vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint.

View File

@ -79,23 +79,17 @@ def initialize_engine(model: str, quantization: str,
# It quantizes the model when loading, with some config info from the
# LoRA adapter repo. So need to set the parameter of load_format and
# qlora_adapter_name_or_path as below.
engine_args = EngineArgs(
model=model,
quantization=quantization,
qlora_adapter_name_or_path=lora_repo,
load_format="bitsandbytes",
enable_lora=True,
max_lora_rank=64,
# set it only in GPUs of limited memory
enforce_eager=True)
engine_args = EngineArgs(model=model,
quantization=quantization,
qlora_adapter_name_or_path=lora_repo,
load_format="bitsandbytes",
enable_lora=True,
max_lora_rank=64)
else:
engine_args = EngineArgs(
model=model,
quantization=quantization,
enable_lora=True,
max_loras=4,
# set it only in GPUs of limited memory
enforce_eager=True)
engine_args = EngineArgs(model=model,
quantization=quantization,
enable_lora=True,
max_loras=4)
return LLMEngine.from_engine_args(engine_args)

View File

@ -30,5 +30,5 @@ datamodel_code_generator # required for minicpm3 test
aiohttp
# quantization
bitsandbytes==0.42.0
bitsandbytes>=0.44.0
buildkite-test-collector==0.1.8

View File

@ -107,7 +107,7 @@ def validate_generated_texts(hf_runner,
quantization='bitsandbytes',
load_format='bitsandbytes',
tensor_parallel_size=vllm_tp_size,
enforce_eager=True,
enforce_eager=False,
gpu_memory_utilization=0.8) as llm:
vllm_outputs = llm.generate_greedy(prompts, 8)
vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner")

View File

@ -222,6 +222,7 @@ class ModelConfig:
self._verify_embedding_mode()
self._verify_quantization()
self._verify_cuda_graph()
self._verify_bnb_config()
def _init_multimodal_config(
self, limit_mm_per_prompt: Optional[Mapping[str, int]]
@ -337,6 +338,28 @@ class ModelConfig:
self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
self.max_model_len)
def _verify_bnb_config(self) -> None:
"""
The current version of bitsandbytes (0.44.0) with 8-bit models does not
yet support CUDA graph.
"""
is_bitsandbytes = self.quantization == "bitsandbytes"
has_quantization_config = (getattr(self.hf_config,
"quantization_config", None)
is not None)
is_8bit = (self.hf_config.quantization_config.get(
"load_in_8bit", False) if has_quantization_config else False)
if all([
is_bitsandbytes,
has_quantization_config,
is_8bit,
not self.enforce_eager,
]):
logger.warning(
"CUDA graph is not supported on BitAndBytes 8bit yet, "
"fallback to the eager mode.")
self.enforce_eager = True
def verify_async_output_proc(self, parallel_config, speculative_config,
device_config) -> None:
if not self.use_async_output_proc:
@ -401,13 +424,6 @@ class ModelConfig:
"Pipeline parallelism is only supported for the following "
f" architectures: {_PP_SUPPORTED_MODELS}.")
# Remove the constraint after the bitsandbytes issue is fixed:
# https://github.com/bitsandbytes-foundation/bitsandbytes/issues/1308
if self.quantization == "bitsandbytes" and self.enforce_eager is False:
logger.warning("CUDA graph is not supported on BitAndBytes yet, "
"fallback to the eager mode.")
self.enforce_eager = True
if pipeline_parallel_size > 1 and self.use_async_output_proc:
logger.warning("Async output processor is not supported with "
"pipeline parallelism currently. Disabling it.")

View File

@ -121,12 +121,12 @@ class BitsAndBytesLinearMethod(LinearMethodBase):
def __init__(self, quant_config: BitsAndBytesConfig):
try:
import bitsandbytes
if bitsandbytes.__version__ < "0.42.0":
if bitsandbytes.__version__ < "0.44.0":
raise ImportError("bitsandbytes version is wrong. Please "
"install bitsandbytes>=0.42.0.")
"install bitsandbytes>=0.44.0.")
except ImportError as err:
raise ImportError("Please install bitsandbytes>=0.42.0 via "
"`pip install bitsandbytes>=0.42.0` to use "
raise ImportError("Please install bitsandbytes>=0.44.0 via "
"`pip install bitsandbytes>=0.44.0` to use "
"bitsandbytes quantizer.") from err
self.quant_config = quant_config

View File

@ -851,12 +851,12 @@ class BitsAndBytesModelLoader(BaseModelLoader):
# only load the bitsandbytes module when needed
try:
import bitsandbytes
if bitsandbytes.__version__ < "0.42.0":
if bitsandbytes.__version__ < "0.44.0":
raise ImportError("bitsandbytes version is wrong. Please "
"install bitsandbytes>=0.42.0.")
"install bitsandbytes>=0.44.0.")
except ImportError as err:
raise ImportError("Please install bitsandbytes>=0.42.0 via "
"`pip install bitsandbytes>=0.42.0` to use "
raise ImportError("Please install bitsandbytes>=0.44.0 via "
"`pip install bitsandbytes>=0.44.0` to use "
"bitsandbytes quantizer.") from err
hf_weights_files, use_safetensors = self._prepare_weights(