mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 04:54:56 +08:00
[[Misc]Upgrade bitsandbytes to the latest version 0.44.0 (#8768)
This commit is contained in:
parent
1e7d5c01f5
commit
13f9f7a3d0
@ -11,7 +11,7 @@ Below are the steps to utilize BitsAndBytes with vLLM.
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ pip install bitsandbytes>=0.42.0
|
||||
$ pip install bitsandbytes>=0.44.0
|
||||
|
||||
vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint.
|
||||
|
||||
|
||||
@ -79,23 +79,17 @@ def initialize_engine(model: str, quantization: str,
|
||||
# It quantizes the model when loading, with some config info from the
|
||||
# LoRA adapter repo. So need to set the parameter of load_format and
|
||||
# qlora_adapter_name_or_path as below.
|
||||
engine_args = EngineArgs(
|
||||
model=model,
|
||||
quantization=quantization,
|
||||
qlora_adapter_name_or_path=lora_repo,
|
||||
load_format="bitsandbytes",
|
||||
enable_lora=True,
|
||||
max_lora_rank=64,
|
||||
# set it only in GPUs of limited memory
|
||||
enforce_eager=True)
|
||||
engine_args = EngineArgs(model=model,
|
||||
quantization=quantization,
|
||||
qlora_adapter_name_or_path=lora_repo,
|
||||
load_format="bitsandbytes",
|
||||
enable_lora=True,
|
||||
max_lora_rank=64)
|
||||
else:
|
||||
engine_args = EngineArgs(
|
||||
model=model,
|
||||
quantization=quantization,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
# set it only in GPUs of limited memory
|
||||
enforce_eager=True)
|
||||
engine_args = EngineArgs(model=model,
|
||||
quantization=quantization,
|
||||
enable_lora=True,
|
||||
max_loras=4)
|
||||
return LLMEngine.from_engine_args(engine_args)
|
||||
|
||||
|
||||
|
||||
@ -30,5 +30,5 @@ datamodel_code_generator # required for minicpm3 test
|
||||
aiohttp
|
||||
|
||||
# quantization
|
||||
bitsandbytes==0.42.0
|
||||
bitsandbytes>=0.44.0
|
||||
buildkite-test-collector==0.1.8
|
||||
|
||||
@ -107,7 +107,7 @@ def validate_generated_texts(hf_runner,
|
||||
quantization='bitsandbytes',
|
||||
load_format='bitsandbytes',
|
||||
tensor_parallel_size=vllm_tp_size,
|
||||
enforce_eager=True,
|
||||
enforce_eager=False,
|
||||
gpu_memory_utilization=0.8) as llm:
|
||||
vllm_outputs = llm.generate_greedy(prompts, 8)
|
||||
vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner")
|
||||
|
||||
@ -222,6 +222,7 @@ class ModelConfig:
|
||||
self._verify_embedding_mode()
|
||||
self._verify_quantization()
|
||||
self._verify_cuda_graph()
|
||||
self._verify_bnb_config()
|
||||
|
||||
def _init_multimodal_config(
|
||||
self, limit_mm_per_prompt: Optional[Mapping[str, int]]
|
||||
@ -337,6 +338,28 @@ class ModelConfig:
|
||||
self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
|
||||
self.max_model_len)
|
||||
|
||||
def _verify_bnb_config(self) -> None:
|
||||
"""
|
||||
The current version of bitsandbytes (0.44.0) with 8-bit models does not
|
||||
yet support CUDA graph.
|
||||
"""
|
||||
is_bitsandbytes = self.quantization == "bitsandbytes"
|
||||
has_quantization_config = (getattr(self.hf_config,
|
||||
"quantization_config", None)
|
||||
is not None)
|
||||
is_8bit = (self.hf_config.quantization_config.get(
|
||||
"load_in_8bit", False) if has_quantization_config else False)
|
||||
if all([
|
||||
is_bitsandbytes,
|
||||
has_quantization_config,
|
||||
is_8bit,
|
||||
not self.enforce_eager,
|
||||
]):
|
||||
logger.warning(
|
||||
"CUDA graph is not supported on BitAndBytes 8bit yet, "
|
||||
"fallback to the eager mode.")
|
||||
self.enforce_eager = True
|
||||
|
||||
def verify_async_output_proc(self, parallel_config, speculative_config,
|
||||
device_config) -> None:
|
||||
if not self.use_async_output_proc:
|
||||
@ -401,13 +424,6 @@ class ModelConfig:
|
||||
"Pipeline parallelism is only supported for the following "
|
||||
f" architectures: {_PP_SUPPORTED_MODELS}.")
|
||||
|
||||
# Remove the constraint after the bitsandbytes issue is fixed:
|
||||
# https://github.com/bitsandbytes-foundation/bitsandbytes/issues/1308
|
||||
if self.quantization == "bitsandbytes" and self.enforce_eager is False:
|
||||
logger.warning("CUDA graph is not supported on BitAndBytes yet, "
|
||||
"fallback to the eager mode.")
|
||||
self.enforce_eager = True
|
||||
|
||||
if pipeline_parallel_size > 1 and self.use_async_output_proc:
|
||||
logger.warning("Async output processor is not supported with "
|
||||
"pipeline parallelism currently. Disabling it.")
|
||||
|
||||
@ -121,12 +121,12 @@ class BitsAndBytesLinearMethod(LinearMethodBase):
|
||||
def __init__(self, quant_config: BitsAndBytesConfig):
|
||||
try:
|
||||
import bitsandbytes
|
||||
if bitsandbytes.__version__ < "0.42.0":
|
||||
if bitsandbytes.__version__ < "0.44.0":
|
||||
raise ImportError("bitsandbytes version is wrong. Please "
|
||||
"install bitsandbytes>=0.42.0.")
|
||||
"install bitsandbytes>=0.44.0.")
|
||||
except ImportError as err:
|
||||
raise ImportError("Please install bitsandbytes>=0.42.0 via "
|
||||
"`pip install bitsandbytes>=0.42.0` to use "
|
||||
raise ImportError("Please install bitsandbytes>=0.44.0 via "
|
||||
"`pip install bitsandbytes>=0.44.0` to use "
|
||||
"bitsandbytes quantizer.") from err
|
||||
|
||||
self.quant_config = quant_config
|
||||
|
||||
@ -851,12 +851,12 @@ class BitsAndBytesModelLoader(BaseModelLoader):
|
||||
# only load the bitsandbytes module when needed
|
||||
try:
|
||||
import bitsandbytes
|
||||
if bitsandbytes.__version__ < "0.42.0":
|
||||
if bitsandbytes.__version__ < "0.44.0":
|
||||
raise ImportError("bitsandbytes version is wrong. Please "
|
||||
"install bitsandbytes>=0.42.0.")
|
||||
"install bitsandbytes>=0.44.0.")
|
||||
except ImportError as err:
|
||||
raise ImportError("Please install bitsandbytes>=0.42.0 via "
|
||||
"`pip install bitsandbytes>=0.42.0` to use "
|
||||
raise ImportError("Please install bitsandbytes>=0.44.0 via "
|
||||
"`pip install bitsandbytes>=0.44.0` to use "
|
||||
"bitsandbytes quantizer.") from err
|
||||
|
||||
hf_weights_files, use_safetensors = self._prepare_weights(
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user