mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-12 16:16:22 +08:00
[[Misc]Upgrade bitsandbytes to the latest version 0.44.0 (#8768)
This commit is contained in:
parent
1e7d5c01f5
commit
13f9f7a3d0
@ -11,7 +11,7 @@ Below are the steps to utilize BitsAndBytes with vLLM.
|
|||||||
|
|
||||||
.. code-block:: console
|
.. code-block:: console
|
||||||
|
|
||||||
$ pip install bitsandbytes>=0.42.0
|
$ pip install bitsandbytes>=0.44.0
|
||||||
|
|
||||||
vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint.
|
vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint.
|
||||||
|
|
||||||
|
|||||||
@ -79,23 +79,17 @@ def initialize_engine(model: str, quantization: str,
|
|||||||
# It quantizes the model when loading, with some config info from the
|
# It quantizes the model when loading, with some config info from the
|
||||||
# LoRA adapter repo. So need to set the parameter of load_format and
|
# LoRA adapter repo. So need to set the parameter of load_format and
|
||||||
# qlora_adapter_name_or_path as below.
|
# qlora_adapter_name_or_path as below.
|
||||||
engine_args = EngineArgs(
|
engine_args = EngineArgs(model=model,
|
||||||
model=model,
|
|
||||||
quantization=quantization,
|
quantization=quantization,
|
||||||
qlora_adapter_name_or_path=lora_repo,
|
qlora_adapter_name_or_path=lora_repo,
|
||||||
load_format="bitsandbytes",
|
load_format="bitsandbytes",
|
||||||
enable_lora=True,
|
enable_lora=True,
|
||||||
max_lora_rank=64,
|
max_lora_rank=64)
|
||||||
# set it only in GPUs of limited memory
|
|
||||||
enforce_eager=True)
|
|
||||||
else:
|
else:
|
||||||
engine_args = EngineArgs(
|
engine_args = EngineArgs(model=model,
|
||||||
model=model,
|
|
||||||
quantization=quantization,
|
quantization=quantization,
|
||||||
enable_lora=True,
|
enable_lora=True,
|
||||||
max_loras=4,
|
max_loras=4)
|
||||||
# set it only in GPUs of limited memory
|
|
||||||
enforce_eager=True)
|
|
||||||
return LLMEngine.from_engine_args(engine_args)
|
return LLMEngine.from_engine_args(engine_args)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -30,5 +30,5 @@ datamodel_code_generator # required for minicpm3 test
|
|||||||
aiohttp
|
aiohttp
|
||||||
|
|
||||||
# quantization
|
# quantization
|
||||||
bitsandbytes==0.42.0
|
bitsandbytes>=0.44.0
|
||||||
buildkite-test-collector==0.1.8
|
buildkite-test-collector==0.1.8
|
||||||
|
|||||||
@ -107,7 +107,7 @@ def validate_generated_texts(hf_runner,
|
|||||||
quantization='bitsandbytes',
|
quantization='bitsandbytes',
|
||||||
load_format='bitsandbytes',
|
load_format='bitsandbytes',
|
||||||
tensor_parallel_size=vllm_tp_size,
|
tensor_parallel_size=vllm_tp_size,
|
||||||
enforce_eager=True,
|
enforce_eager=False,
|
||||||
gpu_memory_utilization=0.8) as llm:
|
gpu_memory_utilization=0.8) as llm:
|
||||||
vllm_outputs = llm.generate_greedy(prompts, 8)
|
vllm_outputs = llm.generate_greedy(prompts, 8)
|
||||||
vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner")
|
vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner")
|
||||||
|
|||||||
@ -222,6 +222,7 @@ class ModelConfig:
|
|||||||
self._verify_embedding_mode()
|
self._verify_embedding_mode()
|
||||||
self._verify_quantization()
|
self._verify_quantization()
|
||||||
self._verify_cuda_graph()
|
self._verify_cuda_graph()
|
||||||
|
self._verify_bnb_config()
|
||||||
|
|
||||||
def _init_multimodal_config(
|
def _init_multimodal_config(
|
||||||
self, limit_mm_per_prompt: Optional[Mapping[str, int]]
|
self, limit_mm_per_prompt: Optional[Mapping[str, int]]
|
||||||
@ -337,6 +338,28 @@ class ModelConfig:
|
|||||||
self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
|
self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
|
||||||
self.max_model_len)
|
self.max_model_len)
|
||||||
|
|
||||||
|
def _verify_bnb_config(self) -> None:
|
||||||
|
"""
|
||||||
|
The current version of bitsandbytes (0.44.0) with 8-bit models does not
|
||||||
|
yet support CUDA graph.
|
||||||
|
"""
|
||||||
|
is_bitsandbytes = self.quantization == "bitsandbytes"
|
||||||
|
has_quantization_config = (getattr(self.hf_config,
|
||||||
|
"quantization_config", None)
|
||||||
|
is not None)
|
||||||
|
is_8bit = (self.hf_config.quantization_config.get(
|
||||||
|
"load_in_8bit", False) if has_quantization_config else False)
|
||||||
|
if all([
|
||||||
|
is_bitsandbytes,
|
||||||
|
has_quantization_config,
|
||||||
|
is_8bit,
|
||||||
|
not self.enforce_eager,
|
||||||
|
]):
|
||||||
|
logger.warning(
|
||||||
|
"CUDA graph is not supported on BitAndBytes 8bit yet, "
|
||||||
|
"fallback to the eager mode.")
|
||||||
|
self.enforce_eager = True
|
||||||
|
|
||||||
def verify_async_output_proc(self, parallel_config, speculative_config,
|
def verify_async_output_proc(self, parallel_config, speculative_config,
|
||||||
device_config) -> None:
|
device_config) -> None:
|
||||||
if not self.use_async_output_proc:
|
if not self.use_async_output_proc:
|
||||||
@ -401,13 +424,6 @@ class ModelConfig:
|
|||||||
"Pipeline parallelism is only supported for the following "
|
"Pipeline parallelism is only supported for the following "
|
||||||
f" architectures: {_PP_SUPPORTED_MODELS}.")
|
f" architectures: {_PP_SUPPORTED_MODELS}.")
|
||||||
|
|
||||||
# Remove the constraint after the bitsandbytes issue is fixed:
|
|
||||||
# https://github.com/bitsandbytes-foundation/bitsandbytes/issues/1308
|
|
||||||
if self.quantization == "bitsandbytes" and self.enforce_eager is False:
|
|
||||||
logger.warning("CUDA graph is not supported on BitAndBytes yet, "
|
|
||||||
"fallback to the eager mode.")
|
|
||||||
self.enforce_eager = True
|
|
||||||
|
|
||||||
if pipeline_parallel_size > 1 and self.use_async_output_proc:
|
if pipeline_parallel_size > 1 and self.use_async_output_proc:
|
||||||
logger.warning("Async output processor is not supported with "
|
logger.warning("Async output processor is not supported with "
|
||||||
"pipeline parallelism currently. Disabling it.")
|
"pipeline parallelism currently. Disabling it.")
|
||||||
|
|||||||
@ -121,12 +121,12 @@ class BitsAndBytesLinearMethod(LinearMethodBase):
|
|||||||
def __init__(self, quant_config: BitsAndBytesConfig):
|
def __init__(self, quant_config: BitsAndBytesConfig):
|
||||||
try:
|
try:
|
||||||
import bitsandbytes
|
import bitsandbytes
|
||||||
if bitsandbytes.__version__ < "0.42.0":
|
if bitsandbytes.__version__ < "0.44.0":
|
||||||
raise ImportError("bitsandbytes version is wrong. Please "
|
raise ImportError("bitsandbytes version is wrong. Please "
|
||||||
"install bitsandbytes>=0.42.0.")
|
"install bitsandbytes>=0.44.0.")
|
||||||
except ImportError as err:
|
except ImportError as err:
|
||||||
raise ImportError("Please install bitsandbytes>=0.42.0 via "
|
raise ImportError("Please install bitsandbytes>=0.44.0 via "
|
||||||
"`pip install bitsandbytes>=0.42.0` to use "
|
"`pip install bitsandbytes>=0.44.0` to use "
|
||||||
"bitsandbytes quantizer.") from err
|
"bitsandbytes quantizer.") from err
|
||||||
|
|
||||||
self.quant_config = quant_config
|
self.quant_config = quant_config
|
||||||
|
|||||||
@ -851,12 +851,12 @@ class BitsAndBytesModelLoader(BaseModelLoader):
|
|||||||
# only load the bitsandbytes module when needed
|
# only load the bitsandbytes module when needed
|
||||||
try:
|
try:
|
||||||
import bitsandbytes
|
import bitsandbytes
|
||||||
if bitsandbytes.__version__ < "0.42.0":
|
if bitsandbytes.__version__ < "0.44.0":
|
||||||
raise ImportError("bitsandbytes version is wrong. Please "
|
raise ImportError("bitsandbytes version is wrong. Please "
|
||||||
"install bitsandbytes>=0.42.0.")
|
"install bitsandbytes>=0.44.0.")
|
||||||
except ImportError as err:
|
except ImportError as err:
|
||||||
raise ImportError("Please install bitsandbytes>=0.42.0 via "
|
raise ImportError("Please install bitsandbytes>=0.44.0 via "
|
||||||
"`pip install bitsandbytes>=0.42.0` to use "
|
"`pip install bitsandbytes>=0.44.0` to use "
|
||||||
"bitsandbytes quantizer.") from err
|
"bitsandbytes quantizer.") from err
|
||||||
|
|
||||||
hf_weights_files, use_safetensors = self._prepare_weights(
|
hf_weights_files, use_safetensors = self._prepare_weights(
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user