Skip to content

Commit

Permalink
[[Misc]Upgrade bitsandbytes to the latest version 0.44.0 (vllm-projec…
Browse files Browse the repository at this point in the history
  • Loading branch information
jeejeelee authored and liuyanyi committed Oct 6, 2024
1 parent c3f2a10 commit 3bb7b0b
Show file tree
Hide file tree
Showing 7 changed files with 44 additions and 34 deletions.
2 changes: 1 addition & 1 deletion docs/source/quantization/bnb.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ Below are the steps to utilize BitsAndBytes with vLLM.

.. code-block:: console
$ pip install bitsandbytes>=0.42.0
$ pip install bitsandbytes>=0.44.0
vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint.

Expand Down
26 changes: 10 additions & 16 deletions examples/lora_with_quantization_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,23 +79,17 @@ def initialize_engine(model: str, quantization: str,
# It quantizes the model when loading, with some config info from the
# LoRA adapter repo. So need to set the parameter of load_format and
# qlora_adapter_name_or_path as below.
engine_args = EngineArgs(
model=model,
quantization=quantization,
qlora_adapter_name_or_path=lora_repo,
load_format="bitsandbytes",
enable_lora=True,
max_lora_rank=64,
# set it only in GPUs of limited memory
enforce_eager=True)
engine_args = EngineArgs(model=model,
quantization=quantization,
qlora_adapter_name_or_path=lora_repo,
load_format="bitsandbytes",
enable_lora=True,
max_lora_rank=64)
else:
engine_args = EngineArgs(
model=model,
quantization=quantization,
enable_lora=True,
max_loras=4,
# set it only in GPUs of limited memory
enforce_eager=True)
engine_args = EngineArgs(model=model,
quantization=quantization,
enable_lora=True,
max_loras=4)
return LLMEngine.from_engine_args(engine_args)


Expand Down
2 changes: 1 addition & 1 deletion requirements-test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,5 @@ datamodel_code_generator # required for minicpm3 test
aiohttp

# quantization
bitsandbytes==0.42.0
bitsandbytes>=0.44.0
buildkite-test-collector==0.1.8
2 changes: 1 addition & 1 deletion tests/quantization/test_bitsandbytes.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def validate_generated_texts(hf_runner,
quantization='bitsandbytes',
load_format='bitsandbytes',
tensor_parallel_size=vllm_tp_size,
enforce_eager=True,
enforce_eager=False,
gpu_memory_utilization=0.8) as llm:
vllm_outputs = llm.generate_greedy(prompts, 8)
vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner")
Expand Down
30 changes: 23 additions & 7 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,7 @@ def __init__(self,
self._verify_embedding_mode()
self._verify_quantization()
self._verify_cuda_graph()
self._verify_bnb_config()

def _init_multimodal_config(
self, limit_mm_per_prompt: Optional[Mapping[str, int]]
Expand Down Expand Up @@ -337,6 +338,28 @@ def _verify_cuda_graph(self) -> None:
self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
self.max_model_len)

def _verify_bnb_config(self) -> None:
"""
The current version of bitsandbytes (0.44.0) with 8-bit models does not
yet support CUDA graph.
"""
is_bitsandbytes = self.quantization == "bitsandbytes"
has_quantization_config = (getattr(self.hf_config,
"quantization_config", None)
is not None)
is_8bit = (self.hf_config.quantization_config.get(
"load_in_8bit", False) if has_quantization_config else False)
if all([
is_bitsandbytes,
has_quantization_config,
is_8bit,
not self.enforce_eager,
]):
logger.warning(
"CUDA graph is not supported on BitAndBytes 8bit yet, "
"fallback to the eager mode.")
self.enforce_eager = True

def verify_async_output_proc(self, parallel_config, speculative_config,
device_config) -> None:
if not self.use_async_output_proc:
Expand Down Expand Up @@ -401,13 +424,6 @@ def verify_with_parallel_config(
"Pipeline parallelism is only supported for the following "
f" architectures: {_PP_SUPPORTED_MODELS}.")

# Remove the constraint after the bitsandbytes issue is fixed:
# https://github.com/bitsandbytes-foundation/bitsandbytes/issues/1308
if self.quantization == "bitsandbytes" and self.enforce_eager is False:
logger.warning("CUDA graph is not supported on BitAndBytes yet, "
"fallback to the eager mode.")
self.enforce_eager = True

if pipeline_parallel_size > 1 and self.use_async_output_proc:
logger.warning("Async output processor is not supported with "
"pipeline parallelism currently. Disabling it.")
Expand Down
8 changes: 4 additions & 4 deletions vllm/model_executor/layers/quantization/bitsandbytes.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,12 +121,12 @@ class BitsAndBytesLinearMethod(LinearMethodBase):
def __init__(self, quant_config: BitsAndBytesConfig):
try:
import bitsandbytes
if bitsandbytes.__version__ < "0.42.0":
if bitsandbytes.__version__ < "0.44.0":
raise ImportError("bitsandbytes version is wrong. Please "
"install bitsandbytes>=0.42.0.")
"install bitsandbytes>=0.44.0.")
except ImportError as err:
raise ImportError("Please install bitsandbytes>=0.42.0 via "
"`pip install bitsandbytes>=0.42.0` to use "
raise ImportError("Please install bitsandbytes>=0.44.0 via "
"`pip install bitsandbytes>=0.44.0` to use "
"bitsandbytes quantizer.") from err

self.quant_config = quant_config
Expand Down
8 changes: 4 additions & 4 deletions vllm/model_executor/model_loader/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -851,12 +851,12 @@ def _get_quantized_weights_iterator(
# only load the bitsandbytes module when needed
try:
import bitsandbytes
if bitsandbytes.__version__ < "0.42.0":
if bitsandbytes.__version__ < "0.44.0":
raise ImportError("bitsandbytes version is wrong. Please "
"install bitsandbytes>=0.42.0.")
"install bitsandbytes>=0.44.0.")
except ImportError as err:
raise ImportError("Please install bitsandbytes>=0.42.0 via "
"`pip install bitsandbytes>=0.42.0` to use "
raise ImportError("Please install bitsandbytes>=0.44.0 via "
"`pip install bitsandbytes>=0.44.0` to use "
"bitsandbytes quantizer.") from err

hf_weights_files, use_safetensors = self._prepare_weights(
Expand Down

0 comments on commit 3bb7b0b

Please sign in to comment.