Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable BNB multi-backend support #31098

Merged
merged 66 commits into from
Sep 24, 2024
Merged
Show file tree
Hide file tree
Changes from 31 commits
Commits
Show all changes
66 commits
Select commit Hold shift + click to select a range
846f853
enable cpu bnb path
jiqing-feng May 29, 2024
6c56703
fix style
jiqing-feng May 29, 2024
3f02c9b
fix code style
jiqing-feng May 29, 2024
9ccbf10
fix 4 bit path
jiqing-feng May 29, 2024
89fa5ef
Update src/transformers/utils/import_utils.py
jiqing-feng Jul 17, 2024
a52d7af
add multi backend refactor tests
jiqing-feng Jul 17, 2024
6f67862
fix style
jiqing-feng Jul 17, 2024
ee23eb0
tweak 4bit quantizer + fix corresponding tests
Titus-von-Koeller Jul 30, 2024
678e673
tweak 8bit quantizer + *try* fixing corresponding tests
Titus-von-Koeller Jul 30, 2024
0858b3e
fix dequant bnb 8bit
jiqing-feng Aug 1, 2024
c76d243
account for Intel CPU in variability of expected outputs
Titus-von-Koeller Aug 1, 2024
5843f28
enable cpu and xpu device map
jiqing-feng Aug 7, 2024
1a864a8
further tweaks to account for Intel CPU
Titus-von-Koeller Aug 2, 2024
f3753fc
fix autocast to work with both cpu + cuda
Titus-von-Koeller Aug 13, 2024
0cc1b7e
fix comments
Titus-von-Koeller Aug 14, 2024
b611812
fix comments
Titus-von-Koeller Aug 14, 2024
ab4836e
switch to testing_utils.torch_device
Titus-von-Koeller Aug 14, 2024
7399500
allow for xpu in multi-gpu tests
Titus-von-Koeller Aug 18, 2024
b41059c
fix tests 4bit for CPU NF4
jiqing-feng Aug 20, 2024
1a7a6fe
fix bug with is_torch_xpu_available needing to be called as func
Titus-von-Koeller Aug 20, 2024
87983df
avoid issue where test reports attr err due to other failure
Titus-von-Koeller Aug 20, 2024
7f17188
fix formatting
Titus-von-Koeller Aug 21, 2024
bb3ba4a
fix typo from resolving of merge conflict
Titus-von-Koeller Aug 21, 2024
463c211
polish based on last PR review
Titus-von-Koeller Aug 22, 2024
6d89ee4
fix CI
jiqing-feng Aug 28, 2024
7e01cfb
Update src/transformers/integrations/integration_utils.py
jiqing-feng Aug 29, 2024
9bffc93
Update src/transformers/integrations/integration_utils.py
jiqing-feng Aug 29, 2024
01b7587
fix error log
jiqing-feng Aug 29, 2024
171b130
fix error msg
jiqing-feng Aug 29, 2024
5e9bf9a
add \n in error log
jiqing-feng Aug 29, 2024
496c046
make quality
jiqing-feng Aug 29, 2024
86d0016
rm bnb cuda restriction in doc
jiqing-feng Aug 30, 2024
1c96ae9
cpu model don't need dispatch
jiqing-feng Sep 3, 2024
495354e
Merge branch 'main' into bnb_cpu
jiqing-feng Sep 3, 2024
3aec626
fix doc
jiqing-feng Sep 3, 2024
daa1e27
fix style
jiqing-feng Sep 3, 2024
d55db0e
check cuda avaliable in testing
jiqing-feng Sep 5, 2024
a21a916
fix tests
jiqing-feng Sep 5, 2024
8ad17e8
Update docs/source/en/model_doc/chameleon.md
jiqing-feng Sep 11, 2024
107e02b
Update docs/source/en/model_doc/llava_next.md
jiqing-feng Sep 11, 2024
20f6b5e
Update tests/quantization/bnb/test_4bit.py
jiqing-feng Sep 11, 2024
9ac038e
Update tests/quantization/bnb/test_4bit.py
jiqing-feng Sep 11, 2024
3bab7d7
fix doc
jiqing-feng Sep 11, 2024
968d9c5
Merge branch 'huggingface:main' into bnb_cpu
jiqing-feng Sep 11, 2024
08f31f8
fix check multibackends
jiqing-feng Sep 11, 2024
9eb0970
fix import sort
jiqing-feng Sep 11, 2024
b506b98
remove check torch in bnb
jiqing-feng Sep 11, 2024
2be4169
docs: update bitsandbytes references with multi-backend info
Titus-von-Koeller Sep 11, 2024
e607b7c
docs: fix small mistakes in bnb paragraph
Titus-von-Koeller Sep 11, 2024
ac108c6
run formatting
Titus-von-Koeller Sep 11, 2024
82dcb0d
Merge remote-tracking branch 'origin/main' into bnb_cpu
Titus-von-Koeller Sep 11, 2024
c66e7e7
reveret bnb check
jiqing-feng Sep 12, 2024
8f25ee2
move bnb multi-backend check to import_utils
jiqing-feng Sep 13, 2024
a4333cb
Update src/transformers/utils/import_utils.py
jiqing-feng Sep 14, 2024
32cbb8d
fix bnb check
jiqing-feng Sep 14, 2024
4ce4b55
minor fix for bnb
jiqing-feng Sep 14, 2024
937ed3b
check lib first
jiqing-feng Sep 14, 2024
e40f284
fix code style
jiqing-feng Sep 14, 2024
03dd03b
Merge branch 'huggingface:main' into bnb_cpu
jiqing-feng Sep 14, 2024
b8093ce
Revert "run formatting"
jiqing-feng Sep 14, 2024
0551d23
fix format
jiqing-feng Sep 14, 2024
e33e43b
give warning when bnb version is low and no cuda found]
jiqing-feng Sep 18, 2024
ced3c28
Merge branch 'huggingface:main' into bnb_cpu
jiqing-feng Sep 18, 2024
170dd58
fix device assignment check to be multi-device capable
Titus-von-Koeller Sep 22, 2024
9ba4a5e
address akx feedback on get_avlbl_dev fn
Titus-von-Koeller Sep 23, 2024
594f6f8
we don't want the function tat publicc, as docs would be too much
Titus-von-Koeller Sep 24, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/transformers/integrations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@
"run_hp_search_ray",
"run_hp_search_sigopt",
"run_hp_search_wandb",
"validate_bnb_backend_availability",
],
"peft": ["PeftAdapterMixin"],
"quanto": ["replace_with_quanto_layers"],
Expand Down Expand Up @@ -175,6 +176,7 @@
run_hp_search_ray,
run_hp_search_sigopt,
run_hp_search_wandb,
validate_bnb_backend_availability,
)
from .peft import PeftAdapterMixin
from .quanto import replace_with_quanto_layers
Expand Down
17 changes: 12 additions & 5 deletions src/transformers/integrations/bitsandbytes.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,11 @@

from packaging import version

from ..utils import is_accelerate_available, is_bitsandbytes_available, logging
from ..utils import (
is_accelerate_available,
is_bitsandbytes_available,
logging,
)


if is_bitsandbytes_available():
Expand Down Expand Up @@ -332,7 +336,7 @@ def get_keys_to_not_convert(model):


# Copied from PEFT: https://github.com/huggingface/peft/blob/47b3712898539569c02ec5b3ed4a6c36811331a1/src/peft/utils/integrations.py#L41
def dequantize_bnb_weight(weight: "torch.nn.Parameter", state=None):
def dequantize_bnb_weight(weight: "torch.nn.Parameter", dtype: "torch.dtype", state=None):
"""
Helper function to dequantize 4bit or 8bit bnb weights.

Expand All @@ -350,7 +354,7 @@ def dequantize_bnb_weight(weight: "torch.nn.Parameter", state=None):
logger.warning_once(
f"The model is going to be dequantized in {output_tensor.dtype} - if you want to upcast it to another dtype, make sure to pass the desired dtype when quantizing the model through `bnb_4bit_quant_type` argument of `BitsAndBytesConfig`"
)
return output_tensor
return output_tensor.to(dtype)
SunMarc marked this conversation as resolved.
Show resolved Hide resolved

if state.SCB is None:
state.SCB = weight.SCB
Expand All @@ -361,7 +365,7 @@ def dequantize_bnb_weight(weight: "torch.nn.Parameter", state=None):
if state.CxB is None:
state.CxB, state.SB = bnb.functional.transform(weight.data, to_order=state.formatB)
out32, Sout32 = bnb.functional.igemmlt(im, state.CxB, Sim, state.SB)
return bnb.functional.mm_dequant(out32, Sout32, SCim, state.SCB, bias=None).t()
return bnb.functional.mm_dequant(out32, Sout32, SCim, state.SCB, bias=None).t().to(dtype)


def _create_accelerate_new_hook(old_hook):
Expand All @@ -383,6 +387,7 @@ def _create_accelerate_new_hook(old_hook):

def _dequantize_and_replace(
model,
dtype,
modules_to_not_convert=None,
current_key_name=None,
quantization_config=None,
Expand Down Expand Up @@ -422,7 +427,7 @@ def _dequantize_and_replace(
else:
state = None

new_module.weight = torch.nn.Parameter(dequantize_bnb_weight(module.weight, state))
new_module.weight = torch.nn.Parameter(dequantize_bnb_weight(module.weight, dtype, state))

if bias is not None:
new_module.bias = bias
Expand All @@ -440,6 +445,7 @@ def _dequantize_and_replace(
if len(list(module.children())) > 0:
_, has_been_replaced = _dequantize_and_replace(
module,
dtype,
modules_to_not_convert,
current_key_name,
quantization_config,
Expand All @@ -457,6 +463,7 @@ def dequantize_and_replace(
):
model, has_been_replaced = _dequantize_and_replace(
model,
model.dtype,
modules_to_not_convert=modules_to_not_convert,
quantization_config=quantization_config,
)
Expand Down
69 changes: 69 additions & 0 deletions src/transformers/integrations/integration_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,9 @@
from ..utils import (
PushToHubMixin,
flatten_dict,
get_available_devices,
is_datasets_available,
is_ipex_available,
is_pandas_available,
is_tf_available,
is_torch_available,
Expand Down Expand Up @@ -204,6 +206,73 @@ def is_dvclive_available():
return importlib.util.find_spec("dvclive") is not None


def _validate_bnb_multi_backend_availability(raise_exception):
Titus-von-Koeller marked this conversation as resolved.
Show resolved Hide resolved
import bitsandbytes as bnb

bnb_supported_devices = getattr(bnb, "supported_torch_devices", set())
available_devices = get_available_devices()

if available_devices == {"cpu"} and not is_ipex_available():
from importlib.util import find_spec

if find_spec("intel_extension_for_pytorch"):
logger.warning(
"You have Intel IPEX installed but if you're intending to use it for CPU, it might not have the right version. Be sure to double check that your PyTorch and IPEX installs are compatible."
)

available_devices.discard("cpu") # Only Intel CPU is supported by BNB at the moment

if not available_devices.intersection(bnb_supported_devices):
if raise_exception:
bnb_supported_devices_with_info = set( # noqa: C401
'"cpu" (needs an Intel CPU and intel_extension_for_pytorch installed and compatible with the PyTorch version)'
if device == "cpu"
else device
for device in bnb_supported_devices
)
err_msg = (
f"None of the available devices `available_devices = {available_devices or None}` are supported by the bitsandbytes version you have installed: `bnb_supported_devices = {bnb_supported_devices_with_info}`. "
"Please check the docs to see if the backend you intend to use is available and how to install it: https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend"
)

logger.error(err_msg)
raise RuntimeError(err_msg)

logger.warning("No supported devices found for bitsandbytes multi-backend.")
return False

logger.debug("Multi-backend validation successful.")
return True


def _validate_bnb_cuda_backend_availability(raise_exception):
if not torch.cuda.is_available():
Titus-von-Koeller marked this conversation as resolved.
Show resolved Hide resolved
log_msg = (
"CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. "
"Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend"
)
if raise_exception:
logger.error(log_msg)
raise RuntimeError(log_msg)

logger.warning(log_msg)
return False

logger.debug("CUDA backend validation successful.")
return True


def validate_bnb_backend_availability(raise_exception=False):
"""
Validates if the available devices are supported by bitsandbytes, optionally raising an exception if not.
"""
import bitsandbytes as bnb

if "multi_backend" in getattr(bnb, "features", set()):
return _validate_bnb_multi_backend_availability(raise_exception)
return _validate_bnb_cuda_backend_availability(raise_exception)


def hp_params(trial):
if is_optuna_available():
import optuna
Expand Down
23 changes: 18 additions & 5 deletions src/transformers/quantizers/quantizer_bnb_4bit.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
is_accelerate_available,
is_bitsandbytes_available,
is_torch_available,
is_torch_xpu_available,
logging,
)

Expand Down Expand Up @@ -64,8 +65,6 @@ def __init__(self, quantization_config, **kwargs):
self.modules_to_not_convert = self.quantization_config.llm_int8_skip_modules

def validate_environment(self, *args, **kwargs):
if not torch.cuda.is_available():
raise RuntimeError("No GPU found. A GPU is needed for quantization.")
if not is_accelerate_available():
raise ImportError(
f"Using `bitsandbytes` 4-bit quantization requires Accelerate: `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`"
Expand All @@ -74,6 +73,13 @@ def validate_environment(self, *args, **kwargs):
raise ImportError(
"Using `bitsandbytes` 4-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`"
)
import bitsandbytes as bnb

bnb_multibackend_is_enabled = "multi_backend" in getattr(bnb, "features", set())

from ..integrations.integration_utils import validate_bnb_backend_availability

validate_bnb_backend_availability(raise_exception=True)

if kwargs.get("from_tf", False) or kwargs.get("from_flax", False):
raise ValueError(
Expand All @@ -90,7 +96,9 @@ def validate_environment(self, *args, **kwargs):
device_map_without_lm_head = {
key: device_map[key] for key in device_map.keys() if key not in self.modules_to_not_convert
}
if "cpu" in device_map_without_lm_head.values() or "disk" in device_map_without_lm_head.values():
if set(device_map.values()) == {"cpu"} and bnb_multibackend_is_enabled:
pass
elif "cpu" in device_map_without_lm_head.values() or "disk" in device_map_without_lm_head.values():
raise ValueError(
"Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the "
"quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules "
Expand Down Expand Up @@ -249,10 +257,15 @@ def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
# Copied from transformers.quantizers.quantizer_bnb_8bit.Bnb8BitHfQuantizer.update_device_map
def update_device_map(self, device_map):
if device_map is None:
device_map = {"": torch.cuda.current_device()}
if torch.cuda.is_available():
device_map = {"": torch.cuda.current_device()}
elif is_torch_xpu_available():
device_map = {"": f"xpu:{torch.xpu.current_device()}"}
else:
device_map = {"": "cpu"}
jiqing-feng marked this conversation as resolved.
Show resolved Hide resolved
logger.info(
"The device_map was not initialized. "
"Setting device_map to {'':torch.cuda.current_device()}. "
f"Setting device_map to {device_map}. "
"If you want to use the model for inference, please set device_map ='auto' "
)
return device_map
Expand Down
24 changes: 18 additions & 6 deletions src/transformers/quantizers/quantizer_bnb_8bit.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
is_accelerate_available,
is_bitsandbytes_available,
is_torch_available,
is_torch_xpu_available,
logging,
)
from .quantizers_utils import get_module_from_name
Expand Down Expand Up @@ -64,9 +65,6 @@ def __init__(self, quantization_config, **kwargs):
self.modules_to_not_convert = self.quantization_config.llm_int8_skip_modules

def validate_environment(self, *args, **kwargs):
if not torch.cuda.is_available():
raise RuntimeError("No GPU found. A GPU is needed for quantization.")

if not is_accelerate_available():
raise ImportError(
f"Using `bitsandbytes` 8-bit quantization requires Accelerate: `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`"
Expand All @@ -75,6 +73,13 @@ def validate_environment(self, *args, **kwargs):
raise ImportError(
"Using `bitsandbytes` 8-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`"
)
import bitsandbytes as bnb

bnb_multibackend_is_enabled = "multi_backend" in getattr(bnb, "features", set())

from ..integrations.integration_utils import validate_bnb_backend_availability

validate_bnb_backend_availability(raise_exception=True)

if kwargs.get("from_tf", False) or kwargs.get("from_flax", False):
raise ValueError(
Expand All @@ -91,7 +96,9 @@ def validate_environment(self, *args, **kwargs):
device_map_without_lm_head = {
key: device_map[key] for key in device_map.keys() if key not in self.modules_to_not_convert
}
if "cpu" in device_map_without_lm_head.values() or "disk" in device_map_without_lm_head.values():
if set(device_map.values()) == {"cpu"} and bnb_multibackend_is_enabled:
pass
elif "cpu" in device_map_without_lm_head.values() or "disk" in device_map_without_lm_head.values():
raise ValueError(
"Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the "
"quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules "
Expand Down Expand Up @@ -127,10 +134,15 @@ def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":

def update_device_map(self, device_map):
if device_map is None:
device_map = {"": torch.cuda.current_device()}
if torch.cuda.is_available():
device_map = {"": torch.cuda.current_device()}
elif is_torch_xpu_available():
device_map = {"": f"xpu:{torch.xpu.current_device()}"}
else:
device_map = {"": "cpu"}
logger.info(
"The device_map was not initialized. "
"Setting device_map to {'':torch.cuda.current_device()}. "
f"Setting device_map to {device_map}. "
"If you want to use the model for inference, please set device_map ='auto' "
)
return device_map
Expand Down
50 changes: 48 additions & 2 deletions src/transformers/testing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,17 @@ def parse_int_from_env(key, default=None):
_run_third_party_device_tests = parse_flag_from_env("RUN_THIRD_PARTY_DEVICE_TESTS", default=False)


def get_device_count():
import torch

if is_torch_xpu_available():
num_devices = torch.xpu.device_count()
else:
num_devices = torch.cuda.device_count()

return num_devices


def is_pt_tf_cross_test(test_case):
"""
Decorator marking a test as a test that control interactions between PyTorch and TensorFlow.
Expand Down Expand Up @@ -328,6 +339,29 @@ def tooslow(test_case):
return unittest.skip(reason="test is too slow")(test_case)


def skip_if_not_implemented(test_func):
@functools.wraps(test_func)
def wrapper(*args, **kwargs):
try:
return test_func(*args, **kwargs)
except NotImplementedError as e:
raise unittest.SkipTest(f"Test skipped due to NotImplementedError: {e}")

return wrapper


def apply_skip_if_not_implemented(cls):
"""
Class decorator to apply @skip_if_not_implemented to all test methods.
"""
for attr_name in dir(cls):
if attr_name.startswith("test_"):
attr = getattr(cls, attr_name)
if callable(attr):
setattr(cls, attr_name, skip_if_not_implemented(attr))
return cls


def custom_tokenizers(test_case):
"""
Decorator marking a test for a custom tokenizer.
Expand Down Expand Up @@ -725,9 +759,9 @@ def require_torch_multi_gpu(test_case):
if not is_torch_available():
return unittest.skip(reason="test requires PyTorch")(test_case)

import torch
device_count = get_device_count()

return unittest.skipUnless(torch.cuda.device_count() > 1, "test requires multiple GPUs")(test_case)
return unittest.skipUnless(device_count > 1, "test requires multiple GPUs")(test_case)


def require_torch_multi_accelerator(test_case):
Expand Down Expand Up @@ -927,6 +961,18 @@ def require_torch_gpu(test_case):
return unittest.skipUnless(torch_device == "cuda", "test requires CUDA")(test_case)


def require_torch_gpu_if_bnb_not_multi_backend_enabled(test_case):
"""
Decorator marking a test that requires a GPU if bitsandbytes multi-backend feature is not enabled.
"""
if is_bitsandbytes_available():
import bitsandbytes as bnb

if hasattr(bnb, "features") and "multi_backend" in bnb.features:
jiqing-feng marked this conversation as resolved.
Show resolved Hide resolved
return test_case
return require_torch_gpu(test_case)


def require_torch_accelerator(test_case):
"""Decorator marking a test that requires an accessible accelerator and PyTorch."""
return unittest.skipUnless(torch_device is not None and torch_device != "cpu", "test requires accelerator")(
Expand Down
Loading
Loading