diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 9d6dc9d82b33b7..adb2b4919d7791 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -2786,10 +2786,10 @@ def from_pretrained( model = replace_with_bnb_linear( model, modules_to_not_convert=modules_to_not_convert, quantization_config=quantization_config ) - # training in 8-bit is only available in 0.37.0+ + # training in 8-bit is only available in 0.37.0+ but a major bug in 8-bit optimizers was fixed in 0.41.1 model._is_quantized_training_enabled = version.parse( importlib.metadata.version("bitsandbytes") - ) >= version.parse("0.37.0") + ) >= version.parse("0.41.1") model.config.quantization_config = quantization_config model.is_8bit_serializable = is_8bit_serializable diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index cb4c0432102733..ec74c45fa304dd 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -404,7 +404,7 @@ def __init__( else: raise ValueError( "The model you want to train is loaded in 8-bit precision. if you want to fine-tune an 8-bit" - " model, please make sure that you have installed `bitsandbytes>=0.37.0`. " + " model, please make sure that you have installed `bitsandbytes>=0.41.1`. " ) # Setup Sharded DDP training