huggingface · Titus-von-Koeller · Sep 24, 2024 · May 29, 2024 · May 29, 2024 · May 29, 2024
diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py
diff --git a/.circleci/parse_test_outputs.py b/.circleci/parse_test_outputs.py
@@ -1,53 +1,57 @@
-import re
 import argparse
+import re
+
 
 def parse_pytest_output(file_path):
     skipped_tests = {}
     skipped_count = 0
-    with open(file_path, 'r') as file:
+    with open(file_path, "r") as file:
         for line in file:
-            match = re.match(r'^SKIPPED \[(\d+)\] (tests/.*): (.*)$', line)
+            match = re.match(r"^SKIPPED \[(\d+)\] (tests/.*): (.*)$", line)
             if match:
                 skipped_count += 1
                 test_file, test_line, reason = match.groups()
                 skipped_tests[reason] = skipped_tests.get(reason, []) + [(test_file, test_line)]
-    for k,v in sorted(skipped_tests.items(), key=lambda x:len(x[1])):
+    for k, v in sorted(skipped_tests.items(), key=lambda x: len(x[1])):
         print(f"{len(v):4} skipped because: {k}")
     print("Number of skipped tests:", skipped_count)
 
+
 def parse_pytest_failure_output(file_path):
     failed_tests = {}
     failed_count = 0
-    with open(file_path, 'r') as file:
+    with open(file_path, "r") as file:
         for line in file:
-            match = re.match(r'^FAILED (tests/.*) - (.*): (.*)$', line)
+            match = re.match(r"^FAILED (tests/.*) - (.*): (.*)$", line)
             if match:
                 failed_count += 1
                 _, error, reason = match.groups()
                 failed_tests[reason] = failed_tests.get(reason, []) + [error]
-    for k,v in sorted(failed_tests.items(), key=lambda x:len(x[1])):
+    for k, v in sorted(failed_tests.items(), key=lambda x: len(x[1])):
         print(f"{len(v):4} failed because `{v[0]}` -> {k}")
     print("Number of failed tests:", failed_count)
-    if failed_count>0:
+    if failed_count > 0:
         exit(1)
 
+
 def parse_pytest_errors_output(file_path):
     print(file_path)
     error_tests = {}
     error_count = 0
-    with open(file_path, 'r') as file:
+    with open(file_path, "r") as file:
         for line in file:
-            match = re.match(r'^ERROR (tests/.*) - (.*): (.*)$', line)
+            match = re.match(r"^ERROR (tests/.*) - (.*): (.*)$", line)
             if match:
                 error_count += 1
                 _, test_error, reason = match.groups()
                 error_tests[reason] = error_tests.get(reason, []) + [test_error]
-    for k,v in sorted(error_tests.items(), key=lambda x:len(x[1])):
+    for k, v in sorted(error_tests.items(), key=lambda x: len(x[1])):
         print(f"{len(v):4} errored out because of `{v[0]}` -> {k}")
     print("Number of errors:", error_count)
-    if error_count>0:
+    if error_count > 0:
         exit(1)
 
+
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--file", help="file to parse")

diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py
@@ -31,9 +31,7 @@
 from pathlib import Path
 
 from git import Repo
-
 from huggingface_hub import HfApi
-
 from optimum_benchmark import Benchmark
 from optimum_benchmark_wrapper import main
 

diff --git a/docs/source/en/llm_tutorial_optimization.md b/docs/source/en/llm_tutorial_optimization.md
@@ -181,7 +181,7 @@ for every matrix multiplication. Dequantization and re-quantization is performed
 
 Therefore, inference time is often **not** reduced when using quantized weights, but rather increases.
 Enough theory, let's give it a try! To quantize the weights with Transformers, you need to make sure that
-the [`bitsandbytes`](https://github.com/TimDettmers/bitsandbytes) library is installed.
+the [`bitsandbytes`](https://github.com/bitsandbytes-foundation/bitsandbytes) library is installed.
 
 ```bash
 !pip install bitsandbytes

diff --git a/docs/source/en/model_doc/chameleon.md b/docs/source/en/model_doc/chameleon.md
@@ -128,7 +128,17 @@ processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokeniza
 
 ### Quantization using Bitsandbytes
 
-The model can be loaded in 8 or 4 bits, greatly reducing the memory requirements while maintaining the performance of the original model. First make sure to install bitsandbytes, `pip install bitsandbytes` and make sure to have access to a CUDA compatible GPU device. Simply change the snippet above with:
+The model can be loaded in 8 or 4 bits, greatly reducing the memory requirements while maintaining the performance of the original model. First make sure to install bitsandbytes, `pip install bitsandbytes` and to have access to a GPU/accelerator that is supported by the library.
+
+<Tip>
+
+bitsandbytes is being refactored to support multiple backends beyond CUDA. Currently, ROCm (AMD GPU) and Intel CPU implementations are mature, with Intel XPU in progress and Apple Silicon support expected by Q4/Q1. For installation instructions and the latest backend updates, visit [this link](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend).
+
+We value your feedback to help identify bugs before the full release! Check out [these docs](https://huggingface.co/docs/bitsandbytes/main/en/non_cuda_backends) for more details and feedback links.
+
+</Tip>
+
+Simply change the snippet above with:
 
 ```python
 from transformers import ChameleonForConditionalGeneration, BitsAndBytesConfig

diff --git a/docs/source/en/model_doc/llava_next.md b/docs/source/en/model_doc/llava_next.md
@@ -233,7 +233,17 @@ processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokeniza
 
 ### Quantization using Bitsandbytes
 
-The model can be loaded in 8 or 4 bits, greatly reducing the memory requirements while maintaining the performance of the original model. First make sure to install bitsandbytes, `pip install bitsandbytes` and make sure to have access to a CUDA compatible GPU device. Simply change the snippet above with:
+The model can be loaded in 8 or 4 bits, greatly reducing the memory requirements while maintaining the performance of the original model. First make sure to install bitsandbytes, `pip install bitsandbytes`, and to have access to a GPU/accelerator that is supported by the library.
+
+<Tip>
+
+bitsandbytes is being refactored to support multiple backends beyond CUDA. Currently, ROCm (AMD GPU) and Intel CPU implementations are mature, with Intel XPU in progress and Apple Silicon support expected by Q4/Q1. For installation instructions and the latest backend updates, visit [this link](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend).
+
+We value your feedback to help identify bugs before the full release! Check out [these docs](https://huggingface.co/docs/bitsandbytes/main/en/non_cuda_backends) for more details and feedback links.
+
+</Tip>
+
+Simply change the snippet above with:
 
 ```python
 from transformers import LlavaNextForConditionalGeneration, BitsAndBytesConfig

diff --git a/docs/source/en/model_doc/llava_next_video.md b/docs/source/en/model_doc/llava_next_video.md
@@ -205,7 +205,17 @@ processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokeniza
 
 The model can be loaded in lower bits, significantly reducing memory burden while maintaining the performance of the original model. This allows for efficient deployment on resource-constrained cases. 
 
-First make sure to install bitsandbytes by running `pip install bitsandbytes` and to have access to a CUDA compatible GPU device. Load the quantized model by simply adding [`BitsAndBytesConfig`](../main_classes/quantization#transformers.BitsAndBytesConfig) as shown below:
+First, make sure to install bitsandbytes by running `pip install bitsandbytes` and to have access to a GPU/accelerator that is supported by the library.
+
+<Tip>
+
+bitsandbytes is being refactored to support multiple backends beyond CUDA. Currently, ROCm (AMD GPU) and Intel CPU implementations are mature, with Intel XPU in progress and Apple Silicon support expected by Q4/Q1. For installation instructions and the latest backend updates, visit [this link](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend).
+
+We value your feedback to help identify bugs before the full release! Check out [these docs](https://huggingface.co/docs/bitsandbytes/main/en/non_cuda_backends) for more details and feedback links.
+
+</Tip>
+
+Then simply load the quantized model by adding [`BitsAndBytesConfig`](../main_classes/quantization#transformers.BitsAndBytesConfig) as shown below:
 
 
 ```python

diff --git a/docs/source/en/model_doc/llava_onevision.md b/docs/source/en/model_doc/llava_onevision.md
@@ -264,9 +264,19 @@ processor.batch_decode(out, skip_special_tokens=True, clean_up_tokenization_spac
 
 ## Model optimization
 
-### Quantization using Bitsandbytes
+### Quantization using bitsandbytes
 
-The model can be loaded in 8 or 4 bits, greatly reducing the memory requirements while maintaining the performance of the original model. First make sure to install bitsandbytes, `pip install bitsandbytes` and make sure to have access to a CUDA compatible GPU device. Simply change the snippet above with:
+The model can be loaded in 8 or 4 bits, greatly reducing the memory requirements while maintaining the performance of the original model. First make sure to install bitsandbytes, `pip install bitsandbytes` and make sure to have access to a GPU/accelerator that is supported by the library.
+
+<Tip>
+
+bitsandbytes is being refactored to support multiple backends beyond CUDA. Currently, ROCm (AMD GPU) and Intel CPU implementations are mature, with Intel XPU in progress and Apple Silicon support expected by Q4/Q1. For installation instructions and the latest backend updates, visit [this link](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend).
+
+We value your feedback to help identify bugs before the full release! Check out [these docs](https://huggingface.co/docs/bitsandbytes/main/en/non_cuda_backends) for more details and feedback links.
+
+</Tip>
+
+Simply change the snippet above with:
 
 ```python
 from transformers import LlavaOnevisionForConditionalGeneration, BitsAndBytesConfig

diff --git a/docs/source/en/model_doc/mixtral.md b/docs/source/en/model_doc/mixtral.md
@@ -141,7 +141,7 @@ The Flash Attention-2 model uses also a more memory efficient cache slicing mech
 
 As the Mixtral model has 45 billion parameters, that would require about 90GB of GPU RAM in half precision (float16), since each parameter is stored in 2 bytes. However, one can shrink down the size of the model using [quantization](../quantization.md). If the model is quantized to 4 bits (or half a byte per parameter), a single A100 with 40GB of RAM is enough to fit the entire model, as in that case only about 27 GB of RAM is required.
 
-Quantizing a model is as simple as passing a `quantization_config` to the model. Below, we'll leverage the BitsAndyBytes quantization (but refer to [this page](../quantization.md) for other quantization methods):
+Quantizing a model is as simple as passing a `quantization_config` to the model. Below, we'll leverage the bitsandbytes quantization library (but refer to [this page](../quantization.md) for alternative quantization methods):
 
 ```python
 >>> import torch

diff --git a/docs/source/en/model_doc/video_llava.md b/docs/source/en/model_doc/video_llava.md
@@ -139,7 +139,17 @@ processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokeniza
 
 The model can be loaded in lower bits, significantly reducing memory burden while maintaining the performance of the original model. his allows for efficient deployment on resource-constrained cases. 
 
-First make sure to install bitsandbytes by running `pip install bitsandbytes` and to have access to a CUDA compatible GPU device. Load the quantized model by simply adding [`BitsAndBytesConfig`](../main_classes/quantization#transformers.BitsAndBytesConfig) as shown below:
+First make sure to install bitsandbytes by running `pip install bitsandbytes` and to have access to a GPU/accelerator that is supported by the library.
+
+<Tip>
+
+bitsandbytes is being refactored to support multiple backends beyond CUDA. Currently, ROCm (AMD GPU) and Intel CPU implementations are mature, with Intel XPU in progress and Apple Silicon support expected by Q4/Q1. For installation instructions and the latest backend updates, visit [this link](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend).
+
+We value your feedback to help identify bugs before the full release! Check out [these docs](https://huggingface.co/docs/bitsandbytes/main/en/non_cuda_backends) for more details and feedback links.
+
+</Tip>
+
+Load the quantized model by simply adding [`BitsAndBytesConfig`](../main_classes/quantization#transformers.BitsAndBytesConfig) as shown below:
 
 
 ```python

diff --git a/docs/source/en/model_memory_anatomy.md b/docs/source/en/model_memory_anatomy.md
@@ -233,7 +233,7 @@ Let's look at the details.
 **Optimizer States:**
 
 - 8 bytes * number of parameters for normal AdamW (maintains 2 states)
-- 2 bytes * number of parameters for 8-bit AdamW optimizers like [bitsandbytes](https://github.com/TimDettmers/bitsandbytes)
+- 2 bytes * number of parameters for 8-bit AdamW optimizers like [bitsandbytes](https://github.com/bitsandbytes-foundation/bitsandbytes)
 - 4 bytes * number of parameters for optimizers like SGD with momentum (maintains only 1 state)
 
 **Gradients**

diff --git a/docs/source/en/perf_train_gpu_one.md b/docs/source/en/perf_train_gpu_one.md
@@ -284,7 +284,7 @@ training_args = TrainingArguments(per_device_train_batch_size=4, optim="adamw_bn
 
 However, we can also use a third-party implementation of the 8-bit optimizer for demonstration purposes to see how that can be integrated.
 
-First, follow the installation guide in the GitHub [repo](https://github.com/TimDettmers/bitsandbytes) to install the `bitsandbytes` library 
+First, follow the installation guide in the GitHub [repo](https://github.com/bitsandbytes-foundation/bitsandbytes) to install the `bitsandbytes` library 
 that implements the 8-bit Adam optimizer.
 
 Next you need to initialize the optimizer. This involves two steps: 

diff --git a/docs/source/en/quantization/bitsandbytes.md b/docs/source/en/quantization/bitsandbytes.md
@@ -38,6 +38,14 @@ pip install --upgrade accelerate transformers
 </hfoption>
 </hfoptions>
 
+<Tip>
+
+bitsandbytes is being refactored to support multiple backends beyond CUDA. Currently, ROCm (AMD GPU) and Intel CPU implementations are mature, with Intel XPU in progress and Apple Silicon support expected by Q4/Q1. For installation instructions and the latest backend updates, visit [this link](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend).
+
+We value your feedback to help identify bugs before the full release! Check out [these docs](https://huggingface.co/docs/bitsandbytes/main/en/non_cuda_backends) for more details and feedback links.
+
+</Tip>
+
 Now you can quantize a model by passing a `BitsAndBytesConfig` to [`~PreTrainedModel.from_pretrained`] method. This works for any model in any modality, as long as it supports loading with Accelerate and contains `torch.nn.Linear` layers.
 
 <hfoptions id="bnb">

diff --git a/docs/source/en/quantization/overview.md b/docs/source/en/quantization/overview.md
@@ -49,11 +49,25 @@ Use the table below to help you decide which quantization method to use.
 |-------------------------------------|-------------------------|-----|----------|----------------|-----------------------|-------------------------|----------------|-------------------------------------|--------------|------------------------|---------------------------------------------|
 | [AQLM](./aqlm)                                | 🔴                       |  🟢   |     🟢     | 🔴              | 🔴                     | 🟢                      | 1 / 2          | 🟢                                   | 🟢            | 🟢                      | https://github.com/Vahe1994/AQLM            |
 | [AWQ](./awq) | 🔴                       | 🔴   | 🟢        | 🟢              | 🔴                     | ?                       | 4              | 🟢                                   | 🟢            | 🟢                      | https://github.com/casper-hansen/AutoAWQ    |
-| [bitsandbytes](./bitsandbytes)                        | 🟢                       | 🔴   |     🟢     | 🔴              | 🔴                     | 🔴                       | 4 / 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/TimDettmers/bitsandbytes |
+| [bitsandbytes](./bitsandbytes)     | 🟢            | 🟡 *   |     🟢     | 🟡 *            | 🔴 **    | 🔴    (soon!)          | 4 / 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/bitsandbytes-foundation/bitsandbytes |
 | [EETQ](./eetq)                                | 🟢                       | 🔴   | 🟢        | 🔴              | 🔴                     | ?                       | 8              | 🟢                                   | 🟢            | 🟢                      | https://github.com/NetEase-FuXi/EETQ        |
 | GGUF / GGML (llama.cpp)             | 🟢                       | 🟢   | 🟢        | 🔴              | 🟢                     | 🔴                       | 1 - 8          | 🔴                                   | [See GGUF section](../gguf)                | [See GGUF section](../gguf)                      | https://github.com/ggerganov/llama.cpp      |
 | [GPTQ](./gptq)                                | 🔴                       | 🔴   | 🟢        | 🟢              | 🔴                     | 🔴                       | 2 - 3 - 4 - 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/AutoGPTQ/AutoGPTQ        |
 | [HQQ](./hqq)                                 | 🟢                       | 🟢    | 🟢        | 🔴              | 🔴                     | 🟢                       | 1 - 8          | 🟢                                   | 🔴            | 🟢                      | https://github.com/mobiusml/hqq/            |
 | [Quanto](./quanto)                              | 🟢                       | 🟢   | 🟢        | 🔴              | 🟢                     | 🟢                       | 2 / 4 / 8      | 🔴                                   | 🔴            | 🟢                      | https://github.com/huggingface/quanto       |
 | [FBGEMM_FP8](./fbgemm_fp8.md)                              | 🟢                       | 🔴    | 🟢        | 🔴              | 🔴                      | 🔴                        | 8      | 🔴                                   | 🟢            | 🟢                      | https://github.com/pytorch/FBGEMM       |
 | [torchao](./torchao.md)                              | 🟢                       |     | 🟢        | 🔴              | partial support (int4 weight only)       |                       | 4 / 8      |                                   | 🟢🔴           | 🟢                      | https://github.com/pytorch/ao       |
+
+<Tip>
+
+\* bitsandbytes is being refactored to support multiple backends beyond CUDA. Currently, ROCm (AMD GPU) and Intel CPU implementations are mature, with Intel XPU in progress and Apple Silicon support expected by Q4/Q1. For installation instructions and the latest backend updates, visit [this link](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend).
+
+We value your feedback to help identify bugs before the full release! Check out [these docs](https://huggingface.co/docs/bitsandbytes/main/en/non_cuda_backends) for more details and feedback links.
+
+</Tip>
+
+<Tip>
+
+\** bitsandbytes is seeking contributors to help develop and lead the Apple Silicon backend. Interested? Contact them directly via their repo. Stipends may be available through sponsorships.
+
+</Tip>
diff --git a/scripts/benchmark/trainer-benchmark.py b/scripts/benchmark/trainer-benchmark.py
@@ -181,23 +181,21 @@ def get_original_command(max_width=80, full_python_path=False):
 
 
 def get_base_command(args, output_dir):
-
     # unwrap multi-line input
     args.base_cmd = re.sub(r"[\\\n]+", " ", args.base_cmd)
 
     # remove --output_dir if any and set our own
-    args.base_cmd = re.sub("--output_dir\s+[^\s]+", "", args.base_cmd)
+    args.base_cmd = re.sub(r"--output_dir\s+[^\s]+", "", args.base_cmd)
     args.base_cmd += f" --output_dir {output_dir}"
 
     # ensure we have --overwrite_output_dir
-    args.base_cmd = re.sub("--overwrite_output_dir\s+", "", args.base_cmd)
+    args.base_cmd = re.sub(r"--overwrite_output_dir\s+", "", args.base_cmd)
     args.base_cmd += " --overwrite_output_dir"
 
     return [sys.executable] + shlex.split(args.base_cmd)
 
 
 def process_run_single(id, cmd, variation, output_dir, target_metric_key, metric_keys, verbose):
-
     # Enable to debug everything but the run itself, to do it fast and see the progress.
     # This is useful for debugging the output formatting quickly - we can remove it later once
     # everybody is happy with the output
@@ -296,7 +294,6 @@ def get_versions():
 
 
 def process_results(results, target_metric_key, report_metric_keys, base_variation, output_dir):
-
     df = pd.DataFrame(results)
     variation_key = "variation"
     diff_key = "diff_%"