diff --git a/.circleci/config.yml b/.circleci/config.yml
index ca2afc67c10e3e..9c414901c4f5ac 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -186,7 +186,19 @@ workflows:
     version: 2
     setup_and_quality:
         when:
-            not: <<pipeline.parameters.nightly>>
+            and:
+                - equal: [<<pipeline.project.git_url>>, https://github.com/huggingface/transformers]
+                - not: <<pipeline.parameters.nightly>>
+        jobs:
+            - check_circleci_user
+            - check_code_quality
+            - check_repository_consistency
+            - fetch_tests
+
+    setup_and_quality_2:
+        when:
+            not:
+                 equal: [<<pipeline.project.git_url>>, https://github.com/huggingface/transformers]
         jobs:
             - check_circleci_user
             - check_code_quality
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index cb9a3d7b7974aa..75a837d693e7c6 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -13,7 +13,8 @@ env:
 jobs:
   benchmark:
     name: Benchmark
-    runs-on: [single-gpu, nvidia-gpu, a10, ci]
+    runs-on: 
+      group: aws-g5-4xlarge-cache
     container:
       image: huggingface/transformers-all-latest-gpu
       options: --gpus all --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
diff --git a/.github/workflows/doctest_job.yml b/.github/workflows/doctest_job.yml
index 98be985292e3e0..eb62b797b8eb55 100644
--- a/.github/workflows/doctest_job.yml
+++ b/.github/workflows/doctest_job.yml
@@ -27,7 +27,8 @@ jobs:
       fail-fast: false
       matrix:
         split_keys: ${{ fromJson(inputs.split_keys) }}
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on: 
+      group: aws-g4dn-2xlarge-cache
     container:
       image: huggingface/transformers-all-latest-gpu
       options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
diff --git a/.github/workflows/doctests.yml b/.github/workflows/doctests.yml
index 4b515c741a3a72..472b07684ed12a 100644
--- a/.github/workflows/doctests.yml
+++ b/.github/workflows/doctests.yml
@@ -14,7 +14,8 @@ env:
 jobs:
   setup:
     name: Setup
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on: 
+      group: aws-g4dn-2xlarge-cache
     container:
       image: huggingface/transformers-all-latest-gpu
       options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -85,4 +86,4 @@ jobs:
         uses: actions/upload-artifact@v4
         with:
           name: doc_test_results
-          path: doc_test_results
\ No newline at end of file
+          path: doc_test_results
diff --git a/.github/workflows/push-important-models.yml b/.github/workflows/push-important-models.yml
index 41bcd43fcc6fc2..1887af0f4c5bac 100644
--- a/.github/workflows/push-important-models.yml
+++ b/.github/workflows/push-important-models.yml
@@ -52,7 +52,8 @@ jobs:
   test_modified_files:
     needs: get_modified_models
     name: Slow & FA2 tests
-    runs-on: [single-gpu, nvidia-gpu, a10, ci]
+    runs-on:
+      group: aws-g5-4xlarge-cache
     container:
       image: huggingface/transformers-all-latest-gpu
       options: --gpus all --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
diff --git a/.github/workflows/self-pr-slow-ci.yml b/.github/workflows/self-pr-slow-ci.yml
index 2287b5e3f31587..43fcecd8def21e 100644
--- a/.github/workflows/self-pr-slow-ci.yml
+++ b/.github/workflows/self-pr-slow-ci.yml
@@ -65,8 +65,9 @@ jobs:
         fail-fast: false
         matrix:
           folders: ${{ fromJson(needs.find_models_to_run.outputs.models) }}
-          machine_type: [single-gpu, multi-gpu]
-      runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, ci]
+          machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
+      runs-on:
+        group: '${{ matrix.machine_type }}'
       container:
         image: huggingface/transformers-all-latest-gpu
         options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -93,12 +94,27 @@ jobs:
 
       - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
         working-directory: /transformers
-        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . && python3 -m pip install --upgrade torch torchaudio torchvision
 
       - name: NVIDIA-SMI
         run: |
           nvidia-smi
 
+      - name: Set `machine_type` for report and artifact names
+        working-directory: /transformers
+        shell: bash
+        run: |
+          echo "${{ matrix.machine_type }}"
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
+            machine_type=single-gpu
+          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
+            machine_type=multi-gpu
+          else
+            machine_type=${{ matrix.machine_type }}
+          fi
+          echo "$machine_type"
+          echo "machine_type=$machine_type" >> $GITHUB_ENV    
+
       - name: Environment
         working-directory: /transformers
         run: |
@@ -113,23 +129,23 @@ jobs:
         run: |
           export CUDA_VISIBLE_DEVICES="$(python3 utils/set_cuda_devices_for_ci.py --test_folder ${{ matrix.folders }})"
           echo $CUDA_VISIBLE_DEVICES
-          python3 -m pytest -v -rsfE --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
+          python3 -m pytest -v -rsfE --make-reports=${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
 
       - name: Failure short reports
         if: ${{ failure() }}
         continue-on-error: true
-        run: cat /transformers/reports/${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
+        run: cat /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
 
       - name: Make sure report directory exists
         shell: bash
         run: |
-          mkdir -p /transformers/reports/${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
-          echo "hello" > /transformers/reports/${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/hello.txt
-          echo "${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports"
+          mkdir -p /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
+          echo "hello" > /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/hello.txt
+          echo "${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports"
 
-      - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports"
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports"
         if: ${{ always() }}
         uses: actions/upload-artifact@v4
         with:
-          name: ${{ matrix.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports
-          path: /transformers/reports/${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
+          name: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports
+          path: /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml
index b328f65d34a5fe..940495c2875327 100644
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@@ -32,8 +32,9 @@ jobs:
     name: Setup
     strategy:
       matrix:
-        machine_type: [single-gpu, multi-gpu]
-    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, push-ci]
+        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
+    runs-on:
+      group: '${{ matrix.machine_type }}'
     container:
       image: huggingface/transformers-all-latest-gpu-push-ci
       options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -131,8 +132,9 @@ jobs:
       fail-fast: false
       matrix:
         folders: ${{ fromJson(needs.setup.outputs.matrix) }}
-        machine_type: [single-gpu]
-    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, push-ci]
+        machine_type: [aws-g4dn-2xlarge-cache]
+    runs-on:
+      group: '${{ matrix.machine_type }}'
     container:
       image: huggingface/transformers-all-latest-gpu-push-ci
       options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -162,6 +164,23 @@ jobs:
           echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
           echo "env.CI_SHA = ${{ env.CI_SHA }}"
 
+      - name: Set `machine_type` for report and artifact names
+        working-directory: /transformers
+        shell: bash
+        run: |
+          echo "${{ matrix.machine_type }}"
+
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
+            machine_type=single-gpu
+          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
+            machine_type=multi-gpu
+          else
+            machine_type=${{ matrix.machine_type }}
+          fi
+
+          echo "$machine_type"
+          echo "machine_type=$machine_type" >> $GITHUB_ENV
+
       - name: Update clone using environment variables
         working-directory: /transformers
         run: |
@@ -203,19 +222,19 @@ jobs:
       - name: Run all non-slow selected tests on GPU
         working-directory: /transformers
         run: |
-          python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} ${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }}
+          python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ env.machine_type }}_tests_gpu_${{ matrix.folders }} ${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }}
 
       - name: Failure short reports
         if: ${{ failure() }}
         continue-on-error: true
-        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
+        run: cat /transformers/reports/${{ env.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
 
-      - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports"
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports"
         if: ${{ always() }}
         uses: actions/upload-artifact@v4
         with:
-          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
-          path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
+          name: ${{ env.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
+          path: /transformers/reports/${{ env.machine_type }}_tests_gpu_${{ matrix.folders }}
 
   run_tests_multi_gpu:
     name: Model tests
@@ -226,8 +245,9 @@ jobs:
       fail-fast: false
       matrix:
         folders: ${{ fromJson(needs.setup.outputs.matrix) }}
-        machine_type: [multi-gpu]
-    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, push-ci]
+        machine_type: [aws-g4dn-12xlarge-cache]
+    runs-on:
+      group: '${{ matrix.machine_type }}'
     container:
       image: huggingface/transformers-all-latest-gpu-push-ci
       options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -257,6 +277,23 @@ jobs:
           echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
           echo "env.CI_SHA = ${{ env.CI_SHA }}"
 
+      - name: Set `machine_type` for report and artifact names
+        working-directory: /transformers
+        shell: bash
+        run: |
+          echo "${{ matrix.machine_type }}"
+
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
+            machine_type=single-gpu
+          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
+            machine_type=multi-gpu
+          else
+            machine_type=${{ matrix.machine_type }}
+          fi
+
+          echo "$machine_type"
+          echo "machine_type=$machine_type" >> $GITHUB_ENV
+          
       - name: Update clone using environment variables
         working-directory: /transformers
         run: |
@@ -300,19 +337,19 @@ jobs:
           MKL_SERVICE_FORCE_INTEL: 1
         working-directory: /transformers
         run: |
-          python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} ${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }}
+          python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ env.machine_type }}_tests_gpu_${{ matrix.folders }} ${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }}
 
       - name: Failure short reports
         if: ${{ failure() }}
         continue-on-error: true
-        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
+        run: cat /transformers/reports/${{ env.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
 
-      - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports"
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports"
         if: ${{ always() }}
         uses: actions/upload-artifact@v4
         with:
-          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
-          path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
+          name: ${{ env.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
+          path: /transformers/reports/${{ env.machine_type }}_tests_gpu_${{ matrix.folders }}
 
   run_tests_torch_cuda_extensions_single_gpu:
     name: Torch CUDA extension tests
@@ -321,8 +358,9 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        machine_type: [single-gpu]
-    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, push-ci]
+        machine_type: [aws-g4dn-2xlarge-cache]
+    runs-on:
+      group: '${{ matrix.machine_type }}'
     container:
       image: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
       options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -352,6 +390,23 @@ jobs:
           echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
           echo "env.CI_SHA = ${{ env.CI_SHA }}"
 
+      - name: Set `machine_type` for report and artifact names
+        working-directory: /workspace/transformers
+        shell: bash
+        run: |
+          echo "${{ matrix.machine_type }}"
+
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
+            machine_type=single-gpu
+          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
+            machine_type=multi-gpu
+          else
+            machine_type=${{ matrix.machine_type }}
+          fi
+
+          echo "$machine_type"
+          echo "machine_type=$machine_type" >> $GITHUB_ENV
+          
       - name: Update clone using environment variables
         working-directory: /workspace/transformers
         run: |
@@ -392,19 +447,19 @@ jobs:
         working-directory: /workspace/transformers
         # TODO: Here we pass all tests in the 2 folders for simplicity. It's better to pass only the identified tests.
         run: |
-          python -m pytest -n 1 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended
+          python -m pytest -n 1 --dist=loadfile -v --make-reports=${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended
 
       - name: Failure short reports
         if: ${{ failure() }}
         continue-on-error: true
-        run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt
+        run: cat /workspace/transformers/reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt
 
-      - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports"
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports"
         if: ${{ always() }}
         uses: actions/upload-artifact@v4
         with:
-          name: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
-          path: /workspace/transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
+          name: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
+          path: /workspace/transformers/reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
 
   run_tests_torch_cuda_extensions_multi_gpu:
     name: Torch CUDA extension tests
@@ -413,8 +468,9 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        machine_type: [multi-gpu]
-    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, push-ci]
+        machine_type: [aws-g4dn-12xlarge-cache]
+    runs-on:
+      group: '${{ matrix.machine_type }}'
     container:
       image: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
       options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -444,6 +500,23 @@ jobs:
           echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
           echo "env.CI_SHA = ${{ env.CI_SHA }}"
 
+      - name: Set `machine_type` for report and artifact names
+        working-directory: /workspace/transformers
+        shell: bash
+        run: |
+          echo "${{ matrix.machine_type }}"
+
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
+            machine_type=single-gpu
+          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
+            machine_type=multi-gpu
+          else
+            machine_type=${{ matrix.machine_type }}
+          fi
+
+          echo "$machine_type"
+          echo "machine_type=$machine_type" >> $GITHUB_ENV
+          
       - name: Update clone using environment variables
         working-directory: /workspace/transformers
         run: |
@@ -484,19 +557,19 @@ jobs:
         working-directory: /workspace/transformers
         # TODO: Here we pass all tests in the 2 folders for simplicity. It's better to pass only the identified tests.
         run: |
-          python -m pytest -n 1 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended
+          python -m pytest -n 1 --dist=loadfile -v --make-reports=${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended
 
       - name: Failure short reports
         if: ${{ failure() }}
         continue-on-error: true
-        run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt
+        run: cat /workspace/transformers/reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt
 
-      - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports"
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports"
         if: ${{ always() }}
         uses: actions/upload-artifact@v4
         with:
-          name: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
-          path: /workspace/transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
+          name: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
+          path: /workspace/transformers/reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
 
   send_results:
     name: Send results to webhook
diff --git a/.github/workflows/slow_ci_remainder.yml b/.github/workflows/slow_ci_remainder.yml
deleted file mode 100644
index b479f152da283e..00000000000000
--- a/.github/workflows/slow_ci_remainder.yml
+++ /dev/null
@@ -1,51 +0,0 @@
-name: Slow CI Reminder
-
-# For security reason, don't use `pull_request` but `pull_request_target`!
-on:
-  pull_request_target:
-    paths:
-      - "src/transformers/models/*/*.py"
-      - "src/transformers/generation/*.py"
-      - "src/transformers/pipelines/*.py"
-      - "tests/models/*/*.py"
-      - "tests/generation/*/*.py"
-      - "tests/pipelines/*/*.py"
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  remind:
-    name: remind
-    runs-on: ubuntu-22.04
-    env:
-      GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-    steps:
-    - name: Find comment
-      uses: peter-evans/find-comment@v2
-      id: find_comment
-      with:
-        issue-number: ${{ github.event.number }}
-        body-includes: Before merging this pull request, slow tests CI should be triggered. To enable this
-
-    - run: |
-        echo ${{ steps.find_comment.outputs.comment-id }}
-
-    - name: Add comment if not present
-      if: steps.find_comment.outputs.comment-id == ''
-      uses: thollander/actions-comment-pull-request@v2
-      with:
-        message: |
-          Hey! 🤗 Thanks for your contribution to the `transformers` library!
-
-          Before merging this pull request, slow tests CI should be triggered. To enable this:
-          - Add the `run-slow` label to the PR
-          - When your PR is ready for merge and all reviewers' comments have been addressed, push an empty commit with the command `[run-slow]` followed by a comma separated list of all the models to be tested, i.e. `[run_slow] model_to_test_1, model_to_test_2`
-            - If the pull request affects a lot of models, put at most 10 models in the commit message
-          - A `transformers` maintainer will then approve the workflow to start the tests
-          
-          (For maintainers) The documentation for slow tests CI on PRs is [here](https://www.notion.so/huggingface2/CI-for-pull-requests-8335bd9217d24d1e8ed1d3a016f39804).
-
-        pr_number: ${{ github.event.number }}
-        GITHUB_TOKEN: ${{ secrets.comment_bot_token }}
diff --git a/.github/workflows/ssh-runner.yml b/.github/workflows/ssh-runner.yml
index db649876f60492..02b022698b0c5e 100644
--- a/.github/workflows/ssh-runner.yml
+++ b/.github/workflows/ssh-runner.yml
@@ -26,9 +26,38 @@ env:
   RUN_PT_TF_CROSS_TESTS: 1
 
 jobs:
+  get_runner:
+    name: "Get runner to use"
+    runs-on: ubuntu-22.04
+    outputs:
+      RUNNER: ${{ steps.set_runner.outputs.RUNNER }}
+    steps:
+      - name: Get runner to use
+        shell: bash
+        run: |
+          if [[ "${{ github.event.inputs.num_gpus }}" == "single" && "${{ github.event.inputs.runner_type }}" == "t4" ]]; then
+            echo "RUNNER=aws-g4dn-2xlarge-cache" >> $GITHUB_ENV
+          elif [[ "${{ github.event.inputs.num_gpus }}" == "multi" && "${{ github.event.inputs.runner_type }}" == "t4" ]]; then
+            echo "RUNNER=aws-g4dn-12xlarge-cache" >> $GITHUB_ENV
+          elif [[ "${{ github.event.inputs.num_gpus }}" == "single" && "${{ github.event.inputs.runner_type }}" == "a10" ]]; then
+            echo "RUNNER=aws-g5-4xlarge-cache" >> $GITHUB_ENV
+          elif [[ "${{ github.event.inputs.num_gpus }}" == "multi" && "${{ github.event.inputs.runner_type }}" == "a10" ]]; then
+            echo "RUNNER=aws-g5-12xlarge-cache" >> $GITHUB_ENV
+          else
+            echo "RUNNER=" >> $GITHUB_ENV
+          fi
+
+      - name: Set runner to use
+        id: set_runner
+        run: |
+          echo ${{ env.RUNNER }}
+          echo "RUNNER=${{ env.RUNNER }}" >> $GITHUB_OUTPUT
+
   ssh_runner:
     name: "SSH"
-    runs-on: ["${{ github.event.inputs.num_gpus }}-gpu", nvidia-gpu, "${{ github.event.inputs.runner_type }}", ci]
+    needs: get_runner
+    runs-on:
+      group: ${{ needs.get_runner.outputs.RUNNER }}
     container:
       image: ${{ github.event.inputs.docker_image }}
       options: --gpus all --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
diff --git a/docker/transformers-quantization-latest-gpu/Dockerfile b/docker/transformers-quantization-latest-gpu/Dockerfile
index 6d94dbee5aa0e9..0617ac8cdd779c 100755
--- a/docker/transformers-quantization-latest-gpu/Dockerfile
+++ b/docker/transformers-quantization-latest-gpu/Dockerfile
@@ -56,7 +56,7 @@ RUN python3 -m pip install --no-cache-dir gguf
 RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.3/autoawq-0.2.3+cu118-cp38-cp38-linux_x86_64.whl
 
 # Add quanto for quantization testing
-RUN python3 -m pip install --no-cache-dir quanto
+RUN python3 -m pip install --no-cache-dir optimum-quanto
 
 # Add eetq for quantization testing
 RUN python3 -m pip install git+https://github.com/NetEase-FuXi/EETQ.git
diff --git a/docs/source/ar/_toctree.yml b/docs/source/ar/_toctree.yml
index 39e0ae14e19c29..6f7899b53b854e 100644
--- a/docs/source/ar/_toctree.yml
+++ b/docs/source/ar/_toctree.yml
@@ -217,32 +217,32 @@
 #     title: التحقق من طلب السحب
 #   title: المساهمة
 - sections:
-  # - local: philosophy
-  #   title: الفلسفة
+  - local: philosophy
+    title: الفلسفة
   - local: glossary
     title: (قاموس المصطلحات (قائمة الكلمات
-  # - local: task_summary
-  #   title: ما الذي يمكن أن تفعله 🤗 المحولات
-  # - local: tasks_explained
-  #   title: كيف تحل المحولات المهام
-  # - local: model_summary
-  #   title: عائلة نماذج المحول
-  # - local: tokenizer_summary
-  #   title: ملخص برنامج مقسم النصوص (tokenizers)
-  # - local: attention
-  #   title: الانتباه Attention
-  # - local: pad_truncation
-  #   title: الحشو والتقليم
-  # - local: bertology
-  #   title: BERTology
-  # - local: perplexity
-  #   title: حيرة النماذج ذات الطول الثابت
-  # - local: pipeline_webserver
-  #   title: خطوط الأنابيب للاستدلال على خادم الويب
-  # - local: model_memory_anatomy
-  #   title: تشريح تدريب النموذج
-  # - local: llm_tutorial_optimization
-  #   title: الاستفادة القصوى من LLMs
+  - local: task_summary
+    title: ما الذي يمكن أن تفعله 🤗 المحولات
+  - local: tasks_explained
+    title: كيف تحل المحولات المهام
+  - local: model_summary
+    title: عائلة نماذج المحول
+  - local: tokenizer_summary
+    title: ملخص برنامج مقسم النصوص (tokenizers)
+  - local: attention
+    title: الانتباه Attention
+  - local: pad_truncation
+    title: الحشو والتقليم
+  - local: bertology
+    title: BERTology
+  - local: perplexity
+    title: حيرة النماذج ذات الطول الثابت
+  - local: pipeline_webserver
+    title: خطوط الأنابيب للاستدلال على خادم الويب
+  - local: model_memory_anatomy
+    title: تشريح تدريب النموذج
+  - local: llm_tutorial_optimization
+    title: الاستفادة القصوى من LLMs
   title: أطر مفاهيمية
 # - sections:
 #   - sections:
diff --git a/docs/source/ar/attention.md b/docs/source/ar/attention.md
new file mode 100644
index 00000000000000..e1f55ed96c2dd4
--- /dev/null
+++ b/docs/source/ar/attention.md
@@ -0,0 +1,25 @@
+# آليات الانتباه 
+
+تستخدم معظم نماذج المحول (Transformer) الانتباه الكامل بحيث تكون مصفوفة الانتباه ذات الأبعاد المتساوية. ويمكن أن يمثل ذلك عقبة حسابية كبيرة عندما تكون لديك نصوص طويلة. ويعد Longformer وReformer من النماذج التي تحاول أن تكون أكثر كفاءة وتستخدم نسخة مخففة من مصفوفة الانتباه لتسريع التدريب.
+
+## انتباه LSH
+
+يستخدم [Reformer](model_doc/reformer) انتباه LSH. في الدالة softmax(QK^t)، فإن أكبر العناصر فقط (في بعد softmax) من المصفوفة QK^t هي التي ستعطي مساهمات مفيدة. لذلك، بالنسبة لكل استعلام q في Q، يمكننا أن نأخذ في الاعتبار فقط المفاتيح k في K المشابهة لـ q فقط. وتُستخدم دالة هاش لتحديد ما إذا كان q وk متشابهين. ويتم تعديل قناع الانتباه لتجاهل الرمز الحالي (باستثناء الموضع الأول)، لأنه سيعطي استعلامًا ومفتاحًا متساويين (لذلك متشابهين للغاية). نظرًا لطبيعة دالة الهاش العشوائية نوعًا ما، يتم في الممارسة العملية استخدام عدة دوال هاش (يحددها معامل n_rounds) ثم يتم حساب المتوسط معًا.
+
+## الانتباه المحلي
+
+يستخدم [Longformer](model_doc/longformer) الانتباه المحلي: غالبًا ما يكون السياق المحلي (على سبيل المثال، ما هما الرمزان إلى اليسار واليمين؟) كافيًا لاتخاذ إجراء بالنسبة للرمز المعطى. أيضًا، عن طريق تكديس طبقات الانتباه التي لها نافذة صغيرة، سيكون للطبقة الأخيرة مجال استقبال أكبر من مجرد الرموز في النافذة، مما يسمح لها ببناء تمثيل للجملة بأكملها.
+
+كما يتم منح بعض رموز الإدخال المختارة مسبقًا انتباهًا عالميًا: بالنسبة لهذه الرموز القليلة، يمكن لمصفوفة الانتباه الوصول إلى جميع الرموز وتكون هذه العملية متماثلة: فلجميع الرموز الأخرى إمكانية الوصول إلى تلك الرموز المحددة (بالإضافة إلى تلك الموجودة في نافذتهم المحلية). وهذا موضح في الشكل 2d من الورقة، انظر أدناه لمثال على قناع الانتباه:
+
+<div class="flex justify-center">
+    <img scale="50 %" align="center" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/local_attention_mask.png"/>
+</div>
+
+وباستخدام مصفوفات الانتباه هذه التي تحتوي على عدد أقل من المعلمات، يسمح النموذج بمدخالات ذات طول تسلسل أكبر.
+
+## حيل أخرى
+
+### الترميزات الموضعية المحورية
+
+يستخدم [Reformer](model_doc/reformer) ترميزات موضعية محورية: في نماذج المحول التقليدية، يكون الترميز الموضعي E مصفوفة بحجم \\(l\\) في \\(d\\)، حيث \\(l\\) هو طول التسلسل و\\(d\\) هو بعد الحالة المخفية. إذا كان لديك نصوص طويلة جدًا، فقد تكون هذه المصفوفة ضخمة وتستهلك مساحة كبيرة جدًا على وحدة معالجة الرسوميات (GPU). وللتخفيف من ذلك، تتكون الترميزات الموضعية المحورية من تحليل تلك المصفوفة الكبيرة E إلى مصفوفتين أصغر E1 وE2، بأبعاد \\(l_{1} \times d_{1}\\) و \\(l_{2} \times d_{2}\\)، بحيث \\(l_{1} \times l_{2} = l\\) و\\(d_{1} + d_{2} = d\\) (مع حاصل ضرب الأطوال، ينتهي الأمر بكونه أصغر بكثير). ويتم الحصول على الترميز للخطوة الزمنية \\(j\\) في E عن طريق ربط الترميزات للخطوة الزمنية \\(j \% l1\\) في E1 و \\(j // l1\\) في E2.
\ No newline at end of file
diff --git a/docs/source/ar/bertology.md b/docs/source/ar/bertology.md
new file mode 100644
index 00000000000000..d3f95e20d7df1a
--- /dev/null
+++ b/docs/source/ar/bertology.md
@@ -0,0 +1,18 @@
+# BERTology
+
+يُشهد في الآونة الأخيرة نمو مجال دراسي يُعنى باستكشاف آلية عمل نماذج المحولات الضخمة مثل BERT (والذي يُطلق عليها البعض اسم "BERTology"). ومن الأمثلة البارزة على هذا المجال ما يلي:
+
+- BERT Rediscovers the Classical NLP Pipeline بواسطة Ian Tenney و Dipanjan Das و Ellie Pavlick:
+  https://arxiv.org/abs/1905.05950
+- Are Sixteen Heads Really Better than One? بواسطة Paul Michel و Omer Levy و Graham Neubig: https://arxiv.org/abs/1905.10650
+- What Does BERT Look At? An Analysis of BERT's Attention بواسطة Kevin Clark و Urvashi Khandelwal و Omer Levy و Christopher D.
+  Manning: https://arxiv.org/abs/1906.04341
+- CAT-probing: A Metric-based Approach to Interpret How Pre-trained Models for Programming Language Attend Code Structure: https://arxiv.org/abs/2210.04633
+
+لإثراء هذا المجال الناشئ، قمنا بتضمين بعض الميزات الإضافية في نماذج BERT/GPT/GPT-2 للسماح للناس بالوصول إلى التمثيلات الداخلية، والتي تم تكييفها بشكل أساسي من العمل الرائد لـ Paul Michel (https://arxiv.org/abs/1905.10650):
+
+- الوصول إلى جميع الحالات المخفية في BERT/GPT/GPT-2،
+- الوصول إلى جميع أوزان الانتباه لكل رأس في BERT/GPT/GPT-2،
+- استرجاع قيم ومشتقات  مخرجات الرأس لحساب درجة أهمية الرأس وحذفه كما هو موضح في https://arxiv.org/abs/1905.10650.
+
+ولمساعدتك على فهم واستخدام هذه الميزات بسهولة، أضفنا مثالًا برمجيًا محددًا: [bertology.py](https://github.com/huggingface/transformers/tree/main/examples/research_projects/bertology/run_bertology.py) أثناء استخراج المعلومات  وتقليص من نموذج تم تدريبه مسبقًا على GLUE.
\ No newline at end of file
diff --git a/docs/source/ar/llm_tutorial_optimization.md b/docs/source/ar/llm_tutorial_optimization.md
new file mode 100644
index 00000000000000..59f17f7f8a92c2
--- /dev/null
+++ b/docs/source/ar/llm_tutorial_optimization.md
@@ -0,0 +1,795 @@
+# تحسين نماذج اللغة الكبيرة من حيث السرعة والذاكرة
+
+
+[[open-in-colab]]
+
+تحقق نماذج اللغة الكبيرة (LLMs) مثل GPT3/4، [Falcon](https://huggingface.co/tiiuae/falcon-40b)، و [Llama](https://huggingface.co/meta-llama/Llama-2-70b-hf) تقدمًا سريعًا في قدرتها على معالجة المهام التي تركز على الإنسان، مما يجعلها أدوات أساسية في الصناعات القائمة على المعرفة الحديثة.
+لا يزال نشر هذه النماذج في المهام الواقعية يمثل تحديًا، ومع ذلك:
+
+-   لكي تظهر نماذج اللغة الكبيرة قدرات فهم وتوليد النصوص قريبة من قدرات الإنسان، فإنها تتطلب حاليًا  إلى تكوينها من مليارات المعلمات (انظر [كابلان وآخرون](https://arxiv.org/abs/2001.08361)، [وي وآخرون](https://arxiv.org/abs/2206.07682)). وهذا بدوره يزيد من متطلبات الذاكرة للاستدلال.
+-   في العديد من المهام الواقعية، تحتاج نماذج اللغة الكبيرة إلى معلومات سياقية شاملة. يتطلب ذلك قدرة النموذج على إدارة تسلسلات إدخال طويلة للغاية أثناء الاستدلال.
+
+يكمن جوهر صعوبة هذه التحديات في تعزيز القدرات الحسابية والذاكرة لنماذج اللغة الكبيرة، خاصة عند التعامل مع تسلسلات الإدخال الضخمة.
+
+في هذا الدليل، سنستعرض التقنيات الفعالة لتُحسِّن من كفاءة نشر نماذج اللغة الكبيرة:
+
+1. سنتناول تقنية "دقة أقل" التي أثبتت الأبحاث فعاليتها في تحقيق مزايا حسابية دون التأثير بشكل ملحوظ على أداء النموذج عن طريق العمل بدقة رقمية أقل [8 بت و4 بت](/main_classes/quantization.md).
+
+2.  **اFlash Attention:** إن Flash Attention وهي نسخة مُعدَّلة من خوارزمية الانتباه التي لا توفر فقط نهجًا أكثر كفاءة في استخدام الذاكرة، ولكنها تحقق أيضًا كفاءة متزايدة بسبب الاستخدام الأمثل لذاكرة GPU.
+
+3.  **الابتكارات المعمارية:** حيث تم اقتراح هياكل متخصصة تسمح باستدلال أكثر فعالية نظرًا لأن نماذج اللغة الكبيرة يتم نشرها دائمًا بنفس الطريقة أثناء عملية الاستدلال، أي توليد النص التنبؤي التلقائي مع سياق الإدخال الطويل، فقد تم اقتراح بنيات نموذج متخصصة تسمح بالاستدلال الأكثر كفاءة. أهم تقدم في بنيات النماذج هنا هو [عذر](https://arxiv.org/abs/2108.12409)، [الترميز الدوار](https://arxiv.org/abs/2104.09864)، [الاهتمام متعدد الاستعلامات (MQA)](https://arxiv.org/abs/1911.02150) و [مجموعة الانتباه بالاستعلام (GQA)]((https://arxiv.org/abs/2305.13245)).
+
+على مدار هذا الدليل، سنقدم تحليلًا للتوليد التنبؤي التلقائي من منظور المُوتِّرات. نتعمق في مزايا وعيوب استخدام دقة أقل، ونقدم استكشافًا شاملاً لخوارزميات الانتباه الأحدث، ونناقش بنيات نماذج نماذج اللغة الكبيرة المحسنة. سندعم الشرح بأمثلة عملية تُبرِز كل تحسين على حدة.
+
+## 1. دقة أقل
+
+يمكن فهم متطلبات ذاكرة نماذج اللغة الكبيرة بشكل أفضل من خلال النظر إلى نموذج اللغة الكبيرة على أنها مجموعة من المصفوفات والمتجهات الوزنية، ومدخلات النص على أنها تسلسل من المتجهات. فيما يلي، سيتم استخدام تعريف "الأوزان" للإشارة إلى جميع مصفوفات الأوزان والمتجهات في النموذج.
+في وقت كتابة هذا الدليل، تتكون نماذج اللغة الكبيرة من مليارات المعلمات على الأقل.كل معلمة يتم تمثيلها برقم عشري مثل 4.5689 `` والذي يتم تخزينه عادةً بتنسيق [float32](https://en.wikipedia.org/wiki/Single-precision_floating-point_format)، [bfloat16](https://en.wikipedia.org/wiki/Bfloat16_floating-point_format)، أو [float16](https://en.wikipedia.org/wiki/Half-precision_floating-point_format) . يسمح لنا هذا بحساب متطلبات الذاكرة لتحميل نموذج اللغة الكبيرة في الذاكرة بسهولة:
+
+> *يتطلب تحميل أوزان نموذج به X مليار معلمة حوالي 4 * X جيجابايت من ذاكرة الفيديو العشوائية (VRAM) بدقة float32*
+
+ومع ذلك، نادرًا ما يتم تدريب النماذج في الوقت الحالي بدقة float32 الكاملة، ولكن عادةً ما تكون بدقة bfloat16 أو بشكل أقل في تنسيق float16. لذلك، تصبح القاعدة الإرشادية كما يلي:
+
+> *يتطلب تحميل أوزان نموذج به X مليار معلمة حوالي 2 * X جيجابايت من ذاكرة الفيديو العشوائية (VRAM) بدقة bfloat16/float16*
+
+بالنسبة لمدخلات  النصوص القصيرة (أقل من 1024 رمزًا)، فإن متطلبات الذاكرة للاستدلال تهيمن عليها إلى حد كبير متطلبات الذاكرة لتحميل الأوزان. لذلك، دعنا نفترض، في الوقت الحالي، أن متطلبات الذاكرة للاستدلال تساوي متطلبات الذاكرة لتحميل النموذج في ذاكرة VRAM لوحدة معالجة الرسومات GPU..
+
+ولإعطاء بعض الأمثلة على مقدار ذاكرة الفيديو العشوائية (VRAM) التي يتطلبها تحميل نموذج بتنسيق bfloat16 تقريبًا:
+
+-   **GPT3** يتطلب 2 \* 175 جيجا بايت = **350 جيجا بايت** VRAM
+-   [**بلوم**](https://huggingface.co/bigscience/bloom) يتطلب 2 \* 176 جيجا بايت = **352 جيجا بايت** VRAM
+-   [**Llama-2-70b**](https://huggingface.co/meta-llama/Llama-2-70b-hf) يتطلب 2 \* 70 جيجا بايت = **140 جيجا بايت** VRAM
+-   [**Falcon-40b**](https://huggingface.co/tiiuae/falcon-40b) يتطلب 2 \* 40 جيجا بايت = **80 جيجا بايت** VRAM
+-   [**MPT-30b**](https://huggingface.co/mosaicml/mpt-30b) يتطلب 2 \* 30 جيجا بايت = **60 جيجا بايت** VRAM
+-   [**bigcode/starcoder**](https://huggingface.co/bigcode/starcoder) يتطلب 2 \* 15.5 = **31 جيجا بايت** VRAM
+
+عند كتابة هذا الدليل، أكبر شريحة لوحدة معالجة الرسومات  المتوفّرة  هي  A100 و  H100  التي توفر 80 جيجابايت من ذاكرة الفيديو العشوائية (VRAM). تتطلب معظم النماذج المدرجة أعلاه أكثر من 80 جيجابايت فقط لتحميلها، وبالتالي فهي تتطلب بالضرورة [التوازي للموتّرات](https://huggingface.co/docs/transformers/perf_train_gpu_many#tensor-parallelism) و/أو [لتوازي  الخطي](https://huggingface.co/docs/transformers/perf_train_gpu_many#naive-model-parallelism-vertical-and-pipeline-parallelism).
+
+🤗 لا يدعم Transformers موازاة التنسور خارج الصندوق لأنه يتطلب كتابة هيكلة النموذج بطريقة محددة. إذا كنت مهتمًا بكتابة نماذج بطريقة صديقة لموازاة التنسور، فلا تتردد في إلقاء نظرة على [مكتبة الاستدلال بتوليد النص](https://github.com/huggingface/text-generation-inference/tree/main/server/text_generation_server/models/custom_modeling).
+
+بدعم موازاة قنوات المعالجة البسيطة خارج الصندوق. للقيام بذلك، قم بتحميل النموذج باستخدام `device="auto"` والذي سيقوم تلقائيًا بوضع الطبقات المختلفة على وحدات معالجة الرسومات (GPU) المتاحة كما هو موضح [هنا](https://huggingface.co/docs/accelerate/v0.22.0/en/concept_guides/big_model_inference).
+لاحظ، مع ذلك، أنه في حين أن موازاة قنوات المعالجة البسيطة فعالة للغاية، إلا أنها لا تعالج مشكلات عدم نشاط وحدة معالجة الرسومات (GPU). لهذا، تكون موازاة قنوات المعالجة المتقدمة مطلوبة كما هو موضح [هنا](https://huggingface.co/docs/transformers/en/perf_train_gpu_many#naive-model-parallelism-vertical-and-pipeline-parallelism).
+
+إذا كان لديك حق الوصول إلى عقدة 8 x 80 جيجابايت A100، فيمكنك تحميل BLOOM كما يلي
+
+```bash
+!pip install transformers accelerate bitsandbytes optimum
+```
+```python
+from transformers import AutoModelForCausalLM
+
+model = AutoModelForCausalLM.from_pretrained("bigscience/bloom", device_map="auto", pad_token_id=0)
+```
+
+من خلال استخدام `device_map="auto"` سيتم توزيع طبقات الاهتمام بالتساوي عبر جميع وحدات معالجة الرسومات (GPU) المتاحة.
+
+في هذا الدليل، سنستخدم [bigcode/octocoder](https://huggingface.co/bigcode/octocoder) لأنه يمكن تشغيله على شريحة جهاز GPU A100 ذات 40 جيجا بايت. لاحظ أن جميع تحسينات الذاكرة والسرعة التي سنطبقها من الآن فصاعدًا تنطبق بالتساوي على النماذج التي تتطلب موازاة النماذج أو المصفوفات.
+
+نظرًا لأن النموذج مُحمَّل بدقة bfloat16، فباستخدام قاعدتنا الإرشادية أعلاه، نتوقع أن تكون متطلبات الذاكرة لتشغيل الاستدلال باستخدام `bigcode/octocoder` حوالي 31 جيجا بايت من ذاكرة الفيديو العشوائية (VRAM). دعنا نجرب.
+
+نقوم أولاً بتحميل النموذج والمجزىء اللغوي ثم نقوم بتمرير كلاهما إلى كائن [قنوات المعالجة](https://huggingface.co/docs/transformers/main_classes/pipelines) في Transformers.
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+import torch
+
+model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", torch_dtype=torch.bfloat16, device_map="auto", pad_token_id=0)
+tokenizer = AutoTokenizer.from_pretrained("bigcode/octocoder")
+
+pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
+```
+
+```python
+prompt = "Question: Please write a function in Python that transforms bytes to Giga bytes.\n\nAnswer:"
+
+result = pipe(prompt, max_new_tokens=60)[0]["generated_text"][len(prompt):]
+result
+```
+
+**الإخراج**:
+```
+Here is a Python function that transforms bytes to Giga bytes:\n\n```python\ndef bytes_to_giga_bytes(bytes):\n    return bytes / 1024 / 1024 / 1024\n```\n\nThis function takes a single
+```
+
+رائع، يمكننا الآن استخدام النتيجة مباشرة لتحويل البايت إلى جيجا بايت.
+
+```python
+def bytes_to_giga_bytes(bytes):
+  return bytes / 1024 / 1024 / 1024
+```
+
+دعونا نستدعي [`torch.cuda.max_memory_allocated`](https://pytorch.org/docs/stable/generated/torch.cuda.max_memory_allocated.html) لقياس ذروة تخصيص ذاكرة وحدة معالجة الرسومات (GPU).
+
+```python
+bytes_to_giga_bytes(torch.cuda.max_memory_allocated())
+```
+
+**الإخراج**:
+```bash
+29.0260648727417
+```
+
+قريب بما يكفي من حسابنا التقريبي! يمكننا أن نرى أن الرقم غير صحيح تمامًا لأن الانتقال من البايت إلى الكيلوبايت يتطلب الضرب في 1024 بدلاً من 1000. لذلك يمكن أيضًا فهم صيغة التقريب على أنها حساب "بحد أقصى X جيجا بايت".
+لاحظ أنه إذا حاولنا تشغيل النموذج بدقة float32 الكاملة، فستكون هناك حاجة إلى 64 جيجا بايت من ذاكرة الفيديو العشوائية (VRAM).
+
+> يتم تدريب جميع النماذج تقريبًا بتنسيق bfloat16 في الوقت الحالي، ولا يوجد سبب لتشغيل النموذج بدقة float32 الكاملة إذا [كانت وحدة معالجة الرسومات (GPU) الخاصة بك تدعم bfloat16](https://discuss.pytorch.org/t/bfloat16-native-support/117155/5). لن توفر دقة float32 نتائج استدلال أفضل من الدقة التي تم استخدامها لتدريب النموذج.
+
+إذا لم تكن متأكدًا من تنسيق تخزين أوزان النموذج على Hub، فيمكنك دائمًا الاطلاع على تهيئة نقطة التفتيش في `"torch_dtype"`، على سبيل المثال [هنا](https://huggingface.co/meta-llama/Llama-2-7b-hf/blob/6fdf2e60f86ff2481f2241aaee459f85b5b0bbb9/config.json#L21). يوصى بتعيين النموذج إلى نفس نوع الدقة كما هو مكتوب في التهيئة عند التحميل باستخدام `from_pretrained(..., torch_dtype=...)` إلا إذا كان النوع الأصلي هو float32، وفي هذه الحالة يمكن استخدام `float16` أو `bfloat16` للاستدلال.
+
+
+دعونا نحدد وظيفة `flush(...)` لتحرير جميع الذاكرة المخصصة بحيث يمكننا قياس ذروة ذاكرة وحدة معالجة الرسومات (GPU) المخصصة بدقة.
+
+```python
+del pipe
+del model
+
+import gc
+import torch
+
+def flush():
+  gc.collect()
+  torch.cuda.empty_cache()
+  torch.cuda.reset_peak_memory_stats()
+```
+
+دعونا نستدعيه الآن للتجربة التالية.
+
+```python
+flush()
+```
+في الإصدار الأخير من مكتبة Accelerate، يمكنك أيضًا استخدام طريقة مساعدة تسمى `release_memory()`
+
+```python
+from accelerate.utils import release_memory
+# ...
+
+release_memory(model)
+```
+```python
+from accelerate.utils import release_memory
+# ...
+
+release_memory(model)
+```
+
+والآن ماذا لو لم يكن لدى وحدة معالجة الرسومات (GPU) لديك 32 جيجا بايت من ذاكرة الفيديو العشوائية (VRAM)؟ لقد وجد أن أوزان النماذج يمكن تحويلها إلى 8 بتات أو 4 بتات دون خسارة كبيرة في الأداء (انظر [Dettmers et al.](https://arxiv.org/abs/2208.07339)).
+يمكن تحويل النموذج إلى 3 بتات أو 2 بتات مع فقدان مقبول في الأداء كما هو موضح في ورقة [GPTQ](https://arxiv.org/abs/2210.17323) 🤯.
+
+دون الدخول في الكثير من التفاصيل، تهدف مخططات التكميم إلى تخفيض دقة الأوزان مع محاولة الحفاظ على دقة نتائج النموذج كما هي (*أي* أقرب ما يمكن إلى bfloat16).
+لاحظ أن التكميم يعمل بشكل خاص جيدًا لتوليد النص حيث كل ما نهتم به هو اختيار *مجموعة الرموز الأكثر احتمالًا التالية* ولا نهتم حقًا بالقيم الدقيقة لتوزيع الرمز التالي *logit*.
+كل ما يهم هو أن توزيع الرمز التالي *logit* يظل كما هو تقريبًا بحيث تعطي عملية `argmax` أو `topk` نفس النتائج.
+
+هناك عدة تقنيات للتكميم، والتي لن نناقشها بالتفصيل هنا، ولكن بشكل عام، تعمل جميع تقنيات التكميم كما يلي:
+
+-   1.  تكميم جميع الأوزان إلى الدقة المستهدفة
+-   2.  تحميل الأوزان المحولة، ومرر تسلسل المدخلات من المتجهات بتنسيق bfloat16
+-   3.  قم بتحويل الأوزان ديناميكيًا إلى bfloat1  لإجراء الحسابات مع متجهات المدخلات بدقة `bfloat16`
+
+باختصار، هذا يعني أن مضروبات *مصفوفة المدخلات-الوزن*، حيث \\( X \\) هي المدخلات، \\( W \\) هي مصفوفة وزن و \\( Y \\) هي الناتج:
+
+$$ Y = X * W $$
+
+تتغير إلى
+
+$$ Y = X * \text{dequantize}(W) $$
+
+لكل عملية ضرب المصفوفات. يتم تنفيذ إلغاء التكميم وإعادة التكميم بشكل متسلسل لجميع مصفوفات الأوزان أثناء مرور المدخلات عبر رسم الشبكة.
+
+لذلك، غالبًا ما لا يتم تقليل وقت الاستدلال عند استخدام الأوزان المكممة، ولكن بدلاً من ذلك يزيد.
+
+كفى نظرية، دعنا نجرب! لتكميم الأوزان باستخدام المحولات، تحتاج إلى التأكد من تثبيت مكتبة [`bitsandbytes`](https://github.com/TimDettmers/bitsandbytes).
+
+```bash
+!pip install bitsandbytes
+```
+
+يمكننا بعد ذلك تحميل النماذج في تكميم 8 بت ببساطة عن طريق إضافة علامة `load_in_8bit=True` إلى `from_pretrained`.
+
+```python
+model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", load_in_8bit=True, pad_token_id=0)
+```
+
+الآن، دعنا نعيد تشغيل مثالنا ونقيس استخدام الذاكرة.
+
+```python
+pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
+
+result = pipe(prompt, max_new_tokens=60)[0]["generated_text"][len(prompt):]
+result
+```
+
+**الإخراج**:
+```
+Here is a Python function that transforms bytes to Giga bytes:\n\n```python\ndef bytes_to_giga_bytes(bytes):\n    return bytes / 1024 / 1024 / 1024\n```\n\nThis function takes a single
+```
+
+جميل، نحصل على نفس النتيجة كما في السابق، لذلك لا يوجد فقدان في الدقة! دعنا نلقي نظرة على مقدار الذاكرة المستخدمة هذه المرة.
+
+```python
+bytes_to_giga_bytes(torch.cuda.max_memory_allocated())
+```
+
+**الإخراج**:
+```
+15.219234466552734
+```
+
+أقل بكثير! لقد انخفضنا إلى ما يزيد قليلاً عن 15 جيجابايت، وبالتالي يمكننا تشغيل هذا النموذج على وحدات معالجة الرسومات للمستهلك مثل 4090.
+
+نرى مكسبًا لطيفًا جدًا في كفاءة الذاكرة ولا يوجد تقريبًا أي تدهور في ناتج النموذج. ومع ذلك، يمكننا أيضًا ملاحظة تباطؤ طفيف أثناء الاستدلال.
+
+نحذف النماذج ونفرغ الذاكرة مرة أخرى.
+```python
+del model
+del pipe
+```
+
+```python
+flush()
+```
+
+دعنا نرى ما هو استهلاك ذاكرة GPU الذروة الذي يوفره تكميم 4 بت. يمكن تكميم النموذج إلى 4 بت باستخدام نفس واجهة برمجة التطبيقات كما في السابق - هذه المرة عن طريق تمرير `load_in_4bit=True` بدلاً من `load_in_8bit=True`.
+
+```python
+model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", load_in_4bit=True, low_cpu_mem_usage=True, pad_token_id=0)
+
+pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
+
+result = pipe(prompt, max_new_tokens=60)[0]["generated_text"][len(prompt):]
+result
+```
+
+**الإخراج**:
+```
+Here is a Python function that transforms bytes to Giga bytes:\n\n```\ndef bytes_to_gigabytes(bytes):\n    return bytes / 1024 / 1024 / 1024\n```\n\nThis function takes a single argument
+```
+
+نحن نرى تقريبًا نفس نص الإخراج كما في السابق - فقط `python` مفقود قبل مقطع الكود. دعنا نرى مقدار الذاكرة المطلوبة.
+
+```python
+bytes_to_giga_bytes(torch.cuda.max_memory_allocated())
+```
+
+**الإخراج**:
+```
+9.543574333190918
+```
+
+فقط 9.5 جيجابايت! هذا ليس كثيرًا بالفعل لنموذج يزيد عدد معاملاته عن 15 مليار.
+
+على الرغم من أننا نرى تدهورًا بسيطًا جدًا في الدقة لنموذجنا هنا، إلا أن تكميم 4 بت يمكن أن يؤدي في الممارسة العملية غالبًا إلى نتائج مختلفة مقارنة بتكميم 8 بت أو الاستدلال الكامل `bfloat16`. الأمر متروك للمستخدم لتجربته.
+
+لاحظ أيضًا أن الاستدلال هنا كان أبطأ قليلاً مقارنة بتكميم 8 بت والذي يرجع إلى طريقة التكميم الأكثر عدوانية المستخدمة لتكميم 4 بت مما يؤدي إلى \\( \text{quantize} \\) و \\( \text{dequantize} \\) يستغرق وقتًا أطول أثناء الاستدلال.
+
+```python
+del model
+del pipe
+```
+```python
+flush()
+```
+
+بشكل عام، رأينا أن تشغيل OctoCoder بدقة 8 بت قلل من ذاكرة GPU VRAM المطلوبة من 32G GPU VRAM إلى 15 جيجابايت فقط، وتشغيل النموذج بدقة 4 بت يقلل من ذاكرة GPU VRAM المطلوبة إلى ما يزيد قليلاً عن 9 جيجابايت.
+
+يسمح تكميم 4 بت بتشغيل النموذج على وحدات معالجة الرسومات مثل RTX3090 و V100 و T4 والتي يمكن الوصول إليها بسهولة لمعظم الأشخاص.
+
+لمزيد من المعلومات حول التكميم ولمعرفة كيف يمكن تكميم النماذج لطلب ذاكرة GPU VRAM أقل حتى من 4 بت، نوصي بالاطلاع على تنفيذ [`AutoGPTQ`](https://huggingface.co/docs/transformers/main/en/main_classes/quantization#autogptq-integration%60).
+
+> كاستنتاج، من المهم تذكر أن تكميم النموذج يتداول كفاءة الذاكرة المحسنة مقابل الدقة وفي بعض الحالات وقت الاستدلال.
+
+إذا لم تكن ذاكرة GPU قيدًا لحالتك الاستخدام، فغالبًا لا توجد حاجة للنظر في التكميم. ومع ذلك، لا يمكن للعديد من وحدات معالجة الرسومات ببساطة تشغيل نماذج اللغة الكبيرة بدون طرق التكميم وفي هذه الحالة، تعد مخططات التكميم 4 بت و 8 بت أدوات مفيدة للغاية.
+
+لمزيد من المعلومات حول الاستخدام التفصيلي، نوصي بشدة بإلقاء نظرة على [وثائق تكميم المحولات](https://huggingface.co/docs/transformers/main_classes/quantization#general-usage).
+
+بعد ذلك، دعنا نلقي نظرة على كيفية تحسين الكفاءة الحسابية وكفاءة الذاكرة باستخدام خوارزميات أفضل وبنية نموذج محسنة.
+
+## 2. الانتباه السريع
+
+تتشارك نماذج اللغة الكبيرة (LLMs) الأعلى أداءً اليوم تقريبًا نفس البنية الأساسية التي تتكون من طبقات التغذية الأمامية، وطبقات التنشيط، وطبقات التطبيع الطبقي، والأهم من ذلك، طبقات الانتباه الذاتي.
+
+تعد طبقات الانتباه الذاتي مركزية لنماذج اللغة الكبيرة (LLMs) حيث تمكن النموذج من فهم العلاقات السياقية بين رموز المدخلات.
+ومع ذلك، فإن استهلاك ذاكرة GPU الذروة لطبقات الانتباه الذاتي ينمو بشكل *رباعي* في كل من التعقيد الحسابي وتعقيد الذاكرة مع عدد رموز المدخلات (والذي يُطلق عليه أيضًا *طول التسلسل*) الذي نسميه في ما يلي بـ \\( N \\) .
+على الرغم من أن هذا غير ملحوظ حقًا للتسلسلات الأقصر (حتى 1000 رمز إدخال)، إلا أنه يصبح مشكلة خطيرة للتسلسلات الأطول (حوالي 16000 رمز إدخال).
+
+دعنا نلقي نظرة أقرب. الصيغة لحساب الناتج \\( \mathbf{O} \\) لطبقة الانتباه الذاتي لإدخال \\( \mathbf{X} \\) بطول \\( N \\) هي:
+
+$$ \textbf{O} = \text{Attn}(\mathbf{X}) = \mathbf{V} \times \text{Softmax}(\mathbf{QK}^T) \text{ with } \mathbf{Q} = \mathbf{W}_q \mathbf{X}, \mathbf{V} = \mathbf{W}_v \mathbf{X}, \mathbf{K} = \mathbf{W}_k \mathbf{X} $$
+
+يعد \\( \mathbf{X} = (\mathbf{x}_1, ... \mathbf{x}_{N}) \\) بالتالي تسلسل الإدخال إلى طبقة الاهتمام. وستتكون كل من الإسقاطات \\( \mathbf{Q} \\) و \\( \mathbf{K} \\) من \\( N \\) من المتجهات مما يؤدي إلى أن يكون حجم \\( \mathbf{QK}^T \\) هو \\( N^2 \\).
+
+عادة ما يكون لدى LLMs العديد من رؤوس الاهتمام، وبالتالي يتم إجراء العديد من حسابات الاهتمام الذاتي بالتوازي.
+وبافتراض أن LLM لديها 40 رأس اهتمام وتعمل بدقة bfloat16، يمكننا حساب متطلبات الذاكرة لتخزين مصفوفات \\( \mathbf{QK^T} \\) لتكون \\( 40 * 2 * N^2 \\) بايت. بالنسبة لـ \\( N=1000 \\)، لا يلزم سوى حوالي 50 ميجابايت من VRAM، ولكن بالنسبة لـ \\( N=16000 \\) سنحتاج إلى 19 جيجابايت من VRAM، وبالنسبة لـ \\( N=100,000 \\) سنحتاج إلى ما يقرب من 1 تيرابايت فقط لتخزين مصفوفات \\( \mathbf{QK}^T \\).
+
+باختصار، سرعان ما يصبح خوارزمية الانتباه الذاتي الافتراضية مكلفة للغاية من حيث الذاكرة بالنسبة لسياقات الإدخال الكبيرة.
+
+مع تحسن LLMs في فهم النص وتوليد النص، يتم تطبيقها على مهام متزايدة التعقيد. في حين أن النماذج كانت تتعامل سابقًا مع ترجمة أو تلخيص بضع جمل، فإنها الآن تدير صفحات كاملة، مما يتطلب القدرة على معالجة أطوال إدخال واسعة.
+
+كيف يمكننا التخلص من متطلبات الذاكرة الباهظة للتطويلات المدخلة الكبيرة؟ نحن بحاجة إلى طريقة جديدة لحساب آلية الاهتمام الذاتي التي تتخلص من مصفوفة \\( QK^T \\). [طريقه داو وآخرون.](Https://arxiv.org/abs/2205.14135) طوروا بالضبط مثل هذا الخوارزمية الجديدة وأطلقوا عليها اسم **Flash Attention**.
+
+باختصار، يكسر الاهتمام الفلاشي حساب \\( \mathbf{V} \times \operatorname{Softmax}(\mathbf{QK}^T\\)) ويحسب بدلاً من ذلك قطعًا أصغر من الإخراج عن طريق التكرار عبر العديد من خطوات حساب Softmax:
+
+$$ \textbf{O}_i \leftarrow s^a_{ij} * \textbf{O}_i + s^b_{ij} * \mathbf{V}_{j} \times \operatorname{Softmax}(\mathbf{QK}^T_{i,j}) \text{ for multiple } i, j \text{ iterations } $$
+
+مع \\( s^a_{ij} \\) و \\( s^b_{ij} \\) كونها بعض إحصائيات التطبيع softmax التي يجب إعادة حسابها لكل \\( i \\) و \\( j \\).
+
+يرجى ملاحظة أن Flash Attention بالكامل أكثر تعقيدًا إلى حد ما ويتم تبسيطه بشكل كبير هنا حيث أن التعمق كثيرًا يخرج عن نطاق هذا الدليل. القارئ مدعو لإلقاء نظرة على ورقة Flash Attention المكتوبة جيدًا [1] لمزيد من التفاصيل.
+
+الفكرة الرئيسية هنا هي:
+
+> من خلال تتبع إحصائيات التطبيع softmax واستخدام بعض الرياضيات الذكية، يعطي Flash Attention **مخرجات متطابقة رقميًا** مقارنة بطبقة الاهتمام الذاتي الافتراضية بتكلفة ذاكرة لا تزيد خطيًا مع \\( N \\).
+
+عند النظر إلى الصيغة، قد يقول المرء بديهيًا أن الاهتمام الفلاشي يجب أن يكون أبطأ بكثير مقارنة بصيغة الاهتمام الافتراضية حيث يلزم إجراء المزيد من الحسابات. في الواقع، يتطلب Flash Attention المزيد من عمليات الفاصلة العائمة مقارنة بالاهتمام العادي حيث يجب إعادة حساب إحصائيات التطبيع softmax باستمرار (راجع [الورقة](https://arxiv.org/abs/2205.14135) لمزيد من التفاصيل إذا كنت مهتمًا)
+
+> ومع ذلك، فإن الاهتمام الفلاشي أسرع بكثير في الاستدلال مقارنة بالاهتمام الافتراضي الذي يأتي من قدرته على تقليل الطلبات على ذاكرة GPU الأبطأ ذات النطاق الترددي العالي (VRAM)، والتركيز بدلاً من ذلك على ذاكرة SRAM الأسرع الموجودة على الشريحة.
+
+من الناحية الأساسية، يتأكد Flash Attention من إمكانية إجراء جميع عمليات الكتابة والقراءة الوسيطة باستخدام ذاكرة SRAM السريعة الموجودة على الشريحة بدلاً من الاضطرار إلى الوصول إلى ذاكرة VRAM الأبطأ لحساب متجه الإخراج \\( \mathbf{O} \\).
+
+من الناحية العملية، لا يوجد حاليًا أي سبب **عدم** استخدام الاهتمام الفلاشي إذا كان متاحًا. الخوارزمية تعطي نفس المخرجات رياضيا، وأسرع وأكثر كفاءة في استخدام الذاكرة.
+
+لنلقِ نظرة على مثال عملي.
+
+
+يحصل نموذج OctoCoder الخاص بنا الآن على موجه إدخال أطول بشكل كبير يتضمن ما يسمى *موجه النظام*. تُستخدم موجهات النظام لتوجيه LLM إلى مساعد أفضل مصمم لمهام المستخدمين.
+فيما يلي، نستخدم موجه النظام الذي سيجعل OctoCoder مساعد ترميز أفضل.
+
+```python
+system_prompt = """Below are a series of dialogues between various people and an AI technical assistant.
+The assistant tries to be helpful, polite, honest, sophisticated, emotionally aware, and humble but knowledgeable.
+The assistant is happy to help with code questions and will do their best to understand exactly what is needed.
+It also tries to avoid giving false or misleading information, and it caveats when it isn't entirely sure about the right answer.
+That said, the assistant is practical really does its best, and doesn't let caution get too much in the way of being useful.
+
+The Starcoder models are a series of 15.5B parameter models trained on 80+ programming languages from The Stack (v1.2) (excluding opt-out requests).
+The model uses Multi Query Attention, was trained using the Fill-in-the-Middle objective, and with 8,192 tokens context window for a trillion tokens of heavily deduplicated data.
+-----
+
+Question: Write a function that takes two lists and returns a list that has alternating elements from each input list.
+
+Answer: Sure. Here is a function that does that.
+
+def alternating(list1, list2):
+   results = []
+   for i in range(len(list1)):
+       results.append(list1[i])
+       results.append(list2[i])
+   return results
+
+Question: Can you write some test cases for this function?
+
+Answer: Sure, here are some tests.
+
+assert alternating([10, 20, 30], [1, 2, 3]) == [10, 1, 20, 2, 30, 3]
+assert alternating([True, False], [4, 5]) == [True, 4, False, 5]
+assert alternating([], []) == []
+
+Question: Modify the function so that it returns all input elements when the lists have uneven length. The elements from the longer list should be at the end.
+
+Answer: Here is the modified function.
+
+def alternating(list1, list2):
+   results = []
+   for i in range(min(len(list1), len(list2))):
+       results.append(list1[i])
+       results.append(list2[i])
+   if len(list1) > len(list2):
+       results.extend(list1[i+1:])
+   else:
+       results.extend(list2[i+1:])
+   return results
+-----
+"""
+```
+لأغراض التوضيح، سنكرر موجه النظام عشر مرات بحيث يكون طول الإدخال طويلاً بما يكفي لملاحظة وفورات ذاكرة Flash Attention.
+نضيف موجه النص الأصلي "سؤال: يرجى كتابة وظيفة في Python تقوم بتحويل البايتات إلى جيجا بايت.
+
+```python
+long_prompt = 10 * system_prompt + prompt
+```
+
+نقوم بتنفيذ نموذجنا مرة أخرى بدقة bfloat16.
+
+```python
+model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", torch_dtype=torch.bfloat16, device_map="auto")
+tokenizer = AutoTokenizer.from_pretrained("bigcode/octocoder")
+
+pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
+```
+
+دعنا الآن نقوم بتشغيل النموذج تمامًا مثلما كان من قبل *بدون اهتمام فلاشي* وقياس متطلبات ذاكرة GPU وقت الذروة ووقت الاستدلال.
+
+```python
+import time
+
+start_time = time.time()
+result = pipe(long_prompt, max_new_tokens=60)[0]["generated_text"][len(long_prompt):]
+
+print(f"Generated in {time.time() - start_time} seconds.")
+result
+```
+
+**الإخراج**:
+```
+تم التوليد في 10.96854019165039 ثانية.
+بالتأكيد. إليك وظيفة للقيام بذلك.
+
+def bytes_to_giga(bytes):
+return bytes / 1024 / 1024 / 1024
+
+الإجابة: بالتأكيد. إليك وظيفة للقيام بذلك.
+
+ديف
+```
+
+نحصل على نفس الإخراج كما كان من قبل، ولكن هذه المرة، يقوم النموذج بتكرار الإجابة عدة مرات حتى يتم قطعها عند 60 رمزًا. ليس من المستغرب أننا كررنا موجه النظام عشر مرات لأغراض التوضيح وبالتالي قمنا بتشغيل النموذج لتكرار نفسه.
+
+**ملاحظة** لا ينبغي تكرار موجه النظام عشر مرات في التطبيقات الواقعية - مرة واحدة كافية!
+
+دعنا نقيس متطلبات ذاكرة GPU وقت الذروة.
+
+```python
+bytes_to_giga_bytes(torch.cuda.max_memory_allocated())
+```
+
+**الإخراج**:
+```
+37.668193340301514
+```
+
+كما نرى، فإن متطلبات ذاكرة GPU وقت الذروة أعلى بكثير مما كانت عليه في البداية، وهو ما يرجع إلى حد كبير إلى تسلسل الإدخال الأطول. أيضًا، يستغرق التوليد أكثر من دقيقة بقليل الآن.
+
+نستدعي `flush()` لتحرير ذاكرة GPU لتجربتنا التالية.
+
+```python
+flush()
+```
+
+لمقارنة، دعونا نقوم بتشغيل نفس الدالة، ولكن تمكين الاهتمام فلاش بدلا من ذلك.
+للقيام بذلك، نقوم بتحويل النموذج إلى [BetterTransformer](Https://huggingface.co/docs/optimum/bettertransformer/overview) ومن خلال القيام بذلك تمكين PyTorch's [SDPA self-attention](Https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention) والتي بدورها قادرة على استخدام الاهتمام فلاش.
+
+```python
+model.to_bettertransformer()
+```
+
+الآن نقوم بتشغيل نفس مقتطف التعليمات البرمجية بالضبط كما كان من قبل وتحت الغطاء سوف تستخدم المحولات الاهتمام فلاش.
+
+```py
+start_time = time.time()
+with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
+    result = pipe(long_prompt, max_new_tokens=60)[0]["generated_text"][len(long_prompt):]
+
+print(f"Generated in {time.time() - start_time} seconds.")
+result
+```
+
+**الإخراج**:
+```
+تم التوليد في 3.0211617946624756 ثانية.
+بالتأكيد. إليك وظيفة للقيام بذلك.
+
+def bytes_to_giga(bytes):
+return bytes / 1024 / 1024 / 1024
+
+الإجابة: بالتأكيد. إليك وظيفة للقيام بذلك.
+
+ديف
+```
+
+نحصل على نفس النتيجة بالضبط كما كان من قبل، ولكن يمكننا ملاحظة تسريع كبير بفضل الاهتمام فلاش.
+
+دعنا نقيس استهلاك الذاكرة لآخر مرة.
+
+```python
+bytes_to_giga_bytes(torch.cuda.max_memory_allocated())
+```
+
+**الإخراج**:
+```
+32.617331981658936
+```
+
+ونحن تقريبا مرة أخرى إلى ذاكرة GPU الذروة الأصلية لدينا 29GB.
+
+يمكننا أن نلاحظ أننا نستخدم فقط حوالي 100 ميجابايت إضافية من ذاكرة GPU عند تمرير تسلسل إدخال طويل جدًا مع الاهتمام فلاش مقارنة بتمرير تسلسل إدخال قصير كما فعلنا في البداية.
+
+```py
+flush()
+```
+
+لمزيد من المعلومات حول كيفية استخدام Flash Attention، يرجى الاطلاع على [صفحة doc هذه](Https://huggingface.co/docs/transformers/en/perf_infer_gpu_one#flashattention-2).
+
+## 3. الابتكارات المعمارية
+
+حتى الآن، نظرنا في تحسين الكفاءة الحسابية والذاكرة من خلال:
+
+-   صب الأوزان في تنسيق دقة أقل
+-   استبدال خوارزمية الاهتمام الذاتي بإصدار أكثر كفاءة من حيث الذاكرة والحساب
+
+دعونا الآن نلقي نظرة على كيفية تغيير بنية LLM بحيث تكون أكثر فعالية وكفاءة للمهام التي تتطلب مدخلات نصية طويلة، على سبيل المثال:
+-   استرجاع الأسئلة المعززة،
+-   تلخيص،
+-   الدردشة
+
+لاحظ أن "الدردشة" لا تتطلب من LLM التعامل مع مدخلات نصية طويلة فحسب، بل تتطلب أيضًا أن يكون LLM قادرًا على التعامل بكفاءة مع الحوار ذهابًا وإيابًا بين المستخدم والمساعد (مثل ChatGPT).
+
+بمجرد تدريبها، يصبح من الصعب تغيير بنية LLM الأساسية، لذلك من المهم مراعاة مهام LLM مسبقًا وتحسين بنية النموذج وفقًا لذلك.
+هناك مكونان مهمان لبنية النموذج يصبحان بسرعة عنق زجاجة للذاكرة و/أو الأداء لتسلسلات الإدخال الكبيرة.
+
+-   الترميزات الموضعية
+-   ذاكرة التخزين المؤقت للقيمة الرئيسية
+
+دعنا نلقي نظرة على كل مكون بمزيد من التفاصيل
+
+### 3.1 تحسين الترميزات الموضعية لـ LLMs
+
+يضع الاهتمام الذاتي كل رمز في علاقة مع رموز أخرى.
+كمثال، يمكن أن تبدو مصفوفة \\( \operatorname{Softmax}(\mathbf{QK}^T) \\) لتسلسل الإدخال النصي *"مرحبًا"، "أنا"، "أحب"، "أنت"* كما يلي:
+
+![](/blog/assets/163_optimize_llm/self_attn_tokens.png)
+
+يتم منح كل رمز كلمة كتلة احتمال يتم من خلالها الاهتمام بجميع رموز الكلمات الأخرى، وبالتالي يتم وضعها في علاقة مع جميع رموز الكلمات الأخرى. على سبيل المثال، تحضر كلمة *"الحب"* كلمة *"مرحبًا"* بنسبة 5%، و *"أنا"* بنسبة 30%، ونفسها بنسبة 65%.
+
+سيواجه LLM القائم على الاهتمام الذاتي، ولكن بدون الترميزات الموضعية، صعوبات كبيرة في فهم مواضع نصوص الإدخال بالنسبة لبعضها البعض.
+ويرجع ذلك إلى أن درجة الاحتمال التي يحسبها \\( \mathbf{QK}^T \\) تربط كل رمز كلمة بكل رمز كلمة أخرى في حسابات \\( O (1) \\) بغض النظر عن مسافة الموضع النسبي بينهما.
+لذلك، بالنسبة إلى LLM بدون ترميزات موضعية، يبدو أن كل رمز له نفس المسافة إلى جميع الرموز الأخرى، على سبيل المثال، سيكون من الصعب التمييز بين *"مرحبًا أنا أحبك"* و *"أنت تحبني مرحبًا"*.
+
+لكي يفهم LLM ترتيب الجملة، يلزم وجود *إشارة* إضافية ويتم تطبيقها عادةً في شكل *الترميزات الموضعية* (أو ما يُطلق عليه أيضًا *الترميزات الموضعية*).
+لم يتم ترجمة النص الخاص والروابط وأكواد HTML وCSS بناءً على طلبك.
+
+قدم مؤلفو الورقة البحثية [*Attention Is All You Need*](https://arxiv.org/abs/1706.03762) تضمينات موضعية جيبية مثلثية \\( \mathbf{P} = \mathbf{p}_1, \ldots, \mathbf{p}_N \\) حيث يتم حساب كل متجه \\( \mathbf{p}_i \\) كدالة جيبية لموضعه \\( i \\) .
+بعد ذلك يتم ببساطة إضافة التضمينات الموضعية إلى متجهات تسلسل الإدخال \\( \mathbf{\hat{X}} = \mathbf{\hat{x}}_1, \ldots, \mathbf{\hat{x}}_N \\) = \\( \mathbf{x}_1 + \mathbf{p}_1, \ldots, \mathbf{x}_N + \mathbf{p}_N \\) وبالتالي توجيه النموذج لتعلم ترتيب الجملة بشكل أفضل.
+
+بدلاً من استخدام التضمينات الموضعية الثابتة، استخدم آخرون (مثل [Devlin et al.](https://arxiv.org/abs/1810.04805)) تضمينات موضعية مكتسبة يتم من خلالها تعلم التضمينات الموضعية \\( \mathbf{P} \\) أثناء التدريب.
+
+كانت التضمينات الموضعية الجيبية والمكتسبة هي الطرق السائدة لترميز ترتيب الجملة في نماذج اللغة الكبيرة، ولكن تم العثور على بعض المشكلات المتعلقة بهذه التضمينات الموضعية:
+
+1. التضمينات الموضعية الجيبية والمكتسبة هي تضمينات موضعية مطلقة، أي ترميز تضمين فريد لكل معرف موضعي: \\( 0, \ldots, N \\) . كما أظهر [Huang et al.](https://arxiv.org/abs/2009.13658) و [Su et al.](https://arxiv.org/abs/2104.09864)، تؤدي التضمينات الموضعية المطلقة إلى أداء ضعيف لنماذج اللغة الكبيرة للمدخلات النصية الطويلة. بالنسبة للمدخلات النصية الطويلة، يكون من المفيد إذا تعلم النموذج المسافة الموضعية النسبية التي تمتلكها رموز المدخلات إلى بعضها البعض بدلاً من موضعها المطلق.
+2. عند استخدام التضمينات الموضعية المكتسبة، يجب تدريب نموذج اللغة الكبيرة على طول إدخال ثابت \\( N \\)، مما يجعل من الصعب الاستقراء إلى طول إدخال أطول مما تم تدريبه عليه.
+
+في الآونة الأخيرة، أصبحت التضمينات الموضعية النسبية التي يمكنها معالجة المشكلات المذكورة أعلاه أكثر شعبية، وأبرزها:
+
+-   [تضمين الموضع الدوراني (RoPE)](https://arxiv.org/abs/2104.09864)
+-   [ALiBi](https://arxiv.org/abs/2108.12409)
+
+يؤكد كل من *RoPE* و *ALiBi* أنه من الأفضل توجيه نموذج اللغة الكبيرة حول ترتيب الجملة مباشرة في خوارزمية الانتباه الذاتي حيث يتم وضع رموز الكلمات في علاقة مع بعضها البعض. على وجه التحديد، يجب توجيه ترتيب الجملة عن طريق تعديل عملية \\( \mathbf{QK}^T \\) .
+
+دون الدخول في الكثير من التفاصيل، يشير *RoPE* إلى أنه يمكن ترميز المعلومات الموضعية في أزواج الاستعلام-المفتاح، على سبيل المثال \\( \mathbf{q}_i \\) و \\( \mathbf{x}_j \\) عن طريق تدوير كل متجه بزاوية \\( \theta * i \\) و \\( \theta * j \\) على التوالي مع \\( i, j \\) تصف موضع الجملة لكل متجه:
+
+$$ \mathbf{\hat{q}}_i^T \mathbf{\hat{x}}_j = \mathbf{{q}}_i^T \mathbf{R}_{\theta, i -j} \mathbf{{x}}_j. $$
+
+يمثل \\( \mathbf{R}_{\theta, i - j} \\) مصفوفة دورانية. \\( \theta \\) *لا* يتم تعلمه أثناء التدريب، ولكن بدلاً من ذلك يتم تعيينه إلى قيمة محددة مسبقًا تعتمد على طول تسلسل الإدخال الأقصى أثناء التدريب.
+
+> من خلال القيام بذلك، يتم التأثير على درجة الاحتمال بين \\( \mathbf{q}_i \\) و \\( \mathbf{q}_j \\) فقط إذا \\( i \ne j \\) ويعتمد فقط على المسافة النسبية \\( i - j \\) بغض النظر عن المواضع المحددة لكل متجه \\( i \\) و \\( j \\) .
+
+يستخدم *RoPE* في العديد من نماذج اللغة الكبيرة الأكثر أهمية اليوم، مثل:
+
+-   [**Falcon**](https://huggingface.co/tiiuae/falcon-40b)
+-   [**Llama**](https://arxiv.org/abs/2302.13971)
+-   [**PaLM**](https://arxiv.org/abs/2204.02311)
+
+كبديل، يقترح *ALiBi* مخطط ترميز موضعي نسبي أبسط بكثير. يتم إضافة المسافة النسبية التي تمتلكها رموز المدخلات إلى بعضها البعض كعدد صحيح سلبي مقياس بقيمة محددة مسبقًا `m` إلى كل إدخال استعلام-مفتاح لمصفوفة \\( \mathbf{QK}^T \\) مباشرة قبل حساب softmax.
+
+![](/blog/assets/163_optimize_llm/alibi.png)
+
+كما هو موضح في ورقة [ALiBi](https://arxiv.org/abs/2108.12409)، يسمح هذا الترميز الموضعي النسبي البسيط للنموذج بالحفاظ على أداء عالٍ حتى في تسلسلات المدخلات النصية الطويلة جدًا.
+
+يُستخدم *ALiBi* في العديد من أهم نماذج اللغة الكبيرة المستخدمة اليوم، مثل:
+
+-   [**MPT**](https://huggingface.co/mosaicml/mpt-30b)
+-   [**BLOOM**](https://huggingface.co/bigscience/bloom)
+
+يمكن لكل من ترميزات الموضع *RoPE* و *ALiBi* الاستقراء إلى أطوال إدخال لم يتم ملاحظتها أثناء التدريب، في حين ثبت أن الاستقراء يعمل بشكل أفضل بكثير خارج الصندوق لـ *ALiBi* مقارنة بـ *RoPE*.
+بالنسبة لـ ALiBi، ما عليك سوى زيادة قيم مصفوفة الموضع المثلث السفلي لمطابقة طول تسلسل الإدخال.
+بالنسبة لـ *RoPE*، يؤدي الحفاظ على نفس \\( \theta \\) الذي تم استخدامه أثناء التدريب إلى نتائج سيئة عند تمرير إدخالات نصية أطول بكثير من تلك التي شوهدت أثناء التدريب، راجع [Press et al.](https://arxiv.org/abs/2108.12409). ومع ذلك، وجد المجتمع بعض الحيل الفعالة التي تقوم بتعديل \\( \theta \\)، مما يسمح لترميزات الموضع *RoPE* بالعمل بشكل جيد لتسلسلات إدخال النص المستقرئة (راجع [هنا](https://github.com/huggingface/transformers/pull/24653)).
+
+> كل من RoPE و ALiBi عبارة عن ترميزات موضع نسبي *لا* يتم تعلمها أثناء التدريب، ولكن بدلاً من ذلك تستند إلى الحدس التالي:
+ -   يجب إعطاء الإشارات الموضعية حول إدخالات النص مباشرة إلى مصفوفة \\( QK^T \\) لطبقة الاهتمام الذاتي
+ -   يجب تحفيز LLM لتعلم ترميزات موضعية ثابتة *نسبية* المسافة لبعضها البعض
+ -   كلما ابتعدت رموز إدخال النص عن بعضها البعض، انخفض احتمال الاستعلام والقيمة. كل من RoPE و ALiBi يقللان من احتمال الاستعلام والمفتاح للرموز البعيدة عن بعضها البعض. يقوم RoPE بذلك عن طريق تقليل منتج المتجه من خلال زيادة الزاوية بين متجهات الاستعلام والمفتاح. تضيف ALiBi أرقامًا كبيرة سالبة إلى المنتج الاتجاهي
+
+في الختام، من الأفضل تدريب نماذج اللغة الكبيرة المراد نشرها في مهام تتطلب التعامل مع إدخالات نصية كبيرة باستخدام ترميزات موضعية نسبية، مثل RoPE و ALiBi. لاحظ أيضًا أنه حتى إذا تم تدريب نموذج لغة كبيرة باستخدام RoPE و ALiBi على طول ثابت يبلغ، على سبيل المثال، \\( N_1 = 2048 \\)، فيمكن استخدامه عمليًا بإدخالات نصية أكبر بكثير من \\( N_1 \\)، مثل \\( N_2 = 8192> N_1 \\) عن طريق استقراء الترميزات الموضعية.
+
+### 3.2 ذاكرة التخزين المؤقت للمفتاح والقيمة
+
+تعمل عملية توليد النص ذاتي التراجع باستخدام نماذج اللغة الكبيرة عن طريق إدخال تسلسل إدخال بشكل تكراري، وأخذ عينات من الرمز التالي، وإلحاق الرمز التالي بتسلسل الإدخال، والاستمرار في ذلك حتى ينتج نموذج اللغة الكبيرة رمزًا يشير إلى انتهاء التوليد.
+
+يرجى الاطلاع على [دليل إنشاء النص الخاص بـ Transformer](https://huggingface.co/docs/transformers/llm_tutorial#generate-text) للحصول على شرح مرئي أفضل لكيفية عمل التوليد ذاتي التراجع.
+
+دعنا ننفذ مقتطفًا قصيرًا من التعليمات البرمجية لإظهار كيفية عمل التوليد ذاتي التراجع في الممارسة. ببساطة، سنأخذ الرمز الأكثر احتمالًا عبر `torch.argmax`.
+
+```python
+input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to("cuda")
+
+for _ in range(5):
+  next_logits = model(input_ids)["logits"][:, -1:]
+  next_token_id = torch.argmax(next_logits,dim=-1)
+
+  input_ids = torch.cat([input_ids, next_token_id], dim=-1)
+  print("shape of input_ids", input_ids.shape)
+
+generated_text = tokenizer.batch_decode(input_ids[:, -5:])
+generated_text
+```
+
+**الإخراج**:
+```
+shape of input_ids torch.Size([1, 21])
+shape of input_ids torch.Size([1, 22])
+shape of input_ids torch.Size([1, 23])
+shape of input_ids torch.Size([1, 24])
+shape of input_ids torch.Size([1, 25])
+[' Here is a Python function']
+```
+
+كما نرى، في كل مرة نزيد من رموز إدخال النص بالرمز الذي تم أخذ عينات منه للتو.
+
+باستثناءات قليلة جدًا، يتم تدريب نماذج اللغة الكبيرة باستخدام [هدف نمذجة اللغة السببية](https://huggingface.co/docs/transformers/tasks/language_modeling#causal-language-modeling) وبالتالي يتم قناع المثلث العلوي لمصفوفة نتيجة الاهتمام - وهذا هو السبب في ترك نتائج الاهتمام فارغة (*أي لها احتمال 0*) في المخططين أعلاه. للحصول على ملخص سريع حول نمذجة اللغة السببية، يمكنك الرجوع إلى مدونة [*Illustrated Self Attention*](https://jalammar.github.io/illustrated-gpt2/#part-2-illustrated-self-attention).
+
+ونتيجة لذلك، *لا* تعتمد الرموز *أبدًا* على الرموز السابقة، وبشكل أكثر تحديدًا، لا يتم أبدًا وضع المتجه \\( \mathbf{q}_i \\) في علاقة مع أي متجهات المفاتيح والقيم \\( \mathbf{k}_j، \mathbf{v}_j \\) إذا \\( j> i \\). بدلاً من ذلك، يحضر \\( \mathbf{q}_i \\) فقط إلى متجهات المفاتيح والقيم السابقة \\( \mathbf{k}_{m < i}، \mathbf{v}_{m < i} \text{ , for } m \in \{0، \ ldots i - 1\} \\). لتقليل الحسابات غير الضرورية، يمكن تخزين ذاكرة التخزين المؤقت لكل طبقة للمفاتيح ومتجهات القيم لجميع الخطوات الزمنية السابقة.
+
+فيما يلي، سنطلب من نموذج اللغة الكبيرة استخدام ذاكرة التخزين المؤقت للمفاتيح والقيم عن طريق استردادها وإرسالها لكل عملية توجيه.
+في Transformers، يمكننا استرداد ذاكرة التخزين المؤقت للمفاتيح والقيم عن طريق تمرير علم `use_cache` إلى مكالمة `forward` ويمكننا بعد ذلك تمريره مع الرمز الحالي.
+
+```python
+past_key_values = None # past_key_values is the key-value cache
+generated_tokens = []
+next_token_id = tokenizer(prompt, return_tensors="pt")["input_ids"].to("cuda")
+
+for _ in range(5):
+  next_logits, past_key_values = model(next_token_id, past_key_values=past_key_values, use_cache=True).to_tuple()
+  next_logits = next_logits[:, -1:]
+  next_token_id = torch.argmax(next_logits, dim=-1)
+
+  print("shape of input_ids", next_token_id.shape)
+  print("length of key-value cache", len(past_key_values[0][0]))  # past_key_values are of shape [num_layers, 0 for k, 1 for v, batch_size, length, hidden_dim]
+  generated_tokens.append(next_token_id.item())
+
+generated_text = tokenizer.batch_decode(generated_tokens)
+generated_text
+```
+
+**الإخراج**:
+```
+shape of input_ids torch.Size([1, 1])
+length of key-value cache 20
+shape of input_ids torch.Size([1, 1])
+length of key-value cache 21
+shape of input_ids torch.Size([1, 1])
+length of key-value cache 22
+shape of input_ids torch.Size([1, 1])
+length of key-value cache 23
+shape of input_ids torch.Size([1, 1])
+length of key-value cache 24
+[' Here', ' is', ' a', ' Python', ' function']
+```
+
+كما هو موضح، عند استخدام ذاكرة التخزين المؤقت للمفاتيح والقيم، لا يتم زيادة رموز إدخال النص في الطول، ولكنها تظل متجه إدخال واحدًا. من ناحية أخرى، يتم زيادة طول ذاكرة التخزين المؤقت للمفاتيح والقيم بواحد في كل خطوة فك التشفير.
+
+> يعني استخدام ذاكرة التخزين المؤقت للمفاتيح والقيم أن \\( \mathbf{QK}^T \\) يتم تقليله بشكل أساسي إلى \\( \mathbf{q}_c\mathbf{K}^T \\) مع \\( \mathbf{q}_c \\) كونها إسقاط الاستعلام للرمز المدخل الحالي الذي يكون *دائمًا* مجرد متجه واحد.
+
+لاستخدام ذاكرة التخزين المؤقت للمفاتيح والقيم ميزتان:
+-   زيادة كبيرة في الكفاءة الحسابية حيث يتم إجراء حسابات أقل مقارنة بحساب مصفوفة \\( \mathbf{QK}^T \\) الكاملة. يؤدي ذلك إلى زيادة سرعة الاستدلال
+-   لا تزداد الذاكرة القصوى المطلوبة بشكل تربيعي مع عدد الرموز المولدة، ولكنها تزداد بشكل خطي فقط.
+
+> يجب *دائمًا* استخدام ذاكرة التخزين المؤقت للمفاتيح والقيم حيث يؤدي ذلك إلى نتائج متطابقة وزيادة كبيرة في السرعة لتسلسلات الإدخال الأطول. ذاكرة التخزين المؤقت للمفاتيح والقيم ممكّنة بشكل افتراضي في Transformers عند استخدام خط أنابيب النص أو طريقة [`generate`](https://huggingface.co/docs/transformers/main_classes/text_generation).
+
+
+<Tip warning={true}>
+
+لاحظ أنه على الرغم من نصيحتنا باستخدام ذاكرة التخزين المؤقت للمفاتيح والقيم، فقد يكون إخراج نموذج اللغة الكبيرة مختلفًا قليلاً عند استخدامها. هذه خاصية نوى ضرب المصفوفة نفسها - يمكنك قراءة المزيد عنها [هنا](https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535).
+
+</Tip>
+
+#### 3.2.1 محادثة متعددة الجولات
+
+ذاكرة التخزين المؤقت للمفاتيح والقيم مفيدة بشكل خاص للتطبيقات مثل الدردشة حيث تكون هناك حاجة إلى عدة تمريرات من فك التشفير ذاتي التراجع. دعنا نلقي نظرة على مثال.
+
+```
+المستخدم: كم عدد الأشخاص الذين يعيشون في فرنسا؟
+المساعد: يعيش حوالي 75 مليون شخص في فرنسا
+المستخدم: وكم عدد الأشخاص في ألمانيا؟
+المساعد: يوجد في ألمانيا حوالي 81 مليون نسمة
+
+User: How many people live in France?
+Assistant: Roughly 75 million people live in France
+User: And how many are in Germany?
+Assistant: Germany has ca. 81 million inhabitants
+```
+
+In this chat، يقوم LLM بتشغيل فك التشفير التلقائي مرتين:
+  1. المرة الأولى، تكون ذاكرة التخزين المؤقت key-value فارغة، ويكون موجه الإدخال هو "User: How many people live in France؟" ويقوم النموذج بإنشاء النص "Roughly 75 million people live in France" بشكل تلقائي أثناء زيادة ذاكرة التخزين المؤقت key-value في كل خطوة فك تشفير.
+  2. في المرة الثانية، يكون موجه الإدخال هو "User: How many people live in France؟ \n Assistant: Roughly 75 million people live in France \n User: And how many in Germany؟". بفضل ذاكرة التخزين المؤقت، يتم بالفعل حساب جميع متجهات القيمة الرئيسية لجاريتين الأولى. لذلك يتكون موجه الإدخال فقط من "User: And how many in Germany؟". أثناء معالجة موجه الإدخال المختصر، يتم ربط متجهات القيمة المحسوبة بذاكرة التخزين المؤقت key-value الخاصة بفك التشفير الأول. يتم بعد ذلك إنشاء إجابة المساعد الثانية "Germany has ca. 81 million inhabitants" بشكل تلقائي باستخدام ذاكرة التخزين المؤقت key-value المكونة من متجهات القيمة المشفرة لـ "User: How many people live in France؟ \n Assistant: Roughly 75 million people live in France \n User: And how many are in Germany؟".
+
+يجب ملاحظة أمرين هنا:
+  1. الحفاظ على كل السياق أمر بالغ الأهمية للنماذج اللغوية الكبيرة (LLMs) التي يتم نشرها في الدردشة بحيث يفهم LLM كل سياق المحادثة السابق. على سبيل المثال، بالنسبة للمثال أعلاه، يحتاج LLM إلى فهم أن المستخدم يشير إلى السكان عند السؤال "And how many are in Germany؟".
+  2. ذاكرة التخزين المؤقت key-value مفيدة للغاية للدردشة حيث تتيح لنا النمو المستمر لتاريخ الدردشة المشفرة بدلاً من الاضطرار إلى إعادة تشفير تاريخ الدردشة من البداية (كما هو الحال، على سبيل المثال، عند استخدام بنية ترميز فك التشفير).
+
+في `transformers`، ستعيد مكالمة `generate` `past_key_values` عندما يتم تمرير `return_dict_in_generate=True`، بالإضافة إلى `use_cache=True` الافتراضي. لاحظ أنه غير متوفر بعد من خلال واجهة `pipeline`.
+
+```python
+# Generation as usual
+prompt = system_prompt + "Question: Please write a function in Python that transforms bytes to Giga bytes.\n\nAnswer: Here"
+model_inputs = tokenizer(prompt، return_tensors='pt')
+generation_output = model.generate(**model_inputs، max_new_tokens=60، return_dict_in_generate=True)
+decoded_output = tokenizer.batch_decode(generation_output.sequences)[0]
+
+# Piping the returned `past_key_values` to speed up the next conversation round
+prompt = decoded_output + "\nQuestion: How can I modify the function above to return Mega bytes instead?\n\nAnswer: Here"
+model_inputs = tokenizer(prompt، return_tensors='pt')
+generation_output = model.generate(
+  **model_inputs،
+  past_key_values=generation_output.past_key_values،
+  max_new_tokens=60،
+  return_dict_in_generate=True
+)
+tokenizer.batch_decode(generation_output.sequences)[0][len(prompt):]
+```
+
+**الإخراج**:
+```
+ هي نسخة معدلة من الدالة التي تعيد ميجا بايت بدلاً من ذلك.
+
+def bytes_to_megabytes(bytes):
+   return bytes / 1024 / 1024
+
+Answer: The function takes a number of bytes as input and returns the number of
+```
+
+رائع، لا يتم إنفاق وقت إضافي على إعادة حساب نفس المفتاح والقيم لطبقة الاهتمام! ومع ذلك، هناك شيء واحد يجب ملاحظته. في حين أن ذروة الذاكرة المطلوبة لمصفوفة \\( \mathbf{QK}^T \\) يتم تقليلها بشكل كبير، فإن الاحتفاظ بذاكرة التخزين المؤقت key-value في الذاكرة يمكن أن يصبح مكلفًا جدًا من حيث الذاكرة لسلاسل الإدخال الطويلة أو الدردشة متعددة الجولات. تذكر أن ذاكرة التخزين المؤقت key-value بحاجة إلى تخزين متجهات القيمة الرئيسية لجميع متجهات الإدخال السابقة \\( \mathbf{x}_i \text{، لـ } i \in \{1، \ ldots، c - 1\} \\) لجميع طبقات الاهتمام الذاتي وكل رؤوس الاهتمام.
+
+دعنا نحسب عدد القيم العائمة التي يجب تخزينها في ذاكرة التخزين المؤقت key-value لنموذج LLM `bigcode/octocoder` الذي استخدمناه من قبل.
+يبلغ عدد القيم العائمة ضعف طول التسلسل مضروبًا في عدد رؤوس الاهتمام مضروبًا في بعد رأس الاهتمام ومضروبًا في عدد الطبقات.
+حساب هذا لنموذج LLM لدينا عند طول تسلسل افتراضي يبلغ 16000 يعطي:
+
+```python
+config = model.config
+2 * 16_000 * config.n_layer * config.n_head * config.n_embd // config.n_head
+```
+
+**الإخراج**:
+```
+7864320000
+```
+
+Roughly 8 مليار قيمة عائمة! يتطلب تخزين 8 مليارات قيمة عائمة في دقة `float16` حوالي 15 جيجابايت من ذاكرة الوصول العشوائي (RAM) وهو ما يقرب من نصف حجم أوزان النموذج نفسها!
+اقترح الباحثون طريقتين تسمحان بتقليل تكلفة الذاكرة لتخزين ذاكرة التخزين المؤقت key-value بشكل كبير، والتي يتم استكشافها في الأقسام الفرعية التالية.
+
+#### 3.2.2 Multi-Query-Attention (MQA)
+
+[Multi-Query-Attention](https://arxiv.org/abs/1911.02150) اقترحها Noam Shazeer في ورقته *Fast Transformer Decoding: One Write-Head is All You Need*. كما يقول العنوان، اكتشف Noam أنه بدلاً من استخدام `n_head` من أوزان إسقاط القيمة الرئيسية، يمكن استخدام زوج واحد من أوزان إسقاط رأس القيمة التي يتم مشاركتها عبر جميع رؤوس الاهتمام دون أن يتدهور أداء النموذج بشكل كبير.
+
+> باستخدام زوج واحد من أوزان إسقاط رأس القيمة، يجب أن تكون متجهات القيمة الرئيسية \\( \mathbf{k}_i، \mathbf{v}_i \\) متطابقة عبر جميع رؤوس الاهتمام والتي بدورها تعني أننا بحاجة فقط إلى تخزين زوج إسقاط قيمة رئيسي واحد في ذاكرة التخزين المؤقت بدلاً من `n_head` منها.
+
+نظرًا لأن معظم LLMs تستخدم ما بين 20 و100 رأس اهتمام، فإن MQA يقلل بشكل كبير من استهلاك الذاكرة لذاكرة التخزين المؤقت key-value. بالنسبة إلى LLM المستخدم في هذا الدفتر، يمكننا تقليل استهلاك الذاكرة المطلوبة من 15 جيجابايت إلى أقل من 400 ميجابايت عند طول تسلسل الإدخال 16000.
+
+بالإضافة إلى توفير الذاكرة، يؤدي MQA أيضًا إلى تحسين الكفاءة الحسابية كما هو موضح في ما يلي.
+في فك التشفير التلقائي، يجب إعادة تحميل متجهات القيمة الرئيسية الكبيرة، ودمجها مع زوج متجه القيمة الحالي، ثم إدخالها في \\( \mathbf{q}_c\mathbf{K}^T \\) الحساب في كل خطوة. بالنسبة لفك التشفير التلقائي، يمكن أن تصبح عرض النطاق الترددي للذاكرة المطلوبة لإعادة التحميل المستمر عنق زجاجة زمنيًا خطيرًا. من خلال تقليل حجم متجهات القيمة الرئيسية، يجب الوصول إلى ذاكرة أقل، وبالتالي تقليل عنق الزجاجة في عرض النطاق الترددي للذاكرة. لمزيد من التفاصيل، يرجى إلقاء نظرة على [ورقة Noam](https://arxiv.org/abs/1911.02150).
+
+الجزء المهم الذي يجب فهمه هنا هو أن تقليل عدد رؤوس الاهتمام بالقيمة الرئيسية إلى 1 لا معنى له إلا إذا تم استخدام ذاكرة التخزين المؤقت للقيمة الرئيسية. يظل الاستهلاك الذروي لذاكرة النموذج لمرور واحد للأمام بدون ذاكرة التخزين المؤقت للقيمة الرئيسية دون تغيير لأن كل رأس اهتمام لا يزال لديه متجه استعلام فريد بحيث يكون لكل رأس اهتمام مصفوفة \\( \mathbf{QK}^T \\) مختلفة.
+
+شهدت MQA اعتمادًا واسع النطاق من قبل المجتمع ويتم استخدامها الآن بواسطة العديد من LLMs الأكثر شهرة:
+
+-   [**Falcon**](https://huggingface.co/tiiuae/falcon-40b)
+-   [**PaLM**](https://arxiv.org/abs/2204.02311)
+-   [**MPT**](https://huggingface.co/mosaicml/mpt-30b)
+-   [**BLOOM**](https://huggingface.co/bigscience/bloom)
+
+كما يستخدم نقطة التحقق المستخدمة في هذا الدفتر - `bigcode/octocoder` - MQA.
+
+#### 3.2.3 مجموعة الاستعلام الاهتمام (GQA)
+
+[مجموعة الاستعلام الاهتمام](https://arxiv.org/abs/2305.13245)، كما اقترح Ainslie et al. من Google، وجد أن استخدام MQA يمكن أن يؤدي غالبًا إلى تدهور الجودة مقارنة باستخدام إسقاطات رأس القيمة الرئيسية المتعددة. تجادل الورقة بأنه يمكن الحفاظ على أداء النموذج بشكل أكبر عن طريق تقليل عدد أوزان إسقاط رأس الاستعلام بشكل أقل حدة. بدلاً من استخدام وزن إسقاط قيمة رئيسية واحدة فقط، يجب استخدام `n <n_head` أوزان إسقاط قيمة رئيسية. من خلال اختيار `n` إلى قيمة أقل بكثير من `n_head`، مثل 2 أو 4 أو 8، يمكن الاحتفاظ بمعظم مكاسب الذاكرة والسرعة من MQA مع التضحية بقدر أقل من سعة النموذج وبالتالي، من المفترض، أقل أداء.
+
+علاوة على ذلك، اكتشف مؤلفو GQA أنه يمكن *تدريب* نقاط تفتيش النموذج الموجودة ليكون لها بنية GQA باستخدام 5% فقط من الحوسبة الأصلية للتعليم المسبق. في حين أن 5% من الحوسبة الأصلية للتعليم المسبق يمكن أن تكون كمية هائلة، يسمح GQA *uptraining* بنقاط تفتيش موجودة للاستفادة من تسلسلات الإدخال الأطول.
+
+تم اقتراح GQA مؤخرًا فقط، ولهذا السبب هناك اعتماد أقل وقت كتابة هذا الدفتر.
+أبرز تطبيق لـ GQA هو [Llama-v2](https://huggingface.co/meta-llama/Llama-2-70b-hf).
+
+> كخاتمة، من المستحسن بشدة استخدام GQA أو MQA إذا تم نشر LLM باستخدام فك التشفير التلقائي ويتطلب التعامل مع تسلسلات الإدخال الكبيرة كما هو الحال على سبيل المثال للدردشة.
+
+
+## الخاتمة
+
+مجتمع البحث يأتي باستمرار بطرق جديدة ومبتكرة لتسريع وقت الاستدلال للنماذج اللغوية الكبيرة على الإطلاق. كمثال، أحد اتجاهات البحث الواعدة هو [فك التشفير التخميني](https://arxiv.org/abs/2211.17192) حيث تقوم "الرموز السهلة" بإنشائها نماذج اللغة الأصغر والأسرع ويتم إنشاء "الرموز الصعبة" فقط بواسطة LLM نفسه. إن التعمق في التفاصيل يتجاوز نطاق هذا الدفتر، ولكن يمكن قراءته في هذه [تدوينة المدونة اللطيفة](https://huggingface.co/blog/assisted-generation).
+
+السبب في أن LLMs الضخمة مثل GPT3/4، وLlama-2-70b، وClaude، وPaLM يمكن أن تعمل بسرعة كبيرة في واجهات الدردشة مثل [Hugging Face Chat](https://huggingface.co/chat/) أو ChatGPT يرجع إلى حد كبير إلى التحسينات المذكورة أعلاه في الدقة والخوارزميات والهندسة المعمارية.
+في المستقبل، ستكون أجهزة التسريع مثل وحدات معالجة الرسومات (GPUs) ووحدات معالجة الرسومات (TPUs)، وما إلى ذلك... ستكون أسرع فقط وستسمح بمزيد من الذاكرة، ولكن يجب دائمًا التأكد من استخدام أفضل الخوارزميات والهندسة المعمارية المتاحة للحصول على أكبر قدر من المال
diff --git a/docs/source/ar/model_memory_anatomy.md b/docs/source/ar/model_memory_anatomy.md
new file mode 100644
index 00000000000000..6fc2552321b799
--- /dev/null
+++ b/docs/source/ar/model_memory_anatomy.md
@@ -0,0 +1,226 @@
+# تشريح عملية تدريب النموذج
+
+لفهم تقنيات تحسين الأداء التي يمكن تطبيقها لتحسين كفاءة استخدام الذاكرة وسرعة تدريب النموذج، من المفيد التعرف على كيفية استخدام وحدة معالجة الرسوميات (GPU) أثناء التدريب، وكيف تختلف كثافة العمليات الحسابية باختلاف العملية التي يتم تنفيذها.
+
+لنبدأ باستكشاف مثال توضيحي على استخدام وحدة GPU وتشغيل تدريب نموذج. وللتوضيح، سنحتاج إلى تثبيت بعض المكتبات:
+
+```bash
+pip install transformers datasets accelerate nvidia-ml-py3
+```
+
+تتيح مكتبة `nvidia-ml-py3` إمكانية مراقبة استخدام الذاكرة في النماذج من داخل بايثون. قد تكون على دراية بأمر `nvidia-smi` في الجهاز - تسمح هذه المكتبة بالوصول إلى نفس المعلومات مباشرة في بايثون.
+
+ثم، نقوم بإنشاء بعض البيانات الوهمية:معرّفات رموز عشوائية بين 100 و30000 وتصنيفات ثنائية للمصنف.
+
+في المجموع، نحصل على 512 تسلسلًا، لكل منها طول 512، ونخزنها في [`~datasets.Dataset`] بتنسيق PyTorch.
+
+```py
+>>> import numpy as np
+>>> from datasets import Dataset
+
+>>> seq_len, dataset_size = 512, 512
+>>> dummy_data = {
+...     "input_ids": np.random.randint(100, 30000, (dataset_size, seq_len)),
+...     "labels": np.random.randint(0, 1, (dataset_size)),
+... }
+>>> ds = Dataset.from_dict(dummy_data)
+>>> ds.set_format("pt")
+```
+
+لطباعة إحصائيات موجزة لاستخدام وحدة GPU وتشغيل التدريب مع [`Trainer`]، نقوم بتعريف دالتين مساعدتين:
+
+```py
+>>> from pynvml import *
+
+>>> def print_gpu_utilization():
+...     nvmlInit()
+...     handle = nvmlDeviceGetHandleByIndex(0)
+...     info = nvmlDeviceGetMemoryInfo(handle)
+...     print(f"GPU memory occupied: {info.used//1024**2} MB.")
+
+>>> def print_summary(result):
+...     print(f"Time: {result.metrics['train_runtime']:.2f}")
+...     print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
+...     print_gpu_utilization()
+```
+
+دعنا نتأكد من أننا نبدأ بذاكرة وحدة GPU خالية:
+
+```py
+>>> print_gpu_utilization()
+GPU memory occupied: 0 MB.
+```
+
+يبدو ذلك جيدًا: لم يتم شغل ذاكرة وحدة معالجة الرسومات كما نتوقع قبل تحميل أي نماذج. إذا لم يكن الأمر كذلك على جهازك، فتأكد من إيقاف جميع العمليات التي تستخدم ذاكرة وحدة GPU. ومع ذلك، لا يمكن للمستخدم استخدام كل ذاكرة وحدة GPU الفارغة. عندما يتم تحميل نموذج إلى وحدة GPU، يتم أيضًا تحميل النواة، والتي يمكن أن تستهلك 1-2 جيجابايت من الذاكرة. ولرؤية مقدار ذلك، نقوم بتحميل مصفوفة صغيرة إلى وحدة GPU والتي تؤدي إلى تحميل النواة أيضًا.
+
+```py
+>>> import torch
+
+>>> torch.ones((1, 1)).to("cuda")
+>>> print_gpu_utilization()
+GPU memory occupied: 1343 MB.
+```
+
+نلاحظ أن النواة وحدها تستهلك 1.3 جيجابايت من ذاكرة وحدة GPU. الآن دعنا نرى مقدار المساحة التي يستخدمها النموذج.
+
+## تحميل النموذج
+
+أولاً، نقوم بتحميل نموذج `google-bert/bert-large-uncased`. نقوم بتحميل أوزان النموذج مباشرة إلى وحدة GPU حتى نتمكن من التحقق من مقدار المساحة التي تستخدمها الأوزان فقط.
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-large-uncased").to("cuda")
+>>> print_gpu_utilization()
+GPU memory occupied: 2631 MB.
+```
+
+يمكننا أن نرى أن أوزان النموذج وحدها تستهلك 1.3 جيجابايت من ذاكرة وحدة GPU. يعتمد الرقم الدقيق على وحدة GPU المحددة التي تستخدمها. لاحظ أنه في وحدات GPU الأحدث، قد يستغرق النموذج في بعض الأحيان مساحة أكبر نظرًا لأن الأوزان يتم تحميلها بطريقة مُحسّنة تُسرّع من استخدام النموذج. الآن يمكننا أيضًا التحقق بسرعة مما إذا كنا نحصل على نفس النتيجة كما هو الحال مع `nvidia-smi` CLI:
+
+```bash
+nvidia-smi
+```
+
+```bash
+Tue Jan 11 08:58:05 2022
++-----------------------------------------------------------------------------+
+| NVIDIA-SMI 460.91.03    Driver Version: 460.91.03    CUDA Version: 11.2     |
+|-------------------------------+----------------------+----------------------+
+| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
+|                               |                      |               MIG M. |
+|===============================+======================+======================|
+|   0  Tesla V100-SXM2...  On   | 00000000:00:04.0 Off |                    0 |
+| N/A   37C    P0    39W / 300W |   2631MiB / 16160MiB |      0%      Default |
+|                               |                      |                  N/A |
++-------------------------------+----------------------+----------------------+
+
++-----------------------------------------------------------------------------+
+| Processes:                                                                  |
+|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
+|        ID   ID                                                   Usage      |
+|=============================================================================|
+|    0   N/A  N/A      3721      C   ...nvs/codeparrot/bin/python     2629MiB |
++-----------------------------------------------------------------------------+
+```
+
+نحصل على نفس الرقم كما كان من قبل، ويمكنك أيضًا أن ترى أننا نستخدم GPU من طراز V100 مع 16 جيجابايت من الذاكرة. لذا الآن يمكننا بدء تدريب النموذج ورؤية كيف يتغير استخدام ذاكرة GPU. أولاً، نقوم بإعداد بعض معاملات التدريب القياسية:
+
+```py
+default_args = {
+    "output_dir": "tmp"،
+    "eval_strategy": "steps"،
+    "num_train_epochs": 1،
+    "log_level": "error"،
+    "report_to": "none"،
+}
+```
+
+<Tip>
+
+ إذا كنت تخطط لتشغيل عدة تجارب، من أجل مسح الذاكرة بشكل صحيح بين التجارب، قم بإعادة تشغيل نواة Python بين التجارب.
+
+</Tip>
+
+## استخدام الذاكرة في التدريب الأساسي
+
+دعونا نستخدم [`Trainer`] وقم بتدريب النموذج دون استخدام أي تقنيات تحسين أداء GPU وحجم دفعة يبلغ 4:
+
+```py
+>>> from transformers import TrainingArguments، Trainer، logging
+
+>>> logging.set_verbosity_error()
+
+
+>>> training_args = TrainingArguments(per_device_train_batch_size=4، **default_args)
+>>> trainer = Trainer(model=model، args=training_args، train_dataset=ds)
+>>> result = trainer.train()
+>>> print_summary(result)
+```
+
+```
+الوقت: 57.82
+العينات / الثانية: 8.86
+ذاكرة GPU المشغولة: 14949 ميجابايت.
+```
+
+يمكننا أن نرى أن حجم دفعة صغير نسبيًا يملأ تقريبًا ذاكرة GPU بالكامل. ومع ذلك، غالبًا ما يؤدي حجم دفعة أكبر  في تقارب نموذج أسرع أو أداء أفضل في النهاية. لذلك نريد أن نضبط حجم الدفعة وفقًا لاحتياجات النموذج لدينا وليس  مع قيود وحدة GPU. ما يثير الاهتمام هو أننا نستخدم ذاكرة أكثر بكثير من حجم النموذج. 
+لفهم سبب ذلك بشكل أفضل، دعنا نلقي نظرة على عمليات النموذج واحتياجاته من الذاكرة.
+
+## تشريح عمليات النموذج
+
+تتضمن بنية المحولات 3 مجموعات رئيسية من العمليات مُجمعة أدناه حسب كثافة العمليات الحسابية.
+
+1. **عمليات ضرب المصفوفات**
+
+    تقوم الطبقات الخطية ومكونات الانتباه متعدد الرؤوس جميعها بعمليات ضرب ** المصفوفة بالمصفوفة** على دفعات. هذه العمليات هي أكثر أجزاء تدريب المحولات كثافة من الناحية الحسابية.
+
+2. **عمليات التسوية الإحصائية**
+
+تُعد  عمليات  Softmax والتسوية الطبقية أقل كثافة من ناحية  الحسابية من عمليات ضرب المصفوفات، وتنطوي على عملية أو أكثر من عمليات **الاختزال**، والتي يتم تطبيق نتيجتها بعد ذلك عبر خريطة.
+
+3. **العمليات على مستوى العناصر**
+
+    هذه هي العمليات المتبقية: **الانحيازات، والتسرب، ووظائف التنشيط، والوصلات المتبقية**. هذه هي عمليات أقل كثافة من الناحية الحسابية.
+
+يمكن أن تكون هذه المعرفة مفيدة لمعرفة عند تحليل اختناقات الأداء.
+
+هذا الملخص مُشتق من [نقل البيانات هو كل ما تحتاجه: دراسة حالة حول تحسين المحولات 2020](https://arxiv.org/abs/2007.00072)
+
+
+## تشريح ذاكرة النموذج
+
+لقد رأينا أن تدريب النموذج يستخدم ذاكرة أكثر بكثير من مجرد وضع النموذج على GPU. ويرجع ذلك إلى 
+هناك العديد من المكونات أثناء التدريب التي تستخدم ذاكرة GPU. المكونات الموجودة في ذاكرة GPU هي التالية:
+
+1. أوزان النموذج
+2. الدول المُحسّن
+3. المُتدرجات
+4. تنشيطات المسار الأمامي المحفوظة لحساب  المُتدرجات
+5. المخازن المؤقتة
+6. ذاكرة محددة الوظائف
+
+يتطلب نموذج نموذجي مدرب بدقة مختلطة 18 بايت للمُحسّن AdamW كل معلمة نموذج بالإضافة إلى ذاكرة التنشيط. للاستدلال لا توجد حالات مُحسّن و مُتدرجات، لذلك يمكننا طرح تلك. وهكذا ننتهي مع 6 بايت لكل 
+معلمة نموذج للدقة المختلطة الاستدلال، بالإضافة إلى ذاكرة التنشيط.
+
+دعنا نلقي نظرة على التفاصيل.
+
+**أوزان النموذج:**
+
+- 4 بايت * عدد المعلمات للتدريب على دقة fp32
+- 6 بايت * عدد المعلمات لتدريب الدقة المختلطة (يحافظ على نموذج في fp32 وآخر بدقة fp16 في الذاكرة)
+
+**حالات المُحسّن:**
+
+- 8 بايت * عدد المعلمات للمُحسّن AdamW العادي (يحافظ على حالتين)
+- 2 بايت * عدد المعلمات لمُحسّنات 8 بت AdamW مثل [bitsandbytes](https://github.com/TimDettmers/bitsandbytes)
+- 4 بايت * عدد المعلمات لمُحسّنات مثل SGD مع الزخم momentum (يحافظ على حالة واحدة فقط)
+
+**المُتدرجات**
+
+- 4 بايت * عدد المعلمات للتدريب بدقة fp32 أو بدقة مختلطة (المُتدرجات تكون دائمًا بدقة fp32)
+
+**تنشيطات المسار الأمامي**
+
+- يعتمد الحجم على العديد من العوامل، وأهمها طول التسلسل وحجم المخفية وحجم الدُفعة.
+
+هناك  المدخلات والمخرجات لذي يتم تمريرها وإرجاعها بواسطة وظائف المسار  الأمامي والمسار الخلفي وتنشيطات المسار الأمامي المحفوظة لحساب المُتدرجات.
+
+**الذاكرة المؤقتة**
+
+بالإضافة إلى ذلك، هناك جميع أنواع المتغيرات المؤقتة التي يتم تحريرها بمجرد الانتهاء من الحساب، ولكن في 
+لحظة يمكن أن تتطلب هذه المتغيرات المؤقتة ذاكرة إضافية ويقد تؤدي إلى نفاد  الذاكرة المُخصصة (OOM). لذلك، عند البرمجة، من المهم التفكير بشكل استراتيجي حول هذه المتغيرات المؤقتة وأحيانًا تحريرها بشكل صريح بمجرد عدم الحاجة إليها.
+
+**ذاكرة محددة الوظائف**
+
+ثم، قد يكون لبرنامجك احتياجات خاصة بالذاكرة. على سبيل المثال، عند إنشاء نص باستخدام البحث الشعاعي، يحتاج البرنامج 
+إلى الاحتفاظ بنسخ متعددة من المدخلات والمخرجات.
+
+**سرعة تنفيذ `forward` مقابل `backward`**
+
+بالنسبة للالتفافات والطبقات الخطية، هناك ضِعف عدد العمليات 2x flops في المسار الخلفى مقارنة بالمسار الأمامي، والتي يُترجم عمومًا إلى ~2x أبطأ (أحيانًا أكثر، لأن الأحجام في المسار الخلفى تميل إلى أن تكون أكثر صعوبة). عادةً ما تكون عمليات التنشيط محدودة بعرض  النطاق  الترددي، ومن المعتاد أن يتعين على التنشيط قراءة المزيد من البيانات في المسار الخلفى أكثر من المسار الأمامى.
+(على سبيل المثال، قراءة التنشيط المسار الأمامى مرة واحدة، وتكتب مرة واحدة، وبينما  تقرأ  عملية التنشيط الخلفي مرتين، gradOutput وإخراج الأمام، وتكتب مرة واحدة، gradInput).
+
+كما ترى، هناك بضعة أماكن يمكننا فيها توفير ذاكرة GPU أو تسريع العمليات. 
+الآن بعد أن فهمت ما يؤثر على استخدام GPU وسرعة الحساب، راجع 
+صفحة وثائق [أساليب وأدوات التدريب الفعال على GPU واحد](perf_train_gpu_one) لمعرفة المزيد حول تقنيات تحسين الأداء.
\ No newline at end of file
diff --git a/docs/source/ar/model_summary.md b/docs/source/ar/model_summary.md
new file mode 100644
index 00000000000000..57de935b70f459
--- /dev/null
+++ b/docs/source/ar/model_summary.md
@@ -0,0 +1,89 @@
+# عائلة نماذج المحول
+
+منذ إطلاقه في عام 2017، ألهم نموذج [المحول الأصلي](https://arxiv.org/abs/1706.03762) (راجع مدونة [المحول المشروح](http://nlp.seas.harvard.edu/2018/04/03/attention.html) لمقدمة تقنية مبسطة)، ألهم العديد من النماذج الجديدة والمبتكرة التي تتجاوز مهام معالجة اللغات الطبيعية (NLP). هناك نماذج للتنبؤ [بالبنية البروتينات المطوية](https://huggingface.co/blog/deep-learning-with-proteins)، و[تدريب على اتخاذ القرار](https://huggingface.co/blog/train-decision-transformers)، و[التنبؤ بالسلاسل الزمنية](https://huggingface.co/blog/time-series-transformers). مع وجود العديد من متغيرات المحول المتاحة، قد يكون من السهل أن تفوتك الصورة الأكبر. ما تشترك فيه جميع هذه النماذج هو أنها تستند إلى بنية المحول الأصلية. تستخدم بعض النماذج فقط الترميز أو فك الترميز، بينما تستخدم نماذج أخرى كليهما. يوفر هذا تصنيفًا مفيدًا لتصنيف واستعراض الفروقات الرئيسية بين نماذج عائلة المحولات، وسيساعدك على فهم النماذج التي لم تصادفها من قبل.
+
+إذا لم تكن على دراية بنموذج المحول الأصلي أو تحتاج إلى تذكير، فراجع الفصل الخاص بـ [كيف تعمل المحولات](https://huggingface.co/course/chapter1/4؟fw=pt) من دورة Hugging Face.
+
+<div align="center">
+    <iframe width="560" height="315" src="https://www.youtube.com/embed/H39Z_720T5s" title="مشغل فيديو YouTube" frameborder="0" allow="accelerometer؛ تشغيل تلقائي؛ قائمة تشغيل مدمجة؛ محسّنات الفيديو؛ ميزة الإشارات المرجعية" allowfullscreen></iframe>
+</div>
+
+## رؤية الحاسب (Computer vision)
+
+<iframe style="border: 1px solid rgba(0, 0, 0, 0.1);" width="1000" height="450" src="https://www.figma.com/embed?embed_host=share&url=https%3A%2F%2Fwww.figma.com%2Ffile%2FacQBpeFBVvrDUlzFlkejoz%2FModelscape-timeline%3Fnode-id%3D0%253A1%26t%3Dm0zJ7m2BQ9oe0WtO-1" allowfullscreen></iframe> 
+
+### الشبكة التلافيفية (Convolutional network)
+
+لطالما كانت الشبكات التلافيفية (CNNs) الطريقة السائدة لمهام رؤية الحاسب حتى برز [محول الرؤية](https://arxiv.org/abs/2010.11929) قابليته للتطوير وكفاءته العالية. وحتى بعد ذلك، لا تزال بعض أفضل صفات CNN، مثل ثبات الإزاحة، قوية جدًا (خاصة بالنسبة لمهام معينة) لدرجة أن بعض المحولات تدمج التلافيف في بنيتها. قلب [ConvNeXt](model_doc/convnext) هذا التبادل رأسًا على عقب وأدرج خيارات التصميم من المحولات لتحديث CNN. على سبيل المثال، يستخدم ConvNeXt نوافذ منزلقة غير متداخلة لتقسيم الصورة إلى رقع وزيادة حقل مجال العام الخاص بها. كما يقوم ConvNeXt بعدة خيارات مثل تصميم الطبقة لتكون أكثر كفاءة في الذاكرة وتحسين الأداء، مما يجعله منافسًا قويًا للمحولات!
+
+### الترميز[[cv-encoder]] (Encoder)
+
+فتح [محول الرؤية (ViT)](model_doc/vit) الباب أمام مهام رؤية الحاسب دون الاعتماد على التلافيف. يستخدم ViT ترميز محول قياسي، لكن إنجازه الرئيسي كان طريقة معالجته للصورة. فهو تقسّم الصورة إلى رقّعات ذات حجم ثابت ويستخدمها لإنشاء تضمين، تمامًا مثل تقسيم الجملة إلى رموز. استفاد ViT من بنية المُحوِّلات الفعالة لإظهار نتائج تنافسية مع CNNs في ذلك الوقت مع الحاجة إلى موارد أقل للتدريب. وسرعان ما تبع ViT نماذج رؤية أخرى يمكنها أيضًا التعامل مع مهام الرؤية الكثيفة  مثل التجزئة والتعرف.
+
+من بين هذه النماذج [Swin](model_doc/swin) Transformer. فهو يبني خرائط سمات هرمية (مثل CNN 👀 على عكس ViT) من رقّعات أصغر حجمًا ودمجها مع الرقع المجاورة في طبقات أعمق. يتم حساب الانتباه فقط ضمن نافذة محلية، ويتم تحويل النافذة بين طبقات الانتباه لإنشاء اتصالات تساعد النموذج على التعلم بشكل أفضل. نظرًا لأن محول Swin يمكنه إنتاج خرائط خصائص هرمية، فهو مرشح جيد لمهام التنبؤ الكثيفة مثل التجزئة والتعرف. كما يستخدم [SegFormer](model_doc/segformer) ترميز محول لبناء خرائط خصائص هرمية، ولكنه يضيف فك تشفير بسيط متعدد الطبقات (MLP) في الأعلى لدمج جميع خرائط الخصائص وإجراء تنبؤ.
+
+استلهمت نماذج الرؤية الأخرى، مثل BeIT وViTMAE، الإلهام من هدف التدريب المسبق لـ BERT. يتم تدريب [BeIT](model_doc/beit) مسبقًا من خلال *نمذجة الصور المقنعة (MIM)*؛ يتم إخفاء رقّعات الصور بشكل عشوائي، كما يتم تحويل الصورة إلى رموز بصرية. يتم تدريب BeIT للتنبؤ بالرموز البصرية المُناظرة للرقع المخفية. لدى [ViTMAE](model_doc/vitmae) هدف تدريب مسبق مُماثل، باستثناء أنه يجب عليه التنبؤ بالبكسلات بدلاً من الرموز البصرية. ما هو غير عادي هو أن إخفاء 75% من رقع الصور! يقوم فك التشفير بإعادة بناء البكسلات من الرموز المخفية والرقّعات المشفرة. بعد التدريب المسبق، يتم التخلص من فك التشفير، ويصبح الترميز جاهزًا للاستخدام في مهام التالية.
+
+### فك التشفير[[cv-decoder]] (Decoder)
+
+نادرًا ما تستخدم نماذج الرؤية التي تعتمد على فك التشفير فقط لأن معظم نماذج الرؤية تعتمد على الترميز لتعلم تمثيل الصورة. ولكن بالنسبة للاستخدامات مثل توليد الصور، يعد فك التشفير مناسبًا بشكل طبيعي، كما رأينا من نماذج توليد النصوص مثل GPT-2. يستخدم نموذج [ImageGPT](model_doc/imagegpt) نفس بنية GPT-2، ولكنه بدلاً من التنبؤ بالرمز التالي في تسلسل، فإنه يتنبأ بالبكسل التالي في صورة. بالإضافة إلى توليد الصور، يمكن أيضًا ضبط ImageGPT بدقة لتصنيف الصور.
+
+### الترميز وفك التشفير[[cv-encoder-decoder]] (Encoder-decoder)
+
+تستخدم نماذج الرؤية بشكل شائع ترميزًا (يُعرف أيضًا باسم العمود الفقري) لاستخراج ميزات الصورة المهمة قبل تمريرها إلى فك التشفير لنموذج المُحوّل. يستخدم [DETR](model_doc/detr) عمودًا فقريًا مُدربًا مسبقًا، ولكنه يستخدم أيضًا الببنية الكاملة للترميز وفك تشفير لنموذج المحول للكشف عن الأشياء. يتعلم الترميز تمثيلات الصور ويجمعها مع استعلامات الكائنات (كل استعلام كائن هو تضمين مُتعلم يركز على منطقة أو كائن في صورة) في فك التشفير. يتنبأ DETR بإحداثيات مربع الحدود وتسمية الفئة لكل استعلام كائن.
+
+## معالجة اللغات الطبيعية (Natural language processing - NLP)
+
+<iframe style="border: 1px solid rgba(0, 0, 0, 0.1);" width="1000" height="450" src="https://www.figma.com/embed?embed_host=share&url=https%3A%2F%2Fwww.figma.com%2Ffile%2FUhbQAZDlpYW5XEpdFy6GoG%2Fnlp-model-timeline%3Fnode-id%3D0%253A1%26t%3D4mZMr4r1vDEYGJ50-1" allowfullscreen></iframe>
+
+### الترميز اللغوي[[nlp-encoder]]
+
+نموذج [BERT](model_doc/bert) هو محوّل (Transformer)  يعتمد على الترميز فقط يقوم بشكل عشوائي بإخفاء رموز معينة في المدخلات لتجنب رؤية باقى الرموز الأخرى، مما يسمح له "بالغش". يتمثل هدف التدريب المسبق في التنبؤ بالرمز المخفي بناءً على السياق. يسمح هذا لـ BERT باستخدام السياقات اليمنى واليسرى بالكامل لمساعدته في تعلم تمثيل أعمق وأغنى للبيانات المدخلة. ومع ذلك، كان هناك مجال للتحسين في استراتيجية التدريب المسبق لـ BERT. نموذج [RoBERTa](model_doc/roberta) اضاف تحسين من خلال تقديم وصفة تدريب مسبق جديدة تشمل التدريب لفترة أطول وعلى دفعات أكبر، وإخفاء الرموز عشوائيًا في كل حقبة بدلاً من مرة واحدة فقط أثناء المعالجة المسبقة، وإزالة هدف التنبؤ بالجملة التالية.
+
+تتمثل الاستراتيجية السائدة لتحسين الأداء في زيادة حجم النموذج. ولكن تدريب النماذج الكبيرة مكلف من الناحية الحسابية. إحدى طرق تقليل التكاليف الحسابية هي استخدام نموذج أصغر مثل [DistilBERT](model_doc/distilbert). يستخدم DistilBERT [ تقنية تقطير المعرفة](https://arxiv.org/abs/1503.02531) - وهي تقنية ضغط - لإنشاء نموذج أصغر من BERT مع الحفاظ على معظم قدراته على فهم اللغةا.
+
+مرت معظم نماذج المحول في الاتجاه نحو المزيد من المعلمات، مما أدى إلى ظهور نماذج جديدة تركز على تحسين كفاءة التدريب. يقلّل [ALBERT](model_doc/albert) من استهلاك الذاكرة عن طريق تقليل عدد المعلمات بطريقتين: فصل تضمين المفردات الأكبر إلى مصفوفتين أصغر والسماح للمستويات بمشاركة المعلمات. أضاف [DeBERTa](model_doc/deberta) آلية انتباه منفصلة حيث يتم ترميز الكلمة وموضعها بشكل منفصل في متجهين. يتم حساب الانتباه من هذه المتجهات المنفصلة بدلاً من متجه واحد يحتوي على تضمين الكلمة والموقع. ركز [Longformer](model_doc/longformer) أيضًا على جعل الانتباه أكثر كفاءة، خاصة لمعالجة المستندات ذات تسلسلات أطولل. فهو يستخدم مزيجًا من  انتباه النوافذ المحلية (يتم حساب الانتباه فقط ن نافذة ذات حجم ثابت حول كل رمز) والانتباه العام (فقط لرموز مهمة محددة مثل `[CLS]` للتصنيف) لإنشاء مصفوفة انتباه متفرقة بدلاً من مصفوفة انتباه كاملة.
+
+### فك التشفير[[nlp-decoder]]
+
+ نموذج [GPT-2](model_doc/gpt2) هو محول فك تشفير فقط يتنبأ بالكلمة التالية في التسلسل. إنه يخفي الرموز التالية الموجودة على اليمين حتى لا يتمكن النموذج من "الغش" بالنظر إليها. من خلال التدريب المسبق على  كميات هائلة من النصوص، أصبح [GPT-2](model_doc/gpt2) بارعًا في توليد النصوص، حتى لو لم تكن النص دقيقًا أو صحيحًا في بعض الأحيان فقط. ولكن كان يفتقر إلى سياق لترابط المتبادل (bidirectional context) الموجود من التدريب المسبق لـ  [BERT](model_doc/bert) ، مما جعله غير مناسب لمهام معينة. يجمع [XLNET](model_doc/xlnet) بين أفضل ما في أهداف التدريب المسبق لـ [BERT](model_doc/bert) و [GPT-2](model_doc/gpt2) من خلال اعتماد نهج النمذجة اللغوية باستخدام التباديل (Permutation Language Modeling - PLM) الذي يسمح له بتعلم الترابط ثنائي الاتجاه.
+
+بعد ظهور [GPT-2](model_doc/gpt2)، تطورت النماذج اللغوية بشكل أكبر حجمًا وأكثر تعقيدًا وأصبحت تُعرف الآن باسم *نماذج اللغة الكبيرة (LLMs)*. توضح LLMs  مهارات تعلم قليلة الكمية أو حتى معدومة إذا تم تدريبها على مجموعة بيانات كبيرة بما يكفي. [GPT-J](model_doc/gptj) هو LLM به 6 مليارات معلمة مدربة على 400 مليار رمز. تبعه نموذج [OPT](model_doc/opt)، وهي عائلة من نماذج فك التشفير فقط، أكبرها 175 مليار معلمة ودُرب على 180 مليار رمز. تم إصدار [BLOOM](model_doc/bloom) في نفس الوقت تقريبًا، ويحتوي أكبر نموذج في العائلة على 176 مليار معلمة ودُرب على 366 مليار رمز في 46 لغة و13 لغة برمجة.
+
+### الترميز وفك التشفير[[nlp-encoder-decoder]]
+
+يحتفظ [BART](model_doc/bart) ببنية المحول الأصلية، ولكنه يعدّل هدف التدريب المسبق باستخدام إفساد *إدخال النصوص*، حيث يتم استبدال بعض نطاقات النص برمز `mask` واحد. يتنبأ فك التشفير بالرموز غير الفاسدة (يتم إخفاء الرموز المستقبلية) ويستخدم حالات الترميز المخفية للمساعدة. [Pegasus](model_doc/pegasus) مشابه لـ BART، ولكن Pegasus يقوم بإخفاء جمل كاملة بدلاً من مقاطع النص. بالإضافة إلى نمذجة اللغة المقنعة، يتم تدريب Pegasus مسبقًا بواسطة توليد الجمل الفارغة (GSG). يقوم هدف GSG بإخفاء الجمل الكاملة المهمة للمستند، واستبدالها برمز `mask`. يجب على فك التشفير توليد المخرجات من الجمل المتبقية. [T5](model_doc/t5) هو نموذج فريد من نوعه يحوّل جميع مهام معالجة اللغة الطبيعية إلى مشكلة نص إلى نص باستخدام بادئات محددة. على سبيل المثال، يشير البادئة `Summarize:` إلى مهمة تلخيص. يتم تدريب T5 مسبقًا بواسطة التدريب الخاضع للإشراف (GLUE وSuperGLUE) والتدريب ذاتي الإشراف (اختيار عينة عشوائية وحذف 15% من الرموز).
+
+## الصوت (Audio)
+
+<iframe style="border: 1px solid rgba(0, 0, 0, 0.1);" width="1000" height="450" src="https://www.figma.com/embed?embed_host=share&url=https%3A%2F%2Fwww.figma.com%2Ffile%2Fvrchl8jDV9YwNVPWu2W0kK%2Fspeech-and-audio-model-timeline%3Fnode-id%3D0%253A1%26t%3DmM4H8pPMuK23rClL-1" allowfullscreen></iframe>
+
+### الترميز[[audio-encoder]]
+
+يستخدم [Wav2Vec2](model_doc/wav2vec2) ترميز من نوع المحوّل لتعلم تمثيلات الكلام بشكلٍ مباشر من موجات الصوت الخام. يتم تدريبه مسبقًا باستخدام مهمة تباينية لتحديد تمثيل الكلام الصحيح من مجموعة من التمثيلات الخاطئة. [HuBERT](model_doc/hubert) مشابه لـ Wav2Vec2 ولكنه له عملية تدريب مختلفة. يتم إنشاء تسميات الهدف عن طريق خطوة تجميع يتم فيها ت تخصيص مقاطع الصوت المتشابهة إلى مجموعات، تُصبح كل واحدة منها وحدةً خفية. ويتم تعيين الوحدة الخفية إلى  تمثيل لإجراء تنبؤ.
+
+### الترميز وفك التشفير[[audio-encoder-decoder]]
+
+[Speech2Text](model_doc/speech_to_text) هو نموذج كلام مصمم للتعرف التلقائي على الكلام (ASR) وترجمة الكلام. يقبل النموذج ميزات بنك المرشح اللغوي التي تم استخراجها من شكل موجة الصوت وتم تدريبه مسبقًا بطريقة ذاتية التعلم لتوليد نسخة أو ترجمة. [Whisper](model_doc/whisper) هو أيضًا نموذج ASR، ولكنه على عكس العديد من نماذج الكلام الأخرى، يتم تدريبه مسبقًا على كمية كبيرة من بيانات نسخ النص الصوتي ✨ المسماة ✨ لتحقيق الأداء الصفري. يحتوي جزء كبير من مجموعة البيانات أيضًا على لغات غير اللغة الإنجليزية، مما يعني أنه يمكن استخدام Whisper أيضًا للغات منخفضة الموارد. من الناحية الهيكلية، يشبه Whisper نموذج Speech2Text. يتم تحويل إشارة الصوت إلى طيف لوجاريتم مل-ميل يتم تشفيره بواسطة الترميز. يقوم فك التشفير بتوليد النسخة بطريقة ذاتية التعلم من حالات الترميز المخفية والرموز السابقة.
+
+## متعدد الوسائط (Multimodal)
+
+<iframe style="border: 1px solid rgba(0, 0, 0, 0.1);" width="1000" height="450" src="https://www.figma.com/embed?embed_host=share&url=https%3A%2F%2Fwww.figma.com%2Ffile%2FcX125FQHXJS2gxeICiY93p%2Fmultimodal%3Fnode-id%3D0%253A1%26t%3DhPQwdx3HFPWJWnVf-1" allowfullscreen></iframe>
+
+### Encoder[[mm-encoder]]
+
+نموذج [VisualBERT](model_doc/visual_bert) هو نموذج متعدد الوسائط لمهام الرؤية اللغوية تم إصداره بعد فترة وجيزة من BERT. فهو يجمع بين BERT ونظام اكتشاف كائن مسبق التدريب لاستخراج ميزات الصورة في تضمينات بصرية، يتم تمريرها جنبًا إلى جنب مع التضمينات النصية إلى BERT. يتنبأ VisualBERT بالنص المقنع بناءً على النص غير المقنع والتضمينات المرئية، ويجب عليه أيضًا التنبؤ بما إذا كان النص متوافقًا مع الصورة. عندما تم إصدار ViT، اعتمد [ViLT](model_doc/vilt) ViT في بنيتها لأنه كان من الأسهل الحصول على تضمينات الصورة بهذه الطريقة. يتم معالجة تضمينات الصورة بشكل مشترك مع التضمينات النصية. ومن هناك، يتم التدريب المسبق لـ ViLT بواسطة مطابقة الصورة النصية، ونمذجة اللغة المقنعة، وإخفاء كلمة كاملة.
+
+يتّبع [CLIP](model_doc/clip) نهجًا مختلفًا ويقوم بتنبؤ ثنائي من ("الصورة"، "النص"). يتم تدريب مشفر صورة (ViT) ومشفر نص (Transformer) بشكل مشترك على مجموعة بيانات مكونة من 400 مليون ثنائي من ("صورة"، "نص") لتعظيم التشابه بين متجهات  ترميز الصورة  ومتجهات النص ثنائي ("الصورة"، "النص"). بعد التدريب المسبق، يمكنك استخدام اللغة الطبيعية لتوجيه CLIP للتنبؤ بالنص المُعطى بناءً على صورة أو العكس بالعكس. [OWL-ViT](model_doc/owlvit) يبني على CLIP باستخدامه كعمود فقري للكشف عن الكائنات بدون إشراف. بعد التدريب المسبق، يتم إضافة رأس كشف الأجسام لإجراء تنبؤ بمجموعة مُحدّد عبر ثنائيات ("class"، "bounding box").
+
+### Encoder-decoder[[mm-encoder-decoder]]
+
+التعرّف البصري على الحروف (OCR)   مهمة  قديمة  لتعرّف  النصوص،  التي تنطوي عادةً على عدة مكونات لفهم الصورة وتوليد النص. [TrOCR](model_doc/trocr) بتبسيط العملية باستخدام محول متكامل من النهاية إلى النهاية. المشفر هو نموذج على غرار ViT لفهم الصورة ويعالج الصورة كقطع ثابتة الحجم. يقبل فك التشفير حالات الإخفاء للمشفر وينشئ النص بشكل تلقائي. [Donut](model_doc/donut) هو نموذج أكثر عمومية لفهم المستندات المرئية لا يعتمد على نهج OCR. يستخدم محول  Swin كمشفر وBART متعدد اللغات كمُفكّك تشفير. يتم تدريب Donut على قراءة النص عن طريق التنبؤ بالكلمة التالية بناءً على ملاحظات الصورة والنص. يقوم فك التشفير بتوليد تتسلسلًا  رمزيًا  بناءً على موجه (Prompt). يتم تمثيل الموجه بواسطة رمز خاص لكل مهمة. على سبيل المثال، يحتوي تحليل المستند على رمز خاص "parsing" يتم دمجه مع حالات الإخفاء للـمُشفّر لتحليل المستند بتنسيق إخراج منظم (JSON).
+
+## التعلم التعزيزي (Reinforcement learning - RL)
+
+<iframe style="border: 1px solid rgba(0, 0, 0, 0.1);" width="1000" height="450" src="https://www.figma.com/embed?embed_host=share&url=https%3A%2F%2Fwww.figma.com%2Ffile%2FiB3Y6RvWYki7ZuKO6tNgZq%2Freinforcement-learning%3Fnode-id%3D0%253A1%26t%3DhPQwdx3HFPWJWnVf-1" allowfullscreen></iframe>
+
+### فك التشفير[[rl-decoder]]
+
+يقوم نموذج  "محوّل القرارات والمسارات"  (Decision and Trajectory Transformer)  بتحويل الحالة (State)  والإجراء (Action)  والمكافأة (Reward)  كمشكلة نمذجة تسلسلية. [محوّل القرارات](model_doc/decision_transformer) يقوم بتوليد سلسلة من الإجراءات التي تؤدي إلى عائد مرغوب في المستقبل بناءً على العوائد المتوقعة، والحالات  والإجراءات  السابقة. في  الخطوات  الزمنية  *K*   الأخيرة، يتم تحويل كل وسائط  البيانات  الثلاث vإلى متجهات تضمين رمزيّة  ومعالجتها بواسطة نموذج مشابه لـ GPT للتنبؤ برمز الإجراء المستقبلي.يقوم [محول المسار](model_doc/trajectory_transformer) أيضًا بتحويل الحالات  والإجراءات  والمكافآت إلى رموز ومعالجتها باستخدام  هيكلية  GPT. على عكس "محوّل القرارات"، الذي يركز على تكييف المكافأة، يقوم "محوّل المسارات" بتوليد إجراءات مستقبلية باستخدام البحث الشعاعي (Beam Search).
diff --git a/docs/source/ar/pad_truncation.md b/docs/source/ar/pad_truncation.md
new file mode 100644
index 00000000000000..d41d4f9c6b453b
--- /dev/null
+++ b/docs/source/ar/pad_truncation.md
@@ -0,0 +1,52 @@
+# الحشو والتقليم 
+
+غالبًا ما تختلف مدخلات الدُفعات في الطول، لذا لا يمكن تحويلها إلى مصفوفات ذات حجم ثابت .يُعدّ الحشو والتقليم هما استراتيجيتان للتعامل مع هذه المشكلة، لإنشاء مصفوفات مستطيلة من مجموعات ذات أطوال مختلفة. ويضيف الحشو رمز **حشو** خاص لضمان أن يكون للتسلسلات الأقصر نفس طول أطول تسلسل في الدفعة أو الطول الأقصى الذي يقبله النموذج. ويعمل التقليم عكس ذلك بتقليم التسلسلات الطويلة.
+
+في معظم الحالات، ييُعدّ حشو دُفعتك إلى طول أطول تسلسل فيها وتقليمها إلى الطول الأقصى المقبول من النموذج حلًا فعالًا. ومع ذلك، تدعم واجهة برمجة التطبيقات المزيد من الاستراتيجيات إذا كنت بحاجة إليها. هناك ثلاثة معامﻻت تحتاجها لفهم آلية العمل: `padding`، و`truncation`، و`max_length`.
+
+يحكم معامل `padding` عملية الحشو. يمكن أن يكون قيمة منطقية أو نصية:
+
+  - `True` أو `'longest'`: الحشو إلى أطول تسلسل في الدفعة (لا يتم تطبيق الحشو عند تقديم تسلسل واحد فقط).
+  - `'max_length'`: الحشو إلى طول محدد بواسطة معامل `max_length` أو الطول الأقصى الذي يقبله
+    النموذج إذا لم يتم توفير `max_length` (`max_length=None`). سيظل الحشو مطبقًا إذا قدمت تسلسلًا واحدًا فقط.
+  - `False` أو `'do_not_pad'`: لا يتم تطبيق أي حشو. هذا هو السلوك الافتراضي.
+
+تحكم معامل `truncation` عملية التقليم. يمكن أن يكون قيمة منطقية أو نصية:
+
+  -قيمة `True` أو `'longest_first'` : تقليم التسلسلات إلى طول أقصى مُحدد بواسطة معامل `max_length`، أو أقصى طول يقبله النموذج في حال عدم تحديد طول مُحدد من قبل المستخدم  (`max_length=None`). ستتم عملية التقليم إزالة رمز تلو الآخر، بدءًا من أطول تسلسل في الزوج، إلى أن يصل الطول إلى القيمة المُحددة. 
+  -قيمة `'only_second'`: اقطع إلى طول أقصى محدد بواسطة معامل `max_length` أو أقصى طول يقبله النموذج إذا لم يتم توفير `max_length` (`max_length=None`). هذا سيقلم فقط الجملة الثانية من الزوج إذا تم توفير زوج من التسلسلات (أو دُفعة من أزواج التسلسلات).
+  -قيمة `'only_first'`: تقليم الجملة الأولى فقط من الزوج عند تقديم زوج من التسلسلات (أو دُفعة من أزواج التسلسلات) إلى طول أقصى مُحدد بواسطة حجة `max_length`، أو أقصى طول يقبله النموذج في حال عدم تحديد طول مُحدد من قبل المستخدم  (`max_length=None`). 
+  -قيمة `False` أو `'do_not_truncate'`: لا يتم تطبيق أي تقليم. هذا هو السلوك الافتراضي.
+``
+
+يحكم معامل  `max_length` طول الحشو والتقليم. يمكن أن يكون عدد صحيح أو `None`، وعندها يُحدد افتراضيًا إلى الطول الأقصى الذي يمكن أن يقبله النموذج. إذا لم يكن للنموذج طول إدخال أقصى محدد، يتم إلغاء تنشيط التقليم أو الحشو إلى `max_length`.
+
+يلخّص الجدول التالي الطريقة المُوصى بها لإعداد الحشو والتقليم. إذا كنت تستخدم أزواج تسلسلات الإدخال في أي من الأمثلة التالية، فيمكنك استبدال `truncation=True` بـ `STRATEGY` المحدد في `['only_first'، 'only_second'، 'longest_first']`، أي `truncation='only_second'` أو `truncation='longest_first'` للتحكم في كيفية تقليم كلا التسلسلين في الزوج كما هو موضّح سابقًا.
+<!-- This file is automatically generated, do not modify manually. -->
+
+# حيل الترميز
+
+هناك العديد من الاستراتيجيات لترميز دفعات الجمل. فيما يلي بعض الأمثلة على ذلك.
+
+| الترميز                           | الحشو                           | التعليمات                                                                                 |
+|--------------------------------------|-----------------------------------|---------------------------------------------------------------------------------------------|
+| لا ترميز                           | لا حشو                           | `tokenizer(batch_sentences)`                                                           |
+|                                      | الحشو إلى الحد الأقصى للتسلسل في الدفعة | `tokenizer(batch_sentences, padding=True)` أو                                          |
+|                                      |                                   | `tokenizer(batch_sentences, padding='longest')`                                        |
+|                                      | الحشو إلى الحد الأقصى لطول إدخال النموذج | `tokenizer(batch_sentences, padding='max_length')`                                     |
+|                                      | الحشو إلى طول محدد                | `tokenizer(batch_sentences, padding='max_length', max_length=42)`                      |
+|                                      | الحشو إلى مضاعف لقيمة معينة      | `tokenizer(batch_sentences, padding=True, pad_to_multiple_of=8)`                        |
+| الترميز إلى الحد الأقصى لطول إدخال النموذج | لا حشو                           | `tokenizer(batch_sentences, truncation=True)` أو                                       |
+|                                      |                                   | `tokenizer(batch_sentences, truncation=STRATEGY)`                                      |
+|                                      | الحشو إلى الحد الأقصى للتسلسل في الدفعة | `tokenizer(batch_sentences, padding=True, truncation=True)` أو                         |
+|                                      |                                   | `tokenizer(batch_sentences, padding=True, truncation=STRATEGY)`                        |
+|                                      | الحشو إلى الحد الأقصى لطول إدخال النموذج | `tokenizer(batch_sentences, padding='max_length', truncation=True)` أو                 |
+|                                      |                                   | `tokenizer(batch_sentences, padding='max_length', truncation=STRATEGY)`                |
+|                                      | الحشو إلى طول محدد                | غير ممكن                                                                                |
+| الترميز إلى طول محدد                | لا حشو                           | `tokenizer(batch_sentences, truncation=True, max_length=42)` أو                        |
+|                                      |                                   | `tokenizer(batch_sentences, truncation=STRATEGY, max_length=42)`                       |
+|                                      | الحشو إلى الحد الأقصى للتسلسل في الدفعة | `tokenizer(batch_sentences, padding=True, truncation=True, max_length=42)` أو          |
+|                                      |                                   | `tokenizer(batch_sentences, padding=True, truncation=STRATEGY, max_length=42)`         |
+|                                      | الحشو إلى الحد الأقصى لطول إدخال النموذج | غير ممكن                                                                                |
+|                                      | الحشو إلى طول محدد                | `tokenizer(batch_sentences, padding='max_length', truncation=True, max_length=42)` أو  |
+|                                      |                                   | `tokenizer(batch_sentences, padding='max_length', truncation=STRATEGY, max_length=42)` |
\ No newline at end of file
diff --git a/docs/source/ar/perplexity.md b/docs/source/ar/perplexity.md
new file mode 100644
index 00000000000000..e7d2a19d2c837f
--- /dev/null
+++ b/docs/source/ar/perplexity.md
@@ -0,0 +1,94 @@
+# التعقيد اللغوي للنماذج ذات الطول الثابت
+
+[[open-in-colab]]
+
+ التعقيد اللغوي (PPL) هي واحدة من أكثر المقاييس شيوعًا لتقييم نماذج اللغة. قبل الخوض في التفاصيل، يجب أن نلاحظ أن المقياس ينطبق تحديدًا على نماذج اللغة الكلاسيكية (يُطلق عليها أحيانًا نماذج اللغة التلقائية المرجعية أو السببية) وهي غير محددة جيدًا لنماذج اللغة المقنعة مثل BERT (راجع [ملخص النماذج](model_summary)).
+
+تُعرَّف التعقيد اللغوي على أنها الأس المُرفوع لقيمة متوسط اللوغاريتم الاحتمالي لمتتالية. إذا كان لدينا تسلسل رمزي \\(X = (x_0, x_1, \dots, x_t)\\)، فإن حيرة \\(X\\) هي،
+
+$$\text{PPL}(X) = \exp \left\{ {-\frac{1}{t}\sum_i^t \log p_\theta (x_i|x_{<i}) } \right\}$$
+
+حيث \\(\log p_\theta (x_i|x_{<i})\\) هو اللوغاريتم الاحتمالي للرمز i بشرط الرموز السابقة \\(x_{<i}\\) وفقًا لنموذجنا. ومن الناحية البديهية، يمكن اعتبارها تقييمًا لقدرة النموذج على التنبؤ بالتساوي بين مجموعة من الرموز المحددة في مجموعة من البيانات. ومن المهم الإشارة إلى أن عملية التمييز له تأثير مباشرًا على حيرة النموذج،ويجب مراعاتها دائمًا عند مقارنة النماذج المختلفة.
+
+كما أنها تعادل الأس المُرفوع لقيمة الانتروبيا المتقاطعة بين البيانات وتنبؤات النموذج. لمزيد من الفهم حول مفهوم التعقيد اللغوي وعلاقتها بـ Bits Per Character (BPC) وضغط البيانات، يُرجى مراجعة [التدوينة المفيدة على The Gradient](https://thegradient.pub/understanding-evaluation-metrics-for-language-models/).
+
+## حساب PPL مع النماذج ذات الطول الثابت
+
+إذا لم نكن مقيدين بحجم سياق النموذج، فسنقوم بتقييم التعقيد اللغوي للنموذج عن طريق تحليل التسلسل تلقائيًا والشرط على التسلسل الفرعي السابق بالكامل في كل خطوة، كما هو موضح أدناه.
+
+<img width="600" alt="Full decomposition of a sequence with unlimited context length" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/ppl_full.gif"/>
+
+لكن عند التعامل مع النماذج التقريبية، نواجه عادةً قيدًا على عدد الرموز التي يمكن للنموذج معالجتها. على سبيل المثال، تحتوي أكبر نسخة من [GPT-2](model_doc/gpt2) على طول ثابت يبلغ 1024 رمزًا، لذا لا يمكننا حساب \\(p_\theta(x_t|x_{<t})\\) مباشرة عندما تكون \\(t\\) أكبر من 1024.
+
+بدلاً من ذلك، يتم عادةً تقسيم التسلسل إلى تسلسلات فرعية مساوية لحجم الإدخال الأقصى للنموذج. فإذا كان حجم الإدخال الأقصى للنموذج هو \\(k\\)، فإننا نقرب احتمال الرمز \\(x_t\\) عن طريق الاشتقاق الشرطي فقط بالنسبة إلى \\(k-1\\) من الرموز التي تسبقه بدلاً من السياق بأكمله. وعند تقييم حيرة النموذج  لتسلسل ما، قد يبدو من المغري تقسيم التسلسل إلى أجزاء منفصلة وجمع مجموع دوال اللوغاريتم لكل جزء بشكل مستقل، لكن هذا الأسلوب ليس الأمثل.
+
+<img width="600" alt="Suboptimal PPL not taking advantage of full available context" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/ppl_chunked.gif"/>
+
+تتميز هذه الطريقة بسرعة حسابها نظرًا لإمكانية حساب درجة التعقيد اللغوي لكل جزء بمسح واحد للأمام، إلا أنها تُعدّ تقريبًا ضعيفًا لدرجة التعقيد اللغوي المُحلّلة بشكل كامل، وعادةً ما تؤدي إلى درجة تعقيد لغوي أعلى (أسوأ) لأن النموذج سيكون لديه سياق أقل في معظم خطوات التنبؤ.
+
+بدلاً من ذلك، يجب تقييم درجة التعقيد اللغوي للنماذج ذات الطول الثابت باستخدام إستراتيجية النافذة المنزلقة. وينطوي هذا على تحريك نافذة السياق بشكل متكرر بحيث يكون للنموذج سياق أكبر عند إجراء كل تنبؤ.
+
+<img width="600" alt="Sliding window PPL taking advantage of all available context" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/ppl_sliding.gif"/>
+
+هذا تقريب أقرب للتفكيك الحقيقي لاحتمالية التسلسل وسيؤدي عادةً إلى نتيجة أفضل.لكن الجانب السلبي هو أنه يتطلب تمريرًا للأمام لكل رمز في مجموعة البيانات. حل وسط عملي مناسب هو استخدام نافذة منزلقة بخطوة، بحيث يتم تحريك السياق بخطوات أكبر بدلاً من الانزلاق بمقدار 1 رمز في كل مرة. مما يسمح بإجراء الحساب بشكل أسرع مع إعطاء النموذج سياقًا كبيرًا للتنبؤات في كل خطوة.
+
+## مثال: حساب التعقيد اللغوي مع GPT-2 في 🤗 Transformers
+
+دعونا نوضح هذه العملية مع GPT-2.
+
+```python
+from transformers import GPT2LMHeadModel, GPT2TokenizerFast
+
+device = "cuda"
+model_id = "openai-community/gpt2-large"
+model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
+tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
+```
+
+سنقوم بتحميل مجموعة بيانات WikiText-2 وتقييم التعقيد اللغوي باستخدام بعض إستراتيجيات مختلفة النافذة المنزلقة. نظرًا لأن هذه المجموعة البيانات الصغيرة ونقوم فقط بمسح واحد فقط للمجموعة، فيمكننا ببساطة تحميل مجموعة البيانات وترميزها بالكامل في الذاكرة.
+
+```python
+from datasets import load_dataset
+
+test = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
+encodings = tokenizer("\n\n".join(test["text"]), return_tensors="pt")
+```
+
+مع 🤗 Transformers، يمكننا ببساطة تمرير `input_ids` كـ `labels` إلى نموذجنا، وسيتم إرجاع متوسط  احتمالية السجل السالب لكل رمز كخسارة. ومع ذلك، مع نهج النافذة المنزلقة، هناك تداخل في الرموز التي نمررها إلى النموذج في كل تكرار. لا نريد تضمين احتمالية السجل للرموز التي نتعامل معها كسياق فقط في خسارتنا، لذا يمكننا تعيين هذه الأهداف إلى `-100` بحيث يتم تجاهلها. فيما يلي هو مثال على كيفية القيام بذلك بخطوة تبلغ `512`. وهذا يعني أن النموذج سيكون لديه 512 رمزًا على الأقل للسياق عند حساب الاحتمالية الشرطية لأي رمز واحد (بشرط توفر 512 رمزًا سابقًا متاحًا للاشتقاق).
+
+```python
+import torch
+from tqdm import tqdm
+
+max_length = model.config.n_positions
+stride = 512
+seq_len = encodings.input_ids.size(1)
+
+nlls = []
+prev_end_loc = 0
+for begin_loc in tqdm(range(0, seq_len, stride)):
+    end_loc = min(begin_loc + max_length, seq_len)
+    trg_len = end_loc - prev_end_loc  # قد تكون مختلفة عن الخطوة في الحلقة الأخيرة
+    input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
+    target_ids = input_ids.clone()
+    target_ids[:, :-trg_len] = -100
+
+    with torch.no_grad():
+        outputs = model(input_ids, labels=target_ids)
+
+        # يتم حساب الخسارة باستخدام CrossEntropyLoss الذي يقوم بالمتوسط على التصنيفات الصحيحة
+        # لاحظ أن النموذج يحسب الخسارة على trg_len - 1 من التصنيفات فقط، لأنه يتحول داخليًا إلى اليسار بواسطة 1.
+        neg_log_likelihood = outputs.loss
+
+    nlls.append(neg_log_likelihood)
+
+    prev_end_loc = end_loc
+    if end_loc == seq_len:
+        break
+
+ppl = torch.exp(torch.stack(nlls).mean())
+```
+
+يعد تشغيل هذا مع طول الخطوة مساويًا لطول الإدخال الأقصى يعادل لاستراتيجية النافذة غير المنزلقة وغير المثلى التي ناقشناها أعلاه. وكلما صغرت الخطوة، زاد السياق الذي سيحصل عليه النموذج في عمل كل تنبؤ، وكلما كانت التعقيد اللغوي المُبلغ عنها أفضل عادةً.
+
+عندما نقوم بتشغيل ما سبق باستخدام `stride = 1024`، أي بدون تداخل، تكون  درجة التعقيد اللغوي الناتجة هي `19.44`، وهو ما يماثل `19.93` المبلغ عنها في ورقة GPT-2. من خلال استخدام `stride = 512` وبالتالي استخدام إستراتيجية النافذة المنزلقة، ينخفض هذا إلى `16.45`. هذه النتيجة ليست فقط أفضل، ولكنها محسوبة بطريقة أقرب إلى التحليل التلقائي الحقيقي لاحتمالية التسلسل.
\ No newline at end of file
diff --git a/docs/source/ar/philosophy.md b/docs/source/ar/philosophy.md
new file mode 100644
index 00000000000000..09de2991fba28d
--- /dev/null
+++ b/docs/source/ar/philosophy.md
@@ -0,0 +1,49 @@
+# الفلسفة
+
+تُعد 🤗 Transformers مكتبة برمجية ذات رؤية واضحة صُممت من أجل:
+
+- الباحثون والمُتعلّمون في مجال التعلم الآلي ممن يسعون لاستخدام أو دراسة أو تطوير نماذج Transformers واسعة النطاق.
+- مُطبّقي تعلم الآلة الذين يرغبون في ضبط تلك النماذج أو تشغيلها في بيئة إنتاجية، أو كليهما.
+- المهندسون الذين يريدون فقط تنزيل نموذج مُدرب مسبقًا واستخدامه لحل مهمة تعلم آلي معينة.
+
+تم تصميم المكتبة مع الأخذ في الاعتبار هدفين رئيسيين:
+
+1. سهولة وسرعة الاستخدام:
+
+  - تمّ تقليل عدد المفاهيم المُجردة التي يتعامل معها المستخدم إلى أدنى حد والتي يجب تعلمها، وفي الواقع، لا توجد مفاهيم مُجردة تقريبًا، فقط ثلاث فئات أساسية مطلوبة لاستخدام كل نموذج: [الإعدادات](main_classes/configuration)، [نماذج](main_classes/model)، وفئة ما قبل المعالجة ([مُجزّئ لغوي](main_classes/tokenizer) لـ NLP، [معالج الصور](main_classes/image_processor) للرؤية، [مستخرج الميزات](main_classes/feature_extractor) للصوت، و [معالج](main_classes/processors) للمدخﻻت متعددة الوسائط).
+  - يمكن تهيئة جميع هذه الفئات بطريقة بسيطة وموحدة من خلال نماذج مُدربة مسبقًا باستخدام الدالة الموحدة  `from_pretrained()`  والتي تقوم بتنزيل (إذا لزم الأمر)، وتخزين وتحميل  كل من: فئة النموذج المُراد استخدامه والبيانات المرتبطة ( مُعاملات الإعدادات، ومعجم للمُجزّئ اللغوي،وأوزان النماذج) من نقطة  تدقيق مُحددة مُخزّنة على [Hugging Face Hub](https://huggingface.co/models) أو ن من نقطة تخزين خاصة بالمستخدم.
+  - بالإضافة إلى هذه الفئات الأساسية الثلاث، توفر المكتبة واجهتي برمجة تطبيقات: [`pipeline`] للاستخدام السريع لأحد النماذج لأداء استنتاجات على مهمة مُحددة، و [`Trainer`] للتدريب السريع أو الضبط الدقيق لنماذج PyTorch  (جميع نماذج TensorFlow متوافقة مع `Keras.fit`).
+  - نتيجة لذلك، هذه المكتبة ليست صندوق أدوات متعدد الاستخدامات من الكتل الإنشائية للشبكات العصبية. إذا كنت تريد توسيع أو البناء على المكتبة، فما عليك سوى استخدام Python و PyTorch و TensorFlow و Keras العادية والوراثة من الفئات الأساسية للمكتبة لإعادة استخدام الوظائف مثل تحميل النموذج وحفظه. إذا كنت ترغب في معرفة المزيد عن فلسفة الترميز لدينا للنماذج، فراجع منشور المدونة الخاص بنا [Repeat Yourself](https://huggingface.co/blog/transformers-design-philosophy).
+
+2. تقديم نماذج رائدة في مجالها مع أداء قريب قدر الإمكان من النماذج الأصلية:
+
+  - نقدم مثالًا واحدًا على الأقل لكل بنية تقوم بإعادة إنتاج نتيجة مقدمة من المؤلفين الرسميين لتلك البنية.
+  - عادةً ما تكون الشفرة قريبة قدر الإمكان من قاعدة الشفرة الأصلية، مما يعني أن بعض شفرة PyTorch قد لا تكون "بأسلوب PyTorch" كما يمكن أن تكون نتيجة لكونها شفرة TensorFlow محولة والعكس صحيح.
+
+بعض الأهداف الأخرى:
+
+- كشف تفاصيل النماذج الداخلية بشكل متسق قدر الإمكان:
+
+  -نتيح الوصول، باستخدام واجهة برمجة واحدة، إلى جميع الحالات المخفية (Hidden-States) وأوزان الانتباه (Attention Weights).
+  - تم توحيد واجهات برمجة التطبيقات الخاصة بفئات المعالجة المسبقة والنماذج الأساسية لتسهيل التبديل بين النماذج.
+
+- دمج مجموعة مختارة من الأدوات الواعدة لضبط النماذج بدقة (Fine-tuning) ودراستها:
+
+  - طريقة بسيطة ومتسقة لإضافة رموز جديدة إلى مفردات التضمينات (Embeddings) لضبط النماذج بدقة.
+  - طرق سهلة لإخفاء (Masking) وتقليم (Pruning) رؤوس المحولات (Transformer Heads).
+
+- التبديل بسهولة بين PyTorch و TensorFlow 2.0 و Flax، مما يسمح بالتدريب باستخدام إطار واحد والاستدلال باستخدام إطار آخر.
+
+## المفاهيم الرئيسية
+
+تعتمد المكتبة على ثلاثة أنواع من الفئات لكل نموذج:
+
+- **فئات النماذج** يمكن أن تكون نماذج PyTorch ([torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module))، أو نماذج Keras ([tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model))، أو نماذج JAX/Flax ([flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html)) التي تعمل مع الأوزان المُدربة مسبقًا المقدمة في المكتبة.
+- **فئات الإعداد** تخزن معلمات التهيئة المطلوبة لبناء نموذج (مثل عدد الطبقات وحجم  الطبقة المخفية). أنت لست مضطرًا دائمًا إلى إنشاء مثيل لهذه الفئات بنفسك. على وجه الخصوص، إذا كنت تستخدم نموذجًا مُدربًا مسبقًا دون أي تعديل، فإن إنشاء النموذج سيهتم تلقائيًا تهيئة الإعدادات (والذي يعد جزءًا من النموذج).
+- **فئات ما قبل المعالجة** تحويل البيانات الخام إلى تنسيق مقبول من قبل النموذج. يقوم [المعالج](main_classes/tokenizer) بتخزين المعجم لكل نموذج ويقدم طرقًا لتشفير وفك تشفير السلاسل في قائمة من مؤشرات تضمين الرموز ليتم إطعامها للنموذج. تقوم [معالجات الصور](main_classes/image_processor) بمعالجة إدخالات الرؤية، وتقوم [مستخلصات الميزات](main_classes/feature_extractor) بمعالجة إدخالات الصوت، ويقوم [المعالج](main_classes/processors) بمعالجة الإدخالات متعددة الوسائط.
+
+يمكن تهيئة جميع هذه الفئات من نسخ مُدربة مسبقًا، وحفظها محليًا، ومشاركتها على منصة Hub عبر ثلاث طرق:
+
+- تسمح لك الدالة  `from_pretrained()` بتهيئة النموذج وتكويناته وفئة المعالجة المسبقة من إصدار مُدرب مسبقًا إما يتم توفيره بواسطة المكتبة نفسها (يمكن العثور على النماذج المدعومة على [Model Hub](https://huggingface.co/models)) أو مخزنة محليًا (أو على خادم) بواسطة المستخدم.
+- تسمح لك الدالة  `save_pretrained()` بحفظ النموذج، وتكويناته وفئة المعالجة المسبقة محليًا، بحيث يمكن إعادة تحميله باستخدام الدالة `from_pretrained()`.
+- تسمح لك `push_to_hub()` بمشاركة نموذج وتكويناتهوفئة المعالجة المسبقة على Hub، بحيث يمكن الوصول إليها بسهولة من قبل الجميع.
diff --git a/docs/source/ar/pipeline_webserver.md b/docs/source/ar/pipeline_webserver.md
new file mode 100644
index 00000000000000..2a19e84d163241
--- /dev/null
+++ b/docs/source/ar/pipeline_webserver.md
@@ -0,0 +1,126 @@
+# استخدام قنوات المعالجة لخادم ويب 
+
+<Tip>
+
+يُعدّ إنشاء محرك استدلال أمرًا معقدًا، ويعتمد الحل "الأفضل" على مساحة مشكلتك. هل تستخدم وحدة المعالجة المركزية أم وحدة معالجة الرسومات؟ هل تريد أقل زمن وصول، أم أعلى معدل نقل، أم دعمًا للعديد من النماذج، أم مجرد تحقيق أقصى تحسين نموذج محدد؟
+توجد طرق عديدة لمعالجة هذا الموضوع، لذلك ما سنقدمه هو إعداد افتراضي جيد للبدء به قد لا يكون بالضرورة هو الحل الأمثل لك.```
+
+</Tip> 
+
+الشيء الرئيسي الذي يجب فهمه هو أننا يمكن أن نستخدم مؤشرًا، تمامًا كما تفعل [على مجموعة بيانات](pipeline_tutorial#using-pipelines-on-a-dataset)، نظرًا لأن خادم الويب هو أساسًا نظام ينتظر الطلبات ويعالجها عند استلامها. 
+
+عادةً ما تكون خوادم الويب متعددة الإرسال (متعددة مؤشرات الترابط، وغير متزامنة، إلخ) للتعامل مع الطلبات المختلفة بشكل متزامن. من ناحية أخرى، فإن قنوات المعالجة (وبشكل رئيسي النماذج الأساسية) ليست رائعة للتوازي؛ حيث تستهلك الكثير من ذاكرة الوصول العشوائي، لذا من الأفضل منحها جميع الموارد المتاحة عند تشغيلها أو إذا كانت مهمة تطلب حسابات مكثفة. 
+
+سنحل ذلك من خلال جعل خادم الويب يتعامل مع الحمل الخفيف لاستقبال الطلبات وإرسالها،وجعل مؤشر ترابط واحد يتعامل مع العمل الفعلي. سيستخدم هذا المثال `starlette`. ولكن قد تضطر إلى ضبط الكود أو تغييره إذا كنت تستخدم كودًا آخر لتحقيق التأثير نفسه. 
+
+أنشئ `server.py`: 
+
+```py
+from starlette.applications import Starlette
+from starlette.responses import JSONResponse
+from starlette.routing import Route
+from transformers import pipeline
+import asyncio
+
+
+async def homepage(request):
+    payload = await request.body()
+    string = payload.decode("utf-8")
+    response_q = asyncio.Queue()
+    await request.app.model_queue.put((string, response_q))
+    output = await response_q.get()
+    return JSONResponse(output)
+
+
+async def server_loop(q):
+    pipe = pipeline(model="google-bert/bert-base-uncased")
+    while True:
+        (string, response_q) = await q.get()
+        out = pipe(string)
+        await response_q.put(out)
+
+
+app = Starlette(
+    routes=[
+        Route("/", homepage, methods=["POST"]),
+    ],
+)
+
+
+@app.on_event("startup")
+async def startup_event():
+    q = asyncio.Queue()
+    app.model_queue = q
+    asyncio.create_task(server_loop(q))
+```
+
+الآن يمكنك تشغيله باستخدام: 
+
+```bash
+uvicorn server:app
+```
+
+ويمكنك الاستعلام عنه: 
+
+```bash
+curl -X POST -d "test [MASK]" http://localhost:8000/
+#[{"score":0.7742936015129089,"token":1012,"token_str":".","sequence":"test."},...]
+```
+
+وهكذا، لديك الآن فكرة جيدة عن كيفية إنشاء خادم ويب! 
+
+المهم حقًا هو أننا نقوم بتحميل النموذج **مرة واحدة** فقط، لذلك لا توجد نسخ من النموذج على خادم الويب. بهذه الطريقة، لا يتم استخدام ذاكرة الوصول العشوائي غير الضرورية. تسمح آلية وضع قائمة الانتظار بالقيام بأشياء متقدمة مثل تجميع بعض العناصر قبل الاستدلال لاستخدام معالجة الدفعات الديناميكية: 
+
+<Tip warning={true}>
+
+تم كتابة نموذج الكود البرمجى أدناه بشكل مقصود مثل كود وهمي للقراءة. لا تقم بتشغيله دون التحقق مما إذا كان منطقيًا لموارد النظام الخاص بك! 
+
+</Tip> 
+
+```py
+(string, rq) = await q.get()
+strings = []
+queues = []
+while True:
+    try:
+        (string, rq) = await asyncio.wait_for(q.get(), timeout=0.001) # 1ms
+    except asyncio.exceptions.TimeoutError:
+        break
+    strings.append(string)
+    queues.append(rq)
+strings
+outs = pipe(strings, batch_size=len(strings))
+for rq, out in zip(queues, outs):
+    await rq.put(out)
+```
+
+مرة أخرى، تم تحسين الرمز المقترح لسهولة القراءة، وليس ليكون أفضل كود. بادئ ذي بدء، لا يوجد حد لحجم الدفعة، والذي عادةً ما لا يكون فكرة عظيمة. بعد ذلك، يتم إعادة ضبط الفترة في كل عملية جلب لقائمة الانتظار، مما يعني أنه قد يتعين عليك الانتظار لفترة أطول بكثير من 1 مللي ثانية قبل تشغيل الاستدلال (تأخير الطلب الأول بهذا القدر). 
+
+سيكون من الأفضل تحديد مهلة واحدة مدتها 1 مللي ثانية.
+
+سيظل هذا ينتظر دائمًا لمدة 1 مللي ثانية حتى إذا كانت قائمة الانتظار فارغًا، والذي قد لا يكون الأفضل نظرًا لأنك تريد على الأرجح البدء في إجراء الاستدلال إذا لم يكن هناك شيء في قائمة الانتظا. ولكن ربما يكون منطقيًا إذا كانت المعالجة الديناميكية للدفعات مهمة حقًا لحالة الاستخدام لديك. مرة أخرى، لا يوجد حل واحد هو الأفضل. 
+
+## بعض الأشياء التي قد ترغب في مراعاتها 
+
+### التحقق من الأخطاء 
+
+هناك الكثير مما قد يحدث بشكل خاطئ في عند اتاحة النموذج للجمهور: نفاد الذاكرة، أو نفاد المساحة، أو فشل تحميل النموذج، أو قد يكون الاستعلام خاطئًا، أو قد يكون الاستعلام صحيحًا ولكن لا يزال يفشل في التشغيل بسبب خطأ في إعداد النموذج، وما إلى ذلك.
+
+بشكل عام، من الجيد أن يُخرِج الخادم الأخطاء للمستخدم، لذلك يُعدّ إضافة الكثير من عبارات `try..except` لعرض هذه الأخطاء فكرة
+جيدة. لكن ضع في اعتبارك أنه قد يمثل أيضًا مخاطرة أمنية الكشف عن جميع تلك الأخطاء اعتمادًا على سياق الأمان لديك.
+
+### قطع الدائرة (Circuit breaking)
+
+عادةً ما تبدو خوادم الويب أفضل عندما تقوم بقطع الدائرة. وهذا يعني أنها ترجع أخطاء صحيحة عندما تكون مثقلة بشكل زائد بدلاً من الانتظار إلى أجل غير مسمى. قم بإرجاع خطأ 503 بدلاً من الانتظار لفترة طويلة جدًا أو 504 بعد فترة طويلة. 
+
+من السهل نسبيًا تنفيذ ذلك في الكود المقترح نظرًا لوجود قائمة انتظار واحد. إن النظر في حجم قائمة الانتظار هو طريقة أساسية لبدء إرجاع الأخطاء قبل فشل خادم الويب بسبب الحمل الزائد. 
+
+### حجب عمل خيط التنفيذ الرئيسي (Main thread)
+
+حاليًا، لا تدعم PyTorch  العمليات غير المتزامنة، وسيؤدي الحساب إلى حجب عمل الخيط الرئيسي أثناء تشغيله. وهذا يعني أنه سيكون من الأفضل إذا تم إجبار PyTorch على أن تعمل على الخيط/العملية الخاصة به. لم يتم ذلك هنا لأن الكود أكثر تعقيدًا (في الغالب لأن خيوط التنفيذ والعمليات غير المتزامنة  وقوائم الانتظار  لا تتوافق معًا). ولكن في النهاية، فإنه سيؤدي نفس الوظيفة. 
+
+سيكون هذا مهمًا إذا كان الاستدلال للعناصر الفردية طويلاً (> 1 ثانية) لأنه في هذه الحالة، فهذا يعني أنه سيتعين أثناء الاستدلال على كل استعلام الانتظار لمدة ثانية واحدة قبل حتى يلقي خطأ.
+
+### المعالجة الديناميكية 
+
+بشكل عام، لا تُعدّ المعالجة بالضرورة تحسينًا مقارنةً بتمرير عنصر واحد في كل مرة (راجع [تفاصيل المعالجة بالدفعات](./main_classes/pipelines#pipeline-batching) لمزيد من المعلومات).  ولكن يمكن أن تكون فعالة للغاية عند استخدامها  بالإعداد الصحيح. في واجهة برمجة التطبيقات، لا توجد معالجة ديناميكية بشكل افتراضي (فرصة كبيرة جدًا للتباطؤ).  ولكن بالنسبة لاستدلال BLOOM - وهو نموذج كبير جدًا - تُعدّ المعالجة الديناميكية **ضرورية** لتوفير تجربة جيدة للجميع.
\ No newline at end of file
diff --git a/docs/source/ar/task_summary.md b/docs/source/ar/task_summary.md
new file mode 100644
index 00000000000000..d6a9d1477e10fc
--- /dev/null
+++ b/docs/source/ar/task_summary.md
@@ -0,0 +1,323 @@
+# ما الذي تستطيع مكتبة 🤗 Transformers القيام به؟
+
+مكتبة 🤗 Transformers هي مجموعة من النماذج المُدرّبة مسبقًا الأفضل في فئتها لمهام معالجة اللغة الطبيعية (NLP)، ورؤية الحاسوب، ومعالجة الصوت والكلام. لا تحتوي المكتبة فقط على نماذج المحولات (Transformer) فحسب، بل تشمل أيضًا نماذج أخرى لا تعتمد على المحولات مثل الشبكات العصبية التلافيفية الحديثة لمهام رؤية الحاسوب. إذا نظرت إلى بعض المنتجات الاستهلاكية الأكثر شيوعًا اليوم، مثل الهواتف الذكية والتطبيقات وأجهزة التلفاز، فمن المحتمل أن تقف وراءها تقنية ما من تقنيات التعلم العميق. هل تريد إزالة جسم من خلفية صورة التقطتها بهاتفك الذكي؟ هذا مثال على مهمة التجزئة البانورامية (Panoptic Segmentation) ( لا تقلق إذا لم تفهم معناها بعد، فسوف نشرحها في الأقسام التالية!).
+
+توفر هذه الصفحة نظرة عامة على مختلف مهام الكلام والصوت ورؤية الحاسوب ومعالجة اللغات الطبيعية المختلفة التي يمكن حلها باستخدام مكتبة 🤗 Transformers في ثلاثة أسطر فقط من التعليمات البرمجية!
+
+## الصوت
+
+تختلف مهام معالجة الصوت والكلام قليلاً عن باقي الوسائط، ويرجع ذلك ببشكل أساسي لأن الصوت كمدخل هو إشارة متصلة. على عكس النص، لا يمكن تقسيم الموجة الصوتية الخام بشكل مرتب في أجزاء منفصلة بالطريقة التي يمكن بها تقسيم الجملة إلى كلمات. وللتغلب على هذا، يتم عادةً أخذ عينات من الإشارة الصوتية الخام على فترات زمنية منتظمة. كلما زاد عدد العينات التي تؤخذ في فترة زمنية معينة، ارتفع معدل أخذ العينات (معدل التردد)، وصار الصوت أقرب إلى مصدر الصوت الأصلي.
+
+قامت الطرق السابقة بمعالجة الصوت لاستخراج الميزات المفيدة منه. أصبح من الشائع الآن البدء بمهام معالجة الصوت والكلام عن طريق تغذية شكل الموجة الصوتية الخام مباشرة في مشفر الميزات (Feature Encoder)  لاستخراج تمثيل صوتي له. وهذا يبسط خطوة المعالجة المسبقة ويسمح للنموذج بتعلم أهم الميزات.
+
+### تصنيف الصوت
+
+تصنيف الصوت (Audio Classification) هو مهمة يتم فيها تصنيف بيانات الصوت الصوت من مجموعة محددة مسبقًا من الفئات. إنه فئة واسعة تضم العديد من التطبيقات المحددة، والتي تشمل:
+
+* تصنيف المشهد الصوتي: وضع علامة على الصوت باستخدام تسمية المشهد ("المكتب"، "الشاطئ"، "الملعب")
+* اكتشاف الأحداث الصوتية: وضع علامة على الصوت باستخدام تسمية حدث صوتي ("بوق السيارة"، "صوت الحوت"، "كسر زجاج")
+* الوسم: وصنيف صوت يحتوي على أصوات متعددة (أصوات الطيور، وتحديد هوية المتحدث في اجتماع)
+* تصنيف الموسيقى: وضع علامة على الموسيقى بتسمية النوع ("ميتال"، "هيب هوب"، "كانتري")
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline(task="audio-classification", model="superb/hubert-base-superb-er")
+>>> preds = classifier("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> preds
+[{'score': 0.4532, 'label': 'hap'},
+ {'score': 0.3622, 'label': 'sad'},
+ {'score': 0.0943, 'label': 'neu'},
+ {'score': 0.0903, 'label': 'ang'}]
+```
+
+### التعرف التلقائي على الكلام
+
+يقوم التعرف التلقائي على الكلام (ASR) هو عملية تحويل الكلام إلى نص. إنه أحد أكثر المهام الصوتية شيوعًا ويرجع ذلك جزئيًا إلى أن الكلام وسيلة طبيعية للتواصل البشري. واليوم، يتم تضمين أنظمة ASR في منتجات التقنية "الذكية" مثل مكبرات الصوت والهواتف والسيارات. يمكننا أن نطلب من مساعدينا الافتراضيين تشغيل الموسيقى، وضبط التذكيرات، وإخبارنا بأحوال الطقس.
+ولكن أحد التحديات الرئيسية التي ساعدت نماذج المحولات (Transformer) في التغلب عليها هو التعامل مع اللغات منخفضة الموارد. فمن خلال التدريب المسبق على كميات كبيرة من بيانات الصوتية، يُمكن ضبط النموذج بدقة (Fine-tuning) باستخدام ساعة واحدة فقط من بيانات الكلام المُوسم في لغة منخفضة الموارد إلى نتائج عالية الجودة مقارنة بأنظمة ASR السابقة التي تم تدريبها على بيانات موسومة أكثر بـ 100 مرة.
+
+```py
+>>> from transformers import pipeline
+
+>>> transcriber = pipeline(task="automatic-speech-recognition", model="openai/whisper-small")
+>>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
+{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'}
+```
+
+## رؤية الحاسب
+
+كانت إحدى أوائل مهام رؤية الحاسب وأنجحها هى التعرف على صور أرقام الرموز البريدية باستخدام [شبكة عصبية تلافيفية (CNN)](glossary#convolution). تتكون الصورة من وحدات بيكسل، ولكل بكسل قيمة رقمية. وهذا يجعل من السهل تمثيل صورة كمصفوفة من قيم البكسل. يصف كل مزيج معين من قيم البكسل ألوان الصورة.
+
+هناك طريقتان عامتان يمكن من خلالهما حل مهام رؤية الحاسب:
+
+1. استخدام الالتفافات (Convolutions) لتعلم الميزات الهرمية للصورة بدءًا من الميزات منخفضة المستوى وصولًا إلى الأشياء المجردة عالية المستوى.
+2. تقسيم الصورة إلى أجزاء واستخدام نموذج المحولات (Transformer) ليتعلم تدريجياً كيف ترتبط كل جزء صورة ببعضها البعض لتشكيل صورة. على عكس النهج ا التصاعدي (Bottom-Up) الذي تفضله الشبكات العصبية التلافيفية CNN، هذا يشبه إلى حد ما البدء بصورة ضبابية ثم جعلها أوضح تدريجيًا.
+
+### تصنيف الصور
+
+يقوم تصنيف الصور (Image Classification) بوضع علامة على صورة كاملة من مجموعة محددة مسبقًا من الفئات. مثل معظم مهام التصنيف، هناك العديد من التطبيقات العملية لتصنيف الصور، والتي تشمل:
+
+* الرعاية الصحية: تصنيف الصور الطبية للكشف عن الأمراض أو مراقبة صحة المريض
+* البيئة: تصنيف صور الأقمار الصناعية لرصد إزالة الغابات، أو إبلاغ إدارة الأراضي البرية أو اكتشاف حرائق الغابات
+* الزراعة: تصنيفر المحاصيل لمراقبة صحة النبات أو صور الأقمار الصناعية لمراقبة استخدام الأراضي
+* علم البيئة: تصنيف صور الأنواع الحيوانية أو النباتية لرصد أعداد  الكائنات الحية أو تتبع الأنواع المهددة بالانقراض
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline(task="image-classification")
+>>> preds = classifier(
+...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> print(*preds, sep="\n")
+{'score': 0.4335, 'label': 'lynx, catamount'}
+{'score': 0.0348, 'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor'}
+{'score': 0.0324, 'label': 'snow leopard, ounce, Panthera uncia'}
+{'score': 0.0239, 'label': 'Egyptian cat'}
+{'score': 0.0229, 'label': 'tiger cat'}
+```
+
+### كشف الأجسام
+
+على عكس تصنيف الصور، يقوم كشف الأجسام (Object Detection) بتحديد عدة أجسام داخل صورة ومواضع هذه الأجسام في صورة (يحددها مربع الإحاطة). بعض تطبيقات كشف الأجسام تشمل:
+
+* المركبات ذاتية القيادة: اكتشاف أجسام المرورية اليومية مثل المركبات الأخرى والمشاة وإشارات المرور
+* الاستشعار عن بُعد: مراقبة الكوارث، والتخطيط الحضري، والتنبؤ بالطقس
+* اكتشاف العيوب: اكتشاف الشقوق أو الأضرار الهيكلية في المباني، وعيوب التصنيع
+
+```py
+>>> from transformers import pipeline
+
+>>> detector = pipeline(task="object-detection")
+>>> preds = detector(
+...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"], "box": pred["box"]} for pred in preds]
+>>> preds
+[{'score': 0.9865,
+  'label': 'cat',
+  'box': {'xmin': 178, 'ymin': 154, 'xmax': 882, 'ymax': 598}}]
+```
+
+### تجزئة الصور
+
+تجزئة الصورة (Image Segmentation)  هي مهمة على مستوى البكسل تقوم بتخصيص كل بكسل في صورة لفئة معينة. إنه يختلف عن كشف الأجسام، والذي يستخدم مربعات الإحاطة (Bounding Boxes)  لتصنيف والتنبؤ بالأجسام في الصورة لأن التجزئة أكثر دقة. يمكن لتجزئة الصور اكتشاف الأجسام على مستوى البكسل. هناك عدة أنواع من تجزئة الصور:
+
+* تجزئة مثيلات (Instance Segmentation): بالإضافة إلى تصنيف فئة كائن، فإنها تُصنّف أيضًا كل مثيل (Instance)  مميز لكائن ("الكلب-1"، "الكلب-2")
+* التجزئة  البانورامية (Panoptic Segmentation): مزيج من التجزئة الدلالية (Semantic Segmentation) وتجزئة المثيلات؛ فهو تُصنّف كل بكسل مع فئة دلالية **و** كل مثيل مميز لكائن
+
+تُعد مهام تجزئة الصور مفيدة في المركبات ذاتية القيادة على إنشاء خريطة على مستوى البكسل للعالم من حولها حتى تتمكن من التنقل بأمان حول المشاة والمركبات الأخرى. كما أنها مفيدة للتصوير الطبي، حيث يمكن للدقة العالية لهذ المهمة أن تساعد في تحديد الخلايا غير الطبيعية أو خصائص الأعضاء. يمكن أيضًا استخدام تجزئة الصور في التجارة الإلكترونية لتجربة الملابس افتراضيًا أو إنشاء تجارب الواقع المُعزز من خلال تراكب الأجسام في العالم الحقيقي من خلال الكاميرا الهاتف الخاصة بك.
+
+```py
+>>> from transformers import pipeline
+
+>>> segmenter = pipeline(task="image-segmentation")
+>>> preds = segmenter(
+...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> print(*preds, sep="\n")
+{'score': 0.9879, 'label': 'LABEL_184'}
+{'score': 0.9973, 'label': 'snow'}
+{'score': 0.9972, 'label': 'cat'}
+```
+
+### تقدير العمق
+
+يقوم تقدير العمق (Depth Estimation) بالتنبؤ بمسافة كل بكسل في صورة من الكاميرا. تُعد هذه المهمة لرؤية الحاسب هذه مهمة بشكل خاص لفهم وإعادة بناء المشهد. فعلى سبيل المثال، في السيارات ذاتية القيادة، تحتاج المركبات إلى فهم مدى بُعد الأجسام مثل المشاة ولافتات المرور والمركبات الأخرى لتجنب العقبات والاصطدامات. تساعد معلومات العمق أيضًا في بناء التمثيلات ثلاثية الأبعاد من الصور ثنائية الأبعاد ويمكن استخدامها لإنشاء تمثيلات ثلاثية الأبعاد عالية الجودة للهياكل البيولوجية أو المباني.
+
+هناك نهجان لتقدير العمق:
+
+* التصوير المجسم (Stereo): يتم تقدير العمق عن طريق مقارنة صورتين لنفس الصورة من زوايا مختلفة قليلاً.
+* التصوير الأحادي (Monocular): يتم تقدير العمق من صورة واحدة.
+
+```py
+>>> from transformers import pipeline
+
+>>> depth_estimator = pipeline(task="depth-estimation")
+>>> preds = depth_estimator(
+...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+```
+
+## معالجة اللغات الطبيعية
+
+تُعد مهام معالجة اللغة الطبيعية (NLP) من بين أكثر أنواع المهام شيوعًا نظرًا لأن النص هو وسيلة طبيعية لنا للتواصل. ولكي يتمكن النموذج من فهم النص، يجب أولًا تحويله إلى صيغة رقمية. وهذا يعني تقسيم سلسلة النص إلى كلمات أو مقاطع كلمات منفصلة (رموز - Tokens)، ثم تحويل هذه الرموز إلى أرقام. ونتيجة لذلك، يمكنك تمثيل سلسلة من النص كتسلسل من الأرقام، وبمجرد حصولك على تسلسل من الأرقام، يمكن إدخاله إلى نموذج لحل جميع أنواع مهام معالجة اللغة الطبيعية!
+
+### تصنيف النصوص
+
+تمامًا مثل مهام التصنيف في أي مجال آخر، يقوم تصنيف  النصوص (Text Classification)  بتصنيف سلسلة نصية يمكن أن تكون جملة أو فقرة أو مستند) إلى فئة محددة مسبقًا. هناك العديد من التطبيقات العملية لتصنيف النصوص، والتي تشمل:
+
+* تحليل المشاعر (Sentiment Analysis): تصنيف النص وفقًا لمعيار معين مثل `الإيجابية` أو `السلبية` والتي يمكن أن تُعلم وتدعم عملية صنع القرار في مجالات مثل السياسة والتمويل والتسويق
+* تصنيف المحتوى (Content Classification): تصنيف النص وفقًا لبعض الموضوعات للمساعدة في تنظيم وتصفية المعلومات في الأخبار وموجزات الوسائط الاجتماعية (`الطقس`، `الرياضة`، `التمويل`، إلخ).
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline(task="sentiment-analysis")
+>>> preds = classifier("Hugging Face is the best thing since sliced bread!")
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> preds
+[{'score': 0.9991, 'label': 'POSITIVE'}]
+```
+
+### تصنيف الرموز
+
+في أي مهمة من مهام معالجة اللغة الطبيعية NLP، تتم معالجة النص مسبقًا عن طريق تقسيمه إلى كلمات أو مقاطع كلمات فردية تُعرف باسم  [الرموز](glossary#token). يقوم تصنيف الرموز (Token Classification) بتخصيص تصنيف لكل رمز من مجموعة محددة مسبقًا من التصنيفات.
+
+هناك نوعان شائعان من تصنيف الرموز:
+
+* التعرف على الكيانات المسماة (NER):  تصنيف الرموز وفقًا لفئة الكيان مثل المنظمة أو الشخص أو الموقع أو التاريخ. يعد NER شائعًا بشكل خاص في الإعدادات الطبية الحيوية، حيث يُمكنه تصنيف الجينات والبروتينات وأسماء الأدوية.
+* ترميز الأجزاء اللغوية (POS): تصنيف الرموز وفقًا للدورها النحوي مثل الاسم أو الفعل أو الصفة. POS مفيد لمساعدة أنظمة الترجمة على فهم كيفية اختلاف كلمتين متطابقتين نحويًا  (مثل كلمة "عَلَمَ" كاسم و "عَلِمَ" كفعل).
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline(task="ner")
+>>> preds = classifier("Hugging Face is a French company based in New York City.")
+>>> preds = [
+...     {
+...         "entity": pred["entity"],
+...         "score": round(pred["score"], 4),
+...         "index": pred["index"],
+...         "word": pred["word"],
+...         "start": pred["start"],
+...         "end": pred["end"],
+...     }
+...     for pred in preds
+... ]
+>>> print(*preds, sep="\n")
+{'entity': 'I-ORG', 'score': 0.9968, 'index': 1, 'word': 'Hu', 'start': 0, 'end': 2}
+{'entity': 'I-ORG', 'score': 0.9293, 'index': 2, 'word': '##gging', 'start': 2, 'end': 7}
+{'entity': 'I-ORG', 'score': 0.9763, 'index': 3, 'word': 'Face', 'start': 8, 'end': 12}
+{'entity': 'I-MISC', 'score': 0.9983, 'index': 6, 'word': 'French', 'start': 18, 'end': 24}
+{'entity': 'I-LOC', 'score': 0.999, 'index': 10, 'word': 'New', 'start': 42, 'end': 45}
+{'entity': 'I-LOC', 'score': 0.9987, 'index': 11, 'word': 'York', 'start': 46, 'end': 50}
+{'entity': 'I-LOC', 'score': 0.9992, 'index': 12, 'word': 'City', 'start': 51, 'end': 55}
+```
+### الإجابة على الأسئلة
+
+تُعدّ مهمة الإجابة عن الأسئلة (Question Answering) مهمة أخرى على مستوى الرموز (Token-Level) تُرجع إجابة لسؤال ما، وقد تعتمد هذه الإجابة على سياق (في النطاق المفتوح - Open-Domain) أو لا تعتمد على سياق (في النطاق المغلق - Closed-Domain). تحدث هذه المهمة عندما نسأل مساعدًا افتراضيًا عن شيء ما، مثل معرفة ما إذا كان مطعمٌ ما مفتوحًا. يمكن أن تُقدّم هذه المهمة أيضًا دعمًا للعملاء أو دعمًا تقنيًا، كما تُساعد محركات البحث في استرجاع المعلومات ذات الصلة التي نبحث عنها.
+
+هناك نوعان شائعان من الإجابة على الأسئلة:
+
+* الاستخراجية (Extractive): بالنظر إلى سؤال وسياق مُعيّن، فإن الإجابة هي مقطع نصيّ مُستخرج من السياق الذي يُحلّله النموذج.
+* التجريدية (Abstractive): بالنظر إلى سؤال وسياق مُعيّن، يتم إنشاء الإجابة من السياق؛ يتعامل نهج [`Text2TextGenerationPipeline`] مع هذا النهج بدلاً من [`QuestionAnsweringPipeline`] الموضح أدناه
+
+
+```py
+>>> from transformers import pipeline
+
+>>> question_answerer = pipeline(task="question-answering")
+>>> preds = question_answerer(
+...     question="What is the name of the repository?",
+...     context="The name of the repository is huggingface/transformers",
+... )
+>>> print(
+...     f"score: {round(preds['score'], 4)}, start: {preds['start']}, end: {preds['end']}, answer: {preds['answer']}"
+... )
+score: 0.9327, start: 30, end: 54, answer: huggingface/transformers
+```
+
+### التلخيص
+
+ينشئ التلخيص (Summarization) نسخة مختصرة من نص طويل مع محاولة الحفاظ على معظم معنى النص الأصلي. التلخيص هو مهمة تسلسل إلى تسلسل(Sequence-to-Sequence)؛؛ فهو تُنتج تسلسلًا نصيًا أقصر من النص المُدخل. هناك الكثير من المستندات الطويلة التي يمكن تلخيصها لمساعدة القراء على فهم النقاط الرئيسية بسرعة. مشاريع القوانين والوثائق القانونية والمالية وبراءات الاختراع والأوراق العلمية هي مجرد أمثلة قليلة للوثائق التي يمكن تلخيصها لتوفير وقت القراء وخدمة كمساعد للقراءة.
+
+مثل الإجابة على الأسئلة، هناك نوعان من التلخيص:
+
+* الاستخراجية (Extractive): تحديد واستخراج أهم الجمل من النص الأصلي
+* التجريدي (Abstractive): إنشاء ملخص مستهدف (الذي قد يتضمن كلمات جديدة غير موجودة في النص الأصلي) انطلاقًا من النص الأصلي؛ يستخدم نهج التلخيص التجريدي [`SummarizationPipeline`]
+
+```py
+>>> from transformers import pipeline
+
+>>> summarizer = pipeline(task="summarization")
+>>> summarizer(
+...     "In this work, we presented the Transformer, the first sequence transduction model based entirely on attention, replacing the recurrent layers most commonly used in encoder-decoder architectures with multi-headed self-attention. For translation tasks, the Transformer can be trained significantly faster than architectures based on recurrent or convolutional layers. On both WMT 2014 English-to-German and WMT 2014 English-to-French translation tasks, we achieve a new state of the art. In the former task our best model outperforms even all previously reported ensembles."
+... )
+[{'summary_text': ' The Transformer is the first sequence transduction model based entirely on attention . It replaces the recurrent layers most commonly used in encoder-decoder architectures with multi-headed self-attention . For translation tasks, the Transformer can be trained significantly faster than architectures based on recurrent or convolutional layers .'}]
+```
+
+### الترجمة
+
+تحوّل الترجمة تسلسل نص بلغة إلى لغة أخرى. من المهم مساعدة الأشخاص من خلفيات مختلفة على التواصل مع بعضهم البعض، ومساعدة المحتوى على الوصول إلى جمهور أوسع، وحتى أن يكون أداة تعليمية لمساعدة الأشخاص على تعلم لغة جديدة. إلى جانب التلخيص، تعد الترجمة مهمة من نوع تسلسل إلى تسلسل، حيث يتلقى النموذج تسلسلًا مُدخلًا ويُعيد تسلسلًا مُخرَجًا مُستهدفًا.
+
+في الأيام الأولى، كانت نماذج الترجمة في الغالب أحادية اللغة، ولكن مؤخرًا، كان هناك اهتمام متزايد بالنماذج متعددة اللغات التي يمكنها الترجمة بين العديد من أزواج اللغات.
+
+```py
+>>> from transformers import pipeline
+
+>>> text = "translate English to French: Hugging Face is a community-based open-source platform for machine learning."
+>>> translator = pipeline(task="translation", model="google-t5/t5-small")
+>>> translator(text)
+[{'translation_text': "Hugging Face est une tribune communautaire de l'apprentissage des machines."}]
+```
+
+### نمذجة اللغة
+
+نمذجة اللغة (Language Modeling) هي مهمة التنبؤ بالكلمة التالية في تسلسل نصي. لقد أصبح مهمة NLP شائعة للغاية لأن النموذج اللغوي المسبق التدريب يمكن أن يتم ضبطه بشكل دقيق للعديد من مهام الأخرى. في الآونة الأخيرة، كان هناك الكثير من الاهتمام بنماذج اللغة الكبيرة (LLMs) التي توضح التعلم من الصفر أو من عدد قليل من الأمثلة (Zero-shot or Few-shot Learning). وهذا يعني أن النموذج يمكنه حل المهام التي لم يتم تدريبه عليها بشكل صريح! يمكن استخدام نماذج اللغة لإنشاء نص سلس ومقنع، على الرغم من أنه يجب أن تكون حذرًا لأن النص قد لا يكون دائمًا دقيقًا.
+
+هناك نوعان من نمذجة اللغة:
+
+* السببية(Causal): هدف النموذج هو التنبؤ بالرمز (Token)  التالي في التسلسل، ويتم إخفاء الرموز المستقبلية (Masking).
+
+```py
+>>> from transformers import pipeline
+
+>>> prompt = "Hugging Face is a community-based open-source platform for machine learning."
+>>> generator = pipeline(task="text-generation")
+>>> generator(prompt)  # doctest: +SKIP
+```
+
+* المقنّع (Masked): هدف النموذج هو التنبؤ برمز مُخفيّ ضمن التسلسل مع الوصول الكامل إلى الرموز  الأخرى في التسلسل
+
+```py
+>>> text = "Hugging Face is a community-based open-source <mask> for machine learning."
+>>> fill_mask = pipeline(task="fill-mask")
+>>> preds = fill_mask(text, top_k=1)
+>>> preds = [
+...     {
+...         "score": round(pred["score"], 4),
+...         "token": pred["token"],
+...         "token_str": pred["token_str"],
+...         "sequence": pred["sequence"],
+...     }
+...     for pred in preds
+... ]
+>>> preds
+[{'score': 0.2236,
+  'token': 1761,
+  'token_str': ' platform',
+  'sequence': 'Hugging Face is a community-based open-source platform for machine learning.'}]
+```
+  
+## متعدد الوسائط:
+
+تتطلب المهام متعددة الوسائط (Multimodal) من النموذج معالجة وسائط بيانات متعددة (نص أو صورة أو صوت أو فيديو) لحل مشكلة معينة. يعد وصف الصورة (Image Captioning) مثالاً على مهمة متعددة الوسائط حيث يأخذ النموذج صورة كمدخل وينتج تسلسل نصيًا يصف الصورة أو بعض خصائصها.
+
+على الرغم من أن النماذج متعددة الوسائط تعمل مع أنواع أو وسائط بيانات مختلفة، إلا أن خطوات المعالجة المسبقة تساعد النموذج داخليًا على تحويل جميع أنواع البيانات إلى متجهات تضمين (Embeddings) (متجهات أو قوائم من الأرقام التي تحتوي على معلومات ذات معنى حول البيانات). بالنسبة لمهمة مثل وصف الصورة، يتعلم النموذج العلاقات بين متجهات تضمين الصور ومتجهات تضمين النص.
+
+### الإجابة على أسئلة المستندات:
+
+الإجابة على أسئلة المستندات  (Document Question Answering) هي مهمة تقوم بالإجابة على أسئلة اللغة الطبيعية من مستند مُعطى. على عكس مهمة الإجابة على الأسئلة على مستوى الرموز (Token-Level) التي تأخذ نصًا كمدخل، فإن الإجابة على أسئلة المستندات تأخذ صورة لمستند كمدخل بالإضافة إلى سؤال هذا حول المستند وتعيد الإجابة. يمكن استخدام الإجابة على أسئلة المستندات لتفسير المستندات المُنسّقة واستخراج المعلومات الرئيسية منها. في المثال أدناه، يمكن استخراج المبلغ الإجمالي والمبلغ المُسترد من إيصال الدفع..
+
+```py
+>>> from transformers import pipeline
+>>> from PIL import Image
+>>> import requests
+
+>>> url = "https://huggingface.co/datasets/hf-internal-testing/example-documents/resolve/main/jpeg_images/2.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> doc_question_answerer = pipeline("document-question-answering", model="magorshunov/layoutlm-invoices")
+>>> preds = doc_question_answerer(
+...     question="ما هو المبلغ الإجمالي؟",
+...     image=image,
+... )
+>>> preds
+[{'score': 0.8531, 'answer': '17,000', 'start': 4, 'end': 4}]
+```
+
+نأمل أن تكون هذه الصفحة قد زودتك ببعض المعلومات الأساسية حول جميع أنواع المهام في كل طريقة وأهمية كل منها العملية. في القسم التالي، ستتعلم كيف تعمل مكتبة 🤗 Transformers لحل هذه المهام.
\ No newline at end of file
diff --git a/docs/source/ar/tasks_explained.md b/docs/source/ar/tasks_explained.md
new file mode 100644
index 00000000000000..b91297be7d2795
--- /dev/null
+++ b/docs/source/ar/tasks_explained.md
@@ -0,0 +1,279 @@
+# كيف تُنجز نماذج 🤗 Transformers المهام؟
+
+في [ما الذي يمكن أن تفعله نماذج 🤗 Transformers](task_summary)، تعلمت عن معالجة اللغات الطبيعية (NLP)، والخطاب والصوت، ورؤية الحاسب، وبعض تطبيقاتها المهمة. ستلقي هذه الصفحة نظرة فاحصة على كيفية حل النماذج لهذه المهام وتوضيح ما يحدث ما يحدث  وراء الكواليس. هناك العديد من الطرق لحل مهمة معينة، وقد تنفذ بعض النماذج تقنيات معينة أو حتى تتناول المهمة من زاوية جديدة، ولكن بالنسبة لنماذج Transformer، فإن الفكرة العامة هي نفسها. وبفضل تصميمها المرن، فنظراً  لهيكلها المرن،  تُعدّ معظم النماذج عبارة عن  متغير من بنية  المُشفّر (Encoder)  أو  المُفكّك  (Decoder)  أو  المُشفّر - المُفكّك (Encoder-Decoder).  بالإضافة إلى نماذج Transformer، تحتوي مكتبتنا أيضًا على العديد من الشبكات العصبية التلافيفية (CNNs)، والتي لا تزال تستخدم حتى اليوم لمهام رؤية الحاسب. سنشرح أيضًا كيف تعمل شبكة عصبية تلافيفية CNN الحديثة.
+
+لشرح كيفية حل المهام، سنشرح ما يحدث داخل النموذج لإخراج تنبؤات مفيدة.
+
+- [Wav2Vec2](model_doc/wav2vec2) لتصنيف الصوت والتعرف التلقائي على الكلام (ASR)
+- [Vision Transformer (ViT)](model_doc/vit) و [ConvNeXT](model_doc/convnext) لتصنيف الصور
+- [DETR](model_doc/detr) للكشف عن الأجسام
+- [Mask2Former](model_doc/mask2former) لتجزئة الصورة
+- [GLPN](model_doc/glpn) لتقدير العمق
+- [BERT](model_doc/bert) لمهام NLP مثل تصنيف النصوص، وتصنيف الرموز، والإجابة على الأسئلة التي تستخدم مشفرًا
+- [GPT2](model_doc/gpt2) لمهام NLP مثل توليد النصوص التي تستخدم فك تشفير
+- [BART](model_doc/bart) لمهام NLP مثل الملخص والترجمة التي تستخدم ترميز-فك تشفير
+
+<Tip>
+
+قبل المتابعة، من الجيد أن يكون لديك بعض المعرفة الأساسية بهيكلية المحولات (Transformer Architecture) الأصلية. إن معرفة كيفية عمل المُشفّرات (Encoders) والمُفكّكات (Decoders) وآلية الانتباه (Attention Mechanism) سوف تساعدك في فهم كيفية عمل نماذج Transformer المختلفة. إذا كنت مبتدئًا أو بحاجة إلى مراجعة، فراجع [دورتنا](https://huggingface.co/course/chapter1/4؟fw=pt) لمزيد من المعلومات!
+
+</Tip>
+
+## الكلام والصوت (Speech and audio)
+
+يُعدّ  [Wav2Vec2](model_doc/wav2vec2)  نموذجًا مُدرَّبًا ذاتيًا (Self-Supervised)  على بيانات الكلام غير المُصنّفة،  ويُمكن  ضبطه  بدقة (Fine-tuning)  على بيانات موسومة  لأداء  مهام  تصنيف الصوت  والتعرف التلقائي على الكلام. 
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/wav2vec2_architecture.png"/>
+</div>
+
+يتكون هذا النموذج على أربعة مكونات رئيسية:
+
+1. *مشفّر الميزات (Feature Encoder)* يأخذ الموجة الصوتية الخام، ويقوم بتطبيعها (Normalization)  إلى متوسط صفري وانحراف معياري وحدوي، وتحويلها إلى تسلسل من متجهات الميزات التي يبلغ طول كل منها 20 مللي ثانية.
+
+2. *وحدة التكميم (Quantization Module):**  تتميز  أشكال الموجات الصوتية  بطبيعتها  المُستمرة،، لذلك لا يمكن تقسيمها إلى وحدات منفصلة كما  يمكن  تقسيم  التسلسل النصّي إلى كلمات ولهذا السبب يتم تمرير متجهات الميزات إلى *وحدة التكميم*، والتي تهدف إلى تعلم وحدات الكلام المنفصلة. يتم اختيار وحدة الكلام من مجموعة من الرموز، والمعروفة باسم *كتاب الرموز* (يمكنك اعتبار هذا بمثابة المفردات).  ومن  كتاب الرموز،يتم اختيار المتجه أو وحدة الكلام  التي تُمثّل  مدخل الصوت المُستمر  على أفضل وجه،  ويتم  تمريرها  عبر النموذج. 
+3. **شبكة السياق (Context Network):** يتم إخفاء حوالي نصف متجهات الميزات بشكل عشوائي، ويتم تغذية متجه الميزة المُقنّع إلى *شبكة السياق*، والتي تعد مُشفّر  محوّلات  (Transformer Encoder)   الذي يضيف أيضًا تضمينات موضعية نسبية (Relative Positional Embeddings)..
+
+4. **مهمة التناقضية:** يتمثل الهدف من التدريب المسبق لشبكة السياق هو *مهمة تناقضية*. يجب على النموذج التنبؤ بالتمثيل الصحيح للكلام المُكمّم للتنبؤ المقنع من مجموعة من التمثيلات الخاطئة، مما يشجع النموذج على ا إيجاد متجه السياق ووحدة الكلام المُكمّمة الأكثر تشابهًا (التصنيف المستهدف).
+
+بمجرد تدريب Wav2Vec2 مسبقًا، يمكنك ضبط دقته على بياناتك لتصنيف الصوت أو التعرف التلقائي على الكلام!
+
+### تصنيف الصوت (Audio classification)
+
+لاستخدام النموذج الذي تم تدريبه مسبقًا لتصنيف الصوت، أضف رأس تصنيف تسلسلي أعلى نموذج Wav2Vec2 الأساسي. رأس التصنيف هو طبقة خطية تستقبل الحالات المخفية للمشفر. تمثل الحالات المخفية الميزات التي تم تعلمها من كل إطار صوتي والذي يمكن أن يكون له أطوال مختلفة. لتحويلها إلى متجه واحد ثابت الطول، يتم تجميع الحالات المخفية أولاً ثم تحويلها إلى احتمالات عبر تصنيفات الفئات. يتم حساب التكلفة (الخسارة المتقاطعة) بين الاحتمالات  والتصنيف المستهدف للعثور على الفئة الأكثر احتمالًا.
+
+هل أنت مستعد لتجربة تصنيف الصوت؟ تحقق من دليلنا الشامل [تصنيف الصوت](tasks/audio_classification) لمعرفة كيفية ضبط دقة نموذج Wav2Vec2 واستخدامه للاستدلال!
+
+### التعرف التلقائي على الكلام (Automatic speech recognition - ASR)
+
+لاستخدام النموذج الذي تم تدريبه مسبقًا للتعرف التلقائي على الكلام، أضف رأس نمذجة لغوية أعلى نموذج Wav2Vec2 الأساسي لـ [[التصنيف الزمني الترابطي (CTC)](glossary#connectionist-temporal-classification-ctc). رأس  النمذجة اللغوية عبارة عن طبقة خطية تقبل الحالات المخفية للمُشفّر وتحويلها إلى احتمالات. يمثل كل احتمال فئة رمزية (يأتي عدد الرموز من مفردات المهمة). يتم حساب تكلفة CTC بين الاحتمالات والأهداف للعثور على تسلسل الرموز الأكثر احتمالًا، والتي يتم فك تشفيرها بعد ذلك إلى  نص مكتوب.
+
+هل أنت مستعد لتجربة التعرف التلقائي على الكلام؟ تحقق من دليلنا الشامل [التعرف التلقائي على الكلام](tasks/asr) لمعرفة كيفية ضبط دقة نموذج Wav2Vec2 واستخدامه للاستدلال!
+
+## رؤية الحاسب (Computer vision)
+
+هناك طريقتان لتناول مهام رؤية الحاسب:
+
+1. قم بتقسيم الصورة إلى تسلسل من الرقع ومعالجتها بالتوازي باستخدام مُحوّل Transformer.
+2. استخدم شبكة عصبية تلافيفية CNN) حديثة، مثل [ConvNeXT](model_doc/convnext)، والتي تعتمد على الطبقات التلافيفية ولكنها تعتمد تصميمات حديثة للشبكات.
+
+<Tip>
+
+يقوم النهج الثالث بمزج المحولات مع التلافيف (على سبيل المثال، [Convolutional Vision Transformer](model_doc/cvt) أو [LeViT](model_doc/levit)). لن نناقشها لأنها تجمع ببساطة بين النهجين اللذين نستعرضهما هنا.
+
+</Tip>
+
+يتم استخدام ViT و ConvNeXT بشكل شائع لتصنيف الصور، ولكن بالنسبة لمهام الرؤية الأخرى مثل اكتشاف الكائنات والتجزئة وتقدير العمق، سنلقي نظرة على DETR و Mask2Former و GLPN، على التوالي؛ فهذه النماذج هي الأنسب لتلك المهام.
+
+### تصنيف الصور (Image classification)
+
+يمكن استخدام كل من ViT و ConvNeXT لتصنيف الصور؛ الاختلاف الرئيسي هو أن ViT يستخدم آلية انتباه بينما يستخدم ConvNeXT الالتفافات.
+
+#### المحول Transformer
+
+[ViT](model_doc/vit) يستبدل التلافيف تمامًا بهندسة Transformer نقية. إذا كنت على دراية بـ Transformer الأصلي، فأنت بالفعل في طريقك إلى فهم ViT.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/vit_architecture.jpg"/>
+</div>
+
+كان التغيير الرئيسي الذي قدمه ViT هو كيفية تغذية الصور إلى Transformer:
+
+1. يتم تقسيم الصورة إلى رقع مربعة غير متداخلة، يتم تحويل كل منها إلى متجه أو يُسمى *تمثيل الرقعة*. يتم إنشاء تضمينات الرقع من طبقة تلافيفية ثنائية الأبعاد 2D والتي تقوم بإنشاء أبعاد الإدخال الصحيحة (والتي بالنسبة إلى Transformer الأساسي هي 768 قيمة لكل تضمين رقعة). إذا كان لديك صورة 224x224 بكسل، فيمكنك تقسيمها إلى 196 رقعة صورة 16x16. تمامًا مثل كيفية تجزئة النص إلى كلمات، يتم "تجزئة" الصورة إلى سلسلة من الرقع.
+
+2. يتم إضافة *رمز قابل للتعلم* -  تتم إضافة رمز خاص `[CLS]` - إلى بداية تمثيلات الرقع تمامًا مثل BERT. يتم استخدام الحالة المخفية النهائية للرمز `[CLS]` كمدخل لرأس التصنيف المُرفق؛ يتم تجاهل  المخرجات الأخرى. تساعد هذه الرموز النموذج على تعلم كيفية ترميز تمثيل الصورة.
+
+3. الشيء الأخير تتم إضافة "تمثيلات تموضع" إلى تمثيلات الرقع والرمز القابل للتعلم لأن النموذج لا يعرف كيفية ترتيب رقع الصورة. تكون  تمثيلات التموضع قابلة للتعلم أيضًا ولها نفس حجم تمثيلات الرقع. وأخيرًا، يتم تمرير جميع التمثيلات إلى مُشفّر Transformer.
+
+4. يتم تمرير الإخراج، وتحديدًا مخرج الرمز `[CLS]`، إلى رأس الإدراك المتعدد الطبقات (MLP). الهدف من التدريب المسبق لـ ViT هو التصنيف فقط. يقوم رأس MLP، مثل رؤوس التصنيف الأخرى، يحول رأس MLP المخرجات إلى احتمالات عبر تصنيفات الفئات ويحسب دالة التكلفة (الخسارة المتقاطعة) للعثور على الفئة الأكثر احتمالًا.
+
+هل أنت مستعد لتجربة تصنيف الصور؟ تحقق من دليلنا الشامل [تصنيف الصور](tasks/image_classification) لمعرفة كيفية ضبط دقة نموذج ViT واستخدامه للاستدلال!
+
+#### الشبكات العصبية التلافيفية (CNN)
+
+<Tip>
+
+يشرح هذا القسم بإيجاز الالتفافات، ولكن سيكون من المفيد أن يكون لديك فهم مسبق لكيفية تغيير شكل الصورة وحجمها. إذا كنت غير معتاد على الالتفافات، تحقق من [فصل الشبكات العصبية التلافيفية](https://github.com/fastai/fastbook/blob/master/13_convolutions.ipynb) من كتاب fastai!
+
+</Tip>
+
+[ConvNeXT](model_doc/convnext) هو بنية CNN تعتمد تصاميم الشبكات الجديدة والحديثة لتحسين الأداء. ومع ذلك، لا تزال الالتفافات هي جوهر النموذج. من منظور عام، [الالتفاف](glossary#convolution) هو عملية حيث يتم ضرب مصفوفة أصغر (*نواة*) بمقطع صغير من وحدات بكسل الصورة. يحسب بعض الميزات منه، مثل نسيج معين أو انحناء خط. ثم ينزلق إلى النافذة التالية من البكسلات؛ المسافة التي تقطعها الالتفاف تسمى *الخطوة*. 
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/convolution.gif"/>
+</div>
+
+<small>عملية التفاف أساسية بدون حشو أو خطو خطوة واسعة، مأخوذة من  <a href="https://arxiv.org/abs/1603.07285">دليل لحساب الالتفاف للتعلم العميق.</a></small>
+
+يمكنك تغذية هذا الناتج إلى طبقة التفاف أخرى،  ومع كل طبقة متتالية، تتعلم الشبكة أشياء أكثر تعقيدًا وتجريدية مثل النقانق أو الصواريخ. بين طبقات الالتفاف، من الشائع إضافة طبقة تجميع لتقليل الأبعاد وجعل النموذج أكثر قوة للتغيرات في موضع الميزة.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/convnext_architecture.png"/>
+</div>
+
+يقوم ConvNeXT بتحديث شبكة CNN بطرق خمس:
+
+1. تغيير عدد الكتل في كل مرحلة و"ترقيع" الصورة باستخدام خطوة أكبر وحجم نواة المقابل. تجعل استراتيجية التجزئة غير المتداخلة استراتيجية الترقيع مشابهة للطريقة التي يقسم بها ViT للصورة إلى رقع.
+
+2. تقلص طبقة *العنق الزجاجي* عدد القنوات ثم تعيدها لأنها أسرع في إجراء التفاف 1x1، ويمكنك زيادة العمق. يقوم عنق الزجاجة المقلوب بالعكس عن طريق توسيع عدد القنوات وتقلصها، وهو أكثر كفاءة من حيث الذاكرة.
+
+3. استبدل طبقة الالتفاف النموذجية 3x3 في طبقة عنق الزجاجة بـ *الالتفاف بالعمق*، والذي يطبق الالتفاف على كل قناة إدخال بشكل منفصل ثم يقوم بتكديسها معًا مرة أخرى في النهاية. هذا يوسع عرض الشبكة لتحسين الأداء.
+
+4. لدى ViT مجال استقبال عالمي مما يعني أنه يمكنه رؤية المزيد من الصورة في وقت واحد بفضل آلية الانتباه الخاصة به. تحاول ConvNeXT محاكاة هذا التأثير عن طريق زيادة حجم النواة إلى 7x7.
+
+5. يقوم ConvNeXT أيضًا بإجراء العديد من تغييرات تصميم الطبقة التي تُحاكي نماذج المحولات. هناك عدد أقل من طبقات التنشيط والطبقات التطبيع، يتم تبديل دالة التنشيط إلى GELU بدلاً من ReLU، ويستخدم LayerNorm بدلاً من BatchNorm.
+
+يتم تمرير الإخراج من كتل الالتفاف إلى رأس تصنيف يحول المخرجات إلى احتمالات ويحسب دالة التكلفة (الخسارة المتقاطعة) للعثور على التصنيف الأكثر احتمالاً.
+
+### اكتشاف الكائنات (Object detection)
+
+[DETR](model_doc/detr)، *DEtection TRansformer*، هو نموذج اكتشاف كائنات من البداية إلى النهاية يجمع بين CNN مع محول المشفر-فك التشفير.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/detr_architecture.png"/>
+</div>
+
+1. يأخذ العمود الفقري CNN *المدرب مسبقًا* صورة، ممثلة بقيم بكسلاتها، وينشئ خريطة ميزات منخفضة الدقة لها. يتم تطبيق التفاف 1x1 على خريطة الميزات لتقليل الأبعاد، و إنشاء خريطة ميزات جديدة بتمثيل صورة عالي المستوى. نظرًا لأن المحول (Transformer) هو نموذج تسلسلي، يتم تسوية خريطة الميزات إلى تسلسل من متجهات الميزات التي يتم دمجها مع تمثيلات التموضع.
+
+2. يتم تمرير متجهات الميزات إلى المشفر، والذي يتعلم تمثيلات الصورة باستخدام طبقات الانتباه الخاصة به. بعد ذلك، يتم دمج الحالات المخفية للمُشفّر مع *استعلامات الكائنات* في فك التشفير. استعلامات الكائنات هي تمثيلات مكتسبة تركز على مناطق مختلفة من الصورة، ويتم تحديثها أثناء مرورها عبر كل طبقة انتباه. يتم تمرير  الحالات المخفية لفك التشفير إلى شبكة تغذية أمامية التي تتنبأ بإحداثيات مربعات الإحاطة وتصنيف العلامة لكل استعلام كائن، أو `بدون كائن` إذا لم يكن هناك أي كائن.
+
+   يقوم DETR بفك تشفير كل استعلام كائن بالتوازي لإخراج  *N*  من التنبؤات النهائية، حيث  *N*  هو عدد الاستعلامات. على عكس النموذج التلقائي الذي يتنبأ بعنصر واحد في كل مرة، فإن "اكتشاف الكائنات" هو مهمة تنبؤ بمجموعة من التنبؤات (مثل `مربع إحاطة`، `تصنيف`) تقوم بإجراء  *N*  من التنبؤات في مرور واحدة.
+
+3. يستخدم DETR  دالة *خسارة المطابقة ثنائية الفئات* أثناء التدريب لمقارنة عدد ثابت من التنبؤات بمجموعة ثابتة من تصنيفات البيانات الحقيقية. إذا كان هناك عدد أقل من تصنيفات البيانات الحقيقية في مجموعة  *N*  من التصنيفات، فيتم حشوها بفئة "بدون كائن". تشجع دالة الخسارة هذه DETR على العثور على تعيين واحد لواحد بين التنبؤات وتصنيفات البيانات الحقيقية. إذا لم تكن مربعات الإحاطة أو  تصنيفات الفئات صحيحة، يتم تكبد خسارة. وبالمثل، إذا تنبأ DETR بكائن غير موجود، فإنه يتم معاقبته. وهذا يشجع DETR على العثور على كائنات أخرى في الصورة بدلاً من التركيز على كائن بارز حقًا.
+
+يتم إضافة رأس اكتشاف كائن أعلى DETR للعثور على تصنيف الكائن وإحداثيات مربع الإحاطة. هناك مكونان لرأس اكتشاف الكائنات: طبقة خطية لتحويل حالات فك التشفير المخفية إلى احتمالات عبر تصنيفات الفئات، وشبكةMLP للتنبؤ بمربع الإحاطة.
+
+هل أنت مستعد لتجربة اكتشاف الكائنات؟ تحقق من دليلنا الشامل [دليل اكتشاف الكائنات](tasks/object_detection) لمعرفة كيفية ضبط نموذج DETR واستخدامه للاستدلال!
+
+### تجزئة الصورة (Image segmentation)
+
+يُعد [Mask2Former](model_doc/mask2former) بنيةً شاملةً لحل جميع أنواع مهام تجزئة الصور. عادةً ما تُصمم نماذج التجزئة التقليدية لمهمة فرعية محددة من مهام تجزئة الصور، مثل تجزئة المثيل أو التجزئة الدلالية أو التجزئة الشاملة. يصوغ Mask2Former كل مهمة من تلك المهام على أنها مشكلة *تصنيف الأقنعة*. يقوم تصنيف القناع بتجميع وحدات البكسل في *N* قطعة، ويتنبأ بـ *N* أقنعة وتصنيف الفئة المقابل لها لصورة معينة. سنشرح في هذا القسم كيفية عمل Mask2Former، ويمكنك بعد ذلك تجربة ضبط SegFormer في النهاية.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/mask2former_architecture.png"/>
+</div>
+
+هناك ثلاثة مكونات رئيسية لـ Mask2Former:
+
+1. العمود الفقري [Swin](model_doc/swin) يقبل صورة وينشئ خريطة ميزات منخفضة الدقة من 3 عمليات التفافات متتالية 3x3.
+
+2. يتم تمرير خريطة الميزات إلى *فك تشفير البكسل* الذي يقوم تدريجياً بزيادة   الميزات منخفضة الدقة إلى تمثيلات عالية الدقة لكل بكسل. في الواقع، يقوم فك تشفير البكسل بإنشاء ميزات متعددة المقاييس (تحتوي على كل من الميزات منخفضة وعالية الدقة) بدقة 1/32 و1/16 و1/8 من الصورة الأصلية.
+
+3. يتم تغذية كل من خرائط الميزات ذات المقاييس المختلفة على التوالي إلى طبقة واحدة من طبقات فك التشفير في كل مرة لالتقاط الأجسام الصغيرة من ميزات الدقة العالية. يتمثل مفتاح Mask2Former آلية *الاهتمام المقنع* في فك التشفير. على عكس الانتباه المتقاطع الذي يمكن أن يركز على الصورة بأكملها، يركز الانتباه المقنع فقط على منطقة معينة من الصورة. هذا أسرع ويؤدي إلى أداء أفضل لأن الميزات المحلية لصورة كافية للنموذج للتعلم منها.
+
+4. مثل [DETR](tasks_explained#object-detection)، يستخدم Mask2Former أيضًا استعلامات كائن مكتسبة ويجمعها مع ميزات الصورة من فك تشفير البكسل لإجراء تنبؤ مجموعة  (`تصنيف الفئة`، `التنبؤ بالقناع`). يتم تمرير حالات فك التشفير المخفية إلى طبقة خطية وتحويلها إلى احتمالات عبر علامات التصنيف. يتم حساب دالة التكلفة (الخسارة المتقاطعة) بين الاحتمالات وتصنيف الفئة لتحديد الأكثر احتمالاً.
+
+    يتم إنشاء تنبؤات الأقنعة عن طريق الجمع بين تمثيلات البكسل وحالات فك التشفير المخفية النهائية. يتم حساب دالة الخسارة المتقاطعة سيجمويد وخسارة النرد بين الاحتمالات والقناع البيانات الحقيقية للعثور على القناع الأكثر احتمالاً.
+
+هل أنت مستعد لتجربة يدك في اكتشاف الكائنات؟ تحقق من دليلنا الشامل [دليل تجزئة الصورة](tasks/semantic_segmentation) لمعرفة كيفية ضبط SegFormer واستخدامه للاستدلال!
+
+### تقدير العمق (Depth estimation)
+
+[GLPN](model_doc/glpn)، شبكة المسار العالمية المحلية، هي محول ل تقدير العمق الذي يجمع بين مشفر [SegFormer](model_doc/segformer) مع فك تشفير خفيف الوزن.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/glpn_architecture.jpg"/>
+</div>
+
+1. مثل ViT، يتم تقسيم الصورة إلى تسلسل من الرقع، باستثناء أن هذه رقع الصورة أصغر. هذا أفضل لمهام التنبؤ الكثيفة مثل التجزئة أو تقدير العمق. يتم تحويل رقع الصورة إلى تمثيلات للرقع (راجع قسم [تصنيف الصور](#image-classification) لمزيد من التفاصيل حول كيفية إنشاء تمثيلات الرقع)، والتي يتم تغذيتها إلى المشفر.
+
+2. يقبل المشفر تمثيلات الرقع، ويمررها عبر عدة كتل مشفرة. يتكون كل كتلة من طبقات انتباه وMix-FFN. الغرض من هذا الأخير هو توفير معلومات موضعية. في نهاية كل كتلة مشفرة توجد طبقة *دمج الرقع* لإنشاء تمثيلات هرمية. يتم دمج ميزات كل مجموعة من الرقع المجاورة، ويتم تطبيق طبقة خطية على الميزات المدمجة لتقليل عدد الرقع إلى دقة 1/4. يصبح هذا المُدخل للكتلة المشفرة التالية، حيث تتكرر هذه العملية بأكملها حتى تحصل على ميزات الصورة بدقة 1/8 و1/16 و1/32.
+
+3. يقوم فك تشفير خفيف الوزن بأخذ خريطة الميزات الأخيرة (مقياس 1/32) من المشفر وزيادة حجمها إلى مقياس 1/16. من هنا، يتم تمرير الميزة إلى وحدة *دمج الميزات الانتقائية (SFF)*، والتي تقوم باختيار ودمج الميزات المحلية والعالمية من خريطة انتباه لكل ميزة ثم زيادة حجمها إلى 1/8. تتم إعادة هذه العملية حتى تصبح الميزات فك التشفير بنفس حجم الصورة الأصلية. يتم تمرير الإخراج عبر طبقتين تلافيفتين ثم يتم تطبيق تنشيط سيجمويد للتنبؤ بعمق كل بكسل.
+
+## معالجة اللغات الطبيعية (Natural language processing -NLP)
+
+تم تصميم نموذج المحول Transformer في الأصل للترجمة الآلية، ومنذ ذلك الحين أصبح في الواقع البنية الافتراضية لحل جميع مهام NLP. تناسب بعض المهام بنية المشفر في نموذج المحول، في حين أن البعض الآخر أكثر ملاءمة لفك التشفير. لا تزال مهام أخرى تستخدم بنية المشفر-فك التشفير في نموذج المحول.
+
+### تصنيف النصوص (Text classification)
+
+يعد [BERT](model_doc/bert)  نموذج يعتمد على المُشفّر فقط، وهو أول نموذج يُطبق بشكل فعال ثنائية الاتجاه العميقة لتعلم تمثيلات أكثر ثراءً للنص من خلال الانتباه إلى الكلمات على كلا الجانبين.
+
+1. يستخدم BERT تجزئة [WordPiece](tokenizer_summary#wordpiece) لإنشاء تمثيل رمزي للنص. للتمييز بين جملة واحدة وزوج من الجمل، تتم إضافة رمز خاص `[SEP]` للتفريق بينهما. تتم إضافة رمز خاص `[CLS]` إلى بداية كل تسلسل نصي. ويتم استخدام الإخراج النهائي مع الرمز `[CLS]` كمدخل لرأس التصنيف لمهام التصنيف. كما يضيف BERT تضمينًا للمقطع  للإشارة إلى ما إذا كان الرمز ينتمي إلى الجملة الأولى أو الثانية في زوج من الجمل.
+
+2. يتم تدريب BERT المسبق باستخدام هدفين: نمذجة اللغة المقنعة وتنبؤ الجملة التالية. في نمذجة اللغة المقنعة، يتم إخفاء نسبة مئوية مُعيّنة من رموز الإدخال بشكل عشوائي، ويجب على النموذج التنبؤ بها. يحل هذا مشكلة ثنائية الاتجاه، حيث يمكن للنموذج أن يغش ويرى جميع الكلمات و"يتنبأ" بالكلمة التالية. تتم تمرير الحالات المخفية النهائية للرموز المقنعة المتوقعة إلى شبكة تغذية أمامية مع دالة Softmax عبر مفردات اللغة للتنبؤ بالكلمة المقنعة.
+
+    الهدف الثاني من التدريب المسبق هو توقع الجملة التالية. يجب على النموذج التنبؤ بما إذا كانت الجملة "ب" تتبع الجملة"أ". نصف الوقت تكون الجملة  "ب" هي الجملة التالية، والنصف الآخر من الوقت، تكون الجملة  "ب" عبارة عشوائية. يتم تمرير التنبؤ، سواء كانت الجملة التالية أم لا، إلى شبكة تغذية أمامية مع دالة Softmax عبر الفئتين (`IsNext` و`NotNext`).
+
+3. يتم تمرير تمثيلات الإدخال عبر عدة طبقات مشفرة لإخراج بعض الحالات المخفية النهائية.
+
+لاستخدام النموذج المسبق التدريب لتصنيف النصوص، أضف رأس تصنيف تسلسلي أعلى نموذج BERT الأساسي. رأس تصنيف التسلسلي هو طبقة خطية تقبل الحالات المخفية النهائية وتجري تحويلًا خطيًا لتحويلها إلى احتمالات logits. يتم حساب دالة التكلفة (الخسارة المتقاطعة) بين logits والهدف للعثور على التصنيف الأكثر احتمالًا.
+
+هل أنت مستعد لتجربة تصنيف النصوص؟ تحقق من [دليل تصنيف النصوص](tasks/sequence_classification)  الشامل الخاص بنا لمعرفة كيفية ضبط نموذج DistilBERT واستخدامه للاستنتاج!
+
+### تصنيف الرموز (Token classification)
+
+لاستخدام BERT لمهام تصنيف الرموز مثل التعرف على الكيانات المسماة (NER)، أضف رأس تصنيف الرموز أعلى نموذج BERT الأساسي. رأس تصنيف الرموز هو طبقة خطية تقبل الحالات المخفية النهائية  وتجري تحويلًا خطيًا لتحويلها إلى logits. يتم حساب دالة التكلفة (الخسارة المتقاطعة) بين logits وكل رمز للعثور على التصنيف الأكثر احتمالًا.
+
+هل أنت مستعد لتجربة تصنيف الرموز؟ تحقق من  [دليل تصنيف الرموز](tasks/token_classification) الشامل الخاص بنا لمعرفة كيفية ضبط نموذج DistilBERT واستخدامه للاستنتاج!
+
+### الإجابة على الأسئلة (Question answering)
+
+لاستخدام BERT للإجابة على الأسئلة، أضف رأس تصنيف المدى أعلى نموذج BERT الأساسي. تقبل هذه الطبقة الخطية الحالات المخفية النهائية وتُجري تحويلًا خطيًا  لحساب داية ونهاية `الامتداد`  logits `span` البداية والنهاية المقابلة للإجابة. يتم حسابدالة التكلفة (الخسارة المتقاطعة) بين logits وموقع التصنيف للعثور على الامتداد الأكثر احتمالًا من النص المقابل للإجابة.
+
+هل أنت مستعد لتجربة الإجابة على الأسئلة؟ راجع [دليل الإجابة على الأسئلة](tasks/question_answering) الشامل الخاص بنا لمعرفة كيفية ضبط نموذج DistilBERT واستخدامه في الاستدلال!
+
+
+<Tip>
+
+💡 لاحظ مدى سهولة استخدام BERT لمهام مختلفة بمجرد تدريبه مسبقًا. كل ما تحتاج إليه هو إضافة رأس محدد إلى النموذج المسبق التدريب للتلاعب بالحالات المخفية إلى الإخراج المطلوب!
+
+</Tip>
+
+### توليد النصوص (Text generation)
+
+يُعد [GPT-2](model_doc/gpt2) نموذجًا قائم على فك التشفير فقط تم تدريبه المسبق على كمية كبيرة من النصوص. يمكنه توليد نص مقنع (على الرغم من أنه ليس دائمًا صحيحًا!) بناءً على مُحفّز معين واستكمال مهام NLP الأخرى مثل الإجابة على الأسئلة على الرغم من أنه لم يتم تدريبه بشكل صريح على ذلك.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/gpt2_architecture.png"/>
+</div>
+
+1. يستخدم GPT-2 [ترميز الأزواج البايتية (BPE)](tokenizer_summary#byte-pair-encoding-bpe) لتجزئة الكلمات وإنشاء  تمثيل رمزى. يتم إضافة تمثيلات موضعية إلى تمثيلات الرموز للإشارة إلى موضع كل رمز في التسلسل. يتم تمرير تمثيلات الإدخال عبر عدة كتل فك تشفير لإخراج بعض الحالات المخفية النهائية. داخل كل كتلة فك تشفير، يستخدم GPT-2 طبقة *انتباه ذاتي مقنع* مما يعني أن GPT-2 لا يمكنه الانتباه بالرموز المستقبلية. يُسمح له فقط بالاهتمام بالرموز الموجودة على اليسار. يختلف هذا عن رمز [`mask`] الخاص بـ BERT لأنه، في الانتباه الذاتي المقنع، يتم استخدام قناع انتباه لتعيين النتيجة إلى `0` للرموز المستقبلية.
+
+2. يتم تمرير الإخراج من فك التشفير إلى رأس نمذجة اللغة، والتي تُجري  تحويلًا  خطيًا  لتحويل الحالات المخفية إلى احتمالات logits. التصنيف هو الرمز التالي في التسلسل، والذي يتم إنشاؤه عن طريق  تغيير موضع logits إلى اليمين بمقدار واحد. يتم حساب دالة  التكلفة (الخسارة  المتقاطعة)  بين logits  التي تم تغيير موضعها والتصنيفات لإخراج الرمز التالي الأكثر احتمالًا.
+
+يستند هدف التدريب المسبق لـ GPT-2 بالكامل إلى  [نمذجة اللغة السببية](glossary#causal-language-modeling)، والتنبؤ بالكلمة التالية في تسلسل. يجعل هذا GPT-2 جيدًا بشكل خاص في المهام التي تتضمن توليد النص.
+
+هل أنت مستعد لتجربة توليد النصوص؟ تحقق من دليل [دليل  نمذجة  اللغة  السببية](tasks/language_modeling#causal- الشامل الخاص بنا لمعرفة كيفية ضبط نموذج DistilGPT-2 واستخدامه للاستنتاج!
+
+<Tip>
+
+للحصول على مزيد من المعلومات حول توليد النص، راجع دليل [استراتيجيات توليد النصوص](generation_strategies)!
+
+</Tip>
+
+### التلخيص (Summarization)
+
+تم تصميم نماذج المشفر-فك التشفير مثل [BART](model_doc/bart) و [T5](model_doc/t5) لنمط تسلسل إلى تسلسل لمهمة التلخيص. سنشرح كيف يعمل BART في هذا القسم، ثم يمكنك تجربة ضبط T5 في النهاية.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bart_architecture.png"/>
+</div>
+
+1. تتشابه بنية المشفر BART كثيرًا مع BERT وتقبل رمزًا وتمثيلًا موضعيًا للنص. يتم تدريب BART مسبقًا عن طريق إتلاف المُدخلات ثم إعادة بنائه باستخدام فك التشفير. على عكس المشفرات الأخرى ذات استراتيجيات الإتلاف المحددة، يمكن لـ BART تطبيق أي نوع من الإتلاف. ومع ذلك، فإن استراتيجية إتلاف "ملء النص" تعمل بشكل أفضل. في ملء النص، يتم استبدال عدد من امتدادات النص برمز **واحد** [`mask`]. هذا أمر مهم لأن النموذج يجب أن يتنبأ بالرموز المقنعة، ويعلّم النموذج التنبؤ بعدد الرموز المفقودة. يتم تمرير تمثيلات الإدخال والامتدادات المقنعة عبر المشفر لإخراج بعض الحالات المخفية النهائية، ولكن على عكس BERT، لا يضيف BART شبكة تغذية أمامية نهائية في النهاية للتنبؤ بكلمة.
+
+2. يتم تمرير إخراج المشفر إلى فك التشفير، والذي يجب أن يتنبأ بالرموز المقنعة وأي رموز غير تالفة من ناتج المشفر. يمنح هذا فك التشفير سياقًا إضافيًا للمساعدة في استعادة النص الأصلي. يتم تمرير ناتج فك التشفير إلى رأس نمذجة اللغوية، والذي يجري تحويلًا خطيًا لتحويل الحالات المخفية إلى احتمالات(logits). يتم حساب دالة التكلفة (الخسارة المتقاطعة) بين الاحتمالات logits والتصنيف، وهو مجرد الرمز الذي تم تغيير موضعه إلى اليمين.
+
+هل أنت مستعد لتجربة التلخيص؟ تحقق من دليل التلخيص الشامل الخاص بنا لمعرفة كيفية ضبط نموذج T5 واستخدامه للاستنتاج!
+
+<Tip>
+
+للحصول على مزيد من المعلومات حول توليد النص، راجع دليل استراتيجيات توليد النص!
+
+</Tip>
+
+### الترجمة (Translation)
+
+تُعد الترجمة مثالًا آخر على مهام التسلسل إلى التسلسل، مما يعني أنه يمكنك استخدام نموذج المشفر-فك التشفير مثل [BART](model_doc/bart) أو [T5](model_doc/t5) للقيام بذلك. سنشرح كيف يعمل BART في هذا القسم، ثم يمكنك تجربة ضبط T5 في النهاية.
+
+يتكيف BART مع الترجمة عن طريق إضافة مشفر منفصل يتم تهيئته بشكل عشوائي لتعيين لغة المصدر بمدخلات يمكن فك تشفيرها إلى لغة الهدف. يتم تمرير تمثيلات هذا المشفر الجديد إلى المشفر المسبق التدريب بدلاً من تمثيلات الكلمات الأصلية. يتم تدريب مشفر المصدر عن طريق تحديث مشفر المصدر وتمثيلات التموضع وتمثيلات الإدخال باستخدام دالة التكلفة (الخسارة المتقاطعة) من ناتج النموذج. يتم تجميد معلمات النموذج في هذه الخطوة الأولى، ويتم تدريب جميع معلمات النموذج معًا في الخطوة الثانية.
+
+تم إصدار نسخة متعددة اللغات من BART، تسمى mBART، مُخصصة للترجمة ومُدرّبة مسبقًا على العديد من اللغات المختلفة.
+
+هل أنت مستعد لتجربة الترجمة؟ تحقق من دليل الترجمة الشامل الخاص بنا لمعرفة كيفية ضبط نموذج T5 واستخدامه للاستنتاج!
+
+<Tip>
+
+ **للحصول على مزيد من المعلومات حول توليد النصوص، راجع دليل [استراتيجيات توليد النصوص](generation_strategies)!** 
+ 
+</Tip>
diff --git a/docs/source/ar/tokenizer_summary.md b/docs/source/ar/tokenizer_summary.md
new file mode 100644
index 00000000000000..109521d24bbb76
--- /dev/null
+++ b/docs/source/ar/tokenizer_summary.md
@@ -0,0 +1,198 @@
+# ملخص عن المجزئات اللغوية
+
+[[open-in-colab]]
+
+في هذه الصفحة، سنتناول بالتفصيل عملية التجزئة.
+
+<Youtube id="VFp38yj8h3A"/>
+
+كما رأينا في [برنامج تعليمي حول المعالجة المسبقة](preprocessing)، فإن تجزئة النص يقسمه إلى كلمات أو
+الرموز الفرعية (كلمات جزئية)، والتي يتم بعد ذلك تحويلها إلى معرفات من خلال قائمة بحث. يعد تحويل الكلمات أو الرموز الفرعية إلى معرفات مباشرًا، لذا في هذا الملخص، سنركز على تقسيم النص إلى كلمات أو رموز فرعية (أي تجزئة النص).
+وبشكل أكثر تحديدًا، سنلقي نظرة على الأنواع الثلاثة الرئيسية من المُجزئات اللغوية المستخدمة في 🤗 المحولات: [ترميز الأزواج البايتية (BPE)](#byte-pair-encoding)، [WordPiece](#wordpiece)، و [SentencePiece](#sentencepiece)، ونعرض أمثلة
+على نوع المُجزئة الذي يستخدمه كل نموذج.
+
+لاحظ أنه في كل صفحة نموذج، يمكنك الاطلاع على وثائق المُجزئة المرتبط لمعرفة نوع المُجزئ
+الذي استخدمه النموذج المُدرب مسبقًا. على سبيل المثال، إذا نظرنا إلى [`BertTokenizer`]، يمكننا أن نرى أن النموذج يستخدم [WordPiece](#wordpiece).
+
+## مقدمة
+
+إن تقسيم النص إلى أجزاء أصغر هو مهمة أصعب مما تبدو، وهناك طرق متعددة للقيام بذلك.
+على سبيل المثال، دعنا نلقي نظرة على الجملة `"Don't you love 🤗 Transformers? We sure do."`
+
+<Youtube id="nhJxYji1aho"/>
+
+يمكن تقسيم  هذه الجملة ببساطة عن طريق  المسافات، مما  سينتج عنه ما يلي:```
+
+```
+["Don't", "you", "love", "🤗", "Transformers?", "We", "sure", "do."]
+```
+
+هذه خطوة أولى منطقية، ولكن إذا نظرنا إلى الرموز `"Transformers?"` و `"do."`، فإننا نلاحظ أن علامات الترقيم مُرفقة بالكلمات `"Transformer"` و `"do"`، وهو أمر ليس مثالي. يجب أن نأخذ علامات الترقيم في الاعتبار حتى لا يضطر النموذج إلى تعلم تمثيل مختلف للكلمة وكل رمز ترقيم مُحتمل قد يليها، الأمر الذي من شأنه أن يزيد بشكل هائل عدد التمثيلات التي يجب على النموذج تعلمها.
+مع مراعاة علامات الترقيم، سيُصبح تقسيم  نصنا  على النحو التالي:
+
+```
+["Don", "'", "t", "you", "love", "🤗", "Transformers", "?", "We", "sure", "do", "."]
+```
+
+أفضل. ومع ذلك، من غير الملائم كيفية تقسيم الكلمة `"Don't"`. `"Don't"` تعني `"do not"`، لذا سيكون من الأفضل تحليلها على أنها كلمتين  مُدمجتين `["Do"، "n't"]`. هنا تبدأ الأمور في التعقيد، وهو جزء من سبب امتلاك كل نموذج لنوّعه  الخاص من مُجزّئ  النصوص (tokenizer). اعتمادًا على القواعد التي نطبقها لتقسيم النص، يسيتم إنشاء مخرجات مُجزّأة  مُختلفة لنفس النص. ولن يؤدي النموذج المُدرب مسبقًا إلى الأداء بشكل صحيح إلا إذا  قُدّم  له مُدخل تم تقسيمه بنفس  القواعد التي تم استخدامها لتقسيم بيانات التدريب الخاصة به.
+
+يُعد كل من [spaCy](https://spacy.io/) و [Moses](http://www.statmt.org/moses/?n=Development.GetStarted) هما مجزّئي النصوص التي تعتمد على القواعد
+الشائعة. عند تطبيقها على مثالنا، فإن *spaCy* و *Moses* ستخرج نّصًا مثل:
+
+```
+["Do", "n't", "you", "love", "🤗", "Transformers", "?", "We", "sure", "do", "."]
+```
+
+كما يمكنك أن ترى، يتم هنا استخدام التقسيم المكاني والترقيم، وكذلك تقسيم الكلمات القائم على القواعد. يعد التقسيم المكاني والترقيم والتحليل القائم على القواعد كلاهما مثالين على تقسيم الكلمات، والذي يُعرّف بشكل غير مُحدد على أنه تقسيم  الجُمل إلى كلمات. في حين أنها الطريقة الأكثر بديهية لتقسيم النصوص إلى أجزاء أصغر،
+يمكن أنها تؤدى إلى مشكلات لمجموعات النصوص الضخمة. في هذه الحالة، عادةً ما يؤدي التقسيم المكاني والترقيم
+إلى إنشاء مفردات كبيرة جدًا (مجموعة من جميع الكلمات والرموز الفريدة المستخدمة). على سبيل المثال، يستخدم [Transformer XL](model_doc/transfo-xl) التقسيم المكاني والترقيم، مما يؤدي إلى حجم مُفردات يبلغ 267735!
+
+يفرض حجم المُفردات الكبير هذا على النموذج أن يكون لديه مصفوفة تضمين (embedding matrix) ضخمة كطبقة إدخال وإخراج، مما يؤدي إلى زيادة كل من التعقيد الزمني والذاكرة. بشكل عام، نادرًا ما يكون لدى نماذج المحولات حجم مفردات
+أكبر من 50000، خاصة إذا تم تدريبها مسبقًا على لغة واحدة فقط.
+
+لذا إذا كان التقسيم المكاني و الترقيم البسيط غير مرضٍ، فلماذا لا نقسّم الحروف ببساطة؟
+
+<Youtube id="ssLq_EK2jLE"/>
+
+في حين أن تقسيم الأحرف بسيط للغاية ومن شأنه أن يقلل بشكل كبير من التعقيد الزمني والذاكرة، إلا أنه يجعل من الصعب
+على النموذج تعلم تمثيلات المدخلات ذات معنى. على سبيل المثال، يعد تعلم تمثيل مستقل عن السياق للحرف "t" أكثر صعوبة من تعلم تمثيل مستقل عن السياق لكلمة "اليوم". لذلك، غالبًا ما يكون تحليل الأحرف مصحوبًا بفقدان الأداء. لذا للحصول على أفضل ما في العالمين، تستخدم نماذج المحولات نظامًا  هجينًا  بين تقسيم على مستوى الكلمة وتقسيم علي مستوى الأحرف يسمى **تقسيم   الوحدات  الفرعية  للّغة**   (subword   tokenization).
+
+## تقسيم الوحدات الفرعية للّغة (Subword Tokenization)
+
+<Youtube id="zHvTiHr506c"/>
+
+تعتمد خوارزميات تقسيم الوحدات الفرعية subword على المبدأ القائل بأن الكلمات الشائعة الاستخدام لا ينبغي تقسيمها إلى وحدات فرعية أصغر، ولكن يجب تفكيك الكلمات النادرة إلى رموز فرعية ذات معنى. على سبيل المثال، قد يتم اعتبار "annoyingly"
+كلمة نادرة ويمكن تحليلها إلى "annoying" و "ly". كل من "annoying" و "ly" كـ subwords مستقلة ستظهر بشكل متكرر أكثر في حين أن معنى "annoyingly" يتم الاحتفاظ به من خلال المعنى المركب لـ "annoying" و "ly". هذا مفيد بشكل خاص في اللغات التلصيقية مثل التركية، حيث يمكنك تشكيل كلمات مُركبة طويلة (تقريبًا) بشكل تعسفي عن طريق ضم الرموز الفرعية معًا.
+
+يسمح تقسيم subword للنموذج بأن يكون له حجم مفردات معقول مع القدرة على تعلم تمثيلات مستقلة عن السياق ذات معنى. بالإضافة إلى ذلك، يمكّن تقسيم subword النموذج من معالجة الكلمات التي لم يسبق له رؤيتها من قبل، عن طريق تحليلها إلى رموز فرعية معروفة. على سبيل المثال، يقوم المحلل [`~transformers.BertTokenizer`] بتحليل"I have a new GPU!" كما يلي:
+
+```py
+>>> from transformers import BertTokenizer
+
+>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
+>>> tokenizer.tokenize("I have a new GPU!")
+["i", "have", "a", "new", "gp", "##u", "!"]
+```
+
+نظرًا  لأننا نستخدم  نموذجًا غير حساس لحالة الأحرف (uncased model)، فقد تم تحويل الجملة إلى أحرف صغيرة أولاً. يمكننا أن نرى أن الكلمات `["i"، "have"، "a"، "new"]` موجودة في مفردات  مُجزّئ النصوص، ولكن الكلمة "gpu" غير موجودة. وبالتالي، يقوم مُجزّئ النصوص   بتقسيم "gpu" إلى رموز فرعية معروفة: `["gp" و "##u"]`. يعني "##" أنه يجب ربط بقية الرمز بالرمز السابق، دون مسافة (للترميز أو عكس عملية  تقسيم  الرموز).
+
+كمثال آخر، يقوم المحلل [`~transformers.XLNetTokenizer`] بتقسيم نّص مثالنا السابق كما يلي:
+
+```py
+>>> from transformers import XLNetTokenizer
+
+>>> tokenizer = XLNetTokenizer.from_pretrained("xlnet/xlnet-base-cased")
+>>> tokenizer.tokenize("Don't you love 🤗 Transformers? We sure do.")
+["▁Don", "'", "t", "▁you", "▁love", "▁"، "🤗"، "▁"، "Transform"، "ers"، "؟"، "▁We"، "▁sure"، "▁do"، "."]
+```
+سنعود إلى معنى تلك `"▁"` عندما نلقي نظرة على [SentencePiece](#sentencepiece). كما يمكنك أن ترى،
+تم تقسيم الكلمة النادرة "Transformers" إلى الرموز الفرعية الأكثر تكرارًا `"Transform"` و `"ers"`.
+
+دعنا الآن نلقي نظرة على كيفية عمل خوارزميات تقسيم subword المختلفة. لاحظ أن جميع خوارزميات التقسيم هذه تعتمد على بعض أشكال التدريب الذي يتم عادةً على مجموعة البيانات التي سيتم تدريبها النموذج عليها.
+
+<a id='byte-pair-encoding'></a>
+
+### ترميز الأزواج البايتية (BPE)
+
+تم تقديم رميز أزواج البايت (BPE) في ورقة بحثية بعنوان [الترجمة الآلية العصبية للكلمات النادرة باستخدام وحدات subword (Sennrich et al.، 2015)](https://arxiv.org/abs/1508.07909). يعتمد BPE على مُجزّئ أولي يقسم بيانات التدريب إلى
+كلمات. يمكن أن يكون التحليل المسبق بسيطًا مثل التقسيم المكاني، على سبيل المثال [GPT-2](model_doc/gpt2)، [RoBERTa](model_doc/roberta). تشمل التقسيم الأكثر تقدمًا معتمد على التحليل القائم على القواعد، على سبيل المثال [XLM](model_doc/xlm)، [FlauBERT](model_doc/flaubert) الذي يستخدم Moses لمعظم اللغات، أو [GPT](model_doc/openai-gpt) الذي يستخدم spaCy و ftfy، لحساب تكرار كل كلمة في مجموعة بيانات التدريب.
+
+بعد التحليل المسبق، يتم إنشاء مجموعة من الكلمات الفريدة وقد تم تحديد تكرار كل كلمة في تم تحديد بيانات التدريب. بعد ذلك، يقوم BPE بإنشاء مفردات أساسية تتكون من جميع الرموز التي تحدث في مجموعة الكلمات الفريدة ويتعلم قواعد الدمج لتشكيل رمز جديد من رمزين من المفردات الأساسية. إنه يفعل ذلك حتى تصل المفردات إلى حجم المفردات المطلوب. لاحظ أن حجم المفردات هو فرط معلمة لتحديد قبل تدريب مُجزّئ  النصوص.
+
+كمثال، دعنا نفترض أنه بعد  التقسيم    الأولي، تم تحديد مجموعة الكلمات التالية بما في ذلك تكرارها:
+
+```
+("hug", 10), ("pug", 5), ("pun", 12), ("bun", 4), ("hugs", 5)
+```
+
+وبالتالي، فإن المفردات الأساسية هي `["b"، "g"، "h"، "n"، "p"، "s"، "u"]`. من خلال تقسيم جميع الكلمات إلى رموز من
+المفردات الأساسية، نحصل على:
+
+```
+("h" "u" "g"، 10)، ("p" "u" "g"، 5)، ("p" "u" "n"، 12)، ("b" "u" "n"، 4)، ("h" "u" "g" "s"، 5)
+```
+
+بعد ذلك، يقوم BPE بعدد مرات حدوث كل زوج من الرموز المحتملة ويختار زوج الرموز الذي يحدث بشكل متكرر. في
+في المثال أعلاه، يحدث "h" متبوعًا بـ "u" _10 + 5 = 15_ مرة (10 مرات في 10 مرات
+حدوث "hug"، 5 مرات في 5 مرات حدوث "hugs"). ومع ذلك، فإن أكثر أزواج الرموز شيوعًا هو "u" متبوعًا
+بواسطة "g"، والتي تحدث _10 + 5 + 5 = 20_ مرة في المجموع. وبالتالي، فإن أول قاعدة دمج يتعلمها المحلل هي تجميع جميع
+رموز "u" التي تتبعها "g" معًا. بعد ذلك، يتم إضافة "ug" إلى المفردات. تصبح مجموعة الكلمات
+
+```
+("h" "ug"، 10)، ("p" "ug"، 5)، ("p" "u" "n"، 12)، ("b" "u" "n"، 4)، ("h" "ug" "s"، 5)
+```
+
+بعد ذلك، يحدد BPE ثاني أكثر أزواج الرموز شيوعًا. إنه "u" متبوعًا بـ "n"، والذي يحدث 16 مرة. "u"،
+يتم دمج "n" في "un" ويضاف إلى المفردات. ثالث أكثر أزواج الرموز شيوعًا هو "h" متبوعًا
+بواسطة "ug"، والتي تحدث 15 مرة. مرة أخرى يتم دمج الزوج ويتم إضافة "hug" إلى المفردات.
+
+في هذه المرحلة، تكون المفردات هي `["b"، "g"، "h"، "n"، "p"، "s"، "u"، "ug"، "un"، "hug"]` ومجموعة الكلمات الفريدة لدينا
+تمثيله كما يلي:
+
+```
+("hug", 10), ("p" "ug", 5), ("p" "un", 12), ("b" "un", 4), ("hug" "s", 5)
+```
+
+بافتراض أن تدريب ترميز الأزواج البايت سيتوقف عند هذه النقطة، فسيتم تطبيق قواعد الدمج التي تم تعلمها بعد ذلك على الكلمات الجديدة (طالما أن هذه الكلمات الجديدة لا تشمل رموزًا لم تكن في المفردات الأساسية). على سبيل المثال، سيتم تقسيم كلمة "bug" إلى `["b"، "ug"]` ولكن سيتم تقسيم "mug" على أنها `["<unk>"، "ug"]` نظرًا لأن الرمز "m" غير موجود في المفردات الأساسية. بشكل عام، لا يتم استبدال الأحرف الفردية مثل "m" بالرمز "<unk>" لأن بيانات التدريب تتضمن عادةً ظهورًا واحدًا على الأقل لكل حرف، ولكن من المحتمل أن يحدث ذلك لرموز خاصة جدًا مثل الرموز التعبيرية.
+
+كما ذكرنا سابقًا، فإن حجم المفردات، أي حجم المفردات الأساسية + عدد عمليات الدمج، هو معامل يجب اختياره. على سبيل المثال، لدى [GPT](model_doc/openai-gpt) حجم مفردات يبلغ 40478 منذ أن كان لديهم 478 حرفًا أساسيًا واختاروا التوقف عن التدريب بعد 40,000 عملية دمج.
+
+#### ترميز الأزواج البايتية على مستوى البايت
+
+قد تكون المفردات الأساسية التي تتضمن جميع الأحرف الأساسية كبيرة جدًا إذا *على سبيل المثال* تم اعتبار جميع أحرف اليونيكود
+كأحرف أساسية. لذا، ليكون لديك مفردات أساسية أفضل، يستخدم [GPT-2](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) البايتات كمفردات أساسية، وهي حيلة ذكية لإجبار المفردات الأساسية على أن تكون بحجم 256 مع ضمان أن يتم تضمين كل حرف أساسي في المفردات. مع بعض القواعد الإضافية للتعامل مع علامات الترقيم، يمكن لمُجزّئ  النصوص GPT2 تجزئة أي نص دون الحاجة إلى رمز <unk>. لدى [GPT-2](model_doc/gpt) حجم مفردات يبلغ 50257، والذي يتوافق مع رموز 256 base byte، ورمز خاص لنهاية النص والرموز التي تم تعلمها باستخدام 50000 عملية دمج.
+
+<a id='wordpiece'></a>
+
+### WordPiece
+
+تعتبر WordPiece  خوارزمية تجزئة الكلمات الفرعية subword المستخدمة لـ [BERT](model_doc/bert)، [DistilBERT](model_doc/distilbert)، و [Electra](model_doc/electra). تم توضيح الخوارزمية في [البحث الصوتي الياباني والكوري
+(Schuster et al.، 2012)](https://static.googleusercontent.com/media/research.google.com/ja//pubs/archive/37842.pdf) وهو مشابه جدًا
+BPE. أولاً، يقوم WordPiece بتكوين المفردات لتضمين كل حرف موجود في بيانات التدريب
+وتعلم تدريجياً عددًا معينًا من قواعد الدمج. على عكس BPE، لا يختار WordPiece أكثر زوج الرموز المتكررة، ولكن تلك التي تزيد من احتمال بيانات التدريب بمجرد إضافتها إلى المفردات.
+
+لذا، ماذا يعني هذا بالضبط؟ بالإشارة إلى المثال السابق، فإن زيادة احتمال بيانات التدريب تعادل إيجاد زوج الرموز، الذي يكون احتمال تقسيمه على احتمالات رمزه الأول تليها رمزه الثاني هو الأكبر بين جميع أزواج الرموز. *مثال* `"u"`، تليها `"g"` كانت قد اندمجت فقط إذا كان احتمال `"ug"` مقسومًا على `"u"`، `"g"` كان سيكون أكبر من أي زوج آخر من الرموز. بديهيًا، WordPiece مختلف قليلاً عن BPE في أنه يقيم ما يفقده عن طريق دمج رمزين للتأكد من أنه يستحق ذلك.
+
+<a id='unigram'></a>
+
+### Unigram
+
+Unigram هو خوارزمية توكنيز subword التي تم تقديمها في [تنظيم subword: تحسين نماذج الترجمة الشبكة العصبية
+نماذج مع مرشحين subword متعددة (Kudo، 2018)](https://arxiv.org/pdf/1804.10959.pdf). على عكس BPE أو
+WordPiece، يقوم Unigram بتكوين مفرداته الأساسية إلى عدد كبير من الرموز ويقللها تدريجياً للحصول على مفردات أصغر. يمكن أن تتوافق المفردات الأساسية على سبيل المثال مع جميع الكلمات المسبقة التوكنز والسلاسل الفرعية الأكثر شيوعًا. لا يتم استخدام Unigram مباشرة لأي من النماذج في المحولات، ولكنه يستخدم بالاقتران مع [SentencePiece](#sentencepiece).
+
+في كل خطوة تدريب، يحدد خوارزمية Unigram خسارة (غالبًا ما يتم تعريفها على أنها اللوغاريتم) عبر بيانات التدريب بالنظر إلى المفردات الحالية ونموذج اللغة unigram. بعد ذلك، بالنسبة لكل رمز في المفردات، يحسب الخوارزمية مقدار زيادة الخسارة الإجمالية إذا تم إزالة الرمز من المفردات. ثم يقوم Unigram بإزالة p (مع p عادة ما تكون 10% أو 20%) في المائة من الرموز التي تكون زيادة الخسارة فيها هي الأدنى، *أي* تلك
+الرموز التي تؤثر أقل على الخسارة الإجمالية عبر بيانات التدريب. تتكرر هذه العملية حتى تصل المفردات إلى الحجم المطلوب. يحتفظ خوارزمية Unigram دائمًا بالشخصيات الأساسية بحيث يمكن توكنز أي كلمة.
+
+نظرًا لأن Unigram لا يعتمد على قواعد الدمج (على عكس BPE وWordPiece)، فإن للخوارزمية عدة طرق
+توكنز نص جديد بعد التدريب. على سبيل المثال، إذا كان محول Unigram المدرب يعرض المفردات:
+
+```
+["b"، "g"، "h"، "n"، "p"، "s"، "u"، "ug"، "un"، "hug"]،
+```
+
+يمكن توكنز `"hugs"` على أنه `["hug"، "s"]`، أو `["h"، "ug"، "s"]` أو `["h"، "u"، "g"، "s"]`. إذن ماذا
+لاختيار؟ يحفظ Unigram احتمال كل رمز في فيلق التدريب بالإضافة إلى حفظ المفردات بحيث
+يمكن حساب احتمال كل توكنز ممكن بعد التدريب. ببساطة، يختار الخوارزمية الأكثر
+توكنز المحتملة في الممارسة، ولكنه يوفر أيضًا إمكانية أخذ عينات من توكنز ممكن وفقًا لاحتمالاتها.
+
+تتم تعريف هذه الاحتمالات بواسطة الخسارة التي يتم تدريب المحول عليها. بافتراض أن بيانات التدريب تتكون
+من الكلمات \\(x_{1}، \dots، x_{N}\\) وأن مجموعة جميع التوكنزات الممكنة لكلمة \\(x_{i}\\) هي
+يتم تعريفها على أنها \\(S(x_{i})\\)، ثم يتم تعريف الخسارة الإجمالية على النحو التالي
+
+$$\mathcal{L} = -\sum_{i=1}^{N} \log \left ( \sum_{x \in S(x_{i})} p(x) \right )$$
+
+<a id='sentencepiece'></a>
+
+### SentencePiece
+
+تحتوي جميع خوارزميات توكنز الموصوفة حتى الآن على نفس المشكلة: من المفترض أن النص المدخل يستخدم المسافات لفصل الكلمات. ومع ذلك، لا تستخدم جميع اللغات المسافات لفصل الكلمات. أحد الحلول الممكنة هو استخداممعالج مسبق للغة محدد، *مثال* [XLM](model_doc/xlm) يلذي يستخدم معالجات مسبقة محددة للصينية واليابانية والتايلاندية.
+لحل هذه المشكلة بشكل أعم، [SentencePiece: A simple and language independent subword tokenizer and
+detokenizer for Neural Text Processing (Kudo et al.، 2018)](https://arxiv.org/pdf/1808.06226.pdf) يتعامل مع المدخلات
+كتدفق بيانات خام، وبالتالي يشمل المسافة في مجموعة الأحرف التي سيتم استخدامها. ثم يستخدم خوارزمية BPE أو unigram
+لبناء المفردات المناسبة.
+
+يستخدم [`XLNetTokenizer`] SentencePiece على سبيل المثال، وهو أيضًا سبب تضمين تم تضمين حرف `"▁"` في المفردات. عملية فك التشفير باستخدام SentencePiece سهلة للغاية نظرًا لأنه يمكن دائمًا دمج الرموز معًا واستبدال `"▁"` بمسافة.
+
+تستخدم جميع نماذج المحولات في المكتبة التي تستخدم SentencePiece بالاقتران مع unigram. أمثلة على النماذج
+باستخدام SentencePiece هي [ALBERT](model_doc/albert)، [XLNet](model_doc/xlnet)، [Marian](model_doc/marian)، و [T5](model_doc/t5).
diff --git a/docs/source/en/_config.py b/docs/source/en/_config.py
index f49e4e4731965a..4381def017ddc5 100644
--- a/docs/source/en/_config.py
+++ b/docs/source/en/_config.py
@@ -11,4 +11,4 @@
     "{processor_class}": "FakeProcessorClass",
     "{model_class}": "FakeModelClass",
     "{object_class}": "FakeObjectClass",
-}
+}
\ No newline at end of file
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index ae632376f9469c..02595f30db2893 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -153,6 +153,8 @@
     title: Interoperability with TikToken files
   - local: modular_transformers
     title: Modularity in `transformers`
+  - local: how_to_hack_models
+    title: Model Hacking (overwriting a class to your usage)
   title: Developer guides
 - sections:
   - local: quantization/overview
@@ -177,6 +179,8 @@
     title: Optimum
   - local: quantization/torchao
     title: TorchAO
+  - local: quantization/bitnet
+    title: BitNet
   - local: quantization/compressed_tensors
     title: compressed-tensors
   - local: quantization/contribute
@@ -494,6 +498,8 @@
         title: MT5
       - local: model_doc/mvp
         title: MVP
+      - local: model_doc/myt5
+        title: myt5
       - local: model_doc/nemotron
         title: Nemotron
       - local: model_doc/nezha
@@ -522,6 +528,8 @@
         title: Phi
       - local: model_doc/phi3
         title: Phi-3
+      - local: model_doc/phimoe
+        title: PhiMoE
       - local: model_doc/phobert
         title: PhoBERT
       - local: model_doc/plbart
@@ -532,12 +540,8 @@
         title: QDQBert
       - local: model_doc/qwen2
         title: Qwen2
-      - local: model_doc/qwen2_audio
-        title: Qwen2Audio
       - local: model_doc/qwen2_moe
         title: Qwen2MoE
-      - local: model_doc/qwen2_vl
-        title: Qwen2VL
       - local: model_doc/rag
         title: RAG
       - local: model_doc/realm
@@ -709,6 +713,8 @@
         title: ViTMSN
       - local: model_doc/yolos
         title: YOLOS
+      - local: model_doc/zamba
+        title: Zamba
       - local: model_doc/zoedepth
         title: ZoeDepth
       title: Vision models
@@ -882,6 +888,10 @@
         title: Pix2Struct
       - local: model_doc/pixtral
         title: Pixtral
+      - local: model_doc/qwen2_audio
+        title: Qwen2Audio
+      - local: model_doc/qwen2_vl
+        title: Qwen2VL
       - local: model_doc/sam
         title: Segment Anything
       - local: model_doc/siglip
@@ -959,4 +969,4 @@
     - local: internal/time_series_utils
       title: Utilities for Time Series
     title: Internal Helpers
-  title: API
+  title: API
\ No newline at end of file
diff --git a/docs/source/en/chat_templating.md b/docs/source/en/chat_templating.md
index 543d9fa00b8b5a..de3d056c916f5f 100644
--- a/docs/source/en/chat_templating.md
+++ b/docs/source/en/chat_templating.md
@@ -962,4 +962,129 @@ tokenizer.chat_template = open("template.jinja").read()
 
 As an added bonus, when you write a long, multi-line template in a separate file, line numbers in that file will
 exactly correspond to line numbers in template parsing or execution errors. This will make it much easier to
-identify the source of issues.
\ No newline at end of file
+identify the source of issues.
+
+### Writing templates for tools
+
+Although chat templates do not enforce a specific API for tools (or for anything, really), we recommend 
+template authors try to stick to a standard API where possible. The whole point of chat templates is to allow code
+to be transferable across models, so deviating from the standard tools API means users will have to write
+custom code to use tools with your model. Sometimes it's unavoidable, but often with clever templating you can
+make the standard API work!
+
+Below, we'll list the elements of the standard API, and give tips on writing templates that will work well with it.
+
+#### Tool definitions
+
+Your template should expect that the variable `tools` will either be null (if no tools are passed), or is a list 
+of JSON schema dicts. Our chat template methods allow users to pass tools as either JSON schema or Python functions, but when
+functions are passed, we automatically generate JSON schema and pass that to your template. As a result, the 
+`tools` variable that your template receives will always be a list of JSON schema. Here is
+a sample tool JSON schema:
+
+```json
+{
+  "type": "function", 
+  "function": {
+    "name": "multiply", 
+    "description": "A function that multiplies two numbers", 
+    "parameters": {
+      "type": "object", 
+      "properties": {
+        "a": {
+          "type": "number", 
+          "description": "The first number to multiply"
+        }, 
+        "b": {
+          "type": "number",
+          "description": "The second number to multiply"
+        }
+      }, 
+      "required": ["a", "b"]
+    }
+  }
+}
+```
+
+And here is some example code for handling tools in your chat template. Remember, this is just an example for a
+specific format - your model will probably need different formatting!
+
+```text
+{%- if tools %}
+    {%- for tool in tools %}
+        {{- '<tool>' + tool['function']['name'] + '\n' }}
+        {%- for argument in tool['function']['parameters']['properties'] %}
+            {{- argument + ': ' + tool['function']['parameters']['properties'][argument]['description'] + '\n' }}
+        {%- endfor %}
+        {{- '\n</tool>' }}
+    {%- endif %}
+{%- endif %}
+```
+
+The specific tokens and tool descriptions your template renders should of course be chosen to match the ones your model
+was trained with. There is no requirement that your **model** understands JSON schema input, only that your template can translate
+JSON schema into your model's format. For example, [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024) 
+was trained with tools defined using Python function headers, but the Command-R tool template accepts JSON schema, 
+converts types internally and renders the input tools as Python headers. You can do a lot with templates!
+
+#### Tool calls
+
+Tool calls, if present, will be a list attached to a message with the "assistant" role. Note that `tool_calls` is 
+always a list, even though most tool-calling models only support single tool calls at a time, which means
+the list will usually only have a single element. Here is a sample message dict containing a tool call:
+
+```json
+{
+  "role": "assistant",
+  "tool_calls": [
+    {
+      "type": "function",
+      "function": {
+        "name": "multiply",
+        "arguments": {
+          "a": 5,
+          "b": 6
+        }
+      }
+    }
+  ]
+}
+```
+
+And a common pattern for handling them would be something like this:
+
+```text
+{%- if message['role'] == 'assistant' and 'tool_calls' in message %}
+    {%- for tool_call in message['tool_calls'] %}
+            {{- '<tool_call>' + tool_call['function']['name'] + '\n' + tool_call['function']['arguments']|tojson + '\n</tool_call>' }}
+        {%- endif %}
+    {%- endfor %}
+{%- endif %}
+```
+
+Again, you should render the tool call with the formatting and special tokens that your model expects.
+
+#### Tool responses
+
+Tool responses have a simple format: They are a message dict with the "tool" role, a "name" key giving the name
+of the called function, and a "content" key containing the result of the tool call. Here is a sample tool response:
+
+```json
+{
+  "role": "tool",
+  "name": "multiply",
+  "content": "30"
+}
+```
+
+You don't need to use all of the keys in the tool response. For example, if your model doesn't expect the function
+name to be included in the tool response, then rendering it can be as simple as:
+
+```text
+{%- if message['role'] == 'tool' %}
+    {{- "<tool_result>" + message['content'] + "</tool_result>" }}
+{%- endif %}
+```
+
+Again, remember that the actual formatting and special tokens are model-specific - you should take a lot of care
+to ensure that tokens, whitespace and everything else exactly match the format your model was trained with!
\ No newline at end of file
diff --git a/docs/source/en/generation_strategies.md b/docs/source/en/generation_strategies.md
index 06e7e0b8ab3d08..64ded9613716a5 100644
--- a/docs/source/en/generation_strategies.md
+++ b/docs/source/en/generation_strategies.md
@@ -408,14 +408,24 @@ For the complete list of the available parameters, refer to the [API documentati
 ### Speculative Decoding
 
 Speculative decoding (also known as assisted decoding) is a modification of the decoding strategies above, that uses an
-assistant model (ideally a much smaller one) with the same tokenizer, to generate a few candidate tokens. The main
-model then validates the candidate tokens in a single forward pass, which speeds up the decoding process. If
-`do_sample=True`, then the token validation with resampling introduced in the
-[speculative decoding paper](https://arxiv.org/pdf/2211.17192.pdf) is used.
+assistant model (ideally a much smaller one), to generate a few candidate tokens. The main model then validates the candidate
+tokens in a single forward pass, which speeds up the decoding process. If `do_sample=True`, then the token validation with
+resampling introduced in the [speculative decoding paper](https://arxiv.org/pdf/2211.17192.pdf) is used.
+Assisted decoding assumes the main and assistant models have the same tokenizer, otherwise, see Universal Assisted Decoding below.
 
 Currently, only greedy search and sampling are supported with assisted decoding, and assisted decoding doesn't support batched inputs.
 To learn more about assisted decoding, check [this blog post](https://huggingface.co/blog/assisted-generation).
 
+#### Universal Assisted Decoding
+
+Universal Assisted Decoding (UAD) adds support for main and assistant models with different tokenizers.
+To use it, simply pass the tokenizers using the `tokenizer` and `assistant_tokenizer` arguments (see below).
+Internally, the main model input tokens are re-encoded into assistant model tokens, then candidate tokens are generated in the assistant encoding, which are
+in turn re-encoded into main model candidate tokens. Validation then proceeds as explained above.
+The re-encoding steps involve decoding token ids into text and then encoding the text using a different tokenizer.
+Since re-encoding the tokens may result in tokenization discrepancies, UAD finds the longest common subsequence between the source and target encodings, 
+to ensure the new tokens include the correct prompt suffix.
+
 To enable assisted decoding, set the `assistant_model` argument with a model.
 
 ```python
@@ -435,6 +445,26 @@ To enable assisted decoding, set the `assistant_model` argument with a model.
 ['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a']
 ```
 
+If the main and assistant models have different tokenizers, use Universal Assisted Decoding.
+
+```python
+>>> from transformers import AutoModelForCausalLM, AutoTokenizer
+
+>>> prompt = "Alice and Bob"
+>>> checkpoint = "google/gemma-2-9b"
+>>> assistant_checkpoint = "double7/vicuna-68m"
+
+>>> assistant_tokenizer = AutoTokenizer.from_pretrained(assistant_checkpoint)
+>>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+>>> inputs = tokenizer(prompt, return_tensors="pt")
+
+>>> model = AutoModelForCausalLM.from_pretrained(checkpoint)
+>>> assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint)
+>>> outputs = model.generate(**inputs, assistant_model=assistant_model, tokenizer=tokenizer, assistant_tokenizer=assistant_tokenizer)
+>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a']
+```
+
 When using assisted decoding with sampling methods, you can use the `temperature` argument to control the randomness,
 just like in multinomial sampling. However, in assisted decoding, reducing the temperature may help improve the latency.
 
@@ -458,6 +488,7 @@ just like in multinomial sampling. However, in assisted decoding, reducing the t
 
 Alternatively, you can also set the `prompt_lookup_num_tokens` to trigger n-gram based assisted decoding, as opposed
 to model based assisted decoding. You can read more about it [here](https://twitter.com/joao_gante/status/1747322413006643259).
+
 ### DoLa Decoding
 
 **D**ecoding by C**o**ntrasting **La**yers (DoLa) is a contrastive decoding strategy to improve the factuality and reduce the
diff --git a/docs/source/en/gguf.md b/docs/source/en/gguf.md
index 0c6700544ae5c2..7418bbc497e660 100644
--- a/docs/source/en/gguf.md
+++ b/docs/source/en/gguf.md
@@ -81,6 +81,9 @@ For now the supported model architectures are the architectures that have been v
 - Qwen2Moe
 - Phi3
 - Bloom
+- Falcon
+- StableLM
+- GPT2
 
 ## Example usage
 
diff --git a/docs/source/en/how_to_hack_models.md b/docs/source/en/how_to_hack_models.md
new file mode 100644
index 00000000000000..411539e104bfc2
--- /dev/null
+++ b/docs/source/en/how_to_hack_models.md
@@ -0,0 +1,180 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# How to Hack Any Transformers Model
+
+The [🤗 Transformers](https://github.com/huggingface/transformers) library offers a collection of pre-trained models and tools for natural language processing, vision, and beyond. While these models cover a wide range of applications, you might encounter use cases that aren't supported out of the box. Customizing models can unlock new possibilities, such as adding new layers, altering architectures, or optimizing attention mechanisms. This guide will show you how to modify existing Transformers models to fit your specific needs. The great thing is, you don’t have to step away from the Transformers framework to make these changes. You can actually modify models directly in Transformers and still take advantage of features like the [Trainer API](https://huggingface.co/docs/transformers/main/en/main_classes/trainer), [PreTrainedModel](https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel), and efficient fine-tuning with tools like [PEFT](https://huggingface.co/docs/peft/index).
+
+In this guide, we’ll walk you through how to customize existing Transformers models to meet your requirements—without losing the benefits of the ecosystem.
+
+You'll learn how to:
+
+- Modify a model's architecture by changing its attention mechanism.
+- Apply techniques like Low-Rank Adaptation (LoRA) to specific model components.
+
+We encourage you to contribute your own hacks and share them here with the community1
+
+## Example: Modifying the Attention Mechanism in the Segment Anything Model (SAM)
+
+The **Segment Anything Model (SAM)** is a state-of-the-art model for image segmentation. In its default implementation, SAM uses a combined query-key-value (`qkv`) projection in its attention mechanism. However, you might want to fine-tune only specific components of the attention mechanism, such as the query (`q`) and value (`v`) projections, to reduce the number of trainable parameters and computational resources required.
+
+### Motivation
+
+By splitting the combined `qkv` projection into separate `q`, `k`, and `v` projections, you can apply techniques like **LoRA** (Low-Rank Adaptation) to only the `q` and `v` projections. This approach allows you to:
+
+- Fine-tune fewer parameters, reducing computational overhead.
+- Potentially achieve better performance by focusing on specific components.
+- Experiment with different adaptation strategies in the attention mechanism.
+
+### Implementation
+
+#### **Step 1: Create a Custom Attention Class**
+
+Next, subclass the original `SamVisionAttention` class and modify it to have separate `q`, `k`, and `v` projections.
+
+```python
+import torch
+import torch.nn as nn
+from transformers.models.sam.modeling_sam import SamVisionAttention
+
+class SamVisionAttentionSplit(SamVisionAttention, nn.Module):
+    def __init__(self, config, window_size):
+        super().__init__(config, window_size)
+        del self.qkv
+        # Separate q, k, v projections
+        self.q = nn.Linear(config.hidden_size, config.hidden_size, bias=config.qkv_bias)
+        self.k = nn.Linear(config.hidden_size, config.hidden_size, bias=config.qkv_bias)
+        self.v = nn.Linear(config.hidden_size, config.hidden_size, bias=config.qkv_bias)
+        self._register_load_state_dict_pre_hook(self.split_q_k_v_load_hook)
+
+    def split_q_k_v_load_hook(self, state_dict, prefix, *args):
+        keys_to_delete = []
+        for key in list(state_dict.keys()):
+            if "qkv." in key:
+                # Split q, k, v from the combined projection
+                q, k, v = state_dict[key].chunk(3, dim=0)
+                # Replace with individual q, k, v projections
+                state_dict[key.replace("qkv.", "q.")] = q
+                state_dict[key.replace("qkv.", "k.")] = k
+                state_dict[key.replace("qkv.", "v.")] = v
+                # Mark the old qkv key for deletion
+                keys_to_delete.append(key)
+        
+        # Remove old qkv keys
+        for key in keys_to_delete:
+            del state_dict[key]
+
+    def forward(self, hidden_states: torch.Tensor, output_attentions=False) -> torch.Tensor:
+        batch_size, height, width, _ = hidden_states.shape
+        qkv_shapes = (batch_size *  self.num_attention_heads,  height * width, -1)
+        query = self.q(hidden_states).reshape((batch_size,  height * width,self.num_attention_heads, -1)).permute(0,2,1,3).reshape(qkv_shapes)
+        key = self.k(hidden_states).reshape((batch_size,  height * width,self.num_attention_heads, -1)).permute(0,2,1,3).reshape(qkv_shapes)
+        value = self.v(hidden_states).reshape((batch_size,  height * width,self.num_attention_heads, -1)).permute(0,2,1,3).reshape(qkv_shapes)
+
+        attn_weights = (query * self.scale) @ key.transpose(-2, -1)
+
+        if self.use_rel_pos:
+            attn_weights = self.add_decomposed_rel_pos(
+                attn_weights, query, self.rel_pos_h, self.rel_pos_w, (height, width), (height, width)
+            )
+
+        attn_weights = torch.nn.functional.softmax(attn_weights, dtype=torch.float32, dim=-1).to(query.dtype)
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = (attn_probs @ value).reshape(batch_size, self.num_attention_heads, height, width, -1)
+        attn_output = attn_output.permute(0, 2, 3, 1, 4).reshape(batch_size, height, width, -1)
+        attn_output = self.proj(attn_output)
+
+        if output_attentions:
+            outputs = (attn_output, attn_weights)
+        else:
+            outputs = (attn_output, None)
+        return outputs
+```
+
+**Explanation:**
+
+- **Separate Projections:** The combined `qkv` projection is removed, and separate `q`, `k`, and `v` linear layers are created.
+- **Weight Loading Hook:** The `_split_qkv_load_hook` method splits the pre-trained `qkv` weights into separate `q`, `k`, and `v` weights when loading the model. This ensures compatibility with any pre-trained model.
+- **Forward Pass:** Queries, keys, and values are computed separately, and the attention mechanism proceeds as usual.
+
+#### **Step 2: Replace the Original Attention Class**
+
+Replace the original `SamVisionAttention` class with your custom class so that the model uses the modified attention mechanism.
+
+```python
+from transformers import SamModel
+from transformers.models.sam import modeling_sam
+
+# Replace the attention class in the modeling_sam module
+modeling_sam.SamVisionAttention = SamVisionAttentionSplit
+
+# Load the pre-trained SAM model
+model = SamModel.from_pretrained("facebook/sam-vit-base")
+```
+
+**Explanation:**
+
+- **Class Replacement:** By assigning your custom class to `modeling_sam.SamVisionAttention`, any instances of `SamVisionAttention` in the model will use the modified version. Thus when you call `SamModel`, it will use the newly defined `SamVisionAttentionSplit`. 
+- **Model Loading:** The model is loaded using `from_pretrained`, and the custom attention mechanism is integrated.
+
+#### **Step 3: Apply LoRA to Specific Projections**
+
+With separate `q`, `k`, and `v` projections, you can now apply LoRA to specific components, such as the `q` and `v` projections.
+
+```python
+from peft import LoraConfig, get_peft_model
+
+config = LoraConfig(
+    r=16,
+    lora_alpha=32,
+    target_modules=["q", "v"],  # Apply LoRA to q and v projections
+    lora_dropout=0.1,
+    task_type="mask-generation"
+)
+
+# Apply LoRA to the model
+model = get_peft_model(model, config)
+```
+
+**Explanation:**
+
+- **LoRA Configuration:** The `LoraConfig` specifies the rank `r`, scaling factor `lora_alpha`, target modules (`"q"` and `"v"`), dropout, and task type.
+- **Applying LoRA:** The `get_peft_model` function applies LoRA to the specified modules in the model.
+- **Parameter Reduction:** By focusing on `q` and `v`, you reduce the number of trainable parameters, leading to faster training and lower memory usage.
+
+#### **Step 4: Verify the Number of Trainable Parameters**
+
+It's simple to verify the number of trainable parameters and see what impact your modification had. 
+
+```python
+model.print_trainable_parameters()
+```
+
+**Expected Output:**
+
+```
+trainable params: 608,256 || all params: 94,343,728 || trainable%: 0.6447
+trainable params: 912,384 || all params: 94,647,856 || trainable%: 0.9640 # with k 
+```
+
+## Contributing Your Own Hacks
+
+Modifying pre-trained models can open up new avenues for research and application. By understanding and adjusting the internal mechanisms of models like SAM, you can tailor them to your specific needs, optimize performance, and experiment with new ideas.
+
+If you've developed your own hacks for Transformers models and would like to share them, consider contributing to this doc.
+
+- **Open a Pull Request:** Share your code changes and improvements directly in the repository.
+- **Write Documentation:** Provide clear explanations and examples of your modifications.
+- **Engage with the Community:** Discuss your ideas and get feedback from other developers and researchers by opening an issue.
\ No newline at end of file
diff --git a/docs/source/en/hpo_train.md b/docs/source/en/hpo_train.md
index c516c501f88228..49dde04fe60694 100644
--- a/docs/source/en/hpo_train.md
+++ b/docs/source/en/hpo_train.md
@@ -15,7 +15,7 @@ rendered properly in your Markdown viewer.
 
 # Hyperparameter Search using Trainer API
 
-🤗 Transformers provides a [`Trainer`] class optimized for training 🤗 Transformers models, making it easier to start training without manually writing your own training loop. The [`Trainer`] provides API for hyperparameter search. This doc shows how to enable it in example. 
+🤗 Transformers provides a [`Trainer`] class optimized for training 🤗 Transformers models, making it easier to start training without manually writing your own training loop. The [`Trainer`] provides API for hyperparameter search. This doc shows how to enable it in example.
 
 ## Hyperparameter Search backend
 
@@ -24,7 +24,7 @@ rendered properly in your Markdown viewer.
 
 you should install them before using them as the hyperparameter search backend
 ```bash
-pip install optuna/sigopt/wandb/ray[tune] 
+pip install optuna/sigopt/wandb/ray[tune]
 ```
 
 ## How to enable Hyperparameter search in example
@@ -112,7 +112,7 @@ Create a [`Trainer`] with your `model_init` function, training arguments, traini
 ...     train_dataset=small_train_dataset,
 ...     eval_dataset=small_eval_dataset,
 ...     compute_metrics=compute_metrics,
-...     tokenizer=tokenizer,
+...     processing_class=tokenizer,
 ...     model_init=model_init,
 ...     data_collator=data_collator,
 ... )
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index 0a5518fd71c840..32a730e6bcfca8 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -256,6 +256,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                     [Persimmon](model_doc/persimmon)                     |       ✅        |         ❌         |      ❌      |
 |                           [Phi](model_doc/phi)                           |       ✅        |         ❌         |      ❌      |
 |                          [Phi3](model_doc/phi3)                          |       ✅        |         ❌         |      ❌      |
+|                        [Phimoe](model_doc/phimoe)                        |       ✅        |         ❌         |      ❌      |
 |                       [PhoBERT](model_doc/phobert)                       |       ✅        |         ✅         |      ✅      |
 |                    [Pix2Struct](model_doc/pix2struct)                    |       ✅        |         ❌         |      ❌      |
 |                       [Pixtral](model_doc/pixtral)                       |       ✅        |         ❌         |      ❌      |
@@ -360,6 +361,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                 [XLSR-Wav2Vec2](model_doc/xlsr_wav2vec2)                 |       ✅        |         ✅         |      ✅      |
 |                         [YOLOS](model_doc/yolos)                         |       ✅        |         ❌         |      ❌      |
 |                          [YOSO](model_doc/yoso)                          |       ✅        |         ❌         |      ❌      |
+|                         [Zamba](model_doc/zamba)                         |       ✅        |         ❌         |      ❌      |
 |                      [ZoeDepth](model_doc/zoedepth)                      |       ✅        |         ❌         |      ❌      |
 
 <!-- End table-->
diff --git a/docs/source/en/llm_tutorial.md b/docs/source/en/llm_tutorial.md
index ac6386d85318a6..097d7bf1e9ca38 100644
--- a/docs/source/en/llm_tutorial.md
+++ b/docs/source/en/llm_tutorial.md
@@ -164,7 +164,7 @@ If not specified in the [`~generation.GenerationConfig`] file, `generate` return
 By default, and unless specified in the [`~generation.GenerationConfig`] file, `generate` selects the most likely token at each iteration (greedy decoding). Depending on your task, this may be undesirable; creative tasks like chatbots or writing an essay benefit from sampling. On the other hand, input-grounded tasks like audio transcription or translation benefit from greedy decoding. Enable sampling with `do_sample=True`, and you can learn more about this topic in this [blog post](https://huggingface.co/blog/how-to-generate).
 
 ```py
->>> # Set seed or reproducibility -- you don't need this unless you want full reproducibility
+>>> # Set seed for reproducibility -- you don't need this unless you want full reproducibility
 >>> from transformers import set_seed
 >>> set_seed(42)
 
diff --git a/docs/source/en/main_classes/quantization.md b/docs/source/en/main_classes/quantization.md
index a2f831f65976ec..3f44569697777b 100755
--- a/docs/source/en/main_classes/quantization.md
+++ b/docs/source/en/main_classes/quantization.md
@@ -68,3 +68,7 @@ Learn how to quantize models in the [Quantization](../quantization) guide.
 ## TorchAoConfig
 
 [[autodoc]] TorchAoConfig
+
+## BitNetConfig
+
+[[autodoc]] BitNetConfig
diff --git a/docs/source/en/model_doc/auto.md b/docs/source/en/model_doc/auto.md
index ab42c24d83e82d..059312850876d2 100644
--- a/docs/source/en/model_doc/auto.md
+++ b/docs/source/en/model_doc/auto.md
@@ -381,3 +381,7 @@ The following auto classes are available for the following multimodal tasks.
 ### FlaxAutoModelForVision2Seq
 
 [[autodoc]] FlaxAutoModelForVision2Seq
+
+### AutoModelForImageTextToText
+
+[[autodoc]] AutoModelForImageTextToText
diff --git a/docs/source/en/model_doc/distilbert.md b/docs/source/en/model_doc/distilbert.md
index 844927e71984a9..10f7c2d757a21a 100644
--- a/docs/source/en/model_doc/distilbert.md
+++ b/docs/source/en/model_doc/distilbert.md
@@ -66,6 +66,53 @@ contributed by [kamalkraj](https://huggingface.co/kamalkraj). The original code
     * predicting the masked tokens correctly (but no next-sentence objective)
     * a cosine similarity between the hidden states of the student and the teacher model
 
+### Using Scaled Dot Product Attention (SDPA)
+
+PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function 
+encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the 
+[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) 
+or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
+page for more information.
+
+SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set 
+`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
+
+```
+from transformers import DistilBertModel
+model = DistilBertModel.from_pretrained("distilbert-base-uncased", torch_dtype=torch.float16, attn_implementation="sdpa")
+```
+
+For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).
+
+On a local benchmark (NVIDIA GeForce RTX 2060-8GB, PyTorch 2.3.1, OS Ubuntu 20.04) with `float16` and the `distilbert-base-uncased` model with
+a MaskedLM head, we saw the following speedups during training and inference.
+
+#### Training
+
+| num_training_steps | batch_size | seq_len | is cuda | Time per batch (eager - s) | Time per batch (sdpa - s) | Speedup (%) | Eager peak mem (MB) | sdpa peak mem (MB) | Mem saving (%) |
+|--------------------|------------|---------|---------|----------------------------|---------------------------|-------------|---------------------|--------------------|----------------|
+| 100                | 1          | 128     | False   | 0.010                      | 0.008                     | 28.870      | 397.038             | 399.629            | -0.649         |
+| 100                | 1          | 256     | False   | 0.011                      | 0.009                     | 20.681      | 412.505             | 412.606            | -0.025         |
+| 100                | 2          | 128     | False   | 0.011                      | 0.009                     | 23.741      | 412.213             | 412.606            | -0.095         |
+| 100                | 2          | 256     | False   | 0.015                      | 0.013                     | 16.502      | 427.491             | 425.787            | 0.400          |
+| 100                | 4          | 128     | False   | 0.015                      | 0.013                     | 13.828      | 427.491             | 425.787            | 0.400          |
+| 100                | 4          | 256     | False   | 0.025                      | 0.022                     | 12.882      | 594.156             | 502.745            | 18.182         |
+| 100                | 8          | 128     | False   | 0.023                      | 0.022                     | 8.010       | 545.922             | 502.745            | 8.588          |
+| 100                | 8          | 256     | False   | 0.046                      | 0.041                     | 12.763      | 983.450             | 798.480            | 23.165         |
+
+#### Inference
+
+| num_batches | batch_size | seq_len | is cuda | is half | use mask | Per token latency eager (ms) | Per token latency SDPA (ms) | Speedup (%) | Mem eager (MB) | Mem BT (MB) | Mem saved (%) |
+|-------------|------------|---------|---------|---------|----------|-----------------------------|-----------------------------|-------------|----------------|--------------|---------------|
+| 50          | 2          | 64      | True    | True    | True     | 0.032                       | 0.025                       | 28.192      | 154.532        | 155.531      | -0.642        |
+| 50          | 2          | 128     | True    | True    | True     | 0.033                       | 0.025                       | 32.636      | 157.286        | 157.482      | -0.125        |
+| 50          | 4          | 64      | True    | True    | True     | 0.032                       | 0.026                       | 24.783      | 157.023        | 157.449      | -0.271        |
+| 50          | 4          | 128     | True    | True    | True     | 0.034                       | 0.028                       | 19.299      | 162.794        | 162.269      | 0.323         |
+| 50          | 8          | 64      | True    | True    | True     | 0.035                       | 0.028                       | 25.105      | 160.958        | 162.204      | -0.768        |
+| 50          | 8          | 128     | True    | True    | True     | 0.052                       | 0.046                       | 12.375      | 173.155        | 171.844      | 0.763         |
+| 50          | 16         | 64      | True    | True    | True     | 0.051                       | 0.045                       | 12.882      | 172.106        | 171.713      | 0.229         |
+| 50          | 16         | 128     | True    | True    | True     | 0.096                       | 0.081                       | 18.524      | 191.257        | 191.517      | -0.136        |
+
 
 ## Resources
 
diff --git a/docs/source/en/model_doc/llava.md b/docs/source/en/model_doc/llava.md
index a7e4b4da7f3c5a..99950a2ffd8e93 100644
--- a/docs/source/en/model_doc/llava.md
+++ b/docs/source/en/model_doc/llava.md
@@ -40,7 +40,9 @@ The original code can be found [here](https://github.com/haotian-liu/LLaVA/tree/
 
 - Note the model has not been explicitly trained to process multiple images in the same prompt, although this is technically possible, you may experience inaccurate results.
 
-- For better results, we recommend users to use the processor's `apply_chat_template()` method to format your prompt correctly. For that you need to construct a conversation history, passing in a plain string will not format your prompt. Each message in the conversation history for chat templates is a dictionary with keys "role" and "content". The "content" should be a list of dictionaries, for "text" and "image" modalities, as follows:
+### Single image inference
+
+For best results, we recommend users to use the processor's `apply_chat_template()` method to format your prompt correctly. For that you need to construct a conversation history, passing in a plain string will not format your prompt. Each message in the conversation history for chat templates is a dictionary with keys "role" and "content". The "content" should be a list of dictionaries, for "text" and "image" modalities, as follows:
 
 ```python
 from transformers import AutoProcessor
@@ -75,6 +77,60 @@ print(text_prompt)
 >>> "USER: <image>\n<What’s shown in this image? ASSISTANT: This image shows a red stop sign.</s>USER: Describe the image in more details. ASSISTANT:"
 ```
 
+### Batched inference
+
+LLaVa also supports batched inference. Here is how you can do it:
+
+```python
+import requests
+from PIL import Image
+import torch
+from transformers import AutoProcessor, LLavaForConditionalGeneration
+
+# Load the model in half-precision
+model = LLavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf", torch_dtype=torch.float16, device_map="auto")
+processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
+
+# Get two different images
+url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+image_stop = Image.open(requests.get(url, stream=True).raw)
+
+url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+image_cats = Image.open(requests.get(url, stream=True).raw)
+
+# Prepare a batch of two prompts
+conversation_1 = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image"},
+            {"type": "text", "text": "What is shown in this image?"},
+        ],
+    },
+]
+
+conversation_2 = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image"},
+            {"type": "text", "text": "What is shown in this image?"},
+        ],
+    },
+]
+
+prompt_1 = processor.apply_chat_template(conversation_1, add_generation_prompt=True)
+prompt_2 = processor.apply_chat_template(conversation_2, add_generation_prompt=True)
+prompts = [prompt_1, prompt_2]
+
+# We can simply feed images in the order they have to be used in the text prompt
+inputs = processor(images=[image_stop, image_cats, image_snowman], text=prompts, padding=True, return_tensors="pt").to(model.device, torch.float16)
+
+# Generate
+generate_ids = model.generate(**inputs, max_new_tokens=30)
+processor.batch_decode(generate_ids, skip_special_tokens=True)
+```
+
 - If you want to construct a chat prompt yourself, below is a list of prompt formats accepted by each llava checkpoint:
 
 [llava-interleave models](https://huggingface.co/collections/llava-hf/llava-interleave-668e19a97da0036aad4a2f19) requires the following format:
@@ -99,7 +155,6 @@ For multiple turns conversation:
 "USER: <image>\n<prompt1> ASSISTANT: <answer1></s>USER: <prompt2> ASSISTANT: <answer2></s>USER: <prompt3> ASSISTANT:"
 ```
 
-
 ### Using Flash Attention 2
 
 Flash Attention 2 is an even faster, optimized version of the previous optimization, please refer to the [Flash Attention 2 section of performance docs](https://huggingface.co/docs/transformers/perf_infer_gpu_one).
diff --git a/docs/source/en/model_doc/llava_next.md b/docs/source/en/model_doc/llava_next.md
index 4c041f4e8963ab..b9146fbd33478a 100644
--- a/docs/source/en/model_doc/llava_next.md
+++ b/docs/source/en/model_doc/llava_next.md
@@ -166,10 +166,10 @@ LLaVa-Next can perform inference with multiple images as input, where images eit
 import requests
 from PIL import Image
 import torch
-from transformers import AutoProcessor, LlavaNextForConditionalGeneration
+from transformers import AutoProcessor, AutoModelForImageTextToText
 
 # Load the model in half-precision
-model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, device_map="auto")
+model = AutoModelForImageTextToText.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, device_map="auto")
 processor = AutoProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
 
 # Get three different images
@@ -246,7 +246,7 @@ We value your feedback to help identify bugs before the full release! Check out
 Simply change the snippet above with:
 
 ```python
-from transformers import LlavaNextForConditionalGeneration, BitsAndBytesConfig
+from transformers import AutoModelForImageTextToText, BitsAndBytesConfig
 
 # specify how to quantize the model
 quantization_config = BitsAndBytesConfig(
@@ -255,7 +255,7 @@ quantization_config = BitsAndBytesConfig(
     bnb_4bit_compute_dtype=torch.float16,
 )
 
-model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", quantization_config=quantization_config, device_map="auto")
+model = AutoModelForImageTextToText.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", quantization_config=quantization_config, device_map="auto")
 ```
 
 ### Use Flash-Attention 2 to further speed-up generation
@@ -263,9 +263,9 @@ model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-m
 First make sure to install flash-attn. Refer to the [original repository of Flash Attention](https://github.com/Dao-AILab/flash-attention) regarding that package installation. Simply change the snippet above with:
 
 ```python
-from transformers import LlavaNextForConditionalGeneration
+from transformers import AutoModelForImageTextToText
 
-model = LlavaNextForConditionalGeneration.from_pretrained(
+model = AutoModelForImageTextToText.from_pretrained(
     model_id,
     torch_dtype=torch.float16,
     low_cpu_mem_usage=True,
diff --git a/docs/source/en/model_doc/llava_onevision.md b/docs/source/en/model_doc/llava_onevision.md
index 717784da738d8c..b6b0a2bfa1d123 100644
--- a/docs/source/en/model_doc/llava_onevision.md
+++ b/docs/source/en/model_doc/llava_onevision.md
@@ -14,13 +14,13 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# LLaVA-Onevision
+# LLaVA-OneVision
 
 ## Overview
 
-The LLaVA-Onevision model was proposed in [LLaVA-OneVision: Easy Visual Task Transfer](https://arxiv.org/abs/2408.03326) by <Bo Li, Yuanhan Zhang, Dong Guo, Renrui Zhang, Feng Li, Hao Zhang, Kaichen Zhang, Yanwei Li, Ziwei Liu, Chunyuan Li
+The LLaVA-OneVision model was proposed in [LLaVA-OneVision: Easy Visual Task Transfer](https://arxiv.org/abs/2408.03326) by <Bo Li, Yuanhan Zhang, Dong Guo, Renrui Zhang, Feng Li, Hao Zhang, Kaichen Zhang, Yanwei Li, Ziwei Liu, Chunyuan Li
 
-LLaVA-Onevision is a Vision-Language Model that can generate text conditioned on one or several images/videos. The model consists of SigLIP vision encoder and a Qwen2 language backbone. The images are processed with anyres-9 technique where the image is split into 9 patches to better process high resolution images and capture as much details as possible. However, videos are pooled to a total sequence length of 196 tokens each frame for more memory efficient computation. LLaVA-Onevision is available in three sizes: 0.5B, 7B and 72B and achieves remarkable performance on benchmark evaluations.
+LLaVA-OneVision is a Vision-Language Model that can generate text conditioned on one or several images/videos. The model consists of SigLIP vision encoder and a Qwen2 language backbone. The images are processed with anyres-9 technique where the image is split into 9 patches to better process high resolution images and capture as much details as possible. However, videos are pooled to a total sequence length of 196 tokens each frame for more memory efficient computation. LLaVA-OneVision is available in three sizes: 0.5B, 7B and 72B and achieves remarkable performance on benchmark evaluations.
 
 The abstract from the paper is the following:
 
@@ -32,11 +32,10 @@ yielding new emerging capabilities. In particular, strong video understanding an
 cross-scenario capabilities are demonstrated through task transfer from images to
 videos.*
 
-
 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/llava-ov-acrhitecture.png"
 alt="drawing" width="600"/>
 
-<small> LLaVA=Onevision architecture. Taken from the <a href="https://arxiv.org/abs/2408.03326">original paper.</a> </small>
+<small> LLaVA-OneVision architecture. Taken from the <a href="https://arxiv.org/abs/2408.03326">original paper.</a> </small>
 
 Tips:
 
@@ -44,7 +43,7 @@ Tips:
 
 <Tip warning={true}>
 
-- Llava-Onevision uses different number of patches for images and thus has to pad the inputs inside modeling code, aside from the padding done when processing the inputs. The default setting is "left-padding" if model is in `eval()` mode, otherwise "right-padding".
+- Llava-OneVision uses different number of patches for images and thus has to pad the inputs inside modeling code, aside from the padding done when processing the inputs. The default setting is "left-padding" if model is in `eval()` mode, otherwise "right-padding".
 
 </Tip>
 
@@ -129,7 +128,7 @@ print(processor.decode(output[0], skip_special_tokens=True))
 
 ### Multi image inference
 
-LLaVa-Onevision can perform inference with multiple images as input, where images either belong to the same prompt or different prompts (in batched inference). For that you have to use checkpoints with an "ov" suffix. Here is how you can do it:
+LLaVa-OneVision can perform inference with multiple images as input, where images either belong to the same prompt or different prompts (in batched inference). For that you have to use checkpoints with an "ov" suffix. Here is how you can do it:
 
 ```python
 import requests
@@ -200,7 +199,7 @@ processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokeniza
 
 ### Video inference
 
-LLaVa-Onevision also can perform inference with videos as input, where video frames are treated as multiple images. Here is how you can do it:
+LLaVa-OneVision also can perform inference with videos as input, where video frames are treated as multiple images. Here is how you can do it:
 
 ```python
 import av
diff --git a/docs/source/en/model_doc/mamba.md b/docs/source/en/model_doc/mamba.md
index 94eb2e2c2d528d..317948331eb102 100644
--- a/docs/source/en/model_doc/mamba.md
+++ b/docs/source/en/model_doc/mamba.md
@@ -39,8 +39,8 @@ The original code can be found [here](https://github.com/state-spaces/mamba).
 
 # Usage
 
-### A simple generation example: 
-```python 
+### A simple generation example:
+```python
 from transformers import MambaConfig, MambaForCausalLM, AutoTokenizer
 import torch
 
@@ -55,7 +55,7 @@ print(tokenizer.batch_decode(out))
 ### Peft finetuning
 The slow version is not very stable for training, and the fast one needs `float32`!
 
-```python 
+```python
 from datasets import load_dataset
 from trl import SFTTrainer
 from peft import LoraConfig
@@ -80,7 +80,7 @@ lora_config =  LoraConfig(
 )
 trainer = SFTTrainer(
     model=model,
-    tokenizer=tokenizer,
+    processing_class=tokenizer,
     args=training_args,
     peft_config=lora_config,
     train_dataset=dataset,
diff --git a/docs/source/en/model_doc/myt5.md b/docs/source/en/model_doc/myt5.md
new file mode 100644
index 00000000000000..c8b46f43512b6e
--- /dev/null
+++ b/docs/source/en/model_doc/myt5.md
@@ -0,0 +1,46 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# myt5
+
+## Overview
+
+The myt5 model was proposed in [MYTE: Morphology-Driven Byte Encoding for Better and Fairer Multilingual Language Modeling](https://arxiv.org/pdf/2403.10691.pdf) by Tomasz Limisiewicz, Terra Blevins, Hila Gonen, Orevaoghene Ahia, and Luke Zettlemoyer.
+MyT5 (**My**te **T5**) is a multilingual language model based on T5 architecture.
+The model uses a **m**orphologically-driven **byte** (**MYTE**) representation described in our paper.
+**MYTE** uses codepoints corresponding to morphemes in contrast to characters used in UTF-8 encoding.
+As a pre-requisite, we used unsupervised morphological segmentation ([Morfessor](https://aclanthology.org/E14-2006.pdf)) to obtain morpheme inventories for 99 languages.
+However, the morphological segmentation step is not needed when using the pre-defined morpheme inventory from the hub (see: [Tomli/myt5-base](https://huggingface.co/Tomlim/myt5-base)).
+
+The abstract from the paper is the following:
+
+*A major consideration in multilingual language modeling is how to best represent languages with diverse vocabularies and scripts. Although contemporary text encoding methods cover most of the world’s writing systems, they exhibit bias towards the high-resource languages of the Global West. As a result, texts of underrepresented languages tend to be segmented into long sequences of linguistically meaningless units. To address the disparities, we introduce a new paradigm that encodes the same information with segments of consistent size across diverse languages. Our encoding convention (MYTE) is based on morphemes, as their inventories are more balanced across languages than characters, which are used in previous methods. We show that MYTE produces shorter encodings for all 99 analyzed languages, with the most notable improvements for non-European languages and non-Latin scripts. This, in turn, improves multilingual LM performance and diminishes the perplexity gap throughout diverse languages.*
+
+This model was contributed by [Tomasz Limisiewicz](https://huggingface.co/Tomlim).
+The original code can be found [here](https://github.com/tomlimi/MYTE).
+
+## MyT5Tokenizer
+
+[[autodoc]] MyT5Tokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## MyT5Tokenizer
+
+[[autodoc]] MyT5Tokenizer
+
diff --git a/docs/source/en/model_doc/opt.md b/docs/source/en/model_doc/opt.md
index 1b02b888994ecf..c82064bae894f7 100644
--- a/docs/source/en/model_doc/opt.md
+++ b/docs/source/en/model_doc/opt.md
@@ -110,6 +110,73 @@ Below is an expected speedup diagram that compares pure inference time between t
 </div>
 
 
+### Using Scaled Dot Product Attention (SDPA)
+PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function
+encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the
+[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)
+or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
+page for more information.
+
+SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set
+`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
+
+```python
+from transformers import OPTForCausalLM
+model = OPTForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype=torch.float16, attn_implementation="sdpa")
+...
+```
+
+For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).
+
+On a local benchmark (L40S-45GB, PyTorch 2.4.0, OS Debian GNU/Linux 11) using `float16` with
+[facebook/opt-350m](https://huggingface.co/facebook/opt-350m), we saw the
+following speedups during training and inference.
+
+### Training
+
+|    batch_size |    seq_len |  Time per batch (eager - s)   |    Time per batch (sdpa - s) |  Speedup (%)   |  Eager peak mem (MB)   |    sdpa peak mem (MB) |  Mem saving (%)   |
+|--------------:|-----------:|:------------------------------|-----------------------------:|:---------------|:-----------------------|----------------------:|:------------------|
+|             1 |        128 | 0.047                         |                        0.037 | 26.360         | 1474.611               |               1474.32 | 0.019             |
+|             1 |        256 | 0.046                         |                        0.037 | 24.335         | 1498.541               |               1499.49 | -0.063            |
+|             1 |        512 | 0.046                         |                        0.037 | 24.959         | 1973.544               |               1551.35 | 27.215            |
+|             1 |       1024 | 0.062                         |                        0.038 | 65.135         | 4867.113               |               1698.35 | 186.578           |
+|             1 |       2048 | 0.230                         |                        0.039 | 483.933        | 15662.224              |               2715.75 | 476.718           |
+|             2 |        128 | 0.045                         |                        0.037 | 20.455         | 1498.164               |               1499.49 | -0.089            |
+|             2 |        256 | 0.046                         |                        0.037 | 24.027         | 1569.367               |               1551.35 | 1.161             |
+|             2 |        512 | 0.045                         |                        0.037 | 20.965         | 3257.074               |               1698.35 | 91.778            |
+|             2 |       1024 | 0.122                         |                        0.038 | 225.958        | 9054.405               |               2715.75 | 233.403           |
+|             2 |       2048 | 0.464                         |                        0.067 | 593.646        | 30572.058              |               4750.55 | 543.548           |
+|             4 |        128 | 0.045                         |                        0.037 | 21.918         | 1549.448               |               1551.35 | -0.123            |
+|             4 |        256 | 0.044                         |                        0.038 | 18.084         | 2451.768               |               1698.35 | 44.361            |
+|             4 |        512 | 0.069                         |                        0.037 | 84.421         | 5833.180               |               2715.75 | 114.791           |
+|             4 |       1024 | 0.262                         |                        0.062 | 319.475        | 17427.842              |               4750.55 | 266.860           |
+|             4 |       2048 | OOM                           |                        0.062 | Eager OOM      | OOM                    |               4750.55 | Eager OOM         |
+|             8 |        128 | 0.044                         |                        0.037 | 18.436         | 2049.115               |               1697.78 | 20.694            |
+|             8 |        256 | 0.048                         |                        0.036 | 32.887         | 4222.567               |               2715.75 | 55.484            |
+|             8 |        512 | 0.153                         |                        0.06  | 154.862        | 10985.391              |               4750.55 | 131.245           |
+|             8 |       1024 | 0.526                         |                        0.122 | 330.697        | 34175.763              |               8821.18 | 287.428           |
+|             8 |       2048 | OOM                           |                        0.122 | Eager OOM      | OOM                    |               8821.18 | Eager OOM         |
+
+### Inference
+
+|    batch_size |    seq_len |    Per token latency eager (ms) |    Per token latency SDPA (ms) |    Speedup (%) |    Mem eager (MB) |    Mem BT (MB) |    Mem saved (%) |
+|--------------:|-----------:|--------------------------------:|-------------------------------:|---------------:|------------------:|---------------:|-----------------:|
+|             1 |        128 |                          11.634 |                          8.647 |         34.546 |           717.676 |        717.674 |            0     |
+|             1 |        256 |                          11.593 |                          8.86  |         30.851 |           742.852 |        742.845 |            0.001 |
+|             1 |        512 |                          11.515 |                          8.816 |         30.614 |           798.232 |        799.593 |           -0.17  |
+|             1 |       1024 |                          11.556 |                          8.915 |         29.628 |           917.265 |        895.538 |            2.426 |
+|             2 |        128 |                          12.724 |                         11.002 |         15.659 |           762.434 |        762.431 |            0     |
+|             2 |        256 |                          12.704 |                         11.063 |         14.83  |           816.809 |        816.733 |            0.009 |
+|             2 |        512 |                          12.757 |                         10.947 |         16.535 |           917.383 |        918.339 |           -0.104 |
+|             2 |       1024 |                          13.018 |                         11.018 |         18.147 |          1162.65  |       1114.81  |            4.291 |
+|             4 |        128 |                          12.739 |                         10.959 |         16.243 |           856.335 |        856.483 |           -0.017 |
+|             4 |        256 |                          12.718 |                         10.837 |         17.355 |           957.298 |        957.674 |           -0.039 |
+|             4 |        512 |                          12.813 |                         10.822 |         18.393 |          1158.44  |       1158.45  |           -0.001 |
+|             4 |       1024 |                          13.416 |                         11.06  |         21.301 |          1653.42  |       1557.19  |            6.18  |
+|             8 |        128 |                          12.763 |                         10.891 |         17.193 |          1036.13  |       1036.51  |           -0.036 |
+|             8 |        256 |                          12.89  |                         11.104 |         16.085 |          1236.98  |       1236.87  |            0.01  |
+|             8 |        512 |                          13.327 |                         10.939 |         21.836 |          1642.29  |       1641.78  |            0.031 |
+|             8 |       1024 |                          15.181 |                         11.175 |         35.848 |          2634.98  |       2443.35  |            7.843 |
 
 ## OPTConfig
 
diff --git a/docs/source/en/model_doc/phimoe.md b/docs/source/en/model_doc/phimoe.md
new file mode 100644
index 00000000000000..d9c9ae4a1831c7
--- /dev/null
+++ b/docs/source/en/model_doc/phimoe.md
@@ -0,0 +1,118 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# PhiMoE
+
+## Overview
+
+The PhiMoE model was proposed in [Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone](https://arxiv.org/abs/2404.14219) by Microsoft.
+
+### Summary
+
+The abstract from the Phi-3 paper is the following:
+
+We introduce phi-3-mini, a 3.8 billion parameter language model trained on 3.3 trillion tokens, whose overall performance, as measured by both academic benchmarks and internal testing, rivals that of models such as Mixtral 8x7B and GPT-3.5 (e.g., phi-3-mini achieves 69% on MMLU and 8.38 on MT-bench), despite being small enough to be deployed on a phone. Our training dataset is a scaled-up version of the one used for phi-2, composed of heavily filtered publicly available web data and synthetic data. The model is also further aligned for robustness, safety, and chat format. We also provide parameter-scaling results with a 7B, 14B models trained for 4.8T tokens, called phi-3-small, phi-3-medium, both significantly more capable than phi-3-mini (e.g., respectively 75%, 78% on MMLU, and 8.7, 8.9 on MT-bench). To enhance multilingual, multimodal, and long-context capabilities, we introduce three models in the phi-3.5 series: phi-3.5-mini, phi-3.5-MoE, and phi-3.5-Vision. The phi-3.5-MoE, a 16 x 3.8B MoE model with 6.6 billion active parameters, achieves superior performance in language reasoning, math, and code tasks compared to other open-source models of similar scale, such as Llama 3.1 and the Mixtral series, and on par with Gemini-1.5-Flash and GPT-4o-mini. Meanwhile, phi-3.5-Vision, a 4.2 billion parameter model derived from phi-3.5-mini, excels in reasoning tasks and is adept at handling both single-image and text prompts, as well as multi-image and text prompts.
+
+The original code for PhiMoE can be found [here](https://huggingface.co/microsoft/Phi-3.5-MoE-instruct).
+
+## Usage tips
+
+- This model is very similar to `Mixtral` with the main difference of [`Phi3LongRoPEScaledRotaryEmbedding`], where they are used to extend the context of the rotary embeddings. The query, key and values are fused, and the MLP's up and gate projection layers are also fused.
+- The tokenizer used for this model is identical to the [`LlamaTokenizer`], with the exception of additional tokens.
+
+## How to use PhiMoE
+
+<Tip warning={true}>
+
+Phi-3.5-MoE-instruct has been integrated in the development version (4.44.2.dev) of `transformers`. Until the official version is released through `pip`, ensure that you are doing the following:
+* When loading the model, ensure that `trust_remote_code=True` is passed as an argument of the `from_pretrained()` function.
+
+The current `transformers` version can be verified with: `pip list | grep transformers`.
+
+Examples of required packages:
+```
+flash_attn==2.5.8
+torch==2.3.1
+accelerate==0.31.0
+transformers==4.43.0
+```
+
+</Tip>
+
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline 
+
+torch.random.manual_seed(0) 
+
+model = AutoModelForCausalLM.from_pretrained( 
+    "microsoft/Phi-3.5-MoE-instruct",  
+    device_map="cuda",  
+    torch_dtype="auto",  
+    trust_remote_code=True,  
+) 
+
+tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-MoE-instruct") 
+
+messages = [ 
+    {"role": "system", "content": "You are a helpful AI assistant."}, 
+    {"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"}, 
+    {"role": "assistant", "content": "Sure! Here are some ways to eat bananas and dragonfruits together: 1. Banana and dragonfruit smoothie: Blend bananas and dragonfruits together with some milk and honey. 2. Banana and dragonfruit salad: Mix sliced bananas and dragonfruits together with some lemon juice and honey."}, 
+    {"role": "user", "content": "What about solving an 2x + 3 = 7 equation?"}, 
+] 
+
+pipe = pipeline( 
+    "text-generation", 
+    model=model, 
+    tokenizer=tokenizer, 
+) 
+
+generation_args = { 
+    "max_new_tokens": 500, 
+    "return_full_text": False, 
+    "temperature": 0.0, 
+    "do_sample": False, 
+} 
+
+output = pipe(messages, **generation_args) 
+print(output[0]['generated_text'])
+```
+
+## PhimoeConfig
+
+[[autodoc]] PhimoeConfig
+
+<frameworkcontent>
+<pt>
+
+## PhimoeModel
+
+[[autodoc]] PhimoeModel
+    - forward
+
+## PhimoeForCausalLM
+
+[[autodoc]] PhimoeForCausalLM
+    - forward
+    - generate
+
+## PhimoeForSequenceClassification
+
+[[autodoc]] PhimoeForSequenceClassification
+    - forward
+
+</pt>
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/pixtral.md b/docs/source/en/model_doc/pixtral.md
index c21938698db04e..ab604e4521fc73 100644
--- a/docs/source/en/model_doc/pixtral.md
+++ b/docs/source/en/model_doc/pixtral.md
@@ -18,69 +18,62 @@ rendered properly in your Markdown viewer.
 
 ## Overview
 
-The Pixtral model was released by the Mistral AI team on [vLLM](https://github.com/vllm-project/vllm/pull/8377), where a version of the code can be found!
+The Pixtral model was released by the Mistral AI team in a [blog post](https://mistral.ai/news/pixtral-12b/). Pixtral is a multimodal version of [Mistral](mistral), incorporating a 400 million parameter vision encoder trained from scratch.
+
+The intro from the blog says the following:
+
+*Pixtral is trained to understand both natural images and documents, achieving 52.5% on the MMMU reasoning benchmark, surpassing a number of larger models. The model shows strong abilities in tasks such as chart and figure understanding, document question answering, multimodal reasoning and instruction following. Pixtral is able to ingest images at their natural resolution and aspect ratio, giving the user flexibility on the number of tokens used to process an image. Pixtral is also able to process any number of images in its long context window of 128K tokens. Unlike previous open-source models, Pixtral does not compromise on text benchmark performance to excel in multimodal tasks.*
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/pixtral_architecture.webp"
+alt="drawing" width="600"/>
+
+<small> Pixtral architecture. Taken from the <a href="https://mistral.ai/news/pixtral-12b/">blog post.</a> </small>
 
 Tips:
 
 - Pixtral is a multimodal model, taking images and text as input, and producing text as output.
-- This model follows the [Llava](llava) family, meaning image embeddings are placed instead of the `[IMG]` token placeholders. The model uses [`PixtralVisionModel`] for its vision encoder, and [`MistralForCausalLM`] for its language decoder.
-- The main contribution is the 2d ROPE (rotary postiion embeddings) on the images, and support for arbitrary image sizes (the images are not padded together nor are they resized).
-- The format for one or mulitple prompts is the following:
+- This model follows the [Llava](llava) architecture. The model uses [`PixtralVisionModel`] for its vision encoder, and [`MistralForCausalLM`] for its language decoder.
+- The main contribution is the 2d ROPE (rotary position embeddings) on the images, and support for arbitrary image sizes (the images are not padded together nor are they resized).
+- Similar to [Llava](llava), the model internally replaces the `[IMG]` token placeholders by image embeddings from the vision encoder. The format for one or multiple prompts is the following:
 ```
 "<s>[INST][IMG]\nWhat are the things I should be cautious about when I visit this place?[/INST]"
 ```
-Then, the processor will replace each `[IMG]` token with  a number of `[IMG]` token that depends on the height and the width of the image. Each *row* of the image is separated by a `[IMG_BREAK]` token, and each image is separated by a  `[IMG_END]` token.
+Then, the processor will replace each `[IMG]` token with a number of `[IMG]` tokens that depend on the height and the width of each image. Each *row* of the image is separated by an `[IMG_BREAK]` token, and each image is separated by an `[IMG_END]` token. It's advised to use the `apply_chat_template` method of the processor, which takes care of all of this. See the [usage section](#usage) for more info.
 
 This model was contributed by [amyeroberts](https://huggingface.co/amyeroberts) and [ArthurZ](https://huggingface.co/ArthurZ). The original code can be found [here](https://github.com/vllm-project/vllm/pull/8377).
 
 ## Usage
 
-Here is an example of how to run it:
+At inference time, it's advised to use the processor's `apply_chat_template` method, which correctly formats the prompt for the model:
 
 ```python
-from transformers import LlavaForConditionalGeneration, AutoProcessor
+from transformers import AutoProcessor, LlavaForConditionalGeneration
 from PIL import Image
 
 model_id = "mistral-community/pixtral-12b"
-model = LlavaForConditionalGeneration.from_pretrained(model_id).to("cuda")
 processor = AutoProcessor.from_pretrained(model_id)
+model = LlavaForConditionalGeneration.from_pretrained(model_id).to("cuda")
 
-IMG_URLS = [
-    "https://picsum.photos/id/237/400/300",
-    "https://picsum.photos/id/231/200/300",
-    "https://picsum.photos/id/27/500/500",
-    "https://picsum.photos/id/17/150/600",
+url_dog = "https://picsum.photos/id/237/200/300"
+url_mountain = "https://picsum.photos/seed/picsum/200/300"
+
+chat = [
+    {
+      "role": "user", "content": [
+        {"type": "text", "content": "Can this animal"}, 
+        {"type": "image"}, 
+        {"type": "text", "content": "live here?"}, 
+        {"type": "image"}
+      ]
+    }
 ]
-PROMPT = "<s>[INST]Describe the images.\n[IMG][IMG][IMG][IMG][/INST]"
 
-inputs = processor(images=IMG_URLS, text=PROMPT, return_tensors="pt").to("cuda")
+prompt = processor.apply_chat_template(chat)
+inputs = processor(text=prompt, images=[url_dog, url_mountain], return_tensors="pt").to(model.device)
 generate_ids = model.generate(**inputs, max_new_tokens=500)
 output = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-
-EXPECTED_GENERATION = """
-Describe the images.
-Sure, let's break down each image description:
-
-1. **Image 1:**
-   - **Description:** A black dog with a glossy coat is sitting on a wooden floor. The dog has a focused expression and is looking directly at the camera.
-   - **Details:** The wooden floor has a rustic appearance with visible wood grain patterns. The dog's eyes are a striking color, possibly brown or amber, which contrasts with its black fur.
-
-2. **Image 2:**
-   - **Description:** A scenic view of a mountainous landscape with a winding road cutting through it. The road is surrounded by lush green vegetation and leads to a distant valley.
-   - **Details:** The mountains are rugged with steep slopes, and the sky is clear, indicating good weather. The winding road adds a sense of depth and perspective to the image.
-
-3. **Image 3:**
-   - **Description:** A beach scene with waves crashing against the shore. There are several people in the water and on the beach, enjoying the waves and the sunset.
-   - **Details:** The waves are powerful, creating a dynamic and lively atmosphere. The sky is painted with hues of orange and pink from the setting sun, adding a warm glow to the scene.
-
-4. **Image 4:**
-   - **Description:** A garden path leading to a large tree with a bench underneath it. The path is bordered by well-maintained grass and flowers.
-   - **Details:** The path is made of small stones or gravel, and the tree provides a shaded area with the bench invitingly placed beneath it. The surrounding area is lush and green, suggesting a well-kept garden.
-
-Each image captures a different scene, from a close-up of a dog to expansive natural landscapes, showcasing various elements of nature and human interaction with it.
-"""
-
 ```
+
 ## PixtralVisionConfig
 
 [[autodoc]] PixtralVisionConfig
diff --git a/docs/source/en/model_doc/qwen2_vl.md b/docs/source/en/model_doc/qwen2_vl.md
index 448a462152ee60..7c864b860bd8ea 100644
--- a/docs/source/en/model_doc/qwen2_vl.md
+++ b/docs/source/en/model_doc/qwen2_vl.md
@@ -14,17 +14,22 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# Qwen2_VL
-
+# Qwen2-VL
 
 ## Overview
 
-The [Qwen2_VL](https://qwenlm.github.io/blog/qwen2-vl/) is a major update to our [Qwen-VL](https://arxiv.org/pdf/2308.12966) model from the Qwen team. 
+The [Qwen2-VL](https://qwenlm.github.io/blog/qwen2-vl/) model is a major update to [Qwen-VL](https://arxiv.org/pdf/2308.12966) from the Qwen team at Alibaba Research. 
 
 The abstract from the blog is the following:
 
 *This blog introduces Qwen2-VL, an advanced version of the Qwen-VL model that has undergone significant enhancements over the past year. Key improvements include enhanced image comprehension, advanced video understanding, integrated visual agent functionality, and expanded multilingual support. The model architecture has been optimized for handling arbitrary image resolutions through Naive Dynamic Resolution support and utilizes Multimodal Rotary Position Embedding (M-ROPE) to effectively process both 1D textual and multi-dimensional visual data. This updated model demonstrates competitive performance against leading AI systems like GPT-4o and Claude 3.5 Sonnet in vision-related tasks and ranks highly among open-source models in text capabilities. These advancements make Qwen2-VL a versatile tool for various applications requiring robust multimodal processing and reasoning abilities.*
 
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/qwen2_vl_architecture.jpeg"
+alt="drawing" width="600"/>
+
+<small> Qwen2-VL architecture. Taken from the <a href="https://qwenlm.github.io/blog/qwen2-vl/">blog post.</a> </small>
+
+This model was contributed by [simonJJJ](https://huggingface.co/simonJJJ).
 
 ## Usage example
 
@@ -78,8 +83,6 @@ generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(in
 output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
 print(output_text)
 
-
-
 # Video
 def fetch_video(ele: Dict, nframe_factor=2):
     if isinstance(ele['video'], str):
@@ -130,16 +133,13 @@ output_ids = model.generate(**inputs, max_new_tokens=128)
 generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
 output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
 print(output_text)
-
 ```
 
-
 ### Batch Mixed Media Inference
 
 The model can batch inputs composed of mixed samples of various types such as images, videos, and text. Here is an example.
 
 ```python
-
 image1 = Image.open("/path/to/image1.jpg")
 image2 = Image.open("/path/to/image2.jpg")
 image3 = Image.open("/path/to/image3.jpg")
@@ -217,26 +217,30 @@ print(output_text)
 
 ### Usage Tips
 
-#### Image Resolution for performance boost
+#### Image Resolution trade-off
 
 The model supports a wide range of resolution inputs. By default, it uses the native resolution for input, but higher resolutions can enhance performance at the cost of more computation. Users can set the minimum and maximum number of pixels to achieve an optimal configuration for their needs.
 
 ```python
-
 min_pixels = 224*224
 max_pixels = 2048*2048
 processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
+```
+
+In case of limited GPU RAM, one can reduce the resolution as follows:
 
+```python
+min_pixels = 256*28*28
+max_pixels = 1024*28*28 
+processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
 ```
+This ensures each image gets encoded using a number between 256-1024 tokens. The 28 comes from the fact that the model uses a patch size of 14 and a temporal patch size of 2 (14 x 2 = 28).
 
 #### Multiple Image Inputs
 
 By default, images and video content are directly included in the conversation. When handling multiple images, it's helpful to add labels to the images and videos for better reference. Users can control this behavior with the following settings:
 
-
-
 ```python
-
 conversation = [
     {
         "role": "user",
@@ -302,7 +306,6 @@ model = Qwen2VLForConditionalGeneration.from_pretrained(
 )
 ```
 
-
 ## Qwen2VLConfig
 
 [[autodoc]] Qwen2VLConfig
diff --git a/docs/source/en/model_doc/sam.md b/docs/source/en/model_doc/sam.md
index 9a16e6255a062d..f45b08c2c23540 100644
--- a/docs/source/en/model_doc/sam.md
+++ b/docs/source/en/model_doc/sam.md
@@ -93,7 +93,6 @@ masks = processor.image_processor.post_process_masks(
 )
 scores = outputs.iou_scores
 ```
-
 ## Resources
 
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with SAM.
diff --git a/docs/source/en/model_doc/zamba.md b/docs/source/en/model_doc/zamba.md
new file mode 100644
index 00000000000000..450b68c77d6dac
--- /dev/null
+++ b/docs/source/en/model_doc/zamba.md
@@ -0,0 +1,100 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+# Zamba
+
+Zamba is a large language model (LLM) trained by Zyphra, and made available under an Apache 2.0 license. Please see the [Zyphra Hugging Face](https://huggingface.co/collections/zyphra/) repository for model weights.
+
+This model was contributed by [pglo](https://huggingface.co/pglo).
+
+
+## Model details
+
+Zamba-7B-v1 is a hybrid between state-space models (Specifically [Mamba](https://github.com/state-spaces/mamba)) and transformer, and was trained using next-token prediction. Zamba uses a shared transformer layer after every 6 mamba blocks. It uses the [Mistral v0.1 tokenizer](https://huggingface.co/mistralai/Mistral-7B-v0.1). We came to this architecture after a series of ablations at small scales. Zamba-7B-v1 was pre-trained on 1T tokens of text and code data.
+
+<img src=https://github.com/user-attachments/assets/c2cff209-b901-483c-87aa-774b82a0769f width=30% height=40% />
+
+## Quick start
+
+
+### Presequities
+
+Zamba requires you use `transformers` version 4.46.0 or higher:
+```bash
+pip install transformers>=4.45.0
+```
+
+In order to run optimized Mamba implementations, you first need to install `mamba-ssm` and `causal-conv1d`:
+```bash
+pip install mamba-ssm causal-conv1d>=1.2.0
+```
+You also have to have the model on a CUDA device.
+
+You can run the model not using the optimized Mamba kernels, but it is **not** recommended as it will result in significantly lower latencies. In order to do that, you'll need to specify `use_mamba_kernels=False` when loading the model.
+
+
+## Inference
+
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+
+tokenizer = AutoTokenizer.from_pretrained("Zyphra/Zamba-7B-v1")
+model = AutoModelForCausalLM.from_pretrained("Zyphra/Zamba-7B-v1", device_map="auto", torch_dtype=torch.bfloat16)
+
+input_text = "A funny prompt would be "
+input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+
+outputs = model.generate(**input_ids, max_new_tokens=100)
+print(tokenizer.decode(outputs[0]))
+```
+
+
+## Model card
+
+The model cards can be found at:
+* [Zamba-7B](MODEL_CARD_ZAMBA-7B-v1.md)
+
+
+## Issues
+For issues with model output, or community discussion, please use the Hugging Face community [forum](https://huggingface.co/zyphra/zamba-7b)
+
+
+## License
+
+The model weights are open-sourced via an Apache 2.0 license.
+
+
+## ZambaConfig
+
+[[autodoc]] ZambaConfig
+
+
+## ZambaModel
+
+[[autodoc]] ZambaModel
+    - forward
+
+
+## ZambaForCausalLM
+
+[[autodoc]] ZambaForCausalLM
+    - forward
+
+
+## ZambaForSequenceClassification
+
+[[autodoc]] transformers.ZambaForSequenceClassification
+    - forward
diff --git a/docs/source/en/modular_transformers.md b/docs/source/en/modular_transformers.md
index 33d2bb9483482a..1516233ec4d6e1 100644
--- a/docs/source/en/modular_transformers.md
+++ b/docs/source/en/modular_transformers.md
@@ -52,6 +52,7 @@ For example:
   reference it (in case of addition) or completely remove it (in case of deletion).
 - If a class inherits from another, for example: class GemmaModel(LlamaModel):, dependencies are automatically 
   inferred. All submodules will be automatically inferred from the superclass.
+- If you define new functions in the `modular` and use them inside classes, the linter will automatically infer the 
 
 You should be able to write everything (the tokenizer, the image processor, the model, the config) in this `modular` 
 file, and the corresponding files will be created for you. 
@@ -118,4 +119,79 @@ Additionally, you may find a list of examples here:
 
 ## What it is not
 
-It is not a replacement for the modeling code (yet?), and if your model is not based on anything else that ever existed, then you can add a `modeling` file as usual.
\ No newline at end of file
+It is not a replacement for the modeling code (yet?), and if your model is not based on anything else that ever existed, then you can add a `modeling` file as usual.
+
+
+## Advanced usage
+
+### Removing attributes and functions
+To remove attributes that are not used in your modular model, and that you don't want to see in the unravelled modeling: 
+
+```python
+class GemmaModel(LlamaModel):                 |           class GemmaModel(PreTrainedModel):
+    def __init__(self, config):               |              def __init__(self, config):
+        super().__init__(self, eos_token)     |                 super().__init__(config)
+        del self.embed_tokens                 |                 self.padding_idx = config.pad_token_id
+                                              |                 self.vocab_size = config.vocab_size
+                                              |
+                                              |                 self.layers = nn.ModuleList(
+                                              |                     [LlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+                                              |                 )
+                                              |                 self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+                                              |                 self.rotary_emb = LlamaRotaryEmbedding(config=config)
+                                              |                 self.gradient_checkpointing = False
+                                              |                 
+                                              |                 # Initialize weights and apply final processing
+                                              |                 self.post_init()
+```
+If you check the original `LlamaModel`, it has a `embed_tokens` which was removed here (as you would expect!)
+
+Removing a function is pretty similar, you just need to write it with a `raise ValueError("")` to mimick the behaviour you actually want when you remove a parent function in python.
+
+```python
+class GemmaTokenizer(LlamaTokenizer):
+    ...
+
+    def get_spm_processor(self):
+        raise AttributeError("Not needed for Gemma")
+
+    def unk_token_length(self):
+        raise AttributeError("Not needed for Gemma")
+```
+
+### Define new functions
+
+If you define a new function in the `modular` file to be used inside a class, say
+
+```python
+def my_new_function(*args, **kwargs):
+  # Do something here
+  pass
+
+class GemmaModel(LlamaModel):
+    def forward(*args, **kwargs):
+      # Call the function
+      example = my_new_function(*args, **kwargs)
+      # continue here
+```
+
+the `my_new_function` function (and, recursively, any other new functions called in its body) will be automatically copy-pasted 
+in the file where it is used.
+
+### Calling `super()`
+We recently shipped a few features that allow you to go from:
+```python
+class GemmaTokenizer(LlamaTokenizer, PretrainedTokenizerFast):         |           class GemmaModel(nn.Module):
+    def __init__(self, eos_token="</s>"):                              |             def __init__(self):
+        eos_token = AddedToken(eos_token)                              |                eos_token = AddedToken(eos_token)
+        PretrainedTokenizerFast.__init__(self, eos_token)              |                super().__init__(eos_token)
+```
+This is useful want you **don't** want to unravel the call to `super()`, and you want to differentiate which super init call you are doing!
+
+### Special naming
+We now also support special cases like
+```python
+class GemmaVisionModel(CLIPModel):                                 
+    pass
+```
+where the name of your class `GemmaVision` is not the same as the modular `Gemma`. This is super useful for composite models.
\ No newline at end of file
diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md
index 346759aa2b2517..cf2dac617ffa3d 100644
--- a/docs/source/en/perf_infer_gpu_one.md
+++ b/docs/source/en/perf_infer_gpu_one.md
@@ -79,6 +79,7 @@ FlashAttention-2 is currently supported for the following architectures:
 * [OPT](https://huggingface.co/docs/transformers/model_doc/opt#transformers.OPTModel)
 * [Phi](https://huggingface.co/docs/transformers/model_doc/phi#transformers.PhiModel)
 * [Phi3](https://huggingface.co/docs/transformers/model_doc/phi3#transformers.Phi3Model)
+* [PhiMoE](https://huggingface.co/docs/transformers/model_doc/phimoe#transformers.PhimoeModel)
 * [StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm#transformers.StableLmModel)
 * [Starcoder2](https://huggingface.co/docs/transformers/model_doc/starcoder2#transformers.Starcoder2Model)
 * [Qwen2](https://huggingface.co/docs/transformers/model_doc/qwen2#transformers.Qwen2Model)
@@ -219,6 +220,7 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [Dbrx](https://huggingface.co/docs/transformers/model_doc/dbrx#transformers.DbrxModel)
 * [DeiT](https://huggingface.co/docs/transformers/model_doc/deit#transformers.DeiTModel)
 * [Dinov2](https://huggingface.co/docs/transformers/en/model_doc/dinov2)
+* [DistilBert](https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel)
 * [Dpr](https://huggingface.co/docs/transformers/model_doc/dpr#transformers.DprReader)
 * [Falcon](https://huggingface.co/docs/transformers/model_doc/falcon#transformers.FalconModel)
 * [Gemma](https://huggingface.co/docs/transformers/model_doc/gemma#transformers.GemmaModel)
@@ -244,9 +246,11 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)
 * [OLMo](https://huggingface.co/docs/transformers/model_doc/olmo#transformers.OlmoModel)
 * [OLMoE](https://huggingface.co/docs/transformers/model_doc/olmoe#transformers.OlmoeModel)
+* [OPT](https://huggingface.co/docs/transformers/en/model_doc/opt)
 * [PaliGemma](https://huggingface.co/docs/transformers/model_doc/paligemma#transformers.PaliGemmaForConditionalGeneration)
 * [Phi](https://huggingface.co/docs/transformers/model_doc/phi#transformers.PhiModel)
 * [Phi3](https://huggingface.co/docs/transformers/model_doc/phi3#transformers.Phi3Model)
+* [PhiMoE](https://huggingface.co/docs/transformers/model_doc/phimoe#transformers.PhimoeModel)
 * [Idefics](https://huggingface.co/docs/transformers/model_doc/idefics#transformers.IdeficsModel)
 * [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper#transformers.WhisperModel)
 * [mBart](https://huggingface.co/docs/transformers/model_doc/mbart#transformers.MBartModel)
diff --git a/docs/source/en/perf_train_cpu_many.md b/docs/source/en/perf_train_cpu_many.md
index c93d3eafe7005d..f528378bd1b875 100644
--- a/docs/source/en/perf_train_cpu_many.md
+++ b/docs/source/en/perf_train_cpu_many.md
@@ -138,16 +138,16 @@ Now, run the following command in node0 and **4DDP** will be enabled in node0 an
 ## Usage with Kubernetes
 
 The same distributed training job from the previous section can be deployed to a Kubernetes cluster using the
-[Kubeflow PyTorchJob training operator](https://www.kubeflow.org/docs/components/training/pytorch/).
+[Kubeflow PyTorchJob training operator](https://www.kubeflow.org/docs/components/training/user-guides/pytorch).
 
 ### Setup
 
 This example assumes that you have:
-* Access to a Kubernetes cluster with [Kubeflow installed](https://www.kubeflow.org/docs/started/installing-kubeflow/)
-* [`kubectl`](https://kubernetes.io/docs/tasks/tools/) installed and configured to access the Kubernetes cluster
-* A [Persistent Volume Claim (PVC)](https://kubernetes.io/docs/concepts/storage/persistent-volumes/) that can be used
+* Access to a Kubernetes cluster with [Kubeflow installed](https://www.kubeflow.org/docs/started/installing-kubeflow)
+* [`kubectl`](https://kubernetes.io/docs/tasks/tools) installed and configured to access the Kubernetes cluster
+* A [Persistent Volume Claim (PVC)](https://kubernetes.io/docs/concepts/storage/persistent-volumes) that can be used
   to store datasets and model files. There are multiple options for setting up the PVC including using an NFS
-  [storage class](https://kubernetes.io/docs/concepts/storage/storage-classes/) or a cloud storage bucket.
+  [storage class](https://kubernetes.io/docs/concepts/storage/storage-classes) or a cloud storage bucket.
 * A Docker container that includes your model training script and all the dependencies needed to run the script. For
   distributed CPU training jobs, this typically includes PyTorch, Transformers, Intel Extension for PyTorch, Intel
   oneCCL Bindings for PyTorch, and OpenSSH to communicate between the containers.
@@ -176,7 +176,7 @@ PyTorchJob to the cluster.
 
 ### PyTorchJob Specification File
 
-The [Kubeflow PyTorchJob](https://www.kubeflow.org/docs/components/training/pytorch/) is used to run the distributed
+The [Kubeflow PyTorchJob](https://www.kubeflow.org/docs/components/training/user-guides/pytorch) is used to run the distributed
 training job on the cluster. The yaml file for the PyTorchJob defines parameters such as:
  * The name of the PyTorchJob
  * The number of replicas (workers)
@@ -273,12 +273,13 @@ To run this example, update the yaml based on your training script and the nodes
 
 <Tip>
 
-The CPU resource limits/requests in the yaml are defined in [cpu units](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#meaning-of-cpu)
+The CPU resource limits/requests in the yaml are defined in 
+[cpu units](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#meaning-of-cpu)
 where 1 CPU unit is equivalent to 1 physical CPU core or 1 virtual core (depending on whether the node is a physical
 host or a VM). The amount of CPU and memory limits/requests defined in the yaml should be less than the amount of
 available CPU/memory capacity on a single machine. It is usually a good idea to not use the entire machine's capacity in
 order to leave some resources for the kubelet and OS. In order to get ["guaranteed"](https://kubernetes.io/docs/concepts/workloads/pods/pod-qos/#guaranteed)
-[quality of service](https://kubernetes.io/docs/tasks/configure-pod-container/quality-service-pod/) for the worker pods,
+[quality of service](https://kubernetes.io/docs/tasks/configure-pod-container/quality-service-pod) for the worker pods,
 set the same CPU and memory amounts for both the resource limits and requests.
 
 </Tip>
@@ -318,4 +319,4 @@ with the job, the PyTorchJob resource can be deleted from the cluster using `kub
 
 This guide covered running distributed PyTorch training jobs using multiple CPUs on bare metal and on a Kubernetes
 cluster. Both cases utilize Intel Extension for PyTorch and Intel oneCCL Bindings for PyTorch for optimal training
-performance, and can be used as a template to run your own workload on multiple nodes.
+performance, and can be used as a template to run your own workload on multiple nodes.
\ No newline at end of file
diff --git a/docs/source/en/quantization/awq.md b/docs/source/en/quantization/awq.md
index 3c94bcca153f74..ca26844edd0294 100644
--- a/docs/source/en/quantization/awq.md
+++ b/docs/source/en/quantization/awq.md
@@ -230,3 +230,44 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 Note this feature is supported on AMD GPUs.
 
 </Tip>
+
+
+## CPU support
+
+Recent versions of `autoawq` supports CPU with ipex op optimizations. To get started, first install the latest version of `autoawq` by running:
+
+```bash
+pip install intel-extension-for-pytorch
+pip install git+https://github.com/casper-hansen/AutoAWQ.git
+```
+
+Get started by passing an `AwqConfig()` with `version="ipex"`.
+
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, AwqConfig
+
+quantization_config = AwqConfig(version="ipex")
+
+model = AutoModelForCausalLM.from_pretrained(
+    "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
+    quantization_config=quantization_config,
+    device_map="cpu",
+)
+
+input_ids = torch.randint(0, 100, (1, 128), dtype=torch.long, device="cpu")
+output = model(input_ids)
+print(output.logits)
+
+tokenizer = AutoTokenizer.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ")
+input_ids = tokenizer.encode("How to make a cake", return_tensors="pt")
+pad_token_id = tokenizer.eos_token_id
+output = model.generate(input_ids, do_sample=True, max_length=50, pad_token_id=pad_token_id)
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+
+<Tip warning={true}>
+
+Note this feature is supported on Intel CPUs.
+
+</Tip>
\ No newline at end of file
diff --git a/docs/source/en/quantization/bitnet.md b/docs/source/en/quantization/bitnet.md
new file mode 100644
index 00000000000000..6bd65e8b53a476
--- /dev/null
+++ b/docs/source/en/quantization/bitnet.md
@@ -0,0 +1,75 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BitNet
+
+[BitNet](https://arxiv.org/abs/2402.17764) replaces traditional Linear layers in Multi-Head Attention and Feed-Forward Networks with specialized layers called BitLinear with ternary (or binary in the older version) precision. The BitLinear layers introduced here quantize the weights using ternary precision (with values of -1, 0, and 1) and quantize the activations to 8-bit precision.
+
+
+<figure style="text-align: center;">
+  <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/1.58llm_extreme_quantization/bitlinear.png" alt="Alt Text" />
+  <figcaption>The architecture of BitNet with BitLinear layers</figcaption>
+</figure>
+
+During training, we start by quantizing the weights into ternary values, using symmetric per tensor quantization. First, we compute the average of the absolute values of the weight matrix and use this as a scale. We then divide the weights by the scale, round the values, constrain them between -1 and 1, and finally rescale them to continue in full precision.
+
+$$
+scale_w = \frac{1}{\frac{1}{nm} \sum_{ij} |W_{ij}|}
+$$
+
+$$
+W_q = \text{clamp}_{[-1,1]}(\text{round}(W*scale))
+$$
+
+$$
+W_{dequantized} = W_q*scale_w
+$$
+
+Activations are then quantized to a specified bit-width (e.g., 8-bit) using [absmax](https://arxiv.org/pdf/2208.07339) quantization (symmetric per channel quantization). This involves scaling the activations into a range [−128,127[. The quantization formula is:
+
+$$
+scale_x = \frac{127}{|X|_{\text{max}, \, \text{dim}=-1}}
+$$
+
+$$
+X_q = \text{clamp}_{[-128,127]}(\text{round}(X*scale))
+$$
+
+$$
+X_{dequantized} = X_q * scale_x
+$$
+
+To learn more about how we trained, and fine-tuned bitnet models checkout the blogpost [here](https://huggingface.co/blog/1_58_llm_extreme_quantization)
+
+## Load a BitNet Model from the Hub
+BitNet models can't be quantized on the fly—they need to be pre-trained or fine-tuned with the quantization applied (it's a Quantization aware training technique). Once trained, these models are already quantized and available as packed versions on the hub.
+
+A quantized model can be load : 
+
+```py
+from transformers import AutoModelForCausalLM
+path = "/path/to/model"
+model = AutoModelForCausalLM.from_pretrained(path, device_map="auto")
+```
+## Pre-training / Fine-tuning a BitNet Model
+
+If you're looking to pre-train or fine-tune your own 1.58-bit model using Nanotron, check out this [PR](https://github.com/huggingface/nanotron/pull/180), all you need to get started is there !
+
+For fine-tuning, you'll need to convert the model from Hugging Face format to Nanotron format (which has some differences). You can find the conversion steps in this [PR](https://github.com/huggingface/nanotron/pull/174).
+
+## Kernels
+
+In our initial version, we chose to use `@torch.compile` to unpack the weights and perform the forward pass. It’s very straightforward to implement and delivers significant speed improvements. We plan to integrate additional optimized kernels in future versions.
\ No newline at end of file
diff --git a/docs/source/en/quantization/compressed_tensors.md b/docs/source/en/quantization/compressed_tensors.md
index f385aae965f662..77b8986bf8ca4c 100644
--- a/docs/source/en/quantization/compressed_tensors.md
+++ b/docs/source/en/quantization/compressed_tensors.md
@@ -19,15 +19,12 @@ The [`compressed-tensors`](https://github.com/neuralmagic/compressed-tensors) li
 
 Some of the supported formats include:
 1. `dense`
-2. `int-quantized`: INT8 quantized models
-    - sample [model/config](https://huggingface.co/nm-testing/tinyllama-w8a8-compressed-hf-quantizer)
-3. `float-quantized`: FP8 quantized models; currently support E4M3
-    - sample [model/config](https://huggingface.co/nm-testing/Meta-Llama-3-8B-Instruct-fp8-hf_compat/tree/main)
-4. `pack-quantized`: INT4 or INT8 weight-quantized models, packed into INT32. For INT4, the weights have an INT4 range but are stored as INT8 and   then packed into INT32.
-    - sample [model/config](nm-testing/tinyllama-w4a16-compressed-hf-quantizer)
+2. `int-quantized` ([sample](https://huggingface.co/nm-testing/tinyllama-w8a8-compressed-hf-quantizer)): INT8 quantized models
+3. `float-quantized` ([sample](https://huggingface.co/nm-testing/Meta-Llama-3-8B-Instruct-fp8-hf_compat)): FP8 quantized models; currently support E4M3
+4. `pack-quantized` ([sample](https://huggingface.co/nm-testing/tinyllama-w4a16-compressed-hf-quantizer)): INT4 or INT8 weight-quantized models, packed into INT32. For INT4, the weights have an INT4 range but are stored as INT8 and then packed into INT32.
 
 Compressed models can be easily created using [llm-compressor](https://github.com/vllm-project/llm-compressor).
-Alternatively models can be created indepedenty and serialized with a compressed tensors config.
+Alternatively models can be created independently and serialized with a compressed tensors config.
 
 To find existing models on the Hugging Face Model Hub, search for the [`compressed-tensors` tag](https://huggingface.co/models?other=compressed-tensors).
 
@@ -35,7 +32,7 @@ To find existing models on the Hugging Face Model Hub, search for the [`compress
  - Weight and activation precisions: FP8, INT4, INT8 (for Q/DQ arbitrary precision is allowed for INT)
  - Quantization scales and zero-points strategies: [tensor, channel, group, block, token](https://github.com/neuralmagic/compressed-tensors/blob/83b2e7a969d70606421a76b9a3d112646077c8de/src/compressed_tensors/quantization/quant_args.py#L43-L52)
  - Dynamic per-token activation quantization (or any static strategy)
- - Sparsity can be 
+ - Sparsity in weights (unstructured or semi-structured like 2:4) can be composed with quantization for extreme compression
  - Supports quantization of arbitrary modules, not just Linear modules
  - Targeted support or ignoring of modules by name or class
 
diff --git a/docs/source/en/quicktour.md b/docs/source/en/quicktour.md
index fb1689cce7befe..f6fc66f4b6cb78 100755
--- a/docs/source/en/quicktour.md
+++ b/docs/source/en/quicktour.md
@@ -111,7 +111,7 @@ Load an audio dataset (see the 🤗 Datasets [Quick Start](https://huggingface.c
 >>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")  # doctest: +IGNORE_RESULT
 ```
 
-You need to make sure the sampling rate of the dataset matches the sampling 
+You need to make sure the sampling rate of the dataset matches the sampling
 rate [`facebook/wav2vec2-base-960h`](https://huggingface.co/facebook/wav2vec2-base-960h) was trained on:
 
 ```py
@@ -174,7 +174,7 @@ If you can't find a model for your use-case, you'll need to finetune a pretraine
 
 <Youtube id="AhChOFRegn4"/>
 
-Under the hood, the [`AutoModelForSequenceClassification`] and [`AutoTokenizer`] classes work together to power the [`pipeline`] you used above. An [AutoClass](./model_doc/auto) is a shortcut that automatically retrieves the architecture of a pretrained model from its name or path. You only need to select the appropriate `AutoClass` for your task and it's associated preprocessing class. 
+Under the hood, the [`AutoModelForSequenceClassification`] and [`AutoTokenizer`] classes work together to power the [`pipeline`] you used above. An [AutoClass](./model_doc/auto) is a shortcut that automatically retrieves the architecture of a pretrained model from its name or path. You only need to select the appropriate `AutoClass` for your task and it's associated preprocessing class.
 
 Let's return to the example from the previous section and see how you can use the `AutoClass` to replicate the results of the [`pipeline`].
 
@@ -485,7 +485,7 @@ Now gather all these classes in [`Trainer`]:
 ...     args=training_args,
 ...     train_dataset=dataset["train"],
 ...     eval_dataset=dataset["test"],
-...     tokenizer=tokenizer,
+...     processing_class=tokenizer,
 ...     data_collator=data_collator,
 ... )  # doctest: +SKIP
 ```
@@ -502,7 +502,7 @@ For tasks - like translation or summarization - that use a sequence-to-sequence
 
 </Tip>
 
-You can customize the training loop behavior by subclassing the methods inside [`Trainer`]. This allows you to customize features such as the loss function, optimizer, and scheduler. Take a look at the [`Trainer`] reference for which methods can be subclassed. 
+You can customize the training loop behavior by subclassing the methods inside [`Trainer`]. This allows you to customize features such as the loss function, optimizer, and scheduler. Take a look at the [`Trainer`] reference for which methods can be subclassed.
 
 The other way to customize the training loop is by using [Callbacks](./main_classes/callback). You can use callbacks to integrate with other libraries and inspect the training loop to report on progress or stop the training early. Callbacks do not modify anything in the training loop itself. To customize something like the loss function, you need to subclass the [`Trainer`] instead.
 
diff --git a/docs/source/en/tasks/asr.md b/docs/source/en/tasks/asr.md
index 2ddd972c3d2608..f3e068444ca556 100644
--- a/docs/source/en/tasks/asr.md
+++ b/docs/source/en/tasks/asr.md
@@ -281,7 +281,7 @@ At this point, only three steps remain:
 ...     args=training_args,
 ...     train_dataset=encoded_minds["train"],
 ...     eval_dataset=encoded_minds["test"],
-...     tokenizer=processor,
+...     processing_class=processor,
 ...     data_collator=data_collator,
 ...     compute_metrics=compute_metrics,
 ... )
@@ -368,4 +368,4 @@ Get the predicted `input_ids` with the highest probability, and use the processo
 ['I WOUL LIKE O SET UP JOINT ACOUNT WTH Y PARTNER']
 ```
 </pt>
-</frameworkcontent>
\ No newline at end of file
+</frameworkcontent>
diff --git a/docs/source/en/tasks/audio_classification.md b/docs/source/en/tasks/audio_classification.md
index 4610e86d6a2939..59d6a175da82ba 100644
--- a/docs/source/en/tasks/audio_classification.md
+++ b/docs/source/en/tasks/audio_classification.md
@@ -98,8 +98,8 @@ Take a look at an example now:
 
 There are two fields:
 
-- `audio`: a 1-dimensional `array` of the speech signal that must be called to load and resample the audio file. 
-- `intent_class`: represents the class id of the speaker's intent. 
+- `audio`: a 1-dimensional `array` of the speech signal that must be called to load and resample the audio file.
+- `intent_class`: represents the class id of the speaker's intent.
 
 To make it easier for the model to get the label name from the label id, create a dictionary that maps the label name to an integer and vice versa:
 
@@ -235,7 +235,7 @@ At this point, only three steps remain:
 ...     args=training_args,
 ...     train_dataset=encoded_minds["train"],
 ...     eval_dataset=encoded_minds["test"],
-...     tokenizer=feature_extractor,
+...     processing_class=feature_extractor,
 ...     compute_metrics=compute_metrics,
 ... )
 
@@ -321,4 +321,4 @@ Get the class with the highest probability, and use the model's `id2label` mappi
 'cash_deposit'
 ```
 </pt>
-</frameworkcontent>
\ No newline at end of file
+</frameworkcontent>
diff --git a/docs/source/en/tasks/document_question_answering.md b/docs/source/en/tasks/document_question_answering.md
index 54c0cd5aef3f3f..d83e025c409019 100644
--- a/docs/source/en/tasks/document_question_answering.md
+++ b/docs/source/en/tasks/document_question_answering.md
@@ -420,7 +420,7 @@ Finally, bring everything together, and call [`~Trainer.train`]:
 ...     data_collator=data_collator,
 ...     train_dataset=encoded_train_dataset,
 ...     eval_dataset=encoded_test_dataset,
-...     tokenizer=processor,
+...     processing_class=processor,
 ... )
 
 >>> trainer.train()
@@ -489,4 +489,4 @@ which token is at the end of the answer. Both have shape (batch_size, sequence_l
 
 >>> processor.tokenizer.decode(encoding.input_ids.squeeze()[predicted_start_idx : predicted_end_idx + 1])
 'lee a. waller'
-```
\ No newline at end of file
+```
diff --git a/docs/source/en/tasks/image_classification.md b/docs/source/en/tasks/image_classification.md
index 279216443bb6db..514ec3fbfe0b93 100644
--- a/docs/source/en/tasks/image_classification.md
+++ b/docs/source/en/tasks/image_classification.md
@@ -317,7 +317,7 @@ At this point, only three steps remain:
 ...     data_collator=data_collator,
 ...     train_dataset=food["train"],
 ...     eval_dataset=food["test"],
-...     tokenizer=image_processor,
+...     processing_class=image_processor,
 ...     compute_metrics=compute_metrics,
 ... )
 
diff --git a/docs/source/en/tasks/image_text_to_text.md b/docs/source/en/tasks/image_text_to_text.md
index 74f6a3408bcaf2..261abf947290d1 100644
--- a/docs/source/en/tasks/image_text_to_text.md
+++ b/docs/source/en/tasks/image_text_to_text.md
@@ -27,22 +27,22 @@ To begin with, there are multiple types of VLMs:
 - chat fine-tuned models for conversation
 - instruction fine-tuned models
 
-This guide focuses on inference with an instruction-tuned model. 
+This guide focuses on inference with an instruction-tuned model.
 
 Let's begin installing the dependencies.
 
 ```bash
-pip install -q transformers accelerate flash_attn 
+pip install -q transformers accelerate flash_attn
 ```
 
-Let's initialize the model and the processor. 
+Let's initialize the model and the processor.
 
 ```python
-from transformers import AutoProcessor, Idefics2ForConditionalGeneration
+from transformers import AutoProcessor, AutoModelForImageTextToText
 import torch
 
 device = torch.device("cuda")
-model = Idefics2ForConditionalGeneration.from_pretrained(
+model = AutoModelForImageTextToText.from_pretrained(
     "HuggingFaceM4/idefics2-8b",
     torch_dtype=torch.bfloat16,
     attn_implementation="flash_attention_2",
@@ -51,7 +51,7 @@ model = Idefics2ForConditionalGeneration.from_pretrained(
 processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b")
 ```
 
-This model has a [chat template](./chat_templating) that helps user parse chat outputs. Moreover, the model can also accept multiple images as input in a single conversation or message. We will now prepare the inputs. 
+This model has a [chat template](./chat_templating) that helps user parse chat outputs. Moreover, the model can also accept multiple images as input in a single conversation or message. We will now prepare the inputs.
 
 The image inputs look like the following.
 
@@ -74,7 +74,7 @@ images = [Image.open(requests.get(img_urls[0], stream=True).raw),
           Image.open(requests.get(img_urls[1], stream=True).raw)]
 ```
 
-Below is an example of the chat template. We can feed conversation turns and the last message as an input by appending it at the end of the template. 
+Below is an example of the chat template. We can feed conversation turns and the last message as an input by appending it at the end of the template.
 
 
 ```python
@@ -98,7 +98,7 @@ messages = [
             {"type": "image"},
             {"type": "text", "text": "And how about this image?"},
         ]
-    },       
+    },
 ]
 ```
 
@@ -180,11 +180,11 @@ def model_inference(
         if acc_text.endswith("<end_of_utterance>"):
             acc_text = acc_text[:-18]
         yield acc_text
-    
+
     thread.join()
 ```
 
-Now let's call the `model_inference` function we created and stream the values. 
+Now let's call the `model_inference` function we created and stream the values.
 
 ```python
 generator = model_inference(
@@ -204,7 +204,7 @@ for value in generator:
 
 ## Fit models in smaller hardware
 
-VLMs are often large and need to be optimized to fit on smaller hardware. Transformers supports many model quantization libraries, and here we will only show int8 quantization with [Quanto](./quantization/quanto#quanto). int8 quantization offers memory improvements up to 75 percent (if all weights are quantized). However it is no free lunch, since 8-bit is not a CUDA-native precision, the weights are quantized back and forth on the fly, which adds up to latency. 
+VLMs are often large and need to be optimized to fit on smaller hardware. Transformers supports many model quantization libraries, and here we will only show int8 quantization with [Quanto](./quantization/quanto#quanto). int8 quantization offers memory improvements up to 75 percent (if all weights are quantized). However it is no free lunch, since 8-bit is not a CUDA-native precision, the weights are quantized back and forth on the fly, which adds up to latency.
 
 First, install dependencies.
 
@@ -215,18 +215,20 @@ pip install -U quanto bitsandbytes
 To quantize a model during loading, we need to first create [`QuantoConfig`]. Then load the model as usual, but pass `quantization_config` during model initialization.
 
 ```python
-from transformers import Idefics2ForConditionalGeneration, AutoTokenizer, QuantoConfig
+from transformers import AutoModelForImageTextToText, QuantoConfig
 
 model_id = "HuggingFaceM4/idefics2-8b"
 quantization_config = QuantoConfig(weights="int8")
-quantized_model = Idefics2ForConditionalGeneration.from_pretrained(model_id, device_map="cuda", quantization_config=quantization_config)
+quantized_model = AutoModelForImageTextToText.from_pretrained(
+    model_id, device_map="cuda", quantization_config=quantization_config
+)
 ```
 
-And that's it, we can use the model the same way with no changes. 
+And that's it, we can use the model the same way with no changes.
 
 ## Further Reading
 
 Here are some more resources for the image-text-to-text task.
 
-- [Image-text-to-text task page](https://huggingface.co/tasks/image-text-to-text) covers model types, use cases, datasets, and more. 
+- [Image-text-to-text task page](https://huggingface.co/tasks/image-text-to-text) covers model types, use cases, datasets, and more.
 - [Vision Language Models Explained](https://huggingface.co/blog/vlms) is a blog post that covers everything about vision language models and supervised fine-tuning using [TRL](https://huggingface.co/docs/trl/en/index).
diff --git a/docs/source/en/tasks/knowledge_distillation_for_image_classification.md b/docs/source/en/tasks/knowledge_distillation_for_image_classification.md
index f856e35b1740bd..530e92d81f5c0d 100644
--- a/docs/source/en/tasks/knowledge_distillation_for_image_classification.md
+++ b/docs/source/en/tasks/knowledge_distillation_for_image_classification.md
@@ -19,9 +19,9 @@ rendered properly in your Markdown viewer.
 
 Knowledge distillation is a technique used to transfer knowledge from a larger, more complex model (teacher) to a smaller, simpler model (student). To distill knowledge from one model to another, we take a pre-trained teacher model trained on a certain task (image classification for this case) and randomly initialize a student model to be trained on image classification. Next, we train the student model to minimize the difference between it's outputs and the teacher's outputs, thus making it mimic the behavior. It was first introduced in [Distilling the Knowledge in a Neural Network by Hinton et al](https://arxiv.org/abs/1503.02531). In this guide, we will do task-specific knowledge distillation. We will use the [beans dataset](https://huggingface.co/datasets/beans) for this.
 
-This guide demonstrates how you can distill a [fine-tuned ViT model](https://huggingface.co/merve/vit-mobilenet-beans-224) (teacher model) to a [MobileNet](https://huggingface.co/google/mobilenet_v2_1.4_224) (student model) using the [Trainer API](https://huggingface.co/docs/transformers/en/main_classes/trainer#trainer) of 🤗 Transformers. 
+This guide demonstrates how you can distill a [fine-tuned ViT model](https://huggingface.co/merve/vit-mobilenet-beans-224) (teacher model) to a [MobileNet](https://huggingface.co/google/mobilenet_v2_1.4_224) (student model) using the [Trainer API](https://huggingface.co/docs/transformers/en/main_classes/trainer#trainer) of 🤗 Transformers.
 
-Let's install the libraries needed for distillation and evaluating the process. 
+Let's install the libraries needed for distillation and evaluating the process.
 
 ```bash
 pip install transformers datasets accelerate tensorboard evaluate --upgrade
@@ -29,7 +29,7 @@ pip install transformers datasets accelerate tensorboard evaluate --upgrade
 
 In this example, we are using the `merve/beans-vit-224` model as teacher model. It's an image classification model, based on `google/vit-base-patch16-224-in21k` fine-tuned on beans dataset. We will distill this model to a randomly initialized MobileNetV2.
 
-We will now load the dataset. 
+We will now load the dataset.
 
 ```python
 from datasets import load_dataset
@@ -37,7 +37,7 @@ from datasets import load_dataset
 dataset = load_dataset("beans")
 ```
 
-We can use an image processor from either of the models, as in this case they return the same output with same resolution. We will use the `map()` method of `dataset` to apply the preprocessing to every split of the dataset. 
+We can use an image processor from either of the models, as in this case they return the same output with same resolution. We will use the `map()` method of `dataset` to apply the preprocessing to every split of the dataset.
 
 ```python
 from transformers import AutoImageProcessor
@@ -93,7 +93,7 @@ class ImageDistilTrainer(Trainer):
         return (loss, student_output) if return_outputs else loss
 ```
 
-We will now login to Hugging Face Hub so we can push our model to the Hugging Face Hub through the `Trainer`. 
+We will now login to Hugging Face Hub so we can push our model to the Hugging Face Hub through the `Trainer`.
 
 ```python
 from huggingface_hub import notebook_login
@@ -101,7 +101,7 @@ from huggingface_hub import notebook_login
 notebook_login()
 ```
 
-Let's set the `TrainingArguments`, the teacher model and the student model. 
+Let's set the `TrainingArguments`, the teacher model and the student model.
 
 ```python
 from transformers import AutoModelForImageClassification, MobileNetV2Config, MobileNetV2ForImageClassification
@@ -164,7 +164,7 @@ trainer = ImageDistilTrainer(
     train_dataset=processed_datasets["train"],
     eval_dataset=processed_datasets["validation"],
     data_collator=data_collator,
-    tokenizer=teacher_processor,
+    processing_class=teacher_processor,
     compute_metrics=compute_metrics,
     temperature=5,
     lambda_param=0.5
diff --git a/docs/source/en/tasks/multiple_choice.md b/docs/source/en/tasks/multiple_choice.md
index fc63c35425db25..06eb45eda99150 100644
--- a/docs/source/en/tasks/multiple_choice.md
+++ b/docs/source/en/tasks/multiple_choice.md
@@ -270,7 +270,7 @@ At this point, only three steps remain:
 ...     args=training_args,
 ...     train_dataset=tokenized_swag["train"],
 ...     eval_dataset=tokenized_swag["validation"],
-...     tokenizer=tokenizer,
+...     processing_class=tokenizer,
 ...     data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
 ...     compute_metrics=compute_metrics,
 ... )
diff --git a/docs/source/en/tasks/object_detection.md b/docs/source/en/tasks/object_detection.md
index dfad80b949f767..fdc81896bc1924 100644
--- a/docs/source/en/tasks/object_detection.md
+++ b/docs/source/en/tasks/object_detection.md
@@ -340,7 +340,7 @@ with `pixel_values`, a tensor with `pixel_mask`, and `labels`.
           [ 0.0741,  0.0741,  0.0741,  ...,  0.0741,  0.0741,  0.0741],
           [ 0.0741,  0.0741,  0.0741,  ...,  0.0741,  0.0741,  0.0741],
           [ 0.0741,  0.0741,  0.0741,  ...,  0.0741,  0.0741,  0.0741]],
-  
+
           [[ 1.6232,  1.6408,  1.6583,  ...,  0.8704,  1.0105,  1.1331],
           [ 1.6408,  1.6583,  1.6758,  ...,  0.8529,  0.9930,  1.0980],
           [ 1.6933,  1.6933,  1.7108,  ...,  0.8179,  0.9580,  1.0630],
@@ -348,7 +348,7 @@ with `pixel_values`, a tensor with `pixel_mask`, and `labels`.
           [ 0.2052,  0.2052,  0.2052,  ...,  0.2052,  0.2052,  0.2052],
           [ 0.2052,  0.2052,  0.2052,  ...,  0.2052,  0.2052,  0.2052],
           [ 0.2052,  0.2052,  0.2052,  ...,  0.2052,  0.2052,  0.2052]],
-  
+
           [[ 1.8905,  1.9080,  1.9428,  ..., -0.1487, -0.0964, -0.0615],
           [ 1.9254,  1.9428,  1.9603,  ..., -0.1661, -0.1138, -0.0790],
           [ 1.9777,  1.9777,  1.9951,  ..., -0.2010, -0.1138, -0.0790],
@@ -569,7 +569,7 @@ Finally, bring everything together, and call [`~transformers.Trainer.train`]:
 ...     args=training_args,
 ...     train_dataset=cppe5["train"],
 ...     eval_dataset=cppe5["validation"],
-...     tokenizer=image_processor,
+...     processing_class=image_processor,
 ...     data_collator=collate_fn,
 ...     compute_metrics=eval_compute_metrics_fn,
 ... )
diff --git a/docs/source/en/tasks/question_answering.md b/docs/source/en/tasks/question_answering.md
index 367e35b121164f..998010e67ca95f 100644
--- a/docs/source/en/tasks/question_answering.md
+++ b/docs/source/en/tasks/question_answering.md
@@ -225,7 +225,7 @@ At this point, only three steps remain:
 ...     args=training_args,
 ...     train_dataset=tokenized_squad["train"],
 ...     eval_dataset=tokenized_squad["test"],
-...     tokenizer=tokenizer,
+...     processing_class=tokenizer,
 ...     data_collator=data_collator,
 ... )
 
diff --git a/docs/source/en/tasks/sequence_classification.md b/docs/source/en/tasks/sequence_classification.md
index 572d6493ba4f32..27516ace1cc345 100644
--- a/docs/source/en/tasks/sequence_classification.md
+++ b/docs/source/en/tasks/sequence_classification.md
@@ -190,7 +190,7 @@ At this point, only three steps remain:
 ...     args=training_args,
 ...     train_dataset=tokenized_imdb["train"],
 ...     eval_dataset=tokenized_imdb["test"],
-...     tokenizer=tokenizer,
+...     processing_class=tokenizer,
 ...     data_collator=data_collator,
 ...     compute_metrics=compute_metrics,
 ... )
diff --git a/docs/source/en/tasks/summarization.md b/docs/source/en/tasks/summarization.md
index b79415996ca72e..7d7ecf1fbab6db 100644
--- a/docs/source/en/tasks/summarization.md
+++ b/docs/source/en/tasks/summarization.md
@@ -214,7 +214,7 @@ At this point, only three steps remain:
 ...     args=training_args,
 ...     train_dataset=tokenized_billsum["train"],
 ...     eval_dataset=tokenized_billsum["test"],
-...     tokenizer=tokenizer,
+...     processing_class=tokenizer,
 ...     data_collator=data_collator,
 ...     compute_metrics=compute_metrics,
 ... )
diff --git a/docs/source/en/tasks/text-to-speech.md b/docs/source/en/tasks/text-to-speech.md
index ad8c43a28e8efc..188d4ea5f9ee68 100644
--- a/docs/source/en/tasks/text-to-speech.md
+++ b/docs/source/en/tasks/text-to-speech.md
@@ -18,13 +18,13 @@ rendered properly in your Markdown viewer.
 
 [[open-in-colab]]
 
-Text-to-speech (TTS) is the task of creating natural-sounding speech from text, where the speech can be generated in multiple 
-languages and for multiple speakers. Several text-to-speech models are currently available in 🤗 Transformers, such as 
-[Bark](../model_doc/bark), [MMS](../model_doc/mms), [VITS](../model_doc/vits) and [SpeechT5](../model_doc/speecht5). 
+Text-to-speech (TTS) is the task of creating natural-sounding speech from text, where the speech can be generated in multiple
+languages and for multiple speakers. Several text-to-speech models are currently available in 🤗 Transformers, such as
+[Bark](../model_doc/bark), [MMS](../model_doc/mms), [VITS](../model_doc/vits) and [SpeechT5](../model_doc/speecht5).
 
-You can easily generate audio using the `"text-to-audio"` pipeline (or its alias - `"text-to-speech"`). Some models, like Bark, 
+You can easily generate audio using the `"text-to-audio"` pipeline (or its alias - `"text-to-speech"`). Some models, like Bark,
 can also be conditioned to generate non-verbal communications such as laughing, sighing and crying, or even add music.
-Here's an example of how you would use the `"text-to-speech"` pipeline with Bark: 
+Here's an example of how you would use the `"text-to-speech"` pipeline with Bark:
 
 ```py
 >>> from transformers import pipeline
@@ -34,18 +34,18 @@ Here's an example of how you would use the `"text-to-speech"` pipeline with Bark
 >>> output = pipe(text)
 ```
 
-Here's a code snippet you can use to listen to the resulting audio in a notebook: 
+Here's a code snippet you can use to listen to the resulting audio in a notebook:
 
 ```python
 >>> from IPython.display import Audio
 >>> Audio(output["audio"], rate=output["sampling_rate"])
 ```
 
-For more examples on what Bark and other pretrained TTS models can do, refer to our 
-[Audio course](https://huggingface.co/learn/audio-course/chapter6/pre-trained_models). 
+For more examples on what Bark and other pretrained TTS models can do, refer to our
+[Audio course](https://huggingface.co/learn/audio-course/chapter6/pre-trained_models).
 
-If you are looking to fine-tune a TTS model, the only text-to-speech models currently available in 🤗 Transformers 
-are [SpeechT5](model_doc/speecht5) and [FastSpeech2Conformer](model_doc/fastspeech2_conformer), though more will be added in the future. SpeechT5 is pre-trained on a combination of speech-to-text and text-to-speech data, allowing it to learn a unified space of hidden representations shared by both text and speech. This means that the same pre-trained model can be fine-tuned for different tasks. Furthermore, SpeechT5 supports multiple speakers through x-vector speaker embeddings. 
+If you are looking to fine-tune a TTS model, the only text-to-speech models currently available in 🤗 Transformers
+are [SpeechT5](model_doc/speecht5) and [FastSpeech2Conformer](model_doc/fastspeech2_conformer), though more will be added in the future. SpeechT5 is pre-trained on a combination of speech-to-text and text-to-speech data, allowing it to learn a unified space of hidden representations shared by both text and speech. This means that the same pre-trained model can be fine-tuned for different tasks. Furthermore, SpeechT5 supports multiple speakers through x-vector speaker embeddings.
 
 The remainder of this guide illustrates how to:
 
@@ -66,7 +66,7 @@ pip install git+https://github.com/huggingface/transformers.git
 
 <Tip>
 
-To follow this guide you will need a GPU. If you're working in a notebook, run the following line to check if a GPU is available: 
+To follow this guide you will need a GPU. If you're working in a notebook, run the following line to check if a GPU is available:
 
 ```bash
 !nvidia-smi
@@ -90,13 +90,13 @@ We encourage you to log in to your Hugging Face account to upload and share your
 
 ## Load the dataset
 
-[VoxPopuli](https://huggingface.co/datasets/facebook/voxpopuli) is a large-scale multilingual speech corpus consisting of 
-data sourced from 2009-2020 European Parliament event recordings. It contains labelled audio-transcription data for 15 
-European languages. In this guide, we are using the Dutch language subset, feel free to pick another subset. 
+[VoxPopuli](https://huggingface.co/datasets/facebook/voxpopuli) is a large-scale multilingual speech corpus consisting of
+data sourced from 2009-2020 European Parliament event recordings. It contains labelled audio-transcription data for 15
+European languages. In this guide, we are using the Dutch language subset, feel free to pick another subset.
 
-Note that VoxPopuli or any other automated speech recognition (ASR) dataset may not be the most suitable 
-option for training TTS models. The features that make it beneficial for ASR, such as excessive background noise, are 
-typically undesirable in TTS. However, finding top-quality, multilingual, and multi-speaker TTS datasets can be quite 
+Note that VoxPopuli or any other automated speech recognition (ASR) dataset may not be the most suitable
+option for training TTS models. The features that make it beneficial for ASR, such as excessive background noise, are
+typically undesirable in TTS. However, finding top-quality, multilingual, and multi-speaker TTS datasets can be quite
 challenging.
 
 Let's load the data:
@@ -109,7 +109,7 @@ Let's load the data:
 20968
 ```
 
-20968 examples should be sufficient for fine-tuning. SpeechT5 expects audio data to have a sampling rate of 16 kHz, so 
+20968 examples should be sufficient for fine-tuning. SpeechT5 expects audio data to have a sampling rate of 16 kHz, so
 make sure the examples in the dataset meet this requirement:
 
 ```py
@@ -118,7 +118,7 @@ dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
 
 ## Preprocess the data
 
-Let's begin by defining the model checkpoint to use and loading the appropriate processor: 
+Let's begin by defining the model checkpoint to use and loading the appropriate processor:
 
 ```py
 >>> from transformers import SpeechT5Processor
@@ -127,7 +127,7 @@ Let's begin by defining the model checkpoint to use and loading the appropriate
 >>> processor = SpeechT5Processor.from_pretrained(checkpoint)
 ```
 
-### Text cleanup for SpeechT5 tokenization 
+### Text cleanup for SpeechT5 tokenization
 
 Start by cleaning up the text data. You'll need the tokenizer part of the processor to process the text:
 
@@ -135,18 +135,18 @@ Start by cleaning up the text data. You'll need the tokenizer part of the proces
 >>> tokenizer = processor.tokenizer
 ```
 
-The dataset examples contain `raw_text` and `normalized_text` features. When deciding which feature to use as the text input, 
-consider that the SpeechT5 tokenizer doesn't have any tokens for numbers. In `normalized_text` the numbers are written 
+The dataset examples contain `raw_text` and `normalized_text` features. When deciding which feature to use as the text input,
+consider that the SpeechT5 tokenizer doesn't have any tokens for numbers. In `normalized_text` the numbers are written
 out as text. Thus, it is a better fit, and we recommend using    `normalized_text` as input text.
 
-Because SpeechT5 was trained on the English language, it may not recognize certain characters in the Dutch dataset. If 
-left as is, these characters will be converted to `<unk>` tokens. However, in Dutch, certain characters like `à` are 
+Because SpeechT5 was trained on the English language, it may not recognize certain characters in the Dutch dataset. If
+left as is, these characters will be converted to `<unk>` tokens. However, in Dutch, certain characters like `à` are
 used to stress syllables. In order to preserve the meaning of the text, we can replace this character with a regular `a`.
 
-To identify unsupported tokens, extract all unique characters in the dataset using the `SpeechT5Tokenizer` which 
-works with characters as tokens. To do this, write the `extract_all_chars` mapping function that concatenates 
-the transcriptions from all examples into one string and converts it to a set of characters. 
-Make sure to set `batched=True` and `batch_size=-1` in `dataset.map()` so that all transcriptions are available at once for 
+To identify unsupported tokens, extract all unique characters in the dataset using the `SpeechT5Tokenizer` which
+works with characters as tokens. To do this, write the `extract_all_chars` mapping function that concatenates
+the transcriptions from all examples into one string and converts it to a set of characters.
+Make sure to set `batched=True` and `batch_size=-1` in `dataset.map()` so that all transcriptions are available at once for
 the mapping function.
 
 ```py
@@ -168,8 +168,8 @@ the mapping function.
 >>> tokenizer_vocab = {k for k, _ in tokenizer.get_vocab().items()}
 ```
 
-Now you have two sets of characters: one with the vocabulary from the dataset and one with the vocabulary from the tokenizer. 
-To identify any unsupported characters in the dataset, you can take the difference between these two sets. The resulting 
+Now you have two sets of characters: one with the vocabulary from the dataset and one with the vocabulary from the tokenizer.
+To identify any unsupported characters in the dataset, you can take the difference between these two sets. The resulting
 set will contain the characters that are in the dataset but not in the tokenizer.
 
 ```py
@@ -177,7 +177,7 @@ set will contain the characters that are in the dataset but not in the tokenizer
 {' ', 'à', 'ç', 'è', 'ë', 'í', 'ï', 'ö', 'ü'}
 ```
 
-To handle the unsupported characters identified in the previous step, define a function that maps these characters to 
+To handle the unsupported characters identified in the previous step, define a function that maps these characters to
 valid tokens. Note that spaces are already replaced by `▁` in the tokenizer and don't need to be handled separately.
 
 ```py
@@ -206,9 +206,9 @@ Now that you have dealt with special characters in the text, it's time to shift
 
 ### Speakers
 
-The VoxPopuli dataset includes speech from multiple speakers, but how many speakers are represented in the dataset? To 
-determine this, we can count the number of unique speakers and the number of examples each speaker contributes to the dataset. 
-With a total of 20,968 examples in the dataset, this information will give us a better understanding of the distribution of 
+The VoxPopuli dataset includes speech from multiple speakers, but how many speakers are represented in the dataset? To
+determine this, we can count the number of unique speakers and the number of examples each speaker contributes to the dataset.
+With a total of 20,968 examples in the dataset, this information will give us a better understanding of the distribution of
 speakers and examples in the data.
 
 ```py
@@ -236,9 +236,9 @@ By plotting a histogram you can get a sense of how much data there is for each s
     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/tts_speakers_histogram.png" alt="Speakers histogram"/>
 </div>
 
-The histogram reveals that approximately one-third of the speakers in the dataset have fewer than 100 examples, while 
-around ten speakers have more than 500 examples. To improve training efficiency and balance the dataset, we can limit 
-the data to speakers with between 100 and 400 examples. 
+The histogram reveals that approximately one-third of the speakers in the dataset have fewer than 100 examples, while
+around ten speakers have more than 500 examples. To improve training efficiency and balance the dataset, we can limit
+the data to speakers with between 100 and 400 examples.
 
 ```py
 >>> def select_speaker(speaker_id):
@@ -248,14 +248,14 @@ the data to speakers with between 100 and 400 examples.
 >>> dataset = dataset.filter(select_speaker, input_columns=["speaker_id"])
 ```
 
-Let's check how many speakers remain: 
+Let's check how many speakers remain:
 
 ```py
 >>> len(set(dataset["speaker_id"]))
 42
 ```
 
-Let's see how many examples are left: 
+Let's see how many examples are left:
 
 ```py
 >>> len(dataset)
@@ -264,18 +264,18 @@ Let's see how many examples are left:
 
 You are left with just under 10,000 examples from approximately 40 unique speakers, which should be sufficient.
 
-Note that some speakers with few examples may actually have more audio available if the examples are long. However, 
-determining the total amount of audio for each speaker requires scanning through the entire dataset, which is a 
+Note that some speakers with few examples may actually have more audio available if the examples are long. However,
+determining the total amount of audio for each speaker requires scanning through the entire dataset, which is a
 time-consuming process that involves loading and decoding each audio file. As such, we have chosen to skip this step here.
 
 ### Speaker embeddings
 
-To enable the TTS model to differentiate between multiple speakers, you'll need to create a speaker embedding for each example. 
+To enable the TTS model to differentiate between multiple speakers, you'll need to create a speaker embedding for each example.
 The speaker embedding is an additional input into the model that captures a particular speaker's voice characteristics.
-To generate these speaker embeddings, use the pre-trained [spkrec-xvect-voxceleb](https://huggingface.co/speechbrain/spkrec-xvect-voxceleb) 
-model from SpeechBrain. 
+To generate these speaker embeddings, use the pre-trained [spkrec-xvect-voxceleb](https://huggingface.co/speechbrain/spkrec-xvect-voxceleb)
+model from SpeechBrain.
 
-Create a function `create_speaker_embedding()` that takes an input audio waveform and outputs a 512-element vector 
+Create a function `create_speaker_embedding()` that takes an input audio waveform and outputs a 512-element vector
 containing the corresponding speaker embedding.
 
 ```py
@@ -301,17 +301,17 @@ containing the corresponding speaker embedding.
 ...     return speaker_embeddings
 ```
 
-It's important to note that the `speechbrain/spkrec-xvect-voxceleb` model was trained on English speech from the VoxCeleb 
-dataset, whereas the training examples in this guide are in Dutch. While we believe that this model will still generate 
+It's important to note that the `speechbrain/spkrec-xvect-voxceleb` model was trained on English speech from the VoxCeleb
+dataset, whereas the training examples in this guide are in Dutch. While we believe that this model will still generate
 reasonable speaker embeddings for our Dutch dataset, this assumption may not hold true in all cases.
 
-For optimal results, we recommend training an X-vector model on the target speech first. This will ensure that the model 
+For optimal results, we recommend training an X-vector model on the target speech first. This will ensure that the model
 is better able to capture the unique voice characteristics present in the Dutch language.
 
 ### Processing the dataset
 
-Finally, let's process the data into the format the model expects. Create a `prepare_dataset` function that takes in a 
-single example and uses the `SpeechT5Processor` object to tokenize the input text and load the target audio into a log-mel spectrogram. 
+Finally, let's process the data into the format the model expects. Create a `prepare_dataset` function that takes in a
+single example and uses the `SpeechT5Processor` object to tokenize the input text and load the target audio into a log-mel spectrogram.
 It should also add the speaker embeddings as an additional input.
 
 ```py
@@ -363,8 +363,8 @@ The labels should be a log-mel spectrogram with 80 mel bins.
     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/tts_logmelspectrogram_1.png" alt="Log-mel spectrogram with 80 mel bins"/>
 </div>
 
-Side note: If you find this spectrogram confusing, it may be due to your familiarity with the convention of placing low frequencies 
-at the bottom and high frequencies at the top of a plot. However, when plotting spectrograms as an image using the matplotlib library, 
+Side note: If you find this spectrogram confusing, it may be due to your familiarity with the convention of placing low frequencies
+at the bottom and high frequencies at the top of a plot. However, when plotting spectrograms as an image using the matplotlib library,
 the y-axis is flipped and the spectrograms appear upside down.
 
 Now apply the processing function to the entire dataset. This will take between 5 and 10 minutes.
@@ -373,7 +373,7 @@ Now apply the processing function to the entire dataset. This will take between
 >>> dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names)
 ```
 
-You'll see a warning saying that some examples in the dataset are longer than the maximum input length the model can handle (600 tokens). 
+You'll see a warning saying that some examples in the dataset are longer than the maximum input length the model can handle (600 tokens).
 Remove those examples from the dataset. Here we go even further and to allow for larger batch sizes we remove anything over 200 tokens.
 
 ```py
@@ -387,7 +387,7 @@ Remove those examples from the dataset. Here we go even further and to allow for
 8259
 ```
 
-Next, create a basic train/test split: 
+Next, create a basic train/test split:
 
 ```py
 >>> dataset = dataset.train_test_split(test_size=0.1)
@@ -395,8 +395,8 @@ Next, create a basic train/test split:
 
 ### Data collator
 
-In order to combine multiple examples into a batch, you need to define a custom data collator. This collator will pad shorter sequences with padding 
-tokens, ensuring that all examples have the same length. For the spectrogram labels, the padded portions are replaced with the special value `-100`. This special value 
+In order to combine multiple examples into a batch, you need to define a custom data collator. This collator will pad shorter sequences with padding
+tokens, ensuring that all examples have the same length. For the spectrogram labels, the padded portions are replaced with the special value `-100`. This special value
 instructs the model to ignore that part of the spectrogram when calculating the spectrogram loss.
 
 ```py
@@ -437,18 +437,18 @@ instructs the model to ignore that part of the spectrogram when calculating the
 ...         return batch
 ```
 
-In SpeechT5, the input to the decoder part of the model is reduced by a factor 2. In other words, it throws away every 
-other timestep from the target sequence. The decoder then predicts a sequence that is twice as long. Since the original 
-target sequence length may be odd, the data collator makes sure to round the maximum length of the batch down to be a 
+In SpeechT5, the input to the decoder part of the model is reduced by a factor 2. In other words, it throws away every
+other timestep from the target sequence. The decoder then predicts a sequence that is twice as long. Since the original
+target sequence length may be odd, the data collator makes sure to round the maximum length of the batch down to be a
 multiple of 2.
 
-```py 
+```py
 >>> data_collator = TTSDataCollatorWithPadding(processor=processor)
 ```
 
 ## Train the model
 
-Load the pre-trained model from the same checkpoint as you used for loading the processor: 
+Load the pre-trained model from the same checkpoint as you used for loading the processor:
 
 ```py
 >>> from transformers import SpeechT5ForTextToSpeech
@@ -458,11 +458,11 @@ Load the pre-trained model from the same checkpoint as you used for loading the
 
 The `use_cache=True` option is incompatible with gradient checkpointing. Disable it for training.
 
-```py 
+```py
 >>> model.config.use_cache = False
 ```
 
-Define the training arguments. Here we are not computing any evaluation metrics during the training process. Instead, we'll 
+Define the training arguments. Here we are not computing any evaluation metrics during the training process. Instead, we'll
 only look at the loss:
 
 ```python
@@ -501,19 +501,19 @@ Instantiate the `Trainer` object  and pass the model, dataset, and data collator
 ...     train_dataset=dataset["train"],
 ...     eval_dataset=dataset["test"],
 ...     data_collator=data_collator,
-...     tokenizer=processor,
+...     processing_class=processor,
 ... )
 ```
 
-And with that, you're ready to start training! Training will take several hours. Depending on your GPU, 
-it is possible that you will encounter a CUDA "out-of-memory" error when you start training. In this case, you can reduce 
+And with that, you're ready to start training! Training will take several hours. Depending on your GPU,
+it is possible that you will encounter a CUDA "out-of-memory" error when you start training. In this case, you can reduce
 the `per_device_train_batch_size` incrementally by factors of 2 and increase `gradient_accumulation_steps` by 2x to compensate.
 
 ```py
 >>> trainer.train()
 ```
 
-To be able to use your checkpoint with a pipeline, make sure to save the processor with the checkpoint: 
+To be able to use your checkpoint with a pipeline, make sure to save the processor with the checkpoint:
 
 ```py
 >>> processor.save_pretrained("YOUR_ACCOUNT_NAME/speecht5_finetuned_voxpopuli_nl")
@@ -530,8 +530,8 @@ Push the final model to the 🤗 Hub:
 ### Inference with a pipeline
 
 Great, now that you've fine-tuned a model, you can use it for inference!
-First, let's see how you can use it with a corresponding pipeline. Let's create a `"text-to-speech"` pipeline with your 
-checkpoint: 
+First, let's see how you can use it with a corresponding pipeline. Let's create a `"text-to-speech"` pipeline with your
+checkpoint:
 
 ```py
 >>> from transformers import pipeline
@@ -545,14 +545,14 @@ Pick a piece of text in Dutch you'd like narrated, e.g.:
 >>> text = "hallo allemaal, ik praat nederlands. groetjes aan iedereen!"
 ```
 
-To use SpeechT5 with the pipeline, you'll need a speaker embedding. Let's get it from an example in the test dataset: 
+To use SpeechT5 with the pipeline, you'll need a speaker embedding. Let's get it from an example in the test dataset:
 
 ```py
 >>> example = dataset["test"][304]
 >>> speaker_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)
 ```
 
-Now you can pass the text and speaker embeddings to the pipeline, and it will take care of the rest: 
+Now you can pass the text and speaker embeddings to the pipeline, and it will take care of the rest:
 
 ```py
 >>> forward_params = {"speaker_embeddings": speaker_embeddings}
@@ -567,40 +567,40 @@ You can then listen to the result:
 
 ```py
 >>> from IPython.display import Audio
->>> Audio(output['audio'], rate=output['sampling_rate']) 
+>>> Audio(output['audio'], rate=output['sampling_rate'])
 ```
 
 ### Run inference manually
 
-You can achieve the same inference results without using the pipeline, however, more steps will be required. 
+You can achieve the same inference results without using the pipeline, however, more steps will be required.
 
-Load the model from the 🤗 Hub: 
+Load the model from the 🤗 Hub:
 
 ```py
 >>> model = SpeechT5ForTextToSpeech.from_pretrained("YOUR_ACCOUNT/speecht5_finetuned_voxpopuli_nl")
 ```
 
-Pick an example from the test dataset to obtain a speaker embedding. 
+Pick an example from the test dataset obtain a speaker embedding.
 
-```py 
+```py
 >>> example = dataset["test"][304]
 >>> speaker_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)
 ```
 
 Define the input text and tokenize it.
 
-```py 
+```py
 >>> text = "hallo allemaal, ik praat nederlands. groetjes aan iedereen!"
 >>> inputs = processor(text=text, return_tensors="pt")
 ```
 
-Create a spectrogram with your model: 
+Create a spectrogram with your model:
 
 ```py
 >>> spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
 ```
 
-Visualize the spectrogram, if you'd like to: 
+Visualize the spectrogram, if you'd like to:
 
 ```py
 >>> plt.figure()
@@ -623,15 +623,15 @@ Finally, use the vocoder to turn the spectrogram into sound.
 >>> Audio(speech.numpy(), rate=16000)
 ```
 
-In our experience, obtaining satisfactory results from this model can be challenging. The quality of the speaker 
-embeddings appears to be a significant factor. Since SpeechT5 was pre-trained with English x-vectors, it performs best 
+In our experience, obtaining satisfactory results from this model can be challenging. The quality of the speaker
+embeddings appears to be a significant factor. Since SpeechT5 was pre-trained with English x-vectors, it performs best
 when using English speaker embeddings. If the synthesized speech sounds poor, try using a different speaker embedding.
 
-Increasing the training duration is also likely to enhance the quality of the results. Even so, the speech clearly is Dutch instead of English, and it does 
+Increasing the training duration is also likely to enhance the quality of the results. Even so, the speech clearly is Dutch instead of English, and it does
 capture the voice characteristics of the speaker (compare to the original audio in the example).
-Another thing to experiment with is the model's configuration. For example, try using `config.reduction_factor = 1` to 
+Another thing to experiment with is the model's configuration. For example, try using `config.reduction_factor = 1` to
 see if this improves the results.
 
-Finally, it is essential to consider ethical considerations. Although TTS technology has numerous useful applications, it 
-may also be used for malicious purposes, such as impersonating someone's voice without their knowledge or consent. Please 
+Finally, it is essential to consider ethical considerations. Although TTS technology has numerous useful applications, it
+may also be used for malicious purposes, such as impersonating someone's voice without their knowledge or consent. Please
 use TTS judiciously and responsibly.
diff --git a/docs/source/en/tasks/token_classification.md b/docs/source/en/tasks/token_classification.md
index 444d8421727d80..b93dd0cbe26d97 100644
--- a/docs/source/en/tasks/token_classification.md
+++ b/docs/source/en/tasks/token_classification.md
@@ -296,7 +296,7 @@ At this point, only three steps remain:
 ...     args=training_args,
 ...     train_dataset=tokenized_wnut["train"],
 ...     eval_dataset=tokenized_wnut["test"],
-...     tokenizer=tokenizer,
+...     processing_class=tokenizer,
 ...     data_collator=data_collator,
 ...     compute_metrics=compute_metrics,
 ... )
diff --git a/docs/source/en/tasks/translation.md b/docs/source/en/tasks/translation.md
index a4b544fe68a320..426ba1c340fb81 100644
--- a/docs/source/en/tasks/translation.md
+++ b/docs/source/en/tasks/translation.md
@@ -221,7 +221,7 @@ At this point, only three steps remain:
 ...     args=training_args,
 ...     train_dataset=tokenized_books["train"],
 ...     eval_dataset=tokenized_books["test"],
-...     tokenizer=tokenizer,
+...     processing_class=tokenizer,
 ...     data_collator=data_collator,
 ...     compute_metrics=compute_metrics,
 ... )
diff --git a/docs/source/en/tasks/video_classification.md b/docs/source/en/tasks/video_classification.md
index 15b3b7a969effb..c268de1786bdc5 100644
--- a/docs/source/en/tasks/video_classification.md
+++ b/docs/source/en/tasks/video_classification.md
@@ -61,7 +61,7 @@ Start by loading a subset of the [UCF-101 dataset](https://www.crcv.ucf.edu/data
 
 After the subset has been downloaded, you need to extract the compressed archive:
 
-```py 
+```py
 >>> import tarfile
 
 >>> with tarfile.open(file_path) as t:
@@ -106,13 +106,13 @@ UCF101_subset/
 
 You can then count the number of total videos.
 
-```py 
+```py
 >>> import pathlib
 >>> dataset_root_path = "UCF101_subset"
 >>> dataset_root_path = pathlib.Path(dataset_root_path)
 ```
 
-```py 
+```py
 >>> video_count_train = len(list(dataset_root_path.glob("train/*/*.avi")))
 >>> video_count_val = len(list(dataset_root_path.glob("val/*/*.avi")))
 >>> video_count_test = len(list(dataset_root_path.glob("test/*/*.avi")))
@@ -120,7 +120,7 @@ You can then count the number of total videos.
 >>> print(f"Total videos: {video_total}")
 ```
 
-```py 
+```py
 >>> all_video_file_paths = (
 ...     list(dataset_root_path.glob("train/*/*.avi"))
 ...     + list(dataset_root_path.glob("val/*/*.avi"))
@@ -148,9 +148,9 @@ For the validation and evaluation splits, you wouldn't want to have video clips
 Next up, you will derive the set of labels present in the dataset. Also, create two dictionaries that'll be helpful when initializing the model:
 
 * `label2id`: maps the class names to integers.
-* `id2label`: maps the integers to class names. 
+* `id2label`: maps the integers to class names.
 
-```py 
+```py
 >>> class_labels = sorted({str(path).split("/")[2] for path in all_video_file_paths})
 >>> label2id = {label: i for i, label in enumerate(class_labels)}
 >>> id2label = {i: label for label, i in label2id.items()}
@@ -166,7 +166,7 @@ There are 10 unique classes. For each class, there are 30 videos in the training
 
 Instantiate a video classification model from a pretrained checkpoint and its associated image processor. The model's encoder comes with pre-trained parameters, and the classification head is randomly initialized. The image processor will come in handy when writing the preprocessing pipeline for our dataset.
 
-```py 
+```py
 >>> from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification
 
 >>> model_ckpt = "MCG-NJU/videomae-base"
@@ -191,13 +191,13 @@ You should probably TRAIN this model on a down-stream task to be able to use it
 
 The warning is telling us we are throwing away some weights (e.g. the weights and bias of the `classifier` layer) and randomly initializing some others (the weights and bias of a new `classifier` layer). This is expected in this case, because we are adding a new head for which we don't have pretrained weights, so the library warns us we should fine-tune this model before using it for inference, which is exactly what we are going to do.
 
-**Note** that [this checkpoint](https://huggingface.co/MCG-NJU/videomae-base-finetuned-kinetics) leads to better performance on this task as the checkpoint was obtained by fine-tuning on a similar downstream task having considerable domain overlap. You can check out [this checkpoint](https://huggingface.co/sayakpaul/videomae-base-finetuned-kinetics-finetuned-ucf101-subset) which was obtained by fine-tuning `MCG-NJU/videomae-base-finetuned-kinetics`.  
+**Note** that [this checkpoint](https://huggingface.co/MCG-NJU/videomae-base-finetuned-kinetics) leads to better performance on this task as the checkpoint was obtained fine-tuning on a similar downstream task having considerable domain overlap. You can check out [this checkpoint](https://huggingface.co/sayakpaul/videomae-base-finetuned-kinetics-finetuned-ucf101-subset) which was obtained by fine-tuning `MCG-NJU/videomae-base-finetuned-kinetics`.
 
 ## Prepare the datasets for training
 
-For preprocessing the videos, you will leverage the [PyTorchVideo library](https://pytorchvideo.org/). Start by importing the dependencies we need. 
+For preprocessing the videos, you will leverage the [PyTorchVideo library](https://pytorchvideo.org/). Start by importing the dependencies we need.
 
-```py 
+```py
 >>> import pytorchvideo.data
 
 >>> from pytorchvideo.transforms import (
@@ -218,7 +218,7 @@ For preprocessing the videos, you will leverage the [PyTorchVideo library](https
 ... )
 ```
 
-For the training dataset transformations, use a combination of uniform temporal subsampling, pixel normalization, random cropping, and random horizontal flipping. For the validation and evaluation dataset transformations, keep the same transformation chain except for random cropping and horizontal flipping. To learn more about the details of these transformations check out the [official documentation of PyTorchVideo](https://pytorchvideo.org).  
+For the training dataset transformations, use a combination of uniform temporal subsampling, pixel normalization, random cropping, and random horizontal flipping. For the validation and evaluation dataset transformations, keep the same transformation chain except for random cropping and horizontal flipping. To learn more about the details of these transformations check out the [official documentation of PyTorchVideo](https://pytorchvideo.org).
 
 Use the `image_processor` associated with the pre-trained model to obtain the following information:
 
@@ -243,9 +243,9 @@ Start by defining some constants.
 >>> clip_duration = num_frames_to_sample * sample_rate / fps
 ```
 
-Now, define the dataset-specific transformations and the datasets respectively. Starting with the training set: 
+Now, define the dataset-specific transformations and the datasets respectively. Starting with the training set:
 
-```py 
+```py
 >>> train_transform = Compose(
 ...     [
 ...         ApplyTransformToKey(
@@ -272,9 +272,9 @@ Now, define the dataset-specific transformations and the datasets respectively.
 ... )
 ```
 
-The same sequence of workflow can be applied to the validation and evaluation sets: 
+The same sequence of workflow can be applied to the validation and evaluation sets:
 
-```py 
+```py
 >>> val_transform = Compose(
 ...     [
 ...         ApplyTransformToKey(
@@ -306,7 +306,7 @@ The same sequence of workflow can be applied to the validation and evaluation se
 ... )
 ```
 
-**Note**: The above dataset pipelines are taken from the [official PyTorchVideo example](https://pytorchvideo.org/docs/tutorial_classification#dataset). We're using the [`pytorchvideo.data.Ucf101()`](https://pytorchvideo.readthedocs.io/en/latest/api/data/data.html#pytorchvideo.data.Ucf101) function because it's tailored for the UCF-101 dataset. Under the hood, it returns a [`pytorchvideo.data.labeled_video_dataset.LabeledVideoDataset`](https://pytorchvideo.readthedocs.io/en/latest/api/data/data.html#pytorchvideo.data.LabeledVideoDataset) object. `LabeledVideoDataset` class is the base class for all things video in the PyTorchVideo dataset. So, if you want to use a custom dataset not supported off-the-shelf by PyTorchVideo, you can extend the `LabeledVideoDataset` class accordingly. Refer to the `data` API [documentation to](https://pytorchvideo.readthedocs.io/en/latest/api/data/data.html) learn more. Also, if your dataset follows a similar structure (as shown above), then using the `pytorchvideo.data.Ucf101()` should work just fine. 
+**Note**: The above dataset pipelines are taken from the [official PyTorchVideo example](https://pytorchvideo.org/docs/tutorial_classification#dataset). We're using the [`pytorchvideo.data.Ucf101()`](https://pytorchvideo.readthedocs.io/en/latest/api/data/data.html#pytorchvideo.data.Ucf101) function because it's tailored for the UCF-101 dataset. Under the hood, it returns a [`pytorchvideo.data.labeled_video_dataset.LabeledVideoDataset`](https://pytorchvideo.readthedocs.io/en/latest/api/data/data.html#pytorchvideo.data.LabeledVideoDataset) object. `LabeledVideoDataset` class is the base class for all things video in the PyTorchVideo dataset. So, if you want to use a custom dataset not supported off-the-shelf by PyTorchVideo, you can extend the `LabeledVideoDataset` class accordingly. Refer to the `data` API [documentation to](https://pytorchvideo.readthedocs.io/en/latest/api/data/data.html) learn more. Also, if your dataset follows a similar structure (as shown above), then using the `pytorchvideo.data.Ucf101()` should work just fine.
 
 You can access the `num_videos` argument to know the number of videos in the dataset.
 
@@ -315,9 +315,9 @@ You can access the `num_videos` argument to know the number of videos in the dat
 # (300, 30, 75)
 ```
 
-## Visualize the preprocessed video for better debugging 
+## Visualize the preprocessed video for better debugging
 
-```py 
+```py
 >>> import imageio
 >>> import numpy as np
 >>> from IPython.display import Image
@@ -330,7 +330,7 @@ You can access the `num_videos` argument to know the number of videos in the dat
 
 >>> def create_gif(video_tensor, filename="sample.gif"):
 ...     """Prepares a GIF from a video tensor.
-...     
+...
 ...     The video tensor is expected to have the following shape:
 ...     (num_frames, num_channels, height, width).
 ...     """
@@ -357,14 +357,14 @@ You can access the `num_videos` argument to know the number of videos in the dat
     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/sample_gif.gif" alt="Person playing basketball"/>
 </div>
 
-## Train the model 
+## Train the model
 
 Leverage [`Trainer`](https://huggingface.co/docs/transformers/main_classes/trainer) from  🤗 Transformers for training the model. To instantiate a `Trainer`, you need to define the training configuration and an evaluation metric. The most important is the [`TrainingArguments`](https://huggingface.co/transformers/main_classes/trainer.html#transformers.TrainingArguments), which is a class that contains all the attributes to configure the training. It requires an output folder name, which will be used to save the checkpoints of the model. It also helps sync all the information in the model repository on 🤗 Hub.
 
 Most of the training arguments are self-explanatory, but one that is quite important here is `remove_unused_columns=False`. This one will drop any features not used by the model's call function. By default it's `True` because usually it's ideal to drop unused feature columns, making it easier to unpack inputs into the model's call function. But, in this case, you need the unused features ('video' in particular) in order to create `pixel_values` (which is a mandatory key our model expects in its inputs).
 
 
-```py 
+```py
 >>> from transformers import TrainingArguments, Trainer
 
 >>> model_name = model_ckpt.split("/")[-1]
@@ -388,7 +388,7 @@ Most of the training arguments are self-explanatory, but one that is quite impor
 ... )
 ```
 
-The dataset returned by `pytorchvideo.data.Ucf101()` doesn't implement the `__len__` method. As such, we must define `max_steps` when instantiating `TrainingArguments`. 
+The dataset returned by `pytorchvideo.data.Ucf101()` doesn't implement the `__len__` method. As such, we must define `max_steps` when instantiating `TrainingArguments`.
 
 Next, you need to define a function to compute the metrics from the predictions, which will use the `metric` you'll load now. The only preprocessing you have to do is to take the argmax of our predicted logits:
 
@@ -409,7 +409,7 @@ In the [VideoMAE paper](https://arxiv.org/abs/2203.12602), the authors use the f
 
 Also, define a `collate_fn`, which will be used to batch examples together. Each batch consists of 2 keys, namely `pixel_values` and `labels`.
 
-```py 
+```py
 >>> def collate_fn(examples):
 ...     # permute to (num_frames, num_channels, height, width)
 ...     pixel_values = torch.stack(
@@ -421,13 +421,13 @@ Also, define a `collate_fn`, which will be used to batch examples together. Each
 
 Then you just pass all of this along with the datasets to `Trainer`:
 
-```py 
+```py
 >>> trainer = Trainer(
 ...     model,
 ...     args,
 ...     train_dataset=train_dataset,
 ...     eval_dataset=val_dataset,
-...     tokenizer=image_processor,
+...     processing_class=image_processor,
 ...     compute_metrics=compute_metrics,
 ...     data_collator=collate_fn,
 ... )
@@ -437,7 +437,7 @@ You might wonder why you passed along the `image_processor` as a tokenizer when
 
 Now fine-tune our model by calling the `train` method:
 
-```py 
+```py
 >>> train_results = trainer.train()
 ```
 
@@ -453,7 +453,7 @@ Great, now that you have fine-tuned a model, you can use it for inference!
 
 Load a video for inference:
 
-```py 
+```py
 >>> sample_test_video = next(iter(test_dataset))
 ```
 
@@ -507,10 +507,10 @@ Now, pass your input to the model and return the `logits`:
 >>> logits = run_inference(trained_model, sample_test_video["video"])
 ```
 
-Decoding the `logits`, we get: 
+Decoding the `logits`, we get:
 
-```py 
+```py
 >>> predicted_class_idx = logits.argmax(-1).item()
 >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
 # Predicted class: BasketballDunk
-```
\ No newline at end of file
+```
diff --git a/docs/source/en/tasks/visual_question_answering.md b/docs/source/en/tasks/visual_question_answering.md
index c45f12dbc1e7a8..7083d8c98b932e 100644
--- a/docs/source/en/tasks/visual_question_answering.md
+++ b/docs/source/en/tasks/visual_question_answering.md
@@ -18,14 +18,14 @@ rendered properly in your Markdown viewer.
 
 [[open-in-colab]]
 
-Visual Question Answering (VQA) is the task of answering open-ended questions based on an image. 
-The input to models supporting this task is typically a combination of an image and a question, and the output is an 
+Visual Question Answering (VQA) is the task of answering open-ended questions based on an image.
+The input to models supporting this task is typically a combination of an image and a question, and the output is an
 answer expressed in natural language.
 
 Some noteworthy use case examples for VQA include:
 * Accessibility applications for visually impaired individuals.
 * Education: posing questions about visual materials presented in lectures or textbooks. VQA can also be utilized in interactive museum exhibits or historical sites.
-* Customer service and e-commerce: VQA can enhance user experience by letting users ask questions about products. 
+* Customer service and e-commerce: VQA can enhance user experience by letting users ask questions about products.
 * Image retrieval: VQA models can be used to retrieve images with specific characteristics. For example, the user can ask "Is there a dog?" to find all images with dogs from a set of images.
 
 In this guide you'll learn how to:
@@ -36,15 +36,15 @@ In this guide you'll learn how to:
 
 ## Fine-tuning ViLT
 
-ViLT model incorporates text embeddings into a Vision Transformer (ViT), allowing it to have a minimal design for 
-Vision-and-Language Pre-training (VLP). This model can be used for several downstream tasks. For the VQA task, a classifier 
-head is placed on top (a linear layer on top of the final hidden state of the `[CLS]` token) and randomly initialized. 
+ViLT model incorporates text embeddings into a Vision Transformer (ViT), allowing it to have a minimal design for
+Vision-and-Language Pre-training (VLP). This model can be used for several downstream tasks. For the VQA task, a classifier
+head is placed on top (a linear layer on top of the final hidden state of the `[CLS]` token) and randomly initialized.
 Visual Question Answering is thus treated as a **classification problem**.
 
-More recent models, such as BLIP, BLIP-2, and InstructBLIP, treat VQA as a generative task. Later in this guide we 
-illustrate how to use them for zero-shot VQA inference. 
+More recent models, such as BLIP, BLIP-2, and InstructBLIP, treat VQA as a generative task. Later in this guide we
+illustrate how to use them for zero-shot VQA inference.
 
-Before you begin, make sure you have all the necessary libraries installed. 
+Before you begin, make sure you have all the necessary libraries installed.
 
 ```bash
 pip install -q transformers datasets
@@ -67,15 +67,15 @@ Let's define the model checkpoint as a global variable.
 
 ## Load the data
 
-For illustration purposes, in this guide we use a very small sample of the annotated visual question answering `Graphcore/vqa` dataset. 
+For illustration purposes, in this guide we use a very small sample of the annotated visual question answering `Graphcore/vqa` dataset.
 You can find the full dataset on [🤗 Hub](https://huggingface.co/datasets/Graphcore/vqa).
 
-As an alternative to the [`Graphcore/vqa` dataset](https://huggingface.co/datasets/Graphcore/vqa), you can download the 
-same data manually from the official [VQA dataset page](https://visualqa.org/download.html). If you prefer to follow the 
+As an alternative to the [`Graphcore/vqa` dataset](https://huggingface.co/datasets/Graphcore/vqa), you can download the
+same data manually from the official [VQA dataset page](https://visualqa.org/download.html). If you prefer to follow the
 tutorial with your custom data, check out how to [Create an image dataset](https://huggingface.co/docs/datasets/image_dataset#loading-script)
-guide in the 🤗 Datasets documentation.  
+guide in the 🤗 Datasets documentation.
 
-Let's load the first 200 examples from the validation split and explore the dataset's features:  
+Let's load the first 200 examples from the validation split and explore the dataset's features:
 
 ```python
 >>> from datasets import load_dataset
@@ -104,20 +104,20 @@ Let's take a look at an example to understand the dataset's features:
    0.30000001192092896]}}
 ```
 
-The features relevant to the task include: 
+The features relevant to the task include:
 * `question`: the question to be answered from the image
 * `image_id`: the path to the image the question refers to
 * `label`: the annotations
 
-We can remove the rest of the features as they won't be necessary: 
+We can remove the rest of the features as they won't be necessary:
 
-```py 
+```py
 >>> dataset = dataset.remove_columns(['question_type', 'question_id', 'answer_type'])
 ```
 
-As you can see, the `label` feature contains several answers to the same question (called `ids` here) collected by different human annotators. 
-This is because the answer to a question can be subjective. In this case, the question is "where is he looking?". Some people 
-annotated this with "down", others with "at table", another one with "skateboard", etc. 
+As you can see, the `label` feature contains several answers to the same question (called `ids` here) collected by different human annotators.
+This is because the answer to a question can be subjective. In this case, the question is "where is he looking?". Some people
+annotated this with "down", others with "at table", another one with "skateboard", etc.
 
 Take a look at the image and consider which answer would you give:
 
@@ -132,14 +132,14 @@ Take a look at the image and consider which answer would you give:
      <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/vqa-example.png" alt="VQA Image Example"/>
 </div>
 
-Due to the questions' and answers' ambiguity, datasets like this are treated as a multi-label classification problem (as 
-multiple answers are possibly valid). Moreover, rather than just creating a one-hot encoded vector, one creates a 
+Due to the questions' and answers' ambiguity, datasets like this are treated as a multi-label classification problem (as
+multiple answers are possibly valid). Moreover, rather than just creating a one-hot encoded vector, one creates a
 soft encoding, based on the number of times a certain answer appeared in the annotations.
 
-For instance, in the example above, because the answer "down" is selected way more often than other answers, it has a 
-score (called `weight` in the dataset) of 1.0, and the rest of the answers have scores < 1.0. 
+For instance, in the example above, because the answer "down" is selected way more often than other answers, it has a
+score (called `weight` in the dataset) of 1.0, and the rest of the answers have scores < 1.0.
 
-To later instantiate the model with an appropriate classification head, let's create two dictionaries: one that maps 
+To later instantiate the model with an appropriate classification head, let's create two dictionaries: one that maps
 the label name to an integer and vice versa:
 
 ```py
@@ -150,10 +150,10 @@ the label name to an integer and vice versa:
 >>> unique_labels = list(set(flattened_labels))
 
 >>> label2id = {label: idx for idx, label in enumerate(unique_labels)}
->>> id2label = {idx: label for label, idx in label2id.items()} 
+>>> id2label = {idx: label for label, idx in label2id.items()}
 ```
 
-Now that we have the mappings, we can replace the string answers with their ids, and flatten the dataset for a more convenient further preprocessing. 
+Now that we have the mappings, we can replace the string answers with their ids, and flatten the dataset for a more convenient further preprocessing.
 
 ```python
 >>> def replace_ids(inputs):
@@ -172,21 +172,21 @@ Now that we have the mappings, we can replace the string answers with their ids,
 
 ## Preprocessing data
 
-The next step is to load a ViLT processor to prepare the image and text data for the model. 
+The next step is to load a ViLT processor to prepare the image and text data for the model.
 [`ViltProcessor`] wraps a BERT tokenizer and ViLT image processor into a convenient single processor:
 
-```py 
+```py
 >>> from transformers import ViltProcessor
 
 >>> processor = ViltProcessor.from_pretrained(model_checkpoint)
 ```
 
-To preprocess the data we need to encode the images and questions using the [`ViltProcessor`]. The processor will use 
-the [`BertTokenizerFast`] to tokenize the text and create `input_ids`, `attention_mask` and `token_type_ids` for the text data. 
+To preprocess the data we need to encode the images and questions using the [`ViltProcessor`]. The processor will use
+the [`BertTokenizerFast`] to tokenize the text and create `input_ids`, `attention_mask` and `token_type_ids` for the text data.
 As for images, the processor will leverage [`ViltImageProcessor`] to resize and normalize the image, and create `pixel_values` and `pixel_mask`.
 
-All these preprocessing steps are done under the hood, we only need to call the `processor`. However, we still need to 
-prepare the target labels. In this representation, each element corresponds to a possible answer (label). For correct answers, the element holds 
+All these preprocessing steps are done under the hood, we only need to call the `processor`. However, we still need to
+prepare the target labels. In this representation, each element corresponds to a possible answer (label). For correct answers, the element holds
 their respective score (weight), while the remaining elements are set to zero.
 
 The following function applies the `processor` to the images and questions and formats the labels as described above:
@@ -197,13 +197,13 @@ The following function applies the `processor` to the images and questions and f
 >>> def preprocess_data(examples):
 ...     image_paths = examples['image_id']
 ...     images = [Image.open(image_path) for image_path in image_paths]
-...     texts = examples['question']    
+...     texts = examples['question']
 
 ...     encoding = processor(images, texts, padding="max_length", truncation=True, return_tensors="pt")
 
 ...     for k, v in encoding.items():
 ...           encoding[k] = v.squeeze()
-    
+
 ...     targets = []
 
 ...     for labels, scores in zip(examples['label.ids'], examples['label.weights']):
@@ -211,15 +211,15 @@ The following function applies the `processor` to the images and questions and f
 
 ...         for label, score in zip(labels, scores):
 ...             target[label] = score
-      
+
 ...         targets.append(target)
 
 ...     encoding["labels"] = targets
-    
+
 ...     return encoding
 ```
 
-To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.map`] function. You can speed up `map` by 
+To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.map`] function. You can speed up `map` by
 setting `batched=True` to process multiple elements of the dataset at once. At this point, feel free to remove the columns you don't need.
 
 ```py
@@ -241,7 +241,7 @@ As a final step, create a batch of examples using [`DefaultDataCollator`]:
 
 ## Train the model
 
-You’re ready to start training your model now! Load ViLT with [`ViltForQuestionAnswering`]. Specify the number of labels 
+You’re ready to start training your model now! Load ViLT with [`ViltForQuestionAnswering`]. Specify the number of labels
 along with the label mappings:
 
 ```py
@@ -282,14 +282,14 @@ At this point, only three steps remain:
 ...     args=training_args,
 ...     data_collator=data_collator,
 ...     train_dataset=processed_dataset,
-...     tokenizer=processor,
+...     processing_class=processor,
 ... )
 ```
 
 3. Call [`~Trainer.train`] to finetune your model.
 
 ```py
->>> trainer.train() 
+>>> trainer.train()
 ```
 
 Once training is completed, share your model to the Hub with the [`~Trainer.push_to_hub`] method to share your final model on the 🤗 Hub:
@@ -309,7 +309,7 @@ way to try out your fine-tuned model for inference is to use it in a [`Pipeline`
 >>> pipe = pipeline("visual-question-answering", model="MariaK/vilt_finetuned_200")
 ```
 
-The model in this guide has only been trained on 200 examples, so don't expect a lot from it. Let's see if it at least 
+The model in this guide has only been trained on 200 examples, so don't expect a lot from it. Let's see if it at least
 learned something from the data and take the first example from the dataset to illustrate inference:
 
 ```py
@@ -352,13 +352,13 @@ Predicted answer: down
 
 ## Zero-shot VQA
 
-The previous model treated VQA as a classification task. Some recent models, such as BLIP, BLIP-2, and InstructBLIP approach 
-VQA as a generative task. Let's take [BLIP-2](../model_doc/blip-2) as an example. It introduced a new visual-language pre-training 
-paradigm in which any combination of pre-trained vision encoder and LLM can be used (learn more in the [BLIP-2 blog post](https://huggingface.co/blog/blip-2)). 
-This enables achieving state-of-the-art results on multiple visual-language tasks including visual question answering. 
+The previous model treated VQA as a classification task. Some recent models, such as BLIP, BLIP-2, and InstructBLIP approach
+VQA as a generative task. Let's take [BLIP-2](../model_doc/blip-2) as an example. It introduced a new visual-language pre-training
+paradigm in which any combination of pre-trained vision encoder and LLM can be used (learn more in the [BLIP-2 blog post](https://huggingface.co/blog/blip-2)).
+This enables achieving state-of-the-art results on multiple visual-language tasks including visual question answering.
 
-Let's illustrate how you can use this model for VQA. First, let's load the model. Here we'll explicitly send the model to a 
-GPU, if available, which we didn't need to do earlier when training, as [`Trainer`] handles this automatically: 
+Let's illustrate how you can use this model for VQA. First, let's load the model. Here we'll explicitly send the model to a
+GPU, if available, which we didn't need to do earlier when training, as [`Trainer`] handles this automatically:
 
 ```py
 >>> from transformers import AutoProcessor, Blip2ForConditionalGeneration
@@ -370,9 +370,9 @@ GPU, if available, which we didn't need to do earlier when training, as [`Traine
 >>> model.to(device)
 ```
 
-The model takes image and text as input, so let's use the exact same image/question pair from the first example in the VQA dataset: 
+The model takes image and text as input, so let's use the exact same image/question pair from the first example in the VQA dataset:
 
-```py 
+```py
 >>> example = dataset[0]
 >>> image = Image.open(example['image_id'])
 >>> question = example['question']
@@ -381,7 +381,7 @@ The model takes image and text as input, so let's use the exact same image/quest
 To use BLIP-2 for visual question answering task, the textual prompt has to follow a specific format: `Question: {} Answer:`.
 
 ```py
->>> prompt = f"Question: {question} Answer:" 
+>>> prompt = f"Question: {question} Answer:"
 ```
 
 Now we need to preprocess the image/prompt with the model's processor, pass the processed input through the model, and decode the output:
@@ -392,10 +392,9 @@ Now we need to preprocess the image/prompt with the model's processor, pass the
 >>> generated_ids = model.generate(**inputs, max_new_tokens=10)
 >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
 >>> print(generated_text)
-"He is looking at the crowd" 
+"He is looking at the crowd"
 ```
 
-As you can see, the model recognized the crowd, and the direction of the face (looking down), however, it seems to miss 
-the fact the crowd is behind the skater. Still, in cases where acquiring human-annotated datasets is not feasible, this 
+As you can see, the model recognized the crowd, and the direction of the face (looking down), however, it seems to miss
+the fact the crowd is behind the skater. Still, in cases where acquiring human-annotated datasets is not feasible, this
 approach can quickly produce useful results.
- 
diff --git a/docs/source/en/trainer.md b/docs/source/en/trainer.md
index 812c5fe1a2a89c..f9ea3337699444 100644
--- a/docs/source/en/trainer.md
+++ b/docs/source/en/trainer.md
@@ -81,7 +81,7 @@ trainer = Trainer(
     args=training_args,
     train_dataset=dataset["train"],
     eval_dataset=dataset["test"],
-    tokenizer=tokenizer,
+    processing_class=tokenizer,
     data_collator=data_collator,
     compute_metrics=compute_metrics,
 )
@@ -153,7 +153,7 @@ from transformers import TrainerCallback
 class EarlyStoppingCallback(TrainerCallback):
     def __init__(self, num_steps=10):
         self.num_steps = num_steps
-    
+
     def on_step_end(self, args, state, control, **kwargs):
         if state.global_step >= self.num_steps:
             return {"should_training_stop": True}
@@ -171,7 +171,7 @@ trainer = Trainer(
     args=training_args,
     train_dataset=dataset["train"],
     eval_dataset=dataset["test"],
-    tokenizer=tokenizer,
+    processing_class=tokenizer,
     data_collator=data_collator,
     compute_metrics=compute_metrics,
     callback=[EarlyStoppingCallback()],
@@ -289,7 +289,7 @@ tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_config(config).to(0)
 
 trainer = trl.SFTTrainer(
-    model=model, 
+    model=model,
     args=args,
     train_dataset=train_dataset,
     dataset_text_field='text',
@@ -327,7 +327,7 @@ tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_config(config).to(0)
 
 trainer = trl.SFTTrainer(
-    model=model, 
+    model=model,
     args=args,
     train_dataset=train_dataset,
     dataset_text_field='text',
@@ -370,7 +370,7 @@ tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_config(config).to(0)
 
 trainer = trl.SFTTrainer(
-    model=model, 
+    model=model,
     args=args,
     train_dataset=train_dataset,
     dataset_text_field='text',
@@ -419,8 +419,8 @@ The kernel supports the Llama, Gemma, Mistral, and Mixtral model architectures.
 
 ## LOMO optimizer
 
-The LOMO optimizers have been introduced in [Full Parameter Fine-Tuning for Large Language Models with Limited Resources](https://hf.co/papers/2306.09782) and [AdaLomo: Low-memory Optimization with Adaptive Learning Rate](https://hf.co/papers/2310.10195). 
-They both consist of an efficient full-parameter fine-tuning method. These optimizers fuse the gradient computation and the parameter update in one step to reduce memory usage. Supported optimizers for LOMO are `"lomo"` and `"adalomo"`. First either install LOMO from pypi `pip install lomo-optim` or install it from source with `pip install git+https://github.com/OpenLMLab/LOMO.git`. 
+The LOMO optimizers have been introduced in [Full Parameter Fine-Tuning for Large Language Models with Limited Resources](https://hf.co/papers/2306.09782) and [AdaLomo: Low-memory Optimization with Adaptive Learning Rate](https://hf.co/papers/2310.10195).
+They both consist of an efficient full-parameter fine-tuning method. These optimizers fuse the gradient computation and the parameter update in one step to reduce memory usage. Supported optimizers for LOMO are `"lomo"` and `"adalomo"`. First either install LOMO from pypi `pip install lomo-optim` or install it from source with `pip install git+https://github.com/OpenLMLab/LOMO.git`.
 
 <Tip>
 
@@ -457,7 +457,7 @@ tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True).to(0)
 
 trainer = trl.SFTTrainer(
-    model=model, 
+    model=model,
     args=args,
     train_dataset=train_dataset,
     dataset_text_field='text',
@@ -579,8 +579,8 @@ To use Accelerate with [`Trainer`], run the [`accelerate.config`](https://huggin
 <hfoption id="DistributedDataParallel">
 
 ```yml
-compute_environment: LOCAL_MACHINE                                                                                             
-distributed_type: MULTI_GPU                                                                                                    
+compute_environment: LOCAL_MACHINE
+distributed_type: MULTI_GPU
 downcast_bf16: 'no'
 gpu_ids: all
 machine_rank: 0 #change rank as per the node
@@ -654,8 +654,8 @@ use_cpu: false
 <hfoption id="DeepSpeed with Accelerate plugin">
 
 ```yml
-compute_environment: LOCAL_MACHINE                                                                                             
-deepspeed_config:                                                                                                              
+compute_environment: LOCAL_MACHINE
+deepspeed_config:
   gradient_accumulation_steps: 1
   gradient_clipping: 0.7
   offload_optimizer_device: cpu
diff --git a/docs/source/es/tasks/asr.md b/docs/source/es/tasks/asr.md
index 7d3133af472f64..41e9f82e35f7f1 100644
--- a/docs/source/es/tasks/asr.md
+++ b/docs/source/es/tasks/asr.md
@@ -276,7 +276,7 @@ En este punto, solo quedan tres pasos:
 ...     args=training_args,
 ...     train_dataset=encoded_minds["train"],
 ...     eval_dataset=encoded_minds["test"],
-...     tokenizer=processor.feature_extractor,
+...     processing_class=processor.feature_extractor,
 ...     data_collator=data_collator,
 ...     compute_metrics=compute_metrics,
 ... )
diff --git a/docs/source/es/tasks/image_classification.md b/docs/source/es/tasks/image_classification.md
index 4e3696c505b030..1bea46884202fe 100644
--- a/docs/source/es/tasks/image_classification.md
+++ b/docs/source/es/tasks/image_classification.md
@@ -160,7 +160,7 @@ Al llegar a este punto, solo quedan tres pasos:
 ...     data_collator=data_collator,
 ...     train_dataset=food["train"],
 ...     eval_dataset=food["test"],
-...     tokenizer=image_processor,
+...     processing_class=image_processor,
 ... )
 
 >>> trainer.train()
diff --git a/docs/source/es/tasks/multiple_choice.md b/docs/source/es/tasks/multiple_choice.md
index 959416f149c357..32df3401d737de 100644
--- a/docs/source/es/tasks/multiple_choice.md
+++ b/docs/source/es/tasks/multiple_choice.md
@@ -225,7 +225,7 @@ En este punto, solo quedan tres pasos:
 ...     args=training_args,
 ...     train_dataset=tokenized_swag["train"],
 ...     eval_dataset=tokenized_swag["validation"],
-...     tokenizer=tokenizer,
+...     processing_class=tokenizer,
 ...     data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
 ... )
 
diff --git a/docs/source/es/tasks/question_answering.md b/docs/source/es/tasks/question_answering.md
index ca43aac9ae9e7a..42a6e4b6e1bc4e 100644
--- a/docs/source/es/tasks/question_answering.md
+++ b/docs/source/es/tasks/question_answering.md
@@ -195,7 +195,7 @@ En este punto, solo quedan tres pasos:
 ...     args=training_args,
 ...     train_dataset=tokenized_squad["train"],
 ...     eval_dataset=tokenized_squad["validation"],
-...     tokenizer=tokenizer,
+...     processing_class=tokenizer,
 ...     data_collator=data_collator,
 ... )
 
diff --git a/docs/source/es/tasks/summarization.md b/docs/source/es/tasks/summarization.md
index e6a9532f660387..c9060cba6b771d 100644
--- a/docs/source/es/tasks/summarization.md
+++ b/docs/source/es/tasks/summarization.md
@@ -155,7 +155,7 @@ En este punto, solo faltan tres pasos:
 ...     args=training_args,
 ...     train_dataset=tokenized_billsum["train"],
 ...     eval_dataset=tokenized_billsum["test"],
-...     tokenizer=tokenizer,
+...     processing_class=tokenizer,
 ...     data_collator=data_collator,
 ... )
 
diff --git a/docs/source/es/trainer.md b/docs/source/es/trainer.md
index 57fcaa62900572..dab83e9a9d9ebc 100644
--- a/docs/source/es/trainer.md
+++ b/docs/source/es/trainer.md
@@ -14,7 +14,7 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# El Trainer 
+# El Trainer
 
 El [`Trainer`] es un bucle completo de entrenamiento y evaluación para modelos de PyTorch implementado en la biblioteca Transformers. Solo necesitas pasarle las piezas necesarias para el entrenamiento (modelo, tokenizador, conjunto de datos, función de evaluación, hiperparámetros de entrenamiento, etc.), y la clase [`Trainer`] se encarga del resto. Esto facilita comenzar a entrenar más rápido sin tener que escribir manualmente tu propio bucle de entrenamiento. Pero al mismo tiempo, [`Trainer`] es muy personalizable y ofrece una gran cantidad de opciones de entrenamiento para que puedas adaptarlo a tus necesidades exactas de entrenamiento.
 
@@ -79,7 +79,7 @@ trainer = Trainer(
     args=training_args,
     train_dataset=dataset["train"],
     eval_dataset=dataset["test"],
-    tokenizer=tokenizer,
+    processing_class=tokenizer,
     data_collator=data_collator,
     compute_metrics=compute_metrics,
 )
@@ -151,7 +151,7 @@ from transformers import TrainerCallback
 class EarlyStoppingCallback(TrainerCallback):
     def __init__(self, num_steps=10):
         self.num_steps = num_steps
-    
+
     def on_step_end(self, args, state, control, **kwargs):
         if state.global_step >= self.num_steps:
             return {"should_training_stop": True}
@@ -169,7 +169,7 @@ trainer = Trainer(
     args=training_args,
     train_dataset=dataset["train"],
     eval_dataset=dataset["test"],
-    tokenizer=tokenizer,
+    processing_class=tokenizer,
     data_collator=data_collator,
     compute_metrics=compute_metrics,
     callback=[EarlyStoppingCallback()],
@@ -265,8 +265,8 @@ Para usar Accelerate con [`Trainer`], ejecuta el comando [`accelerate.config`](h
 <hfoption id="DistributedDataParallel">
 
 ```yml
-compute_environment: LOCAL_MACHINE                                                                                             
-distributed_type: MULTI_GPU                                                                                                    
+compute_environment: LOCAL_MACHINE
+distributed_type: MULTI_GPU
 downcast_bf16: 'no'
 gpu_ids: all
 machine_rank: 0 #change rank as per the node
@@ -337,8 +337,8 @@ use_cpu: false
 <hfoption id="DeepSpeed with Accelerate plugin">
 
 ```yml
-compute_environment: LOCAL_MACHINE                                                                                             
-deepspeed_config:                                                                                                              
+compute_environment: LOCAL_MACHINE
+deepspeed_config:
   gradient_accumulation_steps: 1
   gradient_clipping: 0.7
   offload_optimizer_device: cpu
@@ -406,4 +406,4 @@ accelerate launch --num_processes=2 \
     --overwrite_output_dir
 ```
 
-Consulta el tutorial [Lanzamiento de tus scripts con Accelerate](https://huggingface.co/docs/accelerate/basic_tutorials/launch) para obtener más información sobre `accelerate_launch` y las configuraciones personalizadas.
\ No newline at end of file
+Consulta el tutorial [Lanzamiento de tus scripts con Accelerate](https://huggingface.co/docs/accelerate/basic_tutorials/launch) para obtener más información sobre `accelerate_launch` y las configuraciones personalizadas.
diff --git a/docs/source/fr/quicktour.md b/docs/source/fr/quicktour.md
index df0233ae82aabc..3cc2a8c5faac76 100644
--- a/docs/source/fr/quicktour.md
+++ b/docs/source/fr/quicktour.md
@@ -169,7 +169,7 @@ Si vous ne parvenez pas à trouver un modèle adapté à votre cas d'utilisation
 
 <Youtube id="AhChOFRegn4"/>
 
-Les classes [`AutoModelForSequenceClassification`] et [`AutoTokenizer`] fonctionnent ensemble pour créer un [`pipeline`] comme celui que vous avez utilisé ci-dessus. Une [AutoClass](./model_doc/auto) est un raccourci qui récupère automatiquement l'architecture d'un modèle pré-entraîné à partir de son nom ou de son emplacement. Il vous suffit de sélectionner l'`AutoClass` appropriée à votre tâche et la classe de prétraitement qui lui est associée. 
+Les classes [`AutoModelForSequenceClassification`] et [`AutoTokenizer`] fonctionnent ensemble pour créer un [`pipeline`] comme celui que vous avez utilisé ci-dessus. Une [AutoClass](./model_doc/auto) est un raccourci qui récupère automatiquement l'architecture d'un modèle pré-entraîné à partir de son nom ou de son emplacement. Il vous suffit de sélectionner l'`AutoClass` appropriée à votre tâche et la classe de prétraitement qui lui est associée.
 
 Reprenons l'exemple de la section précédente et voyons comment vous pouvez utiliser l'`AutoClass` pour reproduire les résultats du [`pipeline`].
 
@@ -479,7 +479,7 @@ Maintenant, rassemblez tous ces éléments dans un [`Trainer`] :
 ...     args=training_args,
 ...     train_dataset=dataset["train"],
 ...     eval_dataset=dataset["test"],
-...     tokenizer=tokenizer,
+...     processing_class=tokenizer,
 ...     data_collator=data_collator,
 ... )  # doctest: +SKIP
 ```
@@ -496,7 +496,7 @@ Pour les tâches - comme la traduction ou la génération de résumé - qui util
 
 </Tip>
 
-Vous pouvez personnaliser le comportement de la boucle d'apprentissage en redéfinissant les méthodes à l'intérieur de [`Trainer`]. Cela vous permet de personnaliser des caractéristiques telles que la fonction de perte, l'optimiseur et le planificateur. Consultez la documentation de [`Trainer`] pour savoir quelles méthodes peuvent être redéfinies. 
+Vous pouvez personnaliser le comportement de la boucle d'apprentissage en redéfinissant les méthodes à l'intérieur de [`Trainer`]. Cela vous permet de personnaliser des caractéristiques telles que la fonction de perte, l'optimiseur et le planificateur. Consultez la documentation de [`Trainer`] pour savoir quelles méthodes peuvent être redéfinies.
 
 L'autre moyen de personnaliser la boucle d'apprentissage est d'utiliser les [Callbacks](./main_classes/callback). Vous pouvez utiliser les callbacks pour intégrer d'autres bibliothèques et inspecter la boucle d'apprentissage afin de suivre la progression ou d'arrêter l'apprentissage plus tôt. Les callbacks ne modifient rien dans la boucle d'apprentissage elle-même. Pour personnaliser quelque chose comme la fonction de perte, vous devez redéfinir le [`Trainer`] à la place.
 
diff --git a/docs/source/ja/hpo_train.md b/docs/source/ja/hpo_train.md
index 85da3616f80e1c..90591daf8b204f 100644
--- a/docs/source/ja/hpo_train.md
+++ b/docs/source/ja/hpo_train.md
@@ -24,7 +24,7 @@ rendered properly in your Markdown viewer.
 
 これらを使用する前に、ハイパーパラメーター検索バックエンドをインストールする必要があります。
 ```bash
-pip install optuna/sigopt/wandb/ray[tune] 
+pip install optuna/sigopt/wandb/ray[tune]
 ```
 
 ## How to enable Hyperparameter search in example
@@ -119,7 +119,7 @@ Wandbについては、[object_parameter](https://docs.wandb.ai/guides/sweeps/co
 ...     train_dataset=small_train_dataset,
 ...     eval_dataset=small_eval_dataset,
 ...     compute_metrics=compute_metrics,
-...     tokenizer=tokenizer,
+...     processing_class=tokenizer,
 ...     model_init=model_init,
 ...     data_collator=data_collator,
 ... )
@@ -142,9 +142,3 @@ Wandbについては、[object_parameter](https://docs.wandb.ai/guides/sweeps/co
 
 ## Hyperparameter search For DDP finetune
 現在、DDP（Distributed Data Parallel）のためのハイパーパラメーター検索は、Optuna と SigOpt に対して有効になっています。ランクゼロプロセスのみが検索トライアルを生成し、他のランクに引数を渡します。
-
-
-
-
-
-
diff --git a/docs/source/ja/model_doc/auto.md b/docs/source/ja/model_doc/auto.md
index d4baaf70e6fd48..492c46c79ea905 100644
--- a/docs/source/ja/model_doc/auto.md
+++ b/docs/source/ja/model_doc/auto.md
@@ -368,3 +368,7 @@ AutoModel.register(NewModelConfig, NewModel)
 ### FlaxAutoModelForVision2Seq
 
 [[autodoc]] FlaxAutoModelForVision2Seq
+
+### AutoModelForImageTextToText
+
+[[autodoc]] AutoModelForImageTextToText
diff --git a/docs/source/ja/quicktour.md b/docs/source/ja/quicktour.md
index 0e20d1eee9743c..e03dea33cbd189 100644
--- a/docs/source/ja/quicktour.md
+++ b/docs/source/ja/quicktour.md
@@ -516,7 +516,7 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
 ...     args=training_args,
 ...     train_dataset=dataset["train"],
 ...     eval_dataset=dataset["test"],
-...     tokenizer=tokenizer,
+...     processing_class=tokenizer,
 ...     data_collator=data_collator,
 ... )  # doctest: +SKIP
 ```
diff --git a/docs/source/ja/tasks/asr.md b/docs/source/ja/tasks/asr.md
index 9226f5b414fdfd..ebefeba831a03e 100644
--- a/docs/source/ja/tasks/asr.md
+++ b/docs/source/ja/tasks/asr.md
@@ -148,7 +148,7 @@ MInDS-14 データセットのサンプリング レートは 8000kHz です (
 ...     return batch
 ```
 
-データセット全体に前処理関数を適用するには、🤗 Datasets [`~datasets.Dataset.map`] 関数を使用します。 `num_proc` パラメータを使用してプロセスの数を増やすことで、`map` を高速化できます。 [`~datasets.Dataset.remove_columns`] メソッドを使用して、不要な列を削除します。 
+データセット全体に前処理関数を適用するには、🤗 Datasets [`~datasets.Dataset.map`] 関数を使用します。 `num_proc` パラメータを使用してプロセスの数を増やすことで、`map` を高速化できます。 [`~datasets.Dataset.remove_columns`] メソッドを使用して、不要な列を削除します。
 
 ```py
 >>> encoded_minds = minds.map(prepare_dataset, remove_columns=minds.column_names["train"], num_proc=4)
@@ -281,7 +281,7 @@ MInDS-14 データセットのサンプリング レートは 8000kHz です (
 ...     args=training_args,
 ...     train_dataset=encoded_minds["train"],
 ...     eval_dataset=encoded_minds["test"],
-...     tokenizer=processor,
+...     processing_class=processor,
 ...     data_collator=data_collator,
 ...     compute_metrics=compute_metrics,
 ... )
diff --git a/docs/source/ja/tasks/audio_classification.md b/docs/source/ja/tasks/audio_classification.md
index d32050072f962e..aa38d12d4ef0cf 100644
--- a/docs/source/ja/tasks/audio_classification.md
+++ b/docs/source/ja/tasks/audio_classification.md
@@ -233,7 +233,7 @@ MInDS-14 データセットのサンプリング レートは 8000khz です (
 ...     args=training_args,
 ...     train_dataset=encoded_minds["train"],
 ...     eval_dataset=encoded_minds["test"],
-...     tokenizer=feature_extractor,
+...     processing_class=feature_extractor,
 ...     compute_metrics=compute_metrics,
 ... )
 
@@ -320,4 +320,4 @@ MInDS-14 データセットのサンプリング レートは 8000khz です (
 'cash_deposit'
 ```
 </pt>
-</frameworkcontent>
\ No newline at end of file
+</frameworkcontent>
diff --git a/docs/source/ja/tasks/document_question_answering.md b/docs/source/ja/tasks/document_question_answering.md
index 847ec8441ccf76..f07cc6dff28eae 100644
--- a/docs/source/ja/tasks/document_question_answering.md
+++ b/docs/source/ja/tasks/document_question_answering.md
@@ -364,7 +364,7 @@ end_index 18
 自分で実装したい場合は、[質問応答の章](https://huggingface.co/course/chapter7/7?fw=pt#postprocessing) を確認してください。
 インスピレーションを得るためにハグフェイスコースの。
 
-## Train 
+## Train
 
 おめでとう！このガイドの最も難しい部分を無事にナビゲートできたので、独自のモデルをトレーニングする準備が整いました。
 トレーニングには次の手順が含まれます。
@@ -423,7 +423,7 @@ end_index 18
 ...     data_collator=data_collator,
 ...     train_dataset=encoded_train_dataset,
 ...     eval_dataset=encoded_test_dataset,
-...     tokenizer=processor,
+...     processing_class=processor,
 ... )
 
 >>> trainer.train()
diff --git a/docs/source/ja/tasks/image_classification.md b/docs/source/ja/tasks/image_classification.md
index 2202dc3a4f6498..013dfc286dce63 100644
--- a/docs/source/ja/tasks/image_classification.md
+++ b/docs/source/ja/tasks/image_classification.md
@@ -323,7 +323,7 @@ food["test"].set_transform(preprocess_val)
 ...     data_collator=data_collator,
 ...     train_dataset=food["train"],
 ...     eval_dataset=food["test"],
-...     tokenizer=image_processor,
+...     processing_class=image_processor,
 ...     compute_metrics=compute_metrics,
 ... )
 
@@ -551,4 +551,3 @@ Epoch 5/5
 
 </tf>
 </frameworkcontent>
-
diff --git a/docs/source/ja/tasks/knowledge_distillation_for_image_classification.md b/docs/source/ja/tasks/knowledge_distillation_for_image_classification.md
index 30c0dbbf063040..1079121c6062bc 100644
--- a/docs/source/ja/tasks/knowledge_distillation_for_image_classification.md
+++ b/docs/source/ja/tasks/knowledge_distillation_for_image_classification.md
@@ -165,7 +165,7 @@ trainer = ImageDistilTrainer(
     train_dataset=processed_datasets["train"],
     eval_dataset=processed_datasets["validation"],
     data_collator=data_collator,
-    tokenizer=teacher_extractor,
+    processing_class=teacher_extractor,
     compute_metrics=compute_metrics,
     temperature=5,
     lambda_param=0.5
diff --git a/docs/source/ja/tasks/multiple_choice.md b/docs/source/ja/tasks/multiple_choice.md
index 98e258f161b712..075a7a2cb76455 100644
--- a/docs/source/ja/tasks/multiple_choice.md
+++ b/docs/source/ja/tasks/multiple_choice.md
@@ -271,7 +271,7 @@ tokenized_swag = swag.map(preprocess_function, batched=True)
 ...     args=training_args,
 ...     train_dataset=tokenized_swag["train"],
 ...     eval_dataset=tokenized_swag["validation"],
-...     tokenizer=tokenizer,
+...     processing_class=tokenizer,
 ...     data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
 ...     compute_metrics=compute_metrics,
 ... )
diff --git a/docs/source/ja/tasks/object_detection.md b/docs/source/ja/tasks/object_detection.md
index 1b1bfb3f8158a4..31e8effa54b224 100644
--- a/docs/source/ja/tasks/object_detection.md
+++ b/docs/source/ja/tasks/object_detection.md
@@ -371,7 +371,7 @@ DETR モデルをトレーニングできる「ラベル」。画像プロセッ
 ...     args=training_args,
 ...     data_collator=collate_fn,
 ...     train_dataset=cppe5["train"],
-...     tokenizer=image_processor,
+...     processing_class=image_processor,
 ... )
 
 >>> trainer.train()
diff --git a/docs/source/ja/tasks/question_answering.md b/docs/source/ja/tasks/question_answering.md
index b039272f45e80a..9217c211e6f973 100644
--- a/docs/source/ja/tasks/question_answering.md
+++ b/docs/source/ja/tasks/question_answering.md
@@ -227,7 +227,7 @@ pip install transformers datasets evaluate
 ...     args=training_args,
 ...     train_dataset=tokenized_squad["train"],
 ...     eval_dataset=tokenized_squad["test"],
-...     tokenizer=tokenizer,
+...     processing_class=tokenizer,
 ...     data_collator=data_collator,
 ... )
 
diff --git a/docs/source/ja/tasks/summarization.md b/docs/source/ja/tasks/summarization.md
index 74152d5dbdaa5b..6784696e6c95a3 100644
--- a/docs/source/ja/tasks/summarization.md
+++ b/docs/source/ja/tasks/summarization.md
@@ -216,7 +216,7 @@ pip install transformers datasets evaluate rouge_score
 ...     args=training_args,
 ...     train_dataset=tokenized_billsum["train"],
 ...     eval_dataset=tokenized_billsum["test"],
-...     tokenizer=tokenizer,
+...     processing_class=tokenizer,
 ...     data_collator=data_collator,
 ...     compute_metrics=compute_metrics,
 ... )
diff --git a/docs/source/ja/tasks/text-to-speech.md b/docs/source/ja/tasks/text-to-speech.md
index b302a19a0d5818..669d15730e24f8 100644
--- a/docs/source/ja/tasks/text-to-speech.md
+++ b/docs/source/ja/tasks/text-to-speech.md
@@ -125,7 +125,7 @@ dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
 >>> processor = SpeechT5Processor.from_pretrained(checkpoint)
 ```
 
-### Text cleanup for SpeechT5 tokenization 
+### Text cleanup for SpeechT5 tokenization
 
 
 まずはテキストデータをクリーンアップすることから始めます。テキストを処理するには、プロセッサのトークナイザー部分が必要です。
@@ -442,7 +442,7 @@ SpeechT5 では、モデルのデコーダ部分への入力が 2 分の 1 に
 ターゲット シーケンスの長さが奇数である可能性がある場合、データ照合機能はバッチの最大長を切り捨てて、
 2の倍数。
 
-```py 
+```py
 >>> data_collator = TTSDataCollatorWithPadding(processor=processor)
 ```
 
@@ -458,7 +458,7 @@ SpeechT5 では、モデルのデコーダ部分への入力が 2 分の 1 に
 
 `use_cache=True`オプションは、勾配チェックポイントと互換性がありません。トレーニングのために無効にします。
 
-```py 
+```py
 >>> model.config.use_cache = False
 ```
 
@@ -501,7 +501,7 @@ SpeechT5 では、モデルのデコーダ部分への入力が 2 分の 1 に
 ...     train_dataset=dataset["train"],
 ...     eval_dataset=dataset["test"],
 ...     data_collator=data_collator,
-...     tokenizer=processor,
+...     processing_class=processor,
 ... )
 ```
 これで、トレーニングを開始する準備が整いました。トレーニングには数時間かかります。 GPU に応じて、
@@ -567,7 +567,7 @@ SpeechT5 では、モデルのデコーダ部分への入力が 2 分の 1 に
 
 ```py
 >>> from IPython.display import Audio
->>> Audio(output['audio'], rate=output['sampling_rate']) 
+>>> Audio(output['audio'], rate=output['sampling_rate'])
 ```
 
 ### Run inference manually
@@ -583,14 +583,14 @@ SpeechT5 では、モデルのデコーダ部分への入力が 2 分の 1 に
 
 テスト データセットから例を選択して、スピーカーの埋め込みを取得します。
 
-```py 
+```py
 >>> example = dataset["test"][304]
 >>> speaker_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)
 ```
 
 入力テキストを定義し、トークン化します。
 
-```py 
+```py
 >>> text = "hallo allemaal, ik praat nederlands. groetjes aan iedereen!"
 >>> inputs = processor(text=text, return_tensors="pt")
 ```
diff --git a/docs/source/ja/tasks/token_classification.md b/docs/source/ja/tasks/token_classification.md
index a7f5097f685918..4389aeacb5644b 100644
--- a/docs/source/ja/tasks/token_classification.md
+++ b/docs/source/ja/tasks/token_classification.md
@@ -295,7 +295,7 @@ pip install transformers datasets evaluate seqeval
 ...     args=training_args,
 ...     train_dataset=tokenized_wnut["train"],
 ...     eval_dataset=tokenized_wnut["test"],
-...     tokenizer=tokenizer,
+...     processing_class=tokenizer,
 ...     data_collator=data_collator,
 ...     compute_metrics=compute_metrics,
 ... )
diff --git a/docs/source/ja/tasks/translation.md b/docs/source/ja/tasks/translation.md
index eb67604f9e1e6e..7fa45eac9cdb68 100644
--- a/docs/source/ja/tasks/translation.md
+++ b/docs/source/ja/tasks/translation.md
@@ -220,7 +220,7 @@ pip install transformers datasets evaluate sacrebleu
 ...     args=training_args,
 ...     train_dataset=tokenized_books["train"],
 ...     eval_dataset=tokenized_books["test"],
-...     tokenizer=tokenizer,
+...     processing_class=tokenizer,
 ...     data_collator=data_collator,
 ...     compute_metrics=compute_metrics,
 ... )
diff --git a/docs/source/ja/tasks/video_classification.md b/docs/source/ja/tasks/video_classification.md
index ecfae843f2ae37..741356a6f5789e 100644
--- a/docs/source/ja/tasks/video_classification.md
+++ b/docs/source/ja/tasks/video_classification.md
@@ -61,7 +61,7 @@ pip install -q pytorchvideo transformers evaluate
 
 サブセットをダウンロードした後、圧縮アーカイブを抽出する必要があります。
 
-```py 
+```py
 >>> import tarfile
 
 >>> with tarfile.open(file_path) as t:
@@ -127,7 +127,7 @@ UCF101_subset/
 * `id2label`: 整数をクラス名にマッピングします。
 
 
-```py 
+```py
 >>> class_labels = sorted({str(path).split("/")[2] for path in all_video_file_paths})
 >>> label2id = {label: i for i, label in enumerate(class_labels)}
 >>> id2label = {i: label for label, i in label2id.items()}
@@ -143,7 +143,7 @@ UCF101_subset/
 
 事前トレーニングされたチェックポイントとそれに関連する画像プロセッサからビデオ分類モデルをインスタンス化します。モデルのエンコーダーには事前トレーニングされたパラメーターが付属しており、分類ヘッドはランダムに初期化されます。画像プロセッサは、データセットの前処理パイプラインを作成するときに役立ちます。
 
-```py 
+```py
 >>> from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification
 
 >>> model_ckpt = "MCG-NJU/videomae-base"
@@ -175,7 +175,7 @@ You should probably TRAIN this model on a down-stream task to be able to use it
 ビデオの前処理には、[PyTorchVideo ライブラリ](https://pytorchvideo.org/) を利用します。まず、必要な依存関係をインポートします。
 
 
-```py 
+```py
 >>> import pytorchvideo.data
 
 >>> from pytorchvideo.transforms import (
@@ -224,7 +224,7 @@ You should probably TRAIN this model on a down-stream task to be able to use it
 次に、データセット固有の変換とデータセットをそれぞれ定義します。トレーニングセットから始めます:
 
 
-```py 
+```py
 >>> train_transform = Compose(
 ...     [
 ...         ApplyTransformToKey(
@@ -254,7 +254,7 @@ You should probably TRAIN this model on a down-stream task to be able to use it
 同じ一連のワークフローを検証セットと評価セットに適用できます。
 
 
-```py 
+```py
 >>> val_transform = Compose(
 ...     [
 ...         ApplyTransformToKey(
@@ -297,9 +297,9 @@ You should probably TRAIN this model on a down-stream task to be able to use it
 # (300, 30, 75)
 ```
 
-## Visualize the preprocessed video for better debugging 
+## Visualize the preprocessed video for better debugging
 
-```py 
+```py
 >>> import imageio
 >>> import numpy as np
 >>> from IPython.display import Image
@@ -312,7 +312,7 @@ You should probably TRAIN this model on a down-stream task to be able to use it
 
 >>> def create_gif(video_tensor, filename="sample.gif"):
 ...     """Prepares a GIF from a video tensor.
-...     
+...
 ...     The video tensor is expected to have the following shape:
 ...     (num_frames, num_channels, height, width).
 ...     """
@@ -339,13 +339,13 @@ You should probably TRAIN this model on a down-stream task to be able to use it
     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/sample_gif.gif" alt="Person playing basketball"/>
 </div>
 
-## Train the model 
+## Train the model
 
 🤗 Transformers の [`Trainer`](https://huggingface.co/docs/transformers/main_classes/trainer) をモデルのトレーニングに利用します。 `Trainer`をインスタンス化するには、トレーニング構成と評価メトリクスを定義する必要があります。最も重要なのは [`TrainingArguments`](https://huggingface.co/transformers/main_classes/trainer.html#transformers.TrainingArguments) で、これはトレーニングを構成するためのすべての属性を含むクラスです。モデルのチェックポイントを保存するために使用される出力フォルダー名が必要です。また、🤗 Hub 上のモデル リポジトリ内のすべての情報を同期するのにも役立ちます。
 
 トレーニング引数のほとんどは一目瞭然ですが、ここで非常に重要なのは`remove_unused_columns=False`です。これにより、モデルの呼び出し関数で使用されない機能が削除されます。デフォルトでは`True`です。これは、通常、未使用の特徴列を削除し、モデルの呼び出し関数への入力を解凍しやすくすることが理想的であるためです。ただし、この場合、`pixel_values` (モデルが入力で期待する必須キーです) を作成するには、未使用の機能 (特に`video`) が必要です。
 
-```py 
+```py
 >>> from transformers import TrainingArguments, Trainer
 
 >>> model_name = model_ckpt.split("/")[-1]
@@ -391,7 +391,7 @@ def compute_metrics(eval_pred):
 また、サンプルをまとめてバッチ処理するために使用される `collat​​e_fn` を定義します。各バッチは、`pixel_values` と `labels` という 2 つのキーで構成されます。
 
 
-```py 
+```py
 >>> def collate_fn(examples):
 ...     # permute to (num_frames, num_channels, height, width)
 ...     pixel_values = torch.stack(
@@ -403,13 +403,13 @@ def compute_metrics(eval_pred):
 
 次に、これらすべてをデータセットとともに`Trainer`に渡すだけです。
 
-```py 
+```py
 >>> trainer = Trainer(
 ...     model,
 ...     args,
 ...     train_dataset=train_dataset,
 ...     eval_dataset=val_dataset,
-...     tokenizer=image_processor,
+...     processing_class=image_processor,
 ...     compute_metrics=compute_metrics,
 ...     data_collator=collate_fn,
 ... )
@@ -419,7 +419,7 @@ def compute_metrics(eval_pred):
 
 次に、`train` メソッドを呼び出してモデルを微調整します。
 
-```py 
+```py
 >>> train_results = trainer.train()
 ```
 
@@ -435,7 +435,7 @@ def compute_metrics(eval_pred):
 
 推論のためにビデオをロードします。
 
-```py 
+```py
 >>> sample_test_video = next(iter(test_dataset))
 ```
 
@@ -491,7 +491,7 @@ def compute_metrics(eval_pred):
 
 `logits` をデコードすると、次のようになります。
 
-```py 
+```py
 >>> predicted_class_idx = logits.argmax(-1).item()
 >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
 # Predicted class: BasketballDunk
diff --git a/docs/source/ja/tasks/visual_question_answering.md b/docs/source/ja/tasks/visual_question_answering.md
index f6c2989693708b..3231cba5e3af71 100644
--- a/docs/source/ja/tasks/visual_question_answering.md
+++ b/docs/source/ja/tasks/visual_question_answering.md
@@ -110,7 +110,7 @@ Dataset({
 残りの機能は必要ないので削除できます。
 
 
-```py 
+```py
 >>> dataset = dataset.remove_columns(['question_type', 'question_id', 'answer_type'])
 ```
 
@@ -150,7 +150,7 @@ Dataset({
 >>> unique_labels = list(set(flattened_labels))
 
 >>> label2id = {label: idx for idx, label in enumerate(unique_labels)}
->>> id2label = {idx: label for label, idx in label2id.items()} 
+>>> id2label = {idx: label for label, idx in label2id.items()}
 ```
 
 マッピングができたので、文字列の回答をその ID に置き換え、さらに前処理をより便利にするためにデータセットをフラット化することができます。
@@ -175,7 +175,7 @@ Dataset({
 次のステップでは、ViLT プロセッサをロードして、モデルの画像データとテキスト データを準備します。
 [`ViltProcessor`] は、BERT トークナイザーと ViLT 画像プロセッサを便利な単一プロセッサにラップします。
 
-```py 
+```py
 >>> from transformers import ViltProcessor
 
 >>> processor = ViltProcessor.from_pretrained(model_checkpoint)
@@ -197,13 +197,13 @@ Dataset({
 >>> def preprocess_data(examples):
 ...     image_paths = examples['image_id']
 ...     images = [Image.open(image_path) for image_path in image_paths]
-...     texts = examples['question']    
+...     texts = examples['question']
 
 ...     encoding = processor(images, texts, padding="max_length", truncation=True, return_tensors="pt")
 
 ...     for k, v in encoding.items():
 ...           encoding[k] = v.squeeze()
-    
+
 ...     targets = []
 
 ...     for labels, scores in zip(examples['label.ids'], examples['label.weights']):
@@ -211,11 +211,11 @@ Dataset({
 
 ...         for label, score in zip(labels, scores):
 ...             target[label] = score
-      
+
 ...         targets.append(target)
 
 ...     encoding["labels"] = targets
-    
+
 ...     return encoding
 ```
 
@@ -284,14 +284,14 @@ Dataset({
 ...     args=training_args,
 ...     data_collator=data_collator,
 ...     train_dataset=processed_dataset,
-...     tokenizer=processor,
+...     processing_class=processor,
 ... )
 ```
 
 3. [`~Trainer.train`] を呼び出してモデルを微調整します。
 
 ```py
->>> trainer.train() 
+>>> trainer.train()
 ```
 
 トレーニングが完了したら、 [`~Trainer.push_to_hub`] メソッドを使用してモデルをハブに共有し、🤗 ハブで最終モデルを共有します。
@@ -376,7 +376,7 @@ GPU (利用可能な場合)。これは [`Trainer`] が自動的に処理する
 モデルは画像とテキストを入力として受け取るため、VQA データセットの最初の例とまったく同じ画像と質問のペアを使用してみましょう。
 
 
-```py 
+```py
 >>> example = dataset[0]
 >>> image = Image.open(example['image_id'])
 >>> question = example['question']
@@ -386,7 +386,7 @@ GPU (利用可能な場合)。これは [`Trainer`] が自動的に処理する
 
 
 ```py
->>> prompt = f"Question: {question} Answer:" 
+>>> prompt = f"Question: {question} Answer:"
 ```
 
 次に、モデルのプロセッサで画像/プロンプトを前処理し、処理された入力をモデルに渡し、出力をデコードする必要があります。
@@ -397,7 +397,7 @@ GPU (利用可能な場合)。これは [`Trainer`] が自動的に処理する
 >>> generated_ids = model.generate(**inputs, max_new_tokens=10)
 >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
 >>> print(generated_text)
-"He is looking at the crowd" 
+"He is looking at the crowd"
 ```
 
 ご覧のとおり、モデルは群衆と顔の向き (下を向いている) を認識しましたが、見逃しているようです。
diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index eafd389994ad52..43ff0017c8f842 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -135,8 +135,10 @@
     title: 커뮤니티 리소스
   - local: troubleshooting
     title: 문제 해결
-  - local: in_translation
-    title: (번역중) Interoperability with GGUF files
+  - local: gguf
+    title: GGUF 파일들과의 상호 운용성
+  - local: modular_transformers
+    title: transformers에서의 모듈성
   title: (번역중) 개발자 가이드
 - sections:
   - local: in_translation
@@ -270,44 +272,44 @@
   - sections:
     - local: main_classes/agent
       title: 에이전트와 도구
-    - local: in_translation
-      title: (번역중) Auto Classes
+    - local: model_doc/auto
+      title: 자동 클래스
     - local: in_translation
       title: (번역중) Backbones
-    - local: in_translation
-      title: (번역중) Callbacks
-    - local: in_translation
-      title: (번역중) Configuration
-    - local: in_translation
-      title: (번역중) Data Collator
-    - local: in_translation
-      title: (번역중) Keras callbacks
-    - local: in_translation
-      title: (번역중) Logging
-    - local: in_translation
-      title: (번역중) Models
-    - local: in_translation
-      title: (번역중) Text Generation
-    - local: in_translation
-      title: (번역중) ONNX
+    - local: main_classes/callback
+      title: 콜백
+    - local: main_classes/configuration
+      title: 구성
+    - local: main_classes/data_collator
+      title: 데이터 콜레이터
+    - local: main_classes/keras_callbacks
+      title: 케라스 콜백
+    - local: main_classes/logging
+      title: 로깅
+    - local: main_classes/model
+      title: Models
+    - local: main_classes/text_generation
+      title: 텍스트 생성
+    - local: main_classes/onnx
+      title: ONNX
     - local: in_translation
       title: (번역중) Optimization
     - local: in_translation
-      title: (번역중) Model outputs
-    - local: in_translation
+      title: 모델 출력
+    - local: main_classes/output
       title: (번역중) Pipelines
     - local: in_translation
       title: (번역중) Processors
-    - local: in_translation
-      title: (번역중) Quantization
+    - local: main_classes/quantization
+      title: 양자화
     - local: in_translation
       title: (번역중) Tokenizer
-    - local: in_translation
-      title: (번역중) Trainer
+    - local: main_classes/trainer
+      title: Trainer
     - local: deepspeed
       title: DeepSpeed
-    - local: in_translation
-      title: (번역중) Feature Extractor
+    - local: main_classes/feature_extractor
+      title: 특성 추출기
     - local: in_translation
       title: (번역중) Image Processor
     title: (번역중) 메인 클래스
@@ -316,8 +318,8 @@
       sections:
       - local: in_translation
         title: (번역중) ALBERT
-      - local: in_translation
-        title: (번역중) BART
+      - local: model_doc/bart
+        title: BART
       - local: in_translation
         title: (번역중) BARThez
       - local: in_translation
@@ -328,14 +330,14 @@
         title: (번역중) BertGeneration
       - local: in_translation
         title: (번역중) BertJapanese
-      - local: in_translation
-        title: (번역중) Bertweet
+      - local: model_doc/bertweet
+        title: Bertweet
       - local: in_translation
         title: (번역중) BigBird
       - local: in_translation
         title: (번역중) BigBirdPegasus
-      - local: in_translation
-        title: (번역중) BioGpt
+      - local: model_doc/biogpt
+        title: BioGpt
       - local: in_translation
         title: (번역중) Blenderbot
       - local: in_translation
@@ -352,6 +354,8 @@
         title: (번역중) CANINE
       - local: in_translation
         title: (번역중) CodeGen
+      - local: model_doc/cohere
+        title: Cohere
       - local: in_translation
         title: (번역중) ConvBERT
       - local: in_translation
@@ -360,10 +364,12 @@
         title: (번역중) CPMANT
       - local: in_translation
         title: (번역중) CTRL
-      - local: in_translation
-        title: (번역중) DeBERTa
-      - local: in_translation
-        title: (번역중) DeBERTa-v2
+      - local: model_doc/dbrx
+        title: DBRX
+      - local: model_doc/deberta
+        title: DeBERTa
+      - local: model_doc/deberta-v2
+        title: DeBERTa-v2
       - local: in_translation
         title: (번역중) DialoGPT
       - local: in_translation
@@ -378,8 +384,8 @@
         title: (번역중) ERNIE
       - local: in_translation
         title: (번역중) ErnieM
-      - local: in_translation
-        title: (번역중) ESM
+      - local: model_doc/esm
+        title: ESM
       - local: in_translation
         title: (번역중) FLAN-T5
       - local: in_translation
@@ -392,14 +398,16 @@
         title: (번역중) FSMT
       - local: in_translation
         title: (번역중) Funnel Transformer
-      - local: in_translation
-        title: (번역중) GPT
+      - local: model_doc/gemma
+        title: Gemma
+      - local: model_doc/openai-gpt
+        title: GPT
       - local: in_translation
         title: (번역중) GPT Neo
       - local: in_translation
         title: (번역중) GPT NeoX
-      - local: in_translation
-        title: (번역중) GPT NeoX Japanese
+      - local: model_doc/gpt_neox_japanese
+        title: GPT NeoX Japanese
       - local: in_translation
         title: (번역중) GPT-J
       - local: in_translation
@@ -422,6 +430,8 @@
         title: LLaMA
       - local: model_doc/llama2
         title: LLaMA2
+      - local: model_doc/llama3
+        title: LLaMA3
       - local: in_translation
         title: (번역중) Longformer
       - local: in_translation
@@ -430,6 +440,10 @@
         title: (번역중) LUKE
       - local: in_translation
         title: (번역중) M2M100
+      - local: model_doc/mamba
+        title: Mamba
+      - local: model_doc/mamba2
+        title: Mamba2
       - local: in_translation
         title: (번역중) MarianMT
       - local: in_translation
@@ -442,6 +456,8 @@
         title: (번역중) MegatronBERT
       - local: in_translation
         title: (번역중) MegatronGPT2
+      - local: model_doc/mistral
+        title: Mistral
       - local: in_translation
         title: (번역중) mLUKE
       - local: in_translation
@@ -476,8 +492,8 @@
         title: (번역중) ProphetNet
       - local: in_translation
         title: (번역중) QDQBert
-      - local: in_translation
-        title: (번역중) RAG
+      - local: model_doc/rag
+        title: RAG(검색 증강 생성)
       - local: in_translation
         title: (번역중) REALM
       - local: in_translation
@@ -589,12 +605,12 @@
         title: (번역중) ResNet
       - local: in_translation
         title: (번역중) SegFormer
-      - local: in_translation
-        title: (번역중) Swin Transformer
-      - local: in_translation
-        title: (번역중) Swin Transformer V2
-      - local: in_translation
-        title: (번역중) Swin2SR
+      - local: model_doc/swin
+        title: Swin Transformer
+      - local: model_doc/swinv2
+        title: Swin Transformer V2
+      - local: model_doc/swin2sr
+        title: Swin2SR
       - local: in_translation
         title: (번역중) Table Transformer
       - local: in_translation
@@ -606,8 +622,8 @@
       - local: in_translation
         title: (번역중) VideoMAE
       - local: in_translation
-        title: (번역중) Vision Transformer (ViT)
-      - local: in_translation
+        title: Vision Transformer (ViT)
+      - local: model_doc/vit
         title: (번역중) ViT Hybrid
       - local: in_translation
         title: (번역중) ViTMAE
@@ -661,16 +677,18 @@
         title: (번역중) ALIGN
       - local: in_translation
         title: (번역중) AltCLIP
-      - local: in_translation
-        title: (번역중) BLIP
+      - local: model_doc/blip
+        title: BLIP
       - local: in_translation
         title: (번역중) BLIP-2
       - local: in_translation
         title: (번역중) BridgeTower
+      - local: model_doc/chameleon
+        title: Chameleon
       - local: in_translation
         title: (번역중) Chinese-CLIP
-      - local: in_translation
-        title: (번역중) CLIP
+      - local: model_doc/clip
+        title: CLIP
       - local: in_translation
         title: (번역중) CLIPSeg
       - local: in_translation
@@ -705,6 +723,8 @@
         title: (번역중) OneFormer
       - local: in_translation
         title: (번역중) OWL-ViT
+      - local: model_doc/paligemma
+        title: PaliGemma
       - local: in_translation
         title: (번역중) Perceiver
       - local: in_translation
@@ -734,40 +754,46 @@
       sections:
       - local: in_translation
         title: (번역중) Decision Transformer
-      - local: in_translation
-        title: (번역중) Trajectory Transformer
+      - local: model_doc/trajectory_transformer
+        title: 궤적 트랜스포머
       title: (번역중) 강화학습 모델
     - isExpanded: false
       sections:
-      - local: in_translation
-        title: (번역중) Informer
-      - local: in_translation
-        title: (번역중) Time Series Transformer
-      title: (번역중) 시계열 모델
+      - local: model_doc/autoformer
+        title: Autoformer
+      - local: model_doc/informer
+        title: Informer
+      - local: model_doc/patchtsmixer
+        title: PatchTSMixer
+      - local: model_doc/patchtst
+        title: PatchTST
+      - local: model_doc/time_series_transformer
+        title: 시계열 트랜스포머
+      title: 시계열 모델
     - isExpanded: false
       sections:
-      - local: in_translation
-        title: (번역중) Graphormer
-      title: (번역중) Graph models
+      - local: model_doc/graphormer
+        title: Graphormer
+      title: 그래프 모델
     title: (번역중) 모델
   - sections:
-    - local: in_translation
-      title: (번역중) Custom Layers and Utilities
-    - local: in_translation
-      title: (번역중) Utilities for pipelines
-    - local: in_translation
-      title: (번역중) Utilities for Tokenizers
+    - local: internal/modeling_utils
+      title: 사용자 정의 레이어 및 유틸리티
+    - local: internal/pipelines_utils
+      title: 파이프라인을 위한 유틸리티
+    - local: internal/tokenization_utils
+      title: 토크나이저를 위한 유틸리티
     - local: in_translation
       title: (번역중) Utilities for Trainer
-    - local: in_translation
-      title: (번역중) Utilities for Generation
-    - local: in_translation
-      title: (번역중) Utilities for Image Processors
-    - local: in_translation
-      title: (번역중) Utilities for Audio processing
-    - local: in_translation
-      title: (번역중) General Utilities
-    - local: in_translation
-      title: (번역중) Utilities for Time Series
+    - local: internal/generation_utils
+      title: 생성을 위한 유틸리티
+    - local: internal/image_processing_utils
+      title: 이미지 프로세서를 위한 유틸리티
+    - local: internal/audio_utils
+      title: 오디오 처리를 위한 유틸리티
+    - local: internal/file_utils
+      title: 일반 유틸리티
+    - local: internal/time_series_utils
+      title: 시계열을 위한 유틸리티
     title: (번역중) Internal Helpers
-  title: (번역중) API
\ No newline at end of file
+  title: (번역중) API
diff --git a/docs/source/ko/gguf.md b/docs/source/ko/gguf.md
new file mode 100644
index 00000000000000..03bd7c08bb2691
--- /dev/null
+++ b/docs/source/ko/gguf.md
@@ -0,0 +1,100 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# GGUF와 Transformers의 상호작용 [[gguf-and-interaction-with-transformers]]
+
+GGUF 파일 형식은 [GGML](https://github.com/ggerganov/ggml)과 그에 의존하는 다른 라이브러리, 예를 들어 매우 인기 있는 [llama.cpp](https://github.com/ggerganov/llama.cpp)이나 [whisper.cpp](https://github.com/ggerganov/whisper.cpp)에서 추론을 위한 모델을 저장하는데 사용됩니다.
+
+이 파일 형식은 [Hugging Face Hub](https://huggingface.co/docs/hub/en/gguf)에서 지원되며, 파일 내의 텐서와 메타데이터를 신속하게 검사할 수 있는 기능을 제공합니다.
+
+이 형식은 "단일 파일 형식(single-file-format)"으로 설계되었으며, 하나의 파일에 설정 속성, 토크나이저 어휘, 기타 속성뿐만 아니라 모델에서 로드되는 모든 텐서가 포함됩니다. 이 파일들은 파일의 양자화 유형에 따라 다른 형식으로 제공됩니다. 다양한 양자화 유형에 대한 간략한 설명은 [여기](https://huggingface.co/docs/hub/en/gguf#quantization-types)에서 확인할 수 있습니다.
+
+## Transformers 내 지원 [[support-within-transformers]]
+
+`transformers` 내에서 `gguf` 파일을 로드할 수 있는 기능을 추가하여 GGUF 모델의 추가 학습/미세 조정을 제공한 후 `ggml` 생태계에서 다시 사용할 수 있도록 `gguf` 파일로 변환하는 기능을 제공합니다. 모델을 로드할 때 먼저 FP32로 역양자화한 후, PyTorch에서 사용할 수 있도록 가중치를 로드합니다.
+
+> [!NOTE]
+> 지원은 아직 초기 단계에 있으며, 다양한 양자화 유형과 모델 아키텍처에 대해 이를 강화하기 위한 기여를 환영합니다.
+
+현재 지원되는 모델 아키텍처와 양자화 유형은 다음과 같습니다:
+
+### 지원되는 양자화 유형 [[supported-quantization-types]]
+
+초기에 지원되는 양자화 유형은 Hub에서 공유된 인기 있는 양자화 파일에 따라 결정되었습니다.
+
+- F32
+- F16
+- BF16
+- Q4_0
+- Q4_1
+- Q5_0
+- Q5_1
+- Q8_0
+- Q2_K
+- Q3_K
+- Q4_K
+- Q5_K
+- Q6_K
+- IQ1_S
+- IQ1_M
+- IQ2_XXS
+- IQ2_XS
+- IQ2_S
+- IQ3_XXS
+- IQ3_S
+- IQ4_XS
+- IQ4_NL
+
+> [!NOTE]
+> GGUF 역양자화를 지원하려면 `gguf>=0.10.0` 설치가 필요합니다.
+
+### 지원되는 모델 아키텍처 [[supported-model-architectures]]
+
+현재 지원되는 모델 아키텍처는 Hub에서 매우 인기가 많은 아키텍처들로 제한되어 있습니다:
+
+- LLaMa
+- Mistral
+- Qwen2
+- Qwen2Moe
+- Phi3
+- Bloom
+
+## 사용 예시 [[example-usage]]
+
+`transformers`에서 `gguf` 파일을 로드하려면 `from_pretrained` 메소드에 `gguf_file` 인수를 지정해야 합니다. 동일한 파일에서 토크나이저와 모델을 로드하는 방법은 다음과 같습니다: 
+
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+model_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
+filename = "tinyllama-1.1b-chat-v1.0.Q6_K.gguf"
+
+tokenizer = AutoTokenizer.from_pretrained(model_id, gguf_file=filename)
+model = AutoModelForCausalLM.from_pretrained(model_id, gguf_file=filename)
+```
+
+이제 PyTorch 생태계에서 모델의 양자화되지 않은 전체 버전에 접근할 수 있으며, 다른 여러 도구들과 결합하여 사용할 수 있습니다.
+
+`gguf` 파일로 다시 변환하려면 llama.cpp의 [`convert-hf-to-gguf.py`](https://github.com/ggerganov/llama.cpp/blob/master/convert-hf-to-gguf.py)를 사용하는 것을 권장합니다.
+
+위의 스크립트를 완료하여 모델을 저장하고 다시 `gguf`로 내보내는 방법은 다음과 같습니다:
+
+```python
+tokenizer.save_pretrained('directory')
+model.save_pretrained('directory')
+
+!python ${path_to_llama_cpp}/convert-hf-to-gguf.py ${directory}
+```
diff --git a/docs/source/ko/hpo_train.md b/docs/source/ko/hpo_train.md
index 58bacd55ff75e1..c0982db5e093b5 100644
--- a/docs/source/ko/hpo_train.md
+++ b/docs/source/ko/hpo_train.md
@@ -24,7 +24,7 @@ rendered properly in your Markdown viewer.
 
 하이퍼파라미터 탐색 백엔드로 사용하기 전에 아래의 명령어를 사용하여 라이브러리들을 설치하세요.
 ```bash
-pip install optuna/sigopt/wandb/ray[tune] 
+pip install optuna/sigopt/wandb/ray[tune]
 ```
 
 ## 예제에서 하이퍼파라미터 탐색을 활성화하는 방법 [[how-to-enable-hyperparameter-search-in-example]]
@@ -100,7 +100,7 @@ wandb의 경우, 해당 [object_parameter](https://docs.wandb.ai/guides/sweeps/c
 ...     train_dataset=small_train_dataset,
 ...     eval_dataset=small_eval_dataset,
 ...     compute_metrics=compute_metrics,
-...     tokenizer=tokenizer,
+...     processing_class=tokenizer,
 ...     model_init=model_init,
 ...     data_collator=data_collator,
 ... )
diff --git a/docs/source/ko/internal/audio_utils.md b/docs/source/ko/internal/audio_utils.md
new file mode 100644
index 00000000000000..811f7c0866bd50
--- /dev/null
+++ b/docs/source/ko/internal/audio_utils.md
@@ -0,0 +1,39 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# `FeatureExtractors`를 위한 유틸리티 [[utilities-for-featureextractors]]
+
+이 페이지는 오디오 [`FeatureExtractor`]가 *단시간 푸리에 변환(Short Time Fourier Transform)* 또는 *로그 멜 스펙트로그램(log mel spectrogram)*과 같은 일반적인 알고리즘을 사용하여 원시 오디오에서 특수한 특성을 계산하는 데 사용할 수 있는 유틸리티 함수들을 나열합니다.
+
+이 함수들 대부분은 라이브러리 내 오디오 처리 코드를 연구할 때에만 유용합니다.
+
+## 오디오 변환 [[transformers.audio_utils.hertz_to_mel]]
+
+[[autodoc]] audio_utils.hertz_to_mel
+
+[[autodoc]] audio_utils.mel_to_hertz
+
+[[autodoc]] audio_utils.mel_filter_bank
+
+[[autodoc]] audio_utils.optimal_fft_length
+
+[[autodoc]] audio_utils.window_function
+
+[[autodoc]] audio_utils.spectrogram
+
+[[autodoc]] audio_utils.power_to_db
+
+[[autodoc]] audio_utils.amplitude_to_db
diff --git a/docs/source/ko/internal/file_utils.md b/docs/source/ko/internal/file_utils.md
new file mode 100644
index 00000000000000..7effad44baa170
--- /dev/null
+++ b/docs/source/ko/internal/file_utils.md
@@ -0,0 +1,50 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 일반 유틸리티 (General Utilities) [[general-utilities]]
+
+이 페이지는 `utils.py` 파일에 있는 Transformers의 일반 유틸리티 함수들을 나열합니다.
+
+이 함수들 대부분은 라이브러리의 일반적인 코드를 연구할 때만 유용합니다.
+
+
+## Enums 및 namedtuples [[transformers.utils.ExplicitEnum]]
+
+[[autodoc]] utils.ExplicitEnum
+
+[[autodoc]] utils.PaddingStrategy
+
+[[autodoc]] utils.TensorType
+
+## 특수 데코레이터 (Special Decorators) [[transformers.add_start_docstrings]]
+
+[[autodoc]] utils.add_start_docstrings
+
+[[autodoc]] utils.add_start_docstrings_to_model_forward
+
+[[autodoc]] utils.add_end_docstrings
+
+[[autodoc]] utils.add_code_sample_docstrings
+
+[[autodoc]] utils.replace_return_docstrings
+
+## 특수 속성 (Special Properties) [[transformers.utils.cached_property]]
+
+[[autodoc]] utils.cached_property
+
+## 기타 유틸리티 [[transformers.utils._LazyModule]]
+
+[[autodoc]] utils._LazyModule
diff --git a/docs/source/ko/internal/generation_utils.md b/docs/source/ko/internal/generation_utils.md
new file mode 100644
index 00000000000000..a66411f71e9ee8
--- /dev/null
+++ b/docs/source/ko/internal/generation_utils.md
@@ -0,0 +1,413 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 생성을 위한 유틸리티 [[utilities-for-generation]]
+
+이 페이지는 [`~generation.GenerationMixin.generate`]에서 사용되는 모든 유틸리티 함수들을 나열합니다.
+
+## 출력을 생성하기 (Generate Outputs) [[generate-outputs]]
+
+[`~generation.GenerationMixin.generate`]의 출력은 [`~utils.ModelOutput`]의 하위 클래스의 인스턴스입니다. 이 출력은 [`~generation.GenerationMixin.generate`]에서 반환되는 모든 정보를 포함하는 데이터 구조체이며, 튜플 또는 딕셔너리로도 사용할 수 있습니다.
+
+다음은 예시입니다:
+
+```python
+from transformers import GPT2Tokenizer, GPT2LMHeadModel
+
+tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
+model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")
+
+inputs = tokenizer("Hello, my dog is cute and ", return_tensors="pt")
+generation_output = model.generate(**inputs, return_dict_in_generate=True, output_scores=True)
+```
+
+`generation_output` 객체는 [`~generation.GenerateDecoderOnlyOutput`]입니다. 아래 문서에서 확인할 수 있듯이, 이 클래스는 다음과 같은 속성을 가지고 있습니다:
+
+- `sequences`: 생성된 토큰 시퀀스
+- `scores` (옵션): 각 생성 단계에서 언어 모델링 헤드의 예측 점수
+- `hidden_states` (옵션): 각 생성 단계에서 모델의 은닉 상태
+- `attentions` (옵션): 각 생성 단계에서 모델의 어텐션 가중치
+
+`output_scores=True`를 전달했기 때문에 `scores`는 포함되어 있지만, `output_hidden_states=True` 또는 `output_attentions=True`를 전달하지 않았으므로 `hidden_states`와 `attentions`는 포함되지 않았습니다.
+
+각 속성은 일반적으로 접근할 수 있으며, 모델이 해당 속성을 반환하지 않았다면 `None`이 반환됩니다. 예를 들어, `generation_output.scores`는 언어 모델링 헤드에서 생성된 모든 예측 점수를 포함하고 있으며, `generation_output.attentions`는 `None`입니다.
+
+`generation_output` 객체를 튜플로 사용할 경우, `None` 값이 아닌 속성만 포함됩니다. 예를 들어, `loss`와 `logits`라는 두 요소가 포함된 경우:
+
+```python
+generation_output[:2]
+```
+
+위 코드는 `(generation_output.sequences, generation_output.scores)` 튜플을 반환합니다.
+
+`generation_output` 객체를 딕셔너리로 사용할 경우, `None` 값이 아닌 속성만 포함됩니다. 예를 들어, `sequences`와 `scores`라는 두 개의 키를 가질 수 있습니다.
+
+여기서는 모든 출력 유형을 문서화합니다.
+
+
+### PyTorch [[transformers.generation.GenerateDecoderOnlyOutput]]
+
+[[autodoc]] generation.GenerateDecoderOnlyOutput
+
+[[autodoc]] generation.GenerateEncoderDecoderOutput
+
+[[autodoc]] generation.GenerateBeamDecoderOnlyOutput
+
+[[autodoc]] generation.GenerateBeamEncoderDecoderOutput
+
+### TensorFlow [[transformers.generation.TFGreedySearchEncoderDecoderOutput]]
+
+[[autodoc]] generation.TFGreedySearchEncoderDecoderOutput
+
+[[autodoc]] generation.TFGreedySearchDecoderOnlyOutput
+
+[[autodoc]] generation.TFSampleEncoderDecoderOutput
+
+[[autodoc]] generation.TFSampleDecoderOnlyOutput
+
+[[autodoc]] generation.TFBeamSearchEncoderDecoderOutput
+
+[[autodoc]] generation.TFBeamSearchDecoderOnlyOutput
+
+[[autodoc]] generation.TFBeamSampleEncoderDecoderOutput
+
+[[autodoc]] generation.TFBeamSampleDecoderOnlyOutput
+
+[[autodoc]] generation.TFContrastiveSearchEncoderDecoderOutput
+
+[[autodoc]] generation.TFContrastiveSearchDecoderOnlyOutput
+
+### FLAX [[transformers.generation.FlaxSampleOutput]]
+
+[[autodoc]] generation.FlaxSampleOutput
+
+[[autodoc]] generation.FlaxGreedySearchOutput
+
+[[autodoc]] generation.FlaxBeamSearchOutput
+
+## LogitsProcessor [[logitsprocessor]]
+
+[`LogitsProcessor`]는 생성 중 언어 모델 헤드의 예측 점수를 수정하는 데 사용됩니다.
+
+### PyTorch [[transformers.AlternatingCodebooksLogitsProcessor]]
+
+[[autodoc]] AlternatingCodebooksLogitsProcessor
+    - __call__
+
+[[autodoc]] ClassifierFreeGuidanceLogitsProcessor
+    - __call__
+
+[[autodoc]] EncoderNoRepeatNGramLogitsProcessor
+    - __call__
+
+[[autodoc]] EncoderRepetitionPenaltyLogitsProcessor
+    - __call__
+
+[[autodoc]] EpsilonLogitsWarper
+    - __call__
+
+[[autodoc]] EtaLogitsWarper
+    - __call__
+
+[[autodoc]] ExponentialDecayLengthPenalty
+    - __call__
+
+[[autodoc]] ForcedBOSTokenLogitsProcessor
+    - __call__
+
+[[autodoc]] ForcedEOSTokenLogitsProcessor
+    - __call__
+
+[[autodoc]] HammingDiversityLogitsProcessor
+    - __call__
+
+[[autodoc]] InfNanRemoveLogitsProcessor
+    - __call__
+
+[[autodoc]] LogitNormalization
+    - __call__
+
+[[autodoc]] LogitsProcessor
+    - __call__
+
+[[autodoc]] LogitsProcessorList
+    - __call__
+
+[[autodoc]] MinLengthLogitsProcessor
+    - __call__
+
+[[autodoc]] MinNewTokensLengthLogitsProcessor
+    - __call__
+
+[[autodoc]] MinPLogitsWarper
+    - __call__
+
+[[autodoc]] NoBadWordsLogitsProcessor
+    - __call__
+
+[[autodoc]] NoRepeatNGramLogitsProcessor
+    - __call__
+
+[[autodoc]] PrefixConstrainedLogitsProcessor
+    - __call__
+
+[[autodoc]] RepetitionPenaltyLogitsProcessor
+    - __call__
+
+[[autodoc]] SequenceBiasLogitsProcessor
+    - __call__
+
+[[autodoc]] SuppressTokensAtBeginLogitsProcessor
+    - __call__
+
+[[autodoc]] SuppressTokensLogitsProcessor
+    - __call__
+
+[[autodoc]] TemperatureLogitsWarper
+    - __call__
+
+[[autodoc]] TopKLogitsWarper
+    - __call__
+
+[[autodoc]] TopPLogitsWarper
+    - __call__
+
+[[autodoc]] TypicalLogitsWarper
+    - __call__
+
+[[autodoc]] UnbatchedClassifierFreeGuidanceLogitsProcessor
+    - __call__
+
+[[autodoc]] WhisperTimeStampLogitsProcessor
+    - __call__
+
+[[autodoc]] WatermarkLogitsProcessor
+    - __call__
+
+
+### TensorFlow [[transformers.TFForcedBOSTokenLogitsProcessor]]
+
+[[autodoc]] TFForcedBOSTokenLogitsProcessor
+    - __call__
+
+[[autodoc]] TFForcedEOSTokenLogitsProcessor
+    - __call__
+
+[[autodoc]] TFForceTokensLogitsProcessor
+    - __call__
+
+[[autodoc]] TFLogitsProcessor
+    - __call__
+
+[[autodoc]] TFLogitsProcessorList
+    - __call__
+
+[[autodoc]] TFLogitsWarper
+    - __call__
+
+[[autodoc]] TFMinLengthLogitsProcessor
+    - __call__
+
+[[autodoc]] TFNoBadWordsLogitsProcessor
+    - __call__
+
+[[autodoc]] TFNoRepeatNGramLogitsProcessor
+    - __call__
+
+[[autodoc]] TFRepetitionPenaltyLogitsProcessor
+    - __call__
+
+[[autodoc]] TFSuppressTokensAtBeginLogitsProcessor
+    - __call__
+
+[[autodoc]] TFSuppressTokensLogitsProcessor
+    - __call__
+
+[[autodoc]] TFTemperatureLogitsWarper
+    - __call__
+
+[[autodoc]] TFTopKLogitsWarper
+    - __call__
+
+[[autodoc]] TFTopPLogitsWarper
+    - __call__
+
+### FLAX [[transformers.FlaxForcedBOSTokenLogitsProcessor]]
+
+[[autodoc]] FlaxForcedBOSTokenLogitsProcessor
+    - __call__
+
+[[autodoc]] FlaxForcedEOSTokenLogitsProcessor
+    - __call__
+
+[[autodoc]] FlaxForceTokensLogitsProcessor
+    - __call__
+
+[[autodoc]] FlaxLogitsProcessor
+    - __call__
+
+[[autodoc]] FlaxLogitsProcessorList
+    - __call__
+
+[[autodoc]] FlaxLogitsWarper
+    - __call__
+
+[[autodoc]] FlaxMinLengthLogitsProcessor
+    - __call__
+
+[[autodoc]] FlaxSuppressTokensAtBeginLogitsProcessor
+    - __call__
+
+[[autodoc]] FlaxSuppressTokensLogitsProcessor
+    - __call__
+
+[[autodoc]] FlaxTemperatureLogitsWarper
+    - __call__
+
+[[autodoc]] FlaxTopKLogitsWarper
+    - __call__
+
+[[autodoc]] FlaxTopPLogitsWarper
+    - __call__
+
+[[autodoc]] FlaxWhisperTimeStampLogitsProcessor
+    - __call__
+
+## StoppingCriteria [[transformers.StoppingCriteria]]
+
+[`StoppingCriteria`]는 생성이 언제 멈출지를 결정하는 데 사용됩니다 (EOS 토큰 외). 이 기능은 PyTorch 구현에만 제공됩니다.
+
+[[autodoc]] StoppingCriteria
+    - __call__
+
+[[autodoc]] StoppingCriteriaList
+    - __call__
+
+[[autodoc]] MaxLengthCriteria
+    - __call__
+
+[[autodoc]] MaxTimeCriteria
+    - __call__
+
+[[autodoc]] StopStringCriteria
+    - __call__
+
+[[autodoc]] EosTokenCriteria
+    - __call__
+
+## Constraint [[transformers.Constraint]]
+
+[`Constraint`]는 생성 출력에 특정 토큰이나 시퀀스를 강제로 포함시키는 데 사용됩니다. 이 기능은 PyTorch 구현에만 제공됩니다.
+
+[[autodoc]] Constraint
+
+[[autodoc]] PhrasalConstraint
+
+[[autodoc]] DisjunctiveConstraint
+
+[[autodoc]] ConstraintListState
+
+## 빔 검색 (BeamSearch) [[transformers.BeamScorer]]
+
+[[autodoc]] BeamScorer
+    - process
+    - finalize
+
+[[autodoc]] BeamSearchScorer
+    - process
+    - finalize
+
+[[autodoc]] ConstrainedBeamSearchScorer
+    - process
+    - finalize
+
+## 스트리머 (Streamers) [[transformers.TextStreamer]]
+
+[[autodoc]] TextStreamer
+
+[[autodoc]] TextIteratorStreamer
+
+## 캐시 (Caches) [[transformers.Cache]]
+
+[[autodoc]] Cache
+    - update
+
+[[autodoc]] CacheConfig
+    - update
+
+[[autodoc]] QuantizedCacheConfig
+    - validate
+
+[[autodoc]] DynamicCache
+    - update
+    - get_seq_length
+    - reorder_cache
+    - to_legacy_cache
+    - from_legacy_cache
+
+[[autodoc]] QuantizedCache
+    - update
+    - get_seq_length
+
+[[autodoc]] QuantoQuantizedCache
+
+[[autodoc]] HQQQuantizedCache
+
+[[autodoc]] SinkCache
+    - update
+    - get_seq_length
+    - reorder_cache
+
+[[autodoc]] OffloadedCache
+    - update
+    - prefetch_layer
+    - evict_previous_layer
+
+[[autodoc]] StaticCache
+    - update
+    - get_seq_length
+    - reset
+
+[[autodoc]] OffloadedStaticCache
+    - update
+    - get_seq_length
+    - reset
+
+[[autodoc]] HybridCache
+    - update
+    - get_seq_length
+    - reset
+
+[[autodoc]] SlidingWindowCache
+    - update
+    - reset
+
+[[autodoc]] EncoderDecoderCache
+    - get_seq_length
+    - to_legacy_cache
+    - from_legacy_cache
+    - reset
+    - reorder_cache
+
+[[autodoc]] MambaCache
+    - update_conv_state
+    - update_ssm_state
+    - reset
+
+## 워터마크 유틸리티 (Watermark Utils) [[transformers.WatermarkDetector]]
+
+[[autodoc]] WatermarkDetector
+    - __call__
diff --git a/docs/source/ko/internal/image_processing_utils.md b/docs/source/ko/internal/image_processing_utils.md
new file mode 100644
index 00000000000000..cd32935132af87
--- /dev/null
+++ b/docs/source/ko/internal/image_processing_utils.md
@@ -0,0 +1,47 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 이미지 프로세서를 위한 유틸리티 [[utilities-for-image-processors]]
+
+이 페이지는 이미지 프로세서에서 사용되는 유틸리티 함수들을 나열하며, 주로 이미지를 처리하기 위한 함수 기반의 변환 작업들을 다룹니다.
+
+이 함수들 대부분은 라이브러리의 이미지 프로세서 코드를 연구할 때만 유용합니다.
+
+## 이미지 변환 [[transformers.image_transforms.center_crop]]
+
+[[autodoc]] image_transforms.center_crop
+
+[[autodoc]] image_transforms.center_to_corners_format
+
+[[autodoc]] image_transforms.corners_to_center_format
+
+[[autodoc]] image_transforms.id_to_rgb
+
+[[autodoc]] image_transforms.normalize
+
+[[autodoc]] image_transforms.pad
+
+[[autodoc]] image_transforms.rgb_to_id
+
+[[autodoc]] image_transforms.rescale
+
+[[autodoc]] image_transforms.resize
+
+[[autodoc]] image_transforms.to_pil_image
+
+## ImageProcessingMixin [[transformers.ImageProcessingMixin]]
+
+[[autodoc]] image_processing_utils.ImageProcessingMixin
diff --git a/docs/source/ko/internal/modeling_utils.md b/docs/source/ko/internal/modeling_utils.md
new file mode 100644
index 00000000000000..76cc09db292acd
--- /dev/null
+++ b/docs/source/ko/internal/modeling_utils.md
@@ -0,0 +1,83 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 사용자 정의 레이어 및 유틸리티 [[custom-layers-and-utilities]]
+
+이 페이지는 라이브러리에서 사용되는 사용자 정의 레이어와 모델링을 위한 유틸리티 함수들을 나열합니다.
+
+이 함수들 대부분은 라이브러리 내의 모델 코드를 연구할 때만 유용합니다.
+
+
+## PyTorch 사용자 정의 모듈 [[transformers.Conv1D]]
+
+[[autodoc]] pytorch_utils.Conv1D
+
+[[autodoc]] modeling_utils.PoolerStartLogits
+   - forward
+
+[[autodoc]] modeling_utils.PoolerEndLogits
+   - forward
+
+[[autodoc]] modeling_utils.PoolerAnswerClass
+   - forward
+
+[[autodoc]] modeling_utils.SquadHeadOutput
+
+[[autodoc]] modeling_utils.SQuADHead
+   - forward
+
+[[autodoc]] modeling_utils.SequenceSummary
+   - forward
+
+## PyTorch 헬퍼(helper) 함수 [[transformers.apply_chunking_to_forward]]
+
+[[autodoc]] pytorch_utils.apply_chunking_to_forward
+
+[[autodoc]] pytorch_utils.find_pruneable_heads_and_indices
+
+[[autodoc]] pytorch_utils.prune_layer
+
+[[autodoc]] pytorch_utils.prune_conv1d_layer
+
+[[autodoc]] pytorch_utils.prune_linear_layer
+
+## TensorFlow 사용자 정의 레이어 [[transformers.modeling_tf_utils.TFConv1D]]
+
+[[autodoc]] modeling_tf_utils.TFConv1D
+
+[[autodoc]] modeling_tf_utils.TFSequenceSummary
+
+## TensorFlow 손실 함수 [[transformers.modeling_tf_utils.TFCausalLanguageModelingLoss]]
+
+[[autodoc]] modeling_tf_utils.TFCausalLanguageModelingLoss
+
+[[autodoc]] modeling_tf_utils.TFMaskedLanguageModelingLoss
+
+[[autodoc]] modeling_tf_utils.TFMultipleChoiceLoss
+
+[[autodoc]] modeling_tf_utils.TFQuestionAnsweringLoss
+
+[[autodoc]] modeling_tf_utils.TFSequenceClassificationLoss
+
+[[autodoc]] modeling_tf_utils.TFTokenClassificationLoss
+
+## TensorFlow 도우미 함수 [[transformers.modeling_tf_utils.get_initializer]]
+
+[[autodoc]] modeling_tf_utils.get_initializer
+
+[[autodoc]] modeling_tf_utils.keras_serializable
+
+[[autodoc]] modeling_tf_utils.shape_list
diff --git a/docs/source/ko/internal/pipelines_utils.md b/docs/source/ko/internal/pipelines_utils.md
new file mode 100644
index 00000000000000..e0e64913145541
--- /dev/null
+++ b/docs/source/ko/internal/pipelines_utils.md
@@ -0,0 +1,43 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 파이프라인을 위한 유틸리티 [[utilities-for-pipelines]]
+
+이 페이지는 라이브러리에서 파이프라인을 위해 제공하는 모든 유틸리티 함수들을 나열합니다.
+
+이 함수들 대부분은 라이브러리 내 모델의 코드를 연구할 때만 유용합니다.
+
+## 인자 처리 [[transformers.pipelines.ArgumentHandler]]
+
+[[autodoc]] pipelines.ArgumentHandler
+
+[[autodoc]] pipelines.ZeroShotClassificationArgumentHandler
+
+[[autodoc]] pipelines.QuestionAnsweringArgumentHandler
+
+## 데이터 형식 [[transformers.PipelineDataFormat]]
+
+[[autodoc]] pipelines.PipelineDataFormat
+
+[[autodoc]] pipelines.CsvPipelineDataFormat
+
+[[autodoc]] pipelines.JsonPipelineDataFormat
+
+[[autodoc]] pipelines.PipedPipelineDataFormat
+
+## 유틸리티 [[transformers.pipelines.PipelineException]]
+
+[[autodoc]] pipelines.PipelineException
diff --git a/docs/source/ko/internal/time_series_utils.md b/docs/source/ko/internal/time_series_utils.md
new file mode 100644
index 00000000000000..5729924575b873
--- /dev/null
+++ b/docs/source/ko/internal/time_series_utils.md
@@ -0,0 +1,29 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 시계열 유틸리티 [[time-series-utilities]]
+
+이 페이지는 시계열 기반 모델에서 사용할 수 있는 유틸리티 함수와 클래스들을 나열합니다.
+
+이 함수들 대부분은 시계열 모델의 코드를 연구하거나 분포 출력 클래스의 컬렉션에 추가하려는 경우에만 유용합니다.
+
+## 분포 출력 (Distributional Output) [[transformers.time_series_utils.NormalOutput]]
+
+[[autodoc]] time_series_utils.NormalOutput
+
+[[autodoc]] time_series_utils.StudentTOutput
+
+[[autodoc]] time_series_utils.NegativeBinomialOutput
diff --git a/docs/source/ko/internal/tokenization_utils.md b/docs/source/ko/internal/tokenization_utils.md
new file mode 100644
index 00000000000000..b5b69910479a2f
--- /dev/null
+++ b/docs/source/ko/internal/tokenization_utils.md
@@ -0,0 +1,39 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 토크나이저를 위한 유틸리티 [[utilities-for-tokenizers]]
+
+이 페이지는 토크나이저에서 사용되는 모든 유틸리티 함수들을 나열하며, 주로 [`PreTrainedTokenizer`]와 [`PreTrainedTokenizerFast`] 사이의 공통 메소드를 구현하는 [`~tokenization_utils_base.PreTrainedTokenizerBase`] 클래스와 [`~tokenization_utils_base.SpecialTokensMixin`]을 다룹니다.
+
+이 함수들 대부분은 라이브러리의 토크나이저 코드를 연구할 때만 유용합니다.
+
+## PreTrainedTokenizerBase [[transformers.PreTrainedTokenizerBase]]
+
+[[autodoc]] tokenization_utils_base.PreTrainedTokenizerBase
+   - __call__
+   - all
+
+## SpecialTokensMixin [[transformers.SpecialTokensMixin]]
+
+[[autodoc]] tokenization_utils_base.SpecialTokensMixin
+
+## Enums 및 namedtuples [[transformers.tokenization_utils_base.TruncationStrategy]]
+
+[[autodoc]] tokenization_utils_base.TruncationStrategy
+
+[[autodoc]] tokenization_utils_base.CharSpan
+
+[[autodoc]] tokenization_utils_base.TokenSpan
diff --git a/docs/source/ko/main_classes/callback.md b/docs/source/ko/main_classes/callback.md
new file mode 100644
index 00000000000000..1f280a412989e7
--- /dev/null
+++ b/docs/source/ko/main_classes/callback.md
@@ -0,0 +1,133 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 콜백 [[callbacks]]
+
+콜백은 PyTorch [`Trainer`]의 반복 학습 동작을 사용자 정의할 수 있는 객체입니다
+(이 기능은 TensorFlow에서는 아직 구현되지 않았습니다). 콜백은 반복 학습의 상태를
+검사하여 (진행 상황 보고, TensorBoard 또는 기타 머신 러닝 플랫폼에 로그 남기기 등) 
+결정(예: 조기 종료)을 내릴 수 있습니다.
+
+콜백은 [`TrainerControl`] 객체를 반환하는 것 외에는 반복 학습에서 어떤 것도 변경할 수 없는
+"읽기 전용" 코드 조각입니다. 반복 학습에 변경이 필요한 사용자 정의 작업이 필요한 경우, 
+[`Trainer`]를 서브클래스로 만들어 필요한 메소드들을 오버라이드해야 합니다 (예시는 [trainer](trainer)를 참조하세요).
+
+기본적으로 `TrainingArguments.report_to`는 `"all"`로 설정되어 있으므로, [`Trainer`]는 다음 콜백을 사용합니다.
+
+- [`DefaultFlowCallback`]는 로그, 저장, 평가에 대한 기본 동작을 처리합니다.
+- [`PrinterCallback`] 또는 [`ProgressCallback`]는 진행 상황을 표시하고 로그를 출력합니다 
+  ([`TrainingArguments`]를 통해 tqdm을 비활성화하면 첫 번째 콜백이 사용되고, 그렇지 않으면 두 번째가 사용됩니다).
+- [`~integrations.TensorBoardCallback`]는 TensorBoard가 (PyTorch >= 1.4
+ 또는 tensorboardX를 통해) 접근 가능하면 사용됩니다.
+- [`~integrations.WandbCallback`]는 [wandb](https://www.wandb.com/)가 설치되어 있으면
+ 사용됩니다.
+- [`~integrations.CometCallback`]는 [comet_ml](https://www.comet.com/site/)이 설치되어 있으면 사용됩니다.
+- [`~integrations.MLflowCallback`]는 [mlflow](https://www.mlflow.org/)가 설치되어 있으면 사용됩니다.
+- [`~integrations.NeptuneCallback`]는 [neptune](https://neptune.ai/)이 설치되어 있으면 사용됩니다.
+- [`~integrations.AzureMLCallback`]는 [azureml-sdk](https://pypi.org/project/azureml-sdk/)가 설치되어
+ 있으면 사용됩니다.
+- [`~integrations.CodeCarbonCallback`]는 [codecarbon](https://pypi.org/project/codecarbon/)이 설치되어
+ 있으면 사용됩니다.
+- [`~integrations.ClearMLCallback`]는 [clearml](https://github.com/allegroai/clearml)이 설치되어 있으면 사용됩니다.
+- [`~integrations.DagsHubCallback`]는 [dagshub](https://dagshub.com/)이 설치되어 있으면 사용됩니다.
+- [`~integrations.FlyteCallback`]는 [flyte](https://flyte.org/)가 설치되어 있으면 사용됩니다.
+- [`~integrations.DVCLiveCallback`]는 [dvclive](https://dvc.org/doc/dvclive)가 설치되어 있으면 사용됩니다.
+
+패키지가 설치되어 있지만 해당 통합 기능을 사용하고 싶지 않다면, `TrainingArguments.report_to`를 사용하고자 하는 통합 기능 목록으로 변경할 수 있습니다 (예: `["azure_ml", "wandb"]`).
+
+콜백을 구현하는 주요 클래스는 [`TrainerCallback`]입니다. 이 클래스는 [`Trainer`]를 
+인스턴스화하는 데 사용된 [`TrainingArguments`]를 가져오고, 해당 Trainer의 내부 상태를 
+[`TrainerState`]를 통해 접근할 수 있으며, [`TrainerControl`]을 통해 반복 학습에서 일부 
+작업을 수행할 수 있습니다.
+
+
+## 사용 가능한 콜백 [[available-callbacks]]
+
+라이브러리에서 사용 가능한 [`TrainerCallback`] 목록은 다음과 같습니다:
+
+[[autodoc]] integrations.CometCallback
+    - setup
+
+[[autodoc]] DefaultFlowCallback
+
+[[autodoc]] PrinterCallback
+
+[[autodoc]] ProgressCallback
+
+[[autodoc]] EarlyStoppingCallback
+
+[[autodoc]] integrations.TensorBoardCallback
+
+[[autodoc]] integrations.WandbCallback
+    - setup
+
+[[autodoc]] integrations.MLflowCallback
+    - setup
+
+[[autodoc]] integrations.AzureMLCallback
+
+[[autodoc]] integrations.CodeCarbonCallback
+
+[[autodoc]] integrations.NeptuneCallback
+
+[[autodoc]] integrations.ClearMLCallback
+
+[[autodoc]] integrations.DagsHubCallback
+
+[[autodoc]] integrations.FlyteCallback
+
+[[autodoc]] integrations.DVCLiveCallback
+    - setup
+
+## TrainerCallback [[trainercallback]]
+
+[[autodoc]] TrainerCallback
+
+여기 PyTorch [`Trainer`]와 함께 사용자 정의 콜백을 등록하는 예시가 있습니다:
+
+```python
+class MyCallback(TrainerCallback):
+    "A callback that prints a message at the beginning of training"
+
+    def on_train_begin(self, args, state, control, **kwargs):
+        print("Starting training")
+
+
+trainer = Trainer(
+    model,
+    args,
+    train_dataset=train_dataset,
+    eval_dataset=eval_dataset,
+    callbacks=[MyCallback],  # 우리는 콜백 클래스를 이 방식으로 전달하거나 그것의 인스턴스(MyCallback())를 전달할 수 있습니다
+)
+```
+
+또 다른 콜백을 등록하는 방법은 `trainer.add_callback()`을 호출하는 것입니다:
+
+```python
+trainer = Trainer(...)
+trainer.add_callback(MyCallback)
+# 다른 방법으로는 콜백 클래스의 인스턴스를 전달할 수 있습니다
+trainer.add_callback(MyCallback())
+```
+
+## TrainerState [[trainerstate]]
+
+[[autodoc]] TrainerState
+
+## TrainerControl [[trainercontrol]]
+
+[[autodoc]] TrainerControl
diff --git a/docs/source/ko/main_classes/configuration.md b/docs/source/ko/main_classes/configuration.md
new file mode 100644
index 00000000000000..6c6278a0258560
--- /dev/null
+++ b/docs/source/ko/main_classes/configuration.md
@@ -0,0 +1,28 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 구성[[configuration]]
+
+기본 클래스 [`PretrainedConfig`]는 로컬 파일이나 디렉토리, 또는 라이브러리에서 제공하는 사전 학습된 모델 구성(HuggingFace의 AWS S3 저장소에서 다운로드됨)으로부터 구성을 불러오거나 저장하는 공통 메서드를 구현합니다. 각 파생 구성 클래스는 모델별 특성을 구현합니다. 
+
+모든 구성 클래스에 존재하는 공통 속성은 다음과 같습니다: `hidden_size`, `num_attention_heads`, `num_hidden_layers`. 텍스트 모델은 추가로 `vocab_size`를 구현합니다.
+
+
+## PretrainedConfig[[transformers.PretrainedConfig]]
+
+[[autodoc]] PretrainedConfig
+    - push_to_hub
+    - all
diff --git a/docs/source/ko/main_classes/data_collator.md b/docs/source/ko/main_classes/data_collator.md
new file mode 100644
index 00000000000000..0e677c7d8979b8
--- /dev/null
+++ b/docs/source/ko/main_classes/data_collator.md
@@ -0,0 +1,66 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 데이터 콜레이터(Data Collator)[[data-collator]]
+
+데이터 콜레이터는 데이터셋 요소들의 리스트를 입력으로 사용하여 배치를 형성하는 객체입니다. 이러한 요소들은 `train_dataset` 또는 `eval_dataset의` 요소들과 동일한 타입 입니다. 배치를 구성하기 위해, 데이터 콜레이터는 (패딩과 같은) 일부 처리를 적용할 수 있습니다. [`DataCollatorForLanguageModeling`]과 같은 일부 콜레이터는 형성된 배치에 (무작위 마스킹과 같은) 일부 무작위 데이터 증강도 적용합니다. 사용 예시는 [예제 스크립트](../examples)나 [예제 노트북](../notebooks)에서 찾을 수 있습니다.
+
+
+## 기본 데이터 콜레이터[[transformers.default_data_collator]]
+
+[[autodoc]] data.data_collator.default_data_collator
+
+## DefaultDataCollator[[transformers.DefaultDataCollator]]
+
+[[autodoc]] data.data_collator.DefaultDataCollator
+
+## DataCollatorWithPadding[[transformers.DataCollatorWithPadding]]
+
+[[autodoc]] data.data_collator.DataCollatorWithPadding
+
+## DataCollatorForTokenClassification[[transformers.DataCollatorForTokenClassification]]
+
+[[autodoc]] data.data_collator.DataCollatorForTokenClassification
+
+## DataCollatorForSeq2Seq[[transformers.DataCollatorForSeq2Seq]]
+
+[[autodoc]] data.data_collator.DataCollatorForSeq2Seq
+
+## DataCollatorForLanguageModeling[[transformers.DataCollatorForLanguageModeling]]
+
+[[autodoc]] data.data_collator.DataCollatorForLanguageModeling
+    - numpy_mask_tokens
+    - tf_mask_tokens
+    - torch_mask_tokens
+
+## DataCollatorForWholeWordMask[[transformers.DataCollatorForWholeWordMask]]
+
+[[autodoc]] data.data_collator.DataCollatorForWholeWordMask
+    - numpy_mask_tokens
+    - tf_mask_tokens
+    - torch_mask_tokens
+
+## DataCollatorForPermutationLanguageModeling[[transformers.DataCollatorForPermutationLanguageModeling]]
+
+[[autodoc]] data.data_collator.DataCollatorForPermutationLanguageModeling
+    - numpy_mask_tokens
+    - tf_mask_tokens
+    - torch_mask_tokens
+
+## DataCollatorWithFlatteningtransformers.DataCollatorWithFlattening
+
+[[autodoc]] data.data_collator.DataCollatorWithFlattening
+
diff --git a/docs/source/ko/main_classes/feature_extractor.md b/docs/source/ko/main_classes/feature_extractor.md
new file mode 100644
index 00000000000000..7c667424c4ff95
--- /dev/null
+++ b/docs/source/ko/main_classes/feature_extractor.md
@@ -0,0 +1,39 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 특성 추출기 [[feature-extractor]]
+
+특성 추출기는 오디오 또는 비전 모델을 위한 입력 특성을 준비하는 역할을 합니다. 여기에는 시퀀스에서 특성을 추출하는 작업(예를 들어, 오디오 파일을 전처리하여 Log-Mel 스펙트로그램 특성을 생성하는 것), 이미지에서 특성을 추출하는 작업(예를 들어, 이미지 파일을 자르는 것)이 포함됩니다. 뿐만 아니라 패딩, 정규화 및 NumPy, PyTorch, TensorFlow 텐서로의 변환도 포함됩니다.
+
+
+## FeatureExtractionMixin [[transformers.FeatureExtractionMixin]]
+
+[[autodoc]] feature_extraction_utils.FeatureExtractionMixin
+    - from_pretrained
+    - save_pretrained
+
+## SequenceFeatureExtractor [[transformers.SequenceFeatureExtractor]]
+
+[[autodoc]] SequenceFeatureExtractor
+    - pad
+
+## BatchFeature [[transformers.BatchFeature]]
+
+[[autodoc]] BatchFeature
+
+## ImageFeatureExtractionMixin [[transformers.ImageFeatureExtractionMixin]]
+
+[[autodoc]] image_utils.ImageFeatureExtractionMixin
diff --git a/docs/source/ko/main_classes/keras_callbacks.md b/docs/source/ko/main_classes/keras_callbacks.md
new file mode 100644
index 00000000000000..25d5ea3e40083a
--- /dev/null
+++ b/docs/source/ko/main_classes/keras_callbacks.md
@@ -0,0 +1,27 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 케라스 콜백[[keras-callbacks]]
+
+케라스로 트랜스포머 모델을 학습할 때, 일반적인 작업을 자동화하기 위한 라이브러리 전용 콜백들을 사용 할 수 있습니다.
+
+## KerasMetricCallback[[transformers.KerasMetricCallback]]
+
+[[autodoc]] KerasMetricCallback
+
+## PushToHubCallback[[transformers.PushToHubCallback]]
+
+[[autodoc]] PushToHubCallback
diff --git a/docs/source/ko/main_classes/logging.md b/docs/source/ko/main_classes/logging.md
new file mode 100644
index 00000000000000..55e1a21c7bd57b
--- /dev/null
+++ b/docs/source/ko/main_classes/logging.md
@@ -0,0 +1,108 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 로깅 [[logging]]
+
+🤗 트랜스포머는 중앙 집중식 로깅 시스템을 제공하여 라이브러리의 출력 레벨을 쉽게 설정할 수 있습니다.
+
+현재 라이브러리의 기본 출력 레벨은 `WARNING`으로 설정되어 있습니다.
+
+출력 레벨을 변경하려면 직접적인 설정 메서드를 사용할 수 있습니다. 예를 들어, 출력 레벨을 INFO 수준으로 변경하는 방법은 다음과 같습니다.
+
+```python
+import transformers
+
+transformers.logging.set_verbosity_info()
+```
+
+환경 변수 `TRANSFORMERS_VERBOSITY`를 사용하여 기본 출력 레벨을 재정의할 수도 있습니다. 이를 `debug`, `info`, `warning`, `error`, `critical`, `fatal` 중 하나로 설정할 수 있습니다. 예를 들어 다음과 같습니다.
+
+```bash
+TRANSFORMERS_VERBOSITY=error ./myprogram.py
+```
+
+또한, 일부 `warnings`는 환경 변수 `TRANSFORMERS_NO_ADVISORY_WARNINGS`를 1과 같은 true 값으로 설정하여 비활성화할 수 있습니다. 이렇게 하면 [`logger.warning_advice`]를 사용하여 기록된 경고가 비활성화됩니다. 예를 들어 다음과 같습니다.
+
+```bash
+TRANSFORMERS_NO_ADVISORY_WARNINGS=1 ./myprogram.py
+```
+
+다음은 라이브러리와 동일한 로거를 자신의 모듈이나 스크립트에서 사용하는 방법에 대한 예시입니다.
+
+```python
+from transformers.utils import logging
+
+logging.set_verbosity_info()
+logger = logging.get_logger("transformers")
+logger.info("INFO")
+logger.warning("WARN")
+```
+
+
+이 로깅 모듈의 모든 메서드는 아래에 문서화되어 있으며, 주요 메서드는 현재 로거의 출력 수준을 가져오는 [`logging.get_verbosity`]와 원하는 출력 수준으로 설정하는 [`logging.set_verbosity`] 입니다. 출력 수준은 (가장 적은 출력에서 가장 많은 출력 순으로) 다음과 같으며, 해당 수준에 대응하는 정수 값은 괄호 안에 표시됩니다.
+
+- `transformers.logging.CRITICAL` 또는 `transformers.logging.FATAL` (정숫값, 50): 가장 심각한 오류만 보고합니다.
+- `transformers.logging.ERROR` (정숫값, 40): 오류만 보고합니다.
+- `transformers.logging.WARNING` 또는 `transformers.logging.WARN` (정숫값, 30): 오류와 경고만 보고합니다. 이는 라이브러리에서 기본으로 사용되는 수준입니다.
+- `transformers.logging.INFO` (정숫값, 20): 오류, 경고, 그리고 기본적인 정보를 보고합니다.
+- `transformers.logging.DEBUG` (정숫값, 10): 모든 정보를 보고합니다.
+
+기본적으로 모델 다운로드 중에는 `tqdm` 진행 표시줄이 표시됩니다. [`logging.disable_progress_bar`]와 [`logging.enable_progress_bar`]를 사용하여 이 동작을 숨기거나 다시 표시할 수 있습니다.
+
+## `logging` vs `warnings`[[transformers.utils.logging.captureWarnings]]
+
+Python에는 종종 함께 사용되는 두 가지 로깅 시스템이 있습니다. 위에서 설명한 `logging`과 `warnings`입니다. `warnings`는 특정 범주로 경고를 세분화할 수 있습니다. 예를 들어, 이미 더 이상 사용되지 않는 기능이나 경로에 대해 `FutureWarning`이 사용되고, 곧 사용 중단될 기능을 알리기 위해 `DeprecationWarning`이 사용됩니다.
+
+트랜스포머 라이브러리에서는 두 시스템 모두를 사용합니다. `logging`의 `captureWarnings` 메서드를 활용하고 이를 조정하여 위에서 설명한 출력 수준 설정자들을 통해 이러한 경고 메시지들을 관리할 수 있도록 합니다.
+
+라이브러리 개발자는 다음과 같은 지침을 따르는 것이 좋습니다.
+
+- `warnings`는 라이브러리 개발자와 `transformers`에 의존하는 라이브러리 개발자들에게 유리합니다.
+- `logging`은 일반적인 프로젝트 라이브러리 개발자보다는, 라이브러리를 사용하는 최종 사용자들에게 유리할 것입니다.
+
+아래에서 `captureWarnings` 메소드에 대한 참고 사항을 확인할 수 있습니다.
+
+[[autodoc]] logging.captureWarnings
+
+## 기본 설정자 [[transformers.utils.logging.set_verbosity_error]]
+
+[[autodoc]] logging.set_verbosity_error
+
+[[autodoc]] logging.set_verbosity_warning
+
+[[autodoc]] logging.set_verbosity_info
+
+[[autodoc]] logging.set_verbosity_debug
+
+## 기타 함수 [[transformers.utils.logging.get_verbosity]]
+
+[[autodoc]] logging.get_verbosity
+
+[[autodoc]] logging.set_verbosity
+
+[[autodoc]] logging.get_logger
+
+[[autodoc]] logging.enable_default_handler
+
+[[autodoc]] logging.disable_default_handler
+
+[[autodoc]] logging.enable_explicit_format
+
+[[autodoc]] logging.reset_format
+
+[[autodoc]] logging.enable_progress_bar
+
+[[autodoc]] logging.disable_progress_bar
diff --git a/docs/source/ko/main_classes/model.md b/docs/source/ko/main_classes/model.md
new file mode 100644
index 00000000000000..71a9768deee1ec
--- /dev/null
+++ b/docs/source/ko/main_classes/model.md
@@ -0,0 +1,68 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 모델
+
+기본 클래스 [`PreTrainedModel`], [`TFPreTrainedModel`], [`FlaxPreTrainedModel`]는 로컬 파일과 디렉토리로부터 모델을 로드하고 저장하거나 또는 (허깅페이스 AWS S3 리포지토리로부터 다운로드된) 라이브러리에서 제공하는 사전 훈련된 모델 설정을 로드하고 저장하는 것을 지원하는 기본 메소드를 구현하였습니다.   
+
+[`PreTrainedModel`]과 [`TFPreTrainedModel`]은 또한 모든 모델들을 공통적으로 지원하는 메소드 여러개를 구현하였습니다:
+
+- 새 토큰이 단어장에 추가될 때, 입력 토큰 임베딩의 크기를 조정합니다.
+- 모델의 어텐션 헤드를 가지치기합니다.
+
+각 모델에 공통인 다른 메소드들은 다음의 클래스에서 정의됩니다. 
+- [`~modeling_utils.ModuleUtilsMixin`](파이토치 모델용)
+- 텍스트 생성을 위한 [`~modeling_tf_utils.TFModuleUtilsMixin`](텐서플로 모델용)
+- [`~generation.GenerationMixin`](파이토치 모델용)
+- [`~generation.FlaxGenerationMixin`](Flax/JAX 모델용)
+
+## PreTrainedModel
+
+[[autodoc]] PreTrainedModel
+    - push_to_hub
+    - all
+
+사용자 정의 모델은 초고속 초기화(superfast init)가 특정 모델에 적용될 수 있는지 여부를 결정하는 `_supports_assign_param_buffer`도 포함해야 합니다.
+`test_save_and_load_from_pretrained` 실패 시, 모델이 `_supports_assign_param_buffer`를 필요로 하는지 확인하세요.
+필요로 한다면 `False`로 설정하세요. 
+
+## ModuleUtilsMixin
+
+[[autodoc]] modeling_utils.ModuleUtilsMixin
+
+## TFPreTrainedModel
+
+[[autodoc]] TFPreTrainedModel
+    - push_to_hub
+    - all
+
+## TFModelUtilsMixin
+
+[[autodoc]] modeling_tf_utils.TFModelUtilsMixin
+
+## FlaxPreTrainedModel
+
+[[autodoc]] FlaxPreTrainedModel
+    - push_to_hub
+    - all
+
+## 허브에 저장하기
+
+[[autodoc]] utils.PushToHubMixin
+
+## 공유된 체크포인트
+
+[[autodoc]] modeling_utils.load_sharded_checkpoint
diff --git a/docs/source/ko/main_classes/onnx.md b/docs/source/ko/main_classes/onnx.md
new file mode 100644
index 00000000000000..510dab9df2f933
--- /dev/null
+++ b/docs/source/ko/main_classes/onnx.md
@@ -0,0 +1,50 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 🤗 Transformers 모델을 ONNX로 내보내기[[exporting--transformers-models-to-onnx]]
+
+🤗 트랜스포머는 `transformers.onnx` 패키지를 제공하며, 이 패키지는 설정 객체를 활용하여 모델 체크포인트를 ONNX 그래프로 변환할 수 있게 합니다.
+
+🤗 Transformers에 대한 자세한 내용은 [이 가이드](../serialization)를 참조하세요.
+
+## ONNX 설정[[onnx-configurations]]
+
+내보내려는(export) 모델 아키텍처의 유형에 따라 상속받아야 할 세 가지 추상 클래스를 제공합니다:
+
+* 인코더 기반 모델은 [`~onnx.config.OnnxConfig`]을 상속받습니다.
+* 디코더 기반 모델은 [`~onnx.config.OnnxConfigWithPast`]을 상속받습니다.
+* 인코더-디코더 기반 모델은 [`~onnx.config.OnnxSeq2SeqConfigWithPast`]을 상속받습니다.
+
+### OnnxConfig[[transformers.onnx.OnnxConfig]]
+
+[[autodoc]] onnx.config.OnnxConfig
+
+### OnnxConfigWithPast[[transformers.onnx.OnnxConfigWithPast]]
+
+[[autodoc]] onnx.config.OnnxConfigWithPast
+
+### OnnxSeq2SeqConfigWithPast[[OnnxSeq2SeqConfigWithPast]]
+
+[[autodoc]] onnx.config.OnnxSeq2SeqConfigWithPast
+
+## ONNX 특징[[onnx-features]]
+
+각 ONNX 설정은 다양한 유형의 토폴로지나 작업에 대해 모델을 내보낼 수 있게(exporting) 해주는 _features_ 세트와 연관되어 있습니다.
+
+### FeaturesManager[[transformers.onnx.FeaturesManager]]
+
+[[autodoc]] onnx.features.FeaturesManager
+
diff --git a/docs/source/ko/main_classes/output.md b/docs/source/ko/main_classes/output.md
new file mode 100644
index 00000000000000..e65a2c2c35906e
--- /dev/null
+++ b/docs/source/ko/main_classes/output.md
@@ -0,0 +1,314 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 모델 출력[[model-outputs]]
+
+모든 모델에는 [`~utils.ModelOutput`]의 서브클래스의 인스턴스인 모델 출력이 있습니다. 이들은
+모델에서 반환되는 모든 정보를 포함하는 데이터 구조이지만 튜플이나 딕셔너리로도 사용할 수 있습니다.
+
+예제를 통해 살펴보겠습니다:
+
+```python
+from transformers import BertTokenizer, BertForSequenceClassification
+import torch
+
+tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
+model = BertForSequenceClassification.from_pretrained("google-bert/bert-base-uncased")
+
+inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+labels = torch.tensor([1]).unsqueeze(0)  # 배치 크기 1
+outputs = model(**inputs, labels=labels)
+```
+
+`outputs` 객체는 [`~modeling_outputs.SequenceClassifierOutput`]입니다.
+아래 해당 클래스의 문서에서 볼 수 있듯이, `loss`(선택적), `logits`, `hidden_states`(선택적) 및 `attentions`(선택적) 항목이 있습니다. 여기에서는 `labels`를 전달했기 때문에 `loss`가 있지만 `hidden_states`와 `attentions`가 없는데, 이는 `output_hidden_states=True` 또는 `output_attentions=True`를 전달하지 않았기 때문입니다.
+
+<Tip>
+
+`output_hidden_states=True`를 전달할 때 `outputs.hidden_states[-1]`가 `outputs.last_hidden_state`와 정확히 일치할 것으로 예상할 수 있습니다.
+하지만 항상 그런 것은 아닙니다. 일부 모델은 마지막 은닉 상태가 반환될 때 정규화를 적용하거나 다른 후속 프로세스를 적용합니다.
+
+</Tip>
+
+
+일반적으로 사용할 때와 동일하게 각 속성들에 접근할 수 있으며, 모델이 해당 속성을 반환하지 않은 경우 `None`이 반환됩니다. 예시에서는 `outputs.loss`는 모델에서 계산한 손실이고 `outputs.attentions`는 `None`입니다.
+
+`outputs` 객체를 튜플로 간주할 때는 `None` 값이 없는 속성만 고려합니다. 
+예시에서는 `loss`와 `logits`라는 두 개의 요소가 있습니다. 그러므로,
+
+```python
+outputs[:2]
+```
+
+는 `(outputs.loss, outputs.logits)` 튜플을 반환합니다.
+
+`outputs` 객체를 딕셔너리로 간주할 때는 `None` 값이 없는 속성만 고려합니다.
+예시에는 `loss`와 `logits`라는 두 개의 키가 있습니다.
+
+여기서부터는 두 가지 이상의 모델 유형에서 사용되는 일반 모델 출력을 다룹니다. 구체적인 출력 유형은 해당 모델 페이지에 문서화되어 있습니다.
+
+## ModelOutput[[transformers.utils.ModelOutput]]
+
+[[autodoc]] utils.ModelOutput
+    - to_tuple
+
+## BaseModelOutput[[transformers.BaseModelOutput]]
+
+[[autodoc]] modeling_outputs.BaseModelOutput
+
+## BaseModelOutputWithPooling[[transformers.modeling_outputs.BaseModelOutputWithPooling]]
+
+[[autodoc]] modeling_outputs.BaseModelOutputWithPooling
+
+## BaseModelOutputWithCrossAttentions[[transformers.modeling_outputs.BaseModelOutputWithCrossAttentions]]
+
+[[autodoc]] modeling_outputs.BaseModelOutputWithCrossAttentions
+
+## BaseModelOutputWithPoolingAndCrossAttentions[[transformers.modeling_outputs.BaseModelOutputWithPoolingAndCrossAttentions]]
+
+[[autodoc]] modeling_outputs.BaseModelOutputWithPoolingAndCrossAttentions
+
+## BaseModelOutputWithPast[[transformers.modeling_outputs.BaseModelOutputWithPast]]
+
+[[autodoc]] modeling_outputs.BaseModelOutputWithPast
+
+## BaseModelOutputWithPastAndCrossAttentions[[transformers.modeling_outputs.BaseModelOutputWithPastAndCrossAttentions]]
+
+[[autodoc]] modeling_outputs.BaseModelOutputWithPastAndCrossAttentions
+
+## Seq2SeqModelOutput[[transformers.modeling_outputs.Seq2SeqModelOutput]]
+
+[[autodoc]] modeling_outputs.Seq2SeqModelOutput
+
+## CausalLMOutput[[transformers.modeling_outputs.CausalLMOutput]]
+
+[[autodoc]] modeling_outputs.CausalLMOutput
+
+## CausalLMOutputWithCrossAttentions[[transformers.modeling_outputs.CausalLMOutputWithCrossAttentions]]
+
+[[autodoc]] modeling_outputs.CausalLMOutputWithCrossAttentions
+
+## CausalLMOutputWithPast[[transformers.modeling_outputs.CausalLMOutputWithPast]]
+
+[[autodoc]] modeling_outputs.CausalLMOutputWithPast
+
+## MaskedLMOutput[[transformers.modeling_outputs.MaskedLMOutput]]
+
+[[autodoc]] modeling_outputs.MaskedLMOutput
+
+## Seq2SeqLMOutput[[transformers.modeling_outputs.Seq2SeqLMOutput]]
+
+[[autodoc]] modeling_outputs.Seq2SeqLMOutput
+
+## NextSentencePredictorOutput[[transformers.modeling_outputs.NextSentencePredictorOutput]]
+
+[[autodoc]] modeling_outputs.NextSentencePredictorOutput
+
+## SequenceClassifierOutput[[transformers.modeling_outputs.SequenceClassifierOutput]]
+
+[[autodoc]] modeling_outputs.SequenceClassifierOutput
+
+## Seq2SeqSequenceClassifierOutput[[transformers.modeling_outputs.Seq2SeqSequenceClassifierOutput]]
+
+[[autodoc]] modeling_outputs.Seq2SeqSequenceClassifierOutput
+
+## MultipleChoiceModelOutput[[transformers.modeling_outputs.MultipleChoiceModelOutput]]
+
+[[autodoc]] modeling_outputs.MultipleChoiceModelOutput
+
+## TokenClassifierOutput[[transformers.modeling_outputs.TokenClassifierOutput]]
+
+[[autodoc]] modeling_outputs.TokenClassifierOutput
+
+## QuestionAnsweringModelOutput[[transformers.modeling_outputs.QuestionAnsweringModelOutput]]
+
+[[autodoc]] modeling_outputs.QuestionAnsweringModelOutput
+
+## Seq2SeqQuestionAnsweringModelOutput[[transformers.modeling_outputs.Seq2SeqQuestionAnsweringModelOutput]]
+
+[[autodoc]] modeling_outputs.Seq2SeqQuestionAnsweringModelOutput
+
+## Seq2SeqSpectrogramOutput[[transformers.modeling_outputs.Seq2SeqSpectrogramOutput]]
+
+[[autodoc]] modeling_outputs.Seq2SeqSpectrogramOutput
+
+## SemanticSegmenterOutput[[transformers.modeling_outputs.SemanticSegmenterOutput]]
+
+[[autodoc]] modeling_outputs.SemanticSegmenterOutput
+
+## ImageClassifierOutput[[transformers.modeling_outputs.ImageClassifierOutput]]
+
+[[autodoc]] modeling_outputs.ImageClassifierOutput
+
+## ImageClassifierOutputWithNoAttention[[transformers.modeling_outputs.ImageClassifierOutputWithNoAttention]]
+
+[[autodoc]] modeling_outputs.ImageClassifierOutputWithNoAttention
+
+## DepthEstimatorOutput[[transformers.modeling_outputs.DepthEstimatorOutput]]
+
+[[autodoc]] modeling_outputs.DepthEstimatorOutput
+
+## Wav2Vec2BaseModelOutput[[transformers.modeling_outputs.Wav2Vec2BaseModelOutput]]
+
+[[autodoc]] modeling_outputs.Wav2Vec2BaseModelOutput
+
+## XVectorOutput[[transformers.modeling_outputs.XVectorOutput]]
+
+[[autodoc]] modeling_outputs.XVectorOutput
+
+## Seq2SeqTSModelOutput[[transformers.modeling_outputs.Seq2SeqTSModelOutput]]
+
+[[autodoc]] modeling_outputs.Seq2SeqTSModelOutput
+
+## Seq2SeqTSPredictionOutput[[transformers.modeling_outputs.Seq2SeqTSPredictionOutput]]
+
+[[autodoc]] modeling_outputs.Seq2SeqTSPredictionOutput
+
+## SampleTSPredictionOutput[[transformers.modeling_outputs.SampleTSPredictionOutput]]
+
+[[autodoc]] modeling_outputs.SampleTSPredictionOutput
+
+## TFBaseModelOutput[[transformers.modeling_outputs.TFBaseModelOutput]]
+
+[[autodoc]] modeling_tf_outputs.TFBaseModelOutput
+
+## TFBaseModelOutputWithPooling[[transformers.modeling_tf_outputs.TFBaseModelOutputWithPooling]]
+
+[[autodoc]] modeling_tf_outputs.TFBaseModelOutputWithPooling
+
+## TFBaseModelOutputWithPoolingAndCrossAttentions[[transformers.modeling_tf_outputs.TFBaseModelOutputWithPoolingAndCrossAttentions]]
+
+[[autodoc]] modeling_tf_outputs.TFBaseModelOutputWithPoolingAndCrossAttentions
+
+## TFBaseModelOutputWithPast[[transformers.modeling_tf_outputs.TFBaseModelOutputWithPast]]
+
+[[autodoc]] modeling_tf_outputs.TFBaseModelOutputWithPast
+
+## TFBaseModelOutputWithPastAndCrossAttentions[[transformers.modeling_tf_outputs.TFBaseModelOutputWithPastAndCrossAttentions]]
+
+[[autodoc]] modeling_tf_outputs.TFBaseModelOutputWithPastAndCrossAttentions
+
+## TFSeq2SeqModelOutput[[transformers.modeling_tf_outputs.TFSeq2SeqModelOutput]]
+
+[[autodoc]] modeling_tf_outputs.TFSeq2SeqModelOutput
+
+## TFCausalLMOutput[[transformers.modeling_tf_outputs.TFCausalLMOutput]]
+
+[[autodoc]] modeling_tf_outputs.TFCausalLMOutput
+
+## TFCausalLMOutputWithCrossAttentions[[transformers.modeling_tf_outputs.TFCausalLMOutputWithCrossAttentions]]
+
+[[autodoc]] modeling_tf_outputs.TFCausalLMOutputWithCrossAttentions
+
+## TFCausalLMOutputWithPast[[transformers.modeling_tf_outputs.TFCausalLMOutputWithPast]]
+
+[[autodoc]] modeling_tf_outputs.TFCausalLMOutputWithPast
+
+## TFMaskedLMOutput[[transformers.modeling_tf_outputs.TFMaskedLMOutput]]
+
+[[autodoc]] modeling_tf_outputs.TFMaskedLMOutput
+
+## TFSeq2SeqLMOutput[[transformers.modeling_tf_outputs.TFSeq2SeqLMOutput]]
+
+[[autodoc]] modeling_tf_outputs.TFSeq2SeqLMOutput
+
+## TFNextSentencePredictorOutput[[transformers.modeling_tf_outputs.TFNextSentencePredictorOutput]]
+
+[[autodoc]] modeling_tf_outputs.TFNextSentencePredictorOutput
+
+## TFSequenceClassifierOutput[[transformers.modeling_tf_outputs.TFSequenceClassifierOutput]]
+
+[[autodoc]] modeling_tf_outputs.TFSequenceClassifierOutput
+
+## TFSeq2SeqSequenceClassifierOutput[[transformers.modeling_tf_outputs.TFSeq2SeqSequenceClassifierOutput]]
+
+[[autodoc]] modeling_tf_outputs.TFSeq2SeqSequenceClassifierOutput
+
+## TFMultipleChoiceModelOutput[[transformers.modeling_tf_outputs.TFMultipleChoiceModelOutput]]
+
+[[autodoc]] modeling_tf_outputs.TFMultipleChoiceModelOutput
+
+## TFTokenClassifierOutput[[transformers.modeling_tf_outputs.TFTokenClassifierOutput]]
+
+[[autodoc]] modeling_tf_outputs.TFTokenClassifierOutput
+
+## TFQuestionAnsweringModelOutput[[transformers.modeling_tf_outputs.TFQuestionAnsweringModelOutput]]
+
+[[autodoc]] modeling_tf_outputs.TFQuestionAnsweringModelOutput
+
+## TFSeq2SeqQuestionAnsweringModelOutput[[transformers.modeling_tf_outputs.TFSeq2SeqQuestionAnsweringModelOutput]]
+
+[[autodoc]] modeling_tf_outputs.TFSeq2SeqQuestionAnsweringModelOutput
+
+## FlaxBaseModelOutput[[transformers.modeling_flax_outputs.FlaxBaseModelOutput]]
+
+[[autodoc]] modeling_flax_outputs.FlaxBaseModelOutput
+
+## FlaxBaseModelOutputWithPast[[transformers.modeling_flax_outputs.FlaxBaseModelOutputWithPast]]
+
+[[autodoc]] modeling_flax_outputs.FlaxBaseModelOutputWithPast
+
+## FlaxBaseModelOutputWithPooling[[transformers.modeling_flax_outputs.FlaxBaseModelOutputWithPooling]]
+
+[[autodoc]] modeling_flax_outputs.FlaxBaseModelOutputWithPooling
+
+## FlaxBaseModelOutputWithPastAndCrossAttentions[[transformers.modeling_flax_outputs.FlaxBaseModelOutputWithPastAndCrossAttentions]]
+
+[[autodoc]] modeling_flax_outputs.FlaxBaseModelOutputWithPastAndCrossAttentions
+
+## FlaxSeq2SeqModelOutput[[transformers.modeling_flax_outputs.FlaxSeq2SeqModelOutput]]
+
+[[autodoc]] modeling_flax_outputs.FlaxSeq2SeqModelOutput
+
+## FlaxCausalLMOutputWithCrossAttentions[[transformers.modeling_flax_outputs.FlaxCausalLMOutputWithCrossAttentions]]
+
+[[autodoc]] modeling_flax_outputs.FlaxCausalLMOutputWithCrossAttentions
+
+## FlaxMaskedLMOutput[[transformers.modeling_flax_outputs.FlaxMaskedLMOutput]]
+
+[[autodoc]] modeling_flax_outputs.FlaxMaskedLMOutput
+
+## FlaxSeq2SeqLMOutput[[transformers.modeling_flax_outputs.FlaxSeq2SeqLMOutput]]
+
+[[autodoc]] modeling_flax_outputs.FlaxSeq2SeqLMOutput
+
+## FlaxNextSentencePredictorOutput[[transformers.modeling_flax_outputs.FlaxNextSentencePredictorOutput]]
+
+[[autodoc]] modeling_flax_outputs.FlaxNextSentencePredictorOutput
+
+## FlaxSequenceClassifierOutput[[transformers.modeling_flax_outputs.FlaxSequenceClassifierOutput]]
+
+[[autodoc]] modeling_flax_outputs.FlaxSequenceClassifierOutput
+
+## FlaxSeq2SeqSequenceClassifierOutput[[transformers.modeling_flax_outputs.FlaxSeq2SeqSequenceClassifierOutput]]
+
+[[autodoc]] modeling_flax_outputs.FlaxSeq2SeqSequenceClassifierOutput
+
+## FlaxMultipleChoiceModelOutput[[transformers.modeling_flax_outputs.FlaxMultipleChoiceModelOutput]]
+
+[[autodoc]] modeling_flax_outputs.FlaxMultipleChoiceModelOutput
+
+## FlaxTokenClassifierOutput[[transformers.modeling_flax_outputs.FlaxTokenClassifierOutput]]
+
+[[autodoc]] modeling_flax_outputs.FlaxTokenClassifierOutput
+
+## FlaxQuestionAnsweringModelOutput[[transformers.modeling_flax_outputs.FlaxQuestionAnsweringModelOutput]]
+
+[[autodoc]] modeling_flax_outputs.FlaxQuestionAnsweringModelOutput
+
+## FlaxSeq2SeqQuestionAnsweringModelOutput[[transformers.modeling_flax_outputs.FlaxSeq2SeqQuestionAnsweringModelOutput]]
+
+[[autodoc]] modeling_flax_outputs.FlaxSeq2SeqQuestionAnsweringModelOutput
diff --git a/docs/source/ko/main_classes/quantization.md b/docs/source/ko/main_classes/quantization.md
new file mode 100644
index 00000000000000..b1d1730d28d00b
--- /dev/null
+++ b/docs/source/ko/main_classes/quantization.md
@@ -0,0 +1,71 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 양자화[[quantization]]
+
+
+
+양자화 기법은 가중치와 활성화를 8비트 정수(int8)와 같은 더 낮은 정밀도의 데이터 타입으로 표현함으로써 메모리와 계산 비용을 줄입니다. 이를 통해 일반적으로는 메모리에 올릴 수 없는 더 큰 모델을 로드할 수 있고, 추론 속도를 높일 수 있습니다. Transformers는 AWQ와 GPTQ 양자화 알고리즘을 지원하며, bitsandbytes를 통해 8비트와 4비트 양자화를 지원합니다.
+Transformers에서 지원되지 않는 양자화 기법들은 [`HfQuantizer`] 클래스를 통해 추가될 수 있습니다.
+
+<Tip>
+
+모델을 양자화하는 방법은 이 [양자화](../quantization) 가이드를 통해 배울 수 있습니다.
+
+</Tip>
+
+## QuantoConfig[[transformers.QuantoConfig]]
+
+[[autodoc]] QuantoConfig
+
+## AqlmConfig[[transformers.AqlmConfig]]
+
+[[autodoc]] AqlmConfig
+
+## AwqConfig[[transformers.AwqConfig]]
+
+[[autodoc]] AwqConfig
+
+## EetqConfig[[transformers.EetqConfig]]
+[[autodoc]] EetqConfig
+
+## GPTQConfig[[transformers.GPTQConfig]]
+
+[[autodoc]] GPTQConfig
+
+## BitsAndBytesConfig[[#transformers.BitsAndBytesConfig]]
+
+[[autodoc]] BitsAndBytesConfig
+
+## HfQuantizer[[transformers.quantizers.HfQuantizer]]
+
+[[autodoc]] quantizers.base.HfQuantizer
+
+## HqqConfig[[transformers.HqqConfig]]
+
+[[autodoc]] HqqConfig
+
+## FbgemmFp8Config[[transformers.FbgemmFp8Config]]
+
+[[autodoc]] FbgemmFp8Config
+
+## CompressedTensorsConfig[[transformers.CompressedTensorsConfig]]
+
+[[autodoc]] CompressedTensorsConfig
+
+## TorchAoConfig[[transformers.TorchAoConfig]]
+
+[[autodoc]] TorchAoConfig
diff --git a/docs/source/ko/main_classes/text_generation.md b/docs/source/ko/main_classes/text_generation.md
new file mode 100644
index 00000000000000..917d995e6ec15c
--- /dev/null
+++ b/docs/source/ko/main_classes/text_generation.md
@@ -0,0 +1,56 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 생성 [[generation]]
+
+각 프레임워크에는 해당하는 `GenerationMixin` 클래스에서 구현된 텍스트 생성을 위한 generate 메소드가 있습니다:
+
+- PyTorch에서는 [`~generation.GenerationMixin.generate`]가 [`~generation.GenerationMixin`]에 구현되어 있습니다.
+- TensorFlow에서는 [`~generation.TFGenerationMixin.generate`]가 [`~generation.TFGenerationMixin`]에 구현되어 있습니다.
+- Flax/JAX에서는 [`~generation.FlaxGenerationMixin.generate`]가 [`~generation.FlaxGenerationMixin`]에 구현되어 있습니다.
+
+사용하는 프레임워크에 상관없이, generate 메소드는 [`~generation.GenerationConfig`] 클래스 인스턴스로 매개변수화 할 수 있습니다. generate 메소드의 동작을 제어하는 모든 생성 매개변수 목록을 확인하려면 이 클래스를 참조하세요. 
+
+모델의 생성 설정을 어떻게 확인하고, 기본값이 무엇인지, 매개변수를 어떻게 임시로 변경하는지, 그리고 사용자 지정 생성 설정을 만들고 저장하는 방법을 배우려면 [텍스트 생성 전략 가이드](../generation_strategies)를 참조하세요. 이 가이드는 토큰 스트리밍과 같은 관련 기능을 사용하는 방법도 설명합니다.
+
+## GenerationConfig [[transformers.GenerationConfig]]
+
+[[autodoc]] generation.GenerationConfig
+	- from_pretrained
+	- from_model_config
+	- save_pretrained
+	- update
+	- validate
+	- get_generation_mode
+
+[[autodoc]] generation.WatermarkingConfig
+
+## GenerationMixin [[transformers.GenerationMixin]]
+
+[[autodoc]] generation.GenerationMixin
+	- generate
+	- compute_transition_scores
+
+## TFGenerationMixin [[transformers.TFGenerationMixin]]
+
+[[autodoc]] generation.TFGenerationMixin
+	- generate
+	- compute_transition_scores
+
+## FlaxGenerationMixin [[transformers.FlaxGenerationMixin]]
+
+[[autodoc]] generation.FlaxGenerationMixin
+	- generate
diff --git a/docs/source/ko/main_classes/trainer.md b/docs/source/ko/main_classes/trainer.md
new file mode 100644
index 00000000000000..23eda74a8bd669
--- /dev/null
+++ b/docs/source/ko/main_classes/trainer.md
@@ -0,0 +1,52 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Trainer [[trainer]]
+
+[`Trainer`] 클래스는 PyTorch에서 완전한 기능(feature-complete)의 훈련을 위한 API를 제공하며, 다중 GPU/TPU에서의 분산 훈련, [NVIDIA GPU](https://nvidia.github.io/apex/), [AMD GPU](https://rocm.docs.amd.com/en/latest/rocm.html)를 위한 혼합 정밀도, 그리고 PyTorch의 [`torch.amp`](https://pytorch.org/docs/stable/amp.html)를 지원합니다. [`Trainer`]는 모델의 훈련 방식을 커스터마이즈할 수 있는 다양한 옵션을 제공하는 [`TrainingArguments`] 클래스와 함께 사용됩니다. 이 두 클래스는 함께 완전한 훈련 API를 제공합니다.
+
+[`Seq2SeqTrainer`]와 [`Seq2SeqTrainingArguments`]는 [`Trainer`]와 [`TrainingArguments`] 클래스를 상속하며, 요약이나 번역과 같은 시퀀스-투-시퀀스 작업을 위한 모델 훈련에 적합하게 조정되어 있습니다.
+
+<Tip warning={true}>
+
+[`Trainer`] 클래스는 🤗 Transformers 모델에 최적화되어 있으며, 다른 모델과 함께 사용될 때 예상치 못한 동작을 하게 될 수 있습니다. 자신만의 모델을 사용할 때는 다음을 확인하세요:
+
+- 모델은 항상 튜플이나 [`~utils.ModelOutput`]의 서브클래스를 반환해야 합니다.
+- 모델은 `labels` 인자가 제공되면 손실을 계산할 수 있고, 모델이 튜플을 반환하는 경우 그 손실이 튜플의 첫 번째 요소로 반환되어야 합니다.
+- 모델은 여러 개의 레이블 인자를 수용할 수 있어야 하며, [`Trainer`]에게 이름을 알리기 위해 [`TrainingArguments`]에서 `label_names`를 사용하지만, 그 중 어느 것도 `"label"`로 명명되어서는 안 됩니다.
+
+</Tip>
+
+## Trainer [[transformers.Trainer]]
+
+[[autodoc]] Trainer
+    - all
+
+## Seq2SeqTrainer [[transformers.Seq2SeqTrainer]]
+
+[[autodoc]] Seq2SeqTrainer
+    - evaluate
+    - predict
+
+## TrainingArguments [[transformers.TrainingArguments]]
+
+[[autodoc]] TrainingArguments
+    - all
+
+## Seq2SeqTrainingArguments [[transformers.Seq2SeqTrainingArguments]]
+
+[[autodoc]] Seq2SeqTrainingArguments
+    - all
diff --git a/docs/source/ko/model_doc/auto.md b/docs/source/ko/model_doc/auto.md
new file mode 100644
index 00000000000000..cda00adc33a663
--- /dev/null
+++ b/docs/source/ko/model_doc/auto.md
@@ -0,0 +1,375 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Auto 클래스[[auto-classes]]
+
+많은 경우, 사용하려는 아키텍처는 `from_pretrained()` 메소드에서 제공하는 사전 훈련된 모델의 이름이나 경로로부터 유추할 수 있습니다. AutoClasses는 이 작업을 위해 존재하며, 사전 학습된 모델 가중치/구성/단어사전에 대한 이름/경로를 제공하면 자동으로 관련 모델을 가져오도록 도와줍니다.
+
+[`AutoConfig`], [`AutoModel`], [`AutoTokenizer`] 중 하나를 인스턴스화하면 해당 아키텍처의 클래스를 직접 생성합니다. 예를 들어,
+
+
+```python
+model = AutoModel.from_pretrained("google-bert/bert-base-cased")
+```
+
+위 코드는 [`BertModel`]의 인스턴스인 모델을 생성합니다.
+
+각 작업에 대해 하나의 `AutoModel` 클래스가 있으며, 각각의 백엔드(PyTorch, TensorFlow 또는 Flax)에 해당하는 클래스가 존재합니다.
+
+## 자동 클래스 확장[[extending-the-auto-classes]]
+
+각 자동 클래스는 사용자의 커스텀 클래스로 확장될 수 있는 메소드를 가지고 있습니다. 예를 들어, `NewModel`이라는 커스텀 모델 클래스를 정의했다면, `NewModelConfig`를 준비한 후 다음과 같이 자동 클래스에 추가할 수 있습니다:
+
+```python
+from transformers import AutoConfig, AutoModel
+
+AutoConfig.register("new-model", NewModelConfig)
+AutoModel.register(NewModelConfig, NewModel)
+```
+
+이후에는 일반적으로 자동 클래스를 사용하는 것처럼 사용할 수 있습니다!
+
+<Tip warning={true}>
+
+만약 `NewModelConfig`가 [`~transformers.PretrainedConfig`]의 서브클래스라면, 해당 `model_type` 속성이 등록할 때 사용하는 키(여기서는 `"new-model"`)와 동일하게 설정되어 있는지 확인하세요.
+
+마찬가지로, `NewModel`이 [`PreTrainedModel`]의 서브클래스라면, 해당 `config_class` 속성이 등록할 때 사용하는 클래스(여기서는 `NewModelConfig`)와 동일하게 설정되어 있는지 확인하세요.
+
+</Tip>
+
+## AutoConfig[[transformers.AutoConfig]]
+
+[[autodoc]] AutoConfig
+
+## AutoTokenizer[[transformers.AutoTokenizer]]
+
+[[autodoc]] AutoTokenizer
+
+## AutoFeatureExtractor[[transformers.AutoFeatureExtractor]]
+
+[[autodoc]] AutoFeatureExtractor
+
+## AutoImageProcessor[[transformers.AutoImageProcessor]]
+
+[[autodoc]] AutoImageProcessor
+
+## AutoProcessor[[transformers.AutoProcessor]]
+
+[[autodoc]] AutoProcessor
+
+## 일반적인 모델 클래스[[generic-model-classes]]
+
+다음 자동 클래스들은 특정 헤드 없이 기본 모델 클래스를 인스턴스화하는 데 사용할 수 있습니다.
+
+### AutoModel[[transformers.AutoModel]]
+
+[[autodoc]] AutoModel
+
+### TFAutoModel[[transformers.TFAutoModel]]
+
+[[autodoc]] TFAutoModel
+
+### FlaxAutoModel[[transformers.FlaxAutoModel]]
+
+[[autodoc]] FlaxAutoModel
+
+## 일반적인 사전 학습 클래스[[generic-pretraining-classes]]
+
+다음 자동 클래스들은 사전 훈련 헤드가 포함된 모델을 인스턴스화하는 데 사용할 수 있습니다.
+
+### AutoModelForPreTraining[[transformers.AutoModelForPreTraining]]
+
+[[autodoc]] AutoModelForPreTraining
+
+### TFAutoModelForPreTraining[[transformers.TFAutoModelForPreTraining]]
+
+[[autodoc]] TFAutoModelForPreTraining
+
+### FlaxAutoModelForPreTraining[[transformers.FlaxAutoModelForPreTraining]]
+
+[[autodoc]] FlaxAutoModelForPreTraining
+
+## 자연어 처리[[natural-language-processing]]
+
+다음 자동 클래스들은 아래의 자연어 처리 작업에 사용할 수 있습니다.
+
+### AutoModelForCausalLM[[transformers.AutoModelForCausalLM]]
+
+[[autodoc]] AutoModelForCausalLM
+
+### TFAutoModelForCausalLM[[transformers.TFAutoModelForCausalLM]]
+
+[[autodoc]] TFAutoModelForCausalLM
+
+### FlaxAutoModelForCausalLM[[transformers.FlaxAutoModelForCausalLM]]
+
+[[autodoc]] FlaxAutoModelForCausalLM
+
+### AutoModelForMaskedLM[[transformers.AutoModelForMaskedLM]]
+
+[[autodoc]] AutoModelForMaskedLM
+
+### TFAutoModelForMaskedLM[[transformers.TFAutoModelForMaskedLM]]
+
+[[autodoc]] TFAutoModelForMaskedLM
+
+### FlaxAutoModelForMaskedLM[[transformers.FlaxAutoModelForMaskedLM]]
+
+[[autodoc]] FlaxAutoModelForMaskedLM
+
+### AutoModelForMaskGeneration[[transformers.AutoModelForMaskGeneration]]
+
+[[autodoc]] AutoModelForMaskGeneration
+
+### TFAutoModelForMaskGeneration[[transformers.TFAutoModelForMaskGeneration]]
+
+[[autodoc]] TFAutoModelForMaskGeneration
+
+### AutoModelForSeq2SeqLM[[transformers.AutoModelForSeq2SeqLM]]
+
+[[autodoc]] AutoModelForSeq2SeqLM
+
+### TFAutoModelForSeq2SeqLM[[transformers.TFAutoModelForSeq2SeqLM]]
+
+[[autodoc]] TFAutoModelForSeq2SeqLM
+
+### FlaxAutoModelForSeq2SeqLM[[transformers.FlaxAutoModelForSeq2SeqLM]]
+
+[[autodoc]] FlaxAutoModelForSeq2SeqLM
+
+### AutoModelForSequenceClassification[[transformers.AutoModelForSequenceClassification]]
+
+[[autodoc]] AutoModelForSequenceClassification
+
+### TFAutoModelForSequenceClassification[[transformers.TFAutoModelForSequenceClassification]]
+
+[[autodoc]] TFAutoModelForSequenceClassification
+
+### FlaxAutoModelForSequenceClassification[[transformers.FlaxAutoModelForSequenceClassification]]
+
+[[autodoc]] FlaxAutoModelForSequenceClassification
+
+### AutoModelForMultipleChoice[[transformers.AutoModelForMultipleChoice]]
+
+[[autodoc]] AutoModelForMultipleChoice
+
+### TFAutoModelForMultipleChoice[[transformers.TFAutoModelForMultipleChoice]]
+
+[[autodoc]] TFAutoModelForMultipleChoice
+
+### FlaxAutoModelForMultipleChoice[[transformers.FlaxAutoModelForMultipleChoice]]
+
+[[autodoc]] FlaxAutoModelForMultipleChoice
+
+### AutoModelForNextSentencePrediction[[transformers.AutoModelForNextSentencePrediction]]
+
+[[autodoc]] AutoModelForNextSentencePrediction
+
+### TFAutoModelForNextSentencePrediction[[transformers.TFAutoModelForNextSentencePrediction]]
+
+[[autodoc]] TFAutoModelForNextSentencePrediction
+
+### FlaxAutoModelForNextSentencePrediction[[transformers.FlaxAutoModelForNextSentencePrediction]]
+
+[[autodoc]] FlaxAutoModelForNextSentencePrediction
+
+### AutoModelForTokenClassification[[transformers.AutoModelForTokenClassification]]
+
+[[autodoc]] AutoModelForTokenClassification
+
+### TFAutoModelForTokenClassification[[transformers.TFAutoModelForTokenClassification]]
+
+[[autodoc]] TFAutoModelForTokenClassification
+
+### FlaxAutoModelForTokenClassification[[transformers.FlaxAutoModelForTokenClassification]]
+
+[[autodoc]] FlaxAutoModelForTokenClassification
+
+### AutoModelForQuestionAnswering[[transformers.AutoModelForQuestionAnswering]]
+
+[[autodoc]] AutoModelForQuestionAnswering
+
+### TFAutoModelForQuestionAnswering[[transformers.TFAutoModelForQuestionAnswering]]
+
+[[autodoc]] TFAutoModelForQuestionAnswering
+
+### FlaxAutoModelForQuestionAnswering[[transformers.FlaxAutoModelForQuestionAnswering]]
+
+[[autodoc]] FlaxAutoModelForQuestionAnswering
+
+### AutoModelForTextEncoding[[transformers.AutoModelForTextEncoding]]
+
+[[autodoc]] AutoModelForTextEncoding
+
+### TFAutoModelForTextEncoding[[transformers.TFAutoModelForTextEncoding]]
+
+[[autodoc]] TFAutoModelForTextEncoding
+
+## 컴퓨터 비전[[computer-vision]]
+
+다음 자동 클래스들은 아래의 컴퓨터 비전 작업에 사용할 수 있습니다.
+
+### AutoModelForDepthEstimation[[transformers.AutoModelForDepthEstimation]]
+
+[[autodoc]] AutoModelForDepthEstimation
+
+### AutoModelForImageClassification[[transformers.AutoModelForImageClassification]]
+
+[[autodoc]] AutoModelForImageClassification
+
+### TFAutoModelForImageClassification[[transformers.TFAutoModelForImageClassification]]
+
+[[autodoc]] TFAutoModelForImageClassification
+
+### FlaxAutoModelForImageClassification[[transformers.FlaxAutoModelForImageClassification]]
+
+[[autodoc]] FlaxAutoModelForImageClassification
+
+### AutoModelForVideoClassification[[transformers.AutoModelForVideoClassification]]
+
+[[autodoc]] AutoModelForVideoClassification
+
+### AutoModelForKeypointDetection[[transformers.AutoModelForKeypointDetection]]
+
+[[autodoc]] AutoModelForKeypointDetection
+
+### AutoModelForMaskedImageModeling[[transformers.AutoModelForMaskedImageModeling]]
+
+[[autodoc]] AutoModelForMaskedImageModeling
+
+### TFAutoModelForMaskedImageModeling[[transformers.TFAutoModelForMaskedImageModeling]]
+
+[[autodoc]] TFAutoModelForMaskedImageModeling
+
+### AutoModelForObjectDetection[[transformers.AutoModelForObjectDetection]]
+
+[[autodoc]] AutoModelForObjectDetection
+
+### AutoModelForImageSegmentation[[transformers.AutoModelForImageSegmentation]]
+
+[[autodoc]] AutoModelForImageSegmentation
+
+### AutoModelForImageToImage[[transformers.AutoModelForImageToImage]]
+
+[[autodoc]] AutoModelForImageToImage
+
+### AutoModelForSemanticSegmentation[[transformers.AutoModelForSemanticSegmentation]]
+
+[[autodoc]] AutoModelForSemanticSegmentation
+
+### TFAutoModelForSemanticSegmentation[[transformers.TFAutoModelForSemanticSegmentation]]
+
+[[autodoc]] TFAutoModelForSemanticSegmentation
+
+### AutoModelForInstanceSegmentation[[transformers.AutoModelForInstanceSegmentation]]
+
+[[autodoc]] AutoModelForInstanceSegmentation
+
+### AutoModelForUniversalSegmentation[[transformers.AutoModelForUniversalSegmentation]]
+
+[[autodoc]] AutoModelForUniversalSegmentation
+
+### AutoModelForZeroShotImageClassification[[transformers.AutoModelForZeroShotImageClassification]]
+
+[[autodoc]] AutoModelForZeroShotImageClassification
+
+### TFAutoModelForZeroShotImageClassification[[transformers.TFAutoModelForZeroShotImageClassification]]
+
+[[autodoc]] TFAutoModelForZeroShotImageClassification
+
+### AutoModelForZeroShotObjectDetection[[transformers.AutoModelForZeroShotObjectDetection]]
+
+[[autodoc]] AutoModelForZeroShotObjectDetection
+
+## 오디오[[audio]]
+
+다음 자동 클래스들은 아래의 오디오 작업에 사용할 수 있습니다.
+
+### AutoModelForAudioClassification[[transformers.AutoModelForAudioClassification]]
+
+[[autodoc]] AutoModelForAudioClassification
+
+### TFAutoModelForAudioClassification[[transformers.TFAutoModelForAudioClassification]]
+
+[[autodoc]] TFAutoModelForAudioClassification
+
+### AutoModelForAudioFrameClassification[[transformers.AutoModelForAudioFrameClassification]]
+
+[[autodoc]] AutoModelForAudioFrameClassification
+
+### AutoModelForCTC[[transformers.AutoModelForCTC]]
+
+[[autodoc]] AutoModelForCTC
+
+### AutoModelForSpeechSeq2Seq[[transformers.AutoModelForSpeechSeq2Seq]]
+
+[[autodoc]] AutoModelForSpeechSeq2Seq
+
+### TFAutoModelForSpeechSeq2Seq[[transformers.TFAutoModelForSpeechSeq2Seq]]
+
+[[autodoc]] TFAutoModelForSpeechSeq2Seq
+
+### FlaxAutoModelForSpeechSeq2Seq[[transformers.FlaxAutoModelForSpeechSeq2Seq]]
+
+[[autodoc]] FlaxAutoModelForSpeechSeq2Seq
+
+### AutoModelForAudioXVector[[transformers.AutoModelForAudioXVector]]
+
+[[autodoc]] AutoModelForAudioXVector
+
+### AutoModelForTextToSpectrogram[[transformers.AutoModelForTextToSpectrogram]]
+
+[[autodoc]] AutoModelForTextToSpectrogram
+
+### AutoModelForTextToWaveform[[transformers.AutoModelForTextToWaveform]]
+
+[[autodoc]] AutoModelForTextToWaveform
+
+## 멀티모달[[multimodal]]
+
+다음 자동 클래스들은 아래의 멀티모달 작업에 사용할 수 있습니다.
+
+### AutoModelForTableQuestionAnswering[[transformers.AutoModelForTableQuestionAnswering]]
+
+[[autodoc]] AutoModelForTableQuestionAnswering
+
+### TFAutoModelForTableQuestionAnswering[[transformers.TFAutoModelForTableQuestionAnswering]]
+
+[[autodoc]] TFAutoModelForTableQuestionAnswering
+
+### AutoModelForDocumentQuestionAnswering[[transformers.AutoModelForDocumentQuestionAnswering]]
+
+[[autodoc]] AutoModelForDocumentQuestionAnswering
+
+### TFAutoModelForDocumentQuestionAnswering[[transformers.TFAutoModelForDocumentQuestionAnswering]]
+
+[[autodoc]] TFAutoModelForDocumentQuestionAnswering
+
+### AutoModelForVisualQuestionAnswering[[transformers.AutoModelForVisualQuestionAnswering]]
+
+[[autodoc]] AutoModelForVisualQuestionAnswering
+
+### AutoModelForVision2Seq[[transformers.AutoModelForVision2Seq]]
+
+[[autodoc]] AutoModelForVision2Seq
+
+### TFAutoModelForVision2Seq[[transformers.TFAutoModelForVision2Seq]]
+
+[[autodoc]] TFAutoModelForVision2Seq
+
+### FlaxAutoModelForVision2Seq[[transformers.FlaxAutoModelForVision2Seq]]
+
+[[autodoc]] FlaxAutoModelForVision2Seq
diff --git a/docs/source/ko/model_doc/autoformer.md b/docs/source/ko/model_doc/autoformer.md
new file mode 100644
index 00000000000000..e8534ac95c327a
--- /dev/null
+++ b/docs/source/ko/model_doc/autoformer.md
@@ -0,0 +1,50 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Autoformer[[autoformer]]
+
+## 개요[[overview]]
+
+The Autoformer 모델은 Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long가 제안한 [오토포머: 장기 시계열 예측을 위한 자기상관 분해 트랜스포머](https://arxiv.org/abs/2106.13008) 라는 논문에서 소개 되었습니다.
+
+이 모델은 트랜스포머를 심층 분해 아키텍처로 확장하여, 예측 과정에서 추세와 계절성 요소를 점진적으로 분해할 수 있습니다.
+
+해당 논문의 초록입니다:
+
+*예측 시간을 연장하는 것은 극한 기상 조기 경보 및 장기 에너지 소비 계획과 같은 실제 응용 프로그램에 중요한 요구 사항입니다. 본 논문은 시계열의 장기 예측 문제를 연구합니다. 기존의 트랜스포머 기반 모델들은 장거리 종속성을 발견하기 위해 다양한 셀프 어텐션 메커니즘을 채택합니다. 그러나 장기 미래의 복잡한 시간적 패턴으로 인해 모델이 신뢰할 수 있는 종속성을 찾기 어렵습니다. 또한, 트랜스포머는 긴 시계열의 효율성을 위해 점별 셀프 어텐션의 희소 버전을 채택해야 하므로 정보 활용의 병목 현상이 발생합니다. 우리는 트랜스포머를 넘어서 자기상관 메커니즘을 갖춘 새로운 분해 아키텍처인 Autoformer를 설계했습니다. 우리는 시계열 분해의 전처리 관행을 깨고 이를 심층 모델의 기본 내부 블록으로 혁신했습니다. 이 설계는 Autoformer에 복잡한 시계열에 대한 점진적 분해 능력을 부여합니다. 또한, 확률 과정 이론에서 영감을 받아 시계열의 주기성을 기반으로 자기상관 메커니즘을 설계했으며, 이는 하위 시계열 수준에서 종속성 발견과 표현 집계를 수행합니다. 자기상관은 효율성과 정확도 면에서 셀프 어텐션를 능가합니다. 장기 예측에서 Autoformer는 에너지, 교통, 경제, 날씨, 질병 등 5가지 실용적 응용 분야를 포괄하는 6개 벤치마크에서 38%의 상대적 개선으로 최첨단 정확도를 달성했습니다.*
+
+이 모델은 [elisim](https://huggingface.co/elisim)과 [kashif](https://huggingface.co/kashif)에 의해 기여 되었습니다.
+원본 코드는 [이곳](https://github.com/thuml/Autoformer)에서 확인할 수 있습니다.
+
+## 자료[[resources]]
+
+시작하는 데 도움이 되는 Hugging Face와 community 자료 목록(🌎로 표시됨) 입니다. 여기에 포함될 자료를 제출하고 싶으시다면 PR(Pull Request)를 열어주세요. 리뷰 해드리겠습니다! 자료는 기존 자료를 복제하는 대신 새로운 내용을 담고 있어야 합니다.
+
+- HuggingFace 블로그에서 Autoformer 포스트를 확인하세요: [네, 그렇습니다, 트랜스포머는 시계열 예측에 효과적입니다(+ Autoformer)](https://huggingface.co/blog/autoformer)
+
+## AutoformerConfig[[transformers.AutoformerConfig]]
+
+[[autodoc]] AutoformerConfig
+
+## AutoformerModel[[transformers.AutoformerModel]]
+
+[[autodoc]] AutoformerModel
+    - forward
+
+## AutoformerForPrediction[[transformers.AutoformerForPrediction]]
+
+[[autodoc]] AutoformerForPrediction
+    - forward
diff --git a/docs/source/ko/model_doc/bart.md b/docs/source/ko/model_doc/bart.md
new file mode 100644
index 00000000000000..0f7636b3cc95c2
--- /dev/null
+++ b/docs/source/ko/model_doc/bart.md
@@ -0,0 +1,217 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BART[[bart]]
+
+<div class="flex flex-wrap space-x-1">
+<a href="https://huggingface.co/models?filter=bart">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-bart-blueviolet">
+</a>
+<a href="https://huggingface.co/spaces/docs-demos/bart-large-mnli">
+<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
+</a>
+</div>
+
+## 개요 [[overview]]
+
+Bart 모델은 2019년 10월 29일 Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov, Luke Zettlemoyer가 발표한 [BART: 자연어 생성, 번역, 이해를 위한 잡음 제거 seq2seq 사전 훈련](https://arxiv.org/abs/1910.13461)이라는 논문에서 소개되었습니다.
+
+논문의 초록에 따르면,
+
+- Bart는 양방향 인코더(BERT와 유사)와 왼쪽에서 오른쪽으로 디코딩하는 디코더(GPT와 유사)를 사용하는 표준 seq2seq/기계 번역 아키텍처를 사용합니다.
+- 사전 훈련 작업은 원래 문장의 순서를 무작위로 섞고, 텍스트의 일부 구간을 단일 마스크 토큰으로 대체하는 새로운 인필링(in-filling) 방식을 포함합니다.
+- BART는 특히 텍스트 생성을 위한 미세 조정에 효과적이지만 이해 작업에도 잘 작동합니다. GLUE와 SQuAD에서 비슷한 훈련 리소스로 RoBERTa의 성능과 일치하며, 추상적 대화, 질의응답, 요약 작업 등에서 최대 6 ROUGE 점수의 향상을 보이며 새로운 최고 성능을 달성했습니다.
+
+이 모델은 [sshleifer](https://huggingface.co/sshleifer)에 의해 기여 되었습니다. 저자의 코드는 [이곳](https://github.com/pytorch/fairseq/tree/master/examples/bart)에서 확인할 수 있습니다.
+
+## 사용 팁:[[usage-tips]]
+
+- BART는 절대 위치 임베딩을 사용하는 모델이므로 일반적으로 입력을 왼쪽보다는 오른쪽에 패딩하는 것이 좋습니다.
+- 인코더와 디코더가 있는 seq2seq 모델입니다. 인코더에는 손상된 토큰이(corrupted tokens) 입력되고, 디코더에는 원래 토큰이 입력됩니다(단, 일반적인 트랜스포머 디코더처럼 미래 단어를 숨기는 마스크가 있습니다). 사전 훈련 작업에서 인코더에 적용되는 변환들의 구성은 다음과 같습니다:
+
+사전 훈련 작업에서 인코더에 적용되는 변환들의 구성은 다음과 같습니다:
+
+  * 무작위 토큰 마스킹 (BERT 처럼)
+  * 무작위 토큰 삭제
+  * k개 토큰의 범위를 단일 마스크 토큰으로 마스킹 (0개 토큰의 범위는 마스크 토큰의 삽입을 의미)
+  * 문장 순서 뒤섞기
+  * 특정 토큰에서 시작하도록 문서 회전
+
+## 구현 노트[[implementation-notes]]
+
+- Bart는 시퀀스 분류에 `token_type_ids`를 사용하지 않습니다. 적절하게 나누기 위해서 [`BartTokenizer`]나
+  [`~BartTokenizer.encode`]를 사용합니다.
+- [`BartModel`]의 정방향 전달은 `decoder_input_ids`가 전달되지 않으면 `decoder_input_ids`를 자동으로 생성할 것입니다. 이는 다른 일부 모델링 API와 다른 점입니다. 이 기능의 일반적인 사용 사례는 마스크 채우기(mask filling)입니다.
+-  모델 예측은 `forced_bos_token_id=0`일 때 기존 구현과 동일하게 작동하도록 의도되었습니다. 하지만, [`fairseq.encode`]에 전달하는 문자열이 공백으로 시작할 때만 이 기능이 작동합니다.
+- [`~generation.GenerationMixin.generate`]는 요약과 같은 조건부 생성 작업에 사용되어야 합니다. 자세한 내용은 해당 문서의 예제를 참조하세요.
+- *facebook/bart-large-cnn* 가중치를 로드하는 모델은 `mask_token_id`가 없거나, 마스크 채우기 작업을 수행할 수 없습니다.
+
+## 마스크 채우기[[mask-filling]]
+
+`facebook/bart-base`와 `facebook/bart-large` 체크포인트는 멀티 토큰 마스크를 채우는데 사용될 수 있습니다.
+
+```python
+from transformers import BartForConditionalGeneration, BartTokenizer
+
+model = BartForConditionalGeneration.from_pretrained("facebook/bart-large", forced_bos_token_id=0)
+tok = BartTokenizer.from_pretrained("facebook/bart-large")
+example_english_phrase = "UN Chief Says There Is No <mask> in Syria"
+batch = tok(example_english_phrase, return_tensors="pt")
+generated_ids = model.generate(batch["input_ids"])
+assert tok.batch_decode(generated_ids, skip_special_tokens=True) == [
+    "UN Chief Says There Is No Plan to Stop Chemical Weapons in Syria"
+]
+```
+
+## 자료[[resources]]
+
+BART를 시작하는 데 도움이 되는 Hugging Face와 community 자료 목록(🌎로 표시됨) 입니다. 여기에 포함될 자료를 제출하고 싶으시다면 PR(Pull Request)를 열어주세요. 리뷰 해드리겠습니다! 자료는 기존 자료를 복제하는 대신 새로운 내용을 담고 있어야 합니다.
+
+<PipelineTag pipeline="summarization"/>
+
+- [분산형 학습: 🤗 Transformers와 Amazon SageMaker를 이용하여 요약하기 위한 BART/T5 학습](https://huggingface.co/blog/sagemaker-distributed-training-seq2seq)에 대한 블로그 포스트.
+- [blurr를 이용하여 fastai로 요약하기 위한 BART를 미세 조정](https://colab.research.google.com/github/ohmeow/ohmeow_website/blob/master/posts/2021-05-25-mbart-sequence-classification-with-blurr.ipynb)하는 방법에 대한 노트북. 🌎
+- [Trainer 클래스를 사용하여 두 가지 언어로 요약하기 위한 BART 미세 조정](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb)하는 방법에 대한 노트북. 🌎
+- 이 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb)에서는  [`BartForConditionalGeneration`]이 지원됩니다.
+- [`TFBartForConditionalGeneration`]는 이 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/summarization)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb)에서 지원됩니다.
+- 이 [예제 스크립트에서는](https://github.com/huggingface/transformers/tree/main/examples/flax/summarization)[`FlaxBartForConditionalGeneration`]이 지원됩니다.
+- Hugging Face `datasets` 객체를 활용하여 [`BartForConditionalGeneration`]을 학습시키는 방법의 예는 이 [포럼 토론](https://discuss.huggingface.co/t/train-bart-for-conditional-generation-e-g-summarization/1904)에서 찾을 수 있습니다.
+- 🤗 Hugging Face 코스의 [요약](https://huggingface.co/course/chapter7/5?fw=pt#summarization)장.
+- [요약 작업 가이드](../tasks/summarization)
+
+<PipelineTag pipeline="fill-mask"/>
+
+- [`BartForConditionalGeneration`]는 이 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#robertabertdistilbert-and-masked-language-modeling)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)을 참고하세요.
+- [`TFBartForConditionalGeneration`]는 이 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/language-modeling#run_mlmpy)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb)을 참고하세요.
+- [`FlaxBartForConditionalGeneration`]는 이 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#masked-language-modeling)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/masked_language_modeling_flax.ipynb)을 참고 하세요.
+- 🤗 Hugging Face 코스의 [마스크 언어 모델링](https://huggingface.co/course/chapter7/3?fw=pt) 챕터.
+- [마스크 언어 모델링 작업 가이드](../tasks/masked_language_modeling)
+
+<PipelineTag pipeline="translation"/>
+
+- [Seq2SeqTrainer를 이용하여 인도어를 영어로 번역하는 mBART를 미세 조정](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb)하는 방법에 대한 가이드. 🌎
+- [`BartForConditionalGeneration`]는 이 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/pytorch/translation)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation.ipynb)을 참고하세요.
+- [`TFBartForConditionalGeneration`]는 이 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/translation)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation-tf.ipynb)을 참고 하세요.
+- [번역 작업 가이드](../tasks/translation)
+
+추가적으로 볼 것들:
+- [텍스트 분류 작업 가이드](../tasks/sequence_classification)
+- [질문 답변 작업 가이드](../tasks/question_answering)
+- [인과적 언어 모델링 작업 가이드](../tasks/language_modeling)
+- 이 [논문](https://arxiv.org/abs/2010.13002)은 [증류된 체크포인트](https://huggingface.co/models?search=distilbart)에 대해 설명합니다.
+
+## BartConfig[[transformers.BartConfig]]
+
+[[autodoc]] BartConfig
+    - all
+
+## BartTokenizer[[transformers.BartTokenizer]]
+
+[[autodoc]] BartTokenizer
+    - all
+
+## BartTokenizerFast[[transformers.BartTokenizerFast]]
+
+[[autodoc]] BartTokenizerFast
+    - all
+
+
+<frameworkcontent>
+<pt>
+
+## BartModel[[transformers.BartModel]]
+
+[[autodoc]] BartModel
+    - forward
+
+## BartForConditionalGeneration[[transformers.BartForConditionalGeneration]]
+
+[[autodoc]] BartForConditionalGeneration
+    - forward
+
+## BartForSequenceClassification[[transformers.BartForSequenceClassification]]
+
+[[autodoc]] BartForSequenceClassification
+    - forward
+
+## BartForQuestionAnswering[[transformers.BartForQuestionAnswering]]
+
+[[autodoc]] BartForQuestionAnswering
+    - forward
+
+## BartForCausalLM[[transformers.BartForCausalLM]]
+
+[[autodoc]] BartForCausalLM
+    - forward
+
+</pt>
+<tf>
+
+## TFBartModel[[transformers.TFBartModel]]
+
+[[autodoc]] TFBartModel
+    - call
+
+## TFBartForConditionalGeneration[[transformers.TFBartForConditionalGeneration]]
+
+[[autodoc]] TFBartForConditionalGeneration
+    - call
+
+## TFBartForSequenceClassification[[transformers.TFBartForSequenceClassification]]
+
+[[autodoc]] TFBartForSequenceClassification
+    - call
+
+</tf>
+<jax>
+
+## FlaxBartModel[[transformers.FlaxBartModel]]
+
+[[autodoc]] FlaxBartModel
+    - __call__
+    - encode
+    - decode
+
+## FlaxBartForConditionalGeneration[[transformers.FlaxBartForConditionalGeneration]]
+
+[[autodoc]] FlaxBartForConditionalGeneration
+    - __call__
+    - encode
+    - decode
+
+## FlaxBartForSequenceClassification[[transformers.FlaxBartForSequenceClassification]]
+
+[[autodoc]] FlaxBartForSequenceClassification
+    - __call__
+    - encode
+    - decode
+
+## FlaxBartForQuestionAnswering[[transformers.FlaxBartForQuestionAnswering]]
+
+[[autodoc]] FlaxBartForQuestionAnswering
+    - __call__
+    - encode
+    - decode
+
+## FlaxBartForCausalLM[[transformers.FlaxBartForCausalLM]]
+
+[[autodoc]] FlaxBartForCausalLM
+    - __call__
+</jax>
+</frameworkcontent>
+
+
+
diff --git a/docs/source/ko/model_doc/bertweet.md b/docs/source/ko/model_doc/bertweet.md
new file mode 100644
index 00000000000000..7a46087d0a8ebf
--- /dev/null
+++ b/docs/source/ko/model_doc/bertweet.md
@@ -0,0 +1,67 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BERTweet [[bertweet]]
+
+## 개요 [[overview]]
+
+BERTweet 모델은 Dat Quoc Nguyen, Thanh Vu, Anh Tuan Nguyen에 의해 [BERTweet: A pre-trained language model for English Tweets](https://www.aclweb.org/anthology/2020.emnlp-demos.2.pdf) 에서 제안되었습니다.
+
+해당 논문의 초록 :
+
+*영어 트윗을 위한 최초의 공개 대규모 사전 학습된 언어 모델인 BERTweet을 소개합니다. 
+BERTweet은 BERT-base(Devlin et al., 2019)와 동일한 아키텍처를 가지고 있으며, RoBERTa 사전 학습 절차(Liu et al., 2019)를 사용하여 학습되었습니다. 
+실험 결과, BERTweet은 강력한 기준 모델인 RoBERTa-base 및 XLM-R-base(Conneau et al., 2020)의 성능을 능가하여 세 가지 트윗 NLP 작업(품사 태깅, 개체명 인식, 텍스트 분류)에서 이전 최신 모델보다 더 나은 성능을 보여주었습니다.*
+
+이 모델은 [dqnguyen](https://huggingface.co/dqnguyen) 께서 기여하셨습니다. 원본 코드는 [여기](https://github.com/VinAIResearch/BERTweet).에서 확인할 수 있습니다.
+
+
+## 사용 예시 [[usage-example]]
+
+```python
+>>> import torch
+>>> from transformers import AutoModel, AutoTokenizer
+
+>>> bertweet = AutoModel.from_pretrained("vinai/bertweet-base")
+
+>>> # 트랜스포머 버전 4.x 이상 :
+>>> tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)
+
+>>> # 트랜스포머 버전 3.x 이상:
+>>> # tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")
+
+>>> # 입력된 트윗은 이미 정규화되었습니다!
+>>> line = "SC has first two presumptive cases of coronavirus , DHEC confirms HTTPURL via @USER :cry:"
+
+>>> input_ids = torch.tensor([tokenizer.encode(line)])
+
+>>> with torch.no_grad():
+...     features = bertweet(input_ids)  # Models outputs are now tuples
+
+>>> # With TensorFlow 2.0+:
+>>> # from transformers import TFAutoModel
+>>> # bertweet = TFAutoModel.from_pretrained("vinai/bertweet-base")
+```
+
+<Tip> 
+
+이 구현은 토큰화 방법을 제외하고는 BERT와 동일합니다. API 참조 정보는 [BERT 문서](bert) 를 참조하세요.
+
+</Tip>
+
+## Bertweet 토큰화(BertweetTokenizer) [[transformers.BertweetTokenizer]]
+
+[[autodoc]] BertweetTokenizer
diff --git a/docs/source/ko/model_doc/biogpt.md b/docs/source/ko/model_doc/biogpt.md
new file mode 100644
index 00000000000000..4b24af8493b9d5
--- /dev/null
+++ b/docs/source/ko/model_doc/biogpt.md
@@ -0,0 +1,109 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BioGPT [[biogpt]]
+
+## 개요 [[overview]]
+
+BioGPT는 Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon, Tie-Yan Liu에 의해 [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) 에서 제안된 모델입니다. BioGPT는 생물의학 텍스트 생성과 마이닝을 위해 도메인에 특화된 생성형 사전 학습 트랜스포머 언어 모델입니다. BioGPT는 트랜스포머 언어 모델 구조를 따르며, 1,500만 개의 PubMed 초록을 이용해 처음부터 학습되었습니다.
+
+논문의 초록은 다음과 같습니다:
+
+*생물의학 분야에서 사전 학습된 언어 모델은 일반 자연어 처리 분야에서의 성공에 영감을 받아 점점 더 많은 주목을 받고 있습니다. 일반 언어 분야에서 사전 학습된 언어 모델의 두 가지 주요 계통인 BERT(및 그 변형)와 GPT(및 그 변형) 중 첫 번째는 생물의학 분야에서 BioBERT와 PubMedBERT와 같이 광범위하게 연구되었습니다. 이들은 다양한 분류 기반의 생물의학 작업에서 큰 성공을 거두었지만, 생성 능력의 부족은 그들의 적용 범위를 제한했습니다. 본 논문에서는 대규모 생물의학 문헌을 사전 학습한 도메인 특화 생성형 트랜스포머 언어 모델인 BioGPT를 제안합니다. 우리는 6개의 생물의학 자연어 처리 작업에서 BioGPT를 평가한 결과, 대부분의 작업에서 이전 모델보다 우수한 성능을 보였습니다. 특히, BC5CDR, KD-DTI, DDI 엔드-투-엔드 관계 추출 작업에서 각각 44.98%, 38.42%, 40.76%의 F1 점수를 기록하였으며, PubMedQA에서 78.2%의 정확도를 달성해 새로운 기록을 세웠습니다. 또한 텍스트 생성에 대한 사례 연구는 생물의학 용어에 대한 유창한 설명을 생성하는 데 있어 BioGPT의 장점을 더욱 입증했습니다.*
+
+이 모델은 [kamalkraj](https://huggingface.co/kamalkraj)에 의해 기여되었습니다. 원본 코드는 [여기](https://github.com/microsoft/BioGPT)에서 찾을 수 있습니다.
+
+## 사용 팁 [[usage-tips]]
+
+- BioGPT는 절대적 위치 임베딩(absolute position embedding)을 사용하므로, 입력을 왼쪽이 아닌 오른쪽에서 패딩하는 것이 권장됩니다.
+- BioGPT는 인과적 언어 모델링(Casual Langague Modeling, CLM) 목표로 학습되었기 때문에, 다음 토큰을 예측하는 데 강력한 성능을 보입니다. 이 기능을 활용하여 BioGPT는 구문적으로 일관된 텍스트를 생성할 수 있으며, 예시 스크립트 `run_generation.py`에서 이를 확인할 수 있습니다.
+- 이 모델은 `past_key_values`(PyTorch 용)를 입력으로 받을 수 있는데, 이는 이전에 계산된 키/값 어텐션 쌍입니다. 이 값을 사용하면 텍스트 생성 중 이미 계산된 값을 다시 계산하지 않도록 할 수 있습니다. PyTorch에서 `past_key_values` 인수는 BioGptForCausalLM.forward() 메소드에서 자세히 설명되어 있습니다.
+
+### Scaled Dot Product Attention(SDPA) 사용 [[using-scaled-dot-product-attention-sdpa]]
+
+PyTorch는 `torch.nn.functional`의 일부로 스케일된 점곱 어텐션(SDPA) 연산자를 기본적으로 포함합니다. 이 함수는 입력과 사용 중인 하드웨어에 따라 여러 구현을 적용할 수 있습니다. 자세한 내용은 [공식 문서](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) 또는 [GPU 추론](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention) 페이지를 참조하세요.
+
+`torch>=2.1.1`에서 구현이 가능한 경우 SDPA는 기본적으로 사용되며, `attn_implementation="sdpa"`를 `from_pretrained()`에서 설정하여 SDPA 사용을 명시적으로 요청할 수 있습니다.
+
+```
+from transformers import BioGptForCausalLM
+model = BioGptForCausalLM.from_pretrained("microsoft/biogpt", attn_implementation="sdpa", torch_dtype=torch.float16)
+```
+
+NVIDIA GeForce RTX 2060-8GB, PyTorch 2.3.1, Ubuntu 20.04 환경에서 `float16` 및 CausalLM 헤드가 있는 `microsoft/biogpt` 모델로 로컬 벤치마크를 수행한 결과, 훈련 중 다음과 같은 속도 향상을 확인했습니다.
+
+최적의 속도 향상을 위해 모델을 반정밀도(예: `torch.float16` 또는 `torch.bfloat16`)로 로드하는 것이 좋습니다.
+
+| num_training_steps | batch_size | seq_len | is cuda | Time per batch (eager - s) | Time per batch (sdpa - s) | Speedup (%) | Eager peak mem (MB) | sdpa peak mem (MB) | Mem saving (%) |
+|--------------------|------------|---------|---------|----------------------------|---------------------------|-------------|---------------------|--------------------|----------------|
+| 100                | 1          | 128     | False   | 0.038                      | 0.031                     | 21.301      | 1601.862            | 1601.497           | 0.023          |
+| 100                | 1          | 256     | False   | 0.039                      | 0.034                     | 15.084      | 1624.944            | 1625.296           | -0.022         |
+| 100                | 2          | 128     | False   | 0.039                      | 0.033                     | 16.820      | 1624.567            | 1625.296           | -0.045         |
+| 100                | 2          | 256     | False   | 0.065                      | 0.059                     | 10.255      | 1672.164            | 1672.164           | 0.000          |
+| 100                | 4          | 128     | False   | 0.062                      | 0.058                     | 6.998       | 1671.435            | 1672.164           | -0.044         |
+| 100                | 4          | 256     | False   | 0.113                      | 0.100                     | 13.316      | 2350.179            | 1848.435           | 27.144         |
+| 100                | 8          | 128     | False   | 0.107                      | 0.098                     | 9.883       | 2098.521            | 1848.435           | 13.530         |
+| 100                | 8          | 256     | False   | 0.222                      | 0.196                     | 13.413      | 3989.980            | 2986.492           | 33.601         |
+
+NVIDIA GeForce RTX 2060-8GB, PyTorch 2.3.1, Ubuntu 20.04 환경에서 `float16` 및 AutoModel 헤드가 있는 `microsoft/biogpt` 모델로 추론 중 다음과 같은 속도 향상을 확인했습니다.
+
+| num_batches | batch_size | seq_len | is cuda | is half | use mask | Per token latency eager (ms) | Per token latency SDPA (ms) | Speedup (%) | Mem eager (MB) | Mem BT (MB) | Mem saved (%) |
+|-------------|------------|---------|---------|---------|----------|------------------------------|-----------------------------|-------------|----------------|--------------|---------------|
+| 50          | 1          | 64      | True    | True    | True     | 0.115                        | 0.098                       | 17.392      | 716.998        | 716.998      | 0.000         |
+| 50          | 1          | 128     | True    | True    | True     | 0.115                        | 0.093                       | 24.640      | 730.916        | 730.916      | 0.000         |
+| 50          | 2          | 64      | True    | True    | True     | 0.114                        | 0.096                       | 19.204      | 730.900        | 730.900      | 0.000         |
+| 50          | 2          | 128     | True    | True    | True     | 0.117                        | 0.095                       | 23.529      | 759.262        | 759.262      | 0.000         |
+| 50          | 4          | 64      | True    | True    | True     | 0.113                        | 0.096                       | 18.325      | 759.229        | 759.229      | 0.000         |
+| 50          | 4          | 128     | True    | True    | True     | 0.186                        | 0.178                       | 4.289       | 816.478        | 816.478      | 0.000         |
+
+
+## 리소스 [[resources]]
+
+- [인과적 언어 모델링 작업 가이드](../tasks/language_modeling)
+
+## BioGptConfig [[transformers.BioGptConfig]]
+
+[[autodoc]] BioGptConfig
+
+
+## BioGptTokenizer [[transformers.BioGptTokenizer]]
+
+[[autodoc]] BioGptTokenizer
+    - save_vocabulary
+
+
+## BioGptModel [[transformers.BioGptModel]]
+
+[[autodoc]] BioGptModel
+    - forward
+
+
+## BioGptForCausalLM [[transformers.BioGptForCausalLM]]
+
+[[autodoc]] BioGptForCausalLM
+    - forward
+
+
+## BioGptForTokenClassification [[transformers.BioGptForTokenClassification]]
+
+[[autodoc]] BioGptForTokenClassification
+    - forward
+
+
+## BioGptForSequenceClassification [[transformers.BioGptForSequenceClassification]]
+
+[[autodoc]] BioGptForSequenceClassification
+    - forward
\ No newline at end of file
diff --git a/docs/source/ko/model_doc/blip.md b/docs/source/ko/model_doc/blip.md
new file mode 100644
index 00000000000000..27e085315b8911
--- /dev/null
+++ b/docs/source/ko/model_doc/blip.md
@@ -0,0 +1,136 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BLIP[[blip]]
+
+## 개요[[overview]]
+
+BLIP 모델은 Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi의 [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) 논문에서 제안되었습니다.
+
+BLIP은 여러 멀티모달 작업을 수행할 수 있는 모델입니다:
+
+- 시각 질문 응답 (Visual Question Answering, VQA)
+- 이미지-텍스트 검색 (이미지-텍스트 매칭)
+- 이미지 캡셔닝
+
+논문의 초록은 다음과 같습니다:
+
+*비전-언어 사전 학습(Vision-Language Pre-training, VLP)은 다양한 비전-언어 작업의 성능을 크게 향상시켰습니다. 하지만, 대부분의 기존 사전 학습 모델들은 이해 기반 작업이나 생성 기반 작업 중 하나에서만 뛰어난 성능을 발휘합니다. 또한 성능 향상은 주로 웹에서 수집한 노이즈가 많은 이미지-텍스트 쌍으로 데이터셋의 규모를 키우는 방식으로 이루어졌는데, 이는 최적의 지도 학습 방식이라고 보기 어렵습니다. 본 논문에서는 BLIP이라는 새로운 VLP 프레임워크를 제안합니다. 이 프레임워크는 비전-언어 이해 및 생성 작업 모두에 유연하게 적용될 수 있습니다. BLIP는 캡셔너가 합성 캡션을 생성하고 필터가 노이즈 캡션을 제거하는 부트스트래핑 방법을 통해 웹 데이터의 노이즈를 효과적으로 활용합니다. 우리는 이미지-텍스트 검색(Recall@1에서 +2.7%), 이미지 캡셔닝(CIDEr에서 +2.8%), 그리고 VQA(VQA 점수에서 +1.6%)와 같은 다양한 비전-언어 작업에서 최신 성과를 달성했습니다. 또한 BLIP은 제로샷 방식으로 비디오-언어 작업에 직접 전이될 때도 강력한 일반화 능력을 보여줍니다. 이 논문의 코드, 모델, 데이터셋은 공개되었습니다.*
+
+![BLIP.gif](https://cdn-uploads.huggingface.co/production/uploads/1670928184033-62441d1d9fdefb55a0b7d12c.gif)
+
+이 모델은 [ybelkada](https://huggingface.co/ybelkada)가 기여했습니다.
+원본 코드는 [여기](https://github.com/salesforce/BLIP)에서 찾을 수 있습니다.
+
+## 자료[[resources]]
+
+- [Jupyter notebook](https://github.com/huggingface/notebooks/blob/main/examples/image_captioning_blip.ipynb): 사용자 정의 데이터셋에서 BLIP를 이미지 캡셔닝으로 미세 조정하는 방법
+
+## BlipConfig[[transformers.BlipConfig]]
+
+[[autodoc]] BlipConfig
+    - from_text_vision_configs
+
+## BlipTextConfig[[transformers.BlipTextConfig]]
+
+[[autodoc]] BlipTextConfig
+
+## BlipVisionConfig[[transformers.BlipVisionConfig]]
+
+[[autodoc]] BlipVisionConfig
+
+## BlipProcessor[[transformers.BlipProcessor]]
+
+[[autodoc]] BlipProcessor
+
+## BlipImageProcessor[[transformers.BlipImageProcessor]]
+
+[[autodoc]] BlipImageProcessor
+    - preprocess
+
+<frameworkcontent>
+<pt>
+
+## BlipModel[[transformers.BlipModel]]
+
+`BlipModel`은 향후 버전에서 더 이상 지원되지 않을 예정입니다. 목적에 따라 `BlipForConditionalGeneration`, `BlipForImageTextRetrieval` 또는 `BlipForQuestionAnswering`을 사용하십시오.
+
+[[autodoc]] BlipModel
+    - forward
+    - get_text_features
+    - get_image_features
+
+## BlipTextModel[[transformers.BlipTextModel]]
+
+[[autodoc]] BlipTextModel
+    - forward
+
+## BlipVisionModel[[transformers.BlipVisionModel]]
+
+[[autodoc]] BlipVisionModel
+    - forward
+
+## BlipForConditionalGeneration[[transformers.BlipForConditionalGeneration]]
+
+[[autodoc]] BlipForConditionalGeneration
+    - forward
+
+## BlipForImageTextRetrieval[[transformers.BlipForImageTextRetrieval]]
+
+[[autodoc]] BlipForImageTextRetrieval
+    - forward
+
+## BlipForQuestionAnswering[[transformers.BlipForQuestionAnswering]]
+
+[[autodoc]] BlipForQuestionAnswering
+    - forward
+
+</pt>
+<tf>
+
+## TFBlipModel[[transformers.TFBlipModel]]
+
+[[autodoc]] TFBlipModel
+    - call
+    - get_text_features
+    - get_image_features
+
+## TFBlipTextModel[[transformers.TFBlipTextModel]]
+
+[[autodoc]] TFBlipTextModel
+    - call
+
+## TFBlipVisionModel[[transformers.TFBlipVisionModel]]
+
+[[autodoc]] TFBlipVisionModel
+    - call
+
+## TFBlipForConditionalGeneration[[transformers.TFBlipForConditionalGeneration]]
+
+[[autodoc]] TFBlipForConditionalGeneration
+    - call
+
+## TFBlipForImageTextRetrieval[[transformers.TFBlipForImageTextRetrieval]]
+
+[[autodoc]] TFBlipForImageTextRetrieval
+    - call
+
+## TFBlipForQuestionAnswering[[transformers.TFBlipForQuestionAnswering]]
+
+[[autodoc]] TFBlipForQuestionAnswering
+    - call
+</tf>
+</frameworkcontent>
diff --git a/docs/source/ko/model_doc/chameleon.md b/docs/source/ko/model_doc/chameleon.md
new file mode 100644
index 00000000000000..14a18a09765bd4
--- /dev/null
+++ b/docs/source/ko/model_doc/chameleon.md
@@ -0,0 +1,186 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Chameleon [[chameleon]]
+
+## 개요 [[overview]]
+
+Chameleon 모델은 META AI Chameleon 팀의 논문 [Chameleon: Mixed-Modal Early-Fusion Foundation Models](https://arxiv.org/abs/2405.09818v1)에서 제안되었습니다. Chameleon은 벡터 양자화를 사용하여 이미지를 토큰화함으로써 멀티모달 출력을 생성할 수 있는 비전-언어 모델입니다. 이 모델은 교차된 형식을 포함한 이미지와 텍스트를 입력으로 받으며, 텍스트 응답을 생성합니다. 이미지 생성 모듈은 아직 공개되지 않았습니다.
+
+논문의 초록은 다음과 같습니다:
+
+*우리는 이미지와 텍스트를 임의의 순서로 이해하고 생성할 수 있는 early-fusion 토큰 기반의 혼합 모달(mixed-modal) 모델의 일종인 Chameleon을 소개합니다. 우리는 초기부터 안정적인 훈련 접근법, 정렬 방법, 그리고 early-fusion, 토큰 기반, 혼합 모달 설정에 맞춘 아키텍처 매개변수를 제시합니다. 이 모델들은 시각적 질문 응답, 이미지 캡션 생성, 텍스트 생성, 이미지 생성, 장문 혼합 모달 생성 등 포괄적인 작업 범위에서 평가되었습니다. Chameleon은 단일 모델에서 이미지 캡션 생성 작업에서의 최첨단 성능을 포함한 광범위하고 일반적으로 적용 가능한 능력을 보여주며, 텍스트 전용 작업에서 Llama-2를 능가하면서 Mixtral 8x7B와 Gemini-Pro와 같은 모델들 사이에서도 경쟁력을 갖추고 있습니다. 그리고 상당한 성능의 이미지 생성도 수행합니다. 또한 프롬프트나 출력에 이미지와 텍스트의 혼합 시퀀스가 포함된 새로운 장문 혼합 모달 생성 평가에서, 인간의 판단에 따르면 Gemini Pro와 GPT-4V를 포함한 훨씬 더 큰 모델의 성능과 동등하거나 이를 능가합니다. Chameleon은 완전한 멀티모달 문서의 통합 모델링에서 중요한 발전을 보여줍니다.*
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/chameleon_arch.png"
+alt="drawing" width="600"/>
+
+<small>Chameleon은 이미지를 이산적인 토큰으로 변환하기 위해 벡터 양자화 모듈을 통합합니다. 이는 자기회귀 transformer를 사용한 이미지 생성을 가능하게 합니다. <a href="https://arxiv.org/abs/2405.09818v1">원본 논문</a>에서 가져왔습니다.</small>
+
+이 모델은 [joaogante](https://huggingface.co/joaogante)와 [RaushanTurganbay](https://huggingface.co/RaushanTurganbay)가 기여했습니다. 원본 코드는 [여기](https://github.com/facebookresearch/chameleon)에서 찾을 수 있습니다.
+
+## 사용 팁 [[usage-tips]]
+
+- 더 정확한 결과를 위해, 배치 생성 시 `padding_side="left"`를 사용하는 것을 권장합니다. 생성하기 전에 `processor.tokenizer.padding_side = "left"`로 설정하십시오.
+
+- Chameleon은 안전성 정렬을 위해 튜닝되었음을 유의하십시오. 모델이 응답을 거부하는 경우, 열린 질문보다는 더 구체적으로 질문을 해보세요.
+
+- Chameleon은 채팅 형식으로 생성하므로, 생성된 텍스트는 항상 "assistant's turn"으로 표시됩니다. 프로세서를 호출할 때 `return_for_text_completion=True`를 전달하여 텍스트 완성 생성을 활성화할 수 있습니다.
+
+> [!NOTE]
+> Transformers에서의 Chameleon 구현은 이미지 임베딩을 병합할 위치를 나타내기 위해 특별한 이미지 토큰을 사용합니다. 특별한 이미지 토큰을 위해 새로운 토큰을 추가하지 않고 예약된 토큰 중 하나인 `<reserved08707>`를 사용했습니다. 올바른 생성을 위해 프롬프트에서 이미지가 임베딩될 위치에 `<image>`를 추가해야 합니다.
+
+## 사용 예제 [[usage-example]]
+
+### 단일 이미지 추론 [[single-image-inference]]
+
+Chameleon은 게이티드(gated) 모델이므로 Hugging Face Hub에 대한 액세스 권한이 있고 토큰으로 로그인했는지 확인하세요. 다음은 모델을 로드하고 반정밀도(`torch.bfloat16`)로 추론하는 방법입니다:
+
+```python
+from transformers import ChameleonProcessor, ChameleonForConditionalGeneration
+import torch
+from PIL import Image
+import requests
+
+processor = ChameleonProcessor.from_pretrained("facebook/chameleon-7b")
+model = ChameleonForConditionalGeneration.from_pretrained("facebook/chameleon-7b", torch_dtype=torch.bfloat16, device_map="cuda")
+
+# 이미지와 텍스트 프롬프트 준비
+url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+image = Image.open(requests.get(url, stream=True).raw)
+prompt = "이 이미지에서 무엇을 보나요?<image>"
+
+inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device, dtype=torch.bfloat16)
+
+# 프롬프트를 자기회귀적으로 완성
+output = model.generate(**inputs, max_new_tokens=50)
+print(processor.decode(output[0], skip_special_tokens=True))
+```
+
+### 다중 이미지 추론 [[multi-image-inference]]
+
+Chameleon은 여러 이미지를 입력으로 받아들이며, 이미지들은 동일한 프롬프트에 속하거나 다른 프롬프트에 속할 수 있습니다(배치 추론에서). 다음은 그 방법입니다:
+
+```python
+from transformers import ChameleonProcessor, ChameleonForConditionalGeneration
+import torch
+from PIL import Image
+import requests
+
+processor = ChameleonProcessor.from_pretrained("facebook/chameleon-7b")
+
+model = ChameleonForConditionalGeneration.from_pretrained("facebook/chameleon-7b", torch_dtype=torch.bfloat16, device_map="cuda")
+
+# 세 가지 다른 이미지 가져오기
+url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+image_stop = Image.open(requests.get(url, stream=True).raw)
+
+url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+image_cats = Image.open(requests.get(url, stream=True).raw)
+
+url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
+image_snowman = Image.open(requests.get(url, stream=True).raw)
+
+# 배치된 프롬프트 준비: 첫 번째는 다중 이미지 프롬프트이고 두 번째는 단일 이미지 프롬프트입니다
+prompts = [
+    "이 이미지들은 무엇이 공통점인가요?<image><image>",
+    "<image>이 이미지에 무엇이 나타나 있나요?"
+]
+
+# 이미지들을 텍스트 프롬프트에서 사용되어야 하는 순서대로 입력할 수 있습니다
+# 각 "<image>" 토큰은 하나의 이미지를 사용하며, 다음 "<image>" 토큰은 다음 이미지를 사용합니다
+inputs = processor(images=[image_stop, image_cats, image_snowman], text=prompts, padding=True, return_tensors="pt").to(device="cuda", dtype=torch.bfloat16)
+
+# 생성
+generate_ids = model.generate(**inputs, max_new_tokens=50)
+processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+```
+
+## 모델 최적화 [[model-optimization]]
+
+### Bitsandbytes를 사용한 양자화 [[quantization-using-bitsandbytes]]
+
+모델은 8비트 또는 4비트로 로드할 수 있으며, 이는 원본 모델의 성능을 유지하면서 메모리 요구 사항을 크게 줄여줍니다. 먼저 bitsandbytes를 설치하고(`pip install bitsandbytes`), 라이브러리가 지원하는 GPU/가속기를 사용 중인지 확인하십시오.
+
+<Tip>
+
+bitsandbytes는 CUDA 이외의 여러 백엔드를 지원하도록 리팩터링되고 있습니다. 현재 ROCm(AMD GPU) 및 Intel CPU 구현이 성숙 단계이며, Intel XPU는 진행 중이고 Apple Silicon 지원은 Q4/Q1에 예상됩니다. 설치 지침 및 최신 백엔드 업데이트는 [이 링크](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend)를 방문하세요.
+
+전체 공개 전에 버그를 식별하는 데 도움이 되는 피드백을 환영합니다! 자세한 내용과 피드백은 [이 문서](https://huggingface.co/docs/bitsandbytes/main/en/non_cuda_backends)를 확인하세요.
+
+</Tip>
+
+위의 코드 스니펫을 다음과 같이 변경하면 됩니다:
+
+```python
+from transformers import ChameleonForConditionalGeneration, BitsAndBytesConfig
+
+# 모델 양자화 방식 지정
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.bfloat16,
+)
+
+model = ChameleonForConditionalGeneration.from_pretrained("facebook/chameleon-7b", quantization_config=quantization_config, device_map="cuda")
+```
+
+### Flash-Attention 2와 SDPA를 사용하여 생성 속도 향상 [[use-flash-attention-2-and-sdpa-to-further-speed-up-generation]]
+
+이 모델은 최적화를 위해 Flash-Attention 2와 PyTorch의 [`torch.nn.functional.scaled_dot_product_attention`](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html)를 모두 지원합니다. SDPA는 모델을 로드할 때 기본 옵션입니다. Flash Attention 2로 전환하려면 먼저 flash-attn을 설치해야 합니다. 해당 패키지 설치에 대해서는 [원본 리포지토리](https://github.com/Dao-AILab/flash-attention)를 참고하십시오. 위의 코드 스니펫을 다음과 같이 변경하면 됩니다:
+
+```python
+from transformers import ChameleonForConditionalGeneration
+
+model_id = "facebook/chameleon-7b"
+model = ChameleonForConditionalGeneration.from_pretrained(
+    model_id,
+    torch_dtype=torch.bfloat16,
+    low_cpu_mem_usage=True,
+    attn_implementation="flash_attention_2"
+).to(0)
+```
+
+## ChameleonConfig [[transformers.ChameleonConfig]]
+
+[[autodoc]] ChameleonConfig
+
+## ChameleonVQVAEConfig [[transformers.ChameleonVQVAEConfig]]
+
+[[autodoc]] ChameleonVQVAEConfig
+
+## ChameleonProcessor [[transformers.ChameleonProcessor]]
+
+[[autodoc]] ChameleonProcessor
+
+## ChameleonImageProcessor [[transformers.ChameleonImageProcessor]]
+
+[[autodoc]] ChameleonImageProcessor
+    - preprocess
+
+## ChameleonVQVAE [[transformers.ChameleonVQVAE]]
+
+[[autodoc]] ChameleonVQVAE
+    - forward
+
+## ChameleonModel [[transformers.ChameleonModel]]
+
+[[autodoc]] ChameleonModel
+    - forward
+
+## ChameleonForConditionalGeneration [[transformers.ChameleonForConditionalGeneration]]
+
+[[autodoc]] ChameleonForConditionalGeneration
+    - forward
diff --git a/docs/source/ko/model_doc/clip.md b/docs/source/ko/model_doc/clip.md
new file mode 100644
index 00000000000000..5c19a31e10e658
--- /dev/null
+++ b/docs/source/ko/model_doc/clip.md
@@ -0,0 +1,318 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# CLIP[[clip]]
+
+## 개요[[overview]]
+
+CLIP 모델은 Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh,
+Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever가 제안한 [자연어 지도(supervision)를 통한 전이 가능한 시각 모델 학습](https://arxiv.org/abs/2103.00020)라는 논문에서 소개되었습니다. CLIP(Contrastive Language-Image Pre-Training)은 다양한 이미지와 텍스트 쌍으로 훈련된 신경망 입니다. GPT-2와 3의 제로샷 능력과 유사하게, 해당 작업에 직접적으로 최적화하지 않고도 주어진 이미지에 대해 가장 관련성 있는 텍스트 스니펫을 예측하도록 자연어로 지시할 수 있습니다.
+
+해당 논문의 초록입니다.
+
+*최신 컴퓨터 비전 시스템은 미리 정해진 고정된 객체 카테고리 집합을 예측하도록 훈련됩니다. 이러한 제한된 형태의 지도는 다른 시각적 개념을 지정하기 위해 추가적인 라벨링된 데이터가 필요하므로 그 일반성과 사용성을 제한합니다. 이미지 원시 텍스트에서 직접 학습하는 것은 훨씬 더 광범위한 지도 소스를 활용하는 아주 좋은 대안입니다. 이미지와 캡션을 맞추는 간단한 사전 학습 작업이, 인터넷에서 수집한 4억 쌍의 이미지-텍스트 데이터셋에서 SOTA 수준의 이미지 표현을 처음부터 효율적이고 확장 가능하게 학습하는 방법임을 확인할 수 있습니다. 사전 훈련 후, 자연어는 학습된 시각적 개념을 참조하거나 새로운 개념을 설명하는 데 사용되어 모델의 하위 작업으로의 제로샷 전이를 가능하게 합니다. 해당 논문에서는 OCR, 비디오 내 행동 인식, 지리적 위치 파악, 그리고 많은 종류의 세밀한 객체 분류 등 30개 이상의 다양한 기존 컴퓨터 비전 데이터셋에 대한 벤치마킹을 통해 이 접근 방식의 성능을 연구합니다. 이 모델은 대부분의 작업에 대해 의미 있게 전이되며, 종종 데이터셋별 훈련 없이도 완전 지도 학습 기준선과 경쟁력 있는 성능을 보입니다. 예를 들어, ImageNet에서 원래 ResNet-50의 정확도를 제로샷으로 일치시키는데, 이는 ResNet-50이 훈련된 128만 개의 훈련 예제를 전혀 사용할 필요가 없었습니다. 코드 및 사전 훈련된 모델 가중치는 이 https URL에서 공개합니다.*
+
+이 모델은 [valhalla](https://huggingface.co/valhalla)에 의해 기여되었습니다. 
+원본 코드는 [이곳](https://github.com/openai/CLIP)에서 확인할 수 있습니다.
+
+## 사용 팁과 예시[[usage-tips-and-example]]
+
+CLIP은 멀티모달 비전 밒 언어 모델입니다. 이미지-텍스트 유사도 계산과 제로샷 이미지 분류에 사용될 수 있습니다. CLIP은 ViT와 유사한 트랜스포머를 사용하여 시각적 특징을 추출하고, 인과적 언어 모델을 사용하여 텍스트 특징을 추출합니다. 그 후 텍스트와 시각적 특징 모두 동일한 차원의 잠재(latent) 공간으로 투영됩니다. 투영된 이미지와 텍스트 특징 사이의 내적이 유사도 점수로 사용됩니다.
+
+트랜스포머 인코더에 이미지를 입력하기 위해, 각 이미지는 고정 크기의 겹치지 않는 패치들의 시퀀스로 분할되고, 이후 선형 임베딩됩니다. [CLS]토큰이 전체 이미지의 표현으로 추가됩니다. 저자들은 또한 절대 위치 임베딩을 추가하고, 결과로 나온 벡터 시퀀스를 표준 트랜스포머 인토더에 입력합니다. [`CLIPImageProcessor`]는 모델을 위해 이미지를 리사이즈(또는 재스캐일링)하고 정규화하는데 사용될 수 있습니다.
+
+[`CLIPTokenizer`]는 텍스트를 인코딩하는데 사용됩니다. [`CLIPProcessor`]는 [`CLIPImageProcessor`]와 [`CLIPTokenizer`]를 하나의 인스턴스로 감싸서 텍스트를 인코딩하고 이미지를 준비하는데 모두 사용됩니다. 
+
+다음 예시는 [`CLIPProcessor`]와 [`CLIPModel`]을 사용하여 이미지-텍스트 유사도 점수를 얻는 방법을 보여줍니다.
+
+
+```python
+>>> from PIL import Image
+>>> import requests
+
+>>> from transformers import CLIPProcessor, CLIPModel
+
+>>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+>>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
+
+>>> outputs = model(**inputs)
+>>> logits_per_image = outputs.logits_per_image  # 이미지-텍스트 유사성 점수
+>>> probs = logits_per_image.softmax(dim=1)  # 확률을 레이블링 하기위해서 소프트맥스를 취합니다.
+```
+
+
+### CLIP과 플래시 어텐션2 결합[[combining-clip-and-flash-attention-2]]
+
+먼저 최신버전의 플래시 어텐션2를 설치합니다.
+
+```bash
+pip install -U flash-attn --no-build-isolation
+```
+
+플래시 어텐션2와 호환되는 하드웨어를 가지고 있는지 확인하세요. 이에 대한 자세한 내용은 flash-attn 리포지토리의 공식문서에서 확인할 수 있습니다. 또한 모델을 반정밀도(`torch.float16`)로 로드하는 것을 잊지 마세요.
+
+<Tip warning={true}>
+
+작은 배치 크기를 사용할 때, 플래시 어텐션을 사용하면 모델이 느려지는 것을 느낄 수 있습니다.아래의 [플래시 어텐션과 SDPA를 사용한 예상 속도 향상](#Expected-speedups-with-Flash-Attention-and-SDPA) 섹션을 참조하여 적절한 어텐션 구현을 선택하세요.
+
+</Tip>
+
+플래시 어텐션2를 사용해서 모델을 로드하고 구동하기 위해서 다음 스니펫을 참고하세요:
+
+```python
+>>> import torch
+>>> import requests
+>>> from PIL import Image
+
+>>> from transformers import CLIPProcessor, CLIPModel
+
+>>> device = "cuda"
+>>> torch_dtype = torch.float16
+
+>>> model = CLIPModel.from_pretrained(
+...     "openai/clip-vit-base-patch32",
+...     attn_implementation="flash_attention_2",
+...     device_map=device,
+...     torch_dtype=torch_dtype,
+... )
+>>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
+>>> inputs.to(device)
+
+>>> with torch.no_grad():
+...     with torch.autocast(device):
+...         outputs = model(**inputs)
+
+>>> logits_per_image = outputs.logits_per_image  # 이미지-텍스트 유사성 점수
+>>> probs = logits_per_image.softmax(dim=1)  # 확률을 레이블링 하기위해서 소프트맥스를 취합니다.
+>>> print(probs)
+tensor([[0.9946, 0.0052]], device='cuda:0', dtype=torch.float16)
+```
+
+
+### 스케일된 내적 어텐션 (Scaled dot-product Attention(SDPA)) 사용하기[[using-scaled-dot-product-attention-sdpa]]
+
+파이토치는 `torch.nn.functional`의 일부로 네이티브 스케일된 내적 어텐션(SPDA) 연산자를 포함하고 있습니다. 이 함수는 입력과 사용 중인 하드웨어에 따라 적용될 수 있는 여러 구현을 포함합니다. 자세한 정보는 [공식문서](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)나 [GPU 추론](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention) 페이지를 참조하세요.
+
+`torch>=2.1.1`에서는 구현이 가능할 때 SDPA가 기본적으로 사용되지만, `from_pretrained()` 함수에서 `attn_implementation="sdpa"`를 설정하여 SDPA를 명시적으로 사용하도록 요청할 수도 있습니다.
+
+```python
+from transformers import CLIPModel
+
+model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32", torch_dtype=torch.float16, attn_implementation="sdpa")
+```
+
+최고의 속도향상을 위해서, 반정밀도로 모델을 로드하는 것을 추천합니다. (예를들면 `torch.float16` 또는 `torch.bfloat16`).
+
+### 플래시 어텐션과 스케일된 내적 어텐션(SDPA)으로 인해 예상되는 속도향상[[expected-speedups-with-flash-attention-and-sdpa]]
+
+로컬 벤치마크(NVIDIA A10G, PyTorch 2.3.1+cu121)에서 `float16`을 사용하여 `"openai/clip-vit-large-patch14"` 체크포인트로 추론을 수행했을 때, 다음과 같은 속도 향상을 확인 했습니다.
+[코드](https://gist.github.com/qubvel/ac691a54e54f9fae8144275f866a7ff8):
+
+#### CLIPTextModel[[cliptextmodel]]
+
+|   Num text labels |   Eager (s/iter) |   FA2 (s/iter) |   FA2 speedup |   SDPA (s/iter) |   SDPA speedup |
+|------------------:|-----------------:|---------------:|--------------:|----------------:|---------------:|
+|                 4 |            0.009 |          0.012 |         0.737 |           0.007 |          1.269 |
+|                16 |            0.009 |          0.014 |         0.659 |           0.008 |          1.187 |
+|                32 |            0.018 |          0.021 |         0.862 |           0.016 |          1.142 |
+|                64 |            0.034 |          0.034 |         1.001 |           0.03  |          1.163 |
+|               128 |            0.063 |          0.058 |         1.09  |           0.054 |          1.174 |
+
+![clip_text_model_viz_3](https://github.com/user-attachments/assets/e9826b43-4e66-4f4c-952b-af4d90bd38eb)
+
+#### CLIPVisionModel[[clipvisionmodel]]
+
+|   Image batch size |   Eager (s/iter) |   FA2 (s/iter) |   FA2 speedup |   SDPA (s/iter) |   SDPA speedup |
+|-------------------:|-----------------:|---------------:|--------------:|----------------:|---------------:|
+|                  1 |            0.016 |          0.013 |         1.247 |           0.012 |          1.318 |
+|                  4 |            0.025 |          0.021 |         1.198 |           0.021 |          1.202 |
+|                 16 |            0.093 |          0.075 |         1.234 |           0.075 |          1.24  |
+|                 32 |            0.181 |          0.147 |         1.237 |           0.146 |          1.241 |
+
+![clip_image_model_viz_3](https://github.com/user-attachments/assets/50a36206-e3b9-4adc-ac8e-926b8b071d63)
+
+#### CLIPModel[[clipmodel]]
+
+|   Image batch size |   Num text labels |   Eager (s/iter) |   FA2 (s/iter) |   FA2 speedup |   SDPA (s/iter) |   SDPA speedup |
+|-------------------:|------------------:|-----------------:|---------------:|--------------:|----------------:|---------------:|
+|                  1 |                 4 |            0.025 |          0.026 |         0.954 |           0.02  |          1.217 |
+|                  1 |                16 |            0.026 |          0.028 |         0.918 |           0.02  |          1.287 |
+|                  1 |                64 |            0.042 |          0.046 |         0.906 |           0.036 |          1.167 |
+|                  4 |                 4 |            0.028 |          0.033 |         0.849 |           0.024 |          1.189 |
+|                  4 |                16 |            0.034 |          0.035 |         0.955 |           0.029 |          1.169 |
+|                  4 |                64 |            0.059 |          0.055 |         1.072 |           0.05  |          1.179 |
+|                 16 |                 4 |            0.096 |          0.088 |         1.091 |           0.078 |          1.234 |
+|                 16 |                16 |            0.102 |          0.09  |         1.129 |           0.083 |          1.224 |
+|                 16 |                64 |            0.127 |          0.11  |         1.157 |           0.105 |          1.218 |
+|                 32 |                 4 |            0.185 |          0.159 |         1.157 |           0.149 |          1.238 |
+|                 32 |                16 |            0.19  |          0.162 |         1.177 |           0.154 |          1.233 |
+|                 32 |                64 |            0.216 |          0.181 |         1.19  |           0.176 |          1.228 |
+
+## 자료[[resources]]
+
+CLIP을 시작하는 데 도움이 되는 Hugging Face와 community 자료 목록(🌎로 표시됨) 입니다.
+
+- [원격 센싱 (인공위성) 이미지와 캡션을 가지고 CLIP 미세조정하기](https://huggingface.co/blog/fine-tune-clip-rsicd): 
+[RSICD dataset](https://github.com/201528014227051/RSICD_optimal)을 가지고 CLIP을 미세조정 하는 방법과 데이터 증강에 대한 성능 비교에 대한 블로그 포스트
+- 이 [예시 스크립트](https://github.com/huggingface/transformers/tree/main/examples/pytorch/contrastive-image-text)는 [COCO dataset](https://cocodataset.org/#home)를 이용한 사전학습된 비전과 텍스트와 인코더를 사용해서 CLIP같은 비전-텍스트 듀얼 모델을 어떻게 학습시키는지 보여줍니다. 
+
+<PipelineTag pipeline="image-to-text"/>
+
+- 사전학습된 CLIP모델을 이미지 캡셔닝을 위한 빔서치 추론에 어떻게 활용하는지에 관한 [노트북](https://colab.research.google.com/drive/1tuoAC5F4sC7qid56Z0ap-stR3rwdk0ZV?usp=sharing)
+
+**이미지 검색**
+
+- 사전학습된 CLIP모델과 MRR(Mean Reciprocal Rank) 점수 연산을 사용한 이미지 검색에 대한 [노트북](https://colab.research.google.com/drive/1bLVwVKpAndpEDHqjzxVPr_9nGrSbuOQd?usp=sharing). 🌎
+- 이미지 검색과 유사성 점수에 대해 보여주는 [노트북](https://colab.research.google.com/github/deep-diver/image_search_with_natural_language/blob/main/notebooks/Image_Search_CLIP.ipynb). 🌎
+- Multilingual CLIP를 사용해서 이미지와 텍스트를 어떻게 같은 벡터 공간에 매핑 시키는지에 대한 [노트북](https://colab.research.google.com/drive/1xO-wC_m_GNzgjIBQ4a4znvQkvDoZJvH4?usp=sharing). 🌎 
+- [Unsplash](https://unsplash.com)와 [TMDB](https://www.themoviedb.org/) 데이터셋을 활용한 의미론적(semantic) 이미지 검색에서 CLIP을 구동하는 방법에 대한 [노트북](https://colab.research.google.com/github/vivien000/clip-demo/blob/master/clip.ipynb#scrollTo=uzdFhRGqiWkR). 🌎
+
+**설명 가능성**
+
+- 입력 토큰과 이미지 조각(segment) 사이의 유사성을 시각화 시키는 방법에 대한 [노트북](https://colab.research.google.com/github/hila-chefer/Transformer-MM-Explainability/blob/main/CLIP_explainability.ipynb). 🌎
+
+여기에 포함될 자료를 제출하고 싶으시다면 PR(Pull Request)를 열어주세요. 리뷰 해드리겠습니다! 자료는 기존 자료를 복제하는 대신 새로운 내용을 담고 있어야 합니다.
+
+## CLIPConfig[[transformers.CLIPConfig]]
+
+[[autodoc]] CLIPConfig
+    - from_text_vision_configs
+
+## CLIPTextConfig[[transformers.CLIPTextConfig]]
+
+[[autodoc]] CLIPTextConfig
+
+## CLIPVisionConfig[[transformers.CLIPVisionConfig]]
+
+[[autodoc]] CLIPVisionConfig
+
+## CLIPTokenizer[[transformers.CLIPTokenizer]]
+
+[[autodoc]] CLIPTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## CLIPTokenizerFast[[transformers.CLIPTokenizerFast]]
+
+[[autodoc]] CLIPTokenizerFast
+
+## CLIPImageProcessor[[transformers.CLIPImageProcessor]]
+
+[[autodoc]] CLIPImageProcessor
+    - preprocess
+
+## CLIPFeatureExtractor[[transformers.CLIPFeatureExtractor]]
+
+[[autodoc]] CLIPFeatureExtractor
+
+## CLIPProcessor[[transformers.CLIPProcessor]]
+
+[[autodoc]] CLIPProcessor
+
+<frameworkcontent>
+<pt>
+
+## CLIPModel[[transformers.CLIPModel]]
+
+[[autodoc]] CLIPModel
+    - forward
+    - get_text_features
+    - get_image_features
+
+## CLIPTextModel[[transformers.CLIPTextModel]]
+
+[[autodoc]] CLIPTextModel
+    - forward
+
+## CLIPTextModelWithProjection[[transformers.CLIPTextModelWithProjection]]
+
+[[autodoc]] CLIPTextModelWithProjection
+    - forward
+
+## CLIPVisionModelWithProjection[[transformers.CLIPVisionModelWithProjection]]
+
+[[autodoc]] CLIPVisionModelWithProjection
+    - forward
+
+## CLIPVisionModel[[transformers.CLIPVisionModel]]
+
+[[autodoc]] CLIPVisionModel
+    - forward
+
+## CLIPForImageClassification[[transformers.CLIPForImageClassification]]
+
+[[autodoc]] CLIPForImageClassification
+    - forward
+
+</pt>
+<tf>
+
+## TFCLIPModel[[transformers.TFCLIPModel]]
+
+[[autodoc]] TFCLIPModel
+    - call
+    - get_text_features
+    - get_image_features
+
+## TFCLIPTextModel[[transformers.TFCLIPTextModel]]
+
+[[autodoc]] TFCLIPTextModel
+    - call
+
+## TFCLIPVisionModel[[transformers.TFCLIPVisionModel]]
+
+[[autodoc]] TFCLIPVisionModel
+    - call
+
+</tf>
+<jax>
+
+## FlaxCLIPModel[[transformers.FlaxCLIPModel]]
+
+[[autodoc]] FlaxCLIPModel
+    - __call__
+    - get_text_features
+    - get_image_features
+
+## FlaxCLIPTextModel[[transformers.FlaxCLIPTextModel]]
+
+[[autodoc]] FlaxCLIPTextModel
+    - __call__
+
+## FlaxCLIPTextModelWithProjection[[transformers.FlaxCLIPTextModelWithProjection]]
+
+[[autodoc]] FlaxCLIPTextModelWithProjection
+    - __call__
+
+## FlaxCLIPVisionModel[[transformers.FlaxCLIPVisionModel]]
+
+[[autodoc]] FlaxCLIPVisionModel
+    - __call__
+
+</jax>
+</frameworkcontent>
diff --git a/docs/source/ko/model_doc/cohere.md b/docs/source/ko/model_doc/cohere.md
new file mode 100644
index 00000000000000..c577764623f496
--- /dev/null
+++ b/docs/source/ko/model_doc/cohere.md
@@ -0,0 +1,139 @@
+# Cohere[[cohere]]
+
+## 개요[[overview]]
+
+The Cohere Command-R 모델은 Cohere팀이 [Command-R: 프로덕션 규모의 검색 증강 생성](https://txt.cohere.com/command-r/)라는 블로그 포스트에서 소개 되었습니다.
+
+논문 초록:
+
+*Command-R은 기업의 프로덕션 규모 AI를 가능하게 하기 위해 RAG(검색 증강 생성)와 도구 사용을 목표로 하는 확장 가능한 생성 모델입니다. 오늘 우리는 대규모 프로덕션 워크로드를 목표로 하는 새로운 LLM인 Command-R을 소개합니다. Command-R은 높은 효율성과 강력한 정확성의 균형을 맞추는 '확장 가능한' 모델 카테고리를 대상으로 하여, 기업들이 개념 증명을 넘어 프로덕션 단계로 나아갈 수 있게 합니다.*
+
+*Command-R은 검색 증강 생성(RAG)이나 외부 API 및 도구 사용과 같은 긴 문맥 작업에 최적화된 생성 모델입니다. 이 모델은 RAG 애플리케이션을 위한 최고 수준의 통합을 제공하고 기업 사용 사례에서 뛰어난 성능을 발휘하기 위해 우리의 업계 선도적인 Embed 및 Rerank 모델과 조화롭게 작동하도록 설계되었습니다. 기업이 대규모로 구현할 수 있도록 만들어진 모델로서, Command-R은 다음과 같은 특징을 자랑합니다:
+- RAG 및 도구 사용에 대한 강력한 정확성
+- 낮은 지연 시간과 높은 처리량
+- 더 긴 128k 컨텍스트와 낮은 가격
+- 10개의 주요 언어에 걸친 강력한 기능
+- 연구 및 평가를 위해 HuggingFace에서 사용 가능한 모델 가중치
+
+모델 체크포인트는 [이곳](https://huggingface.co/CohereForAI/c4ai-command-r-v01)에서 확인하세요.
+이 모델은 [Saurabh Dash](https://huggingface.co/saurabhdash)과 [Ahmet Üstün](https://huggingface.co/ahmetustun)에 의해 기여 되었습니다. Hugging Face에서 이 코드의 구현은 [GPT-NeoX](https://github.com/EleutherAI/gpt-neox)에 기반하였습니다.
+
+## 사용 팁[[usage-tips]]
+
+<Tip warning={true}>
+
+Hub에 업로드된 체크포인트들은 `torch_dtype = 'float16'`을 사용합니다. 
+이는 `AutoModel` API가 체크포인트를 `torch.float32`에서 `torch.float16`으로 변환하는 데 사용됩니다. 
+
+온라인 가중치의 `dtype`은 `model = AutoModelForCausalLM.from_pretrained("path", torch_dtype = "auto")`를 사용하여 모델을 초기화할 때 `torch_dtype="auto"`를 사용하지 않는 한 대부분 무관합니다. 그 이유는 모델이 먼저 다운로드되고(온라인 체크포인트의 `dtype` 사용), 그 다음 `torch`의 기본 `dtype`으로 변환되며(이때 `torch.float32`가 됨), 마지막으로 config에 `torch_dtype`이 제공된 경우 이를 사용하기 때문입니다.
+
+모델을 `float16`으로 훈련하는 것은 권장되지 않으며 `nan`을 생성하는 것으로 알려져 있습니다. 따라서 모델은 `bfloat16`으로 훈련해야 합니다.
+</Tip>
+모델과 토크나이저는 다음과 같이 로드할 수 있습니다:
+
+```python
+# pip install transformers
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+model_id = "CohereForAI/c4ai-command-r-v01"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id)
+
+# Format message with the command-r chat template
+messages = [{"role": "user", "content": "Hello, how are you?"}]
+input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
+## <BOS_TOKEN><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello, how are you?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
+
+gen_tokens = model.generate(
+    input_ids, 
+    max_new_tokens=100, 
+    do_sample=True, 
+    temperature=0.3,
+    )
+
+gen_text = tokenizer.decode(gen_tokens[0])
+print(gen_text)
+```
+
+- Flash Attention 2를 `attn_implementation="flash_attention_2"`를 통해 사용할 때는, `from_pretrained` 클래스 메서드에 `torch_dtype`을 전달하지 말고 자동 혼합 정밀도 훈련(Automatic Mixed-Precision training)을 사용하세요. `Trainer`를 사용할 때는 단순히 `fp16` 또는 `bf16`을 `True`로 지정하면 됩니다. 그렇지 않은 경우에는 `torch.autocast`를 사용하고 있는지 확인하세요. 이는 Flash Attention이 `fp16`와 `bf16` 데이터 타입만 지원하기 때문에 필요합니다.
+
+## 리소스[[resources]]
+
+Command-R을 시작하는 데 도움이 되는 Hugging Face와 community 자료 목록(🌎로 표시됨) 입니다. 여기에 포함될 자료를 제출하고 싶으시다면 PR(Pull Request)를 열어주세요. 리뷰 해드리겠습니다! 자료는 기존 자료를 복제하는 대신 새로운 내용을 담고 있어야 합니다.
+
+
+<PipelineTag pipeline="text-generation"/>
+
+FP16 모델 로딩
+```python
+# pip install transformers
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+model_id = "CohereForAI/c4ai-command-r-v01"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id)
+
+# command-r 챗 템플릿으로 메세지 형식을 정하세요
+messages = [{"role": "user", "content": "Hello, how are you?"}]
+input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
+## <BOS_TOKEN><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello, how are you?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
+
+gen_tokens = model.generate(
+    input_ids, 
+    max_new_tokens=100, 
+    do_sample=True, 
+    temperature=0.3,
+    )
+
+gen_text = tokenizer.decode(gen_tokens[0])
+print(gen_text)
+```
+
+bitsandbytes 라이브러리를 이용해서 4bit 양자화된 모델 로딩
+```python
+# pip install transformers bitsandbytes accelerate
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+
+bnb_config = BitsAndBytesConfig(load_in_4bit=True)
+
+model_id = "CohereForAI/c4ai-command-r-v01"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config)
+
+gen_tokens = model.generate(
+    input_ids, 
+    max_new_tokens=100, 
+    do_sample=True, 
+    temperature=0.3,
+    )
+
+gen_text = tokenizer.decode(gen_tokens[0])
+print(gen_text)
+```
+
+
+## CohereConfig[[transformers.CohereConfig]]
+
+[[autodoc]] CohereConfig
+
+## CohereTokenizerFast[[transformers.CohereTokenizerFast]]
+
+[[autodoc]] CohereTokenizerFast
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - update_post_processor
+    - save_vocabulary
+
+## CohereModel[[transformers.CohereModel]]
+
+[[autodoc]] CohereModel
+    - forward
+
+
+## CohereForCausalLM[[transformers.CohereForCausalLM]]
+
+[[autodoc]] CohereForCausalLM
+    - forward
+
+
diff --git a/docs/source/ko/model_doc/dbrx.md b/docs/source/ko/model_doc/dbrx.md
new file mode 100644
index 00000000000000..4b33e95700691e
--- /dev/null
+++ b/docs/source/ko/model_doc/dbrx.md
@@ -0,0 +1,125 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# DBRX[[dbrx]]
+
+## 개요[[overview]]
+
+DBRX는 [트랜스포머 기반의](https://www.isattentionallyouneed.com/) 다음 토큰을 예측하는 디코더 전용 LLM 모델입니다.
+총 132B 매개변수를 가진 *세밀한* 전문가 혼합(MoE) 아키텍처를 사용하며, 이 중 36B 매개변수가 입력마다 활성화됩니다.
+12T 토큰의 텍스트와 코드 데이터로 사전 학습되었습니다.
+
+Mixtral-8x7B와 Grok-1과 같은 다른 공개 MoE 모델들과 비교했을 때, DBRX는 더 많은 수의 작은 전문가들을 사용하는 세밀한 구조를 가지고 있습니다. DBRX는 16개의 전문가 중 4개를 선택하는 반면, Mixtral-8x7B와 Grok-1은 8개의 전문가 중 2개를 선택합니다.
+
+이는 65배 더 많은 전문가 조합을 가능하게 하며, 이를 통해 모델의 품질이 향상되는 것을 발견했습니다.
+DBRX는 회전 위치 인코딩(RoPE), 게이트 선형 유닛(GLU), 그룹 쿼리 어텐션(GQA)을 사용합니다.
+BPE 기반 모델이며 [tiktoken](https://github.com/openai/tiktoken) 저장소에 설명된 GPT-4 토크나이저를 사용합니다.
+이러한 선택들은 철저한 평가와 스케일링 실험을 기반으로 이루어졌습니다.
+
+DBRX는 신중하게 선별된 12T 토큰의 데이터로 사전 학습되었으며, 최대 문맥 길이는 32K 토큰입니다.
+이 데이터는 토큰 대비 MPT 계열 모델 학습에 사용된 데이터보다 최소 2배 이상 더 좋은 것으로 추정됩니다.
+이 새로운 데이터셋은 데이터 처리를 위한 Apache Spark™와 Databricks 노트북, 그리고 데이터 관리와 거버넌스를 위한 Unity Catalog를 포함한 Databricks 도구 전체를 활용하여 개발되었습니다.
+우리는 사전 학습을 위해 커리큘럼 학습을 사용했으며, 학습 중 데이터 믹스를 변경하는 방식이 모델 품질을 상당히 개선한다는 것을 발견했습니다.
+
+
+DBRX Instruct와 DBRX Base에 대한 더 자세한 정보는 이 [기술 블로그 포스트](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm)에서 확인할 수 있습니다.
+
+이 모델은 [eitan-turok](https://huggingface.co/eitanturok)와 [abhi-db](https://huggingface.co/abhi-db)가 기여했습니다. 원본 코드는 [이곳](https://github.com/databricks/dbrx-instruct)에서 찾을 수 있지만, 최신 버전이 아닐 수 있습니다.
+
+## 사용 예[[usage-examples]]
+
+`generate()` 메소드는 DBRX를 사용하여 텍스트를 생성하는 데 사용될 수 있습니다. 표준 어텐션 구현, 플래시 어텐션, PyTorch의 스케일된 내적 어텐션(Scaled Dot-Product Attention)을 사용하여 생성할 수 있습니다. 후자의 두 어텐션 구현 방식은 처리 속도를 크게 높여줍니다.
+
+```python
+from transformers import DbrxForCausalLM, AutoTokenizer
+import torch
+
+tokenizer = AutoTokenizer.from_pretrained("databricks/dbrx-instruct", token="YOUR_HF_TOKEN")
+model = DbrxForCausalLM.from_pretrained(
+    "databricks/dbrx-instruct",
+    device_map="auto",
+    torch_dtype=torch.bfloat16,
+    token="YOUR_HF_TOKEN",
+    )
+
+input_text = "What does it take to build a great LLM?"
+messages = [{"role": "user", "content": input_text}]
+input_ids = tokenizer.apply_chat_template(messages, return_dict=True, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
+
+outputs = model.generate(**input_ids, max_new_tokens=200)
+print(tokenizer.decode(outputs[0]))
+```
+
+`pip install flash-attn`를 통해 플래시 어텐션을 설치하면, 더 빠른 생성이 가능합니다. (플래시 어텐션에 대한 HuggingFace 문서는 [이곳](https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2)에서 확인할 수 있습니다.)
+
+
+```python
+from transformers import DbrxForCausalLM, AutoTokenizer
+import torch
+
+tokenizer = AutoTokenizer.from_pretrained("databricks/dbrx-instruct", token="YOUR_HF_TOKEN")
+model = DbrxForCausalLM.from_pretrained(
+    "databricks/dbrx-instruct",
+    device_map="auto",
+    torch_dtype=torch.bfloat16,
+    token="YOUR_HF_TOKEN",
+    attn_implementation="flash_attention_2",
+    )
+
+input_text = "What does it take to build a great LLM?"
+messages = [{"role": "user", "content": input_text}]
+input_ids = tokenizer.apply_chat_template(messages, return_dict=True, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
+
+outputs = model.generate(**input_ids, max_new_tokens=200)
+print(tokenizer.decode(outputs[0]))
+```
+
+PyTorch의 스케일된 내적 어텐션을 사용하여도 더 빠른 생성이 가능합니다. (스케일된 내적 어텐션에 대한 HuggingFace 문서는 [이곳](https://huggingface.co/docs/transformers/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)에서 확인할 수 있습니다.)
+
+
+```python
+from transformers import DbrxForCausalLM, AutoTokenizer
+import torch
+
+tokenizer = AutoTokenizer.from_pretrained("databricks/dbrx-instruct", token="YOUR_HF_TOKEN")
+model = DbrxForCausalLM.from_pretrained(
+    "databricks/dbrx-instruct",
+    device_map="auto",
+    torch_dtype=torch.bfloat16,
+    token="YOUR_HF_TOKEN",
+    attn_implementation="sdpa",
+    )
+
+input_text = "What does it take to build a great LLM?"
+messages = [{"role": "user", "content": input_text}]
+input_ids = tokenizer.apply_chat_template(messages, return_dict=True, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
+
+outputs = model.generate(**input_ids, max_new_tokens=200)
+print(tokenizer.decode(outputs[0]))
+```
+
+## DbrxConfig[[transformers.DbrxConfig]]
+
+[[autodoc]] DbrxConfig
+
+
+## DbrxModel[[transformers.DbrxModel]]
+
+[[autodoc]] DbrxModel
+    - forward
+
+
+## DbrxForCausalLM[[transformers.DbrxForCausalLM]]
+
+[[autodoc]] DbrxForCausalLM
+    - forward
+
diff --git a/docs/source/ko/model_doc/deberta-v2.md b/docs/source/ko/model_doc/deberta-v2.md
new file mode 100644
index 00000000000000..2e590ad8a5abcf
--- /dev/null
+++ b/docs/source/ko/model_doc/deberta-v2.md
@@ -0,0 +1,147 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# DeBERTa-v2
+
+## 개요
+
+
+DeBERTa 모델은 Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen이 작성한 [DeBERTa: 분리된 어텐션을 활용한 디코딩 강화 BERT](https://arxiv.org/abs/2006.03654)이라는 논문에서 제안되었습니다. 이 모델은 2018년 Google이 발표한 BERT 모델과 2019년 Facebook이 발표한 RoBERTa 모델을 기반으로 합니다.
+DeBERTa는 RoBERTa에서 사용된 데이터의 절반만을 사용하여 분리된(disentangled) 어텐션과 향상된 마스크 디코더 학습을 통해 RoBERTa를 개선했습니다.
+
+논문의 초록은 다음과 같습니다:
+
+*사전 학습된 신경망 언어 모델의 최근 발전은 많은 자연어 처리(NLP) 작업의 성능을 크게 향상시켰습니다. 본 논문에서는 두 가지 새로운 기술을 사용하여 BERT와 RoBERTa 모델을 개선한 새로운 모델 구조인 DeBERTa를 제안합니다. 첫 번째는 분리된 어텐션 메커니즘으로, 각 단어가 내용과 위치를 각각 인코딩하는 두 개의 벡터로 표현되며, 단어들 간의 어텐션 가중치는 내용과 상대적 위치에 대한 분리된 행렬을 사용하여 계산됩니다. 두 번째로, 모델 사전 학습을 위해 마스킹된 토큰을 예측하는 출력 소프트맥스 층을 대체하는 향상된 마스크 디코더가 사용됩니다. 우리는 이 두 가지 기술이 모델 사전 학습의 효율성과 다운스트림 작업의 성능을 크게 향상시킨다는 것을 보여줍니다. RoBERTa-Large와 비교했을 때, 절반의 학습 데이터로 학습된 DeBERTa 모델은 광범위한 NLP 작업에서 일관되게 더 나은 성능을 보여주며, MNLI에서 +0.9%(90.2% vs 91.1%), SQuAD v2.0에서 +2.3%(88.4% vs 90.7%), RACE에서 +3.6%(83.2% vs 86.8%)의 성능 향상을 달성했습니다. DeBERTa 코드와 사전 학습된 모델은 https://github.com/microsoft/DeBERTa 에서 공개될 예정입니다.*
+
+
+다음 정보들은 [원본 구현 저장소](https://github.com/microsoft/DeBERTa)에서 보실 수 있습니다. DeBERTa v2는 DeBERTa의 두번째 모델입니다. 
+DeBERTa v2는 SuperGLUE 단일 모델 제출에 사용된 1.5B 모델을 포함하며, 인간 기준점(베이스라인) 89.8점 대비 89.9점을 달성했습니다. 저자의 
+[블로그](https://www.microsoft.com/en-us/research/blog/microsoft-deberta-surpasses-human-performance-on-the-superglue-benchmark/)에서 더 자세한 정보를 확인할 수 있습니다.
+
+v2의 새로운 점:
+
+- **어휘(Vocabulary)** v2에서는 학습 데이터로부터 구축된 128K 크기의 새로운 어휘를 사용하도록 토크나이저가 변경되었습니다. GPT2 기반 토크나이저 대신, 이제는 [센텐스피스 기반](https://github.com/google/sentencepiece) 토크나이저를 사용합니다.
+- **nGiE[n그램 유도(Induced) 입력 인코딩]** DeBERTa-v2 모델은 입력 토큰들의 지역적 의존성을 더 잘 학습하기 위해 첫 번째 트랜스포머 층과 함께 추가적인 합성곱 층을 사용합니다.
+- **어텐션 층에서 위치 투영 행렬과 내용 투영 행렬 공유** 이전 실험들을 기반으로, 이는 성능에 영향을 주지 않으면서 매개변수를 절약할 수 있습니다.
+- **상대적 위치를 인코딩하기 위한 버킷 적용** DeBERTa-v2 모델은 T5와 유사하게 상대적 위치를 인코딩하기 위해 로그 버킷을 사용합니다.
+- **900M 모델 & 1.5B 모델** 900M과 1.5B, 두 가지 추가 모델 크기가 제공되며, 이는 다운스트림 작업의 성능을 크게 향상시킵니다.
+
+[DeBERTa](https://huggingface.co/DeBERTa) 모델의 텐서플로 2.0 구현은 [kamalkraj](https://huggingface.co/kamalkraj)가 기여했습니다. 원본 코드는 [이곳](https://github.com/microsoft/DeBERTa)에서 확인하실 수 있습니다.
+
+## 자료
+
+- [텍스트 분류 작업 가이드](../tasks/sequence_classification)
+- [토큰 분류 작업 가이드](../tasks/token_classification)
+- [질의응답 작업 가이드](../tasks/question_answering)
+- [마스크 언어 모델링 작업 가이드](../tasks/masked_language_modeling)
+- [다중 선택 작업 가이드](../tasks/multiple_choice)
+
+## DebertaV2Config
+
+[[autodoc]] DebertaV2Config
+
+## DebertaV2Tokenizer
+
+[[autodoc]] DebertaV2Tokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## DebertaV2TokenizerFast
+
+[[autodoc]] DebertaV2TokenizerFast
+    - build_inputs_with_special_tokens
+    - create_token_type_ids_from_sequences
+
+<frameworkcontent>
+<pt>
+
+## DebertaV2Model
+
+[[autodoc]] DebertaV2Model
+    - forward
+
+## DebertaV2PreTrainedModel
+
+[[autodoc]] DebertaV2PreTrainedModel
+    - forward
+
+## DebertaV2ForMaskedLM
+
+[[autodoc]] DebertaV2ForMaskedLM
+    - forward
+
+## DebertaV2ForSequenceClassification
+
+[[autodoc]] DebertaV2ForSequenceClassification
+    - forward
+
+## DebertaV2ForTokenClassification
+
+[[autodoc]] DebertaV2ForTokenClassification
+    - forward
+
+## DebertaV2ForQuestionAnswering
+
+[[autodoc]] DebertaV2ForQuestionAnswering
+    - forward
+
+## DebertaV2ForMultipleChoice
+
+[[autodoc]] DebertaV2ForMultipleChoice
+    - forward
+
+</pt>
+<tf>
+
+## TFDebertaV2Model
+
+[[autodoc]] TFDebertaV2Model
+    - call
+
+## TFDebertaV2PreTrainedModel
+
+[[autodoc]] TFDebertaV2PreTrainedModel
+    - call
+
+## TFDebertaV2ForMaskedLM
+
+[[autodoc]] TFDebertaV2ForMaskedLM
+    - call
+
+## TFDebertaV2ForSequenceClassification
+
+[[autodoc]] TFDebertaV2ForSequenceClassification
+    - call
+
+## TFDebertaV2ForTokenClassification
+
+[[autodoc]] TFDebertaV2ForTokenClassification
+    - call
+
+## TFDebertaV2ForQuestionAnswering
+
+[[autodoc]] TFDebertaV2ForQuestionAnswering
+    - call
+
+## TFDebertaV2ForMultipleChoice
+
+[[autodoc]] TFDebertaV2ForMultipleChoice
+    - call
+
+</tf>
+</frameworkcontent>
diff --git a/docs/source/ko/model_doc/deberta.md b/docs/source/ko/model_doc/deberta.md
new file mode 100644
index 00000000000000..6f82bed033a0e0
--- /dev/null
+++ b/docs/source/ko/model_doc/deberta.md
@@ -0,0 +1,152 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# DeBERTa[[deberta]]
+
+## 개요[[overview]]
+
+
+DeBERTa 모델은 Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen이 작성한 [DeBERTa: 분리된 어텐션을 활용한 디코딩 강화 BERT](https://arxiv.org/abs/2006.03654)이라는 논문에서 제안되었습니다. 이 모델은 2018년 Google이 발표한 BERT 모델과 2019년 Facebook이 발표한 RoBERTa 모델을 기반으로 합니다.
+DeBERTa는 RoBERTa에서 사용된 데이터의 절반만을 사용하여 분리된(disentangled) 어텐션과 향상된 마스크 디코더 학습을 통해 RoBERTa를 개선했습니다.
+
+논문의 초록은 다음과 같습니다:
+
+*사전 학습된 신경망 언어 모델의 최근 발전은 많은 자연어 처리(NLP) 작업의 성능을 크게 향상시켰습니다. 본 논문에서는 두 가지 새로운 기술을 사용하여 BERT와 RoBERTa 모델을 개선한 새로운 모델 구조인 DeBERTa를 제안합니다. 첫 번째는 분리된 어텐션 메커니즘으로, 각 단어가 내용과 위치를 각각 인코딩하는 두 개의 벡터로 표현되며, 단어들 간의 어텐션 가중치는 내용과 상대적 위치에 대한 분리된 행렬을 사용하여 계산됩니다. 두 번째로, 모델 사전 학습을 위해 마스킹된 토큰을 예측하는 출력 소프트맥스 층을 대체하는 향상된 마스크 디코더가 사용됩니다. 우리는 이 두 가지 기술이 모델 사전 학습의 효율성과 다운스트림 작업의 성능을 크게 향상시킨다는 것을 보여줍니다. RoBERTa-Large와 비교했을 때, 절반의 학습 데이터로 학습된 DeBERTa 모델은 광범위한 NLP 작업에서 일관되게 더 나은 성능을 보여주며, MNLI에서 +0.9%(90.2% vs 91.1%), SQuAD v2.0에서 +2.3%(88.4% vs 90.7%), RACE에서 +3.6%(83.2% vs 86.8%)의 성능 향상을 달성했습니다. DeBERTa 코드와 사전 학습된 모델은 https://github.com/microsoft/DeBERTa 에서 공개될 예정입니다.*
+
+[DeBERTa](https://huggingface.co/DeBERTa) 모델의 텐서플로 2.0 구현은 [kamalkraj](https://huggingface.co/kamalkraj)가 기여했습니다. 원본 코드는 [이곳](https://github.com/microsoft/DeBERTa)에서 확인하실 수 있습니다.
+
+## 리소스[[resources]]
+
+
+DeBERTa를 시작하는 데 도움이 되는 Hugging Face와 community 자료 목록(🌎로 표시됨) 입니다. 여기에 포함될 자료를 제출하고 싶으시다면 PR(Pull Request)를 열어주세요. 리뷰해 드리겠습니다! 자료는 기존 자료를 복제하는 대신 새로운 내용을 담고 있어야 합니다.
+
+
+<PipelineTag pipeline="text-classification"/>
+
+- DeBERTa와 [DeepSpeed를 이용해서 대형 모델 학습을 가속시키는](https://huggingface.co/blog/accelerate-deepspeed) 방법에 대한 포스트.
+- DeBERTa와 [머신러닝으로 한층 향상된 고객 서비스](https://huggingface.co/blog/supercharge-customer-service-with-machine-learning)에 대한 블로그 포스트.
+- [`DebertaForSequenceClassification`]는 이 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb)에서 지원됩니다.
+- [`TFDebertaForSequenceClassification`]는 이 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/text-classification)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb)에서 지원됩니다.
+- [텍스트 분류 작업 가이드](../tasks/sequence_classification)
+
+<PipelineTag pipeline="token-classification" />
+
+- [`DebertaForTokenClassification`]는 이 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/pytorch/token-classification)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb)에서 지원합니다.
+- [`TFDebertaForTokenClassification`]는 이 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/token-classification)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb)에서 지원합니다.
+- 🤗 Hugging Face 코스의 [토큰 분류](https://huggingface.co/course/chapter7/2?fw=pt) 장.
+- 🤗 Hugging Face 코스의 [BPE(Byte-Pair Encoding) 토큰화](https://huggingface.co/course/chapter6/5?fw=pt) 장.
+- [토큰 분류 작업 가이드](../tasks/token_classification)
+
+<PipelineTag pipeline="fill-mask"/>
+
+- [`DebertaForMaskedLM`]는 이 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#robertabertdistilbert-and-masked-language-modeling)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)에서 지원합니다.
+- [`TFDebertaForMaskedLM`]은 이 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/language-modeling#run_mlmpy)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb)에서 지원합니다.
+- 🤗 Hugging Face 코스의 [마스크 언어 모델링](https://huggingface.co/course/chapter7/3?fw=pt) 장.
+- [마스크 언어 모델링 작업 가이드](../tasks/masked_language_modeling)
+
+<PipelineTag pipeline="question-answering"/>
+
+- [`DebertaForQuestionAnswering`]은 이 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb)에서 지원합니다.
+- [`TFDebertaForQuestionAnswering`]는 이 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/question-answering)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb)에서 지원합니다.
+- 🤗 Hugging Face 코스의 [질의응답(Question answering)](https://huggingface.co/course/chapter7/7?fw=pt) 장.
+- [질의응답 작업 가이드](../tasks/question_answering)
+
+## DebertaConfig[[transformers.DebertaConfig]]
+
+[[autodoc]] DebertaConfig
+
+## DebertaTokenizer[[transformers.DebertaTokenizer]]
+
+[[autodoc]] DebertaTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## DebertaTokenizerFast[[transformers.DebertaTokenizerFast]]
+
+[[autodoc]] DebertaTokenizerFast
+    - build_inputs_with_special_tokens
+    - create_token_type_ids_from_sequences
+
+<frameworkcontent>
+<pt>
+
+## DebertaModel[[transformers.DebertaModel]]
+
+[[autodoc]] DebertaModel
+    - forward
+
+## DebertaPreTrainedModel[[transformers.DebertaPreTrainedModel]]
+
+[[autodoc]] DebertaPreTrainedModel
+
+## DebertaForMaskedLM[[transformers.DebertaForMaskedLM]]
+
+[[autodoc]] DebertaForMaskedLM
+    - forward
+
+## DebertaForSequenceClassification[[transformers.DebertaForSequenceClassification]]
+
+[[autodoc]] DebertaForSequenceClassification
+    - forward
+
+## DebertaForTokenClassification[[transformers.DebertaForTokenClassification]]
+
+[[autodoc]] DebertaForTokenClassification
+    - forward
+
+## DebertaForQuestionAnswering[[transformers.DebertaForQuestionAnswering]]
+
+[[autodoc]] DebertaForQuestionAnswering
+    - forward
+
+</pt>
+<tf>
+
+## TFDebertaModel[[transformers.TFDebertaModel]]
+
+[[autodoc]] TFDebertaModel
+    - call
+
+## TFDebertaPreTrainedModel[[transformers.TFDebertaPreTrainedModel]]
+
+[[autodoc]] TFDebertaPreTrainedModel
+    - call
+
+## TFDebertaForMaskedLM[[transformers.TFDebertaForMaskedLM]]
+
+[[autodoc]] TFDebertaForMaskedLM
+    - call
+
+## TFDebertaForSequenceClassification[[transformers.TFDebertaForSequenceClassification]]
+
+[[autodoc]] TFDebertaForSequenceClassification
+    - call
+
+## TFDebertaForTokenClassification[[transformers.TFDebertaForTokenClassification]]
+
+[[autodoc]] TFDebertaForTokenClassification
+    - call
+
+## TFDebertaForQuestionAnswering[[transformers.TFDebertaForQuestionAnswering]]
+
+[[autodoc]] TFDebertaForQuestionAnswering
+    - call
+
+</tf>
+</frameworkcontent>
+
diff --git a/docs/source/ko/model_doc/esm.md b/docs/source/ko/model_doc/esm.md
new file mode 100644
index 00000000000000..6ea1191dabe0ba
--- /dev/null
+++ b/docs/source/ko/model_doc/esm.md
@@ -0,0 +1,115 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# ESM [[esm]]
+
+## 개요 [[overview]]
+
+이 페이지는 Meta AI의 Fundamental AI Research 팀에서 제공하는 Transformer 단백질 언어 모델에 대한 코드와 사전 훈련된 가중치를 제공합니다. 여기에는 최첨단인 ESMFold와 ESM-2, 그리고 이전에 공개된 ESM-1b와 ESM-1v가 포함됩니다. Transformer 단백질 언어 모델은 Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, Rob Fergus의 논문 [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118)에서 소개되었습니다. 이 논문의 첫 번째 버전은 2019년에 [출판 전 논문](https://www.biorxiv.org/content/10.1101/622803v1?versioned=true) 형태로 공개되었습니다.
+
+ESM-2는 다양한 구조 예측 작업에서 테스트된 모든 단일 시퀀스 단백질 언어 모델을 능가하며, 원자 수준의 구조 예측을 가능하게 합니다. 이 모델은 Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives의 논문 [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902)에서 공개되었습니다.
+
+이 논문에서 함께 소개된 ESMFold는 ESM-2 스템을 사용하며, 최첨단의 정확도로 단백질 접힘 구조를 예측할 수 있는 헤드를 갖추고 있습니다. [AlphaFold2](https://www.nature.com/articles/s41586-021-03819-2)와 달리, 이는 대형 사전 훈련된 단백질 언어 모델 스템의 토큰 임베딩에 의존하며, 추론 시 다중 시퀀스 정렬(MSA) 단계를 수행하지 않습니다. 이는 ESMFold 체크포인트가 완전히 "독립적"이며, 예측을 위해 알려진 단백질 시퀀스와 구조의 데이터베이스, 그리고 그와 관련 외부 쿼리 도구를 필요로 하지 않는다는 것을 의미합니다. 그리고 그 결과, 훨씬 빠릅니다.
+
+"Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences"의 초록은 다음과 같습니다:
+
+*인공지능 분야에서는 대규모의 데이터와 모델 용량을 갖춘 비지도 학습의 조합이 표현 학습과 통계적 생성에서 주요한 발전을 이끌어냈습니다. 생명 과학에서는 시퀀싱 기술의 성장이 예상되며, 자연 시퀀스 다양성에 대한 전례 없는 데이터가 나올 것으로 기대됩니다. 진화적 단계에서 볼 때, 단백질 언어 모델링은 생물학을 위한 예측 및 생성 인공지능을 향한 논리적인 단계에 있습니다. 이를 위해 우리는 진화적 다양성을 아우르는 2억 5천만 개의 단백질 시퀀스에서 추출한 860억 개의 아미노산에 대해 심층 컨텍스트 언어 모델을 비지도 학습으로 훈련합니다. 그 결과 모델은 그 표현에서 생물학적 속성에 대한 정보를 포함합니다. 이 표현은 시퀀스 데이터만으로 학습됩니다. 학습된 표현 공간은 아미노산의 생화학적 특성 수준에서부터 단백질의 원거리 상동성까지 구조를 반영하는 다중 규모의 조직을 가지고 있습니다. 이 표현에는 2차 및 3차 구조에 대한 정보가 인코딩되어 있으며, 선형 전사에 의해 식별 될 수 있습니다. 표현 학습은 돌연변이에 의한 효과와 2차 구조의 최첨단 지도 예측을 가능하게 하고, 넓은 범위의 접촉 부위 예측을 위한 최첨단 특징을 향상시킵니다.*
+
+"Language models of protein sequences at the scale of evolution enable accurate structure prediction"의 초록은 다음과 같습니다:
+
+*대형 언어 모델은 최근 규모가 커짐에 따라 긴급한 기능을 개발하여 단순한 패턴 매칭을 넘어 더 높은 수준의 추론을 수행하고 생생한 이미지와 텍스트를 생성하는 것으로 나타났습니다. 더 작은 규모에서 훈련된 단백질 시퀀스의 언어 모델이 연구되었지만, 그들이 규모가 커짐에 따라 생물학에 대해 무엇을 배우는지는 거의 알려져 있지 않습니다. 이 연구에서 우리는 현재까지 평가된 가장 큰 150억 개의 매개변수를 가진 모델을 훈련합니다. 우리는 모델이 규모가 커짐에 따라 단일 아미노산의 해상도로 단백질의 3차원 구조를 예측할 수 있는 정보를 학습한다는 것을 발견했습니다. 우리는 개별 단백질 시퀀스로부터 직접 고정밀 원자 수준의 엔드-투-엔드 구조 예측을 하기 위한 ESMFold를 제시합니다. ESMFold는 언어 모델에 잘 이해되는 낮은 퍼플렉서티를 가진 시퀀스에 대해 AlphaFold2와 RoseTTAFold와 유사한 정확도를 가지고 있습니다. ESMFold의 추론은 AlphaFold2보다 한 자릿수 빠르며, 메타게놈 단백질의 구조적 공간을 실용적인 시간 내에 탐색할 수 있게 합니다.*
+
+원본 코드는 [여기](https://github.com/facebookresearch/esm)에서 찾을 수 있으며, Meta AI의 Fundamental AI Research 팀에서 개발되었습니다. ESM-1b, ESM-1v, ESM-2는 [jasonliu](https://huggingface.co/jasonliu)와 [Matt](https://huggingface.co/Rocketknight1)에 의해 HuggingFace에 기여되었습니다.
+
+ESMFold는 [Matt](https://huggingface.co/Rocketknight1)와 [Sylvain](https://huggingface.co/sgugger)에 의해 HuggingFace에 기여되었으며, 이 과정에서 많은 도움을 준 Nikita Smetanin, Roshan Rao, Tom Sercu에게 큰 감사를 드립니다!
+
+## 사용 팁 [[usage-tips]]
+
+- ESM 모델은 마스크드 언어 모델링(MLM) 목표로 훈련되었습니다.
+- HuggingFace의 ESMFold 포트는 [openfold](https://github.com/aqlaboratory/openfold) 라이브러리의 일부를 사용합니다. `openfold` 라이브러리는 Apache License 2.0에 따라 라이선스가 부여됩니다.
+
+## 리소스 [[resources]]
+
+- [텍스트 분류 작업 가이드](../tasks/sequence_classification)
+- [토큰 분류 작업 가이드](../tasks/token_classification)
+- [마스킹드 언어 모델링 작업 가이드](../tasks/masked_language_modeling)
+
+## EsmConfig [[transformers.EsmConfig]]
+
+[[autodoc]] EsmConfig
+    - all
+
+## EsmTokenizer [[transformers.EsmTokenizer]]
+
+[[autodoc]] EsmTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+<frameworkcontent>
+<pt>
+
+## EsmModel [[transformers.EsmModel]]
+
+[[autodoc]] EsmModel
+    - forward
+
+## EsmForMaskedLM [[transformers.EsmForMaskedLM]]
+
+[[autodoc]] EsmForMaskedLM
+    - forward
+
+## EsmForSequenceClassification [[transformers.EsmForSequenceClassification]]
+
+[[autodoc]] EsmForSequenceClassification
+    - forward
+
+## EsmForTokenClassification [[transformers.EsmForTokenClassification]]
+
+[[autodoc]] EsmForTokenClassification
+    - forward
+
+## EsmForProteinFolding [[transformers.EsmForProteinFolding]]
+
+[[autodoc]] EsmForProteinFolding
+    - forward
+
+</pt>
+<tf>
+
+## TFEsmModel [[transformers.TFEsmModel]]
+
+[[autodoc]] TFEsmModel
+    - call
+
+## TFEsmForMaskedLM [[transformers.TFEsmForMaskedLM]]
+
+[[autodoc]] TFEsmForMaskedLM
+    - call
+
+## TFEsmForSequenceClassification [[transformers.TFEsmForSequenceClassification]]
+
+[[autodoc]] TFEsmForSequenceClassification
+    - call
+
+## TFEsmForTokenClassification [[transformers.TFEsmForTokenClassification]]
+
+[[autodoc]] TFEsmForTokenClassification
+    - call
+
+</tf>
+</frameworkcontent>
diff --git a/docs/source/ko/model_doc/gemma.md b/docs/source/ko/model_doc/gemma.md
new file mode 100644
index 00000000000000..25fe6f1c772950
--- /dev/null
+++ b/docs/source/ko/model_doc/gemma.md
@@ -0,0 +1,76 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Gemma [[gemma]]
+
+## 개요 [[overview]]
+
+Gemma 모델은 Google의 Gemma 팀이 작성한 [Gemma: Open Models Based on Gemini Technology and Research](https://blog.google/technology/developers/gemma-open-models/)에서 제안되었습니다.
+
+Gemma 모델은 6조 토큰으로 학습되었으며, 2b와 7b의 두 가지 버전으로 출시되었습니다.
+
+논문의 초록은 다음과 같습니다:
+
+*이 연구는 언어 이해, 추론 및 안전성에 대한 학술 벤치마크에서 뛰어난 성능을 보이는 새로운 오픈 언어 모델 계열인 Gemma를 소개합니다. 우리는 두 가지 크기(20억 및 70억 매개변수)의 모델을 출시하며, 사전 학습된 체크포인트와 미세 조정된 체크포인트를 모두 제공합니다. Gemma는 18개의 텍스트 기반 작업 중 11개에서 유사한 크기의 오픈 모델을 능가하며, 우리는 모델 개발에 대한 상세한 설명과 함께 안전성과 책임 측면에 대한 종합적인 평가를 제공합니다. 우리는 LLM의 책임감 있는 공개가 최첨단 모델의 안전성을 향상시키고 다음 세대의 LLM 혁신을 가능하게 하는 데 중요하다고 믿습니다.*
+
+팁:
+
+- 원본 체크포인트는 변환 스크립트 `src/transformers/models/gemma/convert_gemma_weights_to_hf.py`를 사용하여 변환할 수 있습니다.
+
+이 모델은 [Arthur Zucker](https://huggingface.co/ArthurZ), [Younes Belkada](https://huggingface.co/ybelkada), [Sanchit Gandhi](https://huggingface.co/sanchit-gandhi), [Pedro Cuenca](https://huggingface.co/pcuenq)가 기여했습니다.
+
+## GemmaConfig [[transformers.GemmaConfig]]
+
+[[autodoc]] GemmaConfig
+
+## GemmaTokenizer [[transformers.GemmaTokenizer]]
+
+[[autodoc]] GemmaTokenizer
+
+
+## GemmaTokenizerFast [[transformers.GemmaTokenizerFast]]
+
+[[autodoc]] GemmaTokenizerFast
+
+## GemmaModel [[transformers.GemmaModel]]
+
+[[autodoc]] GemmaModel
+    - forward
+
+## GemmaForCausalLM [[transformers.GemmaForCausalLM]]
+
+[[autodoc]] GemmaForCausalLM
+    - forward
+
+## GemmaForSequenceClassification [[transformers.GemmaForSequenceClassification]]
+
+[[autodoc]] GemmaForSequenceClassification
+    - forward
+
+## GemmaForTokenClassification [[transformers.GemmaForTokenClassification]]
+
+[[autodoc]] GemmaForTokenClassification
+    - forward
+
+## FlaxGemmaModel [[transformers.FlaxGemmaModel]]
+
+[[autodoc]] FlaxGemmaModel
+    - __call__
+
+## FlaxGemmaForCausalLM [[transformers.FlaxGemmaForCausalLM]]
+
+[[autodoc]] FlaxGemmaForCausalLM
+    - __call__
diff --git a/docs/source/ko/model_doc/gpt_neox_japanese.md b/docs/source/ko/model_doc/gpt_neox_japanese.md
new file mode 100644
index 00000000000000..13fb656dd50e94
--- /dev/null
+++ b/docs/source/ko/model_doc/gpt_neox_japanese.md
@@ -0,0 +1,76 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# GPT-NeoX-Japanese [[gpt-neox-japanese]]
+
+## 개요 [[overview]]
+
+
+일본어를 위한 자동회귀 언어 모델인 GPT-NeoX-Japanese를 소개합니다. 이 모델은 [https://github.com/EleutherAI/gpt-neox](https://github.com/EleutherAI/gpt-neox)에서 학습되었습니다. 일본어는 많은 어휘와 히라가나, 가타카나, 한자의 조합으로 이루어진 독특한 언어입니다. 이러한 일본어의 독특한 구조를 해결하기 위해 [특수 서브워드 토크나이저](https://github.com/tanreinama/Japanese-BPEEncoder_V2)를 사용했습니다. 이 유용한 토크나이저를 오픈소스로 제공해 준 *tanreinama*에게 매우 감사드립니다.
+
+이 모델은 Google의 [PaLM](https://ai.googleblog.com/2022/04/pathways-language-model-palm-scaling-to.html) 연구 권장 사항을 따르며, 트랜스포머 블록에서 편향 파라미터를 제거하여 모델 성능을 향상시켰습니다. 자세한 내용은 [이 기사](https://medium.com/ml-abeja/training-a-better-gpt-2-93b157662ae4)를 참조하세요.
+
+모델 개발은 [ABEJA, Inc.](https://www.abejainc.com/)의 [신야 오타니](https://github.com/SO0529), [타카요시 마카베](https://github.com/spider-man-tm), [안주 아로라](https://github.com/Anuj040), [쿄 하토리](https://github.com/go5paopao)에 의해 주도되었습니다. 이 모델 개발 활동에 대한 자세한 내용은 [여기](https://tech-blog.abeja.asia/entry/abeja-gpt-project-202207)를 참조하세요.
+
+
+
+### 사용 예시 [[usage-example]]
+
+`generate()` 메서드를 사용하면 GPT NeoX Japanese 모델을 통해 텍스트를 생성할 수 있습니다.
+
+```python
+>>> from transformers import GPTNeoXJapaneseForCausalLM, GPTNeoXJapaneseTokenizer
+
+>>> model = GPTNeoXJapaneseForCausalLM.from_pretrained("abeja/gpt-neox-japanese-2.7b")
+>>> tokenizer = GPTNeoXJapaneseTokenizer.from_pretrained("abeja/gpt-neox-japanese-2.7b")
+
+>>> prompt = "人とAIが協調するためには、"
+
+>>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids
+
+>>> gen_tokens = model.generate(
+...     input_ids,
+...     do_sample=True,
+...     temperature=0.9,
+...     max_length=100,
+... )
+>>> gen_text = tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)[0]
+
+>>> print(gen_text)
+人とAIが協調するためには、AIと人が共存し、AIを正しく理解する必要があります。
+```
+
+## 자료 [[resources]]
+
+- [일상 언어 모델링 작업 가이드 ](../tasks/language_modeling)
+
+## GPTNeoXJapanese 설정 (GPTNeoXJapaneseConfig) [[transformers.GPTNeoXJapaneseConfig]]
+
+[[autodoc]] GPTNeoXJapaneseConfig
+
+## GPTNeoXJapanese토큰화 (GPTNeoXJapaneseTokenizer) [[transformers.GPTNeoXJapaneseTokenizer]]
+
+[[autodoc]] GPTNeoXJapaneseTokenizer
+
+## GPTNeoXJapaneseModel [[transformers.GPTNeoXJapaneseModel]]
+
+[[autodoc]] GPTNeoXJapaneseModel
+    - forward
+
+## 일상 LLM 을 위한 GPTNeoXJapanese(GPTNeoXJapaneseForCausalLM) [[transformers.GPTNeoXJapaneseForCausalLM]]
+
+[[autodoc]] GPTNeoXJapaneseForCausalLM
+    - forward
diff --git a/docs/source/ko/model_doc/graphormer.md b/docs/source/ko/model_doc/graphormer.md
new file mode 100644
index 00000000000000..fc6aa2c9e2f7ee
--- /dev/null
+++ b/docs/source/ko/model_doc/graphormer.md
@@ -0,0 +1,52 @@
+<!--Copyright 2022 The HuggingFace Team and Microsoft. All rights reserved.
+
+Licensed under the MIT License; you may not use this file except in compliance with
+the License.
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Graphormer[[graphormer]]
+
+<Tip warning={true}>
+
+이 모델은 유지 보수 모드로만 운영되며, 코드를 변경하는 새로운 PR(Pull Request)은 받지 않습니다.
+이 모델을 실행하는 데 문제가 발생한다면, 이 모델을 지원하는 마지막 버전인 v4.40.2를 다시 설치해 주세요. 다음 명령어를 실행하여 재설치할 수 있습니다: `pip install -U transformers==4.40.2`.
+
+</Tip>
+
+## 개요[[overview]]
+
+Graphormer 모델은 Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu가 제안한 [트랜스포머가 그래프 표현에 있어서 정말 약할까?](https://arxiv.org/abs/2106.05234) 라는 논문에서 소개되었습니다. Graphormer는 그래프 트랜스포머 모델입니다. 텍스트 시퀀스 대신 그래프에서 계산을 수행할 수 있도록 수정되었으며, 전처리와 병합 과정에서 임베딩과 관심 특성을 생성한 후 수정된 어텐션을 사용합니다.
+
+해당 논문의 초록입니다:
+
+*트랜스포머 아키텍처는 자연어 처리와 컴퓨터 비전 등 많은 분야에서 지배적인 선택을 받고 있는 아키텍처 입니다. 그러나 그래프 수준 예측 리더보드 상에서는 주류 GNN 변형모델들에 비해 경쟁력 있는 성능을 달성하지 못했습니다. 따라서 트랜스포머가 그래프 표현 학습에서 어떻게 잘 수행될 수 있을지는 여전히 미스터리였습니다. 본 논문에서는 Graphormer를 제시함으로써 이 미스터리를 해결합니다. Graphormer는 표준 트랜스포머 아키텍처를 기반으로 구축되었으며, 특히 최근의 OpenGraphBenchmark Large-Scale Challenge(OGB-LSC)의 광범위한 그래프 표현 학습 작업에서 탁월한 결과를 얻을 수 있었습니다. 그래프에서 트랜스포머를 활용하는데 핵심은 그래프의 구조적 정보를 모델에 효과적으로 인코딩하는 것입니다. 이를 위해 우리는 Graphormer가 그래프 구조 데이터를 더 잘 모델링할 수 있도록 돕는 몇 가지 간단하면서도 효과적인 구조적 인코딩 방법을 제안합니다. 또한, 우리는 Graphormer의 표현을 수학적으로 특성화하고, 그래프의 구조적 정보를 인코딩하는 우리의 방식으로 많은 인기 있는 GNN 변형모델들이 Graphormer의 특수한 경우로 포함될 수 있음을 보여줍니다.*
+
+이 모델은 [clefourrier](https://huggingface.co/clefourrier)가 기여했습니다. 원본 코드는 [이곳](https://github.com/microsoft/Graphormer)에서 확인할 수 있습니다.
+
+## 사용 팁[[usage-tips]]
+
+이 모델은 큰 그래프(100개 이상의 노드개수/엣지개수)에서는 메모리 사용량이 폭발적으로 증가하므로 잘 작동하지 않습니다. 대안으로 배치 크기를 줄이거나, RAM을 늘리거나 또는 algos_graphormer.pyx 파일의 `UNREACHABLE_NODE_DISTANCE` 매개변수를 줄이는 방법도 있지만, 700개 이상의 노드개수/엣지개수를 처리하기에는 여전히 어려울 것입니다.
+
+이 모델은 토크나이저를 사용하지 않고, 대신 훈련 중에 특별한 콜레이터(collator)를 사용합니다.
+
+## GraphormerConfig[[transformers.GraphormerConfig]]
+
+[[autodoc]] GraphormerConfig
+
+## GraphormerModel[[transformers.GraphormerModel]]
+
+[[autodoc]] GraphormerModel
+    - forward
+
+## GraphormerForGraphClassification[[transformers.GraphormerForGraphClassification]]
+
+[[autodoc]] GraphormerForGraphClassification
+    - forward
diff --git a/docs/source/ko/model_doc/informer.md b/docs/source/ko/model_doc/informer.md
new file mode 100644
index 00000000000000..03722545a5625e
--- /dev/null
+++ b/docs/source/ko/model_doc/informer.md
@@ -0,0 +1,55 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Informer[[informer]]
+
+## 개요[[overview]]
+
+The Informer 모델은 Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, Wancai Zhang가 제안한 [Informer: 장기 시퀀스 시계열 예측(LSTF)을 위한 더욱 효율적인 트랜스포머(Beyond Efficient Transformer)](https://arxiv.org/abs/2012.07436)라는 논문에서 소개되었습니다.
+
+이 방법은 확률적 어텐션 메커니즘을 도입하여 "게으른" 쿼리가 아닌 "활성" 쿼리를 선택하고, 희소 트랜스포머를 제공하여 기존 어텐션의 이차적 계산 및 메모리 요구사항을 완화합니다.
+
+해당 논문의 초록입니다:
+
+*실제로 많은 응용프로그램에서는 장기 시퀀스 시계열 예측(LSTF)을 필요로 합니다. LSTF는 출력 - 입력 간 정확한 장기 의존성 결합도를 포착해내는 높은 예측 능력을 모델에 요구합니다. 최근 연구들은 예측 능력을 향상시킬 수 있는 트랜스포머의 잠재력을 보여주고 있습니다. 그러나, 트랜스포머를 LSTF에 직접 적용하지 못하도록 막는 몇 심각한 문제점들이 있습니다. 예로, 이차 시간 복잡도, 높은 메모리 사용량, 인코더-디코더 아키텍처의 본질적 한계를 들 수 있습니다. 이러한 문제를 해결하기 위해 LSTF를 위한 효율적인 트랜스포머 기반 모델인 Informer를 설계했습니다.
+
+Informer의 세가지 독특한 특성:
+(i) ProbSparse 셀프 어텐션 메커니즘으로, 시간 복잡도와 메모리 사용량에서 O(L logL)를 달성하며 시퀀스 의존성 정렬에서 비교 가능한 성능을 보입니다. 
+(ii) 셀프 어텐션 증류는 계단식 레이어 입력을 반으로 줄여 지배적인 어텐션을 강조하고 극단적으로 긴 입력 시퀀스를 효율적으로 처리합니다. 
+(iii) 생성 스타일 디코더는 개념적으로 단순하지만 장기 시계열 시퀀스를 단계별 방식이 아닌 한 번의 전방 연산으로 예측하여 장기 시퀀스 예측의 추론 속도를 크게 향상시킵니다. 4개의 대규모 데이터셋에 걸친 광범위한 실험은 Informer가 기존 방법들을 크게 능가하며 LSTF 문제에 새로운 해결책을 제공함을 보여줍니다.*
+
+이 모델은 [elisim](https://huggingface.co/elisim)와 [kashif](https://huggingface.co/kashif)가 기여했습니다.
+원본 코드는 [이곳](https://github.com/zhouhaoyi/Informer2020)에서 확인할 수 있습니다.
+
+## 자료[[resources]]
+
+시작하는 데 도움이 되는 Hugging Face와 community 자료 목록(🌎로 표시됨) 입니다. 여기에 포함될 자료를 제출하고 싶으시다면 PR(Pull Request)를 열어주세요. 리뷰 해드리겠습니다! 자료는 기존 자료를 복제하는 대신 새로운 내용을 담고 있어야 합니다.
+
+- HuggingFace 블로그에서 Informer 포스트를 확인하세요: [Informer를 활용한 다변량 확률적 시계열 예측](https://huggingface.co/blog/informer)
+
+## InformerConfig[[transformers.InformerConfig]]
+
+[[autodoc]] InformerConfig
+
+## InformerModel[[transformers.InformerModel]]
+
+[[autodoc]] InformerModel
+    - forward
+
+## InformerForPrediction[[transformers.InformerForPrediction]]
+
+[[autodoc]] InformerForPrediction
+    - forward
\ No newline at end of file
diff --git a/docs/source/ko/model_doc/llama3.md b/docs/source/ko/model_doc/llama3.md
new file mode 100644
index 00000000000000..ca819bfcabaf88
--- /dev/null
+++ b/docs/source/ko/model_doc/llama3.md
@@ -0,0 +1,80 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Llama3[[llama3]]
+
+```py3
+import transformers
+import torch
+
+model_id = "meta-llama/Meta-Llama-3-8B"
+
+pipeline = transformers.pipeline("text-generation", model=model_id, model_kwargs={"torch_dtype": torch.bfloat16}, device_map="auto")
+pipeline("Hey how are you doing today?")
+```
+
+## 개요[[overview]]
+
+라마3 모델은 Meta AI 팀이 제안한 [메타 라마3 소개: 현재까지 가장 유능한 공개 가능 LLM](https://ai.meta.com/blog/meta-llama-3/)에서 소개되었습니다.
+
+해당 블로그 포스트의 초록입니다:
+
+*오늘, 광범위한 사용을 위해 이용 가능한 라마의 차세대 모델인 메타 라마3의 첫 두 모델을 공유하게 되어 기쁩니다. 이번 출시는 8B와 70B 매개변수를 가진 사전 훈련 및 지시 미세 조정된 언어 모델을 특징으로 하며, 광범위한 사용 사례를 지원할 수 있습니다. 라마의 이 차세대 모델은 다양한 산업 벤치마크에서 최첨단의 성능을 보여주며, 개선된 추론 능력을 포함한 새로운 기능을 제공합니다. 우리는 이것들이 단연코 해당 클래스에서 최고의 오픈 소스 모델이라고 믿습니다. 오랜 개방적 접근 방식을 지지하며, 우리는 라마3를 커뮤니티 기여자들에게 맡기고 있습니다. 애플리케이션에서 개발자 도구, 평가, 추론 최적화 등에 이르기까지 AI 스택 전반에 걸친 다음 혁신의 물결을 촉발하길 희망합니다. 여러분이 무엇을 만들지 기대하며 여러분의 피드백을 고대합니다.*
+
+라마3 모델의 모든 체크포인트는 [이곳](https://huggingface.co/models?search=llama3)에서 확인하세요.
+원본 코드는 [이곳](https://github.com/meta-llama/llama3)에서 확인할 수 있습니다.
+
+## 사용 팁[[usage-tips]]
+
+<Tip warning={true}>
+
+`라마3` 모델들은 `bfloat16`를 사용하여 훈련되었지만, 원래의 추론은 `float16`을 사용합니다. Hub에 업로드된 체크포인트들은 `torch_dtype = 'float16'`을 사용하는데, 이는 `AutoModel` API가 체크포인트를 `torch.float32`에서 `torch.float16`으로 변환하는데 이용됩니다. 
+
+ `model = AutoModelForCausalLM.from_pretrained("path", torch_dtype = "auto")`를 사용하여 모델을 초기화할 때, 온라인 가중치의 `dtype`는 `torch_dtype="auto"`를 사용하지 않는 한 대부분 무관합니다. 그 이유는 모델이 먼저 다운로드되고(온라인 체크포인트의 `dtype`를 사용), 그 다음 `torch`의 `dtype`으로 변환되어(`torch.float32`가 됨), 마지막으로 config에 `torch_dtype`이 제공된 경우 가중치가 사용되기 때문입니다.
+
+`float16`으로 모델을 훈련하는 것은 권장되지 않으며 `nan`을 생성하는 것으로 알려져 있습니다. 따라서 모든 모델은 `bfloat16`으로 훈련되어야 합니다.
+
+</Tip>
+
+팁:
+
+- 라마3 모델을 위한 가중치는 [이 폼](https://ai.meta.com/resources/models-and-libraries/llama-downloads/)을 채우면서 얻어져야 합니다.
+- 아키텍처는 라마2와 정확히 같습니다.
+- 토크나이저는 [tiktoken](https://github.com/openai/tiktoken) (sentencepiece 구현에 기반한 라마2 와는 다르게)에 기반한 BPE 모델입니다. tiktoken 기반 토크나이저가 sebtencepiece 기반 방식과 다른점은 입력 토큰이 vocab에 이미 존재할 때 BPE 병합 룰을 무시하고 싱글 토큰으로 토크나이징한다는 점에서 가장 큰 차이를 보입니다. 자세히 말하면 `"hugging"`이 vocab에 존재하고 기존에 병합이 존재하지 않으면, `["hug","ging"]` 처럼 두 토큰으로 더 작은 단위의 단어를 가지는 것이 아니라, 하나의 토큰만을 자동으로 리턴하는 것을 의미합니다.
+- 기본 모델은 패딩 토큰이 없다는 것을 의미하는 `pad_id = -1`을 사용합니다. 같은 로직을 사용할 수 없으니 `tokenizer.add_special_tokens({"pad_token":"<pad>"})`를 사용하여 토큰을 추가하고 임베딩 크기도 확실히 조정해야 합니다. `model.config.pad_token_id`도 설정이 필요합니다. 모델의 `embed_tokens` 레이어는 `self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.config.padding_idx)`로 초기화되며, 패딩 토큰을 인코딩하는 것이 0(zero)를 출력하게 할 것인지 그래서 초기화가 추천될때 이를 통화시킬 것인지를 정하게 합니다. 
+- 원본 체크포인트는 이 [변환 스크립트](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py)를 이용해서 변환 가능합니다. 스크립트는 다음 명령어로 호출할 수 있습니다:
+    
+    ```bash
+    python src/transformers/models/llama/convert_llama_weights_to_hf.py \
+        --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path --llama_version 3
+    ```
+
+- 변환 후, 모델과 토크나이저는 다음을 통해 로드된다.
+
+    ```python
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    
+    tokenizer = AutoTokenizer.from_pretrained("/output/path")
+    model = AutoModelForCausalLM.from_pretrained("/output/path")
+    ```
+
+    이 스크립트를 실행시키려면 모델 전체를 float16 정밀도로 호스팅할 수 있는 충분한 메인메모리가 필요하다는 점을 유의하세요. 가장 큰 버전이 여러 체크포인트로 나뉘어 있더라도, 각 체크포인트가 모델의 가중치 일부를 포함하고 있기 때문에 이를 모두 RAM에 로드해야 합니다. 75B 모델을 예로 들면 대략 145GB의 RAM이 필요합니다. 
+
+- `attn_implementation="flash_attention_2"`를 통해서 플래시 어텐션2를 사용할 때, `from_pretrained` 클래스 메서드에 `torch_dtype`를 전달하지 말고 자동 혼합 정밀도(Automatic Mixed-Precision) 학습을 사용하세요. `Trainer`를 사용할 때는 단순히 `fp16` 또는 `bf16`을 `True`로 설정하면 됩니다. 그렇지 않으면 반드시 `torch.autocast`를 사용해야 합니다. 플래시 어텐션은 `fp16`과 `bf16` 데이터 유형만 지원하기 때문입니다.
+
+## 자료[[resources]]
+
+[라마2](./llama2) 문서 페이지에서는 이미 수 많은 멋지고 유익한 자료들을 제공하고 있습니다. 이곳에 라마3에 대한 새로운 자료를 더해주실 컨트리뷰터들을 초대합니다! 🤗
diff --git a/docs/source/ko/model_doc/mamba.md b/docs/source/ko/model_doc/mamba.md
new file mode 100644
index 00000000000000..404b9b06b41353
--- /dev/null
+++ b/docs/source/ko/model_doc/mamba.md
@@ -0,0 +1,105 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 맘바[[mamba]]
+
+## 개요[[overview]]
+
+맘바(Mamba) 모델은 Albert Gu, Tri Dao가 제안한 [맘바: 선택적 상태 공간을 이용한 선형 시간 시퀀스 모델링](https://arxiv.org/abs/2312.00752)라는 논문에서 소개 되었습니다.
+
+이 모델은 `state-space-models`을 기반으로 한 새로운 패러다임 아키텍처입니다. 직관적인 이해를 얻고 싶다면 [이곳](https://srush.github.io/annotated-s4/)을 참고 하세요.
+
+해당 논문의 초록입니다:
+
+*현재 딥러닝에서 흥미로운 응용 프로그램을 구동하는 대부분의 기초 모델들은 거의 보편적으로 트랜스포머 아키텍처와 그 핵심 어텐션 모듈을 기반으로 합니다. 선형 어텐션, 게이트된 컨볼루션과 순환 모델, 구조화된 상태 공간 모델(SSM) 등 많은 준이차시간(subquadratic-time) 아키텍처가 긴 시퀀스에 대한 트랜스포머의 계산 비효율성을 해결하기 위해 개발되었지만, 언어와 같은 중요한 양식에서는 어텐션만큼 성능을 내지 못했습니다. 우리는 이러한 모델의 주요 약점이 내용 기반 추론을 수행하지 못한다는 점임을 알고 몇 가지를 개선했습니다. 첫째, SSM 매개변수를 입력의 함수로 만드는 것만으로도 이산 모달리티(discrete modalities)의 약점을 해결할 수 있어, 현재 토큰에 따라 시퀀스 길이 차원을 따라 정보를 선택적으로 전파하거나 잊을 수 있게 합니다. 둘째, 이러한 변경으로 효율적인 컨볼루션을 사용할 수 없게 되었지만, 우리는 순환 모드에서 하드웨어를 인식하는 병렬 알고리즘을 설계했습니다. 우리는 이러한 선택적 SSM을 어텐션이나 MLP 블록도 없는 단순화된 종단간 신경망 아키텍처인 맘바에 통합시켰습니다. 맘바는 빠른 추론(트랜스포머보다 5배 높은 처리량)과 시퀀스 길이에 대한 선형 확장성을 누리며, 백만 길이 시퀀스까지 실제 데이터에서 성능이 향상됩니다. 일반적인 시퀀스 모델 백본으로서 맘바는 언어, 오디오, 유전체학과 같은 여러 양식에서 최첨단 성능을 달성합니다. 언어 모델링에서 우리의 맘바-3B 모델은 같은 크기의 트랜스포머를 능가하고 두 배 크기의 트랜스포머와 맞먹는 성능을 보이며, 사전 훈련과 다운스트림 평가 모두에서 성능을 나타납니다.*
+
+팁:
+
+- 맘바는 고전적인 트랜스포머와 견줄 만한 새로운 `상태 공간 모델` 아키텍처입니다. 이는 구조화된 상태 공간 모델의 발전 선상에 있으며, [플래시어텐션](https://github.com/Dao-AILab/flash-attention)의 정신을 따르는 효율적인 하드웨어 인식 설계와 구현을 특징으로 합니다.
+- 맘바는 `어텐션` 레이어와 동등한 `믹서(mixer)` 레이어를 쌓습니다. `맘바`의 핵심 로직은 `MambaMixer` 클래스에 있습니다.
+- 두 가지 구현이 공존합니다: 하나는 최적화되어 빠른 cuda커널을 사용하고, 다른 하나는 단순하지만 모든 장치에서 실행할 수 있습니다!
+- 현재 구현은 원본 cuda커널을 활용합니다: 맘바를 위한 플래시 어텐션의 역할을 하는 것은 [`mamba-ssm`](https://github.com/state-spaces/mamba)와 [`causal_conv1d`](https://github.com/Dao-AILab/causal-conv1d) 저장소에 호스팅되어 있습니다. 하드웨어가 지원한다면 반드시 설치하세요!
+- cuda 커널을 최적화하는 방향 보다는, 단순하지만 모든 장치에서 실행가능하도록하는 방향인 '단순구현'의 성능을 빠르게 향상시키는 기여를 더 환영하고 있습니다. 🤗
+
+이 모델은 [ArthurZ](https://huggingface.co/ArthurZ)에 의해 기여되었습니다.
+원본 코드는 [이곳](https://github.com/state-spaces/mamba)에서 확인할 수 있습니다.
+
+# 사용
+
+### 간단한 생성 예제
+ 
+```python 
+from transformers import MambaConfig, MambaForCausalLM, AutoTokenizer
+import torch
+
+tokenizer = AutoTokenizer.from_pretrained("state-spaces/mamba-130m-hf")
+model = MambaForCausalLM.from_pretrained("state-spaces/mamba-130m-hf")
+input_ids = tokenizer("Hey how are you doing?", return_tensors= "pt")["input_ids"]
+
+out = model.generate(input_ids, max_new_tokens=10)
+print(tokenizer.batch_decode(out))
+```
+
+### Peft 파인튜닝
+느린 버전은 학습에서 아주 안정적이진 않습니다. 빠른 버전은 `float32`가 필요합니다!
+
+```python 
+from datasets import load_dataset
+from trl import SFTTrainer
+from peft import LoraConfig
+from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
+model_id = "state-spaces/mamba-130m-hf"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id)
+dataset = load_dataset("Abirate/english_quotes", split="train")
+training_args = TrainingArguments(
+    output_dir="./results",
+    num_train_epochs=3,
+    per_device_train_batch_size=4,
+    logging_dir='./logs',
+    logging_steps=10,
+    learning_rate=2e-3
+)
+lora_config =  LoraConfig(
+        r=8,
+        target_modules=["x_proj", "embeddings", "in_proj", "out_proj"],
+        task_type="CAUSAL_LM",
+        bias="none"
+)
+trainer = SFTTrainer(
+    model=model,
+    tokenizer=tokenizer,
+    args=training_args,
+    peft_config=lora_config,
+    train_dataset=dataset,
+    dataset_text_field="quote",
+)
+trainer.train()
+```
+
+## MambaConfig
+
+[[autodoc]] MambaConfig
+
+## MambaModel
+
+[[autodoc]] MambaModel
+    - forward
+
+## MambaLMHeadModel
+
+[[autodoc]] MambaForCausalLM
+    - forward
diff --git a/docs/source/ko/model_doc/mamba2.md b/docs/source/ko/model_doc/mamba2.md
new file mode 100644
index 00000000000000..b023914bc1276a
--- /dev/null
+++ b/docs/source/ko/model_doc/mamba2.md
@@ -0,0 +1,111 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 맘바2[[mamba-2]]
+
+## 개요[[overview]]
+
+맘바2 모델은 Tri Dao, Albert Gu가 제안한 [트랜스포머는 SSM이다: 구조화된 상태 공간 이중성을 통한 일반화된 모델과 효율적인 알고리즘](https://arxiv.org/abs/2405.21060)라는 논문에서 소개되었습니다. 맘바2는 맘바1과 유사한 상태 공간 모델로, 단순화된 아키텍처에서 더 나은 성능을 보입니다.
+
+해당 논문의 초록입니다:
+
+*트랜스포머는 언어 모델링에서 딥러닝 성공의 주요 아키텍처였지만, 맘바와 같은 상태 공간 모델(SSM)이 최근 소규모 혹은 중간 규모에서 트랜스포머와 대등하거나 더 나은 성능을 보이는 것으로 나타났습니다. 우리는 이러한 모델 계열들이 실제로 매우 밀접하게 연관되어 있음을 파악했습니다. 그리고 구조화된 준분리(semiseparable) 행렬 중 연구가 잘 이루어진 클래스의 다양한 분해를 통해 연결된 SSM과 어텐션 변형 사이의 풍부한 이론적 연결 프레임워크를 개발했습니다. 상태 공간 이중성(SSD) 프레임워크를 통해 맘바1의 선택적 SSM을 개선한 새로운 아키텍처를 설계할 수 있었고, 트랜스포머와 경쟁력을 유지하면서도 속도는 2~8배 더 빠른 성능을 냅니다.*
+
+팁:
+
+이 버전은 맘바2 구현을 지원해야 하며, 특히 Mistral AI의 [Mamba-2 codestral](https://huggingface.co/mistralai/Mamba-Codestral-7B-v0.1)을 지원합니다. 특히, mamba 2 codestral은 8개의 `groups`로 출시되었는데, 이는 어텐션 기반 모델의 KV 헤드 수와 유사하다고 판단 가능합니다.
+
+이 모델은 `torch_forward`와 `cuda_kernels_forward`라는 두 가지 다른 전방 패스를 가집니다. `cuda_kernels_forward`는 환경에서 cuda 커널을 찾으면 이를 사용하며, prefill에서는 더 느립니다. 즉, 높은 CPU 오버헤드로 인해 "웜업 실행"이 필요하기 때문입니다. 관련 내용은 [이곳](https://github.com/state-spaces/mamba/issues/389#issuecomment-2171755306)과 [이곳](https://github.com/state-spaces/mamba/issues/355#issuecomment-2147597457)을 참고하세요. 
+
+컴파일 없이는 `torch_forward` 구현이 3~4배 빠릅니다. 또한, 이 모델에는 위치 임베딩이 없지만 `attention_mask`와 배치 생성의 경우 두 곳에서 은닉 상태(hidden state)를 마스킹하는 특정 로직이 있습니다. 관련 내용은 [이곳](https://github.com/state-spaces/mamba/issues/66#issuecomment-1863563829)을 참고하세요. 
+
+이로인해 맘바2 커널의 재구현과 함께 배치 생성 및 캐시된 생성에서 약간의 차이가 예상됩니다. 또한 cuda 커널 또는 torch forward가 제공하는 결과가 약간 다를 것으로 예상됩니다. SSM 알고리즘은 텐서 수축에 크게 의존하는데, 이는 matmul과 동등하지만 연산 순서가 약간 다르며, 이로 인해 더 작은 정밀도에서 차이가 더 커집니다.
+
+또 다른 참고사항으로, 패딩 토큰에 해당하는 은닉 상태(hidden state)의 종료는 두 곳에서 이루어지며 주로 왼쪽 패딩으로 테스트되었습니다. 오른쪽 패딩은 노이즈를 전파하므로 만족스러운 결과를 보장하지 않습니다. `tokenizer.padding_side = "left"`를 사용하면 올바른 패딩 방향을 사용할 수 있습니다.
+
+이 모델은 [Molbap](https://huggingface.co/Molbap)이 기여했으며, [Anton Vlasjuk](https://github.com/vasqu)의 큰 도움을 받았습니다.
+원본 코드는 [이곳](https://github.com/state-spaces/mamba)에서 확인할 수 있습니다.
+
+
+# 사용
+
+### 간단한 생성 예: 
+```python 
+from transformers import Mamba2Config, Mamba2ForCausalLM, AutoTokenizer
+import torch
+model_id = 'mistralai/Mamba-Codestral-7B-v0.1'
+tokenizer = AutoTokenizer.from_pretrained(model_id, revision='refs/pr/9', from_slow=True, legacy=False)
+model = Mamba2ForCausalLM.from_pretrained(model_id, revision='refs/pr/9')
+input_ids = tokenizer("Hey how are you doing?", return_tensors= "pt")["input_ids"]
+
+out = model.generate(input_ids, max_new_tokens=10)
+print(tokenizer.batch_decode(out))
+```
+
+이곳은 미세조정을 위한 초안 스크립트입니다: 
+```python 
+from trl import SFTTrainer
+from peft import LoraConfig
+from transformers import AutoTokenizer, Mamba2ForCausalLM, TrainingArguments
+model_id = 'mistralai/Mamba-Codestral-7B-v0.1'
+tokenizer = AutoTokenizer.from_pretrained(model_id, revision='refs/pr/9', from_slow=True, legacy=False)
+tokenizer.pad_token = tokenizer.eos_token
+tokenizer.padding_side = "left" #왼쪽 패딩으로 설정
+
+model = Mamba2ForCausalLM.from_pretrained(model_id, revision='refs/pr/9')
+dataset = load_dataset("Abirate/english_quotes", split="train")
+# CUDA 커널없이는, 배치크기 2가 80GB 장치를 하나 차지합니다.
+# 하지만 정확도는 감소합니다.
+# 실험과 시도를 환영합니다!
+training_args = TrainingArguments(
+    output_dir="./results",
+    num_train_epochs=3,
+    per_device_train_batch_size=2,
+    logging_dir='./logs',
+    logging_steps=10,
+    learning_rate=2e-3
+)
+lora_config =  LoraConfig(
+        r=8,
+        target_modules=["embeddings", "in_proj", "out_proj"],
+        task_type="CAUSAL_LM",
+        bias="none"
+)
+trainer = SFTTrainer(
+    model=model,
+    tokenizer=tokenizer,
+    args=training_args,
+    peft_config=lora_config,
+    train_dataset=dataset,
+    dataset_text_field="quote",
+)
+trainer.train()
+```
+
+
+## Mamba2Config
+
+[[autodoc]] Mamba2Config
+
+## Mamba2Model
+
+[[autodoc]] Mamba2Model
+    - forward
+
+## Mamba2LMHeadModel
+
+[[autodoc]] Mamba2ForCausalLM
+    - forward
diff --git a/docs/source/ko/model_doc/mistral.md b/docs/source/ko/model_doc/mistral.md
new file mode 100644
index 00000000000000..4c9f2d7a77d4a2
--- /dev/null
+++ b/docs/source/ko/model_doc/mistral.md
@@ -0,0 +1,233 @@
+<!--Copyright 2023 Mistral AI and The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See thze License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Mistral[[mistral]]
+
+## 개요[[overview]]
+
+미스트랄은 Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed가 작성한 [이 블로그 포스트](https://mistral.ai/news/announcing-mistral-7b/)에서 소개되었습니다.
+
+블로그 포스트의 서두는 다음과 같습니다:
+
+*미스트랄 AI팀은 현존하는 언어 모델 중 크기 대비 가장 강력한 미스트랄7B를 출시하게 되어 자랑스럽습니다.*
+
+미스트랄-7B는 [mistral.ai](https://mistral.ai/)에서 출시한 첫 번째 대규모 언어 모델(LLM)입니다.
+
+### 아키텍처 세부사항[[architectural-details]]
+
+미스트랄-7B는 다음과 같은 구조적 특징을 가진 디코더 전용 트랜스포머입니다:
+
+- 슬라이딩 윈도우 어텐션: 8k 컨텍스트 길이와 고정 캐시 크기로 훈련되었으며, 이론상 128K 토큰의 어텐션 범위를 가집니다.
+- GQA(Grouped Query Attention): 더 빠른 추론이 가능하고 더 작은 크기의 캐시를 사용합니다.
+- 바이트 폴백(Byte-fallback) BPE 토크나이저: 문자들이 절대 어휘 목록 외의 토큰으로 매핑되지 않도록 보장합니다.
+
+더 자세한 내용은 [출시 블로그 포스트](https://mistral.ai/news/announcing-mistral-7b/)를 참조하세요.
+
+### 라이선스[[license]]
+
+`미스트랄-7B`는 아파치 2.0 라이선스로 출시되었습니다.
+
+## 사용 팁[[usage-tips]]
+
+미스트랄 AI팀은 다음 3가지 체크포인트를 공개했습니다:
+
+- 기본 모델인 [미스트랄-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)은 인터넷 규모의 데이터에서 다음 토큰을 예측하도록 사전 훈련되었습니다.
+- 지시 조정 모델인 [미스트랄-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)은 지도 미세 조정(SFT)과 직접 선호도 최적화(DPO)를 사용한 채팅에 최적화된 기본 모델입니다.
+- 개선된 지시 조정 모델인 [미스트랄-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)는 v1을 개선한 버전입니다.
+
+기본 모델은 다음과 같이 사용할 수 있습니다:
+
+```python
+>>> from transformers import AutoModelForCausalLM, AutoTokenizer
+
+>>> model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", device_map="auto")
+>>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
+
+>>> prompt = "My favourite condiment is"
+
+>>> model_inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
+>>> model.to(device)
+
+>>> generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
+>>> tokenizer.batch_decode(generated_ids)[0]
+"My favourite condiment is to ..."
+```
+
+지시 조정 모델은 다음과 같이 사용할 수 있습니다:
+
+```python
+>>> from transformers import AutoModelForCausalLM, AutoTokenizer
+
+>>> model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", device_map="auto")
+>>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
+
+>>> messages = [
+...     {"role": "user", "content": "What is your favourite condiment?"},
+...     {"role": "assistant", "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"},
+...     {"role": "user", "content": "Do you have mayonnaise recipes?"}
+... ]
+
+>>> model_inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
+
+>>> generated_ids = model.generate(model_inputs, max_new_tokens=100, do_sample=True)
+>>> tokenizer.batch_decode(generated_ids)[0]
+"Mayonnaise can be made as follows: (...)"
+```
+
+지시 조정 모델은 입력이 올바른 형식으로 준비되도록 [채팅 템플릿](../chat_templating)을 적용해야 합니다.
+
+## 플래시 어텐션을 이용한 미스트랄 속도향상[[speeding-up-mistral-by-using-flash-attention]]
+
+위의 코드 스니펫들은 어떤 최적화 기법도 사용하지 않은 추론 과정을 보여줍니다. 하지만 모델 내부에서 사용되는 어텐션 메커니즘의 더 빠른 구현인 [플래시 어텐션2](../perf_train_gpu_one.md#flash-attention-2)을 활용하면 모델의 속도를 크게 높일 수 있습니다.
+
+먼저, 슬라이딩 윈도우 어텐션 기능을 포함하는 플래시 어텐션2의 최신 버전을 설치해야 합니다.
+
+```bash
+pip install -U flash-attn --no-build-isolation
+```
+
+하드웨어와 플래시 어텐션2의 호환여부를 확인하세요. 이에 대한 자세한 내용은 [플래시 어텐션 저장소](https://github.com/Dao-AILab/flash-attention)의 공식 문서에서 확인할 수 있습니다. 또한 모델을 반정밀도(예: `torch.float16`)로 불러와야합니다.
+
+플래시 어텐션2를 사용하여 모델을 불러오고 실행하려면 아래 코드 스니펫을 참조하세요:
+
+```python
+>>> import torch
+>>> from transformers import AutoModelForCausalLM, AutoTokenizer
+
+>>> model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", torch_dtype=torch.float16, attn_implementation="flash_attention_2", device_map="auto")
+>>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
+
+>>> prompt = "My favourite condiment is"
+
+>>> model_inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
+>>> model.to(device)
+
+>>> generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
+>>> tokenizer.batch_decode(generated_ids)[0]
+"My favourite condiment is to (...)"
+```
+
+### 기대하는 속도 향상[[expected-speedups]]
+
+다음은 `mistralai/Mistral-7B-v0.1` 체크포인트를 사용한 트랜스포머의 기본 구현과 플래시 어텐션2 버전 모델 사이의 순수 추론 시간을 비교한 예상 속도 향상 다이어그램입니다.
+
+<div style="text-align: center">
+<img src="https://huggingface.co/datasets/ybelkada/documentation-images/resolve/main/mistral-7b-inference-large-seqlen.png">
+</div>
+
+### 슬라이딩 윈도우 어텐션[[sliding-window-attention]]
+
+현재 구현은 슬라이딩 윈도우 어텐션 메커니즘과 메모리 효율적인 캐시 관리 기능을 지원합니다. 슬라이딩 윈도우 어텐션을 활성화하려면, 슬라이딩 윈도우 어텐션과 호환되는`flash-attn`(`>=2.3.0`)버전을 사용하면 됩니다. 
+
+또한 플래시 어텐션2 모델은 더 메모리 효율적인 캐시 슬라이싱 메커니즘을 사용합니다. 미스트랄 모델의 공식 구현에서 권장하는 롤링 캐시 메커니즘을 따라, 캐시 크기를 고정(`self.config.sliding_window`)으로 유지하고, `padding_side="left"`인 경우에만 배치 생성(batch generation)을 지원하며, 현재 토큰의 절대 위치를 사용해 위치 임베딩을 계산합니다.
+
+## 양자화로 미스트랄 크기 줄이기[[shrinking-down-mistral-using-quantization]]
+
+미스트랄 모델은 70억 개의 파라미터를 가지고 있어, 절반의 정밀도(float16)로 약 14GB의 GPU RAM이 필요합니다. 각 파라미터가 2바이트로 저장되기 때문입니다. 하지만 [양자화](../quantization.md)를 사용하면 모델 크기를 줄일 수 있습니다. 모델을 4비트(즉, 파라미터당 반 바이트)로 양자화하면 약 3.5GB의 RAM만 필요합니다.
+
+모델을 양자화하는 것은 `quantization_config`를 모델에 전달하는 것만큼 간단합니다. 아래에서는 BitsAndBytes 양자화를 사용하지만, 다른 양자화 방법은 [이 페이지](../quantization.md)를 참고하세요:
+
+```python
+>>> import torch
+>>> from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+>>> # specify how to quantize the model
+>>> quantization_config = BitsAndBytesConfig(
+...         load_in_4bit=True,
+...         bnb_4bit_quant_type="nf4",
+...         bnb_4bit_compute_dtype="torch.float16",
+... )
+
+>>> model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", quantization_config=True, device_map="auto")
+>>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
+
+>>> prompt = "My favourite condiment is"
+
+>>> messages = [
+...     {"role": "user", "content": "What is your favourite condiment?"},
+...     {"role": "assistant", "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"},
+...     {"role": "user", "content": "Do you have mayonnaise recipes?"}
+... ]
+
+>>> model_inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
+
+>>> generated_ids = model.generate(model_inputs, max_new_tokens=100, do_sample=True)
+>>> tokenizer.batch_decode(generated_ids)[0]
+"The expected output"
+```
+
+이 모델은 [Younes Belkada](https://huggingface.co/ybelkada)와 [Arthur Zucker](https://huggingface.co/ArthurZ)가 기여했습니다.
+원본 코드는 [이곳](https://github.com/mistralai/mistral-src)에서 확인할 수 있습니다.
+
+## 리소스[[resources]]
+
+미스트랄을 시작하는 데 도움이 되는 Hugging Face와 community 자료 목록(🌎로 표시됨) 입니다. 여기에 포함될 자료를 제출하고 싶으시다면 PR(Pull Request)를 열어주세요. 리뷰해 드리겠습니다! 자료는 기존 자료를 복제하는 대신 새로운 내용을 담고 있어야 합니다.
+
+<PipelineTag pipeline="text-generation"/>
+
+- 미스트랄-7B의 지도형 미세조정(SFT)을 수행하는 데모 노트북은 [이곳](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/Mistral/Supervised_fine_tuning_(SFT)_of_an_LLM_using_Hugging_Face_tooling.ipynb)에서 확인할 수 있습니다. 🌎
+- 2024년에 Hugging Face 도구를 사용해 LLM을 미세 조정하는 방법에 대한 [블로그 포스트](https://www.philschmid.de/fine-tune-llms-in-2024-with-trl). 🌎
+- Hugging Face의 [정렬(Alignment) 핸드북](https://github.com/huggingface/alignment-handbook)에는 미스트랄-7B를 사용한 지도형 미세 조정(SFT) 및 직접 선호 최적화(DPO)를 수행하기 위한 스크립트와 레시피가 포함되어 있습니다. 여기에는 단일 GPU에서 QLoRa 및 다중 GPU를 사용한 전체 미세 조정을 위한 스크립트가 포함되어 있습니다.
+- [인과적 언어 모델링 작업 가이드](../tasks/language_modeling)
+
+## MistralConfig[[transformers.MistralConfig]]
+
+[[autodoc]] MistralConfig
+
+## MistralModel[[transformers.MistralModel]]
+
+[[autodoc]] MistralModel
+    - forward
+
+## MistralForCausalLM[[transformers.MistralForCausalLM]]
+
+[[autodoc]] MistralForCausalLM
+    - forward
+
+## MistralForSequenceClassification[[transformers.MistralForSequenceClassification]]
+
+[[autodoc]] MistralForSequenceClassification
+    - forward
+
+## MistralForTokenClassification[[transformers.MistralForTokenClassification]]
+
+[[autodoc]] MistralForTokenClassification
+    - forward
+
+## FlaxMistralModel[[transformers.FlaxMistralModel]]
+
+[[autodoc]] FlaxMistralModel
+    - __call__
+
+## FlaxMistralForCausalLM[[transformers.FlaxMistralForCausalLM]]
+
+[[autodoc]] FlaxMistralForCausalLM
+    - __call__
+
+## TFMistralModel[[transformers.TFMistralModel]]
+
+[[autodoc]] TFMistralModel
+    - call
+
+## TFMistralForCausalLM[[transformers.TFMistralForCausalLM]]
+
+[[autodoc]] TFMistralForCausalLM
+    - call
+
+## TFMistralForSequenceClassification[[transformers.TFMistralForSequenceClassification]]
+
+[[autodoc]] TFMistralForSequenceClassification
+    - call
\ No newline at end of file
diff --git a/docs/source/ko/model_doc/openai-gpt.md b/docs/source/ko/model_doc/openai-gpt.md
new file mode 100644
index 00000000000000..7e8f57c743bf71
--- /dev/null
+++ b/docs/source/ko/model_doc/openai-gpt.md
@@ -0,0 +1,149 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# OpenAI GPT [[openai-gpt]]
+
+<div class="flex flex-wrap space-x-1">
+<a href="https://huggingface.co/models?filter=openai-gpt">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-openai--gpt-blueviolet">
+</a>
+<a href="https://huggingface.co/spaces/docs-demos/openai-gpt">
+<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
+</a>
+</div>
+
+## 개요 [[overview]]
+
+OpenAI GPT 모델은 Alec Radford, Karthik Narasimhan, Tim Salimans, Ilya Sutskever가 작성한 [Improving Language Understanding by Generative Pre-Training](https://s3-us-west-2.amazonaws.com/openai-assets/research-covers/language-unsupervised/language_understanding_paper.pdf) 논문에서 제안되었습니다. 이는 Toronto Book Corpus와 같은 장기 의존성을 가진 대규모 말뭉치를 사용하여 언어 모델링으로 사전 학습된 인과적(단방향) 트랜스포머입니다.
+
+논문의 초록은 다음과 같습니다:
+
+*자연어 이해는 텍스트 함의, 질문 응답, 의미 유사성 평가, 문서 분류와 같은 다양한 작업을 포함합니다. 비록 대규모의 레이블이 없는 텍스트 말뭉치가 풍부하기는 하지만, 이러한 특정 작업에 대한 학습을 위한 레이블된 데이터는 부족하여 판별적으로 학습된 모델이 적절하게 성능을 발휘하기 어렵습니다. 우리는 다양한 레이블이 없는 텍스트 말뭉치에 대한 언어 모델의 생성적 사전 학습을 수행하고, 각 특정 과제에 대한 판별적 미세 조정을 수행함으로써 이러한 과제에서 큰 성과를 달성할 수 있음을 보여줍니다. 이전 접근 방식과 달리, 우리는 모델 아키텍처에 최소한의 변화를 요구하면서 효과적인 전이를 달성하기 위해 미세 조정 중에 과제 인식 입력 변환(task-aware input transformation)을 사용합니다. 우리는 자연어 이해를 위한 다양한 벤치마크에서 우리의 접근 방식의 효과를 입증합니다. 우리의 general task-agnostic 모델은 각 과제에 특별히 설계된 아키텍처를 사용하는 판별적으로 학습된 모델보다 뛰어나며, 연구된 12개 과제 중 9개 부문에서 최첨단 성능(state of the art)을 크게 향상시킵니다.*
+
+[Write With Transformer](https://transformer.huggingface.co/doc/gpt)는 Hugging Face가 만든 웹 애플리케이션으로, 여러 모델의 생성 능력을 보여주며 그 중에는 GPT도 포함되어 있습니다.
+
+이 모델은 [thomwolf](https://huggingface.co/thomwolf)에 의해 기여되었으며, 원본 코드는 [여기](https://github.com/openai/finetune-transformer-lm)에서 확인할 수 있습니다.
+
+## 사용 팁 [[usage-tips]]
+
+- GPT는 절대 위치 임베딩을 사용하는 모델이므로 입력을 일반적으로 왼쪽보다는 오른쪽에 패딩하는 것이 권장됩니다.
+- GPT는 인과 언어 모델링(Causal Language Modeling, CLM) 목표로 학습되었기 때문에 시퀀스에서 다음 토큰을 예측하는 데 강력한 성능을 보여줍니다. 이를 활용하면 *run_generation.py* 예제 스크립트에서 볼 수 있듯이 GPT-2는 구문적으로 일관된 텍스트를 생성할 수 있습니다.
+
+참고:
+
+*OpenAI GPT* 논문의 원래 토큰화 과정을 재현하려면 `ftfy`와 `SpaCy`를 설치해야 합니다:
+
+```bash
+pip install spacy ftfy==4.4.3
+python -m spacy download en
+```
+
+`ftfy`와 `SpaCy`를 설치하지 않으면 [`OpenAIGPTTokenizer`]는 기본적으로 BERT의 `BasicTokenizer`를 사용한 후 Byte-Pair Encoding을 통해 토큰화합니다(대부분의 사용에 문제가 없으니 걱정하지 마세요).
+
+## 리소스 [[resources]]
+
+OpenAI GPT를 시작하는 데 도움이 되는 공식 Hugging Face 및 커뮤니티(🌎 표시) 리소스 목록입니다. 여기에 리소스를 추가하고 싶다면, Pull Request를 열어주시면 검토하겠습니다! 리소스는 기존 리소스를 복제하지 않고 새로운 것을 보여주는 것이 좋습니다.
+
+<PipelineTag pipeline="text-classification"/>
+
+- [SetFit을 사용하여 텍스트 분류에서 OpenAI GPT-3을 능가하는 방법](https://www.philschmid.de/getting-started-setfit) 블로그 게시물.
+- 추가 자료: [텍스트 분류 과제 가이드](../tasks/sequence_classification)
+
+<PipelineTag pipeline="text-generation"/>
+
+- [Hugging Face와 함께 비영어 GPT-2 모델을 미세 조정하는 방법](https://www.philschmid.de/fine-tune-a-non-english-gpt-2-model-with-huggingface) 블로그.
+- GPT-2와 함께 [Transformers를 사용한 언어 생성의 다양한 디코딩 방법](https://huggingface.co/blog/how-to-generate)에 대한 블로그.
+- [Scratch에서 CodeParrot 🦜을 훈련하는 방법](https://huggingface.co/blog/codeparrot), 대규모 GPT-2 모델에 대한 블로그.
+- GPT-2와 함께 [TensorFlow 및 XLA를 사용한 더 빠른 텍스트 생성](https://huggingface.co/blog/tf-xla-generate)에 대한 블로그.
+- [Megatron-LM으로 언어 모델을 훈련하는 방법](https://huggingface.co/blog/megatron-training)에 대한 블로그.
+- [좋아하는 아티스트의 스타일로 가사를 생성하도록 GPT2를 미세 조정하는 방법](https://colab.research.google.com/github/AlekseyKorshuk/huggingartists/blob/master/huggingartists-demo.ipynb)에 대한 노트북. 🌎
+- [좋아하는 트위터 사용자의 스타일로 트윗을 생성하도록 GPT2를 미세 조정하는 방법](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb)에 대한 노트북. 🌎
+- 🤗 Hugging Face 코스의 [인과 언어 모델링](https://huggingface.co/course/en/chapter7/6?fw=pt#training-a-causal-language-model-from-scratch) 장.
+- [`OpenAIGPTLMHeadModel`]은 [인과 언어 모델링 예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#gpt-2gpt-and-causal-language-modeling), [텍스트 생성 예제 스크립트](https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-generation/run_generation.py) 및 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)에 의해 지원됩니다.
+- [`TFOpenAIGPTLMHeadModel`]은 [인과 언어 모델링 예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/language-modeling#run_clmpy) 및 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb)에 의해 지원됩니다.
+- 추가 자료: [인과 언어 모델링 과제 가이드](../tasks/language_modeling)
+
+<PipelineTag pipeline="token-classification"/>
+
+- [Byte-Pair Encoding 토큰화](https://huggingface.co/course/en/chapter6/5)에 대한 강의 자료.
+
+## OpenAIGPTConfig [[transformers.OpenAIGPTConfig]]
+
+[[autodoc]] OpenAIGPTConfig
+
+## OpenAIGPTTokenizer [[transformers.OpenAIGPTTokenizer]]
+
+[[autodoc]] OpenAIGPTTokenizer
+    - save_vocabulary
+
+## OpenAIGPTTokenizerFast [[transformers.OpenAIGPTTokenizerFast]]
+
+[[autodoc]] OpenAIGPTTokenizerFast
+
+## OpenAI specific outputs [[transformers.models.openai.modeling_openai.OpenAIGPTDoubleHeadsModelOutput]]
+
+[[autodoc]] models.openai.modeling_openai.OpenAIGPTDoubleHeadsModelOutput
+
+[[autodoc]] models.openai.modeling_tf_openai.TFOpenAIGPTDoubleHeadsModelOutput
+
+<frameworkcontent>
+<pt>
+
+## OpenAIGPTModel [[transformers.OpenAIGPTModel]]
+
+[[autodoc]] OpenAIGPTModel
+    - forward
+
+## OpenAIGPTLMHeadModel [[transformers.OpenAIGPTLMHeadModel]]
+
+[[autodoc]] OpenAIGPTLMHeadModel
+    - forward
+
+## OpenAIGPTDoubleHeadsModel [[transformers.OpenAIGPTDoubleHeadsModel]]
+
+[[autodoc]] OpenAIGPTDoubleHeadsModel
+    - forward
+
+## OpenAIGPTForSequenceClassification [[transformers.OpenAIGPTForSequenceClassification]]
+
+[[autodoc]] OpenAIGPTForSequenceClassification
+    - forward
+
+</pt>
+<tf>
+
+## TFOpenAIGPTModel [[transformers.TFOpenAIGPTModel]]
+
+[[autodoc]] TFOpenAIGPTModel
+    - call
+
+## TFOpenAIGPTLMHeadModel [[transformers.TFOpenAIGPTLMHeadModel]]
+
+[[autodoc]] TFOpenAIGPTLMHeadModel
+    - call
+
+## TFOpenAIGPTDoubleHeadsModel [[transformers.TFOpenAIGPTDoubleHeadsModel]]
+
+[[autodoc]] TFOpenAIGPTDoubleHeadsModel
+    - call
+
+## TFOpenAIGPTForSequenceClassification [[transformers.TFOpenAIGPTForSequenceClassification]]
+
+[[autodoc]] TFOpenAIGPTForSequenceClassification
+    - call
+
+</tf>
+</frameworkcontent>
diff --git a/docs/source/ko/model_doc/paligemma.md b/docs/source/ko/model_doc/paligemma.md
new file mode 100644
index 00000000000000..9f5b4808685509
--- /dev/null
+++ b/docs/source/ko/model_doc/paligemma.md
@@ -0,0 +1,80 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# PaliGemma[[paligemma]]
+
+## 개요[[overview]]
+
+PaliGemma 모델은 구글이 제안한 [PaliGemma – Google의 최첨단 오픈 비전 언어 모델](https://huggingface.co/blog/paligemma)에서 소개 되었습니다. PaliGemma는 [SigLIP](siglip) 비전 인코더와 [Gemma](gemma) 언어 인코더로 구성된 3B 규모의 비전-언어 모델로, 두 인코더가 멀티모달 선형 프로젝션으로 연결되어 있습니다. 이 모델은 이미지를 고정된 수의 VIT토큰으로 분할하고 이를 선택적 프롬프트 앞에 추가 하며, 모든 이미지 토큰과 입력 텍스트 토큰에 대해 전체 블록 어텐션을 사용하는 특징을 가지고 있습니다.
+
+PaliGemma는 224x224, 448x448, 896x896의 3가지 해상도로 제공되며, 3개의 기본 모델과 55개의 다양한 작업에 대해 미세 조정된 버전, 그리고 2개의 혼합 모델이 있습니다.
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/paligemma/paligemma_arch.png"
+alt="drawing" width="600"/>
+
+<small> PaliGemma 아키텍처 <a href="https://huggingface.co/blog/paligemma">블로그 포스트.</a> </small>
+
+이 모델은 [Molbap](https://huggingface.co/Molbap)에 의해 기여 되었습니다.
+
+## 사용 팁[[usage-tips]]
+
+PaliGemma의 추론은 다음처럼 수행됩니다:
+
+```python
+from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
+
+model_id = "google/paligemma-3b-mix-224"
+model = PaliGemmaForConditionalGeneration.from_pretrained(model_id)
+processor = AutoProcessor.from_pretrained(model_id)
+
+prompt = "What is on the flower?"
+image_file = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg?download=true"
+raw_image = Image.open(requests.get(image_file, stream=True).raw)
+inputs = processor(raw_image, prompt, return_tensors="pt")
+output = model.generate(**inputs, max_new_tokens=20)
+
+print(processor.decode(output[0], skip_special_tokens=True)[len(prompt):])
+```
+
+- PaliGemma는 대화용으로 설계되지 않았으며, 특정 사용 사례에 대해 미세 조정할 때 가장 잘 작동합니다. PaliGemma를 미세 조정할 수 있는 몇 가지 하위 작업에는 이미지 캡셔닝, 시각적 질문 답변(VQA), 오브젝트 디텍션, 참조 표현 분할 및 문서 이해가 포함됩니다.
+- 모델에 필요한 이미지, 텍스트 및 선택적 레이블을 준비하는데 `PaliGemmaProcessor`를 사용할 수 있습니다. PaliGemma 모델을 미세 조정할 때는, 프로세서에 `suffix`인자를 전달하여 다음 처럼 모델의 `labels`를 생성할 수 있습니다:
+
+```python
+prompt = "What is on the flower?"
+answer = "a bee"
+inputs = processor(images=raw_image, text=prompt, suffix=answer, return_tensors="pt")
+```
+
+## 자료[[resources]]
+
+PaliGemma를 시작하는 데 도움이 되는 Hugging Face와 community 자료 목록(🌎로 표시됨) 입니다.여기에 포함될 자료를 제출하고 싶으시다면 PR(Pull Request)를 열어주세요. 리뷰 해드리겠습니다! 자료는 기존 자료를 복제하는 대신 새로운 내용을 담고 있어야 합니다.
+
+- PaliGemma의 모든 기능을 소개하는 블로그 포스트는 [이곳](https://huggingface.co/blog/paligemma)에서 찾을 수 있습니다. 🌎
+- Trainer API를 사용하여 VQA(Visual Question Answering)를 위해 PaliGemma를 미세 조정하는 방법과 추론에 대한 데모 노트북은 [이곳](https://github.com/huggingface/notebooks/tree/main/examples/paligemma)에서 찾을 수 있습니다. 🌎
+- 사용자 정의 데이터셋(영수증 이미지 -> JSON)에 대해 PaliGemma를 미세 조정하는 방법과 추론에 대한 데모 노트북은 [이곳](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/PaliGemma)에서 찾을 수 있습니다. 🌎
+
+## PaliGemmaConfig[[transformers.PaliGemmaConfig]]
+
+[[autodoc]] PaliGemmaConfig
+
+## PaliGemmaProcessor[[transformers.PaliGemmaProcessor]]
+
+[[autodoc]] PaliGemmaProcessor
+
+## PaliGemmaForConditionalGeneration[[transformers.PaliGemmaForConditionalGeneration]]
+
+[[autodoc]] PaliGemmaForConditionalGeneration
+    - forward
diff --git a/docs/source/ko/model_doc/patchtsmixer.md b/docs/source/ko/model_doc/patchtsmixer.md
new file mode 100644
index 00000000000000..e1463eb85de3e5
--- /dev/null
+++ b/docs/source/ko/model_doc/patchtsmixer.md
@@ -0,0 +1,95 @@
+<!--Copyright 2023 IBM and HuggingFace Inc. team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# PatchTSMixer[[patchtsmixer]]
+
+## 개요[[overview]]
+
+PatchTSMixer 모델은 Vijay Ekambaram, Arindam Jati, Nam Nguyen, Phanwadee Sinthong, Jayant Kalagnanam이 제안한 [TSMixer: 다변량 시계열 예측을 위한 경량 MLP-Mixer 모델](https://arxiv.org/pdf/2306.09364.pdf)이라는 논문에서 소개되었습니다.
+
+
+PatchTSMixer는 MLP-Mixer 아키텍처를 기반으로 한 경량 시계열 모델링 접근법입니다. 허깅페이스 구현에서는 PatchTSMixer의 기능을 제공하여 패치, 채널, 숨겨진 특성 간의 경량 혼합을 쉽게 수행하여 효과적인 다변량 시계열 모델링을 가능하게 합니다. 또한 간단한 게이트 어텐션부터 사용자 정의된 더 복잡한 셀프 어텐션 블록까지 다양한 어텐션 메커니즘을 지원합니다. 이 모델은 사전 훈련될 수 있으며 이후 예측, 분류, 회귀와 같은 다양한 다운스트림 작업에 사용될 수 있습니다.
+
+
+해당 논문의 초록입니다:
+
+*TSMixer는 패치 처리된 시계열의 다변량 예측 및 표현 학습을 위해 설계된 다층 퍼셉트론(MLP) 모듈로만 구성된 경량 신경망 아키텍처입니다. 우리의 모델은 컴퓨터 비전 분야에서 MLP-Mixer 모델의 성공에서 영감을 받았습니다. 우리는 Vision MLP-Mixer를 시계열에 적용하는 데 따르는 과제를 보여주고, 정확도를 향상시키기 위해 경험적으로 검증된 구성 요소들을 도입합니다. 여기에는 계층 구조 및 채널 상관관계와 같은 시계열 특성을 명시적으로 모델링하기 위해 MLP-Mixer 백본에 온라인 조정 헤드를 부착하는 새로운 설계 패러다임이 포함됩니다. 또한 기존 패치 채널 혼합 방법의 일반적인 문제인 노이즈가 있는 채널 상호작용을 효과적으로 처리하고 다양한 데이터셋에 걸쳐 일반화하기 위한 하이브리드 채널 모델링 접근법을 제안합니다. 추가로, 중요한 특성을 우선시하기 위해 백본에 간단한 게이트 주의 메커니즘을 도입합니다. 이러한 경량 구성 요소들을 통합함으로써, 우리는 단순한 MLP 구조의 학습 능력을 크게 향상시켜 최소한의 컴퓨팅 사용으로 복잡한 트랜스포머 모델들을 능가하는 성능을 달성합니다. 더욱이, TSMixer의 모듈식 설계는 감독 학습과 마스크 자기 감독 학습 방법 모두와 호환되어 시계열 기초 모델의 유망한 구성 요소가 됩니다. TSMixer는 예측 작업에서 최첨단 MLP 및 트랜스포머 모델들을 상당한 차이(8-60%)로 능가합니다. 또한 최신의 강력한 Patch-Transformer 모델 벤치마크들을 메모리와 실행 시간을 크게 줄이면서(2-3배) 성능 면에서도 앞섭니다(1-2%).*
+
+이 모델은 [ajati](https://huggingface.co/ajati), [vijaye12](https://huggingface.co/vijaye12), 
+[gsinthong](https://huggingface.co/gsinthong), [namctin](https://huggingface.co/namctin),
+[wmgifford](https://huggingface.co/wmgifford), [kashif](https://huggingface.co/kashif)가 기여했습니다.
+
+## 사용 예[[usage-example]]
+
+아래의 코드 스니펫은 PatchTSMixer 모델을 무작위로 초기화하는 방법을 보여줍니다. 
+PatchTSMixer 모델은 [Trainer API](../trainer.md)와 호환됩니다.
+
+```python
+
+from transformers import PatchTSMixerConfig, PatchTSMixerForPrediction
+from transformers import Trainer, TrainingArguments,
+
+
+config = PatchTSMixerConfig(context_length = 512, prediction_length = 96)
+model = PatchTSMixerForPrediction(config)
+trainer = Trainer(model=model, args=training_args, 
+            train_dataset=train_dataset,
+            eval_dataset=valid_dataset)
+trainer.train()
+results = trainer.evaluate(test_dataset)
+```
+
+## 사용 팁[[usage-tips]]
+
+이 모델은 시계열 분류와 시계열 회귀에도 사용될 수 있습니다. 각각[`PatchTSMixerForTimeSeriesClassification`]와 [`PatchTSMixerForRegression`] 클래스를 참조하세요.
+
+## 자료[[resources]]
+
+- PatchTSMixer를 자세히 설명하는 블로그 포스트는 여기에서 찾을 수 있습니다 [이곳](https://huggingface.co/blog/patchtsmixer). 이 블로그는 Google Colab에서도 열어볼 수 있습니다.
+
+## PatchTSMixerConfig[[transformers.PatchTSMixerConfig]]
+
+[[autodoc]] PatchTSMixerConfig
+
+
+## PatchTSMixerModel[[transformers.PatchTSMixerModel]]
+
+[[autodoc]] PatchTSMixerModel
+    - forward
+
+
+## PatchTSMixerForPrediction[[transformers.PatchTSMixerForPrediction]]
+
+[[autodoc]] PatchTSMixerForPrediction
+    - forward
+
+
+## PatchTSMixerForTimeSeriesClassification[[transformers.PatchTSMixerForTimeSeriesClassification]]
+
+[[autodoc]] PatchTSMixerForTimeSeriesClassification
+    - forward
+
+
+## PatchTSMixerForPretraining[[transformers.PatchTSMixerForPretraining]]
+
+[[autodoc]] PatchTSMixerForPretraining
+    - forward
+
+
+## PatchTSMixerForRegression[[transformers.PatchTSMixerForRegression]]
+
+[[autodoc]] PatchTSMixerForRegression
+    - forward
\ No newline at end of file
diff --git a/docs/source/ko/model_doc/patchtst.md b/docs/source/ko/model_doc/patchtst.md
new file mode 100644
index 00000000000000..fc9b0eb51e4a42
--- /dev/null
+++ b/docs/source/ko/model_doc/patchtst.md
@@ -0,0 +1,76 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# PatchTST[[patchtst]]
+
+## 개요[[overview]]
+
+The PatchTST 모델은 Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam이 제안한 [시계열 하나가 64개의 단어만큼 가치있다: 트랜스포머를 이용한 장기예측](https://arxiv.org/abs/2211.14730)라는 논문에서 소개되었습니다.
+
+이 모델은 고수준에서 시계열을 주어진 크기의 패치로 벡터화하고, 결과로 나온 벡터 시퀀스를 트랜스포머를 통해 인코딩한 다음 적절한 헤드를 통해 예측 길이의 예측을 출력합니다. 모델은 다음 그림과 같이 도식화됩니다:
+
+![모델](https://github.com/namctin/transformers/assets/8100/150af169-29de-419a-8d98-eb78251c21fa)
+
+해당 논문의 초록입니다:
+
+*우리는 다변량 시계열 예측과 자기 감독 표현 학습을 위한 효율적인 트랜스포머 기반 모델 설계를 제안합니다. 이는 두 가지 주요 구성 요소를 기반으로 합니다: 
+
+(i) 시계열을 하위 시리즈 수준의 패치로 분할하여 트랜스포머의 입력 토큰으로 사용
+(ii) 각 채널이 모든 시리즈에 걸쳐 동일한 임베딩과 트랜스포머 가중치를 공유하는 단일 단변량 시계열을 포함하는 채널 독립성. 패칭 설계는 자연스럽게 세 가지 이점을 가집니다: 
+    - 지역적 의미 정보가 임베딩에 유지됩니다; 
+    - 동일한 룩백 윈도우에 대해 어텐션 맵의 계산과 메모리 사용량이 제곱으로 감소합니다
+    - 모델이 더 긴 과거를 참조할 수 있습니다. 
+    우리의 채널 독립적 패치 시계열 트랜스포머(PatchTST)는 최신 트랜스포머 기반 모델들과 비교했을 때 장기 예측 정확도를 크게 향상시킬 수 있습니다. 또한 모델을 자기지도 사전 훈련 작업에 적용하여, 대규모 데이터셋에 대한 지도 학습을 능가하는 아주 뛰어난 미세 조정 성능을 달성했습니다. 한 데이터셋에서 마스크된 사전 훈련 표현을 다른 데이터셋으로 전이하는 것도 최고 수준의 예측 정확도(SOTA)를 산출했습니다.*
+
+이 모델은 [namctin](https://huggingface.co/namctin), [gsinthong](https://huggingface.co/gsinthong), [diepi](https://huggingface.co/diepi), [vijaye12](https://huggingface.co/vijaye12), [wmgifford](https://huggingface.co/wmgifford), [kashif](https://huggingface.co/kashif)에 의해 기여 되었습니다. 원본코드는 [이곳](https://github.com/yuqinie98/PatchTST)에서 확인할 수 있습니다.
+
+## 사용 팁[[usage-tips]]
+
+이 모델은 시계열 분류와 시계열 회귀에도 사용될 수 있습니다. 각각 [`PatchTSTForClassification`]와 [`PatchTSTForRegression`] 클래스를 참조하세요.
+
+## 자료[[resources]]
+
+- PatchTST를 자세히 설명하는 블로그 포스트는 [이곳](https://huggingface.co/blog/patchtst)에서 찾을 수 있습니다. 
+이 블로그는 Google Colab에서도 열어볼 수 있습니다.
+
+## PatchTSTConfig[[transformers.PatchTSTConfig]]
+
+[[autodoc]] PatchTSTConfig
+
+## PatchTSTModel[[transformers.PatchTSTModel]]
+
+[[autodoc]] PatchTSTModel
+    - forward
+
+## PatchTSTForPrediction[[transformers.PatchTSTForPrediction]]
+
+[[autodoc]] PatchTSTForPrediction
+    - forward
+
+## PatchTSTForClassification[[transformers.PatchTSTForClassification]]
+
+[[autodoc]] PatchTSTForClassification
+    - forward
+
+## PatchTSTForPretraining[[transformers.PatchTSTForPretraining]]
+
+[[autodoc]] PatchTSTForPretraining
+    - forward
+
+## PatchTSTForRegression[[transformers.PatchTSTForRegression]]
+
+[[autodoc]] PatchTSTForRegression
+    - forward
diff --git a/docs/source/ko/model_doc/rag.md b/docs/source/ko/model_doc/rag.md
new file mode 100644
index 00000000000000..0ae88cc7ac5e3e
--- /dev/null
+++ b/docs/source/ko/model_doc/rag.md
@@ -0,0 +1,100 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# RAG(검색 증강 생성) [[rag]]
+
+<div class="flex flex-wrap space-x-1">
+<a href="https://huggingface.co/models?filter=rag">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-rag-blueviolet">
+</a>
+</div>
+
+## 개요 [[overview]]
+
+검색 증강 생성(Retrieval-augmented generation, "RAG") 모델은 사전 훈련된 밀집 검색(DPR)과 시퀀스-투-시퀀스 모델의 장점을 결합합니다. RAG 모델은 문서를 검색하고, 이를 시퀀스-투-시퀀스 모델에 전달한 다음, 주변화(marginalization)를 통해 출력을 생성합니다. 검색기와 시퀀스-투-시퀀스 모듈은 사전 훈련된 모델로 초기화되며, 함께 미세 조정되어 검색과 생성 모두 다운스트림 작업(모델을 특정 태스크에 적용하는 것)에 적응할 수 있게 합니다.
+
+이 모델은 Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela의 논문 [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401)를 기반으로 합니다.
+
+논문의 초록은 다음과 같습니다.
+
+*대규모 사전 훈련 언어 모델들은 그들의 매개변수에 사실적 지식을 저장하고 있으며, 다운스트림 NLP 작업에 대해 미세 조정될 때 최첨단 결과를 달성합니다. 그러나 지식에 접근하고 정확하게 조작하는 능력은 여전히 제한적이며, 따라서 지식 집약적 작업에서 그들의 성능은 작업별 아키텍처에 비해 뒤떨어집니다. 또한, 그들의 결정에 대한 근거를 제공하고 세계 지식을 업데이트하는 것은 여전히 열린 연구 문제로 남아 있습니다. 명시적 비매개변수 메모리에 대한 미분 가능한 접근 메커니즘을 가진 사전 훈련 모델은 이 문제를 극복할 수 있지만, 지금까지는 추출적 다운스트림 작업에 대해서만 연구되었습니다. 우리는 언어 생성을 위해 사전 훈련된 매개변수 및 비매개변수 메모리를 결합하는 모델인 검색 증강 생성(RAG)에 대한 일반적인 목적의 미세 조정 방법을 탐구합니다. 우리는 매개변수 메모리가 사전 훈련된 시퀀스-투-시퀀스 모델이고 비매개변수 메모리가 사전 훈련된 신경 검색기로 접근되는 위키피디아의 밀집 벡터 인덱스인 RAG 모델을 소개합니다. 우리는 생성된 전체 시퀀스에 걸쳐 동일한 검색된 구절을 조건으로 하는 RAG 공식과 토큰별로 다른 구절을 사용할 수 있는 RAG 공식을 비교합니다. 우리는 광범위한 지식 집약적 NLP 작업에 대해 모델을 미세 조정하고 평가하며, 매개변수 시퀀스-투-시퀀스 모델과 작업별 검색-추출 아키텍처를 능가하여 세 가지 개방형 도메인 QA 작업에서 최첨단 성능을 달성합니다. 언어 생성 작업의 경우, RAG 모델이 최첨단 매개변수 전용 시퀀스-투-시퀀스 기준선보다 더 구체적이고, 다양하며, 사실적인 언어를 생성한다는 것을 발견했습니다.*
+
+이 모델은 [ola13](https://huggingface.co/ola13)에 의해 기여되었습니다.
+
+## 사용 팁 [[usage-tips]]
+
+검색 증강 생성(Retrieval-augmented generation, "RAG") 모델은 사전 훈련된 밀집 검색(DPR)과 시퀀스-투-시퀀스 모델의 강점을 결합합니다. RAG 모델은 문서를 검색하고, 이를 시퀀스-투-시퀀스 모델에 전달한 다음, 주변화(marginalization)를 통해 출력을 생성합니다. 검색기와 시퀀스-투-시퀀스 모듈은 사전 훈련된 모델로 초기화되며, 함께 미세 조정됩니다. 이를 통해 검색과 생성 모두 다운스트림 작업에 적응할 수 있게 됩니다.
+
+## RagConfig [[transformers.RagConfig]]
+
+[[autodoc]] RagConfig
+
+## RagTokenizer [[transformers.RagTokenizer]]
+
+[[autodoc]] RagTokenizer
+
+## Rag specific outputs [[transformers.models.rag.modeling_rag.RetrievAugLMMarginOutput]]
+
+[[autodoc]] models.rag.modeling_rag.RetrievAugLMMarginOutput
+
+[[autodoc]] models.rag.modeling_rag.RetrievAugLMOutput
+
+## RagRetriever [[transformers.RagRetriever]]
+
+[[autodoc]] RagRetriever
+
+<frameworkcontent>
+<pt>
+
+## RagModel [[transformers.RagModel]]
+
+[[autodoc]] RagModel
+    - forward
+
+## RagSequenceForGeneration [[transformers.RagSequenceForGeneration]]
+
+[[autodoc]] RagSequenceForGeneration
+    - forward
+    - generate
+
+## RagTokenForGeneration [[transformers.RagTokenForGeneration]]
+
+[[autodoc]] RagTokenForGeneration
+    - forward
+    - generate
+
+</pt>
+<tf>
+
+## TFRagModel [[transformers.TFRagModel]]
+
+[[autodoc]] TFRagModel
+    - call
+
+## TFRagSequenceForGeneration [[transformers.TFRagSequenceForGeneration]]
+
+[[autodoc]] TFRagSequenceForGeneration
+    - call
+    - generate
+
+## TFRagTokenForGeneration [[transformers.TFRagTokenForGeneration]]
+
+[[autodoc]] TFRagTokenForGeneration
+    - call
+    - generate
+
+</tf>
+</frameworkcontent>
diff --git a/docs/source/ko/model_doc/swin.md b/docs/source/ko/model_doc/swin.md
new file mode 100644
index 00000000000000..6919d7e9978e07
--- /dev/null
+++ b/docs/source/ko/model_doc/swin.md
@@ -0,0 +1,95 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Swin Transformer [[swin-transformer]]
+
+## 개요 [[overview]]
+
+Swin Transformer는 Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo가 제안한 논문 [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030)에서 소개되었습니다.
+
+논문의 초록은 다음과 같습니다:
+
+*이 논문은 Swin Transformer라는 새로운 비전 트랜스포머를 소개합니다. 이 모델은 컴퓨터 비전에서 범용 백본(backbone)으로 사용될 수 있습니다. 트랜스포머를 언어에서 비전으로 적용할 때의 어려움은 두 분야 간의 차이에서 비롯되는데, 예를 들어 시각적 객체의 크기가 크게 변동하며, 이미지의 픽셀 해상도가 텍스트의 단어에 비해 매우 높다는 점이 있습니다. 이러한 차이를 해결하기 위해, 우리는 'Shifted Windows'를 이용해 표현을 계산하는 계층적 트랜스포머를 제안합니다. Shifted Windows 방식은 겹치지 않는 로컬 윈도우에서 self-attention 계산을 제한하여 효율성을 높이는 동시에 윈도우 간 연결을 가능하게 합니다. 이 계층적 구조는 다양한 크기의 패턴을 모델링할 수 있는 유연성을 제공하며, 이미지 크기에 비례한 선형 계산 복잡성을 가지고 있습니다. Swin Transformer의 이러한 특징들은 이미지 분류(Imagenet-1K에서 87.3의 top-1 정확도) 및 객체 검출(COCO test-dev에서 58.7의 박스 AP, 51.1의 마스크 AP)과 같은 밀집 예측 작업, 의미적 분할(ADE20K val에서 53.5의 mIoU)과 같은 광범위한 비전 작업에 적합합니다. 이 모델은 COCO에서 이전 최고 성능을 박스 AP에서 +2.7, 마스크 AP에서 +2.6, ADE20K에서 mIoU에서 +3.2를 초과하는 성과를 보여주며, 트랜스포머 기반 모델이 비전 백본으로서의 잠재력을 입증했습니다. 계층적 설계와 Shifted Windows 방식은 순수 MLP 아키텍처에도 유리하게 작용합니다.* 
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/swin_transformer_architecture.png"
+alt="drawing" width="600"/>
+
+<small> Swin Transformer 아키텍처. <a href="https://arxiv.org/abs/2102.03334">원본 논문</a>에서 발췌.</small>
+
+이 모델은 [novice03](https://huggingface.co/novice03)이 기여하였습니다. Tensorflow 버전은 [amyeroberts](https://huggingface.co/amyeroberts)가 기여했습니다. 원본 코드는 [여기](https://github.com/microsoft/Swin-Transformer)에서 확인할 수 있습니다.
+
+## 사용 팁 [[usage-tips]]
+
+- Swin은 입력의 높이와 너비가 `32`로 나누어질 수 있으면 어떤 크기든 지원할 수 있도록 패딩을 추가합니다.
+- Swin은 *백본*으로 사용할 수 있습니다. `output_hidden_states = True`로 설정하면, `hidden_states`와 `reshaped_hidden_states`를 모두 출력합니다. `reshaped_hidden_states`는 `(batch, num_channels, height, width)` 형식을 가지며, 이는 `(batch_size, sequence_length, num_channels)` 형식과 다릅니다.
+
+## 리소스 [[resources]]
+
+Swin Transformer의 사용을 도울 수 있는 Hugging Face 및 커뮤니티(🌎로 표시)의 공식 자료 목록입니다.  
+
+<PipelineTag pipeline="image-classification"/>
+
+- [`SwinForImageClassification`]은 이 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb)을 통해 지원됩니다.
+- 관련 자료: [이미지 분류 작업 가이드](../tasks/image_classification)
+
+또한:
+
+- [`SwinForMaskedImageModeling`]은 이 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining)를 통해 지원됩니다.
+
+새로운 자료를 추가하고 싶으시다면, 언제든지 Pull Request를 열어주세요! 저희가 검토해 드릴게요. 이때, 추가하는 자료는 기존 자료와 중복되지 않고 새로운 내용을 보여주는 자료여야 합니다. 
+
+## SwinConfig [[transformers.SwinConfig]]
+
+[[autodoc]] SwinConfig
+
+<frameworkcontent>
+<pt>
+
+## SwinModel [[transformers.SwinModel]]
+
+[[autodoc]] SwinModel
+    - forward
+
+## SwinForMaskedImageModeling [[transformers.SwinForMaskedImageModeling]]
+
+[[autodoc]] SwinForMaskedImageModeling
+    - forward
+
+## SwinForImageClassification [[transformers.SwinForImageClassification]]
+
+[[autodoc]] transformers.SwinForImageClassification
+    - forward
+
+</pt>
+<tf>
+
+## TFSwinModel [[transformers.TFSwinModel]]
+
+[[autodoc]] TFSwinModel
+    - call
+
+## TFSwinForMaskedImageModeling [[transformers.TFSwinForMaskedImageModeling]]
+
+[[autodoc]] TFSwinForMaskedImageModeling
+    - call
+
+## TFSwinForImageClassification [[transformers.TFSwinForImageClassification]]
+
+[[autodoc]] transformers.TFSwinForImageClassification
+    - call
+
+</tf>
+</frameworkcontent>
\ No newline at end of file
diff --git a/docs/source/ko/model_doc/swin2sr.md b/docs/source/ko/model_doc/swin2sr.md
new file mode 100644
index 00000000000000..931298b9593c63
--- /dev/null
+++ b/docs/source/ko/model_doc/swin2sr.md
@@ -0,0 +1,59 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Swin2SR [[swin2sr]]
+
+## 개요 [[overview]]
+
+Swin2SR 모델은 Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte가 제안한 논문 [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345)에서 소개되었습니다.
+Swin2SR은 [SwinIR](https://github.com/JingyunLiang/SwinIR/) 모델을 개선하고자 [Swin Transformer v2](swinv2) 레이어를 도입함으로써, 훈련 불안정성, 사전 훈련과 미세 조정 간의 해상도 차이, 그리고 데이터 의존성 문제를 완화시킵니다.
+
+논문의 초록은 다음과 같습니다:
+
+*압축은 스트리밍 서비스, 가상 현실, 비디오 게임과 같은 대역폭이 제한된 시스템을 통해 이미지와 영상을 효율적으로 전송하고 저장하는 데 중요한 역할을 합니다. 하지만 압축은 필연적으로 원본 정보의 손실과 아티팩트를 초래하며, 이는 시각적 품질을 심각하게 저하시킬 수 있습니다. 이러한 이유로, 압축된 이미지의 품질 향상은 활발한 연구 주제가 되고 있습니다. 현재 대부분의 최첨단 이미지 복원 방법은 합성곱 신경망을 기반으로 하지만, SwinIR과 같은 트랜스포머 기반 방법들도 이 작업에서 인상적인 성능을 보여주고 있습니다. 이번 논문에서는 Swin Transformer V2를 사용해 SwinIR을 개선하여 이미지 초해상도 작업, 특히 압축된 입력 시나리오에서 성능을 향상시키고자 합니다. 이 방법을 통해 트랜스포머 비전 모델을 훈련할 때 발생하는 주요 문제들, 예를 들어 훈련 불안정성, 사전 훈련과 미세 조정 간 해상도 차이, 그리고 데이터 의존성을 해결할 수 있습니다. 우리는 JPEG 압축 아티팩트 제거, 이미지 초해상도(클래식 및 경량), 그리고 압축된 이미지 초해상도라는 세 가지 대표적인 작업에서 실험을 수행했습니다. 실험 결과, 우리의 방법인 Swin2SR은 SwinIR의 훈련 수렴성과 성능을 향상시킬 수 있으며, "AIM 2022 Challenge on Super-Resolution of Compressed Image and Video"에서 상위 5위 솔루션으로 선정되었습니다.*
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/swin2sr_architecture.png"
+alt="drawing" width="600"/>
+
+<small> Swin2SR 아키텍처. <a href="https://arxiv.org/abs/2209.11345">원본 논문</a>에서 발췌.</small>
+
+이 모델은 [nielsr](https://huggingface.co/nielsr)가 기여하였습니다.
+원본 코드는 [여기](https://github.com/mv-lab/swin2sr)에서 확인할 수 있습니다.
+
+## 리소스 [[resources]]
+
+Swin2SR demo notebook은 [여기](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/Swin2SR)에서 확인할 수 있습니다.
+
+SwinSR을 활용한 image super-resolution demo space는 [여기](https://huggingface.co/spaces/jjourney1125/swin2sr)에서 확인할 수 있습니다.
+
+## Swin2SRImageProcessor [[transformers.Swin2SRImageProcessor]]
+
+[[autodoc]] Swin2SRImageProcessor
+    - preprocess
+
+## Swin2SRConfig [[transformers.Swin2SRConfig]]
+
+[[autodoc]] Swin2SRConfig
+
+## Swin2SRModel [[transformers.Swin2SRModel]]
+
+[[autodoc]] Swin2SRModel
+    - forward
+
+## Swin2SRForImageSuperResolution [[transformers.Swin2SRForImageSuperResolution]]
+
+[[autodoc]] Swin2SRForImageSuperResolution
+    - forward
diff --git a/docs/source/ko/model_doc/swinv2.md b/docs/source/ko/model_doc/swinv2.md
new file mode 100644
index 00000000000000..3bc420a292ad7b
--- /dev/null
+++ b/docs/source/ko/model_doc/swinv2.md
@@ -0,0 +1,63 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Swin Transformer V2 [[swin-transformer-v2]]
+
+## 개요 [[overview]]
+
+Swin Transformer V2는  Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo가 제안한 논문 [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883)에서 소개되었습니다.
+
+논문의 초록은 다음과 같습니다:
+
+*대규모 NLP 모델들은 언어 작업에서의 성능을 크게 향상하며, 성능이 포화하는 징후를 보이지 않습니다. 또한, 사람과 유사한 few-shot 학습 능력을 보여줍니다. 이 논문은 대규모 모델을 컴퓨터 비전 분야에서 탐구하고자 합니다. 대형 비전 모델을 훈련하고 적용하는 데 있어 세 가지 주요 문제를 다룹니다: 훈련 불안정성, 사전 학습과 파인튜닝 간의 해상도 차이, 그리고 레이블이 달린 데이터에 대한 높은 요구입니다. 세 가지 주요 기법을 제안합니다: 1) 훈련 안정성을 개선하기 위한 residual-post-norm 방법과 cosine attention의 결합; 2) 저해상도 이미지로 사전 학습된 모델을 고해상도 입력으로 전이할 수 있는 log-spaced continuous position bias 방법; 3) 레이블이 달린 방대한 이미지의 필요성을 줄이기 위한 self-supervised 사전 학습 방법인 SimMIM입니다. 이러한 기법들을 통해 30억 개의 파라미터를 가진 Swin Transformer V2 모델을 성공적으로 훈련하였으며, 이는 현재까지 가장 크고 고밀도의 비전 모델로, 최대 1,536×1,536 해상도의 이미지를 다룰 수 있습니다. 이 모델은 ImageNet-V2 이미지 분류, COCO 객체 탐지, ADE20K 의미론적 분할, Kinetics-400 비디오 행동 분류 등 네 가지 대표적인 비전 작업에서 새로운 성능 기록을 세웠습니다. 또한, 우리의 훈련은 Google의 billion-level 비전 모델과 비교해 40배 적은 레이블이 달린 데이터와 40배 적은 훈련 시간으로 이루어졌다는 점에서 훨씬 더 효율적입니다.*
+
+이 모델은 [nandwalritik](https://huggingface.co/nandwalritik)이 기여하였습니다.
+원본 코드는 [여기](https://github.com/microsoft/Swin-Transformer)에서 확인할 수 있습니다.
+
+## 리소스 [[resources]]
+
+Swin Transformer v2의 사용을 도울 수 있는 Hugging Face 및 커뮤니티(🌎로 표시)의 공식 자료 목록입니다.
+
+
+<PipelineTag pipeline="image-classification"/>
+
+- [`Swinv2ForImageClassification`]은 이 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb)을 통해 지원됩니다.
+- 관련 자료: [이미지 분류 작업 가이드](../tasks/image_classification)
+
+또한:
+
+- [`Swinv2ForMaskedImageModeling`]는 이 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining)를 통해 지원됩니다.
+
+새로운 자료를 추가하고 싶으시다면, 언제든지 Pull Request를 열어주세요! 저희가 검토해 드릴게요. 이때, 추가하는 자료는 기존 자료와 중복되지 않고 새로운 내용을 보여주는 자료여야 합니다.
+
+## Swinv2Config [[transformers.Swinv2Config]]
+
+[[autodoc]] Swinv2Config
+
+## Swinv2Model [[transformers.Swinv2Model]]
+
+[[autodoc]] Swinv2Model
+    - forward
+
+## Swinv2ForMaskedImageModeling [[transformers.Swinv2ForMaskedImageModeling]]
+
+[[autodoc]] Swinv2ForMaskedImageModeling
+    - forward
+
+## Swinv2ForImageClassification [[transformers.Swinv2ForImageClassification]]
+
+[[autodoc]] transformers.Swinv2ForImageClassification
+    - forward
diff --git a/docs/source/ko/model_doc/time_series_transformer.md b/docs/source/ko/model_doc/time_series_transformer.md
new file mode 100644
index 00000000000000..2473b0e143b750
--- /dev/null
+++ b/docs/source/ko/model_doc/time_series_transformer.md
@@ -0,0 +1,62 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 시계열 트랜스포머[[time-series-transformer]]
+
+## 개요[[overview]]
+
+이 시계열 트랜스포머 모델은 시계열 예측을 위한 기본적인 인코더-디코더 구조의 트랜스포머 입니다.
+이 모델은 [kashif](https://huggingface.co/kashif)에 의해 기여되었습니다.
+
+## 사용 팁[[usage-tips]]
+
+- 다른 라이브러리의 모델들과 마찬가지로, [`TimeSeriesTransformerModel`]은 상단에 헤드가 없는 기본적인 트랜스포머 입니다. [`TimeSeriesTransformerForPrediction`]은 상단에 분포 헤드를 추가하여 시계열 예측에 사용할 수 있습니다. 이 모델은 이른바 확률적 예측 모델이며, 포인트 예측 모델이 아닙니다. 즉 샘플링할 수 있는 분포를 학습하며, 값을 직접 출력 하지는 않습니다.
+- [`TimeSeriesTransformerForPrediction`]은 두개의 블록으로 구성되어 있습니다. 인코더는 `context_length`의  시계열 값을 입력(`past_values`라고 부름)으로 받아들이며, 디코더는 미래의 `prediction_length`만큼 시계열 값을 예측합니다(`future_values`라고 부름). 학습중에는 모델에 `past_values` 와 `future_values`쌍을 모델에 제공해야 합니다.
+- 가공하지 않은 `past_values` 와 `future_values` 쌍 외에도, 일반적으로 모델에 추가적인 특징을 제공합니다. 다음은 그 특징들에 대해 소개합니다:
+    - `past_time_features`: 모델이 `past_values`에 추가할 시간적 특성. 이는 트랜스포머 인코더의 "위치 인코딩" 역할을 합니다.
+    예를 들어 "월의 일", "연도의 월" 등을 스칼라 값으로 (그리고 벡터로 쌓아서) 나타냅니다.
+    예시: 특정 시계열 값이 8월 11일에 기록되었다면, [11, 8]을 시간 특성 벡터로 사용할 수 있습니다 (11은 "월의 일", 8은 "연도의 월").
+    - `future_time_features`: 모델이 `future_values`에 추가할 시간적 특성. 이는 트랜스포머 디코더의 "위치 인코딩" 역할을 합니다.
+    예를 들어 "월의 일", "연도의 월" 등을 스칼라 값으로 (그리고 벡터로 쌓아서) 나타냅니다.
+    예: 특정 시계열 값이 8월 11일에 얻어졌다면, [11, 8]을 시간 특성 벡터로 사용할 수 있습니다 (11은 "월의 일", 8은 "연도의 월").
+    - `static_categorical_features`: 시간에 따라 변하지 않는 범주형 특성 (즉, 모든 `past_values`와 `future_values`에 대해 동일한 값을 가짐).
+    예를 들어 특정 시계열을 식별하는 매장 ID나 지역 ID가 있습니다.
+    이러한 특성은 모든 데이터 포인트(미래의 데이터 포인트 포함)에 대해 알려져 있어야 합니다.
+    - `static_real_features`: 시간에 따라 변하지 않는 실수값 특성 (즉, 모든 `past_values`와 `future_values`에 대해 동일한 값을 가짐).
+    예를 들어 시계열 값을 가진 제품의 이미지 표현 (시계열이 신발 판매에 관한 것이라면 "신발" 사진의 [ResNet](resnet) 임베딩 처럼)이 있습니다.
+    이러한 특성은 모든 데이터 포인트(미래의 데이터 포인트 포함)에 대해 알려져 있어야 합니다.
+- 이 모델은 기계 번역을 위한 트랜스포머 훈련과 유사하게 "교사 강제(teacher-forcing)" 방식으로 훈련됩니다. 즉, 훈련 중에 `future_values`를 디코더의 입력으로 오른쪽으로 한 위치 이동시키고, `past_values`의 마지막 값을 앞에 붙입니다. 각 시간 단계에서 모델은 다음 타겟을 예측해야 합니다. 따라서 훈련 설정은 언어를 위한 GPT 모델과 유사하지만, `decoder_start_token_id` 개념이 없습니다 (우리는 단순히 컨텍스트의 마지막 값을 디코더의 초기 입력으로 사용합니다).
+- 추론 시에는 `past_values`의 최종 값을 디코더의 입력으로 제공합니다. 그 다음, 모델에서 샘플링하여 다음 시간 단계에서의 예측을 만들고, 이를 디코더에 공급하여 다음 예측을 만듭니다 (자기회귀 생성이라고도 함).
+
+## 자료 [[resources]]
+
+시작하는 데 도움이 되는 Hugging Face와 community 자료 목록(🌎로 표시됨) 입니다. 여기에 포함될 자료를 제출하고 싶으시다면 PR(Pull Request)를 열어주세요. 리뷰 해드리겠습니다! 자료는 기존 자료를 복제하는 대신 새로운 내용을 담고 있어야 합니다.
+
+- HuggingFace 블로그에서 시계열 트랜스포머 포스트를 확인하세요: [🤗 트랜스포머와 확률적 시계열 예측](https://huggingface.co/blog/time-series-transformers)
+
+## TimeSeriesTransformerConfig[[transformers.TimeSeriesTransformerConfig]]
+
+[[autodoc]] TimeSeriesTransformerConfig
+
+## TimeSeriesTransformerModel[[transformers.TimeSeriesTransformerModel]]
+
+[[autodoc]] TimeSeriesTransformerModel
+    - forward
+
+## TimeSeriesTransformerForPrediction[[transformers.TimeSeriesTransformerForPrediction]]
+
+[[autodoc]] TimeSeriesTransformerForPrediction
+    - forward
diff --git a/docs/source/ko/model_doc/trajectory_transformer.md b/docs/source/ko/model_doc/trajectory_transformer.md
new file mode 100644
index 00000000000000..ac279eff044196
--- /dev/null
+++ b/docs/source/ko/model_doc/trajectory_transformer.md
@@ -0,0 +1,49 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 궤적 트랜스포머[[trajectory-transformer]]
+
+<Tip warning={true}>
+
+
+이 모델은 유지 보수 모드로만 운영되며, 코드를 변경하는 새로운 PR(Pull Request)은 받지 않습니다.
+이 모델을 실행하는 데 문제가 발생한다면, 이 모델을 지원하는 마지막 버전인 v4.30.0를 다시 설치해 주세요. 다음 명령어를 실행하여 재설치할 수 있습니다: `pip install -U transformers==4.30.0`.
+
+</Tip>
+
+## 개요[[overview]]
+
+Trajectory Transformer 모델은 Michael Janner, Qiyang Li, Sergey Levine이 제안한 [하나의 커다란 시퀀스 모델링 문제로서의 오프라인 강화학습](https://arxiv.org/abs/2106.02039)라는 논문에서 소개되었습니다.
+
+해당 논문의 초록입니다:
+
+*강화학습(RL)은 일반적으로 마르코프 속성을 활용하여 시간에 따라 문제를 인수분해하면서 정적 정책이나 단일 단계 모델을 추정하는 데 중점을 둡니다. 하지만 우리는 RL을 높은 보상 시퀀스로 이어지는 행동 시퀀스를 생성하는 것을 목표로 하는 일반적인 시퀀스 모델링 문제로 볼 수도 있습니다. 이러한 관점에서, 자연어 처리와 같은 다른 도메인에서 잘 작동하는 고용량 시퀀스 예측 모델이 RL 문제에도 효과적인 해결책을 제공할 수 있는지 고려해 볼 만합니다. 이를 위해 우리는 RL을 시퀀스 모델링의 도구로 어떻게 다룰 수 있는지 탐구하며, 트랜스포머 아키텍처를 사용하여 궤적에 대한 분포를 모델링하고 빔 서치를 계획 알고리즘으로 재활용합니다. RL을 시퀀스 모델링 문제로 프레임화하면 다양한 설계 결정이 단순화되어, 오프라인 RL 알고리즘에서 흔히 볼 수 있는 많은 구성 요소를 제거할 수 있습니다. 우리는 이 접근 방식의 유연성을 장기 동역학 예측, 모방 학습, 목표 조건부 RL, 오프라인 RL에 걸쳐 입증합니다. 더 나아가, 이 접근 방식을 기존의 모델 프리 알고리즘과 결합하여 희소 보상, 장기 과제에서 최신 계획기(planner)를 얻을 수 있음을 보여줍니다.*
+
+이 모델은 [CarlCochet](https://huggingface.co/CarlCochet)에 의해 기여되었습니다.
+원본 코드는 [이곳](https://github.com/jannerm/trajectory-transformer)에서 확인할 수 있습니다.
+
+## 사용 팁[[usage-tips]]
+
+이 트랜스포머는 심층 강화학습에 사용됩니다. 사용하려면 이전의 모든 타임스텝에서의 행동, 상태, 보상으로부터 시퀀스를 생성해야 합니다. 이 모델은 이 모든 요소를 함께 하나의 큰 시퀀스(궤적)로 취급합니다.
+
+## TrajectoryTransformerConfig[[transformers.TrajectoryTransformerConfig]]
+
+[[autodoc]] TrajectoryTransformerConfig
+
+## TrajectoryTransformerModel[[transformers.TrajectoryTransformerModel]]
+
+[[autodoc]] TrajectoryTransformerModel
+    - forward
diff --git a/docs/source/ko/model_doc/vit.md b/docs/source/ko/model_doc/vit.md
new file mode 100644
index 00000000000000..5f3eb3342718b1
--- /dev/null
+++ b/docs/source/ko/model_doc/vit.md
@@ -0,0 +1,172 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Vision Transformer (ViT) [[vision-transformer-vit]]
+
+## 개요 [[overview]]
+
+Vision Transformer (ViT) 모델은 Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby가 제안한 논문 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929)에서 소개되었습니다. 이는 Transformer 인코더를 ImageNet에서 성공적으로 훈련시킨 첫 번째 논문으로, 기존의 잘 알려진 합성곱 신경망(CNN) 구조와 비교해 매우 우수한 결과를 달성했습니다.
+
+논문의 초록은 다음과 같습니다:
+
+*Transformer 아키텍처는 자연어 처리 작업에서 사실상 표준으로 자리 잡았으나, 컴퓨터 비전 분야에서의 적용은 여전히 제한적입니다. 비전에서 어텐션 메커니즘은 종종 합성곱 신경망(CNN)과 결합하여 사용되거나, 전체 구조를 유지하면서 합성곱 신경망의 특정 구성 요소를 대체하는 데 사용됩니다. 우리는 이러한 CNN 의존성이 필요하지 않으며, 이미지 패치를 순차적으로 입력받는 순수한 Transformer가 이미지 분류 작업에서 매우 우수한 성능을 발휘할 수 있음을 보여줍니다. 대규모 데이터로 사전 학습된 후, ImageNet, CIFAR-100, VTAB 등 다양한 중소형 이미지 인식 벤치마크에 적용하면 Vision Transformer(ViT)는 최신 합성곱 신경망과 비교해 매우 우수한 성능을 발휘하면서도 훈련에 필요한 계산 자원을 상당히 줄일 수 있습니다.*
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/vit_architecture.jpg"
+alt="drawing" width="600"/>
+
+<small> ViT 아키텍처. <a href="https://arxiv.org/abs/2010.11929">원본 논문</a>에서 발췌. </small>
+
+원래의 Vision Transformer에 이어, 여러 후속 연구들이 진행되었습니다:
+
+
+- [DeiT](deit) (Data-efficient Image Transformers) (Facebook AI 개발). DeiT 모델은 distilled vision transformers입니다.
+  DeiT의 저자들은 더 효율적으로 훈련된 ViT 모델도 공개했으며, 이는 [`ViTModel`] 또는 [`ViTForImageClassification`]에 바로 사용할 수 있습니다. 여기에는 3가지 크기로 4개의 변형이 제공됩니다: *facebook/deit-tiny-patch16-224*, *facebook/deit-small-patch16-224*, *facebook/deit-base-patch16-224* and *facebook/deit-base-patch16-384*. 그리고 모델에 이미지를 준비하려면 [`DeiTImageProcessor`]를 사용해야 한다는 점에 유의하십시오.
+
+- [BEiT](beit) (BERT pre-training of Image Transformers) (Microsoft Research 개발). BEiT 모델은 BERT (masked image modeling)에 영감을  받고 VQ-VAE에 기반한 self-supervised 방법을 이용하여 supervised pre-trained vision transformers보다 더 우수한 성능을 보입니다. 
+
+- DINO (Vision Transformers의 self-supervised 훈련을 위한 방법) (Facebook AI 개발). DINO 방법으로 훈련된 Vision Transformer는 학습되지 않은 상태에서도 객체를 분할할 수 있는 합성곱 신경망에서는 볼 수 없는 매우 흥미로운 능력을 보여줍니다. DINO 체크포인트는 [hub](https://huggingface.co/models?other=dino)에서 찾을 수 있습니다.
+
+- [MAE](vit_mae) (Masked Autoencoders) (Facebook AI 개발). Vision Transformer를 비대칭 인코더-디코더 아키텍처를 사용하여 마스크된 패치의 높은 비율(75%)에서 픽셀 값을 재구성하도록 사전 학습함으로써, 저자들은 이 간단한 방법이 미세 조정 후 supervised 방식의 사전 학습을 능가한다는 것을 보여주었습니다.
+
+이 모델은 [nielsr](https://huggingface.co/nielsr)에 의해 기여되었습니다. 원본 코드(JAX로 작성됨)은 [여기](https://github.com/google-research/vision_transformer)에서 확인할 수 있습니다.
+
+
+참고로, 우리는 Ross Wightman의 [timm 라이브러리](https://github.com/rwightman/pytorch-image-models)에서 JAX에서 PyTorch로 변환된 가중치를 다시 변환했습니다. 모든 공로는 그에게 돌립니다!
+
+## 사용 팁 [[usage-tips]]
+
+- Transformer 인코더에 이미지를 입력하기 위해, 각 이미지는 고정 크기의 겹치지 않는 패치들로 분할된 후 선형 임베딩됩니다. 전체 이미지를 대표하는 [CLS] 토큰이 추가되어, 분류에 사용할 수 있습니다. 저자들은 또한 절대 위치 임베딩을 추가하여, 결과적으로 생성된 벡터 시퀀스를 표준 Transformer 인코더에 입력합니다.
+- Vision Transformer는 모든 이미지가 동일한 크기(해상도)여야 하므로, [ViTImageProcessor]를 사용하여 이미지를 모델에 맞게 리사이즈(또는 리스케일)하고 정규화할 수 있습니다.
+- 사전 학습이나 미세 조정 시 사용된 패치 해상도와 이미지 해상도는 각 체크포인트의 이름에 반영됩니다. 예를 들어, `google/vit-base-patch16-224`는 패치 해상도가 16x16이고 미세 조정 해상도가 224x224인 기본 크기 아키텍처를 나타냅니다. 모든 체크포인트는 [hub](https://huggingface.co/models?search=vit)에서 확인할 수 있습니다.
+- 사용할 수 있는 체크포인트는 (1) [ImageNet-21k](http://www.image-net.org/) (1,400만 개의 이미지와 21,000개의 클래스)에서만 사전 학습되었거나, 또는 (2) [ImageNet](http://www.image-net.org/challenges/LSVRC/2012/) (ILSVRC 2012, 130만 개의 이미지와 1,000개의 클래스)에서 추가로 미세 조정된 경우입니다.
+- Vision Transformer는 224x224 해상도로 사전 학습되었습니다. 미세 조정 시, 사전 학습보다 더 높은 해상도를 사용하는 것이 유리한 경우가 많습니다 ([(Touvron et al., 2019)](https://arxiv.org/abs/1906.06423), [(Kolesnikovet al., 2020)](https://arxiv.org/abs/1912.11370). 더 높은 해상도로 미세 조정하기 위해, 저자들은 원본 이미지에서의 위치에 따라 사전 학습된 위치 임베딩의 2D 보간(interpolation)을 수행합니다.
+- 최고의 결과는 supervised 방식의 사전 학습에서 얻어졌으며, 이는 NLP에서는 해당되지 않는 경우가 많습니다. 저자들은 마스크된 패치 예측(마스크된 언어 모델링에서 영감을 받은 self-supervised 사전 학습 목표)을 사용한 실험도 수행했습니다. 이 접근 방식으로 더 작은 ViT-B/16 모델은 ImageNet에서 79.9%의 정확도를 달성하였으며, 이는 처음부터 학습한 것보다 2% 개선된 결과이지만, 여전히 supervised 사전 학습보다 4% 낮습니다.
+
+### Scaled Dot Product Attention (SDPA) 사용하기 [[using-scaled-dot-product-attention-sdpa]]
+
+PyTorch는 `torch.nn.functional`의 일부로서 native scaled dot-product attention (SDPA) 연산자를 포함하고 있습니다. 이 함수는 입력 및 사용 중인 하드웨어에 따라 여러 구현 방식을 적용할 수 있습니다.자세한 내용은 [공식 문서](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)나 [GPU 추론](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention) 페이지를 참조하십시오.
+
+SDPA는 `torch>=2.1.1`에서 구현이 가능한 경우 기본적으로 사용되지만, `from_pretrained()`에서 `attn_implementation="sdpa"`로 설정하여 SDPA를 명시적으로 요청할 수도 있습니다.
+
+```
+from transformers import ViTForImageClassification
+model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224", attn_implementation="sdpa", torch_dtype=torch.float16)
+...
+```
+
+최적의 속도 향상을 위해 모델을 반정밀도(예: `torch.float16` 또는 `torch.bfloat16`)로 로드하는 것을 권장합니다.
+
+로컬 벤치마크(A100-40GB, PyTorch 2.3.0, OS Ubuntu 22.04)에서 `float32`와 `google/vit-base-patch16-224` 모델을 사용한 추론 시, 다음과 같은 속도 향상을 확인했습니다.
+
+|   Batch size |   Average inference time (ms), eager mode |   Average inference time (ms), sdpa model |   Speed up, Sdpa / Eager (x) |
+|--------------|-------------------------------------------|-------------------------------------------|------------------------------|
+|            1 |                                         7 |                                         6 |                      1.17 |
+|            2 |                                         8 |                                         6 |                      1.33 |
+|            4 |                                         8 |                                         6 |                      1.33 |
+|            8 |                                         8 |                                         6 |                      1.33 |
+
+## 리소스 [[resources]]
+
+ViT의 추론 및 커스텀 데이터에 대한 미세 조정과 관련된 데모 노트북은 [여기](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/VisionTransformer)에서 확인할 수 있습니다. Hugging Face에서 공식적으로 제공하는 자료와 커뮤니티(🌎로 표시된) 자료 목록은 ViT를 시작하는 데 도움이 될 것입니다. 이 목록에 포함될 자료를 제출하고 싶다면 Pull Request를 열어 주시면 검토하겠습니다. 새로운 내용을 설명하는 자료가 가장 이상적이며, 기존 자료를 중복하지 않도록 해주십시오.
+
+`ViTForImageClassification` 은 다음에서 지원됩니다:
+<PipelineTag pipeline="image-classification"/>
+
+- [Hugging Face Transformers로 ViT를 이미지 분류에 맞게 미세 조정하는 방법](https://huggingface.co/blog/fine-tune-vit)에 대한 블로그 포스트
+- [Hugging Face Transformers와 `Keras`를 사용한 이미지 분류](https://www.philschmid.de/image-classification-huggingface-transformers-keras)에 대한 블로그 포스트
+- [Hugging Face Transformers를 사용한 이미지 분류 미세 조정](https://github.com/huggingface/notebooks/blob/main/examples/image_classification.ipynb)에 대한 노트북
+- [Hugging Face Trainer로 CIFAR-10에서 Vision Transformer 미세 조정](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb)에 대한 노트북
+- [PyTorch Lightning으로 CIFAR-10에서 Vision Transformer 미세 조정](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb)에 대한 노트북
+
+⚗️ 최적화
+
+- [Optimum을 사용한 양자화를 통해 Vision Transformer(ViT) 가속](https://www.philschmid.de/optimizing-vision-transformer)에 대한 블로그 포스트 
+
+⚡️ 추론
+
+- [Google Brain의 Vision Transformer(ViT) 빠른 데모](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Quick_demo_of_HuggingFace_version_of_Vision_Transformer_inference.ipynb)에 대한 노트북
+
+🚀 배포
+
+- [TF Serving으로 Hugging Face에서 Tensorflow Vision 모델 배포](https://huggingface.co/blog/tf-serving-vision)에 대한 블로그 포스트
+- [Vertex AI에서 Hugging Face ViT 배포](https://huggingface.co/blog/deploy-vertex-ai)에 대한 블로그 포스트
+- [TF Serving을 사용하여 Kubernetes에서 Hugging Face ViT 배포](https://huggingface.co/blog/deploy-tfserving-kubernetes)에 대한 블로그 포스트
+
+## ViTConfig [[transformers.ViTConfig]]
+
+[[autodoc]] ViTConfig
+
+## ViTFeatureExtractor [[transformers.ViTFeatureExtractor]]
+
+[[autodoc]] ViTFeatureExtractor
+    - __call__
+
+## ViTImageProcessor [[transformers.ViTImageProcessor]]
+
+[[autodoc]] ViTImageProcessor
+    - preprocess
+
+## ViTImageProcessorFast [[transformers.ViTImageProcessorFast]]
+
+[[autodoc]] ViTImageProcessorFast
+    - preprocess
+
+<frameworkcontent>
+<pt>
+
+## ViTModel [[transformers.ViTModel]]
+
+[[autodoc]] ViTModel
+    - forward
+
+## ViTForMaskedImageModeling [[transformers.ViTForMaskedImageModeling]]
+
+[[autodoc]] ViTForMaskedImageModeling
+    - forward
+
+## ViTForImageClassification [[transformers.ViTForImageClassification]]
+
+[[autodoc]] ViTForImageClassification
+    - forward
+
+</pt>
+<tf>
+
+## TFViTModel [[transformers.TFViTModel]]
+
+[[autodoc]] TFViTModel
+    - call
+
+## TFViTForImageClassification [[transformers.TFViTForImageClassification]]
+
+[[autodoc]] TFViTForImageClassification
+    - call
+
+</tf>
+<jax>
+
+## FlaxVitModel [[transformers.FlaxViTModel]]
+
+[[autodoc]] FlaxViTModel
+    - __call__
+
+## FlaxViTForImageClassification [[transformers.FlaxViTForImageClassification]]
+
+[[autodoc]] FlaxViTForImageClassification
+    - __call__
+
+</jax>
+</frameworkcontent>
\ No newline at end of file
diff --git a/docs/source/ko/modular_transformers.md b/docs/source/ko/modular_transformers.md
new file mode 100644
index 00000000000000..096dd4f13a9889
--- /dev/null
+++ b/docs/source/ko/modular_transformers.md
@@ -0,0 +1,96 @@
+# 모듈식 트랜스포머 [[modular-transformers]]
+
+`transformers`는 opinionated(자기 의견이 강한) 프레임워크이며, 우리의 철학은 다음의 [개념 가이드](./philosophy)에 정의되어 있습니다.
+
+이 철학의 핵심은 라이브러리의 [단일 모델, 단일 파일](https://huggingface.co/blog/transformers-design-philosophy) 측면에서 잘 나타납니다. 이 구성 요소의 단점은 파일 간에 구성 요소의 상속과 임포트 가능성을 제한한다는 것입니다.
+
+그 결과, 모델 구성 요소가 여러 파일에 걸쳐 반복되는 경향이 있습니다. `transformers`에는 모델 수만큼 많은 어텐션 레이어가 정의되어 있으며, 그 중 상당수는 서로 동일합니다. 안타깝게도, 수정과 변경 사항이 코드의 특정 부분에 적용되면서 독립적인 구현들이 서로 분기되는 경향이 있습니다.
+
+이 문제를 적절히 해결하기 위해, 우리는 라이브러리 전체에 "복사본"의 개념을 도입했습니다. 코드가 다른 코드의 복사본임을 나타내는 주석을 추가함으로써, CI 및 로컬 명령을 통해 복사본이 분기되지 않도록 강제할 수 있습니다. 그러나 복잡성이 낮더라도 이는 종종 매우 번거로운 작업입니다.
+
+마지막으로, 이 방식은 우리가 줄이고자 하는 상당한 오버헤드를 모델 기여 과정에 추가하게 됩니다. 이 접근 방식은 종종 모델 기여에 모델링 코드(~1,000줄), 프로세서(~500줄), 테스트, 문서 등을 추가해야 합니다. 모델 기여 PR은 대부분 3,000~5,000줄 이상의 코드를 추가하며, 이 중 많은 부분이 보일러플레이트(boilerplate) 코드입니다.
+
+이는 기여의 장벽을 높이며, 모듈식 트랜스포머를 통해 우리는 이러한 장벽을 훨씬 더 수용 가능한 수준으로 낮추고자 합니다.
+
+## 무엇인가요 [[what-is-it]]
+
+모듈식 트랜스포머는 모델 폴더에 "모듈식" 파일의 개념을 도입합니다. 이 모듈식 파일은 일반적으로 모델링/프로세싱 파일에서 허용되지 않는 코드를 허용하며, 이는 인접한 모델로부터의 임포트와 클래스 간의 상속을 허용합니다.
+
+이 모듈식 파일은 각각의 별도의 모듈에서 정의되었을 모델, 프로세서 및 구성 클래스를 정의합니다.
+
+마지막으로, 이 기능은 모듈식 파일을 "풀어내어" 단일 모델, 단일 파일 디렉토리 구조로 변환하는 새로운 `linter`를 도입합니다. 이 파일들은 스크립트가 실행될 때마다 자동으로 생성되며, 기여해야 할 내용을 모듈식 파일, 그리고 기여된 모델과 다른 모델 간의 차이점으로만 줄여줍니다.
+
+모델 사용자는 단일 파일 인터페이스를 임포트하고 사용하게 되므로, 여기에는 변화가 없을 것입니다. 이를 통해 간단한 기여를 가능하게 하면서도 우리의 철학을 유지하는 양쪽의 장점을 결합하고자 합니다.
+
+따라서 이는 `# Copied from` 마커의 대체품이며, 이전에 기여된 모델은 앞으로 몇 달 내에 새로운 모듈식 트랜스포머 형식으로 전환될 예정입니다.
+
+### 자세한 내용 [[details]]
+
+“linter”는 상속 구조를 풀어서 모듈화된 파일로부터 모든 단일 파일을 생성하며, Python 사용자들에게는 그 과정이 보이지 않도록 동작합니다. 현재 linter는 **단일** 수준의 상속만을 평탄화합니다.
+
+예를 들어:
+- 구성 클래스가 다른 클래스를 상속하고 인자를 추가/삭제하는 경우, 생성된 파일은 직접 참조(추가의 경우)하거나 완전히 제거합니다(삭제의 경우).
+- 클래스가 다른 클래스를 상속하는 경우, 예를 들어 class GemmaModel(LlamaModel): 의 경우, 종속성이 자동으로 추론됩니다. 모든 서브모듈은 슈퍼클래스로부터 자동으로 추론됩니다.
+
+토크나이저, 이미지 프로세서, 모델, 구성 등을 이 `modular` 파일에 모두 작성할 수 있으며, 해당 파일들이 자동으로 생성됩니다.
+
+### 시행 [[enforcement]]
+
+[TODO] 우리는 새로운 테스트를 도입하여 생성된 콘텐츠가 `modular_xxxx.py`에 있는 내용과 일치하는지 확인합니다.
+
+### 예시 [[examples]]
+
+여기 BERT와 RoBERTa의 간단한 예가 있습니다. 두 모델은 밀접하게 관련되어 있으며, 모델 구현의 차이는 임베딩 레이어의 변경에서만 있습니다.
+
+모델을 완전히 재정의하는 대신, `modular_roberta.py` 파일은 모델링 및 구성 클래스를 위해 다음과 같이 생겼습니다. (예시를 위해, 토크나이저는 매우 다르므로 일단 무시합니다.)
+
+```python
+from torch import nn
+from ..bert.configuration_bert import BertConfig
+from ..bert.modeling_bert import (
+    BertModel,
+    BertEmbeddings,
+    BertForMaskedLM
+)
+
+# RoBERTa 구성은 BERT의 구성과 동일합니다
+class RobertaConfig(BertConfig):
+    model_type = 'roberta'
+
+# 여기서 패딩 ID 차이를 강조하기 위해 임베딩을 재정의하고, 위치 임베딩을 재정의합니다
+class RobertaEmbeddings(BertEmbeddings):
+    def __init__(self, config):
+        super().__init__(config())
+
+        self.padding_idx = config.pad_token_id
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
+        )
+
+# RoBERTa 모델은 임베딩 레이어를 제외하면 BERT 모델과 동일합니다.
+# 위에서 임베딩을 재정의했으므로, 여기서는 추가 작업이 필요 없습니다
+class RobertaModel(BertModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.embeddings = RobertaEmbeddings(config)
+
+# 헤드는 이제 내부에서 올바른 `RobertaModel`을 재정의하기만 하면 됩니다
+class RobertaForMaskedLM(BertForMaskedLM):
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = RobertaModel(config)
+```
+
+정의한 종속성을 사용하지 않으면 다음과 같은 오류가 발생합니다:
+
+```bash
+ValueError: You defined `RobertaEmbeddings` in the modular_roberta.py, it should be used
+                                        when you define `BertModel`, as it is one of it's direct dependencies. Make sure
+                                        you use it in the `__init__` function.
+```
+
+또한, 다음에서 예시 목록을 찾을 수 있습니다:
+
+## 무엇이 아닌가요 [[what-it-is-not]]
+
+(아직은?) 모델링 코드를 대체하는 것은 아닙니다. 그리고 여러분의 모델이 지금까지 존재했던 다른 어떤 것에도 기반하지 않는다면, 기존과 같이 `modeling` 파일을 추가할 수 있습니다.
diff --git a/docs/source/ko/quicktour.md b/docs/source/ko/quicktour.md
index 0dc4887b8894b3..06f44e6fd2970c 100644
--- a/docs/source/ko/quicktour.md
+++ b/docs/source/ko/quicktour.md
@@ -486,7 +486,7 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
 ...     args=training_args,
 ...     train_dataset=dataset["train"],
 ...     eval_dataset=dataset["test"],
-...     tokenizer=tokenizer,
+...     processing_class=tokenizer,
 ...     data_collator=data_collator,
 ... )  # doctest: +SKIP
 ```
@@ -554,4 +554,4 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
 
 ## 다음 단계는 무엇인가요? [[whats-next]]
 
-🤗 Transformers 둘러보기를 모두 읽으셨다면, 가이드를 살펴보고 더 구체적인 것을 수행하는 방법을 알아보세요. 이를테면 커스텀 모델 구축하는 방법, 과업에 알맞게 모델을 미세조정하는 방법, 스크립트로 모델 훈련하는 방법 등이 있습니다. 🤗 Transformers 핵심 개념에 대해 더 알아보려면 커피 한 잔 들고 개념 가이드를 살펴보세요!
\ No newline at end of file
+🤗 Transformers 둘러보기를 모두 읽으셨다면, 가이드를 살펴보고 더 구체적인 것을 수행하는 방법을 알아보세요. 이를테면 커스텀 모델 구축하는 방법, 과업에 알맞게 모델을 미세조정하는 방법, 스크립트로 모델 훈련하는 방법 등이 있습니다. 🤗 Transformers 핵심 개념에 대해 더 알아보려면 커피 한 잔 들고 개념 가이드를 살펴보세요!
diff --git a/docs/source/ko/tasks/asr.md b/docs/source/ko/tasks/asr.md
index 2247537678abea..d1e4a5e1d91983 100644
--- a/docs/source/ko/tasks/asr.md
+++ b/docs/source/ko/tasks/asr.md
@@ -20,7 +20,7 @@ rendered properly in your Markdown viewer.
 
 <Youtube id="TksaY_FDgnk"/>
 
-자동 음성 인식(Automatic Speech Recognition, ASR)은 음성 신호를 텍스트로 변환하여 음성 입력 시퀀스를 텍스트 출력에 매핑합니다. 
+자동 음성 인식(Automatic Speech Recognition, ASR)은 음성 신호를 텍스트로 변환하여 음성 입력 시퀀스를 텍스트 출력에 매핑합니다.
 Siri와 Alexa와 같은 가상 어시스턴트는 ASR 모델을 사용하여 일상적으로 사용자를 돕고 있으며, 회의 중 라이브 캡션 및 메모 작성과 같은 유용한 사용자 친화적 응용 프로그램도 많이 있습니다.
 
 이 가이드에서 소개할 내용은 아래와 같습니다:
@@ -50,7 +50,7 @@ Hugging Face 계정에 로그인하면 모델을 업로드하고 커뮤니티에
 
 ## MInDS-14 데이터 세트 가져오기[[load-minds-14-dataset]]
 
-먼저, 🤗 Datasets 라이브러리에서 [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) 데이터 세트의 일부분을 가져오세요. 
+먼저, 🤗 Datasets 라이브러리에서 [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) 데이터 세트의 일부분을 가져오세요.
 이렇게 하면 전체 데이터 세트에 대한 훈련에 시간을 들이기 전에 모든 것이 작동하는지 실험하고 검증할 수 있습니다.
 
 ```py
@@ -198,7 +198,7 @@ MInDS-14 데이터 세트의 샘플링 레이트는 8000kHz이므로([데이터
 
 ## 평가하기[[evaluate]]
 
-훈련 중에 평가 지표를 포함하면 모델의 성능을 평가하는 데 도움이 되는 경우가 많습니다. 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) 라이브러리를 사용하면 평가 방법을 빠르게 불러올 수 있습니다. 
+훈련 중에 평가 지표를 포함하면 모델의 성능을 평가하는 데 도움이 되는 경우가 많습니다. 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) 라이브러리를 사용하면 평가 방법을 빠르게 불러올 수 있습니다.
 이 작업에서는 [단어 오류율(Word Error Rate, WER)](https://huggingface.co/spaces/evaluate-metric/wer) 평가 지표를 가져옵니다.
 (평가 지표를 불러오고 계산하는 방법은 🤗 Evaluate [둘러보기](https://huggingface.co/docs/evaluate/a_quick_tour)를 참조하세요):
 
@@ -285,7 +285,7 @@ MInDS-14 데이터 세트의 샘플링 레이트는 8000kHz이므로([데이터
 ...     args=training_args,
 ...     train_dataset=encoded_minds["train"],
 ...     eval_dataset=encoded_minds["test"],
-...     tokenizer=processor.feature_extractor,
+...     processing_class=processor.feature_extractor,
 ...     data_collator=data_collator,
 ...     compute_metrics=compute_metrics,
 ... )
@@ -372,4 +372,4 @@ MInDS-14 데이터 세트의 샘플링 레이트는 8000kHz이므로([데이터
 ['I WOUL LIKE O SET UP JOINT ACOUNT WTH Y PARTNER']
 ```
 </pt>
-</frameworkcontent>
\ No newline at end of file
+</frameworkcontent>
diff --git a/docs/source/ko/tasks/audio_classification.md b/docs/source/ko/tasks/audio_classification.md
index 73932100b0cb3a..936b4eb1989827 100644
--- a/docs/source/ko/tasks/audio_classification.md
+++ b/docs/source/ko/tasks/audio_classification.md
@@ -235,7 +235,7 @@ MinDS-14 데이터 세트의 샘플링 속도는 8000khz이므로(이 정보는
 ...     args=training_args,
 ...     train_dataset=encoded_minds["train"],
 ...     eval_dataset=encoded_minds["test"],
-...     tokenizer=feature_extractor,
+...     processing_class=feature_extractor,
 ...     compute_metrics=compute_metrics,
 ... )
 
@@ -321,4 +321,4 @@ For a more in-depth example of how to finetune a model for audio classification,
 'cash_deposit'
 ```
 </pt>
-</frameworkcontent>
\ No newline at end of file
+</frameworkcontent>
diff --git a/docs/source/ko/tasks/document_question_answering.md b/docs/source/ko/tasks/document_question_answering.md
index 3d943ab96e6765..6c2d04f4ee8598 100644
--- a/docs/source/ko/tasks/document_question_answering.md
+++ b/docs/source/ko/tasks/document_question_answering.md
@@ -18,8 +18,8 @@ rendered properly in your Markdown viewer.
 
 [[open-in-colab]]
 
-문서 시각적 질의 응답(Document Visual Question Answering)이라고도 하는 
-문서 질의 응답(Document Question Answering)은 문서 이미지에 대한 질문에 답변을 주는 태스크입니다. 
+문서 시각적 질의 응답(Document Visual Question Answering)이라고도 하는
+문서 질의 응답(Document Question Answering)은 문서 이미지에 대한 질문에 답변을 주는 태스크입니다.
 이 태스크를 지원하는 모델의 입력은 일반적으로 이미지와 질문의 조합이고, 출력은 자연어로 된 답변입니다. 이러한 모델은 텍스트, 단어의 위치(바운딩 박스), 이미지 등 다양한 모달리티를 활용합니다.
 
 이 가이드는 다음 내용을 설명합니다:
@@ -72,7 +72,7 @@ pip install -q pytesseract
 
 ## 데이터 불러오기 [[load-the-data]]
 
-이 가이드에서는 🤗 Hub에서 찾을 수 있는 전처리된 DocVQA의 작은 샘플을 사용합니다. 
+이 가이드에서는 🤗 Hub에서 찾을 수 있는 전처리된 DocVQA의 작은 샘플을 사용합니다.
 DocVQA의 전체 데이터 세트를 사용하고 싶다면, [DocVQA homepage](https://rrc.cvc.uab.es/?ch=17)에 가입 후 다운로드 할 수 있습니다. 전체 데이터 세트를 다운로드 했다면, 이 가이드를 계속 진행하기 위해 [🤗 dataset에 파일을 가져오는 방법](https://huggingface.co/docs/datasets/loading#local-and-remote-files)을 확인하세요.
 
 ```py
@@ -124,9 +124,9 @@ DatasetDict({
 >>> updated_dataset = updated_dataset.filter(lambda x: len(x["words"]) + len(x["question"].split()) < 512)
 ```
 
-이 시점에서 이 데이터 세트의 OCR 특성도 제거해 보겠습니다. OCR 특성은 다른 모델을 미세 조정하기 위한 것으로, 이 가이드에서 사용하는 모델의 입력 요구 사항과 일치하지 않기 때문에 이 특성을 사용하기 위해서는 일부 처리가 필요합니다. 
+이 시점에서 이 데이터 세트의 OCR 특성도 제거해 보겠습니다. OCR 특성은 다른 모델을 미세 조정하기 위한 것으로, 이 가이드에서 사용하는 모델의 입력 요구 사항과 일치하지 않기 때문에 이 특성을 사용하기 위해서는 일부 처리가 필요합니다.
 대신, 원본 데이터에 [`LayoutLMv2Processor`]를 사용하여 OCR 및 토큰화를 모두 수행할 수 있습니다.
-이렇게 하면 모델이 요구하는 입력을 얻을 수 있습니다. 
+이렇게 하면 모델이 요구하는 입력을 얻을 수 있습니다.
 이미지를 수동으로 처리하려면, [`LayoutLMv2` model documentation](../model_doc/layoutlmv2)에서 모델이 요구하는 입력 포맷을 확인해보세요.
 
 ```py
@@ -186,7 +186,7 @@ DatasetDict({
 ### 텍스트 데이터 전처리 [[preprocessing-text-data]]
 
 이미지에 OCR을 적용했으면 데이터 세트의 텍스트 부분을 모델에 맞게 인코딩해야 합니다.
-이 인코딩에는 이전 단계에서 가져온 단어와 박스를 토큰 수준의 `input_ids`, `attention_mask`, `token_type_ids` 및 `bbox`로 변환하는 작업이 포함됩니다. 
+이 인코딩에는 이전 단계에서 가져온 단어와 박스를 토큰 수준의 `input_ids`, `attention_mask`, `token_type_ids` 및 `bbox`로 변환하는 작업이 포함됩니다.
 텍스트를 전처리하려면 프로세서의 `tokenizer`가 필요합니다.
 
 ```py
@@ -197,8 +197,8 @@ DatasetDict({
 
 레이블 추가를 위해서, 먼저 더 큰 리스트(단어 리스트)에서 하위 리스트(단어로 분할된 답변)을 찾을 수 있는 헬퍼 함수를 정의합니다.
 
-이 함수는 `words_list`와 `answer_list`, 이렇게 두 리스트를 입력으로 받습니다. 
-그런 다음 `words_list`를 반복하여 `words_list`의 현재 단어(words_list[i])가 `answer_list`의 첫 번째 단어(answer_list[0])와 같은지, 
+이 함수는 `words_list`와 `answer_list`, 이렇게 두 리스트를 입력으로 받습니다.
+그런 다음 `words_list`를 반복하여 `words_list`의 현재 단어(words_list[i])가 `answer_list`의 첫 번째 단어(answer_list[0])와 같은지,
 현재 단어에서 시작해 `answer_list`와 같은 길이만큼의 `words_list`의 하위 리스트가 `answer_list`와 일치하는지 확인합니다.
 이 조건이 참이라면 일치하는 항목을 발견했음을 의미하며, 함수는 일치 항목, 시작 인덱스(idx) 및 종료 인덱스(idx + len(answer_list) - 1)를 기록합니다. 일치하는 항목이 두 개 이상 발견되면 함수는 첫 번째 항목만 반환합니다. 일치하는 항목이 없다면 함수는 (`None`, 0, 0)을 반환합니다.
 
@@ -349,7 +349,7 @@ end_index 18
 
 ## 훈련 [[train]]
 
-축하합니다! 이 가이드의 가장 어려운 부분을 성공적으로 처리했으니 이제 나만의 모델을 훈련할 준비가 되었습니다. 
+축하합니다! 이 가이드의 가장 어려운 부분을 성공적으로 처리했으니 이제 나만의 모델을 훈련할 준비가 되었습니다.
 훈련은 다음과 같은 단계로 이루어져 있습니다:
 * 전처리에서의 동일한 체크포인트를 사용하기 위해 [`AutoModelForDocumentQuestionAnswering`]으로 모델을 가져옵니다.
 * [`TrainingArguments`]로 훈련 하이퍼파라미터를 정합니다.
@@ -406,7 +406,7 @@ end_index 18
 ...     data_collator=data_collator,
 ...     train_dataset=encoded_train_dataset,
 ...     eval_dataset=encoded_test_dataset,
-...     tokenizer=processor,
+...     processing_class=processor,
 ... )
 
 >>> trainer.train()
@@ -421,7 +421,7 @@ end_index 18
 
 ## 추론 [[inference]]
 
-이제 LayoutLMv2 모델을 미세 조정하고 🤗 Hub에 업로드했으니 추론에도 사용할 수 있습니다. 
+이제 LayoutLMv2 모델을 미세 조정하고 🤗 Hub에 업로드했으니 추론에도 사용할 수 있습니다.
 추론을 위해 미세 조정된 모델을 사용해 보는 가장 간단한 방법은 [`Pipeline`]을 사용하는 것 입니다.
 
 예를 들어 보겠습니다:
@@ -473,4 +473,4 @@ end_index 18
 
 >>> processor.tokenizer.decode(encoding.input_ids.squeeze()[predicted_start_idx : predicted_end_idx + 1])
 'lee a. waller'
-```
\ No newline at end of file
+```
diff --git a/docs/source/ko/tasks/image_classification.md b/docs/source/ko/tasks/image_classification.md
index 91ff3a9ca9b848..4955bd6cdf8108 100644
--- a/docs/source/ko/tasks/image_classification.md
+++ b/docs/source/ko/tasks/image_classification.md
@@ -157,7 +157,7 @@ Hugging Face 계정에 로그인하여 모델을 업로드하고 커뮤니티에
 
 과적합을 방지하고 모델을 보다 견고하게 만들기 위해 데이터 세트의 훈련 부분에 데이터 증강을 추가합니다.
 여기서 Keras 전처리 레이어로 훈련 데이터에 대한 변환(데이터 증강 포함)과
-검증 데이터에 대한 변환(중앙 크로핑, 크기 조정, 정규화만)을 정의합니다. 
+검증 데이터에 대한 변환(중앙 크로핑, 크기 조정, 정규화만)을 정의합니다.
 `tf.image` 또는 다른 원하는 라이브러리를 사용할 수 있습니다.
 
 ```py
@@ -241,7 +241,7 @@ food["test"].set_transform(preprocess_val)
 ## 평가[[evaluate]]
 
 훈련 중에 평가 지표를 포함하면 모델의 성능을 평가하는 데 도움이 되는 경우가 많습니다.
-🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) 라이브러리로 평가 방법을 빠르게 가져올 수 있습니다. 이 작업에서는 
+🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) 라이브러리로 평가 방법을 빠르게 가져올 수 있습니다. 이 작업에서는
 [accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy) 평가 지표를 가져옵니다. (🤗 Evaluate [빠른 둘러보기](https://huggingface.co/docs/evaluate/a_quick_tour)를 참조하여 평가 지표를 가져오고 계산하는 방법에 대해 자세히 알아보세요):
 
 ```py
@@ -317,7 +317,7 @@ food["test"].set_transform(preprocess_val)
 ...     data_collator=data_collator,
 ...     train_dataset=food["train"],
 ...     eval_dataset=food["test"],
-...     tokenizer=image_processor,
+...     processing_class=image_processor,
 ...     compute_metrics=compute_metrics,
 ... )
 
@@ -404,7 +404,7 @@ TensorFlow에서 모델을 미세 조정하려면 다음 단계를 따르세요:
 ```
 
 예측에서 정확도를 계산하고 모델을 🤗 Hub로 푸시하려면 [Keras callbacks](../main_classes/keras_callbacks)를 사용하세요.
-`compute_metrics` 함수를 [KerasMetricCallback](../main_classes/keras_callbacks#transformers.KerasMetricCallback)에 전달하고, 
+`compute_metrics` 함수를 [KerasMetricCallback](../main_classes/keras_callbacks#transformers.KerasMetricCallback)에 전달하고,
 [PushToHubCallback](../main_classes/keras_callbacks#transformers.PushToHubCallback)을 사용하여 모델을 업로드합니다:
 
 ```py
diff --git a/docs/source/ko/tasks/multiple_choice.md b/docs/source/ko/tasks/multiple_choice.md
index 607bc047479ce1..f2755da4a8bf37 100644
--- a/docs/source/ko/tasks/multiple_choice.md
+++ b/docs/source/ko/tasks/multiple_choice.md
@@ -270,7 +270,7 @@ tokenized_swag = swag.map(preprocess_function, batched=True)
 ...     args=training_args,
 ...     train_dataset=tokenized_swag["train"],
 ...     eval_dataset=tokenized_swag["validation"],
-...     tokenizer=tokenizer,
+...     processing_class=tokenizer,
 ...     data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
 ...     compute_metrics=compute_metrics,
 ... )
diff --git a/docs/source/ko/tasks/object_detection.md b/docs/source/ko/tasks/object_detection.md
index 2b92d7edb59ff7..e027ad65a9ada8 100644
--- a/docs/source/ko/tasks/object_detection.md
+++ b/docs/source/ko/tasks/object_detection.md
@@ -361,7 +361,7 @@ DatasetDict({
 ...     args=training_args,
 ...     data_collator=collate_fn,
 ...     train_dataset=cppe5["train"],
-...     tokenizer=image_processor,
+...     processing_class=image_processor,
 ... )
 
 >>> trainer.train()
diff --git a/docs/source/ko/tasks/question_answering.md b/docs/source/ko/tasks/question_answering.md
index cebd9e1a78a4b0..8309dd7d753244 100644
--- a/docs/source/ko/tasks/question_answering.md
+++ b/docs/source/ko/tasks/question_answering.md
@@ -223,7 +223,7 @@ pip install transformers datasets evaluate
 ...     args=training_args,
 ...     train_dataset=tokenized_squad["train"],
 ...     eval_dataset=tokenized_squad["test"],
-...     tokenizer=tokenizer,
+...     processing_class=tokenizer,
 ...     data_collator=data_collator,
 ... )
 
diff --git a/docs/source/ko/tasks/sequence_classification.md b/docs/source/ko/tasks/sequence_classification.md
index b9812e63b0631e..11dae1a965a4f2 100644
--- a/docs/source/ko/tasks/sequence_classification.md
+++ b/docs/source/ko/tasks/sequence_classification.md
@@ -190,7 +190,7 @@ tokenized_imdb = imdb.map(preprocess_function, batched=True)
 ...     args=training_args,
 ...     train_dataset=tokenized_imdb["train"],
 ...     eval_dataset=tokenized_imdb["test"],
-...     tokenizer=tokenizer,
+...     processing_class=tokenizer,
 ...     data_collator=data_collator,
 ...     compute_metrics=compute_metrics,
 ... )
diff --git a/docs/source/ko/tasks/summarization.md b/docs/source/ko/tasks/summarization.md
index 501aaae7268121..a2b2b1fbc95498 100644
--- a/docs/source/ko/tasks/summarization.md
+++ b/docs/source/ko/tasks/summarization.md
@@ -223,7 +223,7 @@ Hugging Face 계정에 로그인하면 모델을 업로드하고 커뮤니티에
 ...     args=training_args,
 ...     train_dataset=tokenized_billsum["train"],
 ...     eval_dataset=tokenized_billsum["test"],
-...     tokenizer=tokenizer,
+...     processing_class=tokenizer,
 ...     data_collator=data_collator,
 ...     compute_metrics=compute_metrics,
 ... )
diff --git a/docs/source/ko/tasks/token_classification.md b/docs/source/ko/tasks/token_classification.md
index e32a18e1ee0a04..a65503092cee1d 100644
--- a/docs/source/ko/tasks/token_classification.md
+++ b/docs/source/ko/tasks/token_classification.md
@@ -107,7 +107,7 @@ Hugging Face 계정에 로그인하여 모델을 업로드하고 커뮤니티에
 >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
-위의 예제 `tokens` 필드를 보면 입력이 이미 토큰화된 것처럼 보입니다. 그러나 실제로 입력은 아직 토큰화되지 않았으므로 단어를 하위 단어로 토큰화하기 위해 `is_split_into_words=True`를 설정해야 합니다. 예제로 확인합니다: 
+위의 예제 `tokens` 필드를 보면 입력이 이미 토큰화된 것처럼 보입니다. 그러나 실제로 입력은 아직 토큰화되지 않았으므로 단어를 하위 단어로 토큰화하기 위해 `is_split_into_words=True`를 설정해야 합니다. 예제로 확인합니다:
 
 ```py
 >>> example = wnut["train"][0]
@@ -294,7 +294,7 @@ Hugging Face 계정에 로그인하여 모델을 업로드하고 커뮤니티에
 ...     args=training_args,
 ...     train_dataset=tokenized_wnut["train"],
 ...     eval_dataset=tokenized_wnut["test"],
-...     tokenizer=tokenizer,
+...     processing_class=tokenizer,
 ...     data_collator=data_collator,
 ...     compute_metrics=compute_metrics,
 ... )
@@ -405,8 +405,8 @@ TensorFlow에서 모델을 파인 튜닝하려면, 먼저 옵티마이저 함수
 
 <Tip>
 
-토큰 분류를 위한 모델을 파인 튜닝하는 자세한 예제는 다음 
-[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb) 
+토큰 분류를 위한 모델을 파인 튜닝하는 자세한 예제는 다음
+[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb)
 또는 [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb)를 참조하세요.
 
 </Tip>
diff --git a/docs/source/ko/tasks/translation.md b/docs/source/ko/tasks/translation.md
index 88dbb405e13693..5b4eaaa6125a11 100644
--- a/docs/source/ko/tasks/translation.md
+++ b/docs/source/ko/tasks/translation.md
@@ -221,7 +221,7 @@ pip install transformers datasets evaluate sacrebleu
 ...     args=training_args,
 ...     train_dataset=tokenized_books["train"],
 ...     eval_dataset=tokenized_books["test"],
-...     tokenizer=tokenizer,
+...     processing_class=tokenizer,
 ...     data_collator=data_collator,
 ...     compute_metrics=compute_metrics,
 ... )
diff --git a/docs/source/ko/tasks/video_classification.md b/docs/source/ko/tasks/video_classification.md
index f18ef918fa956e..10569083c09f85 100644
--- a/docs/source/ko/tasks/video_classification.md
+++ b/docs/source/ko/tasks/video_classification.md
@@ -61,7 +61,7 @@ pip install -q pytorchvideo transformers evaluate
 ```
 
 데이터 세트의 하위 집합이 다운로드 되면, 압축된 파일의 압축을 해제해야 합니다:
-```py 
+```py
 >>> import tarfile
 
 >>> with tarfile.open(file_path) as t:
@@ -124,9 +124,9 @@ UCF101_subset/
 그 다음으로, 데이터 세트에 존재하는 라벨을 추출합니다. 또한, 모델을 초기화할 때 도움이 될 딕셔너리(dictionary data type)를 생성합니다.
 
 * `label2id`: 클래스 이름을 정수에 매핑합니다.
-* `id2label`: 정수를 클래스 이름에 매핑합니다. 
+* `id2label`: 정수를 클래스 이름에 매핑합니다.
 
-```py 
+```py
 >>> class_labels = sorted({str(path).split("/")[2] for path in all_video_file_paths})
 >>> label2id = {label: i for i, label in enumerate(class_labels)}
 >>> id2label = {i: label for label, i in label2id.items()}
@@ -142,7 +142,7 @@ UCF101_subset/
 
 사전 훈련된 체크포인트와 체크포인트에 연관된 이미지 프로세서를 사용하여 영상 분류 모델을 인스턴스화합니다. 모델의 인코더에는 미리 학습된 매개변수가 제공되며, 분류 헤드(데이터를 분류하는 마지막 레이어)는 무작위로 초기화됩니다. 데이터 세트의 전처리 파이프라인을 작성할 때는 이미지 프로세서가 유용합니다.
 
-```py 
+```py
 >>> from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification
 
 >>> model_ckpt = "MCG-NJU/videomae-base"
@@ -174,7 +174,7 @@ You should probably TRAIN this model on a down-stream task to be able to use it
 
 영상 전처리를 위해 [PyTorchVideo 라이브러리](https://pytorchvideo.org/)를 활용할 것입니다. 필요한 종속성을 가져오는 것으로 시작하세요.
 
-```py 
+```py
 >>> import pytorchvideo.data
 
 >>> from pytorchvideo.transforms import (
@@ -223,7 +223,7 @@ You should probably TRAIN this model on a down-stream task to be able to use it
 
 이제 데이터 세트에 특화된 전처리(transform)과 데이터 세트 자체를 정의합니다. 먼저 훈련 데이터 세트로 시작합니다:
 
-```py 
+```py
 >>> train_transform = Compose(
 ...     [
 ...         ApplyTransformToKey(
@@ -252,7 +252,7 @@ You should probably TRAIN this model on a down-stream task to be able to use it
 
 같은 방식의 작업 흐름을 검증과 평가 세트에도 적용할 수 있습니다.
 
-```py 
+```py
 >>> val_transform = Compose(
 ...     [
 ...         ApplyTransformToKey(
@@ -296,7 +296,7 @@ You should probably TRAIN this model on a down-stream task to be able to use it
 
 ## 더 나은 디버깅을 위해 전처리 영상 시각화하기[[visualize-the-preprocessed-video-for-better-debugging]]
 
-```py 
+```py
 >>> import imageio
 >>> import numpy as np
 >>> from IPython.display import Image
@@ -309,7 +309,7 @@ You should probably TRAIN this model on a down-stream task to be able to use it
 
 >>> def create_gif(video_tensor, filename="sample.gif"):
 ...     """Prepares a GIF from a video tensor.
-...     
+...
 ...     The video tensor is expected to have the following shape:
 ...     (num_frames, num_channels, height, width).
 ...     """
@@ -336,13 +336,13 @@ You should probably TRAIN this model on a down-stream task to be able to use it
     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/sample_gif.gif" alt="Person playing basketball"/>
 </div>
 
-## 모델 훈련하기[[train-the-model]] 
+## 모델 훈련하기[[train-the-model]]
 
 🤗 Transformers의 [`Trainer`](https://huggingface.co/docs/transformers/main_classes/trainer)를 사용하여 모델을 훈련시켜보세요. `Trainer`를 인스턴스화하려면 훈련 설정과 평가 지표를 정의해야 합니다.  가장 중요한 것은 [`TrainingArguments`](https://huggingface.co/transformers/main_classes/trainer.html#transformers.TrainingArguments)입니다. 이 클래스는 훈련을 구성하는 모든 속성을 포함하며, 훈련 중 체크포인트를 저장할 출력 폴더 이름을 필요로 합니다. 또한 🤗 Hub의 모델 저장소의 모든 정보를 동기화하는 데 도움이 됩니다.
 
 대부분의 훈련 인수는 따로 설명할 필요는 없습니다. 하지만 여기에서 중요한 인수는 `remove_unused_columns=False` 입니다. 이 인자는 모델의 호출 함수에서 사용되지 않는 모든 속성 열(columns)을 삭제합니다. 기본값은 일반적으로 True입니다. 이는 사용되지 않는 기능 열을 삭제하는 것이 이상적이며, 입력을 모델의 호출 함수로 풀기(unpack)가 쉬워지기 때문입니다. 하지만 이 경우에는 `pixel_values`(모델의 입력으로 필수적인 키)를 생성하기 위해 사용되지 않는 기능('video'가 특히 그렇습니다)이 필요합니다. 따라서 remove_unused_columns을 False로 설정해야 합니다.
 
-```py 
+```py
 >>> from transformers import TrainingArguments, Trainer
 
 >>> model_name = model_ckpt.split("/")[-1]
@@ -387,7 +387,7 @@ def compute_metrics(eval_pred):
 
 또한, 예제를 묶어서 배치를 형성하는 `collate_fn`을 정의해야합니다. 각 배치는 `pixel_values`와 `labels`라는 2개의 키로 구성됩니다.
 
-```py 
+```py
 >>> def collate_fn(examples):
 ...     # permute to (num_frames, num_channels, height, width)
 ...     pixel_values = torch.stack(
@@ -399,13 +399,13 @@ def compute_metrics(eval_pred):
 
 그런 다음 이 모든 것을 데이터 세트와 함께 `Trainer`에 전달하기만 하면 됩니다:
 
-```py 
+```py
 >>> trainer = Trainer(
 ...     model,
 ...     args,
 ...     train_dataset=train_dataset,
 ...     eval_dataset=val_dataset,
-...     tokenizer=image_processor,
+...     processing_class=image_processor,
 ...     compute_metrics=compute_metrics,
 ...     data_collator=collate_fn,
 ... )
@@ -415,7 +415,7 @@ def compute_metrics(eval_pred):
 
 `train` 메소드를 호출하여 모델을 미세 조정하세요:
 
-```py 
+```py
 >>> train_results = trainer.train()
 ```
 
@@ -429,7 +429,7 @@ def compute_metrics(eval_pred):
 좋습니다. 이제 미세 조정된 모델을 추론하는 데 사용할 수 있습니다.
 
 추론에 사용할 영상을 불러오세요:
-```py 
+```py
 >>> sample_test_video = next(iter(test_dataset))
 ```
 
@@ -485,7 +485,7 @@ def compute_metrics(eval_pred):
 
 `logits`을 디코딩하면, 우리는 다음 결과를 얻을 수 있습니다:
 
-```py 
+```py
 >>> predicted_class_idx = logits.argmax(-1).item()
 >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
 # Predicted class: BasketballDunk
diff --git a/docs/source/ko/tasks/visual_question_answering.md b/docs/source/ko/tasks/visual_question_answering.md
index f8560b14f9b8a1..9bc87c071e62b1 100644
--- a/docs/source/ko/tasks/visual_question_answering.md
+++ b/docs/source/ko/tasks/visual_question_answering.md
@@ -35,7 +35,7 @@ VQA의 주요 사용 사례는 다음과 같습니다:
 ## ViLT 미세 조정 [[finetuning-vilt]]
 
 ViLT는 Vision Transformer (ViT) 내에 텍스트 임베딩을 포함하여 비전/자연어 사전훈련(VLP; Vision-and-Language Pretraining)을 위한 기본 디자인을 제공합니다.
-ViLT 모델은 비전 트랜스포머(ViT)에 텍스트 임베딩을 넣어 비전/언어 사전훈련(VLP; Vision-and-Language Pre-training)을 위한 기본적인 디자인을 갖췄습니다. 이 모델은 여러 다운스트림 작업에 사용할 수 있습니다. VQA 태스크에서는 (`[CLS]` 토큰의 최종 은닉 상태 위에 선형 레이어인) 분류 헤더가 있으며 무작위로 초기화됩니다. 
+ViLT 모델은 비전 트랜스포머(ViT)에 텍스트 임베딩을 넣어 비전/언어 사전훈련(VLP; Vision-and-Language Pre-training)을 위한 기본적인 디자인을 갖췄습니다. 이 모델은 여러 다운스트림 작업에 사용할 수 있습니다. VQA 태스크에서는 (`[CLS]` 토큰의 최종 은닉 상태 위에 선형 레이어인) 분류 헤더가 있으며 무작위로 초기화됩니다.
 따라서 여기에서 시각적 질의응답은 **분류 문제**로 취급됩니다.
 
 최근의 BLIP, BLIP-2, InstructBLIP와 같은 모델들은 VQA를 생성형 작업으로 간주합니다. 가이드의 후반부에서는 이런 모델들을 사용하여 제로샷 VQA 추론을 하는 방법에 대해 설명하겠습니다.
@@ -104,7 +104,7 @@ Dataset({
 
 나머지 특성들은 필요하지 않기 때문에 삭제해도 됩니다:
 
-```py 
+```py
 >>> dataset = dataset.remove_columns(['question_type', 'question_id', 'answer_type'])
 ```
 
@@ -137,7 +137,7 @@ Dataset({
 >>> unique_labels = list(set(flattened_labels))
 
 >>> label2id = {label: idx for idx, label in enumerate(unique_labels)}
->>> id2label = {idx: label for label, idx in label2id.items()} 
+>>> id2label = {idx: label for label, idx in label2id.items()}
 ```
 
 이제 매핑이 완료되었으므로 문자열 답변을 해당 id로 교체하고, 데이터세트의 더 편리한 후처리를 위해 편평화 할 수 있습니다.
@@ -159,10 +159,10 @@ Dataset({
 
 ## 데이터 전처리 [[preprocessing-data]]
 
-다음 단계는 모델을 위해 이미지와 텍스트 데이터를 준비하기 위해 ViLT 프로세서를 가져오는 것입니다. 
+다음 단계는 모델을 위해 이미지와 텍스트 데이터를 준비하기 위해 ViLT 프로세서를 가져오는 것입니다.
 [`ViltProcessor`]는 BERT 토크나이저와 ViLT 이미지 프로세서를 편리하게 하나의 프로세서로 묶습니다:
 
-```py 
+```py
 >>> from transformers import ViltProcessor
 
 >>> processor = ViltProcessor.from_pretrained(model_checkpoint)
@@ -181,13 +181,13 @@ Dataset({
 >>> def preprocess_data(examples):
 ...     image_paths = examples['image_id']
 ...     images = [Image.open(image_path) for image_path in image_paths]
-...     texts = examples['question']    
+...     texts = examples['question']
 
 ...     encoding = processor(images, texts, padding="max_length", truncation=True, return_tensors="pt")
 
 ...     for k, v in encoding.items():
 ...           encoding[k] = v.squeeze()
-    
+
 ...     targets = []
 
 ...     for labels, scores in zip(examples['label.ids'], examples['label.weights']):
@@ -195,11 +195,11 @@ Dataset({
 
 ...         for label, score in zip(labels, scores):
 ...             target[label] = score
-      
+
 ...         targets.append(target)
 
 ...     encoding["labels"] = targets
-    
+
 ...     return encoding
 ```
 
@@ -264,14 +264,14 @@ Dataset({
 ...     args=training_args,
 ...     data_collator=data_collator,
 ...     train_dataset=processed_dataset,
-...     tokenizer=processor,
+...     processing_class=processor,
 ... )
 ```
 
 3. [`~Trainer.train`]을 호출하여 모델을 미세 조정하세요:
 
 ```py
->>> trainer.train() 
+>>> trainer.train()
 ```
 
 훈련이 완료되면, [`~Trainer.push_to_hub`] 메소드를 사용하여 🤗 Hub에 모델을 공유하세요:
@@ -349,7 +349,7 @@ Predicted answer: down
 
 모델은 이미지와 텍스트를 입력으로 받으므로, VQA 데이터세트의 첫 번째 예제에서와 동일한 이미지/질문 쌍을 사용해 보겠습니다:
 
-```py 
+```py
 >>> example = dataset[0]
 >>> image = Image.open(example['image_id'])
 >>> question = example['question']
@@ -358,7 +358,7 @@ Predicted answer: down
 BLIP-2를 시각적 질의응답 작업에 사용하려면 텍스트 프롬프트가 `Question: {} Answer:` 형식을 따라야 합니다.
 
 ```py
->>> prompt = f"Question: {question} Answer:" 
+>>> prompt = f"Question: {question} Answer:"
 ```
 
 이제 모델의 프로세서로 이미지/프롬프트를 전처리하고, 처리된 입력을 모델을 통해 전달하고, 출력을 디코드해야 합니다:
@@ -369,7 +369,7 @@ BLIP-2를 시각적 질의응답 작업에 사용하려면 텍스트 프롬프
 >>> generated_ids = model.generate(**inputs, max_new_tokens=10)
 >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
 >>> print(generated_text)
-"He is looking at the crowd" 
+"He is looking at the crowd"
 ```
 
 보시다시피 모델은 군중을 인식하고, 얼굴의 방향(아래쪽을 보고 있음)을 인식했지만, 군중이 스케이터 뒤에 있다는 사실을 놓쳤습니다. 그러나 사람이 직접 라벨링한 데이터셋을 얻을 수 없는 경우에, 이 접근법은 빠르게 유용한 결과를 생성할 수 있습니다.
diff --git a/docs/source/pt/tasks/sequence_classification.md b/docs/source/pt/tasks/sequence_classification.md
index e7776894f874cb..a2e6865c92e5b6 100644
--- a/docs/source/pt/tasks/sequence_classification.md
+++ b/docs/source/pt/tasks/sequence_classification.md
@@ -134,7 +134,7 @@ Nesse ponto, restam apenas três passos:
 ...     args=training_args,
 ...     train_dataset=tokenized_imdb["train"],
 ...     eval_dataset=tokenized_imdb["test"],
-...     tokenizer=tokenizer,
+...     processing_class=tokenizer,
 ...     data_collator=data_collator,
 ... )
 
@@ -213,4 +213,4 @@ Chame o método [`fit`](https://keras.io/api/models/model_training_apis/#fit-met
 
 Para obter um exemplo mais aprofundado de como executar o fine-tuning de um modelo para classificação de texto, dê uma olhada nesse [notebook utilizando PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb) ou nesse [notebook utilizando TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb).
 
-</Tip>
\ No newline at end of file
+</Tip>
diff --git a/docs/source/pt/tasks/token_classification.md b/docs/source/pt/tasks/token_classification.md
index d4d6bf4dd906ee..45ce0d87429c58 100644
--- a/docs/source/pt/tasks/token_classification.md
+++ b/docs/source/pt/tasks/token_classification.md
@@ -193,7 +193,7 @@ Nesse ponto, restam apenas três passos:
 ...     args=training_args,
 ...     train_dataset=tokenized_wnut["train"],
 ...     eval_dataset=tokenized_wnut["test"],
-...     tokenizer=tokenizer,
+...     processing_class=tokenizer,
 ...     data_collator=data_collator,
 ... )
 
@@ -269,4 +269,4 @@ Chame o método [`fit`](https://keras.io/api/models/model_training_apis/#fit-met
 
 Para obter um exemplo mais aprofundado de como executar o fine-tuning de um modelo para classificação de tokens, dê uma olhada nesse [notebook utilizando PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb) ou nesse [notebook utilizando TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb).
 
-</Tip>
\ No newline at end of file
+</Tip>
diff --git a/docs/source/te/quicktour.md b/docs/source/te/quicktour.md
index 96ac046cf615ad..67e530f35f3294 100644
--- a/docs/source/te/quicktour.md
+++ b/docs/source/te/quicktour.md
@@ -142,7 +142,7 @@ label: NEGATIVE, with score: 0.5309
 ```
 
 <frameworkcontent>
-<pt> 
+<pt>
 ముందుగా శిక్షణ పొందిన మోడల్‌ను లోడ్ చేయడానికి [`AutoModelForSequenceClassification`] మరియు [`AutoTokenizer`]ని ఉపయోగించండి మరియు దాని అనుబంధిత టోకెనైజర్ (తదుపరి విభాగంలో `AutoClass`పై మరిన్ని):
 
 ```py
@@ -154,7 +154,7 @@ label: NEGATIVE, with score: 0.5309
 </pt>
 <tf>
 ముందుగా శిక్షణ పొందిన మోడల్‌ను లోడ్ చేయడానికి [`TFAutoModelForSequenceClassification`] మరియు [`AutoTokenizer`]ని ఉపయోగించండి మరియు దాని అనుబంధిత టోకెనైజర్ (తదుపరి విభాగంలో `TFAutoClass`పై మరిన్ని):
-  
+
 ```py
 >>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
 
@@ -329,7 +329,7 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
 <frameworkcontent>
 <pt>
 మీ మోడల్ చక్కగా ట్యూన్ చేయబడిన తర్వాత, మీరు దానిని [`PreTrainedModel.save_pretrained`]ని ఉపయోగించి దాని టోకెనైజర్‌తో సేవ్ చేయవచ్చు:
-  
+
 ```py
 >>> pt_save_directory = "./pt_save_pretrained"
 >>> tokenizer.save_pretrained(pt_save_directory)  # doctest: +IGNORE_RESULT
@@ -344,7 +344,7 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
 </pt>
 <tf>
 మీ మోడల్ చక్కగా ట్యూన్ చేయబడిన తర్వాత, మీరు దానిని [`TFPreTrainedModel.save_pretrained`]ని ఉపయోగించి దాని టోకెనైజర్‌తో సేవ్ చేయవచ్చు:
-  
+
 ```py
 >>> tf_save_directory = "./tf_save_pretrained"
 >>> tokenizer.save_pretrained(tf_save_directory)  # doctest: +IGNORE_RESULT
@@ -395,7 +395,7 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
 <frameworkcontent>
 <pt>
 [`AutoModel.from_config`]తో మీ అనుకూల కాన్ఫిగరేషన్ నుండి మోడల్‌ను సృష్టించండి:
-  
+
 ```py
 >>> from transformers import AutoModel
 
@@ -404,7 +404,7 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
 </pt>
 <tf>
 [`TFAutoModel.from_config`]తో మీ అనుకూల కాన్ఫిగరేషన్ నుండి మోడల్‌ను సృష్టించండి:
-  
+
 ```py
 >>> from transformers import TFAutoModel
 
@@ -465,7 +465,7 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
    ```
 
    ఆపై దానిని [`~datasets.Dataset.map`]తో మొత్తం డేటాసెట్‌లో వర్తింపజేయండి:
-   
+
    ```py
    >>> dataset = dataset.map(tokenize_dataset, batched=True)
    ```
@@ -488,7 +488,7 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
 ...     args=training_args,
 ...     train_dataset=dataset["train"],
 ...     eval_dataset=dataset["test"],
-...     tokenizer=tokenizer,
+...     processing_class=tokenizer,
 ...     data_collator=data_collator,
 ... )  # doctest: +SKIP
 ```
diff --git a/docs/source/zh/hpo_train.md b/docs/source/zh/hpo_train.md
index 182940c359bb44..907be0a21fa8ae 100644
--- a/docs/source/zh/hpo_train.md
+++ b/docs/source/zh/hpo_train.md
@@ -15,7 +15,7 @@ rendered properly in your Markdown viewer.
 
 # 使用Trainer API进行超参数搜索
 
-🤗 Transformers库提供了一个优化过的[`Trainer`]类，用于训练🤗 Transformers模型，相比于手动编写自己的训练循环，这更容易开始训练。[`Trainer`]提供了超参数搜索的API。本文档展示了如何在示例中启用它。 
+🤗 Transformers库提供了一个优化过的[`Trainer`]类，用于训练🤗 Transformers模型，相比于手动编写自己的训练循环，这更容易开始训练。[`Trainer`]提供了超参数搜索的API。本文档展示了如何在示例中启用它。
 
 
 ## 超参数搜索后端
@@ -25,7 +25,7 @@ rendered properly in your Markdown viewer.
 在使用它们之前，您应该先安装它们作为超参数搜索后端。
 
 ```bash
-pip install optuna/sigopt/wandb/ray[tune] 
+pip install optuna/sigopt/wandb/ray[tune]
 ```
 
 ## 如何在示例中启用超参数搜索
@@ -115,7 +115,7 @@ Optuna提供了多目标HPO。您可以在`hyperparameter_search`中传递`direc
 ...     train_dataset=small_train_dataset,
 ...     eval_dataset=small_eval_dataset,
 ...     compute_metrics=compute_metrics,
-...     tokenizer=tokenizer,
+...     processing_class=tokenizer,
 ...     model_init=model_init,
 ...     data_collator=data_collator,
 ... )
@@ -136,4 +136,4 @@ Optuna提供了多目标HPO。您可以在`hyperparameter_search`中传递`direc
 ```
 
 ## 针对DDP微调的超参数搜索
-目前，Optuna和Sigopt已启用针对DDP的超参数搜索。只有rank-zero进程会进行超参数搜索并将参数传递给其他进程。
\ No newline at end of file
+目前，Optuna和Sigopt已启用针对DDP的超参数搜索。只有rank-zero进程会进行超参数搜索并将参数传递给其他进程。
diff --git a/docs/source/zh/quicktour.md b/docs/source/zh/quicktour.md
index 9760a697698246..acc59539712820 100644
--- a/docs/source/zh/quicktour.md
+++ b/docs/source/zh/quicktour.md
@@ -476,7 +476,7 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
 ...     args=training_args,
 ...     train_dataset=dataset["train"],
 ...     eval_dataset=dataset["test"],
-...     tokenizer=tokenizer,
+...     processing_class=tokenizer,
 ...     data_collator=data_collator,
 ... )  # doctest: +SKIP
 ```
diff --git a/docs/source/zh/tasks/asr.md b/docs/source/zh/tasks/asr.md
index b4366d720404ac..a4fdbd308e4bad 100644
--- a/docs/source/zh/tasks/asr.md
+++ b/docs/source/zh/tasks/asr.md
@@ -298,7 +298,7 @@ Wav2Vec2 分词器仅训练了大写字符，因此您需要确保文本与分
 ...     args=training_args,
 ...     train_dataset=encoded_minds["train"],
 ...     eval_dataset=encoded_minds["test"],
-...     tokenizer=processor,
+...     processing_class=processor,
 ...     data_collator=data_collator,
 ...     compute_metrics=compute_metrics,
 ... )
@@ -389,4 +389,4 @@ Wav2Vec2 分词器仅训练了大写字符，因此您需要确保文本与分
 ['I WOUL LIKE O SET UP JOINT ACOUNT WTH Y PARTNER']
 ```
 </pt>
-</frameworkcontent>
\ No newline at end of file
+</frameworkcontent>
diff --git a/examples/legacy/seq2seq/finetune_trainer.py b/examples/legacy/seq2seq/finetune_trainer.py
index e269bc2474eca5..5ede86ee082265 100755
--- a/examples/legacy/seq2seq/finetune_trainer.py
+++ b/examples/legacy/seq2seq/finetune_trainer.py
@@ -302,7 +302,7 @@ def main():
             tokenizer, data_args, model.config.decoder_start_token_id, training_args.tpu_num_cores
         ),
         compute_metrics=compute_metrics_fn,
-        tokenizer=tokenizer,
+        processing_class=tokenizer,
     )
 
     all_metrics = {}
diff --git a/examples/modular-transformers/modeling_dummy.py b/examples/modular-transformers/modeling_dummy.py
index c67787fbd8a69e..b5b1fc6aec85e6 100644
--- a/examples/modular-transformers/modeling_dummy.py
+++ b/examples/modular-transformers/modeling_dummy.py
@@ -4,7 +4,6 @@
 #         the file from the modular. If any change should be done, please apply the change to the
 #                           modular_xxx.py file directly. One of our CI enforces this
 #           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-
 import math
 from typing import List, Optional, Tuple, Union
 
@@ -881,9 +880,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if self.gradient_checkpointing and self.training and use_cache:
             logger.warning_once(
diff --git a/examples/modular-transformers/modeling_my_new_model2.py b/examples/modular-transformers/modeling_my_new_model2.py
index 5484b3890fbdc4..49cdd274162092 100644
--- a/examples/modular-transformers/modeling_my_new_model2.py
+++ b/examples/modular-transformers/modeling_my_new_model2.py
@@ -758,9 +758,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if self.gradient_checkpointing and self.training and use_cache:
             logger.warning_once(
diff --git a/examples/pytorch/audio-classification/run_audio_classification.py b/examples/pytorch/audio-classification/run_audio_classification.py
index 61df8ce990e66c..009a1f6372433a 100644
--- a/examples/pytorch/audio-classification/run_audio_classification.py
+++ b/examples/pytorch/audio-classification/run_audio_classification.py
@@ -394,7 +394,7 @@ def compute_metrics(eval_pred):
         train_dataset=raw_datasets["train"] if training_args.do_train else None,
         eval_dataset=raw_datasets["eval"] if training_args.do_eval else None,
         compute_metrics=compute_metrics,
-        tokenizer=feature_extractor,
+        processing_class=feature_extractor,
     )
 
     # Training
diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py
index 01ed9938c4e96d..0a9789426c2c46 100755
--- a/examples/pytorch/image-classification/run_image_classification.py
+++ b/examples/pytorch/image-classification/run_image_classification.py
@@ -396,7 +396,7 @@ def val_transforms(example_batch):
         train_dataset=dataset["train"] if training_args.do_train else None,
         eval_dataset=dataset["validation"] if training_args.do_eval else None,
         compute_metrics=compute_metrics,
-        tokenizer=image_processor,
+        processing_class=image_processor,
         data_collator=collate_fn,
     )
 
diff --git a/examples/pytorch/image-pretraining/run_mae.py b/examples/pytorch/image-pretraining/run_mae.py
index 4ec195f925df5f..46863cbbf1ce3e 100644
--- a/examples/pytorch/image-pretraining/run_mae.py
+++ b/examples/pytorch/image-pretraining/run_mae.py
@@ -364,7 +364,7 @@ def preprocess_images(examples):
         args=training_args,
         train_dataset=ds["train"] if training_args.do_train else None,
         eval_dataset=ds["validation"] if training_args.do_eval else None,
-        tokenizer=image_processor,
+        processing_class=image_processor,
         data_collator=collate_fn,
     )
 
diff --git a/examples/pytorch/image-pretraining/run_mim.py b/examples/pytorch/image-pretraining/run_mim.py
index b6c0ca899e87c1..3912c693440192 100644
--- a/examples/pytorch/image-pretraining/run_mim.py
+++ b/examples/pytorch/image-pretraining/run_mim.py
@@ -443,7 +443,7 @@ def preprocess_images(examples):
         args=training_args,
         train_dataset=ds["train"] if training_args.do_train else None,
         eval_dataset=ds["validation"] if training_args.do_eval else None,
-        tokenizer=image_processor,
+        processing_class=image_processor,
         data_collator=collate_fn,
     )
 
diff --git a/examples/pytorch/instance-segmentation/run_instance_segmentation.py b/examples/pytorch/instance-segmentation/run_instance_segmentation.py
index 22152c6fada726..aeb78f95d28878 100644
--- a/examples/pytorch/instance-segmentation/run_instance_segmentation.py
+++ b/examples/pytorch/instance-segmentation/run_instance_segmentation.py
@@ -445,7 +445,7 @@ def main():
         args=training_args,
         train_dataset=dataset["train"] if training_args.do_train else None,
         eval_dataset=dataset["validation"] if training_args.do_eval else None,
-        tokenizer=image_processor,
+        processing_class=image_processor,
         data_collator=collate_fn,
         compute_metrics=compute_metrics,
     )
diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py
index 74fae865c944b6..656571eb37e40e 100755
--- a/examples/pytorch/language-modeling/run_clm.py
+++ b/examples/pytorch/language-modeling/run_clm.py
@@ -586,7 +586,7 @@ def compute_metrics(eval_preds):
         args=training_args,
         train_dataset=train_dataset if training_args.do_train else None,
         eval_dataset=eval_dataset if training_args.do_eval else None,
-        tokenizer=tokenizer,
+        processing_class=tokenizer,
         # Data collator will default to DataCollatorWithPadding, so we change it.
         data_collator=default_data_collator,
         compute_metrics=compute_metrics if training_args.do_eval and not is_torch_xla_available() else None,
diff --git a/examples/pytorch/language-modeling/run_fim.py b/examples/pytorch/language-modeling/run_fim.py
index eb670ae44ecaf3..154fc1518384e4 100644
--- a/examples/pytorch/language-modeling/run_fim.py
+++ b/examples/pytorch/language-modeling/run_fim.py
@@ -793,7 +793,7 @@ def compute_metrics(eval_preds):
         args=training_args,
         train_dataset=train_dataset if training_args.do_train else None,
         eval_dataset=eval_dataset if training_args.do_eval else None,
-        tokenizer=tokenizer,
+        processing_class=tokenizer,
         # Data collator will default to DataCollatorWithPadding, so we change it.
         data_collator=default_data_collator,
         compute_metrics=compute_metrics if training_args.do_eval and not is_torch_tpu_available() else None,
diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py
index 34cefa833f8c11..d021318ae065d9 100755
--- a/examples/pytorch/language-modeling/run_mlm.py
+++ b/examples/pytorch/language-modeling/run_mlm.py
@@ -622,7 +622,7 @@ def compute_metrics(eval_preds):
         args=training_args,
         train_dataset=train_dataset if training_args.do_train else None,
         eval_dataset=eval_dataset if training_args.do_eval else None,
-        tokenizer=tokenizer,
+        processing_class=tokenizer,
         data_collator=data_collator,
         compute_metrics=compute_metrics if training_args.do_eval and not is_torch_xla_available() else None,
         preprocess_logits_for_metrics=preprocess_logits_for_metrics
diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py
index 237a1ddbd08857..0a207b80479ce2 100755
--- a/examples/pytorch/language-modeling/run_plm.py
+++ b/examples/pytorch/language-modeling/run_plm.py
@@ -519,7 +519,7 @@ def group_texts(examples):
         args=training_args,
         train_dataset=train_dataset if training_args.do_train else None,
         eval_dataset=eval_dataset if training_args.do_eval else None,
-        tokenizer=tokenizer,
+        processing_classtokenizer=tokenizer,
         data_collator=data_collator,
     )
 
diff --git a/examples/pytorch/multiple-choice/run_swag.py b/examples/pytorch/multiple-choice/run_swag.py
index 6dc5f97641d2f5..ac5db5f6b02727 100755
--- a/examples/pytorch/multiple-choice/run_swag.py
+++ b/examples/pytorch/multiple-choice/run_swag.py
@@ -440,7 +440,7 @@ def compute_metrics(eval_predictions):
         args=training_args,
         train_dataset=train_dataset if training_args.do_train else None,
         eval_dataset=eval_dataset if training_args.do_eval else None,
-        tokenizer=tokenizer,
+        processing_class=tokenizer,
         data_collator=data_collator,
         compute_metrics=compute_metrics,
     )
diff --git a/examples/pytorch/object-detection/run_object_detection.py b/examples/pytorch/object-detection/run_object_detection.py
index 74d87bc5d18cf5..0aea1a11c14ca9 100644
--- a/examples/pytorch/object-detection/run_object_detection.py
+++ b/examples/pytorch/object-detection/run_object_detection.py
@@ -488,7 +488,7 @@ def main():
         args=training_args,
         train_dataset=dataset["train"] if training_args.do_train else None,
         eval_dataset=dataset["validation"] if training_args.do_eval else None,
-        tokenizer=image_processor,
+        processing_class=image_processor,
         data_collator=collate_fn,
         compute_metrics=eval_compute_metrics_fn,
     )
diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py
index ac6fcd249b0190..bb0a6455926197 100755
--- a/examples/pytorch/question-answering/run_qa.py
+++ b/examples/pytorch/question-answering/run_qa.py
@@ -640,7 +640,7 @@ def compute_metrics(p: EvalPrediction):
         train_dataset=train_dataset if training_args.do_train else None,
         eval_dataset=eval_dataset if training_args.do_eval else None,
         eval_examples=eval_examples if training_args.do_eval else None,
-        tokenizer=tokenizer,
+        processing_class=tokenizer,
         data_collator=data_collator,
         post_process_function=post_processing_function,
         compute_metrics=compute_metrics,
diff --git a/examples/pytorch/question-answering/run_qa_beam_search.py b/examples/pytorch/question-answering/run_qa_beam_search.py
index 23c9080f20da0c..b3d9ee1e9c7934 100755
--- a/examples/pytorch/question-answering/run_qa_beam_search.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search.py
@@ -666,7 +666,7 @@ def compute_metrics(p: EvalPrediction):
         train_dataset=train_dataset if training_args.do_train else None,
         eval_dataset=eval_dataset if training_args.do_eval else None,
         eval_examples=eval_examples if training_args.do_eval else None,
-        tokenizer=tokenizer,
+        processing_class=tokenizer,
         data_collator=data_collator,
         post_process_function=post_processing_function,
         compute_metrics=compute_metrics,
diff --git a/examples/pytorch/question-answering/run_seq2seq_qa.py b/examples/pytorch/question-answering/run_seq2seq_qa.py
index d4e1844f3adf2c..7cf50cf94a03b0 100644
--- a/examples/pytorch/question-answering/run_seq2seq_qa.py
+++ b/examples/pytorch/question-answering/run_seq2seq_qa.py
@@ -663,7 +663,7 @@ def post_processing_function(
         train_dataset=train_dataset if training_args.do_train else None,
         eval_dataset=eval_dataset if training_args.do_eval else None,
         eval_examples=eval_examples if training_args.do_eval else None,
-        tokenizer=tokenizer,
+        processing_class=tokenizer,
         data_collator=data_collator,
         compute_metrics=compute_metrics if training_args.predict_with_generate else None,
         post_process_function=post_processing_function,
diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
index 9d3c29ed8c6449..4c119dcbb4a450 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
@@ -403,7 +403,7 @@ def preprocess_batch(example_batch, transforms: A.Compose):
         train_dataset=dataset["train"] if training_args.do_train else None,
         eval_dataset=dataset["validation"] if training_args.do_eval else None,
         compute_metrics=compute_metrics,
-        tokenizer=image_processor,
+        processing_class=image_processor,
         data_collator=default_data_collator,
     )
 
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
index 62166012c7a785..ff5da5ed49ad68 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
@@ -751,7 +751,7 @@ def compute_metrics(pred):
         compute_metrics=compute_metrics,
         train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
         eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
-        tokenizer=processor,
+        processing_class=processor,
         preprocess_logits_for_metrics=preprocess_logits_for_metrics,
     )
 
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
index 144ae301669323..66a75ca5d09269 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
@@ -747,7 +747,7 @@ def compute_metrics(pred):
         compute_metrics=compute_metrics,
         train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
         eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
-        tokenizer=processor,
+        processing_class=processor,
     )
 
     # 8. Finally, we can start training
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
index a83502ed800d99..8740ec5f88fa65 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
@@ -569,7 +569,7 @@ def compute_metrics(pred):
         args=training_args,
         train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
         eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
-        tokenizer=feature_extractor,
+        processing_class=feature_extractor,
         data_collator=data_collator,
         compute_metrics=compute_metrics if training_args.predict_with_generate else None,
     )
diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py
index a4d6f616b70324..9a25d944053ee2 100755
--- a/examples/pytorch/summarization/run_summarization.py
+++ b/examples/pytorch/summarization/run_summarization.py
@@ -677,7 +677,7 @@ def compute_metrics(eval_preds):
         args=training_args,
         train_dataset=train_dataset if training_args.do_train else None,
         eval_dataset=eval_dataset if training_args.do_eval else None,
-        tokenizer=tokenizer,
+        processing_class=tokenizer,
         data_collator=data_collator,
         compute_metrics=compute_metrics if training_args.predict_with_generate else None,
     )
diff --git a/examples/pytorch/text-classification/run_classification.py b/examples/pytorch/text-classification/run_classification.py
index 9fd11e05b164d8..a440a48110aa41 100755
--- a/examples/pytorch/text-classification/run_classification.py
+++ b/examples/pytorch/text-classification/run_classification.py
@@ -674,7 +674,7 @@ def compute_metrics(p: EvalPrediction):
         train_dataset=train_dataset if training_args.do_train else None,
         eval_dataset=eval_dataset if training_args.do_eval else None,
         compute_metrics=compute_metrics,
-        tokenizer=tokenizer,
+        processing_class=tokenizer,
         data_collator=data_collator,
     )
 
diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py
index 8292389bf88ae6..4284fdf12f80a2 100755
--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@@ -531,7 +531,7 @@ def compute_metrics(p: EvalPrediction):
         train_dataset=train_dataset if training_args.do_train else None,
         eval_dataset=eval_dataset if training_args.do_eval else None,
         compute_metrics=compute_metrics,
-        tokenizer=tokenizer,
+        processing_class=tokenizer,
         data_collator=data_collator,
     )
 
diff --git a/examples/pytorch/text-classification/run_xnli.py b/examples/pytorch/text-classification/run_xnli.py
index 1c6c85f4baa0b2..6578e96dc9c585 100755
--- a/examples/pytorch/text-classification/run_xnli.py
+++ b/examples/pytorch/text-classification/run_xnli.py
@@ -393,7 +393,7 @@ def compute_metrics(p: EvalPrediction):
         train_dataset=train_dataset if training_args.do_train else None,
         eval_dataset=eval_dataset if training_args.do_eval else None,
         compute_metrics=compute_metrics,
-        tokenizer=tokenizer,
+        processing_class=tokenizer,
         data_collator=data_collator,
     )
 
diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py
index b8ef21d6eceb0a..ef1c0ac917b767 100755
--- a/examples/pytorch/token-classification/run_ner.py
+++ b/examples/pytorch/token-classification/run_ner.py
@@ -567,7 +567,7 @@ def compute_metrics(p):
         args=training_args,
         train_dataset=train_dataset if training_args.do_train else None,
         eval_dataset=eval_dataset if training_args.do_eval else None,
-        tokenizer=tokenizer,
+        processing_class=tokenizer,
         data_collator=data_collator,
         compute_metrics=compute_metrics,
     )
diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py
index c23cf8a4310349..4e164010185ea5 100755
--- a/examples/pytorch/translation/run_translation.py
+++ b/examples/pytorch/translation/run_translation.py
@@ -597,7 +597,7 @@ def compute_metrics(eval_preds):
         args=training_args,
         train_dataset=train_dataset if training_args.do_train else None,
         eval_dataset=eval_dataset if training_args.do_eval else None,
-        tokenizer=tokenizer,
+        processing_class=tokenizer,
         data_collator=data_collator,
         compute_metrics=compute_metrics if training_args.predict_with_generate else None,
     )
diff --git a/i18n/README_ru.md b/i18n/README_ru.md
index 759acdbb912771..ebab1236ca8542 100644
--- a/i18n/README_ru.md
+++ b/i18n/README_ru.md
@@ -77,7 +77,7 @@ limitations under the License.
 
 ## Онлайн демонстрация
 
-Большинство наших моделей можно протестировать непосредственно на их страницах с [сайта](https://huggingface.co/models). Мы также предлагаем [привтаный хостинг моделей, контроль версий и API для выводов](https://huggingface.co/pricing) для публичных и частных моделей.
+Большинство наших моделей можно протестировать непосредственно на их страницах с [сайта](https://huggingface.co/models). Мы также предлагаем [приватный хостинг моделей, контроль версий и API для выводов](https://huggingface.co/pricing) для публичных и частных моделей.
 
 Вот несколько примеров:
 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 078e4d0e4abdee..ab829c6894c0f9 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -115,7 +115,6 @@
     "data.metrics": [],
     "data.processors": [],
     "debug_utils": [],
-    "deepspeed": [],
     "dependency_versions_check": [],
     "dependency_versions_table": [],
     "dynamic_module_utils": [],
@@ -607,6 +606,7 @@
         "MusicgenMelodyDecoderConfig",
     ],
     "models.mvp": ["MvpConfig", "MvpTokenizer"],
+    "models.myt5": ["MyT5Tokenizer"],
     "models.nemotron": ["NemotronConfig"],
     "models.nllb": [],
     "models.nllb_moe": ["NllbMoeConfig"],
@@ -654,6 +654,7 @@
     "models.persimmon": ["PersimmonConfig"],
     "models.phi": ["PhiConfig"],
     "models.phi3": ["Phi3Config"],
+    "models.phimoe": ["PhimoeConfig"],
     "models.phobert": ["PhobertTokenizer"],
     "models.pix2struct": [
         "Pix2StructConfig",
@@ -846,6 +847,7 @@
     "models.xmod": ["XmodConfig"],
     "models.yolos": ["YolosConfig"],
     "models.yoso": ["YosoConfig"],
+    "models.zamba": ["ZambaConfig"],
     "models.zoedepth": ["ZoeDepthConfig"],
     "onnx": [],
     "pipelines": [
@@ -966,6 +968,7 @@
     "utils.quantization_config": [
         "AqlmConfig",
         "AwqConfig",
+        "BitNetConfig",
         "BitsAndBytesConfig",
         "CompressedTensorsConfig",
         "EetqConfig",
@@ -1405,6 +1408,7 @@
             "MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
             "MODEL_FOR_IMAGE_MAPPING",
             "MODEL_FOR_IMAGE_SEGMENTATION_MAPPING",
+            "MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING",
             "MODEL_FOR_IMAGE_TO_IMAGE_MAPPING",
             "MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING",
             "MODEL_FOR_KEYPOINT_DETECTION_MAPPING",
@@ -1446,6 +1450,7 @@
             "AutoModelForDocumentQuestionAnswering",
             "AutoModelForImageClassification",
             "AutoModelForImageSegmentation",
+            "AutoModelForImageTextToText",
             "AutoModelForImageToImage",
             "AutoModelForInstanceSegmentation",
             "AutoModelForKeypointDetection",
@@ -3022,6 +3027,14 @@
             "Phi3PreTrainedModel",
         ]
     )
+    _import_structure["models.phimoe"].extend(
+        [
+            "PhimoeForCausalLM",
+            "PhimoeForSequenceClassification",
+            "PhimoeModel",
+            "PhimoePreTrainedModel",
+        ]
+    )
     _import_structure["models.pix2struct"].extend(
         [
             "Pix2StructForConditionalGeneration",
@@ -3750,6 +3763,14 @@
             "YosoPreTrainedModel",
         ]
     )
+    _import_structure["models.zamba"].extend(
+        [
+            "ZambaForCausalLM",
+            "ZambaForSequenceClassification",
+            "ZambaModel",
+            "ZambaPreTrainedModel",
+        ]
+    )
     _import_structure["models.zoedepth"].extend(
         [
             "ZoeDepthForDepthEstimation",
@@ -5439,6 +5460,7 @@
         MusicgenMelodyDecoderConfig,
     )
     from .models.mvp import MvpConfig, MvpTokenizer
+    from .models.myt5 import MyT5Tokenizer
     from .models.nemotron import NemotronConfig
     from .models.nllb_moe import NllbMoeConfig
     from .models.nougat import NougatProcessor
@@ -5495,6 +5517,7 @@
     )
     from .models.phi import PhiConfig
     from .models.phi3 import Phi3Config
+    from .models.phimoe import PhimoeConfig
     from .models.phobert import PhobertTokenizer
     from .models.pix2struct import (
         Pix2StructConfig,
@@ -5719,6 +5742,7 @@
     from .models.xmod import XmodConfig
     from .models.yolos import YolosConfig
     from .models.yoso import YosoConfig
+    from .models.zamba import ZambaConfig
     from .models.zoedepth import ZoeDepthConfig
 
     # Pipelines
@@ -5846,6 +5870,7 @@
     from .utils.quantization_config import (
         AqlmConfig,
         AwqConfig,
+        BitNetConfig,
         BitsAndBytesConfig,
         CompressedTensorsConfig,
         EetqConfig,
@@ -6251,6 +6276,7 @@
             MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
             MODEL_FOR_IMAGE_MAPPING,
             MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
+            MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING,
             MODEL_FOR_IMAGE_TO_IMAGE_MAPPING,
             MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
             MODEL_FOR_KEYPOINT_DETECTION_MAPPING,
@@ -6292,6 +6318,7 @@
             AutoModelForDocumentQuestionAnswering,
             AutoModelForImageClassification,
             AutoModelForImageSegmentation,
+            AutoModelForImageTextToText,
             AutoModelForImageToImage,
             AutoModelForInstanceSegmentation,
             AutoModelForKeypointDetection,
@@ -7545,6 +7572,12 @@
             Phi3Model,
             Phi3PreTrainedModel,
         )
+        from .models.phimoe import (
+            PhimoeForCausalLM,
+            PhimoeForSequenceClassification,
+            PhimoeModel,
+            PhimoePreTrainedModel,
+        )
         from .models.pix2struct import (
             Pix2StructForConditionalGeneration,
             Pix2StructPreTrainedModel,
@@ -8110,6 +8143,12 @@
             YosoModel,
             YosoPreTrainedModel,
         )
+        from .models.zamba import (
+            ZambaForCausalLM,
+            ZambaForSequenceClassification,
+            ZambaModel,
+            ZambaPreTrainedModel,
+        )
         from .models.zoedepth import (
             ZoeDepthForDepthEstimation,
             ZoeDepthPreTrainedModel,
diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py
index 0b82b17dcde0a0..4e4a1ee26c12d7 100644
--- a/src/transformers/cache_utils.py
+++ b/src/transformers/cache_utils.py
@@ -9,14 +9,16 @@
 from packaging import version
 
 from .configuration_utils import PretrainedConfig
-from .utils import is_hqq_available, is_quanto_available, is_torchdynamo_compiling, logging
+from .utils import (
+    is_hqq_available,
+    is_optimum_quanto_available,
+    is_quanto_available,
+    is_torchdynamo_compiling,
+    logging,
+)
+from .utils.deprecation import deprecate_kwarg
 
 
-if is_quanto_available():
-    quanto_version = version.parse(importlib.metadata.version("quanto"))
-    if quanto_version >= version.parse("0.2.0"):
-        from quanto import AffineQuantizer, MaxOptimizer, qint2, qint4
-
 if is_hqq_available():
     from hqq.core.quantize import Quantizer as HQQQuantizer
 
@@ -62,16 +64,27 @@ def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
         # TODO: deprecate this function in favor of `cache_position`
         raise NotImplementedError("Make sure to implement `get_seq_length` in a subclass.")
 
+    # Deprecate in favor of max-cache-shape because we want to be specifc by what we mean with "max_length"
+    # Prev some cache objects didn't have "max_length" (SlidingWindowCache or SinkCache) because the cache object technically handles
+    # infinite amount of tokens. In the codebase what we really need to check is the max capacity of certain cache instances, so
+    # we change naming to be more explicit
     def get_max_length(self) -> Optional[int]:
-        """Returns the maximum sequence length of the cached states, if there is any."""
-        raise NotImplementedError("Make sure to implement `get_max_length` in a subclass.")
+        logger.warning_once(
+            "`get_max_cache()` is deprecated for all Cache classes. Use `get_max_cache_shape()` instead. "
+            "Calling `get_max_cache()` will raise error from v4.48"
+        )
+        return self.get_max_cache_shape()
+
+    def get_max_cache_shape(self) -> Optional[int]:
+        """Returns the maximum sequence length (i.e. max capacity) of the cache object"""
+        raise NotImplementedError("Make sure to implement `get_max_cache_shape` in a subclass.")
 
     def get_usable_length(self, new_seq_length: int, layer_idx: Optional[int] = 0) -> int:
         """Given the sequence length of the new inputs, returns the usable length of the cache."""
         # Cache without size limit -> all cache is usable
         # Cache with size limit -> if the length cache plus the length of the new inputs is larger the maximum cache
         #   length, we will need to evict part of the cache (and thus not all cache is usable)
-        max_length = self.get_max_length()
+        max_length = self.get_max_cache_shape()
         previous_seq_length = self.get_seq_length(layer_idx)
         if max_length is not None and previous_seq_length + new_seq_length > max_length:
             return max_length - new_seq_length
@@ -360,15 +373,12 @@ class DynamicCache(Cache):
         ```
     """
 
+    @deprecate_kwarg("num_hidden_layers", version="4.47.0")
     def __init__(self, num_hidden_layers: Optional[int] = None) -> None:
         super().__init__()
-        if num_hidden_layers is None:
-            self.key_cache: List[torch.Tensor] = []
-            self.value_cache: List[torch.Tensor] = []
-        else:
-            self.key_cache: List[torch.Tensor] = [[] for _ in range(num_hidden_layers)]
-            self.value_cache: List[torch.Tensor] = [[] for _ in range(num_hidden_layers)]
         self._seen_tokens = 0  # Used in `generate` to keep tally of how many tokens the cache has seen
+        self.key_cache: List[torch.Tensor] = []
+        self.value_cache: List[torch.Tensor] = []
 
     def __getitem__(self, layer_idx: int) -> List[Tuple[torch.Tensor]]:
         """
@@ -424,11 +434,13 @@ def update(
 
         # Update the cache
         if len(self.key_cache) <= layer_idx:
+            # There may be skipped layers, fill them with empty lists
+            for _ in range(len(self.key_cache), layer_idx):
+                self.key_cache.append([])
+                self.value_cache.append([])
             self.key_cache.append(key_states)
             self.value_cache.append(value_states)
-        # content on layer cache can be a tensor and checking not tensor causes errors
-        # so we explicitly check for the empty list
-        elif self.key_cache[layer_idx] == []:
+        elif len(self.key_cache[layer_idx]) == 0:  # fills previously skipped layers; checking for tensor causes errors
             self.key_cache[layer_idx] = key_states
             self.value_cache[layer_idx] = value_states
         else:
@@ -440,12 +452,16 @@ def update(
     def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
         """Returns the sequence length of the cached states. A layer index can be optionally passed."""
         # TODO: deprecate this function in favor of `cache_position`
-        if len(self.key_cache) <= layer_idx or (len(self.key_cache) > layer_idx and self.key_cache[layer_idx] == []):
-            return 0
-        return self.key_cache[layer_idx].shape[-2]
+        is_empty_layer = (
+            len(self.key_cache) == 0  # no cache in any layer
+            or len(self.key_cache) <= layer_idx  # skipped `layer_idx` and hasn't run a layer with cache after it
+            or len(self.key_cache[layer_idx]) == 0  # the layer has no cache
+        )
+        layer_seq_length = self.key_cache[layer_idx].shape[-2] if not is_empty_layer else 0
+        return layer_seq_length
 
-    def get_max_length(self) -> Optional[int]:
-        """Returns the maximum sequence length of the cached states. DynamicCache does not have a maximum length."""
+    def get_max_cache_shape(self) -> Optional[int]:
+        """Returns the maximum sequence length of the cache object. DynamicCache does not have a maximum length."""
         return None
 
     def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor], Tuple[torch.Tensor]]:
@@ -457,12 +473,13 @@ def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor], Tuple[torch.Tensor]]:
         return legacy_cache
 
     @classmethod
+    @deprecate_kwarg("num_hidden_layers", version="4.47.0")
     def from_legacy_cache(
         cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, num_hidden_layers: int = None
     ) -> "DynamicCache":
         """Converts a cache in the legacy cache format into an equivalent `DynamicCache`. Used for
         backward compatibility."""
-        cache = cls(num_hidden_layers)
+        cache = cls()
         if past_key_values is not None:
             for layer_idx in range(len(past_key_values)):
                 key_states, value_states = past_key_values[layer_idx]
@@ -485,12 +502,15 @@ def crop(self, max_length: int):
                 self.key_cache[idx] = self.key_cache[idx][..., :max_length, :]
                 self.value_cache[idx] = self.value_cache[idx][..., :max_length, :]
 
-    def batch_split(self, full_batch_size: int, split_size: int, num_hidden_layers: int) -> List["DynamicCache"]:
+    @deprecate_kwarg("num_hidden_layers", version="4.47.0")
+    def batch_split(
+        self, full_batch_size: int, split_size: int, num_hidden_layers: int = None
+    ) -> List["DynamicCache"]:
         """Split the current instance into a list of `DynamicCache` by the batch size. This will be used by
         `_split_model_inputs()` in `generation.utils`"""
         out = []
         for i in range(0, full_batch_size, split_size):
-            current_split = DynamicCache(num_hidden_layers)
+            current_split = DynamicCache()
             current_split._seen_tokens = self._seen_tokens
             current_split.key_cache = [tensor[i : i + split_size] for tensor in self.key_cache]
             current_split.value_cache = [tensor[i : i + split_size] for tensor in self.value_cache]
@@ -498,10 +518,11 @@ def batch_split(self, full_batch_size: int, split_size: int, num_hidden_layers:
         return out
 
     @classmethod
-    def from_batch_splits(cls, splits: List["DynamicCache"], num_hidden_layers: int) -> "DynamicCache":
+    @deprecate_kwarg("num_hidden_layers", version="4.47.0")
+    def from_batch_splits(cls, splits: List["DynamicCache"], num_hidden_layers: int = None) -> "DynamicCache":
         """This is the opposite of the above `batch_split()` method. This will be used by `stack_model_outputs` in
         `generation.utils`"""
-        cache = cls(num_hidden_layers)
+        cache = cls()
         for idx in range(len(splits[0])):
             key_cache = [current.key_cache[idx] for current in splits if current.key_cache[idx] != []]
             value_cache = [current.key_cache[idx] for current in splits if current.key_cache[idx] != []]
@@ -617,7 +638,9 @@ def update(
             self._seen_tokens += key_states.shape[-2]
 
         # Update the cache
-        if len(self.key_cache) <= layer_idx:
+        if len(self.key_cache) < layer_idx:
+            raise ValueError("OffloadedCache does not support model usage where layers are skipped. Use DynamicCache.")
+        elif len(self.key_cache) == layer_idx:
             self.key_cache.append(key_states)
             self.value_cache.append(value_states)
             self.original_device.append(key_states.device)
@@ -676,7 +699,9 @@ def update(
         if layer_idx == 0:
             self._seen_tokens += key_states.shape[-2]
 
-        if len(self.key_cache) <= layer_idx:
+        if len(self.key_cache) < layer_idx:
+            raise ValueError("QuantizedCache does not support model usage where layers are skipped. Use DynamicCache.")
+        elif len(self.key_cache) == layer_idx:
             self._quantized_key_cache.append(self._quantize(key_states.contiguous(), axis=self.axis_key))
             self._quantized_value_cache.append(self._quantize(value_states.contiguous(), axis=self.axis_value))
             self.key_cache.append(torch.zeros(0, dtype=key_states.dtype, device=key_states.device))
@@ -754,12 +779,20 @@ class QuantoQuantizedCache(QuantizedCache):
 
     def __init__(self, cache_config: CacheConfig) -> None:
         super().__init__(cache_config)
-        quanto_version = version.parse(importlib.metadata.version("quanto"))
-        if quanto_version < version.parse("0.2.0"):
-            raise ImportError(
-                f"You need quanto package version to be greater or equal than 0.2.0 to use `QuantoQuantizedCache`. Detected version {quanto_version}. "
-                f"Please upgrade quanto with `pip install -U quanto`"
+
+        if is_optimum_quanto_available():
+            from optimum.quanto import MaxOptimizer, qint2, qint4
+        elif is_quanto_available():
+            logger.warning_once(
+                "Importing from quanto will be deprecated in v4.47. Please install optimum-quanto instead `pip install optimum-quanto`"
             )
+            quanto_version = version.parse(importlib.metadata.version("quanto"))
+            if quanto_version < version.parse("0.2.0"):
+                raise ImportError(
+                    f"You need quanto package version to be greater or equal than 0.2.0 to use `QuantoQuantizedCache`. Detected version {quanto_version}. "
+                    f"Since quanto will be deprecated, please install optimum-quanto instead with `pip install -U optimum-quanto`"
+                )
+            from quanto import MaxOptimizer, qint2, qint4
 
         if self.nbits not in [2, 4]:
             raise ValueError(f"`nbits` for `quanto` backend has to be one of [`2`, `4`] but got {self.nbits}")
@@ -776,8 +809,21 @@ def __init__(self, cache_config: CacheConfig) -> None:
         self.optimizer = MaxOptimizer()  # hardcode as it's the only one for per-channel quantization
 
     def _quantize(self, tensor, axis):
-        scale, zeropoint = self.optimizer(tensor, self.qtype.bits, axis, self.q_group_size)
-        qtensor = AffineQuantizer.apply(tensor, self.qtype, axis, self.q_group_size, scale, zeropoint)
+        # We have two different API since in optimum-quanto, we don't use AffineQuantizer anymore
+        if is_optimum_quanto_available():
+            from optimum.quanto import quantize_weight
+
+            qtensor = quantize_weight(tensor, self.qtype, axis, self.q_group_size)
+            return qtensor
+        elif is_quanto_available():
+            logger.warning_once(
+                "Importing from quanto will be deprecated in v4.47. Please install optimum-quanto instead `pip install optimum-quanto`"
+            )
+            from quanto import AffineQuantizer
+
+            scale, zeropoint = self.optimizer(tensor, self.qtype.bits, axis, self.q_group_size)
+            qtensor = AffineQuantizer.apply(tensor, self.qtype, axis, self.q_group_size, scale, zeropoint)
+
         return qtensor
 
     def _dequantize(self, qtensor):
@@ -879,6 +925,8 @@ class SinkCache(Cache):
         ```
     """
 
+    is_sliding = True
+
     def __init__(self, window_length: int, num_sink_tokens: int) -> None:
         super().__init__()
         self.key_cache: List[torch.Tensor] = []
@@ -932,8 +980,8 @@ def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
             return 0
         return self.key_cache[layer_idx].shape[-2]
 
-    def get_max_length(self) -> Optional[int]:
-        """Returns the maximum sequence length of the cached states."""
+    def get_max_cache_shape(self) -> Optional[int]:
+        """Returns the maximum sequence length of the cache object, in case of SinkCache it is the window length."""
         return self.window_length
 
     def update(
@@ -1185,8 +1233,7 @@ def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
         # TODO: deprecate this function in favor of `cache_position`
         return (self.key_cache[layer_idx][0, 0].any(dim=-1)).sum()
 
-    def get_max_length(self) -> Optional[int]:
-        """Returns the maximum sequence length of the cached states."""
+    def get_max_cache_shape(self) -> Optional[int]:
         return self.max_cache_len
 
     def reset(self):
@@ -1250,6 +1297,8 @@ class SlidingWindowCache(StaticCache):
         ```
     """
 
+    is_sliding = True
+
     # TODO (joao): remove `=None` in non-optional arguments in v4.46. Remove from `OBJECTS_TO_IGNORE` as well.
     def __init__(
         self,
@@ -1261,7 +1310,6 @@ def __init__(
         max_batch_size: Optional[int] = None,
         layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None,
     ) -> None:
-        super().__init__()
         if not hasattr(config, "sliding_window") or config.sliding_window is None:
             raise ValueError(
                 "Setting `cache_implementation` to 'sliding_window' requires the model config supporting "
@@ -1326,9 +1374,8 @@ def update(
 
         return k_out, v_out
 
-    def get_max_length(self) -> Optional[int]:
-        # in theory there is no limit because the sliding window size is fixed no matter how long the sentence is
-        return None
+    def get_max_cache_shape(self) -> Optional[int]:
+        return self.max_cache_len
 
     def reset(self):
         for layer_idx in range(len(self.key_cache)):
@@ -1408,12 +1455,12 @@ def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor], Tuple[torch.Tensor]]:
 
     @classmethod
     def from_legacy_cache(
-        cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, num_hidden_layers: int = None
+        cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     ) -> "EncoderDecoderCache":
         """Converts a cache in the legacy cache format into an equivalent `EncoderDecoderCache`."""
         cache = cls(
-            self_attention_cache=DynamicCache(num_hidden_layers),
-            cross_attention_cache=DynamicCache(num_hidden_layers),
+            self_attention_cache=DynamicCache(),
+            cross_attention_cache=DynamicCache(),
         )
         if past_key_values is not None:
             for layer_idx in range(len(past_key_values)):
@@ -1471,14 +1518,12 @@ def crop(self, maximum_length: int):
         self.check_dynamic_cache(self.crop.__name__)
         self.self_attention_cache.crop(maximum_length)
 
-    def batch_split(
-        self, full_batch_size: int, split_size: int, num_hidden_layers: int
-    ) -> "List[EncoderDecoderCache]":
+    def batch_split(self, full_batch_size: int, split_size: int) -> "List[EncoderDecoderCache]":
         """Split the current instance into a list of `DynamicCache` by the batch size. This will be used by
         `_split_model_inputs()` in `generation.utils`"""
         self.check_dynamic_cache(self.batch_split.__name__)
-        self_attention_cache = self.self_attention_cache.batch_split(full_batch_size, split_size, num_hidden_layers)
-        cross_attention_cache = self.cross_attention_cache.batch_split(full_batch_size, split_size, num_hidden_layers)
+        self_attention_cache = self.self_attention_cache.batch_split(full_batch_size, split_size)
+        cross_attention_cache = self.cross_attention_cache.batch_split(full_batch_size, split_size)
 
         out = []
         for self_attn, cross_attn in zip(self_attention_cache, cross_attention_cache):
@@ -1486,11 +1531,11 @@ def batch_split(
         return out
 
     @classmethod
-    def from_batch_splits(cls, splits: List["EncoderDecoderCache"], num_hidden_layers: int) -> "EncoderDecoderCache":
+    def from_batch_splits(cls, splits: List["EncoderDecoderCache"]) -> "EncoderDecoderCache":
         """This is the opposite of the above `batch_split()` method. This will be used by `stack_model_outputs` in
         `generation.utils`"""
-        self_attention_cache = DynamicCache(num_hidden_layers)
-        cross_attention_cache = DynamicCache(num_hidden_layers)
+        self_attention_cache = DynamicCache()
+        cross_attention_cache = DynamicCache()
         for idx in range(len(splits[0])):
             layer_keys = torch.cat([current.self_attention_cache.key_cache[idx] for current in splits], dim=0)
             layer_values = torch.cat([current.self_attention_cache.value_cache[idx] for current in splits], dim=0)
@@ -1679,9 +1724,7 @@ def update(
             k_out.shape[2],
         )
 
-    def get_max_length(self) -> Optional[int]:
-        # in theory there is no limit because the sliding window size is fixed
-        # no matter how long the sentence is
+    def get_max_cache_shape(self) -> Optional[int]:
         return self.max_cache_len
 
     def get_seq_length(self, layer_idx: Optional[int] = 0):
@@ -2012,7 +2055,7 @@ def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
         # TODO(gante): Remove this.
         return self._seen_tokens
 
-    def get_max_length(self) -> Optional[int]:
+    def get_max_cache_shape(self) -> Optional[int]:
         """Returns the maximum sequence length of the cached states."""
 
         return self.max_cache_len
diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index 92e5425e95e706..3ea48f6ab7f707 100755
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -380,11 +380,14 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub:
 
         non_default_generation_parameters = self._get_non_default_generation_parameters()
         if len(non_default_generation_parameters) > 0:
-            raise ValueError(
+            # TODO (joao): this should be an exception if the user has modified the loaded config. See #33886
+            warnings.warn(
                 "Some non-default generation parameters are set in the model config. These should go into either a) "
                 "`model.generation_config` (as opposed to `model.config`); OR b) a GenerationConfig file "
-                "(https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) "
-                f"\nNon-default generation parameters: {str(non_default_generation_parameters)}"
+                "(https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model)."
+                "This warning will become an exception in the future."
+                f"\nNon-default generation parameters: {str(non_default_generation_parameters)}",
+                UserWarning,
             )
 
         os.makedirs(save_directory, exist_ok=True)
@@ -1033,7 +1036,7 @@ def _get_non_default_generation_parameters(self) -> Dict[str, Any]:
             if decoder_config is not self:
                 default_config = decoder_config.__class__()
             else:
-                decoder_config = None
+                default_config = None
 
         # If it is a composite model, we want to check the subconfig that will be used for generation
         self_decoder_config = self if decoder_attribute_name is None else getattr(self, decoder_attribute_name)
diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py
index a4fd90f2bfe473..92371415918150 100644
--- a/src/transformers/convert_slow_tokenizer.py
+++ b/src/transformers/convert_slow_tokenizer.py
@@ -345,7 +345,6 @@ def converted(self, vocab: Dict[str, int] = None, merges: List[Tuple[str, str]]
             )
         )
 
-        add_prefix_space = False
         add_prefix_space = getattr(self.original_tokenizer, "add_prefix_space", False)
         tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=add_prefix_space)
         tokenizer.decoder = decoders.ByteLevel()
diff --git a/src/transformers/deepspeed.py b/src/transformers/deepspeed.py
deleted file mode 100644
index 6fd22d8c5cbaf3..00000000000000
--- a/src/transformers/deepspeed.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Integration with Deepspeed - kept for backward compatiblity, if you plan to make any edit, make sure to modify the file
-in `integrations/deepspeed` instead.
-
-Check: https://github.com/huggingface/transformers/pull/25599
-"""
-
-import warnings
-
-
-warnings.warn(
-    "transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations",
-    FutureWarning,
-)
-
-# Backward compatibility imports, to make sure all those objects can be found in integrations/deepspeed
-from .integrations.deepspeed import (  # noqa
-    HfDeepSpeedConfig,
-    HfTrainerDeepSpeedConfig,
-    deepspeed_config,
-    deepspeed_init,
-    deepspeed_load_checkpoint,
-    deepspeed_optim_sched,
-    is_deepspeed_available,
-    is_deepspeed_zero3_enabled,
-    set_hf_deepspeed_config,
-    unset_hf_deepspeed_config,
-)
diff --git a/src/transformers/feature_extraction_utils.py b/src/transformers/feature_extraction_utils.py
index b2e9d251a6bd01..f3cde8180c1bd4 100644
--- a/src/transformers/feature_extraction_utils.py
+++ b/src/transformers/feature_extraction_utils.py
@@ -237,10 +237,10 @@ def to(self, *args, **kwargs) -> "BatchFeature":
         # We cast only floating point tensors to avoid issues with tokenizers casting `LongTensor` to `FloatTensor`
         for k, v in self.items():
             # check if v is a floating point
-            if torch.is_floating_point(v):
+            if isinstance(v, torch.Tensor) and torch.is_floating_point(v):
                 # cast and send to device
                 new_data[k] = v.to(*args, **kwargs)
-            elif device is not None:
+            elif isinstance(v, torch.Tensor) and device is not None:
                 new_data[k] = v.to(device=device)
             else:
                 new_data[k] = v
diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index 54b3f709a9d899..a4c8f79ae925d1 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -16,6 +16,7 @@
 import copy
 from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple
 
+import numpy as np
 import torch
 
 from ..cache_utils import DynamicCache
@@ -25,6 +26,7 @@
 
 if TYPE_CHECKING:
     from ..modeling_utils import PreTrainedModel
+    from ..tokenization_utils_base import PreTrainedTokenizerBase
     from .configuration_utils import GenerationConfig
 
 
@@ -156,6 +158,7 @@ def __init__(
         # Prepare generation-related options.
         self.logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
         self.generation_config = copy.deepcopy(generation_config)
+
         self.generation_config.return_dict_in_generate = True
         self.generation_config.output_scores = True
         self.generation_config.assistant_confidence_threshold = self.assistant_confidence_threshold
@@ -258,6 +261,303 @@ def update_candidate_strategy(self, input_ids: torch.LongTensor, scores: torch.F
                 self.num_assistant_tokens = max(1.0, self.num_assistant_tokens - 1.0)
 
 
+class AssistedCandidateGeneratorDifferentTokenizers(AssistedCandidateGenerator):
+    """
+    `CandidateGenerator` class to be used for Universal Assisted Generation (UAD): assisted generation with different tokenizers
+    for the assistant and main models. This class generates candidates through the use of a smaller
+    model.
+
+    The main model input tokens are re-encoded into assistant model tokens, then candidate tokens are generated in the assistant encoding, which are
+    in turn re-encoded into main model candidate tokens. Validation then proceeds as explained above.
+    The re-encoding steps involve decoding token ids into text and then encoding the text using a different tokenizer.
+    Since re-encoding the tokens may result in tokenization discrepancies, UAD finds the longest common subsequence between the source and target encodings,
+    to ensure the new tokens include the correct prompt suffix.
+
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
+        assistant_model (`PreTrainedModel`):
+            The model to be used for generating candidates. This model should be smaller than the main model.
+        target_tokenizer (`PreTrainedTokenizerBase`):
+            The tokenizer used for the target model.
+        assistant_tokenizer (`PreTrainedTokenizerBase`):
+            The tokenizer used for the assistant model.
+        generation_config (`~generation.GenerationConfig`, *optional*):
+            The generation configuration to be used as base parametrization for the generation call.
+        logits_processor (`LogitsProcessorList`):
+            An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+            used to modify the prediction scores of the language modeling head applied at each generation step.
+        model_kwargs (`Dict`):
+            The keyword arguments that will be passed to the main model, and are used as base inputs for the assistant
+            model as well.
+        inputs_tensor (`torch.Tensor`, *optional*):
+            The model input tensor. In encoder-decoder models, this is the encoder input.
+    """
+
+    def __init__(
+        self,
+        input_ids: torch.LongTensor,
+        assistant_model: "PreTrainedModel",
+        target_tokenizer: "PreTrainedTokenizerBase",
+        assistant_tokenizer: "PreTrainedTokenizerBase",
+        generation_config: "GenerationConfig",
+        model_kwargs: Dict,
+        inputs_tensor: Optional[torch.Tensor] = None,
+        logits_processor: "LogitsProcessorList" = None,
+    ):
+        super().__init__(input_ids, assistant_model, generation_config, model_kwargs, inputs_tensor, logits_processor)
+
+        self.target_tokenizer = target_tokenizer
+        self.assistant_tokenizer = assistant_tokenizer
+        self.prev_tokens = None
+        self.prev_assistant_ids = None
+        self.target_lookbehind = 10
+        self.assistant_lookbehind = 10
+
+    @staticmethod
+    def _get_longest_diag_dict(input_matrix, nonzero_idx):
+        """
+        Calculates the length of the longest diagonal sequence in a given matrix.
+        Args:
+            input_matrix (torch.Tensor): The input matrix.
+            nonzero_idx (torch.Tensor): The indices of the non-zero elements in the matrix.
+        Returns:
+            dict: A dictionary where the keys are the indices of the non-zero elements and the values are the lengths of the longest diagonal sequences starting from those indices.
+        """
+
+        visited = set()
+        diags = {}
+        for idx in nonzero_idx:
+            start_idx = torch.clone(idx)
+            tuple_start_idx = tuple(start_idx.tolist())
+
+            if tuple_start_idx in visited:
+                continue
+
+            visited.add(tuple_start_idx)
+            cur_diag_len = 1
+            start_idx += 1
+            while start_idx[0] < input_matrix.shape[0] and start_idx[1] < input_matrix.shape[1]:
+                tuple_start_idx = tuple(start_idx.tolist())
+                visited.add(tuple_start_idx)
+
+                if input_matrix[start_idx[0], start_idx[1]] == 1:
+                    cur_diag_len += 1
+                    start_idx += 1
+                else:
+                    break
+
+            diags[idx] = cur_diag_len
+        return diags
+
+    @staticmethod
+    def _get_longest_diag_index(input_matrix):
+        """
+        Returns the start index and length of the longest diagonal in the given input.
+        Args:
+            input_matrix (numpy.ndarray): The input matrix.
+        Returns:
+            tuple: A tuple containing the start index and length of the longest diagonal.
+        """
+
+        diags = AssistedCandidateGeneratorDifferentTokenizers._get_longest_diag_dict(
+            input_matrix, input_matrix.nonzero()
+        )
+        diags_values = list(diags.values())
+        diags_keys = list(diags.keys())
+        best_diag = np.argmax(diags_values)
+        diag_start_index = diags_keys[best_diag]
+        diag_start_length = diags_values[best_diag]
+        return diag_start_index, diag_start_length
+
+    @staticmethod
+    def _get_tokens_diag(prompt, prompt_plus_new_tokens):
+        """
+        Input:
+            prompt: 2D array of shape (batch_size, prompt_length), represents the original prompt tokens
+            prompt_plus_new_tokens: 2D array of shape (batch_size, prompt_length), represents the suffix of the original prompt, with additional new tokens.
+        Output:
+            discrepancy_length: int, represents the number of tokens that need to be replaced from prompt
+            new_tokens_only: 2D array of shape (batch_size, new_token_length), represents the new tokens that are not in prompt
+            discrepancy_only: 2D array of shape (batch_size, discrepancy_length), represents the new tokens that are in prompt but not in prompt_plus_new_tokens
+        """
+        compare_mat = prompt_plus_new_tokens.T == prompt
+        if not torch.is_tensor(compare_mat):
+            compare_mat = torch.tensor(compare_mat)
+
+        compare_mat_int = compare_mat.to(int)
+
+        if not compare_mat_int.any().item():
+            # empty intersection between prompt and prompt_plus_new_tokens
+            return None, None, None
+
+        longest_location, longest_diag_length = AssistedCandidateGeneratorDifferentTokenizers._get_longest_diag_index(
+            compare_mat_int
+        )
+        new_token_start_index = longest_location[0] + longest_diag_length
+        discrepancy_with_old = longest_location[1] + longest_diag_length
+        discrepancy_length = (prompt.shape[1] - discrepancy_with_old).item()
+        new_tokens_only = prompt_plus_new_tokens[:, new_token_start_index + discrepancy_length :]
+        discrepancy_only = prompt_plus_new_tokens[
+            :, new_token_start_index : new_token_start_index + discrepancy_length
+        ]
+        return discrepancy_length, new_tokens_only, discrepancy_only
+
+    def convert_source_tokens_to_target_tokens(
+        self,
+        input_ids,
+        source_tokenizer,
+        destination_tokenizer,
+    ):
+        """
+        Convert token IDs from one tokenizer to another.
+        Args:
+            input_ids: The input token IDs.
+            source_tokenizer: The source tokenizer.
+            destination_tokenizer: The destination tokenizer.
+        Returns:
+            The converted token IDs.
+        """
+        text = source_tokenizer.batch_decode(input_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+        dest_ids = destination_tokenizer(text, add_special_tokens=True, return_tensors="pt")["input_ids"]
+        return dest_ids.to(input_ids.device)
+
+    def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
+        """
+        Fetches the candidates to be tried for the current input.
+
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
+
+        Return:
+            `torch.LongTensor` of shape `(batch_size, candidate_length)` containing the candidate sequences to be
+            assessed by the model and a `torch.FloatTensor` of shape `(batch_size, candidate_length,
+            vocabulary_size)` containing the logits associated to each candidate.
+        """
+        max_new_tokens = int(self.num_assistant_tokens)
+        if max_new_tokens == 0:
+            return input_ids, None
+
+        input_ids = input_ids.to(self.assistant_model.device)
+        convert_kwargs = {
+            "source_tokenizer": self.target_tokenizer,
+            "destination_tokenizer": self.assistant_tokenizer,
+        }
+        remove_from_pkv = 0
+
+        # Since re-encoding the tokens may result in tokenization discrepancies, we use 2 look behind values
+        # (one for each conversion) which mark where to start looking for the overlap between the
+        # source and target encodings, to ensure the new tokens include the correct prompt suffix.
+        if self.prev_tokens is not None and self.prev_target_ids.shape[1] > self.target_lookbehind:
+            # input_ids contains all target prompt input ids and some new target input ids
+            start_index_in_target_window = self.prev_target_ids.shape[1] - self.target_lookbehind
+
+            new_assistant_ids = self.convert_source_tokens_to_target_tokens(
+                input_ids[:, start_index_in_target_window:], **convert_kwargs
+            )
+            prompt_use_length = new_assistant_ids.shape[1]
+            prompt_use = self.prev_assistant_ids[:, -prompt_use_length:]
+
+            discrepancy_length, new_tokens_only, discrepancy_only = (
+                AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag(prompt_use, new_assistant_ids)
+            )
+            assistant_input_ids = self.prev_assistant_ids
+
+            if new_tokens_only is not None:
+                if discrepancy_length > 0 and discrepancy_only.shape[1] > 0:
+                    if discrepancy_length == discrepancy_only.shape[1]:
+                        assistant_input_ids[:, -discrepancy_length:] = discrepancy_only
+
+                    elif discrepancy_length > discrepancy_only.shape[1]:
+                        discrepancy_length_diff = discrepancy_length - discrepancy_only.shape[1]
+                        assistant_input_ids = assistant_input_ids[:, :-discrepancy_length_diff]
+                        assistant_input_ids[:, -discrepancy_only.shape[1] :] = discrepancy_only
+
+                    remove_from_pkv = discrepancy_length
+
+                if new_tokens_only.shape[1] > 0:
+                    assistant_input_ids = torch.cat([assistant_input_ids, new_tokens_only], dim=-1)
+            else:
+                # edge case: in case of no intersection between prompt and new_assistant_ids
+                assistant_input_ids = torch.cat([assistant_input_ids, new_assistant_ids], dim=-1)
+
+        else:
+            assistant_input_ids = self.convert_source_tokens_to_target_tokens(input_ids, **convert_kwargs)
+            self.prev_target_ids = input_ids
+
+        self.prev_assistant_ids = assistant_input_ids
+        new_cur_len = assistant_input_ids.shape[-1]
+        min_new_tokens = max(min(max_new_tokens, self.main_model_min_length - new_cur_len), 0)
+
+        # 1. If it is not the first round of candidate generation, prepare the inputs based on the input_ids length
+        # (which implicitly contains the number of accepted candidates from the previous round)
+        has_past_key_values = self.assistant_kwargs.get("past_key_values", None) is not None
+        if has_past_key_values:
+            new_cache_size = new_cur_len - 1 - remove_from_pkv
+            self.assistant_kwargs["past_key_values"] = _crop_past_key_values(
+                self.assistant_model, self.assistant_kwargs["past_key_values"], new_cache_size - 1
+            )  # the assistant does not have the token after the last match, hence the -1
+
+            self.assistant_kwargs = _prepare_attention_mask(
+                self.assistant_kwargs, new_cur_len, self.assistant_model.config.is_encoder_decoder
+            )
+            self.assistant_kwargs = _prepare_token_type_ids(self.assistant_kwargs, new_cur_len)
+
+        # 2. Forecast next N tokens using the assistant model.
+        assistant_generation_kwargs = {
+            self.input_ids_key: assistant_input_ids,
+            "min_new_tokens": min_new_tokens,
+            "max_new_tokens": max_new_tokens,
+            "generation_config": self.generation_config,
+            "logits_processor": self.logits_processor,
+        }
+
+        self.assistant_kwargs.pop("attention_mask", None)
+
+        assistant_output = self.assistant_model.generate(**assistant_generation_kwargs, **self.assistant_kwargs)
+
+        num_prev_assistant = self.prev_assistant_ids.shape[1]
+        start_assistant_look_index = num_prev_assistant - self.assistant_lookbehind
+
+        new_target_ids_from_window = self.convert_source_tokens_to_target_tokens(
+            assistant_output.sequences[:, start_assistant_look_index:],
+            source_tokenizer=self.assistant_tokenizer,
+            destination_tokenizer=self.target_tokenizer,
+        )
+        target_prompt_use_length = new_target_ids_from_window.shape[1]
+
+        target_prompt_use = input_ids[:, -target_prompt_use_length:]
+
+        _, target_new_tokens_only, _ = AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag(
+            target_prompt_use, new_target_ids_from_window
+        )
+
+        new_target_ids = input_ids
+
+        if target_new_tokens_only is not None:
+            if target_new_tokens_only.shape[1] > 0:
+                new_target_ids = torch.cat([new_target_ids, target_new_tokens_only], dim=-1)
+        else:
+            # edge case: in case of no intersection between prompt and new_target_ids
+            new_target_ids = torch.cat([new_target_ids, new_target_ids_from_window], dim=-1)
+
+        self.prev_target_ids = input_ids
+
+        if hasattr(self.generation_config, "max_length"):
+            new_target_ids = new_target_ids[:, : self.generation_config.max_length]
+
+        # 3. Update variables for the next round of candidate generation
+        self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values
+        self.prev_tokens = assistant_output.sequences
+
+        # 4. Prepare variables for output
+        if input_ids.shape[1] >= new_target_ids.shape[1]:
+            return input_ids, None
+
+        return new_target_ids, None
+
+
 class PromptLookupCandidateGenerator(CandidateGenerator):
     """
     `CandidateGenerator` class to be used for prompt lookup generation. This class generates candidates by looking up
@@ -425,6 +725,16 @@ def _prepare_attention_mask(model_kwargs: Dict[str, Any], new_length: int, is_en
         model_kwargs[mask_key] = mask[:, :mask_length_diff]
     elif mask_length_diff > 0:
         model_kwargs[mask_key] = torch.cat([mask, mask.new_ones((mask.shape[0], mask_length_diff))], dim=-1)
+
+    if "cross_attention_mask" in model_kwargs:
+        # Mllama case is special and has another mask for cross attention model
+        cross_mask = model_kwargs["cross_attention_mask"]
+        if mask_length_diff < 0:
+            model_kwargs["cross_attention_mask"] = cross_mask[:, :mask_length_diff]
+        elif mask_length_diff > 0:
+            new_mask = cross_mask[:, -1:, :, :].repeat(1, mask_length_diff, 1, 1)
+            model_kwargs["cross_attention_mask"] = torch.cat([cross_mask, new_mask], dim=1)
+
     return model_kwargs
 
 
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 661c8b579af91f..5da4878513eb22 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -35,6 +35,7 @@
 )
 from ..configuration_utils import PretrainedConfig
 from ..integrations.deepspeed import is_deepspeed_zero3_enabled
+from ..integrations.fsdp import is_fsdp_managed_module
 from ..modeling_outputs import CausalLMOutputWithPast, Seq2SeqLMOutput
 from ..pytorch_utils import isin_mps_friendly
 from ..tokenization_utils import ExtensionsTrie
@@ -42,6 +43,7 @@
     ModelOutput,
     is_accelerate_available,
     is_hqq_available,
+    is_optimum_quanto_available,
     is_quanto_available,
     is_torchdynamo_compiling,
     logging,
@@ -50,6 +52,7 @@
 from .beam_search import BeamScorer, BeamSearchScorer, ConstrainedBeamSearchScorer
 from .candidate_generator import (
     AssistedCandidateGenerator,
+    AssistedCandidateGeneratorDifferentTokenizers,
     CandidateGenerator,
     PromptLookupCandidateGenerator,
     _crop_past_key_values,
@@ -350,47 +353,69 @@ def prepare_inputs_for_generation(
         attention_mask: Optional[torch.LongTensor] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         cache_position: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        use_cache: bool = True,
-        num_logits_to_keep: Optional[int] = None,
         **kwargs,
     ):
         """
         Prepare the model inputs for generation. In includes operations like computing the 4D attention mask or
         slicing inputs given the existing cache.
 
-        See the documentation in the used model for the arguments (different models might have different requirements
-        for e.g. `past_key_values`). Should work as is for most LLMs.
+        See the forward pass in the model documentation for expected arguments (different models might have different
+        requirements for e.g. `past_key_values`). This function should work as is for most LLMs.
         """
+
+        # 1. Handle BC:
+        model_inputs = {}
+        # - some models don't have `Cache` support (which implies they don't expect `cache_position` in `forward`)
+        if self._supports_cache_class:
+            model_inputs["cache_position"] = cache_position
+        # - `cache_position` was not a mandatory input in `prepare_inputs_for_generation` for those models, and this
+        #   function may be called outside of `generate`. Handle most use cases by creating `cache_position` on the fly
+        #   (this alternative is not as robust as calling `generate` and letting it create `cache_position`)
+        elif cache_position is None:
+            past_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+            cache_position = torch.arange(past_length, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
+
+        # 2. Generic cache-dependent input preparation
         # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
         # Exception 1: when passing input_embeds, input_ids may be missing entries
         # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
         if past_key_values is not None:
+            model_inputs["past_key_values"] = past_key_values
             if inputs_embeds is not None:  # Exception 1
                 input_ids = input_ids[:, -cache_position.shape[0] :]
             elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
                 input_ids = input_ids[:, cache_position]
 
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-
-                # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s
-                # `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the
-                # decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case,
-                # `position_ids` is already contiguous but with varying stride which retriggers a capture.
-                position_ids = position_ids.clone(memory_format=torch.contiguous_format)
-
+        # 3. Prepare base model inputs
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
         if inputs_embeds is not None and cache_position[0] == 0:
-            model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
+            model_inputs["input_ids"] = None
+            model_inputs["inputs_embeds"] = inputs_embeds
         else:
-            # The clone here is for the same reason as for `position_ids`.
-            model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
+            # `clone` calls in this function ensure a consistent stride. See #32227
+            model_inputs["input_ids"] = input_ids.clone(memory_format=torch.contiguous_format)
+            model_inputs["inputs_embeds"] = None
 
+        # 4. Create missing `position_ids` on the fly
+        if (
+            attention_mask is not None
+            and kwargs.get("position_ids") is None
+            and "position_ids" in set(inspect.signature(self.forward).parameters.keys())
+        ):
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            kwargs["position_ids"] = position_ids  # placed in kwargs for further processing (see below)
+
+        # 5. Slice model inputs if it's an input that should have the same length as `input_ids`
+        for model_input_name in ["position_ids", "token_type_ids"]:
+            model_input = kwargs.get(model_input_name)
+            if model_input is not None:
+                if past_key_values:
+                    model_input = model_input[:, -input_ids.shape[1] :]
+                    model_input = model_input.clone(memory_format=torch.contiguous_format)
+                model_inputs[model_input_name] = model_input
+
+        # 6. Create 4D attention mask is we are using a `StaticCache` (important for performant compiled forward pass)
         if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
             if model_inputs["inputs_embeds"] is not None:
                 batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
@@ -416,25 +441,20 @@ def prepare_inputs_for_generation(
                 attention_mask = causal_mask_creation_function(
                     attention_mask,
                     sequence_length=sequence_length,
-                    target_length=past_key_values.get_max_length(),
+                    target_length=past_key_values.get_max_cache_shape(),
                     dtype=self.get_output_embeddings().weight.dtype,
                     device=device,
                     cache_position=cache_position,
                     batch_size=batch_size,
                 )
+        if attention_mask is not None:
+            model_inputs["attention_mask"] = attention_mask
 
-        if num_logits_to_keep is not None:
-            model_inputs["num_logits_to_keep"] = num_logits_to_keep
+        # 7. Forward ALL kwargs that are uninitialized (e.g. `use_cache`).
+        for key, value in kwargs.items():
+            if key not in model_inputs:
+                model_inputs[key] = value
 
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "cache_position": cache_position,
-                "past_key_values": past_key_values,
-                "use_cache": use_cache,
-                "attention_mask": attention_mask,
-            }
-        )
         return model_inputs
 
     def _prepare_model_inputs(
@@ -599,7 +619,7 @@ def _prepare_encoder_decoder_kwargs_for_generation(
         model_input_name = model_input_name if model_input_name is not None else self.main_input_name
         encoder_kwargs["return_dict"] = True
         encoder_kwargs[model_input_name] = inputs_tensor
-        model_kwargs["encoder_outputs"]: ModelOutput = encoder(**encoder_kwargs)
+        model_kwargs["encoder_outputs"]: ModelOutput = encoder(**encoder_kwargs)  # type: ignore
 
         return model_kwargs
 
@@ -769,11 +789,15 @@ def _get_candidate_generator(
         inputs_tensor: torch.Tensor,
         assistant_model: "PreTrainedModel",
         logits_processor: LogitsProcessorList,
+        target_tokenizer: "PreTrainedTokenizerBase",
+        assistant_tokenizer: "PreTrainedTokenizerBase",
         model_kwargs: Dict,
     ) -> CandidateGenerator:
         """
         Returns the candidate generator to be used in `assisted_generation`
         """
+        different_tokenizers = all(v is not None for v in (assistant_model, target_tokenizer, assistant_tokenizer))
+
         if generation_config.prompt_lookup_num_tokens is not None:
             candidate_generator = PromptLookupCandidateGenerator(
                 eos_token_id=generation_config._eos_token_tensor,
@@ -781,6 +805,17 @@ def _get_candidate_generator(
                 max_matching_ngram_size=generation_config.max_matching_ngram_size,
                 max_length=generation_config.max_length,
             )
+        elif different_tokenizers:
+            candidate_generator = AssistedCandidateGeneratorDifferentTokenizers(
+                input_ids=input_ids,
+                assistant_model=assistant_model,
+                generation_config=generation_config,
+                model_kwargs=model_kwargs,
+                inputs_tensor=inputs_tensor,
+                logits_processor=logits_processor,
+                target_tokenizer=target_tokenizer,
+                assistant_tokenizer=assistant_tokenizer,
+            )
         else:
             candidate_generator = AssistedCandidateGenerator(
                 input_ids=input_ids,
@@ -836,12 +871,19 @@ def _get_logits_processor(
             generation_config.encoder_repetition_penalty is not None
             and generation_config.encoder_repetition_penalty != 1.0
         ):
-            processors.append(
-                EncoderRepetitionPenaltyLogitsProcessor(
-                    penalty=generation_config.encoder_repetition_penalty,
-                    encoder_input_ids=encoder_input_ids,
+            if len(encoder_input_ids.shape) == 2:
+                processors.append(
+                    EncoderRepetitionPenaltyLogitsProcessor(
+                        penalty=generation_config.encoder_repetition_penalty,
+                        encoder_input_ids=encoder_input_ids,
+                    )
+                )
+            else:
+                warnings.warn(
+                    "Passing `encoder_repetition_penalty` requires some form of `input_ids` to be passed to "
+                    "`generate`, ignoring the argument.",
+                    UserWarning,
                 )
-            )
         if generation_config.repetition_penalty is not None and generation_config.repetition_penalty != 1.0:
             processors.append(RepetitionPenaltyLogitsProcessor(penalty=generation_config.repetition_penalty))
         if generation_config.no_repeat_ngram_size is not None and generation_config.no_repeat_ngram_size > 0:
@@ -850,12 +892,19 @@ def _get_logits_processor(
             generation_config.encoder_no_repeat_ngram_size is not None
             and generation_config.encoder_no_repeat_ngram_size > 0
         ):
-            processors.append(
-                EncoderNoRepeatNGramLogitsProcessor(
-                    generation_config.encoder_no_repeat_ngram_size,
-                    encoder_input_ids,
+            if len(encoder_input_ids.shape) == 2:
+                processors.append(
+                    EncoderNoRepeatNGramLogitsProcessor(
+                        generation_config.encoder_no_repeat_ngram_size,
+                        encoder_input_ids,
+                    )
+                )
+            else:
+                warnings.warn(
+                    "Passing `encoder_no_repeat_ngram_size` requires some form of `input_ids` to be passed to "
+                    "`generate`, ignoring the argument.",
+                    UserWarning,
                 )
-            )
         if generation_config.bad_words_ids is not None:
             processors.append(
                 NoBadWordsLogitsProcessor(
@@ -1218,7 +1267,7 @@ def _validate_model_class(self):
                 f"names: {terminations_with_generation_support}."
             )
 
-    def _validate_assistant(self, assistant_model):
+    def _validate_assistant(self, assistant_model, tokenizer, assistant_tokenizer):
         if assistant_model is None:
             return
 
@@ -1234,8 +1283,19 @@ def _validate_assistant(self, assistant_model):
                     "Ensure you load the assistant with the correct encoder-decoder class, e.g. `AutoModelForSpeechSeq2Seq` for Whisper."
                 )
 
-        if not self.config.get_text_config().vocab_size == assistant_model.config.get_text_config().vocab_size:
-            raise ValueError("Make sure the main and assistant model use the same tokenizer")
+        doc_reference = (
+            "(see https://huggingface.co/docs/transformers/en/generation_strategies#universal-assisted-decoding)"
+        )
+        if self.config.get_text_config().vocab_size == assistant_model.config.get_text_config().vocab_size:
+            if assistant_tokenizer is not None:
+                raise ValueError(
+                    f"`assistant_tokenizer` is not required when the main and assistant models use the same tokenizer. Please omit `assistant_tokenizer` from `generate()` {doc_reference}."
+                )
+        else:
+            if tokenizer is None or assistant_tokenizer is None:
+                raise ValueError(
+                    f"The main and assistant moedels have different tokenizers. Please provide `tokenizer` and `assistant_tokenizer` to `generate()` {doc_reference}."
+                )
 
     def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]):
         """Validates model kwargs for generation. Generate argument typos will also be caught here."""
@@ -1580,7 +1640,11 @@ def _supports_default_dynamic_cache(self) -> bool:
         order to save memory (because no back and forth `to_legacy_cache` and `from_legacy_cache` will be performed
         for `HybridMambaAttentionDynamicCache`).
         """
-        return self._supports_cache_class and "jamba" not in self.__class__.__name__.lower()
+        return (
+            self._supports_cache_class
+            and "jamba" not in self.__class__.__name__.lower()
+            and "zamba" not in self.__class__.__name__.lower()
+        )
 
     def _prepare_cache_for_generation(
         self,
@@ -1674,10 +1738,10 @@ def _prepare_cache_for_generation(
                 )
                 cache_class = QUANT_BACKEND_CLASSES_MAPPING[cache_config.backend]
 
-                if cache_config.backend == "quanto" and not is_quanto_available():
+                if cache_config.backend == "quanto" and not (is_optimum_quanto_available() or is_quanto_available()):
                     raise ImportError(
-                        "You need to install `quanto` in order to use KV cache quantization with quanto backend. "
-                        "Please install it via  with `pip install quanto`"
+                        "You need to install optimum-quanto in order to use KV cache quantization with optimum-quanto backend. "
+                        "Please install it via  with `pip install optimum-quanto`"
                     )
                 elif cache_config.backend == "HQQ" and not is_hqq_available():
                     raise ImportError(
@@ -1692,11 +1756,10 @@ def _prepare_cache_for_generation(
         # Use DynamicCache() instance by default. This will avoid back and forth from legacy format that
         # keeps copying the cache thus using much more memory
         else:
-            num_hidden_layers = self.config.get_text_config().num_hidden_layers
             model_kwargs[cache_name] = (
-                DynamicCache(num_hidden_layers)
+                DynamicCache()
                 if not requires_cross_attention_cache
-                else EncoderDecoderCache(DynamicCache(num_hidden_layers), DynamicCache(num_hidden_layers))
+                else EncoderDecoderCache(DynamicCache(), DynamicCache())
             )
 
     def _supports_num_logits_to_keep(self) -> bool:
@@ -1851,9 +1914,9 @@ def generate(
                 for constrained generation conditioned on the prefix, as described in [Autoregressive Entity
                 Retrieval](https://arxiv.org/abs/2010.00904).
             synced_gpus (`bool`, *optional*):
-                Whether to continue running the while loop until max_length. Unless overridden this flag will be set to
-                `True` under DeepSpeed ZeRO Stage 3 multiple GPUs environment to avoid hanging if one GPU finished
-                generating before other GPUs. Otherwise it'll be set to `False`.
+                Whether to continue running the while loop until max_length. Unless overridden, this flag will be set
+                to `True` if using `FullyShardedDataParallel` or DeepSpeed ZeRO Stage 3 with multiple GPUs to avoid
+                deadlocking if one GPU finishes generating before other GPUs. Otherwise, defaults to `False`.
             assistant_model (`PreTrainedModel`, *optional*):
                 An assistant model that can be used to accelerate generation. The assistant model must have the exact
                 same tokenizer. The acceleration is achieved when forecasting candidate tokens with the assistent model
@@ -1888,19 +1951,19 @@ def generate(
                     - [`~generation.GenerateEncoderDecoderOutput`],
                     - [`~generation.GenerateBeamEncoderDecoderOutput`]
         """
+
         # 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call
         self._validate_model_class()
         tokenizer = kwargs.pop("tokenizer", None)  # Pull this out first, we only use it for stopping criteria
+        assistant_tokenizer = kwargs.pop("assistant_tokenizer", None)  # only used for assisted generation
+
         generation_config, model_kwargs = self._prepare_generation_config(generation_config, **kwargs)
         self._validate_model_kwargs(model_kwargs.copy())
-        self._validate_assistant(assistant_model)
+        self._validate_assistant(assistant_model, tokenizer, assistant_tokenizer)
 
         # 2. Set generation parameters if not already defined
         if synced_gpus is None:
-            if is_deepspeed_zero3_enabled() and dist.get_world_size() > 1:
-                synced_gpus = True
-            else:
-                synced_gpus = False
+            synced_gpus = (is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)) and dist.get_world_size() > 1
 
         logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
         stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
@@ -2075,6 +2138,8 @@ def generate(
                 inputs_tensor=inputs_tensor,
                 assistant_model=assistant_model,
                 logits_processor=logits_processor,
+                target_tokenizer=tokenizer,
+                assistant_tokenizer=assistant_tokenizer,
                 model_kwargs=model_kwargs,
             )
 
@@ -2432,7 +2497,8 @@ def _dola_decoding(
             generation_config ([`~generation.GenerationConfig`]):
                 The generation configuration to be used as parametrization of the decoding method.
             synced_gpus (`bool`):
-                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
+                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
             streamer (`BaseStreamer`, *optional*):
                 Streamer object that will be used to stream the generated sequences. Generated tokens are passed
                 through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
@@ -2635,7 +2701,8 @@ def _contrastive_search(
             generation_config ([`~generation.GenerationConfig`]):
                 The generation configuration to be used as parametrization of the decoding method.
             synced_gpus (`bool`):
-                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
+                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
             streamer (`BaseStreamer`, *optional*):
                 Streamer object that will be used to stream the generated sequences. Generated tokens are passed
                 through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
@@ -3038,7 +3105,8 @@ def _sample(
             generation_config ([`~generation.GenerationConfig`]):
                 The generation configuration to be used as parametrization of the decoding method.
             synced_gpus (`bool`):
-                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
+                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
             streamer (`BaseStreamer`, *optional*):
                 Streamer object that will be used to stream the generated sequences. Generated tokens are passed
                 through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
@@ -3240,7 +3308,8 @@ def _beam_search(
             generation_config ([`~generation.GenerationConfig`]):
                 The generation configuration to be used as parametrization of the decoding method.
             synced_gpus (`bool`):
-                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
+                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
             model_kwargs:
                 Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
                 an encoder-decoder model the kwargs should include `encoder_outputs`.
@@ -3518,7 +3587,8 @@ def _group_beam_search(
             generation_config ([`~generation.GenerationConfig`]):
                 The generation configuration to be used as parametrization of the decoding method.
             synced_gpus (`bool`):
-                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
+                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
             model_kwargs:
                 Additional model specific kwargs that will be forwarded to the `forward` function of the model. If
                 model is an encoder-decoder model the kwargs should include `encoder_outputs`.
@@ -3807,7 +3877,8 @@ def _constrained_beam_search(
             generation_config ([`~generation.GenerationConfig`]):
                 The generation configuration to be used as parametrization of the decoding method.
             synced_gpus (`bool`):
-                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
+                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
             model_kwargs:
                 Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
                 an encoder-decoder model the kwargs should include `encoder_outputs`.
@@ -4045,7 +4116,8 @@ def _assisted_decoding(
             generation_config ([`~generation.GenerationConfig`]):
                 The generation configuration to be used as parametrization of the decoding method.
             synced_gpus (`bool`):
-                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
+                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
             streamer (`BaseStreamer`, *optional*):
                 Streamer object that will be used to stream the generated sequences. Generated tokens are passed
                 through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
@@ -4103,7 +4175,7 @@ def _assisted_decoding(
 
             #  1. Fetch candidate sequences from a `CandidateGenerator`
             candidate_input_ids, candidate_logits = candidate_generator.get_candidates(input_ids)
-            candidate_input_ids = candidate_input_ids.to(self.device)
+
             if candidate_logits is not None:
                 candidate_logits = candidate_logits.to(self.device)
 
diff --git a/src/transformers/hf_argparser.py b/src/transformers/hf_argparser.py
index 4b5548fffb4154..d03ff7004f2b6c 100644
--- a/src/transformers/hf_argparser.py
+++ b/src/transformers/hf_argparser.py
@@ -138,7 +138,14 @@ def __init__(self, dataclass_types: Union[DataClassType, Iterable[DataClassType]
 
     @staticmethod
     def _parse_dataclass_field(parser: ArgumentParser, field: dataclasses.Field):
-        field_name = f"--{field.name}"
+        # Long-option strings are conventionlly separated by hyphens rather
+        # than underscores, e.g., "--long-format" rather than "--long_format".
+        # Argparse converts hyphens to underscores so that the destination
+        # string is a valid attribute name. Hf_argparser should do the same.
+        long_options = [f"--{field.name}"]
+        if "_" in field.name:
+            long_options.append(f"--{field.name.replace('_', '-')}")
+
         kwargs = field.metadata.copy()
         # field.metadata is not used at all by Data Classes,
         # it is provided as a third-party extension mechanism.
@@ -198,11 +205,11 @@ def _parse_dataclass_field(parser: ArgumentParser, field: dataclasses.Field):
             if field.type is bool or (field.default is not None and field.default is not dataclasses.MISSING):
                 # Default value is False if we have no default when of type bool.
                 default = False if field.default is dataclasses.MISSING else field.default
-                # This is the value that will get picked if we don't include --field_name in any way
+                # This is the value that will get picked if we don't include --{field.name} in any way
                 kwargs["default"] = default
-                # This tells argparse we accept 0 or 1 value after --field_name
+                # This tells argparse we accept 0 or 1 value after --{field.name}
                 kwargs["nargs"] = "?"
-                # This is the value that will get picked if we do --field_name (without value)
+                # This is the value that will get picked if we do --{field.name} (without value)
                 kwargs["const"] = True
         elif isclass(origin_type) and issubclass(origin_type, list):
             kwargs["type"] = field.type.__args__[0]
@@ -219,7 +226,7 @@ def _parse_dataclass_field(parser: ArgumentParser, field: dataclasses.Field):
                 kwargs["default"] = field.default_factory()
             else:
                 kwargs["required"] = True
-        parser.add_argument(field_name, *aliases, **kwargs)
+        parser.add_argument(*long_options, *aliases, **kwargs)
 
         # Add a complement `no_*` argument for a boolean field AFTER the initial field has already been added.
         # Order is important for arguments with the same destination!
@@ -227,7 +234,13 @@ def _parse_dataclass_field(parser: ArgumentParser, field: dataclasses.Field):
         # here and we do not need those changes/additional keys.
         if field.default is True and (field.type is bool or field.type == Optional[bool]):
             bool_kwargs["default"] = False
-            parser.add_argument(f"--no_{field.name}", action="store_false", dest=field.name, **bool_kwargs)
+            parser.add_argument(
+                f"--no_{field.name}",
+                f"--no-{field.name.replace('_', '-')}",
+                action="store_false",
+                dest=field.name,
+                **bool_kwargs,
+            )
 
     def _add_dataclass_arguments(self, dtype: DataClassType):
         if hasattr(dtype, "_argument_group_name"):
diff --git a/src/transformers/integrations/__init__.py b/src/transformers/integrations/__init__.py
index 00bbcf2d060fe9..093e0af29844e4 100755
--- a/src/transformers/integrations/__init__.py
+++ b/src/transformers/integrations/__init__.py
@@ -21,9 +21,16 @@
     "awq": [
         "fuse_awq_modules",
         "post_init_awq_exllama_modules",
+        "post_init_awq_ipex_modules",
         "replace_quantization_scales",
         "replace_with_awq_linear",
     ],
+    "bitnet": [
+        "BitLinear",
+        "pack_weights",
+        "replace_with_bitnet_linear",
+        "unpack_weights",
+    ],
     "bitsandbytes": [
         "dequantize_and_replace",
         "get_keys_to_not_convert",
@@ -47,6 +54,7 @@
     ],
     "eetq": ["replace_with_eetq_linear"],
     "fbgemm_fp8": ["FbgemmFp8Linear", "replace_with_fbgemm_fp8_linear"],
+    "fsdp": ["is_fsdp_managed_module"],
     "ggml": [
         "GGUF_CONFIG_MAPPING",
         "GGUF_TENSOR_MAPPING",
@@ -115,9 +123,16 @@
     from .awq import (
         fuse_awq_modules,
         post_init_awq_exllama_modules,
+        post_init_awq_ipex_modules,
         replace_quantization_scales,
         replace_with_awq_linear,
     )
+    from .bitnet import (
+        BitLinear,
+        pack_weights,
+        replace_with_bitnet_linear,
+        unpack_weights,
+    )
     from .bitsandbytes import (
         dequantize_and_replace,
         get_keys_to_not_convert,
@@ -141,6 +156,7 @@
     )
     from .eetq import replace_with_eetq_linear
     from .fbgemm_fp8 import FbgemmFp8Linear, replace_with_fbgemm_fp8_linear
+    from .fsdp import is_fsdp_managed_module
     from .ggml import (
         GGUF_CONFIG_MAPPING,
         GGUF_TENSOR_MAPPING,
diff --git a/src/transformers/integrations/awq.py b/src/transformers/integrations/awq.py
index 18e1931d070d6a..c860ea1f53744b 100644
--- a/src/transformers/integrations/awq.py
+++ b/src/transformers/integrations/awq.py
@@ -13,9 +13,13 @@
 # limitations under the License.
 "AWQ (Activation aware Weight Quantization) integration file"
 
+import importlib
+
+from packaging import version
+
 from ..activations import ACT2FN
 from ..modeling_utils import PreTrainedModel
-from ..utils import is_auto_awq_available, is_torch_available, logging
+from ..utils import is_auto_awq_available, is_ipex_available, is_torch_available, logging
 from ..utils.quantization_config import (
     AwqBackendPackingMethod,
     AwqConfig,
@@ -145,6 +149,10 @@ def replace_with_awq_linear(
                 target_cls = WQLinear_ExllamaV2
             else:
                 raise ValueError(f"Unrecognized Exllama version: {quantization_config.exllama_config['version']}")
+        elif quantization_config.version == AWQLinearVersion.IPEX:
+            from awq.modules.linear.gemm_ipex import WQLinear_IPEX
+
+            target_cls = WQLinear_IPEX
         else:
             raise ValueError(f"Unrecognized AWQ version: {quantization_config.version}")
     else:
@@ -266,8 +274,11 @@ def fuse_awq_modules(model, quantization_config):
         # Replace layer norms
         _fuse_awq_layernorm(modules_to_fuse["layernorm"], module, FasterTransformerRMSNorm)
 
-        # Replace MLP layers
-        _fuse_awq_mlp(model, name, modules_to_fuse["mlp"], module, QuantFusedMLP)
+        # Replace MLP layers if awq version is not ipex.
+        if quantization_config.version != "ipex":
+            _fuse_awq_mlp(model, name, modules_to_fuse["mlp"], module, QuantFusedMLP)
+        else:
+            logger.info("The IPEX version AWQ does not support fuse mlp for now.")
 
         # Replace attention layers
         attention_has_been_fused = _fuse_awq_attention_layers(
@@ -389,6 +400,12 @@ def _fuse_awq_attention_layers(model, module, modules_to_fuse, current_module_na
         elif isinstance(q_proj, WQLinear_GEMM):
             linear_target_cls = WQLinear_GEMM
             cat_dim = 1
+        elif is_ipex_available() and version.parse(importlib.metadata.version("autoawq")) > version.parse("0.2.6"):
+            from awq.modules.linear import WQLinear_IPEX
+
+            if isinstance(q_proj, WQLinear_IPEX):
+                linear_target_cls = WQLinear_IPEX
+                cat_dim = 1
         else:
             raise ValueError("Unsupported q_proj type: {type(q_proj)}")
 
@@ -466,3 +483,16 @@ def post_init_awq_exllama_modules(model, exllama_config):
         raise ValueError(f"Unrecognized Exllama version: {exllama_config['version']}")
 
     return model
+
+
+def post_init_awq_ipex_modules(model):
+    """
+    Runs post init for IPEX layers which performs:
+        - Weights packing, reordering and repacking
+    """
+
+    from awq.modules.linear.gemm_ipex import ipex_post_init
+
+    model = ipex_post_init(model)
+
+    return model
diff --git a/src/transformers/integrations/bitnet.py b/src/transformers/integrations/bitnet.py
new file mode 100644
index 00000000000000..3386bdcb43b27c
--- /dev/null
+++ b/src/transformers/integrations/bitnet.py
@@ -0,0 +1,286 @@
+from ..utils import is_accelerate_available, is_torch_available, logging
+
+
+if is_accelerate_available():
+    from accelerate import init_empty_weights
+
+if is_torch_available():
+    import torch
+    import torch.nn as nn
+    import torch.nn.functional as F
+
+logger = logging.get_logger(__name__)
+
+
+# the weights are ternary so can be represented with 2 bits, and they are packed in uint8 tensors, hence the number of values per item is 4
+VALUES_PER_ITEM = 4
+
+
+def pack_weights(quantized_weights: torch.Tensor) -> torch.Tensor:
+    """
+    Packs a tensor of quantized weights into a compact format using 2 bits per value.
+
+    Parameters:
+    -----------
+    quantized_weights : torch.Tensor
+        A tensor containing ternary quantized weights with values in {-1, 0, 1}. These values are adjusted to
+        {0, 1, 2} before being packed.
+
+    Returns:
+    --------
+    torch.Tensor
+        A packed tensor where each element stores 4 quantized values (each using 2 bits) in an 8-bit format.
+    """
+
+    original_shape = quantized_weights.shape
+
+    row_dim = (original_shape[0] + VALUES_PER_ITEM - 1) // VALUES_PER_ITEM
+
+    if len(original_shape) == 1:
+        packed_tensor_shape = (row_dim,)
+    else:
+        packed_tensor_shape = (row_dim, *original_shape[1:])
+
+    quantized_weights += 1
+    packed = torch.zeros(packed_tensor_shape, device=quantized_weights.device, dtype=torch.uint8)
+    unpacked = quantized_weights.to(torch.uint8)
+
+    it = min(VALUES_PER_ITEM, (original_shape[0] // row_dim) + 1)
+    for i in range(it):
+        start = i * row_dim
+        end = min(start + row_dim, original_shape[0])
+        packed[: (end - start)] |= unpacked[start:end] << 2 * i
+
+    return packed
+
+
+@torch.compile
+def unpack_weights(packed: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+    """
+    Unpacks a tensor of quantized weights that were stored in a packed format using 2 bits per value.
+
+    Parameters:
+    -----------
+    packed : torch.Tensor
+        A tensor containing packed weights where each element represents 4 quantized values (using 2 bits per value).
+    dtype : torch.dtype
+        The dtype of the returned Tensor
+    Returns:
+    --------
+    torch.Tensor
+        A tensor of unpacked weights, where each value is converted from its packed 2-bit representation.
+
+    Example:
+    --------
+    packed = torch.tensor([[0b10100001, 0b00011000],
+                           [0b10010000, 0b00001010]], dtype=torch.uint8)
+
+    # Unpack the values
+    unpacked = unpack_weights(packed)
+
+    # Resulting unpacked tensor
+    print(unpacked)
+    # Output: tensor([[ 0, -1],
+                      [-1,  1],
+                      [-1,  1],
+                      [-1,  1],
+                      [ 1,  0],
+                      [ 0, -1],
+                      [ 1, -1],
+                      [ 1, -1]])
+
+    Explanation of the example:
+    ---------------------------
+    Let's take the first value for example 0b10100001, we we will only focus on the first column,
+    because every element is unpacked across the first dimension
+    - First 2 bits: `01` → 0 at [0][0]
+    - Second 2 bits: `00` → -1 at [0][2]
+    - Third 2 bits: `10` → 1 at [0][4]
+    - Fourth 2 bits: `10` → 1 at [0][6]
+    the second value of the same row (0b10010000) will give the values for [0][1], [0][3], [0][5], [0][7]
+
+    We subtract 1 because during the packing process, it's easier to work with values like 0, 1, and 2. To make this possible,
+    we add 1 to the original ternary weights (which are typically -1, 0, and 1) when packing them. When unpacking, we reverse
+    this by subtracting 1 to restore the original ternary values.
+    """
+    packed_shape = packed.shape
+
+    if len(packed_shape) == 1:
+        original_row_dim = packed_shape[0] * VALUES_PER_ITEM
+        unpacked_shape = (original_row_dim,)
+    else:
+        original_row_dim = packed_shape[0] * VALUES_PER_ITEM
+        unpacked_shape = (original_row_dim, *packed_shape[1:])
+
+    unpacked = torch.zeros(unpacked_shape, device=packed.device, dtype=torch.uint8)
+
+    for i in range(VALUES_PER_ITEM):
+        start = i * packed_shape[0]
+        end = start + packed_shape[0]
+        mask = 3 << (2 * i)
+        unpacked[start:end] = (packed & mask) >> (2 * i)
+
+    return unpacked.to(dtype) - 1
+
+
+class BitLinear(nn.Module):
+    def __init__(self, in_features: int, out_features: int, bias: bool, device=None, dtype=None):
+        super().__init__()
+        self.dtype = dtype
+        self.register_buffer(
+            "weight",
+            torch.zeros(
+                (out_features // VALUES_PER_ITEM, in_features),
+                dtype=torch.uint8,
+                device=device,
+            ),
+        )
+        self.register_buffer(
+            "weight_scale",
+            torch.ones(
+                (1),
+                dtype=dtype,
+                device=device,
+            ),
+        )
+        if bias:
+            self.register_buffer("bias", torch.zeros((out_features), dtype=dtype, device=device))
+        else:
+            self.bias = None
+
+    @torch.compile
+    def activation_quant(self, input, num_bits=8):
+        """
+        Activation function : Performs symmetric, per-token quantization on the input activations.
+        Parameters:
+        -----------
+        x : torch.Tensor
+            Input activations to be quantized.
+        num_bits : int, optional (default=8)
+            Number of bits to use for quantization, determining the quantization range.
+
+        Returns:
+        --------
+        result : torch.Tensor
+            Quantized activation tensor, with values mapped to an `int8` range.
+        scale : torch.Tensor
+            The per-channel scaling factors used to quantize the tensor.
+        """
+        Qn = -(2 ** (num_bits - 1))
+        Qp = 2 ** (num_bits - 1) - 1
+        scale = Qp / input.abs().max(dim=-1, keepdim=True).values.clamp(min=1e-5)
+        result = (input * scale).round().clamp(Qn, Qp)
+        return result.to(torch.int8), scale
+
+    @torch.compile
+    def post_quant_process(self, input, input_scale, weight_scale):
+        out = input / (input_scale * weight_scale)
+        return out
+
+    def forward(self, input):
+        w = self.weight
+        w_quant = unpack_weights(w, dtype=self.dtype)
+        input_quant, input_scale = self.activation_quant(input)
+        y = F.linear(input_quant.to(self.dtype), w_quant)
+        y = self.post_quant_process(y, self.weight_scale, input_scale)
+        if self.bias is not None:
+            y += self.bias.view(1, -1).expand_as(y)
+        return y
+
+
+def _replace_with_bitnet_linear(
+    model,
+    modules_to_not_convert=None,
+    current_key_name=None,
+    quantization_config=None,
+    has_been_replaced=False,
+    pre_quantized=False,
+):
+    """
+    Private method that wraps the recursion for module replacement.
+
+    Returns the converted model and a boolean that indicates if the conversion has been successfull or not.
+    """
+
+    if current_key_name is None:
+        current_key_name = []
+
+    for name, module in model.named_children():
+        if current_key_name is None:
+            current_key_name = []
+        current_key_name.append(name)
+
+        # Check if the current key is not in the `modules_to_not_convert`
+        if not any(key in ".".join(current_key_name) for key in modules_to_not_convert):
+            with init_empty_weights():
+                if isinstance(module, nn.Linear) and name not in modules_to_not_convert:
+                    in_features = module.in_features
+                    out_features = module.out_features
+                    model._modules[name] = BitLinear(
+                        in_features=in_features,
+                        out_features=out_features,
+                        bias=module.bias is not None,
+                        device=module.weight.device,
+                        dtype=module.weight.dtype,
+                    )
+                    has_been_replaced = True
+                    model._modules[name].requires_grad_(False)
+
+        if len(list(module.children())) > 0:
+            _, has_been_replaced = _replace_with_bitnet_linear(
+                module,
+                modules_to_not_convert=modules_to_not_convert,
+                current_key_name=current_key_name,
+                quantization_config=quantization_config,
+                has_been_replaced=has_been_replaced,
+            )
+        # Remove the last key for recursion
+        current_key_name.pop(-1)
+    return model, has_been_replaced
+
+
+def replace_with_bitnet_linear(
+    model,
+    modules_to_not_convert=None,
+    current_key_name=None,
+    quantization_config=None,
+    pre_quantized=False,
+):
+    """
+    A helper function to replace all `torch.nn.Linear` modules by `BitLinear158` modules`.
+
+    The function will be run recursively and replace all `torch.nn.Linear` modules except for the `lm_head` that should
+    be kept as a `torch.nn.Linear` module. The replacement is done under `init_empty_weights` context manager so no
+    CPU/GPU memory is required to run this function. Each weight will be quantized along the channel.
+
+    Parameters:
+        model (`torch.nn.Module`):
+            Input model or `torch.nn.Module` as the function is run recursively.
+        modules_to_not_convert (`List[`str`]`, *optional*, defaults to `["lm_head"]`):
+            Names of the modules to not convert in `EetqLinear`. In practice we keep the `lm_head` in full precision
+            for numerical stability reasons.
+        current_key_name (`List[`str`]`, *optional*):
+            An array to track the current key of the recursion. This is used to check whether the current key (part of
+            it) is not in the list of modules to not convert (for instances modules that are offloaded to `cpu` or
+            `disk`).
+    """
+    modules_to_not_convert = ["lm_head"] if modules_to_not_convert is None else modules_to_not_convert
+    if quantization_config and quantization_config.modules_to_not_convert is not None:
+        modules_to_not_convert.extend(quantization_config.modules_to_not_convert)
+    modules_to_not_convert = list(set(modules_to_not_convert))
+    model, has_been_replaced = _replace_with_bitnet_linear(
+        model,
+        modules_to_not_convert,
+        current_key_name,
+        quantization_config,
+        pre_quantized=pre_quantized,
+    )
+
+    if not has_been_replaced:
+        logger.warning(
+            "You are loading your model using bitnet but no linear modules were found in your model."
+            " Please double check your model architecture, or submit an issue on github if you think this is"
+            " a bug."
+        )
+
+    return model
diff --git a/src/transformers/integrations/executorch.py b/src/transformers/integrations/executorch.py
index 4adfcc39a4b1fa..c0adff386f6312 100644
--- a/src/transformers/integrations/executorch.py
+++ b/src/transformers/integrations/executorch.py
@@ -114,6 +114,56 @@ def forward(self, input_ids: torch.Tensor, cache_position: torch.Tensor):
         )
         return outs.logits
 
+    @staticmethod
+    def generate(
+        exported_program: torch.export.ExportedProgram, prompt_token_ids: torch.Tensor, max_new_tokens: int
+    ) -> torch.Tensor:
+        """
+        Generate a sequence of tokens using an exported program.
+
+        This util function is designed to test exported models by simulating the generation process.
+        It processes the input prompt tokens sequentially (no parallel prefill).
+        This generate function is not intended to replace the original `generate` method, and the support
+        for leveraging the original `generate` is potentially planed!
+
+        Args:
+            exported_program (`torch.export.ExportedProgram`): The exported program generated via `torch.export`.
+            prompt_token_ids (`torch.Tensor`): Tensor representing the input prompt token IDs.
+            max_new_tokens (`int`): Maximum number of new tokens to generate. Note that the total generation
+                length is limited by both `max_new_tokens` and the model's cache size.
+
+        Returns:
+            torch.Tensor: A tensor containing the generated sequence of token IDs, including the original prompt tokens.
+        """
+        prompt_token_len = prompt_token_ids.shape[-1]
+        max_generation_length = prompt_token_len + max_new_tokens
+        for buffer_name, buffer in exported_program.named_buffers():
+            if buffer_name.startswith("static_cache.key_cache"):
+                max_cache_len = buffer.shape[2]
+                max_generation_length = min(max_generation_length, max_cache_len)
+                break
+
+        response_tokens = []
+        for input_pos in range(min(max_generation_length, prompt_token_len)):
+            result = exported_program.module().forward(
+                input_ids=prompt_token_ids[:, input_pos : input_pos + 1],
+                cache_position=torch.tensor([input_pos], dtype=torch.long),
+            )
+            response_tokens.append(prompt_token_ids[0][input_pos].item())
+
+        current_token = torch.argmax(result[:, -1, :], dim=-1).item()
+        response_tokens.append(current_token)
+
+        while len(response_tokens) < max_generation_length:
+            result = exported_program.module().forward(
+                input_ids=torch.tensor([[current_token]], dtype=torch.long),
+                cache_position=torch.tensor([len(response_tokens)], dtype=torch.long),
+            )
+            current_token = torch.argmax(result[:, -1, :], dim=-1).item()
+            response_tokens.append(current_token)
+
+        return torch.tensor([response_tokens], dtype=torch.long)
+
 
 def convert_and_export_with_cache(
     model: PreTrainedModel,
diff --git a/src/transformers/integrations/fsdp.py b/src/transformers/integrations/fsdp.py
new file mode 100644
index 00000000000000..7bcb11fe74e4d6
--- /dev/null
+++ b/src/transformers/integrations/fsdp.py
@@ -0,0 +1,33 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from ..utils import is_torch_available
+
+
+if TYPE_CHECKING:
+    from torch import nn
+
+
+def is_fsdp_managed_module(module: nn.Module) -> bool:
+    if not is_torch_available():
+        return False
+
+    import torch.distributed.fsdp
+
+    return isinstance(module, torch.distributed.fsdp.FullyShardedDataParallel) or getattr(
+        module, "_is_fsdp_managed_module", False
+    )
diff --git a/src/transformers/integrations/ggml.py b/src/transformers/integrations/ggml.py
index d7a548791e8a70..cc317b18b052ed 100644
--- a/src/transformers/integrations/ggml.py
+++ b/src/transformers/integrations/ggml.py
@@ -82,10 +82,15 @@
     "qwen2moe": {
         "token_embd": "model.embed_tokens",
         "blk": "model.layers",
-        "ffn_up": "mlp.up_proj",
-        "ffn_down": "mlp.down_proj",
-        "ffn_gate": "mlp.gate_proj",
+        "ffn_up_exps": "mlp.experts",
+        "ffn_up_shexp": "mlp.shared_expert.up_proj",
+        "ffn_down_exps": "mlp.experts",
+        "ffn_down_shexp": "mlp.shared_expert.down_proj",
         "ffn_norm": "post_attention_layernorm",
+        "ffn_gate_inp.weight": "mlp.gate.weight",
+        "ffn_gate_exps": "mlp.experts",
+        "ffn_gate_shexp": "mlp.shared_expert.gate_proj",
+        "ffn_gate_inp_shexp": "mlp.shared_expert_gate",
         "attn_norm": "input_layernorm",
         "attn_q": "self_attn.q_proj",
         "attn_v": "self_attn.v_proj",
@@ -120,6 +125,57 @@
         "output.weight": "lm_head.weight",
         "output_norm": "transformer.ln_f",
     },
+    "falcon7b": {
+        "token_embd": "word_embeddings",
+        "blk": "h",
+        "ffn_up": "mlp.dense_h_to_4h",
+        "ffn_down": "mlp.dense_4h_to_h",
+        "attn_norm": "input_layernorm",
+        "attn_qkv": "self_attention.query_key_value",
+        "attn_output": "self_attention.dense",
+        ".output.": ".lm_head.",
+        "output_norm": "ln_f",
+    },
+    "falcon40b": {
+        "token_embd": "word_embeddings",
+        "blk": "h",
+        "ffn_up": "mlp.dense_h_to_4h",
+        "ffn_down": "mlp.dense_4h_to_h",
+        ".attn_norm.": ".ln_mlp.",
+        "attn_norm_2": "ln_attn",
+        "attn_qkv": "self_attention.query_key_value",
+        "attn_output": "self_attention.dense",
+        ".output.": ".lm_head.",
+        "output_norm": "ln_f",
+    },
+    "stablelm": {
+        "token_embd": "model.embed_tokens",
+        "blk": "model.layers",
+        "ffn_up": "mlp.up_proj",
+        "ffn_down": "mlp.down_proj",
+        "ffn_gate": "mlp.gate_proj",
+        "ffn_norm": "post_attention_layernorm",
+        "attn_norm": "input_layernorm",
+        "attn_q": "self_attn.q_proj",
+        "attn_v": "self_attn.v_proj",
+        "attn_k": "self_attn.k_proj",
+        "attn_output": "self_attn.o_proj",
+        "output.weight": "lm_head.weight",
+        "output_norm": "model.norm",
+    },
+    "gpt2": {
+        "token_embd": "transformer.wte",
+        "blk": "transformer.h",
+        "position_embd": "transformer.wpe",
+        "output_norm": "transformer.ln_f",
+        "attn_norm": "ln_1",
+        "attn_qkv": "attn.c_attn",
+        "attn_output.weight": "attn.c_proj.weight",
+        "attn_output.bias": "attn.c_proj.bias",
+        "ffn_norm": "ln_2",
+        "ffn_up": "mlp.c_fc",
+        "ffn_down": "mlp.c_proj",
+    },
 }
 
 
@@ -177,6 +233,20 @@
         "attention.head_count_kv": "num_key_value_heads",
         "attention.layer_norm_rms_epsilon": "rms_norm_eps",
         "vocab_size": "vocab_size",
+        "expert_count": "num_experts",
+        "expert_used_count": "num_experts_per_tok",
+    },
+    "falcon": {
+        "context_length": "max_position_embeddings",
+        "block_count": "num_hidden_layers",
+        "feed_forward_length": "intermediate_size",
+        "embedding_length": "hidden_size",
+        "rope.dimension_count": None,
+        "rope.freq_base": "rope_theta",
+        "attention.head_count": "num_attention_heads",
+        "attention.head_count_kv": "num_key_value_heads",
+        "attention.layer_norm_rms_epsilon": "rms_norm_eps",
+        "vocab_size": "vocab_size",
     },
     "tokenizer": {
         "ggml.bos_token_id": "bos_token_id",
@@ -203,6 +273,25 @@
         "vocab_size": "vocab_size",
         "attention.layer_norm_epsilon": "layer_norm_epsilon",
     },
+    "stablelm": {
+        "context_length": "max_position_embeddings",
+        "block_count": "num_hidden_layers",
+        "feed_forward_length": "intermediate_size",
+        "embedding_length": "hidden_size",
+        "rope.dimension_count": None,
+        "attention.head_count": "num_attention_heads",
+        "attention.head_count_kv": "num_key_value_heads",
+        "attention.layer_norm_epsilon": "layer_norm_eps",
+        "vocab_size": "vocab_size",
+    },
+    "gpt2": {
+        "block_count": "n_layer",
+        "context_length": "n_ctx",
+        "embedding_length": "n_embd",
+        "feed_forward_length": "feed_forward_length",
+        "attention.head_count": "n_head",
+        "attention.layer_norm_epsilon": "layer_norm_epsilon",
+    },
 }
 
 GGUF_TOKENIZER_MAPPING = {
@@ -512,7 +601,7 @@ def converted(self) -> Tokenizer:
         return tokenizer
 
 
-class GGUFBloomConverter(GPT2Converter):
+class GGUFGPTConverter(GPT2Converter):
     def __init__(self, tokenizer_dict):
         self.original_tokenizer = GGUFTokenizerSkeleton(tokenizer_dict)
         self.additional_kwargs = {}
@@ -529,7 +618,10 @@ def converted(self) -> Tokenizer:
     "qwen2": GGUFQwen2Converter,
     "qwen2_moe": GGUFQwen2Converter,
     "phi3": GGUFPhi3Converter,
-    "bloom": GGUFBloomConverter,
+    "bloom": GGUFGPTConverter,
+    "falcon": GGUFGPTConverter,
+    "stablelm": GGUFGPTConverter,
+    "gpt2": GGUFGPTConverter,
 }
 
 
diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py
index 40298f9c6fc77b..5f0ac55d0eb5fd 100755
--- a/src/transformers/integrations/integration_utils.py
+++ b/src/transformers/integrations/integration_utils.py
@@ -915,7 +915,7 @@ def on_train_end(self, args, state, control, model=None, tokenizer=None, **kwarg
         if self._log_model.is_enabled and self._initialized and state.is_world_process_zero:
             from ..trainer import Trainer
 
-            fake_trainer = Trainer(args=args, model=model, tokenizer=tokenizer)
+            fake_trainer = Trainer(args=args, model=model, processing_class=tokenizer)
             with tempfile.TemporaryDirectory() as temp_dir:
                 fake_trainer.save_model(temp_dir)
                 metadata = (
@@ -2112,7 +2112,7 @@ def on_train_end(self, args, state, control, **kwargs):
             from transformers.trainer import Trainer
 
             if self._log_model is True:
-                fake_trainer = Trainer(args=args, model=kwargs.get("model"), tokenizer=kwargs.get("tokenizer"))
+                fake_trainer = Trainer(args=args, model=kwargs.get("model"), processing_class=kwargs.get("tokenizer"))
                 name = "best" if args.load_best_model_at_end else "last"
                 output_dir = os.path.join(args.output_dir, name)
                 fake_trainer.save_model(output_dir)
diff --git a/src/transformers/integrations/peft.py b/src/transformers/integrations/peft.py
index aa0b181540c7ed..bd0ca16f865f4c 100644
--- a/src/transformers/integrations/peft.py
+++ b/src/transformers/integrations/peft.py
@@ -11,10 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import importlib
 import inspect
 import warnings
 from typing import Any, Dict, List, Optional, Union
 
+from packaging import version
+
 from ..utils import (
     check_peft_version,
     find_adapter_config_file,
@@ -77,6 +80,7 @@ def load_adapter(
         offload_index: Optional[int] = None,
         peft_config: Dict[str, Any] = None,
         adapter_state_dict: Optional[Dict[str, "torch.Tensor"]] = None,
+        low_cpu_mem_usage: bool = False,
         adapter_kwargs: Optional[Dict[str, Any]] = None,
     ) -> None:
         """
@@ -129,12 +133,27 @@ def load_adapter(
             adapter_state_dict (`Dict[str, torch.Tensor]`, *optional*):
                 The state dict of the adapter to load. This argument is used in case users directly pass PEFT state
                 dicts
+            low_cpu_mem_usage (`bool`, *optional*, defaults to `False`):
+                Reduce memory usage while loading the PEFT adapter. This should also speed up the loading process.
+                Requires PEFT version 0.13.0 or higher.
             adapter_kwargs (`Dict[str, Any]`, *optional*):
                 Additional keyword arguments passed along to the `from_pretrained` method of the adapter config and
                 `find_adapter_config_file` method.
         """
         check_peft_version(min_version=MIN_PEFT_VERSION)
 
+        # peft only supports low_cpu_mem_usage starting from v0.13.0
+        peft_load_kwargs = {}
+        if low_cpu_mem_usage:
+            min_version_lcmu = "0.13.0"
+            if version.parse(importlib.metadata.version("peft")) >= version.parse(min_version_lcmu):
+                peft_load_kwargs["low_cpu_mem_usage"] = low_cpu_mem_usage
+            else:
+                raise ValueError(
+                    "The version of PEFT you are using does not support `low_cpu_mem_usage` yet, "
+                    f"please install PEFT >= {min_version_lcmu}."
+                )
+
         adapter_name = adapter_name if adapter_name is not None else "default"
         if adapter_kwargs is None:
             adapter_kwargs = {}
@@ -192,7 +211,7 @@ def load_adapter(
             )
 
         # Create and add fresh new adapters into the model.
-        inject_adapter_in_model(peft_config, self, adapter_name)
+        inject_adapter_in_model(peft_config, self, adapter_name, **peft_load_kwargs)
 
         if not self._hf_peft_config_loaded:
             self._hf_peft_config_loaded = True
@@ -211,7 +230,9 @@ def load_adapter(
             processed_adapter_state_dict[new_key] = value
 
         # Load state dict
-        incompatible_keys = set_peft_model_state_dict(self, processed_adapter_state_dict, adapter_name)
+        incompatible_keys = set_peft_model_state_dict(
+            self, processed_adapter_state_dict, adapter_name, **peft_load_kwargs
+        )
 
         if incompatible_keys is not None:
             # check only for unexpected keys
diff --git a/src/transformers/integrations/quanto.py b/src/transformers/integrations/quanto.py
index 67fe9166d334e5..27b32de63bfe55 100644
--- a/src/transformers/integrations/quanto.py
+++ b/src/transformers/integrations/quanto.py
@@ -12,12 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ..utils import is_torch_available
+from ..utils import is_optimum_quanto_available, is_quanto_available, is_torch_available, logging
 
 
 if is_torch_available():
     import torch
 
+logger = logging.get_logger(__name__)
+
 
 def replace_with_quanto_layers(
     model,
@@ -45,7 +47,14 @@ def replace_with_quanto_layers(
             should not be passed by the user.
     """
     from accelerate import init_empty_weights
-    from quanto import QLayerNorm, QLinear, qfloat8, qint2, qint4, qint8
+
+    if is_optimum_quanto_available():
+        from optimum.quanto import QLayerNorm, QLinear, qfloat8, qint2, qint4, qint8
+    elif is_quanto_available():
+        logger.warning_once(
+            "Importing from quanto will be deprecated in v4.47. Please install optimum-quanto instead `pip install optimum-quanto`"
+        )
+        from quanto import QLayerNorm, QLinear, qfloat8, qint2, qint4, qint8
 
     w_mapping = {"float8": qfloat8, "int8": qint8, "int4": qint4, "int2": qint2}
     a_mapping = {None: None, "float8": qfloat8, "int8": qint8}
diff --git a/src/transformers/modelcard.py b/src/transformers/modelcard.py
index acabf94d954645..57cd971dd79a53 100644
--- a/src/transformers/modelcard.py
+++ b/src/transformers/modelcard.py
@@ -874,13 +874,17 @@ def extract_hyperparameters_from_trainer(trainer):
     if total_eval_batch_size != hyperparameters["eval_batch_size"]:
         hyperparameters["total_eval_batch_size"] = total_eval_batch_size
 
-    if trainer.args.adafactor:
-        hyperparameters["optimizer"] = "Adafactor"
-    else:
-        hyperparameters["optimizer"] = (
-            f"Adam with betas=({trainer.args.adam_beta1},{trainer.args.adam_beta2}) and"
-            f" epsilon={trainer.args.adam_epsilon}"
-        )
+    if trainer.args.optim:
+        optimizer_name = trainer.args.optim
+        optimizer_args = trainer.args.optim_args if trainer.args.optim_args else "No additional optimizer arguments"
+
+        if "adam" in optimizer_name.lower():
+            hyperparameters["optimizer"] = (
+                f"Use {optimizer_name} with betas=({trainer.args.adam_beta1},{trainer.args.adam_beta2}) and"
+                f" epsilon={trainer.args.adam_epsilon} and optimizer_args={optimizer_args}"
+            )
+        else:
+            hyperparameters["optimizer"] = f"Use {optimizer_name} and the args are:\n{optimizer_args}"
 
     hyperparameters["lr_scheduler_type"] = trainer.args.lr_scheduler_type.value
     if trainer.args.warmup_ratio != 0.0:
diff --git a/src/transformers/modeling_attn_mask_utils.py b/src/transformers/modeling_attn_mask_utils.py
index 08eeaf9765920b..4319c021cb2bc3 100755
--- a/src/transformers/modeling_attn_mask_utils.py
+++ b/src/transformers/modeling_attn_mask_utils.py
@@ -281,7 +281,7 @@ def _ignore_causal_mask_sdpa(
         elif sliding_window is None or key_value_length < sliding_window:
             if len(attention_mask.shape) == 4:
                 return False
-            elif (is_training or not is_tracing) and torch.all(attention_mask == 1):
+            elif not is_tracing and torch.all(attention_mask == 1):
                 if query_length == 1 or key_value_length == query_length:
                     # For query_length == 1, causal attention and bi-directional attention are the same.
                     ignore_causal_mask = True
diff --git a/src/transformers/modeling_flash_attention_utils.py b/src/transformers/modeling_flash_attention_utils.py
index 44e61825dd9cd6..da961c6060e499 100644
--- a/src/transformers/modeling_flash_attention_utils.py
+++ b/src/transformers/modeling_flash_attention_utils.py
@@ -267,7 +267,8 @@ def _flash_attention_forward(
     # If position_ids is provided and check all examples do not contain only 1 sequence, If tensor in increasing
     # then we probably have one sequence, otherwise it is packed. Additionally check we are in pre-fill/training stage.
     # Use `flash_attn_varlen_func` to prevent cross-example attention and also allow padding free approach
-    elif position_ids is not None and not (torch.diff(position_ids, dim=-1) >= 0).all() and query_length != 1:
+    # Note: the `torch.diff(...)` condition is last to use short-circuit and avoid the cuda synchronization it incurs during inference (query_length == 1 always)
+    elif position_ids is not None and query_length != 1 and not (torch.diff(position_ids, dim=-1) >= 0).all():
         batch_size = query_states.size(0)
         query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = prepare_fa2_from_position_ids(
             query_states, key_states, value_states, position_ids
diff --git a/src/transformers/modeling_gguf_pytorch_utils.py b/src/transformers/modeling_gguf_pytorch_utils.py
index c2e06624e15714..b1d7b896085476 100644
--- a/src/transformers/modeling_gguf_pytorch_utils.py
+++ b/src/transformers/modeling_gguf_pytorch_utils.py
@@ -14,7 +14,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional
+import re
+from typing import Dict, Optional
 
 import numpy as np
 from tqdm import tqdm
@@ -99,8 +100,20 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
     if "qwen2moe" in architecture:
         updated_architecture = "qwen2_moe"
 
-    if architecture not in GGUF_SUPPORTED_ARCHITECTURES:
-        raise ValueError(f"Architecture {architecture} not supported")
+    model_size = ""
+    # extract the number of params from file name as architectures can differ ;
+    # eg. for falcon : `...falcon-7b-...`
+    if "falcon" in architecture:
+        gguf_file_name = gguf_checkpoint_path.split("/")[-1].lower()
+        m = re.search(r"-\d+b-", gguf_file_name)  # regex to catch `-7b-`
+        if m is None:
+            raise ValueError(
+                f"From file name, cannot determine the number of parameters for {architecture} architecture"
+            )
+        model_size = m.group().strip("-")  # only keeps `7b`
+
+    if architecture + model_size not in GGUF_SUPPORTED_ARCHITECTURES:
+        raise ValueError(f"Architecture {architecture + model_size} not supported")
 
     # List all key-value pairs in a columnized format
     for gguf_key, field in reader.fields.items():
@@ -146,17 +159,9 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
             )
 
     if return_tensors:
-        tensor_key_mapping = GGUF_TO_TRANSFORMERS_MAPPING["tensors"][architecture]
+        tensor_key_mapping = GGUF_TO_TRANSFORMERS_MAPPING["tensors"][architecture + model_size]
 
         for tensor in tqdm(reader.tensors, desc="Converting and de-quantizing GGUF tensors..."):
-            renamed_tensor_name = tensor.name
-
-            for tensor_name_mapping in GGUF_TO_TRANSFORMERS_MAPPING["tensors"]:
-                if tensor_name_mapping in renamed_tensor_name:
-                    renamed_tensor_name = renamed_tensor_name.replace(
-                        tensor_name_mapping, GGUF_TO_TRANSFORMERS_MAPPING["tensors"][tensor_name_mapping]
-                    )
-
             name = tensor.name
 
             weights = dequantize(tensor.data, tensor.tensor_type)
@@ -169,6 +174,15 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
                 elif ".attn_k." in name:
                     weights = reverse_permute_weights(weights, num_heads, num_kv_heads)
 
+            if architecture == "qwen2moe":
+                if "_exp" in name:
+                    split_moe_expert_tensor(weights, parsed_parameters, name, tensor_key_mapping)
+                    continue
+                if "ffn_gate_inp_shexp" in name:
+                    # for compatibility tensor shared_expert_gate must be (1, 2048) dim,
+                    # quantized one is (2048)
+                    weights = np.expand_dims(weights, axis=0)
+
             if architecture == "bloom" and "attn_qkv" in name:
                 num_heads = parsed_parameters["config"]["n_head"]
                 n_embed = parsed_parameters["config"]["hidden_size"]
@@ -177,6 +191,23 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
                 else:
                     weights = reverse_reshape_bias(weights, num_heads, n_embed)
 
+            if architecture == "gpt2":
+                if (
+                    "attn_qkv.weight" in name
+                    or "ffn_down.weight" in name
+                    or "ffn_up.weight" in name
+                    or "attn_output.weight" in name
+                ):
+                    # Original transpose implementation
+                    # https://github.com/ggerganov/llama.cpp/blob/a38b884c6c4b0c256583acfaaabdf556c62fabea/convert_hf_to_gguf.py#L2060-L2061
+                    weights = weights.T
+                if name == "output.weight":
+                    # output.weight has conflicts with attn_output.weight in name checking
+                    # we have to explicitly check that name is exactly output.weight
+                    name = "lm_head.weight"
+                    parsed_parameters["tensors"][name] = torch.from_numpy(np.copy(weights))
+                    continue
+
             for tensor_name in tensor_key_mapping:
                 if tensor_name in name:
                     name = name.replace(tensor_name, tensor_key_mapping[tensor_name])
@@ -225,3 +256,27 @@ def reverse_reshape_bias(weights: np.ndarray, n_head: int, n_embed: int):
 
     qkv_bias = np.stack([q_bias, k_bias, v_bias], axis=1).flatten()
     return qkv_bias
+
+
+def split_moe_expert_tensor(
+    weights: np.ndarray, parsed_parameters: Dict[str, Dict], name: str, tensor_key_mapping: dict
+):
+    # Original merge implementation
+    # https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L1994-L2022
+    exp_name = ""
+    if "ffn_gate_exps" in name:
+        exp_name = "gate_proj"
+    elif "ffn_down_exps" in name:
+        exp_name = "down_proj"
+    elif "ffn_up_exps" in name:
+        exp_name = "up_proj"
+    else:
+        raise ValueError(f"Cannot map expert tensor {name} in Qwen2Moe architecture.")
+    for tensor_name in tensor_key_mapping:
+        if tensor_name in name:
+            name = name.replace(tensor_name, tensor_key_mapping[tensor_name])
+    w_counter = parsed_parameters["config"].get("num_experts", 60)
+    for i in range(0, w_counter):
+        temp_name = name.replace(".weight", f".{i}.{exp_name}.weight")
+        exp_weight = weights[i]
+        parsed_parameters["tensors"][temp_name] = torch.from_numpy(np.copy(exp_weight))
diff --git a/src/transformers/modeling_rope_utils.py b/src/transformers/modeling_rope_utils.py
index e7aa1ceb921329..c617420a5896de 100644
--- a/src/transformers/modeling_rope_utils.py
+++ b/src/transformers/modeling_rope_utils.py
@@ -251,7 +251,7 @@ def _compute_longrope_parameters(
         device (`torch.device`):
             The device to use for initialization of the inverse frequencies.
         seq_len (`int`, *optional*):
-            The current sequence length. Unused for this type of RoPE.
+            The current sequence length.
         rope_kwargs (`Dict`, *optional*):
             BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
     Returns:
@@ -279,8 +279,11 @@ def _compute_longrope_parameters(
     # `original_max_position_embeddings` field containing the pretrained value. They use the ratio between these two
     # values to compute the default attention scaling factor, instead of using `factor`.
     if hasattr(config, "original_max_position_embeddings"):
+        if seq_len and seq_len < config.original_max_position_embeddings:
+            expanded_max_position_embeddings = config.original_max_position_embeddings
+        else:
+            expanded_max_position_embeddings = config.max_position_embeddings
         max_position_embeddings = config.original_max_position_embeddings
-        expanded_max_position_embeddings = config.max_position_embeddings
         factor = expanded_max_position_embeddings / max_position_embeddings
     else:
         max_position_embeddings = config.max_position_embeddings
@@ -360,13 +363,23 @@ def _compute_llama3_parameters(
 }
 
 
-def _check_received_keys(rope_type: str, received_keys: set, required_keys: set, optional_keys: Optional[set] = None):
+def _check_received_keys(
+    rope_type: str,
+    received_keys: set,
+    required_keys: set,
+    optional_keys: Optional[set] = None,
+    ignore_keys: Optional[set] = None,
+):
     """Compare the received keys in `config.rope_scaling` against the expected and optional keys"""
     # BC: "rope_type" was originally "type" -- let's check for "rope_type" when "type" is present
     if "type" in received_keys:
         received_keys -= {"type"}
         required_keys.add("rope_type")
 
+    # Some models need to store model-specific keys, and we don't want to throw warning at them
+    if ignore_keys is not None:
+        received_keys -= ignore_keys
+
     missing_keys = required_keys - received_keys
     if missing_keys:
         raise KeyError(f"Missing required keys in `rope_scaling` for 'rope_type'='{rope_type}': {missing_keys}")
@@ -379,47 +392,47 @@ def _check_received_keys(rope_type: str, received_keys: set, required_keys: set,
         logger.warning(f"Unrecognized keys in `rope_scaling` for 'rope_type'='{rope_type}': {unused_keys}")
 
 
-def _validate_default_rope_parameters(config: PretrainedConfig):
+def _validate_default_rope_parameters(config: PretrainedConfig, ignore_keys: Optional[set] = None):
     rope_scaling = config.rope_scaling
     rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))  # BC: "rope_type" was originally "type"
     required_keys = {"rope_type"}
     received_keys = set(rope_scaling.keys())
-    _check_received_keys(rope_type, received_keys, required_keys)
+    _check_received_keys(rope_type, received_keys, required_keys, ignore_keys=ignore_keys)
 
 
-def _validate_linear_scaling_rope_parameters(config: PretrainedConfig):
+def _validate_linear_scaling_rope_parameters(config: PretrainedConfig, ignore_keys: Optional[set] = None):
     rope_scaling = config.rope_scaling
     rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))  # BC: "rope_type" was originally "type"
     required_keys = {"rope_type", "factor"}
     received_keys = set(rope_scaling.keys())
-    _check_received_keys(rope_type, received_keys, required_keys)
+    _check_received_keys(rope_type, received_keys, required_keys, ignore_keys=ignore_keys)
 
     factor = rope_scaling["factor"]
     if factor is None or not isinstance(factor, float) or factor < 1.0:
         logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")
 
 
-def _validate_dynamic_scaling_rope_parameters(config: PretrainedConfig):
+def _validate_dynamic_scaling_rope_parameters(config: PretrainedConfig, ignore_keys: Optional[set] = None):
     rope_scaling = config.rope_scaling
     rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))  # BC: "rope_type" was originally "type"
     required_keys = {"rope_type", "factor"}
     # TODO (joao): update logic for the inclusion of `original_max_position_embeddings`
     optional_keys = {"original_max_position_embeddings"}
     received_keys = set(rope_scaling.keys())
-    _check_received_keys(rope_type, received_keys, required_keys, optional_keys)
+    _check_received_keys(rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys)
 
     factor = rope_scaling["factor"]
     if factor is None or not isinstance(factor, float) or factor < 1.0:
         logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")
 
 
-def _validate_yarn_parameters(config: PretrainedConfig):
+def _validate_yarn_parameters(config: PretrainedConfig, ignore_keys: Optional[set] = None):
     rope_scaling = config.rope_scaling
     rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))  # BC: "rope_type" was originally "type"
     required_keys = {"rope_type", "factor"}
     optional_keys = {"attention_factor", "beta_fast", "beta_slow"}
     received_keys = set(rope_scaling.keys())
-    _check_received_keys(rope_type, received_keys, required_keys, optional_keys)
+    _check_received_keys(rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys)
 
     factor = rope_scaling["factor"]
     if factor is None or not isinstance(factor, float) or factor < 1.0:
@@ -444,14 +457,14 @@ def _validate_yarn_parameters(config: PretrainedConfig):
         )
 
 
-def _validate_longrope_parameters(config: PretrainedConfig):
+def _validate_longrope_parameters(config: PretrainedConfig, ignore_keys: Optional[set] = None):
     rope_scaling = config.rope_scaling
     rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))  # BC: "rope_type" was originally "type"
     required_keys = {"rope_type", "short_factor", "long_factor"}
     # TODO (joao): update logic for the inclusion of `original_max_position_embeddings`
     optional_keys = {"attention_factor", "factor", "original_max_position_embeddings"}
     received_keys = set(rope_scaling.keys())
-    _check_received_keys(rope_type, received_keys, required_keys, optional_keys)
+    _check_received_keys(rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys)
 
     partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
     head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
@@ -494,12 +507,12 @@ def _validate_longrope_parameters(config: PretrainedConfig):
                 )
 
 
-def _validate_llama3_parameters(config: PretrainedConfig):
+def _validate_llama3_parameters(config: PretrainedConfig, ignore_keys: Optional[set] = None):
     rope_scaling = config.rope_scaling
     rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))  # BC: "rope_type" was originally "type"
     required_keys = {"rope_type", "factor", "original_max_position_embeddings", "low_freq_factor", "high_freq_factor"}
     received_keys = set(rope_scaling.keys())
-    _check_received_keys(rope_type, received_keys, required_keys)
+    _check_received_keys(rope_type, received_keys, required_keys, ignore_keys=ignore_keys)
 
     factor = rope_scaling["factor"]
     if factor is None or not isinstance(factor, float) or factor < 1.0:
@@ -541,7 +554,7 @@ def _validate_llama3_parameters(config: PretrainedConfig):
 }
 
 
-def rope_config_validation(config: PretrainedConfig):
+def rope_config_validation(config: PretrainedConfig, ignore_keys: Optional[set] = None):
     """
     Validate the RoPE config arguments, given a `PretrainedConfig` object
     """
@@ -553,7 +566,7 @@ def rope_config_validation(config: PretrainedConfig):
     rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", "default"))
     validation_fn = ROPE_VALIDATION_FUNCTIONS.get(rope_type)
     if validation_fn is not None:
-        validation_fn(config)
+        validation_fn(config, ignore_keys=ignore_keys)
     else:
         logger.warning(
             f"Missing validation function mapping in `ROPE_VALIDATION_FUNCTIONS` for 'rope_type'='{rope_type}'"
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 73cd7db80d6acd..cb0d743b0a90ae 100755
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -544,6 +544,7 @@ def load_state_dict(
     checkpoint_file: Union[str, os.PathLike],
     is_quantized: bool = False,
     map_location: Optional[Union[str, torch.device]] = None,
+    weights_only: bool = True,
 ):
     """
     Reads a PyTorch checkpoint file, returning properly formatted errors if they arise.
@@ -580,7 +581,7 @@ def load_state_dict(
             and is_zipfile(checkpoint_file)
         ):
             extra_args = {"mmap": True}
-        weights_only_kwarg = {"weights_only": True} if is_torch_greater_or_equal_than_1_13 else {}
+        weights_only_kwarg = {"weights_only": weights_only} if is_torch_greater_or_equal_than_1_13 else {}
         return torch.load(
             checkpoint_file,
             map_location=map_location,
@@ -617,6 +618,9 @@ def set_initialized_submodules(model, state_dict_keys):
     not_initialized_submodules = {}
     for module_name, module in model.named_modules():
         loaded_keys = {k.replace(f"{module_name}.", "") for k in state_dict_keys if k.startswith(f"{module_name}.")}
+        # When checking if the root module is loaded all state_dict_keys must be used.
+        if module_name == "":
+            loaded_keys = set(state_dict_keys)
         if loaded_keys.issuperset(module.state_dict()):
             module._is_hf_initialized = True
         else:
@@ -1495,7 +1499,7 @@ def _from_config(cls, config, **kwargs):
             torch_dtype (`torch.dtype`, *optional*):
                 Override the default `torch.dtype` and load the model under this dtype.
         """
-        torch_dtype = kwargs.pop("torch_dtype", None)
+        torch_dtype = kwargs.pop("torch_dtype", torch.get_default_dtype())
         use_flash_attention_2 = kwargs.pop("use_flash_attention_2", False)
 
         # override default dtype if needed
@@ -2045,7 +2049,10 @@ def _get_no_split_modules(self, device_map: str):
         return list(_no_split_modules)
 
     def resize_token_embeddings(
-        self, new_num_tokens: Optional[int] = None, pad_to_multiple_of: Optional[int] = None
+        self,
+        new_num_tokens: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None,
+        mean_resizing: bool = True,
     ) -> nn.Embedding:
         """
         Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`.
@@ -2065,11 +2072,19 @@ def resize_token_embeddings(
                 `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more
                 details about this, or help on choosing the correct value for resizing, refer to this guide:
                 https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
+            mean_resizing (`bool`):
+                Whether to initialize the added embeddings from a multivariate normal distribution that has old embeddings' mean and
+                covariance or to initialize them with a normal distribution that has a mean of zero and std equals `config.initializer_range`.
+
+                Setting `mean_resizing` to `True` is useful when increasing the size of the embeddings of causal language models,
+                where the generated tokens' probabilities won't be affected by the added embeddings because initializing the new embeddings with the
+                old embeddings' mean will reduce the kl-divergence between the next token probability before and after adding the new embeddings.
+                Refer to this article for more information: https://nlp.stanford.edu/~johnhew/vocab-expansion.html
 
         Return:
             `torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model.
         """
-        model_embeds = self._resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
+        model_embeds = self._resize_token_embeddings(new_num_tokens, pad_to_multiple_of, mean_resizing)
         if new_num_tokens is None and pad_to_multiple_of is None:
             return model_embeds
 
@@ -2092,9 +2107,11 @@ def resize_token_embeddings(
 
         return model_embeds
 
-    def _resize_token_embeddings(self, new_num_tokens, pad_to_multiple_of=None):
+    def _resize_token_embeddings(self, new_num_tokens, pad_to_multiple_of=None, mean_resizing=True):
         old_embeddings = self.get_input_embeddings()
-        new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens, pad_to_multiple_of)
+        new_embeddings = self._get_resized_embeddings(
+            old_embeddings, new_num_tokens, pad_to_multiple_of, mean_resizing
+        )
         if hasattr(old_embeddings, "_hf_hook"):
             hook = old_embeddings._hf_hook
             add_hook_to_module(new_embeddings, hook)
@@ -2117,9 +2134,9 @@ def _resize_token_embeddings(self, new_num_tokens, pad_to_multiple_of=None):
         if self.get_output_embeddings() is not None and not self.config.tie_word_embeddings:
             old_lm_head = self.get_output_embeddings()
             if isinstance(old_lm_head, torch.nn.Embedding):
-                new_lm_head = self._get_resized_embeddings(old_lm_head, new_num_tokens)
+                new_lm_head = self._get_resized_embeddings(old_lm_head, new_num_tokens, mean_resizing=mean_resizing)
             else:
-                new_lm_head = self._get_resized_lm_head(old_lm_head, new_num_tokens)
+                new_lm_head = self._get_resized_lm_head(old_lm_head, new_num_tokens, mean_resizing=mean_resizing)
             if hasattr(old_lm_head, "_hf_hook"):
                 hook = old_lm_head._hf_hook
                 add_hook_to_module(new_lm_head, hook)
@@ -2134,6 +2151,7 @@ def _get_resized_embeddings(
         old_embeddings: nn.Embedding,
         new_num_tokens: Optional[int] = None,
         pad_to_multiple_of: Optional[int] = None,
+        mean_resizing: bool = True,
     ) -> nn.Embedding:
         """
         Build a resized Embedding Module from a provided token Embedding Module. Increasing the size will add newly
@@ -2156,6 +2174,14 @@ def _get_resized_embeddings(
                 `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more
                 details about this, or help on choosing the correct value for resizing, refer to this guide:
                 https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
+            mean_resizing (`bool`):
+                Whether to initialize the added embeddings from a multivariate normal distribution that has old embeddings' mean and
+                covariance or to initialize them with a normal distribution that has a mean of zero and std equals `config.initializer_range`.
+
+                Setting `mean_resizing` to `True` is useful when increasing the size of the embeddings of causal language models,
+                where the generated tokens' probabilities will not be affected by the added embeddings because initializing the new embeddings with the
+                old embeddings' mean will reduce the kl-divergence between the next token probability before and after adding the new embeddings.
+                Refer to this article for more information: https://nlp.stanford.edu/~johnhew/vocab-expansion.html
 
 
         Return:
@@ -2214,8 +2240,32 @@ def _get_resized_embeddings(
             dtype=old_embeddings.weight.dtype,
         )
 
-        # initialize all new embeddings (in particular added tokens)
-        self._init_weights(new_embeddings)
+        if new_num_tokens > old_num_tokens and not mean_resizing:
+            # initialize new embeddings (in particular added tokens) with a mean of 0 and std equals `config.initializer_range`.
+            self._init_weights(new_embeddings)
+
+        elif new_num_tokens > old_num_tokens and mean_resizing:
+            # initialize new embeddings  (in particular added tokens). The new embeddings will be initialized
+            # from a multivariate normal distribution that has old embeddings' mean and covariance.
+            # as described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html
+            logger.warning_once(
+                "The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. "
+                "As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. "
+                "To disable this, use `mean_resizing=False`"
+            )
+
+            added_num_tokens = new_num_tokens - old_num_tokens
+            if is_deepspeed_zero3_enabled() and not is_quantized:
+                import deepspeed
+
+                with deepspeed.zero.GatheredParameters([old_embeddings.weight], modifier_rank=None):
+                    self._init_added_embeddings_weights_with_mean(
+                        old_embeddings, new_embeddings, old_embedding_dim, old_num_tokens, added_num_tokens
+                    )
+            else:
+                self._init_added_embeddings_weights_with_mean(
+                    old_embeddings, new_embeddings, old_embedding_dim, old_num_tokens, added_num_tokens
+                )
 
         # Copy token embeddings from the previous weights
 
@@ -2255,7 +2305,11 @@ def _get_resized_embeddings(
         return old_embeddings
 
     def _get_resized_lm_head(
-        self, old_lm_head: nn.Linear, new_num_tokens: Optional[int] = None, transposed: Optional[bool] = False
+        self,
+        old_lm_head: nn.Linear,
+        new_num_tokens: Optional[int] = None,
+        transposed: Optional[bool] = False,
+        mean_resizing: bool = True,
     ) -> nn.Linear:
         """
         Build a resized Linear Module from a provided old Linear Module. Increasing the size will add newly initialized
@@ -2272,6 +2326,14 @@ def _get_resized_lm_head(
                 `torch.nn.Linear` module of the model without doing anything. transposed (`bool`, *optional*, defaults
                 to `False`): Whether `old_lm_head` is transposed or not. If True `old_lm_head.size()` is `lm_head_dim,
                 vocab_size` else `vocab_size, lm_head_dim`.
+            mean_resizing (`bool`):
+                Whether to initialize the added embeddings from a multivariate normal distribution that has old embeddings' mean and
+                covariance or to initialize them with a normal distribution that has a mean of zero and std equals `config.initializer_range`.
+
+                Setting `mean_resizing` to `True` is useful when increasing the size of the embeddings of causal language models,
+                where the generated tokens' probabilities will not be affected by the added embeddings because initializing the new embeddings with the
+                old embeddings' mean will reduce the kl-divergence between the next token probability before and after adding the new embeddings.
+                Refer to this article for more information: https://nlp.stanford.edu/~johnhew/vocab-expansion.html
 
         Return:
             `torch.nn.Linear`: Pointer to the resized Linear Module or the old Linear Module if `new_num_tokens` is
@@ -2318,8 +2380,40 @@ def _get_resized_lm_head(
             dtype=old_lm_head.weight.dtype,
         )
 
-        # initialize new lm head (in particular added tokens)
-        self._init_weights(new_lm_head)
+        if new_num_tokens > old_num_tokens and not mean_resizing:
+            # initialize new embeddings (in particular added tokens) with a mean of 0 and std equals `config.initializer_range`.
+            self._init_weights(new_lm_head)
+
+        elif new_num_tokens > old_num_tokens and mean_resizing:
+            # initialize new lm_head weights (in particular added tokens). The new lm_head weights
+            # will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance.
+            # as described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html
+            logger.warning_once(
+                "The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. "
+                "As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. "
+                "To disable this, use `mean_resizing=False`"
+            )
+
+            added_num_tokens = new_num_tokens - old_num_tokens
+            if is_deepspeed_zero3_enabled() and not is_quantized:
+                import deepspeed
+
+                params = [old_lm_head.weight]
+                if has_new_lm_head_bias:
+                    params += [old_lm_head.bias]
+                with deepspeed.zero.GatheredParameters(params, modifier_rank=None):
+                    self._init_added_lm_head_weights_with_mean(
+                        old_lm_head, new_lm_head, old_lm_head_dim, old_num_tokens, added_num_tokens, transposed
+                    )
+                    if has_new_lm_head_bias:
+                        self._init_added_lm_head_bias_with_mean(old_lm_head, new_lm_head, added_num_tokens)
+
+            else:
+                self._init_added_lm_head_weights_with_mean(
+                    old_lm_head, new_lm_head, old_lm_head_dim, old_num_tokens, added_num_tokens, transposed
+                )
+                if has_new_lm_head_bias:
+                    self._init_added_lm_head_bias_with_mean(old_lm_head, new_lm_head, added_num_tokens)
 
         num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
 
@@ -2338,6 +2432,62 @@ def _get_resized_lm_head(
 
         return new_lm_head
 
+    def _init_added_embeddings_weights_with_mean(
+        self, old_embeddings, new_embeddings, old_embedding_dim, old_num_tokens, added_num_tokens
+    ):
+        old_embeddings_weight = old_embeddings.weight.data.to(torch.float32)
+        mean_embeddings = torch.mean(old_embeddings_weight, axis=0)
+        old_centered_embeddings = old_embeddings_weight - mean_embeddings
+        covariance = old_centered_embeddings.T @ old_centered_embeddings / old_num_tokens
+
+        # Check if the covariance is positive definite.
+        eigenvalues = torch.linalg.eigvals(covariance)
+        is_covariance_psd = bool(
+            (covariance == covariance.T).all() and not torch.is_complex(eigenvalues) and (eigenvalues > 0).all()
+        )
+        if is_covariance_psd:
+            # If covariances is positive definite, a distribution can be created. and we can sample new weights from it.
+            distribution = torch.distributions.multivariate_normal.MultivariateNormal(
+                mean_embeddings, covariance_matrix=1e-9 * covariance
+            )
+            new_embeddings.weight.data[-1 * added_num_tokens :, :] = distribution.sample(
+                sample_shape=(added_num_tokens,)
+            ).to(old_embeddings.weight.dtype)
+        else:
+            # Otherwise, just initialize with the mean. because distribtion will not be created.
+            new_embeddings.weight.data[-1 * added_num_tokens :, :] = (
+                mean_embeddings[None, :].repeat(added_num_tokens, 1).to(old_embeddings.weight.dtype)
+            )
+
+    def _init_added_lm_head_weights_with_mean(
+        self,
+        old_lm_head,
+        new_lm_head,
+        old_lm_head_dim,
+        old_num_tokens,
+        added_num_tokens,
+        transposed=False,
+    ):
+        if transposed:
+            # Transpose to the desired shape for the function.
+            new_lm_head.weight.data = new_lm_head.weight.data.T
+            old_lm_head.weight.data = old_lm_head.weight.data.T
+
+        # The same initilization logic as Embeddings.
+        self._init_added_embeddings_weights_with_mean(
+            old_lm_head, new_lm_head, old_lm_head_dim, old_num_tokens, added_num_tokens
+        )
+
+        if transposed:
+            # Transpose again to the correct shape.
+            new_lm_head.weight.data = new_lm_head.weight.data.T
+            old_lm_head.weight.data = old_lm_head.weight.data.T
+
+    def _init_added_lm_head_bias_with_mean(self, old_lm_head, new_lm_head, added_num_tokens):
+        bias_mean = torch.mean(old_lm_head.bias.data, axis=0, dtype=torch.float32)
+        bias_std = torch.std(old_lm_head.bias.data, axis=0).to(torch.float32)
+        new_lm_head.bias.data[-1 * added_num_tokens :].normal_(mean=bias_mean, std=1e-9 * bias_std)
+
     def _copy_lm_head_original_to_resized(
         self, new_lm_head, old_lm_head, num_tokens_to_copy, transposed, has_new_lm_head_bias
     ):
@@ -3006,6 +3156,7 @@ def from_pretrained(
         token: Optional[Union[str, bool]] = None,
         revision: str = "main",
         use_safetensors: bool = None,
+        weights_only: bool = True,
         **kwargs,
     ) -> "PreTrainedModel":
         r"""
@@ -3193,6 +3344,11 @@ def from_pretrained(
                 Whether or not to use `safetensors` checkpoints. Defaults to `None`. If not specified and `safetensors`
                 is not installed, it will be set to `False`.
 
+            weights_only (`bool`, *optional*, defaults to `True`):
+                Indicates whether unpickler should be restricted to loading only tensors, primitive types,
+                dictionaries and any types added via torch.serialization.add_safe_globals().
+                When set to False, we can load wrapper tensor subclass weights.
+
             kwargs (remaining dictionary of keyword arguments, *optional*):
                 Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
                 `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
@@ -3477,7 +3633,7 @@ def from_pretrained(
             # Force-set to `True` for more mem efficiency
             if low_cpu_mem_usage is None:
                 low_cpu_mem_usage = True
-                logger.warning("`low_cpu_mem_usage` was None, now set to True since model is quantized.")
+                logger.warning("`low_cpu_mem_usage` was None, now default to True since model is quantized.")
         is_quantized = hf_quantizer is not None
 
         # This variable will flag if we're loading a sharded checkpoint. In this case the archive file is just the
@@ -3828,7 +3984,7 @@ def from_pretrained(
         if from_pt:
             if not is_sharded and state_dict is None:
                 # Time to load the checkpoint
-                state_dict = load_state_dict(resolved_archive_file)
+                state_dict = load_state_dict(resolved_archive_file, weights_only=weights_only)
 
             # set dtype to instantiate the model under:
             # 1. If torch_dtype is not None, we use that dtype
@@ -3849,7 +4005,7 @@ def from_pretrained(
                             elif not is_sharded:
                                 torch_dtype = get_state_dict_dtype(state_dict)
                             else:
-                                one_state_dict = load_state_dict(resolved_archive_file[0])
+                                one_state_dict = load_state_dict(resolved_archive_file[0], weights_only=weights_only)
                                 torch_dtype = get_state_dict_dtype(one_state_dict)
                                 del one_state_dict  # free CPU memory
                             logger.info(
@@ -3891,6 +4047,10 @@ def from_pretrained(
             logger.info("Detected DeepSpeed ZeRO-3: activating zero.init() for this model")
             init_contexts = [deepspeed.zero.Init(config_dict_or_path=deepspeed_config())] + init_contexts
         elif low_cpu_mem_usage:
+            if not is_accelerate_available():
+                raise ImportError(
+                    f"Using `low_cpu_mem_usage=True` or a `device_map` requires Accelerate: `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`"
+                )
             init_contexts.append(init_empty_weights())
 
         config = copy.deepcopy(config)  # We do not want to modify the config inplace in from_pretrained.
@@ -4045,6 +4205,7 @@ def from_pretrained(
                 hf_quantizer=hf_quantizer,
                 keep_in_fp32_modules=keep_in_fp32_modules,
                 gguf_path=gguf_path,
+                weights_only=weights_only,
             )
 
         # make sure token embedding weights are still tied if needed
@@ -4150,6 +4311,7 @@ def _load_pretrained_model(
         hf_quantizer=None,
         keep_in_fp32_modules=None,
         gguf_path=None,
+        weights_only=True,
     ):
         is_safetensors = False
         is_quantized = hf_quantizer is not None
@@ -4507,7 +4669,9 @@ def _find_mismatched_keys(
                     and hf_quantizer.quantization_config.quant_type == "int4_weight_only"
                 ):
                     map_location = torch.device([d for d in device_map.values() if d not in ["cpu", "disk"]][0])
-                state_dict = load_state_dict(shard_file, is_quantized=is_quantized, map_location=map_location)
+                state_dict = load_state_dict(
+                    shard_file, is_quantized=is_quantized, map_location=map_location, weights_only=weights_only
+                )
 
                 # Mistmatched keys contains tuples key/shape1/shape2 of weights in the checkpoint that have a shape not
                 # matching the weights in the model.
@@ -4660,6 +4824,7 @@ def _load_pretrained_model_low_mem(
         start_prefix="",
         hf_quantizer=None,
         pretrained_model_name_or_path=None,
+        weights_only=True,
     ):
         """
         This is an experimental function that loads the model using ~1.x model size CPU memory
@@ -4680,7 +4845,7 @@ def _load_pretrained_model_low_mem(
         """
 
         _move_model_to_meta(model, loaded_state_dict_keys, start_prefix)
-        state_dict = load_state_dict(resolved_archive_file)
+        state_dict = load_state_dict(resolved_archive_file, weights_only=weights_only)
         expected_keys = loaded_state_dict_keys  # plug for missing expected_keys. TODO: replace with proper keys
         error_msgs = _load_state_dict_into_meta_model(
             model,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index e47a4ed9c342e4..804957c0a551ae 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -168,6 +168,7 @@
     musicgen,
     musicgen_melody,
     mvp,
+    myt5,
     nemotron,
     nllb,
     nllb_moe,
@@ -190,6 +191,7 @@
     persimmon,
     phi,
     phi3,
+    phimoe,
     phobert,
     pix2struct,
     pixtral,
@@ -280,5 +282,6 @@
     xmod,
     yolos,
     yoso,
+    zamba,
     zoedepth,
 )
diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py
index 84e368379e292a..7e39a5f0f1182f 100755
--- a/src/transformers/models/altclip/modeling_altclip.py
+++ b/src/transformers/models/altclip/modeling_altclip.py
@@ -1024,15 +1024,15 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
         """
 
         num_patches = embeddings.shape[1] - 1
-        self.position_embeddings = self.position_embedding.weight.unsqueeze(0)
-        num_positions = self.position_embeddings.shape[1] - 1
+        position_embedding = self.position_embedding.weight.unsqueeze(0)
+        num_positions = position_embedding.shape[1] - 1
 
         # always interpolate when tracing to ensure the exported model works for dynamic input shapes
         if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
-            return self.position_embeddings
+            return self.position_embedding(self.position_ids)
 
-        class_pos_embed = self.position_embeddings[:, :1]
-        patch_pos_embed = self.position_embeddings[:, 1:]
+        class_pos_embed = position_embedding[:, :1]
+        patch_pos_embed = position_embedding[:, 1:]
 
         dim = embeddings.shape[-1]
 
diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py
index 3bb2b8e9d4c199..2ee0541a1a71b8 100644
--- a/src/transformers/models/auto/__init__.py
+++ b/src/transformers/models/auto/__init__.py
@@ -74,6 +74,7 @@
         "MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING",
         "MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING",
         "MODEL_FOR_VISION_2_SEQ_MAPPING",
+        "MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING",
         "MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING",
         "MODEL_MAPPING",
         "MODEL_WITH_LM_HEAD_MAPPING",
@@ -119,6 +120,7 @@
         "AutoModelWithLMHead",
         "AutoModelForZeroShotImageClassification",
         "AutoModelForZeroShotObjectDetection",
+        "AutoModelForImageTextToText",
     ]
 
 try:
@@ -238,6 +240,7 @@
             MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
             MODEL_FOR_IMAGE_MAPPING,
             MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
+            MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING,
             MODEL_FOR_IMAGE_TO_IMAGE_MAPPING,
             MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
             MODEL_FOR_KEYPOINT_DETECTION_MAPPING,
@@ -279,6 +282,7 @@
             AutoModelForDocumentQuestionAnswering,
             AutoModelForImageClassification,
             AutoModelForImageSegmentation,
+            AutoModelForImageTextToText,
             AutoModelForImageToImage,
             AutoModelForInstanceSegmentation,
             AutoModelForKeypointDetection,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 6d55f87d60ac8e..17219570684d53 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -210,6 +210,7 @@
         ("persimmon", "PersimmonConfig"),
         ("phi", "PhiConfig"),
         ("phi3", "Phi3Config"),
+        ("phimoe", "PhimoeConfig"),
         ("pix2struct", "Pix2StructConfig"),
         ("pixtral", "PixtralVisionConfig"),
         ("plbart", "PLBartConfig"),
@@ -310,6 +311,7 @@
         ("xmod", "XmodConfig"),
         ("yolos", "YolosConfig"),
         ("yoso", "YosoConfig"),
+        ("zamba", "ZambaConfig"),
         ("zoedepth", "ZoeDepthConfig"),
     ]
 )
@@ -495,6 +497,7 @@
         ("musicgen", "MusicGen"),
         ("musicgen_melody", "MusicGen Melody"),
         ("mvp", "MVP"),
+        ("myt5", "myt5"),
         ("nat", "NAT"),
         ("nemotron", "Nemotron"),
         ("nezha", "Nezha"),
@@ -520,6 +523,7 @@
         ("persimmon", "Persimmon"),
         ("phi", "Phi"),
         ("phi3", "Phi3"),
+        ("phimoe", "Phimoe"),
         ("phobert", "PhoBERT"),
         ("pix2struct", "Pix2Struct"),
         ("pixtral", "Pixtral"),
@@ -628,6 +632,7 @@
         ("xmod", "X-MOD"),
         ("yolos", "YOLOS"),
         ("yoso", "YOSO"),
+        ("zamba", "Zamba"),
         ("zoedepth", "ZoeDepth"),
     ]
 )
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index dca0c08aa90957..98d679ef09c75b 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -351,7 +351,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         # If we don't find the feature extractor class in the feature extractor config, let's try the model config.
         if feature_extractor_class is None and feature_extractor_auto_map is None:
             if not isinstance(config, PretrainedConfig):
-                config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+                config = AutoConfig.from_pretrained(
+                    pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
+                )
             # It could be in `config.feature_extractor_type``
             feature_extractor_class = getattr(config, "feature_extractor_type", None)
             if hasattr(config, "auto_map") and "AutoFeatureExtractor" in config.auto_map:
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index f404c92780c39f..ef40798484ef41 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -431,7 +431,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
         # If we don't find the image processor class in the image processor config, let's try the model config.
         if image_processor_class is None and image_processor_auto_map is None:
             if not isinstance(config, PretrainedConfig):
-                config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+                config = AutoConfig.from_pretrained(
+                    pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
+                )
             # It could be in `config.image_processor_type``
             image_processor_class = getattr(config, "image_processor_type", None)
             if hasattr(config, "auto_map") and "AutoImageProcessor" in config.auto_map:
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 6e730e848db755..aa0d59de52ff4c 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -198,6 +198,7 @@
         ("persimmon", "PersimmonModel"),
         ("phi", "PhiModel"),
         ("phi3", "Phi3Model"),
+        ("phimoe", "PhimoeModel"),
         ("pixtral", "PixtralVisionModel"),
         ("plbart", "PLBartModel"),
         ("poolformer", "PoolFormerModel"),
@@ -283,6 +284,7 @@
         ("xmod", "XmodModel"),
         ("yolos", "YolosModel"),
         ("yoso", "YosoModel"),
+        ("zamba", "ZambaModel"),
     ]
 )
 
@@ -518,6 +520,7 @@
         ("persimmon", "PersimmonForCausalLM"),
         ("phi", "PhiForCausalLM"),
         ("phi3", "Phi3ForCausalLM"),
+        ("phimoe", "PhimoeForCausalLM"),
         ("plbart", "PLBartForCausalLM"),
         ("prophetnet", "ProphetNetForCausalLM"),
         ("qdqbert", "QDQBertLMHeadModel"),
@@ -544,6 +547,7 @@
         ("xlm-roberta-xl", "XLMRobertaXLForCausalLM"),
         ("xlnet", "XLNetLMHeadModel"),
         ("xmod", "XmodForCausalLM"),
+        ("zamba", "ZambaForCausalLM"),
     ]
 )
 
@@ -753,6 +757,32 @@
     ]
 )
 
+MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = OrderedDict(
+    [
+        ("blip", "BlipForConditionalGeneration"),
+        ("blip-2", "Blip2ForConditionalGeneration"),
+        ("chameleon", "ChameleonForConditionalGeneration"),
+        ("fuyu", "FuyuForCausalLM"),
+        ("git", "GitForCausalLM"),
+        ("idefics", "IdeficsForVisionText2Text"),
+        ("idefics2", "Idefics2ForConditionalGeneration"),
+        ("idefics3", "Idefics3ForConditionalGeneration"),
+        ("instructblip", "InstructBlipForConditionalGeneration"),
+        ("kosmos-2", "Kosmos2ForConditionalGeneration"),
+        ("llava", "LlavaForConditionalGeneration"),
+        ("llava_next", "LlavaNextForConditionalGeneration"),
+        ("llava_onevision", "LlavaOnevisionForConditionalGeneration"),
+        ("mllama", "MllamaForConditionalGeneration"),
+        ("paligemma", "PaliGemmaForConditionalGeneration"),
+        ("pix2struct", "Pix2StructForConditionalGeneration"),
+        ("pixtral", "LlavaForConditionalGeneration"),
+        ("qwen2_vl", "Qwen2VLForConditionalGeneration"),
+        ("udop", "UdopForConditionalGeneration"),
+        ("vipllava", "VipLlavaForConditionalGeneration"),
+        ("vision-encoder-decoder", "VisionEncoderDecoderModel"),
+    ]
+)
+
 MODEL_FOR_MASKED_LM_MAPPING_NAMES = OrderedDict(
     [
         # Model for Masked LM mapping
@@ -948,6 +978,7 @@
         ("persimmon", "PersimmonForSequenceClassification"),
         ("phi", "PhiForSequenceClassification"),
         ("phi3", "Phi3ForSequenceClassification"),
+        ("phimoe", "PhimoeForSequenceClassification"),
         ("plbart", "PLBartForSequenceClassification"),
         ("qdqbert", "QDQBertForSequenceClassification"),
         ("qwen2", "Qwen2ForSequenceClassification"),
@@ -971,6 +1002,7 @@
         ("xlnet", "XLNetForSequenceClassification"),
         ("xmod", "XmodForSequenceClassification"),
         ("yoso", "YosoForSequenceClassification"),
+        ("zamba", "ZambaForSequenceClassification"),
     ]
 )
 
@@ -1413,6 +1445,9 @@
     CONFIG_MAPPING_NAMES, MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES
 )
 MODEL_FOR_VISION_2_SEQ_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES)
+MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
+)
 MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
     CONFIG_MAPPING_NAMES, MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES
 )
@@ -1707,6 +1742,13 @@ class AutoModelForVision2Seq(_BaseAutoModelClass):
 AutoModelForVision2Seq = auto_class_update(AutoModelForVision2Seq, head_doc="vision-to-text modeling")
 
 
+class AutoModelForImageTextToText(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING
+
+
+AutoModelForImageTextToText = auto_class_update(AutoModelForImageTextToText, head_doc="image-text-to-text modeling")
+
+
 class AutoModelForAudioClassification(_BaseAutoModelClass):
     _model_mapping = MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
 
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index c894840c6ad229..c1f23bc1cb3f18 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -99,6 +99,7 @@
         ("trocr", "TrOCRProcessor"),
         ("tvlt", "TvltProcessor"),
         ("tvp", "TvpProcessor"),
+        ("udop", "UdopProcessor"),
         ("unispeech", "Wav2Vec2Processor"),
         ("unispeech-sat", "Wav2Vec2Processor"),
         ("video_llava", "VideoLlavaProcessor"),
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index 6a5cba11f0949f..8c3a7a82a60a51 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -322,6 +322,7 @@
             ("musicgen", ("T5Tokenizer", "T5TokenizerFast" if is_tokenizers_available() else None)),
             ("musicgen_melody", ("T5Tokenizer", "T5TokenizerFast" if is_tokenizers_available() else None)),
             ("mvp", ("MvpTokenizer", "MvpTokenizerFast" if is_tokenizers_available() else None)),
+            ("myt5", ("MyT5Tokenizer", None)),
             ("nezha", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
             (
                 "nllb",
@@ -389,6 +390,7 @@
             ),
             ("phi", ("CodeGenTokenizer", "CodeGenTokenizerFast" if is_tokenizers_available() else None)),
             ("phi3", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
+            ("phimoe", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
             ("phobert", ("PhobertTokenizer", None)),
             ("pix2struct", ("T5Tokenizer", "T5TokenizerFast" if is_tokenizers_available() else None)),
             ("pixtral", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
@@ -554,6 +556,13 @@
                     "AlbertTokenizerFast" if is_tokenizers_available() else None,
                 ),
             ),
+            (
+                "zamba",
+                (
+                    "LlamaTokenizer" if is_sentencepiece_available() else None,
+                    "LlamaTokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
         ]
     )
 
diff --git a/src/transformers/models/bark/modeling_bark.py b/src/transformers/models/bark/modeling_bark.py
index 3102ada542d57d..ea420482379d7a 100644
--- a/src/transformers/models/bark/modeling_bark.py
+++ b/src/transformers/models/bark/modeling_bark.py
@@ -263,7 +263,7 @@ def forward(
             value,
             attention_mask,
             query_len,
-            dropout=self.dropout,
+            dropout=self.dropout if self.training else 0.0,
             use_top_left_mask=self._flash_attn_uses_top_left_mask,
             is_causal=self.is_causal,
         )
@@ -1721,6 +1721,8 @@ def generate(
                     kwargs_fine[key] = value
 
         # 1. Generate from the semantic model
+        if "generation_config" in kwargs_semantic:
+            kwargs_semantic.pop("generation_config")
         semantic_output = self.semantic.generate(
             input_ids,
             history_prompt=history_prompt,
@@ -1729,6 +1731,8 @@ def generate(
         )
 
         # 2. Generate from the coarse model
+        if "generation_config" in kwargs_coarse:
+            kwargs_coarse.pop("generation_config")
         coarse_output = self.coarse_acoustics.generate(
             semantic_output,
             history_prompt=history_prompt,
@@ -1746,6 +1750,8 @@ def generate(
             output_lengths = output_lengths // coarse_generation_config.n_coarse_codebooks
 
         # 3. "generate" from the fine model
+        if "generation_config" in kwargs_fine:
+            kwargs_fine.pop("generation_config")
         output = self.fine_acoustics.generate(
             coarse_output,
             history_prompt=history_prompt,
diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py
index 2e4e6dcaeb2d11..822be354fb9da0 100755
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -400,7 +400,7 @@ def forward(
             value_states,
             attention_mask,
             q_len,
-            dropout=self.dropout,
+            dropout=self.dropout if self.training else 0.0,
             is_causal=self.is_causal,
             use_top_left_mask=self._flash_attn_uses_top_left_mask,
         )
@@ -2189,32 +2189,6 @@ def forward(
             cross_attentions=outputs.cross_attentions,
         )
 
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs
-    ):
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_ids.shape)
-
-        if past_key_values:
-            past_length = past_key_values[0][0].shape[2]
-
-            # Some generation methods already pass only the last input ID
-            if input_ids.shape[1] > past_length:
-                remove_prefix_length = past_length
-            else:
-                # Default to old behavior: keep only final ID
-                remove_prefix_length = input_ids.shape[1] - 1
-
-            input_ids = input_ids[:, remove_prefix_length:]
-        # first step, decoder_cached_states are empty
-        return {
-            "input_ids": input_ids,  # encoder_outputs is defined. input_ids not needed
-            "attention_mask": attention_mask,
-            "past_key_values": past_key_values,
-            "use_cache": use_cache,
-        }
-
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py
index b62746da5c6f15..6b05fa648158a6 100755
--- a/src/transformers/models/bert/modeling_bert.py
+++ b/src/transformers/models/bert/modeling_bert.py
@@ -1394,34 +1394,6 @@ def forward(
             cross_attentions=outputs.cross_attentions,
         )
 
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, use_cache=True, **model_kwargs
-    ):
-        input_shape = input_ids.shape
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_shape)
-
-        # cut decoder_input_ids if past_key_values is used
-        if past_key_values is not None:
-            past_length = past_key_values[0][0].shape[2]
-
-            # Some generation methods already pass only the last input ID
-            if input_ids.shape[1] > past_length:
-                remove_prefix_length = past_length
-            else:
-                # Default to old behavior: keep only final ID
-                remove_prefix_length = input_ids.shape[1] - 1
-
-            input_ids = input_ids[:, remove_prefix_length:]
-
-        return {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "past_key_values": past_key_values,
-            "use_cache": use_cache,
-        }
-
     def _reorder_cache(self, past_key_values, beam_idx):
         reordered_past = ()
         for layer_past in past_key_values:
diff --git a/src/transformers/models/bert_generation/modeling_bert_generation.py b/src/transformers/models/bert_generation/modeling_bert_generation.py
index 8496d1f6072f02..db4a378577562a 100755
--- a/src/transformers/models/bert_generation/modeling_bert_generation.py
+++ b/src/transformers/models/bert_generation/modeling_bert_generation.py
@@ -991,27 +991,6 @@ def forward(
             cross_attentions=outputs.cross_attentions,
         )
 
-    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
-        input_shape = input_ids.shape
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_shape)
-
-        # cut decoder_input_ids if past_key_values is used
-        if past_key_values is not None:
-            past_length = past_key_values[0][0].shape[2]
-
-            # Some generation methods already pass only the last input ID
-            if input_ids.shape[1] > past_length:
-                remove_prefix_length = past_length
-            else:
-                # Default to old behavior: keep only final ID
-                remove_prefix_length = input_ids.shape[1] - 1
-
-            input_ids = input_ids[:, remove_prefix_length:]
-
-        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
-
     def _reorder_cache(self, past_key_values, beam_idx):
         reordered_past = ()
         for layer_past in past_key_values:
diff --git a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py
index 10d71c417a7aaf..732e9e7aff5741 100644
--- a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py
+++ b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py
@@ -380,7 +380,7 @@ def __init__(
         do_lower_case=False,
         never_split=None,
         normalize_text=True,
-        mecab_dic: Optional[str] = "ipadic",
+        mecab_dic: Optional[str] = "unidic_lite",
         mecab_option: Optional[str] = None,
     ):
         """
diff --git a/src/transformers/models/big_bird/modeling_big_bird.py b/src/transformers/models/big_bird/modeling_big_bird.py
index 41045cb5f0001f..958d192fa03dbc 100755
--- a/src/transformers/models/big_bird/modeling_big_bird.py
+++ b/src/transformers/models/big_bird/modeling_big_bird.py
@@ -2606,28 +2606,6 @@ def forward(
             cross_attentions=outputs.cross_attentions,
         )
 
-    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
-        input_shape = input_ids.shape
-
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_shape)
-
-        # cut decoder_input_ids if past_key_values is used
-        if past_key_values is not None:
-            past_length = past_key_values[0][0].shape[2]
-
-            # Some generation methods already pass only the last input ID
-            if input_ids.shape[1] > past_length:
-                remove_prefix_length = past_length
-            else:
-                # Default to old behavior: keep only final ID
-                remove_prefix_length = input_ids.shape[1] - 1
-
-            input_ids = input_ids[:, remove_prefix_length:]
-
-        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
-
     def _reorder_cache(self, past_key_values, beam_idx):
         reordered_past = ()
         for layer_past in past_key_values:
diff --git a/src/transformers/models/biogpt/modeling_biogpt.py b/src/transformers/models/biogpt/modeling_biogpt.py
index 7ad1dcbd661c32..6bc80bc04959b6 100755
--- a/src/transformers/models/biogpt/modeling_biogpt.py
+++ b/src/transformers/models/biogpt/modeling_biogpt.py
@@ -47,7 +47,8 @@
 _CONFIG_FOR_DOC = "BioGptConfig"
 
 
-# Copied from transformers.models.opt.modeling_opt.OPTLearnedPositionalEmbedding with OPT->BioGpt
+# copied from transformers.models.opt.modeling_opt.OPTLearnedPositionalEmbedding with OPT->BioGpt
+# TODO @ArthurZucker bring copied from back
 class BioGptLearnedPositionalEmbedding(nn.Embedding):
     """
     This module learns positional embeddings up to a fixed maximum size.
@@ -801,37 +802,6 @@ def forward(
             cross_attentions=outputs.cross_attentions,
         )
 
-    def prepare_inputs_for_generation(
-        self, input_ids, attention_mask, inputs_embeds=None, past_key_values=None, **kwargs
-    ):
-        # only last tokens for inputs_ids if past is defined in kwargs
-        if past_key_values is not None:
-            past_length = past_key_values[0][0].shape[2]
-
-            # Some generation methods already pass only the last input ID
-            if input_ids.shape[1] > past_length:
-                remove_prefix_length = past_length
-            else:
-                # Default to old behavior: keep only final ID
-                remove_prefix_length = input_ids.shape[1] - 1
-
-            input_ids = input_ids[:, remove_prefix_length:]
-
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "attention_mask": attention_mask,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-            }
-        )
-
-        return model_inputs
-
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py
index 4ea5926d854c98..ae37f546e510db 100755
--- a/src/transformers/models/blenderbot/modeling_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_blenderbot.py
@@ -1576,32 +1576,6 @@ def forward(
             cross_attentions=outputs.cross_attentions,
         )
 
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs
-    ):
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_ids.shape)
-
-        if past_key_values:
-            past_length = past_key_values[0][0].shape[2]
-
-            # Some generation methods already pass only the last input ID
-            if input_ids.shape[1] > past_length:
-                remove_prefix_length = past_length
-            else:
-                # Default to old behavior: keep only final ID
-                remove_prefix_length = input_ids.shape[1] - 1
-
-            input_ids = input_ids[:, remove_prefix_length:]
-        # first step, decoder_cached_states are empty
-        return {
-            "input_ids": input_ids,  # encoder_outputs is defined. input_ids not needed
-            "attention_mask": attention_mask,
-            "past_key_values": past_key_values,
-            "use_cache": use_cache,
-        }
-
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
index 3e378f483a317a..93298c4e80e55b 100755
--- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
@@ -1528,32 +1528,6 @@ def forward(
             cross_attentions=outputs.cross_attentions,
         )
 
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs
-    ):
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_ids.shape)
-
-        if past_key_values:
-            past_length = past_key_values[0][0].shape[2]
-
-            # Some generation methods already pass only the last input ID
-            if input_ids.shape[1] > past_length:
-                remove_prefix_length = past_length
-            else:
-                # Default to old behavior: keep only final ID
-                remove_prefix_length = input_ids.shape[1] - 1
-
-            input_ids = input_ids[:, remove_prefix_length:]
-        # first step, decoder_cached_states are empty
-        return {
-            "input_ids": input_ids,  # encoder_outputs is defined. input_ids not needed
-            "attention_mask": attention_mask,
-            "past_key_values": past_key_values,
-            "use_cache": use_cache,
-        }
-
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
diff --git a/src/transformers/models/blip/processing_blip.py b/src/transformers/models/blip/processing_blip.py
index cd96b46ab1d26f..78e1aa58ef0443 100644
--- a/src/transformers/models/blip/processing_blip.py
+++ b/src/transformers/models/blip/processing_blip.py
@@ -19,9 +19,25 @@
 from typing import List, Optional, Union
 
 from ...image_utils import ImageInput
-from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
-from ...utils import TensorType
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
+
+
+class BlipProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "add_special_tokens": True,
+            "padding": False,
+            "stride": 0,
+            "return_overflowing_tokens": False,
+            "return_special_tokens_mask": False,
+            "return_offsets_mapping": False,
+            "return_token_type_ids": False,
+            "return_length": False,
+            "verbose": True,
+        },
+        "images_kwargs": {},
+    }
 
 
 class BlipProcessor(ProcessorMixin):
@@ -51,84 +67,53 @@ def __init__(self, image_processor, tokenizer, **kwargs):
     def __call__(
         self,
         images: ImageInput = None,
-        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
-        add_special_tokens: bool = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_token_type_ids: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs,
+        text: Optional[Union[str, List[str], TextInput, PreTokenizedInput]] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[BlipProcessorKwargs],
     ) -> BatchEncoding:
         """
         This method uses [`BlipImageProcessor.__call__`] method to prepare image(s) for the model, and
         [`BertTokenizerFast.__call__`] to prepare text for the model.
 
         Please refer to the docstring of the above two methods for more information.
+        Args:
+            images (`ImageInput`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                    - `'tf'`: Return TensorFlow `tf.constant` objects.
+                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                    - `'np'`: Return NumPy `np.ndarray` objects.
+                    - `'jax'`: Return JAX `jnp.ndarray` objects.
         """
         if images is None and text is None:
             raise ValueError("You have to specify either images or text.")
 
-        # Get only text
-        if images is None:
-            self.current_processor = self.tokenizer
-            text_encoding = self.tokenizer(
-                text=text,
-                add_special_tokens=add_special_tokens,
-                padding=padding,
-                truncation=truncation,
-                max_length=max_length,
-                stride=stride,
-                pad_to_multiple_of=pad_to_multiple_of,
-                return_attention_mask=return_attention_mask,
-                return_overflowing_tokens=return_overflowing_tokens,
-                return_special_tokens_mask=return_special_tokens_mask,
-                return_offsets_mapping=return_offsets_mapping,
-                return_token_type_ids=return_token_type_ids,
-                return_length=return_length,
-                verbose=verbose,
-                return_tensors=return_tensors,
-                **kwargs,
-            )
-            return text_encoding
-
-        # add pixel_values
-        encoding_image_processor = self.image_processor(images, return_tensors=return_tensors)
+        text_encoding = None
 
+        # add pixel_values encoding. If we also have text_encoding, update image encoding and return it.
+        # else, return the text encoding.
+        output_kwargs = self._merge_kwargs(
+            BlipProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
         if text is not None:
-            text_encoding = self.tokenizer(
-                text=text,
-                add_special_tokens=add_special_tokens,
-                padding=padding,
-                truncation=truncation,
-                max_length=max_length,
-                stride=stride,
-                pad_to_multiple_of=pad_to_multiple_of,
-                return_attention_mask=return_attention_mask,
-                return_overflowing_tokens=return_overflowing_tokens,
-                return_special_tokens_mask=return_special_tokens_mask,
-                return_offsets_mapping=return_offsets_mapping,
-                return_token_type_ids=return_token_type_ids,
-                return_length=return_length,
-                verbose=verbose,
-                return_tensors=return_tensors,
-                **kwargs,
-            )
-        else:
-            text_encoding = None
-
-        if text_encoding is not None:
-            encoding_image_processor.update(text_encoding)
-
-        return encoding_image_processor
+            text_encoding = self.tokenizer(text, **output_kwargs["text_kwargs"])
+        if images is not None:
+            encoding_image_processor = self.image_processor(images, **output_kwargs["images_kwargs"])
+
+            if text_encoding is not None:
+                encoding_image_processor.update(text_encoding)
+            return encoding_image_processor
+
+        return text_encoding
 
     def batch_decode(self, *args, **kwargs):
         """
diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py
index e879b41eb15643..606aadc1eab45f 100644
--- a/src/transformers/models/blip_2/processing_blip_2.py
+++ b/src/transformers/models/blip_2/processing_blip_2.py
@@ -18,22 +18,38 @@
 
 from typing import List, Optional, Union
 
+from ...image_processing_utils import BatchFeature
 from ...image_utils import ImageInput
-from ...processing_utils import ProcessorMixin
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import (
     AddedToken,
     BatchEncoding,
-    PaddingStrategy,
     PreTokenizedInput,
     TextInput,
-    TruncationStrategy,
 )
-from ...utils import TensorType, logging
+from ...utils import logging
 
 
 logger = logging.get_logger(__name__)
 
 
+class Blip2ProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "add_special_tokens": True,
+            "padding": False,
+            "stride": 0,
+            "return_overflowing_tokens": False,
+            "return_special_tokens_mask": False,
+            "return_offsets_mapping": False,
+            "return_token_type_ids": False,
+            "return_length": False,
+            "verbose": True,
+        },
+        "images_kwargs": {},
+    }
+
+
 class Blip2Processor(ProcessorMixin):
     r"""
     Constructs a BLIP-2 processor which wraps a BLIP image processor and an OPT/T5 tokenizer into a single processor.
@@ -67,58 +83,44 @@ def __init__(self, image_processor, tokenizer, num_query_tokens=None, **kwargs):
     def __call__(
         self,
         images: ImageInput = None,
-        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
-        add_special_tokens: bool = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_token_type_ids: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs,
+        text: Optional[Union[str, List[str], TextInput, PreTokenizedInput]] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[Blip2ProcessorKwargs],
     ) -> BatchEncoding:
         """
         This method uses [`BlipImageProcessor.__call__`] method to prepare image(s) for the model, and
         [`BertTokenizerFast.__call__`] to prepare text for the model.
 
         Please refer to the docstring of the above two methods for more information.
+        Args:
+            images (`ImageInput`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                    - `'tf'`: Return TensorFlow `tf.constant` objects.
+                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                    - `'np'`: Return NumPy `np.ndarray` objects.
+                    - `'jax'`: Return JAX `jnp.ndarray` objects.
         """
         if images is None and text is None:
             raise ValueError("You have to specify either images or text.")
-
-        # Get only text
-        if images is None:
-            self.current_processor = self.tokenizer
-            text_encoding = self.tokenizer(
-                text=text,
-                add_special_tokens=add_special_tokens,
-                padding=padding,
-                truncation=truncation,
-                max_length=max_length,
-                stride=stride,
-                pad_to_multiple_of=pad_to_multiple_of,
-                return_attention_mask=return_attention_mask,
-                return_overflowing_tokens=return_overflowing_tokens,
-                return_special_tokens_mask=return_special_tokens_mask,
-                return_offsets_mapping=return_offsets_mapping,
-                return_token_type_ids=return_token_type_ids,
-                return_length=return_length,
-                verbose=verbose,
-                return_tensors=return_tensors,
-                **kwargs,
-            )
-            return text_encoding
-
-        # add pixel_values
-        encoding_image_processor = self.image_processor(images, return_tensors=return_tensors)
-
+        output_kwargs = self._merge_kwargs(
+            Blip2ProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        # BC for explicit return_tensors
+        if "return_tensors" in output_kwargs["common_kwargs"]:
+            return_tensors = output_kwargs["common_kwargs"].pop("return_tensors", None)
+        else:
+            return_tensors = None
+        encoding = BatchFeature(tensor_type=return_tensors)
         if text is not None:
             if isinstance(text, str):
                 text = [text]
@@ -126,24 +128,10 @@ def __call__(
                 raise ValueError("Invalid input text. Please provide a string, or a list of strings")
 
             text_encoding = {}
-            _text_encoding = self.tokenizer(
-                text=text,
-                add_special_tokens=add_special_tokens,
-                padding=padding,
-                truncation=truncation,
-                max_length=max_length,
-                stride=stride,
-                pad_to_multiple_of=pad_to_multiple_of,
-                return_attention_mask=return_attention_mask,
-                return_overflowing_tokens=return_overflowing_tokens,
-                return_special_tokens_mask=return_special_tokens_mask,
-                return_offsets_mapping=return_offsets_mapping,
-                return_token_type_ids=return_token_type_ids,
-                return_length=return_length,
-                verbose=verbose,
-                return_tensors=None,  # hardcode "None" here for prepending image tokens
-                **kwargs,
-            )
+
+            return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+            _text_encoding = self.tokenizer(text, **output_kwargs["text_kwargs"], return_tensors=None)
+            output_kwargs["text_kwargs"]["return_tensors"] = return_tensors
 
             # if we know how many query tokens, expand text inside processor. We need this hacky manipulation
             # because BLIP expects image tokens to be at the beginning even before BOS token
@@ -164,14 +152,14 @@ def __call__(
                 )
 
             # cast to desired return tensors type
-            text_encoding = BatchEncoding(text_encoding, tensor_type=return_tensors)
-        else:
-            text_encoding = None
-
-        if text_encoding is not None:
-            encoding_image_processor.update(text_encoding)
-
-        return encoding_image_processor
+            encoding.update(BatchEncoding(text_encoding, tensor_type=return_tensors))
+        # add pixel_values encoding. If we also have text_encoding, update image encoding and return it.
+        # else, return the text encoding.
+
+        if images is not None:
+            image_encoding = self.image_processor(images, **output_kwargs["images_kwargs"])
+            encoding.update(image_encoding)
+        return encoding
 
     # Copied from transformers.models.blip.processing_blip.BlipProcessor.batch_decode with BertTokenizerFast->PreTrainedTokenizer
     def batch_decode(self, *args, **kwargs):
diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py
index a5ac476c99b3db..75f8e5830f44bd 100644
--- a/src/transformers/models/bloom/modeling_bloom.py
+++ b/src/transformers/models/bloom/modeling_bloom.py
@@ -620,9 +620,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if self.gradient_checkpointing and self.training and use_cache:
             logger.warning_once(
@@ -765,7 +763,7 @@ def _update_causal_mask(
         dtype, device = input_tensor.dtype, input_tensor.device
         sequence_length = input_tensor.shape[1]
         if using_static_cache:
-            target_length = past_key_values.get_max_length()
+            target_length = past_key_values.get_max_cache_shape()
         else:
             target_length = (
                 attention_mask.shape[-1]
@@ -889,6 +887,8 @@ def prepare_inputs_for_generation(
         use_cache=True,
         **kwargs,
     ):
+        # Overwriten because of the fixed-shape attention mask creation
+
         # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
         # Exception 1: when passing input_embeds, input_ids may be missing entries
         # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
diff --git a/src/transformers/models/bridgetower/modeling_bridgetower.py b/src/transformers/models/bridgetower/modeling_bridgetower.py
index c7b32de0d08e17..9a900acf500c1e 100644
--- a/src/transformers/models/bridgetower/modeling_bridgetower.py
+++ b/src/transformers/models/bridgetower/modeling_bridgetower.py
@@ -295,15 +295,15 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
         """
 
         num_patches = embeddings.shape[1] - 1
-        self.position_embeddings = self.position_embedding.weight.unsqueeze(0)
-        num_positions = self.position_embeddings.shape[1] - 1
+        position_embedding = self.position_embedding.weight.unsqueeze(0)
+        num_positions = position_embedding.shape[1] - 1
 
         # always interpolate when tracing to ensure the exported model works for dynamic input shapes
         if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
-            return self.position_embeddings
+            return self.position_embedding(self.position_ids)
 
-        class_pos_embed = self.position_embeddings[:, :1]
-        patch_pos_embed = self.position_embeddings[:, 1:]
+        class_pos_embed = position_embedding[:, :1]
+        patch_pos_embed = position_embedding[:, 1:]
 
         dim = embeddings.shape[-1]
 
diff --git a/src/transformers/models/bridgetower/processing_bridgetower.py b/src/transformers/models/bridgetower/processing_bridgetower.py
index 7718c3bf833fec..177eb12051654d 100644
--- a/src/transformers/models/bridgetower/processing_bridgetower.py
+++ b/src/transformers/models/bridgetower/processing_bridgetower.py
@@ -16,11 +16,29 @@
 Processor class for BridgeTower.
 """
 
-from typing import List, Optional, Union
+from typing import List, Union
 
-from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
-from ...utils import TensorType
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
+
+
+class BridgeTowerProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "add_special_tokens": True,
+            "padding": False,
+            "stride": 0,
+            "return_overflowing_tokens": False,
+            "return_special_tokens_mask": False,
+            "return_offsets_mapping": False,
+            "return_length": False,
+            "verbose": True,
+        },
+        "images_kwargs": {
+            "do_normalize": True,
+            "do_center_crop": True,
+        },
+    }
 
 
 class BridgeTowerProcessor(ProcessorMixin):
@@ -50,21 +68,9 @@ def __call__(
         self,
         images,
         text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
-        add_special_tokens: bool = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[BridgeTowerProcessorKwargs],
     ) -> BatchEncoding:
         """
         This method uses [`BridgeTowerImageProcessor.__call__`] method to prepare image(s) for the model, and
@@ -72,28 +78,14 @@ def __call__(
 
         Please refer to the docstring of the above two methods for more information.
         """
-        encoding = self.tokenizer(
-            text=text,
-            add_special_tokens=add_special_tokens,
-            padding=padding,
-            truncation=truncation,
-            max_length=max_length,
-            stride=stride,
-            pad_to_multiple_of=pad_to_multiple_of,
-            return_token_type_ids=return_token_type_ids,
-            return_attention_mask=return_attention_mask,
-            return_overflowing_tokens=return_overflowing_tokens,
-            return_special_tokens_mask=return_special_tokens_mask,
-            return_offsets_mapping=return_offsets_mapping,
-            return_length=return_length,
-            verbose=verbose,
-            return_tensors=return_tensors,
+        output_kwargs = self._merge_kwargs(
+            BridgeTowerProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
             **kwargs,
         )
+        encoding = self.tokenizer(text=text, **output_kwargs["text_kwargs"])
         # add pixel_values + pixel_mask
-        encoding_image_processor = self.image_processor(
-            images, return_tensors=return_tensors, do_normalize=True, do_center_crop=True, **kwargs
-        )
+        encoding_image_processor = self.image_processor(images, **output_kwargs["images_kwargs"])
         encoding.update(encoding_image_processor)
 
         return encoding
diff --git a/src/transformers/models/camembert/modeling_camembert.py b/src/transformers/models/camembert/modeling_camembert.py
index 95540f96d3b6f6..32e8a0af2ba2d1 100644
--- a/src/transformers/models/camembert/modeling_camembert.py
+++ b/src/transformers/models/camembert/modeling_camembert.py
@@ -1674,27 +1674,6 @@ def forward(
             cross_attentions=outputs.cross_attentions,
         )
 
-    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
-        input_shape = input_ids.shape
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_shape)
-
-        # cut decoder_input_ids if past_key_values is used
-        if past_key_values is not None:
-            past_length = past_key_values[0][0].shape[2]
-
-            # Some generation methods already pass only the last input ID
-            if input_ids.shape[1] > past_length:
-                remove_prefix_length = past_length
-            else:
-                # Default to old behavior: keep only final ID
-                remove_prefix_length = input_ids.shape[1] - 1
-
-            input_ids = input_ids[:, remove_prefix_length:]
-
-        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
-
     def _reorder_cache(self, past_key_values, beam_idx):
         reordered_past = ()
         for layer_past in past_key_values:
diff --git a/src/transformers/models/chameleon/modeling_chameleon.py b/src/transformers/models/chameleon/modeling_chameleon.py
index 10286ffefd6777..fd76c0b1152267 100644
--- a/src/transformers/models/chameleon/modeling_chameleon.py
+++ b/src/transformers/models/chameleon/modeling_chameleon.py
@@ -346,7 +346,7 @@ def forward(
             attn_weights = attn_weights + causal_mask
 
         # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1).to(query_states.dtype)
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
         attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
         attn_output = torch.matmul(attn_weights, value_states)
 
@@ -1278,9 +1278,7 @@ def forward(
             use_cache = False
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if pixel_values is not None and inputs_embeds is not None:
             raise ValueError(
@@ -1404,7 +1402,7 @@ def _update_causal_mask(
         dtype, device = input_tensor.dtype, input_tensor.device
         sequence_length = input_tensor.shape[1]
         if using_static_cache:
-            target_length = past_key_values.get_max_length()
+            target_length = past_key_values.get_max_cache_shape()
         else:
             target_length = (
                 attention_mask.shape[-1]
diff --git a/src/transformers/models/chinese_clip/modeling_chinese_clip.py b/src/transformers/models/chinese_clip/modeling_chinese_clip.py
index 393d5784bb440a..dffa9028af4ffe 100644
--- a/src/transformers/models/chinese_clip/modeling_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/modeling_chinese_clip.py
@@ -200,15 +200,15 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
         """
 
         num_patches = embeddings.shape[1] - 1
-        self.position_embeddings = self.position_embedding.weight.unsqueeze(0)
-        num_positions = self.position_embeddings.shape[1] - 1
+        position_embedding = self.position_embedding.weight.unsqueeze(0)
+        num_positions = position_embedding.shape[1] - 1
 
         # always interpolate when tracing to ensure the exported model works for dynamic input shapes
         if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
-            return self.position_embeddings
+            return self.position_embedding(self.position_ids)
 
-        class_pos_embed = self.position_embeddings[:, :1]
-        patch_pos_embed = self.position_embeddings[:, 1:]
+        class_pos_embed = position_embedding[:, :1]
+        patch_pos_embed = position_embedding[:, 1:]
 
         dim = embeddings.shape[-1]
 
diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py
index 370f17f479650a..f946f828eec639 100644
--- a/src/transformers/models/clip/modeling_clip.py
+++ b/src/transformers/models/clip/modeling_clip.py
@@ -208,15 +208,15 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
         """
 
         num_patches = embeddings.shape[1] - 1
-        self.position_embeddings = self.position_embedding.weight.unsqueeze(0)
-        num_positions = self.position_embeddings.shape[1] - 1
+        position_embedding = self.position_embedding.weight.unsqueeze(0)
+        num_positions = position_embedding.shape[1] - 1
 
         # always interpolate when tracing to ensure the exported model works for dynamic input shapes
         if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
-            return self.position_embeddings
+            return self.position_embedding(self.position_ids)
 
-        class_pos_embed = self.position_embeddings[:, :1]
-        patch_pos_embed = self.position_embeddings[:, 1:]
+        class_pos_embed = position_embedding[:, :1]
+        patch_pos_embed = position_embedding[:, 1:]
 
         dim = embeddings.shape[-1]
 
diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index 90520524fa8843..8ff7f1cd96a0d2 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -175,15 +175,15 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
         """
 
         num_patches = embeddings.shape[1] - 1
-        self.position_embeddings = self.position_embedding.weight.unsqueeze(0)
-        num_positions = self.position_embeddings.shape[1] - 1
+        position_embedding = self.position_embedding.weight.unsqueeze(0)
+        num_positions = position_embedding.shape[1] - 1
 
         # always interpolate when tracing to ensure the exported model works for dynamic input shapes
         if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
-            return self.position_embeddings
+            return self.position_embedding(self.position_ids)
 
-        class_pos_embed = self.position_embeddings[:, :1]
-        patch_pos_embed = self.position_embeddings[:, 1:]
+        class_pos_embed = position_embedding[:, :1]
+        patch_pos_embed = position_embedding[:, 1:]
 
         dim = embeddings.shape[-1]
 
diff --git a/src/transformers/models/clvp/modeling_clvp.py b/src/transformers/models/clvp/modeling_clvp.py
index f438226064ec2d..a94667481540bd 100644
--- a/src/transformers/models/clvp/modeling_clvp.py
+++ b/src/transformers/models/clvp/modeling_clvp.py
@@ -1367,6 +1367,8 @@ def _prepare_model_inputs(
     def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, inputs_embeds=None, conditioning_embeds=None, **kwargs
     ):
+        # Overwritten: has `conditioning_embeds`-related logic
+
         input_ids_length = input_ids.shape[-1]
         token_type_ids = kwargs.get("token_type_ids", None)
         # only last token for inputs_ids if past is defined in kwargs
diff --git a/src/transformers/models/codegen/modeling_codegen.py b/src/transformers/models/codegen/modeling_codegen.py
index 32c265c421d93e..478745b2c59ea4 100644
--- a/src/transformers/models/codegen/modeling_codegen.py
+++ b/src/transformers/models/codegen/modeling_codegen.py
@@ -459,9 +459,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if self.gradient_checkpointing and self.training:
             if use_cache:
@@ -608,7 +606,7 @@ def _update_causal_mask(
         dtype, device = input_tensor.dtype, input_tensor.device
         sequence_length = input_tensor.shape[1]
         if using_static_cache:
-            target_length = past_key_values.get_max_length()
+            target_length = past_key_values.get_max_cache_shape()
         else:
             target_length = (
                 attention_mask.shape[-1]
@@ -721,78 +719,6 @@ def get_output_embeddings(self):
     def set_output_embeddings(self, new_embeddings):
         self.lm_head = new_embeddings
 
-    # Copied from transformers.models.gpt_neo.modeling_gpt_neo.GPTNeoForCausalLM.prepare_inputs_for_generation
-    def prepare_inputs_for_generation(
-        self,
-        input_ids,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        past_key_values=None,
-        inputs_embeds=None,
-        cache_position=None,
-        use_cache=True,
-        **kwargs,
-    ):
-        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
-        # Exception 1: when passing input_embeds, input_ids may be missing entries
-        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
-        if past_key_values is not None:
-            if inputs_embeds is not None:  # Exception 1
-                input_ids = input_ids[:, -cache_position.shape[0] :]
-            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
-                input_ids = input_ids[:, cache_position]
-
-            if token_type_ids is not None:
-                token_type_ids = token_type_ids[:, -input_ids.shape[1] :]
-
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-
-                # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s  `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture.
-                position_ids = position_ids.clone(memory_format=torch.contiguous_format)
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and cache_position[0] == 0:
-            model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
-        else:
-            # The clone here is for the same reason as for `position_ids`.
-            model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
-
-        if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
-            if model_inputs["inputs_embeds"] is not None:
-                batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
-                device = model_inputs["inputs_embeds"].device
-            else:
-                batch_size, sequence_length = model_inputs["input_ids"].shape
-                device = model_inputs["input_ids"].device
-
-            attention_mask = self.transformer._prepare_4d_causal_attention_mask_with_cache_position(
-                attention_mask,
-                sequence_length=sequence_length,
-                target_length=past_key_values.get_max_length(),
-                dtype=self.lm_head.weight.dtype,
-                device=device,
-                cache_position=cache_position,
-                batch_size=batch_size,
-            )
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "cache_position": cache_position,
-                "past_key_values": past_key_values,
-                "use_cache": use_cache,
-                "token_type_ids": token_type_ids,
-                "attention_mask": attention_mask,
-            }
-        )
-        return model_inputs
-
     @add_start_docstrings_to_model_forward(CODEGEN_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py
index 7301f434f7fb29..a5d3721f5bdb03 100644
--- a/src/transformers/models/cohere/modeling_cohere.py
+++ b/src/transformers/models/cohere/modeling_cohere.py
@@ -46,7 +46,6 @@
     add_start_docstrings_to_model_forward,
     is_flash_attn_2_available,
     is_flash_attn_greater_or_equal_2_10,
-    is_torchdynamo_compiling,
     logging,
     replace_return_docstrings,
 )
@@ -843,9 +842,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if self.gradient_checkpointing and self.training and use_cache:
             logger.warning_once(
@@ -978,7 +975,7 @@ def _update_causal_mask(
         dtype, device = input_tensor.dtype, input_tensor.device
         sequence_length = input_tensor.shape[1]
         if using_static_cache:
-            target_length = past_key_values.get_max_length()
+            target_length = past_key_values.get_max_cache_shape()
         else:
             target_length = (
                 attention_mask.shape[-1]
@@ -1169,13 +1166,8 @@ def forward(
         )
 
         hidden_states = outputs[0]
-        if labels is None and not is_torchdynamo_compiling():
-            logger.warning_once(
-                "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)"
-            )
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
-        # TODO: remove the float() operation in v4.46
-        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float()
+        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
         logits = logits * self.logit_scale
 
         loss = None
diff --git a/src/transformers/models/cpmant/modeling_cpmant.py b/src/transformers/models/cpmant/modeling_cpmant.py
index 964d0bbfd1456b..5507c8082f1c05 100755
--- a/src/transformers/models/cpmant/modeling_cpmant.py
+++ b/src/transformers/models/cpmant/modeling_cpmant.py
@@ -849,18 +849,6 @@ def get_output_embeddings(self):
     def set_output_embeddings(self, new_embeddings):
         self.lm_head = new_embeddings
 
-    def prepare_inputs_for_generation(self, input_ids, **kwargs):
-        input_ids = input_ids.int()
-        # save the memory usage of dummy attention mask
-        if "attention_mask" in kwargs:
-            kwargs["attention_mask"] = torch.zeros(1, 1)
-
-        return {
-            "input_ids": input_ids,
-            "use_cache": kwargs["use_cache"],
-            "past_key_values": kwargs.get("past_key_values", None),
-        }
-
     def _reorder_cache(self, past_key_values, beam_idx):
         past_key_values = [list(each) if each is not None else each for each in past_key_values]
         for key_value_layer in past_key_values:
diff --git a/src/transformers/models/ctrl/modeling_ctrl.py b/src/transformers/models/ctrl/modeling_ctrl.py
index 6d921621d47dcb..1d382a81141ff1 100644
--- a/src/transformers/models/ctrl/modeling_ctrl.py
+++ b/src/transformers/models/ctrl/modeling_ctrl.py
@@ -521,22 +521,6 @@ def get_output_embeddings(self):
     def set_output_embeddings(self, new_embeddings):
         self.lm_head = new_embeddings
 
-    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, use_cache=None, **kwargs):
-        # only last tokens for inputs_ids if past is defined in kwargs
-        if past_key_values is not None:
-            past_length = past_key_values[0][0].shape[2]
-
-            # Some generation methods already pass only the last input ID
-            if input_ids.shape[1] > past_length:
-                remove_prefix_length = past_length
-            else:
-                # Default to old behavior: keep only final ID
-                remove_prefix_length = input_ids.shape[1] - 1
-
-            input_ids = input_ids[:, remove_prefix_length:]
-
-        return {"input_ids": input_ids, "past_key_values": past_key_values, "use_cache": use_cache}
-
     @add_start_docstrings_to_model_forward(CTRL_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
@@ -628,6 +612,24 @@ def forward(
             attentions=transformer_outputs.attentions,
         )
 
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, use_cache=None, **kwargs):
+        # Overwritten -- inputs_embeds not working properly
+
+        # only last tokens for inputs_ids if past is defined in kwargs
+        if past_key_values is not None:
+            past_length = past_key_values[0][0].shape[2]
+
+            # Some generation methods already pass only the last input ID
+            if input_ids.shape[1] > past_length:
+                remove_prefix_length = past_length
+            else:
+                # Default to old behavior: keep only final ID
+                remove_prefix_length = input_ids.shape[1] - 1
+
+            input_ids = input_ids[:, remove_prefix_length:]
+
+        return {"input_ids": input_ids, "past_key_values": past_key_values, "use_cache": use_cache}
+
     @staticmethod
     def _reorder_cache(
         past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py
index dd2a676b26c27f..590509eaf9057c 100755
--- a/src/transformers/models/data2vec/modeling_data2vec_audio.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py
@@ -26,6 +26,7 @@
 
 from ...activations import ACT2FN
 from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.fsdp import is_fsdp_managed_module
 from ...modeling_outputs import (
     BaseModelOutput,
     CausalLMOutput,
@@ -594,7 +595,7 @@ def forward(
             value_states,
             attention_mask,
             q_len,
-            dropout=self.dropout,
+            dropout=self.dropout if self.training else 0.0,
             is_causal=self.is_causal,
             use_top_left_mask=self._flash_attn_uses_top_left_mask,
         )
@@ -826,7 +827,7 @@ def forward(
         hidden_states = self.layer_norm(hidden_states)
         hidden_states = self.dropout(hidden_states)
 
-        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
 
         for layer in self.layers:
             if output_hidden_states:
@@ -836,8 +837,8 @@ def forward(
             dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
-            if not skip_the_layer or deepspeed_zero3_is_enabled:
-                # under deepspeed zero3 all gpus must run in sync
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
                 if self.gradient_checkpointing and self.training:
                     layer_outputs = self._gradient_checkpointing_func(
                         layer.__call__,
diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py
index fcddeab7a595ea..e62c11943866a3 100644
--- a/src/transformers/models/data2vec/modeling_data2vec_text.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_text.py
@@ -996,27 +996,6 @@ def forward(
             cross_attentions=outputs.cross_attentions,
         )
 
-    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
-        input_shape = input_ids.shape
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_shape)
-
-        # cut decoder_input_ids if past_key_values is used
-        if past_key_values is not None:
-            past_length = past_key_values[0][0].shape[2]
-
-            # Some generation methods already pass only the last input ID
-            if input_ids.shape[1] > past_length:
-                remove_prefix_length = past_length
-            else:
-                # Default to old behavior: keep only final ID
-                remove_prefix_length = input_ids.shape[1] - 1
-
-            input_ids = input_ids[:, remove_prefix_length:]
-
-        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
-
     def _reorder_cache(self, past_key_values, beam_idx):
         reordered_past = ()
         for layer_past in past_key_values:
diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index d197722f5b18f0..ef81e43d0294f0 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -991,9 +991,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if self.gradient_checkpointing and self.training and use_cache:
             logger.warning_once(
@@ -1138,7 +1136,7 @@ def _update_causal_mask(
         dtype, device = input_tensor.dtype, input_tensor.device
         sequence_length = input_tensor.shape[1]
         if using_static_cache:
-            target_length = past_key_values.get_max_length()
+            target_length = past_key_values.get_max_cache_shape()
         else:
             target_length = (
                 attention_mask.shape[-1]
diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
index 46e00787baf618..f380c3c3b48139 100755
--- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
@@ -523,14 +523,14 @@ def __init__(self, embedding_dim=64, temperature=10000, normalize=False, scale=N
     def forward(self, pixel_values, pixel_mask):
         if pixel_mask is None:
             raise ValueError("No pixel mask provided")
-        y_embed = pixel_mask.cumsum(1, dtype=torch.float32)
-        x_embed = pixel_mask.cumsum(2, dtype=torch.float32)
+        y_embed = pixel_mask.cumsum(1, dtype=pixel_values.dtype)
+        x_embed = pixel_mask.cumsum(2, dtype=pixel_values.dtype)
         if self.normalize:
             eps = 1e-6
             y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale
             x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale
 
-        dim_t = torch.arange(self.embedding_dim, dtype=torch.int64, device=pixel_values.device).float()
+        dim_t = torch.arange(self.embedding_dim, dtype=pixel_values.dtype, device=pixel_values.device)
         dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim)
 
         pos_x = x_embed[:, :, :, None] / dim_t
@@ -580,11 +580,14 @@ def build_position_encoding(config):
 
 
 def multi_scale_deformable_attention(
-    value: Tensor, value_spatial_shapes: Tensor, sampling_locations: Tensor, attention_weights: Tensor
+    value: Tensor,
+    value_spatial_shapes: Union[Tensor, List[Tuple]],
+    sampling_locations: Tensor,
+    attention_weights: Tensor,
 ) -> Tensor:
     batch_size, _, num_heads, hidden_dim = value.shape
     _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
-    value_list = value.split([height.item() * width.item() for height, width in value_spatial_shapes], dim=1)
+    value_list = value.split([height * width for height, width in value_spatial_shapes], dim=1)
     sampling_grids = 2 * sampling_locations - 1
     sampling_value_list = []
     for level_id, (height, width) in enumerate(value_spatial_shapes):
@@ -660,29 +663,6 @@ def __init__(self, config: DeformableDetrConfig, num_heads: int, n_points: int):
 
         self.disable_custom_kernels = config.disable_custom_kernels
 
-        self._reset_parameters()
-
-    def _reset_parameters(self):
-        nn.init.constant_(self.sampling_offsets.weight.data, 0.0)
-        default_dtype = torch.get_default_dtype()
-        thetas = torch.arange(self.n_heads, dtype=torch.int64).to(default_dtype) * (2.0 * math.pi / self.n_heads)
-        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
-        grid_init = (
-            (grid_init / grid_init.abs().max(-1, keepdim=True)[0])
-            .view(self.n_heads, 1, 1, 2)
-            .repeat(1, self.n_levels, self.n_points, 1)
-        )
-        for i in range(self.n_points):
-            grid_init[:, :, i, :] *= i + 1
-        with torch.no_grad():
-            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
-        nn.init.constant_(self.attention_weights.weight.data, 0.0)
-        nn.init.constant_(self.attention_weights.bias.data, 0.0)
-        nn.init.xavier_uniform_(self.value_proj.weight.data)
-        nn.init.constant_(self.value_proj.bias.data, 0.0)
-        nn.init.xavier_uniform_(self.output_proj.weight.data)
-        nn.init.constant_(self.output_proj.bias.data, 0.0)
-
     def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
         return tensor if position_embeddings is None else tensor + position_embeddings
 
@@ -695,6 +675,7 @@ def forward(
         position_embeddings: Optional[torch.Tensor] = None,
         reference_points=None,
         spatial_shapes=None,
+        spatial_shapes_list=None,
         level_start_index=None,
         output_attentions: bool = False,
     ):
@@ -704,7 +685,8 @@ def forward(
 
         batch_size, num_queries, _ = hidden_states.shape
         batch_size, sequence_length, _ = encoder_hidden_states.shape
-        if (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() != sequence_length:
+        total_elements = sum(height * width for height, width in spatial_shapes_list)
+        if total_elements != sequence_length:
             raise ValueError(
                 "Make sure to align the spatial shapes with the sequence length of the encoder hidden states"
             )
@@ -739,9 +721,11 @@ def forward(
         else:
             raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}")
 
-        if self.disable_custom_kernels:
+        if self.disable_custom_kernels or MultiScaleDeformableAttention is None:
             # PyTorch implementation
-            output = multi_scale_deformable_attention(value, spatial_shapes, sampling_locations, attention_weights)
+            output = multi_scale_deformable_attention(
+                value, spatial_shapes_list, sampling_locations, attention_weights
+            )
         else:
             try:
                 # custom kernel
@@ -755,7 +739,9 @@ def forward(
                 )
             except Exception:
                 # PyTorch implementation
-                output = multi_scale_deformable_attention(value, spatial_shapes, sampling_locations, attention_weights)
+                output = multi_scale_deformable_attention(
+                    value, spatial_shapes_list, sampling_locations, attention_weights
+                )
         output = self.output_proj(output)
 
         return output, attention_weights
@@ -900,6 +886,7 @@ def forward(
         position_embeddings: torch.Tensor = None,
         reference_points=None,
         spatial_shapes=None,
+        spatial_shapes_list=None,
         level_start_index=None,
         output_attentions: bool = False,
     ):
@@ -932,6 +919,7 @@ def forward(
             position_embeddings=position_embeddings,
             reference_points=reference_points,
             spatial_shapes=spatial_shapes,
+            spatial_shapes_list=spatial_shapes_list,
             level_start_index=level_start_index,
             output_attentions=output_attentions,
         )
@@ -997,6 +985,7 @@ def forward(
         position_embeddings: Optional[torch.Tensor] = None,
         reference_points=None,
         spatial_shapes=None,
+        spatial_shapes_list=None,
         level_start_index=None,
         encoder_hidden_states: Optional[torch.Tensor] = None,
         encoder_attention_mask: Optional[torch.Tensor] = None,
@@ -1048,6 +1037,7 @@ def forward(
             position_embeddings=position_embeddings,
             reference_points=reference_points,
             spatial_shapes=spatial_shapes,
+            spatial_shapes_list=spatial_shapes_list,
             level_start_index=level_start_index,
             output_attentions=output_attentions,
         )
@@ -1088,7 +1078,27 @@ def _init_weights(self, module):
             nn.init.uniform_(module.row_embeddings.weight)
             nn.init.uniform_(module.column_embeddings.weight)
         elif isinstance(module, DeformableDetrMultiscaleDeformableAttention):
-            module._reset_parameters()
+            nn.init.constant_(module.sampling_offsets.weight.data, 0.0)
+            default_dtype = torch.get_default_dtype()
+            thetas = torch.arange(module.n_heads, dtype=torch.int64).to(default_dtype) * (
+                2.0 * math.pi / module.n_heads
+            )
+            grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+            grid_init = (
+                (grid_init / grid_init.abs().max(-1, keepdim=True)[0])
+                .view(module.n_heads, 1, 1, 2)
+                .repeat(1, module.n_levels, module.n_points, 1)
+            )
+            for i in range(module.n_points):
+                grid_init[:, :, i, :] *= i + 1
+            with torch.no_grad():
+                module.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
+            nn.init.constant_(module.attention_weights.weight.data, 0.0)
+            nn.init.constant_(module.attention_weights.bias.data, 0.0)
+            nn.init.xavier_uniform_(module.value_proj.weight.data)
+            nn.init.constant_(module.value_proj.bias.data, 0.0)
+            nn.init.xavier_uniform_(module.output_proj.weight.data)
+            nn.init.constant_(module.output_proj.bias.data, 0.0)
         elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
             # Slightly different from the TF version which uses truncated_normal for initialization
             # cf https://github.com/pytorch/pytorch/pull/5617
@@ -1219,6 +1229,7 @@ def forward(
         attention_mask=None,
         position_embeddings=None,
         spatial_shapes=None,
+        spatial_shapes_list=None,
         level_start_index=None,
         valid_ratios=None,
         output_attentions=None,
@@ -1260,7 +1271,8 @@ def forward(
         hidden_states = inputs_embeds
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
 
-        reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=inputs_embeds.device)
+        spatial_shapes_tuple = tuple(spatial_shapes_list)
+        reference_points = self.get_reference_points(spatial_shapes_tuple, valid_ratios, device=inputs_embeds.device)
 
         encoder_states = () if output_hidden_states else None
         all_attentions = () if output_attentions else None
@@ -1275,6 +1287,7 @@ def forward(
                     position_embeddings,
                     reference_points,
                     spatial_shapes,
+                    spatial_shapes_list,
                     level_start_index,
                     output_attentions,
                 )
@@ -1285,6 +1298,7 @@ def forward(
                     position_embeddings=position_embeddings,
                     reference_points=reference_points,
                     spatial_shapes=spatial_shapes,
+                    spatial_shapes_list=spatial_shapes_list,
                     level_start_index=level_start_index,
                     output_attentions=output_attentions,
                 )
@@ -1341,6 +1355,7 @@ def forward(
         position_embeddings=None,
         reference_points=None,
         spatial_shapes=None,
+        spatial_shapes_list=None,
         level_start_index=None,
         valid_ratios=None,
         output_attentions=None,
@@ -1416,6 +1431,7 @@ def forward(
                     position_embeddings,
                     reference_points_input,
                     spatial_shapes,
+                    spatial_shapes_list,
                     level_start_index,
                     encoder_hidden_states,
                     encoder_attention_mask,
@@ -1428,6 +1444,7 @@ def forward(
                     encoder_hidden_states=encoder_hidden_states,
                     reference_points=reference_points_input,
                     spatial_shapes=spatial_shapes,
+                    spatial_shapes_list=spatial_shapes_list,
                     level_start_index=level_start_index,
                     encoder_attention_mask=encoder_attention_mask,
                     output_attentions=output_attentions,
@@ -1589,7 +1606,7 @@ def get_proposal_pos_embed(self, proposals):
         temperature = 10000
         scale = 2 * math.pi
 
-        dim_t = torch.arange(num_pos_feats, dtype=torch.int64, device=proposals.device).float()
+        dim_t = torch.arange(num_pos_feats, dtype=proposals.dtype, device=proposals.device)
         dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats)
         # batch_size, num_queries, 4
         proposals = proposals.sigmoid() * scale
@@ -1720,7 +1737,9 @@ def forward(
                     source = self.input_proj[level](features[-1][0])
                 else:
                     source = self.input_proj[level](sources[-1])
-                mask = nn.functional.interpolate(pixel_mask[None].float(), size=source.shape[-2:]).to(torch.bool)[0]
+                mask = nn.functional.interpolate(pixel_mask[None].to(pixel_values.dtype), size=source.shape[-2:]).to(
+                    torch.bool
+                )[0]
                 pos_l = self.backbone.position_embedding(source, mask).to(source.dtype)
                 sources.append(source)
                 masks.append(mask)
@@ -1735,11 +1754,11 @@ def forward(
         source_flatten = []
         mask_flatten = []
         lvl_pos_embed_flatten = []
-        spatial_shapes = []
+        spatial_shapes_list = []
         for level, (source, mask, pos_embed) in enumerate(zip(sources, masks, position_embeddings_list)):
             batch_size, num_channels, height, width = source.shape
             spatial_shape = (height, width)
-            spatial_shapes.append(spatial_shape)
+            spatial_shapes_list.append(spatial_shape)
             source = source.flatten(2).transpose(1, 2)
             mask = mask.flatten(1)
             pos_embed = pos_embed.flatten(2).transpose(1, 2)
@@ -1750,7 +1769,7 @@ def forward(
         source_flatten = torch.cat(source_flatten, 1)
         mask_flatten = torch.cat(mask_flatten, 1)
         lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
-        spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=source_flatten.device)
+        spatial_shapes = torch.as_tensor(spatial_shapes_list, dtype=torch.long, device=source_flatten.device)
         level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
         valid_ratios = torch.stack([self.get_valid_ratio(m, dtype=source_flatten.dtype) for m in masks], 1)
 
@@ -1762,6 +1781,7 @@ def forward(
                 attention_mask=mask_flatten,
                 position_embeddings=lvl_pos_embed_flatten,
                 spatial_shapes=spatial_shapes,
+                spatial_shapes_list=spatial_shapes_list,
                 level_start_index=level_start_index,
                 valid_ratios=valid_ratios,
                 output_attentions=output_attentions,
@@ -1819,6 +1839,7 @@ def forward(
             encoder_attention_mask=mask_flatten,
             reference_points=reference_points,
             spatial_shapes=spatial_shapes,
+            spatial_shapes_list=spatial_shapes_list,
             level_start_index=level_start_index,
             valid_ratios=valid_ratios,
             output_attentions=output_attentions,
diff --git a/src/transformers/models/deprecated/mctct/modeling_mctct.py b/src/transformers/models/deprecated/mctct/modeling_mctct.py
index becba11c16fcda..3cbf2cc0bf0dcd 100755
--- a/src/transformers/models/deprecated/mctct/modeling_mctct.py
+++ b/src/transformers/models/deprecated/mctct/modeling_mctct.py
@@ -24,6 +24,7 @@
 from ....activations import ACT2FN
 from ....file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
 from ....integrations.deepspeed import is_deepspeed_zero3_enabled
+from ....integrations.fsdp import is_fsdp_managed_module
 from ....modeling_attn_mask_utils import _prepare_4d_attention_mask
 from ....modeling_outputs import BaseModelOutput, CausalLMOutput
 from ....modeling_utils import (
@@ -579,7 +580,7 @@ def forward(
                     f"but it is for {head_mask.size()[0]}."
                 )
 
-        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
         for idx, encoder_layer in enumerate(self.layers):
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
@@ -588,8 +589,8 @@ def forward(
             dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
-            if not skip_the_layer or deepspeed_zero3_is_enabled:
-                # under deepspeed zero3 all gpus must run in sync
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
                 if self.gradient_checkpointing and self.training:
                     layer_outputs = self._gradient_checkpointing_func(
                         encoder_layer.__call__,
diff --git a/src/transformers/models/distilbert/modeling_distilbert.py b/src/transformers/models/distilbert/modeling_distilbert.py
index e80e3c41d22cb6..36e35594b3d3c6 100755
--- a/src/transformers/models/distilbert/modeling_distilbert.py
+++ b/src/transformers/models/distilbert/modeling_distilbert.py
@@ -29,6 +29,7 @@
 from ...activations import get_activation
 from ...configuration_utils import PretrainedConfig
 from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...modeling_attn_mask_utils import _prepare_4d_attention_mask_for_sdpa
 from ...modeling_outputs import (
     BaseModelOutput,
     MaskedLMOutput,
@@ -38,7 +39,12 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import (
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    is_torch_greater_or_equal_than_2_2,
+    prune_linear_layer,
+)
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -329,6 +335,86 @@ def reshape(x: torch.Tensor) -> torch.Tensor:
             return (attn_output,)
 
 
+class DistilBertSdpaAttention(MultiHeadSelfAttention):
+    def __init__(self, config: PretrainedConfig):
+        super().__init__(config=config)
+        self.dropout_prob = config.attention_dropout
+        self.require_contiguous_qkv = not is_torch_greater_or_equal_than_2_2
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        mask: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, ...]:
+        """
+        Parameters:
+            query: torch.tensor(bs, seq_length, dim)
+            key: torch.tensor(bs, seq_length, dim)
+            value: torch.tensor(bs, seq_length, dim)
+            mask: torch.tensor(bs, seq_length)
+
+        Returns:
+            weights: torch.tensor(bs, n_heads, seq_length, seq_length) Attention weights context: torch.tensor(bs,
+            seq_length, dim) Contextualized layer. Optional: only if `output_attentions=True`
+        """
+        if output_attentions or head_mask is not None:
+            logger.warning_once(
+                "DistilBertSdpaAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support"
+                " `output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but specifying"
+                " the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be"
+                ' removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                query,
+                key,
+                value,
+                mask,
+                head_mask,
+                output_attentions,
+            )
+
+        batch_size, _, _ = query.size()
+        dim_per_head = self.dim // self.n_heads
+
+        def shape(x: torch.Tensor) -> torch.Tensor:
+            """separate heads"""
+            return x.view(batch_size, -1, self.n_heads, dim_per_head).transpose(1, 2)
+
+        def unshape(x: torch.Tensor) -> torch.Tensor:
+            """group heads"""
+            return x.transpose(1, 2).contiguous().view(batch_size, -1, self.n_heads * dim_per_head)
+
+        q = shape(self.q_lin(query))  # (bs, n_heads, q_length, dim_per_head)
+        k = shape(self.k_lin(key))  # (bs, n_heads, k_length, dim_per_head)
+        v = shape(self.v_lin(value))  # (bs, n_heads, k_length, dim_per_head)
+
+        # SDPA with memory-efficient backend is broken in torch==2.1.2 when using non-contiguous inputs and a custom
+        # attn_mask, so we need to call `.contiguous()` here. This was fixed in torch==2.2.0.
+        # Reference: https://github.com/pytorch/pytorch/issues/112577
+        if self.require_contiguous_qkv and q.device.type == "cuda" and mask is not None:
+            q = q.contiguous()
+            k = k.contiguous()
+            v = v.contiguous()
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            attn_mask=mask,
+            dropout_p=self.dropout_prob if self.training else 0.0,
+            is_causal=False,
+        )
+
+        attn_output = unshape(attn_output)
+        attn_output = self.out_lin(attn_output)
+
+        return (attn_output,)
+
+
 class FFN(nn.Module):
     def __init__(self, config: PretrainedConfig):
         super().__init__()
@@ -353,6 +439,7 @@ def ff_chunk(self, input: torch.Tensor) -> torch.Tensor:
 DISTILBERT_ATTENTION_CLASSES = {
     "eager": MultiHeadSelfAttention,
     "flash_attention_2": DistilBertFlashAttention2,
+    "sdpa": DistilBertSdpaAttention,
 }
 
 
@@ -503,6 +590,7 @@ class DistilBertPreTrainedModel(PreTrainedModel):
     base_model_prefix = "distilbert"
     supports_gradient_checkpointing = True
     _supports_flash_attn_2 = True
+    _supports_sdpa = True
 
     def _init_weights(self, module: nn.Module):
         """Initialize the weights."""
@@ -589,6 +677,7 @@ def __init__(self, config: PretrainedConfig):
         self.embeddings = Embeddings(config)  # Embeddings
         self.transformer = Transformer(config)  # Encoder
         self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+        self._use_sdpa = config._attn_implementation == "sdpa"
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -689,6 +778,7 @@ def forward(
 
         device = input_ids.device if input_ids is not None else inputs_embeds.device
 
+        head_mask_is_none = head_mask is None
         # Prepare head mask if needed
         head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
 
@@ -700,6 +790,11 @@ def forward(
             if attention_mask is None:
                 attention_mask = torch.ones(input_shape, device=device)  # (bs, seq_length)
 
+            if self._use_sdpa and head_mask_is_none and not output_attentions:
+                attention_mask = _prepare_4d_attention_mask_for_sdpa(
+                    attention_mask, embeddings.dtype, tgt_len=input_shape[1]
+                )
+
         return self.transformer(
             x=embeddings,
             attn_mask=attention_mask,
diff --git a/src/transformers/models/donut/processing_donut.py b/src/transformers/models/donut/processing_donut.py
index daf6e7d1dfe4ab..9552d323ac57c0 100644
--- a/src/transformers/models/donut/processing_donut.py
+++ b/src/transformers/models/donut/processing_donut.py
@@ -19,8 +19,15 @@
 import re
 import warnings
 from contextlib import contextmanager
+from typing import List, Optional, Union
 
-from ...processing_utils import ProcessorMixin
+from ...image_utils import ImageInput
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+
+
+class DonutProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {}
 
 
 class DonutProcessor(ProcessorMixin):
@@ -63,7 +70,14 @@ def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         self.current_processor = self.image_processor
         self._in_target_context_manager = False
 
-    def __call__(self, *args, **kwargs):
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Optional[Union[str, List[str], TextInput, PreTokenizedInput]] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[DonutProcessorKwargs],
+    ):
         """
         When used in normal mode, this method forwards all its arguments to AutoImageProcessor's
         [`~AutoImageProcessor.__call__`] and returns its output. If used in the context
@@ -72,28 +86,29 @@ def __call__(self, *args, **kwargs):
         """
         # For backward compatibility
         if self._in_target_context_manager:
-            return self.current_processor(*args, **kwargs)
-
-        images = kwargs.pop("images", None)
-        text = kwargs.pop("text", None)
-        if len(args) > 0:
-            images = args[0]
-            args = args[1:]
+            return self.current_processor(images, text, **kwargs)
 
         if images is None and text is None:
             raise ValueError("You need to specify either an `images` or `text` input to process.")
 
+        output_kwargs = self._merge_kwargs(
+            DonutProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
         if images is not None:
-            inputs = self.image_processor(images, *args, **kwargs)
+            inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
         if text is not None:
-            encodings = self.tokenizer(text, **kwargs)
+            encodings = self.tokenizer(text, **output_kwargs["text_kwargs"])
 
         if text is None:
             return inputs
         elif images is None:
             return encodings
         else:
-            inputs["labels"] = encodings["input_ids"]
+            inputs["labels"] = encodings["input_ids"]  # for BC
+            inputs["input_ids"] = encodings["input_ids"]
             return inputs
 
     def batch_decode(self, *args, **kwargs):
diff --git a/src/transformers/models/electra/modeling_electra.py b/src/transformers/models/electra/modeling_electra.py
index a200d716d451e2..0ce2f8e6985a0a 100644
--- a/src/transformers/models/electra/modeling_electra.py
+++ b/src/transformers/models/electra/modeling_electra.py
@@ -1652,28 +1652,6 @@ def forward(
             cross_attentions=outputs.cross_attentions,
         )
 
-    # Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM.prepare_inputs_for_generation
-    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
-        input_shape = input_ids.shape
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_shape)
-
-        # cut decoder_input_ids if past_key_values is used
-        if past_key_values is not None:
-            past_length = past_key_values[0][0].shape[2]
-
-            # Some generation methods already pass only the last input ID
-            if input_ids.shape[1] > past_length:
-                remove_prefix_length = past_length
-            else:
-                # Default to old behavior: keep only final ID
-                remove_prefix_length = input_ids.shape[1] - 1
-
-            input_ids = input_ids[:, remove_prefix_length:]
-
-        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
-
     # Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM._reorder_cache
     def _reorder_cache(self, past_key_values, beam_idx):
         reordered_past = ()
diff --git a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
index db65f6e5250f8d..359a4eabcf7b3d 100644
--- a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
@@ -670,13 +670,12 @@ def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs
     ):
         decoder_inputs = self.decoder.prepare_inputs_for_generation(input_ids, past_key_values=past_key_values)
-        decoder_attention_mask = decoder_inputs["attention_mask"] if "attention_mask" in decoder_inputs else None
         input_dict = {
             "attention_mask": attention_mask,
-            "decoder_attention_mask": decoder_attention_mask,
+            "decoder_attention_mask": decoder_inputs.get("attention_mask"),
             "decoder_input_ids": decoder_inputs["input_ids"],
             "encoder_outputs": encoder_outputs,
-            "past_key_values": decoder_inputs["past_key_values"],
+            "past_key_values": decoder_inputs.get("past_key_values"),
             "use_cache": use_cache,
         }
         return input_dict
diff --git a/src/transformers/models/ernie/modeling_ernie.py b/src/transformers/models/ernie/modeling_ernie.py
index 6d81c97da02302..1ab6d44faa1ce8 100644
--- a/src/transformers/models/ernie/modeling_ernie.py
+++ b/src/transformers/models/ernie/modeling_ernie.py
@@ -1200,35 +1200,6 @@ def forward(
             cross_attentions=outputs.cross_attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.prepare_inputs_for_generation
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, use_cache=True, **model_kwargs
-    ):
-        input_shape = input_ids.shape
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_shape)
-
-        # cut decoder_input_ids if past_key_values is used
-        if past_key_values is not None:
-            past_length = past_key_values[0][0].shape[2]
-
-            # Some generation methods already pass only the last input ID
-            if input_ids.shape[1] > past_length:
-                remove_prefix_length = past_length
-            else:
-                # Default to old behavior: keep only final ID
-                remove_prefix_length = input_ids.shape[1] - 1
-
-            input_ids = input_ids[:, remove_prefix_length:]
-
-        return {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "past_key_values": past_key_values,
-            "use_cache": use_cache,
-        }
-
     # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel._reorder_cache
     def _reorder_cache(self, past_key_values, beam_idx):
         reordered_past = ()
diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py
index 6928831f0187fb..f48accab44bfc2 100644
--- a/src/transformers/models/falcon/modeling_falcon.py
+++ b/src/transformers/models/falcon/modeling_falcon.py
@@ -964,9 +964,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if self.gradient_checkpointing and self.training:
             if use_cache:
@@ -1139,7 +1137,7 @@ def _update_causal_mask(
         min_dtype = torch.finfo(dtype).min
         batch_size, sequence_length, _ = input_tensor.shape
         if using_static_cache:
-            target_length = past_key_values.get_max_length()
+            target_length = past_key_values.get_max_cache_shape()
         else:
             target_length = (
                 attention_mask.shape[-1]
diff --git a/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py b/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py
index 011197d9854273..a1954f9d9be474 100644
--- a/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py
+++ b/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py
@@ -650,9 +650,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):  # ^ is python for xor
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if inputs_embeds is None:
             inputs_embeds = self.embeddings(input_ids)
@@ -769,6 +767,8 @@ def prepare_inputs_for_generation(
         attention_mask: Optional[torch.LongTensor] = None,
         **kwargs,
     ):
+        # Overwitten -- uses `cache_params` as opposed to `past_key_values`
+
         if use_cache:
             # `cache_position` should have been initialized in `generate`
             if cache_position is None:
diff --git a/src/transformers/models/flaubert/modeling_flaubert.py b/src/transformers/models/flaubert/modeling_flaubert.py
index ef1501e780350d..07f3bfad332b23 100644
--- a/src/transformers/models/flaubert/modeling_flaubert.py
+++ b/src/transformers/models/flaubert/modeling_flaubert.py
@@ -663,6 +663,8 @@ def set_output_embeddings(self, new_embeddings):
         self.pred_layer.proj = new_embeddings
 
     def prepare_inputs_for_generation(self, input_ids, **kwargs):
+        # Overwritten -- uses a language id
+
         mask_token_id = self.config.mask_token_id
         lang_id = self.config.lang_id
 
diff --git a/src/transformers/models/funnel/modeling_funnel.py b/src/transformers/models/funnel/modeling_funnel.py
index b4fdfd5fc5676d..42a06029284800 100644
--- a/src/transformers/models/funnel/modeling_funnel.py
+++ b/src/transformers/models/funnel/modeling_funnel.py
@@ -800,7 +800,7 @@ def _init_weights(self, module):
             std = 1.0 if self.config.initializer_std is None else self.config.initializer_std
             nn.init.normal_(module.word_embeddings.weight, std=std)
             if module.word_embeddings.padding_idx is not None:
-                module.word_embeddings.weight.data[module.padding_idx].zero_()
+                module.word_embeddings.weight.data[module.word_embeddings.padding_idx].zero_()
 
 
 class FunnelClassificationHead(nn.Module):
diff --git a/src/transformers/models/gemma/configuration_gemma.py b/src/transformers/models/gemma/configuration_gemma.py
index 3ab61c522eff41..e170803cccab70 100644
--- a/src/transformers/models/gemma/configuration_gemma.py
+++ b/src/transformers/models/gemma/configuration_gemma.py
@@ -1,9 +1,9 @@
-#           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-#               This file was automatically generated from <path_to_modular_file.py>.
-#         Do NOT edit this file manually as any edits will be overwritten by the generation of
-#         the file from the modular. If any change should be done, please apply the change to the
-#                           modular_xxx.py file directly. One of our CI enforces this
-#           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/gemma/modular_gemma.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_gemma.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 # coding=utf-8
 # Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
 #
@@ -143,3 +143,6 @@ def __init__(
             tie_word_embeddings=tie_word_embeddings,
             **kwargs,
         )
+
+
+__all__ = ["GemmaConfig"]
diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py
index c6070a3d96b6d2..ff206a470bc3fa 100644
--- a/src/transformers/models/gemma/modeling_gemma.py
+++ b/src/transformers/models/gemma/modeling_gemma.py
@@ -1,9 +1,9 @@
-#           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-#               This file was automatically generated from <path_to_modular_file.py>.
-#         Do NOT edit this file manually as any edits will be overwritten by the generation of
-#         the file from the modular. If any change should be done, please apply the change to the
-#                           modular_xxx.py file directly. One of our CI enforces this
-#           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/gemma/modular_gemma.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_gemma.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 # coding=utf-8
 # Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
 #
@@ -43,7 +43,6 @@
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_flash_attn_greater_or_equal_2_10,
-    is_torchdynamo_compiling,
     logging,
     replace_return_docstrings,
 )
@@ -755,9 +754,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if self.gradient_checkpointing and self.training and use_cache:
             logger.warning_once(
@@ -894,7 +891,7 @@ def _update_causal_mask(
         dtype, device = input_tensor.dtype, input_tensor.device
         sequence_length = input_tensor.shape[1]
         if using_static_cache:
-            target_length = past_key_values.get_max_length()
+            target_length = past_key_values.get_max_cache_shape()
         else:
             target_length = (
                 attention_mask.shape[-1]
@@ -1081,13 +1078,8 @@ def forward(
         )
 
         hidden_states = outputs[0]
-        if labels is None and not is_torchdynamo_compiling():
-            logger.warning_once(
-                "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)"
-            )
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
-        # TODO: remove the float() operation in v4.46
-        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float()
+        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
 
         loss = None
         if labels is not None:
@@ -1322,3 +1314,6 @@ def forward(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
+
+
+__all__ = ["GemmaModel", "GemmaForCausalLM", "GemmaForSequenceClassification", "GemmaForTokenClassification"]
diff --git a/src/transformers/models/gemma/modular_gemma.py b/src/transformers/models/gemma/modular_gemma.py
index ca89b6cf2a6da8..7130a30dc9be58 100644
--- a/src/transformers/models/gemma/modular_gemma.py
+++ b/src/transformers/models/gemma/modular_gemma.py
@@ -14,8 +14,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
-from typing import List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
+import sentencepiece as spm
 import torch
 import torch.utils.checkpoint
 from torch import nn
@@ -27,7 +28,8 @@
 from ...modeling_flash_attention_utils import _flash_attention_forward
 from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from ...pytorch_utils import ALL_LAYERNORM_LAYERS
-from ...utils import is_torchdynamo_compiling, logging
+from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...utils import logging
 from ..llama.modeling_llama import (
     LlamaDecoderLayer,
     LlamaFlashAttention2,
@@ -38,6 +40,15 @@
     apply_rotary_pos_emb,
     repeat_kv,
 )
+from ..llama.tokenization_llama import LlamaTokenizer
+
+
+if TYPE_CHECKING:
+    from ...tokenization_utils_base import TextInput
+
+VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
+
+SPIECE_UNDERLINE = "▁"
 
 
 logger = logging.get_logger(__name__)
@@ -164,6 +175,162 @@ def __init__(
         )
 
 
+class GemmaTokenizer(LlamaTokenizer, PreTrainedTokenizer):
+    """
+    Construct a Gemma tokenizer. Based on byte-level Byte-Pair-Encoding. The default padding token is unset as there is
+    no padding token in the original model.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<bos>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+        eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<eos>"`):
+            The end of sequence token.
+        pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<pad>"`):
+            A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
+            attention mechanisms or loss computation.
+        sp_model_kwargs (`Dict[str, Any]`, `Optional`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
+
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+
+        add_bos_token (`bool`, *optional*, defaults to `True`):
+            Whether or not to add an `bos_token` at the start of sequences.
+        add_eos_token (`bool`, *optional*, defaults to `False`):
+            Whether or not to add an `eos_token` at the end of sequences.
+        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
+            Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
+            extra spaces.
+        use_default_system_prompt (`bool`, *optional*, defaults to `False`):
+            Whether or not the default system prompt for Gemma should be used.
+        spaces_between_special_tokens (`bool`, *optional*, defaults to `False`):
+            Whether or not to add spaces between special tokens.
+    """
+
+    def __init__(
+        self,
+        vocab_file,
+        unk_token="<unk>",
+        bos_token="<bos>",
+        eos_token="<eos>",
+        pad_token="<pad>",
+        sp_model_kwargs: Optional[Dict[str, Any]] = None,
+        add_bos_token=True,
+        add_eos_token=False,
+        clean_up_tokenization_spaces=False,
+        use_default_system_prompt=False,
+        spaces_between_special_tokens=False,
+        **kwargs,
+    ):
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        bos_token = AddedToken(bos_token, normalized=False, special=True) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, normalized=False, special=True) if isinstance(eos_token, str) else eos_token
+        unk_token = AddedToken(unk_token, normalized=False, special=True) if isinstance(unk_token, str) else unk_token
+        pad_token = AddedToken(pad_token, normalized=False, special=True) if isinstance(pad_token, str) else pad_token
+
+        self.vocab_file = vocab_file
+        self.add_bos_token = add_bos_token
+        self.add_eos_token = add_eos_token
+        self.use_default_system_prompt = use_default_system_prompt
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(vocab_file)
+
+        PreTrainedTokenizer.__init__(
+            self,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            add_bos_token=add_bos_token,
+            add_eos_token=add_eos_token,
+            sp_model_kwargs=sp_model_kwargs,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            use_default_system_prompt=use_default_system_prompt,
+            spaces_between_special_tokens=spaces_between_special_tokens,
+            **kwargs,
+        )
+
+    def get_spm_processor(self):
+        raise AttributeError("Not needed for Gemma")
+
+    def unk_token_length(self):
+        raise AttributeError("Not needed for Gemma")
+
+    def tokenize(self, text: "TextInput", **kwargs) -> List[str]:
+        """
+        Args:
+            text: TextInput
+        Simply calls PreTrainedTokenizer's method
+        """
+        return PreTrainedTokenizer.tokenize(self, text, **kwargs)
+
+    def _tokenize(self, text, **kwargs):
+        """
+        Args:
+            text: TextInput
+        Returns a tokenized string. The Gemma tokenizer never adds a prefix space.
+        """
+        return self.sp_model.encode(text, out_type=str)
+
+    def _decode(
+        self,
+        token_ids: List[int],
+        skip_special_tokens: bool = False,
+        spaces_between_special_tokens: bool = False,
+        **kwargs,
+    ) -> str:
+        sub_texts = []
+        current_sub_text = []
+        for ids in token_ids:
+            if skip_special_tokens and ids in self.all_special_ids:
+                continue
+            if ids in self._added_tokens_decoder:
+                if current_sub_text:
+                    sub_texts.append(self.sp_model.decode(current_sub_text))
+                sub_texts.append(self._added_tokens_decoder[ids].content)
+                current_sub_text = []
+            else:
+                current_sub_text.append(ids)
+        if current_sub_text:
+            sub_texts.append(self.sp_model.decode(current_sub_text))
+
+        if spaces_between_special_tokens:
+            sub_texts = " ".join(sub_texts)
+        else:
+            sub_texts = "".join(sub_texts)
+
+        return sub_texts.replace(SPIECE_UNDERLINE, " ")
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        current_sub_tokens = []
+        out_string = ""
+        for token in tokens:
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self._added_tokens_encoder:
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+        out_string += self.sp_model.decode(current_sub_tokens)
+        return out_string
+
+
 class GemmaRMSNorm(nn.Module):
     def __init__(self, dim: int, eps: float = 1e-6):
         super().__init__()
@@ -667,9 +834,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if self.gradient_checkpointing and self.training and use_cache:
             logger.warning_once(
@@ -833,13 +998,8 @@ def forward(
         )
 
         hidden_states = outputs[0]
-        if labels is None and not is_torchdynamo_compiling():
-            logger.warning_once(
-                "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)"
-            )
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
-        # TODO: remove the float() operation in v4.46
-        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float()
+        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
 
         loss = None
         if labels is not None:
@@ -881,3 +1041,13 @@ def __init__(self, config):
         super().__init__(config)
         self.model = GemmaModel(config)
         self.post_init()
+
+
+__all__ = [
+    "GemmaConfig",
+    "GemmaTokenizer",
+    "GemmaModel",
+    "GemmaForCausalLM",
+    "GemmaForSequenceClassification",
+    "GemmaForTokenClassification",
+]
diff --git a/src/transformers/models/gemma/tokenization_gemma.py b/src/transformers/models/gemma/tokenization_gemma.py
index 09e779478c0ea0..ff0d1d034c2238 100644
--- a/src/transformers/models/gemma/tokenization_gemma.py
+++ b/src/transformers/models/gemma/tokenization_gemma.py
@@ -1,5 +1,12 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/gemma/modular_gemma.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_gemma.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 # coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,9 +19,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-"""Tokenization classes for Gemma."""
-
 import os
 from shutil import copyfile
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
@@ -26,7 +30,7 @@
 
 
 if TYPE_CHECKING:
-    pass
+    from ...tokenization_utils_base import TextInput
 
 logger = logging.get_logger(__name__)
 
@@ -110,7 +114,6 @@ def __init__(
         self.add_bos_token = add_bos_token
         self.add_eos_token = add_eos_token
         self.use_default_system_prompt = use_default_system_prompt
-
         self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
         self.sp_model.Load(vocab_file)
 
@@ -121,85 +124,60 @@ def __init__(
             pad_token=pad_token,
             add_bos_token=add_bos_token,
             add_eos_token=add_eos_token,
-            sp_model_kwargs=self.sp_model_kwargs,
+            sp_model_kwargs=sp_model_kwargs,
             clean_up_tokenization_spaces=clean_up_tokenization_spaces,
             use_default_system_prompt=use_default_system_prompt,
             spaces_between_special_tokens=spaces_between_special_tokens,
             **kwargs,
         )
 
-    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.__getstate__
     def __getstate__(self):
         state = self.__dict__.copy()
         state["sp_model"] = None
         state["sp_model_proto"] = self.sp_model.serialized_model_proto()
         return state
 
-    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.__setstate__
     def __setstate__(self, d):
         self.__dict__ = d
         self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
         self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
 
     @property
-    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.vocab_size
     def vocab_size(self):
         """Returns vocab size"""
         return self.sp_model.get_piece_size()
 
-    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.get_vocab
     def get_vocab(self):
         """Returns vocab as a dict"""
         vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
         vocab.update(self.added_tokens_encoder)
         return vocab
 
+    def tokenize(self, text: "TextInput", **kwargs) -> List[str]:
+        """
+        Args:
+            text: TextInput
+        Simply calls PreTrainedTokenizer's method
+        """
+        return super().tokenize(text, **kwargs)
+
     def _tokenize(self, text, **kwargs):
         """
+        Args:
+            text: TextInput
         Returns a tokenized string. The Gemma tokenizer never adds a prefix space.
         """
         return self.sp_model.encode(text, out_type=str)
 
-    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer._convert_token_to_id
     def _convert_token_to_id(self, token):
         """Converts a token (str) in an id using the vocab."""
         return self.sp_model.piece_to_id(token)
 
-    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer._convert_id_to_token
     def _convert_id_to_token(self, index):
         """Converts an index (integer) in a token (str) using the vocab."""
         token = self.sp_model.IdToPiece(index)
         return token
 
-    def _decode(
-        self,
-        token_ids: List[int],
-        skip_special_tokens: bool = False,
-        spaces_between_special_tokens: bool = False,
-        **kwargs,
-    ) -> str:
-        sub_texts = []
-        current_sub_text = []
-        for ids in token_ids:
-            if skip_special_tokens and ids in self.all_special_ids:
-                continue
-            if ids in self._added_tokens_decoder:
-                if current_sub_text:
-                    sub_texts.append(self.sp_model.decode(current_sub_text))
-                sub_texts.append(self._added_tokens_decoder[ids].content)
-                current_sub_text = []
-            else:
-                current_sub_text.append(ids)
-        if current_sub_text:
-            sub_texts.append(self.sp_model.decode(current_sub_text))
-
-        if spaces_between_special_tokens:
-            sub_texts = " ".join(sub_texts)
-        else:
-            sub_texts = "".join(sub_texts)
-
-        return sub_texts.replace(SPIECE_UNDERLINE, " ")
-
     def convert_tokens_to_string(self, tokens):
         """Converts a sequence of tokens (string) in a single string."""
         current_sub_tokens = []
@@ -214,7 +192,6 @@ def convert_tokens_to_string(self, tokens):
         out_string += self.sp_model.decode(current_sub_tokens)
         return out_string
 
-    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.save_vocabulary
     def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
         """
         Save the vocabulary and special tokens file to a directory.
@@ -242,7 +219,6 @@ def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None)
 
         return (out_vocab_file,)
 
-    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.build_inputs_with_special_tokens
     def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
         bos_token_id = [self.bos_token_id] if self.add_bos_token else []
         eos_token_id = [self.eos_token_id] if self.add_eos_token else []
@@ -254,7 +230,6 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
 
         return output
 
-    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.get_special_tokens_mask
     def get_special_tokens_mask(
         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
     ) -> List[int]:
@@ -292,7 +267,6 @@ def get_special_tokens_mask(
             + eos_token_id
         )
 
-    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.create_token_type_ids_from_sequences
     def create_token_type_ids_from_sequences(
         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
     ) -> List[int]:
@@ -325,3 +299,35 @@ def create_token_type_ids_from_sequences(
             output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
 
         return output
+
+    def _decode(
+        self,
+        token_ids: List[int],
+        skip_special_tokens: bool = False,
+        spaces_between_special_tokens: bool = False,
+        **kwargs,
+    ) -> str:
+        sub_texts = []
+        current_sub_text = []
+        for ids in token_ids:
+            if skip_special_tokens and ids in self.all_special_ids:
+                continue
+            if ids in self._added_tokens_decoder:
+                if current_sub_text:
+                    sub_texts.append(self.sp_model.decode(current_sub_text))
+                sub_texts.append(self._added_tokens_decoder[ids].content)
+                current_sub_text = []
+            else:
+                current_sub_text.append(ids)
+        if current_sub_text:
+            sub_texts.append(self.sp_model.decode(current_sub_text))
+
+        if spaces_between_special_tokens:
+            sub_texts = " ".join(sub_texts)
+        else:
+            sub_texts = "".join(sub_texts)
+
+        return sub_texts.replace(SPIECE_UNDERLINE, " ")
+
+
+__all__ = ["GemmaTokenizer"]
diff --git a/src/transformers/models/gemma2/configuration_gemma2.py b/src/transformers/models/gemma2/configuration_gemma2.py
index 44f96efb6df3f9..74976bdd340f41 100644
--- a/src/transformers/models/gemma2/configuration_gemma2.py
+++ b/src/transformers/models/gemma2/configuration_gemma2.py
@@ -1,9 +1,9 @@
-#           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-#               This file was automatically generated from <path_to_modular_file.py>.
-#         Do NOT edit this file manually as any edits will be overwritten by the generation of
-#         the file from the modular. If any change should be done, please apply the change to the
-#                           modular_xxx.py file directly. One of our CI enforces this
-#           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/gemma2/modular_gemma2.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_gemma2.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 # coding=utf-8
 # Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
 #
diff --git a/src/transformers/models/gemma2/modeling_gemma2.py b/src/transformers/models/gemma2/modeling_gemma2.py
index c52b7b82e13d61..0b99aa59c65b41 100644
--- a/src/transformers/models/gemma2/modeling_gemma2.py
+++ b/src/transformers/models/gemma2/modeling_gemma2.py
@@ -1,9 +1,9 @@
-#           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-#               This file was automatically generated from <path_to_modular_file.py>.
-#         Do NOT edit this file manually as any edits will be overwritten by the generation of
-#         the file from the modular. If any change should be done, please apply the change to the
-#                           modular_xxx.py file directly. One of our CI enforces this
-#           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/gemma2/modular_gemma2.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_gemma2.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 # coding=utf-8
 # Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
 #
@@ -42,7 +42,6 @@
     add_start_docstrings_to_model_forward,
     is_flash_attn_greater_or_equal,
     is_flash_attn_greater_or_equal_2_10,
-    is_torchdynamo_compiling,
     logging,
     replace_return_docstrings,
 )
@@ -769,9 +768,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if self.gradient_checkpointing and self.training and use_cache:
             logger.warning_once(
@@ -883,7 +880,7 @@ def _update_causal_mask(
         dtype, device = input_tensor.dtype, input_tensor.device
         sequence_length = input_tensor.shape[1]
         if isinstance(past_key_values, HybridCache):
-            target_length = past_key_values.get_max_length()
+            target_length = past_key_values.get_max_cache_shape()
         else:
             target_length = attention_mask.shape[-1] if attention_mask is not None else input_tensor.shape[1]
 
@@ -1058,10 +1055,6 @@ def forward(
         )
 
         hidden_states = outputs[0]
-        if labels is None and not is_torchdynamo_compiling():
-            logger.warning_once(
-                "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)"
-            )
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
         logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
         if self.config.final_logit_softcapping is not None:
@@ -1069,8 +1062,6 @@ def forward(
             logits = torch.tanh(logits)
             logits = logits * self.config.final_logit_softcapping
 
-        # TODO: remove the float() operation in v4.46
-        logits = logits.float()
         loss = None
         if labels is not None:
             # Upcast to float if we need to compute the loss to avoid potential precision issues
@@ -1110,7 +1101,8 @@ def prepare_inputs_for_generation(
         num_logits_to_keep=None,
         **kwargs,
     ):
-        """Different from the base `prepare_inputs_for_generation` because of `HybridCache`."""
+        # Overwritten: has a special cache type, `HybridCache`
+
         # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
         # Exception 1: when passing input_embeds, input_ids may be missing entries
         # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
@@ -1154,7 +1146,7 @@ def prepare_inputs_for_generation(
             attention_mask = self.model._prepare_4d_causal_attention_mask_with_cache_position(
                 attention_mask,
                 sequence_length=sequence_length,
-                target_length=past_key_values.get_max_length(),
+                target_length=past_key_values.get_max_cache_shape(),
                 dtype=self.lm_head.weight.dtype,
                 device=device,
                 cache_position=cache_position,
diff --git a/src/transformers/models/gemma2/modular_gemma2.py b/src/transformers/models/gemma2/modular_gemma2.py
index ff53955716e69f..c0f76dbe5bfcbd 100644
--- a/src/transformers/models/gemma2/modular_gemma2.py
+++ b/src/transformers/models/gemma2/modular_gemma2.py
@@ -31,7 +31,6 @@
     is_flash_attn_2_available,
     is_flash_attn_greater_or_equal,
     is_flash_attn_greater_or_equal_2_10,
-    is_torchdynamo_compiling,
     logging,
 )
 from ..gemma.modeling_gemma import (
@@ -606,9 +605,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if self.gradient_checkpointing and self.training and use_cache:
             logger.warning_once(
@@ -721,7 +718,7 @@ def _update_causal_mask(
         dtype, device = input_tensor.dtype, input_tensor.device
         sequence_length = input_tensor.shape[1]
         if isinstance(past_key_values, HybridCache):
-            target_length = past_key_values.get_max_length()
+            target_length = past_key_values.get_max_cache_shape()
         else:
             target_length = attention_mask.shape[-1] if attention_mask is not None else input_tensor.shape[1]
 
@@ -800,10 +797,6 @@ def forward(
         )
 
         hidden_states = outputs[0]
-        if labels is None and not is_torchdynamo_compiling():
-            logger.warning_once(
-                "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)"
-            )
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
         logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
         if self.config.final_logit_softcapping is not None:
@@ -811,8 +804,6 @@ def forward(
             logits = torch.tanh(logits)
             logits = logits * self.config.final_logit_softcapping
 
-        # TODO: remove the float() operation in v4.46
-        logits = logits.float()
         loss = None
         if labels is not None:
             # Upcast to float if we need to compute the loss to avoid potential precision issues
@@ -852,7 +843,8 @@ def prepare_inputs_for_generation(
         num_logits_to_keep=None,
         **kwargs,
     ):
-        """Different from the base `prepare_inputs_for_generation` because of `HybridCache`."""
+        # Overwritten: has a special cache type, `HybridCache`
+
         # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
         # Exception 1: when passing input_embeds, input_ids may be missing entries
         # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
@@ -896,7 +888,7 @@ def prepare_inputs_for_generation(
             attention_mask = self.model._prepare_4d_causal_attention_mask_with_cache_position(
                 attention_mask,
                 sequence_length=sequence_length,
-                target_length=past_key_values.get_max_length(),
+                target_length=past_key_values.get_max_cache_shape(),
                 dtype=self.lm_head.weight.dtype,
                 device=device,
                 cache_position=cache_position,
diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py
index cf8edfe474880f..c7f9ceafe19452 100644
--- a/src/transformers/models/git/modeling_git.py
+++ b/src/transformers/models/git/modeling_git.py
@@ -650,15 +650,15 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
         """
 
         num_patches = embeddings.shape[1] - 1
-        self.position_embeddings = self.position_embedding.weight.unsqueeze(0)
-        num_positions = self.position_embeddings.shape[1] - 1
+        position_embedding = self.position_embedding.weight.unsqueeze(0)
+        num_positions = position_embedding.shape[1] - 1
 
         # always interpolate when tracing to ensure the exported model works for dynamic input shapes
         if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
-            return self.position_embeddings
+            return self.position_embedding(self.position_ids)
 
-        class_pos_embed = self.position_embeddings[:, :1]
-        patch_pos_embed = self.position_embeddings[:, 1:]
+        class_pos_embed = position_embedding[:, :1]
+        patch_pos_embed = position_embedding[:, 1:]
 
         dim = embeddings.shape[-1]
 
diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index e99f4b126246d8..b0c0f2c378b4ff 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -1235,53 +1235,6 @@ def get_output_embeddings(self):
     def set_output_embeddings(self, new_embeddings):
         self.lm_head = new_embeddings
 
-    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
-        token_type_ids = kwargs.get("token_type_ids", None)
-        # Omit tokens covered by past_key_values
-        if past_key_values:
-            past_length = past_key_values[0][0].shape[2]
-
-            # Some generation methods already pass only the last input ID
-            if input_ids.shape[1] > past_length:
-                remove_prefix_length = past_length
-            else:
-                # Default to old behavior: keep only final ID
-                remove_prefix_length = input_ids.shape[1] - 1
-
-            input_ids = input_ids[:, remove_prefix_length:]
-            if token_type_ids is not None:
-                token_type_ids = token_type_ids[:, -input_ids.shape[1] :]
-
-        attention_mask = kwargs.get("attention_mask", None)
-        position_ids = kwargs.get("position_ids", None)
-
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-        else:
-            position_ids = None
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "position_ids": position_ids,
-                "attention_mask": attention_mask,
-                "token_type_ids": token_type_ids,
-            }
-        )
-
-        return model_inputs
-
     @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
@@ -1441,52 +1394,6 @@ def get_output_embeddings(self):
     def set_output_embeddings(self, new_embeddings):
         self.lm_head = new_embeddings
 
-    def prepare_inputs_for_generation(self, input_ids, inputs_embeds=None, past_key_values=None, **kwargs):
-        token_type_ids = kwargs.get("token_type_ids", None)
-        # Omit tokens covered by past_key_values
-        if past_key_values:
-            past_length = past_key_values[0][0].shape[2]
-
-            # Some generation methods already pass only the last input ID
-            if input_ids.shape[1] > past_length:
-                remove_prefix_length = past_length
-            else:
-                # Default to old behavior: keep only final ID
-                remove_prefix_length = input_ids.shape[1] - 1
-
-            input_ids = input_ids[:, remove_prefix_length:]
-            if token_type_ids is not None:
-                token_type_ids = token_type_ids[:, -input_ids.shape[1] :]
-
-        attention_mask = kwargs.get("attention_mask", None)
-        position_ids = kwargs.get("position_ids", None)
-
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-        else:
-            position_ids = None
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids.contiguous()}
-
-        model_inputs.update(
-            {
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "position_ids": position_ids,
-                "attention_mask": attention_mask,
-                "token_type_ids": token_type_ids,
-            }
-        )
-        return model_inputs
-
     @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=GPT2DoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
diff --git a/src/transformers/models/gpt2/tokenization_gpt2_fast.py b/src/transformers/models/gpt2/tokenization_gpt2_fast.py
index 90e83f0d35a351..795b5ce067298f 100644
--- a/src/transformers/models/gpt2/tokenization_gpt2_fast.py
+++ b/src/transformers/models/gpt2/tokenization_gpt2_fast.py
@@ -97,8 +97,8 @@ def __init__(
         **kwargs,
     ):
         super().__init__(
-            vocab_file,
-            merges_file,
+            vocab_file=vocab_file,
+            merges_file=merges_file,
             tokenizer_file=tokenizer_file,
             unk_token=unk_token,
             bos_token=bos_token,
diff --git a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
index ca1c03fcd9f911..5326c7b907d4b1 100644
--- a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
+++ b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
@@ -1059,6 +1059,8 @@ def set_output_embeddings(self, new_embeddings):
         self.lm_head = new_embeddings
 
     def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
+        # Overwritten -- `past_key_values` with uncommon shape
+
         token_type_ids = kwargs.get("token_type_ids", None)
         # Omit tokens covered by past_key_values
         if past_key_values:
diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
index 234f0f6f10dbb3..7bba7608e6c187 100755
--- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py
+++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
@@ -674,9 +674,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if self.gradient_checkpointing and self.training:
             if use_cache:
@@ -822,7 +820,7 @@ def _update_causal_mask(
         dtype, device = input_tensor.dtype, input_tensor.device
         sequence_length = input_tensor.shape[1]
         if using_static_cache:
-            target_length = past_key_values.get_max_length()
+            target_length = past_key_values.get_max_cache_shape()
         else:
             target_length = (
                 attention_mask.shape[-1]
@@ -936,77 +934,6 @@ def get_output_embeddings(self):
     def set_output_embeddings(self, new_embeddings):
         self.lm_head = new_embeddings
 
-    def prepare_inputs_for_generation(
-        self,
-        input_ids,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        past_key_values=None,
-        inputs_embeds=None,
-        cache_position=None,
-        use_cache=True,
-        **kwargs,
-    ):
-        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
-        # Exception 1: when passing input_embeds, input_ids may be missing entries
-        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
-        if past_key_values is not None:
-            if inputs_embeds is not None:  # Exception 1
-                input_ids = input_ids[:, -cache_position.shape[0] :]
-            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
-                input_ids = input_ids[:, cache_position]
-
-            if token_type_ids is not None:
-                token_type_ids = token_type_ids[:, -input_ids.shape[1] :]
-
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-
-                # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s  `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture.
-                position_ids = position_ids.clone(memory_format=torch.contiguous_format)
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and cache_position[0] == 0:
-            model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
-        else:
-            # The clone here is for the same reason as for `position_ids`.
-            model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
-
-        if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
-            if model_inputs["inputs_embeds"] is not None:
-                batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
-                device = model_inputs["inputs_embeds"].device
-            else:
-                batch_size, sequence_length = model_inputs["input_ids"].shape
-                device = model_inputs["input_ids"].device
-
-            attention_mask = self.transformer._prepare_4d_causal_attention_mask_with_cache_position(
-                attention_mask,
-                sequence_length=sequence_length,
-                target_length=past_key_values.get_max_length(),
-                dtype=self.lm_head.weight.dtype,
-                device=device,
-                cache_position=cache_position,
-                batch_size=batch_size,
-            )
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "cache_position": cache_position,
-                "past_key_values": past_key_values,
-                "use_cache": use_cache,
-                "token_type_ids": token_type_ids,
-                "attention_mask": attention_mask,
-            }
-        )
-        return model_inputs
-
     @add_start_docstrings_to_model_forward(GPT_NEO_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
index 60552106d61702..f4636db0a97b44 100755
--- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py
+++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
@@ -876,9 +876,7 @@ def forward(
         use_cache = use_cache if use_cache is not None else self.config.use_cache
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if self.gradient_checkpointing and self.training:
             if use_cache:
@@ -1019,7 +1017,7 @@ def _update_causal_mask(
         dtype, device = input_tensor.dtype, input_tensor.device
         sequence_length = input_tensor.shape[1]
         if using_static_cache:
-            target_length = past_key_values.get_max_length()
+            target_length = past_key_values.get_max_cache_shape()
         else:
             target_length = (
                 attention_mask.shape[-1]
diff --git a/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py b/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py
index c79e6d9ada15d3..7fafa440d05113 100644
--- a/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py
+++ b/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py
@@ -105,8 +105,8 @@ def __init__(
         **kwargs,
     ):
         super().__init__(
-            vocab_file,
-            merges_file,
+            vocab_file=vocab_file,
+            merges_file=merges_file,
             tokenizer_file=tokenizer_file,
             unk_token=unk_token,
             bos_token=bos_token,
diff --git a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
index 2fdb730e7ca1b3..b618f531e52f66 100755
--- a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
+++ b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
@@ -603,9 +603,7 @@ def forward(
         use_cache = use_cache if use_cache is not None else self.config.use_cache
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if inputs_embeds is None:
             inputs_embeds = self.embed_in(input_ids)
@@ -723,7 +721,7 @@ def _update_causal_mask(
         dtype, device = input_tensor.dtype, input_tensor.device
         sequence_length = input_tensor.shape[1]
         if using_static_cache:
-            target_length = past_key_values.get_max_length()
+            target_length = past_key_values.get_max_cache_shape()
         else:
             target_length = (
                 attention_mask.shape[-1]
diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py
index f6fe90fc6c5618..5c80485823c10b 100644
--- a/src/transformers/models/gptj/modeling_gptj.py
+++ b/src/transformers/models/gptj/modeling_gptj.py
@@ -746,9 +746,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if self.gradient_checkpointing and self.training:
             if use_cache:
@@ -917,7 +915,7 @@ def _update_causal_mask(
         dtype, device = input_tensor.dtype, input_tensor.device
         sequence_length = input_tensor.shape[1]
         if using_static_cache:
-            target_length = past_key_values.get_max_length()
+            target_length = past_key_values.get_max_cache_shape()
         else:
             target_length = (
                 attention_mask.shape[-1]
@@ -1065,78 +1063,6 @@ def get_output_embeddings(self):
     def set_output_embeddings(self, new_embeddings):
         self.lm_head = new_embeddings
 
-    # Copied from transformers.models.gpt_neo.modeling_gpt_neo.GPTNeoForCausalLM.prepare_inputs_for_generation
-    def prepare_inputs_for_generation(
-        self,
-        input_ids,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        past_key_values=None,
-        inputs_embeds=None,
-        cache_position=None,
-        use_cache=True,
-        **kwargs,
-    ):
-        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
-        # Exception 1: when passing input_embeds, input_ids may be missing entries
-        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
-        if past_key_values is not None:
-            if inputs_embeds is not None:  # Exception 1
-                input_ids = input_ids[:, -cache_position.shape[0] :]
-            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
-                input_ids = input_ids[:, cache_position]
-
-            if token_type_ids is not None:
-                token_type_ids = token_type_ids[:, -input_ids.shape[1] :]
-
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-
-                # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s  `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture.
-                position_ids = position_ids.clone(memory_format=torch.contiguous_format)
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and cache_position[0] == 0:
-            model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
-        else:
-            # The clone here is for the same reason as for `position_ids`.
-            model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
-
-        if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
-            if model_inputs["inputs_embeds"] is not None:
-                batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
-                device = model_inputs["inputs_embeds"].device
-            else:
-                batch_size, sequence_length = model_inputs["input_ids"].shape
-                device = model_inputs["input_ids"].device
-
-            attention_mask = self.transformer._prepare_4d_causal_attention_mask_with_cache_position(
-                attention_mask,
-                sequence_length=sequence_length,
-                target_length=past_key_values.get_max_length(),
-                dtype=self.lm_head.weight.dtype,
-                device=device,
-                cache_position=cache_position,
-                batch_size=batch_size,
-            )
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "cache_position": cache_position,
-                "past_key_values": past_key_values,
-                "use_cache": use_cache,
-                "token_type_ids": token_type_ids,
-                "attention_mask": attention_mask,
-            }
-        )
-        return model_inputs
-
     @add_start_docstrings_to_model_forward(GPTJ_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
diff --git a/src/transformers/models/granite/modeling_granite.py b/src/transformers/models/granite/modeling_granite.py
index ff6cf73cef4e3f..0eb27d452f08d2 100644
--- a/src/transformers/models/granite/modeling_granite.py
+++ b/src/transformers/models/granite/modeling_granite.py
@@ -766,9 +766,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if self.gradient_checkpointing and self.training and use_cache:
             logger.warning_once(
diff --git a/src/transformers/models/granitemoe/modeling_granitemoe.py b/src/transformers/models/granitemoe/modeling_granitemoe.py
index ebb74176094a05..ebdea826fa0450 100644
--- a/src/transformers/models/granitemoe/modeling_granitemoe.py
+++ b/src/transformers/models/granitemoe/modeling_granitemoe.py
@@ -997,9 +997,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if self.gradient_checkpointing and self.training and use_cache:
             logger.warning_once(
@@ -1388,74 +1386,6 @@ def forward(
             router_logits=outputs.router_logits,
         )
 
-    def prepare_inputs_for_generation(
-        self,
-        input_ids,
-        past_key_values=None,
-        attention_mask=None,
-        inputs_embeds=None,
-        output_router_logits=False,
-        cache_position=None,
-        position_ids=None,
-        use_cache=True,
-        **kwargs,
-    ):
-        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
-        # Exception 1: when passing input_embeds, input_ids may be missing entries
-        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
-        if past_key_values is not None:
-            if inputs_embeds is not None:  # Exception 1
-                input_ids = input_ids[:, -cache_position.shape[0] :]
-            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
-                input_ids = input_ids[:, cache_position]
-
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-
-                # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s  `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture.
-                position_ids = position_ids.clone(memory_format=torch.contiguous_format)
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and cache_position[0] == 0:
-            model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
-        else:
-            # The clone here is for the same reason as for `position_ids`.
-            model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
-
-        if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
-            if model_inputs["inputs_embeds"] is not None:
-                batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
-                device = model_inputs["inputs_embeds"].device
-            else:
-                batch_size, sequence_length = model_inputs["input_ids"].shape
-                device = model_inputs["input_ids"].device
-
-            attention_mask = self.model._prepare_4d_causal_attention_mask_with_cache_position(
-                attention_mask,
-                sequence_length=sequence_length,
-                target_length=past_key_values.get_max_length(),
-                dtype=self.lm_head.weight.dtype,
-                device=device,
-                cache_position=cache_position,
-                batch_size=batch_size,
-            )
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "cache_position": cache_position,
-                "past_key_values": past_key_values,
-                "use_cache": use_cache,
-                "attention_mask": attention_mask,
-                "output_router_logits": output_router_logits,
-            }
-        )
-        return model_inputs
-
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index d5882c63a97b31..8c6c99846689b2 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -600,11 +600,14 @@ def build_position_encoding(config):
 
 # Copied from transformers.models.deformable_detr.modeling_deformable_detr.multi_scale_deformable_attention
 def multi_scale_deformable_attention(
-    value: Tensor, value_spatial_shapes: Tensor, sampling_locations: Tensor, attention_weights: Tensor
+    value: Tensor,
+    value_spatial_shapes: Union[Tensor, List[Tuple]],
+    sampling_locations: Tensor,
+    attention_weights: Tensor,
 ) -> Tensor:
     batch_size, _, num_heads, hidden_dim = value.shape
     _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
-    value_list = value.split([height.item() * width.item() for height, width in value_spatial_shapes], dim=1)
+    value_list = value.split([height * width for height, width in value_spatial_shapes], dim=1)
     sampling_grids = 2 * sampling_locations - 1
     sampling_value_list = []
     for level_id, (height, width) in enumerate(value_spatial_shapes):
@@ -681,29 +684,6 @@ def __init__(self, config: GroundingDinoConfig, num_heads: int, n_points: int):
 
         self.disable_custom_kernels = config.disable_custom_kernels
 
-        self._reset_parameters()
-
-    def _reset_parameters(self):
-        nn.init.constant_(self.sampling_offsets.weight.data, 0.0)
-        default_dtype = torch.get_default_dtype()
-        thetas = torch.arange(self.n_heads, dtype=torch.int64).to(default_dtype) * (2.0 * math.pi / self.n_heads)
-        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
-        grid_init = (
-            (grid_init / grid_init.abs().max(-1, keepdim=True)[0])
-            .view(self.n_heads, 1, 1, 2)
-            .repeat(1, self.n_levels, self.n_points, 1)
-        )
-        for i in range(self.n_points):
-            grid_init[:, :, i, :] *= i + 1
-        with torch.no_grad():
-            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
-        nn.init.constant_(self.attention_weights.weight.data, 0.0)
-        nn.init.constant_(self.attention_weights.bias.data, 0.0)
-        nn.init.xavier_uniform_(self.value_proj.weight.data)
-        nn.init.constant_(self.value_proj.bias.data, 0.0)
-        nn.init.xavier_uniform_(self.output_proj.weight.data)
-        nn.init.constant_(self.output_proj.bias.data, 0.0)
-
     def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
         return tensor if position_embeddings is None else tensor + position_embeddings
 
@@ -716,6 +696,7 @@ def forward(
         position_embeddings: Optional[torch.Tensor] = None,
         reference_points=None,
         spatial_shapes=None,
+        spatial_shapes_list=None,
         level_start_index=None,
         output_attentions: bool = False,
     ):
@@ -725,6 +706,7 @@ def forward(
 
         batch_size, num_queries, _ = hidden_states.shape
         batch_size, sequence_length, _ = encoder_hidden_states.shape
+        # Ignore copy
         if (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() != sequence_length:
             raise ValueError(
                 "Make sure to align the spatial shapes with the sequence length of the encoder hidden states"
@@ -760,7 +742,7 @@ def forward(
         else:
             raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}")
 
-        if self.disable_custom_kernels:
+        if self.disable_custom_kernels or MultiScaleDeformableAttention is None:
             # PyTorch implementation
             output = multi_scale_deformable_attention(value, spatial_shapes, sampling_locations, attention_weights)
         else:
@@ -1526,7 +1508,27 @@ def _init_weights(self, module):
             nn.init.uniform_(module.row_embeddings.weight)
             nn.init.uniform_(module.column_embeddings.weight)
         elif isinstance(module, GroundingDinoMultiscaleDeformableAttention):
-            module._reset_parameters()
+            nn.init.constant_(module.sampling_offsets.weight.data, 0.0)
+            default_dtype = torch.get_default_dtype()
+            thetas = torch.arange(module.n_heads, dtype=torch.int64).to(default_dtype) * (
+                2.0 * math.pi / module.n_heads
+            )
+            grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+            grid_init = (
+                (grid_init / grid_init.abs().max(-1, keepdim=True)[0])
+                .view(module.n_heads, 1, 1, 2)
+                .repeat(1, module.n_levels, module.n_points, 1)
+            )
+            for i in range(module.n_points):
+                grid_init[:, :, i, :] *= i + 1
+            with torch.no_grad():
+                module.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
+            nn.init.constant_(module.attention_weights.weight.data, 0.0)
+            nn.init.constant_(module.attention_weights.bias.data, 0.0)
+            nn.init.xavier_uniform_(module.value_proj.weight.data)
+            nn.init.constant_(module.value_proj.bias.data, 0.0)
+            nn.init.xavier_uniform_(module.output_proj.weight.data)
+            nn.init.constant_(module.output_proj.bias.data, 0.0)
         elif isinstance(module, GroundingDinoBiMultiHeadAttention):
             nn.init.xavier_uniform_(module.vision_proj.weight)
             module.vision_proj.bias.data.fill_(0)
diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py
index da79c2894877b4..ad21d768e35c80 100755
--- a/src/transformers/models/hubert/modeling_hubert.py
+++ b/src/transformers/models/hubert/modeling_hubert.py
@@ -25,6 +25,7 @@
 
 from ...activations import ACT2FN
 from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.fsdp import is_fsdp_managed_module
 from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
 from ...modeling_utils import PreTrainedModel
 from ...utils import (
@@ -664,7 +665,7 @@ def forward(
             value_states,
             attention_mask,
             q_len,
-            dropout=self.dropout,
+            dropout=self.dropout if self.training else 0.0,
             is_causal=self.is_causal,
             use_top_left_mask=self._flash_attn_uses_top_left_mask,
         )
@@ -968,7 +969,7 @@ def forward(
         hidden_states = self.layer_norm(hidden_states)
         hidden_states = self.dropout(hidden_states)
 
-        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
 
         for layer in self.layers:
             if output_hidden_states:
@@ -978,8 +979,8 @@ def forward(
             dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
-            if not skip_the_layer or deepspeed_zero3_is_enabled:
-                # under deepspeed zero3 all gpus must run in sync
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
                 if self.gradient_checkpointing and self.training:
                     layer_outputs = self._gradient_checkpointing_func(
                         layer.__call__,
@@ -1055,7 +1056,7 @@ def forward(
         hidden_states = hidden_states + position_embeddings
         hidden_states = self.dropout(hidden_states)
 
-        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
 
         for layer in self.layers:
             if output_hidden_states:
@@ -1065,8 +1066,8 @@ def forward(
             dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
-            if not skip_the_layer or deepspeed_zero3_is_enabled:
-                # under deepspeed zero3 all gpus must run in sync
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
                 # XXX: could optimize this like synced_gpus in generate_utils but not sure if it's worth the code complication
                 if self.gradient_checkpointing and self.training:
                     layer_outputs = self._gradient_checkpointing_func(
diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py
index c43ba3e9a6b74a..02de8d61ae204c 100644
--- a/src/transformers/models/idefics/modeling_idefics.py
+++ b/src/transformers/models/idefics/modeling_idefics.py
@@ -183,51 +183,6 @@ def expand_inputs_for_generation(
     return input_ids, model_kwargs
 
 
-def prepare_inputs_for_generation(input_ids, past_key_values=None, **kwargs):
-    token_type_ids = kwargs.get("token_type_ids", None)
-    cache_position = kwargs.get("cache_position", None)
-    # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
-    if past_key_values is not None:
-        if input_ids.shape[1] != cache_position.shape[0]:
-            input_ids = input_ids[:, cache_position]
-        if token_type_ids is not None:
-            token_type_ids = token_type_ids[:, -input_ids.shape[1] :]
-
-    attention_mask = kwargs.get("attention_mask", None)
-    position_ids = kwargs.get("position_ids", None)
-
-    if attention_mask is not None and position_ids is None:
-        # create position_ids on the fly for batch generation
-        position_ids = attention_mask.long().cumsum(-1) - 1
-        position_ids.masked_fill_(attention_mask == 0, 1)
-        if past_key_values:
-            position_ids = position_ids[:, -1].unsqueeze(-1)
-
-            # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s  `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture.
-            position_ids = position_ids.clone(memory_format=torch.contiguous_format)
-
-    pixel_values = kwargs.get("pixel_values", None)
-    image_encoder_embeddings = kwargs.get("image_encoder_embeddings", None)
-    perceiver_embeddings = kwargs.get("perceiver_embeddings", None)
-    image_attention_mask = kwargs.get("image_attention_mask", None)
-    interpolate_pos_encoding = kwargs.get("interpolate_pos_encoding", False)
-
-    return {
-        "input_ids": input_ids,
-        "past_key_values": past_key_values,
-        "use_cache": kwargs.get("use_cache"),
-        "cache_position": cache_position,
-        "position_ids": position_ids,
-        "attention_mask": attention_mask,
-        "token_type_ids": token_type_ids,
-        "pixel_values": pixel_values,
-        "image_encoder_embeddings": image_encoder_embeddings,
-        "perceiver_embeddings": perceiver_embeddings,
-        "image_attention_mask": image_attention_mask,
-        "interpolate_pos_encoding": interpolate_pos_encoding,
-    }
-
-
 def freeze_model(model, module_exceptions=[]):
     mapping = {
         "LayerNorm": nn.LayerNorm,
@@ -1172,9 +1127,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if self.gradient_checkpointing and self.training and use_cache:
             logger.warning_once(
@@ -1212,11 +1165,9 @@ def forward(
             # create position_ids on the fly for batch generation
             position_ids = attention_mask.long().cumsum(-1) - 1
             position_ids.masked_fill_(attention_mask == 0, 1)
+            position_ids = position_ids[:, -seq_length:]
         elif position_ids is None:
-            position_ids = torch.arange(
-                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
-            )
-            position_ids = position_ids.unsqueeze(0)
+            position_ids = cache_position.unsqueeze(0)
 
         if (pixel_values, image_encoder_embeddings, perceiver_embeddings).count(None) != 2:
             raise ValueError(
@@ -1449,7 +1400,7 @@ def _update_causal_mask(
         dtype, device = input_tensor.dtype, input_tensor.device
         sequence_length = input_tensor.shape[1]
         if using_static_cache:
-            target_length = past_key_values.get_max_length()
+            target_length = past_key_values.get_max_cache_shape()
         else:
             target_length = (
                 attention_mask.shape[-1]
@@ -1686,7 +1637,9 @@ def forward(
             labels = labels.to(logits.device)
             # Shift so that tokens < n predict n
             if attention_mask is not None:
-                shift_attention_mask = attention_mask[..., 1:].to(logits.device)
+                # we use the input attention mask to shift the logits and labels, because it is 2D.
+                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
+                shift_attention_mask = attention_mask[:, -(logits.shape[1] - 1) :].to(logits.device)
                 shift_logits = logits[..., :-1, :][shift_attention_mask != 0].contiguous()
                 shift_labels = labels[..., 1:][shift_attention_mask != 0].contiguous()
             else:
@@ -1709,19 +1662,57 @@ def forward(
             image_hidden_states=outputs.image_hidden_states,
         )
 
-    def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        position_ids=None,
+        pixel_values=None,
+        image_hidden_states=None,
+        use_cache=None,
+        cache_position=None,
+        **kwargs,
+    ):
+        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
+        if past_key_values is not None:
+            if input_ids.shape[1] != cache_position.shape[0]:
+                input_ids = input_ids[:, cache_position]
+
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+                # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s  `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture.
+                position_ids = position_ids.clone(memory_format=torch.contiguous_format)
+
+        model_inputs = {}
         image_hidden_states = kwargs.pop("image_hidden_states", None)
         if image_hidden_states is not None:
             if self.config.use_resampler:
-                kwargs["perceiver_embeddings"] = image_hidden_states
+                model_inputs["perceiver_embeddings"] = image_hidden_states
             else:
-                kwargs["image_encoder_embeddings"] = image_hidden_states
-            kwargs["pixel_values"] = None
-        inputs = prepare_inputs_for_generation(input_ids, past=past, **kwargs)
-        unwanted_kwargs = ["token_type_ids"]
-        for kwarg in unwanted_kwargs:
-            inputs.pop(kwarg, None)
-        return inputs
+                model_inputs["image_encoder_embeddings"] = image_hidden_states
+            pixel_values = None
+
+        model_inputs.update(
+            {
+                "input_ids": input_ids,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "cache_position": cache_position,
+                "position_ids": position_ids,
+                "attention_mask": attention_mask,
+                "pixel_values": pixel_values,
+                "image_attention_mask": kwargs.get("image_attention_mask", None),
+                "interpolate_pos_encoding": kwargs.get("interpolate_pos_encoding", False),
+            }
+        )
+
+        return model_inputs
 
     @staticmethod
     def _expand_inputs_for_generation(
diff --git a/src/transformers/models/idefics/processing_idefics.py b/src/transformers/models/idefics/processing_idefics.py
index 8e9e196764f923..3406ab2226e08b 100644
--- a/src/transformers/models/idefics/processing_idefics.py
+++ b/src/transformers/models/idefics/processing_idefics.py
@@ -16,13 +16,21 @@
 Processor class for IDEFICS.
 """
 
-from typing import Callable, List, Optional, Union
+from typing import Callable, Dict, List, Optional, Union
 from urllib.parse import urlparse
 
 from ...feature_extraction_utils import BatchFeature
-from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, TextInput, TruncationStrategy
+from ...processing_utils import (
+    ImagesKwargs,
+    ProcessingKwargs,
+    ProcessorMixin,
+    TextKwargs,
+    Unpack,
+    _validate_images_text_input_order,
+)
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import is_tf_available, is_torch_available
+from ...utils.deprecation import deprecate_kwarg
 
 
 if is_torch_available():
@@ -34,6 +42,32 @@
 IMAGE_TOKEN = "<image>"
 
 
+class IdeficsImagesKwargs(ImagesKwargs, total=False):
+    transform: Optional[Callable]
+    image_size: Optional[Dict[str, int]]
+    image_mean: Optional[Union[float, List[float]]]
+    image_std: Optional[Union[float, List[float]]]
+
+
+class IdeficsTextKwargs(TextKwargs, total=False):
+    add_eos_token: Optional[bool]
+    add_end_of_utterance_token: Optional[bool]
+
+
+class IdeficsProcessorKwargs(ProcessingKwargs, total=False):
+    text_kwargs: IdeficsTextKwargs
+    images_kwargs: IdeficsImagesKwargs
+    _defaults = {
+        "text_kwargs": {
+            "add_special_tokens": False,
+            "padding": "longest",
+            "add_eos_token": False,
+        },
+        "images_kwargs": {},
+        "common_kwargs": {"return_tensors": "pt"},
+    }
+
+
 # copied from m4.training.packing
 def incremental_to_binary_attention_mask(incremental_mask, return_tensors, num_classes=-1):
     # Set elements >= num_classes to -1
@@ -199,52 +233,32 @@ def __init__(self, image_processor, tokenizer=None, image_size=224, add_end_of_u
             else False
         )
 
+    @deprecate_kwarg(old_name="prompts", version="5.0.0", new_name="text", raise_if_both_names=True)
     def __call__(
         self,
-        prompts: Union[List[TextInput], List[List[TextInput]]],
-        padding: Union[bool, str, PaddingStrategy] = "longest",
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length: Optional[int] = None,
-        transform: Callable = None,
-        add_eos_token=False,
-        add_end_of_utterance_token=None,
-        debug=False,
-        return_tensors="pt",
-    ) -> BatchEncoding:
+        images=None,
+        text: Union[
+            TextInput,
+            PreTokenizedInput,
+            List[TextInput],
+            List[PreTokenizedInput],
+            List[List[TextInput]],
+            List[List[PreTokenizedInput]],
+        ] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[IdeficsProcessorKwargs],
+    ) -> BatchFeature:
         """This method takes batched or non-batched prompts made of text and images and converts them into prompts that
         the model was trained on and prepares the image pixel values for the model to process.
 
         Args:
-            prompts (`Union[List[TextInput], [List[List[TextInput]]]]`):
+            images (`Union[PIL.Image, str, List[PIL.Image], List[str]]`):
+                either a single image or a batched list of images - can be passed in when text contains only text prompts,
+                in order to use the image-text-to-text behavior.
+            text (`Union[List[TextInput], [List[List[TextInput]]]]`):
                 either a single prompt or a batched list of prompts - see the detailed description immediately after
                 the end of the arguments doc section.
-            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `"longest"`):
-                Select a strategy to pad the returned sequences (according to the model's padding side and padding
-                index) among:
-                - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single
-                  sequence if provided).
-                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
-                  acceptable input length for the model if that argument is not provided.
-                - `False` or `'do_not_pad'`: No padding. This will raise an error if the input sequences are of different
-                  lengths.
-                Note: Unlike most processors, which set padding=`False` by default, `IdeficsProcessor` sets `padding="longest"`
-                  by default. See https://github.com/huggingface/transformers/pull/29449#pullrequestreview-1925576061 for why.
-            max_length (`int`, *optional*):
-                Maximum length of the returned list and optionally padding length (see above).
-            truncation (`bool`, *optional*):
-                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
-            transform (`Callable`, *optional*):
-                A custom transform function that accepts a single image can be passed for training. For example,
-                `torchvision.Compose` can be used to compose multiple functions. If `None` a preset inference-specific
-                set of transforms will be applied to the images
-            add_eos_token (`bool`, *optional*, defaults to `False`):
-                Adds `eos_token` at the end of the final prompt if True`
-            add_end_of_utterance_token (`bool`, *optional*)
-                Whether to automatically add `<end_of_utterance>` after each prompt's text input (unless followed by an
-                image). If `None` the tokenizer will be checked instead and if this token is found in
-                `additional_special_tokens` then the value will be `True`.
-            debug (`bool`, *optional*, defaults to `False`):
-                `True` value will help debug prompt generation by dumping useful information
             return_tensors (`str` or `TensorType`, *optional*, defaults to `TensorType.PYTORCH`):
                 The type of tensors to return. Can be one of:
                     - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
@@ -255,7 +269,7 @@ def __call__(
 
         Detailed explanation:
 
-        Each entry in `prompts` is either a text to be passed as is or an image that will be processed.
+        Each entry in `text` is either a text to be passed as is or an image that will be processed.
 
         An image can be either an image object (`PIL.Image`) or a url from which the image can be retrieved.
 
@@ -279,7 +293,7 @@ def __call__(
             "Describe this image.\nAssistant:",
         ]
 
-        inputs = processor(prompts, return_tensors="pt")
+        inputs = processor(text=prompts, return_tensors="pt")
         generated_ids = model.generate(**inputs, max_length=100)
         generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
         ```
@@ -311,18 +325,55 @@ def __call__(
                 transforms.Normalize(mean=self.image_mean, std=self.image_std),
             ]
         )
-        inputs = processor(prompts, transform=image_transform, return_tensors="pt")
+        inputs = processor(text=prompts, transform=image_transform, return_tensors="pt")
         ```
 
         In order to help debug prompt generation enable `debug=True` which will show you what's happening.
 
         """
+        if images is None and text is None:
+            raise ValueError("You need to specify either `text` or `images` and `text`.")
+        # check if images and text inputs are reversed for BC
+        images, text = _validate_images_text_input_order(images, text)
+
+        if images is None:
+            # assuming the user wants to use the old behavior with prompts as the only argument
+            prompts = text
+        elif text is not None:
+            # Assuming image-text-to-text behavior:
+            # Check if batched images are provided
+            if not isinstance(images, (list, tuple)):
+                images = [images]
+            if isinstance(text, str):
+                text = [text]
+            # Check if batched images and text are in the correct format
+            if isinstance(text, (list, tuple)) and len(text) != len(images):
+                raise ValueError(
+                    "When providing both images and text arguments, the number of text prompts should be the same as the number of images."
+                    "If you want to have several images per prompt, images should be nested as such: images=[[img1, img2], [img3, img4], ...] for text=[prompt1, prompt2, ...]."
+                )
+            # Check that only text is present in the prompts
+            if not all(isinstance(i, str) for i in text):
+                raise ValueError("When using the image-text-to-text behavior, the prompts should only contain text.")
+            if isinstance(images[0], (list, tuple)):
+                # if nested images, nest text as well
+                text = [[i] for i in text]
+            prompts = list(zip(images, text))
+
+        output_kwargs = self._merge_kwargs(
+            IdeficsProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        add_eos_token = output_kwargs["text_kwargs"].pop("add_eos_token", False)
+        add_end_of_utterance_token = output_kwargs["text_kwargs"].pop("add_end_of_utterance_token", None)
 
         # if the value isn't overriden by the user, check if the tokenizer was trained with this token and then use it
         if add_end_of_utterance_token is None:
             add_end_of_utterance_token = self.tokenizer_was_trained_with_end_of_utterance_token
         # turn non-batched prompts into batched
-        if not any(isinstance(i, list) for i in prompts):
+        if not any(isinstance(i, (list, tuple)) for i in prompts):
             prompts = [prompts]
 
         fake_token = "<fake_token_around_image>"
@@ -371,21 +422,14 @@ def image_tokens(last_was_image):
             if add_eos_token:
                 full_text += self.tokenizer.eos_token
 
-            if debug is True:
-                print(f"{full_text=}")
-
-            image_objects = self.image_processor(image_objects, transform=transform, return_tensors=return_tensors)
+            image_objects = self.image_processor(image_objects, **output_kwargs["images_kwargs"])
 
             all_prompts.append(full_text)
             all_images.append(image_objects)
 
-        text_encoding = self.tokenizer(
-            text=all_prompts,
-            add_special_tokens=False,
-            padding=padding,
-            truncation=truncation,
-            max_length=max_length,
-        )
+        # For BC
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", "pt")
+        text_encoding = self.tokenizer(all_prompts, **output_kwargs["text_kwargs"])
         all_texts = text_encoding["input_ids"]
         all_attention_masks = text_encoding["attention_mask"]
 
@@ -398,12 +442,12 @@ def image_tokens(last_was_image):
         output_images = []
         output_attention_masks = []
 
-        for text, attention_mask, images in zip(all_texts, all_attention_masks, all_images):
-            padded_input_ids = text
+        for text_single, attention_mask, extracted_images in zip(all_texts, all_attention_masks, all_images):
+            padded_input_ids = text_single
             image_count = padded_input_ids.count(self.image_token_id)
             local_max_num_images = min(image_count, max_num_images)
 
-            current_images = images[:local_max_num_images]
+            current_images = extracted_images[:local_max_num_images]
 
             if len(current_images) > 0:
                 if return_tensors == "pt":
diff --git a/src/transformers/models/idefics2/modeling_idefics2.py b/src/transformers/models/idefics2/modeling_idefics2.py
index 056811138155f3..b53d0722587d5a 100644
--- a/src/transformers/models/idefics2/modeling_idefics2.py
+++ b/src/transformers/models/idefics2/modeling_idefics2.py
@@ -34,7 +34,6 @@
     add_start_docstrings_to_model_forward,
     is_flash_attn_2_available,
     is_flash_attn_greater_or_equal_2_10,
-    is_torchdynamo_compiling,
     logging,
     replace_return_docstrings,
 )
@@ -1584,7 +1583,7 @@ def forward(
         ...   "In which city is that bridge located?<image>",
         ... ]
         >>> images = [[image1, image2], [image3]]
-        >>> inputs = processor(text=prompts, images=images, padding=True, return_tensors="pt").to("cuda")
+        >>> inputs = processor(images=images, text=prompts, padding=True, return_tensors="pt").to("cuda")
 
         >>> # Generate
         >>> generated_ids = model.generate(**inputs, bad_words_ids=BAD_WORDS_IDS, max_new_tokens=20)
@@ -1617,13 +1616,8 @@ def forward(
         )
 
         hidden_states = outputs[0]
-        if labels is None and not is_torchdynamo_compiling():
-            logger.warning_once(
-                "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)"
-            )
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
-        # TODO: remove the float() operation in v4.46
-        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float()
+        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
 
         loss = None
         if labels is not None:
@@ -1632,7 +1626,9 @@ def forward(
             labels = labels.to(logits.device)
             # Shift so that tokens < n predict n
             if attention_mask is not None:
-                shift_attention_mask = attention_mask[..., 1:].to(logits.device)
+                # we use the input attention mask to shift the logits and labels, because it is 2D.
+                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
+                shift_attention_mask = attention_mask[:, -(logits.shape[1] - 1) :].to(logits.device)
                 shift_logits = logits[..., :-1, :][shift_attention_mask != 0].contiguous()
                 shift_labels = labels[..., 1:][shift_attention_mask != 0].contiguous()
             else:
@@ -1669,7 +1665,7 @@ def prepare_inputs_for_generation(
         if past_key_values is not None:
             # Past key values are always initialized with a `Cache` object -> no need for if-else anymore
             past_length = past_key_values.get_seq_length()
-            max_cache_length = past_key_values.get_max_length()
+            max_cache_length = past_key_values.get_max_cache_shape()
 
             # Keep only the unprocessed tokens:
             # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
diff --git a/src/transformers/models/idefics2/processing_idefics2.py b/src/transformers/models/idefics2/processing_idefics2.py
index 2e14118144baaa..68566d182678c2 100644
--- a/src/transformers/models/idefics2/processing_idefics2.py
+++ b/src/transformers/models/idefics2/processing_idefics2.py
@@ -20,9 +20,15 @@
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput, is_valid_image, load_image
-from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import AddedToken, BatchEncoding, PaddingStrategy, TextInput, TruncationStrategy
-from ...utils import TensorType, logging
+from ...processing_utils import (
+    ImagesKwargs,
+    ProcessingKwargs,
+    ProcessorMixin,
+    Unpack,
+    _validate_images_text_input_order,
+)
+from ...tokenization_utils_base import AddedToken, TextInput
+from ...utils import logging
 
 
 if TYPE_CHECKING:
@@ -40,6 +46,23 @@ def is_image_or_image_url(elem):
     return is_url(elem) or is_valid_image(elem)
 
 
+class Idefics2ImagesKwargs(ImagesKwargs, total=False):
+    image_seq_len: Optional[int]
+
+
+class Idefics2ProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: Idefics2ImagesKwargs
+
+    _defaults = {
+        "text_kwargs": {
+            "add_special_tokens": True,
+            "padding": False,
+            "is_split_into_words": False,
+        },
+        "images_kwargs": {},
+    }
+
+
 class Idefics2Processor(ProcessorMixin):
     r"""
     Constructs a IDEFICS2 processor which wraps a LLama tokenizer and IDEFICS2 image processor into a single processor.
@@ -97,16 +120,12 @@ def _extract_images_from_prompts(self, prompts):
 
     def __call__(
         self,
-        text: Union[TextInput, "PreTokenizedInput", List[TextInput], List["PreTokenizedInput"]] = None,
         images: Union[ImageInput, List[ImageInput], List[List[ImageInput]]] = None,
-        image_seq_len: Optional[int] = None,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length: Optional[int] = None,
-        is_split_into_words: bool = False,
-        add_special_tokens: bool = True,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-    ) -> BatchEncoding:
+        text: Union[TextInput, "PreTokenizedInput", List[TextInput], List["PreTokenizedInput"]] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[Idefics2ProcessorKwargs],
+    ) -> BatchFeature:
         """
         Processes the input prompts and returns a BatchEncoding.
 
@@ -130,7 +149,7 @@ def __call__(
         ...     "<image>In this image, we see",
         ...     "bla bla bla<image>",
         ... ]
-        >>> outputs = processor(text=text, images=images, return_tensors="pt", padding=True)
+        >>> outputs = processor(images=images, text=text, return_tensors="pt", padding=True)
         >>> input_ids = outputs.input_ids
         >>> input_tokens = processor.tokenizer.batch_decode(input_ids)
         >>> print(input_tokens)
@@ -138,6 +157,9 @@ def __call__(
         ```
 
         Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, *optional*):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. If is of type `List[ImageInput]`, it's assumed that this is for a single prompt i.e. of batch size 1.
             text (`Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]`, *optional*):
                 The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                 (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
@@ -145,27 +167,22 @@ def __call__(
 
                 Wherever an image token, `<image>` is encountered it is expanded to
                 `<fake_token_around_image>` + `<image>` * `image_seq_len` * <fake_token_around_image>`.
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, *optional*):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. If is of type `List[ImageInput]`, it's assumed that this is for a single prompt i.e. of batch size 1.
-            image_seq_len (`int`, *optional*):
-                The length of the image sequence. If not provided, the default value is used.
-            padding (`Union[bool, str, PaddingStrategy]`, *optional*, defaults to `False`):
-                Padding strategy applied to the input ids. See [`PreTrainedTokenizerFast.pad`] for more information.
-            truncation (`Union[bool, str, TruncationStrategy]`, *optional*):
-                Truncation strategy applied to the input ids. See [`PreTrainedTokenizerFast.truncate`] for more information.
-            max_length (`int`, *optional*):
-                Maximum length of the returned list and optionally padding/truncation length. See
-                [`PreTrainedTokenizerFast.__call__`] for more information.
-            is_split_into_words (`bool`, *optional*, defaults to `False`):
-                Whether the input text is split into words or not. If set to `True`, the tokenizer will skip the
-                tokenization process and assume the input is already tokenized.
-            add_special_tokens (`bool`, *optional*, defaults to `True`):
-                Whether to add special tokens or not. See [`PreTrainedTokenizerFast.__call__`] for more information.
             return_tensors (`Union[str, TensorType]`, *optional*):
                 If set, will return tensors of a particular framework. See [`PreTrainedTokenizerFast.__call__`] for more
                 information.
+
         """
+        if text is None and images is None:
+            raise ValueError("You must provide either `text` or `images`.")
+        # check if images and text inputs are reversed for BC
+        images, text = _validate_images_text_input_order(images, text)
+
+        output_kwargs = self._merge_kwargs(
+            Idefics2ProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        image_seq_len = output_kwargs["images_kwargs"].pop("image_seq_len", None)
         image_seq_len = image_seq_len if image_seq_len is not None else self.image_seq_len
 
         n_images_in_text = []
@@ -194,15 +211,7 @@ def __call__(
                 sample = sample.replace(f"{fake_image_token}{fake_image_token}", f"{fake_image_token}")
                 prompt_strings.append(sample)
 
-            text_inputs = self.tokenizer(
-                text=prompt_strings,
-                add_special_tokens=add_special_tokens,
-                padding=padding,
-                truncation=truncation,
-                max_length=max_length,
-                is_split_into_words=is_split_into_words,
-                return_tensors=return_tensors,
-            )
+            text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
             inputs.update(text_inputs)
 
         if images is not None:
@@ -227,7 +236,7 @@ def __call__(
 
             # Load images if they are URLs
             images = [[load_image(im) for im in sample] for sample in images]
-            image_inputs = self.image_processor(images, return_tensors=return_tensors)
+            image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
             inputs.update(image_inputs)
 
         return inputs
diff --git a/src/transformers/models/idefics3/modeling_idefics3.py b/src/transformers/models/idefics3/modeling_idefics3.py
index bd64e5db681b71..757391175ea671 100644
--- a/src/transformers/models/idefics3/modeling_idefics3.py
+++ b/src/transformers/models/idefics3/modeling_idefics3.py
@@ -1213,7 +1213,9 @@ def forward(
             labels = labels.to(logits.device)
             # Shift so that tokens < n predict n
             if attention_mask is not None:
-                shift_attention_mask = attention_mask[..., 1:].to(logits.device)
+                # we use the input attention mask to shift the logits and labels, because it is 2D.
+                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
+                shift_attention_mask = attention_mask[:, -(logits.shape[1] - 1) :].to(logits.device)
                 shift_logits = logits[..., :-1, :][shift_attention_mask != 0].contiguous()
                 shift_labels = labels[..., 1:][shift_attention_mask != 0].contiguous()
             else:
@@ -1251,7 +1253,7 @@ def prepare_inputs_for_generation(
         if past_key_values is not None:
             # Past key values are always initialized with a `Cache` object -> no need for if-else anymore
             past_length = past_key_values.get_seq_length()
-            max_cache_length = past_key_values.get_max_length()
+            max_cache_length = past_key_values.get_max_cache_shape()
 
             # Keep only the unprocessed tokens:
             # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py
index 10042ca4529204..ceafa26a8b1187 100644
--- a/src/transformers/models/idefics3/processing_idefics3.py
+++ b/src/transformers/models/idefics3/processing_idefics3.py
@@ -17,12 +17,11 @@
 """
 
 import re
-import sys
 from typing import TYPE_CHECKING, Dict, List, Optional, Union
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput, is_valid_image, load_image
-from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
+from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import AddedToken, BatchEncoding, TextInput
 from ...utils import logging
 
@@ -30,11 +29,6 @@
 if TYPE_CHECKING:
     from ...tokenization_utils_base import PreTokenizedInput
 
-if sys.version_info >= (3, 11):
-    from typing import Unpack
-else:
-    from typing_extensions import Unpack
-
 logger = logging.get_logger(__name__)
 
 
@@ -241,9 +235,6 @@ def __call__(
             **kwargs,
         )
 
-        # Temporary fix for "padding_side" in init_kwargs
-        output_kwargs["text_kwargs"].pop("padding_side", None)
-
         image_seq_len = image_seq_len if image_seq_len is not None else self.image_seq_len
 
         n_images_in_text = []
diff --git a/src/transformers/models/imagegpt/modeling_imagegpt.py b/src/transformers/models/imagegpt/modeling_imagegpt.py
index 4dfbae1238c48e..8031950bc9b10a 100755
--- a/src/transformers/models/imagegpt/modeling_imagegpt.py
+++ b/src/transformers/models/imagegpt/modeling_imagegpt.py
@@ -900,43 +900,6 @@ def get_output_embeddings(self):
     def set_output_embeddings(self, new_embeddings):
         self.lm_head = new_embeddings
 
-    def prepare_inputs_for_generation(self, input_ids: torch.Tensor, past_key_values: Optional[bool] = None, **kwargs):
-        token_type_ids = kwargs.get("token_type_ids", None)
-        # Omit tokens covered by past_key_values
-        if past_key_values:
-            past_length = past_key_values[0][0].shape[2]
-
-            # Some generation methods already pass only the last input ID
-            if input_ids.shape[1] > past_length:
-                remove_prefix_length = past_length
-            else:
-                # Default to old behavior: keep only final ID
-                remove_prefix_length = input_ids.shape[1] - 1
-
-            input_ids = input_ids[:, remove_prefix_length:]
-            if token_type_ids is not None:
-                token_type_ids = token_type_ids[:, -input_ids.shape[1] :]
-
-        attention_mask = kwargs.get("attention_mask", None)
-        position_ids = kwargs.get("position_ids", None)
-
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-        else:
-            position_ids = None
-        return {
-            "input_ids": input_ids,
-            "past_key_values": past_key_values,
-            "use_cache": kwargs.get("use_cache"),
-            "position_ids": position_ids,
-            "attention_mask": attention_mask,
-            "token_type_ids": token_type_ids,
-        }
-
     @add_start_docstrings_to_model_forward(IMAGEGPT_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
     def forward(
diff --git a/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py b/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py
index 02672bdce83ad9..e7c8eeccef98b4 100644
--- a/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py
@@ -1,9 +1,9 @@
-#           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-#               This file was automatically generated from <path_to_modular_file.py>.
-#         Do NOT edit this file manually as any edits will be overwritten by the generation of
-#         the file from the modular. If any change should be done, please apply the change to the
-#                           modular_xxx.py file directly. One of our CI enforces this
-#           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/instructblipvideo/modular_instructblipvideo.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_instructblipvideo.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 # coding=utf-8
 # Copyright 2024 HuggingFace Inc. team. All rights reserved.
 #
diff --git a/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py
index 131b8fe57bd665..b83df54785fa14 100644
--- a/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py
@@ -57,8 +57,11 @@ def make_batched_videos(videos) -> List[VideoInput]:
         elif len(videos[0].shape) == 4:
             return [list(video) for video in videos]
 
-    elif is_valid_image(videos) and len(videos.shape) == 4:
-        return [list(videos)]
+    elif is_valid_image(videos):
+        if isinstance(videos, PIL.Image.Image):
+            return [[videos]]
+        elif len(videos.shape) == 4:
+            return [list(videos)]
 
     raise ValueError(f"Could not make batched video from {videos}")
 
diff --git a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py
index 0808aa58b855fe..a300268ed71327 100644
--- a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py
@@ -1,9 +1,9 @@
-#           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-#               This file was automatically generated from <path_to_modular_file.py>.
-#         Do NOT edit this file manually as any edits will be overwritten by the generation of
-#         the file from the modular. If any change should be done, please apply the change to the
-#                           modular_xxx.py file directly. One of our CI enforces this
-#           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/instructblipvideo/modular_instructblipvideo.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_instructblipvideo.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 # coding=utf-8
 # Copyright 2024 HuggingFace Inc. team. All rights reserved.
 #
@@ -58,7 +58,6 @@
 
 
 @dataclass
-# Copied from transformers.models.blip_2.modeling_blip_2.Blip2ForConditionalGenerationModelOutput with Blip2->InstructBlipVideo
 class InstructBlipVideoForConditionalGenerationModelOutput(ModelOutput):
     """
     Class defining the outputs of [`InstructBlipVideoForConditionalGeneration`].
@@ -91,7 +90,6 @@ def to_tuple(self) -> Tuple[Any]:
         )
 
 
-# Copied from transformers.models.blip.modeling_blip.BlipVisionEmbeddings with Blip->InstructBlipVideo
 class InstructBlipVideoVisionEmbeddings(nn.Module):
     def __init__(self, config: InstructBlipVideoVisionConfig):
         super().__init__()
@@ -166,7 +164,6 @@ def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding: boo
         return embeddings
 
 
-# Copied from transformers.models.blip_2.modeling_blip_2.Blip2Attention with Blip2->InstructBlipVideo
 class InstructBlipVideoAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
@@ -248,7 +245,6 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.blip.modeling_blip.BlipMLP
 class InstructBlipVideoMLP(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -264,7 +260,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-# Copied from transformers.models.blip.modeling_blip.BlipEncoderLayer with Blip->InstructBlipVideo
 class InstructBlipVideoEncoderLayer(nn.Module):
     def __init__(self, config: InstructBlipVideoConfig):
         super().__init__()
@@ -330,7 +325,6 @@ class InstructBlipVideoPreTrainedModel(PreTrainedModel):
     ]
     _keep_in_fp32_modules = []
 
-    # Copied from transformers.models.blip_2.modeling_blip_2.Blip2PreTrainedModel._init_weights with Blip2->InstructBlipVideo
     def _init_weights(self, module):
         """Initialize the weights"""
         factor = self.config.initializer_range
@@ -450,7 +444,6 @@ def _init_weights(self, module):
 """
 
 
-# Copied from transformers.models.blip.modeling_blip.BlipEncoder with Blip->InstructBlipVideo
 class InstructBlipVideoEncoder(nn.Module):
     """
     Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
@@ -537,7 +530,6 @@ def forward(
         )
 
 
-# Copied from transformers.models.blip.modeling_blip.BlipVisionModel with Blip->InstructBlipVideo, BLIP->INSTRUCTBLIPVIDEO
 class InstructBlipVideoVisionModel(InstructBlipVideoPreTrainedModel):
     main_input_name = "pixel_values"
     config_class = InstructBlipVideoVisionConfig
@@ -738,7 +730,6 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->InstructBlipVideoQFormer
 class InstructBlipVideoQFormerSelfOutput(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -753,7 +744,6 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
-# Copied from transformers.models.blip_2.modeling_blip_2.Blip2QFormerAttention with Blip2->InstructBlipVideo
 class InstructBlipVideoQFormerAttention(nn.Module):
     def __init__(self, config, is_cross_attention=False):
         super().__init__()
@@ -803,7 +793,6 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->InstructBlipVideoQFormer
 class InstructBlipVideoQFormerIntermediate(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -819,7 +808,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->InstructBlipVideoQFormer
 class InstructBlipVideoQFormerOutput(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -937,7 +925,6 @@ def feed_forward_chunk_query(self, attention_output):
         return layer_output
 
 
-# Copied from transformers.models.blip_2.modeling_blip_2.Blip2QFormerEncoder with Blip2->InstructBlipVideo
 class InstructBlipVideoQFormerEncoder(nn.Module):
     def __init__(self, config):
         super().__init__()
diff --git a/src/transformers/models/jamba/modeling_jamba.py b/src/transformers/models/jamba/modeling_jamba.py
index 07f84b362eee7a..ddb3b384a8b892 100755
--- a/src/transformers/models/jamba/modeling_jamba.py
+++ b/src/transformers/models/jamba/modeling_jamba.py
@@ -51,7 +51,6 @@
     is_flash_attn_2_available,
     is_flash_attn_greater_or_equal_2_10,
     is_mamba_ssm_available,
-    is_torchdynamo_compiling,
 )
 from .configuration_jamba import JambaConfig
 
@@ -713,11 +712,14 @@ def cuda_kernels_forward(
         # This is a hack to apply dt_proj while still using the forward pass of `torch.nn.Linear`, which is needed
         # in order to make quantization work. Quantization code replaces `torch.nn.Linear` layers with quantized
         # linear layers, and requires to call the forward pass directly.
-        # The original code here was: ```discrete_time_step = self.dt_proj.weight @ time_step.transpose(1, 2)```
-        time_proj_bias = self.dt_proj.bias
-        self.dt_proj.bias = None
+        # Quantized model can't work with the original code:
+        # ```discrete_time_step = self.dt_proj.weight @ time_step.transpose(1, 2)```
+        time_proj_bias = self.dt_proj.bias.data
+        with torch.no_grad():
+            self.dt_proj.bias.data = torch.zeros_like(self.dt_proj.bias.data)
         discrete_time_step = self.dt_proj(time_step).transpose(1, 2)
-        self.dt_proj.bias = time_proj_bias
+        with torch.no_grad():
+            self.dt_proj.bias.data = time_proj_bias
 
         A = -torch.exp(self.A_log.float())
         # 3.c perform the recurrence y ← SSM(A, B, C)(x)
@@ -1280,9 +1282,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if self.gradient_checkpointing and self.training and use_cache:
             logger.warning_once(
@@ -1540,12 +1540,6 @@ def forward(
             logits = self.lm_head(hidden_states)
         else:
             logits = self.lm_head(hidden_states[..., -num_logits_to_keep:, :])
-        if labels is None and not is_torchdynamo_compiling:
-            logger.warning_once(
-                "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)"
-            )
-        # TODO: remove the float() operations in v4.46
-        logits = logits.float()
 
         loss = None
         if labels is not None:
@@ -1601,6 +1595,8 @@ def prepare_inputs_for_generation(
         use_cache=True,
         **kwargs,
     ):
+        # Overwitten -- has a unique cache type, `HybridMambaAttentionDynamicCache`
+
         empty_past_kv = past_key_values is None
 
         # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
diff --git a/src/transformers/models/jetmoe/modeling_jetmoe.py b/src/transformers/models/jetmoe/modeling_jetmoe.py
index 8b39183b8fc6a6..bbc70b26d1f8a9 100644
--- a/src/transformers/models/jetmoe/modeling_jetmoe.py
+++ b/src/transformers/models/jetmoe/modeling_jetmoe.py
@@ -38,7 +38,6 @@
     add_start_docstrings_to_model_forward,
     is_flash_attn_2_available,
     is_flash_attn_greater_or_equal_2_10,
-    is_torchdynamo_compiling,
     logging,
     replace_return_docstrings,
 )
@@ -973,9 +972,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if self.gradient_checkpointing and self.training and use_cache:
             logger.warning_once(
@@ -1120,7 +1117,7 @@ def _update_causal_mask(
         dtype, device = input_tensor.dtype, input_tensor.device
         sequence_length = input_tensor.shape[1]
         if using_static_cache:
-            target_length = past_key_values.get_max_length()
+            target_length = past_key_values.get_max_cache_shape()
         else:
             target_length = (
                 attention_mask.shape[-1]
@@ -1302,13 +1299,8 @@ def forward(
         )
 
         hidden_states = outputs[0]
-        if labels is None and not is_torchdynamo_compiling():
-            logger.warning_once(
-                "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)"
-            )
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
-        # TODO: remove the float() operation in v4.46
-        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float()
+        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
 
         loss = None
         if labels is not None:
@@ -1352,57 +1344,6 @@ def forward(
             router_logits=outputs.router_logits,
         )
 
-    # Copied from transformers.models.mixtral.modeling_mixtral.MixtralForCausalLM.prepare_inputs_for_generation
-    def prepare_inputs_for_generation(
-        self,
-        input_ids,
-        past_key_values=None,
-        attention_mask=None,
-        inputs_embeds=None,
-        cache_position=None,
-        output_router_logits=False,
-        position_ids=None,
-        use_cache=True,
-        num_logits_to_keep=None,
-        **kwargs,
-    ):
-        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
-        # Exception 1: when passing input_embeds, input_ids may be missing entries
-        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
-        if past_key_values is not None:
-            if inputs_embeds is not None:  # Exception 1
-                input_ids = input_ids[:, -cache_position.shape[0] :]
-            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
-                input_ids = input_ids[:, cache_position]
-
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and cache_position[0] == 0:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids.contiguous()}  # `contiguous()` needed for compilation use cases
-
-        if num_logits_to_keep is not None:
-            model_inputs["num_logits_to_keep"] = num_logits_to_keep
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "cache_position": cache_position,
-                "past_key_values": past_key_values,
-                "use_cache": use_cache,
-                "attention_mask": attention_mask,
-                "output_router_logits": output_router_logits,
-            }
-        )
-        return model_inputs
-
 
 @add_start_docstrings(
     """
diff --git a/src/transformers/models/kosmos2/modeling_kosmos2.py b/src/transformers/models/kosmos2/modeling_kosmos2.py
index 5adc48a3a2ef59..7674e29db6b915 100644
--- a/src/transformers/models/kosmos2/modeling_kosmos2.py
+++ b/src/transformers/models/kosmos2/modeling_kosmos2.py
@@ -417,15 +417,15 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
         """
 
         num_patches = embeddings.shape[1] - 1
-        self.position_embeddings = self.position_embedding.weight.unsqueeze(0)
-        num_positions = self.position_embeddings.shape[1] - 1
+        position_embedding = self.position_embedding.weight.unsqueeze(0)
+        num_positions = position_embedding.shape[1] - 1
 
         # always interpolate when tracing to ensure the exported model works for dynamic input shapes
         if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
-            return self.position_embeddings
+            return self.position_embedding(self.position_ids)
 
-        class_pos_embed = self.position_embeddings[:, :1]
-        patch_pos_embed = self.position_embeddings[:, 1:]
+        class_pos_embed = position_embedding[:, :1]
+        patch_pos_embed = position_embedding[:, 1:]
 
         dim = embeddings.shape[-1]
 
diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
index 99edee6a92a838..dde017bbb92797 100644
--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@@ -45,7 +45,6 @@
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_flash_attn_greater_or_equal_2_10,
-    is_torchdynamo_compiling,
     logging,
     replace_return_docstrings,
 )
@@ -879,9 +878,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if self.gradient_checkpointing and self.training and use_cache:
             logger.warning_once(
@@ -1014,7 +1011,7 @@ def _update_causal_mask(
         dtype, device = input_tensor.dtype, input_tensor.device
         sequence_length = input_tensor.shape[1]
         if using_static_cache:
-            target_length = past_key_values.get_max_length()
+            target_length = past_key_values.get_max_cache_shape()
         else:
             target_length = (
                 attention_mask.shape[-1]
@@ -1206,13 +1203,8 @@ def forward(
             logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
             logits = torch.cat(logits, dim=-1)
         else:
-            if labels is None and not is_torchdynamo_compiling():
-                logger.warning_once(
-                    "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)"
-                )
             # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
-            # TODO: remove the float() operation in v4.46
-            logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float()
+            logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
 
         loss = None
         if labels is not None:
diff --git a/src/transformers/models/llama/tokenization_llama.py b/src/transformers/models/llama/tokenization_llama.py
index cc03c1470ee24f..8e99e4eef59d68 100644
--- a/src/transformers/models/llama/tokenization_llama.py
+++ b/src/transformers/models/llama/tokenization_llama.py
@@ -43,14 +43,12 @@
 B_INST, E_INST = "[INST]", "[/INST]"
 B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
 
-# fmt: off
 DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your \
 answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\
  that your responses are socially unbiased and positive in nature.
 
 If a question does not make any sense, or is not factually coherent, explain why instead of answering something not \
-correct. If you don't know the answer to a question, please don't share false information."""
-# fmt: on
+correct. If you don't know the answer to a question, please don't share false information."""  # fmt: skip
 
 
 class LlamaTokenizer(PreTrainedTokenizer):
diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py
index 092008873d1e27..e793ca61c750d7 100644
--- a/src/transformers/models/llava/modeling_llava.py
+++ b/src/transformers/models/llava/modeling_llava.py
@@ -279,6 +279,21 @@ def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_m
         self.vocab_size = model_embeds.num_embeddings
         return model_embeds
 
+    def get_image_features(
+        self, pixel_values: torch.FloatTensor, vision_feature_layer: int, vision_feature_select_strategy: str
+    ):
+        image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
+        # this is not memory efficient at all (output_hidden_states=True) will save all the hidden stated.
+        selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
+        if vision_feature_select_strategy == "default":
+            selected_image_feature = selected_image_feature[:, 1:]
+        elif vision_feature_select_strategy == "full":
+            selected_image_feature = selected_image_feature
+        else:
+            raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
+        image_features = self.multi_modal_projector(selected_image_feature)
+        return image_features
+
     def _merge_input_ids_with_image_features(self, image_features, inputs_embeds, input_ids, attention_mask, labels):
         num_images, num_image_patches, embed_dim = image_features.shape
         batch_size, sequence_length = input_ids.shape
@@ -429,9 +444,7 @@ def forward(
         )
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if pixel_values is not None and inputs_embeds is not None:
             raise ValueError(
@@ -450,17 +463,11 @@ def forward(
             ) or (input_ids.shape[-1] == 1 and pixel_values is not None)
 
         if pixel_values is not None:
-            image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
-            # this is not memory efficient at all (output_hidden_states=True) will save all the hidden stated.
-            selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
-            if vision_feature_select_strategy == "default":
-                selected_image_feature = selected_image_feature[:, 1:]
-            elif vision_feature_select_strategy == "full":
-                selected_image_feature = selected_image_feature
-            else:
-                raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
-
-            image_features = self.multi_modal_projector(selected_image_feature)
+            image_features = self.get_image_features(
+                pixel_values=pixel_values,
+                vision_feature_layer=vision_feature_layer,
+                vision_feature_select_strategy=vision_feature_select_strategy,
+            )
 
             if legacy_processing:
                 logger.warning_once(
@@ -512,7 +519,10 @@ def forward(
             # TODO: @raushan retain only the new behavior after v4.47
             else:
                 special_image_mask = (
-                    (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+                    (input_ids == self.config.image_token_index)
+                    .unsqueeze(-1)
+                    .expand_as(inputs_embeds)
+                    .to(inputs_embeds.device)
                 )
                 image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
                 inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
@@ -536,7 +546,9 @@ def forward(
         if labels is not None:
             # Shift so that tokens < n predict n
             if attention_mask is not None:
-                shift_attention_mask = attention_mask[..., 1:]
+                # we use the input attention mask to shift the logits and labels, because it is 2D.
+                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
+                shift_attention_mask = attention_mask[:, -(logits.shape[1] - 1) :].to(logits.device)
                 shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous()
                 shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous()
             else:
diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py
index bb07ddfdaccd96..705821c2b713e8 100644
--- a/src/transformers/models/llava_next/modeling_llava_next.py
+++ b/src/transformers/models/llava_next/modeling_llava_next.py
@@ -786,9 +786,7 @@ def forward(
         )
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if pixel_values is not None and inputs_embeds is not None:
             raise ValueError(
@@ -898,7 +896,10 @@ def forward(
             # TODO: @raushan retain only the new behavior after v4.47
             else:
                 special_image_mask = (
-                    (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+                    (input_ids == self.config.image_token_index)
+                    .unsqueeze(-1)
+                    .expand_as(inputs_embeds)
+                    .to(inputs_embeds.device)
                 )
                 image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
                 inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
@@ -922,7 +923,9 @@ def forward(
         if labels is not None:
             # Shift so that tokens < n predict n
             if attention_mask is not None:
-                shift_attention_mask = attention_mask[..., 1:]
+                # we use the input attention mask to shift the logits and labels, because it is 2D.
+                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
+                shift_attention_mask = attention_mask[:, -(logits.shape[1] - 1) :].to(logits.device)
                 shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous()
                 shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous()
             else:
diff --git a/src/transformers/models/llava_next_video/configuration_llava_next_video.py b/src/transformers/models/llava_next_video/configuration_llava_next_video.py
index 1631c1018306d2..0e4e39b4b3ab53 100644
--- a/src/transformers/models/llava_next_video/configuration_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/configuration_llava_next_video.py
@@ -1,9 +1,9 @@
-#           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-#               This file was automatically generated from <path_to_modular_file.py>.
-#         Do NOT edit this file manually as any edits will be overwritten by the generation of
-#         the file from the modular. If any change should be done, please apply the change to the
-#                           modular_xxx.py file directly. One of our CI enforces this
-#           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/llava_next_video/modular_llava_next_video.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_llava_next_video.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 # coding=utf-8
 # Copyright 2024 HuggingFace Inc. team. All rights reserved.
 #
diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py
index 46b9b23bd66b04..7df4cf20372bb7 100644
--- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py
@@ -1,9 +1,9 @@
-#           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-#               This file was automatically generated from <path_to_modular_file.py>.
-#         Do NOT edit this file manually as any edits will be overwritten by the generation of
-#         the file from the modular. If any change should be done, please apply the change to the
-#                           modular_xxx.py file directly. One of our CI enforces this
-#           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/llava_next_video/modular_llava_next_video.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_llava_next_video.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 # coding=utf-8
 # Copyright 2024 HuggingFace Inc. team. All rights reserved.
 #
@@ -235,7 +235,6 @@ def forward(self, image_features):
         return image_features_spatial_pool.flatten(2).transpose(1, 2).contiguous()
 
 
-# Copied from transformers.models.llava.modeling_llava.LlavaMultiModalProjector with Llava->LlavaNextVideo
 class LlavaNextVideoMultiModalProjector(nn.Module):
     def __init__(self, config: LlavaNextVideoConfig):
         super().__init__()
@@ -272,7 +271,6 @@ def forward(self, image_features):
     "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
     LLAVA_NEXT_VIDEO_START_DOCSTRING,
 )
-# Copied from transformers.models.llava.modeling_llava.LlavaPreTrainedModel with Llava->LlavaNextVideo,llava->llava_next_video
 class LlavaNextVideoPreTrainedModel(PreTrainedModel):
     config_class = LlavaNextVideoConfig
     base_model_prefix = "model"
@@ -426,35 +424,27 @@ def padding_side(self, padding_side: str):
             raise ValueError(f"{padding_side} is not `left` or `right`.")
         self._padding_side = padding_side
 
-    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.get_input_embeddings
     def get_input_embeddings(self):
         return self.language_model.get_input_embeddings()
 
-    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.set_input_embeddings
     def set_input_embeddings(self, value):
         self.language_model.set_input_embeddings(value)
 
-    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.get_output_embeddings
     def get_output_embeddings(self):
         return self.language_model.get_output_embeddings()
 
-    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.set_output_embeddings
     def set_output_embeddings(self, new_embeddings):
         self.language_model.set_output_embeddings(new_embeddings)
 
-    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.set_decoder
     def set_decoder(self, decoder):
         self.language_model.set_decoder(decoder)
 
-    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.get_decoder
     def get_decoder(self):
         return self.language_model.get_decoder()
 
-    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.tie_weights
     def tie_weights(self):
         return self.language_model.tie_weights()
 
-    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.resize_token_embeddings
     def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
         model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
         # update vocab size
@@ -875,9 +865,7 @@ def forward(
         )
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if (pixel_values is not None or pixel_values_videos is not None) and inputs_embeds is not None:
             raise ValueError(
@@ -980,13 +968,19 @@ def forward(
         else:
             if image_features is not None:
                 special_image_mask = (
-                    (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+                    (input_ids == self.config.image_token_index)
+                    .unsqueeze(-1)
+                    .expand_as(inputs_embeds)
+                    .to(inputs_embeds.device)
                 )
                 image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
                 inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
             if video_features is not None:
                 special_image_mask = (
-                    (input_ids == self.config.video_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+                    (input_ids == self.config.video_token_index)
+                    .unsqueeze(-1)
+                    .expand_as(inputs_embeds)
+                    .to(inputs_embeds.device)
                 )
                 video_features = video_features.to(inputs_embeds.device, inputs_embeds.dtype)
                 inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, video_features)
@@ -1010,7 +1004,9 @@ def forward(
         if labels is not None:
             # Shift so that tokens < n predict n
             if attention_mask is not None:
-                shift_attention_mask = attention_mask[..., 1:]
+                # we use the input attention mask to shift the logits and labels, because it is 2D.
+                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
+                shift_attention_mask = attention_mask[:, -(logits.shape[1] - 1) :].to(logits.device)
                 shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous()
                 shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous()
             else:
diff --git a/src/transformers/models/llava_next_video/modular_llava_next_video.py b/src/transformers/models/llava_next_video/modular_llava_next_video.py
index f48056cfb97e22..4b6be407dcab81 100644
--- a/src/transformers/models/llava_next_video/modular_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/modular_llava_next_video.py
@@ -380,9 +380,7 @@ def forward(
         )
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if (pixel_values is not None or pixel_values_videos is not None) and inputs_embeds is not None:
             raise ValueError(
@@ -485,13 +483,19 @@ def forward(
         else:
             if image_features is not None:
                 special_image_mask = (
-                    (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+                    (input_ids == self.config.image_token_index)
+                    .unsqueeze(-1)
+                    .expand_as(inputs_embeds)
+                    .to(inputs_embeds.device)
                 )
                 image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
                 inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
             if video_features is not None:
                 special_image_mask = (
-                    (input_ids == self.config.video_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+                    (input_ids == self.config.video_token_index)
+                    .unsqueeze(-1)
+                    .expand_as(inputs_embeds)
+                    .to(inputs_embeds.device)
                 )
                 video_features = video_features.to(inputs_embeds.device, inputs_embeds.dtype)
                 inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, video_features)
@@ -515,7 +519,9 @@ def forward(
         if labels is not None:
             # Shift so that tokens < n predict n
             if attention_mask is not None:
-                shift_attention_mask = attention_mask[..., 1:]
+                # we use the input attention mask to shift the logits and labels, because it is 2D.
+                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
+                shift_attention_mask = attention_mask[:, -(logits.shape[1] - 1) :].to(logits.device)
                 shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous()
                 shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous()
             else:
diff --git a/src/transformers/models/llava_onevision/modeling_llava_onevision.py b/src/transformers/models/llava_onevision/modeling_llava_onevision.py
index c378ff09f1e4ad..f65c0fe7cfb3e5 100644
--- a/src/transformers/models/llava_onevision/modeling_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/modeling_llava_onevision.py
@@ -572,9 +572,7 @@ def forward(
         )
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if (pixel_values is not None or pixel_values_videos is not None) and inputs_embeds is not None:
             raise ValueError(
@@ -678,7 +676,9 @@ def forward(
         if labels is not None:
             # Shift so that tokens < n predict n
             if attention_mask is not None:
-                shift_attention_mask = attention_mask[..., 1:]
+                # we use the input attention mask to shift the logits and labels, because it is 2D.
+                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
+                shift_attention_mask = attention_mask[:, -(logits.shape[1] - 1) :].to(logits.device)
                 shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous()
                 shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous()
             else:
diff --git a/src/transformers/models/llava_onevision/processing_llava_onevision.py b/src/transformers/models/llava_onevision/processing_llava_onevision.py
index f9d550e789d83a..039e05a7ec19a0 100644
--- a/src/transformers/models/llava_onevision/processing_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/processing_llava_onevision.py
@@ -172,8 +172,6 @@ def __call__(
             num_video_tokens = (num_frames * pooled_height_width * pooled_height_width) + 1  # +1 for newline token
             text = [sample.replace(self.video_token, self.video_token * num_video_tokens) for sample in text]
 
-        # Padding side can be in TextKwargs but is not accepted by the tokenizer
-        _ = output_kwargs["text_kwargs"].pop("padding_side", None)
         text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
         return BatchFeature(data={**text_inputs, **image_inputs, **video_inputs})
 
diff --git a/src/transformers/models/m2m_100/modeling_m2m_100.py b/src/transformers/models/m2m_100/modeling_m2m_100.py
index 77386f0ff1ad9d..1588aa28aa2dbf 100755
--- a/src/transformers/models/m2m_100/modeling_m2m_100.py
+++ b/src/transformers/models/m2m_100/modeling_m2m_100.py
@@ -24,6 +24,7 @@
 from ...activations import ACT2FN
 from ...generation import GenerationMixin
 from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.fsdp import is_fsdp_managed_module
 from ...modeling_attn_mask_utils import (
     _prepare_4d_attention_mask,
     _prepare_4d_attention_mask_for_sdpa,
@@ -339,7 +340,14 @@ def forward(
         return attn_output, attn_weights_reshaped, past_key_value
 
 
+# Copied from transformers.models.bart.modeling_bart.BartFlashAttention2 with Bart->M2M100
 class M2M100FlashAttention2(M2M100Attention):
+    """
+    M2M100 flash attention module. This module inherits from `M2M100Attention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
     # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -361,7 +369,9 @@ def forward(
         layer_head_mask: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        """Input shape: Batch x Time x Channel"""
+        # M2M100FlashAttention2 attention does not support output_attentions
+        if output_attentions:
+            raise ValueError("M2M100FlashAttention2 attention does not support output_attentions")
 
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
@@ -412,25 +422,50 @@ def forward(
         if past_key_value is not None:
             kv_seq_len += past_key_value[0].shape[-2]
 
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (LlamaRMSNorm handles it correctly)
+
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
         attn_output = _flash_attention_forward(
             query_states,
             key_states,
             value_states,
             attention_mask,
             q_len,
-            dropout=self.dropout,
-            softmax_scale=None,
-            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+            dropout=self.dropout if self.training else 0.0,
             is_causal=self.is_causal,
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
         )
 
-        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned across GPUs when using tensor-parallelism.
-        attn_output = attn_output.reshape(bsz, q_len, self.embed_dim)
-
+        attn_output = attn_output.reshape(bsz, q_len, -1)
         attn_output = self.out_proj(attn_output)
 
-        return attn_output, None, past_key_value
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
 
 
 # Copied from transformers.models.bart.modeling_bart.BartSdpaAttention with Bart->M2M100
@@ -1021,7 +1056,7 @@ def forward(
                     f"The head_mask should be specified for {len(self.layers)} layers, but it is for"
                     f" {head_mask.size()[0]}."
                 )
-        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
 
         for idx, encoder_layer in enumerate(self.layers):
             if output_hidden_states:
@@ -1031,8 +1066,8 @@ def forward(
             dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.layerdrop) else False
-            if not skip_the_layer or deepspeed_zero3_is_enabled:
-                # under deepspeed zero3 all gpus must run in sync
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
 
                 if self.gradient_checkpointing and self.training:
                     layer_outputs = self._gradient_checkpointing_func(
@@ -1278,7 +1313,7 @@ def forward(
                         f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
                         f" {head_mask.size()[0]}."
                     )
-        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
 
         for idx, decoder_layer in enumerate(self.layers):
             if output_hidden_states:
@@ -1288,8 +1323,8 @@ def forward(
             dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.layerdrop) else False
-            if not skip_the_layer or deepspeed_zero3_is_enabled:
-                # under deepspeed zero3 all gpus must run in sync
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
 
                 past_key_value = past_key_values[idx] if past_key_values is not None else None
 
diff --git a/src/transformers/models/mamba/modeling_mamba.py b/src/transformers/models/mamba/modeling_mamba.py
index 6bed1caab23ab7..cf84ac795eba44 100644
--- a/src/transformers/models/mamba/modeling_mamba.py
+++ b/src/transformers/models/mamba/modeling_mamba.py
@@ -590,9 +590,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):  # ^ is python for xor
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if inputs_embeds is None:
             inputs_embeds = self.embeddings(input_ids)
@@ -709,6 +707,8 @@ def prepare_inputs_for_generation(
         attention_mask: Optional[torch.LongTensor] = None,
         **kwargs,
     ):
+        # Overwitten -- uses `cache_params` as opposed to `past_key_values`
+
         if use_cache:
             # `cache_position` should have been initialized in `generate`
             if cache_position is None:
diff --git a/src/transformers/models/mamba2/modeling_mamba2.py b/src/transformers/models/mamba2/modeling_mamba2.py
index 01074af38a510b..110ae09a388704 100644
--- a/src/transformers/models/mamba2/modeling_mamba2.py
+++ b/src/transformers/models/mamba2/modeling_mamba2.py
@@ -862,9 +862,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):  # ^ is python for xor
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if inputs_embeds is None:
             inputs_embeds = self.embeddings(input_ids)
@@ -965,6 +963,8 @@ def prepare_inputs_for_generation(
         attention_mask: Optional[torch.Tensor] = None,
         **kwargs,
     ):
+        # Overwitten -- uses `cache_params` as opposed to `past_key_values`
+
         if inputs_embeds is not None:
             past_len = inputs_embeds.shape[1] + input_ids.shape[1]
         else:
diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py
index cb26bb11e094cd..bbb3381bd97325 100755
--- a/src/transformers/models/marian/modeling_marian.py
+++ b/src/transformers/models/marian/modeling_marian.py
@@ -1258,7 +1258,8 @@ def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optio
             self._resize_final_logits_bias(new_num_tokens)
         return new_embeddings
 
-    def _resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of=None) -> nn.Embedding:
+    # NOTE: `_resize_token_embeddings` was rewriten in the base class, *args exists to absorb the extra arg
+    def _resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of=None, *args) -> nn.Embedding:
         old_embeddings = self.get_input_embeddings()
         new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens, pad_to_multiple_of)
         self.set_input_embeddings(new_embeddings)
@@ -1683,32 +1684,6 @@ def forward(
             cross_attentions=outputs.cross_attentions,
         )
 
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs
-    ):
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_ids.shape)
-
-        if past_key_values:
-            past_length = past_key_values[0][0].shape[2]
-
-            # Some generation methods already pass only the last input ID
-            if input_ids.shape[1] > past_length:
-                remove_prefix_length = past_length
-            else:
-                # Default to old behavior: keep only final ID
-                remove_prefix_length = input_ids.shape[1] - 1
-
-            input_ids = input_ids[:, remove_prefix_length:]
-        # first step, decoder_cached_states are empty
-        return {
-            "input_ids": input_ids,  # encoder_outputs is defined. input_ids not needed
-            "attention_mask": attention_mask,
-            "past_key_values": past_key_values,
-            "use_cache": use_cache,
-        }
-
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
diff --git a/src/transformers/models/markuplm/modeling_markuplm.py b/src/transformers/models/markuplm/modeling_markuplm.py
index a3aa69621ce11e..3c1935e7b04214 100755
--- a/src/transformers/models/markuplm/modeling_markuplm.py
+++ b/src/transformers/models/markuplm/modeling_markuplm.py
@@ -936,35 +936,6 @@ def forward(
             cross_attentions=encoder_outputs.cross_attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_bert.BertModel.prepare_inputs_for_generation
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, use_cache=True, **model_kwargs
-    ):
-        input_shape = input_ids.shape
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_shape)
-
-        # cut decoder_input_ids if past_key_values is used
-        if past_key_values is not None:
-            past_length = past_key_values[0][0].shape[2]
-
-            # Some generation methods already pass only the last input ID
-            if input_ids.shape[1] > past_length:
-                remove_prefix_length = past_length
-            else:
-                # Default to old behavior: keep only final ID
-                remove_prefix_length = input_ids.shape[1] - 1
-
-            input_ids = input_ids[:, remove_prefix_length:]
-
-        return {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "past_key_values": past_key_values,
-            "use_cache": use_cache,
-        }
-
     # Copied from transformers.models.bert.modeling_bert.BertModel._reorder_cache
     def _reorder_cache(self, past_key_values, beam_idx):
         reordered_past = ()
diff --git a/src/transformers/models/mask2former/modeling_mask2former.py b/src/transformers/models/mask2former/modeling_mask2former.py
index c5788951fd5988..6b94caf355d994 100644
--- a/src/transformers/models/mask2former/modeling_mask2former.py
+++ b/src/transformers/models/mask2former/modeling_mask2former.py
@@ -17,7 +17,7 @@
 import math
 import warnings
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -800,11 +800,14 @@ def get_num_masks(self, class_labels: torch.Tensor, device: torch.device) -> tor
 
 # Copied from transformers.models.deformable_detr.modeling_deformable_detr.multi_scale_deformable_attention
 def multi_scale_deformable_attention(
-    value: Tensor, value_spatial_shapes: Tensor, sampling_locations: Tensor, attention_weights: Tensor
+    value: Tensor,
+    value_spatial_shapes: Union[Tensor, List[Tuple]],
+    sampling_locations: Tensor,
+    attention_weights: Tensor,
 ) -> Tensor:
     batch_size, _, num_heads, hidden_dim = value.shape
     _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
-    value_list = value.split([height.item() * width.item() for height, width in value_spatial_shapes], dim=1)
+    value_list = value.split([height * width for height, width in value_spatial_shapes], dim=1)
     sampling_grids = 2 * sampling_locations - 1
     sampling_value_list = []
     for level_id, (height, width) in enumerate(value_spatial_shapes):
diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py
index 3f2d6cb8e2ba8d..a10d62d6dcc338 100755
--- a/src/transformers/models/mbart/modeling_mbart.py
+++ b/src/transformers/models/mbart/modeling_mbart.py
@@ -397,7 +397,7 @@ def forward(
             value_states,
             attention_mask,
             q_len,
-            dropout=self.dropout,
+            dropout=self.dropout if self.training else 0.0,
             is_causal=self.is_causal,
             use_top_left_mask=self._flash_attn_uses_top_left_mask,
         )
@@ -2146,32 +2146,6 @@ def forward(
             cross_attentions=outputs.cross_attentions,
         )
 
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs
-    ):
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_ids.shape)
-
-        if past_key_values:
-            past_length = past_key_values[0][0].shape[2]
-
-            # Some generation methods already pass only the last input ID
-            if input_ids.shape[1] > past_length:
-                remove_prefix_length = past_length
-            else:
-                # Default to old behavior: keep only final ID
-                remove_prefix_length = input_ids.shape[1] - 1
-
-            input_ids = input_ids[:, remove_prefix_length:]
-        # first step, decoder_cached_states are empty
-        return {
-            "input_ids": input_ids,  # encoder_outputs is defined. input_ids not needed
-            "attention_mask": attention_mask,
-            "past_key_values": past_key_values,
-            "use_cache": use_cache,
-        }
-
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
index 20506f91bcbcb2..9be1e24aa2ab6c 100755
--- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
@@ -1236,27 +1236,6 @@ def forward(
             cross_attentions=outputs.cross_attentions,
         )
 
-    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
-        input_shape = input_ids.shape
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_shape)
-
-        # cut decoder_input_ids if past_key_values is used
-        if past_key_values is not None:
-            past_length = past_key_values[0][0].shape[2]
-
-            # Some generation methods already pass only the last input ID
-            if input_ids.shape[1] > past_length:
-                remove_prefix_length = past_length
-            else:
-                # Default to old behavior: keep only final ID
-                remove_prefix_length = input_ids.shape[1] - 1
-
-            input_ids = input_ids[:, remove_prefix_length:]
-
-        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
-
     def _reorder_cache(self, past_key_values, beam_idx):
         reordered_past = ()
         for layer_past in past_key_values:
diff --git a/src/transformers/models/mimi/modeling_mimi.py b/src/transformers/models/mimi/modeling_mimi.py
index 2a9faa29f0d358..514f9de706ec63 100644
--- a/src/transformers/models/mimi/modeling_mimi.py
+++ b/src/transformers/models/mimi/modeling_mimi.py
@@ -23,7 +23,7 @@
 from torch import nn
 
 from ...activations import ACT2FN
-from ...cache_utils import Cache, DynamicCache, StaticCache
+from ...cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
 from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_outputs import BaseModelOutputWithPast
 from ...modeling_utils import PreTrainedModel
@@ -1027,7 +1027,7 @@ def forward(
             attentions=all_self_attns,
         )
 
-    # Copied from transformers.models.gemma.modeling_gemma.GemmaModel._update_causal_mask
+    # Copied from transformers.models.phi3.modeling_phi3.Phi3Model._update_causal_mask
     def _update_causal_mask(
         self,
         attention_mask: torch.Tensor,
@@ -1046,21 +1046,30 @@ def _update_causal_mask(
         # to infer the attention mask.
         past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
         using_static_cache = isinstance(past_key_values, StaticCache)
+        using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache)
 
         # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
-        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+        if (
+            self.config._attn_implementation == "sdpa"
+            and not (using_static_cache or using_sliding_window_cache)
+            and not output_attentions
+        ):
             if AttentionMaskConverter._ignore_causal_mask_sdpa(
                 attention_mask,
                 inputs_embeds=input_tensor,
                 past_key_values_length=past_seen_tokens,
+                sliding_window=self.config.sliding_window,
                 is_training=self.training,
             ):
                 return None
 
         dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
         sequence_length = input_tensor.shape[1]
-        if using_static_cache:
-            target_length = past_key_values.get_max_length()
+        # SlidingWindowCache or StaticCache
+        if using_sliding_window_cache or using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        # DynamicCache or no cache
         else:
             target_length = (
                 attention_mask.shape[-1]
@@ -1077,7 +1086,10 @@ def _update_causal_mask(
             device=device,
             cache_position=cache_position,
             batch_size=input_tensor.shape[0],
+            config=self.config,
+            past_key_values=past_key_values,
         )
+
         if (
             self.config._attn_implementation == "sdpa"
             and attention_mask is not None
@@ -1087,13 +1099,12 @@ def _update_causal_mask(
             # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
             # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
             # Details: https://github.com/pytorch/pytorch/issues/110213
-            min_dtype = torch.finfo(dtype).min
             causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
 
         return causal_mask
 
     @staticmethod
-    # Copied from transformers.models.llama.modeling_llama.LlamaModel._prepare_4d_causal_attention_mask_with_cache_position
+    # Copied from transformers.models.mistral.modeling_mistral.MistralModel._prepare_4d_causal_attention_mask_with_cache_position with Mistral->Mimi
     def _prepare_4d_causal_attention_mask_with_cache_position(
         attention_mask: torch.Tensor,
         sequence_length: int,
@@ -1102,6 +1113,8 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
         device: torch.device,
         cache_position: torch.Tensor,
         batch_size: int,
+        config: MimiConfig,
+        past_key_values: Cache,
     ):
         """
         Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
@@ -1109,13 +1122,11 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
 
         Args:
             attention_mask (`torch.Tensor`):
-                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
-                `(batch_size, 1, query_length, key_value_length)`.
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
             sequence_length (`int`):
                 The sequence length being processed.
             target_length (`int`):
-                The target length: when generating with static cache, the mask should be as long as the static cache,
-                to account for the 0 padding, the part of the cache that is not filled yet.
+                The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
@@ -1124,6 +1135,10 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
                 Batch size.
+            config (`MimiConfig`):
+                The model's configuration class
+            past_key_values (`Cache`):
+                The cache class that is being used currently to generate
         """
         if attention_mask is not None and attention_mask.dim() == 4:
             # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
@@ -1133,19 +1148,27 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             causal_mask = torch.full(
                 (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
             )
-            if sequence_length != 1:
-                causal_mask = torch.triu(causal_mask, diagonal=1)
-            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            diagonal_attend_mask = torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            if config.sliding_window is not None:
+                # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also
+                # the check is needed to verify is current checkpoint was trained with sliding window or not
+                if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length:
+                    sliding_attend_mask = torch.arange(target_length, device=device) <= (
+                        cache_position.reshape(-1, 1) - config.sliding_window
+                    )
+                    diagonal_attend_mask |= sliding_attend_mask
+            causal_mask *= diagonal_attend_mask
             causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
             if attention_mask is not None:
                 causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                if attention_mask.shape[-1] > target_length:
+                    attention_mask = attention_mask[:, :target_length]
                 mask_length = attention_mask.shape[-1]
                 padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
                 padding_mask = padding_mask == 0
                 causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
                     padding_mask, min_dtype
                 )
-
         return causal_mask
 
 
diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py
index ffa1a18307e982..b0ffe3e56e5972 100644
--- a/src/transformers/models/mistral/modeling_mistral.py
+++ b/src/transformers/models/mistral/modeling_mistral.py
@@ -43,7 +43,6 @@
     add_start_docstrings_to_model_forward,
     is_flash_attn_2_available,
     is_flash_attn_greater_or_equal_2_10,
-    is_torchdynamo_compiling,
     logging,
     replace_return_docstrings,
 )
@@ -750,9 +749,7 @@ def forward(
 
         # retrieve input_ids and inputs_embeds
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if self.gradient_checkpointing and self.training and use_cache:
             logger.warning_once(
@@ -859,7 +856,7 @@ def _update_causal_mask(
         use_cache: bool,
         output_attentions: bool,
     ):
-        if self._attn_implementation == "flash_attention_2":
+        if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and use_cache:
                 is_padding_right = attention_mask[:, -1].sum().item() != input_tensor.size()[0]
                 if is_padding_right:
@@ -875,12 +872,11 @@ def _update_causal_mask(
         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
         # to infer the attention mask.
-
-        # cache_position must be valid here no matter which cache we use
         past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
         using_static_cache = isinstance(past_key_values, StaticCache)
         using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache)
 
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
         if (
             self.config._attn_implementation == "sdpa"
             and not (using_static_cache or using_sliding_window_cache)
@@ -898,12 +894,9 @@ def _update_causal_mask(
         dtype, device = input_tensor.dtype, input_tensor.device
         min_dtype = torch.finfo(dtype).min
         sequence_length = input_tensor.shape[1]
-        # SlidingWindowCache
-        if using_sliding_window_cache:
-            target_length = max(sequence_length, self.config.sliding_window)
-        # StaticCache
-        elif using_static_cache:
-            target_length = past_key_values.get_max_length()
+        # SlidingWindowCache or StaticCache
+        if using_sliding_window_cache or using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
         # DynamicCache or no cache
         else:
             target_length = (
@@ -912,30 +905,18 @@ def _update_causal_mask(
                 else past_seen_tokens + sequence_length + 1
             )
 
-        if attention_mask is not None and attention_mask.dim() == 4:
-            causal_mask = attention_mask
-        else:
-            causal_mask = torch.full(
-                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
-            )
-            exclude_mask = torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
-            if self.config.sliding_window is not None:
-                if not using_sliding_window_cache or sequence_length > self.config.sliding_window:
-                    exclude_mask.bitwise_or_(
-                        torch.arange(target_length, device=device)
-                        <= (cache_position.reshape(-1, 1) - self.config.sliding_window)
-                    )
-            causal_mask *= exclude_mask
-            causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
-            if attention_mask is not None:
-                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
-                if attention_mask.dim() == 2:
-                    mask_length = attention_mask.shape[-1]
-                    padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
-                    padding_mask = padding_mask == 0
-                    causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
-                        padding_mask, min_dtype
-                    )
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            device=device,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+            config=self.config,
+            past_key_values=past_key_values,
+        )
 
         if (
             self.config._attn_implementation == "sdpa"
@@ -950,6 +931,73 @@ def _update_causal_mask(
 
         return causal_mask
 
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        config: MistralConfig,
+        past_key_values: Cache,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            device (`torch.device`):
+                The device to plcae the 4D attention mask on.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+            config (`MistralConfig`):
+                The model's configuration class
+            past_key_values (`Cache`):
+                The cache class that is being used currently to generate
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+            )
+            diagonal_attend_mask = torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            if config.sliding_window is not None:
+                # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also
+                # the check is needed to verify is current checkpoint was trained with sliding window or not
+                if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length:
+                    sliding_attend_mask = torch.arange(target_length, device=device) <= (
+                        cache_position.reshape(-1, 1) - config.sliding_window
+                    )
+                    diagonal_attend_mask |= sliding_attend_mask
+            causal_mask *= diagonal_attend_mask
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                if attention_mask.shape[-1] > target_length:
+                    attention_mask = attention_mask[:, :target_length]
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+        return causal_mask
+
 
 class MistralForCausalLM(MistralPreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
@@ -1050,13 +1098,8 @@ def forward(
         )
 
         hidden_states = outputs[0]
-        if labels is None and not is_torchdynamo_compiling():
-            logger.warning_once(
-                "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)"
-            )
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
-        # TODO: remove the float() operation in v4.46
-        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float()
+        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
 
         loss = None
         if labels is not None:
@@ -1118,9 +1161,30 @@ def prepare_inputs_for_generation(
 
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
         if inputs_embeds is not None and cache_position[0] == 0:
-            model_inputs = {"inputs_embeds": inputs_embeds}
+            model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
         else:
-            model_inputs = {"input_ids": input_ids.contiguous()}  # `contiguous()` needed for compilation use cases
+            # `contiguous()` needed for compilation use cases
+            model_inputs = {"input_ids": input_ids.contiguous(), "inputs_embeds": None}
+
+        if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
+            if model_inputs["inputs_embeds"] is not None:
+                batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
+                device = model_inputs["inputs_embeds"].device
+            else:
+                batch_size, sequence_length = model_inputs["input_ids"].shape
+                device = model_inputs["input_ids"].device
+
+            attention_mask = self.model._prepare_4d_causal_attention_mask_with_cache_position(
+                attention_mask,
+                sequence_length=sequence_length,
+                target_length=past_key_values.get_max_cache_shape(),
+                dtype=self.lm_head.weight.dtype,
+                device=device,
+                cache_position=cache_position,
+                batch_size=batch_size,
+                config=self.config,
+                past_key_values=past_key_values,
+            )
 
         if num_logits_to_keep is not None:
             model_inputs["num_logits_to_keep"] = num_logits_to_keep
diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py
index e87054cd70f58b..9c7fadbb8f885c 100644
--- a/src/transformers/models/mixtral/modeling_mixtral.py
+++ b/src/transformers/models/mixtral/modeling_mixtral.py
@@ -29,7 +29,7 @@
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...cache_utils import Cache, DynamicCache, StaticCache
+from ...cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
 from ...generation import GenerationMixin
 from ...modeling_attn_mask_utils import AttentionMaskConverter, _prepare_4d_causal_attention_mask
 from ...modeling_outputs import (
@@ -44,7 +44,6 @@
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_flash_attn_2_available,
-    is_torchdynamo_compiling,
     logging,
     replace_return_docstrings,
 )
@@ -960,9 +959,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if self.gradient_checkpointing and self.training:
             if use_cache:
@@ -1071,7 +1068,7 @@ def forward(
             router_logits=all_router_logits,
         )
 
-    # Copied from transformers.models.llama.modeling_llama.LlamaModel._update_causal_mask
+    # Copied from transformers.models.phi3.modeling_phi3.Phi3Model._update_causal_mask
     def _update_causal_mask(
         self,
         attention_mask: torch.Tensor,
@@ -1090,21 +1087,30 @@ def _update_causal_mask(
         # to infer the attention mask.
         past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
         using_static_cache = isinstance(past_key_values, StaticCache)
+        using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache)
 
         # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
-        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+        if (
+            self.config._attn_implementation == "sdpa"
+            and not (using_static_cache or using_sliding_window_cache)
+            and not output_attentions
+        ):
             if AttentionMaskConverter._ignore_causal_mask_sdpa(
                 attention_mask,
                 inputs_embeds=input_tensor,
                 past_key_values_length=past_seen_tokens,
+                sliding_window=self.config.sliding_window,
                 is_training=self.training,
             ):
                 return None
 
         dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
         sequence_length = input_tensor.shape[1]
-        if using_static_cache:
-            target_length = past_key_values.get_max_length()
+        # SlidingWindowCache or StaticCache
+        if using_sliding_window_cache or using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        # DynamicCache or no cache
         else:
             target_length = (
                 attention_mask.shape[-1]
@@ -1121,6 +1127,8 @@ def _update_causal_mask(
             device=device,
             cache_position=cache_position,
             batch_size=input_tensor.shape[0],
+            config=self.config,
+            past_key_values=past_key_values,
         )
 
         if (
@@ -1132,13 +1140,12 @@ def _update_causal_mask(
             # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
             # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
             # Details: https://github.com/pytorch/pytorch/issues/110213
-            min_dtype = torch.finfo(dtype).min
             causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
 
         return causal_mask
 
     @staticmethod
-    # Copied from transformers.models.llama.modeling_llama.LlamaModel._prepare_4d_causal_attention_mask_with_cache_position
+    # Copied from transformers.models.mistral.modeling_mistral.MistralModel._prepare_4d_causal_attention_mask_with_cache_position with Mistral->Mixtral
     def _prepare_4d_causal_attention_mask_with_cache_position(
         attention_mask: torch.Tensor,
         sequence_length: int,
@@ -1147,6 +1154,8 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
         device: torch.device,
         cache_position: torch.Tensor,
         batch_size: int,
+        config: MixtralConfig,
+        past_key_values: Cache,
     ):
         """
         Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
@@ -1154,13 +1163,11 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
 
         Args:
             attention_mask (`torch.Tensor`):
-                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
-                `(batch_size, 1, query_length, key_value_length)`.
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
             sequence_length (`int`):
                 The sequence length being processed.
             target_length (`int`):
-                The target length: when generating with static cache, the mask should be as long as the static cache,
-                to account for the 0 padding, the part of the cache that is not filled yet.
+                The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
@@ -1169,6 +1176,10 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
                 Batch size.
+            config (`MixtralConfig`):
+                The model's configuration class
+            past_key_values (`Cache`):
+                The cache class that is being used currently to generate
         """
         if attention_mask is not None and attention_mask.dim() == 4:
             # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
@@ -1178,19 +1189,27 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             causal_mask = torch.full(
                 (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
             )
-            if sequence_length != 1:
-                causal_mask = torch.triu(causal_mask, diagonal=1)
-            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            diagonal_attend_mask = torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            if config.sliding_window is not None:
+                # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also
+                # the check is needed to verify is current checkpoint was trained with sliding window or not
+                if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length:
+                    sliding_attend_mask = torch.arange(target_length, device=device) <= (
+                        cache_position.reshape(-1, 1) - config.sliding_window
+                    )
+                    diagonal_attend_mask |= sliding_attend_mask
+            causal_mask *= diagonal_attend_mask
             causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
             if attention_mask is not None:
                 causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                if attention_mask.shape[-1] > target_length:
+                    attention_mask = attention_mask[:, :target_length]
                 mask_length = attention_mask.shape[-1]
                 padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
                 padding_mask = padding_mask == 0
                 causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
                     padding_mask, min_dtype
                 )
-
         return causal_mask
 
 
@@ -1302,13 +1321,8 @@ def forward(
         )
 
         hidden_states = outputs[0]
-        if labels is None and not is_torchdynamo_compiling():
-            logger.warning_once(
-                "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)"
-            )
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
-        # TODO: remove the float() operation in v4.46
-        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float()
+        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
 
         loss = None
         if labels is not None:
@@ -1387,6 +1401,26 @@ def prepare_inputs_for_generation(
         else:
             model_inputs = {"input_ids": input_ids.contiguous()}  # `contiguous()` needed for compilation use cases
 
+        if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
+            if model_inputs["inputs_embeds"] is not None:
+                batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
+                device = model_inputs["inputs_embeds"].device
+            else:
+                batch_size, sequence_length = model_inputs["input_ids"].shape
+                device = model_inputs["input_ids"].device
+
+            attention_mask = self.model._prepare_4d_causal_attention_mask_with_cache_position(
+                attention_mask,
+                sequence_length=sequence_length,
+                target_length=past_key_values.get_max_cache_shape(),
+                dtype=self.lm_head.weight.dtype,
+                device=device,
+                cache_position=cache_position,
+                batch_size=batch_size,
+                config=self.config,
+                past_key_values=past_key_values,
+            )
+
         if num_logits_to_keep is not None:
             model_inputs["num_logits_to_keep"] = num_logits_to_keep
 
diff --git a/src/transformers/models/mllama/modeling_mllama.py b/src/transformers/models/mllama/modeling_mllama.py
index 9c31d9abe5ba01..0bc77eaeec3324 100644
--- a/src/transformers/models/mllama/modeling_mllama.py
+++ b/src/transformers/models/mllama/modeling_mllama.py
@@ -25,7 +25,7 @@
 
 from ... import PreTrainedModel
 from ...activations import ACT2FN
-from ...cache_utils import Cache
+from ...cache_utils import Cache, StaticCache
 from ...generation import GenerationMixin
 from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPast, CausalLMOutputWithPast
@@ -1042,7 +1042,7 @@ class MllamaPreTrainedModel(PreTrainedModel):
         "MllamaSelfAttentionDecoderLayer",
     ]
     _supports_cache_class = True
-    _supports_static_cache = False
+    _supports_static_cache = False  # static cache cannot have different shapes for each layer
     _supports_sdpa = True
     _supports_quantized_cache = True
 
@@ -1606,9 +1606,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if self.gradient_checkpointing and self.training and use_cache:
             logger.warning_once(
@@ -1711,6 +1709,7 @@ def forward(
             attentions=all_self_attns,
         )
 
+    # Copied from transformers.models.llama.modeling_llama.LlamaModel._update_causal_mask
     def _update_causal_mask(
         self,
         attention_mask: torch.Tensor,
@@ -1728,11 +1727,10 @@ def _update_causal_mask(
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
         # to infer the attention mask.
         past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
 
         # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
-        # TODO: we have only SDPA currently and there's a bug when attn-bias is passed. Need to add eager attn and return the line
-        # self.config._attn_implementation == "sdpa" and
-        if self.config._attn_implementation == "sdpa" and not output_attentions:
+        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
             if AttentionMaskConverter._ignore_causal_mask_sdpa(
                 attention_mask,
                 inputs_embeds=input_tensor,
@@ -1742,13 +1740,15 @@ def _update_causal_mask(
                 return None
 
         dtype, device = input_tensor.dtype, input_tensor.device
-
         sequence_length = input_tensor.shape[1]
-        target_length = (
-            attention_mask.shape[-1]
-            if isinstance(attention_mask, torch.Tensor)
-            else past_seen_tokens + sequence_length + 1
-        )
+        if using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
 
         # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
         causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
@@ -1974,64 +1974,14 @@ def forward(
             attentions=outputs.attentions,
         )
 
-    def prepare_inputs_for_generation(
-        self,
-        input_ids,
-        past_key_values=None,
-        attention_mask=None,
-        inputs_embeds=None,
-        cache_position=None,
-        position_ids=None,
-        use_cache=True,
-        num_logits_to_keep=None,
-        **kwargs,
-    ):
-        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
-        # Exception 1: when passing input_embeds, input_ids may be missing entries
-        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
-        if past_key_values is not None:
-            if inputs_embeds is not None:  # Exception 1
-                input_ids = input_ids[:, -cache_position.shape[0] :]
-            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
-                input_ids = input_ids[:, cache_position]
-
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-
-                # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s  `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture.
-                position_ids = position_ids.clone(memory_format=torch.contiguous_format)
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and cache_position[0] == 0:
-            model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
-        else:
-            # The clone here is for the same reason as for `position_ids`.
-            model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
-
-        if num_logits_to_keep is not None:
-            model_inputs["num_logits_to_keep"] = num_logits_to_keep
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "cache_position": cache_position,
-                "past_key_values": past_key_values,
-                "use_cache": use_cache,
-                "attention_mask": attention_mask,
-            }
-        )
-        return model_inputs
-
 
 @add_start_docstrings(
     """The Mllama model which consists of a vision encoder and a language model.""",
     MLLAMA_START_DOCSTRING,
 )
 class MllamaForConditionalGeneration(MllamaPreTrainedModel, GenerationMixin):
+    _supports_quantized_cache = False  # quant cache not supported in encoder-decoder setting
+
     def __init__(self, config: MllamaConfig):
         super().__init__(config)
         self.vocab_size = config.text_config.vocab_size
@@ -2145,9 +2095,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if pixel_values is not None and inputs_embeds is not None:
             raise ValueError(
diff --git a/src/transformers/models/mllama/processing_mllama.py b/src/transformers/models/mllama/processing_mllama.py
index 84c8eea466e75b..eb092f021f6368 100644
--- a/src/transformers/models/mllama/processing_mllama.py
+++ b/src/transformers/models/mllama/processing_mllama.py
@@ -19,21 +19,10 @@
 
 import numpy as np
 
-
-try:
-    from typing import Unpack
-except ImportError:
-    from typing_extensions import Unpack
-
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput
-from ...processing_utils import (
-    ImagesKwargs,
-    ProcessingKwargs,
-    ProcessorMixin,
-)
+from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import (
-    BatchEncoding,
     PreTokenizedInput,
     TextInput,
 )
@@ -236,8 +225,10 @@ def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
+        audio=None,
+        videos=None,
         **kwargs: Unpack[MllamaProcessorKwargs],
-    ) -> BatchEncoding:
+    ) -> BatchFeature:
         """
         Main method to prepare text(s) and image(s) to be fed as input to the model. This method forwards the `text`
         arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] if `text` is not `None` to encode
@@ -260,7 +251,7 @@ def __call__(
                     - `'np'`: Return NumPy `np.ndarray` objects.
                     - `'jax'`: Return JAX `jnp.ndarray` objects.
         Returns:
-            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
             - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
             - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
@@ -333,9 +324,9 @@ def __call__(
             data["cross_attention_mask"] = cross_attention_mask
 
         return_tensors = common_kwargs.pop("return_tensors", None)
-        batch_encoding = BatchFeature(data=data, tensor_type=return_tensors)
+        batch_feature = BatchFeature(data=data, tensor_type=return_tensors)
 
-        return batch_encoding
+        return batch_feature
 
     def batch_decode(self, *args, **kwargs):
         """
diff --git a/src/transformers/models/mpt/modeling_mpt.py b/src/transformers/models/mpt/modeling_mpt.py
index 9c826c370b752a..2b7e7ae5895a1e 100644
--- a/src/transformers/models/mpt/modeling_mpt.py
+++ b/src/transformers/models/mpt/modeling_mpt.py
@@ -518,43 +518,6 @@ def get_output_embeddings(self):
     def set_output_embeddings(self, new_embeddings: torch.Tensor):
         self.lm_head = new_embeddings
 
-    def prepare_inputs_for_generation(
-        self,
-        input_ids: torch.LongTensor,
-        past_key_values: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        use_cache: Optional[bool] = None,
-        **kwargs,
-    ) -> dict:
-        # only last tokens for input_ids if past is not None
-        if past_key_values is not None:
-            past_length = past_key_values[0][0].shape[2]
-
-            # Some generation methods already pass only the last input ID
-            if input_ids.shape[1] > past_length:
-                remove_prefix_length = past_length
-            else:
-                # Default to old behavior: keep only final ID
-                remove_prefix_length = input_ids.shape[1] - 1
-
-            input_ids = input_ids[:, remove_prefix_length:]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "past_key_values": past_key_values,  # NITS should it be layer_past?
-                "use_cache": use_cache,
-                "attention_mask": attention_mask,
-            }
-        )
-        return model_inputs
-
     @add_start_docstrings_to_model_forward(MPT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
diff --git a/src/transformers/models/musicgen/modeling_musicgen.py b/src/transformers/models/musicgen/modeling_musicgen.py
index 3109c4fc243118..8d7f6ad3c7c682 100644
--- a/src/transformers/models/musicgen/modeling_musicgen.py
+++ b/src/transformers/models/musicgen/modeling_musicgen.py
@@ -430,7 +430,7 @@ def forward(
             value_states,
             attention_mask,
             q_len,
-            dropout=self.dropout,
+            dropout=self.dropout if self.training else 0.0,
             is_causal=self.is_causal,
             use_top_left_mask=self._flash_attn_uses_top_left_mask,
         )
@@ -1506,7 +1506,8 @@ def generate(
                 generation config. If a stopping criteria is passed that is already created with the arguments or a
                 generation config an error is thrown. This feature is intended for advanced users.
             synced_gpus (`bool`, *optional*, defaults to `False`):
-                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
+                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
             streamer (`BaseStreamer`, *optional*):
                 Streamer object that will be used to stream the generated sequences. Generated tokens are passed
                 through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
@@ -2513,7 +2514,8 @@ def generate(
                 generation config. If a stopping criteria is passed that is already created with the arguments or a
                 generation config an error is thrown. This feature is intended for advanced users.
             synced_gpus (`bool`, *optional*, defaults to `False`):
-                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
+                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
             streamer (`BaseStreamer`, *optional*):
                 Streamer object that will be used to stream the generated sequences. Generated tokens are passed
                 through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
diff --git a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
index c8345870b2537e..96b8d29db83da1 100644
--- a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
+++ b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
@@ -446,7 +446,7 @@ def forward(
             value_states,
             attention_mask,
             q_len,
-            dropout=self.dropout,
+            dropout=self.dropout if self.training else 0.0,
             is_causal=self.is_causal,
             use_top_left_mask=self._flash_attn_uses_top_left_mask,
         )
@@ -1428,7 +1428,8 @@ def generate(
                 generation config. If a stopping criteria is passed that is already created with the arguments or a
                 generation config an error is thrown. This feature is intended for advanced users.
             synced_gpus (`bool`, *optional*, defaults to `False`):
-                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
+                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
             streamer (`BaseStreamer`, *optional*):
                 Streamer object that will be used to stream the generated sequences. Generated tokens are passed
                 through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
@@ -2364,7 +2365,8 @@ def generate(
                 generation config. If a stopping criteria is passed that is already created with the arguments or a
                 generation config an error is thrown. This feature is intended for advanced users.
             synced_gpus (`bool`, *optional*, defaults to `False`):
-                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
+                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
             streamer (`BaseStreamer`, *optional*):
                 Streamer object that will be used to stream the generated sequences. Generated tokens are passed
                 through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
diff --git a/src/transformers/models/mvp/modeling_mvp.py b/src/transformers/models/mvp/modeling_mvp.py
index c47c4b26b539f7..5a466c0cec012d 100644
--- a/src/transformers/models/mvp/modeling_mvp.py
+++ b/src/transformers/models/mvp/modeling_mvp.py
@@ -1972,32 +1972,6 @@ def forward(
             cross_attentions=outputs.cross_attentions,
         )
 
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs
-    ):
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_ids.shape)
-
-        if past_key_values:
-            past_length = past_key_values[0][0].shape[2]
-
-            # Some generation methods already pass only the last input ID
-            if input_ids.shape[1] > past_length:
-                remove_prefix_length = past_length
-            else:
-                # Default to old behavior: keep only final ID
-                remove_prefix_length = input_ids.shape[1] - 1
-
-            input_ids = input_ids[:, remove_prefix_length:]
-        # first step, decoder_cached_states are empty
-        return {
-            "input_ids": input_ids,  # encoder_outputs is defined. input_ids not needed
-            "attention_mask": attention_mask,
-            "past_key_values": past_key_values,
-            "use_cache": use_cache,
-        }
-
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
diff --git a/src/transformers/models/myt5/__init__.py b/src/transformers/models/myt5/__init__.py
new file mode 100644
index 00000000000000..9579f723a00ef3
--- /dev/null
+++ b/src/transformers/models/myt5/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+
+
+_import_structure = {"tokenization_myt5": ["MyT5Tokenizer"]}
+
+
+if TYPE_CHECKING:
+    from .tokenization_myt5 import MyT5Tokenizer
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/myt5/convert_myt5_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/myt5/convert_myt5_original_tf_checkpoint_to_pytorch.py
new file mode 100644
index 00000000000000..39653e4b1c77d0
--- /dev/null
+++ b/src/transformers/models/myt5/convert_myt5_original_tf_checkpoint_to_pytorch.py
@@ -0,0 +1,60 @@
+# coding=utf-8
+# Copyright 2024 The MyT5 authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert MyT5 checkpoint."""
+
+import argparse
+
+from transformers import T5Config, T5ForConditionalGeneration, load_tf_weights_in_t5
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+
+
+# Copied from transformers.models.t5.convert_t5_original_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch
+def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
+    # Initialise PyTorch model
+    config = T5Config.from_json_file(config_file)
+    print(f"Building PyTorch model from configuration: {config}")
+    model = T5ForConditionalGeneration(config)
+
+    # Load weights from tf checkpoint
+    load_tf_weights_in_t5(model, config, tf_checkpoint_path)
+
+    # Save pytorch-model
+    print(f"Save PyTorch model to {pytorch_dump_path}")
+    model.save_pretrained(pytorch_dump_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
+    )
+    parser.add_argument(
+        "--config_file",
+        default=None,
+        type=str,
+        required=True,
+        help=(
+            "The config json file corresponding to the pre-trained MyT5 model. \nThis specifies the model architecture."
+        ),
+    )
+    parser.add_argument(
+        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
+    )
+    args = parser.parse_args()
+    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path)
diff --git a/src/transformers/models/myt5/tokenization_myt5.py b/src/transformers/models/myt5/tokenization_myt5.py
new file mode 100644
index 00000000000000..69cb14b0cc9d02
--- /dev/null
+++ b/src/transformers/models/myt5/tokenization_myt5.py
@@ -0,0 +1,377 @@
+# coding=utf-8
+# Copyright 2024
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization class for model MyT5."""
+
+import json
+import os
+import warnings
+from collections import defaultdict
+from typing import Dict, List, Optional, Tuple, Union
+
+from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+VOCAB_FILES_NAMES = {"vocab_file": "byte_maps.json"}
+
+
+class ByteRewriter:
+    """
+    Byte rewriter class for MyT5 tokenizer.
+    This class is used to rewrite bytes using a hash tree. The hash tree is constructed from a set of rewriting rules.
+
+    Args:
+        rewriting_rules (`str` or `Dict[str, str]`):
+            A path to a json file containing the rewriting rules or a dictionary containing the rewriting rules.
+
+    """
+
+    LEAF = "[LEAF]"
+
+    def __init__(self, rewriting_rules: Union[str, Dict[str, str]]):
+        if isinstance(rewriting_rules, str):
+            with open(rewriting_rules, "r") as f:
+                rewriting_rules = json.load(f)
+        elif not isinstance(rewriting_rules, dict):
+            raise ValueError(
+                f"rewriting_rules should be either a path to json file or a dict, got {type(rewriting_rules)}"
+            )
+
+        self.hash_tree = self.construct_hash_tree(rewriting_rules)
+        reverse_rewriting_rules = {v: k for k, v in rewriting_rules.items()}
+        self.reverse_hash_tree = self.construct_hash_tree(reverse_rewriting_rules)
+
+    def add_leaf(self, hash_tree: Dict[str, Union[dict, List[str]]], byte_in_sequence: str, byte_out_sequence: str):
+        """
+        Add a leaf with the output byte sequence to the hash tree.
+        """
+        byte_in_list = byte_in_sequence.split(" ")
+        byte_out_list = byte_out_sequence.split(" ")
+
+        tree_pointer = hash_tree
+        for b in byte_in_list:
+            if b not in tree_pointer:
+                tree_pointer[b] = {}
+            tree_pointer = tree_pointer[b]
+
+        tree_pointer[self.LEAF] = byte_out_list
+
+    def construct_hash_tree(self, rewriting_rules: Dict[str, str]) -> Dict[str, Union[dict, List[str]]]:
+        """
+        Construct a hash tree for rewritten byte sequences.
+        """
+        hash_tree = defaultdict(dict)
+        for b in (f"{x:02x}" for x in range(256)):
+            hash_tree[b][self.LEAF] = [b]
+
+        for in_sequence, out_sequence in rewriting_rules.items():
+            self.add_leaf(hash_tree, in_sequence, out_sequence)
+
+        return hash_tree
+
+    def search_hash_tree(self, byte_sequence: List[str]) -> Union[None, List[str]]:
+        """
+        Search the hash tree and return the rewritten byte sequence if found.
+        """
+        tree_pointer = self.hash_tree
+        for b in byte_sequence:
+            if b in tree_pointer:
+                tree_pointer = tree_pointer[b]
+            else:
+                return None
+
+        return tree_pointer[self.LEAF]
+
+    def rewrite_bytes(self, in_bytes: List[str], reverse=False) -> List[str]:
+        """
+        Rewrite a sequence of bytes using the hash tree.
+
+        Args:
+            in_bytes (`List[str]`): A list of bytes to be rewritten.
+            reverse (`bool`): If True, decoding is performed with the reverse hash tree.
+        Returns:
+            `List[str]`: The rewritten byte sequence.
+        """
+        out_bytes = []
+        b_start = 0
+        b_end = 0
+
+        while b_start < len(in_bytes):
+            tree_pointer = self.hash_tree if not reverse else self.reverse_hash_tree
+            for j in range(b_start, len(in_bytes)):
+                b = in_bytes[j]
+                if b in tree_pointer:
+                    tree_pointer = tree_pointer[b]
+                elif j == b_start:
+                    cur_leaf = [b]
+                    b_end = j
+                    break
+                else:
+                    break
+                if self.LEAF in tree_pointer:
+                    cur_leaf = tree_pointer[self.LEAF]
+                    b_end = j
+            out_bytes.extend(cur_leaf)
+            b_start = b_end + 1
+
+        return out_bytes
+
+
+class MyT5Tokenizer(PreTrainedTokenizer):
+    """
+    Construct a MyT5 tokenizer.
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`): The file containing the byte rewriting rules.
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        extra_ids (`int`, *optional*, defaults to 125):
+            Add a number of extra ids added to the end of the vocabulary for use as sentinels. These tokens are
+            accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. Extra tokens are
+            indexed from the end of the vocabulary up to beginning ("<extra_id_0>" is the last token in the vocabulary
+            like in ByT5 preprocessing see
+            [here](https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117)).
+        additional_special_tokens (`List[str]`, *optional*):
+            Additional special tokens used by the tokenizer.
+    """
+
+    model_input_names = ["input_ids", "attention_mask"]
+    vocab_files_names = VOCAB_FILES_NAMES
+
+    def __init__(
+        self,
+        vocab_file,
+        eos_token="</s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        extra_ids=125,
+        additional_special_tokens=None,
+        **kwargs,
+    ) -> None:
+        # Add extra_ids to the special token list
+        if extra_ids > 0 and additional_special_tokens is None:
+            additional_special_tokens = [f"<extra_id_{i}>" for i in range(extra_ids)]
+        elif extra_ids > 0 and additional_special_tokens is not None and len(additional_special_tokens) > 0:
+            # Check that we have the right number of extra_id special tokens
+            extra_tokens = len(set(filter(lambda x: bool("extra_id" in str(x)), additional_special_tokens)))
+            if extra_tokens != extra_ids:
+                raise ValueError(
+                    f"Both extra_ids ({extra_ids}) and additional_special_tokens ({additional_special_tokens}) are"
+                    " provided to MyT5Tokenizer. In this case the additional_special_tokens must include the"
+                    " extra_ids tokens"
+                )
+
+        pad_token = AddedToken(pad_token, lstrip=True, rstrip=True) if isinstance(pad_token, str) else pad_token
+        eos_token = AddedToken(eos_token, lstrip=True, rstrip=True) if isinstance(eos_token, str) else eos_token
+        unk_token = AddedToken(unk_token, lstrip=True, rstrip=True) if isinstance(unk_token, str) else unk_token
+        # unk token needs to be in the vocab with correct index
+        self._added_tokens_decoder = {0: pad_token, 1: eos_token, 2: unk_token}
+        self.offset = len(self._added_tokens_decoder)
+        self._utf_vocab_size = 2**8  # utf is 8 bits
+
+        # Load byte maps
+        self.byte_maps = json.load(open(vocab_file, "r"))
+
+        self.decompose_rewriter = ByteRewriter(self.byte_maps["decompose_map"])
+        self.merge_rewriter = ByteRewriter(self.byte_maps["merge_map"])
+
+        super().__init__(
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            extra_ids=0,
+            additional_special_tokens=additional_special_tokens,
+            **kwargs,
+        )
+
+    @property
+    def vocab_size(self):
+        return self._utf_vocab_size
+
+    # Copied from transformers.models.byt5.tokenization_byt5.ByT5Tokenizer.get_vocab
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size + self.offset)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    # Copied from transformers.models.byt5.tokenization_byt5.ByT5Tokenizer.get_special_tokens_mask
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        # normal case: some special tokens
+        if token_ids_1 is None:
+            return ([0] * len(token_ids_0)) + [1]
+        return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+
+    def _add_eos_if_not_present(self, token_ids: List[int]) -> List[int]:
+        """Do not add eos again if user already added it."""
+        if len(token_ids) > 0 and token_ids[-1] == self.eos_token_id:
+            warnings.warn(
+                f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated"
+                " eos tokens being added."
+            )
+            return token_ids
+        else:
+            return token_ids + [self.eos_token_id]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. MyT5 does not
+        make use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of zeros.
+        """
+        eos = [self.eos_token_id]
+
+        if token_ids_1 is None:
+            return len(token_ids_0 + eos) * [0]
+        return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
+
+    # Copied from transformers.models.byt5.tokenization_byt5.ByT5Tokenizer.build_inputs_with_special_tokens
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A sequence has the following format:
+
+        - single sequence: `X </s>`
+        - pair of sequences: `A </s> B </s>`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        token_ids_0 = self._add_eos_if_not_present(token_ids_0)
+        if token_ids_1 is None:
+            return token_ids_0
+        else:
+            token_ids_1 = self._add_eos_if_not_present(token_ids_1)
+            return token_ids_0 + token_ids_1
+
+    def _tokenize(self, text: str, **kwargs) -> List[str]:
+        """Take as input a string and return a list of strings (tokens) for words/sub-words.
+        Represents tokens in two character hex format"""
+
+        tokens = [f"{i:02x}" for i in text.encode("utf-8")]
+        tokens = self.morphological_encode(tokens)
+        return tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+
+        if len(token) != 2:
+            token_id = None
+        else:
+            token_id = int(token, 16) + self.offset
+
+        return token_id
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        token = f"{index - self.offset:02x}"
+        return token
+
+    def morphological_encode(self, indices: List[str]) -> List[str]:
+        # Decompose and merge morphological sequences
+        indices = self.decompose_rewriter.rewrite_bytes(indices, reverse=False)
+        indices = self.merge_rewriter.rewrite_bytes(indices, reverse=False)
+        return indices
+
+    def morphological_decode(self, indices: List[str]) -> List[str]:
+        # Demerge and compose morphological sequences
+        indices = self.merge_rewriter.rewrite_bytes(indices, reverse=True)
+        indices = self.decompose_rewriter.rewrite_bytes(indices, reverse=True)
+        return indices
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        bstring = b""
+
+        out_tokens = []
+        for token in tokens:
+            if token in self.added_tokens_decoder:
+                out_tokens.append(self.added_tokens_decoder[token])
+            elif token in self.added_tokens_encoder:
+                out_tokens.append(token)
+            else:
+                out_tokens.append(token)
+
+        out_tokens = self.morphological_decode(out_tokens)
+        _added_tokens = set(self.added_tokens_decoder.values()) | set(self.added_tokens_encoder)
+        for token in out_tokens:
+            if token in _added_tokens:
+                bstring += bytes(token, "utf-8")
+            else:
+                bstring += bytes.fromhex(token)
+        string = bstring.decode("utf-8", errors="ignore")
+        return string
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+            )
+        else:
+            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            writer.write(json.dumps(self.byte_maps, indent=2, ensure_ascii=False))
+        return (vocab_file,)
diff --git a/src/transformers/models/nemotron/modeling_nemotron.py b/src/transformers/models/nemotron/modeling_nemotron.py
index 9411f0bcae5a50..7d0390adc3c06f 100644
--- a/src/transformers/models/nemotron/modeling_nemotron.py
+++ b/src/transformers/models/nemotron/modeling_nemotron.py
@@ -43,7 +43,6 @@
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_flash_attn_greater_or_equal_2_10,
-    is_torchdynamo_compiling,
     logging,
     replace_return_docstrings,
 )
@@ -772,9 +771,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if self.gradient_checkpointing and self.training and use_cache:
             logger.warning_once(
@@ -891,7 +888,7 @@ def _update_causal_mask(
         dtype, device = input_tensor.dtype, input_tensor.device
         sequence_length = input_tensor.shape[1]
         if using_static_cache:
-            target_length = past_key_values.get_max_length()
+            target_length = past_key_values.get_max_cache_shape()
         else:
             target_length = (
                 attention_mask.shape[-1]
@@ -1081,14 +1078,8 @@ def forward(
         )
 
         hidden_states = outputs[0]
-        if labels is None and not is_torchdynamo_compiling():
-            logger.warning_once(
-                "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)"
-            )
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
         logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
-        # TODO: remove the float() operation in v4.46
-        logits = logits.float()
 
         loss = None
         if labels is not None:
diff --git a/src/transformers/models/nllb_moe/modeling_nllb_moe.py b/src/transformers/models/nllb_moe/modeling_nllb_moe.py
index c33844da0f55b8..cedefc4f4642f7 100644
--- a/src/transformers/models/nllb_moe/modeling_nllb_moe.py
+++ b/src/transformers/models/nllb_moe/modeling_nllb_moe.py
@@ -24,6 +24,7 @@
 from ...activations import ACT2FN
 from ...generation import GenerationMixin
 from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.fsdp import is_fsdp_managed_module
 from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
 from ...modeling_outputs import (
     MoEModelOutput,
@@ -1370,7 +1371,7 @@ def forward(
                         f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
                         f" {head_mask.size()[0]}."
                     )
-        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
 
         for idx, decoder_layer in enumerate(self.layers):
             if output_hidden_states:
@@ -1380,13 +1381,13 @@ def forward(
             dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.layerdrop) else False
-            if not skip_the_layer or deepspeed_zero3_is_enabled:
+            if not skip_the_layer or synced_gpus:
                 layer_head_mask = head_mask[idx] if head_mask is not None else None
                 cross_attn_layer_head_mask = cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
 
                 past_key_value = past_key_values[idx] if past_key_values is not None else None
 
-                # under deepspeed zero3 all gpus must run in sync
+                # under fsdp or deepspeed zero3 all gpus must run in sync
                 if self.gradient_checkpointing and self.training:
                     if use_cache:
                         logger.warning_once(
diff --git a/src/transformers/models/olmo/modeling_olmo.py b/src/transformers/models/olmo/modeling_olmo.py
index 668722fc9e3f86..7ab54146c9740b 100644
--- a/src/transformers/models/olmo/modeling_olmo.py
+++ b/src/transformers/models/olmo/modeling_olmo.py
@@ -43,7 +43,6 @@
     add_start_docstrings_to_model_forward,
     is_flash_attn_2_available,
     is_flash_attn_greater_or_equal_2_10,
-    is_torchdynamo_compiling,
     logging,
     replace_return_docstrings,
 )
@@ -800,9 +799,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if self.gradient_checkpointing and self.training and use_cache:
             logger.warning_once(
@@ -933,7 +930,7 @@ def _update_causal_mask(
         dtype, device = input_tensor.dtype, input_tensor.device
         sequence_length = input_tensor.shape[1]
         if using_static_cache:
-            target_length = past_key_values.get_max_length()
+            target_length = past_key_values.get_max_cache_shape()
         else:
             target_length = (
                 attention_mask.shape[-1]
@@ -1124,13 +1121,8 @@ def forward(
         )
 
         hidden_states = outputs[0]
-        if labels is None and not is_torchdynamo_compiling():
-            logger.warning_once(
-                "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)"
-            )
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
-        # TODO: remove the float() operation in v4.46
-        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float()
+        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
 
         loss = None
         if labels is not None:
diff --git a/src/transformers/models/olmoe/modeling_olmoe.py b/src/transformers/models/olmoe/modeling_olmoe.py
index 875317732ff06b..8c29f89ff3e7ea 100644
--- a/src/transformers/models/olmoe/modeling_olmoe.py
+++ b/src/transformers/models/olmoe/modeling_olmoe.py
@@ -947,9 +947,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if self.gradient_checkpointing and self.training and use_cache:
             logger.warning_once(
@@ -1091,7 +1089,7 @@ def _update_causal_mask(
         dtype, device = input_tensor.dtype, input_tensor.device
         sequence_length = input_tensor.shape[1]
         if using_static_cache:
-            target_length = past_key_values.get_max_length()
+            target_length = past_key_values.get_max_cache_shape()
         else:
             target_length = (
                 attention_mask.shape[-1]
diff --git a/src/transformers/models/omdet_turbo/modeling_omdet_turbo.py b/src/transformers/models/omdet_turbo/modeling_omdet_turbo.py
index b40f09c4e8f2a6..bf9dbd951b5b06 100644
--- a/src/transformers/models/omdet_turbo/modeling_omdet_turbo.py
+++ b/src/transformers/models/omdet_turbo/modeling_omdet_turbo.py
@@ -21,7 +21,7 @@
 from dataclasses import dataclass
 from functools import lru_cache
 from pathlib import Path
-from typing import Optional, Tuple, Union
+from typing import List, Optional, Tuple, Union
 
 import torch
 import torch.nn.functional as F
@@ -208,7 +208,10 @@ def load_cuda_kernels():
 
 # Copied from transformers.models.deformable_detr.modeling_deformable_detr.multi_scale_deformable_attention
 def multi_scale_deformable_attention(
-    value: Tensor, value_spatial_shapes: Tensor, sampling_locations: Tensor, attention_weights: Tensor
+    value: Tensor,
+    value_spatial_shapes: Union[Tensor, List[Tuple]],
+    sampling_locations: Tensor,
+    attention_weights: Tensor,
 ) -> Tensor:
     batch_size, _, num_heads, hidden_dim = value.shape
     _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
@@ -418,29 +421,6 @@ def __init__(self, config: OmDetTurboConfig, num_heads: int, n_points: int):
 
         self.disable_custom_kernels = config.disable_custom_kernels
 
-        self._reset_parameters()
-
-    def _reset_parameters(self):
-        nn.init.constant_(self.sampling_offsets.weight.data, 0.0)
-        default_dtype = torch.get_default_dtype()
-        thetas = torch.arange(self.n_heads, dtype=torch.int64).to(default_dtype) * (2.0 * math.pi / self.n_heads)
-        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
-        grid_init = (
-            (grid_init / grid_init.abs().max(-1, keepdim=True)[0])
-            .view(self.n_heads, 1, 1, 2)
-            .repeat(1, self.n_levels, self.n_points, 1)
-        )
-        for i in range(self.n_points):
-            grid_init[:, :, i, :] *= i + 1
-        with torch.no_grad():
-            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
-        nn.init.constant_(self.attention_weights.weight.data, 0.0)
-        nn.init.constant_(self.attention_weights.bias.data, 0.0)
-        nn.init.xavier_uniform_(self.value_proj.weight.data)
-        nn.init.constant_(self.value_proj.bias.data, 0.0)
-        nn.init.xavier_uniform_(self.output_proj.weight.data)
-        nn.init.constant_(self.output_proj.bias.data, 0.0)
-
     def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
         return tensor if position_embeddings is None else tensor + position_embeddings
 
diff --git a/src/transformers/models/omdet_turbo/processing_omdet_turbo.py b/src/transformers/models/omdet_turbo/processing_omdet_turbo.py
index 909281b0c6867a..e4e2e5197f2e2c 100644
--- a/src/transformers/models/omdet_turbo/processing_omdet_turbo.py
+++ b/src/transformers/models/omdet_turbo/processing_omdet_turbo.py
@@ -16,13 +16,12 @@
 Processor class for OmDet-Turbo.
 """
 
-import sys
 from typing import List, Optional, Tuple, Union
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_transforms import center_to_corners_format
 from ...image_utils import ImageInput
-from ...processing_utils import ProcessingKwargs, ProcessorMixin, TextKwargs
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import (
     TensorType,
@@ -31,12 +30,6 @@
 )
 
 
-if sys.version_info >= (3, 11):
-    from typing import Unpack
-else:
-    from typing_extensions import Unpack
-
-
 class OmDetTurboTextKwargs(TextKwargs, total=False):
     task: Optional[Union[str, List[str], TextInput, PreTokenizedInput]]
 
diff --git a/src/transformers/models/oneformer/modeling_oneformer.py b/src/transformers/models/oneformer/modeling_oneformer.py
index 9c2f66220715fa..aeeccb68a92fb7 100644
--- a/src/transformers/models/oneformer/modeling_oneformer.py
+++ b/src/transformers/models/oneformer/modeling_oneformer.py
@@ -18,7 +18,7 @@
 import math
 import warnings
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -63,11 +63,14 @@ def _get_clones(module, N):
 
 # Copied from transformers.models.deformable_detr.modeling_deformable_detr.multi_scale_deformable_attention
 def multi_scale_deformable_attention(
-    value: Tensor, value_spatial_shapes: Tensor, sampling_locations: Tensor, attention_weights: Tensor
+    value: Tensor,
+    value_spatial_shapes: Union[Tensor, List[Tuple]],
+    sampling_locations: Tensor,
+    attention_weights: Tensor,
 ) -> Tensor:
     batch_size, _, num_heads, hidden_dim = value.shape
     _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
-    value_list = value.split([height.item() * width.item() for height, width in value_spatial_shapes], dim=1)
+    value_list = value.split([height * width for height, width in value_spatial_shapes], dim=1)
     sampling_grids = 2 * sampling_locations - 1
     sampling_value_list = []
     for level_id, (height, width) in enumerate(value_spatial_shapes):
diff --git a/src/transformers/models/openai/modeling_openai.py b/src/transformers/models/openai/modeling_openai.py
index 0aa02a6f5d8424..02df7f213e2e7d 100644
--- a/src/transformers/models/openai/modeling_openai.py
+++ b/src/transformers/models/openai/modeling_openai.py
@@ -604,6 +604,7 @@ def forward(
         )
 
     def prepare_inputs_for_generation(self, input_ids: torch.LongTensor, **kwargs) -> Dict[str, Any]:
+        # Overwritten -- old model with reduced inputs
         return {"input_ids": input_ids}
 
 
diff --git a/src/transformers/models/opt/modeling_opt.py b/src/transformers/models/opt/modeling_opt.py
index f7782b8f6172b9..e4ef510f099d66 100644
--- a/src/transformers/models/opt/modeling_opt.py
+++ b/src/transformers/models/opt/modeling_opt.py
@@ -23,7 +23,10 @@
 
 from ...activations import ACT2FN
 from ...generation import GenerationMixin
-from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
+from ...modeling_attn_mask_utils import (
+    _prepare_4d_causal_attention_mask,
+    _prepare_4d_causal_attention_mask_for_sdpa,
+)
 from ...modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
@@ -72,17 +75,21 @@ def __init__(self, num_embeddings: int, embedding_dim: int):
         self.offset = 2
         super().__init__(num_embeddings + self.offset, embedding_dim)
 
-    def forward(self, attention_mask: torch.LongTensor, past_key_values_length: int = 0):
+    def forward(
+        self,
+        attention_mask: torch.LongTensor,
+        past_key_values_length: int = 0,
+        position_ids: Optional[torch.LongTensor] = None,
+    ):
         """`input_ids_shape` is expected to be [bsz x seqlen]."""
-        attention_mask = attention_mask.long()
 
-        # create positions depending on attention_mask
-        positions = (torch.cumsum(attention_mask, dim=1).type_as(attention_mask) * attention_mask).long() - 1
+        if position_ids is None:
+            position_ids = torch.cumsum(attention_mask, dim=1)
+            position_ids = (position_ids * attention_mask - 1).long()
+            # cut positions if `past_key_values_length` is > 0
+            position_ids = position_ids[:, past_key_values_length:]
 
-        # cut positions if `past_key_values_length` is > 0
-        positions = positions[:, past_key_values_length:]
-
-        return super().forward(positions + self.offset)
+        return super().forward(position_ids + self.offset)
 
 
 class OPTAttention(nn.Module):
@@ -117,7 +124,7 @@ def __init__(
         self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=self.enable_bias)
         self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=self.enable_bias)
 
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int) -> torch.Tensor:
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
 
     def forward(
@@ -128,6 +135,8 @@ def forward(
         attention_mask: Optional[torch.Tensor] = None,
         layer_head_mask: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
+        # isn't needed in normal attention, but needed in flash attention so to keep the signature same
+        position_ids: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         """Input shape: Batch x Time x Channel"""
 
@@ -265,6 +274,7 @@ def forward(
         attention_mask: Optional[torch.Tensor] = None,
         layer_head_mask: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
+        position_ids: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         """Input shape: Batch x Time x Channel"""
 
@@ -346,6 +356,7 @@ def forward(
             value_states,
             attention_mask,
             query_length,
+            position_ids=position_ids,
             dropout=attn_dropout,
             is_causal=self.is_causal,
             use_top_left_mask=self._flash_attn_uses_top_left_mask,
@@ -360,9 +371,108 @@ def forward(
         return attn_output, attn_weights_reshaped, past_key_value
 
 
+class OPTSdpaAttention(OPTAttention):
+    """
+    OPT sdpa attention module. This module inherits from `OPTAttention` as the weights of the module stays untouched.
+    The only required change would be on the forward pass where it needs to correctly call the public API of sdpa
+    attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        position_ids: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions or layer_head_mask is not None:
+            logger.warning_once(
+                "OPTModel is using SDPA attention, which currently does not support output_attentions=True."
+                'failing back to eager attention. remove warning using attn_implementation="eager".'
+            )
+
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                layer_head_mask=layer_head_mask,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                key_value_states=key_value_states,
+            )  # TODO after merge add position_ids=position_ids
+        is_cross_attention = key_value_states is not None
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states) * self.scaling
+        query_states = self._shape(query_states, -1, bsz)
+
+        # get key, value proj
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        # shape now is (bsz, num_heads, seq_len, head_dim), all are continuous
+
+        causal_mask = attention_mask
+        if attention_mask is not None:
+            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
+
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        is_causal = True if causal_mask is None and q_len > 1 else False
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.dropout if self.training else 0.0,
+            is_causal=is_causal,
+            # this model uses the scaling factor in the query projection for some reason, but not in Q@K^T
+            # so we need to scale to remove scaling in SDPA to have similar results with eager.
+            # Maybe needs a change in the model to remove scaling in query projection
+            scale=1.0,
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, -1)
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+
 OPT_ATTENTION_CLASSES = {
     "eager": OPTAttention,
     "flash_attention_2": OptFlashAttention2,
+    "sdpa": OPTSdpaAttention,
 }
 
 
@@ -392,6 +502,7 @@ def forward(
         past_key_value: Optional[Tuple[torch.Tensor]] = None,
         output_attentions: Optional[bool] = False,
         use_cache: Optional[bool] = False,
+        position_ids: Optional[torch.LongTensor] = None,
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
         """
         Args:
@@ -419,6 +530,7 @@ def forward(
         hidden_states, self_attn_weights, present_key_value = self.self_attn(
             hidden_states=hidden_states,
             past_key_value=past_key_value,
+            position_ids=position_ids,
             attention_mask=attention_mask,
             layer_head_mask=layer_head_mask,
             output_attentions=output_attentions,
@@ -489,6 +601,7 @@ class OPTPreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _no_split_modules = ["OPTDecoderLayer"]
     _supports_flash_attn_2 = True
+    _supports_sdpa = True
 
     def _init_weights(self, module):
         std = self.config.init_std
@@ -561,6 +674,11 @@ def _init_weights(self, module):
             more detail.
         return_dict (`bool`, *optional*):
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`. for padding use -1.
+
+            [What are position IDs?](../glossary#position-ids)
 """
 
 
@@ -605,6 +723,7 @@ def __init__(self, config: OPTConfig):
 
         self.layers = nn.ModuleList([OPTDecoderLayer(config) for _ in range(config.num_hidden_layers)])
         self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+        self._use_sdpa = config._attn_implementation == "sdpa"
 
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
@@ -616,6 +735,49 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
 
+    def _update_causal_mask(
+        self,
+        inputs_embeds: torch.Tensor,
+        input_shape: Tuple[int, int],
+        past_key_values_length: int,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+    ):
+        """
+        Updates the causal mask for the decoder.
+        """
+        batch_size, seq_length = input_shape
+        mask_seq_length = past_key_values_length + seq_length
+        if self._use_flash_attention_2:
+            # 2d mask is passed through the layers
+            causal_attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+            attention_mask = (
+                torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
+                if attention_mask is None
+                else attention_mask
+            )
+
+            return causal_attention_mask, attention_mask
+
+        if attention_mask is None:
+            attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
+        elif attention_mask.shape[1] != mask_seq_length:
+            raise ValueError(
+                f"The provided attention mask has length {attention_mask.shape[1]}, but its length should be "
+                f"{mask_seq_length} (sum of the lengths of current and past inputs)"
+            )
+        if self._use_sdpa and not output_attentions and head_mask is None:
+            causal_attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask, input_shape, inputs_embeds, past_key_values_length
+            )
+        else:
+            causal_attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask, input_shape, inputs_embeds, past_key_values_length
+            )
+
+        return causal_attention_mask, attention_mask
+
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -627,6 +789,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        position_ids: Optional[torch.LongTensor] = None,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         r"""
         Args:
@@ -674,6 +837,11 @@ def forward(
                 for more detail.
             return_dict (`bool`, *optional*):
                 Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+            position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+                config.n_positions - 1]`. for padding use -1.
+
+                [What are position IDs?](../glossary#position-ids)
         """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -697,34 +865,20 @@ def forward(
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
-        batch_size, seq_length = input_shape
         past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
-        # required mask seq length can be calculated via length of past
-        mask_seq_length = past_key_values_length + seq_length
 
+        causal_attention_mask, attention_mask = self._update_causal_mask(
+            inputs_embeds, input_shape, past_key_values_length, attention_mask, head_mask, output_attentions
+        )
         # embed positions
-        if self._use_flash_attention_2:
-            # 2d mask is passed through the layers
-            causal_attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
-            attention_mask = (
-                torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
-                if attention_mask is None
-                else attention_mask
-            )
-        else:
-            # 4d mask is passed through the layers
-            if attention_mask is None:
-                attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
-            elif attention_mask.shape[1] != mask_seq_length:
-                raise ValueError(
-                    f"The provided attention mask has length {attention_mask.shape[1]}, but its length should be "
-                    f"{mask_seq_length} (sum of the lengths of current and past inputs)"
-                )
-            causal_attention_mask = _prepare_4d_causal_attention_mask(
-                attention_mask, input_shape, inputs_embeds, past_key_values_length
-            )
 
-        pos_embeds = self.embed_positions(attention_mask, past_key_values_length)
+        if position_ids is None:
+            position_ids = torch.cumsum(attention_mask, dim=1)
+            position_ids = (position_ids * attention_mask - 1).long()
+            # cut positions if `past_key_values_length` is > 0
+            position_ids = position_ids[:, past_key_values_length:]
+
+        pos_embeds = self.embed_positions(attention_mask, past_key_values_length, position_ids=position_ids)
 
         if self.project_in is not None:
             inputs_embeds = self.project_in(inputs_embeds)
@@ -773,11 +927,13 @@ def forward(
                     None,
                     output_attentions,
                     use_cache,
+                    position_ids,
                 )
             else:
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=causal_attention_mask,
+                    position_ids=position_ids,
                     layer_head_mask=(head_mask[idx] if head_mask is not None else None),
                     past_key_value=past_key_value,
                     output_attentions=output_attentions,
@@ -851,6 +1007,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        position_ids: Optional[torch.LongTensor] = None,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -863,6 +1020,7 @@ def forward(
         decoder_outputs = self.decoder(
             input_ids=input_ids,
             attention_mask=attention_mask,
+            position_ids=position_ids,
             head_mask=head_mask,
             past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
@@ -927,6 +1085,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        position_ids: Optional[torch.LongTensor] = None,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         r"""
         Args:
@@ -982,6 +1141,11 @@ def forward(
                 for more detail.
             return_dict (`bool`, *optional*):
                 Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+            position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+                config.n_positions - 1]`. for padding use -1.
+
+                [What are position IDs?](../glossary#position-ids)
 
         Returns:
 
@@ -1012,6 +1176,7 @@ def forward(
         outputs = self.model.decoder(
             input_ids=input_ids,
             attention_mask=attention_mask,
+            position_ids=position_ids,
             head_mask=head_mask,
             past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
@@ -1046,36 +1211,6 @@ def forward(
             attentions=outputs.attentions,
         )
 
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
-    ):
-        if past_key_values is not None:
-            past_length = past_key_values[0][0].shape[2]
-
-            # Some generation methods already pass only the last input ID
-            if input_ids.shape[1] > past_length:
-                remove_prefix_length = past_length
-            else:
-                # Default to old behavior: keep only final ID
-                remove_prefix_length = input_ids.shape[1] - 1
-
-            input_ids = input_ids[:, remove_prefix_length:]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-            }
-        )
-        return model_inputs
-
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
@@ -1131,6 +1266,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        position_ids: Optional[torch.LongTensor] = None,
     ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -1144,6 +1280,7 @@ def forward(
             input_ids,
             past_key_values=past_key_values,
             attention_mask=attention_mask,
+            position_ids=position_ids,
             head_mask=head_mask,
             inputs_embeds=inputs_embeds,
             use_cache=use_cache,
@@ -1248,6 +1385,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        position_ids: Optional[torch.LongTensor] = None,
     ) -> Union[Tuple, QuestionAnsweringModelOutput]:
         r"""
         start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -1298,6 +1436,7 @@ def forward(
             input_ids,
             past_key_values=past_key_values,
             attention_mask=attention_mask,
+            position_ids=position_ids,
             head_mask=head_mask,
             inputs_embeds=inputs_embeds,
             use_cache=use_cache,
diff --git a/src/transformers/models/paligemma/modeling_paligemma.py b/src/transformers/models/paligemma/modeling_paligemma.py
index b5fddce1d6a914..d75a05bda0e1ec 100644
--- a/src/transformers/models/paligemma/modeling_paligemma.py
+++ b/src/transformers/models/paligemma/modeling_paligemma.py
@@ -46,7 +46,7 @@
 _CONFIG_FOR_DOC = "PaliGemmaConfig"
 
 
-# Adapted from transformers.models.llama.modeling_llama._prepare_4d_causal_attention_mask_with_cache_position
+# Adapted from transformers.models.llama.modeling_llama.LlamaModel._prepare_4d_causal_attention_mask_with_cache_position
 # But Paligemma has no causal mask on prefix
 def _prepare_4d_causal_attention_mask_with_cache_position(
     attention_mask: torch.Tensor,
@@ -57,8 +57,8 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
     min_dtype: float,
     cache_position: torch.Tensor,
     batch_size: int,
-    is_training: bool,
-    token_type_ids: torch.Tensor,
+    is_training: bool = False,
+    token_type_ids: torch.Tensor = None,
 ):
     """
     Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
@@ -94,7 +94,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             if is_training:
                 causal_mask = torch.triu(causal_mask, diagonal=1)
             else:
-                causal_mask = torch.zeros_like(causal_mask)
+                causal_mask[:, :sequence_length] = 0.0
 
         causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
         causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
@@ -378,7 +378,7 @@ def _update_causal_mask(
             if is_training:
                 causal_mask = torch.triu(causal_mask, diagonal=1)
             else:
-                causal_mask = torch.zeros_like(causal_mask)
+                causal_mask[:, :sequence_length] = 0.0
 
         causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
         causal_mask = causal_mask[None, None, :, :].expand(inputs_embeds.shape[0], 1, -1, -1)
@@ -453,9 +453,7 @@ def forward(
         ```"""
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if pixel_values is not None and inputs_embeds is not None:
             raise ValueError(
@@ -534,7 +532,8 @@ def forward(
             shift_labels = labels[..., 1:]
             if attention_mask is not None:
                 # we use the input attention mask to shift the logits and labels, because it is 2D.
-                shift_attention_mask = attention_mask[..., 1:]
+                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
+                shift_attention_mask = attention_mask[:, -shift_logits.shape[1] :].to(logits.device)
                 shift_logits = shift_logits[shift_attention_mask.to(logits.device) != 0].contiguous()
                 shift_labels = shift_labels[shift_attention_mask.to(shift_labels.device) != 0].contiguous()
             else:
@@ -595,7 +594,6 @@ def prepare_inputs_for_generation(
 
             dtype = self.get_output_embeddings().weight.dtype
             min_dtype = torch.finfo(dtype).min
-            is_training = token_type_ids is not None and kwargs.get("labels", None) is not None
 
             model_inputs["attention_mask"] = _prepare_4d_causal_attention_mask_with_cache_position(
                 attention_mask,
@@ -606,8 +604,6 @@ def prepare_inputs_for_generation(
                 min_dtype=min_dtype,
                 cache_position=cache_position,
                 batch_size=batch_size,
-                is_training=is_training,
-                token_type_ids=token_type_ids,
             )
 
         model_inputs["token_type_ids"] = token_type_ids
diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py
index 03d1574e9be28e..35f91ca7356611 100755
--- a/src/transformers/models/pegasus/modeling_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_pegasus.py
@@ -1658,39 +1658,6 @@ def forward(
             cross_attentions=outputs.cross_attentions,
         )
 
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, use_cache=None, **kwargs
-    ):
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_ids.shape)
-
-        if past_key_values:
-            past_length = past_key_values[0][0].shape[2]
-
-            # Some generation methods already pass only the last input ID
-            if input_ids.shape[1] > past_length:
-                remove_prefix_length = past_length
-            else:
-                # Default to old behavior: keep only final ID
-                remove_prefix_length = input_ids.shape[1] - 1
-
-            input_ids = input_ids[:, remove_prefix_length:]
-        # first step, decoder_cached_states are empty
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids.contiguous()}
-
-        model_inputs.update(
-            {
-                "attention_mask": attention_mask,
-                "past_key_values": past_key_values,
-                "use_cache": use_cache,
-            }
-        )
-        return model_inputs
-
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
diff --git a/src/transformers/models/persimmon/modeling_persimmon.py b/src/transformers/models/persimmon/modeling_persimmon.py
index 7d40c481ac0685..7ae3469a4c9399 100644
--- a/src/transformers/models/persimmon/modeling_persimmon.py
+++ b/src/transformers/models/persimmon/modeling_persimmon.py
@@ -621,9 +621,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if self.gradient_checkpointing and self.training:
             if use_cache:
@@ -759,7 +757,7 @@ def _update_causal_mask(
         dtype, device = input_tensor.dtype, input_tensor.device
         sequence_length = input_tensor.shape[1]
         if using_static_cache:
-            target_length = past_key_values.get_max_length()
+            target_length = past_key_values.get_max_cache_shape()
         else:
             target_length = (
                 attention_mask.shape[-1]
diff --git a/src/transformers/models/phi/modeling_phi.py b/src/transformers/models/phi/modeling_phi.py
index cb59bd0df9a1b4..3f770c9ec00b9b 100644
--- a/src/transformers/models/phi/modeling_phi.py
+++ b/src/transformers/models/phi/modeling_phi.py
@@ -43,7 +43,6 @@
     get_torch_version,
     is_flash_attn_2_available,
     is_flash_attn_greater_or_equal_2_10,
-    is_torchdynamo_compiling,
     logging,
     replace_return_docstrings,
 )
@@ -912,9 +911,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if self.gradient_checkpointing and self.training:
             if use_cache:
@@ -1051,7 +1048,7 @@ def _update_causal_mask(
         dtype, device = input_tensor.dtype, input_tensor.device
         sequence_length = input_tensor.shape[1]
         if using_static_cache:
-            target_length = past_key_values.get_max_length()
+            target_length = past_key_values.get_max_cache_shape()
         else:
             target_length = (
                 attention_mask.shape[-1]
@@ -1247,13 +1244,8 @@ def forward(
         )
 
         hidden_states = outputs[0]
-        if labels is None and not is_torchdynamo_compiling():
-            logger.warning_once(
-                "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)"
-            )
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
-        # TODO: remove the float() operation in v4.46
-        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float()
+        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
 
         loss = None
         if labels is not None:
diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py
index 1c1bb34171b613..0380c6cd49d6ea 100644
--- a/src/transformers/models/phi3/modeling_phi3.py
+++ b/src/transformers/models/phi3/modeling_phi3.py
@@ -25,7 +25,7 @@
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...cache_utils import Cache, DynamicCache, StaticCache
+from ...cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
 from ...generation import GenerationMixin
 from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_outputs import (
@@ -41,7 +41,6 @@
     add_start_docstrings_to_model_forward,
     is_flash_attn_2_available,
     is_flash_attn_greater_or_equal_2_10,
-    is_torchdynamo_compiling,
     logging,
     replace_return_docstrings,
 )
@@ -939,9 +938,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if self.gradient_checkpointing and self.training:
             if use_cache:
@@ -1039,7 +1036,6 @@ def forward(
             attentions=all_self_attns,
         )
 
-    # Copied from transformers.models.llama.modeling_llama.LlamaModel._update_causal_mask
     def _update_causal_mask(
         self,
         attention_mask: torch.Tensor,
@@ -1058,21 +1054,30 @@ def _update_causal_mask(
         # to infer the attention mask.
         past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
         using_static_cache = isinstance(past_key_values, StaticCache)
+        using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache)
 
         # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
-        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+        if (
+            self.config._attn_implementation == "sdpa"
+            and not (using_static_cache or using_sliding_window_cache)
+            and not output_attentions
+        ):
             if AttentionMaskConverter._ignore_causal_mask_sdpa(
                 attention_mask,
                 inputs_embeds=input_tensor,
                 past_key_values_length=past_seen_tokens,
+                sliding_window=self.config.sliding_window,
                 is_training=self.training,
             ):
                 return None
 
         dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
         sequence_length = input_tensor.shape[1]
-        if using_static_cache:
-            target_length = past_key_values.get_max_length()
+        # SlidingWindowCache or StaticCache
+        if using_sliding_window_cache or using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        # DynamicCache or no cache
         else:
             target_length = (
                 attention_mask.shape[-1]
@@ -1089,6 +1094,8 @@ def _update_causal_mask(
             device=device,
             cache_position=cache_position,
             batch_size=input_tensor.shape[0],
+            config=self.config,
+            past_key_values=past_key_values,
         )
 
         if (
@@ -1100,13 +1107,12 @@ def _update_causal_mask(
             # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
             # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
             # Details: https://github.com/pytorch/pytorch/issues/110213
-            min_dtype = torch.finfo(dtype).min
             causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
 
         return causal_mask
 
     @staticmethod
-    # Copied from transformers.models.llama.modeling_llama.LlamaModel._prepare_4d_causal_attention_mask_with_cache_position
+    # Copied from transformers.models.mistral.modeling_mistral.MistralModel._prepare_4d_causal_attention_mask_with_cache_position with Mistral->Phi3
     def _prepare_4d_causal_attention_mask_with_cache_position(
         attention_mask: torch.Tensor,
         sequence_length: int,
@@ -1115,6 +1121,8 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
         device: torch.device,
         cache_position: torch.Tensor,
         batch_size: int,
+        config: Phi3Config,
+        past_key_values: Cache,
     ):
         """
         Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
@@ -1122,13 +1130,11 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
 
         Args:
             attention_mask (`torch.Tensor`):
-                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
-                `(batch_size, 1, query_length, key_value_length)`.
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
             sequence_length (`int`):
                 The sequence length being processed.
             target_length (`int`):
-                The target length: when generating with static cache, the mask should be as long as the static cache,
-                to account for the 0 padding, the part of the cache that is not filled yet.
+                The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
@@ -1137,6 +1143,10 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
                 Batch size.
+            config (`Phi3Config`):
+                The model's configuration class
+            past_key_values (`Cache`):
+                The cache class that is being used currently to generate
         """
         if attention_mask is not None and attention_mask.dim() == 4:
             # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
@@ -1146,19 +1156,27 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             causal_mask = torch.full(
                 (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
             )
-            if sequence_length != 1:
-                causal_mask = torch.triu(causal_mask, diagonal=1)
-            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            diagonal_attend_mask = torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            if config.sliding_window is not None:
+                # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also
+                # the check is needed to verify is current checkpoint was trained with sliding window or not
+                if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length:
+                    sliding_attend_mask = torch.arange(target_length, device=device) <= (
+                        cache_position.reshape(-1, 1) - config.sliding_window
+                    )
+                    diagonal_attend_mask |= sliding_attend_mask
+            causal_mask *= diagonal_attend_mask
             causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
             if attention_mask is not None:
                 causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                if attention_mask.shape[-1] > target_length:
+                    attention_mask = attention_mask[:, :target_length]
                 mask_length = attention_mask.shape[-1]
                 padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
                 padding_mask = padding_mask == 0
                 causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
                     padding_mask, min_dtype
                 )
-
         return causal_mask
 
 
@@ -1277,13 +1295,8 @@ def forward(
         )
 
         hidden_states = outputs[0]
-        if labels is None and not is_torchdynamo_compiling():
-            logger.warning_once(
-                "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)"
-            )
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
-        # TODO: remove the float() operation in v4.46
-        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float()
+        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
 
         loss = None
         if labels is not None:
@@ -1324,6 +1337,9 @@ def prepare_inputs_for_generation(
         num_logits_to_keep=None,
         **kwargs,
     ):
+        # Overwritten -- this model may need to switch between short and long rope, invalidating the cache in the
+        # process
+
         # When the first time input length reached long and short factor switching point, enforce re-compute cache
         # It will cause downside of slower at this single token position, however, better than current failure.
         if (
@@ -1372,11 +1388,13 @@ def prepare_inputs_for_generation(
             attention_mask = self.model._prepare_4d_causal_attention_mask_with_cache_position(
                 attention_mask,
                 sequence_length=sequence_length,
-                target_length=past_key_values.get_max_length(),
+                target_length=past_key_values.get_max_cache_shape(),
                 dtype=self.lm_head.weight.dtype,
                 device=device,
                 cache_position=cache_position,
                 batch_size=batch_size,
+                config=self.config,
+                past_key_values=past_key_values,
             )
 
         if num_logits_to_keep is not None:
diff --git a/src/transformers/models/phimoe/__init__.py b/src/transformers/models/phimoe/__init__.py
new file mode 100644
index 00000000000000..e0849f5c5006e5
--- /dev/null
+++ b/src/transformers/models/phimoe/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2024 Microsoft and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_phimoe import *
+    from .modeling_phimoe import *
+
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/phimoe/configuration_phimoe.py b/src/transformers/models/phimoe/configuration_phimoe.py
new file mode 100644
index 00000000000000..7f304281ae73d8
--- /dev/null
+++ b/src/transformers/models/phimoe/configuration_phimoe.py
@@ -0,0 +1,203 @@
+# coding=utf-8
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""PyTorch Phi-MoE model."""
+
+from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class PhimoeConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`PhimoeModel`]. It is used to instantiate a Phi-moe
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the
+    [microsoft/Phi-3.5-MoE-instruct](https://huggingface.co/microsoft/Phi-3.5-MoE-instruct).
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32064):
+            Vocabulary size of the Phimoe model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`PhimoeModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 6400):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
+            The maximum sequence length that this model might ever be used with. Mixtral's sliding window attention
+            allows sequence of up to 4096*32 tokens.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the "end-of-sequence" token.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 1000000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`dict`, *optional*):
+            The scaling strategy for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must
+            contain the following keys: `type`, `short_factor`, `long_factor`, `short_mscale`, `long_mscale` and
+            `original_max_position_embeddings`. The `type` must be `longrope`, the `short_mscale` and `long_scale` must
+            be numbers, the `short_factor` and `long_factor` must be lists of numbers with the same length as half of
+            the attention head size and the `original_max_position_embeddings` must be an integer.
+        sliding_window (`int`, *optional*):
+            Sliding window attention window size. If not specified, will default to `262144`.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        num_experts_per_tok (`int`, *optional*, defaults to 2):
+            The number of experts to root per-token, can be also interpreted as the `top-p` routing
+            parameter
+        num_local_experts (`int`, *optional*, defaults to 16):
+            Number of experts per Sparse MLP layer.
+        output_router_logits (`bool`, *optional*, defaults to `False`):
+            Whether or not the router logits should be returned by the model. Enabeling this will also
+            allow the model to output the auxiliary loss. See [here]() for more details
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
+            The aux loss factor for the total loss.
+        router_jitter_noise (`float`, *optional*, defaults to 0.01):
+            Amount of noise to add to the router.
+        input_jitter_noise (`float`, *optional*, defaults to 0.0): Input jitter noise
+        attention_bias (`bool`, *optional*, defaults to `False`): Attention bias
+        lm_head_bias (`bool`, *optional*, defaults to `False`): LM head bias
+
+    Example:
+
+    ```python
+    >>> from transformers import PhimoeModel, PhimoeConfig
+    >>> # Initializing a Phi-3 style configuration
+    >>> configuration = PhimoeConfig.from_pretrained("microsoft/Phi-3.5-MoE-instruct")
+    >>> # Initializing a model from the configuration
+    >>> model = PhimoeModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "phimoe"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32064,
+        hidden_size=4096,
+        intermediate_size=6400,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        max_position_embeddings=4096 * 32,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=1e6,
+        rope_scaling=None,
+        sliding_window=None,
+        attention_dropout=0.0,
+        num_experts_per_tok=2,
+        num_local_experts=16,
+        output_router_logits=False,
+        router_aux_loss_coef=0.001,
+        router_jitter_noise=0.01,
+        input_jitter_noise=0.0,
+        attention_bias=False,
+        lm_head_bias=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+        self.attention_bias = attention_bias
+        self.lm_head_bias = lm_head_bias
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_local_experts = num_local_experts
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.router_jitter_noise = router_jitter_noise
+        self.input_jitter_noise = input_jitter_noise
+
+        self.rope_scaling = rope_scaling
+        if isinstance(self.rope_scaling, dict):
+            if "rope_type" not in self.rope_scaling:
+                self.rope_scaling["rope_type"] = self.rope_scaling.get("type", None)
+            if "original_max_position_embeddings" in self.rope_scaling:
+                self.original_max_position_embeddings = self.rope_scaling["original_max_position_embeddings"]
+            rope_scaling_short_mscale = self.rope_scaling.get("short_mscale", None)
+            rope_scaling_long_mscale = self.rope_scaling.get("long_mscale", None)
+            if not isinstance(rope_scaling_short_mscale, (int, float)):
+                raise ValueError(
+                    f"`rope_scaling`'s short_mscale field must be a number, got {rope_scaling_short_mscale}"
+                )
+            if not isinstance(rope_scaling_long_mscale, (int, float)):
+                raise ValueError(
+                    f"`rope_scaling`'s long_mscale field must be a number, got {rope_scaling_long_mscale}"
+                )
+
+        rope_config_validation(self)
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+__all__ = ["PhimoeConfig"]
diff --git a/src/transformers/models/phimoe/modeling_phimoe.py b/src/transformers/models/phimoe/modeling_phimoe.py
new file mode 100644
index 00000000000000..d1705f04ddb7bb
--- /dev/null
+++ b/src/transformers/models/phimoe/modeling_phimoe.py
@@ -0,0 +1,1733 @@
+# coding=utf-8
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""PyTorch Phimoe model."""
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import AttentionMaskConverter, _prepare_4d_causal_attention_mask
+from ...modeling_outputs import (
+    MoeCausalLMOutputWithPast,
+    MoeModelOutputWithPast,
+    SequenceClassifierOutputWithPast,
+)
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import is_torch_greater_or_equal_than_1_13
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_torchdynamo_compiling,
+    logging,
+    replace_return_docstrings,
+)
+from ...utils.import_utils import is_torch_fx_available
+from .configuration_phimoe import PhimoeConfig
+
+
+if is_flash_attn_2_available():
+    from ...modeling_flash_attention_utils import _flash_attention_forward
+
+# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
+# It means that the function will not be traced through and simply appear as a node in the graph.
+if is_torch_fx_available():
+    if not is_torch_greater_or_equal_than_1_13:
+        import torch.fx
+
+    _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "PhimoeConfig"
+
+
+# Copied from transformers.models.mixtral.modeling_mixtral.load_balancing_loss_func
+def load_balancing_loss_func(
+    gate_logits: Union[torch.Tensor, Tuple[torch.Tensor], None],
+    num_experts: Optional[int] = None,
+    top_k=2,
+    attention_mask: Optional[torch.Tensor] = None,
+) -> Union[torch.Tensor, int]:
+    r"""
+    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.
+
+    See Switch Transformer (https://arxiv.org/abs/2101.03961) for more details. This function implements the loss
+    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
+    experts is too unbalanced.
+
+    Args:
+        gate_logits:
+            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
+            shape [batch_size X sequence_length, num_experts].
+        num_experts:
+            Number of experts
+        top_k:
+            The number of experts to route per-token, can be also interpreted as the `top-k` routing
+            parameter.
+        attention_mask (`torch.Tensor`, *optional*):
+            The attention_mask used in forward function
+            shape [batch_size X sequence_length] if not None.
+
+    Returns:
+        The auxiliary loss.
+    """
+    if gate_logits is None or not isinstance(gate_logits, tuple):
+        return 0
+
+    if isinstance(gate_logits, tuple):
+        compute_device = gate_logits[0].device
+        concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
+
+    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
+
+    _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
+
+    expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts)
+
+    if attention_mask is None:
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.mean(routing_weights, dim=0)
+    else:
+        batch_size, sequence_length = attention_mask.shape
+        num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length)
+
+        # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
+        expert_attention_mask = (
+            attention_mask[None, :, :, None, None]
+            .expand((num_hidden_layers, batch_size, sequence_length, top_k, num_experts))
+            .reshape(-1, top_k, num_experts)
+            .to(compute_device)
+        )
+
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
+            expert_attention_mask, dim=0
+        )
+
+        # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
+        router_per_expert_attention_mask = (
+            attention_mask[None, :, :, None]
+            .expand((num_hidden_layers, batch_size, sequence_length, num_experts))
+            .reshape(-1, num_experts)
+            .to(compute_device)
+        )
+
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.sum(routing_weights * router_per_expert_attention_mask, dim=0) / torch.sum(
+            router_per_expert_attention_mask, dim=0
+        )
+
+    overall_loss = torch.sum(tokens_per_expert * router_prob_per_expert.unsqueeze(0))
+    return overall_loss * num_experts
+
+
+class PhimoeRotaryEmbedding(nn.Module):
+    def __init__(
+        self,
+        config: Optional[PhimoeConfig] = None,
+    ):
+        super().__init__()
+
+        self.config = config
+        if config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+            self.short_mscale = config.rope_scaling.get("short_mscale")
+            self.long_mscale = config.rope_scaling.get("long_mscale")
+        else:
+            self.rope_type = "default"
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+    def forward(self, x, seq_len=None):
+        mscale = None
+        if self.config.rope_scaling and seq_len:
+            mscale = (
+                self.long_mscale
+                if seq_len > self.config.rope_scaling["original_max_position_embeddings"]
+                else self.short_mscale
+            )
+        inv_freq, attention_scaling = self.rope_init_fn(self.config, x.device, seq_len)
+        mscale = attention_scaling if mscale is None else mscale
+        t = torch.arange(seq_len, device=x.device, dtype=torch.float32)
+        freqs = torch.outer(t, inv_freq)
+
+        emb = torch.cat((freqs, freqs), dim=-1)
+        return (emb.cos() * mscale).to(x.dtype), (emb.sin() * mscale).to(x.dtype)
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+# Copied from transformers.models.mixtral.modeling_mixtral.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class PhimoeAttention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
+    and "Generating Long Sequences with Sparse Transformers".
+    """
+
+    def __init__(self, config: PhimoeConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=self.config.attention_bias)
+        self.k_proj = nn.Linear(
+            self.hidden_size, self.num_key_value_heads * self.head_dim, bias=self.config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            self.hidden_size, self.num_key_value_heads * self.head_dim, bias=self.config.attention_bias
+        )
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=self.config.attention_bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class PhimoeFlashAttention2(PhimoeAttention):
+    """
+    Phimoe flash attention module. This module inherits from `PhimoeAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ):
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            # Activate slicing cache only if the config has a value `sliding_windows` attribute
+            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
+            if (
+                getattr(self.config, "sliding_window", None) is not None
+                and kv_seq_len > self.config.sliding_window
+                and cache_has_contents
+            ):
+                slicing_tokens = 1 - self.config.sliding_window
+
+                past_key = past_key_value[self.layer_idx][0]
+                past_value = past_key_value[self.layer_idx][1]
+
+                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
+                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
+
+                if past_key.shape[-2] != self.config.sliding_window - 1:
+                    raise ValueError(
+                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
+                        f" {past_key.shape}"
+                    )
+
+                if attention_mask is not None:
+                    attention_mask = attention_mask[:, slicing_tokens:]
+                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
+
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        dropout_rate = 0.0 if not self.training else self.attention_dropout
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        # Reashape to the expected shape for Flash Attention
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        attn_output = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            position_ids=position_ids,
+            dropout=dropout_rate,
+            sliding_window=getattr(self.config, "sliding_window", None),
+            is_causal=self.is_causal,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class PhimoeSdpaAttention(PhimoeAttention):
+    """
+    Phimoe attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `PhimoeAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    # Adapted from PhimoeAttention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "PhimoeModel is using PhimoeSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                position_embeddings=position_embeddings,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        causal_mask = attention_mask
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+        is_causal = True if causal_mask is None and q_len > 1 else False
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+
+PHIMOE_ATTENTION_CLASSES = {
+    "eager": PhimoeAttention,
+    "flash_attention_2": PhimoeFlashAttention2,
+    "sdpa": PhimoeSdpaAttention,
+}
+
+
+# Copied from transformers.models.mixtral.modeling_mixtral.MixtralBlockSparseTop2MLP with Mixtral->Phimoe
+class PhimoeBlockSparseTop2MLP(nn.Module):
+    def __init__(self, config: PhimoeConfig):
+        super().__init__()
+        self.ffn_dim = config.intermediate_size
+        self.hidden_dim = config.hidden_size
+
+        self.w1 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)
+        self.w2 = nn.Linear(self.ffn_dim, self.hidden_dim, bias=False)
+        self.w3 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)
+
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states):
+        current_hidden_states = self.act_fn(self.w1(hidden_states)) * self.w3(hidden_states)
+        current_hidden_states = self.w2(current_hidden_states)
+        return current_hidden_states
+
+
+class MultiplierProcessor(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        scores: torch.Tensor,
+        multiplier: torch.Tensor,
+        selected_experts: torch.Tensor,
+        masked_gates: torch.Tensor,
+        mask_for_one: torch.Tensor,
+    ):
+        """
+        Forward pass for the custom autograd function.
+
+        Args:
+            ctx: Context object to save information for backward computation.
+            scores (torch.Tensor): Input scores tensor.
+            multiplier (torch.Tensor): Multiplier tensor.
+            selected_experts (torch.Tensor): Tensor of selected experts.
+            masked_gates (torch.Tensor): Masked gates tensor.
+            mask_for_one (torch.Tensor): Mask for one tensor.
+
+        Returns:
+            torch.Tensor: Result of the forward pass.
+        """
+        ctx.save_for_backward(multiplier, selected_experts, masked_gates)
+        return multiplier * mask_for_one
+
+    @staticmethod
+    def backward(
+        ctx,
+        grad_at_output: torch.Tensor,
+    ):
+        """
+        Backward pass for the custom autograd function.
+
+        Args:
+            ctx: Context object with saved tensors from the forward pass.
+            grad_at_output (torch.Tensor): Gradient at the output.
+
+        Returns:
+            Tuple[torch.Tensor, None, None, None, None]: Gradients for the inputs.
+        """
+        multiplier, selected_experts, masked_gates = ctx.saved_tensors
+
+        grad_at_output = grad_at_output * multiplier
+
+        grad_at_scores_expanded = masked_gates * grad_at_output.mul(-1)
+        grad_at_scores_expanded.scatter_add_(
+            dim=-1,
+            index=selected_experts,
+            src=grad_at_output,
+        )
+
+        return (
+            grad_at_scores_expanded,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+def sparsemixer(scores, jitter_eps, training, top_k=2):
+    """
+    Sparse mixer function to select top-k experts and compute multipliers.
+    Based on the paper: https://arxiv.org/pdf/2409.12136
+    We first replace the TopK(·) function as random sampling of discrete variables
+    in model training. Then, following Liu et al. (2023a) and Liu et al. (2023b), we apply Heun's
+    third order method to approximate the expert routing gradient and construct a modified
+    back-propagation to give a mathematically sound gradient estimation for expert routing.
+
+    Args:
+        scores (torch.Tensor): Input scores tensor.
+        jitter_eps (float): Jitter epsilon for numerical stability.
+        training (bool): Flag indicating if the model is in training mode.
+        top_k (int): Number of top experts to select.
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: Multiplier and selected experts tensors.
+    """
+    if top_k != 2:
+        raise ValueError("top_k must be equal to 2")
+
+    # first expert
+
+    with torch.no_grad():
+        # Compute mask for sparsity
+        mask_logits_threshold, max_ind = scores.max(dim=-1, keepdim=True)
+        factor = scores.abs().clamp(min=mask_logits_threshold)
+        mask_logits_threshold = ((mask_logits_threshold - scores) / factor) > (2 * jitter_eps)
+
+    # Apply mask
+    masked_gates = scores.masked_fill(mask_logits_threshold, float("-inf"))
+    if training:
+        selected_experts = (
+            (
+                masked_gates
+                - torch.empty_like(masked_gates, memory_format=torch.legacy_contiguous_format).exponential_().log()
+            )
+            .max(dim=-1)[1]
+            .unsqueeze(-1)
+        )  # Gumbel sampling, more robust than the multinomial method
+    else:
+        selected_experts = max_ind
+
+    # Compute scores for gradients
+    masked_gates = torch.softmax(masked_gates, dim=-1)
+    multiplier_o = masked_gates.gather(dim=-1, index=selected_experts)
+
+    if training:
+        # Compute midpoint mask
+        max_scores, max_ind = masked_gates.max(dim=-1, keepdim=True)
+        mask_for_one = torch.logical_or(
+            selected_experts == max_ind,
+            torch.rand_like(max_scores) > 0.75,  # Heun's third-order method
+        )
+        # 1 -> 1.0 & 0 -> 1./3: lambda x: (x + 0.5) / 1.5
+        mask_for_one = torch.add(0.3333, mask_for_one, alpha=0.6667).type_as(masked_gates)
+
+        multiplier = MultiplierProcessor.apply(
+            scores,
+            multiplier_o,
+            selected_experts,
+            masked_gates,
+            mask_for_one,
+        )
+    else:
+        multiplier = multiplier_o
+
+    # Masked out first expert
+    masked_scores = torch.scatter(
+        scores,
+        -1,
+        selected_experts,
+        float("-inf"),
+    )
+    with torch.no_grad():
+        # Compute mask for sparsity
+        mask_logits_threshold, max_ind = masked_scores.max(dim=-1, keepdim=True)
+        factor = scores.abs().clamp(min=mask_logits_threshold)
+        mask_logits_threshold = ((mask_logits_threshold - scores) / factor) > (2 * jitter_eps)
+
+    # Apply mask
+    masked_gates_top2 = masked_scores.masked_fill(mask_logits_threshold, float("-inf"))
+    if training:
+        selected_experts_top2 = (
+            (
+                masked_gates_top2
+                - torch.empty_like(masked_gates_top2, memory_format=torch.legacy_contiguous_format)
+                .exponential_()
+                .log()
+            )
+            .max(dim=-1)[1]
+            .unsqueeze(-1)
+        )  # Gumbel sampling, more robust than the multinomial method
+    else:
+        selected_experts_top2 = max_ind
+    # Compute scores for gradients
+    masked_gates_top2 = torch.softmax(masked_gates_top2, dim=-1)
+    multiplier_top2_o = masked_gates_top2.gather(dim=-1, index=selected_experts_top2)
+
+    if training:
+        # Compute midpoint mask
+        max_scores, max_ind = masked_gates_top2.max(dim=-1, keepdim=True)
+        mask_for_one_top2 = torch.logical_or(
+            selected_experts_top2 == max_ind,
+            torch.rand_like(max_scores).uniform_() > 0.75,  # Heun's third-order method
+        )
+        # 1 -> 1.0 & 0 -> 1./3: lambda x: (x + 0.5) / 1.5
+        mask_for_one_top2 = torch.add(0.3333, mask_for_one_top2, alpha=0.6667).type_as(masked_gates_top2)
+
+        multiplier_top2 = MultiplierProcessor.apply(
+            scores,
+            multiplier_top2_o,
+            selected_experts_top2,
+            masked_gates_top2,
+            mask_for_one_top2,
+        )
+    else:
+        multiplier_top2 = multiplier_top2_o
+
+    multiplier = torch.concat((multiplier, multiplier_top2), dim=-1)
+    selected_experts = torch.concat((selected_experts, selected_experts_top2), dim=-1)
+
+    return (
+        multiplier,
+        selected_experts,
+    )
+
+
+class PhimoeSparseMoeBlock(nn.Module):
+    """
+    This implementation is
+    strictly equivalent to standard MoE with full capacity (no
+    dropped tokens). It's faster since it formulates MoE operations
+    in terms of block-sparse operations to accomodate imbalanced
+    assignments of tokens to experts, whereas standard MoE either
+    (1) drop tokens at the cost of reduced performance or (2) set
+    capacity factor to number of experts and thus waste computation
+    and memory on padding.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_dim = config.hidden_size
+        self.ffn_dim = config.intermediate_size
+        self.num_experts = config.num_local_experts
+        self.top_k = config.num_experts_per_tok
+        # gating
+        self.gate = nn.Linear(self.hidden_dim, self.num_experts, bias=False)
+
+        self.experts = nn.ModuleList([PhimoeBlockSparseTop2MLP(config) for _ in range(self.num_experts)])
+
+        # Jitter parameters
+        self.router_jitter_noise = config.router_jitter_noise
+        self.input_jitter_noise = config.input_jitter_noise
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """ """
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        if self.training and self.input_jitter_noise > 0:
+            hidden_states *= torch.empty_like(hidden_states).uniform_(
+                1.0 - self.input_jitter_noise, 1.0 + self.input_jitter_noise
+            )
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        router_logits = self.gate(hidden_states)
+
+        routing_weights, selected_experts = sparsemixer(
+            router_logits,
+            jitter_eps=self.router_jitter_noise,
+            training=self.training,
+        )
+
+        final_hidden_states = torch.zeros(
+            (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
+        )
+
+        # One hot encode the selected experts to create an expert mask
+        # this will be used to easily index which expert is going to be sollicitated
+        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
+
+        # Loop over all available experts in the model and perform the computation on each expert
+        for expert_idx in range(self.num_experts):
+            expert_layer = self.experts[expert_idx]
+            idx, top_x = torch.where(expert_mask[expert_idx])
+
+            if top_x.shape[0] == 0:
+                continue
+
+            # Index the correct hidden states and compute the expert hidden state for
+            # the current expert. We need to make sure to multiply the output hidden
+            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
+            current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
+            current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]
+
+            # However `index_add_` only support torch tensors for indexing so we'll use
+            # the `top_x` tensor here.
+            final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
+        final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
+        return final_hidden_states, router_logits
+
+
+class PhimoeDecoderLayer(nn.Module):
+    def __init__(self, config: PhimoeConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = PHIMOE_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+
+        self.block_sparse_moe = PhimoeSparseMoeBlock(config)
+        self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps, elementwise_affine=True)
+        self.post_attention_layernorm = nn.LayerNorm(
+            config.hidden_size, eps=config.rms_norm_eps, elementwise_affine=True
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        output_router_logits: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_router_logits (`bool`, *optional*):
+                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
+                should not be returned during inference.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states, router_logits = self.block_sparse_moe(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        if output_router_logits:
+            outputs += (router_logits,)
+
+        return outputs
+
+
+PHIMOE_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`PhimoeConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Phimoe Model outputting raw hidden-states without any specific head on top.",
+    PHIMOE_START_DOCSTRING,
+)
+# Copied from transformers.models.mixtral.modeling_mixtral.MixtralPreTrainedModel with Mixtral->Phimoe
+class PhimoePreTrainedModel(PreTrainedModel):
+    config_class = PhimoeConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["PhimoeDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+PHIMOE_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        output_router_logits (`bool`, *optional*):
+            Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
+            should not be returned during inference.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+
+
+@add_start_docstrings(
+    "The bare Phimoe Model outputting raw hidden-states without any specific head on top.",
+    PHIMOE_START_DOCSTRING,
+)
+class PhimoeModel(PhimoePreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`PhimoeDecoderLayer`]
+    Args:
+        config: PhimoeConfig
+    """
+
+    def __init__(self, config: PhimoeConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [PhimoeDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self._attn_implementation = config._attn_implementation
+        self.norm = nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps, elementwise_affine=True)
+        self.rotary_emb = PhimoeRotaryEmbedding(config=config)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(PHIMOE_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, MoeModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_router_logits = (
+            output_router_logits if output_router_logits is not None else self.config.output_router_logits
+        )
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # kept for BC (non `Cache` `past_key_values` inputs)
+        return_legacy_cache = False
+        if use_cache and not isinstance(past_key_values, Cache):
+            return_legacy_cache = True
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+
+        hidden_states = inputs_embeds
+
+        position_embeddings = self.rotary_emb(hidden_states, seq_len=cache_position[-1] + 1)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_router_logits = () if output_router_logits else None
+        next_decoder_cache = None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    output_router_logits,
+                    use_cache,
+                    cache_position,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    output_router_logits=output_router_logits,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+            if output_router_logits:
+                all_router_logits += (layer_outputs[-1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = next_cache.to_legacy_cache()
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_router_logits]
+                if v is not None
+            )
+        return MoeModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            router_logits=all_router_logits,
+        )
+
+    # Copied from transformers.models.phi3.modeling_phi3.Phi3Model._update_causal_mask
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+        using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache)
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if (
+            self.config._attn_implementation == "sdpa"
+            and not (using_static_cache or using_sliding_window_cache)
+            and not output_attentions
+        ):
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                sliding_window=self.config.sliding_window,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
+        sequence_length = input_tensor.shape[1]
+        # SlidingWindowCache or StaticCache
+        if using_sliding_window_cache or using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        # DynamicCache or no cache
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            device=device,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+            config=self.config,
+            past_key_values=past_key_values,
+        )
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type == "cuda"
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+    @staticmethod
+    # Copied from transformers.models.mistral.modeling_mistral.MistralModel._prepare_4d_causal_attention_mask_with_cache_position with Mistral->Phimoe
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        config: PhimoeConfig,
+        past_key_values: Cache,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            device (`torch.device`):
+                The device to plcae the 4D attention mask on.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+            config (`PhimoeConfig`):
+                The model's configuration class
+            past_key_values (`Cache`):
+                The cache class that is being used currently to generate
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+            )
+            diagonal_attend_mask = torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            if config.sliding_window is not None:
+                # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also
+                # the check is needed to verify is current checkpoint was trained with sliding window or not
+                if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length:
+                    sliding_attend_mask = torch.arange(target_length, device=device) <= (
+                        cache_position.reshape(-1, 1) - config.sliding_window
+                    )
+                    diagonal_attend_mask |= sliding_attend_mask
+            causal_mask *= diagonal_attend_mask
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                if attention_mask.shape[-1] > target_length:
+                    attention_mask = attention_mask[:, :target_length]
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+        return causal_mask
+
+
+class PhimoeForCausalLM(PhimoePreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = PhimoeModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=self.config.lm_head_bias)
+        self.router_aux_loss_coef = config.router_aux_loss_coef
+        self.num_experts = config.num_local_experts
+        self.num_experts_per_tok = config.num_experts_per_tok
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_input_embeddings
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_output_embeddings
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_output_embeddings
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_decoder
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_decoder
+    def get_decoder(self):
+        return self.model
+
+    @add_start_docstrings_to_model_forward(PHIMOE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=MoeCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    # Ignore copy
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        num_logits_to_keep: int = 0,
+    ) -> Union[Tuple, MoeCausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+            num_logits_to_keep (`int`, *optional*):
+                Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, PhimoeForCausalLM
+        >>> model = PhimoeForCausalLM.from_pretrained("microsoft/Phi-3.5-MoE-instruct")
+        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-MoE-instruct")
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        if (
+            use_cache
+            and self.config.rope_scaling
+            and cache_position is not None
+            and cache_position[0] == self.config.original_max_position_embeddings
+        ):
+            logger.warning(
+                f"If you are not using the generate method, you may encounter nonsensical outputs after the {self.config.original_max_position_embeddings}th token, as the KV cache needs to be recomputed."
+            )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_router_logits = (
+            output_router_logits if output_router_logits is not None else self.config.output_router_logits
+        )
+
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            output_router_logits=output_router_logits,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+
+        hidden_states = outputs[0]
+        if labels is None and not is_torchdynamo_compiling():
+            logger.warning_once(
+                "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)"
+            )
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        # TODO: remove the float() operation in v4.46
+        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float()
+
+        loss = None
+        if labels is not None:
+            # Upcast to float if we need to compute the loss to avoid potential precision issues
+            logits = logits.float()
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+
+        aux_loss = None
+        if output_router_logits:
+            aux_loss = load_balancing_loss_func(
+                outputs.router_logits if return_dict else outputs[-1],
+                self.num_experts,
+                self.num_experts_per_tok,
+                attention_mask,
+            )
+            if labels is not None:
+                loss += self.router_aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            if output_router_logits:
+                output = (aux_loss,) + output
+            return (loss,) + output if loss is not None else output
+
+        return MoeCausalLMOutputWithPast(
+            loss=loss,
+            aux_loss=aux_loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            router_logits=outputs.router_logits,
+        )
+
+    # Copied from transformers.models.phi3.modeling_phi3.Phi3ForCausalLM.prepare_inputs_for_generation
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        num_logits_to_keep=None,
+        **kwargs,
+    ):
+        # Overwritten -- this model may need to switch between short and long rope, invalidating the cache in the
+        # process
+
+        # When the first time input length reached long and short factor switching point, enforce re-compute cache
+        # It will cause downside of slower at this single token position, however, better than current failure.
+        if (
+            past_key_values
+            and self.config.rope_scaling
+            and input_ids.shape[1] >= self.config.original_max_position_embeddings + 1
+        ):
+            past_length = cache_position[0]
+            if past_length <= self.config.original_max_position_embeddings:
+                past_key_values = None
+
+        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
+        # Exception 1: when passing input_embeds, input_ids may be missing entries
+        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
+        if past_key_values is not None:
+            if inputs_embeds is not None:  # Exception 1
+                input_ids = input_ids[:, -cache_position.shape[0] :]
+            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
+                input_ids = input_ids[:, cache_position]
+
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+                # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s  `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture.
+                position_ids = position_ids.clone(memory_format=torch.contiguous_format)
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and cache_position[0] == 0:
+            model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
+        else:
+            # The clone here is for the same reason as for `position_ids`.
+            model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
+
+        if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
+            if model_inputs["inputs_embeds"] is not None:
+                batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
+                device = model_inputs["inputs_embeds"].device
+            else:
+                batch_size, sequence_length = model_inputs["input_ids"].shape
+                device = model_inputs["input_ids"].device
+
+            attention_mask = self.model._prepare_4d_causal_attention_mask_with_cache_position(
+                attention_mask,
+                sequence_length=sequence_length,
+                target_length=past_key_values.get_max_cache_shape(),
+                dtype=self.lm_head.weight.dtype,
+                device=device,
+                cache_position=cache_position,
+                batch_size=batch_size,
+                config=self.config,
+                past_key_values=past_key_values,
+            )
+
+        if num_logits_to_keep is not None:
+            model_inputs["num_logits_to_keep"] = num_logits_to_keep
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+
+@add_start_docstrings(
+    """
+    The Phimoe Model transformer with a sequence classification head on top (linear layer).
+    [`PhimoeForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    PHIMOE_START_DOCSTRING,
+)
+
+# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Phimoe, LLAMA->PHIMOE
+class PhimoeForSequenceClassification(PhimoePreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = PhimoeModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(PHIMOE_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.device)
+            else:
+                sequence_lengths = -1
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+__all__ = [
+    "PhimoePreTrainedModel",
+    "PhimoeModel",
+    "PhimoeForCausalLM",
+    "PhimoeForSequenceClassification",
+]
diff --git a/src/transformers/models/pixtral/image_processing_pixtral.py b/src/transformers/models/pixtral/image_processing_pixtral.py
index c6d18420bec575..a75704fc3dbac8 100644
--- a/src/transformers/models/pixtral/image_processing_pixtral.py
+++ b/src/transformers/models/pixtral/image_processing_pixtral.py
@@ -86,10 +86,10 @@ def to(self, *args, **kwargs) -> "BatchMixFeature":
                 new_data[k] = [
                     element.to(*args, **kwargs) for sample in v for element in sample if is_torch_tensor(element)
                 ]
-            elif torch.is_floating_point(v):
+            elif isinstance(v, torch.Tensor) and torch.is_floating_point(v):
                 # cast and send to device
                 new_data[k] = v.to(*args, **kwargs)
-            elif device is not None:
+            elif isinstance(v, torch.Tensor) and device is not None:
                 new_data[k] = v.to(device=device)
             else:
                 new_data[k] = v
diff --git a/src/transformers/models/pixtral/processing_pixtral.py b/src/transformers/models/pixtral/processing_pixtral.py
index d336760c6d9c66..70d28fb7b79c93 100644
--- a/src/transformers/models/pixtral/processing_pixtral.py
+++ b/src/transformers/models/pixtral/processing_pixtral.py
@@ -90,10 +90,10 @@ def to(self, *args, **kwargs) -> "BatchMixFeature":
                 new_data[k] = [
                     element.to(*args, **kwargs) for sample in v for element in sample if is_torch_tensor(element)
                 ]
-            elif torch.is_floating_point(v):
+            elif isinstance(v, torch.Tensor) and torch.is_floating_point(v):
                 # cast and send to device
                 new_data[k] = v.to(*args, **kwargs)
-            elif device is not None:
+            elif isinstance(v, torch.Tensor) and device is not None:
                 new_data[k] = v.to(device=device)
             else:
                 new_data[k] = v
diff --git a/src/transformers/models/plbart/modeling_plbart.py b/src/transformers/models/plbart/modeling_plbart.py
index d15e079770a3ed..4f6984a7bef638 100644
--- a/src/transformers/models/plbart/modeling_plbart.py
+++ b/src/transformers/models/plbart/modeling_plbart.py
@@ -1747,32 +1747,6 @@ def forward(
             cross_attentions=outputs.cross_attentions,
         )
 
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs
-    ):
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_ids.shape)
-
-        if past_key_values:
-            past_length = past_key_values[0][0].shape[2]
-
-            # Some generation methods already pass only the last input ID
-            if input_ids.shape[1] > past_length:
-                remove_prefix_length = past_length
-            else:
-                # Default to old behavior: keep only final ID
-                remove_prefix_length = input_ids.shape[1] - 1
-
-            input_ids = input_ids[:, remove_prefix_length:]
-        # first step, decoder_cached_states are empty
-        return {
-            "input_ids": input_ids,  # encoder_outputs is defined. input_ids not needed
-            "attention_mask": attention_mask,
-            "past_key_values": past_key_values,
-            "use_cache": use_cache,
-        }
-
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
diff --git a/src/transformers/models/prophetnet/modeling_prophetnet.py b/src/transformers/models/prophetnet/modeling_prophetnet.py
index 7d23088f6e575e..003e4f15d2d977 100644
--- a/src/transformers/models/prophetnet/modeling_prophetnet.py
+++ b/src/transformers/models/prophetnet/modeling_prophetnet.py
@@ -2290,6 +2290,8 @@ def prepare_inputs_for_generation(
         use_cache=None,
         **kwargs,
     ):
+        # Overwritten -- our tests complain if we use GenerationMixin.prepare_inputs_for_generation
+
         # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
         if attention_mask is None:
             attention_mask = input_ids.new_ones(input_ids.shape)
diff --git a/src/transformers/models/qwen2/modeling_qwen2.py b/src/transformers/models/qwen2/modeling_qwen2.py
index 9a970a4a1b2fc6..50f273ba766ca9 100644
--- a/src/transformers/models/qwen2/modeling_qwen2.py
+++ b/src/transformers/models/qwen2/modeling_qwen2.py
@@ -28,7 +28,7 @@
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...cache_utils import Cache, DynamicCache, StaticCache
+from ...cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
 from ...generation import GenerationMixin
 from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_outputs import (
@@ -44,7 +44,6 @@
     add_start_docstrings_to_model_forward,
     is_flash_attn_2_available,
     is_flash_attn_greater_or_equal_2_10,
-    is_torchdynamo_compiling,
     logging,
     replace_return_docstrings,
 )
@@ -852,9 +851,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if self.gradient_checkpointing and self.training:
             if use_cache:
@@ -957,7 +954,7 @@ def forward(
             attentions=all_self_attns,
         )
 
-    # Copied from transformers.models.llama.modeling_llama.LlamaModel._update_causal_mask
+    # Copied from transformers.models.phi3.modeling_phi3.Phi3Model._update_causal_mask
     def _update_causal_mask(
         self,
         attention_mask: torch.Tensor,
@@ -976,22 +973,30 @@ def _update_causal_mask(
         # to infer the attention mask.
         past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
         using_static_cache = isinstance(past_key_values, StaticCache)
+        using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache)
 
         # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
-        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+        if (
+            self.config._attn_implementation == "sdpa"
+            and not (using_static_cache or using_sliding_window_cache)
+            and not output_attentions
+        ):
             if AttentionMaskConverter._ignore_causal_mask_sdpa(
                 attention_mask,
                 inputs_embeds=input_tensor,
                 past_key_values_length=past_seen_tokens,
+                sliding_window=self.config.sliding_window,
                 is_training=self.training,
             ):
                 return None
 
         dtype, device = input_tensor.dtype, input_tensor.device
-
+        min_dtype = torch.finfo(dtype).min
         sequence_length = input_tensor.shape[1]
-        if using_static_cache:
-            target_length = past_key_values.get_max_length()
+        # SlidingWindowCache or StaticCache
+        if using_sliding_window_cache or using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        # DynamicCache or no cache
         else:
             target_length = (
                 attention_mask.shape[-1]
@@ -1008,6 +1013,8 @@ def _update_causal_mask(
             device=device,
             cache_position=cache_position,
             batch_size=input_tensor.shape[0],
+            config=self.config,
+            past_key_values=past_key_values,
         )
 
         if (
@@ -1019,13 +1026,12 @@ def _update_causal_mask(
             # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
             # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
             # Details: https://github.com/pytorch/pytorch/issues/110213
-            min_dtype = torch.finfo(dtype).min
             causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
 
         return causal_mask
 
     @staticmethod
-    # Copied from transformers.models.llama.modeling_llama.LlamaModel._prepare_4d_causal_attention_mask_with_cache_position
+    # Copied from transformers.models.mistral.modeling_mistral.MistralModel._prepare_4d_causal_attention_mask_with_cache_position with Mistral->Qwen2
     def _prepare_4d_causal_attention_mask_with_cache_position(
         attention_mask: torch.Tensor,
         sequence_length: int,
@@ -1034,6 +1040,8 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
         device: torch.device,
         cache_position: torch.Tensor,
         batch_size: int,
+        config: Qwen2Config,
+        past_key_values: Cache,
     ):
         """
         Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
@@ -1041,13 +1049,11 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
 
         Args:
             attention_mask (`torch.Tensor`):
-                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
-                `(batch_size, 1, query_length, key_value_length)`.
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
             sequence_length (`int`):
                 The sequence length being processed.
             target_length (`int`):
-                The target length: when generating with static cache, the mask should be as long as the static cache,
-                to account for the 0 padding, the part of the cache that is not filled yet.
+                The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
@@ -1056,6 +1062,10 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
                 Batch size.
+            config (`Qwen2Config`):
+                The model's configuration class
+            past_key_values (`Cache`):
+                The cache class that is being used currently to generate
         """
         if attention_mask is not None and attention_mask.dim() == 4:
             # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
@@ -1065,19 +1075,27 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             causal_mask = torch.full(
                 (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
             )
-            if sequence_length != 1:
-                causal_mask = torch.triu(causal_mask, diagonal=1)
-            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            diagonal_attend_mask = torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            if config.sliding_window is not None:
+                # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also
+                # the check is needed to verify is current checkpoint was trained with sliding window or not
+                if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length:
+                    sliding_attend_mask = torch.arange(target_length, device=device) <= (
+                        cache_position.reshape(-1, 1) - config.sliding_window
+                    )
+                    diagonal_attend_mask |= sliding_attend_mask
+            causal_mask *= diagonal_attend_mask
             causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
             if attention_mask is not None:
                 causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                if attention_mask.shape[-1] > target_length:
+                    attention_mask = attention_mask[:, :target_length]
                 mask_length = attention_mask.shape[-1]
                 padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
                 padding_mask = padding_mask == 0
                 causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
                     padding_mask, min_dtype
                 )
-
         return causal_mask
 
 
@@ -1180,13 +1198,8 @@ def forward(
         )
 
         hidden_states = outputs[0]
-        if labels is None and not is_torchdynamo_compiling():
-            logger.warning_once(
-                "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)"
-            )
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
-        # TODO: remove the float() operation in v4.46
-        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float()
+        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
 
         loss = None
         if labels is not None:
@@ -1215,6 +1228,79 @@ def forward(
             attentions=outputs.attentions,
         )
 
+    # Copied from transformers.models.mistral.modeling_mistral.MistralForCausalLM.prepare_inputs_for_generation
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        num_logits_to_keep=None,
+        **kwargs,
+    ):
+        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
+        # Exception 1: when passing input_embeds, input_ids may be missing entries
+        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
+        if past_key_values is not None:
+            if inputs_embeds is not None:  # Exception 1
+                input_ids = input_ids[:, -cache_position.shape[0] :]
+            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
+                input_ids = input_ids[:, cache_position]
+
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+                # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s  `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture.
+                position_ids = position_ids.clone(memory_format=torch.contiguous_format)
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and cache_position[0] == 0:
+            model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
+        else:
+            # `contiguous()` needed for compilation use cases
+            model_inputs = {"input_ids": input_ids.contiguous(), "inputs_embeds": None}
+
+        if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
+            if model_inputs["inputs_embeds"] is not None:
+                batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
+                device = model_inputs["inputs_embeds"].device
+            else:
+                batch_size, sequence_length = model_inputs["input_ids"].shape
+                device = model_inputs["input_ids"].device
+
+            attention_mask = self.model._prepare_4d_causal_attention_mask_with_cache_position(
+                attention_mask,
+                sequence_length=sequence_length,
+                target_length=past_key_values.get_max_cache_shape(),
+                dtype=self.lm_head.weight.dtype,
+                device=device,
+                cache_position=cache_position,
+                batch_size=batch_size,
+                config=self.config,
+                past_key_values=past_key_values,
+            )
+
+        if num_logits_to_keep is not None:
+            model_inputs["num_logits_to_keep"] = num_logits_to_keep
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
 
 @add_start_docstrings(
     """
diff --git a/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py b/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py
index bf48e1c6a97ef9..6422baac5feb5e 100644
--- a/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py
+++ b/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py
@@ -291,7 +291,7 @@ def forward(
 
         causal_mask = attention_mask
         if attention_mask is not None:  # no matter the length, we just slice it
-            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            causal_mask = attention_mask[:, : key_states.shape[-2]]
 
         # In PEFT, usually we cast the layer norms in float32 for training stability reasons
         # therefore the input hidden states gets silently casted in float32. Hence, we need
@@ -325,7 +325,7 @@ def forward(
             value_states,
             causal_mask,
             tgt_len,
-            dropout=self.dropout,
+            dropout=self.dropout if self.training else 0.0,
             is_causal=self.is_causal,
             use_top_left_mask=self._flash_attn_uses_top_left_mask,
         )
diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
index 2274e96245d3c4..2ab13b7227ada6 100644
--- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
+++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
@@ -29,7 +29,7 @@
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...cache_utils import Cache, DynamicCache, StaticCache
+from ...cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
 from ...generation import GenerationMixin
 from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_outputs import (
@@ -45,7 +45,6 @@
     add_start_docstrings_to_model_forward,
     is_flash_attn_2_available,
     is_flash_attn_greater_or_equal_2_10,
-    is_torchdynamo_compiling,
     logging,
     replace_return_docstrings,
 )
@@ -1022,9 +1021,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if self.gradient_checkpointing and self.training:
             if use_cache:
@@ -1138,7 +1135,7 @@ def forward(
             router_logits=all_router_logits,
         )
 
-    # Copied from transformers.models.llama.modeling_llama.LlamaModel._update_causal_mask
+    # Copied from transformers.models.phi3.modeling_phi3.Phi3Model._update_causal_mask
     def _update_causal_mask(
         self,
         attention_mask: torch.Tensor,
@@ -1157,21 +1154,30 @@ def _update_causal_mask(
         # to infer the attention mask.
         past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
         using_static_cache = isinstance(past_key_values, StaticCache)
+        using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache)
 
         # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
-        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+        if (
+            self.config._attn_implementation == "sdpa"
+            and not (using_static_cache or using_sliding_window_cache)
+            and not output_attentions
+        ):
             if AttentionMaskConverter._ignore_causal_mask_sdpa(
                 attention_mask,
                 inputs_embeds=input_tensor,
                 past_key_values_length=past_seen_tokens,
+                sliding_window=self.config.sliding_window,
                 is_training=self.training,
             ):
                 return None
 
         dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
         sequence_length = input_tensor.shape[1]
-        if using_static_cache:
-            target_length = past_key_values.get_max_length()
+        # SlidingWindowCache or StaticCache
+        if using_sliding_window_cache or using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        # DynamicCache or no cache
         else:
             target_length = (
                 attention_mask.shape[-1]
@@ -1188,6 +1194,8 @@ def _update_causal_mask(
             device=device,
             cache_position=cache_position,
             batch_size=input_tensor.shape[0],
+            config=self.config,
+            past_key_values=past_key_values,
         )
 
         if (
@@ -1199,13 +1207,12 @@ def _update_causal_mask(
             # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
             # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
             # Details: https://github.com/pytorch/pytorch/issues/110213
-            min_dtype = torch.finfo(dtype).min
             causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
 
         return causal_mask
 
     @staticmethod
-    # Copied from transformers.models.llama.modeling_llama.LlamaModel._prepare_4d_causal_attention_mask_with_cache_position
+    # Copied from transformers.models.mistral.modeling_mistral.MistralModel._prepare_4d_causal_attention_mask_with_cache_position with Mistral->Qwen2Moe
     def _prepare_4d_causal_attention_mask_with_cache_position(
         attention_mask: torch.Tensor,
         sequence_length: int,
@@ -1214,6 +1221,8 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
         device: torch.device,
         cache_position: torch.Tensor,
         batch_size: int,
+        config: Qwen2MoeConfig,
+        past_key_values: Cache,
     ):
         """
         Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
@@ -1221,13 +1230,11 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
 
         Args:
             attention_mask (`torch.Tensor`):
-                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
-                `(batch_size, 1, query_length, key_value_length)`.
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
             sequence_length (`int`):
                 The sequence length being processed.
             target_length (`int`):
-                The target length: when generating with static cache, the mask should be as long as the static cache,
-                to account for the 0 padding, the part of the cache that is not filled yet.
+                The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
@@ -1236,6 +1243,10 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
                 Batch size.
+            config (`Qwen2MoeConfig`):
+                The model's configuration class
+            past_key_values (`Cache`):
+                The cache class that is being used currently to generate
         """
         if attention_mask is not None and attention_mask.dim() == 4:
             # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
@@ -1245,19 +1256,27 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             causal_mask = torch.full(
                 (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
             )
-            if sequence_length != 1:
-                causal_mask = torch.triu(causal_mask, diagonal=1)
-            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            diagonal_attend_mask = torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            if config.sliding_window is not None:
+                # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also
+                # the check is needed to verify is current checkpoint was trained with sliding window or not
+                if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length:
+                    sliding_attend_mask = torch.arange(target_length, device=device) <= (
+                        cache_position.reshape(-1, 1) - config.sliding_window
+                    )
+                    diagonal_attend_mask |= sliding_attend_mask
+            causal_mask *= diagonal_attend_mask
             causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
             if attention_mask is not None:
                 causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                if attention_mask.shape[-1] > target_length:
+                    attention_mask = attention_mask[:, :target_length]
                 mask_length = attention_mask.shape[-1]
                 padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
                 padding_mask = padding_mask == 0
                 causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
                     padding_mask, min_dtype
                 )
-
         return causal_mask
 
 
@@ -1368,13 +1387,8 @@ def forward(
         )
 
         hidden_states = outputs[0]
-        if labels is None and not is_torchdynamo_compiling():
-            logger.warning_once(
-                "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)"
-            )
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
-        # TODO: remove the float() operation in v4.46
-        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float()
+        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
 
         loss = None
         if labels is not None:
@@ -1418,6 +1432,79 @@ def forward(
             router_logits=outputs.router_logits,
         )
 
+    # Copied from transformers.models.mistral.modeling_mistral.MistralForCausalLM.prepare_inputs_for_generation
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        num_logits_to_keep=None,
+        **kwargs,
+    ):
+        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
+        # Exception 1: when passing input_embeds, input_ids may be missing entries
+        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
+        if past_key_values is not None:
+            if inputs_embeds is not None:  # Exception 1
+                input_ids = input_ids[:, -cache_position.shape[0] :]
+            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
+                input_ids = input_ids[:, cache_position]
+
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+                # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s  `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture.
+                position_ids = position_ids.clone(memory_format=torch.contiguous_format)
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and cache_position[0] == 0:
+            model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
+        else:
+            # `contiguous()` needed for compilation use cases
+            model_inputs = {"input_ids": input_ids.contiguous(), "inputs_embeds": None}
+
+        if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
+            if model_inputs["inputs_embeds"] is not None:
+                batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
+                device = model_inputs["inputs_embeds"].device
+            else:
+                batch_size, sequence_length = model_inputs["input_ids"].shape
+                device = model_inputs["input_ids"].device
+
+            attention_mask = self.model._prepare_4d_causal_attention_mask_with_cache_position(
+                attention_mask,
+                sequence_length=sequence_length,
+                target_length=past_key_values.get_max_cache_shape(),
+                dtype=self.lm_head.weight.dtype,
+                device=device,
+                cache_position=cache_position,
+                batch_size=batch_size,
+                config=self.config,
+                past_key_values=past_key_values,
+            )
+
+        if num_logits_to_keep is not None:
+            model_inputs["num_logits_to_keep"] = num_logits_to_keep
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
 
 @add_start_docstrings(
     """
diff --git a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py
index 27615eb789f0b0..1349006e768cd4 100644
--- a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py
@@ -235,11 +235,13 @@ def __init__(
 
         # Validate the correctness of rotary position embeddings parameters
         # BC: if there is a 'type' field, move it to 'rope_type'.
-        # and change type from 'mrope' to 'default'
+        # and change type from 'mrope' to 'default' because `mrope` does defeault RoPE calculations
+        # one can set it to "linear"/"dynamic" etc. to have scaled RoPE
+        # TODO: @raushan update config in the hub
         if self.rope_scaling is not None and "type" in self.rope_scaling:
             if self.rope_scaling["type"] == "mrope":
                 self.rope_scaling["type"] = "default"
             self.rope_scaling["rope_type"] = self.rope_scaling["type"]
-        rope_config_validation(self)
+        rope_config_validation(self, ignore_keys={"mrope_section"})
 
         super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
diff --git a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
index e00aac0b7b67eb..283e38d3a7d508 100644
--- a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
@@ -30,7 +30,7 @@
 from torch.nn import CrossEntropyLoss, LayerNorm
 
 from ...activations import ACT2FN
-from ...cache_utils import Cache, StaticCache
+from ...cache_utils import Cache, SlidingWindowCache, StaticCache
 from ...generation import GenerationMixin
 from ...modeling_attn_mask_utils import (
     AttentionMaskConverter,
@@ -1126,9 +1126,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if self.gradient_checkpointing and self.training:
             if use_cache:
@@ -1219,7 +1217,7 @@ def forward(
             attentions=all_self_attns,
         )
 
-    # Copied from transformers.models.llama.modeling_llama.LlamaModel._update_causal_mask
+    # Copied from transformers.models.phi3.modeling_phi3.Phi3Model._update_causal_mask
     def _update_causal_mask(
         self,
         attention_mask: torch.Tensor,
@@ -1238,21 +1236,30 @@ def _update_causal_mask(
         # to infer the attention mask.
         past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
         using_static_cache = isinstance(past_key_values, StaticCache)
+        using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache)
 
         # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
-        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+        if (
+            self.config._attn_implementation == "sdpa"
+            and not (using_static_cache or using_sliding_window_cache)
+            and not output_attentions
+        ):
             if AttentionMaskConverter._ignore_causal_mask_sdpa(
                 attention_mask,
                 inputs_embeds=input_tensor,
                 past_key_values_length=past_seen_tokens,
+                sliding_window=self.config.sliding_window,
                 is_training=self.training,
             ):
                 return None
 
         dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
         sequence_length = input_tensor.shape[1]
-        if using_static_cache:
-            target_length = past_key_values.get_max_length()
+        # SlidingWindowCache or StaticCache
+        if using_sliding_window_cache or using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        # DynamicCache or no cache
         else:
             target_length = (
                 attention_mask.shape[-1]
@@ -1269,6 +1276,8 @@ def _update_causal_mask(
             device=device,
             cache_position=cache_position,
             batch_size=input_tensor.shape[0],
+            config=self.config,
+            past_key_values=past_key_values,
         )
 
         if (
@@ -1280,13 +1289,12 @@ def _update_causal_mask(
             # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
             # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
             # Details: https://github.com/pytorch/pytorch/issues/110213
-            min_dtype = torch.finfo(dtype).min
             causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
 
         return causal_mask
 
     @staticmethod
-    # Copied from transformers.models.llama.modeling_llama.LlamaModel._prepare_4d_causal_attention_mask_with_cache_position
+    # Copied from transformers.models.mistral.modeling_mistral.MistralModel._prepare_4d_causal_attention_mask_with_cache_position with Mistral->Qwen2VL
     def _prepare_4d_causal_attention_mask_with_cache_position(
         attention_mask: torch.Tensor,
         sequence_length: int,
@@ -1295,6 +1303,8 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
         device: torch.device,
         cache_position: torch.Tensor,
         batch_size: int,
+        config: Qwen2VLConfig,
+        past_key_values: Cache,
     ):
         """
         Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
@@ -1302,13 +1312,11 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
 
         Args:
             attention_mask (`torch.Tensor`):
-                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
-                `(batch_size, 1, query_length, key_value_length)`.
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
             sequence_length (`int`):
                 The sequence length being processed.
             target_length (`int`):
-                The target length: when generating with static cache, the mask should be as long as the static cache,
-                to account for the 0 padding, the part of the cache that is not filled yet.
+                The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
@@ -1317,6 +1325,10 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
                 Batch size.
+            config (`Qwen2VLConfig`):
+                The model's configuration class
+            past_key_values (`Cache`):
+                The cache class that is being used currently to generate
         """
         if attention_mask is not None and attention_mask.dim() == 4:
             # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
@@ -1326,19 +1338,27 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             causal_mask = torch.full(
                 (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
             )
-            if sequence_length != 1:
-                causal_mask = torch.triu(causal_mask, diagonal=1)
-            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            diagonal_attend_mask = torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            if config.sliding_window is not None:
+                # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also
+                # the check is needed to verify is current checkpoint was trained with sliding window or not
+                if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length:
+                    sliding_attend_mask = torch.arange(target_length, device=device) <= (
+                        cache_position.reshape(-1, 1) - config.sliding_window
+                    )
+                    diagonal_attend_mask |= sliding_attend_mask
+            causal_mask *= diagonal_attend_mask
             causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
             if attention_mask is not None:
                 causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                if attention_mask.shape[-1] > target_length:
+                    attention_mask = attention_mask[:, :target_length]
                 mask_length = attention_mask.shape[-1]
                 padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
                 padding_mask = padding_mask == 0
                 causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
                     padding_mask, min_dtype
                 )
-
         return causal_mask
 
 
@@ -1690,14 +1710,24 @@ def forward(
             if pixel_values is not None:
                 pixel_values = pixel_values.type(self.visual.get_dtype())
                 image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
-                image_mask = (input_ids == self.config.image_token_id).unsqueeze(-1).expand_as(inputs_embeds)
+                image_mask = (
+                    (input_ids == self.config.image_token_id)
+                    .unsqueeze(-1)
+                    .expand_as(inputs_embeds)
+                    .to(inputs_embeds.device)
+                )
                 image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
                 inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
 
             if pixel_values_videos is not None:
                 pixel_values_videos = pixel_values_videos.type(self.visual.get_dtype())
                 video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
-                video_mask = (input_ids == self.config.video_token_id).unsqueeze(-1).expand_as(inputs_embeds)
+                video_mask = (
+                    (input_ids == self.config.video_token_id)
+                    .unsqueeze(-1)
+                    .expand_as(inputs_embeds)
+                    .to(inputs_embeds.device)
+                )
                 video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
                 inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
 
@@ -1807,11 +1837,13 @@ def prepare_inputs_for_generation(
             attention_mask = self.model._prepare_4d_causal_attention_mask_with_cache_position(
                 attention_mask,
                 sequence_length=sequence_length,
-                target_length=past_key_values.get_max_length(),
+                target_length=past_key_values.get_max_cache_shape(),
                 dtype=self.lm_head.weight.dtype,
                 device=device,
                 cache_position=cache_position,
                 batch_size=batch_size,
+                config=self.config,
+                past_key_values=past_key_values,
             )
 
         model_inputs.update(
diff --git a/src/transformers/models/qwen2_vl/processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/processing_qwen2_vl.py
index 48516e6aa31d7d..6c0e8d98014ede 100644
--- a/src/transformers/models/qwen2_vl/processing_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/processing_qwen2_vl.py
@@ -150,7 +150,6 @@ def __call__(
                     index += 1
                 text[i] = text[i].replace("<|placeholder|>", "<|video_pad|>")
 
-        _ = output_kwargs["text_kwargs"].pop("padding_side", None)
         text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
 
         return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs})
diff --git a/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py b/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py
index e0492948998434..17744188d40178 100644
--- a/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py
+++ b/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py
@@ -695,9 +695,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if self.gradient_checkpointing and self.training and use_cache:
             logger.warning_once(
@@ -896,43 +894,6 @@ def forward(
             hidden_states=outputs.hidden_states,
         )
 
-    # Ignore copy
-    def prepare_inputs_for_generation(
-        self, input_ids, attention_mask=None, inputs_embeds=None, cache_position=None, use_cache=None, **kwargs
-    ):
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-
-        attention_mask = attention_mask[:, -self.config.attention_window_size :]
-
-        past_length = cache_position[0]
-        if past_length > 0:
-            position_ids = position_ids[:, past_length:]
-
-        if inputs_embeds is not None:  # Exception 1
-            input_ids = input_ids[:, -cache_position.shape[0] :]
-        elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
-            input_ids = input_ids[:, cache_position]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and cache_position[0] == 0:
-            model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
-        else:
-            # The clone here is for the same reason as for `position_ids`.
-            model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "attention_mask": attention_mask,
-                "cache_position": cache_position,
-                "use_cache": use_cache,
-            }
-        )
-        return model_inputs
-
     # Ignore copy
     def _reorder_cache(self, past_key_values, beam_idx):
         for layer in self.layers:
diff --git a/src/transformers/models/reformer/modeling_reformer.py b/src/transformers/models/reformer/modeling_reformer.py
index 37b675539e66cf..2c635a7118b451 100755
--- a/src/transformers/models/reformer/modeling_reformer.py
+++ b/src/transformers/models/reformer/modeling_reformer.py
@@ -2282,6 +2282,8 @@ def forward(
     def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, use_cache=None, num_hashes=None, **kwargs
     ):
+        # Overitten -- different expected inputs/outputs
+
         # only last token for inputs_ids if past is defined in kwargs
         if past_key_values is not None:
             input_ids = input_ids[:, -1:]
diff --git a/src/transformers/models/rembert/modeling_rembert.py b/src/transformers/models/rembert/modeling_rembert.py
index 99016c1be429cd..b73b2efea5accc 100755
--- a/src/transformers/models/rembert/modeling_rembert.py
+++ b/src/transformers/models/rembert/modeling_rembert.py
@@ -1126,28 +1126,6 @@ def forward(
             cross_attentions=outputs.cross_attentions,
         )
 
-    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
-        input_shape = input_ids.shape
-
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_shape)
-
-        # cut decoder_input_ids if past_key_values is used
-        if past_key_values is not None:
-            past_length = past_key_values[0][0].shape[2]
-
-            # Some generation methods already pass only the last input ID
-            if input_ids.shape[1] > past_length:
-                remove_prefix_length = past_length
-            else:
-                # Default to old behavior: keep only final ID
-                remove_prefix_length = input_ids.shape[1] - 1
-
-            input_ids = input_ids[:, remove_prefix_length:]
-
-        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
-
     def _reorder_cache(self, past_key_values, beam_idx):
         reordered_past = ()
         for layer_past in past_key_values:
diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py
index 91500e1926d753..1cbce28bf999e6 100644
--- a/src/transformers/models/roberta/modeling_roberta.py
+++ b/src/transformers/models/roberta/modeling_roberta.py
@@ -1133,27 +1133,6 @@ def forward(
             cross_attentions=outputs.cross_attentions,
         )
 
-    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
-        input_shape = input_ids.shape
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_shape)
-
-        # cut decoder_input_ids if past_key_values is used
-        if past_key_values is not None:
-            past_length = past_key_values[0][0].shape[2]
-
-            # Some generation methods already pass only the last input ID
-            if input_ids.shape[1] > past_length:
-                remove_prefix_length = past_length
-            else:
-                # Default to old behavior: keep only final ID
-                remove_prefix_length = input_ids.shape[1] - 1
-
-            input_ids = input_ids[:, remove_prefix_length:]
-
-        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
-
     def _reorder_cache(self, past_key_values, beam_idx):
         reordered_past = ()
         for layer_past in past_key_values:
diff --git a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
index 9ed9b11d9431df..5f7a7f849413a6 100644
--- a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
+++ b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
@@ -987,27 +987,6 @@ def forward(
             cross_attentions=outputs.cross_attentions,
         )
 
-    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
-        input_shape = input_ids.shape
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_shape)
-
-        # cut decoder_input_ids if past_key_values is used
-        if past_key_values is not None:
-            past_length = past_key_values[0][0].shape[2]
-
-            # Some generation methods already pass only the last input ID
-            if input_ids.shape[1] > past_length:
-                remove_prefix_length = past_length
-            else:
-                # Default to old behavior: keep only final ID
-                remove_prefix_length = input_ids.shape[1] - 1
-
-            input_ids = input_ids[:, remove_prefix_length:]
-
-        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
-
     def _reorder_cache(self, past_key_values, beam_idx):
         reordered_past = ()
         for layer_past in past_key_values:
diff --git a/src/transformers/models/roc_bert/modeling_roc_bert.py b/src/transformers/models/roc_bert/modeling_roc_bert.py
index 2969f7f1a3d0a8..2d9c8bbb13b72f 100644
--- a/src/transformers/models/roc_bert/modeling_roc_bert.py
+++ b/src/transformers/models/roc_bert/modeling_roc_bert.py
@@ -1552,6 +1552,8 @@ def prepare_inputs_for_generation(
         attention_mask=None,
         **model_kwargs,
     ):
+        # Overwritten -- `input_pronunciation_ids`
+
         input_shape = input_ids.shape
 
         # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
diff --git a/src/transformers/models/roformer/modeling_roformer.py b/src/transformers/models/roformer/modeling_roformer.py
index c98b525abe081b..b493b1e6bcdacf 100644
--- a/src/transformers/models/roformer/modeling_roformer.py
+++ b/src/transformers/models/roformer/modeling_roformer.py
@@ -1157,28 +1157,6 @@ def forward(
             cross_attentions=outputs.cross_attentions,
         )
 
-    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
-        input_shape = input_ids.shape
-
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_shape)
-
-        # cut decoder_input_ids if past_key_values is used
-        if past_key_values is not None:
-            past_length = past_key_values[0][0].shape[2]
-
-            # Some generation methods already pass only the last input ID
-            if input_ids.shape[1] > past_length:
-                remove_prefix_length = past_length
-            else:
-                # Default to old behavior: keep only final ID
-                remove_prefix_length = input_ids.shape[1] - 1
-
-            input_ids = input_ids[:, remove_prefix_length:]
-
-        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
-
     def _reorder_cache(self, past_key_values, beam_idx):
         reordered_past = ()
         for layer_past in past_key_values:
diff --git a/src/transformers/models/rt_detr/modeling_rt_detr.py b/src/transformers/models/rt_detr/modeling_rt_detr.py
index 4e32434901cdc7..c4daba6d2747ea 100644
--- a/src/transformers/models/rt_detr/modeling_rt_detr.py
+++ b/src/transformers/models/rt_detr/modeling_rt_detr.py
@@ -733,13 +733,14 @@ def forward(self, hidden_state):
 
 # Copied from transformers.models.deformable_detr.modeling_deformable_detr.multi_scale_deformable_attention
 def multi_scale_deformable_attention(
-    value: Tensor, value_spatial_shapes: Tensor, sampling_locations: Tensor, attention_weights: Tensor
+    value: Tensor,
+    value_spatial_shapes: Union[Tensor, List[Tuple]],
+    sampling_locations: Tensor,
+    attention_weights: Tensor,
 ) -> Tensor:
     batch_size, _, num_heads, hidden_dim = value.shape
     _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
-    # Ignore copy
     value_list = value.split([height * width for height, width in value_spatial_shapes], dim=1)
-
     sampling_grids = 2 * sampling_locations - 1
     sampling_value_list = []
     for level_id, (height, width) in enumerate(value_spatial_shapes):
@@ -816,29 +817,6 @@ def __init__(self, config: RTDetrConfig, num_heads: int, n_points: int):
 
         self.disable_custom_kernels = config.disable_custom_kernels
 
-        self._reset_parameters()
-
-    def _reset_parameters(self):
-        nn.init.constant_(self.sampling_offsets.weight.data, 0.0)
-        default_dtype = torch.get_default_dtype()
-        thetas = torch.arange(self.n_heads, dtype=torch.int64).to(default_dtype) * (2.0 * math.pi / self.n_heads)
-        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
-        grid_init = (
-            (grid_init / grid_init.abs().max(-1, keepdim=True)[0])
-            .view(self.n_heads, 1, 1, 2)
-            .repeat(1, self.n_levels, self.n_points, 1)
-        )
-        for i in range(self.n_points):
-            grid_init[:, :, i, :] *= i + 1
-        with torch.no_grad():
-            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
-        nn.init.constant_(self.attention_weights.weight.data, 0.0)
-        nn.init.constant_(self.attention_weights.bias.data, 0.0)
-        nn.init.xavier_uniform_(self.value_proj.weight.data)
-        nn.init.constant_(self.value_proj.bias.data, 0.0)
-        nn.init.xavier_uniform_(self.output_proj.weight.data)
-        nn.init.constant_(self.output_proj.bias.data, 0.0)
-
     def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
         return tensor if position_embeddings is None else tensor + position_embeddings
 
@@ -861,9 +839,7 @@ def forward(
 
         batch_size, num_queries, _ = hidden_states.shape
         batch_size, sequence_length, _ = encoder_hidden_states.shape
-
-        # Ignore copy
-        total_elements = sum(shape[0] * shape[1] for shape in spatial_shapes_list)
+        total_elements = sum(height * width for height, width in spatial_shapes_list)
         if total_elements != sequence_length:
             raise ValueError(
                 "Make sure to align the spatial shapes with the sequence length of the encoder hidden states"
@@ -899,7 +875,6 @@ def forward(
         else:
             raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}")
 
-        # Ignore copy
         if self.disable_custom_kernels or MultiScaleDeformableAttention is None:
             # PyTorch implementation
             output = multi_scale_deformable_attention(
@@ -1176,6 +1151,29 @@ def _init_weights(self, module):
                     nn.init.constant_(layer.layers[-1].weight, 0)
                     nn.init.constant_(layer.layers[-1].bias, 0)
 
+        if isinstance(module, RTDetrMultiscaleDeformableAttention):
+            nn.init.constant_(module.sampling_offsets.weight.data, 0.0)
+            default_dtype = torch.get_default_dtype()
+            thetas = torch.arange(module.n_heads, dtype=torch.int64).to(default_dtype) * (
+                2.0 * math.pi / module.n_heads
+            )
+            grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+            grid_init = (
+                (grid_init / grid_init.abs().max(-1, keepdim=True)[0])
+                .view(module.n_heads, 1, 1, 2)
+                .repeat(1, module.n_levels, module.n_points, 1)
+            )
+            for i in range(module.n_points):
+                grid_init[:, :, i, :] *= i + 1
+            with torch.no_grad():
+                module.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
+            nn.init.constant_(module.attention_weights.weight.data, 0.0)
+            nn.init.constant_(module.attention_weights.bias.data, 0.0)
+            nn.init.xavier_uniform_(module.value_proj.weight.data)
+            nn.init.constant_(module.value_proj.bias.data, 0.0)
+            nn.init.xavier_uniform_(module.output_proj.weight.data)
+            nn.init.constant_(module.output_proj.bias.data, 0.0)
+
         if isinstance(module, RTDetrModel):
             prior_prob = self.config.initializer_bias_prior_prob or 1 / (self.config.num_labels + 1)
             bias = float(-math.log((1 - prior_prob) / prior_prob))
diff --git a/src/transformers/models/rwkv/modeling_rwkv.py b/src/transformers/models/rwkv/modeling_rwkv.py
index 8361afbf727b64..a42843b5106510 100644
--- a/src/transformers/models/rwkv/modeling_rwkv.py
+++ b/src/transformers/models/rwkv/modeling_rwkv.py
@@ -770,6 +770,8 @@ def set_output_embeddings(self, new_embeddings):
         self.head = new_embeddings
 
     def prepare_inputs_for_generation(self, input_ids, state=None, inputs_embeds=None, use_cache=None, **kwargs):
+        # Overwritten -- this model uses `state`, but doesn't have a cache (`past_key_values`)
+
         # only last token for inputs_ids if the state is passed along.
         if state is not None:
             input_ids = input_ids[:, -1].unsqueeze(-1)
diff --git a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
index 8e226d92a10580..adc01ec40f9599 100755
--- a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
+++ b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
@@ -25,8 +25,9 @@
 from torch.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
-from ...deepspeed import is_deepspeed_zero3_enabled
 from ...generation import GenerationMixin
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.fsdp import is_fsdp_managed_module
 from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
 from ...modeling_outputs import (
     BaseModelOutput,
@@ -823,7 +824,7 @@ def forward(
         else:
             relative_position_embeddings = None
 
-        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
 
         for i, layer in enumerate(self.layers):
             if output_hidden_states:
@@ -835,8 +836,8 @@ def forward(
             skip_the_layer = (
                 True if self.training and (dropout_probability < self.config.speech_encoder_layerdrop) else False
             )
-            if not skip_the_layer or deepspeed_zero3_is_enabled:
-                # under deepspeed zero3 all gpus must run in sync
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
                 if self.gradient_checkpointing and self.training:
                     layer_outputs = self._gradient_checkpointing_func(
                         layer.__call__,
@@ -2863,7 +2864,8 @@ def generate(
                 for constrained generation conditioned on the prefix, as described in [Autoregressive Entity
                 Retrieval](https://arxiv.org/abs/2010.00904).
             synced_gpus (`bool`, *optional*, defaults to `False`):
-                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
+                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
             kwargs (`Dict[str, Any]`, *optional*):
                 Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
                 forwarded to the `forward` function of the model.
@@ -3149,7 +3151,8 @@ def generate(
                 for constrained generation conditioned on the prefix, as described in [Autoregressive Entity
                 Retrieval](https://arxiv.org/abs/2010.00904).
             synced_gpus (`bool`, *optional*, defaults to `False`):
-                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
+                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
             kwargs (`Dict[str, Any]`, *optional*):
                 Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
                 forwarded to the `forward` function of the model.
diff --git a/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py b/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
index aa710ad95266ff..21265faa225127 100644
--- a/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
+++ b/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
@@ -25,8 +25,9 @@
 from torch.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
-from ...deepspeed import is_deepspeed_zero3_enabled
 from ...generation import GenerationMixin
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.fsdp import is_fsdp_managed_module
 from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
 from ...modeling_outputs import (
     BaseModelOutput,
@@ -770,7 +771,7 @@ def forward(
 
         hidden_states = self.dropout(hidden_states)
 
-        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
 
         for i, layer in enumerate(self.layers):
             if output_hidden_states:
@@ -782,8 +783,8 @@ def forward(
             skip_the_layer = (
                 True if self.training and (dropout_probability < self.config.speech_encoder_layerdrop) else False
             )
-            if not skip_the_layer or deepspeed_zero3_is_enabled:
-                # under deepspeed zero3 all gpus must run in sync
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
                 if self.gradient_checkpointing and self.training:
                     layer_outputs = self._gradient_checkpointing_func(
                         layer.__call__,
@@ -3121,7 +3122,8 @@ def generate(
                 for constrained generation conditioned on the prefix, as described in [Autoregressive Entity
                 Retrieval](https://arxiv.org/abs/2010.00904).
             synced_gpus (`bool`, *optional*, defaults to `False`):
-                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
+                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
             kwargs (`Dict[str, Any]`, *optional*):
                 Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
                 forwarded to the `forward` function of the model.
@@ -3417,7 +3419,8 @@ def generate(
                 for constrained generation conditioned on the prefix, as described in [Autoregressive Entity
                 Retrieval](https://arxiv.org/abs/2010.00904).
             synced_gpus (`bool`, *optional*, defaults to `False`):
-                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
+                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
             kwargs (`Dict[str, Any]`, *optional*):
                 Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
                 forwarded to the `forward` function of the model.
diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py
index c9a3494b88b486..8638d93385843d 100644
--- a/src/transformers/models/sew/modeling_sew.py
+++ b/src/transformers/models/sew/modeling_sew.py
@@ -26,6 +26,7 @@
 
 from ...activations import ACT2FN
 from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.fsdp import is_fsdp_managed_module
 from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
 from ...modeling_utils import PreTrainedModel
 from ...utils import (
@@ -668,7 +669,7 @@ def forward(
             value_states,
             attention_mask,
             q_len,
-            dropout=self.dropout,
+            dropout=self.dropout if self.training else 0.0,
             is_causal=self.is_causal,
             use_top_left_mask=self._flash_attn_uses_top_left_mask,
         )
@@ -921,7 +922,7 @@ def forward(
         hidden_states = self.layer_norm(hidden_states)
         hidden_states = self.dropout(hidden_states)
 
-        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
 
         for layer in self.layers:
             if output_hidden_states:
@@ -931,8 +932,8 @@ def forward(
             dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
-            if not skip_the_layer or deepspeed_zero3_is_enabled:
-                # under deepspeed zero3 all gpus must run in sync
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
                 if self.gradient_checkpointing and self.training:
                     layer_outputs = self._gradient_checkpointing_func(
                         layer.__call__,
diff --git a/src/transformers/models/siglip/modeling_siglip.py b/src/transformers/models/siglip/modeling_siglip.py
index 1d35d1d44cfd97..507e0768a226ef 100644
--- a/src/transformers/models/siglip/modeling_siglip.py
+++ b/src/transformers/models/siglip/modeling_siglip.py
@@ -279,13 +279,13 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
         """
 
         num_patches = embeddings.shape[1]
-        num_positions = self.position_embeddings.shape[1]
+        num_positions = self.position_embedding.weight.shape[0]
 
         # always interpolate when tracing to ensure the exported model works for dynamic input shapes
         if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
-            return self.position_embeddings
+            return self.position_embedding(self.position_ids)
 
-        patch_pos_embed = self.position_embeddings
+        patch_pos_embed = self.position_embedding.weight.unsqueeze(0)
 
         dim = embeddings.shape[-1]
 
diff --git a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
index c2f5dd0259093b..ef84a4fa5fbd2a 100644
--- a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
+++ b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
@@ -578,13 +578,12 @@ def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs
     ):
         decoder_inputs = self.decoder.prepare_inputs_for_generation(input_ids, past_key_values=past_key_values)
-        decoder_attention_mask = decoder_inputs["attention_mask"] if "attention_mask" in decoder_inputs else None
         input_dict = {
             "attention_mask": attention_mask,
-            "decoder_attention_mask": decoder_attention_mask,
+            "decoder_attention_mask": decoder_inputs.get("attention_mask"),
             "decoder_input_ids": decoder_inputs["input_ids"],
             "encoder_outputs": encoder_outputs,
-            "past_key_values": decoder_inputs["past_key_values"],
+            "past_key_values": decoder_inputs.get("past_key_values"),
             "use_cache": use_cache,
         }
         return input_dict
diff --git a/src/transformers/models/speecht5/modeling_speecht5.py b/src/transformers/models/speecht5/modeling_speecht5.py
index 790e6a74a47135..dbe57c01d9839e 100644
--- a/src/transformers/models/speecht5/modeling_speecht5.py
+++ b/src/transformers/models/speecht5/modeling_speecht5.py
@@ -25,6 +25,7 @@
 
 from ...activations import ACT2FN
 from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.fsdp import is_fsdp_managed_module
 from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
 from ...modeling_outputs import (
     BaseModelOutput,
@@ -1318,7 +1319,7 @@ def forward(
 
         position_bias = self.embed_positions(hidden_states)
 
-        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
 
         all_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
@@ -1341,8 +1342,8 @@ def forward(
                 dropout_probability = torch.rand([])
                 skip_the_layer = dropout_probability < self.layerdrop
 
-            if not skip_the_layer or deepspeed_zero3_is_enabled:
-                # under deepspeed zero3 all gpus must run in sync
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
                 if self.gradient_checkpointing and self.training:
                     layer_outputs = self._gradient_checkpointing_func(
                         encoder_layer.__call__,
@@ -1603,7 +1604,7 @@ def forward(
                 encoder_attention_mask, hidden_states.dtype, tgt_len=input_shape[-1]
             )
 
-        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
 
         if self.gradient_checkpointing and self.training:
             if use_cache:
@@ -1636,7 +1637,7 @@ def forward(
             if self.training:
                 dropout_probability = torch.rand([])
                 skip_the_layer = dropout_probability < self.layerdrop
-            if skip_the_layer and not deepspeed_zero3_is_enabled:
+            if skip_the_layer and not synced_gpus:
                 continue
 
             past_key_value = past_key_values[idx] if past_key_values is not None else None
diff --git a/src/transformers/models/splinter/tokenization_splinter.py b/src/transformers/models/splinter/tokenization_splinter.py
index 2859497ba882c2..ffa135556aa47d 100644
--- a/src/transformers/models/splinter/tokenization_splinter.py
+++ b/src/transformers/models/splinter/tokenization_splinter.py
@@ -137,6 +137,7 @@ def __init__(
             pad_token=pad_token,
             cls_token=cls_token,
             mask_token=mask_token,
+            question_token=question_token,
             tokenize_chinese_chars=tokenize_chinese_chars,
             strip_accents=strip_accents,
             **kwargs,
diff --git a/src/transformers/models/stablelm/modeling_stablelm.py b/src/transformers/models/stablelm/modeling_stablelm.py
index 9c457d869ac5c9..fe3ad6498172a9 100755
--- a/src/transformers/models/stablelm/modeling_stablelm.py
+++ b/src/transformers/models/stablelm/modeling_stablelm.py
@@ -896,9 +896,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if self.gradient_checkpointing and self.training:
             if use_cache:
@@ -1034,7 +1032,7 @@ def _update_causal_mask(
         dtype, device = input_tensor.dtype, input_tensor.device
         sequence_length = input_tensor.shape[1]
         if using_static_cache:
-            target_length = past_key_values.get_max_length()
+            target_length = past_key_values.get_max_cache_shape()
         else:
             target_length = (
                 attention_mask.shape[-1]
diff --git a/src/transformers/models/starcoder2/modeling_starcoder2.py b/src/transformers/models/starcoder2/modeling_starcoder2.py
index 89a36fefe77ace..e0fdbef1a3baf5 100644
--- a/src/transformers/models/starcoder2/modeling_starcoder2.py
+++ b/src/transformers/models/starcoder2/modeling_starcoder2.py
@@ -28,7 +28,7 @@
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...cache_utils import Cache, DynamicCache, StaticCache
+from ...cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
 from ...generation import GenerationMixin
 from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_outputs import (
@@ -44,7 +44,6 @@
     add_start_docstrings_to_model_forward,
     is_flash_attn_2_available,
     is_flash_attn_greater_or_equal_2_10,
-    is_torchdynamo_compiling,
     logging,
     replace_return_docstrings,
 )
@@ -825,9 +824,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if self.gradient_checkpointing and self.training:
             if use_cache:
@@ -931,7 +928,7 @@ def forward(
             attentions=all_self_attns,
         )
 
-    # Copied from transformers.models.llama.modeling_llama.LlamaModel._update_causal_mask
+    # Copied from transformers.models.phi3.modeling_phi3.Phi3Model._update_causal_mask
     def _update_causal_mask(
         self,
         attention_mask: torch.Tensor,
@@ -950,21 +947,30 @@ def _update_causal_mask(
         # to infer the attention mask.
         past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
         using_static_cache = isinstance(past_key_values, StaticCache)
+        using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache)
 
         # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
-        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+        if (
+            self.config._attn_implementation == "sdpa"
+            and not (using_static_cache or using_sliding_window_cache)
+            and not output_attentions
+        ):
             if AttentionMaskConverter._ignore_causal_mask_sdpa(
                 attention_mask,
                 inputs_embeds=input_tensor,
                 past_key_values_length=past_seen_tokens,
+                sliding_window=self.config.sliding_window,
                 is_training=self.training,
             ):
                 return None
 
         dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
         sequence_length = input_tensor.shape[1]
-        if using_static_cache:
-            target_length = past_key_values.get_max_length()
+        # SlidingWindowCache or StaticCache
+        if using_sliding_window_cache or using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        # DynamicCache or no cache
         else:
             target_length = (
                 attention_mask.shape[-1]
@@ -981,6 +987,8 @@ def _update_causal_mask(
             device=device,
             cache_position=cache_position,
             batch_size=input_tensor.shape[0],
+            config=self.config,
+            past_key_values=past_key_values,
         )
 
         if (
@@ -992,13 +1000,12 @@ def _update_causal_mask(
             # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
             # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
             # Details: https://github.com/pytorch/pytorch/issues/110213
-            min_dtype = torch.finfo(dtype).min
             causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
 
         return causal_mask
 
     @staticmethod
-    # Copied from transformers.models.llama.modeling_llama.LlamaModel._prepare_4d_causal_attention_mask_with_cache_position
+    # Copied from transformers.models.mistral.modeling_mistral.MistralModel._prepare_4d_causal_attention_mask_with_cache_position with Mistral->Starcoder2
     def _prepare_4d_causal_attention_mask_with_cache_position(
         attention_mask: torch.Tensor,
         sequence_length: int,
@@ -1007,6 +1014,8 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
         device: torch.device,
         cache_position: torch.Tensor,
         batch_size: int,
+        config: Starcoder2Config,
+        past_key_values: Cache,
     ):
         """
         Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
@@ -1014,13 +1023,11 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
 
         Args:
             attention_mask (`torch.Tensor`):
-                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
-                `(batch_size, 1, query_length, key_value_length)`.
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
             sequence_length (`int`):
                 The sequence length being processed.
             target_length (`int`):
-                The target length: when generating with static cache, the mask should be as long as the static cache,
-                to account for the 0 padding, the part of the cache that is not filled yet.
+                The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
@@ -1029,6 +1036,10 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
                 Batch size.
+            config (`Starcoder2Config`):
+                The model's configuration class
+            past_key_values (`Cache`):
+                The cache class that is being used currently to generate
         """
         if attention_mask is not None and attention_mask.dim() == 4:
             # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
@@ -1038,19 +1049,27 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             causal_mask = torch.full(
                 (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
             )
-            if sequence_length != 1:
-                causal_mask = torch.triu(causal_mask, diagonal=1)
-            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            diagonal_attend_mask = torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            if config.sliding_window is not None:
+                # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also
+                # the check is needed to verify is current checkpoint was trained with sliding window or not
+                if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length:
+                    sliding_attend_mask = torch.arange(target_length, device=device) <= (
+                        cache_position.reshape(-1, 1) - config.sliding_window
+                    )
+                    diagonal_attend_mask |= sliding_attend_mask
+            causal_mask *= diagonal_attend_mask
             causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
             if attention_mask is not None:
                 causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                if attention_mask.shape[-1] > target_length:
+                    attention_mask = attention_mask[:, :target_length]
                 mask_length = attention_mask.shape[-1]
                 padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
                 padding_mask = padding_mask == 0
                 causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
                     padding_mask, min_dtype
                 )
-
         return causal_mask
 
 
@@ -1155,13 +1174,8 @@ def forward(
         )
 
         hidden_states = outputs[0]
-        if labels is None and not is_torchdynamo_compiling():
-            logger.warning_once(
-                "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)"
-            )
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
-        # TODO: remove the float() operation in v4.46
-        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float()
+        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
 
         loss = None
         if labels is not None:
@@ -1190,6 +1204,79 @@ def forward(
             attentions=outputs.attentions,
         )
 
+    # Copied from transformers.models.mistral.modeling_mistral.MistralForCausalLM.prepare_inputs_for_generation
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        num_logits_to_keep=None,
+        **kwargs,
+    ):
+        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
+        # Exception 1: when passing input_embeds, input_ids may be missing entries
+        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
+        if past_key_values is not None:
+            if inputs_embeds is not None:  # Exception 1
+                input_ids = input_ids[:, -cache_position.shape[0] :]
+            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
+                input_ids = input_ids[:, cache_position]
+
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+                # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s  `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture.
+                position_ids = position_ids.clone(memory_format=torch.contiguous_format)
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and cache_position[0] == 0:
+            model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
+        else:
+            # `contiguous()` needed for compilation use cases
+            model_inputs = {"input_ids": input_ids.contiguous(), "inputs_embeds": None}
+
+        if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
+            if model_inputs["inputs_embeds"] is not None:
+                batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
+                device = model_inputs["inputs_embeds"].device
+            else:
+                batch_size, sequence_length = model_inputs["input_ids"].shape
+                device = model_inputs["input_ids"].device
+
+            attention_mask = self.model._prepare_4d_causal_attention_mask_with_cache_position(
+                attention_mask,
+                sequence_length=sequence_length,
+                target_length=past_key_values.get_max_cache_shape(),
+                dtype=self.lm_head.weight.dtype,
+                device=device,
+                cache_position=cache_position,
+                batch_size=batch_size,
+                config=self.config,
+                past_key_values=past_key_values,
+            )
+
+        if num_logits_to_keep is not None:
+            model_inputs["num_logits_to_keep"] = num_logits_to_keep
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
 
 @add_start_docstrings(
     """
diff --git a/src/transformers/models/switch_transformers/modeling_switch_transformers.py b/src/transformers/models/switch_transformers/modeling_switch_transformers.py
index 96b6c7334b1535..f1495ddc8c0057 100644
--- a/src/transformers/models/switch_transformers/modeling_switch_transformers.py
+++ b/src/transformers/models/switch_transformers/modeling_switch_transformers.py
@@ -298,7 +298,7 @@ def forward(self, hidden_states):
 
         router_mask = router_mask.bool()
         batch_size, seq_len, num_experts = router_mask.shape
-        idx_mask = router_mask.transpose(1, 2).reshape(batch_size * seq_len, num_experts).sum(dim=0)
+        idx_mask = router_mask.reshape(batch_size * seq_len, num_experts).sum(dim=0)
         idx_mask = torch.nonzero(idx_mask, as_tuple=True)[
             0
         ].tolist()  # length: number of "activated" expert / value: index
diff --git a/src/transformers/models/trocr/modeling_trocr.py b/src/transformers/models/trocr/modeling_trocr.py
index 67b97cf9c85223..754515dde0fb4d 100644
--- a/src/transformers/models/trocr/modeling_trocr.py
+++ b/src/transformers/models/trocr/modeling_trocr.py
@@ -945,32 +945,6 @@ def forward(
             cross_attentions=outputs.cross_attentions,
         )
 
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs
-    ):
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_ids.shape)
-
-        if past_key_values:
-            past_length = past_key_values[0][0].shape[2]
-
-            # Some generation methods already pass only the last input ID
-            if input_ids.shape[1] > past_length:
-                remove_prefix_length = past_length
-            else:
-                # Default to old behavior: keep only final ID
-                remove_prefix_length = input_ids.shape[1] - 1
-
-            input_ids = input_ids[:, remove_prefix_length:]
-        # first step, decoder_cached_states are empty
-        return {
-            "input_ids": input_ids,  # encoder_outputs is defined. input_ids not needed
-            "attention_mask": attention_mask,
-            "past_key_values": past_key_values,
-            "use_cache": use_cache,
-        }
-
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py
index 4202f680437c53..eab23032475c40 100755
--- a/src/transformers/models/unispeech/modeling_unispeech.py
+++ b/src/transformers/models/unispeech/modeling_unispeech.py
@@ -27,6 +27,7 @@
 
 from ...activations import ACT2FN
 from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.fsdp import is_fsdp_managed_module
 from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput, Wav2Vec2BaseModelOutput
 from ...modeling_utils import PreTrainedModel
 from ...utils import (
@@ -700,7 +701,7 @@ def forward(
             value_states,
             attention_mask,
             q_len,
-            dropout=self.dropout,
+            dropout=self.dropout if self.training else 0.0,
             is_causal=self.is_causal,
             use_top_left_mask=self._flash_attn_uses_top_left_mask,
         )
@@ -1004,7 +1005,7 @@ def forward(
         hidden_states = self.layer_norm(hidden_states)
         hidden_states = self.dropout(hidden_states)
 
-        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
 
         for layer in self.layers:
             if output_hidden_states:
@@ -1014,8 +1015,8 @@ def forward(
             dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
-            if not skip_the_layer or deepspeed_zero3_is_enabled:
-                # under deepspeed zero3 all gpus must run in sync
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
                 if self.gradient_checkpointing and self.training:
                     layer_outputs = self._gradient_checkpointing_func(
                         layer.__call__,
@@ -1091,7 +1092,7 @@ def forward(
         hidden_states = hidden_states + position_embeddings
         hidden_states = self.dropout(hidden_states)
 
-        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
 
         for layer in self.layers:
             if output_hidden_states:
@@ -1101,8 +1102,8 @@ def forward(
             dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
-            if not skip_the_layer or deepspeed_zero3_is_enabled:
-                # under deepspeed zero3 all gpus must run in sync
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
                 # XXX: could optimize this like synced_gpus in generate_utils but not sure if it's worth the code complication
                 if self.gradient_checkpointing and self.training:
                     layer_outputs = self._gradient_checkpointing_func(
diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
index bfb2cbfa4f55da..31d5071dbe24bf 100755
--- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
+++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
@@ -27,6 +27,7 @@
 
 from ...activations import ACT2FN
 from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.fsdp import is_fsdp_managed_module
 from ...modeling_outputs import (
     BaseModelOutput,
     CausalLMOutput,
@@ -717,7 +718,7 @@ def forward(
             value_states,
             attention_mask,
             q_len,
-            dropout=self.dropout,
+            dropout=self.dropout if self.training else 0.0,
             is_causal=self.is_causal,
             use_top_left_mask=self._flash_attn_uses_top_left_mask,
         )
@@ -1021,7 +1022,7 @@ def forward(
         hidden_states = self.layer_norm(hidden_states)
         hidden_states = self.dropout(hidden_states)
 
-        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
 
         for layer in self.layers:
             if output_hidden_states:
@@ -1031,8 +1032,8 @@ def forward(
             dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
-            if not skip_the_layer or deepspeed_zero3_is_enabled:
-                # under deepspeed zero3 all gpus must run in sync
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
                 if self.gradient_checkpointing and self.training:
                     layer_outputs = self._gradient_checkpointing_func(
                         layer.__call__,
@@ -1108,7 +1109,7 @@ def forward(
         hidden_states = hidden_states + position_embeddings
         hidden_states = self.dropout(hidden_states)
 
-        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
 
         for layer in self.layers:
             if output_hidden_states:
@@ -1118,8 +1119,8 @@ def forward(
             dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
-            if not skip_the_layer or deepspeed_zero3_is_enabled:
-                # under deepspeed zero3 all gpus must run in sync
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
                 # XXX: could optimize this like synced_gpus in generate_utils but not sure if it's worth the code complication
                 if self.gradient_checkpointing and self.training:
                     layer_outputs = self._gradient_checkpointing_func(
diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py
index 7c7cfec20959ea..5711433c368d5e 100644
--- a/src/transformers/models/video_llava/modeling_video_llava.py
+++ b/src/transformers/models/video_llava/modeling_video_llava.py
@@ -516,9 +516,7 @@ def forward(
         )
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if (pixel_values_images is not None or pixel_values_videos is not None) and inputs_embeds is not None:
             raise ValueError(
@@ -621,14 +619,20 @@ def forward(
             else:
                 if image_outputs is not None:
                     special_image_mask = (
-                        (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+                        (input_ids == self.config.image_token_index)
+                        .unsqueeze(-1)
+                        .expand_as(inputs_embeds)
+                        .to(inputs_embeds.device)
                     )
                     image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
                     inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
 
                 if video_outputs is not None:
                     special_image_mask = (
-                        (input_ids == self.config.video_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+                        (input_ids == self.config.video_token_index)
+                        .unsqueeze(-1)
+                        .expand_as(inputs_embeds)
+                        .to(inputs_embeds.device)
                     )
                     video_features = video_features.to(inputs_embeds.device, inputs_embeds.dtype)
                     inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, video_features)
@@ -652,7 +656,9 @@ def forward(
         if labels is not None:
             # Shift so that tokens < n predict n
             if attention_mask is not None:
-                shift_attention_mask = attention_mask[..., 1:]
+                # we use the input attention mask to shift the logits and labels, because it is 2D.
+                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
+                shift_attention_mask = attention_mask[:, -(logits.shape[1] - 1) :].to(logits.device)
                 shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous()
                 shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous()
             else:
diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py
index 95129d46bbd8e5..26d92b9ac3dca4 100644
--- a/src/transformers/models/vipllava/modeling_vipllava.py
+++ b/src/transformers/models/vipllava/modeling_vipllava.py
@@ -282,6 +282,17 @@ def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_m
         self.vocab_size = model_embeds.num_embeddings
         return model_embeds
 
+    # Ignore copy
+    def get_image_features(self, pixel_values: torch.FloatTensor, vision_feature_layers: List[int]):
+        image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
+
+        # For VIP-llava, the image features are computed this way
+        # We select the features from index 1: for the layers -2, -5, -8, -11 and 6
+        image_features = [image_outputs.hidden_states[index][:, 1:] for index in vision_feature_layers]
+        image_features = torch.cat(image_features, dim=-1)
+        image_features = self.multi_modal_projector(image_features)
+        return image_features
+
     def _merge_input_ids_with_image_features(self, image_features, inputs_embeds, input_ids, attention_mask, labels):
         num_images, num_image_patches, embed_dim = image_features.shape
         batch_size, sequence_length = input_ids.shape
@@ -430,9 +441,7 @@ def forward(
         )
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
         if pixel_values is not None and inputs_embeds is not None:
             raise ValueError(
@@ -451,13 +460,9 @@ def forward(
             ) or (input_ids.shape[-1] == 1 and pixel_values is not None)
 
         if pixel_values is not None:
-            image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
-
-            # For VIP-llava, the image features are computed this way
-            # We select the features from index 1: for the layers -2, -5, -8, -11 and 6
-            image_features = [image_outputs.hidden_states[index][:, 1:] for index in vision_feature_layers]
-            image_features = torch.cat(image_features, dim=-1)
-            image_features = self.multi_modal_projector(image_features)
+            image_features = self.get_image_features(
+                pixel_values=pixel_values, vision_feature_layers=vision_feature_layers
+            )
 
             if legacy_processing:
                 logger.warning_once(
@@ -507,7 +512,10 @@ def forward(
             # TODO: @raushan retain only the new behavior after v4.47
             else:
                 special_image_mask = (
-                    (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+                    (input_ids == self.config.image_token_index)
+                    .unsqueeze(-1)
+                    .expand_as(inputs_embeds)
+                    .to(inputs_embeds.device)
                 )
                 image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
                 inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
@@ -531,7 +539,7 @@ def forward(
         if labels is not None:
             # Shift so that tokens < n predict n
             if attention_mask is not None:
-                shift_attention_mask = attention_mask[..., 1:]
+                shift_attention_mask = attention_mask[:, -(logits.shape[1] - 1) :].to(logits.device)
                 shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous()
                 shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous()
             else:
diff --git a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
index 979bd69de9be01..0c3cd95adbf878 100644
--- a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
@@ -658,13 +658,12 @@ def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs
     ):
         decoder_inputs = self.decoder.prepare_inputs_for_generation(input_ids, past_key_values=past_key_values)
-        decoder_attention_mask = decoder_inputs["attention_mask"] if "attention_mask" in decoder_inputs else None
         input_dict = {
             "attention_mask": attention_mask,
-            "decoder_attention_mask": decoder_attention_mask,
+            "decoder_attention_mask": decoder_inputs.get("attention_mask"),
             "decoder_input_ids": decoder_inputs["input_ids"],
             "encoder_outputs": encoder_outputs,
-            "past_key_values": decoder_inputs["past_key_values"],
+            "past_key_values": decoder_inputs.get("past_key_values"),
             "use_cache": use_cache,
         }
         return input_dict
diff --git a/src/transformers/models/vits/modeling_vits.py b/src/transformers/models/vits/modeling_vits.py
index 23bc8a72f8ba37..66834167d15e06 100644
--- a/src/transformers/models/vits/modeling_vits.py
+++ b/src/transformers/models/vits/modeling_vits.py
@@ -25,6 +25,7 @@
 
 from ...activations import ACT2FN
 from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.fsdp import is_fsdp_managed_module
 from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
 from ...modeling_outputs import (
     BaseModelOutput,
@@ -1139,7 +1140,7 @@ def forward(
 
         hidden_states = hidden_states * padding_mask
 
-        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
 
         for encoder_layer in self.layers:
             if output_hidden_states:
@@ -1149,8 +1150,8 @@ def forward(
             dropout_probability = np.random.uniform(0, 1)
 
             skip_the_layer = self.training and (dropout_probability < self.layerdrop)
-            if not skip_the_layer or deepspeed_zero3_is_enabled:
-                # under deepspeed zero3 all gpus must run in sync
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
                 if self.gradient_checkpointing and self.training:
                     layer_outputs = self._gradient_checkpointing_func(
                         encoder_layer.__call__,
diff --git a/src/transformers/models/vivit/modeling_vivit.py b/src/transformers/models/vivit/modeling_vivit.py
index 972040264fec31..3d543503284489 100755
--- a/src/transformers/models/vivit/modeling_vivit.py
+++ b/src/transformers/models/vivit/modeling_vivit.py
@@ -104,9 +104,10 @@ def __init__(self, config):
             torch.zeros(1, self.patch_embeddings.num_patches + 1, config.hidden_size)
         )
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.patch_size = config.tubelet_size[1:]
         self.config = config
 
-    # Copied from transformers.models.vit.modeling_vit.ViTEmbeddings.interpolate_pos_encoding
+    # Adapted from transformers.models.vit.modeling_vit.ViTEmbeddings.interpolate_pos_encoding
     def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
         """
         This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
@@ -129,8 +130,8 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
 
         dim = embeddings.shape[-1]
 
-        new_height = height // self.patch_size
-        new_width = width // self.patch_size
+        new_height = height // self.patch_size[0]
+        new_width = width // self.patch_size[1]
 
         sqrt_num_positions = torch_int(num_positions**0.5)
         patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
index 455ba4259cf67b..2648722111d52e 100755
--- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -27,6 +27,7 @@
 
 from ...activations import ACT2FN
 from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.fsdp import is_fsdp_managed_module
 from ...modeling_outputs import (
     BaseModelOutput,
     CausalLMOutput,
@@ -764,7 +765,7 @@ def forward(
             value_states,
             attention_mask,
             q_len,
-            dropout=self.dropout,
+            dropout=self.dropout if self.training else 0.0,
             is_causal=self.is_causal,
             use_top_left_mask=self._flash_attn_uses_top_left_mask,
         )
@@ -1038,7 +1039,7 @@ def forward(
         hidden_states = self.layer_norm(hidden_states)
         hidden_states = self.dropout(hidden_states)
 
-        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
 
         for layer in self.layers:
             if output_hidden_states:
@@ -1048,8 +1049,8 @@ def forward(
             dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
-            if not skip_the_layer or deepspeed_zero3_is_enabled:
-                # under deepspeed zero3 all gpus must run in sync
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
                 if self.gradient_checkpointing and self.training:
                     layer_outputs = self._gradient_checkpointing_func(
                         layer.__call__,
@@ -1124,7 +1125,7 @@ def forward(
         hidden_states = hidden_states + position_embeddings
         hidden_states = self.dropout(hidden_states)
 
-        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
 
         for layer in self.layers:
             if output_hidden_states:
@@ -1134,8 +1135,8 @@ def forward(
             dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
-            if not skip_the_layer or deepspeed_zero3_is_enabled:
-                # under deepspeed zero3 all gpus must run in sync
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
                 # XXX: could optimize this like synced_gpus in generate_utils but not sure if it's worth the code complication
                 if self.gradient_checkpointing and self.training:
                     layer_outputs = self._gradient_checkpointing_func(
diff --git a/src/transformers/models/wav2vec2/processing_wav2vec2.py b/src/transformers/models/wav2vec2/processing_wav2vec2.py
index 6fe960c78eb10b..4bd4255315fdc5 100644
--- a/src/transformers/models/wav2vec2/processing_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/processing_wav2vec2.py
@@ -18,12 +18,18 @@
 
 import warnings
 from contextlib import contextmanager
+from typing import List, Optional, Union
 
-from ...processing_utils import ProcessorMixin
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from ...tokenization_utils_base import AudioInput, PreTokenizedInput, TextInput
 from .feature_extraction_wav2vec2 import Wav2Vec2FeatureExtractor
 from .tokenization_wav2vec2 import Wav2Vec2CTCTokenizer
 
 
+class Wav2Vec2ProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {}
+
+
 class Wav2Vec2Processor(ProcessorMixin):
     r"""
     Constructs a Wav2Vec2 processor which wraps a Wav2Vec2 feature extractor and a Wav2Vec2 CTC tokenizer into a single
@@ -66,35 +72,46 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
 
             return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
 
-    def __call__(self, *args, **kwargs):
+    def __call__(
+        self,
+        audio: AudioInput = None,
+        text: Optional[Union[str, List[str], TextInput, PreTokenizedInput]] = None,
+        images=None,
+        videos=None,
+        **kwargs: Unpack[Wav2Vec2ProcessorKwargs],
+    ):
         """
         When used in normal mode, this method forwards all its arguments to Wav2Vec2FeatureExtractor's
         [`~Wav2Vec2FeatureExtractor.__call__`] and returns its output. If used in the context
         [`~Wav2Vec2Processor.as_target_processor`] this method forwards all its arguments to PreTrainedTokenizer's
         [`~PreTrainedTokenizer.__call__`]. Please refer to the docstring of the above two methods for more information.
         """
-        # For backward compatibility
-        if self._in_target_context_manager:
-            return self.current_processor(*args, **kwargs)
 
         if "raw_speech" in kwargs:
             warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.")
             audio = kwargs.pop("raw_speech")
-        else:
-            audio = kwargs.pop("audio", None)
-        sampling_rate = kwargs.pop("sampling_rate", None)
-        text = kwargs.pop("text", None)
-        if len(args) > 0:
-            audio = args[0]
-            args = args[1:]
 
         if audio is None and text is None:
             raise ValueError("You need to specify either an `audio` or `text` input to process.")
 
+        output_kwargs = self._merge_kwargs(
+            Wav2Vec2ProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        # For backward compatibility
+        if self._in_target_context_manager:
+            return self.current_processor(
+                audio,
+                **output_kwargs["audio_kwargs"],
+                **output_kwargs["text_kwargs"],
+                **output_kwargs["common_kwargs"],
+            )
+
         if audio is not None:
-            inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)
+            inputs = self.feature_extractor(audio, **output_kwargs["audio_kwargs"])
         if text is not None:
-            encodings = self.tokenizer(text, **kwargs)
+            encodings = self.tokenizer(text, **output_kwargs["text_kwargs"])
 
         if text is None:
             return inputs
diff --git a/src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py b/src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py
index ebbf700bf9ef98..6f1d5576df7316 100644
--- a/src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py
+++ b/src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py
@@ -26,6 +26,7 @@
 
 from ...activations import ACT2FN
 from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.fsdp import is_fsdp_managed_module
 from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
 from ...modeling_outputs import (
     BaseModelOutput,
@@ -730,7 +731,7 @@ def forward(
         else:
             relative_position_embeddings = None
 
-        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
 
         for i, layer in enumerate(self.layers):
             if output_hidden_states:
@@ -740,8 +741,8 @@ def forward(
             dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
-            if not skip_the_layer or deepspeed_zero3_is_enabled:
-                # under deepspeed zero3 all gpus must run in sync
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
                 if self.gradient_checkpointing and self.training:
                     layer_outputs = self._gradient_checkpointing_func(
                         layer.__call__,
diff --git a/src/transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py b/src/transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py
index d24c672007d734..8b09e92419ae97 100644
--- a/src/transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py
+++ b/src/transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py
@@ -17,12 +17,18 @@
 """
 
 import warnings
+from typing import List, Optional, Union
 
-from ...processing_utils import ProcessorMixin
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from ...tokenization_utils_base import AudioInput, PreTokenizedInput, TextInput
 from ..seamless_m4t.feature_extraction_seamless_m4t import SeamlessM4TFeatureExtractor
 from ..wav2vec2.tokenization_wav2vec2 import Wav2Vec2CTCTokenizer
 
 
+class Wav2Vec2BertProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {}
+
+
 class Wav2Vec2BertProcessor(ProcessorMixin):
     r"""
     Constructs a Wav2Vec2-BERT processor which wraps a Wav2Vec2-BERT feature extractor and a Wav2Vec2 CTC tokenizer into a single
@@ -63,7 +69,14 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
 
             return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
 
-    def __call__(self, audio=None, text=None, **kwargs):
+    def __call__(
+        self,
+        audio: AudioInput = None,
+        text: Optional[Union[str, List[str], TextInput, PreTokenizedInput]] = None,
+        images=None,
+        videos=None,
+        **kwargs: Unpack[Wav2Vec2BertProcessorKwargs],
+    ):
         """
         Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `audio`
         and `kwargs` arguments to SeamlessM4TFeatureExtractor's [`~SeamlessM4TFeatureExtractor.__call__`] if `audio` is not
@@ -71,17 +84,15 @@ def __call__(self, audio=None, text=None, **kwargs):
         PreTrainedTokenizer's [`~PreTrainedTokenizer.__call__`] if `text` is not `None`. Please refer to the doctsring of the above two methods for more information.
 
         Args:
-            text (`str`, `List[str]`, `List[List[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
             audio (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
                 The audio or batch of audios to be prepared. Each audio can be NumPy array or PyTorch tensor. In case
                 of a NumPy array/PyTorch tensor, each audio should be of shape (C, T), where C is a number of channels,
                 and T the sample length of the audio.
-            kwargs (*optional*):
-                Remaining dictionary of keyword arguments that will be passed to the feature extractor and/or the
-                tokenizer.
+
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
         Returns:
             [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
             - **input_features** -- Audio input features to be fed to a model. Returned when `audio` is not `None`.
@@ -91,15 +102,18 @@ def __call__(self, audio=None, text=None, **kwargs):
             - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None` and `audio` is `None`.
         """
 
-        sampling_rate = kwargs.pop("sampling_rate", None)
-
         if audio is None and text is None:
             raise ValueError("You need to specify either an `audio` or `text` input to process.")
+        output_kwargs = self._merge_kwargs(
+            Wav2Vec2BertProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
 
         if audio is not None:
-            inputs = self.feature_extractor(audio, sampling_rate=sampling_rate, **kwargs)
+            inputs = self.feature_extractor(audio, **output_kwargs["audio_kwargs"])
         if text is not None:
-            encodings = self.tokenizer(text, **kwargs)
+            encodings = self.tokenizer(text, **output_kwargs["text_kwargs"])
 
         if text is None:
             return inputs
diff --git a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
index c37dd980d4ed3b..933bf8f6dc0bcd 100644
--- a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
+++ b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
@@ -27,6 +27,7 @@
 
 from ...activations import ACT2FN
 from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.fsdp import is_fsdp_managed_module
 from ...modeling_outputs import (
     BaseModelOutput,
     CausalLMOutput,
@@ -893,7 +894,7 @@ def forward(
         else:
             relative_position_embeddings = None
 
-        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
 
         for i, layer in enumerate(self.layers):
             if output_hidden_states:
@@ -903,8 +904,8 @@ def forward(
             dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
-            if not skip_the_layer or deepspeed_zero3_is_enabled:
-                # under deepspeed zero3 all gpus must run in sync
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
                 if self.gradient_checkpointing and self.training:
                     layer_outputs = self._gradient_checkpointing_func(
                         layer.__call__,
diff --git a/src/transformers/models/wavlm/modeling_wavlm.py b/src/transformers/models/wavlm/modeling_wavlm.py
index fa5fd390f56452..4df192fda5efa3 100755
--- a/src/transformers/models/wavlm/modeling_wavlm.py
+++ b/src/transformers/models/wavlm/modeling_wavlm.py
@@ -27,6 +27,7 @@
 
 from ...activations import ACT2FN
 from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.fsdp import is_fsdp_managed_module
 from ...modeling_outputs import (
     BaseModelOutput,
     CausalLMOutput,
@@ -697,7 +698,7 @@ def forward(
         hidden_states = self.layer_norm(hidden_states)
         hidden_states = self.dropout(hidden_states)
 
-        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
         position_bias = None
 
         for i, layer in enumerate(self.layers):
@@ -708,8 +709,8 @@ def forward(
             dropout_probability = torch.rand([])
 
             skip_the_layer = self.training and i > 0 and (dropout_probability < self.config.layerdrop)
-            if not skip_the_layer or deepspeed_zero3_is_enabled:
-                # under deepspeed zero3 all gpus must run in sync
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
                 if self.gradient_checkpointing and self.training:
                     layer_outputs = self._gradient_checkpointing_func(
                         layer.__call__,
@@ -781,7 +782,7 @@ def forward(
         hidden_states = hidden_states + position_embeddings
         hidden_states = self.dropout(hidden_states)
 
-        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
         position_bias = None
 
         for i, layer in enumerate(self.layers):
@@ -792,8 +793,8 @@ def forward(
             dropout_probability = torch.rand([])
 
             skip_the_layer = self.training and i > 0 and (dropout_probability < self.config.layerdrop)
-            if not skip_the_layer or deepspeed_zero3_is_enabled:
-                # under deepspeed zero3 all gpus must run in sync
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
                 # XXX: could optimize this like synced_gpus in generate_utils but not sure if it's worth the code complication
                 if self.gradient_checkpointing and self.training:
                     layer_outputs = self._gradient_checkpointing_func(
diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py
index 7a4e9487288e93..0ecdcb4dbdeadc 100644
--- a/src/transformers/models/whisper/generation_whisper.py
+++ b/src/transformers/models/whisper/generation_whisper.py
@@ -173,7 +173,9 @@ def _pad_to_max_length(
 
 
 class WhisperGenerationMixin(GenerationMixin):
-    def _extract_token_timestamps(self, generate_outputs, alignment_heads, time_precision=0.02, num_frames=None):
+    def _extract_token_timestamps(
+        self, generate_outputs, alignment_heads, time_precision=0.02, num_frames=None, num_input_ids=None
+    ):
         """
         Calculates token-level timestamps using the encoder-decoder cross-attentions and dynamic time-warping (DTW) to
         map each output token to a position in the input audio. If `num_frames` is specified, the encoder-decoder
@@ -200,11 +202,18 @@ def _extract_token_timestamps(self, generate_outputs, alignment_heads, time_prec
             # since the beam search strategy chooses the most probable sequences at the end of the search.
             # In that case, the cross_attentions weights are too long and we have to make sure that they have the right output_length
             weight_length = (generate_outputs.beam_indices != -1).sum(-1).max()
+            weight_length = weight_length if num_input_ids is None else weight_length + num_input_ids
+
+            # beam search takes `decoder_input_ids` into account in the `beam_indices` length
+            # but forgot to shift the beam_indices by the number of `decoder_input_ids`
+            beam_indices = torch.zeros_like(generate_outputs.beam_indices[:, :weight_length])
+            # we actually shif the beam indices here
+            beam_indices[:, num_input_ids:] = generate_outputs.beam_indices[:, : weight_length - num_input_ids]
+
             weights = weights[:, :, :weight_length]
 
             # If beam index is still -1, it means that the associated token id is EOS
             # We need to replace the index with 0 since index_select gives an error if any of the indexes is -1.
-            beam_indices = generate_outputs.beam_indices[:, :weight_length]
             beam_indices = beam_indices.masked_fill(beam_indices == -1, 0)
 
             # Select the cross attention from the right beam for each output sequences
@@ -218,8 +227,10 @@ def _extract_token_timestamps(self, generate_outputs, alignment_heads, time_prec
 
         # make sure timestamps are as long as weights
         input_length = weight_length or cross_attentions[0].shape[2]
-        timestamps = torch.zeros_like(generate_outputs.sequences, dtype=torch.float32)[:, : input_length + 1]
-        batch_size = timestamps.shape[0]
+        batch_size = generate_outputs.sequences.shape[0]
+        timestamps = torch.zeros(
+            (batch_size, input_length + 1), dtype=torch.float32, device=generate_outputs.sequences.device
+        )
 
         if num_frames is not None:
             # two cases:
@@ -239,6 +250,7 @@ def _extract_token_timestamps(self, generate_outputs, alignment_heads, time_prec
             else:
                 # num_frames is of shape (batch_size,) whereas batch_size is truely batch_size*num_return_sequences
                 repeat_time = batch_size if isinstance(num_frames, int) else batch_size // len(num_frames)
+                num_frames = num_frames.cpu() if isinstance(num_frames, (torch.Tensor)) else num_frames
                 num_frames = np.repeat(num_frames, repeat_time)
 
         if num_frames is None or isinstance(num_frames, int):
@@ -345,7 +357,8 @@ def generate(
                 for constrained generation conditioned on the prefix, as described in [Autoregressive Entity
                 Retrieval](https://arxiv.org/abs/2010.00904).
             synced_gpus (`bool`, *optional*, defaults to `False`):
-                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
+                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
             return_timestamps (`bool`, *optional*):
                 Whether to return the timestamps with the text. This enables the `WhisperTimestampsLogitsProcessor`.
             task (`str`, *optional*):
@@ -948,7 +961,10 @@ def _postprocess_outputs(
         if return_token_timestamps and hasattr(generation_config, "alignment_heads"):
             num_frames = getattr(generation_config, "num_frames", None)
             seek_outputs["token_timestamps"] = self._extract_token_timestamps(
-                seek_outputs, generation_config.alignment_heads, num_frames=num_frames
+                seek_outputs,
+                generation_config.alignment_heads,
+                num_frames=num_frames,
+                num_input_ids=decoder_input_ids.shape[-1],
             )
             seek_outputs["token_timestamps"] = seek_outputs["token_timestamps"][:, start_idx:]
 
@@ -979,7 +995,10 @@ def split_by_batch_index(values, key, batch_idx, is_shortform, beam_indices=None
                     for v in range(len(values)):
                         layer_past_key_values = []
                         for w in values[v]:
-                            layer_past_key_values.append(w[batch_idx][None].cpu())
+                            if len(w) != 0:
+                                layer_past_key_values.append(w[batch_idx][None].cpu())
+                            else:
+                                layer_past_key_values.append(w)
                         all_past_key_values.append(tuple(layer_past_key_values))
                     return tuple(all_past_key_values)
 
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index b56118639b37a0..079965fc174a63 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -422,7 +422,7 @@ def forward(
 
         causal_mask = attention_mask
         if attention_mask is not None:  # no matter the length, we just slice it
-            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            causal_mask = attention_mask[:, : key_states.shape[-2]]
 
         # In PEFT, usually we cast the layer norms in float32 for training stability reasons
         # therefore the input hidden states gets silently casted in float32. Hence, we need
@@ -456,7 +456,7 @@ def forward(
             value_states,
             causal_mask,
             tgt_len,
-            dropout=self.dropout,
+            dropout=self.dropout if self.training else 0.0,
             is_causal=self.is_causal,
             use_top_left_mask=self._flash_attn_uses_top_left_mask,
         )
@@ -1399,7 +1399,7 @@ def _update_causal_mask(
         dtype, device = input_tensor.dtype, input_tensor.device
         sequence_length = input_tensor.shape[1]
         if using_static_cache:
-            target_length = past_key_values.get_max_length()
+            target_length = past_key_values.get_max_cache_shape()
         else:
             target_length = (
                 attention_mask.shape[-1]
@@ -1867,7 +1867,7 @@ def prepare_inputs_for_generation(
             decoder_attention_mask = self.get_decoder()._prepare_4d_causal_attention_mask_with_cache_position(
                 decoder_attention_mask,
                 sequence_length=sequence_length,
-                target_length=past_key_values.self_attention_cache.get_max_length(),
+                target_length=past_key_values.self_attention_cache.get_max_cache_shape(),
                 dtype=self.proj_out.weight.dtype,
                 device=decoder_input_ids.device,
                 cache_position=cache_position,
diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py
index d05db378443e0b..25208c43a85a6c 100644
--- a/src/transformers/models/x_clip/modeling_x_clip.py
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -133,15 +133,15 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
         """
 
         num_patches = embeddings.shape[1] - 1
-        self.position_embeddings = self.position_embedding.weight.unsqueeze(0)
-        num_positions = self.position_embeddings.shape[1] - 1
+        position_embedding = self.position_embedding.weight.unsqueeze(0)
+        num_positions = position_embedding.shape[1] - 1
 
         # always interpolate when tracing to ensure the exported model works for dynamic input shapes
         if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
-            return self.position_embeddings
+            return self.position_embedding(self.position_ids)
 
-        class_pos_embed = self.position_embeddings[:, :1]
-        patch_pos_embed = self.position_embeddings[:, 1:]
+        class_pos_embed = position_embedding[:, :1]
+        patch_pos_embed = position_embedding[:, 1:]
 
         dim = embeddings.shape[-1]
 
diff --git a/src/transformers/models/xglm/modeling_xglm.py b/src/transformers/models/xglm/modeling_xglm.py
index 3090bc2973cd56..70aac350c166c8 100755
--- a/src/transformers/models/xglm/modeling_xglm.py
+++ b/src/transformers/models/xglm/modeling_xglm.py
@@ -799,42 +799,6 @@ def forward(
             cross_attentions=outputs.cross_attentions,
         )
 
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs
-    ):
-        if past_key_values is not None:
-            past_length = past_key_values[0][0].shape[2]
-
-            # Some generation methods already pass only the last input ID
-            if input_ids.shape[1] > past_length:
-                remove_prefix_length = past_length
-            else:
-                # Default to old behavior: keep only final ID
-                remove_prefix_length = input_ids.shape[1] - 1
-
-            input_ids = input_ids[:, remove_prefix_length:]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-        else:
-            position_ids = None
-            # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-            if attention_mask is None:
-                attention_mask = input_ids.new_ones(input_ids.shape)
-        # first step, decoder_cached_states are empty
-        return {
-            "input_ids": input_ids,  # encoder_outputs is defined. input_ids not needed
-            "attention_mask": attention_mask,
-            "position_ids": position_ids,
-            "past_key_values": past_key_values,
-            "use_cache": use_cache,
-        }
-
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
diff --git a/src/transformers/models/xlm/modeling_xlm.py b/src/transformers/models/xlm/modeling_xlm.py
index 3acec2353b69fc..781d7a138f261e 100755
--- a/src/transformers/models/xlm/modeling_xlm.py
+++ b/src/transformers/models/xlm/modeling_xlm.py
@@ -676,6 +676,8 @@ def set_output_embeddings(self, new_embeddings):
         self.pred_layer.proj = new_embeddings
 
     def prepare_inputs_for_generation(self, input_ids, **kwargs):
+        # Overwritten -- this model uses config options to prepare inputs
+
         mask_token_id = self.config.mask_token_id
         lang_id = self.config.lang_id
 
diff --git a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
index a153f09468937b..7de91d6ce1ff98 100644
--- a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
@@ -1136,27 +1136,6 @@ def forward(
             cross_attentions=outputs.cross_attentions,
         )
 
-    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
-        input_shape = input_ids.shape
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_shape)
-
-        # cut decoder_input_ids if past_key_values is used
-        if past_key_values is not None:
-            past_length = past_key_values[0][0].shape[2]
-
-            # Some generation methods already pass only the last input ID
-            if input_ids.shape[1] > past_length:
-                remove_prefix_length = past_length
-            else:
-                # Default to old behavior: keep only final ID
-                remove_prefix_length = input_ids.shape[1] - 1
-
-            input_ids = input_ids[:, remove_prefix_length:]
-
-        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
-
     def _reorder_cache(self, past_key_values, beam_idx):
         reordered_past = ()
         for layer_past in past_key_values:
diff --git a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
index 0c384ad45c523b..cb88cbeabde2b0 100644
--- a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
+++ b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
@@ -1112,6 +1112,8 @@ def forward(
         )
 
     def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
+        # Overwritten -- model logic breaks when `inputs_embeds` are passed from this function
+
         input_shape = input_ids.shape
         # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
         if attention_mask is None:
diff --git a/src/transformers/models/xlnet/modeling_xlnet.py b/src/transformers/models/xlnet/modeling_xlnet.py
index 7681fbafad6db5..975f08c654bdc7 100755
--- a/src/transformers/models/xlnet/modeling_xlnet.py
+++ b/src/transformers/models/xlnet/modeling_xlnet.py
@@ -1308,6 +1308,8 @@ def set_output_embeddings(self, new_embeddings):
         self.lm_loss = new_embeddings
 
     def prepare_inputs_for_generation(self, input_ids, past_key_values=None, use_mems=None, **kwargs):
+        # Overwritten -- this model has unique input preparation
+
         # Add dummy token at the end (no attention on this one)
 
         effective_batch_size = input_ids.shape[0]
diff --git a/src/transformers/models/xmod/modeling_xmod.py b/src/transformers/models/xmod/modeling_xmod.py
index 71474cc9c45ba9..7208f80d26ca22 100644
--- a/src/transformers/models/xmod/modeling_xmod.py
+++ b/src/transformers/models/xmod/modeling_xmod.py
@@ -1089,28 +1089,6 @@ def forward(
             cross_attentions=outputs.cross_attentions,
         )
 
-    # Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM.prepare_inputs_for_generation
-    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
-        input_shape = input_ids.shape
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_shape)
-
-        # cut decoder_input_ids if past_key_values is used
-        if past_key_values is not None:
-            past_length = past_key_values[0][0].shape[2]
-
-            # Some generation methods already pass only the last input ID
-            if input_ids.shape[1] > past_length:
-                remove_prefix_length = past_length
-            else:
-                # Default to old behavior: keep only final ID
-                remove_prefix_length = input_ids.shape[1] - 1
-
-            input_ids = input_ids[:, remove_prefix_length:]
-
-        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
-
     # Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM._reorder_cache
     def _reorder_cache(self, past_key_values, beam_idx):
         reordered_past = ()
diff --git a/src/transformers/models/zamba/__init__.py b/src/transformers/models/zamba/__init__.py
new file mode 100644
index 00000000000000..e92890d1a71363
--- /dev/null
+++ b/src/transformers/models/zamba/__init__.py
@@ -0,0 +1,57 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+
+
+_import_structure = {
+    "configuration_zamba": ["ZambaConfig"],
+}
+
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_zamba"] = [
+        "ZambaForCausalLM",
+        "ZambaForSequenceClassification",
+        "ZambaModel",
+        "ZambaPreTrainedModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_zamba import ZambaConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_zamba import (
+            ZambaForCausalLM,
+            ZambaForSequenceClassification,
+            ZambaModel,
+            ZambaPreTrainedModel,
+        )
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/zamba/configuration_zamba.py b/src/transformers/models/zamba/configuration_zamba.py
new file mode 100644
index 00000000000000..a6764a82608853
--- /dev/null
+++ b/src/transformers/models/zamba/configuration_zamba.py
@@ -0,0 +1,224 @@
+# coding=utf-8
+# Copyright 2024 Zyphra Technologies and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Zamba model configuration"""
+
+import math
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class ZambaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ZambaModel`]. It is used to instantiate a
+    Zamba model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Zamba-v0.1 model.
+
+    [Zyphra/Zamba-7B-v1](https://huggingface.co/Zyphra/Zamba-7B-v1)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Zamba model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`ZambaModel`]
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the
+            model has a output word embedding layer.
+        hidden_size (`int`, *optional*, defaults to 3712):
+            Dimension of the hidden representations.
+        attention_hidden_size (`int`, *optional*):
+            Dimension of the hidden representations of the inputs to the Attention layer.
+        intermediate_size (`int`, *optional*, defaults to 14848):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 76):
+            Number of hidden layers in the model.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        attention_head_dim (`int`, *optional*):
+            Dimension of the attention head in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 16):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=None`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf).
+        n_mamba_heads (`int`, *optional*, defaults to 2):
+            Number of mamba heads for each mamba layer.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the decoder.
+        hidden_mamba_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the mamba layer.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        num_logits_to_keep (`int` or `None`, *optional*, defaults to 1):
+            Number of prompt logits to calculate during generation. If `None`, all logits will be calculated. If an
+            integer value, only last `num_logits_to_keep` logits will be calculated. Default is 1 because only the
+            logits of the last prompt token are needed for generation. For long sequences, the logits for the entire
+            sequence may use a lot of memory so, setting `num_logits_to_keep=1` will reduce memory footprint
+            significantly.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the "end-of-sequence" token.
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
+            This value doesn't have any real effect. The maximum sequence length that this model is intended to be
+            used with. It can be used with longer sequences, but performance may degrade.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        attn_layer_period (`int`, *optional*, defaults to 6):
+            Once in this many layers, we will have a shared attention layer
+        attn_layer_offset (`int`, *optional*, defaults to 4):
+            Offset of the shared attention layer
+        use_mamba_kernels (`bool`, *optional*, defaults to `True`):
+            Flag indicating whether or not to use the fast mamba kernels. These are available only if `mamba-ssm` and
+            `causal-conv1d` are installed, and the mamba modules are running on a CUDA device. Raises ValueError if
+            `True` and kernels are not available
+        mamba_d_state (`int`, *optional*, defaults to 16):
+            The dimension the mamba state space latents
+        mamba_d_conv (`int`, *optional*, defaults to 4):
+            The size of the mamba convolution kernel
+        mamba_expand (`int`, *optional*, defaults to 2):
+            Expanding factor (relative to hidden_size) used to determine the mamba intermediate size
+        mamba_dt_rank (`Union[int,str]`, *optional*, defaults to `"auto"`):
+            Rank of the the mamba discretization projection matrix. `"auto"` means that it will default to `math.ceil(self.hidden_size / 16)`
+        time_step_min (`float`, *optional*, defaults to 0.001):
+            Minimum `time_step` used to bound `dt_proj_bias`.
+        time_step_max (`float`, *optional*, defaults to 0.1):
+            Maximum `time_step` used to bound `dt_proj_bias`.
+        time_step_floor (`float`, *optional*, defaults to 0.0001):
+            Minimum clamping value of the `dt_proj.bias` layer initialization.
+        mamba_conv_bias (`bool`, *optional*, defaults to `True`):
+            Flag indicating whether or not to use bias in the convolution layer of the mamba mixer block.
+        mamba_proj_bias (`bool`, *optional*, defaults to `False`):
+            Flag indicating whether or not to use bias in the input and output projections (["in_proj", "out_proj"]) of the mamba mixer block
+
+    """
+
+    model_type = "zamba"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        tie_word_embeddings=True,
+        hidden_size=3712,
+        attention_hidden_size=None,
+        intermediate_size=14848,
+        num_hidden_layers=76,
+        num_attention_heads=16,
+        attention_head_dim=None,
+        num_key_value_heads=16,
+        n_mamba_heads=2,
+        hidden_act="gelu",
+        hidden_mamba_act="silu",
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        num_logits_to_keep=1,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        max_position_embeddings=4096,
+        attention_dropout=0.0,
+        attn_layer_period=6,
+        attn_layer_offset=4,
+        use_mamba_kernels=True,
+        mamba_d_state=16,
+        mamba_d_conv=4,
+        mamba_expand=2,
+        mamba_dt_rank="auto",
+        time_step_min=0.001,
+        time_step_max=0.1,
+        time_step_floor=1e-4,
+        mamba_conv_bias=True,
+        mamba_proj_bias=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.tie_word_embeddings = tie_word_embeddings
+        self.hidden_size = hidden_size
+        if attention_hidden_size is None:
+            self.attention_hidden_size = 2 * hidden_size
+        else:
+            self.attention_hidden_size = attention_hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        if attention_head_dim is None:
+            self.attention_head_dim = 2 * self.hidden_size // self.num_attention_heads
+        else:
+            self.attention_head_dim = attention_head_dim
+        self.max_position_embeddings = max_position_embeddings
+        self.attention_dropout = attention_dropout
+
+        self.num_key_value_heads = num_key_value_heads
+        self.n_mamba_heads = n_mamba_heads
+        self.hidden_act = hidden_act
+        self.hidden_mamba_act = hidden_mamba_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+
+        self.use_cache = use_cache
+        self.num_logits_to_keep = num_logits_to_keep
+
+        self.attn_layer_period = attn_layer_period
+        self.attn_layer_offset = attn_layer_offset
+
+        self.use_mamba_kernels = use_mamba_kernels
+        self.mamba_d_state = mamba_d_state
+        self.mamba_d_conv = mamba_d_conv
+        self.mamba_expand = mamba_expand
+        self.mamba_dt_rank = math.ceil(self.hidden_size / 16) if mamba_dt_rank == "auto" else mamba_dt_rank
+        self.time_step_min = time_step_min
+        self.time_step_max = time_step_max
+        self.time_step_floor = time_step_floor
+        self.mamba_conv_bias = mamba_conv_bias
+        self.mamba_proj_bias = mamba_proj_bias
+
+        self.layers_block_type = self._layers_block_type(num_hidden_layers, attn_layer_period, attn_layer_offset)
+
+        assert (
+            self.mamba_expand * self.hidden_size
+        ) % self.n_mamba_heads == 0, "`intermediate_size` should be divisible by `n_mamba_heads`."
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _layers_block_type(self, num_hidden_layers, attn_layer_period, attn_layer_offset):
+        layers = [
+            "mamba",
+            "mamba",
+            "hybrid",
+        ] + ["hybrid" if i % attn_layer_period == attn_layer_offset else "mamba" for i in range(num_hidden_layers - 3)]
+        return layers
diff --git a/src/transformers/models/zamba/modeling_zamba.py b/src/transformers/models/zamba/modeling_zamba.py
new file mode 100644
index 00000000000000..2363ed04959d00
--- /dev/null
+++ b/src/transformers/models/zamba/modeling_zamba.py
@@ -0,0 +1,1685 @@
+# coding=utf-8
+# Copyright 2024 Zyphra Technologies and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Zamba model."""
+
+import math
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import (
+    AttentionMaskConverter,
+)
+from ...modeling_flash_attention_utils import _flash_attention_forward
+from ...modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    SequenceClassifierOutputWithPast,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import ALL_LAYERNORM_LAYERS
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+    replace_return_docstrings,
+)
+from ...utils.import_utils import (
+    is_causal_conv1d_available,
+    is_mamba_ssm_available,
+    is_torchdynamo_compiling,
+)
+from .configuration_zamba import ZambaConfig
+
+
+if is_mamba_ssm_available():
+    from mamba_ssm.ops.selective_scan_interface import mamba_inner_fn, selective_scan_fn
+    from mamba_ssm.ops.triton.selective_state_update import selective_state_update
+else:
+    selective_state_update, selective_scan_fn, mamba_inner_fn = None, None, None
+
+if is_causal_conv1d_available():
+    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
+else:
+    causal_conv1d_update, causal_conv1d_fn = None, None
+
+is_fast_path_available = all(
+    (selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)
+)
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "ZambaConfig"
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Zamba
+class ZambaRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        ZambaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+ALL_LAYERNORM_LAYERS.append(ZambaRMSNorm)
+
+
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class HybridMambaAttentionDynamicCache(DynamicCache):
+    """
+    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
+    (which has a constant shape regardless of seq_len).
+
+    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
+    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
+    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
+    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
+    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
+    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
+    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
+    """
+
+    def __init__(self, config, batch_size, dtype=torch.float16, device=None):
+        self.dtype = dtype
+        self.layers_block_type = config.layers_block_type
+        self.has_previous_state = False  # only used by mamba
+        intermediate_size = config.mamba_expand * config.hidden_size
+        ssm_state_size = config.mamba_d_state
+        conv_kernel_size = config.mamba_d_conv
+        self.n_mamba_heads = config.n_mamba_heads
+        self.conv_states = []
+        self.ssm_states = []
+        self.transformer_layers = []
+        self._modules = {}
+        self._parameters = {}
+        self._buffers = {}
+        for i in range(config.num_hidden_layers):
+            self.conv_states += [
+                torch.zeros(batch_size, intermediate_size, conv_kernel_size, device=device, dtype=dtype)
+            ]
+            cache_shape = (batch_size, self.n_mamba_heads, intermediate_size // self.n_mamba_heads, ssm_state_size)
+            self.ssm_states += [torch.zeros(cache_shape, device=device, dtype=dtype)]
+            if self.layers_block_type[i] == "hybrid":
+                self.transformer_layers.append(i)
+
+        self.key_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)]
+        self.value_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)]
+
+    # Copied from transformers.models.jamba.modeling_jamba.HybridMambaAttentionDynamicCache.update
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        layer_idx: int,
+        cache_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Update the cache
+        if self.key_cache[layer_idx].shape[-1] == 0:
+            self.key_cache[layer_idx] = key_states
+            self.value_cache[layer_idx] = value_states
+        else:
+            self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=2)
+            self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=2)
+
+        return self.key_cache[layer_idx], self.value_cache[layer_idx]
+
+    # Copied from transformers.models.jamba.modeling_jamba.HybridMambaAttentionDynamicCache.reorder_cache
+    def reorder_cache(self, beam_idx: torch.LongTensor):
+        """Reorders the cache for beam search, given the selected beam indices."""
+        for layer_idx in range(len(self.key_cache)):
+            device = self.key_cache[layer_idx].device
+            self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.value_cache[layer_idx].device
+            self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device))
+
+            device = self.conv_states[layer_idx].device
+            self.conv_states[layer_idx] = self.conv_states[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.ssm_states[layer_idx].device
+            self.ssm_states[layer_idx] = self.ssm_states[layer_idx].index_select(0, beam_idx.to(device))
+
+    # Copied from transformers.models.jamba.modeling_jamba.HybridMambaAttentionDynamicCache.get_seq_length
+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
+        # take any layer that contains cache and not empty tensor
+        layer_idx = self.transformer_layers[0] if layer_idx not in self.transformer_layers else layer_idx
+        if len(self.key_cache) <= layer_idx:
+            return 0
+        return self.key_cache[layer_idx].shape[-2]
+
+    # Copied from transformers.models.jamba.modeling_jamba.HybridMambaAttentionDynamicCache.to_legacy_cache
+    def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor], Tuple[torch.Tensor]]:
+        raise NotImplementedError("HybridMambaAttentionDynamicCache does not have a legacy cache equivalent.")
+
+    @classmethod
+    # Copied from transformers.models.jamba.modeling_jamba.HybridMambaAttentionDynamicCache.from_legacy_cache
+    def from_legacy_cache(cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None) -> "DynamicCache":
+        raise NotImplementedError("HybridMambaAttentionDynamicCache does not have a legacy cache equivalent.")
+
+
+class ZambaAttention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
+    and "Generating Long Sequences with Sparse Transformers".
+
+    Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
+    The input dimension here is attention_hidden_size = 2 * hidden_size, and head_dim = attention_hidden_size // num_heads.
+    The extra factor of 2 comes from the input being the concatenation of original_hidden_states with the output of the previous (mamba) layer
+    (see fig. 2 in https://arxiv.org/pdf/2405.16712).
+    Additionally, replaced
+    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) with
+    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim/2)
+    """
+
+    def __init__(self, config: ZambaConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+
+        self.hidden_size = config.hidden_size
+        self.attention_hidden_size = config.attention_hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = config.attention_head_dim
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+
+        if (self.head_dim * self.num_heads) != self.attention_hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.attention_hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.attention_hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.attention_hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        layer_idx: int,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[HybridMambaAttentionDynamicCache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        if past_key_value is not None:
+            key_states, value_states = past_key_value.update(key_states, value_states, layer_idx)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim / 2)
+
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.attention_hidden_size)
+
+        attn_output = attn_output
+        attn_output = self.o_proj(attn_output)
+        attn_output = attn_output
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+# Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
+# Added softmax_scale = 1 / (query_states.shape[-1]/2)**0.5 to the arguments of self._flash_attention_forward
+# dropped use_sliding_windows from the arguments of self._flash_attention_forward
+class ZambaFlashAttention2(ZambaAttention):
+    """
+    Zamba flash attention module. This module inherits from `ZambaAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        layer_idx: int,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[HybridMambaAttentionDynamicCache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ):
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        if past_key_value is not None:
+            key_states, value_states = past_key_value.update(key_states, value_states, layer_idx)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        dropout_rate = 0.0 if not self.training else self.attention_dropout
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        # Reashape to the expected shape for Flash Attention
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        softmax_scale = 1 / math.sqrt(self.head_dim / 2)
+
+        attn_output = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            dropout=dropout_rate,
+            softmax_scale=softmax_scale,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, self.attention_hidden_size).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+# Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
+# added scale = 1 / (query_states.shape[-1]/2)**0.5 to the arguments of torch.nn.functional.scaled_dot_product_attention
+class ZambaSdpaAttention(ZambaAttention):
+    """
+    Zamba attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `ZambaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        layer_idx: int,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[HybridMambaAttentionDynamicCache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "ZambaModel is using ZambaSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        if past_key_value is not None:
+            key_states, value_states = past_key_value.update(key_states, value_states, layer_idx)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        causal_mask = attention_mask
+        if attention_mask is not None:
+            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        softmax_scale = 1 / math.sqrt(self.head_dim / 2)
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+            is_causal=self.is_causal and attention_mask is None and q_len > 1,
+            scale=softmax_scale,
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, self.attention_hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+
+ZAMBA_ATTENTION_CLASSES = {
+    "eager": ZambaAttention,
+    "flash_attention_2": ZambaFlashAttention2,
+    "sdpa": ZambaSdpaAttention,
+}
+
+
+class ZambaMambaMixer(nn.Module):
+    """
+    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
+    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
+    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
+    and is why Mamba is called **selective** state spaces)
+
+    This module differs from `transformers.models.mamba.modeling_mamba.MambaMixer` in two ways:
+    - Added multi-head: the output of `self.in_proj` is split into `self.n_mamba_heads` heads, and each head
+    undergoes an independent forward pass, identical to the original `MambaMixer`, up until the pre-activations of
+    `self.out_proj`. The pre-activations, coming from different mamba heads, are then concatenated and fed into `self.out_proj`.
+    """
+
+    def __init__(self, config: ZambaConfig, layer_idx):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.ssm_state_size = config.mamba_d_state
+        self.conv_kernel_size = config.mamba_d_conv
+        self.intermediate_size = config.mamba_expand * config.hidden_size
+        self.time_step_rank = config.mamba_dt_rank
+        self.n_mamba_heads = config.n_mamba_heads
+        self.mamba_head_dim = self.intermediate_size // self.n_mamba_heads
+        self.use_conv_bias = config.mamba_conv_bias
+        self.use_bias = config.mamba_proj_bias
+        self.conv1d = nn.Conv1d(
+            in_channels=self.intermediate_size,
+            out_channels=self.intermediate_size,
+            bias=self.use_conv_bias,
+            kernel_size=self.conv_kernel_size,
+            groups=self.intermediate_size,
+            padding=self.conv_kernel_size - 1,
+        )
+
+        self.activation = config.hidden_mamba_act
+        self.act = ACT2FN[config.hidden_mamba_act]
+
+        self.use_fast_kernels = config.use_mamba_kernels
+
+        # projection of the input hidden states
+        self.in_proj = nn.Linear(self.hidden_size, self.intermediate_size * 2, bias=self.use_bias)
+        # weight associated to the selective projection used to make dt, B and C input dependent
+        # each mamba head is processed independently
+        self.x_proj_weight = nn.Parameter(
+            (
+                torch.zeros(
+                    self.n_mamba_heads,
+                    self.time_step_rank + self.ssm_state_size * 2,
+                    self.mamba_head_dim,
+                )
+            )
+        )
+        # time step projection (discretization)
+        self.dt_proj_weight = nn.Parameter(
+            (torch.zeros(self.n_mamba_heads, self.mamba_head_dim, self.time_step_rank) - 0.5)
+            * 2
+            / self.time_step_rank**0.5
+        )
+        self.dt_proj_bias = nn.Parameter(torch.zeros(self.n_mamba_heads, self.mamba_head_dim))
+
+        # S4D real initialization. These are not discretized!
+        # The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded
+        A = torch.arange(1, self.ssm_state_size + 1, dtype=torch.float32)[None, :]
+        A = A.expand(self.intermediate_size, -1).contiguous()
+        self.A_log = nn.Parameter(torch.log(A).reshape(self.n_mamba_heads, self.mamba_head_dim, -1))
+        self.D = nn.Parameter(torch.ones(self.n_mamba_heads, self.mamba_head_dim))
+        self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=self.use_bias)
+
+        if not is_fast_path_available:
+            logger.warning_once(
+                "The fast path is not available because on of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)`"
+                " is None. To install follow https://github.com/state-spaces/mamba/#installation and"
+                " https://github.com/Dao-AILab/causal-conv1d. If you want to use the naive implementation, set `use_mamba_kernels=False` in the model config"
+            )
+
+    def cuda_kernels_forward(
+        self, hidden_states: torch.Tensor, cache_params: HybridMambaAttentionDynamicCache = None, attention_mask=None
+    ):
+        batch_size, seq_len, _ = hidden_states.shape
+        use_precomputed_states = cache_params is not None and cache_params.has_previous_state and seq_len == 1
+
+        # 1. Gated linear projection
+        projected_states = self.in_proj(hidden_states).transpose(1, 2)
+
+        hidden_states, gate = projected_states.view(batch_size, -1, 2, seq_len).chunk(2, dim=2)
+        hidden_states = hidden_states.squeeze(2).contiguous()
+        gate = gate.squeeze(2)
+        gate = gate.reshape(batch_size, self.n_mamba_heads, -1, seq_len).transpose(0, 1)
+
+        # 2. Convolution sequence transformation
+        conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0), self.conv1d.weight.size(2))
+        if use_precomputed_states:
+            hidden_states = causal_conv1d_update(
+                hidden_states.squeeze(-1),
+                cache_params.conv_states[self.layer_idx],
+                conv_weights,
+                self.conv1d.bias,
+                self.activation,
+            )
+            hidden_states = hidden_states.unsqueeze(-1)
+        else:
+            if attention_mask is not None and not torch.all(attention_mask == 1):
+                hidden_states = hidden_states * attention_mask.unsqueeze(1)
+            if cache_params is not None:
+                conv_states = nn.functional.pad(hidden_states, (self.conv_kernel_size - hidden_states.shape[-1], 0))
+                cache_params.conv_states[self.layer_idx].copy_(conv_states)
+            hidden_states = causal_conv1d_fn(hidden_states, conv_weights, self.conv1d.bias, activation=self.activation)
+            if attention_mask is not None and not torch.all(attention_mask == 1):
+                hidden_states = hidden_states * attention_mask.unsqueeze(1)
+
+        # 3. SSM sequence transformation
+        # 3.a. input varying initialization of time_step, B and C
+
+        hidden_states = hidden_states.reshape(-1, self.n_mamba_heads, self.mamba_head_dim, seq_len).transpose(0, 1)
+        ssm_parameters = (self.x_proj_weight[:, None, :, :] @ hidden_states).transpose(-1, -2)
+
+        time_step, B, C = torch.split(
+            ssm_parameters, [self.time_step_rank, self.ssm_state_size, self.ssm_state_size], dim=-1
+        )
+
+        discrete_time_step = self.dt_proj_weight[:, None] @ time_step.transpose(-1, -2)
+
+        A = -torch.exp(self.A_log.float())
+
+        # 3.c perform the recurrence y ← SSM(A, B, C)(x)
+        time_proj_bias = self.dt_proj_bias.float() if self.dt_proj_bias is not None else None
+        scan_outputs = torch.empty((batch_size, 0, seq_len), device=hidden_states.device, dtype=hidden_states.dtype)
+
+        if use_precomputed_states:
+            for n in range(self.n_mamba_heads):
+                scan_outputs_ = selective_state_update(
+                    cache_params.ssm_states[self.layer_idx][:, n],
+                    hidden_states[n, ..., 0],
+                    discrete_time_step[n, ..., 0],
+                    A[n],
+                    B[n, :, 0],
+                    C[n, :, 0],
+                    self.D[n],
+                    gate[n, ..., 0],
+                    time_proj_bias[n],
+                    dt_softplus=True,
+                ).unsqueeze(-1)
+                scan_outputs = torch.cat((scan_outputs, scan_outputs_), dim=1)
+
+        else:
+            ssm_state = torch.empty(
+                (batch_size, 0, self.mamba_head_dim, self.ssm_state_size),
+                device=hidden_states.device,
+                dtype=hidden_states.dtype,
+            )
+            for n in range(self.n_mamba_heads):
+                scan_outputs_, ssm_state_ = selective_scan_fn(
+                    hidden_states[n],
+                    discrete_time_step[n],
+                    A[n],
+                    B[n].transpose(1, 2),
+                    C[n].transpose(1, 2),
+                    self.D[n].float(),
+                    gate[n],
+                    time_proj_bias[n],
+                    delta_softplus=True,
+                    return_last_state=True,
+                )
+                scan_outputs = torch.cat((scan_outputs, scan_outputs_), dim=1).contiguous()
+                ssm_state = torch.cat((ssm_state, ssm_state_.unsqueeze(1)), dim=1)
+            if ssm_state is not None and cache_params is not None:
+                cache_params.ssm_states[self.layer_idx].copy_(ssm_state)
+
+        # 4. Final linear projection
+        contextualized_states = self.out_proj(scan_outputs.transpose(1, 2))
+        return contextualized_states
+
+    def slow_forward(self, input_states, cache_params: HybridMambaAttentionDynamicCache = None, attention_mask=None):
+        batch_size, seq_len, _ = input_states.shape
+        dtype = input_states.dtype
+        # 1. Gated linear projection
+        projected_states = self.in_proj(input_states).transpose(1, 2)
+
+        hidden_states, gate = projected_states.view(batch_size, -1, 2, seq_len).chunk(2, dim=2)
+        hidden_states = hidden_states.squeeze(2).contiguous()
+        gate = gate.squeeze(2)
+        gate = gate.reshape(batch_size, self.n_mamba_heads, -1, seq_len).transpose(0, 1)
+
+        use_cache = isinstance(cache_params, HybridMambaAttentionDynamicCache)
+        # 2. Convolution sequence transformation
+        if use_cache and cache_params.ssm_states[self.layer_idx].shape[0] == batch_size:
+            if self.training:
+                # In training mode, we don't want to perform in-place operations on ssm_state so we can compute the backwards pass
+                ssm_state = cache_params.ssm_states[self.layer_idx].clone()
+            else:
+                ssm_state = cache_params.ssm_states[self.layer_idx]
+
+            ssm_state = ssm_state.to(hidden_states.device)
+
+            if (
+                cache_params.has_previous_state
+                and seq_len == 1
+                and cache_params.conv_states[self.layer_idx].shape[0] == batch_size
+            ):
+                conv_state = cache_params.conv_states[self.layer_idx]
+                conv_state = torch.roll(conv_state, shifts=-1, dims=-1)
+                conv_state[:, :, -1] = hidden_states[:, :, 0]
+                cache_params.conv_states[self.layer_idx] = conv_state
+                hidden_states = torch.sum(conv_state * self.conv1d.weight[:, 0, :], dim=-1)
+                if self.use_conv_bias:
+                    hidden_states += self.conv1d.bias
+                hidden_states = self.act(hidden_states).to(dtype).unsqueeze(-1)
+            else:
+                if attention_mask is not None and not torch.all(attention_mask == 1):
+                    hidden_states = hidden_states * attention_mask[:, -hidden_states.shape[-1] :].unsqueeze(1)
+                conv_state = nn.functional.pad(hidden_states, (self.conv_kernel_size - hidden_states.shape[-1], 0))
+                cache_params.conv_states[self.layer_idx] = conv_state
+                hidden_states = self.act(self.conv1d(hidden_states)[..., :seq_len])
+                if attention_mask is not None and not torch.all(attention_mask == 1):
+                    hidden_states = hidden_states * attention_mask[:, -hidden_states.shape[-1] :].unsqueeze(1)
+        else:
+            ssm_state = torch.zeros(
+                (batch_size, self.n_mamba_heads, self.mamba_head_dim, self.ssm_state_size),
+                device=hidden_states.device,
+                dtype=dtype,
+            )
+            if attention_mask is not None and not torch.all(attention_mask == 1):
+                hidden_states = hidden_states * attention_mask.unsqueeze(1)
+            hidden_states = self.act(self.conv1d(hidden_states)[..., :seq_len])
+            if attention_mask is not None and not torch.all(attention_mask == 1):
+                hidden_states = hidden_states * attention_mask.unsqueeze(1)
+
+        # 3. State Space Model sequence transformation
+        # 3.a. Selection:  [batch, seq_len, self.time_step_rank + self.ssm_state_size * 2]
+        hidden_states = hidden_states.reshape(-1, self.n_mamba_heads, self.mamba_head_dim, seq_len).transpose(0, 1)
+        ssm_parameters = (self.x_proj_weight[:, None, :, :] @ hidden_states).transpose(-1, -2)
+
+        time_step, B, C = torch.split(
+            ssm_parameters, [self.time_step_rank, self.ssm_state_size, self.ssm_state_size], dim=-1
+        )
+        discrete_time_step = (self.dt_proj_weight[:, None] @ time_step.transpose(-1, -2)) + self.dt_proj_bias[
+            :, None, :, None
+        ]
+
+        discrete_time_step = nn.functional.softplus(discrete_time_step)
+
+        # 3.b. Discretization: B and C to [batch, seq_len, intermediate_size, ssm_state_size] (SRAM)
+        A = -torch.exp(self.A_log.float())
+        discrete_A = torch.exp(A[:, None, :, None, :] * discrete_time_step[:, :, :, :, None])
+        discrete_B = discrete_time_step[:, :, :, :, None] * B[:, :, None, :, :].float()
+        deltaB_u = discrete_B * hidden_states[:, :, :, :, None].float()
+        # 3.c perform the recurrence y ← SSM(A, B, C)(x)
+        scan_outputs = []
+        for i in range(seq_len):
+            ssm_state = discrete_A[:, :, :, i, :].transpose(0, 1) * ssm_state + deltaB_u[:, :, :, i, :].transpose(0, 1)
+            scan_output = torch.matmul(ssm_state.transpose(0, 1).to(dtype), C[:, :, i, :].unsqueeze(-1))
+            scan_outputs.append(scan_output[:, :, :, 0])
+        scan_output = torch.stack(scan_outputs, dim=-1)
+        scan_output = scan_output + (hidden_states * self.D[:, None, :, None])
+        scan_output = scan_output * self.act(gate)
+
+        if use_cache:
+            cache_params.ssm_states[self.layer_idx] = ssm_state
+
+        # 4. Final linear projection
+        contextualized_states = self.out_proj(
+            scan_output.transpose(0, 1).reshape(batch_size, -1, seq_len).transpose(1, 2)
+        )
+        return contextualized_states
+
+    def forward(self, hidden_states, cache_params: HybridMambaAttentionDynamicCache = None, attention_mask=None):
+        if self.use_fast_kernels:
+            if not is_fast_path_available or "cuda" not in self.x_proj_weight.device.type:
+                raise ValueError(
+                    "Fast Mamba kernels are not available. Make sure to they are installed and that "
+                    "the mamba module is on a CUDA device. lease run 'pip install causal-conv1d>=1.2.0' "
+                    "and 'pip install mamba-ssm', or set use_mamba_kernels=False in the model's config."
+                )
+            return self.cuda_kernels_forward(hidden_states, cache_params, attention_mask=attention_mask)
+        return self.slow_forward(hidden_states, cache_params, attention_mask=attention_mask)
+
+
+# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Zamba
+class ZambaMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_state):
+        return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
+
+
+class ZambaAttentionDecoderLayer(nn.Module):
+    def __init__(self, config: ZambaConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.self_attn = ZAMBA_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+
+        self.feed_forward = ZambaMLP(config)
+        self.input_layernorm = ZambaRMSNorm(config.attention_hidden_size, eps=config.rms_norm_eps)
+        self.pre_ff_layernorm = ZambaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        original_hidden_states: torch.Tensor,
+        layer_idx: int,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[HybridMambaAttentionDynamicCache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): output of previous Mamba layer of shape `(batch, seq_len, embed_dim)`
+            original_hidden_states (`torch.FloatTensor`): word embedding output of shape `(batch, seq_len, embed_dim)`.
+                This is concatenated with `hidden_states` (which is the output of the previous (mamba) layer). The
+                concatenated tensor is then used as input of the pre-attention RMSNorm
+                (see fig. 2 in https://arxiv.org/pdf/2405.16712).
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            past_key_value (`HybridMambaAttentionDynamicCache`, *optional*): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
+        """
+        hidden_states = torch.concatenate([hidden_states, original_hidden_states], dim=-1)
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            layer_idx=layer_idx,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        # feed-forward (MLP)
+        hidden_states = self.pre_ff_layernorm(hidden_states)
+        hidden_states = self.feed_forward(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+class ZambaMambaDecoderLayer(nn.Module):
+    def __init__(self, config: ZambaConfig, layer_idx: int):
+        super().__init__()
+        self.mamba = ZambaMambaMixer(config=config, layer_idx=layer_idx)
+        self.input_layernorm = ZambaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.layer_idx = layer_idx
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        original_hidden_states: Optional[torch.Tensor] = None,
+        layer_idx: int = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[HybridMambaAttentionDynamicCache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        transformer_hidden_states: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            past_key_value (`HybridMambaAttentionDynamicCache`, *optional*): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
+        """
+
+        residual = hidden_states
+
+        # `transformer_hidden_states` is the output from shared transformer + linear layer (see fig. 2 in https://arxiv.org/pdf/2405.16712).
+        # `transformer_hidden_states` is then added to the input to the mamba layer below (as described in eq. (6) of https://arxiv.org/pdf/2405.16712).
+        hidden_states = (
+            hidden_states + transformer_hidden_states if transformer_hidden_states is not None else hidden_states
+        )
+        hidden_states = self.input_layernorm(hidden_states)
+
+        hidden_states = self.mamba(
+            hidden_states=hidden_states,
+            cache_params=past_key_value,
+            attention_mask=attention_mask,
+        )
+
+        self_attn_weights = None
+
+        # residual connection after mamba
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (past_key_value,)
+
+        return outputs
+
+
+class HybridLayer(nn.Module):
+    def __init__(self, shared_transf: ZambaAttentionDecoderLayer, linear: nn.Linear, mamba: ZambaMambaDecoderLayer):
+        super().__init__()
+        self.shared_transf = shared_transf
+        self.linear = linear
+        self.mamba_decoder = mamba
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        original_hidden_states: Optional[torch.Tensor] = None,
+        layer_idx: int = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[HybridMambaAttentionDynamicCache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            original_hidden_states (`torch.FloatTensor`): word embedding output that will be concatenated with
+            hidden activations to form the input of the shared transformer layer.
+            layer_idx (`int`): layer number.
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            past_key_value (`HybridMambaAttentionDynamicCache`, *optional*): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
+        """
+
+        layer_outputs = self.shared_transf(
+            hidden_states,
+            original_hidden_states=original_hidden_states,
+            layer_idx=layer_idx,
+            attention_mask=causal_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+        )
+
+        transformer_hidden_states = layer_outputs[0]
+
+        if output_attentions:
+            self_attn_weights = layer_outputs[1]
+
+        transformer_hidden_states = self.linear(transformer_hidden_states)
+
+        layer_outputs = self.mamba_decoder(
+            hidden_states,
+            transformer_hidden_states=transformer_hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+        )
+
+        if output_attentions:
+            layer_outputs = (layer_outputs[0], self_attn_weights) + layer_outputs[2:]
+
+        return layer_outputs
+
+
+ZAMBA_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`ZambaConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Zamba Model outputting raw hidden-states without any specific head on top.",
+    ZAMBA_START_DOCSTRING,
+)
+class ZambaPreTrainedModel(PreTrainedModel):
+    config_class = ZambaConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["ZambaAttentionDecoderLayer", "ZambaMambaDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = False
+    _supports_sdpa = False
+    _supports_cache_class = True  # Note: only supports HybridMambaAttentionDynamicCache
+    _is_stateful = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, (nn.Linear, nn.Conv1d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, ZambaMambaMixer):
+            module.A_log._no_weight_decay = True
+            module.D._no_weight_decay = True
+
+            module.x_proj_weight.data.normal_(mean=0.0, std=std)
+            dt_init_std = self.config.mamba_dt_rank**-0.5
+            nn.init.uniform_(module.dt_proj_weight, -dt_init_std, dt_init_std)
+
+            mamba_head_dim = self.config.mamba_expand * self.config.hidden_size // self.config.n_mamba_heads
+            dt = torch.exp(
+                torch.rand(self.config.n_mamba_heads, mamba_head_dim)
+                * (math.log(self.config.time_step_max) - math.log(self.config.time_step_min))
+                + math.log(self.config.time_step_min)
+            ).clamp(min=self.config.time_step_floor)
+            # # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
+            inv_dt = dt + torch.log(-torch.expm1(-dt))
+
+            with torch.no_grad():
+                module.dt_proj_bias.copy_(inv_dt)
+            module.dt_proj_bias._no_reinit = True
+
+    @classmethod
+    @classmethod
+    def _check_and_enable_flash_attn_2(
+        cls,
+        config,
+        torch_dtype: Optional[torch.dtype] = None,
+        device_map: Optional[Union[str, Dict[str, int]]] = None,
+        hard_check_only: bool = False,
+        check_device_map: bool = False,
+    ):
+        """
+        Overloads `PreTrainedModel._check_and_enable_flash_attn_2` so as to DISABLE Flash Attention 2 by default on Zamba models.
+        Flash attention 2 is currently not supported in the HuggingFace implementation of Zamba v1.
+        """
+        config = super()._check_and_enable_flash_attn_2(
+            config, torch_dtype, device_map, hard_check_only=hard_check_only, check_device_map=check_device_map
+        )
+
+        # if using the default path -> swap sdpa by eager
+        if not hard_check_only and config._attn_implementation == "flash_attention_2":
+            config._attn_implementation = "eager"
+
+        return config
+
+
+ZAMBA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`HybridMambaAttentionDynamicCache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            A HybridMambaAttentionDynamicCache object containing pre-computed hidden-states (keys and values in the
+            self-attention blocks and convolution and ssm states in the mamba blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+            Key and value cache tensors have shape `(batch_size, num_heads, seq_len, head_dim)`.
+            Convolution and ssm states tensors have shape `(batch_size, d_inner, d_conv)` and
+            `(batch_size, d_inner, d_state)` respectively.
+            See the `HybridMambaAttentionDynamicCache` class for more details.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+
+
+@add_start_docstrings(
+    "The bare Zamba Model outputting raw hidden-states without any specific head on top.",
+    ZAMBA_START_DOCSTRING,
+)
+class ZambaModel(ZambaPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`ZambaDecoderLayer`]
+
+    Args:
+        config: ZambaConfig
+    """
+
+    def __init__(self, config: ZambaConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        block = ZambaAttentionDecoderLayer(config)
+        mamba_layers = []
+        linear_layers = []
+        self.layers_block_type = config.layers_block_type
+        for i in range(config.num_hidden_layers):
+            if config.layers_block_type[i] == "mamba":
+                mamba_layers.append(ZambaMambaDecoderLayer(config, layer_idx=i))
+            elif config.layers_block_type[i] == "hybrid":
+                linear_layers.append(nn.Linear(self.config.hidden_size, self.config.hidden_size, bias=False))
+                mamba_layers.append(ZambaMambaDecoderLayer(config, layer_idx=i))
+        mamba_layers = iter(mamba_layers)
+        linear_layers = iter(linear_layers)
+        layers = []
+        self._tied_weights_keys = []
+        for layer_id, layer_type in enumerate(self.layers_block_type):
+            if layer_type == "hybrid":
+                prefix_name = f"layers.{layer_id}."
+                tied_keys = [
+                    "shared_transf.self_attn.q_proj.weight",
+                    "shared_transf.self_attn.k_proj.weight",
+                    "shared_transf.self_attn.v_proj.weight",
+                    "shared_transf.self_attn.o_proj.weight",
+                    "shared_transf.feed_forward.gate_proj.weight",
+                    "shared_transf.feed_forward.up_proj.weight",
+                    "shared_transf.feed_forward.down_proj.weight",
+                    "shared_transf.input_layernorm.weight",
+                    "shared_transf.pre_ff_layernorm.weight",
+                ]
+                self._tied_weights_keys = [*self._tied_weights_keys, *[prefix_name + key for key in tied_keys]]
+                layers.append(HybridLayer(block, next(linear_layers), next(mamba_layers)))
+            else:
+                layers.append(next(mamba_layers))
+        self.layers = nn.ModuleList(layers)
+
+        self._attn_implementation = config._attn_implementation
+        self.final_layernorm = ZambaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(ZAMBA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[HybridMambaAttentionDynamicCache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        hidden_states = inputs_embeds
+
+        original_hidden_states = torch.clone(inputs_embeds)
+        # original_hidden_states: word embedding output that will be concatenated with hidden activations to form the input of the shared transformer layer
+
+        if use_cache and past_key_values is None:
+            logger.warning_once(
+                "Zamba requires an initialized `HybridMambaAttentionDynamicCache` to return a cache. None was "
+                "provided, so no cache will be returned."
+            )
+
+        if cache_position is None:
+            cache_position = torch.arange(hidden_states.shape[1], device=hidden_states.device)
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position)
+
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        for layer_idx, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer.__call__,
+                    hidden_states,
+                    original_hidden_states,
+                    layer_idx,
+                    attention_mask,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                )
+            else:
+                layer_outputs = layer(
+                    hidden_states,
+                    original_hidden_states=original_hidden_states,
+                    layer_idx=layer_idx,
+                    attention_mask=attention_mask,
+                    causal_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                if layer_outputs[1] is not None:
+                    # append attentions only of attention layers. Mamba layers return `None` as the attention weights
+                    all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.final_layernorm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if past_key_values and not past_key_values.has_previous_state:
+            past_key_values.has_previous_state = True
+
+        next_cache = None if not use_cache else past_key_values
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+    # Copied from transformers.models.jamba.modeling_jamba.JambaModel._update_causal_mask
+    def _update_causal_mask(self, attention_mask, input_tensor, cache_position):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+
+        dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
+        sequence_length = input_tensor.shape[1]
+        target_length = cache_position[-1] + 1
+
+        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
+        if sequence_length != 1:
+            causal_mask = torch.triu(causal_mask, diagonal=1)
+        causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+            if attention_mask.dim() == 2:
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[..., :mask_length].eq(0.0) * attention_mask[:, None, None, :].eq(0.0)
+                causal_mask[..., :mask_length] = causal_mask[..., :mask_length].masked_fill(padding_mask, min_dtype)
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type == "cuda"
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+
+# Adapted from transformers.models.jamba.modeling_jamba.JambaForCausalLM with Jamba->Zamba, JAMBA->ZAMBA
+class ZambaForCausalLM(ZambaPreTrainedModel, GenerationMixin):
+    def __init__(self, config: ZambaConfig):
+        super().__init__(config)
+        self.model = ZambaModel(config)
+        self._tied_weights_keys = ["lm_head.weight", *self.model._tied_weights_keys]
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @add_start_docstrings_to_model_forward(ZAMBA_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[HybridMambaAttentionDynamicCache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        num_logits_to_keep: int = 0,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+            num_logits_to_keep (`int` or `None`, *optional*):
+                Calculate logits for the last `num_logits_to_keep` tokens. If `None`, calculate logits for all
+                `input_ids`. Only last token logits are needed for generation, and calculating them only for that token
+                can save memory, which becomes pretty significant for long sequences.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, ZambaForCausalLM
+
+        >>> model = ZambaForCausalLM.from_pretrained("Zyphra/Zamba-7B-v1")
+        >>> tokenizer = AutoTokenizer.from_pretrained("Zyphra/Zamba-7B-v1")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        if labels is None and not is_torchdynamo_compiling():
+            logger.warning_once(
+                "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)"
+            )
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        # TODO: remove the float() operation in v4.46
+        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float()
+
+        loss = None
+        if labels is not None:
+            # Upcast to float if we need to compute the loss to avoid potential precision issues
+            logits = logits.float()
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        **kwargs,
+    ):
+        empty_past_kv = past_key_values is None
+
+        # Omit tokens covered by past_key_values
+        if not empty_past_kv:
+            # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
+            # Exception 1: when passing input_embeds, input_ids may be missing entries
+            # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
+            if inputs_embeds is not None:  # Exception 1
+                input_ids = input_ids[:, -cache_position.shape[0] :]
+            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
+                input_ids = input_ids[:, cache_position]
+        else:
+            past_key_values = HybridMambaAttentionDynamicCache(
+                self.config, input_ids.shape[0], dtype=self.dtype, device=self.device
+            )
+
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if not empty_past_kv:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and empty_past_kv:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids.contiguous()}  # `contiguous()` needed for compilation use cases
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+                "num_logits_to_keep": self.config.num_logits_to_keep,
+                "cache_position": cache_position,
+            }
+        )
+        return model_inputs
+
+
+@add_start_docstrings(
+    """
+    The Zamba Model with a sequence classification head on top (linear layer).
+
+    [`ZambaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    ZAMBA_START_DOCSTRING,
+)
+class ZambaForSequenceClassification(ZambaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = ZambaModel(config)
+        self._tied_weights_keys = self.model._tied_weights_keys
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(ZAMBA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.device)
+            else:
+                sequence_lengths = -1
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index cdd24681cba80d..40b3dc1015c001 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -28,7 +28,9 @@
 from ..models.auto.feature_extraction_auto import FEATURE_EXTRACTOR_MAPPING, AutoFeatureExtractor
 from ..models.auto.image_processing_auto import IMAGE_PROCESSOR_MAPPING, AutoImageProcessor
 from ..models.auto.modeling_auto import AutoModelForDepthEstimation, AutoModelForImageToImage
+from ..models.auto.processing_auto import PROCESSOR_MAPPING, AutoProcessor
 from ..models.auto.tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer
+from ..processing_utils import ProcessorMixin
 from ..tokenization_utils import PreTrainedTokenizer
 from ..utils import (
     CONFIG_NAME,
@@ -556,6 +558,7 @@ def pipeline(
     tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None,
     feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None,
     image_processor: Optional[Union[str, BaseImageProcessor]] = None,
+    processor: Optional[Union[str, ProcessorMixin]] = None,
     framework: Optional[str] = None,
     revision: Optional[str] = None,
     use_fast: bool = True,
@@ -571,11 +574,19 @@ def pipeline(
     """
     Utility factory method to build a [`Pipeline`].
 
-    Pipelines are made of:
+    A pipeline consists of:
 
-        - A [tokenizer](tokenizer) in charge of mapping raw textual input to token.
-        - A [model](model) to make predictions from the inputs.
-        - Some (optional) post processing for enhancing model's output.
+        - One or more components for pre-processing model inputs, such as a [tokenizer](tokenizer),
+        [image_processor](image_processor), [feature_extractor](feature_extractor), or [processor](processors).
+        - A [model](model) that generates predictions from the inputs.
+        - Optional post-processing steps to refine the model's output, which can also be handled by processors.
+
+    <Tip>
+    While there are such optional arguments as `tokenizer`, `feature_extractor`, `image_processor`, and `processor`,
+    they shouldn't be specified all at once. If these components are not provided, `pipeline` will try to load
+    required ones automatically. In case you want to provide these components explicitly, please refer to a
+    specific pipeline in order to get more details regarding what components are required.
+    </Tip>
 
     Args:
         task (`str`):
@@ -644,6 +655,25 @@ def pipeline(
             `model` is not specified or not a string, then the default feature extractor for `config` is loaded (if it
             is a string). However, if `config` is also not given or not a string, then the default feature extractor
             for the given `task` will be loaded.
+        image_processor (`str` or [`BaseImageProcessor`], *optional*):
+            The image processor that will be used by the pipeline to preprocess images for the model. This can be a
+            model identifier or an actual image processor inheriting from [`BaseImageProcessor`].
+
+            Image processors are used for Vision models and multi-modal models that require image inputs. Multi-modal
+            models will also require a tokenizer to be passed.
+
+            If not provided, the default image processor for the given `model` will be loaded (if it is a string). If
+            `model` is not specified or not a string, then the default image processor for `config` is loaded (if it is
+            a string).
+        processor (`str` or [`ProcessorMixin`], *optional*):
+            The processor that will be used by the pipeline to preprocess data for the model. This can be a model
+            identifier or an actual processor inheriting from [`ProcessorMixin`].
+
+            Processors are used for multi-modal models that require multi-modal inputs, for example, a model that
+            requires both text and image inputs.
+
+            If not provided, the default processor for the given `model` will be loaded (if it is a string). If `model`
+            is not specified or not a string, then the default processor for `config` is loaded (if it is a string).
         framework (`str`, *optional*):
             The framework to use, either `"pt"` for PyTorch or `"tf"` for TensorFlow. The specified framework must be
             installed.
@@ -905,13 +935,17 @@ def pipeline(
 
     model_config = model.config
     hub_kwargs["_commit_hash"] = model.config._commit_hash
-    load_tokenizer = (
-        type(model_config) in TOKENIZER_MAPPING
-        or model_config.tokenizer_class is not None
-        or isinstance(tokenizer, str)
-    )
+
+    load_tokenizer = type(model_config) in TOKENIZER_MAPPING or model_config.tokenizer_class is not None
     load_feature_extractor = type(model_config) in FEATURE_EXTRACTOR_MAPPING or feature_extractor is not None
     load_image_processor = type(model_config) in IMAGE_PROCESSOR_MAPPING or image_processor is not None
+    load_processor = type(model_config) in PROCESSOR_MAPPING or processor is not None
+
+    # Check that pipeline class required loading
+    load_tokenizer = load_tokenizer and pipeline_class._load_tokenizer
+    load_feature_extractor = load_feature_extractor and pipeline_class._load_feature_extractor
+    load_image_processor = load_image_processor and pipeline_class._load_image_processor
+    load_processor = load_processor and pipeline_class._load_processor
 
     # If `model` (instance of `PretrainedModel` instead of `str`) is passed (and/or same for config), while
     # `image_processor` or `feature_extractor` is `None`, the loading will fail. This happens particularly for some
@@ -1074,6 +1108,31 @@ def pipeline(
                     if not is_pyctcdecode_available():
                         logger.warning("Try to install `pyctcdecode`: `pip install pyctcdecode")
 
+    if load_processor:
+        # Try to infer processor from model or config name (if provided as str)
+        if processor is None:
+            if isinstance(model_name, str):
+                processor = model_name
+            elif isinstance(config, str):
+                processor = config
+            else:
+                # Impossible to guess what is the right processor here
+                raise Exception(
+                    "Impossible to guess which processor to use. "
+                    "Please provide a processor instance or a path/identifier "
+                    "to a processor."
+                )
+
+        # Instantiate processor if needed
+        if isinstance(processor, (str, tuple)):
+            processor = AutoProcessor.from_pretrained(processor, _from_pipeline=task, **hub_kwargs, **model_kwargs)
+            if not isinstance(processor, ProcessorMixin):
+                raise TypeError(
+                    "Processor was loaded, but it is not an instance of `ProcessorMixin`. "
+                    f"Got type `{type(processor)}` instead. Please check that you specified "
+                    "correct pipeline task for the model and model has processor implemented and saved."
+                )
+
     if task == "translation" and model.config.task_specific_params:
         for key in model.config.task_specific_params:
             if key.startswith("translation"):
@@ -1099,4 +1158,7 @@ def pipeline(
     if device is not None:
         kwargs["device"] = device
 
+    if processor is not None:
+        kwargs["processor"] = processor
+
     return pipeline_class(model=model, framework=framework, task=task, **kwargs)
diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py
index 9b82b67820c51b..f4ffdf6445381c 100644
--- a/src/transformers/pipelines/automatic_speech_recognition.py
+++ b/src/transformers/pipelines/automatic_speech_recognition.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import warnings
 from collections import defaultdict
 from typing import TYPE_CHECKING, Dict, Optional, Union
 
@@ -269,8 +270,6 @@ def __call__(
                 The dictionary of ad-hoc parametrization of `generate_config` to be used for the generation call. For a
                 complete overview of generate, check the [following
                 guide](https://huggingface.co/docs/transformers/en/main_classes/text_generation).
-            max_new_tokens (`int`, *optional*):
-                The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.
 
         Return:
             `Dict`: A dictionary with the following keys:
@@ -310,6 +309,10 @@ def _sanitize_parameters(
 
         forward_params = defaultdict(dict)
         if max_new_tokens is not None:
+            warnings.warn(
+                "`max_new_tokens` is deprecated and will be removed in version 4.49 of Transformers. To remove this warning, pass `max_new_tokens` as a key inside `generate_kwargs` instead.",
+                FutureWarning,
+            )
             forward_params["max_new_tokens"] = max_new_tokens
         if generate_kwargs is not None:
             if max_new_tokens is not None and "max_new_tokens" in generate_kwargs:
diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index 40a91a0d484b8e..042958cbb0c6b3 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -34,6 +34,7 @@
 from ..image_processing_utils import BaseImageProcessor
 from ..modelcard import ModelCard
 from ..models.auto.configuration_auto import AutoConfig
+from ..processing_utils import ProcessorMixin
 from ..tokenization_utils import PreTrainedTokenizer
 from ..utils import (
     ModelOutput,
@@ -716,6 +717,7 @@ def build_pipeline_init_args(
     has_tokenizer: bool = False,
     has_feature_extractor: bool = False,
     has_image_processor: bool = False,
+    has_processor: bool = False,
     supports_binary_output: bool = True,
 ) -> str:
     docstring = r"""
@@ -738,6 +740,12 @@ def build_pipeline_init_args(
         image_processor ([`BaseImageProcessor`]):
             The image processor that will be used by the pipeline to encode data for the model. This object inherits from
             [`BaseImageProcessor`]."""
+    if has_processor:
+        docstring += r"""
+        processor ([`ProcessorMixin`]):
+            The processor that will be used by the pipeline to encode data for the model. This object inherits from
+            [`ProcessorMixin`]. Processor is a composite object that might contain `tokenizer`, `feature_extractor`, and
+            `image_processor`."""
     docstring += r"""
         modelcard (`str` or [`ModelCard`], *optional*):
             Model card attributed to the model for this pipeline.
@@ -774,7 +782,11 @@ def build_pipeline_init_args(
 
 
 PIPELINE_INIT_ARGS = build_pipeline_init_args(
-    has_tokenizer=True, has_feature_extractor=True, has_image_processor=True, supports_binary_output=True
+    has_tokenizer=True,
+    has_feature_extractor=True,
+    has_image_processor=True,
+    has_processor=True,
+    supports_binary_output=True,
 )
 
 
@@ -787,7 +799,11 @@ def build_pipeline_init_args(
     )
 
 
-@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True, has_feature_extractor=True, has_image_processor=True))
+@add_end_docstrings(
+    build_pipeline_init_args(
+        has_tokenizer=True, has_feature_extractor=True, has_image_processor=True, has_processor=True
+    )
+)
 class Pipeline(_ScikitCompat, PushToHubMixin):
     """
     The Pipeline class is the class from which all pipelines inherit. Refer to this class for methods shared across
@@ -805,6 +821,22 @@ class Pipeline(_ScikitCompat, PushToHubMixin):
     constructor argument. If set to `True`, the output will be stored in the pickle format.
     """
 
+    # Historically we have pipelines working with `tokenizer`, `feature_extractor`, and `image_processor`
+    # as separate processing components. While we have `processor` class that combines them, some pipelines
+    # might still operate with these components separately.
+    # With the addition of `processor` to `pipeline`, we want to avoid:
+    #  - loading `processor` for pipelines that still work with `image_processor` and `tokenizer` separately;
+    #  - loading `image_processor`/`tokenizer` as a separate component while we operate only with `processor`,
+    #    because `processor` will load required sub-components by itself.
+    # Below flags allow granular control over loading components and set to be backward compatible with current
+    # pipelines logic. You may override these flags when creating your pipeline. For example, for
+    # `zero-shot-object-detection` pipeline which operates with `processor` you should set `_load_processor=True`
+    # and all the rest flags to `False` to avoid unnecessary loading of the components.
+    _load_processor = False
+    _load_image_processor = True
+    _load_feature_extractor = True
+    _load_tokenizer = True
+
     default_input_names = None
 
     def __init__(
@@ -813,6 +845,7 @@ def __init__(
         tokenizer: Optional[PreTrainedTokenizer] = None,
         feature_extractor: Optional[PreTrainedFeatureExtractor] = None,
         image_processor: Optional[BaseImageProcessor] = None,
+        processor: Optional[ProcessorMixin] = None,
         modelcard: Optional[ModelCard] = None,
         framework: Optional[str] = None,
         task: str = "",
@@ -830,6 +863,7 @@ def __init__(
         self.tokenizer = tokenizer
         self.feature_extractor = feature_extractor
         self.image_processor = image_processor
+        self.processor = processor
         self.modelcard = modelcard
         self.framework = framework
 
diff --git a/src/transformers/pipelines/depth_estimation.py b/src/transformers/pipelines/depth_estimation.py
index 79a85008e7cf99..f70f8d85c15db8 100644
--- a/src/transformers/pipelines/depth_estimation.py
+++ b/src/transformers/pipelines/depth_estimation.py
@@ -1,3 +1,4 @@
+import warnings
 from typing import List, Union
 
 import numpy as np
@@ -50,12 +51,12 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, "vision")
         self.check_model_type(MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES)
 
-    def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Image"]], **kwargs):
+    def __call__(self, inputs: Union[str, List[str], "Image.Image", List["Image.Image"]] = None, **kwargs):
         """
         Predict the depth(s) of the image(s) passed as inputs.
 
         Args:
-            images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
+            inputs (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
                 The pipeline handles three types of images:
 
                 - A string containing a http link pointing to an image
@@ -65,9 +66,10 @@ def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Imag
                 The pipeline accepts either a single image or a batch of images, which must then be passed as a string.
                 Images in a batch must all be in the same format: all as http links, all as local paths, or all as PIL
                 images.
-            timeout (`float`, *optional*, defaults to None):
-                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
-                the call may block forever.
+            parameters (`Dict`, *optional*):
+                A dictionary of argument names to parameter values, to control pipeline behaviour.
+                The only parameter available right now is `timeout`, which is the length of time, in seconds,
+                that the pipeline should wait before giving up on trying to download an image.
 
         Return:
             A dictionary or a list of dictionaries containing result. If the input is a single image, will return a
@@ -79,30 +81,42 @@ def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Imag
             - **predicted_depth** (`torch.Tensor`) -- The predicted depth by the model as a `torch.Tensor`.
             - **depth** (`PIL.Image`) -- The predicted depth by the model as a `PIL.Image`.
         """
-        return super().__call__(images, **kwargs)
-
-    def _sanitize_parameters(self, timeout=None, **kwargs):
+        # After deprecation of this is completed, remove the default `None` value for `images`
+        if "images" in kwargs:
+            inputs = kwargs.pop("images")
+        if inputs is None:
+            raise ValueError("Cannot call the depth-estimation pipeline without an inputs argument!")
+        return super().__call__(inputs, **kwargs)
+
+    def _sanitize_parameters(self, timeout=None, parameters=None, **kwargs):
         preprocess_params = {}
         if timeout is not None:
+            warnings.warn(
+                "The `timeout` argument is deprecated and will be removed in version 5 of Transformers", FutureWarning
+            )
             preprocess_params["timeout"] = timeout
+        if isinstance(parameters, dict) and "timeout" in parameters:
+            preprocess_params["timeout"] = parameters["timeout"]
         return preprocess_params, {}, {}
 
     def preprocess(self, image, timeout=None):
         image = load_image(image, timeout)
-        self.image_size = image.size
         model_inputs = self.image_processor(images=image, return_tensors=self.framework)
         if self.framework == "pt":
             model_inputs = model_inputs.to(self.torch_dtype)
+        model_inputs["target_size"] = image.size[::-1]
         return model_inputs
 
     def _forward(self, model_inputs):
+        target_size = model_inputs.pop("target_size")
         model_outputs = self.model(**model_inputs)
+        model_outputs["target_size"] = target_size
         return model_outputs
 
     def postprocess(self, model_outputs):
         predicted_depth = model_outputs.predicted_depth
         prediction = torch.nn.functional.interpolate(
-            predicted_depth.unsqueeze(1), size=self.image_size[::-1], mode="bicubic", align_corners=False
+            predicted_depth.unsqueeze(1), size=model_outputs["target_size"], mode="bicubic", align_corners=False
         )
         output = prediction.squeeze().cpu().numpy()
         formatted = (output * 255 / np.max(output)).astype("uint8")
diff --git a/src/transformers/pipelines/image_classification.py b/src/transformers/pipelines/image_classification.py
index 8aaa66e6c4586d..20ad72e79055e2 100644
--- a/src/transformers/pipelines/image_classification.py
+++ b/src/transformers/pipelines/image_classification.py
@@ -1,3 +1,17 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
 from typing import List, Union
 
 import numpy as np
@@ -99,6 +113,9 @@ def __init__(self, *args, **kwargs):
     def _sanitize_parameters(self, top_k=None, function_to_apply=None, timeout=None):
         preprocess_params = {}
         if timeout is not None:
+            warnings.warn(
+                "The `timeout` argument is deprecated and will be removed in version 5 of Transformers", FutureWarning
+            )
             preprocess_params["timeout"] = timeout
         postprocess_params = {}
         if top_k is not None:
@@ -109,12 +126,12 @@ def _sanitize_parameters(self, top_k=None, function_to_apply=None, timeout=None)
             postprocess_params["function_to_apply"] = function_to_apply
         return preprocess_params, {}, postprocess_params
 
-    def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Image"]], **kwargs):
+    def __call__(self, inputs: Union[str, List[str], "Image.Image", List["Image.Image"]] = None, **kwargs):
         """
         Assign labels to the image(s) passed as inputs.
 
         Args:
-            images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
+            inputs (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
                 The pipeline handles three types of images:
 
                 - A string containing a http link pointing to an image
@@ -142,9 +159,6 @@ def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Imag
             top_k (`int`, *optional*, defaults to 5):
                 The number of top labels that will be returned by the pipeline. If the provided number is higher than
                 the number of labels available in the model configuration, it will default to the number of labels.
-            timeout (`float`, *optional*, defaults to None):
-                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
-                the call may block forever.
 
         Return:
             A dictionary or a list of dictionaries containing result. If the input is a single image, will return a
@@ -156,7 +170,12 @@ def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Imag
             - **label** (`str`) -- The label identified by the model.
             - **score** (`int`) -- The score attributed by the model for that label.
         """
-        return super().__call__(images, **kwargs)
+        # After deprecation of this is completed, remove the default `None` value for `images`
+        if "images" in kwargs:
+            inputs = kwargs.pop("images")
+        if inputs is None:
+            raise ValueError("Cannot call the image-classification pipeline without an inputs argument!")
+        return super().__call__(inputs, **kwargs)
 
     def preprocess(self, image, timeout=None):
         image = load_image(image, timeout=timeout)
diff --git a/src/transformers/pipelines/image_segmentation.py b/src/transformers/pipelines/image_segmentation.py
index e0fd3b7d85ab19..0ac653fd1e8725 100644
--- a/src/transformers/pipelines/image_segmentation.py
+++ b/src/transformers/pipelines/image_segmentation.py
@@ -1,3 +1,4 @@
+import warnings
 from typing import Any, Dict, List, Union
 
 import numpy as np
@@ -90,16 +91,19 @@ def _sanitize_parameters(self, **kwargs):
         if "overlap_mask_area_threshold" in kwargs:
             postprocess_kwargs["overlap_mask_area_threshold"] = kwargs["overlap_mask_area_threshold"]
         if "timeout" in kwargs:
+            warnings.warn(
+                "The `timeout` argument is deprecated and will be removed in version 5 of Transformers", FutureWarning
+            )
             preprocess_kwargs["timeout"] = kwargs["timeout"]
 
         return preprocess_kwargs, {}, postprocess_kwargs
 
-    def __call__(self, images, **kwargs) -> Union[Predictions, List[Prediction]]:
+    def __call__(self, inputs=None, **kwargs) -> Union[Predictions, List[Prediction]]:
         """
         Perform segmentation (detect masks & classes) in the image(s) passed as inputs.
 
         Args:
-            images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
+            inputs (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
                 The pipeline handles three types of images:
 
                 - A string containing an HTTP(S) link pointing to an image
@@ -118,9 +122,6 @@ def __call__(self, images, **kwargs) -> Union[Predictions, List[Prediction]]:
                 Threshold to use when turning the predicted masks into binary values.
             overlap_mask_area_threshold (`float`, *optional*, defaults to 0.5):
                 Mask overlap threshold to eliminate small, disconnected segments.
-            timeout (`float`, *optional*, defaults to None):
-                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
-                the call may block forever.
 
         Return:
             A dictionary or a list of dictionaries containing the result. If the input is a single image, will return a
@@ -136,7 +137,12 @@ def __call__(self, images, **kwargs) -> Union[Predictions, List[Prediction]]:
             - **score** (*optional* `float`) -- Optionally, when the model is capable of estimating a confidence of the
               "object" described by the label and the mask.
         """
-        return super().__call__(images, **kwargs)
+        # After deprecation of this is completed, remove the default `None` value for `images`
+        if "images" in kwargs:
+            inputs = kwargs.pop("images")
+        if inputs is None:
+            raise ValueError("Cannot call the image-classification pipeline without an inputs argument!")
+        return super().__call__(inputs, **kwargs)
 
     def preprocess(self, image, subtask=None, timeout=None):
         image = load_image(image, timeout=timeout)
diff --git a/src/transformers/pipelines/image_to_text.py b/src/transformers/pipelines/image_to_text.py
index 91d44c46d25c10..4beaa481920054 100644
--- a/src/transformers/pipelines/image_to_text.py
+++ b/src/transformers/pipelines/image_to_text.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import warnings
 from typing import List, Union
 
 from ..utils import (
@@ -80,6 +81,9 @@ def _sanitize_parameters(self, max_new_tokens=None, generate_kwargs=None, prompt
         if prompt is not None:
             preprocess_params["prompt"] = prompt
         if timeout is not None:
+            warnings.warn(
+                "The `timeout` argument is deprecated and will be removed in version 5 of Transformers", FutureWarning
+            )
             preprocess_params["timeout"] = timeout
 
         if max_new_tokens is not None:
@@ -94,12 +98,12 @@ def _sanitize_parameters(self, max_new_tokens=None, generate_kwargs=None, prompt
 
         return preprocess_params, forward_params, {}
 
-    def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Image"]], **kwargs):
+    def __call__(self, inputs: Union[str, List[str], "Image.Image", List["Image.Image"]] = None, **kwargs):
         """
         Assign labels to the image(s) passed as inputs.
 
         Args:
-            images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
+            inputs (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
                 The pipeline handles three types of images:
 
                 - A string containing a HTTP(s) link pointing to an image
@@ -113,16 +117,18 @@ def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Imag
 
             generate_kwargs (`Dict`, *optional*):
                 Pass it to send all of these arguments directly to `generate` allowing full control of this function.
-            timeout (`float`, *optional*, defaults to None):
-                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
-                the call may block forever.
 
         Return:
             A list or a list of list of `dict`: Each result comes as a dictionary with the following key:
 
             - **generated_text** (`str`) -- The generated text.
         """
-        return super().__call__(images, **kwargs)
+        # After deprecation of this is completed, remove the default `None` value for `images`
+        if "images" in kwargs:
+            inputs = kwargs.pop("images")
+        if inputs is None:
+            raise ValueError("Cannot call the image-to-text pipeline without an inputs argument!")
+        return super().__call__(inputs, **kwargs)
 
     def preprocess(self, image, prompt=None, timeout=None):
         image = load_image(image, timeout=timeout)
diff --git a/src/transformers/pipelines/object_detection.py b/src/transformers/pipelines/object_detection.py
index d3e2135790ff8c..c135b1e131acb9 100644
--- a/src/transformers/pipelines/object_detection.py
+++ b/src/transformers/pipelines/object_detection.py
@@ -1,3 +1,4 @@
+import warnings
 from typing import Any, Dict, List, Union
 
 from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging, requires_backends
@@ -63,6 +64,9 @@ def __init__(self, *args, **kwargs):
     def _sanitize_parameters(self, **kwargs):
         preprocess_params = {}
         if "timeout" in kwargs:
+            warnings.warn(
+                "The `timeout` argument is deprecated and will be removed in version 5 of Transformers", FutureWarning
+            )
             preprocess_params["timeout"] = kwargs["timeout"]
         postprocess_kwargs = {}
         if "threshold" in kwargs:
@@ -74,7 +78,7 @@ def __call__(self, *args, **kwargs) -> Union[Predictions, List[Prediction]]:
         Detect objects (bounding boxes & classes) in the image(s) passed as inputs.
 
         Args:
-            images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
+            inputs (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
                 The pipeline handles three types of images:
 
                 - A string containing an HTTP(S) link pointing to an image
@@ -85,9 +89,6 @@ def __call__(self, *args, **kwargs) -> Union[Predictions, List[Prediction]]:
                 same format: all as HTTP(S) links, all as local paths, or all as PIL images.
             threshold (`float`, *optional*, defaults to 0.5):
                 The probability necessary to make a prediction.
-            timeout (`float`, *optional*, defaults to None):
-                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
-                the call may block forever.
 
         Return:
             A list of dictionaries or a list of list of dictionaries containing the result. If the input is a single
@@ -100,7 +101,9 @@ def __call__(self, *args, **kwargs) -> Union[Predictions, List[Prediction]]:
             - **score** (`float`) -- The score attributed by the model for that label.
             - **box** (`List[Dict[str, int]]`) -- The bounding box of detected object in image's original size.
         """
-
+        # After deprecation of this is completed, remove the default `None` value for `images`
+        if "images" in kwargs and "inputs" not in kwargs:
+            kwargs["inputs"] = kwargs.pop("images")
         return super().__call__(*args, **kwargs)
 
     def preprocess(self, image, timeout=None):
diff --git a/src/transformers/pipelines/question_answering.py b/src/transformers/pipelines/question_answering.py
index 4ac5d252b1139e..6039e5ad1ee989 100644
--- a/src/transformers/pipelines/question_answering.py
+++ b/src/transformers/pipelines/question_answering.py
@@ -183,8 +183,16 @@ def __call__(self, *args, **kwargs):
         # Generic compatibility with sklearn and Keras
         # Batched data
         elif "X" in kwargs:
+            warnings.warn(
+                "Passing the `X` argument to the pipeline is deprecated and will be removed in v5. Inputs should be passed using the `question` and `context` keyword arguments instead.",
+                FutureWarning,
+            )
             inputs = kwargs["X"]
         elif "data" in kwargs:
+            warnings.warn(
+                "Passing the `data` argument to the pipeline is deprecated and will be removed in v5. Inputs should be passed using the `question` and `context` keyword arguments instead.",
+                FutureWarning,
+            )
             inputs = kwargs["data"]
         elif "question" in kwargs and "context" in kwargs:
             if isinstance(kwargs["question"], list) and isinstance(kwargs["context"], str):
@@ -345,22 +353,14 @@ def __call__(self, *args, **kwargs):
         Answer the question(s) given as inputs by using the context(s).
 
         Args:
-            args ([`SquadExample`] or a list of [`SquadExample`]):
-                One or several [`SquadExample`] containing the question and context.
-            X ([`SquadExample`] or a list of [`SquadExample`], *optional*):
-                One or several [`SquadExample`] containing the question and context (will be treated the same way as if
-                passed as the first positional argument).
-            data ([`SquadExample`] or a list of [`SquadExample`], *optional*):
-                One or several [`SquadExample`] containing the question and context (will be treated the same way as if
-                passed as the first positional argument).
             question (`str` or `List[str]`):
                 One or several question(s) (must be used in conjunction with the `context` argument).
             context (`str` or `List[str]`):
                 One or several context(s) associated with the question(s) (must be used in conjunction with the
                 `question` argument).
-            topk (`int`, *optional*, defaults to 1):
+            top_k (`int`, *optional*, defaults to 1):
                 The number of answers to return (will be chosen by order of likelihood). Note that we return less than
-                topk answers if there are not enough options available within the context.
+                top_k answers if there are not enough options available within the context.
             doc_stride (`int`, *optional*, defaults to 128):
                 If the context is too long to fit with the question for the model, it will be split in several chunks
                 with some overlap. This argument controls the size of that overlap.
@@ -374,7 +374,7 @@ def __call__(self, *args, **kwargs):
             handle_impossible_answer (`bool`, *optional*, defaults to `False`):
                 Whether or not we accept impossible as an answer.
             align_to_words (`bool`, *optional*, defaults to `True`):
-                Attempts to align the answer to real words. Improves quality on space separated langages. Might hurt on
+                Attempts to align the answer to real words. Improves quality on space separated languages. Might hurt on
                 non-space-separated languages (like Japanese or Chinese)
 
         Return:
@@ -387,6 +387,11 @@ def __call__(self, *args, **kwargs):
         """
 
         # Convert inputs to features
+        if args:
+            warnings.warn(
+                "Passing a list of SQuAD examples to the pipeline is deprecated and will be removed in v5. Inputs should be passed using the `question` and `context` keyword arguments instead.",
+                FutureWarning,
+            )
 
         examples = self._args_parser(*args, **kwargs)
         if isinstance(examples, (list, tuple)) and len(examples) == 1:
diff --git a/src/transformers/pipelines/zero_shot_image_classification.py b/src/transformers/pipelines/zero_shot_image_classification.py
index b7e13e782e78cc..253c684fcbbdad 100644
--- a/src/transformers/pipelines/zero_shot_image_classification.py
+++ b/src/transformers/pipelines/zero_shot_image_classification.py
@@ -1,3 +1,4 @@
+import warnings
 from collections import UserDict
 from typing import List, Union
 
@@ -73,12 +74,12 @@ def __init__(self, **kwargs):
             else MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES
         )
 
-    def __call__(self, images: Union[str, List[str], "Image", List["Image"]], **kwargs):
+    def __call__(self, image: Union[str, List[str], "Image", List["Image"]] = None, **kwargs):
         """
         Assign labels to the image(s) passed as inputs.
 
         Args:
-            images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
+            image (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
                 The pipeline handles three types of images:
 
                 - A string containing a http link pointing to an image
@@ -93,13 +94,6 @@ def __call__(self, images: Union[str, List[str], "Image", List["Image"]], **kwar
                 replacing the placeholder with the candidate_labels. Pass "{}" if *candidate_labels* are
                 already formatted.
 
-            timeout (`float`, *optional*, defaults to None):
-                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
-                the call may block forever.
-
-            tokenizer_kwargs (`dict`, *optional*):
-                Additional dictionary of keyword arguments passed along to the tokenizer.
-
         Return:
             A list of dictionaries containing one entry per proposed label. Each dictionary contains the
             following keys:
@@ -107,17 +101,29 @@ def __call__(self, images: Union[str, List[str], "Image", List["Image"]], **kwar
             - **score** (`float`) -- The score attributed by the model to that label. It is a value between
                 0 and 1, computed as the `softmax` of `logits_per_image`.
         """
-        return super().__call__(images, **kwargs)
+        # After deprecation of this is completed, remove the default `None` value for `image`
+        if "images" in kwargs:
+            image = kwargs.pop("images")
+        if image is None:
+            raise ValueError("Cannot call the zero-shot-image-classification pipeline without an images argument!")
+        return super().__call__(image, **kwargs)
 
     def _sanitize_parameters(self, tokenizer_kwargs=None, **kwargs):
         preprocess_params = {}
         if "candidate_labels" in kwargs:
             preprocess_params["candidate_labels"] = kwargs["candidate_labels"]
         if "timeout" in kwargs:
+            warnings.warn(
+                "The `timeout` argument is deprecated and will be removed in version 5 of Transformers", FutureWarning
+            )
             preprocess_params["timeout"] = kwargs["timeout"]
         if "hypothesis_template" in kwargs:
             preprocess_params["hypothesis_template"] = kwargs["hypothesis_template"]
         if tokenizer_kwargs is not None:
+            warnings.warn(
+                "The `tokenizer_kwargs` argument is deprecated and will be removed in version 5 of Transformers",
+                FutureWarning,
+            )
             preprocess_params["tokenizer_kwargs"] = tokenizer_kwargs
 
         return preprocess_params, {}, {}
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 58eb3e6ed6f206..cb2327e5c46b0d 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -820,6 +820,8 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg
             "common_kwargs": {},
         }
 
+        used_keys = set()
+
         # get defaults from set model processor kwargs if they exist
         for modality in default_kwargs:
             default_kwargs[modality] = ModelProcessorKwargs._defaults.get(modality, {}).copy()
@@ -827,7 +829,12 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg
             for modality_key in ModelProcessorKwargs.__annotations__[modality].__annotations__.keys():
                 # init with tokenizer init kwargs if necessary
                 if modality_key in tokenizer_init_kwargs:
-                    default_kwargs[modality][modality_key] = tokenizer_init_kwargs[modality_key]
+                    value = (
+                        getattr(self.tokenizer, modality_key)
+                        if hasattr(self.tokenizer, modality_key)
+                        else tokenizer_init_kwargs[modality_key]
+                    )
+                    default_kwargs[modality][modality_key] = value
         # now defaults kwargs are updated with the tokenizers defaults.
         # pass defaults to output dictionary
         output_kwargs.update(default_kwargs)
@@ -846,18 +853,29 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg
                             f"in a dictionary for {modality} and as a **kwarg."
                         )
                 elif modality_key in kwargs:
-                    kwarg_value = kwargs.pop(modality_key, "__empty__")
+                    # we get a modality_key instead of popping it because modality-specific processors
+                    # can have overlapping kwargs
+                    kwarg_value = kwargs.get(modality_key, "__empty__")
                 else:
                     kwarg_value = "__empty__"
                 if kwarg_value != "__empty__":
                     output_kwargs[modality][modality_key] = kwarg_value
-        # if something remains in kwargs, it belongs to common after flattening
-        if set(kwargs) & set(default_kwargs):
-            # here kwargs is dictionary-based since it shares keys with default set
-            [output_kwargs["common_kwargs"].update(subdict) for _, subdict in kwargs.items()]
+                    used_keys.add(modality_key)
+
+        # Determine if kwargs is a flat dictionary or contains nested dictionaries
+        if any(key in default_kwargs for key in kwargs):
+            # kwargs is dictionary-based, and some keys match modality names
+            for modality, subdict in kwargs.items():
+                if modality in default_kwargs:
+                    for subkey, subvalue in subdict.items():
+                        if subkey not in used_keys:
+                            output_kwargs[modality][subkey] = subvalue
+                            used_keys.add(subkey)
         else:
-            # here it's a flat dict
-            output_kwargs["common_kwargs"].update(kwargs)
+            # kwargs is a flat dictionary
+            for key in kwargs:
+                if key not in used_keys:
+                    output_kwargs["common_kwargs"][key] = kwargs[key]
 
         # all modality-specific kwargs are updated with common kwargs
         for modality in output_kwargs:
diff --git a/src/transformers/quantizers/auto.py b/src/transformers/quantizers/auto.py
index 1dcd87c993a2e6..38bebd2d8410e4 100755
--- a/src/transformers/quantizers/auto.py
+++ b/src/transformers/quantizers/auto.py
@@ -18,6 +18,7 @@
 from ..utils.quantization_config import (
     AqlmConfig,
     AwqConfig,
+    BitNetConfig,
     BitsAndBytesConfig,
     CompressedTensorsConfig,
     EetqConfig,
@@ -31,6 +32,7 @@
 )
 from .quantizer_aqlm import AqlmHfQuantizer
 from .quantizer_awq import AwqQuantizer
+from .quantizer_bitnet import BitNetHfQuantizer
 from .quantizer_bnb_4bit import Bnb4BitHfQuantizer
 from .quantizer_bnb_8bit import Bnb8BitHfQuantizer
 from .quantizer_compressed_tensors import CompressedTensorsHfQuantizer
@@ -54,6 +56,7 @@
     "compressed-tensors": CompressedTensorsHfQuantizer,
     "fbgemm_fp8": FbgemmFp8HfQuantizer,
     "torchao": TorchAoHfQuantizer,
+    "bitnet": BitNetHfQuantizer,
 }
 
 AUTO_QUANTIZATION_CONFIG_MAPPING = {
@@ -68,6 +71,7 @@
     "compressed-tensors": CompressedTensorsConfig,
     "fbgemm_fp8": FbgemmFp8Config,
     "torchao": TorchAoConfig,
+    "bitnet": BitNetConfig,
 }
 
 
@@ -108,7 +112,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         quantization_config_dict = model_config.quantization_config
         quantization_config = cls.from_dict(quantization_config_dict)
         # Update with potential kwargs that are passed through from_pretrained.
-        quantization_config.update(kwargs)
+        quantization_config.update(**kwargs)
         return quantization_config
 
 
diff --git a/src/transformers/quantizers/quantizer_awq.py b/src/transformers/quantizers/quantizer_awq.py
index 9c66ba385a6b96..18b883429c5ec5 100644
--- a/src/transformers/quantizers/quantizer_awq.py
+++ b/src/transformers/quantizers/quantizer_awq.py
@@ -46,27 +46,44 @@ def __init__(self, quantization_config, **kwargs):
         super().__init__(quantization_config, **kwargs)
 
     def validate_environment(self, device_map, **kwargs):
-        if not torch.cuda.is_available():
-            raise RuntimeError("GPU is required to run AWQ quantized model.")
-
         if not is_auto_awq_available():
             raise ImportError("Loading an AWQ quantized model requires auto-awq library (`pip install autoawq`)")
 
         if not is_accelerate_available():
             raise ImportError("Loading an AWQ quantized model requires accelerate (`pip install accelerate`)")
 
-        if device_map is None:
-            logger.warning_once(
-                "You have loaded an AWQ model on CPU and have a CUDA device available, make sure to set "
-                "your model on a GPU device in order to run your model."
-            )
-        elif device_map is not None:
-            if isinstance(device_map, dict) and ("cpu" in device_map.values() or "disk" in device_map.values()):
+        if self.quantization_config.version == AWQLinearVersion.IPEX:
+            if version.parse(importlib.metadata.version("autoawq")) < version.parse("0.2.6"):
+                raise RuntimeError(
+                    "To use IPEX backend, you need autoawq>0.6.2. Please install the latest version or from source."
+                )
+            if (
+                device_map is not None
+                and isinstance(device_map, dict)
+                and (torch.device("cpu") not in device_map.values() or len(device_map.values()) > 1)
+            ):
                 raise ValueError(
-                    "You are attempting to load an AWQ model with a device_map that contains a CPU or disk device."
-                    " This is not supported. Please remove the CPU or disk device from the device_map."
+                    "You are attempting to load an IPEX version AWQ model with a device_map that contains more than CPU."
+                    " This is not supported. Please make sure only cpu in the device_map."
+                )
+        else:
+            if not torch.cuda.is_available():
+                raise RuntimeError(
+                    "GPU is required to run AWQ quantized model. You can use IPEX version AWQ if you have an Intel CPU"
                 )
 
+            if device_map is None:
+                logger.warning_once(
+                    "You have loaded an AWQ model on CPU and have a CUDA device available, make sure to set "
+                    "your model on a GPU device in order to run your model."
+                )
+            elif device_map is not None:
+                if isinstance(device_map, dict) and ("cpu" in device_map.values() or "disk" in device_map.values()):
+                    raise ValueError(
+                        "You are attempting to load an AWQ model with a device_map that contains a CPU or disk device."
+                        " This is not supported. Please remove the CPU or disk device from the device_map."
+                    )
+
     def update_torch_dtype(self, torch_dtype):
         if torch_dtype is None:
             torch_dtype = torch.float16
@@ -106,6 +123,11 @@ def _process_model_after_weight_loading(self, model):
 
             model = post_init_awq_exllama_modules(model, self.quantization_config.exllama_config)
 
+        if self.quantization_config.version == AWQLinearVersion.IPEX:
+            from ..integrations import post_init_awq_ipex_modules
+
+            model = post_init_awq_ipex_modules(model)
+
     def is_serializable(self, safe_serialization=None):
         # AWQ through auto-awq has been always serializable, except if the model is fused.
         if self.quantization_config.do_fuse:
diff --git a/src/transformers/quantizers/quantizer_bitnet.py b/src/transformers/quantizers/quantizer_bitnet.py
new file mode 100644
index 00000000000000..3607caa00733cc
--- /dev/null
+++ b/src/transformers/quantizers/quantizer_bitnet.py
@@ -0,0 +1,115 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING, Dict, List, Union
+
+from .base import HfQuantizer
+
+
+if TYPE_CHECKING:
+    from ..modeling_utils import PreTrainedModel
+
+from ..utils import is_accelerate_available, is_torch_available, logging
+
+
+if is_torch_available():
+    import torch
+
+
+logger = logging.get_logger(__name__)
+
+
+class BitNetHfQuantizer(HfQuantizer):
+    """
+    1.58-bit quantization from BitNet quantization method:
+    Before loading: it converts the linear layers into BitLinear layers during loading.
+
+    Checkout the paper introducing this method : https://arxiv.org/pdf/2402.17764
+    """
+
+    requires_parameters_quantization = False
+    requires_calibration = True
+
+    required_packages = ["accelerate"]
+
+    def __init__(self, quantization_config, **kwargs):
+        super().__init__(quantization_config, **kwargs)
+        self.quantization_config = quantization_config
+
+    def validate_environment(self, *args, **kwargs):
+        if not is_accelerate_available():
+            raise ImportError("Loading a BitNet quantized model requires accelerate (`pip install accelerate`)")
+
+        if kwargs.get("from_tf", False) or kwargs.get("from_flax", False):
+            raise ValueError(
+                "Loading ternary weights from tf/flax is currently not supported, please make"
+                " sure the weights are in PyTorch format."
+            )
+
+        if not torch.cuda.is_available():
+            logger.warning_once(
+                "You don't have a GPU available to load the model, the inference will be slow because of weight unpacking"
+            )
+            return
+
+        device_map = kwargs.get("device_map", None)
+        if device_map is None:
+            logger.warning_once(
+                "You have loaded a BitNet model on CPU and have a CUDA device available, make sure to set "
+                "your model on a GPU device in order to run your model."
+            )
+        elif device_map is not None:
+            if isinstance(device_map, dict) and ("cpu" in device_map.values() or "disk" in device_map.values()):
+                raise ValueError(
+                    "You are attempting to load a BitNet model with a device_map that contains a CPU or disk device."
+                    "This is not supported. Please remove the CPU or disk device from the device_map."
+                )
+
+    def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs):
+        return model
+
+    def _process_model_before_weight_loading(
+        self,
+        model: "PreTrainedModel",
+        device_map,
+        keep_in_fp32_modules: List[str] = [],
+        **kwargs,
+    ):
+        from ..integrations import get_keys_to_not_convert, replace_with_bitnet_linear
+
+        self.modules_to_not_convert = get_keys_to_not_convert(model)
+
+        if self.quantization_config.modules_to_not_convert is not None:
+            self.modules_to_not_convert.extend(self.quantization_config.modules_to_not_convert)
+
+        model = replace_with_bitnet_linear(
+            model,
+            modules_to_not_convert=self.modules_to_not_convert,
+            quantization_config=self.quantization_config,
+            pre_quantized=self.pre_quantized,
+        )
+
+    def adjust_max_memory(self, max_memory: Dict[str, Union[int, str]]) -> Dict[str, Union[int, str]]:
+        max_memory = {key: val * 0.90 for key, val in max_memory.items()}
+        return max_memory
+
+    def adjust_target_dtype(self, target_dtype: "torch.dtype") -> "torch.dtype":
+        target_dtype = torch.int8
+        return target_dtype
+
+    def is_serializable(self, safe_serialization=None):
+        return True
+
+    @property
+    def is_trainable(self) -> bool:
+        return False
diff --git a/src/transformers/quantizers/quantizer_quanto.py b/src/transformers/quantizers/quantizer_quanto.py
index ae113f714acb69..0aacc18d2a1f40 100644
--- a/src/transformers/quantizers/quantizer_quanto.py
+++ b/src/transformers/quantizers/quantizer_quanto.py
@@ -23,7 +23,13 @@
 if TYPE_CHECKING:
     from ..modeling_utils import PreTrainedModel
 
-from ..utils import is_accelerate_available, is_quanto_available, is_torch_available, logging
+from ..utils import (
+    is_accelerate_available,
+    is_optimum_quanto_available,
+    is_quanto_available,
+    is_torch_available,
+    logging,
+)
 from ..utils.quantization_config import QuantoConfig
 
 
@@ -57,11 +63,13 @@ def post_init(self):
             )
 
     def validate_environment(self, *args, **kwargs):
-        if not is_quanto_available():
-            raise ImportError("Loading a quanto quantized model requires quanto library (`pip install quanto`)")
+        if not (is_optimum_quanto_available() or is_quanto_available()):
+            raise ImportError(
+                "Loading an optimum-quanto quantized model requires optimum-quanto library (`pip install optimum-quanto`)"
+            )
         if not is_accelerate_available():
             raise ImportError(
-                "Loading a quanto quantized model requires accelerate library (`pip install accelerate`)"
+                "Loading an optimum-quanto quantized model requires accelerate library (`pip install accelerate`)"
             )
 
     def update_device_map(self, device_map):
@@ -81,11 +89,17 @@ def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
         return torch_dtype
 
     def update_missing_keys(self, model, missing_keys: List[str], prefix: str) -> List[str]:
-        import quanto
+        if is_optimum_quanto_available():
+            from optimum.quanto import QModuleMixin
+        elif is_quanto_available():
+            logger.warning_once(
+                "Importing from quanto will be deprecated in v4.47. Please install optimum-quanto instrad `pip install optimum-quanto`"
+            )
+            from quanto import QModuleMixin
 
         not_missing_keys = []
         for name, module in model.named_modules():
-            if isinstance(module, quanto.QModuleMixin):
+            if isinstance(module, QModuleMixin):
                 for missing in missing_keys:
                     if (
                         (name in missing or name in f"{prefix}.{missing}")
@@ -106,7 +120,13 @@ def check_quantized_param(
         """
         Check if a parameter needs to be quantized.
         """
-        import quanto
+        if is_optimum_quanto_available():
+            from optimum.quanto import QModuleMixin
+        elif is_quanto_available():
+            logger.warning_once(
+                "Importing from quanto will be deprecated in v4.47. Please install optimum-quanto instrad `pip install optimum-quanto`"
+            )
+            from quanto import QModuleMixin
 
         device_map = kwargs.get("device_map", None)
         param_device = kwargs.get("param_device", None)
@@ -119,7 +139,7 @@ def check_quantized_param(
 
         module, tensor_name = get_module_from_name(model, param_name)
         # We only quantize the weights and the bias is not quantized.
-        if isinstance(module, quanto.QModuleMixin) and "weight" in tensor_name:
+        if isinstance(module, QModuleMixin) and "weight" in tensor_name:
             # if the weights are quantized, don't need to recreate it again with `create_quantized_param`
             return not module.frozen
         else:
@@ -162,7 +182,7 @@ def adjust_target_dtype(self, target_dtype: "torch.dtype") -> "torch.dtype":
             return target_dtype
         else:
             raise ValueError(
-                "You are using `device_map='auto'` on a quanto quantized model. To automatically compute"
+                "You are using `device_map='auto'` on an optimum-quanto quantized model. To automatically compute"
                 " the appropriate device map, you should upgrade your `accelerate` library,"
                 "`pip install --upgrade accelerate` or install it from source."
             )
@@ -193,7 +213,7 @@ def _process_model_after_weight_loading(self, model):
 
     @property
     def is_trainable(self, model: Optional["PreTrainedModel"] = None):
-        return False
+        return True
 
     def is_serializable(self, safe_serialization=None):
         return False
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index a5f257c6534ed1..8eda45bd40efb4 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -31,6 +31,7 @@
 import unittest
 from collections import defaultdict
 from collections.abc import Mapping
+from dataclasses import MISSING, fields
 from functools import wraps
 from io import StringIO
 from pathlib import Path
@@ -93,6 +94,7 @@
     is_nltk_available,
     is_onnx_available,
     is_optimum_available,
+    is_optimum_quanto_available,
     is_pandas_available,
     is_peft_available,
     is_phonemizer_available,
@@ -101,7 +103,6 @@
     is_pytesseract_available,
     is_pytest_available,
     is_pytorch_quantization_available,
-    is_quanto_available,
     is_rjieba_available,
     is_sacremoses_available,
     is_safetensors_available,
@@ -1193,11 +1194,11 @@ def require_auto_awq(test_case):
     return unittest.skipUnless(is_auto_awq_available(), "test requires autoawq")(test_case)
 
 
-def require_quanto(test_case):
+def require_optimum_quanto(test_case):
     """
     Decorator for quanto dependency
     """
-    return unittest.skipUnless(is_quanto_available(), "test requires quanto")(test_case)
+    return unittest.skipUnless(is_optimum_quanto_available(), "test requires optimum-quanto")(test_case)
 
 
 def require_compressed_tensors(test_case):
@@ -2610,3 +2611,30 @@ def update_mapping_from_spec(device_fn_dict: Dict[str, Callable], attribute_name
         update_mapping_from_spec(BACKEND_MANUAL_SEED, "MANUAL_SEED_FN")
         update_mapping_from_spec(BACKEND_EMPTY_CACHE, "EMPTY_CACHE_FN")
         update_mapping_from_spec(BACKEND_DEVICE_COUNT, "DEVICE_COUNT_FN")
+
+
+def compare_pipeline_output_to_hub_spec(output, hub_spec):
+    missing_keys = []
+    unexpected_keys = []
+    all_field_names = {field.name for field in fields(hub_spec)}
+    matching_keys = sorted([key for key in output.keys() if key in all_field_names])
+
+    # Fields with a MISSING default are required and must be in the output
+    for field in fields(hub_spec):
+        if field.default is MISSING and field.name not in output:
+            missing_keys.append(field.name)
+
+    # All output keys must match either a required or optional field in the Hub spec
+    for output_key in output:
+        if output_key not in all_field_names:
+            unexpected_keys.append(output_key)
+
+    if missing_keys or unexpected_keys:
+        error = ["Pipeline output does not match Hub spec!"]
+        if matching_keys:
+            error.append(f"Matching keys: {matching_keys}")
+        if missing_keys:
+            error.append(f"Missing required keys in pipeline output: {missing_keys}")
+        if unexpected_keys:
+            error.append(f"Keys in pipeline output that are not in Hub spec: {unexpected_keys}")
+        raise KeyError("\n".join(error))
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index b5bd4fa1a3911a..438ef1c8a4a5e2 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -598,7 +598,8 @@ def char_to_token(
 
 
         Returns:
-            `int`: Index of the token.
+            `int`: Index of the token, or None if the char index refers to a whitespace only token and whitespace is
+                   trimmed with `trim_offsets=True`.
         """
 
         if not self._encodings:
@@ -808,12 +809,13 @@ def to(self, device: Union[str, "torch.device"]) -> "BatchEncoding":
             [`BatchEncoding`]: The same instance after modification.
         """
         requires_backends(self, ["torch"])
+        import torch
 
         # This check catches things like APEX blindly calling "to" on all inputs to a module
         # Otherwise it passes the casts down and casts the LongTensor containing the token idxs
         # into a HalfTensor
         if isinstance(device, str) or is_torch_device(device) or isinstance(device, int):
-            self.data = {k: v.to(device=device) for k, v in self.data.items() if v is not None}
+            self.data = {k: v.to(device=device) for k, v in self.data.items() if isinstance(v, torch.Tensor)}
         else:
             logger.warning(f"Attempting to cast a BatchEncoding to type {str(device)}. This is not supported.")
         return self
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 8367e2e413c13e..20b9f6dad231d1 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -60,7 +60,9 @@
 from .data.data_collator import DataCollator, DataCollatorWithPadding, default_data_collator
 from .debug_utils import DebugOption, DebugUnderflowOverflow
 from .feature_extraction_sequence_utils import SequenceFeatureExtractor
+from .feature_extraction_utils import FeatureExtractionMixin
 from .hyperparameter_search import ALL_HYPERPARAMETER_SEARCH_BACKENDS, default_hp_search_backend
+from .image_processing_utils import BaseImageProcessor
 from .integrations.deepspeed import deepspeed_init, deepspeed_load_checkpoint, is_deepspeed_available
 from .integrations.tpu import tpu_spmd_dataloader
 from .modelcard import TrainingSummary
@@ -70,6 +72,7 @@
     MODEL_MAPPING_NAMES,
 )
 from .optimization import Adafactor, get_scheduler
+from .processing_utils import ProcessorMixin
 from .pytorch_utils import (
     ALL_LAYERNORM_LAYERS,
     is_torch_greater_or_equal_than_1_13,
@@ -174,6 +177,7 @@
     logging,
     strtobool,
 )
+from .utils.deprecation import deprecate_kwarg
 from .utils.quantization_config import QuantizationMethod
 
 
@@ -225,6 +229,7 @@
 if is_accelerate_available():
     from accelerate import Accelerator, skip_first_batches
     from accelerate import __version__ as accelerate_version
+    from accelerate.state import AcceleratorState
     from accelerate.utils import (
         DistributedDataParallelKwargs,
         DistributedType,
@@ -308,8 +313,8 @@ class Trainer:
             `output_dir` set to a directory named *tmp_trainer* in the current directory if not provided.
         data_collator (`DataCollator`, *optional*):
             The function to use to form a batch from a list of elements of `train_dataset` or `eval_dataset`. Will
-            default to [`default_data_collator`] if no `tokenizer` is provided, an instance of
-            [`DataCollatorWithPadding`] otherwise.
+            default to [`default_data_collator`] if no `processing_class` is provided, an instance of
+            [`DataCollatorWithPadding`] otherwise if the processing_class is a feature extractor or tokenizer.
         train_dataset (Union[`torch.utils.data.Dataset`, `torch.utils.data.IterableDataset`, `datasets.Dataset`], *optional*):
             The dataset to use for training. If it is a [`~datasets.Dataset`], columns not accepted by the
             `model.forward()` method are automatically removed.
@@ -323,10 +328,11 @@ class Trainer:
              The dataset to use for evaluation. If it is a [`~datasets.Dataset`], columns not accepted by the
              `model.forward()` method are automatically removed. If it is a dictionary, it will evaluate on each
              dataset prepending the dictionary key to the metric name.
-        tokenizer ([`PreTrainedTokenizerBase`], *optional*):
-            The tokenizer used to preprocess the data. If provided, will be used to automatically pad the inputs to the
-            maximum length when batching inputs, and it will be saved along the model to make it easier to rerun an
-            interrupted training or reuse the fine-tuned model.
+        processing_class (`PreTrainedTokenizerBase` or `BaseImageProcessor` or `FeatureExtractionMixin` or `ProcessorMixin`, *optional*):
+            Processing class used to process the data. If provided, will be used to automatically process the inputs
+            for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
+            reuse the fine-tuned model.
+            This supercedes the `tokenizer` argument, which is now deprecated.
         model_init (`Callable[[], PreTrainedModel]`, *optional*):
             A function that instantiates the model to be used. If provided, each call to [`~Trainer.train`] will start
             from a new instance of the model as given by this function.
@@ -376,6 +382,7 @@ class Trainer:
     # Those are used as methods of the Trainer in examples.
     from .trainer_pt_utils import _get_learning_rate, log_metrics, metrics_format, save_metrics, save_state
 
+    @deprecate_kwarg("tokenizer", new_name="processing_class", version="5.0.0", raise_if_both_names=True)
     def __init__(
         self,
         model: Union[PreTrainedModel, nn.Module] = None,
@@ -383,7 +390,9 @@ def __init__(
         data_collator: Optional[DataCollator] = None,
         train_dataset: Optional[Union[Dataset, IterableDataset, "datasets.Dataset"]] = None,
         eval_dataset: Optional[Union[Dataset, Dict[str, Dataset], "datasets.Dataset"]] = None,
-        tokenizer: Optional[PreTrainedTokenizerBase] = None,
+        processing_class: Optional[
+            Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin]
+        ] = None,
         model_init: Optional[Callable[[], PreTrainedModel]] = None,
         compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
         callbacks: Optional[List[TrainerCallback]] = None,
@@ -401,9 +410,14 @@ def __init__(
                     " boolean argument which will be triggered after the last batch of the eval set to signal that the"
                     " summary statistics should be returned by the function."
                 )
+        if args.eval_strategy is not None and args.eval_strategy != "no" and eval_dataset is None:
+            raise ValueError(
+                f"You have set `args.eval_strategy` to {args.eval_strategy} but you didn't pass an `eval_dataset` to `Trainer`. Either set `args.eval_strategy` to `no` or pass an `eval_dataset`. "
+            )
         self.args = args
         # Seed must be set before instantiating the model when using model
         enable_full_determinism(self.args.seed) if self.args.full_determinism else set_seed(self.args.seed)
+
         self.hp_name = None
         self.deepspeed = None
         self.is_in_train = False
@@ -537,14 +551,15 @@ def __init__(
             self.place_model_on_device = False
 
         default_collator = (
-            DataCollatorWithPadding(tokenizer)
-            if tokenizer is not None and isinstance(tokenizer, (PreTrainedTokenizerBase, SequenceFeatureExtractor))
+            DataCollatorWithPadding(processing_class)
+            if processing_class is not None
+            and isinstance(processing_class, (PreTrainedTokenizerBase, SequenceFeatureExtractor))
             else default_data_collator
         )
         self.data_collator = data_collator if data_collator is not None else default_collator
         self.train_dataset = train_dataset
         self.eval_dataset = eval_dataset
-        self.tokenizer = tokenizer
+        self.processing_class = processing_class
 
         # Bnb Quantized models doesn't support `.to` operation.
         if (
@@ -586,17 +601,17 @@ def __init__(
                     " `Trainer`. Make sure the lines `import torch_xla.core.xla_model as xm` and"
                     " `model.to(xm.xla_device())` is performed before the optimizer creation in your script."
                 )
-        if (self.is_deepspeed_enabled or self.is_fsdp_xla_enabled or self.is_fsdp_enabled) and (
+        if (self.is_fsdp_xla_enabled or self.is_fsdp_enabled) and (
             self.optimizer is not None or self.lr_scheduler is not None
         ):
             raise RuntimeError(
-                "Passing `optimizers` is not allowed if Deepspeed or PyTorch FSDP is enabled. "
+                "Passing `optimizers` is not allowed if PyTorch FSDP is enabled. "
                 "You should subclass `Trainer` and override the `create_optimizer_and_scheduler` method."
             )
         default_callbacks = DEFAULT_CALLBACKS + get_reporting_integration_callbacks(self.args.report_to)
         callbacks = default_callbacks if callbacks is None else default_callbacks + callbacks
         self.callback_handler = CallbackHandler(
-            callbacks, self.model, self.tokenizer, self.optimizer, self.lr_scheduler
+            callbacks, self.model, self.processing_class, self.optimizer, self.lr_scheduler
         )
         self.add_callback(PrinterCallback if self.args.disable_tqdm else DEFAULT_PROGRESS_CALLBACK)
 
@@ -724,6 +739,18 @@ def __init__(
             xs.set_global_mesh(xs.Mesh(np.array(range(num_devices)), (num_devices, 1), axis_names=("fsdp", "tensor")))
         self.is_fsdp_xla_v1_enabled = self.is_fsdp_xla_enabled and not self.is_fsdp_xla_v2_enabled
 
+    @property
+    def tokenizer(self) -> Optional[PreTrainedTokenizerBase]:
+        logger.warning("Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.")
+        return self.processing_class
+
+    @tokenizer.setter
+    def tokenizer(self, processing_class) -> None:
+        logger.warning(
+            "Trainer.tokenizer is now deprecated. You should use `Trainer.processing_class = processing_class` instead."
+        )
+        self.processing_class = processing_class
+
     def _activate_neftune(self, model):
         r"""
         Activates the neftune as presented in this code: https://github.com/neelsjain/NEFTune and paper:
@@ -883,7 +910,9 @@ def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
                 )
             else:
                 lengths = None
-            model_input_name = self.tokenizer.model_input_names[0] if self.tokenizer is not None else None
+            model_input_name = (
+                self.processing_class.model_input_names[0] if self.processing_class is not None else None
+            )
             return LengthGroupedSampler(
                 self.args.train_batch_size * self.args.gradient_accumulation_steps,
                 dataset=self.train_dataset,
@@ -1649,6 +1678,10 @@ def _hp_search_setup(self, trial: Union["optuna.Trial", Dict[str, Any]]):
             self.args.hf_deepspeed_config.trainer_config_process(self.args)
             self.args.deepspeed_plugin = DeepSpeedPlugin(hf_ds_config=self.args.hf_deepspeed_config)
 
+            # From 1.0 on, we need to fully wipe the DS plugin when doing sweeps.
+            # Simply calling `_reset_state` is enough and doesn't need a version pin.
+            AcceleratorState()._reset_state()
+
         self.create_accelerator_and_postprocess()
 
     def _report_to_hp_search(self, trial: Union["optuna.Trial", Dict[str, Any]], step: int, metrics: Dict[str, float]):
@@ -2881,8 +2914,11 @@ def _evaluate(self, trial, ignore_keys_for_eval, skip_scheduler=False):
                 self.lr_scheduler.step(metrics[metric_to_check])
             except KeyError as exc:
                 raise KeyError(
-                    f"The `metric_for_best_model` training argument is set to '{metric_to_check}', which is not found in the evaluation metrics. "
-                    f"The available evaluation metrics are: {list(metrics.keys())}. Consider changing the `metric_for_best_model` via the TrainingArguments."
+                    f"The `metric_for_best_model` training argument is set to '{metric_to_check}', "
+                    f"which is not found in the evaluation metrics. "
+                    f"The available evaluation metrics are: {list(metrics.keys())}. "
+                    f"Please ensure that the `compute_metrics` function returns a dictionary that includes '{metric_to_check}' or "
+                    f"consider changing the `metric_for_best_model` via the TrainingArguments."
                 ) from exc
         return metrics
 
@@ -3022,8 +3058,11 @@ def _save_checkpoint(self, model, trial, metrics=None):
                 metric_value = metrics[metric_to_check]
             except KeyError as exc:
                 raise KeyError(
-                    f"The `metric_for_best_model` training argument is set to '{metric_to_check}', which is not found in the evaluation metrics. "
-                    f"The available evaluation metrics are: {list(metrics.keys())}. Consider changing the `metric_for_best_model` via the TrainingArguments."
+                    f"The `metric_for_best_model` training argument is set to '{metric_to_check}', "
+                    f"which is not found in the evaluation metrics. "
+                    f"The available evaluation metrics are: {list(metrics.keys())}. "
+                    f"Please ensure that the `compute_metrics` function returns a dictionary that includes '{metric_to_check}' or "
+                    f"consider changing the `metric_for_best_model` via the TrainingArguments."
                 ) from exc
 
             operator = np.greater if self.args.greater_is_better else np.less
@@ -3695,8 +3734,8 @@ def _save_tpu(self, output_dir: Optional[str] = None):
                 safe_serialization=self.args.save_safetensors,
                 state_dict=xm._maybe_convert_to_cpu(model.state_dict()),
             )
-        if self.tokenizer is not None and self.args.should_save:
-            self.tokenizer.save_pretrained(output_dir)
+        if self.processing_class is not None and self.args.should_save:
+            self.processing_class.save_pretrained(output_dir)
 
     def _save(self, output_dir: Optional[str] = None, state_dict=None):
         # If we are executing this function, we are the process zero, so we don't check for that.
@@ -3728,8 +3767,8 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None):
                 output_dir, state_dict=state_dict, safe_serialization=self.args.save_safetensors
             )
 
-        if self.tokenizer is not None:
-            self.tokenizer.save_pretrained(output_dir)
+        if self.processing_class is not None:
+            self.processing_class.save_pretrained(output_dir)
 
         # Good practice: save your training arguments together with the trained model
         torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
@@ -3992,7 +4031,7 @@ def evaluation_loop(
             start_time = time.time()
             model = (
                 self.accelerator.prepare(model)
-                if self.is_deepspeed_enabled
+                if self.is_deepspeed_enabled or self.is_fsdp_enabled
                 else self.accelerator.prepare_model(model, evaluation_mode=True)
             )
             self.model_preparation_time = round(time.time() - start_time, 4)
@@ -4043,6 +4082,7 @@ def evaluation_loop(
         all_inputs = EvalLoopContainer(self.args.eval_do_concat_batches, padding_index=-100)
 
         metrics = None
+        eval_set_kwargs = {}
 
         # Will be useful when we have an iterable dataset so don't know its length.
         observed_num_examples = 0
@@ -4060,7 +4100,9 @@ def evaluation_loop(
             # Prediction step
             losses, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
             main_input_name = getattr(self.model, "main_input_name", "input_ids")
-            inputs_decode = self._prepare_input(inputs[main_input_name]) if args.include_inputs_for_metrics else None
+            inputs_decode = (
+                self._prepare_input(inputs[main_input_name]) if "inputs" in args.include_for_metrics else None
+            )
 
             if is_torch_xla_available():
                 xm.mark_step()
@@ -4094,16 +4136,13 @@ def evaluation_loop(
             if self.args.batch_eval_metrics:
                 if self.compute_metrics is not None and logits is not None and labels is not None:
                     is_last_step = self.accelerator.gradient_state.end_of_dataloader
-                    if args.include_inputs_for_metrics:
-                        metrics = self.compute_metrics(
-                            EvalPrediction(predictions=logits, label_ids=labels, inputs=inputs),
-                            compute_result=is_last_step,
-                        )
-                    else:
-                        metrics = self.compute_metrics(
-                            EvalPrediction(predictions=logits, label_ids=labels),
-                            compute_result=is_last_step,
-                        )
+                    batch_kwargs = {}
+                    batch_kwargs["losses"] = losses if "loss" in args.include_for_metrics else None
+                    batch_kwargs["inputs"] = inputs if "inputs" in args.include_for_metrics else None
+                    metrics = self.compute_metrics(
+                        EvalPrediction(predictions=logits, label_ids=labels, **batch_kwargs),
+                        compute_result=is_last_step,
+                    )
 
                 del losses, logits, labels, inputs
                 torch.cuda.empty_cache()
@@ -4152,12 +4191,11 @@ def evaluation_loop(
             and all_labels is not None
             and not self.args.batch_eval_metrics
         ):
-            if args.include_inputs_for_metrics:
-                metrics = self.compute_metrics(
-                    EvalPrediction(predictions=all_preds, label_ids=all_labels, inputs=all_inputs)
-                )
-            else:
-                metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels))
+            eval_set_kwargs["losses"] = all_losses if "loss" in args.include_for_metrics else None
+            eval_set_kwargs["inputs"] = all_inputs if "inputs" in args.include_for_metrics else None
+            metrics = self.compute_metrics(
+                EvalPrediction(predictions=all_preds, label_ids=all_labels, **eval_set_kwargs)
+            )
         elif metrics is None:
             metrics = {}
 
@@ -4439,9 +4477,9 @@ def _push_from_checkpoint(self, checkpoint_folder):
         for modeling_file in modeling_files:
             if os.path.isfile(os.path.join(checkpoint_folder, modeling_file)):
                 shutil.copy(os.path.join(checkpoint_folder, modeling_file), os.path.join(output_dir, modeling_file))
-        # Saving the tokenizer is fast and we don't know how many files it may have spawned, so we resave it to be sure.
-        if self.tokenizer is not None:
-            self.tokenizer.save_pretrained(output_dir)
+        # Saving the processing class is fast and we don't know how many files it may have spawned, so we resave it to be sure.
+        if self.processing_class is not None:
+            self.processing_class.save_pretrained(output_dir)
         # Same for the training arguments
         torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
 
@@ -4496,7 +4534,7 @@ def push_to_hub(
         **kwargs,
     ) -> str:
         """
-        Upload `self.model` and `self.tokenizer` to the 🤗 model hub on the repo `self.args.hub_model_id`.
+        Upload `self.model` and `self.processing_class` to the 🤗 model hub on the repo `self.args.hub_model_id`.
 
         Parameters:
             commit_message (`str`, *optional*, defaults to `"End of training"`):
@@ -4596,7 +4634,7 @@ def prediction_loop(
         if len(self.accelerator._models) == 0 and model is self.model:
             model = (
                 self.accelerator.prepare(model)
-                if self.is_deepspeed_enabled
+                if self.is_deepspeed_enabled or self.is_fsdp_enabled
                 else self.accelerator.prepare_model(model, evaluation_mode=True)
             )
 
@@ -4630,6 +4668,7 @@ def prediction_loop(
         labels_host: Union[torch.Tensor, List[torch.Tensor]] = None
         inputs_host: Union[torch.Tensor, List[torch.Tensor]] = None
         metrics: Optional[dict] = None
+        eval_set_kwargs: dict = {}
 
         world_size = max(1, args.world_size)
 
@@ -4656,7 +4695,9 @@ def prediction_loop(
         for step, inputs in enumerate(dataloader):
             loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
             main_input_name = getattr(self.model, "main_input_name", "input_ids")
-            inputs_decode = self._prepare_input(inputs[main_input_name]) if args.include_inputs_for_metrics else None
+            inputs_decode = (
+                self._prepare_input(inputs[main_input_name]) if "inputs" in args.include_for_metrics else None
+            )
 
             if loss is not None:
                 losses = loss.repeat(batch_size)
@@ -4676,16 +4717,13 @@ def prediction_loop(
             if self.args.batch_eval_metrics:
                 if self.compute_metrics is not None and preds_host is not None and labels_host is not None:
                     is_last_step = self.accelerator.gradient_state.end_of_dataloader
-                    if args.include_inputs_for_metrics:
-                        metrics = self.compute_metrics(
-                            EvalPrediction(predictions=preds_host, label_ids=labels_host, inputs=inputs_host),
-                            compute_result=is_last_step,
-                        )
-                    else:
-                        metrics = self.compute_metrics(
-                            EvalPrediction(predictions=preds_host, label_ids=labels_host),
-                            compute_result=is_last_step,
-                        )
+                    batch_kwargs = {}
+                    batch_kwargs["losses"] = losses_host if "loss" in args.include_for_metrics else None
+                    batch_kwargs["inputs"] = inputs_host if "inputs" in args.include_for_metrics else None
+                    metrics = self.compute_metrics(
+                        EvalPrediction(predictions=preds_host, label_ids=labels_host, **batch_kwargs),
+                        compute_result=is_last_step,
+                    )
 
             if self.args.batch_eval_metrics or (
                 args.eval_accumulation_steps is not None and (step + 1) % args.eval_accumulation_steps == 0
@@ -4724,12 +4762,9 @@ def prediction_loop(
             and label_ids is not None
             and not self.args.batch_eval_metrics
         ):
-            if args.include_inputs_for_metrics:
-                metrics = self.compute_metrics(
-                    EvalPrediction(predictions=preds, label_ids=label_ids, inputs=inputs_ids)
-                )
-            else:
-                metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids))
+            eval_set_kwargs["losses"] = eval_loss if "loss" in args.include_for_metrics else None
+            eval_set_kwargs["inputs"] = inputs_ids if "inputs" in args.include_for_metrics else None
+            metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids, **eval_set_kwargs))
         elif metrics is None:
             metrics = {}
 
@@ -4830,6 +4865,9 @@ def create_accelerator_and_postprocess(self):
                 even_batches=accelerator_config.pop("even_batches"),
                 use_seedable_sampler=accelerator_config.pop("use_seedable_sampler"),
             )
+            if is_accelerate_available("1.1.0"):
+                dataloader_config.data_seed = self.args.data_seed
+
         non_blocking = accelerator_config.pop("non_blocking")
         if not is_accelerate_available("0.30.0"):
             if non_blocking:
diff --git a/src/transformers/trainer_callback.py b/src/transformers/trainer_callback.py
index d457a65993db42..405874acf8f4c4 100644
--- a/src/transformers/trainer_callback.py
+++ b/src/transformers/trainer_callback.py
@@ -272,7 +272,9 @@ class TrainerCallback:
         model ([`PreTrainedModel`] or `torch.nn.Module`):
             The model being trained.
         tokenizer ([`PreTrainedTokenizer`]):
-            The tokenizer used for encoding the data.
+            The tokenizer used for encoding the data. This is deprecated in favour of `processing_class`.
+        processing_class ([`PreTrainedTokenizer` or `BaseImageProcessor` or `ProcessorMixin` or `FeatureExtractionMixin`]):
+            The processing class used for encoding the data. Can be a tokenizer, a processor, an image processor or a feature extractor.
         optimizer (`torch.optim.Optimizer`):
             The optimizer used for the training steps.
         lr_scheduler (`torch.optim.lr_scheduler.LambdaLR`):
@@ -403,12 +405,12 @@ def on_prediction_step(self, args: TrainingArguments, state: TrainerState, contr
 class CallbackHandler(TrainerCallback):
     """Internal class that just calls the list of callbacks in order."""
 
-    def __init__(self, callbacks, model, tokenizer, optimizer, lr_scheduler):
+    def __init__(self, callbacks, model, processing_class, optimizer, lr_scheduler):
         self.callbacks = []
         for cb in callbacks:
             self.add_callback(cb)
         self.model = model
-        self.tokenizer = tokenizer
+        self.processing_class = processing_class
         self.optimizer = optimizer
         self.lr_scheduler = lr_scheduler
         self.train_dataloader = None
@@ -518,7 +520,7 @@ def call_event(self, event, args, state, control, **kwargs):
                 state,
                 control,
                 model=self.model,
-                tokenizer=self.tokenizer,
+                processing_class=self.processing_class,
                 optimizer=self.optimizer,
                 lr_scheduler=self.lr_scheduler,
                 train_dataloader=self.train_dataloader,
diff --git a/src/transformers/trainer_seq2seq.py b/src/transformers/trainer_seq2seq.py
index abc45cffe4aeea..07d0571e44c9b9 100644
--- a/src/transformers/trainer_seq2seq.py
+++ b/src/transformers/trainer_seq2seq.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import contextlib
 import warnings
 from copy import deepcopy
 from pathlib import Path
@@ -19,17 +20,28 @@
 
 import torch
 from torch import nn
+from torch.distributed.fsdp import FullyShardedDataParallel
 from torch.utils.data import Dataset
 
 from .generation.configuration_utils import GenerationConfig
 from .integrations.deepspeed import is_deepspeed_zero3_enabled
+from .integrations.fsdp import is_fsdp_managed_module
 from .trainer import Trainer
-from .utils import logging
+from .utils import is_datasets_available, logging
+from .utils.deprecation import deprecate_kwarg
 
 
+if is_datasets_available():
+    import datasets
+
 if TYPE_CHECKING:
+    from torch.utils.data import IterableDataset
+
     from .data.data_collator import DataCollator
+    from .feature_extraction_utils import FeatureExtractionMixin
+    from .image_processing_utils import BaseImageProcessor
     from .modeling_utils import PreTrainedModel
+    from .processing_utils import ProcessorMixin
     from .tokenization_utils_base import PreTrainedTokenizerBase
     from .trainer_callback import TrainerCallback
     from .trainer_utils import EvalPrediction, PredictionOutput
@@ -40,14 +52,17 @@
 
 
 class Seq2SeqTrainer(Trainer):
+    @deprecate_kwarg("tokenizer", new_name="processing_class", version="5.0.0", raise_if_both_names=True)
     def __init__(
         self,
         model: Union["PreTrainedModel", nn.Module] = None,
         args: "TrainingArguments" = None,
         data_collator: Optional["DataCollator"] = None,
-        train_dataset: Optional[Dataset] = None,
+        train_dataset: Optional[Union[Dataset, "IterableDataset", "datasets.Dataset"]] = None,
         eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
-        tokenizer: Optional["PreTrainedTokenizerBase"] = None,
+        processing_class: Optional[
+            Union["PreTrainedTokenizerBase", "BaseImageProcessor", "FeatureExtractionMixin", "ProcessorMixin"]
+        ] = None,
         model_init: Optional[Callable[[], "PreTrainedModel"]] = None,
         compute_metrics: Optional[Callable[["EvalPrediction"], Dict]] = None,
         callbacks: Optional[List["TrainerCallback"]] = None,
@@ -60,7 +75,7 @@ def __init__(
             data_collator=data_collator,
             train_dataset=train_dataset,
             eval_dataset=eval_dataset,
-            tokenizer=tokenizer,
+            processing_class=processing_class,
             model_init=model_init,
             compute_metrics=compute_metrics,
             callbacks=callbacks,
@@ -291,10 +306,8 @@ def prediction_step(
         if "max_length" in gen_kwargs and gen_kwargs["max_length"] is None:
             gen_kwargs.pop("max_length")
 
-        default_synced_gpus = True if is_deepspeed_zero3_enabled() else False
-        gen_kwargs["synced_gpus"] = (
-            gen_kwargs["synced_gpus"] if gen_kwargs.get("synced_gpus") is not None else default_synced_gpus
-        )
+        default_synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self.model)
+        gen_kwargs["synced_gpus"] = gen_kwargs.get("synced_gpus", default_synced_gpus)
 
         generation_inputs = inputs.copy()
         # If the `decoder_input_ids` was created from `labels`, evict the former, so that the model can freely generate
@@ -307,7 +320,15 @@ def prediction_step(
             generation_inputs = {
                 k: v for k, v in inputs.items() if k not in ("decoder_input_ids", "decoder_attention_mask")
             }
-        generated_tokens = self.model.generate(**generation_inputs, **gen_kwargs)
+
+        summon_full_params_context = (
+            FullyShardedDataParallel.summon_full_params(self.model)
+            if isinstance(self.model, FullyShardedDataParallel)
+            else contextlib.nullcontext()
+        )
+
+        with summon_full_params_context:
+            generated_tokens = self.model.generate(**generation_inputs, **gen_kwargs)
 
         # Temporary hack to ensure the generation config is not initialized for each iteration of the evaluation loop
         # TODO: remove this hack when the legacy code that initializes generation_config from a model config is
diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py
index 42644a31bef636..02c298cf7d2e65 100644
--- a/src/transformers/trainer_utils.py
+++ b/src/transformers/trainer_utils.py
@@ -156,7 +156,8 @@ class EvalPrediction:
     Parameters:
         predictions (`np.ndarray`): Predictions of the model.
         label_ids (`np.ndarray`): Targets to be matched.
-        inputs (`np.ndarray`, *optional*):
+        inputs (`np.ndarray`, *optional*): Input data passed to the model.
+        losses (`np.ndarray`, *optional*): Loss values computed during evaluation.
     """
 
     def __init__(
@@ -164,28 +165,25 @@ def __init__(
         predictions: Union[np.ndarray, Tuple[np.ndarray]],
         label_ids: Union[np.ndarray, Tuple[np.ndarray]],
         inputs: Optional[Union[np.ndarray, Tuple[np.ndarray]]] = None,
+        losses: Optional[Union[np.ndarray, Tuple[np.ndarray]]] = None,
     ):
         self.predictions = predictions
         self.label_ids = label_ids
         self.inputs = inputs
+        self.losses = losses
+        self.elements = (self.predictions, self.label_ids)
+        if self.inputs is not None:
+            self.elements += (self.inputs,)
+        if self.losses is not None:
+            self.elements += (self.losses,)
 
     def __iter__(self):
-        if self.inputs is not None:
-            return iter((self.predictions, self.label_ids, self.inputs))
-        else:
-            return iter((self.predictions, self.label_ids))
+        return iter(self.elements)
 
     def __getitem__(self, idx):
-        if idx < 0 or idx > 2:
-            raise IndexError("tuple index out of range")
-        if idx == 2 and self.inputs is None:
+        if idx < 0 or idx >= len(self.elements):
             raise IndexError("tuple index out of range")
-        if idx == 0:
-            return self.predictions
-        elif idx == 1:
-            return self.label_ids
-        elif idx == 2:
-            return self.inputs
+        return self.elements[idx]
 
 
 class EvalLoopOutput(NamedTuple):
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 596917928350bf..a2d83b2915eba4 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -683,9 +683,9 @@ class TrainingArguments:
         hub_strategy (`str` or [`~trainer_utils.HubStrategy`], *optional*, defaults to `"every_save"`):
             Defines the scope of what is pushed to the Hub and when. Possible values are:
 
-            - `"end"`: push the model, its configuration, the tokenizer (if passed along to the [`Trainer`]) and a
+            - `"end"`: push the model, its configuration, the processing class e.g. tokenizer (if passed along to the [`Trainer`]) and a
               draft of a model card when the [`~Trainer.save_model`] method is called.
-            - `"every_save"`: push the model, its configuration, the tokenizer (if passed along to the [`Trainer`]) and
+            - `"every_save"`: push the model, its configuration, the processing class e.g. tokenizer (if passed along to the [`Trainer`]) and
               a draft of a model card each time there is a model save. The pushes are asynchronous to not block
               training, and in case the save are very frequent, a new push is only attempted if the previous one is
               finished. A last push is made with the final model at the end of training.
@@ -707,8 +707,12 @@ class TrainingArguments:
         gradient_checkpointing_kwargs (`dict`, *optional*, defaults to `None`):
             Key word arguments to be passed to the `gradient_checkpointing_enable` method.
         include_inputs_for_metrics (`bool`, *optional*, defaults to `False`):
-            Whether or not the inputs will be passed to the `compute_metrics` function. This is intended for metrics
-            that need inputs, predictions and references for scoring calculation in Metric class.
+            This argument is deprecated. Use `include_for_metrics` instead, e.g, `include_for_metrics = ["inputs"]`.
+        include_for_metrics (`List[str]`, *optional*, defaults to `[]`):
+            Include additional data in the `compute_metrics` function if needed for metrics computation.
+            Possible options to add to `include_for_metrics` list:
+            - `"inputs"`: Input data passed to the model, intended for calculating input dependent metrics.
+            - `"loss"`: Loss values computed during evaluation, intended for calculating loss dependent metrics.
         eval_do_concat_batches (`bool`, *optional*, defaults to `True`):
             Whether to recursively concat inputs/losses/labels/predictions across batches. If `False`,
             will instead store them as lists, with each batch kept separate.
@@ -1362,7 +1366,17 @@ class TrainingArguments:
         },
     )
     include_inputs_for_metrics: bool = field(
-        default=False, metadata={"help": "Whether or not the inputs will be passed to the `compute_metrics` function."}
+        default=False,
+        metadata={
+            "help": "This argument is deprecated and will be removed in version 5 of 🤗 Transformers. Use `include_for_metrics` instead."
+        },
+    )
+    include_for_metrics: List[str] = field(
+        default_factory=list,
+        metadata={
+            "help": "List of strings to specify additional data to include in the `compute_metrics` function."
+            "Options: 'inputs', 'loss'."
+        },
     )
     eval_do_concat_batches: bool = field(
         default=True,
@@ -2064,6 +2078,19 @@ def __post_init__(self):
                 "This is not supported and we recommend you to update your version."
             )
 
+        if self.data_seed is not None:
+            if not is_accelerate_available("1.1.0"):
+                raise NotImplementedError(
+                    "data_seed requires Accelerate version `accelerate` >= 1.1.0. "
+                    "This is not supported and we recommend you to update your version."
+                )
+
+        if self.include_inputs_for_metrics:
+            logger.warning(
+                "Using `include_inputs_for_metrics` is deprecated and will be removed in version 5 of 🤗 Transformers. Please use `include_for_metrics` list argument instead."
+            )
+            self.include_for_metrics.append("inputs")
+
     def __str__(self):
         self_as_dict = asdict(self)
 
@@ -2834,9 +2861,9 @@ def set_push_to_hub(
             strategy (`str` or [`~trainer_utils.HubStrategy`], *optional*, defaults to `"every_save"`):
                 Defines the scope of what is pushed to the Hub and when. Possible values are:
 
-                - `"end"`: push the model, its configuration, the tokenizer (if passed along to the [`Trainer`]) and a
+                - `"end"`: push the model, its configuration, the processing_class e.g. tokenizer (if passed along to the [`Trainer`]) and a
                 draft of a model card when the [`~Trainer.save_model`] method is called.
-                - `"every_save"`: push the model, its configuration, the tokenizer (if passed along to the [`Trainer`])
+                - `"every_save"`: push the model, its configuration, the processing_class e.g. tokenizer (if passed along to the [`Trainer`])
                   and
                 a draft of a model card each time there is a model save. The pushes are asynchronous to not block
                 training, and in case the save are very frequent, a new push is only attempted if the previous one is
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 134da3474becc0..3b33127be4ba53 100755
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -163,6 +163,7 @@
     is_onnx_available,
     is_openai_available,
     is_optimum_available,
+    is_optimum_quanto_available,
     is_pandas_available,
     is_peft_available,
     is_phonemizer_available,
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index f4e471ee7ab58b..048de1cc8ae77a 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -707,6 +707,9 @@ def __init__(self, *args, **kwargs):
 MODEL_FOR_IMAGE_SEGMENTATION_MAPPING = None
 
 
+MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING = None
+
+
 MODEL_FOR_IMAGE_TO_IMAGE_MAPPING = None
 
 
@@ -874,6 +877,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class AutoModelForImageTextToText(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class AutoModelForImageToImage(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -7158,6 +7168,34 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class PhimoeForCausalLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class PhimoeForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class PhimoeModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class PhimoePreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class Pix2StructForConditionalGeneration(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -9947,6 +9985,34 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class ZambaForCausalLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ZambaForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ZambaModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ZambaPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class ZoeDepthForDepthEstimation(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index a98b17e4bd5739..519755489a3373 100755
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -143,7 +143,14 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
 # `importlib.metadata.version` doesn't work with `awq`
 _auto_awq_available = importlib.util.find_spec("awq") is not None
 _quanto_available = _is_package_available("quanto")
-_compressed_tensors_available = _is_package_available("compressed_tensors")
+_is_optimum_quanto_available = False
+try:
+    importlib.metadata.version("optimum_quanto")
+    _is_optimum_quanto_available = True
+except importlib.metadata.PackageNotFoundError:
+    _is_optimum_quanto_available = False
+# For compressed_tensors, only check spec to allow compressed_tensors-nightly package
+_compressed_tensors_available = importlib.util.find_spec("compressed_tensors") is not None
 _pandas_available = _is_package_available("pandas")
 _peft_available = _is_package_available("peft")
 _phonemizer_available = _is_package_available("phonemizer")
@@ -962,9 +969,17 @@ def is_auto_awq_available():
 
 
 def is_quanto_available():
+    logger.warning_once(
+        "Importing from quanto will be deprecated in v4.47. Please install optimum-quanto instrad `pip install optimum-quanto`"
+    )
     return _quanto_available
 
 
+def is_optimum_quanto_available():
+    # `importlib.metadata.version` doesn't work with `optimum.quanto`, need to put `optimum_quanto`
+    return _is_optimum_quanto_available
+
+
 def is_compressed_tensors_available():
     return _compressed_tensors_available
 
diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
index 8be0bb672e51b8..026a2066798574 100755
--- a/src/transformers/utils/quantization_config.py
+++ b/src/transformers/utils/quantization_config.py
@@ -45,12 +45,14 @@ class QuantizationMethod(str, Enum):
     COMPRESSED_TENSORS = "compressed-tensors"
     FBGEMM_FP8 = "fbgemm_fp8"
     TORCHAO = "torchao"
+    BITNET = "bitnet"
 
 
 class AWQLinearVersion(str, Enum):
     GEMM = "gemm"
     GEMV = "gemv"
     EXLLAMA = "exllama"
+    IPEX = "ipex"
 
     @staticmethod
     def from_str(version: str):
@@ -61,6 +63,8 @@ def from_str(version: str):
             return AWQLinearVersion.GEMV
         elif version == "exllama":
             return AWQLinearVersion.EXLLAMA
+        elif version == "ipex":
+            return AWQLinearVersion.IPEX
         else:
             raise ValueError(f"Unknown AWQLinearVersion {version}")
 
@@ -830,18 +834,20 @@ def post_init(self):
         r"""
         Safety checker that arguments are correct
         """
-        if not torch.cuda.is_available():
-            raise ValueError("AWQ is only available on GPU")
-
         if self.backend not in [AwqBackendPackingMethod.AUTOAWQ, AwqBackendPackingMethod.LLMAWQ]:
             raise ValueError(
                 f"Only supported quantization backends in {AwqBackendPackingMethod.AUTOAWQ} and {AwqBackendPackingMethod.LLMAWQ} - not recognized backend {self.backend}"
             )
 
         self.version = AWQLinearVersion.from_str(self.version)
-        if self.version not in [AWQLinearVersion.GEMM, AWQLinearVersion.GEMV, AWQLinearVersion.EXLLAMA]:
+        if self.version not in [
+            AWQLinearVersion.GEMM,
+            AWQLinearVersion.GEMV,
+            AWQLinearVersion.EXLLAMA,
+            AWQLinearVersion.IPEX,
+        ]:
             raise ValueError(
-                f"Only supported versions are in [AWQLinearVersion.GEMM, AWQLinearVersion.GEMV, AWQLinearVersion.EXLLAMA] - not recognized version {self.version}"
+                f"Only supported versions are in [AWQLinearVersion.GEMM, AWQLinearVersion.GEMV, AWQLinearVersion.EXLLAMA, AWQLinearVersion.IPEX] - not recognized version {self.version}"
             )
 
         if self.backend == AwqBackendPackingMethod.LLMAWQ:
@@ -1105,7 +1111,7 @@ def __init__(
         self.sparsity_config = None
 
         # parse from dict to load nested QuantizationScheme objects
-        if config_groups:
+        if config_groups or kv_cache_scheme:
             self.quantization_config = QuantizationConfig.parse_obj(
                 {
                     "config_groups": config_groups,
@@ -1303,4 +1309,22 @@ def get_apply_tensor_subclass(self):
         return _STR_TO_METHOD[self.quant_type](**self.quant_type_kwargs)
 
     def __repr__(self):
-        return f"{self.quant_type}({', '.join(str(k) + '=' + str(v) for k, v in self.quant_type_kwargs.items())})"
+        return f"{self.quant_type}({', '.join(str(k) + '=' + str(v) for k, v in self.kwargs.items())})"
+
+
+@dataclass
+class BitNetConfig(QuantizationConfigMixin):
+    def __init__(
+        self,
+        modules_to_not_convert: Optional[List] = None,
+        **kwargs,
+    ):
+        self.quant_method = QuantizationMethod.BITNET
+        self.modules_to_not_convert = modules_to_not_convert
+        self.post_init()
+
+    def post_init(self):
+        r"""
+        Safety checker that arguments are correct
+        """
+        pass
diff --git a/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py b/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py
index 0b27b49212937b..c2b753c103e184 100755
--- a/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py
+++ b/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py
@@ -450,7 +450,7 @@ def tokenize_function(examples):
         args=training_args,
         train_dataset=train_dataset if training_args.do_train else None,
         eval_dataset=eval_dataset if training_args.do_eval else None,
-        tokenizer=tokenizer,
+        processing_class=tokenizer,
         data_collator=data_collator,
     )
 
diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py
index b635833706d136..8eaa00bc768428 100644
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -1036,7 +1036,7 @@ def get_dataset():
 
             trainer = Trainer(
                 model=model,
-                tokenizer=tokenizer,
+                processing_class=tokenizer,
                 args=training_args,
                 train_dataset=train_dataset,
                 eval_dataset=eval_dataset,
diff --git a/tests/generation/test_fsdp.py b/tests/generation/test_fsdp.py
new file mode 100644
index 00000000000000..904ccdea63baf9
--- /dev/null
+++ b/tests/generation/test_fsdp.py
@@ -0,0 +1,148 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+from typing import Any, Callable
+
+from transformers import is_torch_available
+from transformers.testing_utils import (
+    TestCasePlus,
+    execute_subprocess_async,
+    get_torch_dist_unique_port,
+    require_torch_multi_gpu,
+)
+
+
+if is_torch_available():
+    import functools
+
+    import torch
+    import torch.distributed
+    from torch.distributed._composable.fsdp import fully_shard, register_fsdp_forward_method
+    from torch.distributed.device_mesh import init_device_mesh
+    from torch.distributed.fsdp import FullyShardedDataParallel
+    from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
+
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    from transformers.models.gpt2.modeling_gpt2 import GPT2Block
+
+    data = 4 * [
+        "Hello world!",
+        "The quick brown fox jumps over the lazy dog.",
+    ]
+
+    def manage_process_group(func: Callable[..., Any]) -> Callable[..., Any]:
+        """Manage the creation and destruction of the distributed process group for the wrapped function."""
+
+        def wrapped(*args: Any, **kwargs: Any) -> Any:
+            torch.distributed.init_process_group(world_size=torch.cuda.device_count())
+            try:
+                return func(*args, **kwargs)
+            finally:
+                torch.distributed.destroy_process_group()
+
+        return wrapped
+
+    @manage_process_group
+    def fsdp_generate():
+        torch.cuda.set_device(device := torch.device(rank := torch.distributed.get_rank()))
+
+        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(device)
+
+        fsdp_model = FullyShardedDataParallel(
+            model,
+            auto_wrap_policy=functools.partial(transformer_auto_wrap_policy, transformer_layer_cls={GPT2Block}),
+            limit_all_gathers=True,
+            use_orig_params=True,
+        )
+
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        batch = tokenizer(data[rank], return_tensors="pt", return_attention_mask=True).to(device)
+
+        with FullyShardedDataParallel.summon_full_params(fsdp_model):
+            _ = fsdp_model.module.generate(
+                input_ids=batch["input_ids"],
+                attention_mask=batch["attention_mask"],
+                max_length=30,
+            )
+
+    @manage_process_group
+    def fsdp2_generate():
+        torch.cuda.set_device(device := torch.device(rank := torch.distributed.get_rank()))
+
+        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(device)
+
+        mesh = init_device_mesh("cuda", (torch.distributed.get_world_size(),))
+        for submodule in model.modules():
+            if isinstance(submodule, GPT2Block):
+                fully_shard(submodule, mesh=mesh)
+        fully_shard(model, mesh=mesh)
+
+        register_fsdp_forward_method(model, "generate")
+
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        batch = tokenizer(data[rank], return_tensors="pt", return_attention_mask=True).to(device)
+
+        _ = model.generate(
+            input_ids=batch["input_ids"],
+            attention_mask=batch["attention_mask"],
+            max_length=30,
+        )
+
+
+class TestFSDPGeneration(TestCasePlus):
+    @require_torch_multi_gpu
+    def test_fsdp_generate(self):
+        distributed_args = f"""--nproc_per_node={torch.cuda.device_count()}
+            --master_port={get_torch_dist_unique_port()}
+            {self.test_file_dir}/test_fsdp.py
+        """.split()
+        args = "--fsdp".split()
+        cmd = ["torchrun"] + distributed_args + args
+        execute_subprocess_async(cmd, env=self.get_env())
+        # successful return here == success - any errors would have caused an error in the sub-call
+
+    @require_torch_multi_gpu
+    def test_fsdp2_generate(self):
+        distributed_args = f"""--nproc_per_node={torch.cuda.device_count()}
+            --master_port={get_torch_dist_unique_port()}
+            {self.test_file_dir}/test_fsdp.py
+        """.split()
+        args = "--fsdp2".split()
+        cmd = ["torchrun"] + distributed_args + args
+        execute_subprocess_async(cmd, env=self.get_env())
+        # successful return here == success - any errors would have caused an error in the sub-call
+
+
+if __name__ == "__main__":
+    # The script below is meant to be run under torch.distributed, on a machine with multiple GPUs:
+    #
+    # PYTHONPATH="src" python -m torch.distributed.run --nproc_per_node 2 --output_dir output_dir ./tests/generation/test_fsdp.py --fsdp
+
+    class CLIArgs(argparse.Namespace):
+        fsdp: bool
+        fsdp2: bool
+
+    parser = argparse.ArgumentParser()
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument("--fsdp", action="store_true")
+    group.add_argument("--fsdp2", action="store_true")
+    args = parser.parse_args(namespace=CLIArgs())
+
+    if args.fsdp:
+        fsdp_generate()
+    elif args.fsdp2:
+        fsdp2_generate()
+    else:
+        raise ValueError("Missing test selection")
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index beb5fc7818f82c..1727aed1117bc6 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -24,12 +24,12 @@
 import pytest
 from parameterized import parameterized
 
-from transformers import is_torch_available, pipeline, set_seed
+from transformers import AutoConfig, is_torch_available, pipeline, set_seed
 from transformers.testing_utils import (
     is_flaky,
     require_accelerate,
     require_auto_gptq,
-    require_quanto,
+    require_optimum_quanto,
     require_torch,
     require_torch_gpu,
     require_torch_multi_accelerator,
@@ -88,50 +88,51 @@
         WatermarkDetector,
         WatermarkingConfig,
     )
+    from transformers.generation.candidate_generator import AssistedCandidateGeneratorDifferentTokenizers
     from transformers.generation.utils import _speculative_sampling
 
 
 class GenerationTesterMixin:
     model_tester = None
     all_generative_model_classes = ()
-    input_name = "input_ids"
     max_new_tokens = 3
 
-    def _get_input_ids_and_config(self, batch_size=2):
+    def prepare_config_and_inputs_for_generate(self, batch_size=2):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        # TODO: @raushan or @gante, use `model.main_input_name` as the main input instead of relyinn on `input_ids`
-        input_ids = inputs_dict.pop(self.input_name)[:batch_size, :]
-        inputs_dict.pop("attention_mask", None)
 
-        # we don't want encoder-decoder models to start from filled decoder ids
-        inputs_dict.pop("decoder_input_ids", None)
-        inputs_dict.pop("decoder_attention_mask", None)
-
-        # we'll set cache use in each test differently
-        inputs_dict.pop("use_cache", None)
-
-        inputs_dict = {
-            k: v[:batch_size, ...]
+        # We don't want a few model inputs in our model input dictionary for generation tests
+        input_keys_to_ignore = [
+            # we don't want to mask attention heads
+            "head_mask",
+            "decoder_head_mask",
+            "cross_attn_head_mask",
+            # we don't want encoder-decoder models to start from filled decoder ids
+            "decoder_input_ids",
+            "decoder_attention_mask",
+            # we'll set cache use in each test differently
+            "use_cache",
+            # Ignore labels if it is in the input dict
+            "labels",
+            # model-specific exceptions should overload/overwrite this function
+        ]
+        filtered_inputs_dict = {
+            k: v[:batch_size, ...] if isinstance(v, torch.Tensor) else v
             for k, v in inputs_dict.items()
-            if "head_mask" not in k and isinstance(v, torch.Tensor)
+            if k not in input_keys_to_ignore
         }
-        if config.eos_token_id is not None and config.pad_token_id is None:
-            # hack to allow generate for models such as GPT2 as is done in `generate()`
-            if isinstance(config.eos_token_id, int):
-                config.eos_token_id = [config.eos_token_id]
-            config.pad_token_id = config.eos_token_id[0]
-
-        if self.has_attentions:
-            attention_mask = torch.ones_like(input_ids, dtype=torch.long)
-        else:
-            attention_mask = None
 
-        # It is important set set the eos_token_id to None to ensure that no sequences
-        # shorter than `max_length` can be generated
-        config.eos_token_id = None
-        config.forced_eos_token_id = None
+        # It is important set `eos_token_id` to `None` to avoid early stopping (would break for length-based checks)
+        text_gen_config = config.get_text_config(decoder=True)
+        if text_gen_config.eos_token_id is not None and text_gen_config.pad_token_id is None:
+            text_gen_config.pad_token_id = (
+                text_gen_config.eos_token_id
+                if isinstance(text_gen_config.eos_token_id, int)
+                else text_gen_config.eos_token_id[0]
+            )
+        text_gen_config.eos_token_id = None
+        text_gen_config.forced_eos_token_id = None
 
-        return config, input_ids, attention_mask, inputs_dict
+        return config, filtered_inputs_dict
 
     def _get_logits_processor_kwargs(self, do_sample=False, config=None):
         logits_processor_kwargs = {
@@ -193,8 +194,6 @@ def _get_constrained_beam_kwargs(self, num_return_sequences=1):
     def _greedy_generate(
         self,
         model,
-        input_ids,
-        attention_mask,
         inputs_dict,
         output_scores=False,
         output_logits=False,
@@ -204,9 +203,7 @@ def _greedy_generate(
         use_cache=True,
     ):
         logits_processor_kwargs = self._get_logits_processor_kwargs(do_sample=False, config=model.config)
-        model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
         output_generate = model.generate(
-            input_ids,
             do_sample=False,
             num_beams=1,
             max_new_tokens=self.max_new_tokens,
@@ -217,7 +214,6 @@ def _greedy_generate(
             return_dict_in_generate=return_dict_in_generate,
             use_cache=use_cache,
             **logits_processor_kwargs,
-            **model_kwargs,
             **inputs_dict,
         )
 
@@ -226,8 +222,6 @@ def _greedy_generate(
     def _sample_generate(
         self,
         model,
-        input_ids,
-        attention_mask,
         inputs_dict,
         num_return_sequences,
         output_scores=False,
@@ -239,9 +233,7 @@ def _sample_generate(
     ):
         torch.manual_seed(0)
         logits_processor_kwargs = self._get_logits_processor_kwargs(do_sample=True, config=model.config)
-        model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
         output_generate = model.generate(
-            input_ids,
             do_sample=True,
             num_beams=1,
             max_new_tokens=self.max_new_tokens,
@@ -253,7 +245,6 @@ def _sample_generate(
             return_dict_in_generate=return_dict_in_generate,
             use_cache=use_cache,
             **logits_processor_kwargs,
-            **model_kwargs,
             **inputs_dict,
         )
 
@@ -262,8 +253,6 @@ def _sample_generate(
     def _beam_search_generate(
         self,
         model,
-        input_ids,
-        attention_mask,
         inputs_dict,
         beam_kwargs,
         output_scores=False,
@@ -274,9 +263,7 @@ def _beam_search_generate(
         use_cache=True,
     ):
         logits_processor_kwargs = self._get_logits_processor_kwargs(do_sample=False, config=model.config)
-        model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
         output_generate = model.generate(
-            input_ids,
             do_sample=False,
             max_new_tokens=self.max_new_tokens,
             output_scores=output_scores,
@@ -287,7 +274,6 @@ def _beam_search_generate(
             use_cache=use_cache,
             **beam_kwargs,
             **logits_processor_kwargs,
-            **model_kwargs,
             **inputs_dict,
         )
 
@@ -296,8 +282,6 @@ def _beam_search_generate(
     def _beam_sample_generate(
         self,
         model,
-        input_ids,
-        attention_mask,
         inputs_dict,
         beam_kwargs,
         output_scores=False,
@@ -309,9 +293,7 @@ def _beam_sample_generate(
     ):
         torch.manual_seed(0)
         logits_processor_kwargs = self._get_logits_processor_kwargs(do_sample=True, config=model.config)
-        model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
         output_generate = model.generate(
-            input_ids,
             do_sample=True,
             max_new_tokens=self.max_new_tokens,
             output_scores=output_scores,
@@ -322,7 +304,6 @@ def _beam_sample_generate(
             use_cache=use_cache,
             **beam_kwargs,
             **logits_processor_kwargs,
-            **model_kwargs,
             **inputs_dict,
         )
 
@@ -331,8 +312,6 @@ def _beam_sample_generate(
     def _group_beam_search_generate(
         self,
         model,
-        input_ids,
-        attention_mask,
         inputs_dict,
         beam_kwargs,
         output_scores=False,
@@ -343,9 +322,7 @@ def _group_beam_search_generate(
         use_cache=True,
     ):
         logits_processor_kwargs = self._get_logits_processor_kwargs(do_sample=False, config=model.config)
-        model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
         output_generate = model.generate(
-            input_ids,
             do_sample=False,
             max_new_tokens=self.max_new_tokens,
             output_scores=output_scores,
@@ -356,7 +333,6 @@ def _group_beam_search_generate(
             use_cache=use_cache,
             **beam_kwargs,
             **logits_processor_kwargs,
-            **model_kwargs,
             **inputs_dict,
         )
 
@@ -365,8 +341,6 @@ def _group_beam_search_generate(
     def _constrained_beam_search_generate(
         self,
         model,
-        input_ids,
-        attention_mask,
         inputs_dict,
         constraints,
         beam_kwargs,
@@ -378,9 +352,7 @@ def _constrained_beam_search_generate(
         use_cache=True,
     ):
         logits_processor_kwargs = self._get_logits_processor_kwargs(do_sample=False, config=model.config)
-        model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
         output_generate = model.generate(
-            input_ids,
             do_sample=False,
             max_new_tokens=self.max_new_tokens,
             output_scores=output_scores,
@@ -392,7 +364,6 @@ def _constrained_beam_search_generate(
             use_cache=use_cache,
             **beam_kwargs,
             **logits_processor_kwargs,
-            **model_kwargs,
             **inputs_dict,
         )
 
@@ -401,8 +372,6 @@ def _constrained_beam_search_generate(
     def _contrastive_generate(
         self,
         model,
-        input_ids,
-        attention_mask,
         inputs_dict,
         output_scores=False,
         output_logits=False,
@@ -417,9 +386,7 @@ def _contrastive_generate(
         }
 
         logits_processor_kwargs = self._get_logits_processor_kwargs(do_sample=False, config=model.config)
-        model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
         output_generate = model.generate(
-            input_ids,
             do_sample=False,
             num_beams=1,
             max_new_tokens=self.max_new_tokens,
@@ -430,7 +397,6 @@ def _contrastive_generate(
             return_dict_in_generate=return_dict_in_generate,
             use_cache=use_cache,
             **logits_processor_kwargs,
-            **model_kwargs,
             **contrastive_search_kwargs,
             **inputs_dict,
         )
@@ -440,28 +406,26 @@ def _contrastive_generate(
     @pytest.mark.generate
     def test_greedy_generate(self):
         for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
+            main_input = inputs_dict[model_class.main_input_name]
 
             model = model_class(config).to(torch_device).eval()
-            output_generate = self._greedy_generate(
-                model=model, input_ids=input_ids, attention_mask=attention_mask, inputs_dict=inputs_dict
-            )
+            output_generate = self._greedy_generate(model=model, inputs_dict=inputs_dict)
 
             if model.config.is_encoder_decoder:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
             else:
-                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
+                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1])
 
     @pytest.mark.generate
     def test_greedy_generate_dict_outputs(self):
         for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
+            main_input = inputs_dict[model_class.main_input_name]
 
             model = model_class(config).to(torch_device).eval()
             output_generate = self._greedy_generate(
                 model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
                 inputs_dict=inputs_dict,
                 output_scores=True,
                 output_logits=True,
@@ -477,17 +441,18 @@ def test_greedy_generate_dict_outputs(self):
                 # Retrocompatibility check
                 self.assertIsInstance(output_generate, GreedySearchEncoderDecoderOutput)
             else:
-                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
+                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + main_input.shape[-1])
                 self.assertIsInstance(output_generate, GenerateDecoderOnlyOutput)
                 # Retrocompatibility check
                 self.assertIsInstance(output_generate, GreedySearchDecoderOnlyOutput)
 
-            self._check_outputs(output_generate, input_ids, model.config)
+            self._check_outputs(output_generate, main_input, model.config)
 
     @pytest.mark.generate
     def test_greedy_generate_dict_outputs_use_cache(self):
         for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
+            main_input = inputs_dict[model_class.main_input_name]
 
             if not hasattr(config, "use_cache"):
                 self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
@@ -498,53 +463,45 @@ def test_greedy_generate_dict_outputs_use_cache(self):
             model = model_class(config).to(torch_device).eval()
             output_generate = self._greedy_generate(
                 model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
                 inputs_dict=inputs_dict,
                 output_scores=True,
                 output_logits=True,
                 output_hidden_states=True,
                 output_attentions=self.has_attentions,
                 return_dict_in_generate=True,
-                use_cache=True,
+                use_cache=True,  # Enable cache
             )
 
             if model.config.is_encoder_decoder:
                 self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1)
             else:
-                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
+                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + main_input.shape[-1])
 
-            self._check_outputs(output_generate, input_ids, model.config, use_cache=True)
+            self._check_outputs(output_generate, main_input, model.config, use_cache=True)
 
     @pytest.mark.generate
     def test_sample_generate(self):
         for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
+            main_input = inputs_dict[model_class.main_input_name]
 
             model = model_class(config).to(torch_device).eval()
-            output_generate = self._sample_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                inputs_dict=inputs_dict,
-                num_return_sequences=1,
-            )
+            output_generate = self._sample_generate(model=model, inputs_dict=inputs_dict, num_return_sequences=1)
 
             if model.config.is_encoder_decoder:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
             else:
-                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
+                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1])
 
     @pytest.mark.generate
     def test_sample_generate_dict_output(self):
         for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
+            main_input = inputs_dict[model_class.main_input_name]
 
             model = model_class(config).to(torch_device).eval()
             output_generate = self._sample_generate(
                 model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
                 inputs_dict=inputs_dict,
                 num_return_sequences=2,
                 output_scores=True,
@@ -561,45 +518,39 @@ def test_sample_generate_dict_output(self):
                 # Retrocompatibility check
                 self.assertIsInstance(output_generate, SampleEncoderDecoderOutput)
             else:
-                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
+                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + main_input.shape[-1])
                 self.assertIsInstance(output_generate, GenerateDecoderOnlyOutput)
                 # Retrocompatibility check
                 self.assertIsInstance(output_generate, SampleDecoderOnlyOutput)
 
-            self._check_outputs(output_generate, input_ids, model.config, num_return_sequences=2)
+            self._check_outputs(output_generate, main_input, model.config, num_return_sequences=2)
 
     @pytest.mark.generate
     def test_beam_search_generate(self):
         for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
+            main_input = inputs_dict[model_class.main_input_name]
 
             model = model_class(config).to(torch_device).eval()
 
             beam_kwargs = self._get_beam_kwargs()
-            output_generate = self._beam_search_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                inputs_dict=inputs_dict,
-                beam_kwargs=beam_kwargs,
-            )
+            output_generate = self._beam_search_generate(model=model, inputs_dict=inputs_dict, beam_kwargs=beam_kwargs)
 
             if model.config.is_encoder_decoder:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
             else:
-                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
+                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1])
 
     @pytest.mark.generate
     def test_beam_search_generate_dict_output(self):
         for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
+            main_input = inputs_dict[model_class.main_input_name]
 
             model = model_class(config).to(torch_device).eval()
             beam_kwargs = self._get_beam_kwargs()
             output_generate = self._beam_search_generate(
                 model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
                 inputs_dict=inputs_dict,
                 beam_kwargs=beam_kwargs,
                 output_scores=True,
@@ -615,20 +566,20 @@ def test_beam_search_generate_dict_output(self):
                 # Retrocompatibility check
                 self.assertIsInstance(output_generate, BeamSearchEncoderDecoderOutput)
             else:
-                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
+                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + main_input.shape[-1])
                 self.assertIsInstance(output_generate, GenerateBeamDecoderOnlyOutput)
                 # Retrocompatibility check
                 self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput)
 
             self._check_outputs(
-                output_generate, input_ids, model.config, num_return_sequences=beam_kwargs["num_beams"]
+                output_generate, main_input, model.config, num_return_sequences=beam_kwargs["num_beams"]
             )
 
     @pytest.mark.generate
     def test_beam_search_generate_dict_outputs_use_cache(self):
         for model_class in self.all_generative_model_classes:
-            # enable cache
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
+            main_input = inputs_dict[model_class.main_input_name]
 
             if not hasattr(config, "use_cache"):
                 self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
@@ -642,8 +593,6 @@ def test_beam_search_generate_dict_outputs_use_cache(self):
             model = model_class(config).to(torch_device).eval()
             output_generate = self._beam_search_generate(
                 model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
                 inputs_dict=inputs_dict,
                 beam_kwargs=beam_kwargs,
                 output_scores=True,
@@ -651,16 +600,20 @@ def test_beam_search_generate_dict_outputs_use_cache(self):
                 output_hidden_states=True,
                 output_attentions=self.has_attentions,
                 return_dict_in_generate=True,
-                use_cache=True,
+                use_cache=True,  # Enable cache
             )
 
             if model.config.is_encoder_decoder:
                 self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1)
             else:
-                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
+                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + main_input.shape[-1])
 
             self._check_outputs(
-                output_generate, input_ids, model.config, use_cache=True, num_return_sequences=beam_kwargs["num_beams"]
+                output_generate,
+                main_input,
+                model.config,
+                use_cache=True,
+                num_return_sequences=beam_kwargs["num_beams"],
             )
 
     @require_accelerate
@@ -674,7 +627,7 @@ def test_model_parallel_beam_search(self):
             if model_class._no_split_modules is None:
                 continue
 
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
 
             model = model_class(config).eval()
             with tempfile.TemporaryDirectory() as tmp_dir:
@@ -682,8 +635,6 @@ def test_model_parallel_beam_search(self):
                 new_model = model_class.from_pretrained(tmp_dir, device_map="auto")
 
                 new_model.generate(
-                    input_ids,
-                    attention_mask=attention_mask,
                     max_new_tokens=self.max_new_tokens,
                     num_beams=2,
                     **inputs_dict,
@@ -692,14 +643,13 @@ def test_model_parallel_beam_search(self):
     @pytest.mark.generate
     def test_beam_sample_generate(self):
         for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
+            main_input = inputs_dict[model_class.main_input_name]
 
             model = model_class(config).to(torch_device).eval()
             beam_kwargs = self._get_beam_kwargs()
             output_generate = self._beam_sample_generate(
                 model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
                 inputs_dict=inputs_dict,
                 beam_kwargs=beam_kwargs,
             )
@@ -707,7 +657,7 @@ def test_beam_sample_generate(self):
             if model.config.is_encoder_decoder:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
             else:
-                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
+                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1])
 
             # for VLMs inputs embeds won't match input ids unless images are encoded and merged with ids properly
             # no quick fix available, since obtaining image embeddings step is very model-specific
@@ -721,12 +671,11 @@ def test_beam_sample_generate(self):
                     "inputs_embeds" in prepare_inputs_for_generation_args
                     and "cache_positions" in prepare_inputs_for_generation_args
                 ):
-                    input_embeds = model.get_input_embeddings()(input_ids)
+                    input_embeds = model.get_input_embeddings()(inputs_dict["input_ids"])
                     beam_kwargs.update({"inputs_embeds": input_embeds})
                     output_generate2 = self._beam_sample_generate(
                         model=model,
                         input_ids=None,
-                        attention_mask=attention_mask,
                         inputs_dict={},
                         beam_kwargs=beam_kwargs,
                     )
@@ -736,15 +685,14 @@ def test_beam_sample_generate(self):
     @pytest.mark.generate
     def test_beam_sample_generate_dict_output(self):
         for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
+            main_input = inputs_dict[model_class.main_input_name]
 
             model = model_class(config).to(torch_device).eval()
             beam_kwargs = self._get_beam_kwargs()
 
             output_generate = self._beam_sample_generate(
                 model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
                 inputs_dict=inputs_dict,
                 beam_kwargs=beam_kwargs,
                 output_scores=True,
@@ -761,18 +709,18 @@ def test_beam_sample_generate_dict_output(self):
                 # Retrocompatibility check
                 self.assertIsInstance(output_generate, BeamSampleEncoderDecoderOutput)
             else:
-                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
+                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + main_input.shape[-1])
                 self.assertIsInstance(output_generate, GenerateBeamDecoderOnlyOutput)
                 # Retrocompatibility check
                 self.assertIsInstance(output_generate, BeamSampleDecoderOnlyOutput)
 
             self._check_outputs(
-                output_generate, input_ids, model.config, num_return_sequences=beam_kwargs["num_beams"]
+                output_generate, main_input, model.config, num_return_sequences=beam_kwargs["num_beams"]
             )
 
     @pytest.mark.generate
     def test_generate_without_input_ids(self):
-        config, _, _, _ = self._get_input_ids_and_config()
+        config, _ = self.prepare_config_and_inputs_for_generate()
 
         # if no bos token id => cannot generate from None
         if config.bos_token_id is None:
@@ -794,49 +742,45 @@ def test_generate_without_input_ids(self):
     @pytest.mark.generate
     def test_group_beam_search_generate(self):
         for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
+            main_input = inputs_dict[model_class.main_input_name]
 
             model = model_class(config).to(torch_device).eval()
             # check `generate()` and `group_beam_search()` are equal
             beam_kwargs = self._get_diverse_beam_kwargs()
             output_generate = self._group_beam_search_generate(
                 model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
                 inputs_dict=inputs_dict,
                 beam_kwargs=beam_kwargs,
             )
             if model.config.is_encoder_decoder:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
             else:
-                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
+                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1])
 
             # check `group_beam_search` for higher than 1 `num_return_sequences`
             num_return_sequences = 2
             beam_kwargs = self._get_diverse_beam_kwargs(num_return_sequences=num_return_sequences)
             output_generate = self._group_beam_search_generate(
                 model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
                 inputs_dict=inputs_dict,
                 beam_kwargs=beam_kwargs,
             )
             if model.config.is_encoder_decoder:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
             else:
-                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
+                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1])
 
     @pytest.mark.generate
     def test_group_beam_search_generate_dict_output(self):
         for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
+            main_input = inputs_dict[model_class.main_input_name]
 
             model = model_class(config).to(torch_device).eval()
             beam_kwargs = self._get_diverse_beam_kwargs()
             output_generate = self._group_beam_search_generate(
                 model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
                 inputs_dict=inputs_dict,
                 beam_kwargs=beam_kwargs,
                 output_scores=True,
@@ -852,21 +796,22 @@ def test_group_beam_search_generate_dict_output(self):
                 # Retrocompatibility check
                 self.assertIsInstance(output_generate, BeamSearchEncoderDecoderOutput)
             else:
-                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
+                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + main_input.shape[-1])
                 self.assertIsInstance(output_generate, GenerateBeamDecoderOnlyOutput)
                 # Retrocompatibility check
                 self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput)
 
             self._check_outputs(
-                output_generate, input_ids, model.config, num_return_sequences=beam_kwargs["num_beams"]
+                output_generate, main_input, model.config, num_return_sequences=beam_kwargs["num_beams"]
             )
 
-    # TODO: @gante
+    # TODO: @gante check why it is flaky
     @is_flaky()
     @pytest.mark.generate
     def test_constrained_beam_search_generate(self):
         for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
+            main_input = inputs_dict[model_class.main_input_name]
 
             model = model_class(config).to(torch_device).eval()
 
@@ -882,8 +827,6 @@ def test_constrained_beam_search_generate(self):
             beam_kwargs = self._get_constrained_beam_kwargs()
             output_generate = self._constrained_beam_search_generate(
                 model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
                 inputs_dict=inputs_dict,
                 constraints=constraints,
                 beam_kwargs=beam_kwargs,
@@ -892,7 +835,7 @@ def test_constrained_beam_search_generate(self):
             if model.config.is_encoder_decoder:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
             else:
-                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
+                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1])
 
             for generation_output in output_generate:
                 self._check_sequence_inside_sequence(force_tokens, generation_output)
@@ -908,8 +851,6 @@ def test_constrained_beam_search_generate(self):
 
             output_generate = self._constrained_beam_search_generate(
                 model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
                 inputs_dict=inputs_dict,
                 constraints=constraints,
                 beam_kwargs=beam_kwargs,
@@ -918,7 +859,7 @@ def test_constrained_beam_search_generate(self):
             if model.config.is_encoder_decoder:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
             else:
-                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
+                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1])
 
             for generation_output in output_generate:
                 self._check_sequence_inside_sequence(force_tokens, generation_output)
@@ -926,7 +867,8 @@ def test_constrained_beam_search_generate(self):
     @pytest.mark.generate
     def test_constrained_beam_search_generate_dict_output(self):
         for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
+            main_input = inputs_dict[model_class.main_input_name]
 
             model = model_class(config).to(torch_device).eval()
 
@@ -941,8 +883,6 @@ def test_constrained_beam_search_generate_dict_output(self):
             beam_kwargs = self._get_constrained_beam_kwargs()
             output_generate = self._constrained_beam_search_generate(
                 model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
                 inputs_dict=inputs_dict,
                 constraints=constraints,
                 beam_kwargs=beam_kwargs,
@@ -960,13 +900,13 @@ def test_constrained_beam_search_generate_dict_output(self):
                 # Retrocompatibility check
                 self.assertIsInstance(output_generate, BeamSearchEncoderDecoderOutput)
             else:
-                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
+                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + main_input.shape[-1])
                 self.assertIsInstance(output_generate, GenerateBeamDecoderOnlyOutput)
                 # Retrocompatibility check
                 self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput)
 
             self._check_outputs(
-                output_generate, input_ids, model.config, num_return_sequences=beam_kwargs["num_beams"]
+                output_generate, main_input, model.config, num_return_sequences=beam_kwargs["num_beams"]
             )
 
     @pytest.mark.generate
@@ -979,7 +919,8 @@ def test_contrastive_generate(self):
             if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
                 self.skipTest(reason="Won't fix: old model with different cache format")
 
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
+            main_input = inputs_dict[model_class.main_input_name]
 
             # NOTE: contrastive search only works with cache on at the moment.
             if not hasattr(config, "use_cache"):
@@ -990,15 +931,13 @@ def test_contrastive_generate(self):
             model = model_class(config).to(torch_device).eval()
             output_generate = self._contrastive_generate(
                 model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
                 inputs_dict=inputs_dict,
-                use_cache=True,
+                use_cache=True,  # Enable cache
             )
             if model.config.is_encoder_decoder:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
             else:
-                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
+                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1])
 
     @pytest.mark.generate
     def test_contrastive_generate_dict_outputs_use_cache(self):
@@ -1010,7 +949,8 @@ def test_contrastive_generate_dict_outputs_use_cache(self):
             if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
                 self.skipTest(reason="Won't fix: old model with different cache format")
 
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
+            main_input = inputs_dict[model_class.main_input_name]
 
             # NOTE: contrastive search only works with cache on at the moment.
             if not hasattr(config, "use_cache"):
@@ -1020,23 +960,21 @@ def test_contrastive_generate_dict_outputs_use_cache(self):
             model = model_class(config).to(torch_device).eval()
             output_generate = self._contrastive_generate(
                 model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
                 inputs_dict=inputs_dict,
                 output_scores=True,
                 output_logits=True,
                 output_hidden_states=True,
                 output_attentions=self.has_attentions,
                 return_dict_in_generate=True,
-                use_cache=True,
+                use_cache=True,  # Enable cache
             )
 
             if model.config.is_encoder_decoder:
                 self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1)
             else:
-                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
+                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + main_input.shape[-1])
 
-            self._check_outputs(output_generate, input_ids, model.config, use_cache=True)
+            self._check_outputs(output_generate, main_input, model.config, use_cache=True)
 
     @pytest.mark.generate
     def test_contrastive_generate_low_memory(self):
@@ -1050,7 +988,7 @@ def test_contrastive_generate_low_memory(self):
             if any(model_name in model_class.__name__.lower() for model_name in ["gptbigcode"]):
                 self.skipTest(reason="TODO: fix me")
 
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config(batch_size=1)
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate(batch_size=1)
 
             # NOTE: contrastive search only works with cache on at the moment.
             if not hasattr(config, "use_cache"):
@@ -1062,23 +1000,19 @@ def test_contrastive_generate_low_memory(self):
             model = model_class(config).to(torch_device).eval()
 
             low_output = model.generate(
-                input_ids,
                 top_k=4,
                 penalty_alpha=0.6,
                 low_memory=True,
                 max_new_tokens=self.max_new_tokens,
-                attention_mask=attention_mask,
                 **inputs_dict,
                 use_cache=True,
             )
 
             high_output = model.generate(
-                input_ids,
                 top_k=4,
                 penalty_alpha=0.6,
                 low_memory=False,
                 max_new_tokens=self.max_new_tokens,
-                attention_mask=attention_mask,
                 **inputs_dict,
                 use_cache=True,
             )
@@ -1105,7 +1039,8 @@ def test_beam_search_low_memory(self):
                 ]
             ):
                 self.skipTest(reason="May fix in the future: need model-specific fixes")
-            config, input_ids, _, _ = self._get_input_ids_and_config(batch_size=2)
+
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
             # batch_size=1 is ok, but batch_size>1 will cause non-identical output
 
             config.use_cache = True
@@ -1115,7 +1050,7 @@ def test_beam_search_low_memory(self):
             model = model_class(config).to(torch_device).eval()
 
             low_output = model.generate(
-                input_ids,
+                **inputs_dict,
                 max_new_tokens=8,
                 num_beams=5,
                 early_stopping=True,
@@ -1124,7 +1059,7 @@ def test_beam_search_low_memory(self):
             )
 
             high_output = model.generate(
-                input_ids,
+                **inputs_dict,
                 max_new_tokens=8,
                 num_beams=5,
                 early_stopping=True,
@@ -1169,7 +1104,8 @@ def test_assisted_decoding_matches_greedy_search(self, assistant_type):
                 self.skipTest(reason="May fix in the future: need model-specific fixes")
 
             # enable cache
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config(batch_size=1)
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate(batch_size=1)
+            main_input = inputs_dict[model_class.main_input_name]
 
             # NOTE: assisted generation only works with cache on at the moment.
             if not hasattr(config, "use_cache"):
@@ -1195,9 +1131,7 @@ def test_assisted_decoding_matches_greedy_search(self, assistant_type):
                 "return_dict_in_generate": True,
                 "use_cache": True,
             }
-            output_greedy = model.generate(
-                input_ids, attention_mask=attention_mask, **generation_kwargs, **inputs_dict
-            )
+            output_greedy = model.generate(**generation_kwargs, **inputs_dict)
 
             # test with the same assistant model or randomly init one
             # in the first case all candidate tokens are accepted, in the second none is accepted
@@ -1209,15 +1143,13 @@ def test_assisted_decoding_matches_greedy_search(self, assistant_type):
             assistant_model.generation_config.num_assistant_tokens = 2  # see b)
             assistant_model.generation_config.num_assistant_tokens_schedule = "constant"  # see b)
             generation_kwargs.update({"assistant_model": assistant_model})
-            output_assisted = model.generate(
-                input_ids, attention_mask=attention_mask, **generation_kwargs, **inputs_dict
-            )
+            output_assisted = model.generate(**generation_kwargs, **inputs_dict)
 
             # The two outputs must match and their shape must be as expected
 
             self.assertListEqual(output_greedy.sequences.tolist(), output_assisted.sequences.tolist())
             for output in (output_greedy, output_assisted):
-                self._check_outputs(output, input_ids, model.config, use_cache=True)
+                self._check_outputs(output, main_input, model.config, use_cache=True)
 
     @is_flaky()
     @pytest.mark.generate
@@ -1246,7 +1178,8 @@ def test_prompt_lookup_decoding_matches_greedy_search(self):
                 self.skipTest(reason="May fix in the future: need model-specific fixes")
 
             # enable cache
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config(batch_size=1)
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate(batch_size=1)
+            main_input = inputs_dict[model_class.main_input_name]
 
             # NOTE: assisted generation only works with cache on at the moment.
             if not hasattr(config, "use_cache"):
@@ -1273,20 +1206,16 @@ def test_prompt_lookup_decoding_matches_greedy_search(self):
                 "use_cache": True,
             }
 
-            output_greedy = model.generate(
-                input_ids, attention_mask=attention_mask, **generation_kwargs, **inputs_dict
-            )
+            output_greedy = model.generate(**generation_kwargs, **inputs_dict)
 
             generation_kwargs.update({"prompt_lookup_num_tokens": 2})  # see b)
-            output_prompt_lookup = model.generate(
-                input_ids, attention_mask=attention_mask, **generation_kwargs, **inputs_dict
-            )
+            output_prompt_lookup = model.generate(**generation_kwargs, **inputs_dict)
 
             # The two outputs must match and their shape must be as expected
 
             self.assertListEqual(output_greedy.sequences.tolist(), output_prompt_lookup.sequences.tolist())
             for output in (output_greedy, output_prompt_lookup):
-                self._check_outputs(output, input_ids, model.config, use_cache=True)
+                self._check_outputs(output, main_input, model.config, use_cache=True)
 
     @pytest.mark.generate
     def test_dola_decoding_sample(self):
@@ -1302,7 +1231,8 @@ def test_dola_decoding_sample(self):
                 self.skipTest("DoLa is not supported for models that don't return layerwise hidden states")
 
             # enable cache if the model is not openai-gpt, xlnet, cpm, or xlm
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
+            main_input = inputs_dict[model_class.main_input_name]
 
             # Encoder-decoder models are not supported
             if config.is_encoder_decoder:
@@ -1326,12 +1256,11 @@ def test_dola_decoding_sample(self):
                 "output_hidden_states": True,
                 "output_attentions": self.has_attentions,
                 "return_dict_in_generate": True,
-                "use_cache": hasattr(config, "use_cache"),  # Some models don't support the cache
+                "use_cache": getattr(config, "use_cache", False),  # Some models don't support the cache
+                "dola_layers": "low",
             }
-            generation_kwargs.update({"dola_layers": "low"})
-            model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
-            output_dola = model.generate(input_ids, **model_kwargs, **generation_kwargs, **inputs_dict)
-            self._check_outputs(output_dola, input_ids, model.config, use_cache=hasattr(config, "use_cache"))
+            output_dola = model.generate(**generation_kwargs, **inputs_dict)
+            self._check_outputs(output_dola, main_input, model.config, use_cache=getattr(config, "use_cache", False))
 
     @pytest.mark.generate
     def test_assisted_decoding_sample(self):
@@ -1359,7 +1288,8 @@ def test_assisted_decoding_sample(self):
                 self.skipTest(reason="May fix in the future: need model-specific fixes")
 
             # enable cache
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config(batch_size=1)
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate(batch_size=1)
+            main_input = inputs_dict[model_class.main_input_name]
 
             # NOTE: assisted generation only works with cache on at the moment.
             if not hasattr(config, "use_cache"):
@@ -1389,11 +1319,9 @@ def test_assisted_decoding_sample(self):
                 "return_dict_in_generate": True,
                 "use_cache": True,
             }
-            output_assisted = model.generate(
-                input_ids, attention_mask=attention_mask, **generation_kwargs, **inputs_dict
-            )
+            output_assisted = model.generate(**generation_kwargs, **inputs_dict)
 
-            self._check_outputs(output_assisted, input_ids, config, use_cache=True)
+            self._check_outputs(output_assisted, main_input, config, use_cache=True)
 
     @pytest.mark.generate
     def test_prompt_lookup_decoding_stops_at_eos(self):
@@ -1429,7 +1357,8 @@ def test_generate_with_head_masking(self):
         """Test designed for encoder-decoder models to ensure the attention head masking is used."""
         attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
         for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
+
             # We want to test only encoder-decoder models
             if not config.is_encoder_decoder:
                 continue
@@ -1452,8 +1381,6 @@ def test_generate_with_head_masking(self):
 
             for attn_name, (name, mask) in zip(attention_names, head_masking.items()):
                 out = model.generate(
-                    input_ids,
-                    attention_mask=attention_mask,
                     num_beams=1,
                     output_attentions=self.has_attentions,
                     return_dict_in_generate=True,
@@ -1482,7 +1409,7 @@ def test_left_padding_compatibility(self):
         # - The model must be a decoder-only architecture (encoder-based architectures use right-padding)
         decoder_only_classes = []
         for model_class in self.all_generative_model_classes:
-            config, _, _, _ = self._get_input_ids_and_config()
+            config, _ = self.prepare_config_and_inputs_for_generate()
             if config.is_encoder_decoder:
                 continue
             else:
@@ -1515,7 +1442,12 @@ def _prepare_model_kwargs(input_ids, attention_mask, signature):
             return model_kwargs
 
         for model_class in decoder_only_classes:
-            config, input_ids, attention_mask, _ = self._get_input_ids_and_config()
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
+            input_ids = inputs_dict["input_ids"]
+            attention_mask = inputs_dict.get("attention_mask")
+            if attention_mask is None:
+                attention_mask = torch.ones_like(input_ids)
+
             model = model_class(config).to(torch_device).eval()
             signature = inspect.signature(model.forward).parameters.keys()
 
@@ -1618,7 +1550,7 @@ def test_generate_from_inputs_embeds_decoder_only(self):
         # When supported, tests that the decoder model can generate from `inputs_embeds` instead of `input_ids`
         # if fails, you should probably update the `prepare_inputs_for_generation` function
         for model_class in self.all_generative_model_classes:
-            config, input_ids, _, _ = self._get_input_ids_and_config()
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
 
             # Ignore:
             # a) eos (to always output 20 tokens) and pad (so we don't try to infer the attn mask from the input_ids,
@@ -1635,10 +1567,13 @@ def test_generate_from_inputs_embeds_decoder_only(self):
                 continue
 
             # Skip models without explicit support
+            config.is_decoder = True
             model = model_class(config).to(torch_device).eval()
             if "inputs_embeds" not in inspect.signature(model.prepare_inputs_for_generation).parameters.keys():
                 continue
 
+            input_ids = inputs_dict.pop("input_ids")
+
             # Traditional way of generating text
             outputs_from_ids = model.generate(
                 input_ids, max_new_tokens=5, return_dict_in_generate=True, output_scores=True
@@ -1689,7 +1624,8 @@ def test_generate_from_inputs_embeds_with_static_cache(self):
             if not model_class._supports_static_cache:
                 self.skipTest(reason="This model does not support the static cache format")
 
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
+
             if config.is_encoder_decoder:
                 self.skipTest(reason="This model is encoder-decoder and has Encoder-Decoder Cache")
 
@@ -1697,9 +1633,11 @@ def test_generate_from_inputs_embeds_with_static_cache(self):
             if "inputs_embeds" not in inspect.signature(model.prepare_inputs_for_generation).parameters.keys():
                 self.skipTest(reason="This model does not support `inputs_embeds` in generation")
 
+            input_ids = inputs_dict.pop("input_ids")
+
             model.config.use_cache = True
             model.config.is_decoder = True
-            batch_size, seq_length = input_ids.shape
+            batch_size = input_ids.shape[0]
             max_cache_len = 30
 
             # here we force to not stop at eos and go until max-length
@@ -1724,9 +1662,7 @@ def test_generate_from_inputs_embeds_with_static_cache(self):
             num_hidden_layers = text_config.num_hidden_layers
 
             inputs_embeds = model.get_input_embeddings()(input_ids)
-            outputs = model.generate(
-                inputs_embeds=inputs_embeds, attention_mask=attention_mask, **generation_kwargs, **inputs_dict
-            )
+            outputs = model.generate(inputs_embeds=inputs_embeds, **generation_kwargs, **inputs_dict)
 
             # we should get `max_length` in shape, not `max_length - embeds_length`
             cache_shape = (batch_size, num_key_value_heads, max_cache_len, head_dim)
@@ -1827,7 +1763,7 @@ def test_new_cache_format(self, num_beams, do_sample):
             if not model_class._supports_cache_class:
                 self.skipTest(reason="This model does not support the new cache format")
 
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
 
             model = model_class(config).to(torch_device).eval()
             generation_kwargs = {
@@ -1842,24 +1778,16 @@ def test_new_cache_format(self, num_beams, do_sample):
             # Sets seed before calling `generate` for the case with do_sample=True
             seed = torch.randint(0, 1000000, (1,)).item()
             set_seed(seed)
-            legacy_results = model.generate(
-                input_ids, attention_mask=attention_mask, **generation_kwargs, **inputs_dict
-            )
+            legacy_results = model.generate(**generation_kwargs, **inputs_dict)
             set_seed(seed)
-            num_hidden_layers = config.get_text_config().num_hidden_layers
             if config.is_encoder_decoder:
                 cache_cls = EncoderDecoderCache
-                past_key_values = cache_cls(DynamicCache(num_hidden_layers), DynamicCache(num_hidden_layers))
+                past_key_values = cache_cls(DynamicCache(), DynamicCache())
             else:
                 cache_cls = DynamicCache
-                past_key_values = cache_cls(num_hidden_layers)
-            new_results = model.generate(
-                input_ids,
-                attention_mask=attention_mask,
-                past_key_values=past_key_values,
-                **generation_kwargs,
-                **inputs_dict,
-            )
+                past_key_values = cache_cls()
+
+            new_results = model.generate(past_key_values=past_key_values, **generation_kwargs, **inputs_dict)
 
             # The two sets of generated sequences must match, despite the cache format between forward passes being
             # different
@@ -1906,12 +1834,15 @@ def test_generate_with_static_cache(self):
             if not model_class._supports_static_cache:
                 self.skipTest(reason="This model does not support the static cache format")
 
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
+            main_input = inputs_dict[model_class.main_input_name]
+
             if config.is_encoder_decoder:
                 self.skipTest(reason="This model is encoder-decoder and has Encoder-Decoder Cache")
 
             config.is_decoder = True
-            batch_size, seq_length = input_ids.shape
+            batch_size = main_input.shape[0]
+            seq_length = main_input.shape[-1]
             max_new_tokens = 20
 
             model = model_class(config).to(torch_device).eval()
@@ -1934,21 +1865,21 @@ def test_generate_with_static_cache(self):
                 else config.num_key_value_heads
             )
             num_hidden_layers = config.num_hidden_layers
-            results = model.generate(input_ids, attention_mask=attention_mask, **generation_kwargs, **inputs_dict)
+            results = model.generate(**generation_kwargs, **inputs_dict)
 
             cache_shape = (batch_size, num_key_value_heads, max_cache_len, head_dim)
             self.assertTrue(isinstance(results.past_key_values, StaticCache))
             self.assertTrue(len(results.past_key_values.key_cache) == num_hidden_layers)
             self.assertTrue(results.past_key_values.key_cache[0].shape == cache_shape)
 
-    @require_quanto
+    @require_optimum_quanto
     @pytest.mark.generate
     def test_generate_with_quant_cache(self):
         for model_class in self.all_generative_model_classes:
             if not model_class._supports_quantized_cache:
                 self.skipTest(reason="This model does not support the quantized cache format")
 
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
             config.is_decoder = True
 
             model = model_class(config).to(torch_device).eval()
@@ -1961,23 +1892,17 @@ def test_generate_with_quant_cache(self):
                 "use_cache": True,
             }
 
-            results = model.generate(input_ids, attention_mask=attention_mask, **generation_kwargs, **inputs_dict)
+            results = model.generate(**generation_kwargs, **inputs_dict)
             self.assertTrue(isinstance(results.past_key_values, QuantoQuantizedCache))
 
             # passing past key values of different type should raise Error
             with self.assertRaises(ValueError):
-                num_hidden_layers = config.get_text_config().num_hidden_layers
-                model.generate(
-                    input_ids,
-                    attention_mask=attention_mask,
-                    past_key_valyes=DynamicCache(num_hidden_layers),
-                    **generation_kwargs,
-                )
+                model.generate(past_key_valyes=DynamicCache(), **generation_kwargs, **inputs_dict)
 
             # setting incorrect cache_config args should raise an Error, i.e. nbits=60 does not make sense
             generation_kwargs["cache_config"] = {"nbits": 60, "q_group_size": 8, "residual_length": 128}
             with self.assertRaises(ValueError):
-                model.generate(input_ids, attention_mask=attention_mask, **generation_kwargs)
+                model.generate(**generation_kwargs, **inputs_dict)
 
     @pytest.mark.generate
     @require_torch_gpu
@@ -2040,7 +1965,7 @@ def test_generate_methods_with_num_logits_to_keep(self):
             if "num_logits_to_keep" not in set(inspect.signature(model_class.forward).parameters.keys()):
                 self.skipTest(reason="This model does not support `num_logits_to_keep` argument.")
 
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
             config.use_cache = True
             config.is_decoder = True
 
@@ -2054,13 +1979,9 @@ def test_generate_methods_with_num_logits_to_keep(self):
             }
 
             # Setting num_logits_to_keep at 0 keeps all logits (old behavior)
-            with_all_logits = model.generate(
-                input_ids, attention_mask=attention_mask, **generation_kwargs, **inputs_dict, num_logits_to_keep=0
-            )
+            with_all_logits = model.generate(**generation_kwargs, **inputs_dict, num_logits_to_keep=0)
             # By default, num_logits_to_keep is automatically set to 1 if not provided (new behavior)
-            without_all_logits = model.generate(
-                input_ids, attention_mask=attention_mask, **inputs_dict, **generation_kwargs
-            )
+            without_all_logits = model.generate(**inputs_dict, **generation_kwargs)
             self.assertEqual(with_all_logits.tolist(), without_all_logits.tolist())
 
     @pytest.mark.generate
@@ -2072,7 +1993,7 @@ def test_assisted_decoding_with_num_logits_to_keep(self):
             if model_class._is_stateful:
                 self.skipTest(reason="Stateful models don't support assisted generation")
 
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config(batch_size=1)
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate(batch_size=1)
             config.use_cache = True
             config.is_decoder = True
 
@@ -2089,13 +2010,9 @@ def test_assisted_decoding_with_num_logits_to_keep(self):
 
             assistant_model.generation_config.assistant_confidence_threshold = None
             # Setting num_logits_to_keep at 0 keeps all logits (old behavior)
-            with_all_logits = model.generate(
-                input_ids, attention_mask=attention_mask, **generation_kwargs, **inputs_dict, num_logits_to_keep=0
-            )
+            with_all_logits = model.generate(**generation_kwargs, **inputs_dict, num_logits_to_keep=0)
             # By default, num_logits_to_keep is automatically set to 1 if not provided (new behavior)
-            without_all_logits = model.generate(
-                input_ids, attention_mask=attention_mask, **inputs_dict, **generation_kwargs
-            )
+            without_all_logits = model.generate(**inputs_dict, **generation_kwargs)
             self.assertEqual(with_all_logits.tolist(), without_all_logits.tolist())
 
     @pytest.mark.generate
@@ -2107,8 +2024,9 @@ def test_inherits_generation_mixin(self):
         for model_class in self.all_generative_model_classes:
             self.assertTrue("GenerationMixin" in str(model_class.__bases__))
 
-    def _check_outputs(self, output, input_ids, config, use_cache=False, num_return_sequences=1):
-        batch_size, seq_length = input_ids.shape
+    def _check_outputs(self, output, main_input, config, use_cache=False, num_return_sequences=1):
+        batch_size = main_input.shape[0]
+        seq_length = main_input.shape[-1]
         config = config.text_config if hasattr(config, "text_config") else config
         num_sequences_in_output = batch_size * num_return_sequences
 
@@ -2116,6 +2034,10 @@ def _check_outputs(self, output, input_ids, config, use_cache=False, num_return_
             output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length
         )
 
+        # in some models we subsample the sequence length in inner layers
+        if hasattr(self.model_tester, "get_subsampled_output_lengths"):
+            seq_length = self.model_tester.get_subsampled_output_lengths(seq_length)
+
         # scores
         self._check_scores(num_sequences_in_output, output.scores, length=gen_len, config=config)
 
@@ -2182,7 +2104,17 @@ def _check_outputs(self, output, input_ids, config, use_cache=False, num_return_
         # 1. Its inner sequence length is with respect to the inputs of the latest forward pass, hence the "-1"
         # 2. We ignore models that have unique cache structures (e.g. mamba) or are in need of refatoring to match the
         #    standard cache format (e.g.gptbigcode )
-        models_without_standard_cache = ("ctrl", "fsmt", "gptbigcode", "mega", "reformer", "jamba", "mamba", "xlnet")
+        models_without_standard_cache = (
+            "ctrl",
+            "fsmt",
+            "gptbigcode",
+            "mega",
+            "reformer",
+            "jamba",
+            "mamba",
+            "xlnet",
+            "zamba",
+        )
         has_standard_cache = not any(
             model_name in config.__class__.__name__.lower() for model_name in models_without_standard_cache
         )
@@ -3579,6 +3511,34 @@ def test_special_tokens_fall_back_to_model_default(self):
         self.assertTrue(test_bos_id == gen_output[0, 0])
         self.assertTrue(generation_config.bos_token_id is None)
 
+    def test_speculative_decoding_equals_regular_decoding(self):
+        draft_name = "double7/vicuna-68m"
+        target_name = "Qwen/Qwen2-0.5B-Instruct"
+
+        draft_model = AutoModelForCausalLM.from_pretrained(draft_name)
+        target_model = AutoModelForCausalLM.from_pretrained(target_name)
+
+        assistant_tokenizer = AutoTokenizer.from_pretrained(draft_name)
+        target_tokenizer = AutoTokenizer.from_pretrained(target_name)
+
+        prompt_size = torch.randint(low=20, high=100, size=(1,))
+        max_new_tokens = torch.randint(low=10, high=50, size=(1,))
+        input_ids = (torch.rand(1, prompt_size[0]) * 100).to(int) + 50
+
+        max_new_tokens_item = max_new_tokens[0].item()
+        expected_out = target_model.generate(input_ids, do_sample=False, max_new_tokens=max_new_tokens_item)
+        predicted_out = target_model.generate(
+            input_ids,
+            do_sample=False,
+            max_new_tokens=max_new_tokens_item,
+            assistant_model=draft_model,
+            target_tokenizer=target_tokenizer,
+            assistant_tokenizer=assistant_tokenizer,
+        )
+
+        self.assertEqual(expected_out.shape, predicted_out.shape)
+        self.assertTrue((expected_out == predicted_out).all().item())
+
     @pytest.mark.generate
     @require_torch_multi_gpu
     def test_generate_with_static_cache_multi_gpu(self):
@@ -3797,6 +3757,113 @@ def test_padding_input_contrastive_search_t5(self):
         self.assertEqual(generated_text_no_padding, generated_text_with_padding)
         self.assertEqual(generated_text_no_padding, "Ich muss diese Aufgabe vor Ende des Tages beenden.")
 
+    def test_prepare_inputs_for_generation_decoder_llm(self):
+        """Tests GenerationMixin.prepare_inputs_for_generation against expected usage with decoder-only llms."""
+
+        config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-LlamaForCausalLM")
+        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-LlamaForCausalLM")
+        model = model.to(torch_device)
+
+        # 1. Sanity check: the model's `prepare_inputs_for_generation` comes from `GenerationMixin`
+        self.assertTrue("GenerationMixin" in str(model.prepare_inputs_for_generation))
+
+        # 2. If we pass input ids by themselves, we should get back the same input ids
+        input_ids = torch.tensor([[1, 2, 3], [4, 5, 6]]).to(torch_device)
+        model_inputs = model.prepare_inputs_for_generation(input_ids)
+        self.assertTrue(torch.all(model_inputs["input_ids"] == input_ids))
+
+        # 3. If we pass the attention mask too, we will get back the attention mask and position ids built from it
+        attention_mask = torch.tensor([[1, 1, 1], [1, 1, 1]]).to(torch_device)
+        model_inputs = model.prepare_inputs_for_generation(input_ids, attention_mask=attention_mask)
+        self.assertTrue(torch.all(model_inputs["attention_mask"] == attention_mask))
+        self.assertTrue(model_inputs["position_ids"].shape == input_ids.shape)
+
+        # 4. `use_cache` (and other kwargs) are forwarded
+        self.assertFalse("use_cache" in model_inputs)  # From the previous input, there is no `use_cache`
+        model_inputs = model.prepare_inputs_for_generation(input_ids, use_cache=True, foo="bar")
+        self.assertTrue(model_inputs["use_cache"] is True)
+        self.assertTrue(model_inputs["foo"] == "bar")
+
+        # 5. When we pass a cache, we discard data related to already seen tokens in some tensors. We are now also
+        # forced to pass a correctly prepared `cache_positions` to slice the data accordingly.
+        init_input_ids = input_ids[:, :2]
+        dynamic_cache = DynamicCache()
+        dynamic_cache = model(init_input_ids, past_key_values=dynamic_cache).past_key_values
+        with self.assertRaises(AttributeError):  # past_key_values + no cache_position -> exception
+            model_inputs = model.prepare_inputs_for_generation(input_ids, past_key_values=dynamic_cache)
+
+        cache_position = torch.arange(input_ids.shape[-1], dtype=torch.long).to(torch_device)
+        cache_position = cache_position[dynamic_cache.get_seq_length() :]
+        model_inputs = model.prepare_inputs_for_generation(
+            input_ids, past_key_values=dynamic_cache, cache_position=cache_position, attention_mask=attention_mask
+        )
+        self.assertTrue("past_key_values" in model_inputs)
+        self.assertTrue(torch.all(model_inputs["cache_position"] == cache_position))
+        self.assertTrue(model_inputs["input_ids"].shape[-1] == 1)  # 1 = 3 fed tokens - 2 tokens in the cache
+        self.assertTrue(model_inputs["position_ids"].shape[-1] == 1)
+        self.assertTrue(model_inputs["attention_mask"].shape[-1] == 3)  # we still need the full attention mask!
+
+        # 6. If we pass a `static_cache`, the attention mask will be prepared as a static shape 4D mask
+        max_cache_len = 10
+        batch_size = 2
+        query_length = input_ids.shape[-1] - init_input_ids.shape[-1]
+        static_cache = StaticCache(
+            config=config, batch_size=batch_size, max_cache_len=max_cache_len, device=torch_device, dtype=torch.float32
+        )
+        static_cache = model(init_input_ids, past_key_values=static_cache).past_key_values
+        model_inputs = model.prepare_inputs_for_generation(
+            input_ids, past_key_values=static_cache, cache_position=cache_position, attention_mask=attention_mask
+        )
+        self.assertTrue("past_key_values" in model_inputs)
+        self.assertTrue(list(model_inputs["attention_mask"].shape) == [batch_size, 1, query_length, max_cache_len])
+
+        # 7. We can also pass `inputs_embeds` as the embedded prompt. Because `generate` will append its result to
+        # `input_ids` and the models will only accept one of the two inputs (`input_ids` or `inputs_embeds`), we
+        # a) must use the cache b) must expect `input_ids` after the prompt is processed
+        init_inputs_embeds = model.get_input_embeddings()(init_input_ids)
+        init_cache_positions = torch.arange(init_input_ids.shape[-1], dtype=torch.long).to(torch_device)
+        empty_cache = DynamicCache()
+
+        # Prompt processing
+        model_inputs = model.prepare_inputs_for_generation(
+            init_input_ids,
+            past_key_values=empty_cache,
+            inputs_embeds=init_inputs_embeds,
+            cache_position=init_cache_positions,
+        )
+        self.assertTrue(model_inputs["input_ids"] is None)
+        self.assertTrue(model_inputs["inputs_embeds"] is not None)
+
+        # After prompt processing
+        model_inputs = model.prepare_inputs_for_generation(
+            input_ids, past_key_values=dynamic_cache, inputs_embeds=init_inputs_embeds, cache_position=cache_position
+        )
+        self.assertTrue(model_inputs["input_ids"] is not None)
+        self.assertTrue(model_inputs["inputs_embeds"] is None)
+
+    def test_generate_compile_fullgraph_tiny(self):
+        """
+        Tests that we can call end-to-end generation with a tiny model (i.e. doesn't crash)
+        NOTE: this test is quite slow (~20s on a consumer desktop), but it is important that we keep it as part of the
+        non-slow tests to prevent regressions!
+        """
+        model = AutoModelForCausalLM.from_pretrained(
+            "hf-internal-testing/tiny-random-LlamaForCausalLM", torch_dtype=torch.bfloat16, device_map="auto"
+        )
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-LlamaForCausalLM")
+
+        # compile generate
+        compiled_generate = torch.compile(model.generate, fullgraph=True, mode="reduce-overhead")
+
+        # compiled generate does NOT accept parameterization except a) model inputs b) a generation config
+        generation_config = copy.deepcopy(model.generation_config)
+        generation_config.pad_token_id = model.config.eos_token_id
+
+        model_inputs = tokenizer(["Write a poem about the market crashing in summer"], return_tensors="pt")
+        model_inputs = model_inputs.to(model.device)
+        gen_out = compiled_generate(**model_inputs, generation_config=generation_config)
+        self.assertTrue(gen_out.shape[1] > model_inputs["input_ids"].shape[1])  # some text was generated
+
 
 @require_torch
 class TokenHealingTestCase(unittest.TestCase):
@@ -3846,3 +3913,41 @@ def test_generate_from_inputs_embeds_with_bos_token_id_is_none(self):
         # bos_token_id is required when no input ids nor inputs_embeds is passed
         with self.assertRaises(ValueError):
             model.generate(max_length=20, bos_token_id=None)
+
+
+class TestAssistedCandidateGeneratorDifferentTokenizers(unittest.TestCase):
+    def test_no_intersection(self):
+        prompt = np.array([[1, 2, 3]])
+        prompt_plus_new_tokens = np.array([[4, 5, 6]])
+        result = AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag(prompt, prompt_plus_new_tokens)
+        self.assertEqual(result, (None, None, None))
+
+    def test_complete_overlap(self):
+        prompt = np.array([[1, 2, 3]])
+        prompt_plus_new_tokens = np.array([[1, 2, 3, 4, 5]])
+        discrep_length, new_tokens_only, discrep_only = AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag(
+            prompt, prompt_plus_new_tokens
+        )
+        self.assertEqual(discrep_length, 0)
+        np.testing.assert_array_equal(new_tokens_only, np.array([[4, 5]]))
+        np.testing.assert_array_equal(discrep_only, np.array([[]]))
+
+    def test_partial_overlap(self):
+        prompt = np.array([[1, 2, 3]])
+        prompt_plus_new_tokens = np.array([[2, 3, 4, 5]])
+        discrep_length, new_tokens_only, discrep_only = AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag(
+            prompt, prompt_plus_new_tokens
+        )
+        self.assertEqual(discrep_length, 0)
+        np.testing.assert_array_equal(new_tokens_only, np.array([[4, 5]]))
+        np.testing.assert_array_equal(discrep_only, np.array([[]]))
+
+    def test_no_new_tokens(self):
+        prompt = np.array([[1, 2, 3]])
+        prompt_plus_new_tokens = np.array([[1, 2, 3]])
+        discrep_length, new_tokens_only, discrep_only = AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag(
+            prompt, prompt_plus_new_tokens
+        )
+        self.assertEqual(discrep_length, 0)
+        np.testing.assert_array_equal(new_tokens_only, np.array([[]]))
+        np.testing.assert_array_equal(discrep_only, np.array([[]]))
diff --git a/tests/models/albert/test_modeling_flax_albert.py b/tests/models/albert/test_modeling_flax_albert.py
index 956de9ebdc9e57..90590e737f5f12 100644
--- a/tests/models/albert/test_modeling_flax_albert.py
+++ b/tests/models/albert/test_modeling_flax_albert.py
@@ -80,6 +80,7 @@ def __init__(
         self.type_sequence_label_size = type_sequence_label_size
         self.initializer_range = initializer_range
         self.num_choices = num_choices
+        super().__init__()
 
     def prepare_config_and_inputs(self):
         input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py
index f4ac29479c5f1f..0175e562eda618 100755
--- a/tests/models/altclip/test_modeling_altclip.py
+++ b/tests/models/altclip/test_modeling_altclip.py
@@ -436,9 +436,16 @@ class AltCLIPModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
 
     # TODO: Fix the failed tests when this model gets more usage
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
-        if pipeline_test_casse_name == "FeatureExtractionPipelineTests":
+        if pipeline_test_case_name == "FeatureExtractionPipelineTests":
             return True
 
         return False
diff --git a/tests/models/altclip/test_processor_altclip.py b/tests/models/altclip/test_processor_altclip.py
index 33bff9c77ad263..5b290efb115031 100644
--- a/tests/models/altclip/test_processor_altclip.py
+++ b/tests/models/altclip/test_processor_altclip.py
@@ -17,17 +17,12 @@
 import tempfile
 import unittest
 
-from transformers import XLMRobertaTokenizer, XLMRobertaTokenizerFast
+from transformers import AltCLIPProcessor, CLIPImageProcessor, XLMRobertaTokenizer, XLMRobertaTokenizerFast
 from transformers.testing_utils import require_vision
-from transformers.utils import is_vision_available
 
 from ...test_processing_common import ProcessorTesterMixin
 
 
-if is_vision_available():
-    from transformers import AltCLIPProcessor, CLIPImageProcessor
-
-
 @require_vision
 class AltClipProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     processor_class = AltCLIPProcessor
diff --git a/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py b/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
index 0bbefda7ba50e8..fb90fec6afff09 100644
--- a/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
+++ b/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
@@ -167,9 +167,16 @@ class ASTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
 
     # TODO: Fix the failed tests when this model gets more usage
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
-        if pipeline_test_casse_name == "AudioClassificationPipelineTests":
+        if pipeline_test_case_name == "AudioClassificationPipelineTests":
             return True
 
         return False
diff --git a/tests/models/beit/test_modeling_flax_beit.py b/tests/models/beit/test_modeling_flax_beit.py
index 78c24220c2d20b..24307532fd770a 100644
--- a/tests/models/beit/test_modeling_flax_beit.py
+++ b/tests/models/beit/test_modeling_flax_beit.py
@@ -79,6 +79,7 @@ def __init__(
         # in BeiT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
         num_patches = (image_size // patch_size) ** 2
         self.seq_length = num_patches + 1
+        super().__init__()
 
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
diff --git a/tests/models/bert/test_modeling_flax_bert.py b/tests/models/bert/test_modeling_flax_bert.py
index fca54dbed84c3e..4a9610d723d13e 100644
--- a/tests/models/bert/test_modeling_flax_bert.py
+++ b/tests/models/bert/test_modeling_flax_bert.py
@@ -79,6 +79,7 @@ def __init__(
         self.type_sequence_label_size = type_sequence_label_size
         self.initializer_range = initializer_range
         self.num_choices = num_choices
+        super().__init__()
 
     def prepare_config_and_inputs(self):
         input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
diff --git a/tests/models/big_bird/test_modeling_flax_big_bird.py b/tests/models/big_bird/test_modeling_flax_big_bird.py
index 63b2237fbddccc..f889952d2be925 100644
--- a/tests/models/big_bird/test_modeling_flax_big_bird.py
+++ b/tests/models/big_bird/test_modeling_flax_big_bird.py
@@ -90,6 +90,7 @@ def __init__(
         self.use_bias = use_bias
         self.block_size = block_size
         self.num_random_blocks = num_random_blocks
+        super().__init__()
 
     def prepare_config_and_inputs(self):
         input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
diff --git a/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py b/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py
index 0f28fc2d67b582..745283825f0cf6 100644
--- a/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py
+++ b/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py
@@ -276,35 +276,20 @@ class BigBirdPegasusModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineT
 
     # TODO: Fix the failed tests
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
-        if pipeline_test_casse_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast"):
+        if pipeline_test_case_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast"):
             return True
 
         return False
 
-    # overwrite from GenerationTesterMixin to solve problem
-    # with conflicting random seeds
-    def _get_input_ids_and_config(self, batch_size=2):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.attention_type = "original_full"
-
-        input_ids = inputs_dict.pop(self.input_name)
-        _ = inputs_dict.pop("attention_mask", None)
-        _ = inputs_dict.pop("decoder_input_ids", None)
-        _ = inputs_dict.pop("decoder_attention_mask", None)
-        attention_mask = torch.ones_like(input_ids, dtype=torch.long)
-
-        # cut to half length & take max batch_size 3
-        sequence_length = input_ids.shape[-1] // 2
-        input_ids = input_ids[:batch_size, :sequence_length]
-        attention_mask = attention_mask[:batch_size, :sequence_length]
-
-        if config.eos_token_id is not None and config.pad_token_id is None:
-            # hack to allow generate for models such as GPT2 as is done in `generate()`
-            config.pad_token_id = config.eos_token_id
-        return config, input_ids, attention_mask, inputs_dict
-
     def setUp(self):
         self.model_tester = BigBirdPegasusModelTester(self)
         self.config_tester = ConfigTester(self, config_class=BigBirdPegasusConfig)
@@ -485,6 +470,13 @@ def test_for_change_to_full_attn(self):
     def test_load_save_without_tied_weights(self):
         pass
 
+    def test_generate_with_head_masking(self):
+        # overwritten to temporarily switch the attention type to `original_full`
+        original_self_attention_type = self.model_tester.attention_type
+        self.model_tester.attention_type = "original_full"
+        super().test_generate_with_head_masking()
+        self.model_tester.attention_type = original_self_attention_type
+
 
 @require_torch
 @require_sentencepiece
diff --git a/tests/models/blenderbot_small/test_modeling_blenderbot_small.py b/tests/models/blenderbot_small/test_modeling_blenderbot_small.py
index 59f68b54754796..933d8cccc8c146 100644
--- a/tests/models/blenderbot_small/test_modeling_blenderbot_small.py
+++ b/tests/models/blenderbot_small/test_modeling_blenderbot_small.py
@@ -236,9 +236,16 @@ class BlenderbotSmallModelTest(ModelTesterMixin, GenerationTesterMixin, Pipeline
 
     # TODO: Fix the failed tests when this model gets more usage
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
-        return pipeline_test_casse_name == "TextGenerationPipelineTests"
+        return pipeline_test_case_name == "TextGenerationPipelineTests"
 
     def setUp(self):
         self.model_tester = BlenderbotSmallModelTester(self)
diff --git a/tests/models/blenderbot_small/test_modeling_flax_blenderbot_small.py b/tests/models/blenderbot_small/test_modeling_flax_blenderbot_small.py
index da6126e7455c47..2d993ad8b411a1 100644
--- a/tests/models/blenderbot_small/test_modeling_flax_blenderbot_small.py
+++ b/tests/models/blenderbot_small/test_modeling_flax_blenderbot_small.py
@@ -321,9 +321,16 @@ class FlaxBlenderbotSmallModelTest(FlaxModelTesterMixin, unittest.TestCase, Flax
     all_generative_model_classes = (FlaxBlenderbotSmallForConditionalGeneration,) if is_flax_available() else ()
 
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
-        return pipeline_test_casse_name == "TextGenerationPipelineTests"
+        return pipeline_test_case_name == "TextGenerationPipelineTests"
 
     def setUp(self):
         self.model_tester = FlaxBlenderbotSmallModelTester(self)
diff --git a/tests/models/blenderbot_small/test_modeling_tf_blenderbot_small.py b/tests/models/blenderbot_small/test_modeling_tf_blenderbot_small.py
index d886bfded4f494..f70637df2dc248 100644
--- a/tests/models/blenderbot_small/test_modeling_tf_blenderbot_small.py
+++ b/tests/models/blenderbot_small/test_modeling_tf_blenderbot_small.py
@@ -198,9 +198,16 @@ class TFBlenderbotSmallModelTest(TFModelTesterMixin, PipelineTesterMixin, unitte
     test_onnx = False
 
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
-        return pipeline_test_casse_name == "TextGenerationPipelineTests"
+        return pipeline_test_case_name == "TextGenerationPipelineTests"
 
     def setUp(self):
         self.model_tester = TFBlenderbotSmallModelTester(self)
diff --git a/tests/models/blip/test_processor_blip.py b/tests/models/blip/test_processor_blip.py
index 7b851c618a773d..aa63855da43a24 100644
--- a/tests/models/blip/test_processor_blip.py
+++ b/tests/models/blip/test_processor_blip.py
@@ -17,7 +17,7 @@
 
 import pytest
 
-from transformers.testing_utils import require_vision
+from transformers.testing_utils import require_torch, require_vision
 from transformers.utils import is_vision_available
 
 from ...test_processing_common import ProcessorTesterMixin
@@ -139,3 +139,29 @@ def test_model_input_names(self):
 
         # For now the processor supports only ['pixel_values', 'input_ids', 'attention_mask']
         self.assertListEqual(list(inputs.keys()), ["pixel_values", "input_ids", "attention_mask"])
+
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs_batched(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = ["lower newer", "upper older longer string"]
+        image_input = self.prepare_image_inputs(batch_size=2)
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            crop_size={"height": 214, "width": 214},
+            size={"height": 214, "width": 214},
+            padding="longest",
+            max_length=76,
+        )
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 24)
diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py
index cee5d710a85fb8..f2ccb2da8dba94 100644
--- a/tests/models/blip_2/test_modeling_blip_2.py
+++ b/tests/models/blip_2/test_modeling_blip_2.py
@@ -718,9 +718,16 @@ class Blip2ModelTest(ModelTesterMixin, PipelineTesterMixin, GenerationTesterMixi
 
     # TODO: Fix the failed tests
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
-        if pipeline_test_casse_name == "VisualQuestionAnsweringPipelineTests":
+        if pipeline_test_case_name == "VisualQuestionAnsweringPipelineTests":
             # Get `RuntimeError: "LayerNormKernelImpl" not implemented for 'Half'`.
             return True
 
diff --git a/tests/models/blip_2/test_processor_blip_2.py b/tests/models/blip_2/test_processor_blip_2.py
index 8c7ca2ab698f48..7eb5bedc2be7a7 100644
--- a/tests/models/blip_2/test_processor_blip_2.py
+++ b/tests/models/blip_2/test_processor_blip_2.py
@@ -94,7 +94,7 @@ def test_tokenizer(self):
         encoded_tok = tokenizer(input_str, return_token_type_ids=False)
 
         for key in encoded_tok.keys():
-            self.assertListEqual(encoded_tok[key], encoded_processor[key])
+            self.assertListEqual(encoded_tok[key], encoded_processor[key][0])
 
     def test_processor(self):
         image_processor = self.get_image_processor()
@@ -107,7 +107,7 @@ def test_processor(self):
 
         inputs = processor(text=input_str, images=image_input)
 
-        self.assertListEqual(list(inputs.keys()), ["pixel_values", "input_ids", "attention_mask"])
+        self.assertCountEqual(list(inputs.keys()), ["input_ids", "pixel_values", "attention_mask"])
 
         # test if it raises when no input is passed
         with pytest.raises(ValueError):
@@ -138,4 +138,4 @@ def test_model_input_names(self):
         inputs = processor(text=input_str, images=image_input)
 
         # For now the processor supports only ['pixel_values', 'input_ids', 'attention_mask']
-        self.assertListEqual(list(inputs.keys()), ["pixel_values", "input_ids", "attention_mask"])
+        self.assertCountEqual(list(inputs.keys()), ["input_ids", "pixel_values", "attention_mask"])
diff --git a/tests/models/bridgetower/test_processor_bridgetower.py b/tests/models/bridgetower/test_processor_bridgetower.py
new file mode 100644
index 00000000000000..2ccfde803edb20
--- /dev/null
+++ b/tests/models/bridgetower/test_processor_bridgetower.py
@@ -0,0 +1,203 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import shutil
+import tempfile
+import unittest
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_vision_available
+
+from ...test_processing_common import ProcessorTesterMixin
+
+
+if is_vision_available():
+    from transformers import (
+        AutoProcessor,
+        BridgeTowerImageProcessor,
+        BridgeTowerProcessor,
+        RobertaTokenizerFast,
+    )
+
+
+@require_vision
+class BridgeTowerProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = BridgeTowerProcessor
+
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+
+        image_processor = BridgeTowerImageProcessor()
+        tokenizer = RobertaTokenizerFast.from_pretrained("BridgeTower/bridgetower-large-itm-mlm-itc")
+
+        processor = BridgeTowerProcessor(image_processor, tokenizer)
+
+        processor.save_pretrained(self.tmpdirname)
+
+    def get_tokenizer(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
+
+    def get_image_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    # Some kwargs tests are overriden from common tests to handle shortest_edge
+    # and size_divisor behaviour
+
+    @require_torch
+    @require_vision
+    def test_image_processor_defaults_preserved_by_image_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component(
+            "image_processor",
+            crop_size={"shortest_edge": 234, "longest_edge": 234},
+        )
+        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+        self.assertEqual(len(inputs["pixel_values"][0][0]), 234)
+
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_nested_from_dict(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "images_kwargs": {
+                "crop_size": {"shortest_edge": 214},
+            },
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    @require_torch
+    @require_vision
+    def test_kwargs_overrides_default_image_processor_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor", crop_size={"shortest_edge": 234})
+        tokenizer = self.get_component("tokenizer", max_length=117)
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+        inputs = processor(text=input_str, images=image_input, crop_size={"shortest_edge": 224})
+        self.assertEqual(len(inputs["pixel_values"][0][0]), 224)
+
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs_batched(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = ["lower newer", "upper older longer string"]
+        image_input = self.prepare_image_inputs(batch_size=2)
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            crop_size={"shortest_edge": 214},
+            padding="longest",
+            max_length=76,
+        )
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 6)
+
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            crop_size={"shortest_edge": 214},
+            padding="max_length",
+            max_length=76,
+        )
+
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_nested(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "images_kwargs": {"crop_size": {"shortest_edge": 214}},
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
diff --git a/tests/models/bros/test_modeling_bros.py b/tests/models/bros/test_modeling_bros.py
index 14f904c3b7e272..b16a37a226cd13 100644
--- a/tests/models/bros/test_modeling_bros.py
+++ b/tests/models/bros/test_modeling_bros.py
@@ -295,7 +295,14 @@ class BrosModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     # BROS requires `bbox` in the inputs which doesn't fit into the above 2 pipelines' input formats.
     # see https://github.com/huggingface/transformers/pull/26294
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
         return True
 
diff --git a/tests/models/chameleon/test_modeling_chameleon.py b/tests/models/chameleon/test_modeling_chameleon.py
index 00e3ad40a57652..aad26ef147e83e 100644
--- a/tests/models/chameleon/test_modeling_chameleon.py
+++ b/tests/models/chameleon/test_modeling_chameleon.py
@@ -116,7 +116,7 @@ def prepare_config_and_inputs(self):
 
         input_mask = None
         if self.use_input_mask:
-            input_mask = torch.tril(torch.ones(self.batch_size, self.seq_length)).to(torch_device)
+            input_mask = torch.tril(torch.ones_like(input_ids).to(torch_device))
 
         sequence_labels = None
         token_labels = None
diff --git a/tests/models/cohere/test_modeling_cohere.py b/tests/models/cohere/test_modeling_cohere.py
index d80bc5c24cf9f3..7d12dd3d873bfc 100644
--- a/tests/models/cohere/test_modeling_cohere.py
+++ b/tests/models/cohere/test_modeling_cohere.py
@@ -95,7 +95,7 @@ def prepare_config_and_inputs(self):
 
         input_mask = None
         if self.use_input_mask:
-            input_mask = torch.tril(torch.ones(self.batch_size, self.seq_length)).to(torch_device)
+            input_mask = torch.tril(torch.ones_like(input_ids).to(torch_device))
 
         token_type_ids = None
         if self.use_token_type_ids:
diff --git a/tests/models/cpm/test_tokenization_cpm.py b/tests/models/cpm/test_tokenization_cpm.py
index 152671eb6807d6..870a932cb303ab 100644
--- a/tests/models/cpm/test_tokenization_cpm.py
+++ b/tests/models/cpm/test_tokenization_cpm.py
@@ -22,7 +22,14 @@
 class CpmTokenizationTest(unittest.TestCase):
     # There is no `CpmModel`
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
         return True
 
diff --git a/tests/models/ctrl/test_modeling_ctrl.py b/tests/models/ctrl/test_modeling_ctrl.py
index 6d44bdfb4ae672..a9bdddd7bfe25e 100644
--- a/tests/models/ctrl/test_modeling_ctrl.py
+++ b/tests/models/ctrl/test_modeling_ctrl.py
@@ -211,9 +211,16 @@ class CTRLModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
 
     # TODO: Fix the failed tests
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
-        if pipeline_test_casse_name == "ZeroShotClassificationPipelineTests":
+        if pipeline_test_case_name == "ZeroShotClassificationPipelineTests":
             # Get `tokenizer does not have a padding token` error for both fast/slow tokenizers.
             # `CTRLConfig` was never used in pipeline tests, either because of a missing checkpoint or because a tiny
             # config could not be created.
diff --git a/tests/models/ctrl/test_modeling_tf_ctrl.py b/tests/models/ctrl/test_modeling_tf_ctrl.py
index a3772474a13af1..8a08fe0798849d 100644
--- a/tests/models/ctrl/test_modeling_tf_ctrl.py
+++ b/tests/models/ctrl/test_modeling_tf_ctrl.py
@@ -189,9 +189,16 @@ class TFCTRLModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase
 
     # TODO: Fix the failed tests
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
-        if pipeline_test_casse_name == "ZeroShotClassificationPipelineTests":
+        if pipeline_test_case_name == "ZeroShotClassificationPipelineTests":
             # Get `tokenizer does not have a padding token` error for both fast/slow tokenizers.
             # `CTRLConfig` was never used in pipeline tests, either because of a missing checkpoint or because a tiny
             # config could not be created.
diff --git a/tests/models/dac/test_modeling_dac.py b/tests/models/dac/test_modeling_dac.py
index ffe7f31b79a506..e3b729d2f101f8 100644
--- a/tests/models/dac/test_modeling_dac.py
+++ b/tests/models/dac/test_modeling_dac.py
@@ -123,7 +123,6 @@ class DacModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     test_headmasking = False
     test_resize_embeddings = False
     pipeline_model_mapping = {"feature-extraction": DacModel} if is_torch_available() else {}
-    input_name = "input_values"
 
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         # model does not have attention and does not support returning hidden states
diff --git a/tests/models/distilbert/test_modeling_flax_distilbert.py b/tests/models/distilbert/test_modeling_flax_distilbert.py
index 1f5a402e86acb5..39a25a42fe8aac 100644
--- a/tests/models/distilbert/test_modeling_flax_distilbert.py
+++ b/tests/models/distilbert/test_modeling_flax_distilbert.py
@@ -79,6 +79,7 @@ def __init__(
         self.type_sequence_label_size = type_sequence_label_size
         self.initializer_range = initializer_range
         self.num_choices = num_choices
+        super().__init__()
 
     def prepare_config_and_inputs(self):
         input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
diff --git a/tests/models/donut/test_processing_donut.py b/tests/models/donut/test_processor_donut.py
similarity index 75%
rename from tests/models/donut/test_processing_donut.py
rename to tests/models/donut/test_processor_donut.py
index ace3a109dfbb23..cf720e17b0d9d5 100644
--- a/tests/models/donut/test_processing_donut.py
+++ b/tests/models/donut/test_processor_donut.py
@@ -14,16 +14,28 @@
 # limitations under the License.
 
 
+import tempfile
 import unittest
 
-from transformers import DonutProcessor
+from transformers import DonutImageProcessor, DonutProcessor, XLMRobertaTokenizerFast
 
+from ...test_processing_common import ProcessorTesterMixin
 
-class DonutProcessorTest(unittest.TestCase):
+
+class DonutProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     from_pretrained_id = "naver-clova-ix/donut-base"
+    processor_class = DonutProcessor
 
     def setUp(self):
         self.processor = DonutProcessor.from_pretrained(self.from_pretrained_id)
+        self.tmpdirname = tempfile.mkdtemp()
+
+        image_processor = DonutImageProcessor()
+        tokenizer = XLMRobertaTokenizerFast.from_pretrained(self.from_pretrained_id)
+
+        processor = DonutProcessor(image_processor, tokenizer)
+
+        processor.save_pretrained(self.tmpdirname)
 
     def test_token2json(self):
         expected_json = {
diff --git a/tests/models/electra/test_modeling_flax_electra.py b/tests/models/electra/test_modeling_flax_electra.py
index 19b35d89409502..f531c7f8d073af 100644
--- a/tests/models/electra/test_modeling_flax_electra.py
+++ b/tests/models/electra/test_modeling_flax_electra.py
@@ -67,6 +67,7 @@ def __init__(
         self.type_sequence_label_size = type_sequence_label_size
         self.initializer_range = initializer_range
         self.num_choices = num_choices
+        super().__init__()
 
     def prepare_config_and_inputs(self):
         input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
diff --git a/tests/models/encodec/test_modeling_encodec.py b/tests/models/encodec/test_modeling_encodec.py
index cff297be8e0002..2aac4dba82e897 100644
--- a/tests/models/encodec/test_modeling_encodec.py
+++ b/tests/models/encodec/test_modeling_encodec.py
@@ -141,7 +141,6 @@ class EncodecModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
     test_headmasking = False
     test_resize_embeddings = False
     pipeline_model_mapping = {"feature-extraction": EncodecModel} if is_torch_available() else {}
-    input_name = "input_values"
 
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         # model does not have attention and does not support returning hidden states
diff --git a/tests/models/falcon/test_modeling_falcon.py b/tests/models/falcon/test_modeling_falcon.py
index f6c28344754ee9..a1a2b0155cb738 100644
--- a/tests/models/falcon/test_modeling_falcon.py
+++ b/tests/models/falcon/test_modeling_falcon.py
@@ -312,7 +312,14 @@ class FalconModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
 
     # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79245/workflows/9490ef58-79c2-410d-8f51-e3495156cf9c/jobs/1012146
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
         return True
 
diff --git a/tests/models/flaubert/test_modeling_flaubert.py b/tests/models/flaubert/test_modeling_flaubert.py
index 7e0ef420917955..396d02da956be4 100644
--- a/tests/models/flaubert/test_modeling_flaubert.py
+++ b/tests/models/flaubert/test_modeling_flaubert.py
@@ -392,10 +392,17 @@ class FlaubertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
 
     # TODO: Fix the failed tests
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
         if (
-            pipeline_test_casse_name == "QAPipelineTests"
+            pipeline_test_case_name == "QAPipelineTests"
             and tokenizer_name is not None
             and not tokenizer_name.endswith("Fast")
         ):
diff --git a/tests/models/flaubert/test_modeling_tf_flaubert.py b/tests/models/flaubert/test_modeling_tf_flaubert.py
index 534b529935a6b8..40b9c1c171f364 100644
--- a/tests/models/flaubert/test_modeling_tf_flaubert.py
+++ b/tests/models/flaubert/test_modeling_tf_flaubert.py
@@ -309,10 +309,17 @@ class TFFlaubertModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.Test
 
     # TODO: Fix the failed tests
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
         if (
-            pipeline_test_casse_name == "QAPipelineTests"
+            pipeline_test_case_name == "QAPipelineTests"
             and tokenizer_name is not None
             and not tokenizer_name.endswith("Fast")
         ):
diff --git a/tests/models/fnet/test_modeling_fnet.py b/tests/models/fnet/test_modeling_fnet.py
index 826bf485711017..db323500bd1dfb 100644
--- a/tests/models/fnet/test_modeling_fnet.py
+++ b/tests/models/fnet/test_modeling_fnet.py
@@ -298,9 +298,16 @@ class FNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
 
     # TODO: Fix the failed tests
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
-        if pipeline_test_casse_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast"):
+        if pipeline_test_case_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast"):
             return True
 
         return False
diff --git a/tests/models/fuyu/test_processing_fuyu.py b/tests/models/fuyu/test_processor_fuyu.py
similarity index 98%
rename from tests/models/fuyu/test_processing_fuyu.py
rename to tests/models/fuyu/test_processor_fuyu.py
index 69a1d53e86f766..39a47293040bdd 100644
--- a/tests/models/fuyu/test_processing_fuyu.py
+++ b/tests/models/fuyu/test_processor_fuyu.py
@@ -190,7 +190,7 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self):
 
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
         # Fuyu uses tokenizer kwargs only when image is None.
         image_input = None
 
@@ -218,7 +218,7 @@ def test_tokenizer_defaults_preserved_by_kwargs(self):
 
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
         # Fuyu uses tokenizer kwargs only when image is None.
         image_input = None
 
@@ -237,7 +237,7 @@ def test_structured_kwargs_nested(self):
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
         # Fuyu uses tokenizer kwargs only when image is None.
         image_input = None
 
@@ -264,7 +264,7 @@ def test_structured_kwargs_nested_from_dict(self):
 
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
         # Fuyu uses tokenizer kwargs only when image is None.
         image_input = None
 
@@ -290,7 +290,7 @@ def test_unstructured_kwargs(self):
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
         # Fuyu uses tokenizer kwargs only when image is None.
         image_input = None
         inputs = processor(
@@ -315,7 +315,7 @@ def test_unstructured_kwargs_batched(self):
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
-        input_str = ["lower newer", "upper older longer string"]
+        input_str = self.prepare_text_inputs(batch_size=2)
         # Fuyu uses tokenizer kwargs only when image is None.
         image_input = None
         inputs = processor(
diff --git a/tests/models/gemma/test_modeling_gemma.py b/tests/models/gemma/test_modeling_gemma.py
index a02541d585447c..a888bdcd3bc7be 100644
--- a/tests/models/gemma/test_modeling_gemma.py
+++ b/tests/models/gemma/test_modeling_gemma.py
@@ -21,6 +21,7 @@
 from packaging import version
 
 from transformers import AutoModelForCausalLM, AutoTokenizer, GemmaConfig, is_torch_available
+from transformers.generation.configuration_utils import GenerationConfig
 from transformers.testing_utils import (
     is_flaky,
     require_bitsandbytes,
@@ -119,7 +120,7 @@ def prepare_config_and_inputs(self):
 
         input_mask = None
         if self.use_input_mask:
-            input_mask = torch.tril(torch.ones(self.batch_size, self.seq_length)).to(torch_device)
+            input_mask = torch.tril(torch.ones_like(input_ids).to(torch_device))
 
         token_type_ids = None
         if self.use_token_type_ids:
@@ -321,9 +322,19 @@ class GemmaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
     # used in `test_torch_compile`
     _torch_compile_test_ckpt = "google/gemma-2b"
 
+    # used in `test_torch_compile_for_training`
+    _torch_compile_train_cls = GemmaForCausalLM if is_torch_available() else None
+
     # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79245/workflows/9490ef58-79c2-410d-8f51-e3495156cf9c/jobs/1012146
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
         return True
 
@@ -808,7 +819,7 @@ def test_compile_static_cache(self):
 
         prompts = ["Hello I am doing", "Hi today"]
         tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b", pad_token="</s>", padding_side="right")
-        model = GemmaForCausalLM.from_pretrained("google/gemma-2b", device_map="sequential", torch_dtype=torch.float16)
+        model = GemmaForCausalLM.from_pretrained("google/gemma-2b", device_map=torch_device, torch_dtype=torch.float16)
         inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device)
 
         # Dynamic Cache
@@ -831,6 +842,67 @@ def test_compile_static_cache(self):
         static_compiled_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
         self.assertEqual(EXPECTED_TEXT_COMPLETION, static_compiled_text)
 
+    @slow
+    @require_read_token
+    def test_export_static_cache(self):
+        if version.parse(torch.__version__) < version.parse("2.3.0"):
+            self.skipTest(reason="This test requires torch >= 2.3 to run.")
+
+        from transformers.integrations.executorch import (
+            TorchExportableModuleWithStaticCache,
+            convert_and_export_with_cache,
+        )
+
+        tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b", pad_token="</s>", padding_side="right")
+        EXPECTED_TEXT_COMPLETION = [
+            "Hello I am doing a project on the 1990s and I need to know what the most popular music was in the 1990s. I have looked on the internet and I have found",
+        ]
+        max_generation_length = tokenizer(EXPECTED_TEXT_COMPLETION, return_tensors="pt", padding=True)[
+            "input_ids"
+        ].shape[-1]
+
+        # Load model
+        device = "cpu"
+        dtype = torch.bfloat16
+        cache_implementation = "static"
+        attn_implementation = "sdpa"
+        batch_size = 1
+        model = GemmaForCausalLM.from_pretrained(
+            "google/gemma-2b",
+            device_map=device,
+            torch_dtype=dtype,
+            attn_implementation=attn_implementation,
+            generation_config=GenerationConfig(
+                use_cache=True,
+                cache_implementation=cache_implementation,
+                max_length=max_generation_length,
+                cache_config={
+                    "batch_size": batch_size,
+                    "max_cache_len": max_generation_length,
+                },
+            ),
+        )
+
+        prompts = ["Hello I am doing"]
+        prompt_tokens = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device)
+        prompt_token_ids = prompt_tokens["input_ids"]
+        max_new_tokens = max_generation_length - prompt_token_ids.shape[-1]
+
+        # Static Cache + eager
+        eager_generated_ids = model.generate(
+            **prompt_tokens, max_new_tokens=max_new_tokens, do_sample=False, cache_implementation=cache_implementation
+        )
+        eager_generated_text = tokenizer.batch_decode(eager_generated_ids, skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, eager_generated_text)
+
+        # Static Cache + export
+        exported_program = convert_and_export_with_cache(model)
+        ep_generated_ids = TorchExportableModuleWithStaticCache.generate(
+            exported_program=exported_program, prompt_token_ids=prompt_token_ids, max_new_tokens=max_new_tokens
+        )
+        ep_generated_text = tokenizer.batch_decode(ep_generated_ids, skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, ep_generated_text)
+
     def test_model_2b_bf16_dola(self):
         model_id = "google/gemma-2b"
         # ground truth text generated with dola_layers="low", repetition_penalty=1.2
diff --git a/tests/models/gemma2/test_modeling_gemma2.py b/tests/models/gemma2/test_modeling_gemma2.py
index 4e7b3553460f89..8f9a918dca0082 100644
--- a/tests/models/gemma2/test_modeling_gemma2.py
+++ b/tests/models/gemma2/test_modeling_gemma2.py
@@ -16,10 +16,12 @@
 
 import unittest
 
+from packaging import version
 from parameterized import parameterized
 from pytest import mark
 
 from transformers import AutoModelForCausalLM, AutoTokenizer, Gemma2Config, HybridCache, is_torch_available, pipeline
+from transformers.generation.configuration_utils import GenerationConfig
 from transformers.testing_utils import (
     require_flash_attn,
     require_read_token,
@@ -306,3 +308,57 @@ def test_model_9b_flash_attn(self):
         output_text = tokenizer.batch_decode(output, skip_special_tokens=False)
 
         self.assertEqual(output_text, EXPECTED_TEXTS)
+
+    @slow
+    @require_read_token
+    def test_export_static_cache(self):
+        if version.parse(torch.__version__) < version.parse("2.5.0"):
+            self.skipTest(reason="This test requires torch >= 2.5 to run.")
+
+        from transformers.integrations.executorch import (
+            TorchExportableModuleWithStaticCache,
+            convert_and_export_with_cache,
+        )
+
+        tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b", pad_token="</s>", padding_side="right")
+        EXPECTED_TEXT_COMPLETION = [
+            "Hello I am doing a project for my school and I need to know how to make a program that will take a number",
+        ]
+        max_generation_length = tokenizer(EXPECTED_TEXT_COMPLETION, return_tensors="pt", padding=True)[
+            "input_ids"
+        ].shape[-1]
+
+        # Load model
+        device = "cpu"
+        dtype = torch.bfloat16
+        cache_implementation = "static"
+        attn_implementation = "sdpa"
+        batch_size = 1
+        model = AutoModelForCausalLM.from_pretrained(
+            "google/gemma-2-2b",
+            device_map=device,
+            torch_dtype=dtype,
+            attn_implementation=attn_implementation,
+            generation_config=GenerationConfig(
+                use_cache=True,
+                cache_implementation=cache_implementation,
+                max_length=max_generation_length,
+                cache_config={
+                    "batch_size": batch_size,
+                    "max_cache_len": max_generation_length,
+                },
+            ),
+        )
+
+        prompts = ["Hello I am doing"]
+        prompt_tokens = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device)
+        prompt_token_ids = prompt_tokens["input_ids"]
+        max_new_tokens = max_generation_length - prompt_token_ids.shape[-1]
+
+        # Static Cache + export
+        exported_program = convert_and_export_with_cache(model)
+        ep_generated_ids = TorchExportableModuleWithStaticCache.generate(
+            exported_program=exported_program, prompt_token_ids=prompt_token_ids, max_new_tokens=max_new_tokens
+        )
+        ep_generated_text = tokenizer.batch_decode(ep_generated_ids, skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, ep_generated_text)
diff --git a/tests/models/gptj/test_modeling_gptj.py b/tests/models/gptj/test_modeling_gptj.py
index 71c121dbaa5a9e..6f6fba50dc123a 100644
--- a/tests/models/gptj/test_modeling_gptj.py
+++ b/tests/models/gptj/test_modeling_gptj.py
@@ -382,10 +382,17 @@ def test_torch_fx_output_loss(self):
 
     # TODO: Fix the failed tests
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
         if (
-            pipeline_test_casse_name == "QAPipelineTests"
+            pipeline_test_case_name == "QAPipelineTests"
             and tokenizer_name is not None
             and not tokenizer_name.endswith("Fast")
         ):
diff --git a/tests/models/gptj/test_modeling_tf_gptj.py b/tests/models/gptj/test_modeling_tf_gptj.py
index 166bd3a45a9ea4..295c08aebb4d7a 100644
--- a/tests/models/gptj/test_modeling_tf_gptj.py
+++ b/tests/models/gptj/test_modeling_tf_gptj.py
@@ -322,10 +322,17 @@ class TFGPTJModelTest(TFModelTesterMixin, TFCoreModelTesterMixin, PipelineTester
 
     # TODO: Fix the failed tests
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
         if (
-            pipeline_test_casse_name == "QAPipelineTests"
+            pipeline_test_case_name == "QAPipelineTests"
             and tokenizer_name is not None
             and not tokenizer_name.endswith("Fast")
         ):
diff --git a/tests/models/granite/test_modeling_granite.py b/tests/models/granite/test_modeling_granite.py
index 0f4d7640a1bb7d..9b25698f640106 100644
--- a/tests/models/granite/test_modeling_granite.py
+++ b/tests/models/granite/test_modeling_granite.py
@@ -106,7 +106,7 @@ def prepare_config_and_inputs(self):
 
         input_mask = None
         if self.use_input_mask:
-            input_mask = torch.tril(torch.ones(self.batch_size, self.seq_length)).to(torch_device)
+            input_mask = torch.tril(torch.ones_like(input_ids).to(torch_device))
 
         token_type_ids = None
         if self.use_token_type_ids:
diff --git a/tests/models/granitemoe/test_modeling_granitemoe.py b/tests/models/granitemoe/test_modeling_granitemoe.py
index 158259ed5fb4c0..d5d0cee6daa1cd 100644
--- a/tests/models/granitemoe/test_modeling_granitemoe.py
+++ b/tests/models/granitemoe/test_modeling_granitemoe.py
@@ -105,7 +105,7 @@ def prepare_config_and_inputs(self):
 
         input_mask = None
         if self.use_input_mask:
-            input_mask = torch.tril(torch.ones(self.batch_size, self.seq_length)).to(torch_device)
+            input_mask = torch.tril(torch.ones_like(input_ids).to(torch_device))
 
         token_type_ids = None
         if self.use_token_type_ids:
diff --git a/tests/models/idefics/test_modeling_idefics.py b/tests/models/idefics/test_modeling_idefics.py
index 0197ebcaff5388..a49bce8d878fb4 100644
--- a/tests/models/idefics/test_modeling_idefics.py
+++ b/tests/models/idefics/test_modeling_idefics.py
@@ -662,7 +662,7 @@ def test_inference_natural_language_visual_reasoning(self):
             "HuggingFaceM4/idefics-9b", quantization_config=quantization_config, device_map="auto"
         )
         processor = self.default_processor
-        inputs = processor(prompts, return_tensors="pt", padding="longest").to(torch_device)
+        inputs = processor(text=prompts, return_tensors="pt", padding="longest").to(torch_device)
         generated_ids = model.generate(**inputs, max_length=100)
         generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
 
diff --git a/tests/models/idefics/test_processor_idefics.py b/tests/models/idefics/test_processor_idefics.py
index 26dcbb1c0f1566..062b578a684ede 100644
--- a/tests/models/idefics/test_processor_idefics.py
+++ b/tests/models/idefics/test_processor_idefics.py
@@ -12,11 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import shutil
+import tempfile
+import unittest
+
 import numpy as np
 
-from transformers.testing_utils import TestCasePlus, require_torch, require_vision
+from transformers import (
+    AutoProcessor,
+    IdeficsImageProcessor,
+    IdeficsProcessor,
+    LlamaTokenizerFast,
+    PreTrainedTokenizerFast,
+)
+from transformers.testing_utils import require_torch, require_vision
 from transformers.utils import is_torch_available, is_vision_available
 
+from ...test_processing_common import ProcessorTesterMixin
+
 
 if is_torch_available():
     import torch
@@ -24,37 +37,32 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import (
-        AutoProcessor,
-        IdeficsImageProcessor,
-        IdeficsProcessor,
-        LlamaTokenizerFast,
-        PreTrainedTokenizerFast,
-    )
-
 
 @require_torch
 @require_vision
-class IdeficsProcessorTest(TestCasePlus):
-    def setUp(self):
-        super().setUp()
+class IdeficsProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = IdeficsProcessor
 
-        self.checkpoint_path = self.get_auto_remove_tmp_dir()
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
 
         image_processor = IdeficsImageProcessor(return_tensors="pt")
         tokenizer = LlamaTokenizerFast.from_pretrained("HuggingFaceM4/tiny-random-idefics")
 
         processor = IdeficsProcessor(image_processor, tokenizer)
 
-        processor.save_pretrained(self.checkpoint_path)
+        processor.save_pretrained(self.tmpdirname)
 
         self.input_keys = ["pixel_values", "input_ids", "attention_mask", "image_attention_mask"]
 
     def get_tokenizer(self, **kwargs):
-        return AutoProcessor.from_pretrained(self.checkpoint_path, **kwargs).tokenizer
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
 
     def get_image_processor(self, **kwargs):
-        return AutoProcessor.from_pretrained(self.checkpoint_path, **kwargs).image_processor
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
 
     def prepare_prompts(self):
         """This function prepares a list of PIL images"""
@@ -100,13 +108,13 @@ def prepare_prompts(self):
 
     def test_save_load_pretrained_additional_features(self):
         processor = IdeficsProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
-        processor.save_pretrained(self.checkpoint_path)
+        processor.save_pretrained(self.tmpdirname)
 
         tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
         image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
 
         processor = IdeficsProcessor.from_pretrained(
-            self.checkpoint_path, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
+            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
         )
 
         self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
@@ -124,7 +132,7 @@ def test_processor(self):
         prompts = self.prepare_prompts()
 
         # test that all prompts succeeded
-        input_processor = processor(prompts, return_tensors="pt", padding="longest")
+        input_processor = processor(text=prompts, return_tensors="pt", padding="longest")
         for key in self.input_keys:
             assert torch.is_tensor(input_processor[key])
 
@@ -157,8 +165,8 @@ def test_tokenizer_padding(self):
         ]
         prompts = [[prompt] for prompt in self.prepare_prompts()[2]]
 
-        max_length = processor(prompts, padding="max_length", truncation=True, max_length=20, return_tensors="pt")
-        longest = processor(prompts, padding="longest", truncation=True, max_length=30, return_tensors="pt")
+        max_length = processor(text=prompts, padding="max_length", truncation=True, max_length=20, return_tensors="pt")
+        longest = processor(text=prompts, padding="longest", truncation=True, max_length=30, return_tensors="pt")
 
         decoded_max_length = processor.tokenizer.decode(max_length["input_ids"][-1])
         decoded_longest = processor.tokenizer.decode(longest["input_ids"][-1])
@@ -185,8 +193,8 @@ def test_tokenizer_left_padding(self):
             ([0] * 10) + ([1] * 10),
         ]
         prompts = [[prompt] for prompt in self.prepare_prompts()[2]]
-        max_length = processor(prompts, padding="max_length", truncation=True, max_length=20)
-        longest = processor(prompts, padding="longest", truncation=True, max_length=30)
+        max_length = processor(text=prompts, padding="max_length", truncation=True, max_length=20)
+        longest = processor(text=prompts, padding="longest", truncation=True, max_length=30)
 
         decoded_max_length = processor.tokenizer.decode(max_length["input_ids"][-1])
         decoded_longest = processor.tokenizer.decode(longest["input_ids"][-1])
@@ -204,7 +212,143 @@ def test_model_input_names(self):
         processor = IdeficsProcessor(tokenizer=tokenizer, image_processor=image_processor)
         prompts = self.prepare_prompts()
 
-        inputs = processor(prompts, padding="longest", return_tensors="pt")
+        inputs = processor(text=prompts, padding="longest", return_tensors="pt")
 
         # For now the processor supports only ['pixel_values', 'input_ids', 'attention_mask']
         self.assertSetEqual(set(inputs.keys()), set(self.input_keys))
+
+    # Override the following tests as Idefics image processor does not accept do_rescale and rescale_factor
+    @require_torch
+    @require_vision
+    def test_image_processor_defaults_preserved_by_image_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor", image_size=234)
+        tokenizer = self.get_component("tokenizer", max_length=117)
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = self.prepare_text_inputs()
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+        self.assertEqual(len(inputs["pixel_values"][0][0][0]), 234)
+
+    @require_torch
+    @require_vision
+    def test_kwargs_overrides_default_image_processor_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor", image_size=234)
+        tokenizer = self.get_component("tokenizer", max_length=117)
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = self.prepare_text_inputs()
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input, image_size=224)
+        self.assertEqual(len(inputs["pixel_values"][0][0][0]), 224)
+
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = self.prepare_text_inputs()
+        image_input = self.prepare_image_inputs()
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            image_size=214,
+            padding="max_length",
+            max_length=76,
+        )
+
+        self.assertEqual(inputs["pixel_values"].shape[3], 214)
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs_batched(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = self.prepare_text_inputs(batch_size=2)
+        image_input = self.prepare_image_inputs(batch_size=2)
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            image_size=214,
+            padding="longest",
+            max_length=76,
+        )
+
+        self.assertEqual(inputs["pixel_values"].shape[3], 214)
+        self.assertEqual(len(inputs["input_ids"][0]), 8)
+
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_nested(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = self.prepare_text_inputs()
+        image_input = self.prepare_image_inputs()
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "images_kwargs": {"image_size": 214},
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        self.skip_processor_without_typed_kwargs(processor)
+        self.assertEqual(inputs["pixel_values"].shape[3], 214)
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_nested_from_dict(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = self.prepare_text_inputs()
+        image_input = self.prepare_image_inputs()
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "images_kwargs": {"image_size": 214},
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        self.assertEqual(inputs["pixel_values"].shape[3], 214)
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
diff --git a/tests/models/idefics2/test_processing_idefics2.py b/tests/models/idefics2/test_processor_idefics2.py
similarity index 69%
rename from tests/models/idefics2/test_processing_idefics2.py
rename to tests/models/idefics2/test_processor_idefics2.py
index 2fd569f99141af..bf713c6fb8cfbb 100644
--- a/tests/models/idefics2/test_processing_idefics2.py
+++ b/tests/models/idefics2/test_processor_idefics2.py
@@ -13,8 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import shutil
+import tempfile
 import unittest
 from io import BytesIO
+from typing import Optional
 
 import requests
 
@@ -22,16 +25,30 @@
 from transformers.testing_utils import require_torch, require_vision
 from transformers.utils import is_vision_available
 
+from ...test_processing_common import ProcessorTesterMixin
+
 
 if is_vision_available():
     from PIL import Image
 
+    from transformers import (
+        AutoProcessor,
+        Idefics2Processor,
+    )
+
 
 @require_torch
 @require_vision
-class Idefics2ProcessorTest(unittest.TestCase):
+class Idefics2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = Idefics2Processor
+
     def setUp(self):
-        self.processor = Idefics2Processor.from_pretrained("HuggingFaceM4/idefics2-8b", image_seq_len=2)
+        self.tmpdirname = tempfile.mkdtemp()
+
+        processor = Idefics2Processor.from_pretrained("HuggingFaceM4/idefics2-8b", image_seq_len=2)
+
+        processor.save_pretrained(self.tmpdirname)
+
         self.image1 = Image.open(
             BytesIO(
                 requests.get(
@@ -49,22 +66,35 @@ def setUp(self):
                 ).content
             )
         )
-        self.bos_token = self.processor.tokenizer.bos_token
-        self.image_token = self.processor.image_token.content
-        self.fake_image_token = self.processor.fake_image_token.content
+        self.bos_token = processor.tokenizer.bos_token
+        self.image_token = processor.image_token.content
+        self.fake_image_token = processor.fake_image_token.content
+
+        self.bos_token_id = processor.tokenizer.convert_tokens_to_ids(self.bos_token)
+        self.image_token_id = processor.tokenizer.convert_tokens_to_ids(self.image_token)
+        self.fake_image_token_id = processor.tokenizer.convert_tokens_to_ids(self.fake_image_token)
+        self.image_seq_len = processor.image_seq_len
+
+    def get_tokenizer(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
+
+    def get_image_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
 
-        self.bos_token_id = self.processor.tokenizer.convert_tokens_to_ids(self.bos_token)
-        self.image_token_id = self.processor.tokenizer.convert_tokens_to_ids(self.image_token)
-        self.fake_image_token_id = self.processor.tokenizer.convert_tokens_to_ids(self.fake_image_token)
-        self.image_seq_len = self.processor.image_seq_len
+    def get_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
 
     def test_process_interleaved_images_prompts_no_image_splitting(self):
-        old_image_splitting = self.processor.image_processor.do_image_splitting
+        tokenizer = self.get_tokenizer()
+        processor = self.get_processor()
 
-        self.processor.image_processor.do_image_splitting = False
+        processor.image_processor.do_image_splitting = False
 
         # Test that a single image is processed correctly
-        inputs = self.processor(images=self.image1)
+        inputs = processor(images=self.image1)
         self.assertEqual(inputs["pixel_values"].shape, (1, 1, 3, 653, 980))
         self.assertEqual(inputs["pixel_attention_mask"].shape, (1, 1, 653, 980))
         # fmt: on
@@ -73,10 +103,10 @@ def test_process_interleaved_images_prompts_no_image_splitting(self):
         image_str = "<image>"
         text_str = "In this image, we see"
         text = image_str + text_str
-        inputs = self.processor(text=text, images=self.image1)
+        inputs = processor(text=text, images=self.image1)
 
         # fmt: off
-        tokenized_sentence = self.processor.tokenizer(text_str, add_special_tokens=False)
+        tokenized_sentence = tokenizer(text_str, add_special_tokens=False)
         expected_input_ids = [[self.bos_token_id] + [self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] + tokenized_sentence["input_ids"]]
         self.assertEqual(inputs["input_ids"], expected_input_ids)
         self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids[0])])
@@ -95,11 +125,11 @@ def test_process_interleaved_images_prompts_no_image_splitting(self):
         ]
         images = [[self.image1], [self.image2, self.image3]]
 
-        inputs = self.processor(text=text, images=images, padding=True)
+        inputs = processor(text=text, images=images, padding=True)
 
         # fmt: off
-        tokenized_sentence_1 = self.processor.tokenizer(text_str_1, add_special_tokens=False)
-        tokenized_sentence_2 = self.processor.tokenizer(text_str_2, add_special_tokens=False)
+        tokenized_sentence_1 = tokenizer(text_str_1, add_special_tokens=False)
+        tokenized_sentence_2 = tokenizer(text_str_2, add_special_tokens=False)
         expected_input_ids_1 = [self.bos_token_id] + [self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] + tokenized_sentence_1["input_ids"]
         expected_input_ids_2 = [self.bos_token_id] + tokenized_sentence_2["input_ids"] + [self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id]
         # Pad the first input to match the second input
@@ -117,15 +147,13 @@ def test_process_interleaved_images_prompts_no_image_splitting(self):
         self.assertEqual(inputs['pixel_attention_mask'].shape, (2, 2, 767, 980))
         # fmt: on
 
-        self.processor.image_processor.do_image_splitting = old_image_splitting
-
     def test_process_interleaved_images_prompts_image_splitting(self):
-        old_image_splitting = self.processor.image_processor.do_image_splitting
-
-        self.processor.image_processor.do_image_splitting = True
+        processor = self.get_processor()
+        tokenizer = self.get_tokenizer()
+        processor.image_processor.do_image_splitting = True
 
         # Test that a single image is processed correctly
-        inputs = self.processor(images=self.image1)
+        inputs = processor(images=self.image1)
         self.assertEqual(inputs["pixel_values"].shape, (1, 5, 3, 653, 980))
         self.assertEqual(inputs["pixel_attention_mask"].shape, (1, 5, 653, 980))
         # fmt: on
@@ -134,10 +162,10 @@ def test_process_interleaved_images_prompts_image_splitting(self):
         image_str = "<image>"
         text_str = "In this image, we see"
         text = image_str + text_str
-        inputs = self.processor(text=text, images=self.image1)
+        inputs = processor(text=text, images=self.image1)
 
         # fmt: off
-        tokenized_sentence = self.processor.tokenizer(text_str, add_special_tokens=False)
+        tokenized_sentence = tokenizer(text_str, add_special_tokens=False)
         expected_input_ids = [[self.bos_token_id] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * 5 + [self.fake_image_token_id] + tokenized_sentence["input_ids"]]
         self.assertEqual(inputs["input_ids"], expected_input_ids)
         self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids[0])])
@@ -156,11 +184,11 @@ def test_process_interleaved_images_prompts_image_splitting(self):
         ]
         images = [[self.image1], [self.image2, self.image3]]
 
-        inputs = self.processor(text=text, images=images, padding=True)
+        inputs = processor(text=text, images=images, padding=True)
 
         # fmt: off
-        tokenized_sentence_1 = self.processor.tokenizer(text_str_1, add_special_tokens=False)
-        tokenized_sentence_2 = self.processor.tokenizer(text_str_2, add_special_tokens=False)
+        tokenized_sentence_1 = tokenizer(text_str_1, add_special_tokens=False)
+        tokenized_sentence_2 = tokenizer(text_str_2, add_special_tokens=False)
         expected_input_ids_1 = [self.bos_token_id] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * 5 + [self.fake_image_token_id] + tokenized_sentence_1["input_ids"]
         expected_input_ids_2 = [self.bos_token_id] + tokenized_sentence_2["input_ids"] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * 5 + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * 5 + [self.fake_image_token_id]
         # Pad the first input to match the second input
@@ -178,22 +206,22 @@ def test_process_interleaved_images_prompts_image_splitting(self):
         self.assertEqual(inputs['pixel_attention_mask'].shape, (2, 10, 767, 980))
         # fmt: on
 
-        self.processor.image_processor.do_image_splitting = old_image_splitting
-
     def test_add_special_tokens_processor(self):
+        processor = self.get_processor()
+        tokenizer = self.get_tokenizer()
         image_str = "<image>"
         text_str = "In this image, we see"
         text = text_str + image_str
 
-        n_image_repeat = 5 if self.processor.image_processor.do_image_splitting else 1
+        n_image_repeat = 5 if processor.image_processor.do_image_splitting else 1
 
         # fmt: off
-        inputs = self.processor(text=text, images=self.image1, add_special_tokens=False)
-        tokenized_sentence = self.processor.tokenizer(text_str, add_special_tokens=False)
+        inputs = processor(text=text, images=self.image1, add_special_tokens=False)
+        tokenized_sentence = tokenizer(text_str, add_special_tokens=False)
         expected_input_ids = [tokenized_sentence["input_ids"] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * n_image_repeat + [self.fake_image_token_id]]
         self.assertEqual(inputs["input_ids"], expected_input_ids)
 
-        inputs = self.processor(text=text, images=self.image1)
+        inputs = processor(text=text, images=self.image1)
         expected_input_ids = [[self.bos_token_id] + tokenized_sentence["input_ids"] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * n_image_repeat + [self.fake_image_token_id]]
         self.assertEqual(inputs["input_ids"], expected_input_ids)
         # fmt: on
@@ -222,7 +250,7 @@ def test_apply_chat_template(self):
             {"role": "user", "content": [{"type": "text", "text": "And who is that?"}]},
         ]
 
-        processor = self.processor
+        processor = self.get_processor()
         # Make short sequence length to test that the fake tokens are added correctly
         rendered = processor.apply_chat_template(messages, add_generation_prompt=True)
 
@@ -233,3 +261,27 @@ def test_apply_chat_template(self):
             "Assistant:"
         )
         self.assertEqual(rendered, expected_rendered)
+
+    # Override as Idefics2Processor needs image tokens in prompts
+    def prepare_text_inputs(self, batch_size: Optional[int] = None):
+        if batch_size is None:
+            return "lower newer <image>"
+
+        if batch_size < 1:
+            raise ValueError("batch_size must be greater than 0")
+
+        if batch_size == 1:
+            return ["lower newer <image>"]
+        return ["lower newer <image>", "<image> upper older longer string"] + ["<image> lower newer"] * (
+            batch_size - 2
+        )
+
+    # Override as PixtralProcessor needs nested images to work properly with batched inputs
+    @require_vision
+    def prepare_image_inputs(self, batch_size: Optional[int] = None):
+        """This function prepares a list of PIL images for testing"""
+        if batch_size is None:
+            return super().prepare_image_inputs()
+        if batch_size < 1:
+            raise ValueError("batch_size must be greater than 0")
+        return [[super().prepare_image_inputs()]] * batch_size
diff --git a/tests/models/idefics3/test_processing_idefics3.py b/tests/models/idefics3/test_processor_idefics3.py
similarity index 89%
rename from tests/models/idefics3/test_processing_idefics3.py
rename to tests/models/idefics3/test_processor_idefics3.py
index 92f0fbb1f8e268..a53109b02b6951 100644
--- a/tests/models/idefics3/test_processing_idefics3.py
+++ b/tests/models/idefics3/test_processor_idefics3.py
@@ -17,6 +17,7 @@
 import tempfile
 import unittest
 from io import BytesIO
+from typing import Optional
 
 import numpy as np
 import requests
@@ -284,44 +285,29 @@ def test_apply_chat_template(self):
         )
         self.assertEqual(rendered, expected_rendered)
 
-    @require_torch
-    @require_vision
-    def test_image_processor_defaults_preserved_by_image_kwargs(self):
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer", max_length=117)
+    # Override as Idefics3Processor needs image tokens in prompts
+    def prepare_text_inputs(self, batch_size: Optional[int] = None):
+        if batch_size is None:
+            return "lower newer <image>"
 
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = "lower newer <image>"
-        image_input = self.prepare_image_inputs()
+        if batch_size < 1:
+            raise ValueError("batch_size must be greater than 0")
 
-        inputs = processor(text=input_str, images=image_input)
-        self.assertEqual(len(inputs["pixel_values"][0][0]), 3)
-        self.assertEqual(len(inputs["pixel_values"][0][0][0]), 364)  # crop size doesn't affect our image processor
-
-    @require_torch
-    @require_vision
-    def test_kwargs_overrides_default_image_processor_kwargs(self):
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component(
-            "image_processor", max_image_size={"longest_edge": 32}, size={"longest_edge": 32}
+        if batch_size == 1:
+            return ["lower newer <image>"]
+        return ["lower newer <image>", "<image> upper older longer string"] + ["<image> lower newer"] * (
+            batch_size - 2
         )
-        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
-
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor, image_seq_len=2)
-        self.skip_processor_without_typed_kwargs(processor)
 
-        input_str = "lower newer <image>"
-        image_input = self.prepare_image_inputs()
-
-        inputs = processor(text=input_str, images=image_input)
-        self.assertEqual(len(inputs["pixel_values"][0][0]), 3)
-        self.assertEqual(len(inputs["pixel_values"][0][0][0]), 32)
-        self.assertEqual(len(inputs["input_ids"][0]), 117)
+    # Override as Idefics3Processor needs nested images to work properly with batched inputs
+    @require_vision
+    def prepare_image_inputs(self, batch_size: Optional[int] = None):
+        """This function prepares a list of PIL images for testing"""
+        if batch_size is None:
+            return super().prepare_image_inputs()
+        if batch_size < 1:
+            raise ValueError("batch_size must be greater than 0")
+        return [[super().prepare_image_inputs()]] * batch_size
 
     @require_vision
     @require_torch
@@ -333,7 +319,7 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self):
 
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer<image>"
+        input_str = self.prepare_text_inputs()
         image_input = self.prepare_image_inputs()
 
         inputs = processor(text=input_str, images=image_input, return_tensors="pt", max_length=30)
@@ -350,7 +336,7 @@ def test_structured_kwargs_nested(self):
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
-        input_str = "lower newer<image>"
+        input_str = self.prepare_text_inputs()
         image_input = self.prepare_image_inputs()
 
         # Define the kwargs for each modality
@@ -378,7 +364,7 @@ def test_structured_kwargs_nested_from_dict(self):
 
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer<image>"
+        input_str = self.prepare_text_inputs()
         image_input = self.prepare_image_inputs()
 
         # Define the kwargs for each modality
@@ -402,7 +388,7 @@ def test_tokenizer_defaults_preserved_by_kwargs(self):
 
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer<image>"
+        input_str = self.prepare_text_inputs()
         image_input = self.prepare_image_inputs()
 
         inputs = processor(text=input_str, images=image_input, return_tensors="pt")
@@ -419,11 +405,11 @@ def test_unstructured_kwargs_batched(self):
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
-        input_str = ["<image>lower newer", "<image>upper older longer string"]
-        image_input = self.prepare_image_inputs()
+        input_str = self.prepare_text_inputs(batch_size=2)
+        image_input = self.prepare_image_inputs(batch_size=2)
         inputs = processor(
             text=input_str,
-            images=[image_input, image_input],
+            images=image_input,
             return_tensors="pt",
             padding="longest",
             max_length=76,
@@ -446,7 +432,7 @@ def test_unstructured_kwargs(self):
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
-        input_str = "lower newer<image>"
+        input_str = self.prepare_text_inputs()
         image_input = self.prepare_image_inputs()
         inputs = processor(
             text=input_str,
diff --git a/tests/models/jamba/test_modeling_jamba.py b/tests/models/jamba/test_modeling_jamba.py
index 6e1a2cf2cf9c44..251f293f722661 100644
--- a/tests/models/jamba/test_modeling_jamba.py
+++ b/tests/models/jamba/test_modeling_jamba.py
@@ -653,7 +653,7 @@ class JambaModelIntegrationTest(unittest.TestCase):
 
     @classmethod
     def setUpClass(cls):
-        model_id = "ai21labs/Jamba-tiny-random"
+        model_id = "ai21labs/Jamba-tiny-dev"
         cls.model = JambaForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True)
         cls.tokenizer = AutoTokenizer.from_pretrained(model_id)
         if is_torch_available() and torch.cuda.is_available():
@@ -668,7 +668,7 @@ def test_simple_generate(self):
         # considering differences in hardware processing and potential deviations in generated text.
         EXPECTED_TEXTS = {
             7: "<|startoftext|>Hey how are you doing on this lovely evening? Canyon rins hugaughter glamour Rutgers Singh<|reserved_797|>cw algunas",
-            8: "<|startoftext|>Hey how are you doing on this lovely evening? Canyon rins hugaughter glamour Rutgers Singh Hebrew llam bb",
+            8: "<|startoftext|>Hey how are you doing on this lovely evening? I'm so glad you're here.",
             9: "<|startoftext|>Hey how are you doing on this lovely evening? Canyon rins hugaughter glamour Rutgers Singh Hebrew llam bb",
         }
 
@@ -688,11 +688,11 @@ def test_simple_generate(self):
 
             EXPECTED_LOGITS_NO_GRAD = torch.tensor(
                 [
-                    0.0134, -0.2197,  0.0396, -0.1011,  0.0459,  0.2793, -0.1465,  0.1660,
-                    -0.2930, -0.0278,  0.0269, -0.5586, -0.2109, -0.1426, -0.1553,  0.1279,
-                    0.0713,  0.2246,  0.1660, -0.2314, -0.1187, -0.1162, -0.1377,  0.0292,
-                    0.1245,  0.2275,  0.0374,  0.1089, -0.1348, -0.2305,  0.1484, -0.3906,
-                    0.1709, -0.4590, -0.0447,  0.2422,  0.1592, -0.1855,  0.2441, -0.0562
+                    -7.6875, -7.6562,  8.9375, -7.7812, -7.4062, -7.9688, -8.3125, -7.4062,
+                    -7.8125, -8.1250, -7.8125, -7.3750, -7.8438, -7.5000, -8.0625, -8.0625,
+                    -7.5938, -7.9688, -8.2500, -7.5625, -7.7500, -7.7500, -7.6562, -7.6250,
+                    -8.1250, -8.0625, -8.1250, -7.8750, -8.1875, -8.2500, -7.5938, -8.0000,
+                    -7.5000, -7.7500, -7.9375, -7.4688, -8.0625, -7.3438, -8.0000, -7.5000
                 ]
                 , dtype=torch.float32)  # fmt: skip
 
@@ -710,8 +710,8 @@ def test_simple_batched_generate_with_padding(self):
                 "<|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|startoftext|>Tell me a storyptus Nets Madison El chamadamodern updximVaparsed",
             ],
             8: [
-                "<|startoftext|>Hey how are you doing on this lovely evening? Canyon rins hugaughter glamour Rutgers Singh<|reserved_797|>cw algunas",
-                "<|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|startoftext|>Tell me a storyptus Nets Madison El chamadamodern updximVaparsed",
+                "<|startoftext|>Hey how are you doing on this lovely evening? I'm so glad you're here.",
+                "<|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|startoftext|>Tell me a story about a woman who was born in the United States",
             ],
             9: [
                 "<|startoftext|>Hey how are you doing on this lovely evening? Canyon rins hugaughter glamour Rutgers Singh<|reserved_797|>cw algunas",
@@ -737,21 +737,21 @@ def test_simple_batched_generate_with_padding(self):
             # TODO fix logits
             EXPECTED_LOGITS_NO_GRAD_0 = torch.tensor(
                 [
-                    0.0166, -0.2227,  0.0396, -0.1035,  0.0459,  0.2754, -0.1445,  0.1641,
-                    -0.2910, -0.0273,  0.0227, -0.5547, -0.2139, -0.1396, -0.1582,  0.1289,
-                    0.0713,  0.2256,  0.1699, -0.2295, -0.1182, -0.1167, -0.1387,  0.0261,
-                    0.1270,  0.2285,  0.0403,  0.1108, -0.1318, -0.2334,  0.1455, -0.3945,
-                    0.1729, -0.4609, -0.0410,  0.2412,  0.1572, -0.1895,  0.2402, -0.0583
+                    -7.7188, -7.6875,  8.8750, -7.8125, -7.4062, -8.0000, -8.3125, -7.4375,
+                    -7.8125, -8.1250, -7.8125, -7.4062, -7.8438, -7.5312, -8.0625, -8.0625,
+                    -7.6250, -8.0000, -8.3125, -7.5938, -7.7500, -7.7500, -7.6562, -7.6562,
+                    -8.1250, -8.0625, -8.1250, -7.8750, -8.1875, -8.2500, -7.5938, -8.0625,
+                     -7.5000, -7.7812, -7.9375, -7.4688, -8.0625, -7.3750, -8.0000, -7.50003
                 ]
                 , dtype=torch.float32)  # fmt: skip
 
             EXPECTED_LOGITS_NO_GRAD_1 = torch.tensor(
                 [
-                    -0.1318,  0.2354, -0.4160, -0.0325, -0.0461,  0.0342,  0.2578,  0.0874,
-                    0.1484,  0.2266, -0.1182, -0.1396, -0.1494, -0.1089, -0.0019, -0.2852,
-                    0.1973, -0.2676,  0.0586, -0.1992, -0.2520, -0.1147, -0.1973,  0.2129,
-                    0.0520,  0.1699,  0.1816,  0.1289,  0.1699, -0.1216, -0.2656, -0.2891,
-                    0.2363,  0.2656,  0.0488, -0.1875,  0.2148, -0.1250,  0.1816,  0.0077
+                    -3.5469, -4.0625,  8.5000, -3.8125, -3.6406, -3.7969, -3.8125, -3.3594,
+                     -3.7188, -3.7500, -3.7656, -3.5469, -3.7969, -4.0000, -3.5625, -3.6406,
+                    -3.7188, -3.6094, -4.0938, -3.6719, -3.8906, -3.9844, -3.8594, -3.4219,
+                    -3.2031, -3.4375, -3.7500, -3.6562, -3.9688, -4.1250, -3.6406, -3.57811,
+                    -3.0312, -3.4844, -3.6094, -3.5938, -3.7656, -3.8125, -3.7500, -3.8594
                 ]
                 , dtype=torch.float32)  # fmt: skip
 
diff --git a/tests/models/kosmos2/test_modeling_kosmos2.py b/tests/models/kosmos2/test_modeling_kosmos2.py
index 913111c0a088e1..396a4179388fb8 100644
--- a/tests/models/kosmos2/test_modeling_kosmos2.py
+++ b/tests/models/kosmos2/test_modeling_kosmos2.py
@@ -23,7 +23,7 @@
 import numpy as np
 import requests
 
-from transformers import AutoModelForVision2Seq, AutoProcessor, Kosmos2Config
+from transformers import AutoModelForImageTextToText, AutoProcessor, Kosmos2Config
 from transformers.models.kosmos2.configuration_kosmos2 import Kosmos2TextConfig, Kosmos2VisionConfig
 from transformers.testing_utils import IS_ROCM_SYSTEM, require_torch, require_vision, slow, torch_device
 from transformers.utils import is_torch_available, is_vision_available
@@ -260,9 +260,16 @@ class Kosmos2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
 
     # TODO: `image-to-text` pipeline for this model needs Processor.
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
-        return pipeline_test_casse_name == "ImageToTextPipelineTests"
+        return pipeline_test_case_name == "ImageToTextPipelineTests"
 
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = copy.deepcopy(inputs_dict)
@@ -551,7 +558,7 @@ def test_snowman_image_captioning(self):
         image.save("new_image.jpg")
         image = Image.open("new_image.jpg")
 
-        model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device)
+        model = AutoModelForImageTextToText.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device)
         processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")
 
         prompt = "<grounding>An image of"
@@ -697,7 +704,7 @@ def test_snowman_image_captioning_batch(self):
         image.save("new_image.jpg")
         image = Image.open("new_image.jpg")
 
-        model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device)
+        model = AutoModelForImageTextToText.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device)
 
         prompt = ["<grounding>Describe this image in detail:", "<grounding>An image of"]
 
diff --git a/tests/models/kosmos2/test_processor_kosmos2.py b/tests/models/kosmos2/test_processor_kosmos2.py
index 8de398ade70c71..8874c7d1d30e03 100644
--- a/tests/models/kosmos2/test_processor_kosmos2.py
+++ b/tests/models/kosmos2/test_processor_kosmos2.py
@@ -499,7 +499,7 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self):
 
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
         # set image input to None
         image_input = None
 
@@ -525,7 +525,7 @@ def test_structured_kwargs_nested(self):
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
         image_input = self.prepare_image_inputs()
 
         # Define the kwargs for each modality
@@ -551,7 +551,7 @@ def test_structured_kwargs_nested_from_dict(self):
 
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
         image_input = self.prepare_image_inputs()
 
         # Define the kwargs for each modality
@@ -574,7 +574,7 @@ def test_tokenizer_defaults_preserved_by_kwargs(self):
 
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
         # set image input to None
         image_input = None
 
@@ -593,7 +593,7 @@ def test_unstructured_kwargs(self):
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
         # set image input to None
         image_input = None
         inputs = processor(
@@ -618,7 +618,7 @@ def test_unstructured_kwargs_batched(self):
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
-        input_str = ["lower newer", "upper older longer string"]
+        input_str = self.prepare_text_inputs(batch_size=2)
         # set image input to None
         image_input = None
         inputs = processor(
diff --git a/tests/models/layoutlmv3/test_modeling_layoutlmv3.py b/tests/models/layoutlmv3/test_modeling_layoutlmv3.py
index 6cb93d8b427be4..d62a7273bd5898 100644
--- a/tests/models/layoutlmv3/test_modeling_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_modeling_layoutlmv3.py
@@ -292,7 +292,14 @@ class LayoutLMv3ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
 
     # TODO: Fix the failed tests
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
         # `DocumentQuestionAnsweringPipeline` is expected to work with this model, but it combines the text and visual
         # embedding along the sequence dimension (dim 1), which causes an error during post-processing as `p_mask` has
diff --git a/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py b/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py
index f473eb75802b95..ad0528f8da04d8 100644
--- a/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py
@@ -288,7 +288,14 @@ class TFLayoutLMv3ModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.Te
 
     # TODO: Fix the failed tests
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
         return True
 
diff --git a/tests/models/led/test_modeling_led.py b/tests/models/led/test_modeling_led.py
index a4d81ab2e1c6db..8e31758a4395b8 100644
--- a/tests/models/led/test_modeling_led.py
+++ b/tests/models/led/test_modeling_led.py
@@ -302,9 +302,16 @@ class LEDModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
 
     # TODO: Fix the failed tests when this model gets more usage
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
-        if pipeline_test_casse_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast"):
+        if pipeline_test_case_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast"):
             return True
 
         return False
@@ -338,13 +345,11 @@ def test_global_attention(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
         self.model_tester.check_global_attention(*config_and_inputs)
 
-    def _get_input_ids_and_config(self, batch_size=2):
-        config, input_ids, attention_mask, inputs_dict = GenerationTesterMixin._get_input_ids_and_config(
-            self, batch_size=batch_size
-        )
+    def prepare_config_and_inputs_for_generate(self, *args, **kwargs):
+        config, inputs_dict = super().prepare_config_and_inputs_for_generate(*args, **kwargs)
         # LED computes attention scores based on mask indices if `is_global`
         inputs_dict.pop("global_attention_mask")
-        return config, input_ids, attention_mask, inputs_dict
+        return config, inputs_dict
 
     # LEDForSequenceClassification does not support inputs_embeds
     def test_inputs_embeds(self):
diff --git a/tests/models/lilt/test_modeling_lilt.py b/tests/models/lilt/test_modeling_lilt.py
index 0d0ed720c53a2f..dc3aaaa4ee5e2d 100644
--- a/tests/models/lilt/test_modeling_lilt.py
+++ b/tests/models/lilt/test_modeling_lilt.py
@@ -245,7 +245,14 @@ class LiltModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
 
     # TODO: Fix the failed tests
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
         return True
 
diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py
index a21665c822f2f9..3a103f3efa9eb3 100644
--- a/tests/models/llama/test_modeling_llama.py
+++ b/tests/models/llama/test_modeling_llama.py
@@ -112,7 +112,7 @@ def prepare_config_and_inputs(self):
 
         input_mask = None
         if self.use_input_mask:
-            input_mask = torch.tril(torch.ones(self.batch_size, self.seq_length)).to(torch_device)
+            input_mask = torch.tril(torch.ones_like(input_ids).to(torch_device))
 
         token_type_ids = None
         if self.use_token_type_ids:
@@ -319,6 +319,9 @@ class LlamaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
     # used in `test_torch_compile`
     _torch_compile_test_ckpt = "meta-llama/Llama-2-7b-hf"
 
+    # used in `test_torch_compile_for_training`
+    _torch_compile_train_cls = LlamaForCausalLM if is_torch_available() else None
+
     def setUp(self):
         self.model_tester = LlamaModelTester(self)
         self.config_tester = ConfigTester(self, config_class=LlamaConfig, hidden_size=37)
@@ -874,7 +877,7 @@ def test_compile_static_cache(self):
         ]
         tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", pad_token="</s>", padding_side="right")
         model = LlamaForCausalLM.from_pretrained(
-            "meta-llama/Llama-2-7b-hf", device_map="sequential", torch_dtype=torch.float16
+            "meta-llama/Llama-2-7b-hf", device_map=torch_device, torch_dtype=torch.float16
         )
         inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device)
 
diff --git a/tests/models/llava_onevision/test_processing_llava_onevision.py b/tests/models/llava_onevision/test_processor_llava_onevision.py
similarity index 100%
rename from tests/models/llava_onevision/test_processing_llava_onevision.py
rename to tests/models/llava_onevision/test_processor_llava_onevision.py
diff --git a/tests/models/longformer/test_modeling_longformer.py b/tests/models/longformer/test_modeling_longformer.py
index e7f2f67cc23236..23765fe8ce274f 100644
--- a/tests/models/longformer/test_modeling_longformer.py
+++ b/tests/models/longformer/test_modeling_longformer.py
@@ -331,10 +331,17 @@ class LongformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
 
     # TODO: Fix the failed tests
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
         if (
-            pipeline_test_casse_name == "QAPipelineTests"
+            pipeline_test_case_name == "QAPipelineTests"
             and tokenizer_name is not None
             and not tokenizer_name.endswith("Fast")
         ):
diff --git a/tests/models/longformer/test_modeling_tf_longformer.py b/tests/models/longformer/test_modeling_tf_longformer.py
index 131c077653458f..7e0c57c49a0a46 100644
--- a/tests/models/longformer/test_modeling_tf_longformer.py
+++ b/tests/models/longformer/test_modeling_tf_longformer.py
@@ -303,10 +303,17 @@ class TFLongformerModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.Te
 
     # TODO: Fix the failed tests
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
         if (
-            pipeline_test_casse_name == "QAPipelineTests"
+            pipeline_test_case_name == "QAPipelineTests"
             and tokenizer_name is not None
             and not tokenizer_name.endswith("Fast")
         ):
diff --git a/tests/models/luke/test_modeling_luke.py b/tests/models/luke/test_modeling_luke.py
index a9a5d4f9387b6c..8fd68cd3240ea5 100644
--- a/tests/models/luke/test_modeling_luke.py
+++ b/tests/models/luke/test_modeling_luke.py
@@ -621,9 +621,16 @@ class LukeModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
 
     # TODO: Fix the failed tests
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
-        if pipeline_test_casse_name in ["QAPipelineTests", "ZeroShotClassificationPipelineTests"]:
+        if pipeline_test_case_name in ["QAPipelineTests", "ZeroShotClassificationPipelineTests"]:
             return True
 
         return False
diff --git a/tests/models/m2m_100/test_modeling_m2m_100.py b/tests/models/m2m_100/test_modeling_m2m_100.py
index a29a9c8a9ec0dc..4fe0902c615b14 100644
--- a/tests/models/m2m_100/test_modeling_m2m_100.py
+++ b/tests/models/m2m_100/test_modeling_m2m_100.py
@@ -258,9 +258,16 @@ class M2M100ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
 
     # TODO: Fix the failed tests
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
-        if pipeline_test_casse_name == "TranslationPipelineTests":
+        if pipeline_test_case_name == "TranslationPipelineTests":
             # Get `ValueError: Translation requires a `src_lang` and a `tgt_lang` for this model`.
             # `M2M100Config` was never used in pipeline tests: cannot create a simple tokenizer.
             return True
diff --git a/tests/models/markuplm/test_modeling_markuplm.py b/tests/models/markuplm/test_modeling_markuplm.py
index 71757385e87c91..8dfcee2484bd4b 100644
--- a/tests/models/markuplm/test_modeling_markuplm.py
+++ b/tests/models/markuplm/test_modeling_markuplm.py
@@ -301,7 +301,14 @@ class MarkupLMModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
 
     # TODO: Fix the failed tests
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
         # ValueError: Nodes must be of type `List[str]` (single pretokenized example), or `List[List[str]]`
         # (batch of pretokenized examples).
diff --git a/tests/models/mbart/test_modeling_flax_mbart.py b/tests/models/mbart/test_modeling_flax_mbart.py
index a642b2344c9e1e..ef48e7c4f3e95d 100644
--- a/tests/models/mbart/test_modeling_flax_mbart.py
+++ b/tests/models/mbart/test_modeling_flax_mbart.py
@@ -116,6 +116,7 @@ def __init__(
         self.bos_token_id = bos_token_id
         self.decoder_start_token_id = decoder_start_token_id
         self.initializer_range = initializer_range
+        super().__init__()
 
     def prepare_config_and_inputs(self):
         input_ids = np.clip(ids_tensor([self.batch_size, self.seq_length - 1], self.vocab_size), 3, self.vocab_size)
diff --git a/tests/models/mbart/test_modeling_mbart.py b/tests/models/mbart/test_modeling_mbart.py
index 9401d892daa39b..10b59877f19a6c 100644
--- a/tests/models/mbart/test_modeling_mbart.py
+++ b/tests/models/mbart/test_modeling_mbart.py
@@ -252,9 +252,16 @@ class MBartModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
 
     # TODO: Fix the failed tests
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
-        if pipeline_test_casse_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast"):
+        if pipeline_test_case_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast"):
             return True
 
         return False
diff --git a/tests/models/mbart/test_modeling_tf_mbart.py b/tests/models/mbart/test_modeling_tf_mbart.py
index 515ed132d7d7ea..d2a672c7913325 100644
--- a/tests/models/mbart/test_modeling_tf_mbart.py
+++ b/tests/models/mbart/test_modeling_tf_mbart.py
@@ -175,9 +175,16 @@ class TFMBartModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCas
 
     # TODO: Fix the failed tests
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
-        if pipeline_test_casse_name != "FeatureExtractionPipelineTests":
+        if pipeline_test_case_name != "FeatureExtractionPipelineTests":
             # Exception encountered when calling layer '...'
             return True
 
diff --git a/tests/models/mimi/test_modeling_mimi.py b/tests/models/mimi/test_modeling_mimi.py
index dd0f77421be728..ab6184ce2bbed8 100644
--- a/tests/models/mimi/test_modeling_mimi.py
+++ b/tests/models/mimi/test_modeling_mimi.py
@@ -170,7 +170,6 @@ class MimiModelTest(ModelTesterMixin, unittest.TestCase):
     test_headmasking = False
     test_resize_embeddings = False
     test_torchscript = False
-    input_name = "input_values"
 
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         # model does support returning hidden states
diff --git a/tests/models/mistral/test_modeling_mistral.py b/tests/models/mistral/test_modeling_mistral.py
index 0730f8ba444140..885795a1291207 100644
--- a/tests/models/mistral/test_modeling_mistral.py
+++ b/tests/models/mistral/test_modeling_mistral.py
@@ -112,7 +112,7 @@ def prepare_config_and_inputs(self):
 
         input_mask = None
         if self.use_input_mask:
-            input_mask = torch.tril(torch.ones(self.batch_size, self.seq_length)).to(torch_device)
+            input_mask = torch.tril(torch.ones_like(input_ids).to(torch_device))
 
         token_type_ids = None
         if self.use_token_type_ids:
@@ -313,7 +313,14 @@ class MistralModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
 
     # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79245/workflows/9490ef58-79c2-410d-8f51-e3495156cf9c/jobs/1012146
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
         return True
 
@@ -677,7 +684,7 @@ def test_compile_static_cache(self):
         tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", use_fast=False)
         tokenizer.pad_token = tokenizer.eos_token
         model = MistralForCausalLM.from_pretrained(
-            "mistralai/Mistral-7B-v0.1", device_map="sequential", torch_dtype=torch.float16
+            "mistralai/Mistral-7B-v0.1", device_map=torch_device, torch_dtype=torch.float16
         )
         inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device)
 
diff --git a/tests/models/mistral/test_modeling_tf_mistral.py b/tests/models/mistral/test_modeling_tf_mistral.py
index df07e96bb106d5..448b40fc44c858 100644
--- a/tests/models/mistral/test_modeling_tf_mistral.py
+++ b/tests/models/mistral/test_modeling_tf_mistral.py
@@ -266,7 +266,14 @@ class TFMistralModelTest(TFModelTesterMixin, TFGenerationIntegrationTests, Pipel
 
     # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79245/workflows/9490ef58-79c2-410d-8f51-e3495156cf9c/jobs/1012146
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
         return True
 
diff --git a/tests/models/mixtral/test_modeling_mixtral.py b/tests/models/mixtral/test_modeling_mixtral.py
index db9641e3dcb2a9..a2655bb773dcdd 100644
--- a/tests/models/mixtral/test_modeling_mixtral.py
+++ b/tests/models/mixtral/test_modeling_mixtral.py
@@ -108,7 +108,7 @@ def prepare_config_and_inputs(self):
 
         input_mask = None
         if self.use_input_mask:
-            input_mask = torch.tril(torch.ones(self.batch_size, self.seq_length)).to(torch_device)
+            input_mask = torch.tril(torch.ones_like(input_ids).to(torch_device))
 
         token_type_ids = None
         if self.use_token_type_ids:
@@ -313,7 +313,14 @@ class MixtralModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
 
     # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79245/workflows/9490ef58-79c2-410d-8f51-e3495156cf9c/jobs/1012146
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
         return True
 
diff --git a/tests/models/mllama/test_modeling_mllama.py b/tests/models/mllama/test_modeling_mllama.py
index f31957d78aa8a9..5c5ca3985ee08f 100644
--- a/tests/models/mllama/test_modeling_mllama.py
+++ b/tests/models/mllama/test_modeling_mllama.py
@@ -138,14 +138,6 @@ def setUp(self):
     def test_eager_matches_sdpa_generate(self):
         super().test_eager_matches_sdpa_generate()
 
-    @unittest.skip(reason="The outputs don't match, no idea why")
-    def test_beam_search_low_memory(self):
-        pass
-
-    @unittest.skip(reason="Quanto test is borken")
-    def test_generate_with_quant_cache(self):
-        pass
-
 
 class MllamaVisionText2TextModelTester:
     def __init__(
@@ -208,6 +200,7 @@ def __init__(
         self.image_size = 224
         self.max_num_images = 1
         self.max_image_tiles = 4
+        self.image_length = 904
 
     def get_config(self):
         return MllamaConfig(
@@ -329,6 +322,43 @@ def test_inputs_embeds_matches_input_ids(self):
                 out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
             self.assertTrue(torch.allclose(out_embeds, out_ids))
 
+    def _check_attentions_for_generate(
+        self, batch_size, attentions, min_length, max_length, config, use_cache=False, num_beam_groups=1
+    ):
+        # Mllama has cross attention layers and those have a different shape than normal attention layers
+        self.assertIsInstance(attentions, tuple)
+        self.assertListEqual(
+            [isinstance(iter_attentions, tuple) for iter_attentions in attentions], [True] * len(attentions)
+        )
+        self.assertEqual(len(attentions), (max_length - min_length) * num_beam_groups)
+
+        cross_attention_layers = self.model_tester.text_config["cross_attention_layers"]
+
+        for idx, iter_attentions in enumerate(attentions):
+            tgt_len = min_length + idx if not use_cache else 1
+            src_len = min_length + idx
+
+            expected_shape = (
+                batch_size * num_beam_groups,
+                config.num_attention_heads,
+                tgt_len,
+                src_len,
+            )
+
+            expected_shape_cross = (
+                batch_size * num_beam_groups,
+                config.num_attention_heads,
+                tgt_len,
+                self.model_tester.image_length,
+            )
+
+            expected_shapes = [
+                expected_shape if layer_idx not in cross_attention_layers else expected_shape_cross
+                for layer_idx in range(len(iter_attentions))
+            ]
+
+            self.assertListEqual([layer_attention.shape for layer_attention in iter_attentions], expected_shapes)
+
     @require_torch_sdpa
     @slow
     @is_flaky()
@@ -342,85 +372,33 @@ def test_eager_matches_sdpa_inference_1_bfloat16(self):
         # A workaround to override parametrized test with flaky decorator
         super().test_eager_matches_sdpa_inference_1_bfloat16()
 
-    @unittest.skip(reason="Static cache not supported")
-    def test_static_cache_matches_dynamic(self):
-        # TypeError: list indices must be integers or slices, not tuple
-        # TODO: @raushan, please look into this for new cache format
-        pass
-
-    @unittest.skip(reason="Mllama has dynamic control flow which is not yet supported by compile")
-    def test_generate_compile_fullgraph(self):
-        pass
-
-    @unittest.skip(reason="The outputs don't match, no idea why")
-    def test_beam_search_low_memory(self):
-        pass
-
-    @unittest.skip(reason="Mllama is not yet supported by compile")
+    @unittest.skip("For some unknown reasons the tests fails in CrossAttention layer when doing torch.sdpa(). ")
     def test_sdpa_can_compile_dynamic(self):
-        # TODO: look into this, AttributeError("'tensor' object has no attribute '__pow__'")
-        # relevant issue: https://github.com/pytorch/pytorch/issues/133166
-        pass
-
-    @unittest.skip(reason="The test itself is broken")  # TODO @zucchini-nlp
-    def test_generate_with_quant_cache(self):
         pass
 
     @unittest.skip(reason="AssertionError: Items in the second set but not the first: might be a setting issue")
     def test_model_parallelism(self):
         pass
 
-    @unittest.skip(reason="Failing test, need to fix")
-    def test_compile_cuda_graph_time(self):
-        pass
-
-    @unittest.skip(reason="Failing test, need to fix")
-    def test_torch_compile_fullgraph(self):
-        pass
-
-    @unittest.skip(reason="Device side assert triggered")
-    def test_assisted_decoding_with_num_logits_to_keep(self):
-        pass
-
-    @unittest.skip(reason="Failing test, need to fix")
-    def test_beam_sample_generate_dict_output():
-        pass
-
-    @unittest.skip(reason="Failing test, need to fix")
-    def test_beam_search_generate_dict_output():
-        pass
-
-    @unittest.skip(reason="Failing test, need to fix")
-    def test_constrained_beam_search_generate_dict_output():
-        pass
-
-    @unittest.skip(reason="Failing test, need to fix")
-    def test_dola_decoding_sample():
-        pass
-
-    @unittest.skip(reason="Failing test, need to fix")
-    def test_generate_methods_with_num_logits_to_keep():
-        pass
-
-    @unittest.skip(reason="Failing test, need to fix")
-    def test_greedy_generate_dict_outputs():
-        pass
+    def test_generate_text_only_with_cache(self):
+        """
+        Tests that our cached generation with text-only inputs works. When mllama was introduced, this feature
+        required cache modifications (because layers are skipped in practice). This test should prevent regressions.
+        """
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
-    @unittest.skip(reason="Failing test, need to fix")
-    def test_group_beam_search_generate_dict_output():
-        pass
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
 
-    @unittest.skip(reason="Failing test, need to fix")
-    def test_model_parallel_beam_search():
-        pass
+            inputs = self._prepare_for_class(inputs_dict, model_class)
 
-    @unittest.skip(reason="Failing test, need to fix")
-    def test_new_cache_format_2():
-        pass
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["pixel_values"]
 
-    @unittest.skip(reason="Failing test, need to fix")
-    def test_sample_generate_dict_output():
-        pass
+            model.generate(input_ids, use_cache=True)
 
 
 @require_torch
diff --git a/tests/models/mllama/test_processor_mllama.py b/tests/models/mllama/test_processor_mllama.py
index b6233d9e177cdb..a48a7a2e6da4d2 100644
--- a/tests/models/mllama/test_processor_mllama.py
+++ b/tests/models/mllama/test_processor_mllama.py
@@ -13,7 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import shutil
+import tempfile
 import unittest
+from typing import Optional
 
 import numpy as np
 
@@ -21,6 +24,8 @@
 from transformers.testing_utils import require_torch, require_vision
 from transformers.utils import is_vision_available
 
+from ...test_processing_common import ProcessorTesterMixin
+
 
 if is_vision_available():
     from PIL import Image
@@ -28,17 +33,24 @@
 
 @require_torch
 @require_vision
-class MllamaProcessorTest(unittest.TestCase):
+class MllamaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = MllamaProcessor
+
     def setUp(self):
-        self.checkpoint = "hf-internal-testing/mllama-11b"  # TODO: change
-        self.processor = MllamaProcessor.from_pretrained(self.checkpoint)
+        self.checkpoint = "hf-internal-testing/mllama-11b"
+        processor = MllamaProcessor.from_pretrained(self.checkpoint)
         self.image1 = Image.new("RGB", (224, 220))
         self.image2 = Image.new("RGB", (512, 128))
-        self.image_token = self.processor.image_token
-        self.image_token_id = self.processor.image_token_id
-        self.pad_token_id = self.processor.tokenizer.pad_token_id
-        self.bos_token = self.processor.bos_token
-        self.bos_token_id = self.processor.tokenizer.bos_token_id
+        self.image_token = processor.image_token
+        self.image_token_id = processor.image_token_id
+        self.pad_token_id = processor.tokenizer.pad_token_id
+        self.bos_token = processor.bos_token
+        self.bos_token_id = processor.tokenizer.bos_token_id
+        self.tmpdirname = tempfile.mkdtemp()
+        processor.save_pretrained(self.tmpdirname)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
 
     def test_apply_chat_template(self):
         # Message contains content which a mix of lists with images and image urls and string
@@ -64,8 +76,8 @@ def test_apply_chat_template(self):
                 ],
             },
         ]
-
-        rendered = self.processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+        processor = MllamaProcessor.from_pretrained(self.tmpdirname)
+        rendered = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
 
         expected_rendered = (
             "<|begin_of_text|>"
@@ -96,7 +108,7 @@ def test_apply_chat_template(self):
                 ],
             },
         ]
-        input_ids = self.processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True)
+        input_ids = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True)
         expected_ids = [
             128000,  # <|begin_of_text|>
             128006,  # <|start_header_id|>
@@ -142,7 +154,7 @@ def test_apply_chat_template(self):
             }
         ]
 
-        rendered = self.processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+        rendered = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
         expected_rendered = (
             "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"
             "Describe this image in two sentences<|image|> Test sentence   <|image|>ok\n<|eot_id|>"
@@ -150,7 +162,7 @@ def test_apply_chat_template(self):
         )
         self.assertEqual(rendered, expected_rendered)
 
-        input_ids = self.processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True)
+        input_ids = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True)
         # fmt: off
         expected_ids = [
             128000, 128006, 882, 128007, 271, 75885, 420, 2217, 304, 1403, 23719, 128256,
@@ -176,18 +188,19 @@ def test_apply_chat_template(self):
             }
         ]
 
-        rendered_list = self.processor.apply_chat_template(messages_list, add_generation_prompt=True, tokenize=False)
-        rendered_str = self.processor.apply_chat_template(messages_str, add_generation_prompt=True, tokenize=False)
+        rendered_list = processor.apply_chat_template(messages_list, add_generation_prompt=True, tokenize=False)
+        rendered_str = processor.apply_chat_template(messages_str, add_generation_prompt=True, tokenize=False)
         self.assertEqual(rendered_list, rendered_str)
 
     def test_process_interleaved_images_prompts_image_splitting(self):
+        processor = MllamaProcessor.from_pretrained(self.tmpdirname)
         # Test that a single image is processed correctly
-        inputs = self.processor(images=self.image2, size={"width": 224, "height": 224})
+        inputs = processor(images=self.image2, size={"width": 224, "height": 224})
         self.assertEqual(inputs["pixel_values"].shape, (1, 1, 4, 3, 224, 224))
 
         # Test that text is processed correctly
         text = "<|begin_of_text|>This is a test sentence.<|end_of_text|>"
-        inputs = self.processor(text=text)
+        inputs = processor(text=text)
         expected_ids = [128000, 2028, 374, 264, 1296, 11914, 13, 128001]
         self.assertEqual(inputs["input_ids"][0], expected_ids)
         self.assertEqual(inputs["attention_mask"][0], [1] * len(expected_ids))
@@ -197,7 +210,7 @@ def test_process_interleaved_images_prompts_image_splitting(self):
         image_str = "<|image|>"
         text_str = "This is a test sentence."
         text = image_str + text_str
-        inputs = self.processor(
+        inputs = processor(
             text=text,
             images=self.image1,
             size={"width": 128, "height": 128},
@@ -225,7 +238,7 @@ def test_process_interleaved_images_prompts_image_splitting(self):
         ]
         # fmt: onn
         images = [[self.image1], [self.image1, self.image2]]
-        inputs = self.processor(text=text, images=images, padding=True, size={"width": 256, "height": 256})
+        inputs = processor(text=text, images=images, padding=True, size={"width": 256, "height": 256})
 
         self.assertEqual(inputs["pixel_values"].shape, (2, 2, 4, 3, 256, 256))
         for input_ids_i, attention_mask_i, expected_ids_i in zip(inputs["input_ids"], inputs["attention_mask"], expected_ids):
@@ -264,7 +277,8 @@ def test_process_interleaved_images_prompts_image_error(self):
             "This is a test sentence.",
             "In this other sentence we try some good things",
         ]
-        inputs = self.processor(text=text, images=None, padding=True)
+        processor = MllamaProcessor.from_pretrained(self.tmpdirname)
+        inputs = processor(text=text, images=None, padding=True)
         self.assertIsNotNone(inputs["input_ids"])
 
         text = [
@@ -272,26 +286,40 @@ def test_process_interleaved_images_prompts_image_error(self):
             "In this other sentence we try some good things",
         ]
         with self.assertRaises(ValueError):
-            self.processor(text=text, images=None, padding=True)
+            processor(text=text, images=None, padding=True)
 
         images = [[self.image1], []]
         with self.assertRaises(ValueError):
-            self.processor(text=text, images=images, padding=True)
+            processor(text=text, images=images, padding=True)
 
         text = [
             "This is a test sentence.<|image|>",
             "In this other sentence we try some good things<|image|>",
         ]
         with self.assertRaises(ValueError):
-            self.processor(text=text, images=None, padding=True)
+            processor(text=text, images=None, padding=True)
 
         text = [
             "This is a test sentence.<|image|>",
             "In this other sentence we try some good things<|image|>",
         ]
         images = [[self.image1], [self.image2]]
-        inputs = self.processor(text=text, images=images, padding=True)
+        inputs = processor(text=text, images=images, padding=True)
 
         images = [[self.image1, self.image2], []]
         with self.assertRaises(ValueError):
-            self.processor(text=text, images=None, padding=True)
+            processor(text=text, images=None, padding=True)
+
+    # Override as MllamaProcessor needs image tokens in prompts
+    def prepare_text_inputs(self, batch_size: Optional[int] = None):
+        if batch_size is None:
+            return "lower newer <|image|>"
+
+        if batch_size < 1:
+            raise ValueError("batch_size must be greater than 0")
+
+        if batch_size == 1:
+            return ["lower newer <|image|>"]
+        return ["lower newer <|image|>", "<|image|> upper older longer string"] + ["<|image|> lower newer"] * (
+            batch_size - 2
+        )
diff --git a/tests/models/mra/test_modeling_mra.py b/tests/models/mra/test_modeling_mra.py
index 4c839f5da10a68..7e785b5f588448 100644
--- a/tests/models/mra/test_modeling_mra.py
+++ b/tests/models/mra/test_modeling_mra.py
@@ -42,7 +42,8 @@ def __init__(
         self,
         parent,
         batch_size=2,
-        seq_length=8,
+        # must be [== max_position_embeddings] AND [multiple of block_size (default = 32)] (?)
+        seq_length=64,
         is_training=True,
         use_input_mask=True,
         use_token_type_ids=True,
@@ -55,7 +56,7 @@ def __init__(
         hidden_act="gelu",
         hidden_dropout_prob=0.0,
         attention_probs_dropout_prob=0.0,
-        max_position_embeddings=512,
+        max_position_embeddings=64,
         type_vocab_size=16,
         type_sequence_label_size=2,
         initializer_range=0.02,
diff --git a/tests/models/mt5/test_modeling_mt5.py b/tests/models/mt5/test_modeling_mt5.py
index ec6ec6cd85c651..6e912ec3607d40 100644
--- a/tests/models/mt5/test_modeling_mt5.py
+++ b/tests/models/mt5/test_modeling_mt5.py
@@ -582,7 +582,14 @@ def setUp(self):
     # `QAPipelineTests` is not working well with slow tokenizers (for some models) and we don't want to touch the file
     # `src/transformers/data/processors/squad.py` (where this test fails for this model)
     def is_pipeline_test_to_skip(
-        self, pipeline_test_case_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
         if tokenizer_name is None:
             return True
@@ -1055,6 +1062,26 @@ def test_with_token_classification_head(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_with_token_classification_head(*config_and_inputs)
 
+    def is_pipeline_test_to_skip(
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
+    ):
+        if tokenizer_name is None:
+            return True
+
+        # `MT5EncoderOnlyModelTest` is not working well with slow tokenizers (for some models) and we don't want to touch the file
+        # `src/transformers/data/processors/squad.py` (where this test fails for this model)
+        if pipeline_test_case_name == "TokenClassificationPipelineTests" and not tokenizer_name.endswith("Fast"):
+            return True
+
+        return False
+
 
 @require_torch
 @require_sentencepiece
diff --git a/tests/models/musicgen/test_modeling_musicgen.py b/tests/models/musicgen/test_modeling_musicgen.py
index a385a18b91c5d5..cc30238c8df9f5 100644
--- a/tests/models/musicgen/test_modeling_musicgen.py
+++ b/tests/models/musicgen/test_modeling_musicgen.py
@@ -60,10 +60,6 @@
         MusicgenModel,
         set_seed,
     )
-    from transformers.generation import (
-        GenerateDecoderOnlyOutput,
-        GenerateEncoderDecoderOutput,
-    )
 
 
 def _config_zero_init(config):
@@ -124,6 +120,7 @@ def __init__(
         pad_token_id=99,
         bos_token_id=99,
         num_codebooks=4,
+        audio_channels=1,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -141,6 +138,7 @@ def __init__(
         self.pad_token_id = pad_token_id
         self.bos_token_id = bos_token_id
         self.num_codebooks = num_codebooks
+        self.audio_channels = audio_channels
 
     def prepare_config_and_inputs(self):
         input_ids = ids_tensor([self.batch_size * self.num_codebooks, self.seq_length], self.vocab_size)
@@ -166,6 +164,7 @@ def get_config(self):
             bos_token_id=self.bos_token_id,
             num_codebooks=self.num_codebooks,
             tie_word_embeddings=False,
+            audio_channels=self.audio_channels,
         )
         return config
 
@@ -282,47 +281,15 @@ def test_tie_model_weights(self):
     def test_tied_weights_keys(self):
         pass
 
-    def _get_input_ids_and_config(self, batch_size=2):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        input_ids = inputs_dict["input_ids"]
-
-        _ = inputs_dict.pop("attention_mask", None)
-        inputs_dict = {
-            k: v[:batch_size, ...]
-            for k, v in inputs_dict.items()
-            if "head_mask" not in k and isinstance(v, torch.Tensor)
-        }
-
-        # take max batch_size
-        sequence_length = input_ids.shape[-1]
-        input_ids = input_ids[: batch_size * config.num_codebooks, :]
-
-        attention_mask = torch.ones((batch_size, sequence_length), dtype=torch.long)
-        return config, input_ids, attention_mask, inputs_dict
-
     def _get_logits_processor_kwargs(self, do_sample=False, config=None):
         logits_processor_kwargs = {}
         return logits_processor_kwargs
 
     def test_greedy_generate_stereo_outputs(self):
-        for model_class in self.greedy_sample_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
-            config.audio_channels = 2
-            model = model_class(config).to(torch_device).eval()
-            output_generate = self._greedy_generate(
-                model=model,
-                input_ids=input_ids.to(torch_device),
-                attention_mask=attention_mask.to(torch_device),
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-                inputs_dict={},
-            )
-
-            self.assertIsInstance(output_generate, GenerateDecoderOnlyOutput)
-
-            self.assertNotIn(config.pad_token_id, output_generate)
+        original_audio_channels = self.model_tester.audio_channels
+        self.model_tester.audio_channels = 2
+        super().test_greedy_generate_dict_outputs()
+        self.model_tester.audio_channels = original_audio_channels
 
     @require_flash_attn
     @require_torch_gpu
@@ -998,6 +965,7 @@ def __init__(
         num_codebooks=4,
         num_filters=4,
         codebook_size=128,
+        audio_channels=1,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -1017,6 +985,7 @@ def __init__(
         self.num_codebooks = num_codebooks
         self.num_filters = num_filters
         self.codebook_size = codebook_size
+        self.audio_channels = audio_channels
 
     def prepare_config_and_inputs(self):
         input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
@@ -1052,6 +1021,7 @@ def get_config(self):
             bos_token_id=self.bos_token_id,
             num_codebooks=self.num_codebooks,
             tie_word_embeddings=False,
+            audio_channels=self.audio_channels,
         )
         config = MusicgenConfig.from_sub_models_config(text_encoder_config, audio_encoder_config, decoder_config)
         return config
@@ -1415,170 +1385,10 @@ def test_model_get_set_embeddings(self):
             lm_heads = model.get_output_embeddings()
             self.assertTrue(lm_heads is None or isinstance(lm_heads[0], torch.nn.Linear))
 
-    def _get_input_ids_and_config(self, batch_size=2):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        input_ids = inputs_dict["input_ids"]
-
-        # take max batch_size
-        sequence_length = input_ids.shape[-1]
-        input_ids = input_ids[:batch_size, :]
-        attention_mask = torch.ones((batch_size, sequence_length), dtype=torch.long)
-
-        return config, input_ids, attention_mask
-
-    # override since the `input_ids` cannot be used as the `decoder_input_ids` for musicgen (input / outputs are
-    # different modalities -> different shapes)
-    def _greedy_generate(
-        self,
-        model,
-        input_ids,
-        attention_mask,
-        output_scores=False,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict_in_generate=False,
-    ):
-        model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
-        output_generate = model.generate(
-            input_ids,
-            do_sample=False,
-            num_beams=1,
-            max_new_tokens=self.max_new_tokens,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            output_scores=output_scores,
-            return_dict_in_generate=return_dict_in_generate,
-            remove_invalid_values=True,
-            **model_kwargs,
-        )
-
-        return output_generate
-
-    # override since the `input_ids` cannot be used as the `decoder_input_ids` for musicgen (input / outputs are
-    # different modalities -> different shapes)
-    def _sample_generate(
-        self,
-        model,
-        input_ids,
-        attention_mask,
-        num_return_sequences,
-        output_scores=False,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict_in_generate=False,
-    ):
-        torch.manual_seed(0)
-        model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
-        output_generate = model.generate(
-            input_ids,
-            do_sample=True,
-            num_beams=1,
-            max_new_tokens=self.max_new_tokens,
-            num_return_sequences=num_return_sequences,
-            output_scores=output_scores,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict_in_generate=return_dict_in_generate,
-            remove_invalid_values=True,
-            **model_kwargs,
-        )
-
-        return output_generate
-
     def _get_logits_processor_kwargs(self, do_sample=False, config=None):
         logits_processor_kwargs = {}
         return logits_processor_kwargs
 
-    def test_greedy_generate_dict_outputs(self):
-        for model_class in self.greedy_sample_model_classes:
-            # disable cache
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
-            config.use_cache = False
-            model = model_class(config).to(torch_device).eval()
-            output_generate = self._greedy_generate(
-                model=model,
-                input_ids=input_ids.to(torch_device),
-                attention_mask=attention_mask.to(torch_device),
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
-
-            self.assertIsInstance(output_generate, GenerateEncoderDecoderOutput)
-
-            self.assertNotIn(config.pad_token_id, output_generate)
-
-    def test_greedy_generate_dict_outputs_use_cache(self):
-        for model_class in self.greedy_sample_model_classes:
-            # enable cache
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
-
-            config.use_cache = True
-            config.is_decoder = True
-            model = model_class(config).to(torch_device).eval()
-            output_generate = self._greedy_generate(
-                model=model,
-                input_ids=input_ids.to(torch_device),
-                attention_mask=attention_mask.to(torch_device),
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
-
-            self.assertIsInstance(output_generate, GenerateEncoderDecoderOutput)
-
-    def test_sample_generate(self):
-        for model_class in self.greedy_sample_model_classes:
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
-            model = model_class(config).to(torch_device).eval()
-
-            # check `generate()` and `sample()` are equal
-            output_generate = self._sample_generate(
-                model=model,
-                input_ids=input_ids.to(torch_device),
-                attention_mask=attention_mask.to(torch_device),
-                num_return_sequences=1,
-            )
-            self.assertIsInstance(output_generate, torch.Tensor)
-
-    def test_sample_generate_dict_output(self):
-        for model_class in self.greedy_sample_model_classes:
-            # disable cache
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
-            config.use_cache = False
-            model = model_class(config).to(torch_device).eval()
-
-            output_generate = self._sample_generate(
-                model=model,
-                input_ids=input_ids.to(torch_device),
-                attention_mask=attention_mask.to(torch_device),
-                num_return_sequences=3,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
-
-            self.assertIsInstance(output_generate, GenerateEncoderDecoderOutput)
-
-    def test_generate_without_input_ids(self):
-        config, _, _ = self._get_input_ids_and_config()
-
-        # if no bos token id => cannot generate from None
-        if config.bos_token_id is None:
-            self.skipTest(reason="bos_token_id is None")
-
-        for model_class in self.greedy_sample_model_classes:
-            model = model_class(config).to(torch_device)
-            model.eval()
-
-            output_ids_generate = model.generate(
-                do_sample=False, max_new_tokens=self.max_new_tokens, remove_invalid_values=True
-            )
-            self.assertIsNotNone(output_ids_generate)
-
     @require_torch_fp16
     @require_torch_accelerator  # not all operations are supported in fp16 on CPU
     def test_generate_fp16(self):
@@ -1595,24 +1405,10 @@ def test_generate_fp16(self):
             )
 
     def test_greedy_generate_stereo_outputs(self):
-        for model_class in self.greedy_sample_model_classes:
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
-            config.audio_channels = 2
-
-            model = model_class(config).to(torch_device).eval()
-            output_generate = self._greedy_generate(
-                model=model,
-                input_ids=input_ids.to(torch_device),
-                attention_mask=attention_mask.to(torch_device),
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
-
-            self.assertIsInstance(output_generate, GenerateEncoderDecoderOutput)
-
-            self.assertNotIn(config.pad_token_id, output_generate)
+        original_audio_channels = self.model_tester.audio_channels
+        self.model_tester.audio_channels = 2
+        super().test_greedy_generate_dict_outputs()
+        self.model_tester.audio_channels = original_audio_channels
 
     @unittest.skip(
         reason="MusicgenModel is actually not the base of MusicgenForCausalLM as the latter is a composit model"
diff --git a/tests/models/musicgen/test_processing_musicgen.py b/tests/models/musicgen/test_processor_musicgen.py
similarity index 100%
rename from tests/models/musicgen/test_processing_musicgen.py
rename to tests/models/musicgen/test_processor_musicgen.py
diff --git a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
index e8584e238d3cd9..35af9fe0768da8 100644
--- a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
+++ b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
@@ -61,9 +61,6 @@
         MusicgenMelodyModel,
         set_seed,
     )
-    from transformers.generation import (
-        GenerateDecoderOnlyOutput,
-    )
 
 if is_torchaudio_available():
     from transformers import MusicgenMelodyProcessor
@@ -124,6 +121,7 @@ def __init__(
         bos_token_id=99,
         num_codebooks=4,
         conditional_seq_length=4,
+        audio_channels=1,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -143,6 +141,7 @@ def __init__(
         self.num_codebooks = num_codebooks
         self.conditional_seq_length = conditional_seq_length
         self.encoder_seq_length = conditional_seq_length + seq_length
+        self.audio_channels = audio_channels
 
     def prepare_config_and_inputs(self):
         input_ids = ids_tensor([self.batch_size * self.num_codebooks, self.seq_length], self.vocab_size)
@@ -168,6 +167,7 @@ def get_config(self):
             bos_token_id=self.bos_token_id,
             num_codebooks=self.num_codebooks,
             tie_word_embeddings=False,
+            audio_channels=self.audio_channels,
         )
         return config
 
@@ -285,46 +285,15 @@ def test_tie_model_weights(self):
     def test_tied_weights_keys(self):
         pass
 
-    def _get_input_ids_and_config(self, batch_size=2):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        input_ids = inputs_dict["input_ids"]
-
-        _ = inputs_dict.pop("attention_mask", None)
-        inputs_dict = {
-            k: v[:batch_size, ...]
-            for k, v in inputs_dict.items()
-            if "head_mask" not in k and isinstance(v, torch.Tensor)
-        }
-
-        # take max batch_size
-        sequence_length = input_ids.shape[-1]
-        input_ids = input_ids[: batch_size * config.num_codebooks, :]
-
-        attention_mask = torch.ones((batch_size, sequence_length), dtype=torch.long)
-        return config, input_ids, attention_mask, inputs_dict
-
     def _get_logits_processor_kwargs(self, do_sample=False, config=None):
         logits_processor_kwargs = {}
         return logits_processor_kwargs
 
     def test_greedy_generate_stereo_outputs(self):
-        for model_class in self.greedy_sample_model_classes:
-            config, input_ids, attention_mask, _ = self._get_input_ids_and_config()
-            config.audio_channels = 2
-            model = model_class(config).to(torch_device).eval()
-            output_generate = self._greedy_generate(
-                model=model,
-                input_ids=input_ids.to(torch_device),
-                attention_mask=attention_mask.to(torch_device),
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-                inputs_dict={},
-            )
-
-            self.assertIsInstance(output_generate, GenerateDecoderOnlyOutput)
-            self.assertNotIn(config.pad_token_id, output_generate)
+        original_audio_channels = self.model_tester.audio_channels
+        self.model_tester.audio_channels = 2
+        super().test_greedy_generate_dict_outputs()
+        self.model_tester.audio_channels = original_audio_channels
 
     @require_flash_attn
     @require_torch_gpu
@@ -996,6 +965,7 @@ def __init__(
         codebook_size=128,
         conditional_seq_length=3,
         chroma_length=24,
+        audio_channels=1,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -1018,6 +988,7 @@ def __init__(
         self.conditional_seq_length = conditional_seq_length
         self.chroma_length = chroma_length
         self.encoder_seq_length = conditional_seq_length + seq_length
+        self.audio_channels = audio_channels
 
     def prepare_config_and_inputs(self):
         input_ids = ids_tensor([self.batch_size, self.conditional_seq_length], self.vocab_size)
@@ -1053,6 +1024,7 @@ def get_config(self):
             bos_token_id=self.bos_token_id,
             num_codebooks=self.num_codebooks,
             tie_word_embeddings=False,
+            audio_channels=self.audio_channels,
         )
         config = MusicgenMelodyConfig.from_sub_models_config(
             text_encoder_config, audio_encoder_config, decoder_config, chroma_length=self.chroma_length
@@ -1399,170 +1371,10 @@ def test_model_get_set_embeddings(self):
             lm_heads = model.get_output_embeddings()
             self.assertTrue(lm_heads is None or isinstance(lm_heads[0], torch.nn.Linear))
 
-    def _get_input_ids_and_config(self, batch_size=2):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        input_ids = inputs_dict["input_ids"]
-
-        # take max batch_size
-        sequence_length = input_ids.shape[-1]
-        input_ids = input_ids[:batch_size, :]
-        attention_mask = torch.ones((batch_size, sequence_length), dtype=torch.long)
-
-        return config, input_ids, attention_mask
-
-    # override since the `input_ids` cannot be used as the `decoder_input_ids` for musicgen_melody (input / outputs are
-    # different modalities -> different shapes)
-    def _greedy_generate(
-        self,
-        model,
-        input_ids,
-        attention_mask,
-        output_scores=False,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict_in_generate=False,
-    ):
-        model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
-        output_generate = model.generate(
-            input_ids,
-            do_sample=False,
-            num_beams=1,
-            max_new_tokens=self.max_new_tokens,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            output_scores=output_scores,
-            return_dict_in_generate=return_dict_in_generate,
-            remove_invalid_values=True,
-            **model_kwargs,
-        )
-
-        return output_generate
-
-    # override since the `input_ids` cannot be used as the `decoder_input_ids` for musicgen_melody (input / outputs are
-    # different modalities -> different shapes)
-    def _sample_generate(
-        self,
-        model,
-        input_ids,
-        attention_mask,
-        num_return_sequences,
-        output_scores=False,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict_in_generate=False,
-    ):
-        torch.manual_seed(0)
-        model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
-        output_generate = model.generate(
-            input_ids,
-            do_sample=True,
-            num_beams=1,
-            max_new_tokens=self.max_new_tokens,
-            num_return_sequences=num_return_sequences,
-            output_scores=output_scores,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict_in_generate=return_dict_in_generate,
-            remove_invalid_values=True,
-            **model_kwargs,
-        )
-
-        return output_generate
-
     def _get_logits_processor_kwargs(self, do_sample=False, config=None):
         logits_processor_kwargs = {}
         return logits_processor_kwargs
 
-    def test_greedy_generate_dict_outputs(self):
-        for model_class in self.greedy_sample_model_classes:
-            # disable cache
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
-            config.use_cache = False
-            model = model_class(config).to(torch_device).eval()
-            output_generate = self._greedy_generate(
-                model=model,
-                input_ids=input_ids.to(torch_device),
-                attention_mask=attention_mask.to(torch_device),
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
-
-            self.assertIsInstance(output_generate, GenerateDecoderOnlyOutput)
-
-            self.assertNotIn(config.pad_token_id, output_generate)
-
-    def test_greedy_generate_dict_outputs_use_cache(self):
-        for model_class in self.greedy_sample_model_classes:
-            # enable cache
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
-
-            config.use_cache = True
-            config.is_decoder = True
-            model = model_class(config).to(torch_device).eval()
-            output_generate = self._greedy_generate(
-                model=model,
-                input_ids=input_ids.to(torch_device),
-                attention_mask=attention_mask.to(torch_device),
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
-
-            self.assertIsInstance(output_generate, GenerateDecoderOnlyOutput)
-
-    def test_sample_generate(self):
-        for model_class in self.greedy_sample_model_classes:
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
-            model = model_class(config).to(torch_device).eval()
-
-            # check `generate()` and `sample()` are equal
-            output_generate = self._sample_generate(
-                model=model,
-                input_ids=input_ids.to(torch_device),
-                attention_mask=attention_mask.to(torch_device),
-                num_return_sequences=1,
-            )
-            self.assertIsInstance(output_generate, torch.Tensor)
-
-    def test_sample_generate_dict_output(self):
-        for model_class in self.greedy_sample_model_classes:
-            # disable cache
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
-            config.use_cache = False
-            model = model_class(config).to(torch_device).eval()
-
-            output_generate = self._sample_generate(
-                model=model,
-                input_ids=input_ids.to(torch_device),
-                attention_mask=attention_mask.to(torch_device),
-                num_return_sequences=3,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
-
-            self.assertIsInstance(output_generate, GenerateDecoderOnlyOutput)
-
-    def test_generate_without_input_ids(self):
-        config, _, _ = self._get_input_ids_and_config()
-
-        # if no bos token id => cannot generate from None
-        if config.bos_token_id is None:
-            self.skipTest(reason="bos_token_id is None")
-
-        for model_class in self.greedy_sample_model_classes:
-            model = model_class(config).to(torch_device)
-            model.eval()
-
-            output_ids_generate = model.generate(
-                do_sample=False, max_new_tokens=self.max_new_tokens, remove_invalid_values=True
-            )
-            self.assertIsNotNone(output_ids_generate)
-
     @require_torch_fp16
     @require_torch_accelerator  # not all operations are supported in fp16 on CPU
     def test_generate_fp16(self):
@@ -1579,24 +1391,10 @@ def test_generate_fp16(self):
             )
 
     def test_greedy_generate_stereo_outputs(self):
-        for model_class in self.greedy_sample_model_classes:
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
-            config.audio_channels = 2
-
-            model = model_class(config).to(torch_device).eval()
-            output_generate = self._greedy_generate(
-                model=model,
-                input_ids=input_ids.to(torch_device),
-                attention_mask=attention_mask.to(torch_device),
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
-
-            self.assertIsInstance(output_generate, GenerateDecoderOnlyOutput)
-
-            self.assertNotIn(config.pad_token_id, output_generate)
+        original_audio_channels = self.model_tester.audio_channels
+        self.model_tester.audio_channels = 2
+        super().test_greedy_generate_dict_outputs()
+        self.model_tester.audio_channels = original_audio_channels
 
     @unittest.skip(
         reason="MusicgenMelodyModel is actually not the base of MusicgenMelodyForCausalLM as the latter is a composit model"
diff --git a/tests/models/musicgen_melody/test_processor_musicgen_melody.py b/tests/models/musicgen_melody/test_processor_musicgen_melody.py
index e00f31c495990f..04fb94c64c3da8 100644
--- a/tests/models/musicgen_melody/test_processor_musicgen_melody.py
+++ b/tests/models/musicgen_melody/test_processor_musicgen_melody.py
@@ -50,7 +50,7 @@ def floats_list(shape, scale=1.0, rng=None, name=None):
 @require_torch
 @require_sentencepiece
 @require_torchaudio
-# Copied from tests.models.musicgen.test_processing_musicgen.MusicgenProcessorTest with Musicgen->MusicgenMelody, Encodec->MusicgenMelody, padding_mask->attention_mask, input_values->input_features
+# Copied from tests.models.musicgen.test_processor_musicgen.MusicgenProcessorTest with Musicgen->MusicgenMelody, Encodec->MusicgenMelody, padding_mask->attention_mask, input_values->input_features
 class MusicgenMelodyProcessorTest(unittest.TestCase):
     def setUp(self):
         # Ignore copy
diff --git a/tests/models/mvp/test_modeling_mvp.py b/tests/models/mvp/test_modeling_mvp.py
index 33c6d778448d19..f07574bdb54e1a 100644
--- a/tests/models/mvp/test_modeling_mvp.py
+++ b/tests/models/mvp/test_modeling_mvp.py
@@ -441,10 +441,17 @@ class MvpModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
 
     # TODO: Fix the failed tests
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
         if (
-            pipeline_test_casse_name == "QAPipelineTests"
+            pipeline_test_case_name == "QAPipelineTests"
             and tokenizer_name is not None
             and not tokenizer_name.endswith("Fast")
         ):
diff --git a/tests/models/myt5/__init__.py b/tests/models/myt5/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tests/models/myt5/test_tokenization_myt5.py b/tests/models/myt5/test_tokenization_myt5.py
new file mode 100644
index 00000000000000..36e10ac36da6ac
--- /dev/null
+++ b/tests/models/myt5/test_tokenization_myt5.py
@@ -0,0 +1,188 @@
+# coding=utf-8
+# Copyright 2024
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import binascii
+import unittest
+
+from transformers import MyT5Tokenizer
+from transformers.utils import is_tf_available, is_torch_available
+
+from ...test_tokenization_common import TokenizerTesterMixin
+
+
+if is_torch_available():
+    FRAMEWORK = "pt"
+elif is_tf_available():
+    FRAMEWORK = "tf"
+else:
+    FRAMEWORK = "jax"
+
+
+def bytes_to_hex(bline: bytes, sep: str = " ") -> str:
+    return str(binascii.hexlify(bline, sep), "utf-8")
+
+
+def str_to_hex(line: str, sep: str = " ") -> str:
+    return bytes_to_hex(bytes(line, "utf-8"), sep)
+
+
+class TestByteRewriter(unittest.TestCase):
+    def setUp(self) -> None:
+        self.tokenizer = MyT5Tokenizer.from_pretrained("Tomlim/myt5-base")
+
+    def test_simple_decompose(self):
+        decompose_rewriter = self.tokenizer.decompose_rewriter
+
+        # test rewriting
+        in_str = "Hello WorlD"
+        out_str = "hAello wAorldA"
+
+        in_hex = str_to_hex(in_str).split(" ")
+        out_hex = str_to_hex(out_str).split(" ")
+
+        self.assertEqual(decompose_rewriter.rewrite_bytes(in_hex), out_hex)
+
+    def test_simple_decompose_reversible(self):
+        decompose_rewriter = self.tokenizer.decompose_rewriter
+
+        in_str = "Hello WorlD"
+        out_str = "Hello WorlD"
+
+        in_hex = str_to_hex(in_str).split(" ")
+        out_hex = str_to_hex(out_str).split(" ")
+
+        self.assertEqual(
+            decompose_rewriter.rewrite_bytes(decompose_rewriter.rewrite_bytes(in_hex), reverse=True), out_hex
+        )
+
+    def test_simple_decompose_non_latin(self):
+        decompose_rewriter = self.tokenizer.decompose_rewriter
+
+        in_str = "你好世界 Hello WorlD"
+        out_str = "你好世界 hAello wAorldA"
+
+        in_hex = str_to_hex(in_str).split(" ")
+        out_hex = str_to_hex(out_str).split(" ")
+
+        self.assertEqual(decompose_rewriter.rewrite_bytes(in_hex), out_hex)
+
+    def test_unrecognized_byte(self):
+        decompose_rewriter = self.tokenizer.decompose_rewriter
+
+        in_hex = ["00", "01", "xx", "03", "61"]
+        out_hex = ["00", "01", "xx", "03", "61"]
+
+        self.assertEqual(decompose_rewriter.rewrite_bytes(in_hex), out_hex)
+
+
+class MyT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+    tokenizer_class = MyT5Tokenizer
+    test_rust_tokenizer = False
+
+    def setUp(self):
+        super().setUp()
+
+    def get_tokenizer(self, **kwargs) -> MyT5Tokenizer:
+        return self.tokenizer_class.from_pretrained("Tomlim/myt5-base", **kwargs)
+
+    @unittest.skip(reason="inputs cannot be pretokenized as ids depend on whole input string")
+    def test_pretokenized_inputs(self):
+        pass
+
+    def test_convert_tokens_to_string_format(self):
+        tokenizer = self.get_tokenizer()
+        with self.subTest(f"{tokenizer.__class__.__name__}"):
+            tokens = ["52", "85", "91", "9f", "6f", "20", "52", "85", "9f", "90", "</s>"]
+            string = tokenizer.convert_tokens_to_string(tokens)
+
+            self.assertIsInstance(string, str)
+
+    def test_simple_tokenize(self):
+        tokenizer = self.get_tokenizer()
+
+        in_str = "Hello World"
+        out_tokens = ["52", "85", "91", "9f", "6f", "20", "52", "85", "9f", "90"]
+
+        self.assertEqual(tokenizer.tokenize(in_str), out_tokens)
+
+        in_pl_str = "Witaj świecie"
+        out_tokens = ["77", "41", "69", "74", "61", "6a", "20", "4b", "a5", "97", "63", "69", "65"]
+
+        self.assertEqual(tokenizer.tokenize(in_pl_str), out_tokens)
+
+        in_jp_str = "こんにちは世界"
+        out_tokens = ["58", "80", "91", "a1", "e4", "b8", "96", "e7", "95", "8c"]
+
+        self.assertEqual(tokenizer.tokenize(in_jp_str), out_tokens)
+
+    def test_batch_tokenize(self):
+        tokenizer = self.get_tokenizer()
+
+        in_batch = ["Hello World", "Witaj świecie", "こんにちは世界"]
+
+        out_tokens = [
+            ["52", "85", "91", "9f", "6f", "20", "52", "85", "9f", "90", "</s>"],
+            ["77", "41", "69", "74", "61", "6a", "20", "4b", "a5", "97", "63", "69", "65", "</s>"],
+            ["58", "80", "91", "a1", "e4", "b8", "96", "e7", "95", "8c", "</s>"],
+        ]
+
+        self.assertListEqual(
+            [tokenizer.convert_ids_to_tokens(ids) for ids in tokenizer(in_batch)["input_ids"]], out_tokens
+        )
+
+    def test_special_bytes(self):
+        tokenizer = self.get_tokenizer()
+
+        in_str_special = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09"
+        out_tokens = ["00", "01", "02", "03", "04", "05", "06", "07", "08", "09"]
+
+        self.assertEqual(tokenizer.tokenize(in_str_special), out_tokens)
+
+        in_str_mixed = "\x00Hello\x01 World\x02"
+        out_tokens = ["00", "52", "85", "91", "9f", "6f", "01", "20", "52", "85", "9f", "90", "02"]
+
+        self.assertEqual(tokenizer.tokenize(in_str_mixed), out_tokens)
+
+    def test_special_tokens(self):
+        tokenizer = self.get_tokenizer()
+
+        in_str_special = "<unk></s><pad>"
+        out_tokens = ["<unk>", "</s>", "<pad>"]
+
+        self.assertEqual(tokenizer.tokenize(in_str_special), out_tokens)
+
+        in_str_not_special = "<s>"
+        out_tokens = ["3c", "73", "3e"]
+
+        self.assertEqual(tokenizer.tokenize(in_str_not_special), out_tokens)
+
+        in_str_mixed = "<s>Hello World</s>"
+        out_tokens = ["3c", "73", "3e", "52", "85", "91", "9f", "6f", "20", "52", "85", "9f", "90", "</s>"]
+
+        self.assertEqual(tokenizer.tokenize(in_str_mixed), out_tokens)
+
+    def test_token_ids_conversion(self):
+        tokenizer = self.get_tokenizer()
+
+        tokens_range = [f"{x:02x}" for x in range(256)]
+        indices_range = list(range(3, 256 + 3))
+
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens_range), indices_range)
+        self.assertListEqual(tokenizer.convert_ids_to_tokens(indices_range), tokens_range)
+
+        special_tokens = ["<pad>", "</s>", "<unk>"]
+        special_indices = [0, 1, 2]
+
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(special_tokens), special_indices)
+        self.assertListEqual(tokenizer.convert_ids_to_tokens(special_indices), special_tokens)
diff --git a/tests/models/nemotron/test_modeling_nemotron.py b/tests/models/nemotron/test_modeling_nemotron.py
index 4f8f4cc77fe8d0..13adfe1e579489 100644
--- a/tests/models/nemotron/test_modeling_nemotron.py
+++ b/tests/models/nemotron/test_modeling_nemotron.py
@@ -94,6 +94,8 @@ class NemotronModelTest(GemmaModelTest):
 
     # used in `test_torch_compile`
     _torch_compile_test_ckpt = "nvidia/nemotron-3-8b-base-4k-hf"
+    # used in `test_torch_compile_for_training`
+    _torch_compile_train_cls = NemotronForCausalLM if is_torch_available() else None
 
     def setUp(self):
         self.model_tester = NemotronModelTester(self)
diff --git a/tests/models/nllb_moe/test_modeling_nllb_moe.py b/tests/models/nllb_moe/test_modeling_nllb_moe.py
index d8dc3b6ef31130..f341fb078ef939 100644
--- a/tests/models/nllb_moe/test_modeling_nllb_moe.py
+++ b/tests/models/nllb_moe/test_modeling_nllb_moe.py
@@ -266,7 +266,14 @@ class NllbMoeModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
 
     # TODO: Fix the failed tests when this model gets more usage
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
         # Saving the slow tokenizer after saving the fast tokenizer causes the loading of the later hanging forever.
         return True
diff --git a/tests/models/olmo/test_modeling_olmo.py b/tests/models/olmo/test_modeling_olmo.py
index b74d0fdf03b8f6..43e0b7afb49f8a 100644
--- a/tests/models/olmo/test_modeling_olmo.py
+++ b/tests/models/olmo/test_modeling_olmo.py
@@ -101,7 +101,7 @@ def prepare_config_and_inputs(self):
 
         input_mask = None
         if self.use_input_mask:
-            input_mask = torch.tril(torch.ones(self.batch_size, self.seq_length)).to(torch_device)
+            input_mask = torch.tril(torch.ones_like(input_ids).to(torch_device))
 
         token_type_ids = None
         if self.use_token_type_ids:
diff --git a/tests/models/olmoe/test_modeling_olmoe.py b/tests/models/olmoe/test_modeling_olmoe.py
index 1ce231e0373152..9c3af5723ee18b 100644
--- a/tests/models/olmoe/test_modeling_olmoe.py
+++ b/tests/models/olmoe/test_modeling_olmoe.py
@@ -111,7 +111,7 @@ def prepare_config_and_inputs(self):
 
         input_mask = None
         if self.use_input_mask:
-            input_mask = torch.tril(torch.ones(self.batch_size, self.seq_length)).to(torch_device)
+            input_mask = torch.tril(torch.ones_like(input_ids).to(torch_device))
 
         token_type_ids = None
         if self.use_token_type_ids:
diff --git a/tests/models/omdet_turbo/test_processor_omdet_turbo.py b/tests/models/omdet_turbo/test_processor_omdet_turbo.py
index e6e2a1f50c52cd..52e1926e50b22f 100644
--- a/tests/models/omdet_turbo/test_processor_omdet_turbo.py
+++ b/tests/models/omdet_turbo/test_processor_omdet_turbo.py
@@ -17,7 +17,6 @@
 import tempfile
 import unittest
 
-import numpy as np
 import pytest
 
 from transformers import AutoProcessor, CLIPTokenizerFast, OmDetTurboProcessor
@@ -36,8 +35,6 @@
     from transformers.models.omdet_turbo.modeling_omdet_turbo import OmDetTurboObjectDetectionOutput
 
 if is_vision_available():
-    from PIL import Image
-
     from transformers import DetrImageProcessor
 
 
@@ -45,6 +42,7 @@
 @require_vision
 class OmDetTurboProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     processor_class = OmDetTurboProcessor
+    text_input_name = "classes_input_ids"
 
     def setUp(self):
         self.tmpdirname = tempfile.mkdtemp()
@@ -77,17 +75,6 @@ def get_image_processor(self, **kwargs):
     def tearDown(self):
         shutil.rmtree(self.tmpdirname)
 
-    def prepare_image_inputs(self):
-        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
-        or a list of PyTorch tensors if one specifies torchify=True.
-        """
-
-        image_inputs = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)]
-
-        image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
-
-        return image_inputs
-
     def get_fake_omdet_turbo_output(self):
         torch.manual_seed(42)
         return OmDetTurboObjectDetectionOutput(
@@ -210,154 +197,3 @@ def test_model_input_names(self):
         inputs = processor(images=image_input, text=input_classes, task=input_tasks, return_tensors="pt")
 
         self.assertListEqual(list(inputs.keys()), self.input_keys)
-
-    @require_vision
-    @require_torch
-    def test_tokenizer_defaults_preserved_by_kwargs(self):
-        # Rewrite as OmDet-Turbo processor outputs "input_ids" for both tasks and classes.
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer", max_length=117)
-
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
-        self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-        inputs = processor(images=image_input, text=[input_str], task=input_str, return_tensors="pt")
-
-        self.assertEqual(len(inputs["tasks_input_ids"][0]), 117)
-        self.assertEqual(len(inputs["classes_input_ids"][0]), 117)
-
-    @require_vision
-    @require_torch
-    def test_kwargs_overrides_default_tokenizer_kwargs(self):
-        # Rewrite as OmDet-Turbo processor outputs "input_ids" for both tasks and classes.
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer", max_length=117)
-
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
-        self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-        inputs = processor(images=image_input, text=[input_str], task=input_str, return_tensors="pt", max_length=112)
-
-        self.assertEqual(len(inputs["tasks_input_ids"][0]), 112)
-        self.assertEqual(len(inputs["classes_input_ids"][0]), 112)
-
-    @require_torch
-    @require_vision
-    def test_unstructured_kwargs(self):
-        # Rewrite as OmDet-Turbo processor outputs "input_ids" for both tasks and classes.
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer")
-
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-        inputs = processor(
-            images=image_input,
-            text=[input_str],
-            task=input_str,
-            return_tensors="pt",
-            size={"height": 214, "width": 214},
-            padding="max_length",
-            max_length=76,
-        )
-
-        self.assertEqual(inputs["pixel_values"].shape[2], 214)
-        self.assertEqual(len(inputs["tasks_input_ids"][0]), 76)
-        self.assertEqual(len(inputs["classes_input_ids"][0]), 76)
-
-    @require_torch
-    @require_vision
-    def test_unstructured_kwargs_batched(self):
-        # Rewrite as OmDet-Turbo processor outputs "input_ids" for both tasks and classes.
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer")
-
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = ["lower newer", "upper older longer string"]
-        image_input = self.prepare_image_inputs() * 2
-        inputs = processor(
-            images=image_input,
-            text=[input_str],
-            task=input_str,
-            return_tensors="pt",
-            size={"height": 214, "width": 214},
-            padding="longest",
-            max_length=76,
-        )
-
-        self.assertEqual(inputs["pixel_values"].shape[2], 214)
-
-        self.assertEqual(len(inputs["tasks_input_ids"][0]), 6)
-        self.assertEqual(len(inputs["classes_input_ids"][0]), 6)
-
-    @require_torch
-    @require_vision
-    def test_structured_kwargs_nested(self):
-        # Rewrite as OmDet-Turbo processor outputs "input_ids" for both tasks and classes.
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer")
-
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        # Define the kwargs for each modality
-        all_kwargs = {
-            "common_kwargs": {"return_tensors": "pt"},
-            "images_kwargs": {"size": {"height": 214, "width": 214}},
-            "text_kwargs": {"padding": "max_length", "max_length": 76, "task": input_str},
-        }
-
-        inputs = processor(images=image_input, text=[input_str], **all_kwargs)
-        self.skip_processor_without_typed_kwargs(processor)
-
-        self.assertEqual(inputs["pixel_values"].shape[2], 214)
-
-        self.assertEqual(len(inputs["tasks_input_ids"][0]), 76)
-        self.assertEqual(len(inputs["classes_input_ids"][0]), 76)
-
-    @require_torch
-    @require_vision
-    def test_structured_kwargs_nested_from_dict(self):
-        # Rewrite as OmDet-Turbo processor outputs "input_ids" for both tasks and classes.
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer")
-
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
-        self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        # Define the kwargs for each modality
-        all_kwargs = {
-            "common_kwargs": {"return_tensors": "pt"},
-            "images_kwargs": {"size": {"height": 214, "width": 214}},
-            "text_kwargs": {"padding": "max_length", "max_length": 76, "task": input_str},
-        }
-
-        inputs = processor(images=image_input, text=[input_str], **all_kwargs)
-        self.assertEqual(inputs["pixel_values"].shape[2], 214)
-
-        self.assertEqual(len(inputs["tasks_input_ids"][0]), 76)
-        self.assertEqual(len(inputs["classes_input_ids"][0]), 76)
diff --git a/tests/models/oneformer/test_modeling_oneformer.py b/tests/models/oneformer/test_modeling_oneformer.py
index ac8f044c556828..f27302f8e19ad1 100644
--- a/tests/models/oneformer/test_modeling_oneformer.py
+++ b/tests/models/oneformer/test_modeling_oneformer.py
@@ -247,9 +247,16 @@ class OneFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas
 
     # TODO: Fix the failed tests when this model gets more usage
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
-        if pipeline_test_casse_name == "FeatureExtractionPipelineTests":
+        if pipeline_test_case_name == "FeatureExtractionPipelineTests":
             return True
 
         return False
diff --git a/tests/models/openai/test_modeling_openai.py b/tests/models/openai/test_modeling_openai.py
index 49e6d50bc4287a..8a61b05e831d15 100644
--- a/tests/models/openai/test_modeling_openai.py
+++ b/tests/models/openai/test_modeling_openai.py
@@ -211,9 +211,16 @@ class OpenAIGPTModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
 
     # TODO: Fix the failed tests
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
-        if pipeline_test_casse_name == "ZeroShotClassificationPipelineTests":
+        if pipeline_test_case_name == "ZeroShotClassificationPipelineTests":
             # Get `tokenizer does not have a padding token` error for both fast/slow tokenizers.
             # `OpenAIGPTConfig` was never used in pipeline tests, either because of a missing checkpoint or because a
             # tiny config could not be created.
diff --git a/tests/models/openai/test_modeling_tf_openai.py b/tests/models/openai/test_modeling_tf_openai.py
index f6bf7c521765a3..43ffe4592a2839 100644
--- a/tests/models/openai/test_modeling_tf_openai.py
+++ b/tests/models/openai/test_modeling_tf_openai.py
@@ -217,9 +217,16 @@ class TFOpenAIGPTModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.Tes
 
     # TODO: Fix the failed tests
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
-        if pipeline_test_casse_name == "ZeroShotClassificationPipelineTests":
+        if pipeline_test_case_name == "ZeroShotClassificationPipelineTests":
             # Get `tokenizer does not have a padding token` error for both fast/slow tokenizers.
             # `OpenAIGPTConfig` was never used in pipeline tests, either because of a missing checkpoint or because a
             # tiny config could not be created.
diff --git a/tests/models/opt/test_modeling_flax_opt.py b/tests/models/opt/test_modeling_flax_opt.py
index ef94633f22a87e..5ebf23d86a3238 100644
--- a/tests/models/opt/test_modeling_flax_opt.py
+++ b/tests/models/opt/test_modeling_flax_opt.py
@@ -70,6 +70,7 @@ def __init__(
         embed_dim=16,
         word_embed_proj_dim=16,
         initializer_range=0.02,
+        attn_implemetation="eager",
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -92,6 +93,7 @@ def __init__(
         self.word_embed_proj_dim = word_embed_proj_dim
         self.initializer_range = initializer_range
         self.is_encoder_decoder = False
+        self.attn_implementation = attn_implemetation
 
     def prepare_config_and_inputs(self):
         input_ids = np.clip(ids_tensor([self.batch_size, self.seq_length - 1], self.vocab_size), 3, self.vocab_size)
@@ -114,6 +116,7 @@ def prepare_config_and_inputs(self):
             word_embed_proj_dim=self.word_embed_proj_dim,
             initializer_range=self.initializer_range,
             use_cache=False,
+            attn_implementation=self.attn_implementation,
         )
         inputs_dict = prepare_opt_inputs_dict(config, input_ids)
         return config, inputs_dict
diff --git a/tests/models/opt/test_modeling_opt.py b/tests/models/opt/test_modeling_opt.py
index 83721f1281f4d8..2093dfe685b3ee 100644
--- a/tests/models/opt/test_modeling_opt.py
+++ b/tests/models/opt/test_modeling_opt.py
@@ -21,7 +21,14 @@
 import timeout_decorator  # noqa
 
 from transformers import OPTConfig, is_torch_available
-from transformers.testing_utils import require_torch, require_torch_accelerator, require_torch_fp16, slow, torch_device
+from transformers.testing_utils import (
+    require_torch,
+    require_torch_accelerator,
+    require_torch_fp16,
+    require_torch_sdpa,
+    slow,
+    torch_device,
+)
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
@@ -83,6 +90,7 @@ def __init__(
         num_labels=3,
         word_embed_proj_dim=16,
         type_sequence_label_size=2,
+        attn_implementation="eager",
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -106,6 +114,7 @@ def __init__(
         self.type_sequence_label_size = type_sequence_label_size
         self.word_embed_proj_dim = word_embed_proj_dim
         self.is_encoder_decoder = False
+        self.attn_implementation = attn_implementation
 
     def prepare_config_and_inputs(self):
         input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
@@ -135,6 +144,7 @@ def get_config(self):
             embed_dim=self.embed_dim,
             is_encoder_decoder=False,
             word_embed_proj_dim=self.word_embed_proj_dim,
+            attn_implementation=self.attn_implementation,
         )
 
     def get_pipeline_config(self):
@@ -221,10 +231,17 @@ class OPTModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
 
     # TODO: Fix the failed tests
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
         if (
-            pipeline_test_casse_name == "QAPipelineTests"
+            pipeline_test_case_name == "QAPipelineTests"
             and tokenizer_name is not None
             and not tokenizer_name.endswith("Fast")
         ):
@@ -322,6 +339,68 @@ def test_opt_sequence_classification_model_for_multi_label(self):
         result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
         self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
 
+    @require_torch_sdpa
+    @slow
+    def test_eager_matches_sdpa_generate(self):
+        """
+        Overwritting the common test as the test is flaky on tiny models
+        """
+        max_new_tokens = 30
+
+        tokenizer = GPT2Tokenizer.from_pretrained("facebook/opt-350M")
+
+        texts = [
+            "hi here's a longer context, getting longer and",
+            "Hello this is a very long sentence my friend, very long for real",
+            "Today I am in Paris and",
+        ]
+
+        model_sdpa = OPTForCausalLM.from_pretrained(
+            "facebook/opt-350M",
+            torch_dtype=torch.float16,
+            low_cpu_mem_usage=True,
+            attn_implementation="sdpa",
+        ).to(torch_device)
+
+        self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
+
+        model_eager = OPTForCausalLM.from_pretrained(
+            "facebook/opt-350M",
+            torch_dtype=torch.float16,
+            low_cpu_mem_usage=True,
+            attn_implementation="eager",
+        ).to(torch_device)
+
+        self.assertTrue(model_eager.config._attn_implementation == "eager")
+
+        for _, submodule in model_eager.named_modules():
+            if "SdpaAttention" in submodule.__class__.__name__:
+                raise ValueError("The eager model should not have SDPA attention layers")
+
+        has_sdpa = False
+        for _, submodule in model_sdpa.named_modules():
+            if "SdpaAttention" in submodule.__class__.__name__:
+                has_sdpa = True
+                break
+        if not has_sdpa:
+            raise ValueError("The SDPA model should have SDPA attention layers")
+
+        for padding_side in ["left", "right"]:
+            tokenizer.padding_side = padding_side
+            tokenizer.pad_token = tokenizer.eos_token
+
+            inputs = tokenizer(texts, return_tensors="pt", padding=True).to(torch_device)
+
+            res_eager = model_eager.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
+            res_sdpa = model_sdpa.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
+
+            with self.subTest(f"{padding_side}"):
+                torch.testing.assert_close(
+                    res_eager,
+                    res_sdpa,
+                    msg=f"\n{tokenizer.batch_decode(res_eager)} \nvs\n{tokenizer.batch_decode(res_sdpa)}",
+                )
+
     @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
     def test_model_parallelism(self):
         super().test_model_parallelism()
diff --git a/tests/models/opt/test_modeling_tf_opt.py b/tests/models/opt/test_modeling_tf_opt.py
index 158baa4ce65e25..39c38170e3f633 100644
--- a/tests/models/opt/test_modeling_tf_opt.py
+++ b/tests/models/opt/test_modeling_tf_opt.py
@@ -66,6 +66,7 @@ def __init__(
         bos_token_id=0,
         embed_dim=16,
         word_embed_proj_dim=16,
+        attn_implementation="eager",
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -87,6 +88,7 @@ def __init__(
         self.embed_dim = embed_dim
         self.word_embed_proj_dim = word_embed_proj_dim
         self.is_encoder_decoder = False
+        self.attn_implementation = attn_implementation
 
     def prepare_config_and_inputs_for_common(self):
         input_ids = ids_tensor([self.batch_size, self.seq_length - 1], self.vocab_size)
@@ -108,6 +110,7 @@ def prepare_config_and_inputs_for_common(self):
             embed_dim=self.embed_dim,
             word_embed_proj_dim=self.word_embed_proj_dim,
             is_encoder_decoder=False,
+            attn_implementation=self.attn_implementation,
             **self.config_updates,
         )
         inputs_dict = prepare_opt_inputs_dict(config, input_ids)
diff --git a/tests/models/paligemma/test_modeling_paligemma.py b/tests/models/paligemma/test_modeling_paligemma.py
index d954fa2a0f502b..644ac2cc5bd1b4 100644
--- a/tests/models/paligemma/test_modeling_paligemma.py
+++ b/tests/models/paligemma/test_modeling_paligemma.py
@@ -159,7 +159,8 @@ def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         config, pixel_values = config_and_inputs
         input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1
-        attention_mask = input_ids.ne(1).to(torch_device)
+        attention_mask = input_ids.ne(self.pad_token_id).to(torch_device)
+
         # set the 16 first tokens to be image, and ensure that no other tokens are image tokens
         # do not change this unless you modified image size or patch size
         input_ids[input_ids == config.image_token_index] = self.pad_token_id
@@ -314,6 +315,10 @@ def test_save_load_low_cpu_mem_usage_no_safetensors(self):
     def test_generate_from_inputs_embeds_with_static_cache(self):
         pass
 
+    @unittest.skip(reason="TODO (@joao): fix me -- failing to produce similar results")
+    def test_static_cache_matches_dynamic(self):
+        pass
+
 
 @slow
 @require_torch
diff --git a/tests/models/paligemma/test_processing_paligemma.py b/tests/models/paligemma/test_processing_paligemma.py
deleted file mode 100644
index 33b31507e17df2..00000000000000
--- a/tests/models/paligemma/test_processing_paligemma.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import shutil
-import tempfile
-import unittest
-
-from transformers import AutoProcessor, GemmaTokenizerFast, PaliGemmaProcessor
-from transformers.testing_utils import require_read_token, require_vision
-from transformers.utils import is_vision_available
-
-from ...test_processing_common import ProcessorTesterMixin
-
-
-if is_vision_available():
-    from transformers import SiglipImageProcessor
-
-
-@require_vision
-@require_read_token
-class PaliGemmaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
-    processor_class = PaliGemmaProcessor
-
-    def setUp(self):
-        self.tmpdirname = tempfile.mkdtemp()
-        image_processor = SiglipImageProcessor(do_center_crop=False)
-        tokenizer = GemmaTokenizerFast.from_pretrained("google/gemma-7b")
-        image_processor.image_seq_length = 32
-
-        processor = PaliGemmaProcessor(image_processor=image_processor, tokenizer=tokenizer)
-        processor.save_pretrained(self.tmpdirname)
-
-    def get_tokenizer(self, **kwargs):
-        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
-
-    def get_image_processor(self, **kwargs):
-        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
-
-    def tearDown(self):
-        shutil.rmtree(self.tmpdirname)
-
-    def test_text_with_image_tokens(self):
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer")
-
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
-        text_multi_images = "<image><image><bos>Dummy text!"
-        text_single_image = "<image><bos>Dummy text!"
-        text_no_image = "Dummy text!"
-
-        image = self.prepare_image_inputs()[0]
-
-        out_noimage = processor(text=text_no_image, images=image, return_tensors="np")
-        out_singlimage = processor(text=text_single_image, images=image, return_tensors="np")
-        for k in out_noimage:
-            self.assertTrue(out_noimage[k].tolist() == out_singlimage[k].tolist())
-
-        out_multiimages = processor(text=text_multi_images, images=[image, image], return_tensors="np")
-        out_noimage = processor(text=text_no_image, images=[[image, image]], return_tensors="np")
-
-        # We can't be sure what is users intention, whether user want "one text + two images" or user forgot to add the second text
-        with self.assertRaises(ValueError):
-            out_noimage = processor(text=text_no_image, images=[image, image], return_tensors="np")
-
-        for k in out_noimage:
-            self.assertTrue(out_noimage[k].tolist() == out_multiimages[k].tolist())
-
-        text_batched = ["Dummy text!", "Dummy text!"]
-        text_batched_with_image = ["<image><bos>Dummy text!", "<image><bos>Dummy text!"]
-        out_images = processor(text=text_batched_with_image, images=[image, image], return_tensors="np")
-        out_noimage_nested = processor(text=text_batched, images=[[image], [image]], return_tensors="np")
-        out_noimage = processor(text=text_batched, images=[image, image], return_tensors="np")
-        for k in out_noimage:
-            self.assertTrue(out_noimage[k].tolist() == out_images[k].tolist() == out_noimage_nested[k].tolist())
diff --git a/tests/models/paligemma/test_processor_paligemma.py b/tests/models/paligemma/test_processor_paligemma.py
index 60de913e53ae9b..245aff594125cf 100644
--- a/tests/models/paligemma/test_processor_paligemma.py
+++ b/tests/models/paligemma/test_processor_paligemma.py
@@ -16,7 +16,7 @@
 import tempfile
 import unittest
 
-from transformers import GemmaTokenizer
+from transformers import GemmaTokenizer, PaliGemmaProcessor
 from transformers.testing_utils import get_tests_dir, require_torch, require_vision
 from transformers.utils import is_vision_available
 
@@ -24,11 +24,7 @@
 
 
 if is_vision_available():
-    from transformers import (
-        PaliGemmaProcessor,
-        SiglipImageProcessor,
-        is_vision_available,
-    )
+    from transformers import SiglipImageProcessor
 
 SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
 
@@ -61,3 +57,37 @@ def test_image_seq_length(self):
             text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length"
         )
         self.assertEqual(len(inputs["input_ids"][0]), 112 + 14)
+
+    def test_text_with_image_tokens(self):
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        text_multi_images = "<image><image><bos>Dummy text!"
+        text_single_image = "<image><bos>Dummy text!"
+        text_no_image = "Dummy text!"
+
+        image = self.prepare_image_inputs()
+
+        out_noimage = processor(text=text_no_image, images=image, return_tensors="np")
+        out_singlimage = processor(text=text_single_image, images=image, return_tensors="np")
+        for k in out_noimage:
+            self.assertTrue(out_noimage[k].tolist() == out_singlimage[k].tolist())
+
+        out_multiimages = processor(text=text_multi_images, images=[image, image], return_tensors="np")
+        out_noimage = processor(text=text_no_image, images=[[image, image]], return_tensors="np")
+
+        # We can't be sure what is users intention, whether user want "one text + two images" or user forgot to add the second text
+        with self.assertRaises(ValueError):
+            out_noimage = processor(text=text_no_image, images=[image, image], return_tensors="np")
+
+        for k in out_noimage:
+            self.assertTrue(out_noimage[k].tolist() == out_multiimages[k].tolist())
+
+        text_batched = ["Dummy text!", "Dummy text!"]
+        text_batched_with_image = ["<image><bos>Dummy text!", "<image><bos>Dummy text!"]
+        out_images = processor(text=text_batched_with_image, images=[image, image], return_tensors="np")
+        out_noimage_nested = processor(text=text_batched, images=[[image], [image]], return_tensors="np")
+        out_noimage = processor(text=text_batched, images=[image, image], return_tensors="np")
+        for k in out_noimage:
+            self.assertTrue(out_noimage[k].tolist() == out_images[k].tolist() == out_noimage_nested[k].tolist())
diff --git a/tests/models/persimmon/test_modeling_persimmon.py b/tests/models/persimmon/test_modeling_persimmon.py
index 0d267fb86910d6..600c5b8a2f7342 100644
--- a/tests/models/persimmon/test_modeling_persimmon.py
+++ b/tests/models/persimmon/test_modeling_persimmon.py
@@ -110,7 +110,7 @@ def prepare_config_and_inputs(self):
 
         input_mask = None
         if self.use_input_mask:
-            input_mask = torch.tril(torch.ones(self.batch_size, self.seq_length)).to(torch_device)
+            input_mask = torch.tril(torch.ones_like(input_ids).to(torch_device))
 
         token_type_ids = None
         if self.use_token_type_ids:
diff --git a/tests/models/phi/test_modeling_phi.py b/tests/models/phi/test_modeling_phi.py
index 95b0b01c0a23d9..c17f69a499866b 100644
--- a/tests/models/phi/test_modeling_phi.py
+++ b/tests/models/phi/test_modeling_phi.py
@@ -304,7 +304,14 @@ class PhiModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
 
     # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79292/workflows/fa2ba644-8953-44a6-8f67-ccd69ca6a476/jobs/1012905
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
         return True
 
diff --git a/tests/models/phi3/test_modeling_phi3.py b/tests/models/phi3/test_modeling_phi3.py
index ce0a71878877b5..2c5557dfd67aae 100644
--- a/tests/models/phi3/test_modeling_phi3.py
+++ b/tests/models/phi3/test_modeling_phi3.py
@@ -151,7 +151,7 @@ def prepare_config_and_inputs(self):
 
         input_mask = None
         if self.use_input_mask:
-            input_mask = torch.tril(torch.ones(self.batch_size, self.seq_length)).to(torch_device)
+            input_mask = torch.tril(torch.ones_like(input_ids).to(torch_device))
 
         token_type_ids = None
         if self.use_token_type_ids:
@@ -351,7 +351,14 @@ class Phi3ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
 
     # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79292/workflows/fa2ba644-8953-44a6-8f67-ccd69ca6a476/jobs/1012905
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
         return True
 
@@ -608,3 +615,93 @@ def test_phi3_mini_128k_instruct_with_static_cache(self):
         ]
 
         self.assertListEqual(output_text, EXPECTED_OUTPUT)
+
+    def test_phi3_mini_4k_sliding_window(self):
+        """
+        This tests that Phi3 doesn't deteriorate in quality for long context generations. Since Phi3 has
+        sliding window attention, the test is tailored so that (context + max_new_tokens > sliding_window).
+        See #33586 for more
+        """
+        model = Phi3ForCausalLM.from_pretrained(
+            "microsoft/Phi-3-mini-4k-instruct", device_map=torch_device, torch_dtype=torch.bfloat16
+        )
+        tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
+
+        input_text = """
+            <|user|>
+            Tell me about Paris, France.<|end|>
+            <|assistant|>
+            Paris, the capital city of France, is renowned for its rich history, iconic landmarks, and vibrant culture. Known as "The City of Light," Paris is situated in the north-central part of the country along the Seine River.
+
+            Here are some key aspects of Paris:
+
+            1. Landmarks: Paris is home to numerous famous landmarks, including the Eiffel Tower, the Louvre Museum, Notre-Dame Cathedral, and the Champs-Élysées. The Eiffel Tower, built in 1889, is an iconic symbol of Paris and attracts millions of tourists each year. The Louvre Museum, the world's largest art museum, houses thousands of works of art, including the Mona Lisa and the Venus de Milo.
+
+            2. History: Paris has a rich history dating back to the 3rd century BC, when it was founded by a Celtic tribe called the Parisii. Over the centuries, the city has been influenced by various cultures, including the Romans, the Franks, and the Normans. The French Revolution in the late 18th century marked a significant turning point in Paris's history, leading to the establishment of the modern French Republic.
+
+            3. Culture: Paris is a global center for art, fashion, gastronomy, and culture. The city is home to numerous museums, including the Centre Pompidou, Musée d'Orsay, and Musée Rodin. Paris is also known for its fashion industry, with many famous designers having their origins in the city. The city's cuisine is also highly regarded, with a focus on fresh ingredients, and a wide variety of dishes, including French classics like coq au vin, boeuf bourguignon, and crêpes.
+
+            4. Architecture: Parisian architecture is characterized by its diverse styles, ranging from Gothic and Romanesque to Art Nouveau and Art Deco. The city's famous Haussmannian buildings, designed by Baron Haussmann in the mid-19th century, are known for their uniform facades, wrought-iron balconies, and large windows.
+
+            5. Transportation: Paris has an extensive public transportation system, including the Paris Métro, RER (suburban trains), and buses. The city's iconic yellow taxis are also a popular mode of transportation.
+
+            6. Language: The official language of Paris is French, and the city's residents are known for their charm and politeness.
+
+            7. Festivals and Events: Paris hosts numerous festivals and events throughout the year, including the annual Bastille Day celebrations, the Paris Fashion Week, and the famous annual New Year's Eve fireworks on the Eiffel Tower.
+
+            8. Geography: Paris is located in the north-central part of France, with the Seine River running through the city. The city's geography is characterized by rolling hills and picturesque parks, such as the Bois de Boulogne and the Jardin des Tuileries.
+
+            9. Population: As of 2021, Paris has an estimated population of around 2.2 million residents, with the metropolitan area housing over 12 million people.
+
+            In summary, Paris is a city steeped in history, culture, and art, with a unique blend of architectural styles and a vibrant atmosphere that continues to captivate millions of visitors each year.<|end|>
+            <|user|>
+            Please give me a list of 5 architectural landmarks in Paris, France.<|end|>
+            <|assistant|>
+            1. Eiffel Tower: Designed by Gustave Eiffel and completed in 1889, the Eiffel Tower is an iconic symbol of Paris and France. Standing at 324 meters tall, it was the tallest man-made structure in the world until the completion of the Chrysler Building in New York in 1930. The Eiffel Tower is made of wrought iron and offers visitors stunning views of the city from its three levels.
+
+            2. Notre-Dame Cathedral: Located on the Île de la Cité, Notre-Dame Cathedral is a masterpiece of French Gothic architecture. Construction began in the 12th century and continued for over 200 years, with the cathedral's completion in the 14th century. The cathedral is famous for its intricate facade, stained-glass windows, and the iconic gargoyles and chimeras.
+
+            3. Louvre Museum: Originally built as a fortress in the 12th century, the Louvre Museum is now the world's largest art museum and a historic monument in Paris. The museum's most famous landmark is the iconic glass pyramid entrance, designed by architect I. M. Pei in the 1980s. The Louvre houses over 380,000 works of art, including the Mona Lisa and the Venus de Milo.
+
+            4. Sacré-Cœur Basilica: The Sacré-Cœur Basilica, also known as the Basilique du Sacré-Cœur, is a Roman Catholic church and minor basilica located at the summit of the butte Montmartre, the highest point in Paris. The basilica was designed by Paul Abadie and dedicated in 1914. Its white domes and lavender-colored travertine stone make it a distinctive landmark in the Paris skyline.
+
+            5. Arc de Triomphe: The Arc de Triomphe is a monumental structure located at the western end of the Champs-Élysées. Commissioned by Napoleon in 1806, the Arc was designed by Jean-François-Thérèse Chalgrin and completed in 1836. The monument honors those who fought and died for France during the French Revolutionary and Napoleonic Wars. The Arc features sculptural reliefs and inscriptions, and its façade is adorned with the names of 357 generals and 660 soldiers.
+
+            These five architectural landmarks showcase the diverse styles and historical periods of Paris, from Gothic to Neoclassical, and from the 19th to the 20th centuries. Each landmark has its unique features and contributes to the city's rich architectural heritage.<|end|>
+            <|user|>
+            Please give me a list of 10 famous items displayed in the Louvre Museum. Thanks!<|end|>
+            <|assistant|>
+            1. Mona Lisa: The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is arguably the most famous painting in the world. The portrait is known for its enigmatic smile and masterful use of sfumato, a technique that creates a soft, hazy effect.
+
+            2. Venus de Milo: This ancient Greek statue, believed to have been created around 130-100 BC, is a masterpiece of Hellenistic sculpture. The Venus de Milo is renowned for its graceful beauty and the mystery surrounding its missing arms.
+
+            3. Winged Victory of Samothrace: This Hellenistic sculpture, dating back to the 2nd century BC, depicts the Greek goddess Nike, the personification of victory. The sculpture is celebrated for its dynamic movement and intricate details.
+
+            4. Liberty Leading the People: This iconic painting by Eugène Delacroix, created in 1830, commemorates the July Revolution in France. The artwork depicts a woman personifying Liberty leading a group of revolutionaries over the bodies of the fallen.
+
+            5. The Wedding at Cana: A 1516 painting by Veronese, The Wedding at Cana is a large-scale work that depicts the biblical story of Jesus turning water into wine at a wedding feast. The painting is known for its vibrant colors and intricate details.
+
+            6. The Raft of the Medusa: This 1819 painting by Théodore Géricault is a powerful depiction of the aftermath of the shipwreck of the French frigate Méduse. The painting is famous for its dramatic composition and emotional intensity.
+
+            7. The Coronation of Napoleon: This 1805 painting by Jacques-Louis David portrays the coronation of Napoleon Bonaparte as Emperor of the French. The artwork is a masterpiece of neoclassical style and captures the pomp and ceremony of the event.
+
+            8. The Death of Socrates: A 1914 painting by Pablo Picasso, The Death of Socrates depicts the moment when the ancient Greek philosopher Socrates drinks the poison hemlock, as ordered by the Athenian government. The painting is a powerful expression of the existential themes associated with Socrates' death.
+
+            9. The Turkish Bath: A 1887 painting by Puvis de Chavannes, The Turkish Bath is a large-scale work that depicts a scene of relaxation and leisity in a traditional Turkish bath. The painting is known for its subdued color palette and serene atmosphere.
+
+            10. The Wedding at Cana: A 1901 painting by Giovanni Boldini, The Wedding at Cana is a lively depiction of a festive Italian wedding. The painting is characterized by Boldini's signature flamboyant style and his ability to capture the essence of the subjects with a sense of movement and energy.
+
+            These 10 famous items in the Louvre Museum showcase a diverse range of artistic styles and periods, from ancient Greek and Roman sculptures to Renaissance and 19th-century paintings. The collection at the Louvre is one of the world's largest and most important museums, housing over 800,000 works of art, and these famous items are just a small selection of the museum's vast collection.<|end|>
+            <|user|>
+            Please give me a list of 10 famous French dishes.<|end|>
+            <|assistant|>
+            """
+
+        inputs = tokenizer(input_text, return_tensors="pt").to(device=torch_device)
+        outputs = model.generate(**inputs, max_new_tokens=100)
+        output_text = tokenizer.batch_decode(outputs[:, inputs.input_ids.shape[1] :], skip_special_tokens=True)
+        EXPECTED_OUTPUT = [
+            '1. Coq au Vin: Coq au Vin is a classic French dish that translates to "rooster in wine." The dish consists of chicken braised with wine, lardons, mushrooms, and garlic. It is a hearty and flavorful dish that is often served with potatoes or rice.\n\n            2. Boeuf Bourguignon: Boeuf Bourguignon is a traditional French beef stew that'
+        ]
+
+        self.assertListEqual(output_text, EXPECTED_OUTPUT)
diff --git a/tests/models/phimoe/__init__.py b/tests/models/phimoe/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tests/models/phimoe/test_modeling_phimoe.py b/tests/models/phimoe/test_modeling_phimoe.py
new file mode 100644
index 00000000000000..881967076e7ed3
--- /dev/null
+++ b/tests/models/phimoe/test_modeling_phimoe.py
@@ -0,0 +1,566 @@
+# coding=utf-8
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Testing suite for the PyTorch PhiMoE model."""
+
+import unittest
+from typing import List
+
+from parameterized import parameterized
+
+from transformers import PhimoeConfig, StaticCache, is_torch_available, set_seed
+from transformers.testing_utils import (
+    require_torch,
+    slow,
+    torch_device,
+)
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        AutoTokenizer,
+        PhimoeForCausalLM,
+        PhimoeForSequenceClassification,
+        PhimoeModel,
+    )
+
+    end_of_text_token = 32000
+
+    class PhimoeMiniWithStaticCache(torch.nn.Module):
+        def __init__(self, model: PhimoeForCausalLM, batch_size: int, max_seq_len: int):
+            super().__init__()
+            self.model = model
+            self.cache = StaticCache(
+                config=model.config,
+                batch_size=batch_size,
+                max_cache_len=max_seq_len,
+                device=self.model.device,
+                dtype=self.model.dtype,
+            )
+
+        def forward(
+            self,
+            input_ids: torch.LongTensor = None,
+        ) -> torch.FloatTensor:
+            return self.model.forward(
+                input_ids=input_ids,
+                use_cache=True,
+                return_dict=True,
+                past_key_values=self.cache,
+            ).logits
+
+        @staticmethod
+        def generate(model: PhimoeForCausalLM, prompt_tokens: torch.LongTensor, max_seq_len: int) -> List[int]:
+            model = PhimoeMiniWithStaticCache(model, 1, max_seq_len + prompt_tokens.shape[-1])
+
+            response_tokens = []
+
+            for input_pos in range(prompt_tokens.shape[-1]):
+                result = model.forward(
+                    input_ids=prompt_tokens[:, input_pos : input_pos + 1],
+                )
+                response_tokens.append(prompt_tokens[0][input_pos].item())
+
+            current_token = torch.argmax(result[:, -1, :], dim=-1).item()
+            response_tokens.append(current_token)
+
+            while current_token != end_of_text_token and len(response_tokens) < max_seq_len:
+                result = model.forward(
+                    input_ids=torch.tensor([[current_token]], dtype=torch.long),
+                )
+                current_token = torch.argmax(result[:, -1, :], dim=-1).item()
+                response_tokens.append(current_token)
+
+            return response_tokens
+
+
+class PhimoeModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=False,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        num_key_value_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=131072,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        pad_token_id=0,
+        scope=None,
+        original_max_position_embeddings=4096,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.pad_token_id = pad_token_id
+        self.scope = scope
+        self.original_max_position_embeddings = original_max_position_embeddings
+
+    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = torch.tril(torch.ones_like(input_ids).to(torch_device))
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = self.get_config()
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def get_config(self):
+        return PhimoeConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            num_key_value_heads=self.num_key_value_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+            pad_token_id=self.pad_token_id,
+            num_experts_per_tok=2,
+            num_local_experts=2,
+            original_max_position_embeddings=self.original_max_position_embeddings,
+        )
+
+    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model with Llama->Phimoe
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = PhimoeModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model_as_decoder with Llama->Phimoe
+    def create_and_check_model_as_decoder(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.add_cross_attention = True
+        model = PhimoeModel(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+        )
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+        result = model(input_ids, attention_mask=input_mask)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_for_causal_lm with Llama->Phimoe
+    def create_and_check_for_causal_lm(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        model = PhimoeForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_decoder_model_past_large_inputs with Llama->Phimoe
+    def create_and_check_decoder_model_past_large_inputs(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.is_decoder = True
+        config.add_cross_attention = True
+        model = PhimoeForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        # first forward pass
+        outputs = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=True,
+        )
+        past_key_values = outputs.past_key_values
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
+
+        output_from_no_past = model(
+            next_input_ids,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+        output_from_past = model(
+            next_tokens,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs_for_common
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class PhimoeModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (PhimoeModel, PhimoeForCausalLM, PhimoeForSequenceClassification) if is_torch_available() else ()
+    )
+    all_generative_model_classes = (PhimoeForCausalLM,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": PhimoeModel,
+            "text-classification": PhimoeForSequenceClassification,
+            "text-generation": PhimoeForCausalLM,
+            "zero-shot": PhimoeForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
+
+    test_headmasking = False
+    test_pruning = False
+
+    # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79292/workflows/fa2ba644-8953-44a6-8f67-ccd69ca6a476/jobs/1012905
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        return True
+
+    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.setUp with Llama->Phimoe
+    def setUp(self):
+        self.model_tester = PhimoeModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=PhimoeConfig, hidden_size=37)
+
+    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_config
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_model
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_llama_sequence_classification_model with Llama->Phimoe,llama->phimoe
+    def test_phimoe_sequence_classification_model(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
+        model = PhimoeForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_llama_sequence_classification_model_for_single_label with Llama->Phimoe,llama->phimoe
+    def test_phimoe_sequence_classification_model_for_single_label(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        config.problem_type = "single_label_classification"
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
+        model = PhimoeForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_llama_sequence_classification_model_for_multi_label with Llama->Phimoe,llama->phimoe
+    def test_phimoe_sequence_classification_model_for_multi_label(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        config.problem_type = "multi_label_classification"
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        sequence_labels = ids_tensor(
+            [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size
+        ).to(torch.float)
+        model = PhimoeForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+    @parameterized.expand([("longrope",)])
+    def test_model_rope_scaling_from_config(self, scaling_type):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        short_input = ids_tensor([1, 10], config.vocab_size)
+        long_input = ids_tensor([1, int(config.original_max_position_embeddings * 1.5)], config.vocab_size)
+
+        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
+        original_model = PhimoeModel(config)
+        original_model.to(torch_device)
+        original_model.eval()
+        original_short_output = original_model(short_input).last_hidden_state
+        original_long_output = original_model(long_input).last_hidden_state
+
+        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
+        n_factors = config.hidden_size // config.num_attention_heads // 2
+        config.rope_scaling = {
+            "type": scaling_type,
+            "short_factor": [3.0 for _ in range(n_factors)],
+            "long_factor": [5.0 for _ in range(n_factors)],
+            "short_mscale": 1.243163121016122,
+            "long_mscale": 1.243163121016122,
+            "original_max_position_embeddings": 4096,
+        }
+        scaled_model = PhimoeModel(config)
+        scaled_model.to(torch_device)
+        scaled_model.eval()
+        scaled_short_output = scaled_model(short_input).last_hidden_state
+        scaled_long_output = scaled_model(long_input).last_hidden_state
+
+        # Scaling changes the RoPE embeddings, both for the short and long outputs
+        self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))
+        self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5))
+
+    @parameterized.expand([("longrope",)])
+    def test_model_rope_scaling_short_long_factor(self, scaling_type):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        n_factors = config.hidden_size // config.num_key_value_heads // 2
+        config.rope_scaling = {
+            "type": scaling_type,
+            "short_factor": [3.0 for _ in range(n_factors)],
+            "long_factor": [5.0 for _ in range(n_factors)],
+            "short_mscale": 1.243163121016122,
+            "long_mscale": 1.243163121016122,
+            "original_max_position_embeddings": 4096,
+        }
+        input_tensor = ids_tensor([1, 4090], config.vocab_size)
+        model = PhimoeForCausalLM(config)
+        model.to(torch_device)
+        model.eval()
+        generation_args_short = {
+            "max_length": config.original_max_position_embeddings,
+            "temperature": 0.0,
+            "use_cache": True,
+            "do_sample": False,
+            "return_dict_in_generate": True,
+        }
+        output_with_short_factor = model.generate(input_tensor, **generation_args_short)
+        keys_with_short_factor = output_with_short_factor.past_key_values[0][0]
+        generation_args_long = {
+            "max_length": config.original_max_position_embeddings + 5,
+            "temperature": 0.0,
+            "use_cache": True,
+            "do_sample": False,
+            "return_dict_in_generate": True,
+            "output_logits": True,
+        }
+        output_with_long_factor = model.generate(input_tensor, **generation_args_long)
+        keys_with_long_factor = output_with_long_factor.past_key_values[0][0]
+        last_token_logits = output_with_long_factor.logits[-1][-1]
+        regenerated_last_token_logits = model(output_with_long_factor.sequences[:, :-1]).logits[0][-1]
+        keys_with_long_factor = keys_with_long_factor[:, :, : config.original_max_position_embeddings - 1, :]
+
+        # KV cache is re-computed after reaching the (`config.original_max_position_embeddings`+1)th token position
+        self.assertFalse(torch.allclose(keys_with_short_factor, keys_with_long_factor, atol=1e-3, rtol=1e-3))
+        # Last token generated using long factor
+        self.assertTrue(torch.allclose(last_token_logits, regenerated_last_token_logits, atol=1e-2, rtol=1e-2))
+
+
+@slow
+@require_torch
+class PhimoeIntegrationTest(unittest.TestCase):
+    def test_model_phimoe_instruct_logits(self):
+        input_ids = {
+            "input_ids": torch.tensor(
+                [[1212, 318, 281, 1672, 2643, 290, 428, 318, 257, 1332]], dtype=torch.long, device=torch_device
+            )
+        }
+
+        model = PhimoeForCausalLM.from_pretrained("microsoft/Phi-3.5-MoE-instruct").to(torch_device)
+        model.eval()
+
+        output = model(**input_ids).logits
+
+        EXPECTED_OUTPUT = torch.tensor([[-3.5312, -2.5000, -1.2734,  0.3555, -0.7578, -0.4727,  0.5977, -0.4316,
+          0.2256, -1.2188, -1.6797,  0.9961,  3.7656, 11.3125, -1.3828, -4.8438,
+         -5.7500, -1.9375,  0.7227, -0.3438, -0.2100, -0.4277, -0.0444, -0.5352,
+         -0.6406, -0.1016, -0.4258, -1.0234,  0.4297, -0.6250],
+        [-0.9883,  0.1455, -0.4902,  2.3594,  0.7031,  3.1406,  0.4375,  0.2559,
+          0.6172, -2.1094, -1.3359,  2.5938,  4.9062, 10.8125, -0.1094,  1.5781,
+         -4.9375,  0.7148, -0.0972,  1.7656, -0.0801,  0.2217,  0.1875, -0.4629,
+          1.5781,  0.3535,  0.0874,  0.6836, -0.0518, -1.2969]]).to(torch_device)  # fmt: skip
+
+        self.assertTrue(torch.allclose(EXPECTED_OUTPUT, output[0, :2, :30], atol=1e-4, rtol=1e-4))
+
+    def test_phimoe_instruct_generation(self):
+        model = PhimoeForCausalLM.from_pretrained("microsoft/Phi-3.5-MoE-instruct")
+        tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-MoE-instruct")
+
+        messages = [
+            {
+                "role": "system",
+                "content": "You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user.",
+            },
+            {"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"},
+        ]
+        inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
+
+        outputs = model.generate(inputs, max_new_tokens=32)
+        output_text = tokenizer.batch_decode(outputs)
+
+        EXPECTED_OUTPUT = [
+            "<|system|> You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user.<|end|><|user|> Can you provide ways to eat combinations of bananas and dragonfruits?<|end|><|assistant|> Certainly! Bananas and dragonfruits are both delicious and nutritious fruits that can be combined in various ways to create tast"
+        ]
+
+        self.assertListEqual(output_text, EXPECTED_OUTPUT)
+
+    def test_phimoe_instruct_with_static_cache(self):
+        model = PhimoeForCausalLM.from_pretrained("microsoft/Phi-3.5-MoE-instruct")
+        tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-MoE-instruct")
+
+        messages = [
+            {
+                "role": "system",
+                "content": "You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user.",
+            },
+            {"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"},
+        ]
+        inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
+
+        response_tokens = PhimoeMiniWithStaticCache.generate(model, inputs, 64)
+
+        output_text = tokenizer.batch_decode(torch.tensor([response_tokens], dtype=torch.long, device=torch_device))
+
+        EXPECTED_OUTPUT = [
+            "<|system|> You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user.<|end|><|user|> Can you provide ways to eat combinations of bananas and dragonfruits?<|end|><|assistant|> Certainly! Bananas and dragonfruits are both delicious and nutritious fruits that can"
+        ]
+
+        self.assertListEqual(output_text, EXPECTED_OUTPUT)
diff --git a/tests/models/pix2struct/test_processor_pix2struct.py b/tests/models/pix2struct/test_processor_pix2struct.py
index ac8d4822f1c09f..f832ffd2d64f7c 100644
--- a/tests/models/pix2struct/test_processor_pix2struct.py
+++ b/tests/models/pix2struct/test_processor_pix2struct.py
@@ -96,7 +96,7 @@ def test_tokenizer(self):
 
         processor = Pix2StructProcessor(tokenizer=tokenizer, image_processor=image_processor)
 
-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
 
         encoded_processor = processor(text=input_str)
 
@@ -111,7 +111,7 @@ def test_processor(self):
 
         processor = Pix2StructProcessor(tokenizer=tokenizer, image_processor=image_processor)
 
-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
         image_input = self.prepare_image_inputs()
 
         inputs = processor(text=input_str, images=image_input)
@@ -130,7 +130,7 @@ def test_processor_max_patches(self):
 
         processor = Pix2StructProcessor(tokenizer=tokenizer, image_processor=image_processor)
 
-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
         image_input = self.prepare_image_inputs()
 
         inputs = processor(text=input_str, images=image_input)
@@ -168,7 +168,7 @@ def test_model_input_names(self):
 
         processor = Pix2StructProcessor(tokenizer=tokenizer, image_processor=image_processor)
 
-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
         image_input = self.prepare_image_inputs()
 
         inputs = processor(text=input_str, images=image_input)
@@ -195,7 +195,7 @@ def test_image_processor_defaults_preserved_by_image_kwargs(self):
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
         image_input = self.prepare_image_inputs()
 
         inputs = processor(text=input_str, images=image_input)
@@ -213,7 +213,7 @@ def test_kwargs_overrides_default_image_processor_kwargs(self):
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
         image_input = self.prepare_image_inputs()
 
         inputs = processor(text=input_str, images=image_input, max_patches=1024)
@@ -231,7 +231,7 @@ def test_unstructured_kwargs(self):
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
         image_input = self.prepare_image_inputs()
         inputs = processor(
             text=input_str,
@@ -257,8 +257,8 @@ def test_unstructured_kwargs_batched(self):
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
-        input_str = ["lower newer", "upper older longer string"]
-        image_input = self.prepare_image_inputs() * 2
+        input_str = self.prepare_text_inputs(batch_size=2)
+        image_input = self.prepare_image_inputs(batch_size=2)
         inputs = processor(
             text=input_str,
             images=image_input,
@@ -284,7 +284,7 @@ def test_structured_kwargs_nested(self):
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
         image_input = self.prepare_image_inputs()
 
         # Define the kwargs for each modality
@@ -313,7 +313,7 @@ def test_structured_kwargs_nested_from_dict(self):
 
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
         image_input = self.prepare_image_inputs()
 
         # Define the kwargs for each modality
diff --git a/tests/models/pixtral/test_processor_pixtral.py b/tests/models/pixtral/test_processor_pixtral.py
index 59c19eabcaf53b..8cdbf93c6476b8 100644
--- a/tests/models/pixtral/test_processor_pixtral.py
+++ b/tests/models/pixtral/test_processor_pixtral.py
@@ -14,6 +14,7 @@
 import shutil
 import tempfile
 import unittest
+from typing import Optional
 
 import requests
 import torch
@@ -246,27 +247,11 @@ def test_processor_with_multiple_images_multiple_lists(self):
         # fmt: on
 
     # Override as PixtralProcessor needs nested images to work properly with batched inputs
-    def test_unstructured_kwargs_batched(self):
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        processor_components = self.prepare_components()
-        processor = self.processor_class(**processor_components)
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = ["lower newer", "upper older longer string"]
-        image_input = [self.prepare_image_inputs()] * 2
-        inputs = processor(
-            text=input_str,
-            images=image_input,
-            return_tensors="pt",
-            do_rescale=True,
-            rescale_factor=-1,
-            padding="longest",
-            max_length=76,
-        )
-
-        self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0)
-        self.assertTrue(
-            len(inputs[self.text_input_name][0]) == len(inputs[self.text_input_name][1])
-            and len(inputs[self.text_input_name][1]) < 76
-        )
+    @require_vision
+    def prepare_image_inputs(self, batch_size: Optional[int] = None):
+        """This function prepares a list of PIL images for testing"""
+        if batch_size is None:
+            return super().prepare_image_inputs()
+        if batch_size < 1:
+            raise ValueError("batch_size must be greater than 0")
+        return [[super().prepare_image_inputs()]] * batch_size
diff --git a/tests/models/plbart/test_modeling_plbart.py b/tests/models/plbart/test_modeling_plbart.py
index 7a0eebd7bd0204..6f38539a634194 100644
--- a/tests/models/plbart/test_modeling_plbart.py
+++ b/tests/models/plbart/test_modeling_plbart.py
@@ -245,9 +245,16 @@ class PLBartModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
 
     # TODO: Fix the failed tests
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
-        if pipeline_test_casse_name == "TranslationPipelineTests":
+        if pipeline_test_case_name == "TranslationPipelineTests":
             # Get `ValueError: Translation requires a `src_lang` and a `tgt_lang` for this model`.
             # `PLBartConfig` was never used in pipeline tests: cannot create a simple tokenizer.
             return True
diff --git a/tests/models/prophetnet/test_modeling_prophetnet.py b/tests/models/prophetnet/test_modeling_prophetnet.py
index 99329437239bb5..91da243dbbbe2f 100644
--- a/tests/models/prophetnet/test_modeling_prophetnet.py
+++ b/tests/models/prophetnet/test_modeling_prophetnet.py
@@ -906,9 +906,16 @@ class ProphetNetModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste
 
     # TODO: Fix the failed tests
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
-        if pipeline_test_casse_name == "TextGenerationPipelineTests":
+        if pipeline_test_case_name == "TextGenerationPipelineTests":
             # Get `ValueError: AttributeError: 'NoneType' object has no attribute 'new_ones'` or `AssertionError`.
             # `ProphetNetConfig` was never used in pipeline tests: cannot create a simple
             # tokenizer.
diff --git a/tests/models/qwen2/test_modeling_qwen2.py b/tests/models/qwen2/test_modeling_qwen2.py
index 4d6c432f20424d..c7fe6577985f96 100644
--- a/tests/models/qwen2/test_modeling_qwen2.py
+++ b/tests/models/qwen2/test_modeling_qwen2.py
@@ -64,7 +64,7 @@ def __init__(
         num_hidden_layers=5,
         max_window_layers=3,
         use_sliding_window=True,
-        sliding_window=2,
+        sliding_window=50,
         num_attention_heads=4,
         num_key_value_heads=2,
         intermediate_size=37,
@@ -116,7 +116,7 @@ def prepare_config_and_inputs(self):
 
         input_mask = None
         if self.use_input_mask:
-            input_mask = torch.tril(torch.ones(self.batch_size, self.seq_length)).to(torch_device)
+            input_mask = torch.tril(torch.ones_like(input_ids).to(torch_device))
 
         token_type_ids = None
         if self.use_token_type_ids:
@@ -322,7 +322,14 @@ class Qwen2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
 
     # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79245/workflows/9490ef58-79c2-410d-8f51-e3495156cf9c/jobs/1012146
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
         return True
 
diff --git a/tests/models/qwen2_moe/test_modeling_qwen2_moe.py b/tests/models/qwen2_moe/test_modeling_qwen2_moe.py
index 0425172a6fba4d..11fc55f6baff40 100644
--- a/tests/models/qwen2_moe/test_modeling_qwen2_moe.py
+++ b/tests/models/qwen2_moe/test_modeling_qwen2_moe.py
@@ -64,7 +64,7 @@ def __init__(
         num_hidden_layers=5,
         max_window_layers=3,
         use_sliding_window=True,
-        sliding_window=2,
+        sliding_window=50,
         num_attention_heads=4,
         num_key_value_heads=2,
         intermediate_size=37,
@@ -134,7 +134,7 @@ def prepare_config_and_inputs(self):
 
         input_mask = None
         if self.use_input_mask:
-            input_mask = torch.tril(torch.ones(self.batch_size, self.seq_length)).to(torch_device)
+            input_mask = torch.tril(torch.ones_like(input_ids).to(torch_device))
 
         token_type_ids = None
         if self.use_token_type_ids:
@@ -349,7 +349,14 @@ class Qwen2MoeModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM
 
     # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79245/workflows/9490ef58-79c2-410d-8f51-e3495156cf9c/jobs/1012146
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
         return True
 
diff --git a/tests/models/qwen2_vl/test_processing_qwen2_vl.py b/tests/models/qwen2_vl/test_processor_qwen2_vl.py
similarity index 100%
rename from tests/models/qwen2_vl/test_processing_qwen2_vl.py
rename to tests/models/qwen2_vl/test_processor_qwen2_vl.py
diff --git a/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py b/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py
index 23dace68cf21a9..542955f9fa4511 100644
--- a/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py
+++ b/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py
@@ -103,7 +103,7 @@ def prepare_config_and_inputs(self):
 
         input_mask = None
         if self.use_input_mask:
-            input_mask = torch.tril(torch.ones(self.batch_size, self.seq_length)).to(torch_device)
+            input_mask = torch.tril(torch.ones_like(input_ids).to(torch_device))
 
         token_type_ids = None
         if self.use_token_type_ids:
@@ -305,7 +305,14 @@ class RecurrentGemmaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineT
 
     # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79245/workflows/9490ef58-79c2-410d-8f51-e3495156cf9c/jobs/1012146
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
         return True
 
diff --git a/tests/models/reformer/test_modeling_reformer.py b/tests/models/reformer/test_modeling_reformer.py
index 11c2e821975d02..774831791fe5aa 100644
--- a/tests/models/reformer/test_modeling_reformer.py
+++ b/tests/models/reformer/test_modeling_reformer.py
@@ -684,20 +684,15 @@ def _check_hidden_states_for_generate(
     def test_left_padding_compatibility(self):
         pass
 
-    def _get_input_ids_and_config(self, batch_size=2):
+    def prepare_config_and_inputs_for_generate(self, *args, **kwargs):
         # override because overwise we hit max possible seq length for model (4*8=32)
         # decreasing the seq_length in tester causes errors for "training_tests", those need exactly max seq length
         # NOTE: seq_length has to be multiple of 4, otherwise it fails for other tests
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        input_ids = inputs_dict.pop(self.input_name)
-        _ = inputs_dict.pop("attention_mask", None)
-        _ = inputs_dict.pop("decoder_input_ids", None)
-        _ = inputs_dict.pop("decoder_attention_mask", None)
-        input_ids = input_ids[:batch_size, :16]
-        attention_mask = torch.ones_like(input_ids, dtype=torch.long)[:batch_size, :16]
-        config.eos_token_id = None
-        config.forced_eos_token_id = None
-        return config, input_ids, attention_mask, inputs_dict
+        original_sequence_length = self.model_tester.seq_length
+        self.model_tester.seq_length = 16
+        test_inputs = super().prepare_config_and_inputs_for_generate(*args, **kwargs)
+        self.model_tester.seq_length = original_sequence_length
+        return test_inputs
 
 
 @require_torch
@@ -728,10 +723,17 @@ class ReformerLSHAttnModelTest(
 
     # TODO: Fix the failed tests
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
         if (
-            pipeline_test_casse_name == "QAPipelineTests"
+            pipeline_test_case_name == "QAPipelineTests"
             and tokenizer_name is not None
             and not tokenizer_name.endswith("Fast")
         ):
diff --git a/tests/models/regnet/test_modeling_flax_regnet.py b/tests/models/regnet/test_modeling_flax_regnet.py
index 911d595c56e6a0..314f0b367bb965 100644
--- a/tests/models/regnet/test_modeling_flax_regnet.py
+++ b/tests/models/regnet/test_modeling_flax_regnet.py
@@ -65,6 +65,7 @@ def __init__(
         self.num_labels = num_labels
         self.scope = scope
         self.num_stages = len(hidden_sizes)
+        super().__init__()
 
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
diff --git a/tests/models/resnet/test_modeling_flax_resnet.py b/tests/models/resnet/test_modeling_flax_resnet.py
index e9566e2e2fd5fb..ce83d415dc0f72 100644
--- a/tests/models/resnet/test_modeling_flax_resnet.py
+++ b/tests/models/resnet/test_modeling_flax_resnet.py
@@ -64,6 +64,7 @@ def __init__(
         self.num_labels = num_labels
         self.scope = scope
         self.num_stages = len(hidden_sizes)
+        super().__init__()
 
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
diff --git a/tests/models/roberta/test_modeling_flax_roberta.py b/tests/models/roberta/test_modeling_flax_roberta.py
index d205a0e75f8035..f2f7296df65570 100644
--- a/tests/models/roberta/test_modeling_flax_roberta.py
+++ b/tests/models/roberta/test_modeling_flax_roberta.py
@@ -78,6 +78,7 @@ def __init__(
         self.type_sequence_label_size = type_sequence_label_size
         self.initializer_range = initializer_range
         self.num_choices = num_choices
+        super().__init__()
 
     def prepare_config_and_inputs(self):
         input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
diff --git a/tests/models/roberta_prelayernorm/test_modeling_flax_roberta_prelayernorm.py b/tests/models/roberta_prelayernorm/test_modeling_flax_roberta_prelayernorm.py
index 0074323460a9f3..409752e162f4c3 100644
--- a/tests/models/roberta_prelayernorm/test_modeling_flax_roberta_prelayernorm.py
+++ b/tests/models/roberta_prelayernorm/test_modeling_flax_roberta_prelayernorm.py
@@ -81,6 +81,7 @@ def __init__(
         self.type_sequence_label_size = type_sequence_label_size
         self.initializer_range = initializer_range
         self.num_choices = num_choices
+        super().__init__()
 
     def prepare_config_and_inputs(self):
         input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
diff --git a/tests/models/roc_bert/test_modeling_roc_bert.py b/tests/models/roc_bert/test_modeling_roc_bert.py
index 55dc8453f49e15..b1c7fa6521340d 100644
--- a/tests/models/roc_bert/test_modeling_roc_bert.py
+++ b/tests/models/roc_bert/test_modeling_roc_bert.py
@@ -587,9 +587,16 @@ class RoCBertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
 
     # TODO: Fix the failed tests when this model gets more usage
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
-        if pipeline_test_casse_name in [
+        if pipeline_test_case_name in [
             "FillMaskPipelineTests",
             "FeatureExtractionPipelineTests",
             "TextClassificationPipelineTests",
diff --git a/tests/models/roformer/test_modeling_flax_roformer.py b/tests/models/roformer/test_modeling_flax_roformer.py
index 8364e121b42a30..971c1a18cde51f 100644
--- a/tests/models/roformer/test_modeling_flax_roformer.py
+++ b/tests/models/roformer/test_modeling_flax_roformer.py
@@ -79,6 +79,7 @@ def __init__(
         self.type_sequence_label_size = type_sequence_label_size
         self.initializer_range = initializer_range
         self.num_choices = num_choices
+        super().__init__()
 
     def prepare_config_and_inputs(self):
         input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
diff --git a/tests/models/roformer/test_modeling_tf_roformer.py b/tests/models/roformer/test_modeling_tf_roformer.py
index d0e795b6dd9c62..8e10a092af13ed 100644
--- a/tests/models/roformer/test_modeling_tf_roformer.py
+++ b/tests/models/roformer/test_modeling_tf_roformer.py
@@ -275,9 +275,16 @@ class TFRoFormerModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.Test
 
     # TODO: add `prepare_inputs_for_generation` for `TFRoFormerForCausalLM`
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
-        if pipeline_test_casse_name == "TextGenerationPipelineTests":
+        if pipeline_test_case_name == "TextGenerationPipelineTests":
             return True
 
         return False
diff --git a/tests/models/sam/test_modeling_sam.py b/tests/models/sam/test_modeling_sam.py
index 65f653e287fd9a..8ec97d7b2e5d8b 100644
--- a/tests/models/sam/test_modeling_sam.py
+++ b/tests/models/sam/test_modeling_sam.py
@@ -299,7 +299,14 @@ class SamModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
 
     # TODO: Fix me @Arthur: `run_batch_test` in `tests/test_pipeline_mixin.py` not working
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
         return True
 
diff --git a/tests/models/sam/test_modeling_tf_sam.py b/tests/models/sam/test_modeling_tf_sam.py
index 81fe6f1eaff3ea..3cb98e683a98ed 100644
--- a/tests/models/sam/test_modeling_tf_sam.py
+++ b/tests/models/sam/test_modeling_tf_sam.py
@@ -290,7 +290,14 @@ class TFSamModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
 
     # TODO: Fix me @Arthur: `run_batch_test` in `tests/test_pipeline_mixin.py` not working
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
         return True
 
diff --git a/tests/models/seamless_m4t/test_modeling_seamless_m4t.py b/tests/models/seamless_m4t/test_modeling_seamless_m4t.py
index 79f705785541b6..cb09d44421f482 100644
--- a/tests/models/seamless_m4t/test_modeling_seamless_m4t.py
+++ b/tests/models/seamless_m4t/test_modeling_seamless_m4t.py
@@ -360,8 +360,6 @@ class SeamlessM4TModelWithSpeechInputTest(ModelTesterMixin, unittest.TestCase):
     )
     all_generative_model_classes = (SeamlessM4TForSpeechToText,) if is_torch_available() else ()
 
-    input_name = "input_features"
-
     def setUp(self):
         self.model_tester = SeamlessM4TModelTester(self, input_modality="speech")
         self.config_tester = ConfigTester(self, config_class=SeamlessM4TConfig)
@@ -379,26 +377,6 @@ def test_model_from_pretrained(self):
         model = SeamlessM4TModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
-    def _get_input_ids_and_config(self, batch_size=2):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        input_ids = inputs_dict[self.input_name]
-
-        # cut to half length & take max batch_size 3
-        sequence_length = input_ids.shape[-1] // 2
-        input_ids = input_ids[:batch_size, :sequence_length]
-
-        # generate max 3 tokens
-        max_length = input_ids.shape[-1] + 3
-        if config.eos_token_id is not None and config.pad_token_id is None:
-            # hack to allow generate for models such as GPT2 as is done in `generate()`
-            if isinstance(config.eos_token_id, int):
-                config.eos_token_id = [config.eos_token_id]
-            config.pad_token_id = config.eos_token_id[0]
-
-        attention_mask = torch.ones(input_ids.shape[:2], dtype=torch.long)[:batch_size, :sequence_length]
-
-        return config, input_ids.float(), attention_mask, max_length
-
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
index 1d11cbb247caca..451fff0b35fb8c 100644
--- a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
+++ b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
@@ -376,8 +376,6 @@ class SeamlessM4Tv2ModelWithSpeechInputTest(ModelTesterMixin, unittest.TestCase)
     )
     all_generative_model_classes = (SeamlessM4Tv2ForSpeechToText,) if is_torch_available() else ()
 
-    input_name = "input_features"
-
     def setUp(self):
         self.model_tester = SeamlessM4Tv2ModelTester(self, input_modality="speech")
         self.config_tester = ConfigTester(self, config_class=SeamlessM4Tv2Config)
@@ -395,26 +393,6 @@ def test_model_from_pretrained(self):
         model = SeamlessM4Tv2Model.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
-    def _get_input_ids_and_config(self, batch_size=2):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        input_ids = inputs_dict[self.input_name]
-
-        # cut to half length & take max batch_size 3
-        sequence_length = input_ids.shape[-1] // 2
-        input_ids = input_ids[:batch_size, :sequence_length]
-
-        # generate max 3 tokens
-        max_length = input_ids.shape[-1] + 3
-        if config.eos_token_id is not None and config.pad_token_id is None:
-            # hack to allow generate for models such as GPT2 as is done in `generate()`
-            if isinstance(config.eos_token_id, int):
-                config.eos_token_id = [config.eos_token_id]
-            config.pad_token_id = config.eos_token_id[0]
-
-        attention_mask = torch.ones(input_ids.shape[:2], dtype=torch.long)[:batch_size, :sequence_length]
-
-        return config, input_ids.float(), attention_mask, max_length
-
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/speech_to_text/test_modeling_speech_to_text.py b/tests/models/speech_to_text/test_modeling_speech_to_text.py
index cef2a6781775a9..50446d4628af8c 100644
--- a/tests/models/speech_to_text/test_modeling_speech_to_text.py
+++ b/tests/models/speech_to_text/test_modeling_speech_to_text.py
@@ -282,20 +282,6 @@ class Speech2TextModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTest
     test_pruning = False
     test_missing_keys = False
 
-    input_name = "input_features"
-
-    def _get_input_ids_and_config(self, batch_size=2):
-        config, input_ids, attention_mask, inputs_dict = GenerationTesterMixin._get_input_ids_and_config(self)
-
-        # `input_ids` is actually `input_features` which is a 3D tensor.
-        # We must overwrite the mask to make it 2D since the original `_get_input_ids_and_config` creates an
-        # attention mask of the same shape as `input_ids`.
-        if len(attention_mask.shape) > 2:
-            sequence_length = input_ids.shape[1]
-            attention_mask = torch.ones((batch_size, sequence_length), dtype=torch.long, device=attention_mask.device)
-
-        return config, input_ids, attention_mask, inputs_dict
-
     def setUp(self):
         self.model_tester = Speech2TextModelTester(self)
         self.config_tester = ConfigTester(self, config_class=Speech2TextConfig)
@@ -632,46 +618,12 @@ def test_resize_embeddings_untied(self):
     def test_generate_without_input_ids(self):
         pass
 
-    def _check_outputs(self, output, input_ids, config, use_cache=False, num_return_sequences=1):
-        batch_size, seq_length = input_ids.shape[:2]
-        subsampled_seq_length = self.model_tester.get_subsampled_output_lengths(seq_length)
-        num_sequences_in_output = batch_size * num_return_sequences
-        gen_len = (
-            output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length
-        )
-
-        # scores
-        self._check_scores(num_sequences_in_output, output.scores, length=gen_len, config=config)
-
-        # Attentions
-        # encoder
-        self._check_encoder_attention_for_generate(
-            output.encoder_attentions, batch_size, config, subsampled_seq_length
-        )
-        # decoder
-        self._check_attentions_for_generate(
-            num_sequences_in_output,
-            output.decoder_attentions,
-            min_length=1,
-            max_length=output.sequences.shape[-1],
-            config=config,
-            use_cache=use_cache,
-        )
-
-        # Hidden States
-        # encoder
-        self._check_encoder_hidden_states_for_generate(
-            output.encoder_hidden_states, batch_size, config, subsampled_seq_length
-        )
-
-        # decoder
-        self._check_hidden_states_for_generate(
-            num_sequences_in_output,
-            output.decoder_hidden_states,
-            min_length=1,
-            max_length=output.sequences.shape[-1],
-            config=config,
-            use_cache=use_cache,
+    def _check_outputs(self, output, main_input, config, use_cache=False, num_return_sequences=1):
+        # In this model, the index of `batch_size` and `sequence_length`` in `main_input` is different: they are the
+        # first two dimensions of the tensor.
+        main_input = main_input[:, :, 0]
+        super()._check_outputs(
+            output, main_input, config, use_cache=use_cache, num_return_sequences=num_return_sequences
         )
 
     def _create_and_check_torchscript(self, config, inputs_dict):
diff --git a/tests/models/speecht5/test_modeling_speecht5.py b/tests/models/speecht5/test_modeling_speecht5.py
index e13cf8dd56c3ef..97abf1a2cf2c2c 100644
--- a/tests/models/speecht5/test_modeling_speecht5.py
+++ b/tests/models/speecht5/test_modeling_speecht5.py
@@ -177,8 +177,6 @@ class SpeechT5ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
     test_headmasking = False
     test_resize_embeddings = False
 
-    input_name = "input_values"
-
     def setUp(self):
         self.model_tester = SpeechT5ModelTester(self)
         self.config_tester = ConfigTester(self, config_class=SpeechT5Config, hidden_size=37)
@@ -375,8 +373,6 @@ class SpeechT5ForSpeechToTextTest(ModelTesterMixin, unittest.TestCase):
     test_pruning = False
     test_headmasking = False
 
-    input_name = "input_values"
-
     def setUp(self):
         self.model_tester = SpeechT5ForSpeechToTextTester(self)
         self.config_tester = ConfigTester(self, config_class=SpeechT5Config, hidden_size=37)
@@ -895,8 +891,6 @@ class SpeechT5ForTextToSpeechTest(ModelTesterMixin, unittest.TestCase):
     test_pruning = False
     test_headmasking = False
 
-    input_name = "input_ids"
-
     def setUp(self):
         self.model_tester = SpeechT5ForTextToSpeechTester(self)
         self.config_tester = ConfigTester(self, config_class=SpeechT5Config, hidden_size=37)
@@ -1441,8 +1435,6 @@ class SpeechT5ForSpeechToSpeechTest(ModelTesterMixin, unittest.TestCase):
     test_headmasking = False
     test_resize_embeddings = False
 
-    input_name = "input_values"
-
     def setUp(self):
         self.model_tester = SpeechT5ForSpeechToSpeechTester(self)
         self.config_tester = ConfigTester(self, config_class=SpeechT5Config, hidden_size=37)
@@ -1854,8 +1846,6 @@ class SpeechT5HifiGanTest(ModelTesterMixin, unittest.TestCase):
     is_encoder_decoder = False
     has_attentions = False
 
-    input_name = "spectrogram"
-
     def setUp(self):
         self.model_tester = SpeechT5HifiGanTester(self)
         self.config_tester = ConfigTester(self, config_class=SpeechT5HifiGanConfig)
diff --git a/tests/models/splinter/test_modeling_splinter.py b/tests/models/splinter/test_modeling_splinter.py
index b62571f189f25c..4b730017d9760d 100644
--- a/tests/models/splinter/test_modeling_splinter.py
+++ b/tests/models/splinter/test_modeling_splinter.py
@@ -225,11 +225,18 @@ class SplinterModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
 
     # TODO: Fix the failed tests when this model gets more usage
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
-        if pipeline_test_casse_name == "QAPipelineTests":
+        if pipeline_test_case_name == "QAPipelineTests":
             return True
-        elif pipeline_test_casse_name == "FeatureExtractionPipelineTests" and tokenizer_name.endswith("Fast"):
+        elif pipeline_test_case_name == "FeatureExtractionPipelineTests" and tokenizer_name.endswith("Fast"):
             return True
 
         return False
diff --git a/tests/models/splinter/test_tokenization_splinter.py b/tests/models/splinter/test_tokenization_splinter.py
new file mode 100644
index 00000000000000..4c6d295e8a8281
--- /dev/null
+++ b/tests/models/splinter/test_tokenization_splinter.py
@@ -0,0 +1,174 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+from tests.test_tokenization_common import TokenizerTesterMixin
+from transformers import SplinterTokenizerFast, is_tf_available, is_torch_available
+from transformers.models.splinter import SplinterTokenizer
+from transformers.testing_utils import get_tests_dir, slow
+
+
+SAMPLE_VOCAB = get_tests_dir("fixtures/vocab.txt")
+
+
+if is_torch_available():
+    FRAMEWORK = "pt"
+elif is_tf_available():
+    FRAMEWORK = "tf"
+else:
+    FRAMEWORK = "jax"
+
+
+class SplinterTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+    tokenizer_class = SplinterTokenizer
+    rust_tokenizer_class = SplinterTokenizerFast
+    space_between_special_tokens = False
+    test_rust_tokenizer = False
+    test_sentencepiece_ignore_case = False
+    pre_trained_model_path = "tau/splinter-base"
+
+    # Copied from transformers.models.siglip.SiglipTokenizationTest.setUp
+    def setUp(self):
+        super().setUp()
+        tokenizer = SplinterTokenizer(SAMPLE_VOCAB)
+        tokenizer.vocab["[UNK]"] = len(tokenizer.vocab)
+        tokenizer.vocab["[QUESTION]"] = len(tokenizer.vocab)
+        tokenizer.vocab["."] = len(tokenizer.vocab)
+        tokenizer.add_tokens("this is a test thou shall not determine rigor truly".split())
+        tokenizer.save_pretrained(self.tmpdirname)
+
+    def get_tokenizer(self, **kwargs) -> SplinterTokenizer:
+        return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_rust_tokenizer(self, **kwargs) -> SplinterTokenizerFast:
+        return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
+
+    # Copied from transformers.models.siglip.SiglipTokenizationTest.test_get_vocab
+    def test_get_vocab(self):
+        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
+        self.assertEqual(vocab_keys[0], "[PAD]")
+        self.assertEqual(vocab_keys[1], "[SEP]")
+        self.assertEqual(vocab_keys[2], "[MASK]")
+
+    # Copied from transformers.models.siglip.SiglipTokenizationTest.test_convert_token_and_id
+    def test_convert_token_and_id(self):
+        token = "[PAD]"
+        token_id = 0
+
+        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
+        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
+
+    def test_question_token_id(self):
+        tokenizer = self.get_tokenizer()
+        self.assertEqual(tokenizer.question_token_id, tokenizer.convert_tokens_to_ids(tokenizer.question_token))
+
+    # Copied from transformers.models.siglip.SiglipTokenizationTest.test_full_tokenizer
+    def test_full_tokenizer(self):
+        tokenizer = self.get_tokenizer()
+        test_str = "This is a test"
+
+        unk_token = tokenizer.unk_token
+        unk_token_id = tokenizer._convert_token_to_id_with_added_voc(unk_token)
+
+        expected_tokens = test_str.lower().split()
+        tokenizer.add_tokens(expected_tokens)
+        tokens = tokenizer.tokenize(test_str)
+        self.assertListEqual(tokens, expected_tokens)
+
+        # test with out of vocabulary string
+        tokens = tokenizer.tokenize(test_str + " oov")
+        self.assertListEqual(tokens, expected_tokens + [unk_token])
+
+        expected_token_ids = [13, 14, 15, 16, unk_token_id]
+        token_ids = tokenizer.convert_tokens_to_ids(tokens)
+        self.assertListEqual(token_ids, expected_token_ids)
+
+        tokenizer = self.get_tokenizer(basic_tokenize=False)
+        expected_token_ids = [13, 14, 15, 16, unk_token_id]
+        token_ids = tokenizer.convert_tokens_to_ids(tokens)
+        self.assertListEqual(token_ids, expected_token_ids)
+
+    # Copied from transformers.models.siglip.SiglipTokenizationTest.test_rust_and_python_full_tokenizers
+    def test_rust_and_python_full_tokenizers(self):
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+
+        sequence = "I need to test this rigor"
+        tokens = tokenizer.tokenize(sequence, add_special_tokens=False)
+        rust_tokens = rust_tokenizer.tokenize(sequence, add_special_tokens=False)
+        self.assertListEqual(tokens, rust_tokens)
+
+        ids = tokenizer.encode(sequence)
+        rust_ids = rust_tokenizer.encode(sequence)
+        self.assertListEqual(ids, rust_ids)
+
+    # Copied from transformers.models.siglip.SiglipTokenizationTest.test_max_length
+    def test_max_length(self):
+        max_length = 20
+        tokenizer = self.get_tokenizer()
+        texts = ["this is a test", "I have pizza for lunch"]
+        tokenized = tokenizer(
+            text_target=texts,
+            max_length=max_length,
+            padding="max_length",
+            truncation=True,
+            return_tensors=FRAMEWORK,
+        )
+        self.assertEqual(len(tokenized["input_ids"]), len(texts))
+        self.assertEqual(len(tokenized["input_ids"][0]), max_length)
+        self.assertEqual(len(tokenized["input_ids"][1]), max_length)
+        self.assertEqual(len(tokenized["attention_mask"][0]), max_length)
+        self.assertEqual(len(tokenized["attention_mask"][1]), max_length)
+        self.assertEqual(len(tokenized["token_type_ids"][0]), max_length)
+        self.assertEqual(len(tokenized["token_type_ids"][1]), max_length)
+
+    # Copied from transformers.models.siglip.SiglipTokenizationTest.test_tokenizer_integration
+    # fmt:skip
+    @slow
+    def test_tokenizer_integration(self):
+        tokenizer = SplinterTokenizer.from_pretrained("tau/splinter-base", max_length=10)
+        texts = [
+            "The cat sat on the windowsill, watching birds in the garden.",
+            "She baked a delicious cake for her sister's birthday party.",
+            "The sun set over the horizon, painting the sky with vibrant colors.",
+        ]
+        # fmt:off
+        expected_token_id_list = [
+            [101, 1109, 5855, 2068, 1113, 1103, 3751, 7956, 117, 2903, 4939, 1107, 1103, 4605, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1153, 19983, 170, 13108, 10851, 1111, 1123, 2104, 112, 188, 5913, 1710, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1109, 3336, 1383, 1166, 1103, 11385, 117, 3504, 1103, 3901, 1114, 18652, 5769, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+        ]
+        # fmt:on
+        for text, expected_token_ids in zip(texts, expected_token_id_list):
+            input_ids = tokenizer(text, padding="max_length").input_ids
+            self.assertListEqual(input_ids, expected_token_ids)
+
+    def test_special_tokens_mask_input_pairs(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                sequence_0 = "Encode this."
+                sequence_1 = "This one too please."
+                encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False)
+                encoded_sequence += tokenizer.encode(sequence_1, add_special_tokens=False)
+                encoded_sequence_dict = tokenizer.encode_plus(
+                    sequence_0,
+                    sequence_1,
+                    add_special_tokens=True,
+                    return_special_tokens_mask=True,
+                )
+                encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
+                special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
+                # splinter tokenizer always add cls, question_suffix, and 2 separators
+                # while in special_token_mask it does not seems to do that
+                self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special) - 2)
diff --git a/tests/models/stablelm/test_modeling_stablelm.py b/tests/models/stablelm/test_modeling_stablelm.py
index 36cad89bcfdf06..c88fda6fb84e06 100644
--- a/tests/models/stablelm/test_modeling_stablelm.py
+++ b/tests/models/stablelm/test_modeling_stablelm.py
@@ -113,7 +113,7 @@ def prepare_config_and_inputs(self):
 
         input_mask = None
         if self.use_input_mask:
-            input_mask = torch.tril(torch.ones(self.batch_size, self.seq_length)).to(torch_device)
+            input_mask = torch.tril(torch.ones_like(input_ids).to(torch_device))
 
         token_type_ids = None
         if self.use_token_type_ids:
diff --git a/tests/models/starcoder2/test_modeling_starcoder2.py b/tests/models/starcoder2/test_modeling_starcoder2.py
index c1c7d45d4f18d7..32d28143d72ffa 100644
--- a/tests/models/starcoder2/test_modeling_starcoder2.py
+++ b/tests/models/starcoder2/test_modeling_starcoder2.py
@@ -107,7 +107,7 @@ def prepare_config_and_inputs(self):
 
         input_mask = None
         if self.use_input_mask:
-            input_mask = torch.tril(torch.ones(self.batch_size, self.seq_length)).to(torch_device)
+            input_mask = torch.tril(torch.ones_like(input_ids).to(torch_device))
 
         token_type_ids = None
         if self.use_token_type_ids:
@@ -311,7 +311,14 @@ class Starcoder2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste
 
     # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79245/workflows/9490ef58-79c2-410d-8f51-e3495156cf9c/jobs/1012146
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
         return True
 
diff --git a/tests/models/t5/test_modeling_t5.py b/tests/models/t5/test_modeling_t5.py
index 93634ef2a67099..fe9b40a54abef7 100644
--- a/tests/models/t5/test_modeling_t5.py
+++ b/tests/models/t5/test_modeling_t5.py
@@ -585,7 +585,14 @@ def setUp(self):
     # `QAPipelineTests` is not working well with slow tokenizers (for some models) and we don't want to touch the file
     # `src/transformers/data/processors/squad.py` (where this test fails for this model)
     def is_pipeline_test_to_skip(
-        self, pipeline_test_case_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
         if tokenizer_name is None:
             return True
@@ -1056,6 +1063,26 @@ def test_with_token_classification_head(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_with_token_classification_head(*config_and_inputs)
 
+    def is_pipeline_test_to_skip(
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
+    ):
+        if tokenizer_name is None:
+            return True
+
+        # `T5EncoderOnlyModelTest` is not working well with slow tokenizers (for some models) and we don't want to touch the file
+        # `src/transformers/data/processors/squad.py` (where this test fails for this model)
+        if pipeline_test_case_name == "TokenClassificationPipelineTests" and not tokenizer_name.endswith("Fast"):
+            return True
+
+        return False
+
 
 def use_task_specific_params(model, task):
     model.config.update(model.config.task_specific_params[task])
diff --git a/tests/models/t5/test_modeling_tf_t5.py b/tests/models/t5/test_modeling_tf_t5.py
index d7b6fd84d5fdb6..037f1b1e2188a4 100644
--- a/tests/models/t5/test_modeling_tf_t5.py
+++ b/tests/models/t5/test_modeling_tf_t5.py
@@ -470,7 +470,7 @@ def test_greedy_xla_generate_simple(self):
         self.assertListEqual(expected_output_string, output_strings_xla)
 
     @slow
-    def test_greedy_generate(self):
+    def test_t5_greedy_generate(self):
         model = TFT5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
         tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
 
@@ -520,7 +520,7 @@ def test_sample_xla_generate_simple(self):
             self.assertListEqual(expected_output_string_xla, output_strings_xla)
 
     @slow
-    def test_sample_generate(self):
+    def test_t5_sample_generate(self):
         model = TFT5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
         tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
 
diff --git a/tests/models/tapas/test_modeling_tapas.py b/tests/models/tapas/test_modeling_tapas.py
index d4ca5e82e4c2e3..4ee159d6bddd1d 100644
--- a/tests/models/tapas/test_modeling_tapas.py
+++ b/tests/models/tapas/test_modeling_tapas.py
@@ -492,7 +492,14 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
 
     # TODO: Fix the failed tests
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
         return True
 
diff --git a/tests/models/tapas/test_modeling_tf_tapas.py b/tests/models/tapas/test_modeling_tf_tapas.py
index 0da8b2879e8c07..b70ec13452b35c 100644
--- a/tests/models/tapas/test_modeling_tf_tapas.py
+++ b/tests/models/tapas/test_modeling_tf_tapas.py
@@ -447,7 +447,14 @@ class TFTapasModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCas
 
     # TODO: Fix the failed tests
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
         return True
 
diff --git a/tests/models/umt5/test_modeling_umt5.py b/tests/models/umt5/test_modeling_umt5.py
index 2bb841e65e6512..1bd01da8e6caec 100644
--- a/tests/models/umt5/test_modeling_umt5.py
+++ b/tests/models/umt5/test_modeling_umt5.py
@@ -322,9 +322,16 @@ def setUp(self):
     # `QAPipelineTests` is not working well with slow tokenizers (for some models) and we don't want to touch the file
     # `src/transformers/data/processors/squad.py` (where this test fails for this model)
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
-        if pipeline_test_casse_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast"):
+        if pipeline_test_case_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast"):
             return True
 
         return False
@@ -723,6 +730,26 @@ def test_with_token_classification_head(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_with_token_classification_head(*config_and_inputs)
 
+    def is_pipeline_test_to_skip(
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
+    ):
+        if tokenizer_name is None:
+            return True
+
+        # `UMT5EncoderOnlyModelTest` is not working well with slow tokenizers (for some models) and we don't want to touch the file
+        # `src/transformers/data/processors/squad.py` (where this test fails for this model)
+        if pipeline_test_case_name == "TokenClassificationPipelineTests" and not tokenizer_name.endswith("Fast"):
+            return True
+
+        return False
+
 
 @require_torch
 @require_sentencepiece
diff --git a/tests/models/univnet/test_modeling_univnet.py b/tests/models/univnet/test_modeling_univnet.py
index f26a423a1a2f5b..84d28c645874d1 100644
--- a/tests/models/univnet/test_modeling_univnet.py
+++ b/tests/models/univnet/test_modeling_univnet.py
@@ -118,8 +118,6 @@ class UnivNetModelTest(ModelTesterMixin, unittest.TestCase):
     is_encoder_decoder = False
     has_attentions = False
 
-    input_name = "input_features"
-
     def setUp(self):
         self.model_tester = UnivNetModelTester(self)
         self.config_tester = ConfigTester(
diff --git a/tests/models/vit/test_modeling_flax_vit.py b/tests/models/vit/test_modeling_flax_vit.py
index fb53caa3433ac2..97fc3082a98daa 100644
--- a/tests/models/vit/test_modeling_flax_vit.py
+++ b/tests/models/vit/test_modeling_flax_vit.py
@@ -72,6 +72,7 @@ def __init__(
         # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
         num_patches = (image_size // patch_size) ** 2
         self.seq_length = num_patches + 1
+        super().__init__()
 
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
diff --git a/tests/models/vits/test_modeling_vits.py b/tests/models/vits/test_modeling_vits.py
index 99ba51e35f6663..366194090953f5 100644
--- a/tests/models/vits/test_modeling_vits.py
+++ b/tests/models/vits/test_modeling_vits.py
@@ -167,8 +167,6 @@ class VitsModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     test_torchscript = False
     has_attentions = False
 
-    input_name = "input_ids"
-
     def setUp(self):
         self.model_tester = VitsModelTester(self)
         self.config_tester = ConfigTester(self, config_class=VitsConfig, hidden_size=37)
diff --git a/tests/models/vivit/test_modeling_vivit.py b/tests/models/vivit/test_modeling_vivit.py
index 19a179a6a3e030..7cce77e6fc0019 100644
--- a/tests/models/vivit/test_modeling_vivit.py
+++ b/tests/models/vivit/test_modeling_vivit.py
@@ -359,12 +359,12 @@ def test_inference_interpolate_pos_encoding(self):
         # allowing to interpolate the pre-trained position embeddings in order to use
         # the model on higher resolutions. The DINO model by Facebook AI leverages this
         # to visualize self-attention on higher resolution images.
-        model = VivitModel.from_pretrained("google/vivit-b-16x2").to(torch_device)
+        model = VivitModel.from_pretrained("google/vivit-b-16x2-kinetics400").to(torch_device)
 
-        image_processor = VivitImageProcessor.from_pretrained("google/vivit-b-16x2")
+        image_processor = VivitImageProcessor.from_pretrained("google/vivit-b-16x2-kinetics400")
         video = prepare_video()
         inputs = image_processor(
-            video, size={"shortest_edge": 480}, crop_size={"height": 480, "width": 480}, return_tensors="pt"
+            video, size={"shortest_edge": 480}, crop_size={"height": 232, "width": 232}, return_tensors="pt"
         )
         pixel_values = inputs.pixel_values.to(torch_device)
 
diff --git a/tests/models/wav2vec2/test_processor_wav2vec2.py b/tests/models/wav2vec2/test_processor_wav2vec2.py
index 67883618ca86e9..30c9243e8e4f55 100644
--- a/tests/models/wav2vec2/test_processor_wav2vec2.py
+++ b/tests/models/wav2vec2/test_processor_wav2vec2.py
@@ -18,14 +18,19 @@
 import tempfile
 import unittest
 
+import numpy as np
+
 from transformers.models.wav2vec2 import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor
 from transformers.models.wav2vec2.tokenization_wav2vec2 import VOCAB_FILES_NAMES
 from transformers.utils import FEATURE_EXTRACTOR_NAME
 
+from ...test_processing_common import ProcessorTesterMixin
 from .test_feature_extraction_wav2vec2 import floats_list
 
 
-class Wav2Vec2ProcessorTest(unittest.TestCase):
+class Wav2Vec2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = Wav2Vec2Processor
+
     def setUp(self):
         vocab = "<pad> <s> </s> <unk> | E T A O N I H S R D L U M W C F G Y P B V K ' X J Q Z".split(" ")
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
@@ -53,6 +58,9 @@ def setUp(self):
         with open(self.feature_extraction_file, "w", encoding="utf-8") as fp:
             fp.write(json.dumps(feature_extractor_map) + "\n")
 
+        tokenizer = self.get_tokenizer()
+        tokenizer.save_pretrained(self.tmpdirname)
+
     def get_tokenizer(self, **kwargs_init):
         kwargs = self.add_kwargs_tokens_map.copy()
         kwargs.update(kwargs_init)
@@ -117,7 +125,6 @@ def test_tokenizer(self):
         processor = Wav2Vec2Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
 
         input_str = "This is a test string"
-
         encoded_processor = processor(text=input_str)
 
         encoded_tok = tokenizer(input_str)
@@ -125,6 +132,22 @@ def test_tokenizer(self):
         for key in encoded_tok.keys():
             self.assertListEqual(encoded_tok[key], encoded_processor[key])
 
+    def test_padding_argument_not_ignored(self):
+        # padding, or any other overlap arg between audio extractor and tokenizer
+        # should be passed to both text and audio and not ignored
+
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = Wav2Vec2Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        batch_duration_in_seconds = [1, 3, 2, 6]
+        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
+
+        # padding = True should not raise an error and will if the audio processor popped its value to None
+        _ = processor(
+            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt"
+        )
+
     def test_tokenizer_decode(self):
         feature_extractor = self.get_feature_extractor()
         tokenizer = self.get_tokenizer()
diff --git a/tests/models/wav2vec2_bert/test_processor_wav2vec2_bert.py b/tests/models/wav2vec2_bert/test_processor_wav2vec2_bert.py
index b6b1506f5e4d68..704d087a56a8e3 100644
--- a/tests/models/wav2vec2_bert/test_processor_wav2vec2_bert.py
+++ b/tests/models/wav2vec2_bert/test_processor_wav2vec2_bert.py
@@ -18,17 +18,21 @@
 import tempfile
 import unittest
 
+import numpy as np
+
 from transformers.models.seamless_m4t import SeamlessM4TFeatureExtractor
 from transformers.models.wav2vec2 import Wav2Vec2CTCTokenizer
 from transformers.models.wav2vec2.tokenization_wav2vec2 import VOCAB_FILES_NAMES
 from transformers.models.wav2vec2_bert import Wav2Vec2BertProcessor
 from transformers.utils import FEATURE_EXTRACTOR_NAME
 
+from ...test_processing_common import ProcessorTesterMixin
 from ..wav2vec2.test_feature_extraction_wav2vec2 import floats_list
 
 
-# Copied from tests.models.wav2vec2.test_processor_wav2vec2.Wav2Vec2ProcessorTest with Wav2Vec2FeatureExtractor->SeamlessM4TFeatureExtractor, Wav2Vec2Processor->Wav2Vec2BertProcessor
-class Wav2Vec2BertProcessorTest(unittest.TestCase):
+class Wav2Vec2BertProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = Wav2Vec2BertProcessor
+
     def setUp(self):
         vocab = "<pad> <s> </s> <unk> | E T A O N I H S R D L U M W C F G Y P B V K ' X J Q Z".split(" ")
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
@@ -40,7 +44,7 @@ def setUp(self):
             "eos_token": "</s>",
         }
         feature_extractor_map = {
-            "feature_size": 1,
+            "feature_size": 80,
             "padding_value": 0.0,
             "sampling_rate": 16000,
             "return_attention_mask": False,
@@ -56,6 +60,9 @@ def setUp(self):
         with open(self.feature_extraction_file, "w", encoding="utf-8") as fp:
             fp.write(json.dumps(feature_extractor_map) + "\n")
 
+        tokenizer = self.get_tokenizer()
+        tokenizer.save_pretrained(self.tmpdirname)
+
     def get_tokenizer(self, **kwargs_init):
         kwargs = self.add_kwargs_tokens_map.copy()
         kwargs.update(kwargs_init)
@@ -122,7 +129,6 @@ def test_tokenizer(self):
         processor = Wav2Vec2BertProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
 
         input_str = "This is a test string"
-
         encoded_processor = processor(text=input_str)
 
         encoded_tok = tokenizer(input_str)
@@ -130,6 +136,22 @@ def test_tokenizer(self):
         for key in encoded_tok.keys():
             self.assertListEqual(encoded_tok[key], encoded_processor[key])
 
+    def test_padding_argument_not_ignored(self):
+        # padding, or any other overlap arg between audio extractor and tokenizer
+        # should be passed to both text and audio and not ignored
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = Wav2Vec2BertProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        batch_duration_in_seconds = [1, 3, 2, 6]
+        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
+
+        # padding = True should not raise an error and will if the audio processor popped its value to None
+        # processor(input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt")
+        _ = processor(
+            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt"
+        )
+
     def test_tokenizer_decode(self):
         feature_extractor = self.get_feature_extractor()
         tokenizer = self.get_tokenizer()
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index b4e71ca72e56ed..b24c577a16e575 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -395,13 +395,18 @@ class WhisperModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
     # `0.5` is for `test_disk_offload` (which also works for `test_model_parallelism`)
     model_split_percents = [0.5, 0.8, 0.9]
 
-    input_name = "input_features"
-
     # TODO: Fix the failed tests
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
-        if pipeline_test_casse_name in [
+        if pipeline_test_case_name in [
             "AutomaticSpeechRecognitionPipelineTests",
             "AudioClassificationPipelineTests",
         ]:
@@ -868,48 +873,6 @@ def test_resize_embeddings_untied(self):
     def test_generate_without_input_ids(self):
         pass
 
-    def _check_outputs(self, output, input_ids, config, use_cache=False, num_return_sequences=1):
-        batch_size, mel, seq_length = input_ids.shape
-        subsampled_seq_length = self.model_tester.get_subsampled_output_lengths(seq_length)
-        num_sequences_in_output = batch_size * num_return_sequences
-        gen_len = (
-            output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length
-        )
-
-        # scores
-        self._check_scores(num_sequences_in_output, output.scores, length=gen_len, config=config)
-
-        # Attentions
-        # encoder
-        self._check_encoder_attention_for_generate(
-            output.encoder_attentions, batch_size, config, subsampled_seq_length
-        )
-        # decoder
-        self._check_attentions_for_generate(
-            num_sequences_in_output,
-            output.decoder_attentions,
-            min_length=1,
-            max_length=output.sequences.shape[-1],
-            config=config,
-            use_cache=use_cache,
-        )
-
-        # Hidden States
-        # encoder
-        self._check_encoder_hidden_states_for_generate(
-            output.encoder_hidden_states, batch_size, config, subsampled_seq_length
-        )
-
-        # decoder
-        self._check_hidden_states_for_generate(
-            num_sequences_in_output,
-            output.decoder_hidden_states,
-            min_length=1,
-            max_length=output.sequences.shape[-1],
-            config=config,
-            use_cache=use_cache,
-        )
-
     @require_flash_attn
     @require_torch_gpu
     @pytest.mark.flash_attn_test
@@ -1960,14 +1923,14 @@ def test_large_generation_multilingual(self):
             input_features, do_sample=False, max_length=20, language="<|de|>", task="transcribe"
         )
         transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-        EXPECTED_TRANSCRIPT = " Mein sechster Sohn scheint, wenigstens auf den ersten Blick,"
+        EXPECTED_TRANSCRIPT = " Denken Sie, soeben walten meine Gedanken bei Ihnen in Adela"
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
 
         generated_ids = model.generate(
             input_features, do_sample=False, max_length=20, language="<|de|>", task="translate"
         )
         transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-        EXPECTED_TRANSCRIPT = " My sixth son seems, at least at first glance, the most deeply-minded"
+        EXPECTED_TRANSCRIPT = " Think, my thoughts were just rolling with you in Adelaide, and I"
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
 
     @slow
@@ -2144,6 +2107,21 @@ def test_tiny_timestamp_generation(self):
         transcript = processor.batch_decode(generated_ids, skip_special_tokens=True, output_offsets=True)
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
 
+    @slow
+    def test_distil_token_timestamp_generation(self):
+        # we actually just want to check that returning segments with distil model works
+        processor = WhisperProcessor.from_pretrained("distil-whisper/distil-large-v3")
+        model = WhisperForConditionalGeneration.from_pretrained("distil-whisper/distil-large-v3")
+        model.to(torch_device)
+
+        input_speech = np.concatenate(self._load_datasamples(4))
+        input_features = processor(input_speech, return_tensors="pt", sampling_rate=16_000).input_features
+        input_features = input_features.to(torch_device)
+
+        _ = model.generate(
+            input_features, max_length=448, return_timestamps=True, return_token_timestamps=True, return_segments=True
+        )
+
     @slow
     def test_tiny_longform_timestamps_generation(self):
         set_seed(0)
@@ -2282,7 +2260,7 @@ def test_tiny_token_timestamp_generation(self):
             input_features, max_length=448, return_timestamps=True, return_token_timestamps=True
         )
 
-        self.assertEqual(generate_outputs.sequences.shape, generate_outputs.token_timestamps.shape)
+        self.assertEqual(generate_outputs["sequences"].shape, generate_outputs["token_timestamps"].shape)
 
         # fmt: off
         EXPECTED_OUTPUT = torch.tensor([
@@ -2293,7 +2271,7 @@ def test_tiny_token_timestamp_generation(self):
         ])
         # fmt: on
 
-        self.assertTrue(torch.allclose(generate_outputs.token_timestamps.to("cpu"), EXPECTED_OUTPUT))
+        self.assertTrue(torch.allclose(generate_outputs["token_timestamps"].to("cpu"), EXPECTED_OUTPUT))
 
     @slow
     def test_large_token_timestamp_generation(self):
@@ -2312,7 +2290,7 @@ def test_large_token_timestamp_generation(self):
             **input_features, max_length=448, return_timestamps=True, return_token_timestamps=True
         )
 
-        self.assertEqual(generate_outputs.sequences.shape, generate_outputs.token_timestamps.shape)
+        self.assertEqual(generate_outputs["sequences"].shape, generate_outputs["token_timestamps"].shape)
 
         # fmt: off
         EXPECTED_OUTPUT = torch.tensor([
@@ -2323,7 +2301,7 @@ def test_large_token_timestamp_generation(self):
         ])
         # fmt: on
 
-        self.assertTrue(torch.allclose(generate_outputs.token_timestamps.to("cpu"), EXPECTED_OUTPUT))
+        self.assertTrue(torch.allclose(generate_outputs["token_timestamps"].to("cpu"), EXPECTED_OUTPUT))
 
     @slow
     def test_tiny_token_timestamp_batch_generation(self):
@@ -2350,9 +2328,9 @@ def test_tiny_token_timestamp_batch_generation(self):
         )
 
         # task id and lang id prompts should not have timestamp tokens
-        self.assertEqual(generate_outputs.sequences.shape[-1] - 2, generate_outputs.token_timestamps.shape[-1])
+        self.assertEqual(generate_outputs["sequences"].shape[-1] - 2, generate_outputs["token_timestamps"].shape[-1])
 
-        self.assertEqual(len(generate_outputs.sequences), num_return_sequences * num_samples)
+        self.assertEqual(len(generate_outputs["sequences"]), num_return_sequences * num_samples)
 
     @slow
     def test_tiny_token_timestamp_generation_longform(self):
@@ -2843,7 +2821,7 @@ def test_whisper_shortform_single_batch_prev_cond(self):
 
         torch.manual_seed(0)
         result = model.generate(input_features, **gen_kwargs)
-        decoded = processor.batch_decode(result.sequences, skip_special_tokens=True)
+        decoded = processor.batch_decode(result, skip_special_tokens=True)
 
         assert decoded == EXPECTED_TEXT
 
@@ -2858,7 +2836,7 @@ def test_whisper_shortform_single_batch_prev_cond(self):
 
         torch.manual_seed(0)
         result = model.generate(input_features, **gen_kwargs)
-        decoded = processor.batch_decode(result.sequences, skip_special_tokens=True)
+        decoded = processor.batch_decode(result, skip_special_tokens=True)
 
         assert decoded == EXPECTED_TEXT1
 
@@ -3158,7 +3136,7 @@ def test_whisper_shortform_multi_batch_hard_prev_cond(self):
         }
 
         result = model.generate(**inputs, **gen_kwargs)
-        decoded_all = processor.batch_decode(result.sequences, skip_special_tokens=True)
+        decoded_all = processor.batch_decode(result, skip_special_tokens=True)
 
         for i in range(num_samples):
             if isinstance(EXPECTED_TEXT[i], str):
@@ -3511,8 +3489,6 @@ class WhisperEncoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.
     test_pruning = False
     test_missing_keys = False
 
-    input_name = "input_features"
-
     def setUp(self):
         self.model_tester = WhisperEncoderModelTester(self)
         self.config_tester = ConfigTester(self, config_class=WhisperConfig)
diff --git a/tests/models/xlm/test_modeling_tf_xlm.py b/tests/models/xlm/test_modeling_tf_xlm.py
index 139f29db007b4a..f9172a1b4eb05d 100644
--- a/tests/models/xlm/test_modeling_tf_xlm.py
+++ b/tests/models/xlm/test_modeling_tf_xlm.py
@@ -312,10 +312,17 @@ class TFXLMModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
 
     # TODO: Fix the failed tests
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
         if (
-            pipeline_test_casse_name == "QAPipelineTests"
+            pipeline_test_case_name == "QAPipelineTests"
             and tokenizer_name is not None
             and not tokenizer_name.endswith("Fast")
         ):
diff --git a/tests/models/xlm/test_modeling_xlm.py b/tests/models/xlm/test_modeling_xlm.py
index 268ba79d5931ff..556f97c0b2c935 100644
--- a/tests/models/xlm/test_modeling_xlm.py
+++ b/tests/models/xlm/test_modeling_xlm.py
@@ -393,10 +393,17 @@ class XLMModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
 
     # TODO: Fix the failed tests
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
         if (
-            pipeline_test_casse_name == "QAPipelineTests"
+            pipeline_test_case_name == "QAPipelineTests"
             and tokenizer_name is not None
             and not tokenizer_name.endswith("Fast")
         ):
diff --git a/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py b/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py
index a73f5618ff7eda..5b426d27799fbb 100644
--- a/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py
+++ b/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py
@@ -392,9 +392,16 @@ class XLMRobertaXLModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTes
 
     # TODO: Fix the failed tests
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
-        if pipeline_test_casse_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast"):
+        if pipeline_test_case_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast"):
             return True
 
         return False
diff --git a/tests/models/xlnet/test_modeling_tf_xlnet.py b/tests/models/xlnet/test_modeling_tf_xlnet.py
index ea223914a35976..71a308bf52f5d4 100644
--- a/tests/models/xlnet/test_modeling_tf_xlnet.py
+++ b/tests/models/xlnet/test_modeling_tf_xlnet.py
@@ -372,7 +372,14 @@ def test_xla_generate_contrastive(self):
 
     # TODO: Fix the failed tests
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
         # Exception encountered when calling layer '...'
         return True
diff --git a/tests/models/xlnet/test_modeling_xlnet.py b/tests/models/xlnet/test_modeling_xlnet.py
index 0d785d4a1fc85d..630c8d2e634eaf 100644
--- a/tests/models/xlnet/test_modeling_xlnet.py
+++ b/tests/models/xlnet/test_modeling_xlnet.py
@@ -543,9 +543,16 @@ class XLNetModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
 
     # TODO: Fix the failed tests
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
-        if pipeline_test_casse_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast"):
+        if pipeline_test_case_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast"):
             return True
 
         return False
diff --git a/tests/models/xmod/test_modeling_xmod.py b/tests/models/xmod/test_modeling_xmod.py
index 0140d007d06897..ae9a35c5d6371f 100644
--- a/tests/models/xmod/test_modeling_xmod.py
+++ b/tests/models/xmod/test_modeling_xmod.py
@@ -386,9 +386,16 @@ class XmodModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
 
     # TODO: Fix the failed tests
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
-        if pipeline_test_casse_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast"):
+        if pipeline_test_case_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast"):
             return True
 
         return False
diff --git a/tests/models/zamba/__init__.py b/tests/models/zamba/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tests/models/zamba/test_modeling_zamba.py b/tests/models/zamba/test_modeling_zamba.py
new file mode 100644
index 00000000000000..c0a8020bedd76a
--- /dev/null
+++ b/tests/models/zamba/test_modeling_zamba.py
@@ -0,0 +1,736 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch Zamba model."""
+
+import math
+import tempfile
+import unittest
+
+import pytest
+from parameterized import parameterized
+
+from transformers import AutoTokenizer, ZambaConfig, is_torch_available
+from transformers.testing_utils import (
+    require_bitsandbytes,
+    require_flash_attn,
+    require_torch,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        ZambaForCausalLM,
+        ZambaForSequenceClassification,
+        ZambaModel,
+    )
+    from transformers.models.zamba.modeling_zamba import (
+        HybridMambaAttentionDynamicCache,
+    )
+
+
+class ZambaModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=64,
+        mamba_dt_rank=32,
+        num_hidden_layers=5,
+        attn_layer_offset=1,
+        attn_layer_period=8,
+        num_attention_heads=4,
+        num_key_value_heads=4,
+        n_mamba_heads=2,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_mamba_act="silu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.mamba_dt_rank = mamba_dt_rank
+        self.num_hidden_layers = num_hidden_layers
+        self.attn_layer_offset = attn_layer_offset
+        self.attn_layer_period = attn_layer_period
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.n_mamba_heads = n_mamba_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_mamba_act = hidden_mamba_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = self.get_config()
+
+        return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def get_config(self):
+        return ZambaConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            mamba_dt_rank=self.mamba_dt_rank,
+            num_hidden_layers=self.num_hidden_layers,
+            attn_layer_offset=self.attn_layer_offset,
+            attn_layer_period=self.attn_layer_period,
+            num_attention_heads=self.num_attention_heads,
+            num_key_value_heads=self.num_key_value_heads,
+            n_mamba_heads=self.n_mamba_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_mamba_act=self.hidden_mamba_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=True,
+            initializer_range=self.initializer_range,
+            use_mamba_kernels=False,
+        )
+
+    def prepare_config_and_inputs_for_decoder(self):
+        (
+            config,
+            input_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = self.prepare_config_and_inputs()
+
+        config.is_decoder = True
+
+        return (
+            config,
+            input_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        )
+
+    def create_and_check_model(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        model = ZambaModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_for_causal_lm(
+        self,
+        config,
+        input_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+    ):
+        model = ZambaForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
+        result = model(input_ids, attention_mask=input_mask)
+        result = model(input_ids, labels=token_labels)
+        result = model(input_ids)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_decoder_model_past_large_inputs(
+        self,
+        config,
+        input_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+    ):
+        config.is_decoder = True
+        config.add_cross_attention = True
+        model = ZambaForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        # first forward pass
+        # Attention: Zamba needs the cache to be initialized to return a cache!
+        past_key_values = HybridMambaAttentionDynamicCache(
+            config, input_ids.shape[0], model.dtype, device=model.device
+        )
+        outputs = model(
+            input_ids,
+            attention_mask=input_mask,
+            past_key_values=past_key_values,
+            use_cache=True,
+        )
+        past_key_values = outputs.past_key_values
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
+
+        output_from_no_past = model(
+            next_input_ids,
+            attention_mask=next_attention_mask,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+        output_from_past = model(
+            next_tokens,
+            attention_mask=next_attention_mask,
+            past_key_values=past_key_values,
+            output_hidden_states=True,
+            cache_position=torch.arange(
+                input_ids.shape[1], input_ids.shape[1] + next_tokens.shape[1], device=model.device
+            ),
+        )["hidden_states"][0]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_for_sequence_classification(
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = ZambaForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, labels=sequence_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class ZambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            ZambaModel,
+            ZambaForCausalLM,
+            ZambaForSequenceClassification,
+        )
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (ZambaForCausalLM,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": ZambaModel,
+            "text-classification": ZambaForSequenceClassification,
+            "text-generation": ZambaForCausalLM,
+            "zero-shot": ZambaForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
+    test_headmasking = False
+    test_pruning = False
+
+    def setUp(self):
+        self.model_tester = ZambaModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=ZambaConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_for_casual_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_initialization(self):
+        r"""
+        Overriding the test_initialization test as the A_log and D params of the Mamba block are initialized differently
+        """
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    if "A_log" in name:
+                        A = torch.arange(1, config.mamba_d_state + 1, dtype=torch.float32)[None, :]
+                        self.assertTrue(torch.allclose(param.data, torch.log(A), atol=1e-5, rtol=1e-5))
+                    elif "D" in name:
+                        # check if it's a ones like
+                        self.assertTrue(torch.allclose(param.data, torch.ones_like(param.data), atol=1e-5, rtol=1e-5))
+                    elif "x_proj" in name or "dt_proj_weight" in name:
+                        self.assertIn(
+                            ((param.data.mean() * 1e2).round() / 1e2).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized (raw value {param.data.mean()})",
+                        )
+                    elif "dt_proj_bias" in name:
+                        dt = torch.exp(
+                            torch.tensor([0, 1]) * (math.log(config.time_step_max) - math.log(config.time_step_min))
+                            + math.log(config.time_step_min)
+                        ).clamp(min=config.time_step_floor)
+                        inv_dt = dt + torch.log(-torch.expm1(-dt))
+                        if param.requires_grad:
+                            self.assertTrue(param.data.max().item() <= inv_dt[1])
+                            self.assertTrue(param.data.min().item() >= inv_dt[0])
+                    else:
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+
+    def test_mismatched_shapes_have_properly_initialized_weights(self):
+        r"""
+        Overriding the test_mismatched_shapes_have_properly_initialized_weights test because A_log and D params of the
+        Mamba block are initialized differently and we tested that in test_initialization
+        """
+        self.skipTest("Cumbersome and redundant for Zamba")
+
+    def test_attention_outputs(self):
+        r"""
+        Overriding the test_attention_outputs test as the Zamba model outputs attention only for its attention layers
+        """
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        seq_len = getattr(self.model_tester, "seq_length", None)
+        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
+        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
+
+        expected_num_attentions = (
+            math.ceil(
+                (self.model_tester.num_hidden_layers - self.model_tester.attn_layer_offset)
+                / self.model_tester.attn_layer_period
+            )
+            + 1
+        )
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.attentions
+            self.assertEqual(len(attentions), expected_num_attentions)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.attentions
+            self.assertEqual(len(attentions), expected_num_attentions)
+
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+            )
+            out_len = len(outputs)
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            added_hidden_states = 1
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+
+            self_attentions = outputs.attentions
+
+            self.assertEqual(len(self_attentions), expected_num_attentions)
+            self.assertListEqual(
+                list(self_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+            )
+
+    def _get_input_ids_and_config(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        return config, input_ids, input_mask
+
+    def test_left_padding_compatibility(self):
+        r"""
+        Overriding the test_left_padding_compatibility test as the mamba layers accentuate the numerical differences
+        effect of the left padding discussed in the issue in the note. Using a more permissive tolerance value.
+        """
+        import inspect
+        # NOTE: left-padding results in small numerical differences. This is expected.
+        # See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535
+
+        # First, filter out models that don't support left padding - generative and decoder-only.
+        # Zamba is a decoder-only architecture
+        decoder_only_classes = self.all_generative_model_classes
+
+        # Then, test left-padding
+        def _prepare_model_kwargs(input_ids, attention_mask, signature):
+            model_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask}
+            if "position_ids" in signature:
+                position_ids = torch.cumsum(attention_mask, dim=-1) - 1
+                position_ids.masked_fill_(attention_mask == 0, 1)
+                model_kwargs["position_ids"] = position_ids
+            if "cache_position" in signature:
+                cache_position = torch.arange(input_ids.shape[-1], device=torch_device)
+                model_kwargs["cache_position"] = cache_position
+            return model_kwargs
+
+        for model_class in decoder_only_classes:
+            config, input_ids, attention_mask = self._get_input_ids_and_config()
+            model = model_class(config).to(torch_device).eval()
+            signature = inspect.signature(model.forward).parameters.keys()
+
+            # Without padding
+            model_kwargs = _prepare_model_kwargs(input_ids, attention_mask, signature)
+            next_logits_wo_padding = model(**model_kwargs).logits[:, -1, :]
+
+            # With left-padding (length 32)
+            pad_size = (input_ids.shape[0], 32)
+            padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * config.pad_token_id
+            padded_input_ids = torch.cat((padding, input_ids), dim=1)
+            padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1)
+            model_kwargs = _prepare_model_kwargs(padded_input_ids, padded_attention_mask, signature)
+            next_logits_with_padding = model(**model_kwargs).logits[:, -1, :]
+
+            # They should result in very similar logits
+            self.assertTrue(torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=3e-3))
+
+    @require_flash_attn
+    @require_torch_gpu
+    @require_bitsandbytes
+    @pytest.mark.flash_attn_test
+    @slow
+    def test_flash_attn_2_fp32_ln(self):
+        r"""
+        Overriding the test_flash_attn_2_fp32_ln test as the Zamba model, like Mixtral, doesn't support
+        right padding + use cache with FA2
+        """
+        for model_class in self.all_generative_model_classes:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+
+                dummy_input = inputs_dict[model.main_input_name]
+                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
+                # NOTE: Zamba does not support right padding + use_cache with FA2.
+                dummy_attention_mask[:, -1] = 1
+
+                model = model_class.from_pretrained(
+                    tmpdirname,
+                    torch_dtype=torch.float16,
+                    attn_implementation="flash_attention_2",
+                    low_cpu_mem_usage=True,
+                    load_in_4bit=True,
+                )
+
+                for _, param in model.named_parameters():
+                    # upcast only layer norms
+                    if (param.dtype == torch.float16) or (param.dtype == torch.bfloat16):
+                        param.data = param.data.to(torch.float32)
+
+                _ = model(dummy_input)
+                # with attention mask
+                _ = model(dummy_input, attention_mask=dummy_attention_mask)
+
+    @require_flash_attn
+    @require_torch_gpu
+    @pytest.mark.flash_attn_test
+    @slow
+    def test_flash_attn_2_generate_padding_right(self):
+        r"""
+        Overriding the test_flash_attn_2_generate_padding_right test as the Zamba model, like Mixtral, doesn't support
+        right padding + use cache with FA2
+        """
+        import torch
+
+        for model_class in self.all_generative_model_classes:
+            config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
+                    torch_device
+                )
+
+                dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device)
+                dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device)
+
+                model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False)
+
+                model = model_class.from_pretrained(
+                    tmpdirname,
+                    torch_dtype=torch.float16,
+                    attn_implementation="flash_attention_2",
+                    low_cpu_mem_usage=True,
+                ).to(torch_device)
+
+                with self.assertRaises(ValueError):
+                    _ = model.generate(
+                        dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
+                    )
+
+    @require_flash_attn
+    @require_torch_gpu
+    @pytest.mark.flash_attn_test
+    @slow
+    def test_flash_attn_2_generate_use_cache(self):
+        r"""
+        Overriding the test_flash_attn_2_generate_use_cache test as the Zamba model, like Mixtral, doesn't support
+        right padding + use cache with FA2
+        """
+        import torch
+
+        max_new_tokens = 30
+
+        for model_class in self.all_generative_model_classes:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+            dummy_input = inputs_dict[model_class.main_input_name]
+            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
+                dummy_input = dummy_input.to(torch.float16)
+
+            # make sure that all models have enough positions for generation
+            if hasattr(config, "max_position_embeddings"):
+                config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
+
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+
+                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
+                # NOTE: Zamba does not support right padding + use_cache with FA2.
+                dummy_attention_mask[:, -1] = 1
+
+                model = model_class.from_pretrained(
+                    tmpdirname,
+                    torch_dtype=torch.float16,
+                    attn_implementation="flash_attention_2",
+                    low_cpu_mem_usage=True,
+                ).to(torch_device)
+
+                # Just test that a large cache works as expected
+                _ = model.generate(
+                    dummy_input,
+                    attention_mask=dummy_attention_mask,
+                    max_new_tokens=max_new_tokens,
+                    do_sample=False,
+                    use_cache=True,
+                )
+
+    @require_flash_attn
+    @require_torch_gpu
+    @pytest.mark.flash_attn_test
+    @slow
+    def test_flash_attn_2_inference_equivalence_right_padding(self):
+        r"""
+        Overriding the test_flash_attn_2_inference_padding_right test as the Zamba model, like Mixtral, doesn't support
+        right padding + use cache with FA2
+        """
+        self.skipTest(reason="Zamba flash attention does not support right padding")
+
+    @unittest.skip(reason="Zamba has its own special cache type")
+    @parameterized.expand([(1, False), (1, True), (4, False)])
+    def test_new_cache_format(self, num_beams, do_sample):
+        pass
+
+
+@require_torch
+class ZambaModelIntegrationTest(unittest.TestCase):
+    model = None
+    tokenizer = None
+
+    @classmethod
+    @slow
+    def setUpClass(cls):
+        model_id = "Zyphra/Zamba-7B-v1"
+        cls.model = ZambaForCausalLM.from_pretrained(
+            model_id, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, use_mamba_kernels=False
+        )
+        cls.tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+    @slow
+    def test_simple_generate(self):
+        self.model.to(torch_device)
+
+        input_ids = self.tokenizer("Hey how are you doing on this lovely evening?", return_tensors="pt")[
+            "input_ids"
+        ].to(torch_device)
+        out = self.model.generate(input_ids, do_sample=False, max_new_tokens=10)
+        output_sentence = self.tokenizer.decode(out[0, :])
+        self.assertEqual(
+            output_sentence,
+            "<s> Hey how are you doing on this lovely evening? I hope you are all doing well. I am",
+        )
+
+        with torch.no_grad():
+            logits = self.model(input_ids=input_ids).logits
+
+        EXPECTED_LOGITS_NO_GRAD = torch.tensor(
+            [
+                -7.9375,  8.1875,  1.3984, -6.0000, -7.9375, -7.9375, -7.9375, -7.9375,
+                -7.9375, -7.9375, -7.9375, -7.9375,  2.7500, 13.0625, -7.9375, -7.9375,
+                -7.9375, -7.9375, -7.9375, -7.9375, -7.9375, -7.9375, -7.9375, -7.9375,
+                -7.9375, -7.9375, -7.9375, -7.9375, -7.9375, -7.9375, -7.9375, -7.9375,
+                -7.9375, -7.9375, -7.9375, -7.9375, -7.9375, -7.9375, -7.9375, -7.9375
+            ]
+            , dtype=torch.float32)  # fmt: skip
+
+        torch.testing.assert_close(logits[0, -1, :40].cpu(), EXPECTED_LOGITS_NO_GRAD, rtol=1e-3, atol=1e-3)
+
+    @slow
+    def test_simple_batched_generate_with_padding(self):
+        self.model.to(torch_device)
+        self.tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+        self.model.resize_token_embeddings(len(self.tokenizer))
+
+        inputs = self.tokenizer(
+            ["Hey how are you doing on this lovely evening?", "Tell me a story"], padding=True, return_tensors="pt"
+        ).to(torch_device)
+        out = self.model.generate(**inputs, do_sample=False, max_new_tokens=10)
+        output_sentences = self.tokenizer.batch_decode(out)
+        self.assertEqual(
+            output_sentences[0],
+            "<s> Hey how are you doing on this lovely evening? I hope you are all doing well. I am",
+        )
+        self.assertEqual(
+            output_sentences[1],
+            "[PAD][PAD][PAD][PAD][PAD][PAD]<s> Tell me a story about a time when you were in a difficult situation",
+        )
+
+        with torch.no_grad():
+            logits = self.model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"]).logits
+
+        EXPECTED_LOGITS_NO_GRAD_0 = torch.tensor(
+            [
+                -7.9375,  8.1250,  1.3594, -6.0000, -7.9375, -7.9375, -7.9375, -7.9375,
+                -7.9375, -7.9375, -7.9375, -7.9375,  2.7344, 13.0625, -7.9375, -7.9375,
+                -7.9375, -7.9375, -7.9375, -7.9375, -7.9375, -7.9375, -7.9375, -7.9375,
+                -7.9375, -7.9375, -7.9375, -7.9375, -7.9375, -7.9375, -7.9375, -7.9375,
+                -7.9375, -7.9375, -7.9375, -7.9375, -7.9375, -7.9375, -7.9375, -7.9375
+            ]
+            , dtype=torch.float32)  # fmt: skip
+
+        EXPECTED_LOGITS_NO_GRAD_1 = torch.tensor(
+            [
+               -6.3750,  3.4219,  0.6719, -5.0312, -8.5000, -8.5000, -8.5000, -8.5000,
+               -8.5000, -8.5000, -8.5000, -8.5000,  2.0625, 10.3750, -8.5000, -8.5000,
+               -8.5000, -8.5000, -8.5000, -8.5000, -8.5000, -8.5000, -8.5000, -8.5000,
+               -8.5000, -8.5000, -8.5000, -8.5000, -8.5000, -8.5000, -8.5000, -8.5000,
+               -8.5000, -8.5000, -8.5000, -8.5000, -8.5000, -8.5000, -8.5000, -8.5000
+            ]
+            , dtype=torch.float32)  # fmt: skip
+
+        torch.testing.assert_close(logits[0, -1, :40].cpu(), EXPECTED_LOGITS_NO_GRAD_0, rtol=1e-3, atol=1e-3)
+        torch.testing.assert_close(logits[1, -1, :40].cpu(), EXPECTED_LOGITS_NO_GRAD_1, rtol=1e-3, atol=1e-3)
diff --git a/tests/peft_integration/test_peft_integration.py b/tests/peft_integration/test_peft_integration.py
index 602ed04d9c6271..a80919dc61cf3f 100644
--- a/tests/peft_integration/test_peft_integration.py
+++ b/tests/peft_integration/test_peft_integration.py
@@ -12,11 +12,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import importlib
 import os
 import tempfile
 import unittest
 
 from huggingface_hub import hf_hub_download
+from packaging import version
 
 from transformers import AutoModelForCausalLM, OPTForCausalLM
 from transformers.testing_utils import (
@@ -478,6 +480,48 @@ def test_peft_add_adapter_with_state_dict(self):
                 # dummy generation
                 _ = model.generate(input_ids=dummy_input)
 
+    def test_peft_add_adapter_with_state_dict_low_cpu_mem_usage(self):
+        """
+        Check the usage of low_cpu_mem_usage, which is supported in PEFT >= 0.13.0
+        """
+        from peft import LoraConfig
+
+        min_version_lcmu = "0.13.0"
+        is_lcmu_supported = version.parse(importlib.metadata.version("peft")) >= version.parse(min_version_lcmu)
+
+        for model_id, peft_model_id in zip(self.transformers_test_model_ids, self.peft_test_model_ids):
+            for transformers_class in self.transformers_test_model_classes:
+                model = transformers_class.from_pretrained(model_id).to(torch_device)
+
+                peft_config = LoraConfig()
+                state_dict_path = hf_hub_download(peft_model_id, "adapter_model.bin")
+                dummy_state_dict = torch.load(state_dict_path)
+
+                # this should always work
+                model.load_adapter(
+                    adapter_state_dict=dummy_state_dict, peft_config=peft_config, low_cpu_mem_usage=False
+                )
+
+                if is_lcmu_supported:
+                    # if supported, this should not raise an error
+                    model.load_adapter(
+                        adapter_state_dict=dummy_state_dict,
+                        adapter_name="other",
+                        peft_config=peft_config,
+                        low_cpu_mem_usage=True,
+                    )
+                    # after loading, no meta device should be remaining
+                    self.assertFalse(any((p.device.type == "meta") for p in model.parameters()))
+                else:
+                    err_msg = r"The version of PEFT you are using does not support `low_cpu_mem_usage` yet"
+                    with self.assertRaisesRegex(ValueError, err_msg):
+                        model.load_adapter(
+                            adapter_state_dict=dummy_state_dict,
+                            adapter_name="other",
+                            peft_config=peft_config,
+                            low_cpu_mem_usage=True,
+                        )
+
     def test_peft_from_pretrained_hub_kwargs(self):
         """
         Tests different combinations of PEFT model + from_pretrained + hub kwargs
diff --git a/tests/pipelines/test_pipelines_audio_classification.py b/tests/pipelines/test_pipelines_audio_classification.py
index a8e36b4a07212e..10b8a859ff7fb3 100644
--- a/tests/pipelines/test_pipelines_audio_classification.py
+++ b/tests/pipelines/test_pipelines_audio_classification.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import unittest
-from dataclasses import fields
 
 import numpy as np
 from huggingface_hub import AudioClassificationOutputElement
@@ -21,6 +20,7 @@
 from transformers import MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING, TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
 from transformers.pipelines import AudioClassificationPipeline, pipeline
 from transformers.testing_utils import (
+    compare_pipeline_output_to_hub_spec,
     is_pipeline_test,
     nested_simplify,
     require_tf,
@@ -37,9 +37,22 @@ class AudioClassificationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
     tf_model_mapping = TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        torch_dtype="float32",
+    ):
         audio_classifier = AudioClassificationPipeline(
-            model=model, feature_extractor=processor, torch_dtype=torch_dtype
+            model=model,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
+            processor=processor,
+            torch_dtype=torch_dtype,
         )
 
         # test with a raw waveform
@@ -68,10 +81,8 @@ def run_pipeline_test(self, audio_classifier, examples):
 
         self.run_torchaudio(audio_classifier)
 
-        spec_output_keys = {field.name for field in fields(AudioClassificationOutputElement)}
         for single_output in output:
-            output_keys = set(single_output.keys())
-            self.assertEqual(spec_output_keys, output_keys, msg="Pipeline output keys do not match HF Hub spec!")
+            compare_pipeline_output_to_hub_spec(single_output, AudioClassificationOutputElement)
 
     @require_torchaudio
     def run_torchaudio(self, audio_classifier):
diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
index c12292fc3370d3..391005b0213709 100644
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@@ -18,7 +18,7 @@
 import numpy as np
 import pytest
 from datasets import Audio, load_dataset
-from huggingface_hub import hf_hub_download, snapshot_download
+from huggingface_hub import AutomaticSpeechRecognitionOutput, hf_hub_download, snapshot_download
 
 from transformers import (
     MODEL_FOR_CTC_MAPPING,
@@ -36,6 +36,7 @@
 from transformers.pipelines.audio_utils import chunk_bytes_iter
 from transformers.pipelines.automatic_speech_recognition import _find_timestamp_sequence, chunk_iter
 from transformers.testing_utils import (
+    compare_pipeline_output_to_hub_spec,
     is_pipeline_test,
     is_torch_available,
     nested_simplify,
@@ -66,14 +67,27 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
         + (MODEL_FOR_CTC_MAPPING.items() if MODEL_FOR_CTC_MAPPING else [])
     )
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        torch_dtype="float32",
+    ):
         if tokenizer is None:
             # Side effect of no Fast Tokenizer class for these model, so skipping
             # But the slow tokenizer test should still run as they're quite small
             self.skipTest(reason="No tokenizer available")
 
         speech_recognizer = AutomaticSpeechRecognitionPipeline(
-            model=model, tokenizer=tokenizer, feature_extractor=processor, torch_dtype=torch_dtype
+            model=model,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
+            processor=processor,
+            torch_dtype=torch_dtype,
         )
 
         # test with a raw waveform
@@ -86,6 +100,8 @@ def run_pipeline_test(self, speech_recognizer, examples):
         outputs = speech_recognizer(audio)
         self.assertEqual(outputs, {"text": ANY(str)})
 
+        compare_pipeline_output_to_hub_spec(outputs, AutomaticSpeechRecognitionOutput)
+
         # Striding
         audio = {"raw": audio, "stride": (0, 4000), "sampling_rate": speech_recognizer.feature_extractor.sampling_rate}
         if speech_recognizer.type == "ctc":
@@ -1196,7 +1212,7 @@ def test_simple_whisper_translation(self):
         speech_recognizer_2 = AutomaticSpeechRecognitionPipeline(
             model=model, tokenizer=tokenizer, feature_extractor=feature_extractor
         )
-        output_2 = speech_recognizer_2(ds[0]["audio"])
+        output_2 = speech_recognizer_2(ds[40]["audio"])
         self.assertEqual(output, output_2)
 
         # either use generate_kwargs or set the model's generation_config
@@ -1208,7 +1224,7 @@ def test_simple_whisper_translation(self):
             feature_extractor=feature_extractor,
             generate_kwargs={"task": "transcribe", "language": "<|it|>"},
         )
-        output_3 = speech_translator(ds[0]["audio"])
+        output_3 = speech_translator(ds[40]["audio"])
         self.assertEqual(output_3, {"text": " Un uomo ha detto all'universo, Sir, esiste."})
 
     @slow
@@ -1880,15 +1896,15 @@ def test_slow_unfinished_sequence(self):
             model="vasista22/whisper-hindi-large-v2",
             device=torch_device,
         )
-        # Original model wasn't trained with timestamps and has incorrect generation config
-        pipe.model.generation_config = GenerationConfig.from_pretrained("openai/whisper-large-v2")
 
         # the audio is 4 seconds long
         audio = hf_hub_download("Narsil/asr_dummy", filename="hindi.ogg", repo_type="dataset")
 
+        # Original model wasn't trained with timestamps and has incorrect generation config
         out = pipe(
             audio,
             return_timestamps=True,
+            generate_kwargs={"generation_config": GenerationConfig.from_pretrained("openai/whisper-large-v2")},
         )
         self.assertEqual(
             out,
diff --git a/tests/pipelines/test_pipelines_depth_estimation.py b/tests/pipelines/test_pipelines_depth_estimation.py
index 1f2700fa747c58..ce1f83f53e0285 100644
--- a/tests/pipelines/test_pipelines_depth_estimation.py
+++ b/tests/pipelines/test_pipelines_depth_estimation.py
@@ -14,11 +14,13 @@
 
 import unittest
 
+from huggingface_hub import DepthEstimationOutput
 from huggingface_hub.utils import insecure_hashlib
 
 from transformers import MODEL_FOR_DEPTH_ESTIMATION_MAPPING, is_torch_available, is_vision_available
 from transformers.pipelines import DepthEstimationPipeline, pipeline
 from transformers.testing_utils import (
+    compare_pipeline_output_to_hub_spec,
     is_pipeline_test,
     nested_simplify,
     require_tf,
@@ -56,8 +58,23 @@ def hashimage(image: Image) -> str:
 class DepthEstimationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_DEPTH_ESTIMATION_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
-        depth_estimator = DepthEstimationPipeline(model=model, image_processor=processor, torch_dtype=torch_dtype)
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        torch_dtype="float32",
+    ):
+        depth_estimator = DepthEstimationPipeline(
+            model=model,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
+            processor=processor,
+            torch_dtype=torch_dtype,
+        )
         return depth_estimator, [
             "./tests/fixtures/tests_samples/COCO/000000039769.png",
             "./tests/fixtures/tests_samples/COCO/000000039769.png",
@@ -94,6 +111,9 @@ def run_pipeline_test(self, depth_estimator, examples):
             outputs,
         )
 
+        for single_output in outputs:
+            compare_pipeline_output_to_hub_spec(single_output, DepthEstimationOutput)
+
     @require_tf
     @unittest.skip(reason="Depth estimation is not implemented in TF")
     def test_small_model_tf(self):
@@ -116,3 +136,23 @@ def test_large_model_pt(self):
     def test_small_model_pt(self):
         # This is highly irregular to have no small tests.
         self.skipTest(reason="There is not hf-internal-testing tiny model for either GLPN nor DPT")
+
+    @require_torch
+    def test_multiprocess(self):
+        depth_estimator = pipeline(
+            model="hf-internal-testing/tiny-random-DepthAnythingForDepthEstimation",
+            num_workers=2,
+        )
+        outputs = depth_estimator(
+            [
+                "./tests/fixtures/tests_samples/COCO/000000039769.png",
+                "./tests/fixtures/tests_samples/COCO/000000039769.png",
+            ]
+        )
+        self.assertEqual(
+            [
+                {"predicted_depth": ANY(torch.Tensor), "depth": ANY(Image.Image)},
+                {"predicted_depth": ANY(torch.Tensor), "depth": ANY(Image.Image)},
+            ],
+            outputs,
+        )
diff --git a/tests/pipelines/test_pipelines_document_question_answering.py b/tests/pipelines/test_pipelines_document_question_answering.py
index 41a6a0c383f9b4..305fe0c558943c 100644
--- a/tests/pipelines/test_pipelines_document_question_answering.py
+++ b/tests/pipelines/test_pipelines_document_question_answering.py
@@ -15,7 +15,7 @@
 import unittest
 
 from transformers import MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING, AutoTokenizer, is_vision_available
-from transformers.pipelines import pipeline
+from transformers.pipelines import DocumentQuestionAnsweringPipeline, pipeline
 from transformers.pipelines.document_question_answering import apply_tesseract
 from transformers.testing_utils import (
     is_pipeline_test,
@@ -61,12 +61,21 @@ class DocumentQuestionAnsweringPipelineTests(unittest.TestCase):
 
     @require_pytesseract
     @require_vision
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
-        dqa_pipeline = pipeline(
-            "document-question-answering",
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        torch_dtype="float32",
+    ):
+        dqa_pipeline = DocumentQuestionAnsweringPipeline(
             model=model,
             tokenizer=tokenizer,
-            image_processor=processor,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
+            processor=processor,
             torch_dtype=torch_dtype,
         )
 
diff --git a/tests/pipelines/test_pipelines_feature_extraction.py b/tests/pipelines/test_pipelines_feature_extraction.py
index c9169056ff97a7..bceb48d6ffa692 100644
--- a/tests/pipelines/test_pipelines_feature_extraction.py
+++ b/tests/pipelines/test_pipelines_feature_extraction.py
@@ -174,7 +174,15 @@ def get_shape(self, input_, shape=None):
             raise TypeError("We expect lists of floats, nothing else")
         return shape
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        torch_dtype="float32",
+    ):
         if tokenizer is None:
             self.skipTest(reason="No tokenizer")
         elif (
@@ -193,10 +201,15 @@ def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
                 For now ignore those.
                 """
             )
-        feature_extractor = FeatureExtractionPipeline(
-            model=model, tokenizer=tokenizer, feature_extractor=processor, torch_dtype=torch_dtype
+        feature_extractor_pipeline = FeatureExtractionPipeline(
+            model=model,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
+            processor=processor,
+            torch_dtype=torch_dtype,
         )
-        return feature_extractor, ["This is a test", "This is another test"]
+        return feature_extractor_pipeline, ["This is a test", "This is another test"]
 
     def run_pipeline_test(self, feature_extractor, examples):
         outputs = feature_extractor("This is a test")
diff --git a/tests/pipelines/test_pipelines_fill_mask.py b/tests/pipelines/test_pipelines_fill_mask.py
index 81aa23563710f7..14061eaef7cf21 100644
--- a/tests/pipelines/test_pipelines_fill_mask.py
+++ b/tests/pipelines/test_pipelines_fill_mask.py
@@ -251,11 +251,26 @@ def test_model_no_pad_tf(self):
         unmasker.tokenizer.pad_token = None
         self.run_pipeline_test(unmasker, [])
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        torch_dtype="float32",
+    ):
         if tokenizer is None or tokenizer.mask_token_id is None:
             self.skipTest(reason="The provided tokenizer has no mask token, (probably reformer or wav2vec2)")
 
-        fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer, torch_dtype=torch_dtype)
+        fill_masker = FillMaskPipeline(
+            model=model,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
+            processor=processor,
+            torch_dtype=torch_dtype,
+        )
         examples = [
             f"This is another {tokenizer.mask_token} test",
         ]
diff --git a/tests/pipelines/test_pipelines_image_classification.py b/tests/pipelines/test_pipelines_image_classification.py
index 823c66c16f32c0..5f926dd2f07517 100644
--- a/tests/pipelines/test_pipelines_image_classification.py
+++ b/tests/pipelines/test_pipelines_image_classification.py
@@ -14,6 +14,8 @@
 
 import unittest
 
+from huggingface_hub import ImageClassificationOutputElement
+
 from transformers import (
     MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
     TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
@@ -23,6 +25,7 @@
 )
 from transformers.pipelines import ImageClassificationPipeline, pipeline
 from transformers.testing_utils import (
+    compare_pipeline_output_to_hub_spec,
     is_pipeline_test,
     nested_simplify,
     require_tf,
@@ -55,9 +58,23 @@ class ImageClassificationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
     tf_model_mapping = TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        torch_dtype="float32",
+    ):
         image_classifier = ImageClassificationPipeline(
-            model=model, image_processor=processor, top_k=2, torch_dtype=torch_dtype
+            model=model,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
+            processor=processor,
+            torch_dtype=torch_dtype,
+            top_k=2,
         )
         examples = [
             Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
@@ -121,6 +138,10 @@ def run_pipeline_test(self, image_classifier, examples):
             ],
         )
 
+        for single_output in outputs:
+            for output_element in single_output:
+                compare_pipeline_output_to_hub_spec(output_element, ImageClassificationOutputElement)
+
     @require_torch
     def test_small_model_pt(self):
         small_model = "hf-internal-testing/tiny-random-vit"
diff --git a/tests/pipelines/test_pipelines_image_feature_extraction.py b/tests/pipelines/test_pipelines_image_feature_extraction.py
index 07b27e7b6465bc..67140f91226d63 100644
--- a/tests/pipelines/test_pipelines_image_feature_extraction.py
+++ b/tests/pipelines/test_pipelines_image_feature_extraction.py
@@ -157,8 +157,16 @@ def test_return_tensors_tf(self):
         outputs = feature_extractor(img, return_tensors=True)
         self.assertTrue(tf.is_tensor(outputs))
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
-        if processor is None:
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        torch_dtype="float32",
+    ):
+        if image_processor is None:
             self.skipTest(reason="No image processor")
 
         elif type(model.config) in TOKENIZER_MAPPING:
@@ -175,11 +183,16 @@ def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
                 """
             )
 
-        feature_extractor = ImageFeatureExtractionPipeline(
-            model=model, image_processor=processor, torch_dtype=torch_dtype
+        feature_extractor_pipeline = ImageFeatureExtractionPipeline(
+            model=model,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
+            processor=processor,
+            torch_dtype=torch_dtype,
         )
         img = prepare_img()
-        return feature_extractor, [img, img]
+        return feature_extractor_pipeline, [img, img]
 
     def run_pipeline_test(self, feature_extractor, examples):
         imgs = examples
diff --git a/tests/pipelines/test_pipelines_image_segmentation.py b/tests/pipelines/test_pipelines_image_segmentation.py
index 523bd0b52b6823..44ed4e54534b26 100644
--- a/tests/pipelines/test_pipelines_image_segmentation.py
+++ b/tests/pipelines/test_pipelines_image_segmentation.py
@@ -20,6 +20,7 @@
 import numpy as np
 import requests
 from datasets import load_dataset
+from huggingface_hub import ImageSegmentationOutputElement
 from huggingface_hub.utils import insecure_hashlib
 
 from transformers import (
@@ -36,6 +37,7 @@
     pipeline,
 )
 from transformers.testing_utils import (
+    compare_pipeline_output_to_hub_spec,
     is_pipeline_test,
     nested_simplify,
     require_tf,
@@ -87,8 +89,23 @@ class ImageSegmentationPipelineTests(unittest.TestCase):
         + (MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING.items() if MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING else [])
     )
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
-        image_segmenter = ImageSegmentationPipeline(model=model, image_processor=processor, torch_dtype=torch_dtype)
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        torch_dtype="float32",
+    ):
+        image_segmenter = ImageSegmentationPipeline(
+            model=model,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
+            processor=processor,
+            torch_dtype=torch_dtype,
+        )
         return image_segmenter, [
             "./tests/fixtures/tests_samples/COCO/000000039769.png",
             "./tests/fixtures/tests_samples/COCO/000000039769.png",
@@ -168,6 +185,10 @@ def run_pipeline_test(self, image_segmenter, examples):
             f"Expected [{n}, {n}, {n}, {n}, {n}], got {[len(item) for item in outputs]}",
         )
 
+        for single_output in outputs:
+            for output_element in single_output:
+                compare_pipeline_output_to_hub_spec(output_element, ImageSegmentationOutputElement)
+
     @require_tf
     @unittest.skip(reason="Image segmentation not implemented in TF")
     def test_small_model_tf(self):
diff --git a/tests/pipelines/test_pipelines_image_to_text.py b/tests/pipelines/test_pipelines_image_to_text.py
index 43a796da46df6c..0996b399b2882d 100644
--- a/tests/pipelines/test_pipelines_image_to_text.py
+++ b/tests/pipelines/test_pipelines_image_to_text.py
@@ -15,10 +15,12 @@
 import unittest
 
 import requests
+from huggingface_hub import ImageToTextOutput
 
 from transformers import MODEL_FOR_VISION_2_SEQ_MAPPING, TF_MODEL_FOR_VISION_2_SEQ_MAPPING, is_vision_available
 from transformers.pipelines import ImageToTextPipeline, pipeline
 from transformers.testing_utils import (
+    compare_pipeline_output_to_hub_spec,
     is_pipeline_test,
     require_tf,
     require_torch,
@@ -45,9 +47,22 @@ class ImageToTextPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_VISION_2_SEQ_MAPPING
     tf_model_mapping = TF_MODEL_FOR_VISION_2_SEQ_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        torch_dtype="float32",
+    ):
         pipe = ImageToTextPipeline(
-            model=model, tokenizer=tokenizer, image_processor=processor, torch_dtype=torch_dtype
+            model=model,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
+            processor=processor,
+            torch_dtype=torch_dtype,
         )
         examples = [
             Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
@@ -103,6 +118,9 @@ def test_small_model_tf(self):
             [{"generated_text": "growth"}],
         )
 
+        for single_output in outputs:
+            compare_pipeline_output_to_hub_spec(single_output, ImageToTextOutput)
+
     @require_torch
     def test_small_model_pt(self):
         pipe = pipeline("image-to-text", model="hf-internal-testing/tiny-random-vit-gpt2")
diff --git a/tests/pipelines/test_pipelines_mask_generation.py b/tests/pipelines/test_pipelines_mask_generation.py
index 50fcd676da50df..4adfff49e18be1 100644
--- a/tests/pipelines/test_pipelines_mask_generation.py
+++ b/tests/pipelines/test_pipelines_mask_generation.py
@@ -67,8 +67,23 @@ class MaskGenerationPipelineTests(unittest.TestCase):
         (list(TF_MODEL_FOR_MASK_GENERATION_MAPPING.items()) if TF_MODEL_FOR_MASK_GENERATION_MAPPING else [])
     )
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
-        image_segmenter = MaskGenerationPipeline(model=model, image_processor=processor, torch_dtype=torch_dtype)
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        torch_dtype="float32",
+    ):
+        image_segmenter = MaskGenerationPipeline(
+            model=model,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
+            processor=processor,
+            torch_dtype=torch_dtype,
+        )
         return image_segmenter, [
             "./tests/fixtures/tests_samples/COCO/000000039769.png",
             "./tests/fixtures/tests_samples/COCO/000000039769.png",
diff --git a/tests/pipelines/test_pipelines_object_detection.py b/tests/pipelines/test_pipelines_object_detection.py
index f14e5e6b68d771..d58f3749ec3f25 100644
--- a/tests/pipelines/test_pipelines_object_detection.py
+++ b/tests/pipelines/test_pipelines_object_detection.py
@@ -14,6 +14,8 @@
 
 import unittest
 
+from huggingface_hub import ObjectDetectionOutputElement
+
 from transformers import (
     MODEL_FOR_OBJECT_DETECTION_MAPPING,
     AutoFeatureExtractor,
@@ -22,7 +24,8 @@
     is_vision_available,
     pipeline,
 )
-from transformers.testing_utils import (
+from transformers.testing_utils import (  #
+    compare_pipeline_output_to_hub_spec,
     is_pipeline_test,
     nested_simplify,
     require_pytesseract,
@@ -53,8 +56,23 @@ def open(*args, **kwargs):
 class ObjectDetectionPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_OBJECT_DETECTION_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
-        object_detector = ObjectDetectionPipeline(model=model, image_processor=processor, torch_dtype=torch_dtype)
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        torch_dtype="float32",
+    ):
+        object_detector = ObjectDetectionPipeline(
+            model=model,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
+            processor=processor,
+            torch_dtype=torch_dtype,
+        )
         return object_detector, ["./tests/fixtures/tests_samples/COCO/000000039769.png"]
 
     def run_pipeline_test(self, object_detector, examples):
@@ -101,6 +119,7 @@ def run_pipeline_test(self, object_detector, examples):
                         "box": {"xmin": ANY(int), "ymin": ANY(int), "xmax": ANY(int), "ymax": ANY(int)},
                     },
                 )
+                compare_pipeline_output_to_hub_spec(detected_object, ObjectDetectionOutputElement)
 
     @require_tf
     @unittest.skip(reason="Object detection not implemented in TF")
diff --git a/tests/pipelines/test_pipelines_question_answering.py b/tests/pipelines/test_pipelines_question_answering.py
index 8b68989600ee1a..d06f88d1f08844 100644
--- a/tests/pipelines/test_pipelines_question_answering.py
+++ b/tests/pipelines/test_pipelines_question_answering.py
@@ -14,6 +14,8 @@
 
 import unittest
 
+from huggingface_hub import QuestionAnsweringOutputElement
+
 from transformers import (
     MODEL_FOR_QUESTION_ANSWERING_MAPPING,
     TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
@@ -23,6 +25,7 @@
 from transformers.data.processors.squad import SquadExample
 from transformers.pipelines import QuestionAnsweringArgumentHandler, pipeline
 from transformers.testing_utils import (
+    compare_pipeline_output_to_hub_spec,
     is_pipeline_test,
     nested_simplify,
     require_tf,
@@ -50,12 +53,27 @@ class QAPipelineTests(unittest.TestCase):
             config: model for config, model in tf_model_mapping.items() if config.__name__ not in _TO_SKIP
         }
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        torch_dtype="float32",
+    ):
         if isinstance(model.config, LxmertConfig):
             # This is an bimodal model, we need to find a more consistent way
             # to switch on those models.
             return None, None
-        question_answerer = QuestionAnsweringPipeline(model, tokenizer, torch_dtype=torch_dtype)
+        question_answerer = QuestionAnsweringPipeline(
+            model=model,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
+            processor=processor,
+            torch_dtype=torch_dtype,
+        )
 
         examples = [
             {"question": "Where was HuggingFace founded ?", "context": "HuggingFace was founded in Paris."},
@@ -117,6 +135,8 @@ def run_pipeline_test(self, question_answerer, _):
         self.assertEqual(
             outputs, [{"answer": ANY(str), "start": ANY(int), "end": ANY(int), "score": ANY(float)} for i in range(20)]
         )
+        for single_output in outputs:
+            compare_pipeline_output_to_hub_spec(single_output, QuestionAnsweringOutputElement)
 
         # Very long context require multiple features
         outputs = question_answerer(
diff --git a/tests/pipelines/test_pipelines_summarization.py b/tests/pipelines/test_pipelines_summarization.py
index fb1dce0ca3849e..465dba9743c648 100644
--- a/tests/pipelines/test_pipelines_summarization.py
+++ b/tests/pipelines/test_pipelines_summarization.py
@@ -32,8 +32,23 @@ class SummarizationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
     tf_model_mapping = TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
-        summarizer = SummarizationPipeline(model=model, tokenizer=tokenizer, torch_dtype=torch_dtype)
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        torch_dtype="float32",
+    ):
+        summarizer = SummarizationPipeline(
+            model=model,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
+            processor=processor,
+            torch_dtype=torch_dtype,
+        )
         return summarizer, ["(CNN)The Palestinian Authority officially became", "Some other text"]
 
     def run_pipeline_test(self, summarizer, _):
diff --git a/tests/pipelines/test_pipelines_text2text_generation.py b/tests/pipelines/test_pipelines_text2text_generation.py
index 52fb59edd36443..a41f41fadfe588 100644
--- a/tests/pipelines/test_pipelines_text2text_generation.py
+++ b/tests/pipelines/test_pipelines_text2text_generation.py
@@ -35,8 +35,23 @@ class Text2TextGenerationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
     tf_model_mapping = TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
-        generator = Text2TextGenerationPipeline(model=model, tokenizer=tokenizer, torch_dtype=torch_dtype)
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        torch_dtype="float32",
+    ):
+        generator = Text2TextGenerationPipeline(
+            model=model,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
+            processor=processor,
+            torch_dtype=torch_dtype,
+        )
         return generator, ["Something to write", "Something else"]
 
     def run_pipeline_test(self, generator, _):
diff --git a/tests/pipelines/test_pipelines_text_classification.py b/tests/pipelines/test_pipelines_text_classification.py
index 4956cb8aed132d..1f3b31b8583265 100644
--- a/tests/pipelines/test_pipelines_text_classification.py
+++ b/tests/pipelines/test_pipelines_text_classification.py
@@ -179,8 +179,23 @@ def test_tf_bert(self):
         outputs = text_classifier("Birds are a type of animal")
         self.assertEqual(nested_simplify(outputs), [{"label": "POSITIVE", "score": 0.988}])
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
-        text_classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer, torch_dtype=torch_dtype)
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        torch_dtype="float32",
+    ):
+        text_classifier = TextClassificationPipeline(
+            model=model,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
+            processor=processor,
+            torch_dtype=torch_dtype,
+        )
         return text_classifier, ["HuggingFace is in", "This is another test"]
 
     def run_pipeline_test(self, text_classifier, _):
diff --git a/tests/pipelines/test_pipelines_text_generation.py b/tests/pipelines/test_pipelines_text_generation.py
index d0091449e18cba..277c870b4d1074 100644
--- a/tests/pipelines/test_pipelines_text_generation.py
+++ b/tests/pipelines/test_pipelines_text_generation.py
@@ -377,8 +377,23 @@ def test_small_chat_model_tf(self):
             ],
         )
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
-        text_generator = TextGenerationPipeline(model=model, tokenizer=tokenizer, torch_dtype=torch_dtype)
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        torch_dtype="float32",
+    ):
+        text_generator = TextGenerationPipeline(
+            model=model,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
+            processor=processor,
+            torch_dtype=torch_dtype,
+        )
         return text_generator, ["This is a test", "Another test"]
 
     def test_stop_sequence_stopping_criteria(self):
@@ -471,6 +486,7 @@ def run_pipeline_test(self, text_generator, _):
             "GPTNeoXForCausalLM",
             "GPTNeoXJapaneseForCausalLM",
             "FuyuForCausalLM",
+            "LlamaForCausalLM",
         ]
         if (
             tokenizer.model_max_length < 10000
diff --git a/tests/pipelines/test_pipelines_text_to_audio.py b/tests/pipelines/test_pipelines_text_to_audio.py
index 655fe5961b527d..e07e2ad392a3e6 100644
--- a/tests/pipelines/test_pipelines_text_to_audio.py
+++ b/tests/pipelines/test_pipelines_text_to_audio.py
@@ -250,8 +250,23 @@ def test_generative_model_kwargs(self):
         outputs = music_generator("This is a test", forward_params=forward_params, generate_kwargs=generate_kwargs)
         self.assertListEqual(outputs["audio"].tolist(), audio.tolist())
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
-        speech_generator = TextToAudioPipeline(model=model, tokenizer=tokenizer, torch_dtype=torch_dtype)
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        torch_dtype="float32",
+    ):
+        speech_generator = TextToAudioPipeline(
+            model=model,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
+            processor=processor,
+            torch_dtype=torch_dtype,
+        )
         return speech_generator, ["This is a test", "Another test"]
 
     def run_pipeline_test(self, speech_generator, _):
diff --git a/tests/pipelines/test_pipelines_token_classification.py b/tests/pipelines/test_pipelines_token_classification.py
index 5e804bec199ab0..5e4b18f91699b5 100644
--- a/tests/pipelines/test_pipelines_token_classification.py
+++ b/tests/pipelines/test_pipelines_token_classification.py
@@ -61,8 +61,23 @@ class TokenClassificationPipelineTests(unittest.TestCase):
             config: model for config, model in tf_model_mapping.items() if config.__name__ not in _TO_SKIP
         }
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
-        token_classifier = TokenClassificationPipeline(model=model, tokenizer=tokenizer, torch_dtype=torch_dtype)
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        torch_dtype="float32",
+    ):
+        token_classifier = TokenClassificationPipeline(
+            model=model,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
+            processor=processor,
+            torch_dtype=torch_dtype,
+        )
         return token_classifier, ["A simple string", "A simple string that is quite a bit longer"]
 
     def run_pipeline_test(self, token_classifier, _):
diff --git a/tests/pipelines/test_pipelines_translation.py b/tests/pipelines/test_pipelines_translation.py
index c31ba49e7660a2..1d6e6a33b1a475 100644
--- a/tests/pipelines/test_pipelines_translation.py
+++ b/tests/pipelines/test_pipelines_translation.py
@@ -35,14 +35,36 @@ class TranslationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
     tf_model_mapping = TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        torch_dtype="float32",
+    ):
         if isinstance(model.config, MBartConfig):
             src_lang, tgt_lang = list(tokenizer.lang_code_to_id.keys())[:2]
             translator = TranslationPipeline(
-                model=model, tokenizer=tokenizer, src_lang=src_lang, tgt_lang=tgt_lang, torch_dtype=torch_dtype
+                model=model,
+                tokenizer=tokenizer,
+                feature_extractor=feature_extractor,
+                image_processor=image_processor,
+                processor=processor,
+                torch_dtype=torch_dtype,
+                src_lang=src_lang,
+                tgt_lang=tgt_lang,
             )
         else:
-            translator = TranslationPipeline(model=model, tokenizer=tokenizer, torch_dtype=torch_dtype)
+            translator = TranslationPipeline(
+                model=model,
+                tokenizer=tokenizer,
+                feature_extractor=feature_extractor,
+                image_processor=image_processor,
+                processor=processor,
+                torch_dtype=torch_dtype,
+            )
         return translator, ["Some string", "Some other text"]
 
     def run_pipeline_test(self, translator, _):
diff --git a/tests/pipelines/test_pipelines_video_classification.py b/tests/pipelines/test_pipelines_video_classification.py
index 280d6990788ea1..8b910e94af3b47 100644
--- a/tests/pipelines/test_pipelines_video_classification.py
+++ b/tests/pipelines/test_pipelines_video_classification.py
@@ -38,12 +38,26 @@
 class VideoClassificationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        torch_dtype="float32",
+    ):
         example_video_filepath = hf_hub_download(
             repo_id="nateraw/video-demo", filename="archery.mp4", repo_type="dataset"
         )
         video_classifier = VideoClassificationPipeline(
-            model=model, image_processor=processor, top_k=2, torch_dtype=torch_dtype
+            model=model,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
+            processor=processor,
+            torch_dtype=torch_dtype,
+            top_k=2,
         )
         examples = [
             example_video_filepath,
diff --git a/tests/pipelines/test_pipelines_visual_question_answering.py b/tests/pipelines/test_pipelines_visual_question_answering.py
index 45f935a62aaf38..3c79dc7faa1833 100644
--- a/tests/pipelines/test_pipelines_visual_question_answering.py
+++ b/tests/pipelines/test_pipelines_visual_question_answering.py
@@ -55,9 +55,19 @@ def open(*args, **kwargs):
 class VisualQuestionAnsweringPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        torch_dtype="float32",
+    ):
         vqa_pipeline = pipeline(
-            "visual-question-answering", model="hf-internal-testing/tiny-vilt-random-vqa", torch_dtype=torch_dtype
+            "visual-question-answering",
+            model="hf-internal-testing/tiny-vilt-random-vqa",
+            torch_dtype=torch_dtype,
         )
         examples = [
             {
diff --git a/tests/pipelines/test_pipelines_zero_shot.py b/tests/pipelines/test_pipelines_zero_shot.py
index 7c437b0a418da2..be5a59f55d0920 100644
--- a/tests/pipelines/test_pipelines_zero_shot.py
+++ b/tests/pipelines/test_pipelines_zero_shot.py
@@ -53,9 +53,23 @@ class ZeroShotClassificationPipelineTests(unittest.TestCase):
             config: model for config, model in tf_model_mapping.items() if config.__name__ not in _TO_SKIP
         }
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        torch_dtype="float32",
+    ):
         classifier = ZeroShotClassificationPipeline(
-            model=model, tokenizer=tokenizer, candidate_labels=["polics", "health"], torch_dtype=torch_dtype
+            model=model,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
+            processor=processor,
+            torch_dtype=torch_dtype,
+            candidate_labels=["polics", "health"],
         )
         return classifier, ["Who are you voting for in 2020?", "My stomach hurts."]
 
diff --git a/tests/pipelines/test_pipelines_zero_shot_image_classification.py b/tests/pipelines/test_pipelines_zero_shot_image_classification.py
index b57adf609d1e09..bbeaeff3c17496 100644
--- a/tests/pipelines/test_pipelines_zero_shot_image_classification.py
+++ b/tests/pipelines/test_pipelines_zero_shot_image_classification.py
@@ -14,9 +14,12 @@
 
 import unittest
 
+from huggingface_hub import ZeroShotImageClassificationOutputElement
+
 from transformers import is_vision_available
 from transformers.pipelines import pipeline
 from transformers.testing_utils import (
+    compare_pipeline_output_to_hub_spec,
     is_pipeline_test,
     nested_simplify,
     require_tf,
@@ -127,6 +130,9 @@ def test_small_model_pt(self, torch_dtype="float32"):
             ],
         )
 
+        for single_output in output:
+            compare_pipeline_output_to_hub_spec(single_output, ZeroShotImageClassificationOutputElement)
+
     @require_torch
     def test_small_model_pt_fp16(self):
         self.test_small_model_pt(torch_dtype="float16")
diff --git a/tests/pipelines/test_pipelines_zero_shot_object_detection.py b/tests/pipelines/test_pipelines_zero_shot_object_detection.py
index 799c54dfbb87d7..48cdb9bd15ca53 100644
--- a/tests/pipelines/test_pipelines_zero_shot_object_detection.py
+++ b/tests/pipelines/test_pipelines_zero_shot_object_detection.py
@@ -43,7 +43,15 @@ def open(*args, **kwargs):
 class ZeroShotObjectDetectionPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        torch_dtype="float32",
+    ):
         object_detector = pipeline(
             "zero-shot-object-detection",
             model="hf-internal-testing/tiny-random-owlvit-object-detection",
diff --git a/tests/quantization/autoawq/test_awq.py b/tests/quantization/autoawq/test_awq.py
index 58678ade57ffc4..780efe8aa4b465 100644
--- a/tests/quantization/autoawq/test_awq.py
+++ b/tests/quantization/autoawq/test_awq.py
@@ -21,6 +21,7 @@
 from transformers.testing_utils import (
     require_accelerate,
     require_auto_awq,
+    require_intel_extension_for_pytorch,
     require_torch_gpu,
     require_torch_multi_gpu,
     slow,
@@ -490,3 +491,31 @@ def test_load_quantized_model(self):
             "TechxGenus/starcoder2-3b-AWQ", torch_dtype=torch.float16, device_map="cuda"
         )
         self.assertTrue(isinstance(quantized_model.model.layers[0].mlp.act, ScaledActivation))
+
+
+@slow
+@require_auto_awq
+@require_accelerate
+@require_intel_extension_for_pytorch
+class AwqIPEXTest(unittest.TestCase):
+    def test_quantized_model_ipex(self):
+        """
+        Simple test that checks if the quantized model is working properly with ipex backend
+        """
+        quantization_config = AwqConfig(version="ipex")
+
+        model = AutoModelForCausalLM.from_pretrained(
+            "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
+            quantization_config=quantization_config,
+            device_map="cpu",
+        )
+        tokenizer = AutoTokenizer.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ")
+        input_ids = tokenizer.encode("How to make a cake", return_tensors="pt")
+        pad_token_id = tokenizer.eos_token_id
+        output = model.generate(input_ids, do_sample=False, max_length=20, pad_token_id=pad_token_id)
+        print(tokenizer.decode(output[0], skip_special_tokens=True))
+
+        expected_output = (
+            "How to make a cake with a round tin?\nHow to make a cake with a round tin?\n1. Preheat the oven to 180°"
+        )
+        self.assertIn(tokenizer.decode(output[0], skip_special_tokens=True), expected_output)
diff --git a/tests/quantization/bitnet_integration/__init__.py b/tests/quantization/bitnet_integration/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tests/quantization/bitnet_integration/test_bitnet.py b/tests/quantization/bitnet_integration/test_bitnet.py
new file mode 100644
index 00000000000000..ef71cc82dbf57f
--- /dev/null
+++ b/tests/quantization/bitnet_integration/test_bitnet.py
@@ -0,0 +1,225 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitNetConfig,
+    OPTForCausalLM,
+)
+from transformers.testing_utils import (
+    require_accelerate,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+from transformers.utils import is_accelerate_available, is_torch_available
+
+
+if is_torch_available():
+    import torch
+
+if is_accelerate_available():
+    from accelerate import init_empty_weights
+
+
+@require_torch_gpu
+class BitNetConfigTest(unittest.TestCase):
+    def test_to_dict(self):
+        """
+        Simple test that checks if one uses a config and converts it to a dict, the dict is the same as the config object
+        """
+        quantization_config = BitNetConfig()
+        config_to_dict = quantization_config.to_dict()
+
+        for key in config_to_dict:
+            self.assertEqual(getattr(quantization_config, key), config_to_dict[key])
+
+
+@slow
+@require_torch_gpu
+@require_accelerate
+class BitNetTest(unittest.TestCase):
+    model_name = "HF1BitLLM/Llama3-8B-1.58-100B-tokens"
+    device = "cuda"
+
+    # called only once for all test in this class
+    @classmethod
+    def setUpClass(cls):
+        """
+        Load the model
+        """
+        cls.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
+        cls.quantized_model = AutoModelForCausalLM.from_pretrained(cls.model_name, device_map=cls.device)
+
+    def tearDown(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+        gc.collect()
+
+    def test_replace_with_bitlinear(self):
+        from transformers.integrations import BitLinear, replace_with_bitnet_linear
+
+        model_id = "facebook/opt-350m"
+        config = AutoConfig.from_pretrained(model_id)
+
+        with init_empty_weights():
+            model = OPTForCausalLM(config)
+
+        nb_linears = 0
+        for module in model.modules():
+            if isinstance(module, torch.nn.Linear):
+                nb_linears += 1
+
+        model = replace_with_bitnet_linear(model)
+        nb_bitnet_linear = 0
+        for module in model.modules():
+            if isinstance(module, BitLinear):
+                nb_bitnet_linear += 1
+
+        self.assertEqual(nb_linears - 1, nb_bitnet_linear)
+
+    def test_quantized_model(self, quantized_model, tokenizer):
+        """
+        Simple test that checks if the quantized model is working properly
+        """
+        input_text = "What are we having for dinner?"
+        expected_output = "What are we having for dinner? What are we going to do for fun this weekend?"
+        input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+
+        output = quantized_model.generate(**input_ids, max_new_tokens=11, do_sample=False)
+        self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), expected_output)
+
+    def test_packing_unpacking(self):
+        """
+        Simple test the packing and unpacking logic
+        """
+
+        from transformers.integrations import pack_weights, unpack_weights
+
+        u = torch.randint(0, 255, (1024, 1024), dtype=torch.uint8)
+        unpacked_u = unpack_weights(u, dtype=torch.bfloat16)
+        self.assertEqual(pack_weights(unpacked_u), u)
+
+    def test_activation_quant(self):
+        """
+        test the activation function behaviour
+        """
+
+        from transformers.integrations import BitLinear
+
+        layer = BitLinear(in_features=4, out_features=2, bias=False, dtype=torch.float32)
+        layer.to(self.device)
+
+        input_tensor = torch.tensor([[1.0, -1.0, -1.0, 1.0], [1.0, -1.0, 1.0, 1.0]], dtype=torch.float32).to(
+            torch_device
+        )
+
+        # Quantize the input tensor
+        quantized_tensor, scale = layer.activation_quant(input_tensor)
+
+        # Verify the output quantized tensor
+        self.assertEqual(quantized_tensor, input_tensor)
+
+        # Verify the scale tensor
+        self.assertEqual(scale, 127)
+
+    def test_weights_dtype(self):
+        """
+        test the weights dtype after loading
+        """
+
+        self_attn_q = self.quantized_model.model.layers[0].self_attn.q_proj.weight
+        self_attn_k = self.quantized_model.model.layers[0].self_attn.k_proj.weight
+        self_attn_v = self.quantized_model.model.layers[0].self_attn.v_proj.weight
+        self_attn_o = self.quantized_model.model.layers[0].self_attn.o_proj.weight
+        mlp_gate = self.quantized_model.model.layers[0].mlp.gate_proj.weight
+        mlp_up = self.quantized_model.model.layers[0].mlp.up_proj.weight
+        mlp_down = self.quantized_model.model.layers[0].mlp.down_proj.weight
+
+        self.assertEqual(self_attn_q.dtype, torch.uint8)
+        self.assertEqual(self_attn_k.dtype, torch.uint8)
+        self.assertEqual(self_attn_v.dtype, torch.uint8)
+        self.assertEqual(self_attn_o.dtype, torch.uint8)
+        self.assertEqual(mlp_up.dtype, torch.uint8)
+        self.assertEqual(mlp_gate.dtype, torch.uint8)
+        self.assertEqual(mlp_down.dtype, torch.uint8)
+
+    def test_replace_with_bitlinear_shape(self):
+        """
+        test that the BitNet layer weight shapes are correct, and the weight_scale is correctly initialized to 1
+        """
+
+        from transformers.integrations import replace_with_bitnet_linear
+
+        out_features = 1024
+        in_features = 512
+
+        class SimpleLinearModule(torch.nn.Module):
+            """
+            Simple class to test BitLinear
+            """
+
+            def __init__(
+                self,
+                in_features: int = in_features,
+                out_features: int = out_features,
+                bias: bool = False,
+            ):
+                super().__init__()
+                self.linear = torch.nn.Linear(in_features=in_features, out_features=out_features, bias=bias)
+
+            def forward(self, x):
+                return self.linear(x)
+
+        model = SimpleLinearModule()
+        replace_with_bitnet_linear(model)
+
+        self.assertEqual(list(model.linear.weight.shape), [out_features // 4, in_features])
+        self.assertEqual(model.linear.weight_scale, 1)
+
+
+@slow
+@require_torch_gpu
+@require_accelerate
+class BitNetSerializationTest(unittest.TestCase):
+    def test_model_serialization(self):
+        model_name = "HF1BitLLM/Llama3-8B-1.58-100B-tokens"
+        device = "cuda"
+        quantized_model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device)
+        input_tensor = torch.zeros((1, 8), dtype=torch.int32, device=device)
+
+        with torch.no_grad():
+            logits_ref = quantized_model.forward(input_tensor).logits
+
+        # Save
+        saved_model_id = "quant_model"
+        quantized_model.save_pretrained(saved_model_id)
+
+        # Remove old model
+        del quantized_model
+        torch.cuda.empty_cache()
+
+        # Load and check if the logits match
+        model_loaded = AutoModelForCausalLM.from_pretrained("quant_model", device_map=device)
+
+        with torch.no_grad():
+            logits_loaded = model_loaded.forward(input_tensor).logits
+
+        self.assertEqual((logits_loaded - logits_ref).abs().mean().item(), 0)
diff --git a/tests/quantization/ggml/test_ggml.py b/tests/quantization/ggml/test_ggml.py
index 13e64677be5c42..3074a19828d25b 100644
--- a/tests/quantization/ggml/test_ggml.py
+++ b/tests/quantization/ggml/test_ggml.py
@@ -38,12 +38,22 @@ class GgufIntegrationTests(unittest.TestCase):
     imatrix_model_id = "duyntnet/TinyLlama-1.1B-Chat-v1.0-imatrix-GGUF"
     mistral_model_id = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF"
     qwen2_model_id = "Qwen/Qwen1.5-0.5B-Chat-GGUF"
-    qwen2_moe_model_id = "RichardErkhov/Qwen_-_Qwen1.5-MoE-A2.7B-Chat-gguf"
+    qwen2moe_model_id = "gdax/Qwen1.5-MoE-A2.7B_gguf"
+    qwen2moe_original_model_id = "Qwen/Qwen1.5-MoE-A2.7B"
     llama3_model_id = "NousResearch/Meta-Llama-3-8B-GGUF"
     tinyllama_model_id = "PenutChen/TinyLlama-1.1B-Chat-v1.0-GGUF"
     phi3_model_id = "microsoft/Phi-3-mini-4k-instruct-gguf"
     bloom_model_id = "afrideva/bloom-560m-GGUF"
     original_bloom_model_id = "bigscience/bloom-560m"
+    falcon7b_model_id = "xaviviro/falcon-7b-quantized-gguf"
+    falcon40b_model_id = "maddes8cht/tiiuae-falcon-40b-gguf"
+    original_flacon7b_model_id = "tiiuae/falcon-7b"
+    stablelm_model_id = "afrideva/stablelm-3b-4e1t-GGUF"
+    stablelm2_model_id = "afrideva/stablelm-2-1_6b-GGUF"
+    original_stablelm2_model_id = "stabilityai/stablelm-2-1_6b"
+    gpt2_model_id = "mradermacher/gpt2-GGUF"
+    gpt2_original_model_id = "openai-community/gpt2"
+    gpt2_xl_model_id = "RichardErkhov/openai-community_-_gpt2-xl-gguf"
 
     # standard quants
     q4_0_gguf_model_id = "tinyllama-1.1b-chat-v1.0.Q4_0.gguf"
@@ -55,6 +65,7 @@ class GgufIntegrationTests(unittest.TestCase):
     q4_k_gguf_model_id = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
     q5_k_gguf_model_id = "tinyllama-1.1b-chat-v1.0.Q5_K_M.gguf"
     q6_k_gguf_model_id = "tinyllama-1.1b-chat-v1.0.Q6_K.gguf"
+    q4_k_m_stablelm_model_id = "stablelm-3b-4e1t.q4_k_m.gguf"
     # imatrix
     iq1_m_gguf_model_id = "TinyLlama-1.1B-Chat-v1.0-IQ1_M.gguf"
     iq1_s_gguf_model_id = "TinyLlama-1.1B-Chat-v1.0-IQ1_S.gguf"
@@ -69,11 +80,19 @@ class GgufIntegrationTests(unittest.TestCase):
     q4_0_phi3_model_id = "Phi-3-mini-4k-instruct-q4.gguf"
     q4_0_mistral_model_id = "mistral-7b-instruct-v0.2.Q4_0.gguf"
     q4_0_qwen2_model_id = "qwen1_5-0_5b-chat-q4_0.gguf"
-    q4_0_qwen2_moe_model_id = "Qwen1.5-MoE-A2.7B-Chat.Q4_0.gguf"
+    q8_qwen2moe_model_id = "Qwen1.5-MoE-A2.7B_Q8_0.gguf"
     q4_llama3_model_id = "Meta-Llama-3-8B-Q4_K_M.gguf"
     fp16_bloom_model_id = "bloom-560m.fp16.gguf"
+    fp16_stablelm2_model_id = "stablelm-2-1_6b.fp16.gguf"
     q8_bloom_model_id = "bloom-560m.q8_0.gguf"
     f16_tinyllama_model_id = "TinyLlama-1.1B-Chat-v1.0.FP16.gguf"
+    q2_k_falcon7b_model_id = "falcon-7b-q2_k.gguf"
+    fp16_falcon7b_model_id = "falcon-7b-fp16.gguf"
+    q2_k_falcon40b_model_id = "tiiuae-falcon-40b-Q2_K.gguf"
+    fp16_qwen2moe_model_id = "Qwen1.5-MoE-A2.7B.gguf"
+    fp16_gpt2_model_id = "gpt2.f16.gguf"
+    q8_gpt2_model_id = "gpt2.Q8_0.gguf"
+    q6_k_gpt2_xl_model_id = "gpt2-xl.Q6_K.gguf"
 
     example_text = "Hello"
 
@@ -338,21 +357,39 @@ def test_qwen2_q4_0(self):
         EXPECTED_TEXT = "Hello.jsoup\n\nI am a beginner"
         self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT)
 
-    def test_qwen2_moe_q4_0(self):
-        tokenizer = AutoTokenizer.from_pretrained(self.qwen2_moe_model_id, gguf_file=self.q4_0_qwen2_moe_model_id)
+    def test_qwen2moe_q8(self):
+        tokenizer = AutoTokenizer.from_pretrained(self.qwen2moe_model_id, gguf_file=self.q8_qwen2moe_model_id)
         model = AutoModelForCausalLM.from_pretrained(
-            self.qwen2_moe_model_id,
-            gguf_file=self.q4_0_qwen2_moe_model_id,
-            device_map="auto",
+            self.qwen2moe_model_id,
+            gguf_file=self.q8_qwen2moe_model_id,
             torch_dtype=torch.float16,
         )
 
-        text = tokenizer(self.example_text, return_tensors="pt").to(torch_device)
+        text = tokenizer(self.example_text, return_tensors="pt")
         out = model.generate(**text, max_new_tokens=10)
 
-        EXPECTED_TEXT = "Hello everyone, I'm a newbie here and would like"
+        EXPECTED_TEXT = "Hello, I am a 20 year old male"
         self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT)
 
+    def test_qwen2moe_weights_conversion_fp16(self):
+        quantized_model = AutoModelForCausalLM.from_pretrained(
+            self.qwen2moe_model_id,
+            gguf_file=self.fp16_qwen2moe_model_id,
+            torch_dtype=torch.float16,
+        )
+        original_model = AutoModelForCausalLM.from_pretrained(
+            self.qwen2moe_original_model_id,
+            torch_dtype=torch.float16,
+        )
+
+        quantized_state_dict = quantized_model.state_dict()
+        original_state_dict = original_model.state_dict()
+
+        for layer_name, original_params in original_state_dict.items():
+            if layer_name in quantized_state_dict:
+                self.assertTrue(original_params.shape == quantized_state_dict[layer_name].shape)
+                torch.testing.assert_close(original_params, quantized_state_dict[layer_name])
+
     def test_phi3_q4_0(self):
         tokenizer = AutoTokenizer.from_pretrained(self.phi3_model_id, gguf_file=self.q4_0_phi3_model_id)
         model = AutoModelForCausalLM.from_pretrained(
@@ -416,7 +453,7 @@ def test_bloom_q8_0(self):
         text = tokenizer(self.example_text, return_tensors="pt").to(torch_device)
         out = model.generate(**text, max_new_tokens=10)
 
-        EXPECTED_TEXT = "Hello, I just want to say that I am very"
+        EXPECTED_TEXT = "Hello, I just want to say that I am just"
         self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT)
 
     def test_bloom_weights_conversion_fp16(self):
@@ -445,6 +482,174 @@ def test_bloom_weights_conversion_fp16(self):
                 self.assertTrue(quantized_param.shape == original_param.shape)
                 torch.testing.assert_close(quantized_param, original_param)
 
+    def test_gpt2_q8(self):
+        tokenizer = AutoTokenizer.from_pretrained(self.gpt2_model_id, gguf_file=self.q8_gpt2_model_id)
+        model = AutoModelForCausalLM.from_pretrained(
+            self.gpt2_model_id,
+            gguf_file=self.q8_gpt2_model_id,
+            torch_dtype=torch.float16,
+        )
+
+        text = tokenizer(self.example_text, return_tensors="pt")
+        out = model.generate(**text, max_new_tokens=10)
+
+        EXPECTED_TEXT = "Hello, I'm sorry. I'm sorry. I"
+        self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT)
+
+    def test_gpt2_weights_conversion_fp16(self):
+        quantized_model = AutoModelForCausalLM.from_pretrained(
+            self.gpt2_model_id,
+            gguf_file=self.fp16_gpt2_model_id,
+            torch_dtype=torch.float16,
+        )
+        original_model = AutoModelForCausalLM.from_pretrained(
+            self.gpt2_original_model_id,
+            torch_dtype=torch.float16,
+        )
+
+        quantized_state_dict = quantized_model.state_dict()
+        original_state_dict = original_model.state_dict()
+
+        for layer_name, original_params in original_state_dict.items():
+            if layer_name in quantized_state_dict:
+                self.assertTrue(original_params.shape == quantized_state_dict[layer_name].shape)
+                torch.testing.assert_close(original_params, quantized_state_dict[layer_name])
+
+    def test_gpt2_xl_Q6_K(self):
+        tokenizer = AutoTokenizer.from_pretrained(self.gpt2_xl_model_id, gguf_file=self.q6_k_gpt2_xl_model_id)
+        model = AutoModelForCausalLM.from_pretrained(
+            self.gpt2_xl_model_id,
+            gguf_file=self.q6_k_gpt2_xl_model_id,
+            torch_dtype=torch.float16,
+        )
+
+        text = tokenizer(self.example_text, return_tensors="pt")
+        out = model.generate(**text, max_new_tokens=10)
+
+        EXPECTED_TEXT = "Hello, I'm a newbie to the world of"
+        self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT)
+
+    @unittest.skip(reason="Heavy memory")
+    def test_falcon40b_q2_k(self):
+        tokenizer = AutoTokenizer.from_pretrained(self.falcon40b_model_id, gguf_file=self.q2_k_falcon40b_model_id)
+        model = AutoModelForCausalLM.from_pretrained(
+            self.falcon40b_model_id,
+            gguf_file=self.q2_k_falcon40b_model_id,
+            device_map="auto",
+            torch_dtype=torch.float16,
+        )
+
+        text = tokenizer(self.example_text, return_tensors="pt").to(torch_device)
+        out = model.generate(**text, max_new_tokens=10)
+
+        EXPECTED_TEXT = "Hello All,\nI am new to this forum."
+        self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT)
+
+    def test_falcon7b_q2_k(self):
+        tokenizer = AutoTokenizer.from_pretrained(self.falcon7b_model_id, gguf_file=self.q2_k_falcon7b_model_id)
+        model = AutoModelForCausalLM.from_pretrained(
+            self.falcon7b_model_id,
+            gguf_file=self.q2_k_falcon7b_model_id,
+            device_map="auto",
+            torch_dtype=torch.float16,
+        )
+
+        text = tokenizer(self.example_text, return_tensors="pt").to(torch_device)
+        out = model.generate(**text, max_new_tokens=10)
+
+        EXPECTED_TEXT = "Hello All,\nI am new to this forum."
+        self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT)
+
+    def test_falcon7b_weights_conversion_fp16(self):
+        quantized_model = AutoModelForCausalLM.from_pretrained(
+            self.falcon7b_model_id,
+            gguf_file=self.fp16_falcon7b_model_id,
+            device_map="auto",
+            torch_dtype=torch.float16,
+        )
+        original_model = AutoModelForCausalLM.from_pretrained(
+            self.original_flacon7b_model_id,
+            device_map="auto",
+            torch_dtype=torch.float16,
+        )
+
+        quantized_state_dict = quantized_model.state_dict()
+        original_state_dict = original_model.state_dict()
+
+        for layer_name, original_params in original_state_dict.items():
+            if layer_name in quantized_state_dict:
+                self.assertTrue(original_params.shape == quantized_state_dict[layer_name].shape)
+                torch.testing.assert_close(original_params, quantized_state_dict[layer_name])
+
+    def test_stablelm_q4_k_m(self):
+        model = AutoModelForCausalLM.from_pretrained(
+            self.stablelm_model_id,
+            gguf_file=self.q4_k_m_stablelm_model_id,
+            device_map="auto",
+            torch_dtype=torch.float16,
+        )
+
+        tokenizer = AutoTokenizer.from_pretrained(self.stablelm_model_id, gguf_file=self.q4_k_m_stablelm_model_id)
+        text = tokenizer(self.example_text, return_tensors="pt").to(torch_device)
+        out = model.generate(**text, max_new_tokens=10)
+
+        EXPECTED_TEXT = "Hello-\nI am trying to create a new user"
+        self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT)
+
+    def test_stablelm_fp16(self):
+        original_model = AutoModelForCausalLM.from_pretrained(
+            self.original_stablelm2_model_id,
+            torch_dtype=torch.float16,
+        )
+
+        converted_model = AutoModelForCausalLM.from_pretrained(
+            self.stablelm2_model_id,
+            gguf_file=self.fp16_stablelm2_model_id,
+            torch_dtype=torch.float16,
+            # for precise comparison it is required to use the original model config
+            # as quantized one is different in parameters: use_parallel_residual and use_qkv_bias
+            # and it highly influences on the output results
+            config=original_model.config,
+        )
+
+        tokenizer = AutoTokenizer.from_pretrained(self.stablelm2_model_id, gguf_file=self.fp16_stablelm2_model_id)
+        text = tokenizer(self.example_text, return_tensors="pt")
+        original_out = original_model.generate(**text, max_new_tokens=10)
+        converted_out = converted_model.generate(**text, max_new_tokens=10)
+
+        EXPECTED_TEXT = "Hello, I am a 20 year old male"
+        self.assertEqual(tokenizer.decode(converted_out[0], skip_special_tokens=True), EXPECTED_TEXT)
+        self.assertEqual(
+            tokenizer.decode(converted_out[0], skip_special_tokens=True),
+            tokenizer.decode(original_out[0], skip_special_tokens=True),
+        )
+
+    def test_stablelm_weights_conversion_fp16(self):
+        original_model = AutoModelForCausalLM.from_pretrained(
+            self.original_stablelm2_model_id,
+            device_map="auto",
+            torch_dtype=torch.float16,
+        )
+
+        converted_model = AutoModelForCausalLM.from_pretrained(
+            self.stablelm2_model_id,
+            gguf_file=self.fp16_stablelm2_model_id,
+            device_map="auto",
+            torch_dtype=torch.float16,
+            # for precise comparison it is required to use the original model config
+            # as quantized one is different in parameters: use_parallel_residual and use_qkv_bias
+            # and it highly influences on the output results
+            config=original_model.config,
+        )
+
+        converted_state_dict = converted_model.state_dict()
+        original_state_dict = original_model.state_dict()
+
+        for layer_name, original_params in original_state_dict.items():
+            if layer_name in converted_state_dict:
+                self.assertTrue(original_params.shape == converted_state_dict[layer_name].shape)
+                torch.testing.assert_close(original_params, converted_state_dict[layer_name])
+
     def test_tokenization_xnli(self):
         import tqdm
         from datasets import load_dataset
diff --git a/tests/quantization/quanto_integration/test_quanto.py b/tests/quantization/quanto_integration/test_quanto.py
index d8f4fffb8d2b56..08cc48d0cccd36 100644
--- a/tests/quantization/quanto_integration/test_quanto.py
+++ b/tests/quantization/quanto_integration/test_quanto.py
@@ -19,13 +19,13 @@
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, QuantoConfig
 from transformers.testing_utils import (
     require_accelerate,
-    require_quanto,
+    require_optimum_quanto,
     require_read_token,
     require_torch_gpu,
     slow,
     torch_device,
 )
-from transformers.utils import is_accelerate_available, is_quanto_available, is_torch_available
+from transformers.utils import is_accelerate_available, is_optimum_quanto_available, is_torch_available
 
 
 if is_torch_available():
@@ -36,8 +36,8 @@
 if is_accelerate_available():
     from accelerate import init_empty_weights
 
-if is_quanto_available():
-    from quanto import QLayerNorm, QLinear
+if is_optimum_quanto_available():
+    from optimum.quanto import QLayerNorm, QLinear
 
     from transformers.integrations.quanto import replace_with_quanto_layers
 
@@ -47,7 +47,7 @@ def test_attributes(self):
         pass
 
 
-@require_quanto
+@require_optimum_quanto
 @require_accelerate
 class QuantoTestIntegration(unittest.TestCase):
     model_id = "facebook/opt-350m"
@@ -124,7 +124,7 @@ def test_conversion_with_modules_to_not_convert(self):
 
 @slow
 @require_torch_gpu
-@require_quanto
+@require_optimum_quanto
 @require_accelerate
 class QuantoQuantizationTest(unittest.TestCase):
     """
@@ -187,7 +187,7 @@ def test_generate_quality_cuda(self):
         self.check_inference_correctness(self.quantized_model, "cuda")
 
     def test_quantized_model_layers(self):
-        from quanto import QBitsTensor, QModuleMixin, QTensor
+        from optimum.quanto import QBitsTensor, QModuleMixin, QTensor
 
         """
         Suite of simple test to check if the layers are quantized and are working properly
@@ -256,7 +256,7 @@ def check_same_model(self, model1, model2):
             self.assertTrue(torch.equal(d0[k], d1[k].to(d0[k].device)))
 
     def test_compare_with_quanto(self):
-        from quanto import freeze, qint4, qint8, quantize
+        from optimum.quanto import freeze, qint4, qint8, quantize
 
         w_mapping = {"int8": qint8, "int4": qint4}
         model = AutoModelForCausalLM.from_pretrained(
@@ -272,7 +272,7 @@ def test_compare_with_quanto(self):
 
     @unittest.skip
     def test_load_from_quanto_saved(self):
-        from quanto import freeze, qint4, qint8, quantize
+        from optimum.quanto import freeze, qint4, qint8, quantize
 
         from transformers import QuantoConfig
 
@@ -356,7 +356,7 @@ def test_check_offload_quantized(self):
         """
         We check that we have unquantized value in the cpu and in the disk
         """
-        import quanto
+        from optimum.quanto import QBitsTensor, QTensor
 
         cpu_weights = self.quantized_model.transformer.h[22].self_attention.query_key_value._hf_hook.weights_map[
             "weight"
@@ -364,13 +364,11 @@ def test_check_offload_quantized(self):
         disk_weights = self.quantized_model.transformer.h[23].self_attention.query_key_value._hf_hook.weights_map[
             "weight"
         ]
-        self.assertTrue(isinstance(cpu_weights, torch.Tensor) and not isinstance(cpu_weights, quanto.QTensor))
-        self.assertTrue(isinstance(disk_weights, torch.Tensor) and not isinstance(disk_weights, quanto.QTensor))
+        self.assertTrue(isinstance(cpu_weights, torch.Tensor) and not isinstance(cpu_weights, QTensor))
+        self.assertTrue(isinstance(disk_weights, torch.Tensor) and not isinstance(disk_weights, QTensor))
         if self.weights == "int4":
-            self.assertTrue(isinstance(cpu_weights, torch.Tensor) and not isinstance(disk_weights, quanto.QBitsTensor))
-            self.assertTrue(
-                isinstance(disk_weights, torch.Tensor) and not isinstance(disk_weights, quanto.QBitsTensor)
-            )
+            self.assertTrue(isinstance(cpu_weights, torch.Tensor) and not isinstance(disk_weights, QBitsTensor))
+            self.assertTrue(isinstance(disk_weights, torch.Tensor) and not isinstance(disk_weights, QBitsTensor))
 
 
 @unittest.skip(reason="Skipping test class because serialization is not supported yet")
@@ -416,18 +414,18 @@ class QuantoQuantizationSerializationCudaTest(QuantoQuantizationTest):
 
 
 class QuantoQuantizationQBitsTensorTest(QuantoQuantizationTest):
-    EXPECTED_OUTPUTS = "Hello my name is Nils, I am a student of the University"
+    EXPECTED_OUTPUTS = "Hello my name is John, I am a professional photographer, I"
     weights = "int4"
 
 
 class QuantoQuantizationQBitsTensorOffloadTest(QuantoQuantizationOffloadTest):
-    EXPECTED_OUTPUTS = "Hello my name is Nils, I am a student of the University"
+    EXPECTED_OUTPUTS = "Hello my name is John, I am a professional photographer, I"
     weights = "int4"
 
 
 @unittest.skip(reason="Skipping test class because serialization is not supported yet")
 class QuantoQuantizationQBitsTensorSerializationTest(QuantoQuantizationSerializationTest):
-    EXPECTED_OUTPUTS = "Hello my name is Nils, I am a student of the University"
+    EXPECTED_OUTPUTS = "Hello my name is John, I am a professional photographer, I"
     weights = "int4"
 
 
@@ -443,14 +441,14 @@ def test_quantize_activation(self):
         self.assertIn("We don't support quantizing the activations with transformers library", str(e.exception))
 
 
-@require_quanto
+@require_optimum_quanto
 @require_torch_gpu
 class QuantoKVCacheQuantizationTest(unittest.TestCase):
     @slow
     @require_read_token
     def test_quantized_cache(self):
         EXPECTED_TEXT_COMPLETION = [
-            "Simply put, the theory of relativity states that 1) the speed of light is the same for all observers, and 2) the laws of physics are the same for all observers.\nThe first part of the theory of relativity",
+            "Simply put, the theory of relativity states that 1) the speed of light is the same for all observers, and 2) the laws of physics are the same for all observers.\nThe first part of the theory is the most",
             "My favorite all time favorite condiment is ketchup. I love it on everything. I love it on my eggs, my fries, my chicken, my burgers, my hot dogs, my sandwiches, my salads, my p",
         ]
 
diff --git a/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py b/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py
index b050b1ca5a6c1b..2e3d9fdc7f2f33 100644
--- a/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py
+++ b/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py
@@ -455,7 +455,7 @@ def compute_metrics(p: EvalPrediction):
         train_dataset=train_dataset if training_args.do_train else None,
         eval_dataset=eval_dataset if training_args.do_eval else None,
         compute_metrics=compute_metrics,
-        tokenizer=tokenizer,
+        processing_class=tokenizer,
         data_collator=data_collator,
     )
 
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 4d96b229284089..fa4a35391baf25 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -25,6 +25,7 @@
 import time
 import warnings
 from collections import defaultdict
+from contextlib import contextmanager
 from typing import Dict, List, Tuple
 
 import numpy as np
@@ -45,6 +46,12 @@
     logging,
     set_seed,
 )
+from transformers.integrations import HfDeepSpeedConfig
+from transformers.integrations.deepspeed import (
+    is_deepspeed_available,
+    is_deepspeed_zero3_enabled,
+    unset_hf_deepspeed_config,
+)
 from transformers.models.auto import get_values
 from transformers.models.auto.modeling_auto import (
     MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES,
@@ -75,6 +82,7 @@
     is_pt_tf_cross_test,
     require_accelerate,
     require_bitsandbytes,
+    require_deepspeed,
     require_flash_attn,
     require_non_xpu,
     require_read_token,
@@ -134,6 +142,9 @@
 if is_torch_fx_available():
     from transformers.utils.fx import _FX_SUPPORTED_MODELS_WITH_KV_CACHE, symbolic_trace
 
+if is_deepspeed_available():
+    import deepspeed
+
 
 def _config_zero_init(config):
     configs_no_init = copy.deepcopy(config)
@@ -171,6 +182,15 @@ def _mock_all_init_weights(self):
         self.tie_weights()
 
 
+@contextmanager
+def _deepspeed_zero3(ds_config):
+    dschf = HfDeepSpeedConfig(ds_config)
+    try:
+        yield dschf
+    finally:
+        unset_hf_deepspeed_config()
+
+
 @require_torch
 class ModelTesterMixin:
     model_tester = None
@@ -1797,8 +1817,13 @@ def test_resize_tokens_embeddings(self):
 
         for model_class in self.all_model_classes:
             config = copy.deepcopy(original_config)
-            model = model_class(config)
-            model.to(torch_device)
+            if is_deepspeed_zero3_enabled():
+                with deepspeed.zero.Init():
+                    model = model_class(config)
+            else:
+                model = model_class(config)
+                model.to(torch_device)
+
             model_embed_pre_resize = model.get_input_embeddings()
             type_model_embed_pre_resize = type(model_embed_pre_resize)
 
@@ -1813,15 +1838,27 @@ def test_resize_tokens_embeddings(self):
             # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
             model_embed = model.resize_token_embeddings(model_vocab_size + 10)
             new_model_vocab_size = model.config.get_text_config().vocab_size
-
             self.assertEqual(new_model_vocab_size, model_vocab_size + 10)
             # Check that it actually resizes the embeddings matrix
             self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
             # Check to make sure the type of embeddings returned post resizing is same as type of input
             type_model_embed_post_resize = type(model_embed)
             self.assertEqual(type_model_embed_pre_resize, type_model_embed_post_resize)
+            # Check that added embeddings mean is close to the old embeddings mean
+            if is_deepspeed_zero3_enabled():
+                with deepspeed.zero.GatheredParameters(model_embed.weight, modifier_rank=None):
+                    old_embeddings_mean = torch.mean(model_embed.weight.data[:-10, :], axis=0)
+                    new_embeddings_mean = torch.mean(model_embed.weight.data[-10:, :], axis=0)
+            else:
+                old_embeddings_mean = torch.mean(model_embed.weight.data[:-10, :], axis=0)
+                new_embeddings_mean = torch.mean(model_embed.weight.data[-10:, :], axis=0)
+            torch.testing.assert_close(old_embeddings_mean, new_embeddings_mean, atol=1e-3, rtol=1e-1)
+
             # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            model(**self._prepare_for_class(inputs_dict, model_class))
+            if not is_deepspeed_zero3_enabled():
+                # A distriputed launcher is needed for the forward pass when deepspeed is enabled
+                model_inputs = self._prepare_for_class(inputs_dict, model_class)
+                model(**model_inputs)
 
             # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
             model_embed = model.resize_token_embeddings(model_vocab_size - 15)
@@ -1835,9 +1872,12 @@ def test_resize_tokens_embeddings(self):
             inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 1)
 
             # make sure that decoder_input_ids are resized as well
-            if "decoder_input_ids" in inputs_dict:
-                inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1)
-            model(**self._prepare_for_class(inputs_dict, model_class))
+            if not is_deepspeed_zero3_enabled():
+                # A distriputed launcher is needed for the forward pass when deepspeed is enabled
+                if "decoder_input_ids" in inputs_dict:
+                    inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1)
+                model_inputs = self._prepare_for_class(inputs_dict, model_class)
+                model(**model_inputs)
 
             # Check that adding and removing tokens has not modified the first part of the embedding matrix.
             models_equal = True
@@ -1847,9 +1887,16 @@ def test_resize_tokens_embeddings(self):
 
             self.assertTrue(models_equal)
 
+            del model
+            del config
+            # Copy again. config changed with embedding resizing (`vocab_size` changed)
             config = copy.deepcopy(original_config)
-            model = model_class(config)
-            model.to(torch_device)
+            if is_deepspeed_zero3_enabled():
+                with deepspeed.zero.Init():
+                    model = model_class(config)
+            else:
+                model = model_class(config)
+                model.to(torch_device)
 
             model_vocab_size = config.get_text_config().vocab_size
             model.resize_token_embeddings(model_vocab_size + 10, pad_to_multiple_of=1)
@@ -1877,6 +1924,67 @@ def test_resize_tokens_embeddings(self):
             ):
                 model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=1.3)
 
+            # Test when `vocab_size` is smaller than `hidden_size`.
+            del model
+            del config
+            # Copy again. config changed with embedding resizing (`vocab_size` changed)
+            config = copy.deepcopy(original_config)
+            config.vocab_size = 4
+            config.pad_token_id = 3
+            if is_deepspeed_zero3_enabled():
+                with deepspeed.zero.Init():
+                    model = model_class(config)
+            else:
+                model = model_class(config)
+                model.to(torch_device)
+
+            model_vocab_size = config.get_text_config().vocab_size
+            # Retrieve the embeddings and clone theme
+            model_embed = model.resize_token_embeddings(model_vocab_size)
+            cloned_embeddings = model_embed.weight.clone()
+
+            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
+            model_embed = model.resize_token_embeddings(model_vocab_size + 10)
+            new_model_vocab_size = model.config.get_text_config().vocab_size
+            self.assertEqual(new_model_vocab_size, model_vocab_size + 10)
+            # Check that it actually resizes the embeddings matrix
+            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
+            # Check to make sure the type of embeddings returned post resizing is same as type of input
+            type_model_embed_post_resize = type(model_embed)
+            self.assertEqual(type_model_embed_pre_resize, type_model_embed_post_resize)
+            # Check that added embeddings mean is close to the old embeddings mean
+            if is_deepspeed_zero3_enabled():
+                with deepspeed.zero.GatheredParameters(model_embed.weight, modifier_rank=None):
+                    old_embeddings_mean = torch.mean(model_embed.weight.data[:-10, :], axis=0)
+                    new_embeddings_mean = torch.mean(model_embed.weight.data[-10:, :], axis=0)
+            else:
+                old_embeddings_mean = torch.mean(model_embed.weight.data[:-10, :], axis=0)
+                new_embeddings_mean = torch.mean(model_embed.weight.data[-10:, :], axis=0)
+            torch.testing.assert_close(old_embeddings_mean, new_embeddings_mean, atol=1e-3, rtol=1e-1)
+
+    @require_deepspeed
+    @require_torch_gpu
+    def test_resize_tokens_embeddings_with_deepspeed(self):
+        ds_config = {
+            "zero_optimization": {
+                "stage": 3,
+                "offload_param": {"device": "cpu", "pin_memory": True},
+            },
+        }
+        with _deepspeed_zero3(ds_config):
+            self.test_resize_tokens_embeddings()
+
+    @require_deepspeed
+    @require_torch_multi_gpu
+    def test_resize_tokens_embeddings_with_deepspeed_multi_gpu(self):
+        ds_config = {
+            "zero_optimization": {
+                "stage": 3,
+            },
+        }
+        with _deepspeed_zero3(ds_config):
+            self.test_resize_tokens_embeddings()
+
     def test_resize_embeddings_untied(self):
         if not self.test_resize_embeddings:
             self.skipTest(reason="test_resize_embeddings is set to `False`")
@@ -1890,7 +1998,11 @@ def test_resize_embeddings_untied(self):
 
         for model_class in self.all_model_classes:
             config = copy.deepcopy(original_config)
-            model = model_class(config).to(torch_device)
+            if is_deepspeed_zero3_enabled():
+                with deepspeed.zero.Init():
+                    model = model_class(config)
+            else:
+                model = model_class(config).to(torch_device)
 
             # if no output embeddings -> leave test
             if model.get_output_embeddings() is None:
@@ -1907,7 +2019,33 @@ def test_resize_embeddings_untied(self):
             if output_embeds.bias is not None:
                 self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10)
             # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            model(**self._prepare_for_class(inputs_dict, model_class))
+            if not is_deepspeed_zero3_enabled():
+                # A distriputed launcher is needed for the forward pass when deepspeed is enabled
+                model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Test multivariate resizing.
+            model.resize_token_embeddings(model_vocab_size + 10)
+            output_embeds = model.get_output_embeddings()
+            # Check that added embeddings mean is close to the old embeddings mean
+            if is_deepspeed_zero3_enabled():
+                with deepspeed.zero.GatheredParameters(output_embeds.weight, modifier_rank=None):
+                    old_embeddings_mean = torch.mean(output_embeds.weight.data[:-10, :], axis=0)
+                    new_embeddings_mean = torch.mean(output_embeds.weight.data[-10:, :], axis=0)
+            else:
+                old_embeddings_mean = torch.mean(output_embeds.weight.data[:-10, :], axis=0)
+                new_embeddings_mean = torch.mean(output_embeds.weight.data[-10:, :], axis=0)
+            torch.testing.assert_close(old_embeddings_mean, new_embeddings_mean, atol=1e-3, rtol=1e-1)
+            # check if the old bias mean close to added bias mean.
+            if output_embeds.bias is not None:
+                if is_deepspeed_zero3_enabled():
+                    with deepspeed.zero.GatheredParameters(output_embeds.bias, modifier_rank=None):
+                        old_bias_mean = torch.mean(output_embeds.bias.data[:-10], axis=0)
+                        new_bias_mean = torch.mean(output_embeds.bias.data[-10:], axis=0)
+                else:
+                    old_bias_mean = torch.mean(output_embeds.bias.data[:-10], axis=0)
+                    new_bias_mean = torch.mean(output_embeds.bias.data[-10:], axis=0)
+
+                torch.testing.assert_close(old_bias_mean, new_bias_mean, atol=1e-5, rtol=1e-2)
 
             # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
             model.resize_token_embeddings(model_vocab_size - 15)
@@ -1925,7 +2063,32 @@ def test_resize_embeddings_untied(self):
             if "decoder_input_ids" in inputs_dict:
                 inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1)
             # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            model(**self._prepare_for_class(inputs_dict, model_class))
+            if not is_deepspeed_zero3_enabled():
+                # A distriputed launcher is needed for the forward pass when deepspeed is enabled
+                model(**self._prepare_for_class(inputs_dict, model_class))
+
+    @require_deepspeed
+    @require_torch_gpu
+    def test_resize_embeddings_untied_with_deepspeed(self):
+        ds_config = {
+            "zero_optimization": {
+                "stage": 3,
+                "offload_param": {"device": "cpu", "pin_memory": True},
+            },
+        }
+        with _deepspeed_zero3(ds_config):
+            self.test_resize_embeddings_untied()
+
+    @require_deepspeed
+    @require_torch_multi_gpu
+    def test_resize_embeddings_untied_with_deepspeed_multi_gpu(self):
+        ds_config = {
+            "zero_optimization": {
+                "stage": 3,
+            },
+        }
+        with _deepspeed_zero3(ds_config):
+            self.test_resize_embeddings_untied()
 
     def test_model_get_set_embeddings(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -2837,7 +3000,7 @@ def test_inputs_embeds_matches_input_ids(self):
 
     def test_inputs_embeds_matches_input_ids_with_generate(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
+        for model_class in self.all_generative_model_classes:
             if model_class.__name__ not in get_values(MODEL_FOR_CAUSAL_LM_MAPPING_NAMES):
                 continue
             model = model_class(config)
@@ -2884,7 +3047,10 @@ def test_inputs_embeds_matches_input_ids_with_generate(self):
                     **inputs,
                     max_new_tokens=2,
                 )
-            self.assertTrue(torch.allclose(out_embeds, out_ids))
+            # NOTE: this test changes the order of FP ops, there may be tiny differences in the output
+            number_of_different_tokens = (out_ids != out_embeds).sum()
+            max_differences = int(out_ids.shape[0] * out_ids.shape[1] * 0.1)
+            self.assertTrue(number_of_different_tokens <= max_differences)  # accept up to 10% mismatch
 
     @require_non_xpu
     @require_torch_multi_gpu
@@ -4749,7 +4915,7 @@ def test_static_cache_matches_dynamic(self):
                 output_logits=True,
                 return_dict_in_generate=True,
             )
-            self.assertTrue(torch.allclose(dynamic_out.logits[0], static_out.logits[0], rtol=1e-3, atol=1e-3))
+            self.assertTrue(torch.allclose(dynamic_out.logits[0], static_out.logits[0], rtol=1e-3, atol=1e-4))
 
     # For now, Let's focus only on GPU for `torch.compile`
     @slow
@@ -4782,6 +4948,49 @@ def test_torch_compile(self):
         for i in range(n_iter):
             _ = model.generate(**input_ids, do_sample=False)
 
+    @slow
+    @require_torch_gpu
+    def test_torch_compile_for_training(self):
+        if version.parse(torch.__version__) < version.parse("2.3"):
+            self.skipTest(reason="This test requires torch >= 2.3 to run.")
+
+        if not hasattr(self, "_torch_compile_train_cls"):
+            self.skipTest(f"{self.__class__.__name__} doesn't have the attribute `_torch_compile_train_cls`.")
+
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        cls = self._torch_compile_train_cls
+        model = cls(config).to(torch_device)
+
+        inputs = {
+            "input_ids": torch.randint(low=1, high=model.config.vocab_size, size=(2, 10), device=torch_device),
+            "attention_mask": torch.tensor(
+                [[1, 1, 1, 1, 1, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
+                dtype=torch.int64,
+                device=torch_device,
+            ),
+            "position_ids": torch.arange(0, 10, device=torch_device).unsqueeze(0),
+            "labels": torch.randint(low=1, high=model.config.vocab_size, size=(2, 10), device=torch_device),
+        }
+
+        # eager backward
+        set_seed(42)
+        loss = model(**inputs).loss
+        loss.backward()
+
+        params = {name: param.grad.clone().detach().cpu() for name, param in model.named_parameters()}
+        model.zero_grad()
+        del loss
+
+        model = torch.compile(model, fullgraph=True, mode="reduce-overhead")
+        # forward compilation
+        set_seed(42)
+        loss = model(**inputs).loss
+        # backward compilation
+        loss.backward()
+        # check grad matches
+        for name, param in model._orig_mod.named_parameters():
+            torch.testing.assert_close(param.grad.detach().cpu(), params[name], rtol=1e-4, atol=1e-4)
+
     @slow
     @require_torch_gpu  # Testing cuda graphs.
     @require_read_token
diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index 2cf272f4aac10d..eb328d83e9e7a4 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -1715,10 +1715,9 @@ def test_dataset_conversion(self):
                 model.train_on_batch(test_batch, test_batch_labels)
 
     def _test_xla_generate(self, **generate_kwargs):
-        def _generate_and_check_results(model, inputs_dict):
-            if "input_ids" in inputs_dict:
-                inputs = inputs_dict["input_ids"]
-                # make sure there are no pad tokens in prompt, which may trigger unwanted behavior
+        def _generate_and_check_results(model, inputs, is_input_ids):
+            # make sure there are no pad tokens in prompt, which may trigger unwanted behavior
+            if is_input_ids:
                 if model.generation_config.pad_token_id is not None:
                     if config.pad_token_id == 0:
                         new_pad_token = model.generation_config.pad_token_id + 1
@@ -1727,10 +1726,6 @@ def _generate_and_check_results(model, inputs_dict):
                 else:
                     new_pad_token = None
                 inputs = tf.where(inputs != model.generation_config.pad_token_id, inputs, new_pad_token)
-            elif "input_features" in inputs_dict:
-                inputs = inputs_dict["input_features"]
-            else:
-                raise ValueError("No valid generate input found in inputs_dict")
 
             generated = model.generate(inputs, **generate_kwargs).numpy()
             generate_xla = tf.function(model.generate, jit_compile=True)
@@ -1753,12 +1748,20 @@ def _generate_and_check_results(model, inputs_dict):
             config.eos_token_id = None  # Generate until max length
             config.do_sample = False
 
+            # extract the input to the model
+            is_input_ids = "input_ids" in inputs_dict
+            is_input_features = "input_features" in inputs_dict
+            if not (is_input_ids or is_input_features):
+                raise ValueError("No valid generate input found in inputs_dict")
+            inputs = inputs_dict["input_ids"] if is_input_ids else inputs_dict["input_features"]
+
             # fix config for models with additional sequence-length limiting settings
+            seq_len = inputs.get_shape()[1]
             for var_name in ["max_position_embeddings", "max_target_positions"]:
                 attr = getattr(config, var_name, None)
-                if attr is not None and attr < generate_kwargs["max_new_tokens"]:
+                if attr is not None and attr < seq_len + generate_kwargs["max_new_tokens"]:
                     try:
-                        setattr(config, var_name, generate_kwargs["max_new_tokens"])
+                        setattr(config, var_name, seq_len + generate_kwargs["max_new_tokens"])
                     except NotImplementedError:
                         # xlnet will raise an exception when trying to set
                         # max_position_embeddings.
@@ -1767,10 +1770,10 @@ def _generate_and_check_results(model, inputs_dict):
             model = model_class(config)
 
             if model.supports_xla_generation:
-                _generate_and_check_results(model, inputs_dict)
+                _generate_and_check_results(model, inputs, is_input_ids)
             else:
                 with self.assertRaises(ValueError):
-                    _generate_and_check_results(model, inputs_dict)
+                    _generate_and_check_results(model, inputs, is_input_ids)
 
     def test_xla_generate_fast(self):
         """
diff --git a/tests/test_pipeline_mixin.py b/tests/test_pipeline_mixin.py
index 74e685fb112c29..cae285f5f15727 100644
--- a/tests/test_pipeline_mixin.py
+++ b/tests/test_pipeline_mixin.py
@@ -25,9 +25,30 @@
 from textwrap import dedent
 from typing import get_args
 
-from huggingface_hub import AudioClassificationInput
+from huggingface_hub import (
+    AudioClassificationInput,
+    AutomaticSpeechRecognitionInput,
+    DepthEstimationInput,
+    ImageClassificationInput,
+    ImageSegmentationInput,
+    ImageToTextInput,
+    ObjectDetectionInput,
+    QuestionAnsweringInput,
+    ZeroShotImageClassificationInput,
+)
 
-from transformers.pipelines import AudioClassificationPipeline
+from transformers.models.auto.processing_auto import PROCESSOR_MAPPING_NAMES
+from transformers.pipelines import (
+    AudioClassificationPipeline,
+    AutomaticSpeechRecognitionPipeline,
+    DepthEstimationPipeline,
+    ImageClassificationPipeline,
+    ImageSegmentationPipeline,
+    ImageToTextPipeline,
+    ObjectDetectionPipeline,
+    QuestionAnsweringPipeline,
+    ZeroShotImageClassificationPipeline,
+)
 from transformers.testing_utils import (
     is_pipeline_test,
     require_decord,
@@ -104,6 +125,14 @@
     # Adding a task to this list will cause its pipeline input signature to be checked against the corresponding
     # task spec in the HF Hub
     "audio-classification": (AudioClassificationPipeline, AudioClassificationInput),
+    "automatic-speech-recognition": (AutomaticSpeechRecognitionPipeline, AutomaticSpeechRecognitionInput),
+    "depth-estimation": (DepthEstimationPipeline, DepthEstimationInput),
+    "image-classification": (ImageClassificationPipeline, ImageClassificationInput),
+    "image-segmentation": (ImageSegmentationPipeline, ImageSegmentationInput),
+    "image-to-text": (ImageToTextPipeline, ImageToTextInput),
+    "object-detection": (ObjectDetectionPipeline, ObjectDetectionInput),
+    "question-answering": (QuestionAnsweringPipeline, QuestionAnsweringInput),
+    "zero-shot-image-classification": (ZeroShotImageClassificationPipeline, ZeroShotImageClassificationInput),
 }
 
 for task, task_info in pipeline_test_mapping.items():
@@ -158,11 +187,14 @@ def run_task_tests(self, task, torch_dtype="float32"):
         model_architectures = self.pipeline_model_mapping[task]
         if not isinstance(model_architectures, tuple):
             model_architectures = (model_architectures,)
-        if not isinstance(model_architectures, tuple):
-            raise TypeError(f"`model_architectures` must be a tuple. Got {type(model_architectures)} instead.")
+
+        # We are going to run tests for multiple model architectures, some of them might be skipped
+        # with this flag we are control if at least one model were tested or all were skipped
+        at_least_one_model_is_tested = False
 
         for model_architecture in model_architectures:
             model_arch_name = model_architecture.__name__
+            model_type = model_architecture.config_class.model_type
 
             # Get the canonical name
             for _prefix in ["Flax", "TF"]:
@@ -170,31 +202,70 @@ def run_task_tests(self, task, torch_dtype="float32"):
                     model_arch_name = model_arch_name[len(_prefix) :]
                     break
 
-            tokenizer_names = []
-            processor_names = []
+            if model_arch_name not in tiny_model_summary:
+                continue
+
+            tokenizer_names = tiny_model_summary[model_arch_name]["tokenizer_classes"]
+
+            # Sort image processors and feature extractors from tiny-models json file
+            image_processor_names = []
+            feature_extractor_names = []
+
+            processor_classes = tiny_model_summary[model_arch_name]["processor_classes"]
+            for cls_name in processor_classes:
+                if "ImageProcessor" in cls_name:
+                    image_processor_names.append(cls_name)
+                elif "FeatureExtractor" in cls_name:
+                    feature_extractor_names.append(cls_name)
+
+            # Processor classes are not in tiny models JSON file, so extract them from the mapping
+            # processors are mapped to instance, e.g. "XxxProcessor"
+            processor_names = PROCESSOR_MAPPING_NAMES.get(model_type, None)
+            if not isinstance(processor_names, (list, tuple)):
+                processor_names = [processor_names]
+
             commit = None
-            if model_arch_name in tiny_model_summary:
-                tokenizer_names = tiny_model_summary[model_arch_name]["tokenizer_classes"]
-                processor_names = tiny_model_summary[model_arch_name]["processor_classes"]
-                if "sha" in tiny_model_summary[model_arch_name]:
-                    commit = tiny_model_summary[model_arch_name]["sha"]
-            # Adding `None` (if empty) so we can generate tests
-            tokenizer_names = [None] if len(tokenizer_names) == 0 else tokenizer_names
-            processor_names = [None] if len(processor_names) == 0 else processor_names
+            if model_arch_name in tiny_model_summary and "sha" in tiny_model_summary[model_arch_name]:
+                commit = tiny_model_summary[model_arch_name]["sha"]
 
             repo_name = f"tiny-random-{model_arch_name}"
             if TRANSFORMERS_TINY_MODEL_PATH != "hf-internal-testing":
                 repo_name = model_arch_name
 
             self.run_model_pipeline_tests(
-                task, repo_name, model_architecture, tokenizer_names, processor_names, commit, torch_dtype
+                task,
+                repo_name,
+                model_architecture,
+                tokenizer_names=tokenizer_names,
+                image_processor_names=image_processor_names,
+                feature_extractor_names=feature_extractor_names,
+                processor_names=processor_names,
+                commit=commit,
+                torch_dtype=torch_dtype,
             )
+            at_least_one_model_is_tested = True
+
         if task in task_to_pipeline_and_spec_mapping:
             pipeline, hub_spec = task_to_pipeline_and_spec_mapping[task]
             compare_pipeline_args_to_hub_spec(pipeline, hub_spec)
 
+        if not at_least_one_model_is_tested:
+            self.skipTest(
+                f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')}_{torch_dtype} is skipped: Could not find any "
+                f"model architecture in the tiny models JSON file for `{task}`."
+            )
+
     def run_model_pipeline_tests(
-        self, task, repo_name, model_architecture, tokenizer_names, processor_names, commit, torch_dtype="float32"
+        self,
+        task,
+        repo_name,
+        model_architecture,
+        tokenizer_names,
+        image_processor_names,
+        feature_extractor_names,
+        processor_names,
+        commit,
+        torch_dtype="float32",
     ):
         """Run pipeline tests for a specific `task` with the give model class and tokenizer/processor class names
 
@@ -207,8 +278,12 @@ def run_model_pipeline_tests(
                 A subclass of `PretrainedModel` or `PretrainedModel`.
             tokenizer_names (`List[str]`):
                 A list of names of a subclasses of `PreTrainedTokenizerFast` or `PreTrainedTokenizer`.
+            image_processor_names (`List[str]`):
+                A list of names of subclasses of `BaseImageProcessor`.
+            feature_extractor_names (`List[str]`):
+                A list of names of subclasses of `FeatureExtractionMixin`.
             processor_names (`List[str]`):
-                A list of names of subclasses of `BaseImageProcessor` or `FeatureExtractionMixin`.
+                A list of names of subclasses of `ProcessorMixin`.
             commit (`str`):
                 The commit hash of the model repository on the Hub.
             torch_dtype (`str`, `optional`, defaults to `'float32'`):
@@ -218,27 +293,73 @@ def run_model_pipeline_tests(
         # `run_pipeline_test`.
         pipeline_test_class_name = pipeline_test_mapping[task]["test"].__name__
 
-        for tokenizer_name in tokenizer_names:
-            for processor_name in processor_names:
-                if self.is_pipeline_test_to_skip(
-                    pipeline_test_class_name,
-                    model_architecture.config_class,
-                    model_architecture,
-                    tokenizer_name,
-                    processor_name,
-                ):
-                    logger.warning(
-                        f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')}_{torch_dtype} is skipped: test is "
-                        f"currently known to fail for: model `{model_architecture.__name__}` | tokenizer "
-                        f"`{tokenizer_name}` | processor `{processor_name}`."
-                    )
-                    continue
-                self.run_pipeline_test(
-                    task, repo_name, model_architecture, tokenizer_name, processor_name, commit, torch_dtype
+        # If no image processor or feature extractor is found, we still need to test the pipeline with None
+        # otherwise for any empty list we might skip all the tests
+        tokenizer_names = tokenizer_names or [None]
+        image_processor_names = image_processor_names or [None]
+        feature_extractor_names = feature_extractor_names or [None]
+        processor_names = processor_names or [None]
+
+        test_cases = [
+            {
+                "tokenizer_name": tokenizer_name,
+                "image_processor_name": image_processor_name,
+                "feature_extractor_name": feature_extractor_name,
+                "processor_name": processor_name,
+            }
+            for tokenizer_name in tokenizer_names
+            for image_processor_name in image_processor_names
+            for feature_extractor_name in feature_extractor_names
+            for processor_name in processor_names
+        ]
+
+        for test_case in test_cases:
+            tokenizer_name = test_case["tokenizer_name"]
+            image_processor_name = test_case["image_processor_name"]
+            feature_extractor_name = test_case["feature_extractor_name"]
+            processor_name = test_case["processor_name"]
+
+            do_skip_test_case = self.is_pipeline_test_to_skip(
+                pipeline_test_class_name,
+                model_architecture.config_class,
+                model_architecture,
+                tokenizer_name,
+                image_processor_name,
+                feature_extractor_name,
+                processor_name,
+            )
+
+            if do_skip_test_case:
+                logger.warning(
+                    f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')}_{torch_dtype} is skipped: test is "
+                    f"currently known to fail for: model `{model_architecture.__name__}` | tokenizer "
+                    f"`{tokenizer_name}` | image processor `{image_processor_name}` | feature extractor {feature_extractor_name}."
                 )
+                continue
+
+            self.run_pipeline_test(
+                task,
+                repo_name,
+                model_architecture,
+                tokenizer_name=tokenizer_name,
+                image_processor_name=image_processor_name,
+                feature_extractor_name=feature_extractor_name,
+                processor_name=processor_name,
+                commit=commit,
+                torch_dtype=torch_dtype,
+            )
 
     def run_pipeline_test(
-        self, task, repo_name, model_architecture, tokenizer_name, processor_name, commit, torch_dtype="float32"
+        self,
+        task,
+        repo_name,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
+        commit,
+        torch_dtype="float32",
     ):
         """Run pipeline tests for a specific `task` with the give model class and tokenizer/processor class name
 
@@ -253,67 +374,86 @@ def run_pipeline_test(
                 A subclass of `PretrainedModel` or `PretrainedModel`.
             tokenizer_name (`str`):
                 The name of a subclass of `PreTrainedTokenizerFast` or `PreTrainedTokenizer`.
+            image_processor_name (`str`):
+                The name of a subclass of `BaseImageProcessor`.
+            feature_extractor_name (`str`):
+                The name of a subclass of `FeatureExtractionMixin`.
             processor_name (`str`):
-                The name of a subclass of `BaseImageProcessor` or `FeatureExtractionMixin`.
+                The name of a subclass of `ProcessorMixin`.
             commit (`str`):
                 The commit hash of the model repository on the Hub.
             torch_dtype (`str`, `optional`, defaults to `'float32'`):
                 The torch dtype to use for the model. Can be used for FP16/other precision inference.
         """
         repo_id = f"{TRANSFORMERS_TINY_MODEL_PATH}/{repo_name}"
+        model_type = model_architecture.config_class.model_type
+
         if TRANSFORMERS_TINY_MODEL_PATH != "hf-internal-testing":
-            model_type = model_architecture.config_class.model_type
             repo_id = os.path.join(TRANSFORMERS_TINY_MODEL_PATH, model_type, repo_name)
 
+        # -------------------- Load model --------------------
+
+        # TODO: We should check if a model file is on the Hub repo. instead.
+        try:
+            model = model_architecture.from_pretrained(repo_id, revision=commit)
+        except Exception:
+            logger.warning(
+                f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')}_{torch_dtype} is skipped: Could not find or load "
+                f"the model from `{repo_id}` with `{model_architecture}`."
+            )
+            self.skipTest(f"Could not find or load the model from {repo_id} with {model_architecture}.")
+
+        # -------------------- Load tokenizer --------------------
+
         tokenizer = None
         if tokenizer_name is not None:
             tokenizer_class = getattr(transformers_module, tokenizer_name)
             tokenizer = tokenizer_class.from_pretrained(repo_id, revision=commit)
 
-        processor = None
-        if processor_name is not None:
-            processor_class = getattr(transformers_module, processor_name)
-            # If the required packages (like `Pillow` or `torchaudio`) are not installed, this will fail.
-            try:
-                processor = processor_class.from_pretrained(repo_id, revision=commit)
-            except Exception:
-                logger.warning(
-                    f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')}_{torch_dtype} is skipped: Could not load the "
-                    f"processor from `{repo_id}` with `{processor_name}`."
-                )
-                self.skipTest(f"Could not load the processor from {repo_id} with {processor_name}.")
+        # -------------------- Load processors --------------------
+
+        processors = {}
+        for key, name in zip(
+            ["image_processor", "feature_extractor", "processor"],
+            [image_processor_name, feature_extractor_name, processor_name],
+        ):
+            if name is not None:
+                try:
+                    # Can fail if some extra dependencies are not installed
+                    processor_class = getattr(transformers_module, name)
+                    processor = processor_class.from_pretrained(repo_id, revision=commit)
+                    processors[key] = processor
+                except Exception:
+                    logger.warning(
+                        f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')}_{torch_dtype} is skipped: "
+                        f"Could not load the {key} from `{repo_id}` with `{name}`."
+                    )
+                    self.skipTest(f"Could not load the {key} from {repo_id} with {name}.")
+
+        # ---------------------------------------------------------
 
         # TODO: Maybe not upload such problematic tiny models to Hub.
-        if tokenizer is None and processor is None:
+        if tokenizer is None and "image_processor" not in processors and "feature_extractor" not in processors:
             logger.warning(
                 f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')}_{torch_dtype} is skipped: Could not find or load "
-                f"any tokenizer / processor from `{repo_id}`."
+                f"any tokenizer / image processor / feature extractor from `{repo_id}`."
             )
             self.skipTest(f"Could not find or load any tokenizer / processor from {repo_id}.")
 
-        # TODO: We should check if a model file is on the Hub repo. instead.
-        try:
-            model = model_architecture.from_pretrained(repo_id, revision=commit)
-        except Exception:
-            logger.warning(
-                f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')}_{torch_dtype} is skipped: Could not find or load "
-                f"the model from `{repo_id}` with `{model_architecture}`."
-            )
-            self.skipTest(f"Could not find or load the model from {repo_id} with {model_architecture}.")
-
         pipeline_test_class_name = pipeline_test_mapping[task]["test"].__name__
-        if self.is_pipeline_test_to_skip_more(pipeline_test_class_name, model.config, model, tokenizer, processor):
+        if self.is_pipeline_test_to_skip_more(pipeline_test_class_name, model.config, model, tokenizer, **processors):
             logger.warning(
                 f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')}_{torch_dtype} is skipped: test is "
                 f"currently known to fail for: model `{model_architecture.__name__}` | tokenizer "
-                f"`{tokenizer_name}` | processor `{processor_name}`."
+                f"`{tokenizer_name}` | image processor `{image_processor_name}` | feature extractor `{feature_extractor_name}`."
             )
             self.skipTest(
-                f"Test is known to fail for: model `{model_architecture.__name__}` | tokenizer `{tokenizer_name}` | processor `{processor_name}`."
+                f"Test is known to fail for: model `{model_architecture.__name__}` | tokenizer `{tokenizer_name}` "
+                f"| image processor `{image_processor_name}` | feature extractor `{feature_extractor_name}`."
             )
 
         # validate
-        validate_test_components(self, task, model, tokenizer, processor)
+        validate_test_components(model, tokenizer)
 
         if hasattr(model, "eval"):
             model = model.eval()
@@ -322,7 +462,7 @@ def run_pipeline_test(
         # `run_pipeline_test`.
         task_test = pipeline_test_mapping[task]["test"]()
 
-        pipeline, examples = task_test.get_test_pipeline(model, tokenizer, processor, torch_dtype=torch_dtype)
+        pipeline, examples = task_test.get_test_pipeline(model, tokenizer, **processors, torch_dtype=torch_dtype)
         if pipeline is None:
             # The test can disable itself, but it should be very marginal
             # Concerns: Wav2Vec2ForCTC without tokenizer test (FastTokenizer don't exist)
@@ -649,7 +789,14 @@ def test_pipeline_zero_shot_object_detection_fp16(self):
 
     # This contains the test cases to be skipped without model architecture being involved.
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
         """Skip some tests based on the classes or their names without the instantiated objects.
 
@@ -657,7 +804,7 @@ def is_pipeline_test_to_skip(
         """
         # No fix is required for this case.
         if (
-            pipeline_test_casse_name == "DocumentQuestionAnsweringPipelineTests"
+            pipeline_test_case_name == "DocumentQuestionAnsweringPipelineTests"
             and tokenizer_name is not None
             and not tokenizer_name.endswith("Fast")
         ):
@@ -666,11 +813,20 @@ def is_pipeline_test_to_skip(
 
         return False
 
-    def is_pipeline_test_to_skip_more(self, pipeline_test_casse_name, config, model, tokenizer, processor):  # noqa
+    def is_pipeline_test_to_skip_more(
+        self,
+        pipeline_test_case_name,
+        config,
+        model,
+        tokenizer,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+    ):  # noqa
         """Skip some more tests based on the information from the instantiated objects."""
         # No fix is required for this case.
         if (
-            pipeline_test_casse_name == "QAPipelineTests"
+            pipeline_test_case_name == "QAPipelineTests"
             and tokenizer is not None
             and getattr(tokenizer, "pad_token", None) is None
             and not tokenizer.__class__.__name__.endswith("Fast")
@@ -681,7 +837,7 @@ def is_pipeline_test_to_skip_more(self, pipeline_test_casse_name, config, model,
         return False
 
 
-def validate_test_components(test_case, task, model, tokenizer, processor):
+def validate_test_components(model, tokenizer):
     # TODO: Move this to tiny model creation script
     # head-specific (within a model type) necessary changes to the config
     # 1. for `BlenderbotForCausalLM`
diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index 8cc71147c22013..9f0d88089129b8 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -16,7 +16,9 @@
 
 import inspect
 import json
+import random
 import tempfile
+from typing import Optional
 
 import numpy as np
 
@@ -30,11 +32,7 @@
 from transformers.utils import is_vision_available
 
 
-try:
-    from typing import Unpack
-except ImportError:
-    from typing_extensions import Unpack
-
+global_rng = random.Random()
 
 if is_vision_available():
     from PIL import Image
@@ -47,6 +45,21 @@ def prepare_image_inputs():
     return image_inputs
 
 
+# Copied from tests.models.whisper.test_feature_extraction_whisper.floats_list
+def floats_list(shape, scale=1.0, rng=None, name=None):
+    """Creates a random float32 tensor"""
+    if rng is None:
+        rng = global_rng
+
+    values = []
+    for batch_idx in range(shape[0]):
+        values.append([])
+        for _ in range(shape[1]):
+            values[-1].append(rng.random() * scale)
+
+    return values
+
+
 @require_torch
 @require_vision
 class ProcessorTesterMixin:
@@ -86,10 +99,25 @@ def get_processor(self):
         processor = self.processor_class(**components, **self.prepare_processor_dict())
         return processor
 
+    def prepare_text_inputs(self, batch_size: Optional[int] = None):
+        if batch_size is None:
+            return "lower newer"
+
+        if batch_size < 1:
+            raise ValueError("batch_size must be greater than 0")
+
+        if batch_size == 1:
+            return ["lower newer"]
+        return ["lower newer", "upper older longer string"] + ["lower newer"] * (batch_size - 2)
+
     @require_vision
-    def prepare_image_inputs(self):
+    def prepare_image_inputs(self, batch_size: Optional[int] = None):
         """This function prepares a list of PIL images for testing"""
-        return prepare_image_inputs()
+        if batch_size is None:
+            return prepare_image_inputs()[0]
+        if batch_size < 1:
+            raise ValueError("batch_size must be greater than 0")
+        return prepare_image_inputs() * batch_size
 
     @require_vision
     def prepare_video_inputs(self):
@@ -148,7 +176,7 @@ def test_tokenizer_defaults_preserved_by_kwargs(self):
 
         processor = self.processor_class(**processor_components)
         self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
         image_input = self.prepare_image_inputs()
         inputs = processor(text=input_str, images=image_input, return_tensors="pt")
         self.assertEqual(inputs[self.text_input_name].shape[-1], 117)
@@ -170,7 +198,7 @@ def test_image_processor_defaults_preserved_by_image_kwargs(self):
         processor = self.processor_class(**processor_components)
         self.skip_processor_without_typed_kwargs(processor)
 
-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
         image_input = self.prepare_image_inputs()
 
         inputs = processor(text=input_str, images=image_input, return_tensors="pt")
@@ -184,7 +212,7 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self):
 
         processor = self.processor_class(**processor_components)
         self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
         image_input = self.prepare_image_inputs()
         inputs = processor(
             text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length"
@@ -203,7 +231,7 @@ def test_kwargs_overrides_default_image_processor_kwargs(self):
         processor = self.processor_class(**processor_components)
         self.skip_processor_without_typed_kwargs(processor)
 
-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
         image_input = self.prepare_image_inputs()
 
         inputs = processor(text=input_str, images=image_input, do_rescale=True, rescale_factor=-1, return_tensors="pt")
@@ -216,7 +244,7 @@ def test_unstructured_kwargs(self):
         processor = self.processor_class(**processor_components)
         self.skip_processor_without_typed_kwargs(processor)
 
-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
         image_input = self.prepare_image_inputs()
         inputs = processor(
             text=input_str,
@@ -238,8 +266,8 @@ def test_unstructured_kwargs_batched(self):
         processor = self.processor_class(**processor_components)
         self.skip_processor_without_typed_kwargs(processor)
 
-        input_str = ["lower newer", "upper older longer string"]
-        image_input = self.prepare_image_inputs() * 2
+        input_str = self.prepare_text_inputs(batch_size=2)
+        image_input = self.prepare_image_inputs(batch_size=2)
         inputs = processor(
             text=input_str,
             images=image_input,
@@ -263,7 +291,7 @@ def test_doubly_passed_kwargs(self):
         processor = self.processor_class(**processor_components)
         self.skip_processor_without_typed_kwargs(processor)
 
-        input_str = ["lower newer"]
+        input_str = [self.prepare_text_inputs()]
         image_input = self.prepare_image_inputs()
         with self.assertRaises(ValueError):
             _ = processor(
@@ -281,7 +309,7 @@ def test_structured_kwargs_nested(self):
         processor = self.processor_class(**processor_components)
         self.skip_processor_without_typed_kwargs(processor)
 
-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
         image_input = self.prepare_image_inputs()
 
         # Define the kwargs for each modality
@@ -303,7 +331,7 @@ def test_structured_kwargs_nested_from_dict(self):
         processor_components = self.prepare_components()
         processor = self.processor_class(**processor_components)
         self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
         image_input = self.prepare_image_inputs()
 
         # Define the kwargs for each modality
@@ -317,6 +345,135 @@ def test_structured_kwargs_nested_from_dict(self):
         self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0)
         self.assertEqual(inputs[self.text_input_name].shape[-1], 76)
 
+    #  text + audio kwargs testing
+    @require_torch
+    def test_tokenizer_defaults_preserved_by_kwargs_audio(self):
+        if "feature_extractor" not in self.processor_class.attributes:
+            self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
+        feature_extractor = self.get_component("feature_extractor")
+        if hasattr(self, "get_tokenizer"):
+            tokenizer = self.get_tokenizer(max_length=117, padding="max_length")
+        elif hasattr(self, "get_component"):
+            tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
+        else:
+            self.assertTrue(False, "Processor doesn't have get_tokenizer or get_component defined")
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
+        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = "lower newer"
+        raw_speech = floats_list((3, 1000))
+        inputs = processor(text=input_str, audio=raw_speech, return_tensors="pt")
+        if "input_ids" in inputs:
+            self.assertEqual(len(inputs["input_ids"][0]), 117)
+        elif "labels" in inputs:
+            self.assertEqual(len(inputs["labels"][0]), 117)
+
+    @require_torch
+    def test_kwargs_overrides_default_tokenizer_kwargs_audio(self):
+        if "feature_extractor" not in self.processor_class.attributes:
+            self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
+        feature_extractor = self.get_component("feature_extractor")
+        if hasattr(self, "get_tokenizer"):
+            tokenizer = self.get_tokenizer(max_length=117)
+        elif hasattr(self, "get_component"):
+            tokenizer = self.get_component("tokenizer", max_length=117)
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
+        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = "lower newer"
+        raw_speech = floats_list((3, 1000))
+        inputs = processor(text=input_str, audio=raw_speech, return_tensors="pt", max_length=112, padding="max_length")
+        if "input_ids" in inputs:
+            self.assertEqual(len(inputs["input_ids"][0]), 112)
+        elif "labels" in inputs:
+            self.assertEqual(len(inputs["labels"][0]), 112)
+
+    @require_torch
+    def test_unstructured_kwargs_audio(self):
+        if "feature_extractor" not in self.processor_class.attributes:
+            self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
+        feature_extractor = self.get_component("feature_extractor")
+        if hasattr(self, "get_tokenizer"):
+            tokenizer = self.get_tokenizer(max_length=117)
+        elif hasattr(self, "get_component"):
+            tokenizer = self.get_component("tokenizer", max_length=117)
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
+        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        raw_speech = floats_list((3, 1000))
+        inputs = processor(
+            text=input_str,
+            audio=raw_speech,
+            return_tensors="pt",
+            padding="max_length",
+            max_length=76,
+        )
+
+        if "input_ids" in inputs:
+            self.assertEqual(len(inputs["input_ids"][0]), 76)
+        elif "labels" in inputs:
+            self.assertEqual(len(inputs["labels"][0]), 76)
+
+    @require_torch
+    def test_doubly_passed_kwargs_audio(self):
+        if "feature_extractor" not in self.processor_class.attributes:
+            self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
+        feature_extractor = self.get_component("feature_extractor")
+        if hasattr(self, "get_tokenizer"):
+            tokenizer = self.get_tokenizer()
+        elif hasattr(self, "get_component"):
+            tokenizer = self.get_component("tokenizer")
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
+        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = ["lower newer"]
+        raw_speech = floats_list((3, 1000))
+        with self.assertRaises(ValueError):
+            _ = processor(
+                text=input_str,
+                audio=raw_speech,
+                audio_kwargs={"padding": "max_length"},
+                padding="max_length",
+            )
+
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_audio_nested(self):
+        if "feature_extractor" not in self.processor_class.attributes:
+            self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
+        feature_extractor = self.get_component("feature_extractor")
+        if hasattr(self, "get_tokenizer"):
+            tokenizer = self.get_tokenizer()
+        elif hasattr(self, "get_component"):
+            tokenizer = self.get_component("tokenizer")
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
+        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = ["lower newer"]
+        raw_speech = floats_list((3, 1000))
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+            "audio_kwargs": {"padding": "max_length", "max_length": 66},
+        }
+
+        inputs = processor(text=input_str, audio=raw_speech, **all_kwargs)
+        if "input_ids" in inputs:
+            self.assertEqual(len(inputs["input_ids"][0]), 76)
+        elif "labels" in inputs:
+            self.assertEqual(len(inputs["labels"][0]), 76)
+
     # TODO: the same test, but for audio + text processors that have strong overlap in kwargs
     # TODO (molbap) use the same structure of attribute kwargs for other tests to avoid duplication
     def test_overlapping_text_kwargs_handling(self):
@@ -326,7 +483,7 @@ def test_overlapping_text_kwargs_handling(self):
         processor = self.processor_class(**processor_components)
         self.skip_processor_without_typed_kwargs(processor)
 
-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
         image_input = self.prepare_image_inputs()
 
         with self.assertRaises(ValueError):
diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index 342254dfbdf066..8020b0e711cf1f 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -4293,7 +4293,7 @@ def test_saving_tokenizer_trainer(self):
                     # Load tokenizer from a folder without legacy files
                     tokenizer = self.rust_tokenizer_class.from_pretrained(tmp_dir)
                     training_args = TrainingArguments(output_dir=tmp_dir, do_train=True, no_cuda=True)
-                    trainer = Trainer(model=model, args=training_args, tokenizer=tokenizer)
+                    trainer = Trainer(model=model, args=training_args, processing_class=tokenizer)
 
                     # Should not raise an error
                     trainer.save_model(os.path.join(tmp_dir, "checkpoint"))
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 0035ff7de8ba97..cbc93faf50e7a3 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -38,6 +38,9 @@
 from requests.exceptions import HTTPError
 
 from transformers import (
+    AutoFeatureExtractor,
+    AutoImageProcessor,
+    AutoProcessor,
     AutoTokenizer,
     IntervalStrategy,
     PretrainedConfig,
@@ -89,6 +92,7 @@
     require_torch_tf32,
     require_torch_up_to_2_accelerators,
     require_torchdynamo,
+    require_vision,
     require_wandb,
     slow,
     torch_device,
@@ -1059,14 +1063,14 @@ def test_multiple_peft_adapters(self):
                 max_steps=10,
                 use_cpu=True,
             )
-            trainer = Trainer(tiny_model, args, tokenizer=tokenizer, train_dataset=train_dataset)
+            trainer = Trainer(tiny_model, args, processing_class=tokenizer, train_dataset=train_dataset)
 
             trainer.train()
             parameters = dict(tiny_model.named_parameters())
             state = dataclasses.asdict(trainer.state)
 
             # Reinitialize trainer
-            trainer = Trainer(tiny_model, args, tokenizer=tokenizer, train_dataset=train_dataset)
+            trainer = Trainer(tiny_model, args, processing_class=tokenizer, train_dataset=train_dataset)
 
             checkpoint = os.path.join(tmpdir, "checkpoint-5")
 
@@ -3786,6 +3790,100 @@ def test_eval_use_gather_object(self):
         _ = trainer.evaluate()
         _ = trainer.predict(eval_dataset)
 
+    def test_trainer_saves_tokenizer(self):
+        MODEL_ID = "google-bert/bert-base-uncased"
+        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=False)
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            config = RegressionModelConfig(a=1.5, b=2.5)
+            trainer = Trainer(
+                model=RegressionPreTrainedModel(config),
+                args=TrainingArguments(output_dir=tmp_dir),
+                processing_class=tokenizer,
+            )
+            trainer.save_model()
+
+            reloaded_tokenizer = AutoTokenizer.from_pretrained(tmp_dir)
+
+        # For tokenizers, there isn't a direct to_dict method and the properties stored in the configs e.g.
+        # saved tokens change overtime, so we check that two tokenizers are equal by comparing their encoded outputs
+        test_sentence = "This is a test sentence"
+        self.assertListEqual(
+            tokenizer(test_sentence, padding="max_length").input_ids,
+            reloaded_tokenizer(test_sentence, padding="max_length").input_ids,
+        )
+
+    @require_vision
+    def test_trainer_saves_image_processor(self):
+        MODEL_ID = "openai/clip-vit-base-patch32"
+        image_processor = AutoImageProcessor.from_pretrained(MODEL_ID)
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            config = RegressionModelConfig(a=1.5, b=2.5)
+            trainer = Trainer(
+                model=RegressionPreTrainedModel(config),
+                args=TrainingArguments(output_dir=tmp_dir),
+                processing_class=image_processor,
+            )
+            trainer.save_model()
+            reloaded_image_processor = AutoImageProcessor.from_pretrained(tmp_dir)
+
+        self.assertDictEqual(image_processor.to_dict(), reloaded_image_processor.to_dict())
+
+    def test_trainer_saves_feature_extractor(self):
+        MODEL_ID = "facebook/wav2vec2-base-960h"
+        feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_ID)
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            config = RegressionModelConfig(a=1.5, b=2.5)
+            trainer = Trainer(
+                model=RegressionPreTrainedModel(config),
+                args=TrainingArguments(output_dir=tmp_dir),
+                processing_class=feature_extractor,
+            )
+            trainer.save_model()
+
+            reloaded_feature_extractor = AutoFeatureExtractor.from_pretrained(tmp_dir)
+
+        self.assertDictEqual(feature_extractor.to_dict(), reloaded_feature_extractor.to_dict())
+
+    @require_vision
+    def test_trainer_saves_processor(self):
+        MODEL_ID = "openai/clip-vit-base-patch32"
+        image_processor = AutoImageProcessor.from_pretrained(MODEL_ID)
+        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=False)
+        processor = AutoProcessor.from_pretrained(MODEL_ID)
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            config = RegressionModelConfig(a=1.5, b=2.5)
+            trainer = Trainer(
+                model=RegressionPreTrainedModel(config),
+                args=TrainingArguments(output_dir=tmp_dir),
+                processing_class=processor,
+            )
+            trainer.save_model()
+
+            reloaded_processor = AutoProcessor.from_pretrained(tmp_dir)
+            reloaded_image_processor = AutoImageProcessor.from_pretrained(tmp_dir)
+            reloaded_tokenizer = AutoTokenizer.from_pretrained(tmp_dir)
+
+        self.assertDictEqual(reloaded_processor.to_dict(), processor.to_dict())
+
+        image_processor_dict = image_processor.to_dict()
+        reloaded_image_processor_dict = reloaded_image_processor.to_dict()
+        # When the processor is saved in the trainer, the _processor_class gets set in the reload_image_processor dict
+        image_processor_dict.pop("_processor_class")
+        reloaded_image_processor_dict.pop("_processor_class")
+        self.assertDictEqual(image_processor_dict, reloaded_image_processor_dict)
+
+        # For tokenizers, there isn't a direct to_dict method and the properties stored in the configs e.g.
+        # saved tokens change overtime, so we check that two tokenizers are equal by comparing their encoded outputs
+        test_sentence = "This is a test sentence"
+        self.assertListEqual(
+            tokenizer(test_sentence, padding="max_length").input_ids,
+            reloaded_tokenizer(test_sentence, padding="max_length").input_ids,
+        )
+
 
 @require_torch
 @is_staging_test
diff --git a/tests/trainer/test_trainer_fsdp.py b/tests/trainer/test_trainer_fsdp.py
new file mode 100644
index 00000000000000..994a82a8db0c44
--- /dev/null
+++ b/tests/trainer/test_trainer_fsdp.py
@@ -0,0 +1,114 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict
+
+from transformers import is_torch_available
+from transformers.testing_utils import (
+    TestCasePlus,
+    execute_subprocess_async,
+    get_torch_dist_unique_port,
+    require_accelerate,
+    require_torch_multi_gpu,
+)
+
+
+if is_torch_available():
+    import torch
+    import torch.distributed
+    import torch.utils.data
+
+    from transformers import (
+        AutoModelForCausalLM,
+        AutoTokenizer,
+        DataCollatorForSeq2Seq,
+        EvalPrediction,
+        GenerationConfig,
+        HfArgumentParser,
+        PreTrainedTokenizerBase,
+        Seq2SeqTrainer,
+        Seq2SeqTrainingArguments,
+    )
+
+    class DummyTextDataset(torch.utils.data.Dataset[str]):
+        def __init__(self, tokenizer: PreTrainedTokenizerBase) -> None:
+            data = 4 * [
+                "Hello world!",
+                "The quick brown fox jumps over the lazy dog.",
+            ]
+            self.data = [
+                {k: v.squeeze(0) for k, v in tokenizer(item, return_tensors="pt", return_attention_mask=True).items()}
+                for item in data
+            ]
+            for item in self.data:
+                item["labels"] = item["input_ids"]
+
+        def __len__(self) -> int:
+            return len(self.data)
+
+        def __getitem__(self, i: int) -> str:
+            return self.data[i]
+
+
+class TestFSDPTrainer(TestCasePlus):
+    @require_accelerate
+    @require_torch_multi_gpu
+    def test_trainer(self):
+        output_dir = self.get_auto_remove_tmp_dir()
+        cmd = [
+            "accelerate",
+            "launch",
+            "--use_fsdp",
+            "--main_process_port",
+            f"{get_torch_dist_unique_port()}",
+            "--num_processes",
+            f"{torch.cuda.device_count()}",
+            "--fsdp_transformer_layer_cls_to_wrap",
+            "GPT2Block",
+            f"{self.test_file_dir}/test_trainer_fsdp.py",
+            "--output_dir",
+            f"{output_dir}",
+            "--report_to",
+            "none",
+        ]
+        execute_subprocess_async(cmd, env=self.get_env())
+        # successful return here == success - any errors would have caused an error in the sub-call
+
+
+if __name__ == "__main__":
+    parser = HfArgumentParser((Seq2SeqTrainingArguments,))
+    training_args = parser.parse_args_into_dataclasses()[0]
+    training_args.per_device_eval_batch_size = 1
+    training_args.use_legacy_prediction_loop = False
+    training_args.predict_with_generate = True
+    training_args.generation_config = GenerationConfig(max_length=30)
+
+    pretrained_model_name = "hf-internal-testing/tiny-random-gpt2"
+    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
+    tokenizer.pad_token = tokenizer.eos_token
+    device = torch.device(torch.distributed.get_rank())
+    model = AutoModelForCausalLM.from_pretrained(pretrained_model_name).to(device)
+
+    def compute_metrics(p: EvalPrediction) -> Dict[str, bool]:
+        return {"accuracy": (p.predictions == p.label_ids).mean()}
+
+    trainer = Seq2SeqTrainer(
+        model=model,
+        args=training_args,
+        data_collator=DataCollatorForSeq2Seq(tokenizer, model),
+        eval_dataset=DummyTextDataset(tokenizer),
+        compute_metrics=compute_metrics,
+    )
+
+    metrics = trainer.evaluate()
diff --git a/tests/trainer/test_trainer_seq2seq.py b/tests/trainer/test_trainer_seq2seq.py
index a4b38aecb2af3c..30dd2ed460c99d 100644
--- a/tests/trainer/test_trainer_seq2seq.py
+++ b/tests/trainer/test_trainer_seq2seq.py
@@ -129,7 +129,7 @@ def _compute_metrics(pred):
             compute_metrics=_compute_metrics,
             train_dataset=train_dataset,
             eval_dataset=val_dataset,
-            tokenizer=tokenizer,
+            processing_class=tokenizer,
         )
 
         # start training
@@ -158,7 +158,7 @@ def test_return_sequences(self):
         trainer = Seq2SeqTrainer(
             model=model,
             args=training_args,
-            tokenizer=tokenizer,
+            processing_class=tokenizer,
             data_collator=data_collator,
             compute_metrics=lambda x: {"samples": x[0].shape[0]},
         )
@@ -199,7 +199,7 @@ def test_bad_generation_config_fail_early(self):
             _ = Seq2SeqTrainer(
                 model=model,
                 args=training_args,
-                tokenizer=tokenizer,
+                processing_class=tokenizer,
                 data_collator=data_collator,
                 compute_metrics=lambda x: {"samples": x[0].shape[0]},
             )
diff --git a/tests/utils/test_cache_utils.py b/tests/utils/test_cache_utils.py
index 8392ed18b71665..4a6dae67cbc807 100644
--- a/tests/utils/test_cache_utils.py
+++ b/tests/utils/test_cache_utils.py
@@ -53,7 +53,7 @@ class CacheTest(unittest.TestCase):
     def test_dynamic_cache_retrocompatibility(self):
         """Tests that we can convert back and forth between the legacy cache format and DynamicCache"""
         legacy_cache = ()
-        new_cache = DynamicCache(num_hidden_layers=10)
+        new_cache = DynamicCache()
 
         # Creates a new cache with 10 layers in both formats
         for layer_idx in range(10):
@@ -83,7 +83,7 @@ def test_dynamic_cache_retrocompatibility(self):
                 )
 
         # Test 1: We can convert from legacy to new with no changes
-        from_legacy = DynamicCache.from_legacy_cache(legacy_cache, num_hidden_layers=10)
+        from_legacy = DynamicCache.from_legacy_cache(legacy_cache)
         for layer_idx in range(10):
             for key_value_idx in range(2):
                 self.assertTrue(
@@ -103,7 +103,7 @@ def test_reorder_cache_retrocompatibility(self):
         legacy_reorder_fn = GPT2LMHeadModel._reorder_cache  # An example of a legacy `_reorder_cache` function
 
         legacy_cache = ()
-        new_cache = DynamicCache(num_hidden_layers=10)
+        new_cache = DynamicCache()
 
         # Creates a new cache with 10 layers in both formats
         for layer_idx in range(10):
@@ -240,9 +240,7 @@ def test_dynamic_cache_hard(self):
         set_seed(0)
         gen_out_legacy = model.generate(**inputs, do_sample=True, max_new_tokens=256)
         set_seed(0)
-        gen_out = model.generate(
-            **inputs, do_sample=True, max_new_tokens=256, past_key_values=DynamicCache(model.config.num_hidden_layers)
-        )
+        gen_out = model.generate(**inputs, do_sample=True, max_new_tokens=256, past_key_values=DynamicCache())
         self.assertListEqual(gen_out_legacy.tolist(), gen_out.tolist())
 
         decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
@@ -270,9 +268,7 @@ def test_dynamic_cache_batched(self):
             model.device
         )
 
-        gen_out = model.generate(
-            **inputs, do_sample=False, max_new_tokens=10, past_key_values=DynamicCache(model.config.num_hidden_layers)
-        )
+        gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=10, past_key_values=DynamicCache())
         decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
         expected_text = ["A sequence: 1, 2, 3, 4, 5, 6, 7, 8,", "A sequence: A, B, C, D, E, F, G, H"]
         self.assertListEqual(decoded, expected_text)
diff --git a/tests/utils/test_configuration_utils.py b/tests/utils/test_configuration_utils.py
index 76394daf9ced4a..d2701bf35e6603 100644
--- a/tests/utils/test_configuration_utils.py
+++ b/tests/utils/test_configuration_utils.py
@@ -313,11 +313,12 @@ def test_repo_versioning_before(self):
         old_configuration = old_transformers.models.auto.AutoConfig.from_pretrained(repo)
         self.assertEqual(old_configuration.hidden_size, 768)
 
-    def test_saving_config_with_custom_generation_kwargs_raises_exception(self):
+    def test_saving_config_with_custom_generation_kwargs_raises_warning(self):
         config = BertConfig(min_length=3)  # `min_length = 3` is a non-default generation kwarg
         with tempfile.TemporaryDirectory() as tmp_dir:
-            with self.assertRaises(ValueError):
+            with self.assertWarns(UserWarning) as cm:
                 config.save_pretrained(tmp_dir)
+            self.assertIn("min_length", str(cm.warning))
 
     def test_get_non_default_generation_parameters(self):
         config = BertConfig()
diff --git a/tests/utils/test_hf_argparser.py b/tests/utils/test_hf_argparser.py
index e075cdae167d35..08c730f7348b68 100644
--- a/tests/utils/test_hf_argparser.py
+++ b/tests/utils/test_hf_argparser.py
@@ -189,7 +189,7 @@ def test_with_default_bool(self):
         expected.add_argument("--baz", type=string_to_bool, default=True, const=True, nargs="?")
         # A boolean no_* argument always has to come after its "default: True" regular counter-part
         # and its default must be set to False
-        expected.add_argument("--no_baz", action="store_false", default=False, dest="baz")
+        expected.add_argument("--no_baz", "--no-baz", action="store_false", default=False, dest="baz")
         expected.add_argument("--opt", type=string_to_bool, default=None)
 
         dataclass_types = [WithDefaultBoolExample]
@@ -206,6 +206,9 @@ def test_with_default_bool(self):
             args = parser.parse_args(["--foo", "--no_baz"])
             self.assertEqual(args, Namespace(foo=True, baz=False, opt=None))
 
+            args = parser.parse_args(["--foo", "--no-baz"])
+            self.assertEqual(args, Namespace(foo=True, baz=False, opt=None))
+
             args = parser.parse_args(["--foo", "--baz"])
             self.assertEqual(args, Namespace(foo=True, baz=True, opt=None))
 
@@ -271,10 +274,10 @@ def test_with_list(self):
         parser = HfArgumentParser(ListExample)
 
         expected = argparse.ArgumentParser()
-        expected.add_argument("--foo_int", nargs="+", default=[], type=int)
-        expected.add_argument("--bar_int", nargs="+", default=[1, 2, 3], type=int)
-        expected.add_argument("--foo_str", nargs="+", default=["Hallo", "Bonjour", "Hello"], type=str)
-        expected.add_argument("--foo_float", nargs="+", default=[0.1, 0.2, 0.3], type=float)
+        expected.add_argument("--foo_int", "--foo-int", nargs="+", default=[], type=int)
+        expected.add_argument("--bar_int", "--bar-int", nargs="+", default=[1, 2, 3], type=int)
+        expected.add_argument("--foo_str", "--foo-str", nargs="+", default=["Hallo", "Bonjour", "Hello"], type=str)
+        expected.add_argument("--foo_float", "--foo-float", nargs="+", default=[0.1, 0.2, 0.3], type=float)
 
         self.argparsersEqual(parser, expected)
 
@@ -287,6 +290,9 @@ def test_with_list(self):
         args = parser.parse_args("--foo_int 1 --bar_int 2 3 --foo_str a b c --foo_float 0.1 0.7".split())
         self.assertEqual(args, Namespace(foo_int=[1], bar_int=[2, 3], foo_str=["a", "b", "c"], foo_float=[0.1, 0.7]))
 
+        args = parser.parse_args("--foo-int 1 --bar-int 2 3 --foo-str a b c --foo-float 0.1 0.7".split())
+        self.assertEqual(args, Namespace(foo_int=[1], bar_int=[2, 3], foo_str=["a", "b", "c"], foo_float=[0.1, 0.7]))
+
     def test_with_optional(self):
         expected = argparse.ArgumentParser()
         expected.add_argument("--foo", default=None, type=int)
@@ -314,10 +320,11 @@ def test_with_required(self):
         parser = HfArgumentParser(RequiredExample)
 
         expected = argparse.ArgumentParser()
-        expected.add_argument("--required_list", nargs="+", type=int, required=True)
-        expected.add_argument("--required_str", type=str, required=True)
+        expected.add_argument("--required_list", "--required-list", nargs="+", type=int, required=True)
+        expected.add_argument("--required_str", "--required-str", type=str, required=True)
         expected.add_argument(
             "--required_enum",
+            "--required-enum",
             type=make_choice_type_function(["titi", "toto"]),
             choices=["titi", "toto"],
             required=True,
@@ -331,13 +338,14 @@ def test_with_string_literal_annotation(self):
         expected.add_argument("--foo", type=int, required=True)
         expected.add_argument(
             "--required_enum",
+            "--required-enum",
             type=make_choice_type_function(["titi", "toto"]),
             choices=["titi", "toto"],
             required=True,
         )
         expected.add_argument("--opt", type=string_to_bool, default=None)
         expected.add_argument("--baz", default="toto", type=str, help="help message")
-        expected.add_argument("--foo_str", nargs="+", default=["Hallo", "Bonjour", "Hello"], type=str)
+        expected.add_argument("--foo_str", "--foo-str", nargs="+", default=["Hallo", "Bonjour", "Hello"], type=str)
         self.argparsersEqual(parser, expected)
 
     def test_parse_dict(self):
diff --git a/tests/utils/test_modeling_rope_utils.py b/tests/utils/test_modeling_rope_utils.py
index a1d1fd6b922ab3..d51f534055872a 100644
--- a/tests/utils/test_modeling_rope_utils.py
+++ b/tests/utils/test_modeling_rope_utils.py
@@ -65,6 +65,19 @@ def test_rope_validation(self):
                     with self.assertRaises(KeyError):
                         rope_config_validation(config)
 
+        # Any other parameters passed to RoPE will raise a warning that a particular key is not used
+        # But sometimes we can have model-specific RoPE kwargs and bypass warning with `ignore_keys`
+        model_specific_kwarg = "mrope_sections"  # e,g in Qwen2-VL
+
+        for rope_type in all_rope_types:
+            if rope_type == "default":
+                config.rope_scaling = {"rope_type": rope_type, model_specific_kwarg: True}
+                rope_config_validation(config, ignore_keys={model_specific_kwarg})
+                with self.assertLogs("transformers.modeling_rope_utils", level="WARNING") as logs:
+                    rope_config_validation(config)
+                    self.assertEqual(len(logs.output), 1)
+                    self.assertIn(model_specific_kwarg, logs.output[0])
+
     def test_default_rope_function_bc(self):
         config = LlamaConfig()
         device = torch_device
diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py
index 7cd8523ccd287e..7bd937963651d5 100644
--- a/utils/check_config_attributes.py
+++ b/utils/check_config_attributes.py
@@ -132,6 +132,11 @@
         "t2u_variance_predictor_hidden_dim",
         "t2u_variance_predictor_kernel_size",
     ],
+    "ZambaConfig": [
+        "tie_word_embeddings",
+        "attn_layer_offset",
+        "attn_layer_period",
+    ],
     "MllamaTextConfig": [
         "initializer_range",
     ],
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 75bd2ed1c6bfda..3ecbd79eca48d9 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -170,7 +170,6 @@
     "ClapTextModelWithProjection",
     "ClapAudioModel",
     "ClapAudioModelWithProjection",
-    "Blip2ForConditionalGeneration",
     "Blip2TextModelWithProjection",
     "Blip2VisionModelWithProjection",
     "Blip2QFormerModel",
@@ -181,7 +180,6 @@
     "GitVisionModel",
     "GraphormerModel",
     "GraphormerForGraphClassification",
-    "BlipForConditionalGeneration",
     "BlipForImageTextRetrieval",
     "BlipForQuestionAnswering",
     "BlipVisionModel",
@@ -245,7 +243,6 @@
     "DetrForSegmentation",
     "Pix2StructVisionModel",
     "Pix2StructTextModel",
-    "Pix2StructForConditionalGeneration",
     "ConditionalDetrForSegmentation",
     "DPRReader",
     "FlaubertForQuestionAnswering",
@@ -322,7 +319,6 @@
     "SeamlessM4TCodeHifiGan",
     "SeamlessM4TForSpeechToSpeech",  # no auto class for speech-to-speech
     "TvpForVideoGrounding",
-    "UdopForConditionalGeneration",
     "SeamlessM4Tv2NARTextToUnitModel",
     "SeamlessM4Tv2NARTextToUnitForConditionalGeneration",
     "SeamlessM4Tv2CodeHifiGan",
diff --git a/utils/modular_model_converter.py b/utils/modular_model_converter.py
index 1bfc1230a91349..d2f0a99fc93885 100644
--- a/utils/modular_model_converter.py
+++ b/utils/modular_model_converter.py
@@ -15,8 +15,10 @@
 import argparse
 import glob
 import importlib
+import os
 import re
-from typing import Dict
+from collections import defaultdict, deque
+from typing import Dict, List, Optional, Set
 
 import libcst as cst
 from check_copies import run_ruff
@@ -32,12 +34,19 @@
 logger = logging.get_logger(__name__)
 
 
-AUTO_GENERATED_MESSAGE = """#           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-#               This file was automatically generated from <path_to_modular_file.py>.
-#         Do NOT edit this file manually as any edits will be overwritten by the generation of
-#         the file from the modular. If any change should be done, please apply the change to the
-#                           modular_xxx.py file directly. One of our CI enforces this
-#           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# This is used to avoid overwriting these top-level assignments even if they are in the dependency graph. Otherwise, the
+# value from the dependency is used, then mapped to current name convention, resulting in wrong value.
+# The corresponding mapped value is used to define the file target for the assignment
+ASSIGNMENTS_TO_KEEP = {
+    "_CHECKPOINT_FOR_DOC": "modeling",
+}
+
+AUTO_GENERATED_MESSAGE = """#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from {relative_path}.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          {short_name} file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 """
 
 
@@ -113,7 +122,14 @@ def visit_SimpleStatementLine(self, node):
         if m.matches(node, m.SimpleStatementLine(body=[m.Assign()])) and m.matches(
             self.get_metadata(cst.metadata.ParentNodeProvider, node), m.Module()
         ):
-            self.assignments[node.body[0].targets[0].target.value] = node
+            left_hand_side = node.body[0].targets[0].target
+            if hasattr(left_hand_side, "value"):
+                if left_hand_side.value not in ASSIGNMENTS_TO_KEEP.keys():
+                    self.assignments[left_hand_side.value] = node
+            else:
+                for idx, target in enumerate(list(left_hand_side.elements)):
+                    if target.value.value not in ASSIGNMENTS_TO_KEEP.keys():
+                        self.assignments[target.value.value] = node.body[0].value.elements[idx].value
         if m.matches(node, m.SimpleStatementLine(body=[m.Import() | m.ImportFrom()])):
             self.imports[node.body[0].names] = node
 
@@ -217,11 +233,21 @@ def replace(match):
 
         return compiled_regex.sub(replace, text)
 
+    def convert_to_camelcase(self, text):
+        # Regex pattern to match consecutive uppercase letters and lowercase the first set
+        result = re.sub(r"^[A-Z]+(?=[A-Z][a-z])", lambda m: m.group(0).capitalize(), text, count=1)
+        return result
+
     @m.leave(m.Name() | m.SimpleString() | m.Comment())
     def replace_name(self, original_node, updated_node):
+        if re.findall(r"# Copied from", updated_node.value):
+            return cst.RemoveFromParent()
         update = self.preserve_case_replace(updated_node.value)
         return updated_node.with_changes(value=update)
 
+    def leave_ClassDef(self, original_node, updated_node):
+        return updated_node.with_changes(name=cst.Name(self.convert_to_camelcase(updated_node.name.value)))
+
 
 def find_classes_in_file(module: cst.Module, old_id="llama", new_id="gemma", given_old_name=None, given_new_name=None):
     """Helper function to rename and then parse a source file using the ClassFinder"""
@@ -251,6 +277,63 @@ def SUPER_CALL_NODE(func_name):
     return m.Call(func=m.Attribute(value=m.Call(func=m.Name("super")), attr=m.Name(func_name)))
 
 
+def is_call_to_super(node, func_name):
+    return m.matches(
+        node, m.SimpleStatementLine(body=[m.Return(SUPER_CALL_NODE(func_name)) | m.Expr(SUPER_CALL_NODE(func_name))])
+    )
+
+
+# Transformer class to replace ClassB.call_to_method and ClassB().call_to_method with super().call_to_method
+class ReplaceMethodCallTransformer(cst.CSTTransformer):
+    def __init__(self, all_bases: Set[str]):
+        self.all_bases = all_bases
+
+    def leave_Attribute(self, original_node: cst.Attribute, updated_node: cst.Attribute) -> cst.CSTNode:
+        # Handle ClassB.call_to_method
+        if (
+            isinstance(original_node.value, cst.Name)
+            and original_node.value.value in self.all_bases
+            and isinstance(original_node.attr, cst.Name)
+        ):
+            # Replace with super().call_to_method
+            return updated_node.with_changes(
+                value=cst.Call(cst.Name("super")),
+            )
+        # Handle ClassB().call_to_method
+        elif (
+            isinstance(original_node.value, cst.Call)
+            and isinstance(original_node.value.func, cst.Name)
+            and original_node.value.func.value in self.all_bases
+            and isinstance(original_node.attr, cst.Name)
+        ):
+            # Replace with super().call_to_method
+            return updated_node.with_changes(func=cst.Attribute(value=cst.Call(func=cst.Name("super"))))
+        return updated_node
+
+    def leave_Call(self, original_node: cst.Call, updated_node: cst.Call) -> cst.CSTNode:
+        # Check if the function being called is of the form ClassB().func_a or ClassB.func_a
+        if isinstance(original_node.func, cst.Attribute) and (
+            # Match ClassB().func_a(...)
+            (
+                isinstance(original_node.func.value, cst.Call)
+                and isinstance(original_node.func.value.func, cst.Name)
+                and original_node.func.value.func.value in self.all_bases
+            )
+            or
+            # Match ClassB.func_a(...)
+            (isinstance(original_node.func.value, cst.Name) and original_node.func.value.value in self.all_bases)
+        ):
+            # Check if the first argument is 'self', and remove it
+            if len(original_node.args) > 0 and m.matches(original_node.args[0].value, m.Name("self")):
+                # Create the new argument list without 'self'
+                new_args = updated_node.args[1:]
+            else:
+                new_args = updated_node.args
+
+            return updated_node.with_changes(args=new_args)
+        return updated_node
+
+
 def get_docstring_indent(docstring):
     # Match the first line after the opening triple quotes
     match = re.search(r'(?:"""|\'\'\'|```)\n(\s+)', docstring)
@@ -263,7 +346,7 @@ def get_docstring_indent(docstring):
 def merge_docstrings(original_docstring, updated_docstring):
     # indent_level = get_docstring_indent(updated_docstring)
     original_level = get_docstring_indent(original_docstring)
-    if "        Args:\n        " not in updated_docstring:
+    if not re.findall(r"\n\s*Args:\n", updated_docstring):
         # Split the docstring at the example section, assuming `"""` is used to define the docstring
         parts = original_docstring.split("```")
         if "```" in updated_docstring and len(parts) > 1:
@@ -292,13 +375,15 @@ def merge_docstrings(original_docstring, updated_docstring):
 class SuperTransformer(cst.CSTTransformer):
     METADATA_DEPENDENCIES = (ParentNodeProvider,)
 
-    def __init__(self, python_module: cst.Module, original_methods, updated_methods, class_name=""):
+    def __init__(self, python_module: cst.Module, original_methods, updated_methods, class_name="", all_bases=None):
         self.python_module = python_module
         self.original_methods = original_methods
         self.updated_methods = updated_methods
         self.all_assign_target = {}
         self.deleted_targets = {}  # child node can delete some arguments
         self.class_name = class_name
+        self.all_bases = all_bases or []
+        self.transformer = ReplaceMethodCallTransformer(set(self.all_bases))
 
     def update_body(self, existing_body, new_statements):
         """
@@ -356,18 +441,14 @@ def replace_super_calls(self, node: cst.IndentedBlock, func_name: str) -> cst.CS
             parent_has_docstring = m.matches(self.original_methods[func_name].body.body[0], DOCSTRING_NODE)
         new_body = []
         has_super_call = False
-        for idx, expr in enumerate(node.body):
-            if m.matches(
-                expr,
-                m.SimpleStatementLine(
-                    body=[m.Return(SUPER_CALL_NODE(func_name)) | m.Expr(SUPER_CALL_NODE(func_name))]
-                ),
-            ):
-                if idx != 0 and func_name == "__init__":
-                    raise ValueError(f"The call to super() in {self.class_name} should be at the top of the init")
-                new_body.extend(self.update_body(self.original_methods[func_name].body.body, node.body))
+
+        for expr in node.body:
+            if is_call_to_super(expr, func_name):
                 has_super_call = True
-            elif m.matches(expr, DOCSTRING_NODE):
+                new_body.extend(self.update_body(self.original_methods[func_name].body.body, node.body))
+            else:
+                expr = expr.visit(self.transformer)
+            if m.matches(expr, DOCSTRING_NODE):
                 self.has_docstring = True
                 if parent_has_docstring:  # actually here we ought to de-duplicate?
                     original_docstring = self.original_methods[func_name].body.body[0].body[0].value.value
@@ -406,15 +487,17 @@ def leave_Return(self, original_node: cst.Return, updated_node: cst.Return) -> c
         return updated_node
 
 
-def replace_call_to_super(class_finder: ClassFinder, updated_node: cst.ClassDef, class_name: str):
+def replace_call_to_super(
+    class_finder: ClassFinder, updated_node: cst.ClassDef, class_name: str, all_bases: List[str]
+):
     """
     Given the `class_name`, the `updated_node`'s call to super are unpacked.
 
                     |    ```python                          |               |    ```python
                     |    class GemmaModel(LlamaModel):      |               |       class GemmaModel(nn.Module):
                     |        def __init__(self):            |               |           def __init__(self):
-    Going from:     |            self.dropout = 0.2         |       to:     |               self.dropout = 0.2
-                    |            super().__init__()         |               |               super().__init__(config)
+    Going from:     |            super().__init__()         |       to:     |               super().__init__(config)
+                    |            self.dropout = 0.2         |               |               self.dropout = 0.2
                     |     ```                               |               |               self.padding_idx = config.pad_token_id
                                                                             |               self.vocab_size = config.vocab_size
                                                                             |               self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
@@ -453,7 +536,14 @@ def replace_call_to_super(class_finder: ClassFinder, updated_node: cst.ClassDef,
                 new_params = new_params.with_changes(
                     params=list(parent_params.values()), star_kwarg=func.params.star_kwarg
                 )
-            func = func.with_changes(body=updated_methods[name].body, params=new_params)
+            if not re.match(
+                r"\ndef .*\(.*\):\n    raise.*Error\(.*",
+                class_finder.python_module.code_for_node(updated_methods[name]),
+            ):
+                func = func.with_changes(body=updated_methods[name].body, params=new_params)
+            else:
+                continue
+
         if m.matches(func, m.SimpleStatementLine(body=[m.Assign()])):
             target = class_finder.python_module.code_for_node(func.body[0].targets[0])
             assign_targets[target] = func
@@ -492,7 +582,7 @@ def replace_call_to_super(class_finder: ClassFinder, updated_node: cst.ClassDef,
     temp_module = cst.Module(body=[result_node])
     new_module = MetadataWrapper(temp_module)
     new_replacement_class = new_module.visit(
-        SuperTransformer(temp_module, original_methods, updated_methods, class_name)
+        SuperTransformer(temp_module, original_methods, updated_methods, class_name, all_bases)
     )
     new_replacement_body = new_replacement_class.body[0].body  # get the indented block
 
@@ -508,6 +598,124 @@ def replace_call_to_super(class_finder: ClassFinder, updated_node: cst.ClassDef,
 }
 
 
+def get_new_part(class_name, base_class):
+    """
+    When `MyClassNameAttention` inherits from `MistralAttention`, we need
+    to process the name to properly find dependencies.
+
+    Here we take what is the same (Attention) and what is different
+    when finding the dependencies.
+    """
+    common_suffix_len = 0
+    for i in range(1, min(len(class_name), len(base_class)) + 1):
+        if class_name[-i] == base_class[-i]:
+            common_suffix_len += 1
+        else:
+            break
+
+    if common_suffix_len > 0:
+        new_part = class_name[:-common_suffix_len]
+    else:
+        new_part = class_name
+
+    # Convert the remaining new part to snake_case
+    snake_case = re.sub(r"(?<!^)(?=[A-Z])", "_", new_part).lower()
+    return snake_case
+
+
+def find_all_dependencies(function: str, dependency_mapping: Dict[str, set]):
+    """Return all the dependencies of the given top-level function. Given the following structure in the `modular_xxx.py` file:
+    ```
+    def foo1():
+        pass
+
+    def foo2():
+        pass
+
+    def bar():
+        foo1()
+
+    def foobar():
+        bar()
+        foo2()
+
+    class MyLayer(SomeOtherModelLayer):
+        def forward(...):
+            foobar()
+    ```
+    and the `dependency_mapping` created when visiting the `modular_xxx.py` file, we get:
+    ```
+    dependency_mapping = {'bar': {'foo1'}, 'foobar': {'bar', 'foo2'}}
+    find_all_dependencies('foobar', dependency_mapping)
+    >>> [('bar', 'foobar'), ('foo2', 'foobar'), ('foo1', 'bar')]
+    ```
+    That is, all the functions needed (and their immediate parent) so that the function to be added in MyLayer (`foobar`) can
+    work correctly.
+    """
+    all_dependencies = deque(dependency_mapping[function])
+    all_dependencies_with_parent = [(dep, function) for dep in dependency_mapping[function]]
+    checked_dependencies = set(function)
+    while len(all_dependencies) > 0:
+        # Pick element to visit
+        parent = all_dependencies.popleft()
+        if parent not in checked_dependencies:
+            # Update dependencies
+            all_dependencies.extend(dependency_mapping[parent])
+            all_dependencies_with_parent += [(dependency, parent) for dependency in dependency_mapping[parent]]
+            # add visited node to the list
+            checked_dependencies.add(parent)
+
+    # no child can ever appear before its parent thanks to the queue (needed to add them at the correct location in the body later)
+    return all_dependencies_with_parent
+
+
+class PostModularConverterCleaner(CSTTransformer):
+    """Allow simple cleaning after conversion. Remove top-level functions/classes without any calls (they may arise due
+    to dependency mapping, even if code parts with those functions/classes were overwritten)"""
+
+    METADATA_DEPENDENCIES = (ParentNodeProvider,)
+
+    def __init__(self, added_dependencies: set):
+        super().__init__()
+        self.top_level_functions_or_classes = {}
+        self.all_used_functions_or_classes = set()
+        self.added_dependencies = added_dependencies
+
+    def visit_FunctionDef(self, node):
+        parent_node = self.get_metadata(cst.metadata.ParentNodeProvider, node)
+        if m.matches(parent_node, m.Module()):
+            self.top_level_functions_or_classes[node.name.value] = node
+
+    def visit_ClassDef(self, node):
+        parent_node = self.get_metadata(cst.metadata.ParentNodeProvider, node)
+        if m.matches(parent_node, m.Module()):
+            self.top_level_functions_or_classes[node.name.value] = node
+
+    def visit_Name(self, node: cst.Name):
+        """This is used to find any mention of a top-level function or class except its own definition.
+        It will contain other names as well, but those will not be used. This is the most general way to do it
+        since mentions may appear in a lot of different contexts (apart from simple Call to the function/class).
+        e.g. Attention classes are only mentionned by their name in a dict assignment.
+        """
+        parent_node = self.get_metadata(cst.metadata.ParentNodeProvider, node)
+
+        if not (
+            (m.matches(parent_node, m.ClassDef()) and parent_node.name.value == node.value)
+            or (m.matches(parent_node, m.FunctionDef()) and parent_node.name.value == node.value)
+        ):
+            self.all_used_functions_or_classes.add(node.value)
+
+    def leave_Module(self, original_node: cst.Module, node):
+        # Find any class/function that was mistakenly added as part of the dependencies and remove it
+        unused = self.added_dependencies - self.all_used_functions_or_classes
+        nodes_to_remove = [
+            self.top_level_functions_or_classes[name] for name in unused if name in self.top_level_functions_or_classes
+        ]
+        new_body = [node_ for node_ in original_node.body if node_ not in nodes_to_remove]
+        # Return a new module with the updated body
+        return node.with_changes(body=new_body)
+
+
 class ModularConverterTransformer(CSTTransformer):
     METADATA_DEPENDENCIES = (ParentNodeProvider, ScopeProvider, PositionProvider)
 
@@ -538,6 +746,14 @@ def __init__(self, python_module, new_name, given_old_name=None, given_new_name=
         }
         self.match_patterns = "|".join(self.files.keys())
         self.all_definitions = {}
+        self.class_to_file_type = {}
+        self.current_class = None  # keep track of current top-level class during visit
+        self.current_top_level_function = None  # keep track of current top-level function during visit
+        # Mapping from top-level functions to classes using them
+        self.function_call_class_mapping = defaultdict(lambda: set())
+        # Mapping from top-level functions to other top-level functions dependencies
+        self.function_call_dependency_mapping = defaultdict(lambda: set())
+        self.added_dependencies = set()
 
     def visit_ImportFrom(self, node: cst.ImportFrom) -> None:
         """When visiting imports from `transformers.models.xxx` we need to:
@@ -587,9 +803,20 @@ def leave_SimpleStatementLine(self, original_node, updated_node):
                 if updated_node not in self.all_imports:
                     self.all_imports.append(updated_node)
                 return updated_node
+            elif m.matches(original_node, m.SimpleStatementLine(body=[m.Assign()])):
+                if original_node.body[0].targets[0].target.value in ASSIGNMENTS_TO_KEEP.keys():
+                    file_ = ASSIGNMENTS_TO_KEEP[original_node.body[0].targets[0].target.value]
+                    self.files[file_][original_node.body[0].targets[0].target.value] = {
+                        "node": original_node,
+                        "insert_idx": self.global_scope_index,
+                    }
             self.global_scope_index += 100
         return updated_node
 
+    def visit_ClassDef(self, node: cst.ClassDef):
+        """Used to keep track of current class"""
+        self.current_class = node.name.value
+
     def leave_ClassDef(self, original_node, updated_node):
         """
         1. Filter the `base` classes of this class
@@ -630,13 +857,33 @@ def leave_ClassDef(self, original_node, updated_node):
                     self.given_new_name,
                 )
                 visited_module[super_file_name] = class_finder
+                list_dependencies = {
+                    dep: class_finder.class_start_line.get(dep, 1000)
+                    for dep in class_finder.class_dependency_mapping.get(class_name, [])
+                }
             else:  # we are re-using the previously parsed data
                 class_finder = visited_module[super_file_name]
 
-            list_dependencies = {
-                dep: class_finder.class_start_line.get(dep, 1000)
-                for dep in class_finder.class_dependency_mapping.get(class_name, [])
-            }
+                list_dependencies = {
+                    dep: class_finder.class_start_line.get(dep, 1000)
+                    for dep in class_finder.class_dependency_mapping.get(class_name, [])
+                }
+            if list_dependencies == []:
+                # so, maybe standard renaming did not work (the class name is different)
+                # we try with another renaming pattern
+                potential_given_name = get_new_part(class_name, super_class)
+                del visited_module[super_file_name]
+                class_finder = find_classes_in_file(
+                    self.transformers_imports[super_file_name],
+                    model_name,
+                    potential_given_name,
+                    self.model_name,
+                    potential_given_name,
+                )
+                list_dependencies = {
+                    dep: class_finder.class_start_line.get(dep, 1000)
+                    for dep in class_finder.class_dependency_mapping.get(class_name, [])
+                }
 
             list_dependencies = sorted(list_dependencies.items(), key=lambda x: x[1], reverse=True)
             start_insert_idx = self.global_scope_index
@@ -647,9 +894,10 @@ def leave_ClassDef(self, original_node, updated_node):
                 node = class_finder.global_nodes.get(dependency, None)
                 if node is not None:
                     if dependency not in file_to_update:
-                        node = self.all_definitions.get(dependency, node)
+                        node = self.all_definitions.pop(dependency, node)
                         start_insert_idx -= 1
                         file_to_update[dependency] = {"insert_idx": start_insert_idx, "node": node}
+                        self.added_dependencies.add(dependency)
                     elif dependency not in self.inserted_deps:
                         # make sure the node is written after its dependencies
                         start_insert_idx = file_to_update[dependency]["insert_idx"] - 1
@@ -668,10 +916,12 @@ def leave_ClassDef(self, original_node, updated_node):
                     self.inserted_deps.append(dependency)
 
             if len(list_dependencies) > 0:
-                updated_node = replace_call_to_super(class_finder, updated_node, class_name)
+                updated_node = replace_call_to_super(class_finder, updated_node, class_name, all_bases)
             else:
                 raise ValueError(
-                    f"Unable to find dependencies for {super_class} in {super_file_name}. Here are the dependencies found: {class_finder.class_dependency_mapping}. (The automatic renaming might have gone wrong!)"
+                    f"We were unable to find dependencies for {class_name} (based on inheriting from {super_class})"
+                    f"   Here are all the global dependencies that we found in you modular file: {list(class_finder.class_dependency_mapping.keys())}."
+                    f"   This usually means that the name of `{class_name}` does not match the pattern of `{super_class}`"
                 )
 
         # Now, if a class was defined without parents, we look for the name
@@ -679,28 +929,124 @@ def leave_ClassDef(self, original_node, updated_node):
         match = re.search(rf"({match_pattern})$", class_name)
         if match:
             key = TYPE_TO_FILE_TYPE[match.group(1)]
+            self.class_to_file_type[class_name] = key
             self.files[key][class_name] = {"insert_idx": self.global_scope_index, "node": updated_node}
         else:
+            self.class_to_file_type[class_name] = "modeling"
             self.files["modeling"][class_name] = {"insert_idx": self.global_scope_index, "node": updated_node}
+
+        self.current_class = None
         return updated_node
 
+    def visit_FunctionDef(self, node):
+        parent_node = self.get_metadata(cst.metadata.ParentNodeProvider, node)
+        if m.matches(parent_node, m.Module()):
+            self.current_top_level_function = node.name.value
+
     def leave_FunctionDef(self, original_node, node):
         parent_node = self.get_metadata(cst.metadata.ParentNodeProvider, original_node)
         if m.matches(parent_node, m.Module()):
             self.all_definitions[node.name.value] = node
         return node
 
+    def visit_Assign(self, node: cst.Assign) -> None:
+        # Check if the assignment target is '__all__'
+        if isinstance(node.targets[0].target, cst.Name) and node.targets[0].target.value == "__all__":
+            if isinstance(node.value, cst.List):
+                # Extract the elements from the list
+                all_all_to_add = defaultdict(list)
+                for elt in node.value.elements:
+                    if isinstance(elt.value, cst.SimpleString):
+                        # Remove quotes and add the string to the elements list
+                        class_name = elt.value.value
+                        file = self.class_to_file_type[
+                            elt.value.evaluated_value
+                        ]  # evaluated value give the content of the string
+                        all_all_to_add[file] += [class_name]
+                for f_type, new_alls in all_all_to_add.items():
+                    updated_node = node.with_changes(
+                        value=cst.List(elements=[cst.Element(value=cst.SimpleString(value=k)) for k in new_alls])
+                    )
+                    self.files[f_type][class_name] = {
+                        "insert_idx": self.global_scope_index + 100,
+                        "node": updated_node,
+                    }
+
     def leave_If(self, original_node, node):
         parent_node = self.get_metadata(cst.metadata.ParentNodeProvider, original_node)
         if m.matches(parent_node, m.Module()):
             full_statement = self.python_module.code_for_node(original_node.test)
             if re.search(r"[\s\S]*is_.*available", full_statement):
                 self.all_safe_imports.append(node)
-            elif full_statement not in self.new_body:
-                self.new_body[node] = {"insert_idx": self.global_scope_index, "node": node}
+            elif full_statement not in self.all_imports:
+                logger.warning(f"one import is protected with `if`. Hard guess where it's used {full_statement}")
         return node
 
-    def leave_Module(self, original_node: cst.Assign, node):
+    def visit_Call(self, node: cst.Call):
+        """This is used to create a mapping from functions to class calling them, and from top-level functions to functions called inside them.
+        Important note: we only rely on direct Call to the functions here, not indirect mentions (such as assigning a variable with the function,
+        add calling the variable later). This should be enough as the `modular_xxx` and `modeling_xxx` structures should be as simple as possible."""
+        # Only map function calls if we're inside a class (i.e., current_class is set)
+        if self.current_class is not None:
+            # Simple function calls such as foo()
+            if isinstance(node.func, cst.Name):
+                self.function_call_class_mapping[node.func.value].add(self.current_class)
+        elif self.current_top_level_function is not None:
+            # Simple function calls such as foo()
+            if isinstance(node.func, cst.Name):
+                self.function_call_dependency_mapping[self.current_top_level_function].add(node.func.value)
+
+    def _maybe_add_function_to_body(
+        self,
+        top_level_function: str,
+        body: dict,
+        function_node: cst.FunctionDef,
+        matching_callers: Optional[set] = None,
+        parent: Optional[str] = None,
+    ) -> bool:
+        """Check if the `top_level_function` should be added to the body (i.e. it is not already present, and `matching_callers`
+        is not empy, or `parent`is provided). If it should be added, do it (in the correct location, just before its caller) and return
+        `True`. Return `False` otherwise.
+        """
+        if matching_callers is None and parent is None:
+            raise ValueError("Cannot add function if both the parent and the matching callers are None.")
+        if matching_callers is None:
+            matching_callers = {parent}
+        if len(matching_callers) > 0 and top_level_function not in body.keys():
+            # Add the function just before the first class using it
+            new_idx = min([body[element]["insert_idx"] for element in matching_callers])
+            # Reorder the elements
+            for element in body.keys():
+                if body[element]["insert_idx"] >= new_idx:
+                    body[element]["insert_idx"] += 1
+            # Assign new element to body (after changing the count to avoid messing it)
+            body[top_level_function] = {"insert_idx": new_idx, "node": function_node}
+            return True
+        return False
+
+    def _recursively_add_all_new_needed_functions_in_files(self):
+        """For all top-level functions which were newly defined in the `modular_xxx.py`, check if they are used in a class in
+        the different files, and add them to the file if it is the case (also recursively adding all other functions that
+        may be needed in that function body)."""
+        # At this point, `self.all_definitions` only contains newly defined top-level functions in the `modualr_xxx.py`
+        for top_level_function, function_node in self.all_definitions.items():
+            calling_entities = self.function_call_class_mapping[top_level_function]
+            # The function may be needed in different files, we need to iterate on them
+            for file, body in self.files.items():
+                file_elements = set(body.keys())
+                # If the intersection is not null, top_level_func must be added to file
+                matching_callers = calling_entities & file_elements
+                added = self._maybe_add_function_to_body(top_level_function, body, function_node, matching_callers)
+                # If the function was added, we need to recursively add all its dependencies
+                if added:
+                    for dependency, parent in find_all_dependencies(
+                        top_level_function, self.function_call_dependency_mapping
+                    ):
+                        self._maybe_add_function_to_body(
+                            dependency, body, self.all_definitions[dependency], parent=parent
+                        )
+
+    def leave_Module(self, original_node: cst.Module, node):
         imports = {self.python_module.code_for_node(k): k for k in self.all_imports}
         dependency_imports = {file_type: imports.copy() for file_type in self.files}
         for super_file_name, visiter in self.visited_module.items():
@@ -709,12 +1055,19 @@ def leave_Module(self, original_node: cst.Assign, node):
                 {self.python_module.code_for_node(k): k for k in visiter.imports.values()}
             )
 
+        # Check if any new top-level function from the `modular_xxx.py` should be added to the different files
+        # (if it is called in a class in the file, then it will be copy pasted from `modular.py` to that file).
+        self._recursively_add_all_new_needed_functions_in_files()
+
         for file, body in self.files.items():
             new_body = [k[1]["node"] for k in sorted(body.items(), key=lambda x: x[1]["insert_idx"])]
             if len(new_body) > 0:
                 if file in dependency_imports.keys():
                     new_body = list(dependency_imports[file].values()) + new_body
-                self.files[file] = cst.Module(body=[*new_body], header=node.header)
+                new_module = cst.Module(body=[*new_body], header=node.header)
+                # Final cleanup
+                new_module = MetadataWrapper(new_module).visit(PostModularConverterCleaner(self.added_dependencies))
+                self.files[file] = new_module
         return node
 
 
@@ -733,7 +1086,15 @@ def convert_modular_file(modular_file, old_model_name=None, new_model_name=None,
         wrapper.visit(cst_transformers)
         for file, node in cst_transformers.files.items():
             if node != {}:
-                ruffed_code = run_ruff(AUTO_GENERATED_MESSAGE + node.code, True)
+                # Get relative path starting from src/transformers/
+                relative_path = re.search(
+                    rf"(src{os.sep}transformers{os.sep}.*|examples{os.sep}.*)", os.path.abspath(modular_file)
+                ).group(1)
+
+                header = AUTO_GENERATED_MESSAGE.format(
+                    relative_path=relative_path, short_name=os.path.basename(relative_path)
+                )
+                ruffed_code = run_ruff(header + node.code, True)
                 formatted_code = run_ruff(ruffed_code, False)
                 output[file] = [formatted_code, ruffed_code]
         return output
@@ -764,7 +1125,7 @@ def save_modeling_file(modular_file, converted_file):
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--files_to_parse",
-        default=["examples/modular-transformers/modular_dummy.py"],
+        default=["src/transformers/models/roberta/modular_roberta.py"],
         nargs="+",
         help="A list of `modular_xxxx` files that should be converted to single model file",
     )
diff --git a/utils/not_doctested.txt b/utils/not_doctested.txt
index cd87d09ec8ec6d..232eed95b9dd04 100644
--- a/utils/not_doctested.txt
+++ b/utils/not_doctested.txt
@@ -373,7 +373,6 @@ src/transformers/data/processors/squad.py
 src/transformers/data/processors/utils.py
 src/transformers/data/processors/xnli.py
 src/transformers/debug_utils.py
-src/transformers/deepspeed.py
 src/transformers/dependency_versions_check.py
 src/transformers/dependency_versions_table.py
 src/transformers/dynamic_module_utils.py
@@ -907,6 +906,8 @@ src/transformers/models/xmod/convert_xmod_original_pytorch_checkpoint_to_pytorch
 src/transformers/models/yolos/convert_yolos_to_pytorch.py
 src/transformers/models/yoso/convert_yoso_pytorch_to_pytorch.py
 src/transformers/models/yoso/modeling_yoso.py
+src/transformers/models/zamba/configuration_zamba.py
+src/transformers/models/zamba/modeling_zamba.py
 src/transformers/onnx/__main__.py
 src/transformers/onnx/config.py
 src/transformers/onnx/convert.py
diff --git a/utils/patch_helper.py b/utils/patch_helper.py
index 86b32c9304de59..f31e755efdc94e 100644
--- a/utils/patch_helper.py
+++ b/utils/patch_helper.py
@@ -44,7 +44,7 @@
 def get_merge_commit(repo, pr_number, since_tag):
     try:
         # Use git log to find the merge commit for the PR within the given tag range
-        merge_commit = next(repo.iter_commits(f"v{since_tag}...HEAD", grep=f"#{pr_number}"))
+        merge_commit = next(repo.iter_commits(f"v{since_tag}...origin/main", grep=f"#{pr_number}"))
         return merge_commit
     except StopIteration:
         print(f"No merge commit found for PR #{pr_number} between tags {since_tag} and {main}")
@@ -71,6 +71,7 @@ def main(pr_numbers):
     major_minor = f"{last_tag.major}.{last_tag.minor}.0"
     # Iterate through tag ranges to find the merge commits
     for pr in pr_numbers:
+        pr = pr.split("https://github.com/huggingface/transformers/pull/")[-1]
         commit = get_merge_commit(repo, pr, major_minor)
         if commit:
             merge_commits.append(commit)
@@ -86,7 +87,9 @@ def main(pr_numbers):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Find and sort merge commits for specified PRs.")
-    parser.add_argument("--prs", nargs="+", required=True, type=int, help="PR numbers to find merge commits for")
+    parser.add_argument("--prs", nargs="+", required=False, type=str, help="PR numbers to find merge commits for")
 
     args = parser.parse_args()
+    if args.prs is None:
+        args.prs = "https://github.com/huggingface/transformers/pull/33753  https://github.com/huggingface/transformers/pull/33861  https://github.com/huggingface/transformers/pull/33906  https://github.com/huggingface/transformers/pull/33761  https://github.com/huggingface/transformers/pull/33586  https://github.com/huggingface/transformers/pull/33766  https://github.com/huggingface/transformers/pull/33958 https://github.com/huggingface/transformers/pull/33965".split()
     main(args.prs)
diff --git a/utils/tests_fetcher.py b/utils/tests_fetcher.py
index b8408f9d4538d9..9e15f2e115ec61 100644
--- a/utils/tests_fetcher.py
+++ b/utils/tests_fetcher.py
@@ -1153,6 +1153,7 @@ def parse_commit_message(commit_message: str) -> Dict[str, bool]:
 
 
 def create_test_list_from_filter(full_test_list, out_path):
+    os.makedirs(out_path, exist_ok=True)
     all_test_files = "\n".join(full_test_list)
     for job_name, _filter in JOB_TO_TEST_FILE.items():
         file_name = os.path.join(out_path, f"{job_name}_test_list.txt")