diff --git a/.github/workflows/self-nightly-scheduled.yml b/.github/workflows/self-nightly-scheduled.yml
index b3e13cbb1b7d52..aea6bf44d275cb 100644
--- a/.github/workflows/self-nightly-scheduled.yml
+++ b/.github/workflows/self-nightly-scheduled.yml
@@ -227,7 +227,7 @@ jobs:
           python3 -m pip uninstall -y deepspeed
           rm -rf DeepSpeed
           git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build
-          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
 
       - name: NVIDIA-SMI
         run: |
diff --git a/.github/workflows/self-past.yml b/.github/workflows/self-past.yml
index bcb6639a79810e..cc1b5b0176db61 100644
--- a/.github/workflows/self-past.yml
+++ b/.github/workflows/self-past.yml
@@ -253,7 +253,7 @@ jobs:
           python3 -m pip uninstall -y deepspeed
           rm -rf DeepSpeed
           git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build
-          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
 
       - name: NVIDIA-SMI
         run: |
diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml
index be2fcc9880efd0..603a148358d9b5 100644
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@@ -381,7 +381,7 @@ jobs:
         working-directory: /workspace
         run: |
           python3 -m pip uninstall -y deepspeed
-          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
 
       - name: NVIDIA-SMI
         run: |
@@ -467,7 +467,7 @@ jobs:
         working-directory: /workspace
         run: |
           python3 -m pip uninstall -y deepspeed
-          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
 
       - name: NVIDIA-SMI
         run: |
diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml
index 3ebf38062c4207..cdbde587a58bbc 100644
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -369,7 +369,7 @@ jobs:
         working-directory: /workspace
         run: |
           python3 -m pip uninstall -y deepspeed
-          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
 
       - name: NVIDIA-SMI
         run: |
diff --git a/README.md b/README.md
index 8d0792d7740fe0..d6a4f5ef004f4a 100644
--- a/README.md
+++ b/README.md
@@ -310,6 +310,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
+1. **[CPM-Ant](https://huggingface.co/docs/transformers/model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/).
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
@@ -318,7 +319,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
 1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (from SenseTime Research) released with the paper [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) by Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai.
 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
-1. **[DePlot](https://huggingface.co/docs/transformers/main/model_doc/deplot)** (from Google AI) released with the paper [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505) by Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun.
+1. **[DePlot](https://huggingface.co/docs/transformers/model_doc/deplot)** (from Google AI) released with the paper [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505) by Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun.
 1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (from The University of Texas at Austin) released with the paper [NMS Strikes Back](https://arxiv.org/abs/2212.06137) by Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Krähenbühl.
 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
@@ -350,7 +351,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
 1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
 1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren.
-1. **[GPTBigCode](https://huggingface.co/docs/transformers/main/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
+1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
@@ -366,7 +367,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
 1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
-1. **[LLaMA](https://huggingface.co/docs/transformers/main/model_doc/llama)** (from The FAIR team of Meta AI) released with the paper [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
+1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (from The FAIR team of Meta AI) released with the paper [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
@@ -377,10 +378,10 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (from Microsoft Research Asia) released with the paper [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei.
 1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
 1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
-1. **[MatCha](https://huggingface.co/docs/transformers/main/model_doc/matcha)** (from Google AI) released with the paper [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) by Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos.
+1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (from Google AI) released with the paper [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) by Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos.
 1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
 1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
-1. **[MEGA](https://huggingface.co/docs/transformers/main/model_doc/mega)** (from Facebook) released with the paper [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.
+1. **[MEGA](https://huggingface.co/docs/transformers/model_doc/mega)** (from Facebook) released with the paper [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.
 1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao.
@@ -395,7 +396,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
 1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
-1. **[NLLB-MOE](https://huggingface.co/docs/transformers/main/model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
+1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
 1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
@@ -404,7 +405,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu.
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
 1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
-1. **[Pix2Struct](https://huggingface.co/docs/transformers/main/model_doc/pix2struct)** (from Google) released with the paper [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova.
+1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (from Google) released with the paper [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova.
 1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
 1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
diff --git a/README_es.md b/README_es.md
index e85a8bf080fd2c..afc03c7f325701 100644
--- a/README_es.md
+++ b/README_es.md
@@ -298,6 +298,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
+1. **[CPM-Ant](https://huggingface.co/docs/transformers/main/model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/).
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
diff --git a/README_hd.md b/README_hd.md
index d1db75014302f2..24fc985432b7ec 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -270,6 +270,7 @@ conda install -c huggingface transformers
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (Facebook AI से) साथ वाला पेपर [A ConvNet for the 2020s](https://arxiv.org/abs /2201.03545) ज़ुआंग लियू, हेंज़ी माओ, चाओ-युआन वू, क्रिस्टोफ़ फीचटेनहोफ़र, ट्रेवर डेरेल, सैनिंग ज़ी द्वारा।
 1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (सिंघुआ यूनिवर्सिटी से) साथ में पेपर [सीपीएम: ए लार्ज-स्केल जेनेरेटिव चाइनीज प्री-ट्रेंड लैंग्वेज मॉडल](https : //arxiv.org/abs/2012.00413) झेंग्यान झांग, जू हान, हाओ झोउ, पेई के, युक्सियन गु, डेमिंग ये, युजिया किन, युशेंग सु, हाओझे जी, जियान गुआन, फैंचाओ क्यूई, ज़ियाओझी वांग, यानान झेंग द्वारा , गुओयांग ज़ेंग, हुआनकी काओ, शेंगकी चेन, डाइक्सुआन ली, ज़ेनबो सन, ज़ियुआन लियू, मिनली हुआंग, वेंटाओ हान, जी तांग, जुआनज़ी ली, ज़ियाओयान झू, माओसोंग सन।
+1. **[CPM-Ant](https://huggingface.co/docs/transformers/main/model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/).
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (सेल्सफोर्स से) साथ में पेपर [CTRL: ए कंडिशनल ट्रांसफॉर्मर लैंग्वेज मॉडल फॉर कंट्रोलेबल जेनरेशन](https://arxiv.org/abs/1909.05858) नीतीश शिरीष केसकर*, ब्रायन मैककैन*, लव आर. वार्ष्णेय, कैमिंग जिओंग और रिचर्ड द्वारा सोचर द्वारा जारी किया गया।
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (Microsoft से) साथ में दिया गया पेपर [CvT: इंट्रोड्यूसिंग कनवॉल्यूशन टू विजन ट्रांसफॉर्मर्स](https://arxiv.org/ एब्स/2103.15808) हैपिंग वू, बिन जिओ, नोएल कोडेला, मेंगचेन लियू, जियांग दाई, लू युआन, लेई झांग द्वारा।
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (फेसबुक से) साथ में कागज [Data2Vec: भाषण, दृष्टि और भाषा में स्व-पर्यवेक्षित सीखने के लिए एक सामान्य ढांचा] (https://arxiv.org/abs/2202.03555) एलेक्सी बाएव्स्की, वेई-निंग सू, कियानटोंग जू, अरुण बाबू, जियाताओ गु, माइकल औली द्वारा पोस्ट किया गया।
diff --git a/README_ja.md b/README_ja.md
index c762816b097753..f700e34b343b0b 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -332,6 +332,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (Facebook AI から) Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie から公開された研究論文: [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545)
 1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (Tsinghua University から) Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun から公開された研究論文: [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413)
+1. **[CPM-Ant](https://huggingface.co/docs/transformers/main/model_doc/cpmant)** (OpenBMB から) [OpenBMB](https://www.openbmb.org/) から公開されました. 
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (Salesforce から) Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher から公開された研究論文: [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858)
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (Microsoft から) Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang から公開された研究論文: [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808)
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (Facebook から) Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli から公開された研究論文: [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555)
diff --git a/README_ko.md b/README_ko.md
index cc8328f2594407..4339279254771e 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -247,6 +247,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (Facebook AI 에서) Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie 의 [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) 논문과 함께 발표했습니다.
 1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (Tsinghua University 에서) Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun 의 [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) 논문과 함께 발표했습니다.
+1. **[CPM-Ant](https://huggingface.co/docs/transformers/main/model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/).
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (Salesforce 에서) Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher 의 [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) 논문과 함께 발표했습니다.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (Microsoft 에서) Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang 의 [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) 논문과 함께 발표했습니다.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (Facebook 에서) Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli 의 [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) 논문과 함께 발표했습니다.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 42d94b7839b54f..59ef12bbf2c11f 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -271,6 +271,7 @@ conda install -c huggingface transformers
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (来自 Facebook AI) 伴随论文 [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) 由 Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie 发布。
 1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (来自 Tsinghua University) 伴随论文 [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) 由 Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun 发布。
+1. **[CPM-Ant](https://huggingface.co/docs/transformers/main/model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/).
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (来自 Salesforce) 伴随论文 [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) 由 Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher 发布。
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (来自 Microsoft) 伴随论文 [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) 由 Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang 发布。
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (来自 Facebook) 伴随论文 [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) 由 Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 697950c8b6b7c8..252ab5061b81b4 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -283,6 +283,7 @@ conda install -c huggingface transformers
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
+1. **[CPM-Ant](https://huggingface.co/docs/transformers/main/model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/).
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
diff --git a/docker/transformers-past-gpu/Dockerfile b/docker/transformers-past-gpu/Dockerfile
index 8ecc83c339d973..d39668c38885a8 100644
--- a/docker/transformers-past-gpu/Dockerfile
+++ b/docker/transformers-past-gpu/Dockerfile
@@ -47,7 +47,7 @@ RUN python3 -m pip uninstall -y deepspeed
 # This has to be run inside the GPU VMs running the tests. (So far, it fails here due to GPU checks during compilation.)
 # Issue: https://github.com/microsoft/DeepSpeed/issues/2010
 # RUN git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build && \
-#    DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
+#    DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
 
 RUN python3 -m pip install -U "itsdangerous<2.1.0"
 
diff --git a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
index 56c3c707240d5a..903dcb768d9b1d 100644
--- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
@@ -36,7 +36,7 @@ RUN python3 -m pip uninstall -y deepspeed
 # This has to be run (again) inside the GPU VMs running the tests.
 # The installation works here, but some tests fail, if we don't pre-build deepspeed again in the VMs running the tests.
 # TODO: Find out why test fail.
-RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
+RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
 
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
diff --git a/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
index fcb599ddc232d6..911cab8f7e23c5 100644
--- a/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
@@ -29,7 +29,7 @@ RUN python3 -m pip uninstall -y deepspeed
 # This has to be run inside the GPU VMs running the tests. (So far, it fails here due to GPU checks during compilation.)
 # Issue: https://github.com/microsoft/DeepSpeed/issues/2010
 # RUN git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build && \
-#    DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
+#    DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
 
 ## For `torchdynamo` tests
 ## (see https://github.com/huggingface/transformers/pull/17765)
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 676cf502e1fc6d..93f7764409bf89 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -263,6 +263,8 @@
         title: ConvBERT
       - local: model_doc/cpm
         title: CPM
+      - local: model_doc/cpmant
+        title: CPMANT
       - local: model_doc/ctrl
         title: CTRL
       - local: model_doc/deberta
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 3026fbbd443038..ed5feb91fc0783 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -84,6 +84,7 @@ The documentation is organized into five sections:
 1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[ConvNeXTV2](model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
 1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
+1. **[CPM-Ant](model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/).
 1. **[CTRL](model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[CvT](model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
@@ -287,6 +288,7 @@ Flax), PyTorch, and/or TensorFlow.
 |           ConvBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |           ConvNeXT            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |          ConvNeXTV2           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            CPM-Ant            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             CTRL              |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 |              CvT              |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |         Data2VecAudio         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/en/main_classes/quantization.mdx b/docs/source/en/main_classes/quantization.mdx
index 37877c9d028d3c..3dd6d36ee497d8 100644
--- a/docs/source/en/main_classes/quantization.mdx
+++ b/docs/source/en/main_classes/quantization.mdx
@@ -52,6 +52,37 @@ Note that once a model has been loaded in 8-bit it is currently not possible to
 
 </Tip>
 
+### Push quantized models on the 🤗 Hub
+
+You can push a quantized model on the Hub by naively using `push_to_hub` method. This will first push the quantization configuration file, then push the quantized model weights.
+Make sure to use `bitsandbytes>0.37.2` (at this time of writing, we tested it on `bitsandbytes==0.38.0.post1`) to be able to use this feature. 
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-560m", device_map="auto", load_in_8bit=True)
+tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")
+
+model.push_to_hub("bloom-560m-8bit")
+```
+
+<Tip warning={true}>
+
+Pushing 8bit models on the Hub is strongely encouraged for large models. This will allow the community to benefit from the memory footprint reduction and loading for example large models on a Google Colab.
+
+</Tip>
+
+### Load a quantized model from the 🤗 Hub
+
+You can load a quantized model from the Hub by using `from_pretrained` method. Make sure that the pushed weights are quantized, by checking that the attribute `quantization_config` is present in the model configuration object.
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model = AutoModelForCausalLM.from_pretrained("{your_username}/bloom-560m-8bit")
+```
+Note that in this case, you don't need to specify the arguments `load_in_8bit=True` and `device_map="auto"`, but you need to make sure that `bitsandbytes` and `accelerate` are installed.
+
 ### Advanced usecases
 
 This section is intended to advanced users, that want to explore what it is possible to do beyond loading and running 8-bit models.
diff --git a/docs/source/en/model_doc/cpmant.mdx b/docs/source/en/model_doc/cpmant.mdx
new file mode 100644
index 00000000000000..8f855355b3a7e5
--- /dev/null
+++ b/docs/source/en/model_doc/cpmant.mdx
@@ -0,0 +1,44 @@
+<!--Copyright 2022 The HuggingFace Team and The OpenBMB Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# CPMAnt
+
+## Overview
+
+CPM-Ant is an open-source Chinese pre-trained language model (PLM) with 10B parameters. It is also the first milestone of the live training process of CPM-Live. The training process is cost-effective and environment-friendly. CPM-Ant also achieves promising results with delta tuning on the CUGE benchmark. Besides the full model, we also provide various compressed versions to meet the requirements of different hardware configurations. [See more](https://github.com/OpenBMB/CPM-Live/tree/cpm-ant/cpm-live)
+
+Tips:
+
+This model was contributed by [OpenBMB](https://huggingface.co/openbmb). The original code can be found [here](https://github.com/OpenBMB/CPM-Live/tree/cpm-ant/cpm-live).
+
+⚙️ Training & Inference
+- A tutorial on [CPM-Live](https://github.com/OpenBMB/CPM-Live/tree/cpm-ant/cpm-live).
+
+## CpmAntConfig
+
+[[autodoc]] CpmAntConfig
+    - all
+
+## CpmAntTokenizer
+
+[[autodoc]] CpmAntTokenizer
+    - all
+
+## CpmAntModel
+
+[[autodoc]] CpmAntModel
+    - all
+    
+## CpmAntForCausalLM
+
+[[autodoc]] CpmAntForCausalLM
+    - all
\ No newline at end of file
diff --git a/docs/source/en/model_doc/llama.mdx b/docs/source/en/model_doc/llama.mdx
index a25a7ed96c6b13..edcb0482097bba 100644
--- a/docs/source/en/model_doc/llama.mdx
+++ b/docs/source/en/model_doc/llama.mdx
@@ -42,7 +42,7 @@ model = LlamaForCausalLM.from_pretrained("/output/path")
 Note that executing the script requires enough CPU RAM to host the whole model in float16 precision (even if the biggest versions
 come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM). For the 65B model, it's thus 130GB of RAM needed.
 
-- The LLaMA tokenizer is based on [sentencepiece](https://github.com/google/sentencepiece). One quirk of sentencepiece is that when decoding a sequence, if the first token is the start of the word (e.g. "Banana"), the tokenizer does not prepend the prefix space to the string. To have the tokenizer output the prefix space, set `decode_with_prefix_space=True` in the `LlamaTokenizer` object or in the tokenizer configuration.
+- The LLaMA tokenizer is a BPE model based on [sentencepiece](https://github.com/google/sentencepiece). One quirk of sentencepiece is that when decoding a sequence, if the first token is the start of the word (e.g. "Banana"), the tokenizer does not prepend the prefix space to the string.
 
 This model was contributed by [zphang](https://huggingface.co/zphang) with contributions from [BlackSamorez](https://huggingface.co/BlackSamorez). The code of the implementation in Hugging Face is based on GPT-NeoX [here](https://github.com/EleutherAI/gpt-neox). The original code of the authors can be found [here](https://github.com/facebookresearch/llama).
 
diff --git a/docs/source/en/model_doc/pix2struct.mdx b/docs/source/en/model_doc/pix2struct.mdx
index 340b06c69b07cd..c6d31362856959 100644
--- a/docs/source/en/model_doc/pix2struct.mdx
+++ b/docs/source/en/model_doc/pix2struct.mdx
@@ -28,9 +28,8 @@ We therefore advise you to use these models for the tasks they have been fine tu
 This model was contributed by [ybelkada](https://huggingface.co/ybelkada).
 The original code can be found [here](https://github.com/google-research/pix2struct).
 
-## Resources:
+## Resources
 
-- [Paper](https://arxiv.org/abs/2210.03347)
 - [Fine-tuning Notebook](https://github.com/huggingface/notebooks/blob/main/examples/image_captioning_pix2struct.ipynb)
 - [All models](https://huggingface.co/models?search=pix2struct)
 
diff --git a/docs/source/en/pipeline_tutorial.mdx b/docs/source/en/pipeline_tutorial.mdx
index 873d497d3ef984..ee85d522518c27 100644
--- a/docs/source/en/pipeline_tutorial.mdx
+++ b/docs/source/en/pipeline_tutorial.mdx
@@ -81,10 +81,10 @@ If you want to iterate over a whole dataset, or want to use it for inference in
 In general you can specify parameters anywhere you want:
 
 ```py
-generator(model="openai/whisper-large", my_parameter=1)
-out = generate(...)  # This will use `my_parameter=1`.
-out = generate(..., my_parameter=2)  # This will override and use `my_parameter=2`.
-out = generate(...)  # This will go back to using `my_parameter=1`.
+generator = pipeline(model="openai/whisper-large", my_parameter=1)
+out = generator(...)  # This will use `my_parameter=1`.
+out = generator(..., my_parameter=2)  # This will override and use `my_parameter=2`.
+out = generator(...)  # This will go back to using `my_parameter=1`.
 ```
 
 Let's check out 3 important ones:
@@ -95,14 +95,14 @@ If you use `device=n`, the pipeline automatically puts the model on the specifie
 This will work regardless of whether you are using PyTorch or Tensorflow.
 
 ```py
-generator(model="openai/whisper-large", device=0)
+generator = pipeline(model="openai/whisper-large", device=0)
 ```
 
 If the model is too large for a single GPU, you can set `device_map="auto"` to allow 🤗 [Accelerate](https://huggingface.co/docs/accelerate) to automatically determine how to load and store the model weights.
 
 ```py
 #!pip install accelerate
-generator(model="openai/whisper-large", device_map="auto")
+generator = pipeline(model="openai/whisper-large", device_map="auto")
 ```
 
 Note that if  `device_map="auto"` is passed, there is no need to add the argument `device=device` when instantiating your `pipeline` as you may encounter some unexpected behavior!
@@ -114,7 +114,7 @@ By default, pipelines will not batch inference for reasons explained in detail [
 But if it works in your use case, you can use:
 
 ```py
-generator(model="openai/whisper-large", device=0, batch_size=2)
+generator = pipeline(model="openai/whisper-large", device=0, batch_size=2)
 audio_filenames = [f"audio_{i}.flac" for i in range(10)]
 texts = generator(audio_filenames)
 ```
@@ -287,4 +287,4 @@ pipe = pipeline(model="facebook/opt-1.3b", device_map="auto", model_kwargs={"loa
 output = pipe("This is a cool example!", do_sample=True, top_p=0.95)
 ```
 
-Note that you can replace the checkpoint with any of the Hugging Face model that supports large model loading such as BLOOM!
\ No newline at end of file
+Note that you can replace the checkpoint with any of the Hugging Face model that supports large model loading such as BLOOM!
diff --git a/docs/source/en/tasks/language_modeling.mdx b/docs/source/en/tasks/language_modeling.mdx
index 2f79cb80085cbe..b79435b08f317c 100644
--- a/docs/source/en/tasks/language_modeling.mdx
+++ b/docs/source/en/tasks/language_modeling.mdx
@@ -34,7 +34,7 @@ Choose one of the following architectures:
 
 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
 
-[BART](../model_doc/bart), [BERT](../model_doc/bert), [Bert Generation](../model_doc/bert-generation), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CodeGen](../model_doc/codegen), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [GIT](../model_doc/git), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT NeoX Japanese](../model_doc/gpt_neox_japanese), [GPT-J](../model_doc/gptj), [LLaMA](../model_doc/llama), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MVP](../model_doc/mvp), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Pegasus](../model_doc/pegasus), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [Speech2Text2](../model_doc/speech_to_text_2), [Transformer-XL](../model_doc/transfo-xl), [TrOCR](../model_doc/trocr), [XGLM](../model_doc/xglm), [XLM](../model_doc/xlm), [XLM-ProphetNet](../model_doc/xlm-prophetnet), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod)
+[BART](../model_doc/bart), [BERT](../model_doc/bert), [Bert Generation](../model_doc/bert-generation), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CodeGen](../model_doc/codegen), [CPM-Ant](../model_doc/cpmant), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [GIT](../model_doc/git), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT NeoX Japanese](../model_doc/gpt_neox_japanese), [GPT-J](../model_doc/gptj), [LLaMA](../model_doc/llama), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MVP](../model_doc/mvp), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Pegasus](../model_doc/pegasus), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [Speech2Text2](../model_doc/speech_to_text_2), [Transformer-XL](../model_doc/transfo-xl), [TrOCR](../model_doc/trocr), [XGLM](../model_doc/xglm), [XLM](../model_doc/xlm), [XLM-ProphetNet](../model_doc/xlm-prophetnet), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod)
 
 <!--End of the generated tip-->
 
diff --git a/docs/source/en/tasks/token_classification.mdx b/docs/source/en/tasks/token_classification.mdx
index 72045b4bcd69cf..b3e1bdad62b6d4 100644
--- a/docs/source/en/tasks/token_classification.mdx
+++ b/docs/source/en/tasks/token_classification.mdx
@@ -121,7 +121,7 @@ As you saw in the example `tokens` field above, it looks like the input has alre
 
 However, this adds some special tokens `[CLS]` and `[SEP]` and the subword tokenization creates a mismatch between the input and labels. A single word corresponding to a single label may now be split into two subwords. You'll need to realign the tokens and labels by:
 
-1. Mapping all tokens to their corresponding word with the [`word_ids`](https://huggingface.co/docs/tokenizers/python/latest/api/reference.html#tokenizers.Encoding.word_ids) method.
+1. Mapping all tokens to their corresponding word with the [`word_ids`](https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.BatchEncoding.word_ids) method.
 2. Assigning the label `-100` to the special tokens `[CLS]` and `[SEP]` so they're ignored by the PyTorch loss function.
 3. Only labeling the first token of a given word. Assign `-100` to other subtokens from the same word.
 
diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index a9c7de2151f593..57e7d896117444 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -13,10 +13,10 @@
     title: (번역중) Pipelines for inference
   - local: autoclass_tutorial
     title: 자동 클래스로 사전 학습된 인스턴스 로드하기
-  - local: in_translation
-    title: (번역중) Preprocess
-  - local: in_translation
-    title: (번역중) Fine-tune a pretrained model
+  - local: preprocessing
+    title: 전처리
+  - local: training
+    title: 사전 학습된 모델 미세 조정하기
   - local: in_translation
     title: (번역중) Distributed training with 🤗 Accelerate
   - local: in_translation
@@ -49,8 +49,8 @@
     - local: in_translation
       title: (번역중) Text generation strategies
     - sections:
-      - local: in_translation
-        title: (번역중) Text classification
+      - local: tasks/sequence_classification
+        title: 텍스트 분류
       - local: in_translation
         title: (번역중) Token classification
       - local: in_translation
@@ -146,7 +146,7 @@
     - local: in_translation
       title: (번역중) Checks on a Pull Request
     title: (번역중) 기여하기
-  - local: in_translation
+  - local: notebooks
     title: (번역중) 🤗 Transformers Notebooks
   - local: in_translation
     title: (번역중) Community resources
diff --git a/docs/source/ko/autoclass_tutorial.mdx b/docs/source/ko/autoclass_tutorial.mdx
index 36e36599da3490..124583fb5478f3 100644
--- a/docs/source/ko/autoclass_tutorial.mdx
+++ b/docs/source/ko/autoclass_tutorial.mdx
@@ -12,7 +12,8 @@ specific language governing permissions and limitations under the License.
 
 # AutoClass로 사전 학습된 인스턴스 로드[[Load pretrained instances with an AutoClass]]
 
-트랜스포머 아키텍처가 매우 다양하기 때문에 체크포인트에 맞는 아키텍처를 생성하는 것이 어려울 수 있습니다. 라이브러리를 쉽고 간단하며 유연하게 사용하기 위한 Transformer 핵심 철학의 일환으로, `AutoClass`는 주어진 체크포인트에서 올바른 아키텍처를 자동으로 추론하여 로드합니다. `from_pretrained()` 메서드를 사용하면 모든 아키텍처에 대해 사전 학습된 모델을 빠르게 로드할 수 있으므로 모델을 처음부터 학습하는 데 시간과 리소스를 투입할 필요가 없습니다. 이러한 유형의 체크포인트에 구애받지 않는 코드를 생성한다는 것은 코드가 한 체크포인트에서 작동한다면 아키텍처가 다르더라도 유사한 작업에 대해 학습된 것이라면 다른 체크포인트에서도 작동한다는 것을 의미합니다.
+트랜스포머 아키텍처가 매우 다양하기 때문에 체크포인트에 맞는 아키텍처를 생성하는 것이 어려울 수 있습니다. 라이브러리를 쉽고 간단하며 유연하게 사용하기 위한 Transformer 핵심 철학의 일환으로, `AutoClass`는 주어진 체크포인트에서 올바른 아키텍처를 자동으로 추론하여 로드합니다. `from_pretrained()` 메서드를 사용하면 모든 아키텍처에 대해 사전 학습된 모델을 빠르게 로드할 수 있으므로 모델을 처음부터 학습하는 데 시간과 리소스를 투입할 필요가 없습니다. 
+체크포인트에 구애받지 않는 코드를 생성한다는 것은 코드가 한 체크포인트에서 작동하면 아키텍처가 다르더라도 다른 체크포인트(유사한 작업에 대해 학습된 경우)에서도 작동한다는 것을 의미합니다.
 
 <Tip>
 
@@ -23,7 +24,7 @@ specific language governing permissions and limitations under the License.
 이 튜토리얼에서는 다음을 학습합니다:
 
 * 사전 학습된 토크나이저 로드하기.
-* 사전 학습된 이미지 프로세서 로드하기
+* 사전 학습된 이미지 프로세서 로드하기.
 * 사전 학습된 특징 추출기 로드하기.
 * 사전 훈련된 프로세서 로드하기.
 * 사전 학습된 모델 로드하기.
@@ -39,7 +40,7 @@ specific language governing permissions and limitations under the License.
 >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
 ```
 
-그리고 다음 아래와 같이 입력을 토큰화합니다:
+그리고 아래와 같이 입력을 토큰화합니다:
 
 ```py
 >>> sequence = "In a hole in the ground there lived a hobbit."
diff --git a/docs/source/ko/notebooks.mdx b/docs/source/ko/notebooks.mdx
new file mode 100644
index 00000000000000..ead906183348a4
--- /dev/null
+++ b/docs/source/ko/notebooks.mdx
@@ -0,0 +1 @@
+# 열심히 번역 중입니다. 조금 이따 만나요!
\ No newline at end of file
diff --git a/docs/source/ko/preprocessing.mdx b/docs/source/ko/preprocessing.mdx
new file mode 100644
index 00000000000000..6b9aff451003c2
--- /dev/null
+++ b/docs/source/ko/preprocessing.mdx
@@ -0,0 +1,535 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# 전처리[[preprocess]]
+
+[[open-in-colab]]
+
+모델을 학습하려면 데이터셋을 모델에 맞는 입력 형식으로 전처리 해야 합니다. 데이터가 텍스트, 이미지 또는 오디오인지 여부에 관계없이 데이터를 텐서 배치로 변환하고 조립할 필요가 있습니다. 🤗 Transformers는 모델에 대한 데이터를 준비하는 데 도움이 되는 일련의 전처리 클래스를 제공합니다. 이 튜토리얼에서는 다음 내용을 배울 수 있습니다.
+
+* 텍스트는 [Tokenizer](./main_classes/tokenizer)를 사용하여 텍스트를 토큰 시퀀스로 변환하고 토큰의 숫자 표현을 만든 후 텐서로 조립합니다.
+* 음성 및 오디오는 [Feature extractor](./main_classes/feature_extractor)를 사용하여 오디오 파형에서 시퀀스 특성을 파악하여 텐서로 변환합니다.
+* 이미지 입력은 [ImageProcessor](./main_classes/image)을 사용하여 이미지를 텐서로 변환합니다.
+* 멀티모달 입력은 [Processor](./main_classes/processors)을 사용하여 토크나이저와 특성 추출기 또는 이미지 프로세서를 결합합니다.
+
+<Tip>
+
+`AutoProcessor`는 **항상** 작동하며 토크나이저, 이미지 프로세서, 특성 추출기 또는 프로세서 등 사용 중인 모델에 맞는 클래스를 자동으로 선택합니다.
+
+</Tip>
+
+시작하기 전에 🤗 Datasets를 설치하여 실험에 사용할 데이터를 불러올 수 있습니다:
+
+```bash
+pip install datasets
+```
+
+## 자연어처리[[natural-language-processing]]
+
+<Youtube id="Yffk5aydLzg"/>
+
+텍스트 데이터를 전처리하기 위한 기본 도구는 [tokenizer](main_classes/tokenizer)입니다. 토크나이저는 일련의 규칙에 따라 텍스트를 *토큰*으로 나눕니다. 토큰은 숫자로 변환되고 텐서는 모델 입력이 됩니다. 모델에 필요한 추가 입력은 토크나이저에 의해 추가됩니다.
+
+<Tip>
+
+사전 훈련된 모델을 사용할 계획이라면 모델과 함께 사전 훈련된 토크나이저를 사용하는 것이 중요합니다. 이렇게 하면 텍스트가 사전 훈련 말뭉치와 동일한 방식으로 분할되고 사전 훈련 중에 동일한 해당 토큰-인덱스 쌍(일반적으로 *vocab*이라고 함)을 사용합니다.
+
+</Tip>
+
+시작하려면 [`AutoTokenizer.from_pretrained`] 메소드를 사용하여 사전 훈련된 토크나이저를 불러오세요. 모델과 함께 사전 훈련된 *vocab*을 다운로드합니다:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+```
+
+그 다음으로 텍스트를 토크나이저에 넣어주세요:
+
+```py
+>>> encoded_input = tokenizer("Do not meddle in the affairs of wizards, for they are subtle and quick to anger.")
+>>> print(encoded_input)
+{'input_ids': [101, 2079, 2025, 19960, 10362, 1999, 1996, 3821, 1997, 16657, 1010, 2005, 2027, 2024, 11259, 1998, 4248, 2000, 4963, 1012, 102], 
+ 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+ 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+```
+
+토크나이저는 세 가지 중요한 항목을 포함한 사전을 반환합니다:
+
+* [input_ids](glossary#input-ids)는 문장의 각 토큰에 해당하는 인덱스입니다.
+* [attention_mask](glossary#attention-mask)는 토큰을 처리해야 하는지 여부를 나타냅니다.
+* [token_type_ids](glossary#token-type-ids)는 두 개 이상의 시퀀스가 있을 때 토큰이 속한 시퀀스를 식별합니다.
+
+`input_ids`를 디코딩하여 입력을 반환합니다:
+
+```py
+>>> tokenizer.decode(encoded_input["input_ids"])
+'[CLS] Do not meddle in the affairs of wizards, for they are subtle and quick to anger. [SEP]'
+```
+
+토크나이저가 두 개의 특수한 토큰(분류 토큰 CLS와 분할 토큰 SEP)을 문장에 추가했습니다.
+모든 모델에 특수한 토큰이 필요한 것은 아니지만, 필요한 경우 토크나이저가 자동으로 추가합니다.
+
+전처리할 문장이 여러 개 있는 경우 이를 리스트로 토크나이저에 전달합니다:
+
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_inputs = tokenizer(batch_sentences)
+>>> print(encoded_inputs)
+{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102], 
+               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], 
+               [101, 1327, 1164, 5450, 23434, 136, 102]], 
+ 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0], 
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+                    [0, 0, 0, 0, 0, 0, 0]], 
+ 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], 
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
+                    [1, 1, 1, 1, 1, 1, 1]]}
+```
+
+### 패딩[[pad]]
+
+모델 입력인 텐서는 균일한 모양을 가져야 하는데, 문장의 길이가 항상 같지 않아서 문제가 될 수 있습니다. 패딩은 짧은 문장에 특수한 *패딩 토큰*을 추가하여 텐서가 직사각형 모양이 되도록 하는 전략입니다.
+
+`padding` 매개변수를 `True`로 설정하여 배치의 짧은 시퀀스를 가장 긴 시퀀스와 일치하도록 패딩합니다.
+
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_input = tokenizer(batch_sentences, padding=True)
+>>> print(encoded_input)
+{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], 
+               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], 
+               [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], 
+ 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 
+ 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], 
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
+                    [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]}
+```
+
+길이가 짧은 첫 문장과 세 번째 문장은 이제 `0`으로 채워집니다.
+
+### 생략[[truncation]]
+
+한편, 때로는 시퀀스가 모델에서 처리하기에 너무 길 수도 있습니다. 이 경우, 시퀀스를 더 짧은 길이로 줄일 필요가 있습니다.
+
+모델에서 허용하는 최대 길이로 시퀀스를 자르려면 `truncation` 매개변수를 `True`로 설정하세요:
+
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True)
+>>> print(encoded_input)
+{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], 
+               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], 
+               [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], 
+ 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 
+ 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], 
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
+                    [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]}
+```
+
+<Tip>
+
+다양한 패딩 및 생략 인수에 대해 더 알아보려면 [Padding and truncation](./pad_truncation) 개념 가이드를 확인해보세요.
+
+</Tip>
+
+### 텐서 만들기[[build-tensors]]
+
+마지막으로, 토크나이저가 모델에 공급되는 실제 텐서를 반환하도록 합니다.
+
+`return_tensors` 매개변수를 PyTorch의 경우 `pt`, TensorFlow의 경우 `tf`로 설정하세요:
+
+<frameworkcontent>
+<pt>
+
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt")
+>>> print(encoded_input)
+{'input_ids': tensor([[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
+                      [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
+                      [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]]), 
+ 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 
+ 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
+                           [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                           [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}
+```
+</pt>
+<tf>
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="tf")
+>>> print(encoded_input)
+{'input_ids': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
+array([[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
+       [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
+       [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]],
+      dtype=int32)>, 
+ 'token_type_ids': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
+array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>, 
+ 'attention_mask': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
+array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
+       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+       [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>}
+```
+</tf>
+</frameworkcontent>
+
+## 오디오[[audio]]
+
+오디오 작업에는 데이터셋을 모델에 준비하기 위해 [특성 추출기](main_classes/feature_extractor)가 필요합니다. 특성 추출기는 원시 오디오 데이터에서 특성를 추출하고 이를 텐서로 변환하는 것이 목적입니다.
+
+오디오 데이터셋에 특성 추출기를 사용하는 방법을 보려면 [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) 데이터셋을 가져오세요. (데이터셋을 가져오는 방법은 🤗 [데이터셋 튜토리얼](https://huggingface.co/docs/datasets/load_hub.html)에서 자세히 설명하고 있습니다.)
+
+```py
+>>> from datasets import load_dataset, Audio
+
+>>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
+```
+
+`audio` 열의 첫 번째 요소에 접근하여 입력을 살펴보세요. `audio` 열을 호출하면 오디오 파일을 자동으로 가져오고 리샘플링합니다.
+
+```py
+>>> dataset[0]["audio"]
+{'array': array([ 0.        ,  0.00024414, -0.00024414, ..., -0.00024414,
+         0.        ,  0.        ], dtype=float32),
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
+ 'sampling_rate': 8000}
+```
+
+이렇게 하면 세 가지 항목이 반환됩니다:
+
+* `array`는 1D 배열로 가져와서 (필요한 경우) 리샘플링된 음성 신호입니다.
+* `path`는 오디오 파일의 위치를 가리킵니다.
+* `sampling_rate`는 음성 신호에서 초당 측정되는 데이터 포인트 수를 나타냅니다.
+
+이 튜토리얼에서는 [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) 모델을 사용합니다. 모델 카드를 보면 Wav2Vec2가 16kHz 샘플링된 음성 오디오를 기반으로 사전 학습된 것을 알 수 있습니다. 
+모델을 사전 학습하는 데 사용된 데이터셋의 샘플링 레이트와 오디오 데이터의 샘플링 레이트가 일치해야 합니다. 데이터의 샘플링 레이트가 다르면 데이터를 리샘플링해야 합니다.
+
+1. 🤗 Datasets의 [`~datasets.Dataset.cast_column`] 메소드를 사용하여 샘플링 레이트를 16kHz로 업샘플링하세요:
+
+```py
+>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000))
+```
+
+2. 오디오 파일을 리샘플링하기 위해 `audio` 열을 다시 호출합니다:
+
+```py
+>>> dataset[0]["audio"]
+{'array': array([ 2.3443763e-05,  2.1729663e-04,  2.2145823e-04, ...,
+         3.8356509e-05, -7.3497440e-06, -2.1754686e-05], dtype=float32),
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
+ 'sampling_rate': 16000}
+```
+
+다음으로, 입력을 정규화하고 패딩하는 특성 추출기를 가져오세요. 텍스트 데이터의 경우, 더 짧은 시퀀스에 대해 `0`이 추가됩니다. 오디오 데이터에도 같은 개념이 적용됩니다. 
+특성 추출기는 배열에 대해 `0`(묵음으로 해석)을 추가합니다.
+
+[`AutoFeatureExtractor.from_pretrained`]를 사용하여 특성 추출기를 가져오세요:
+
+```py
+>>> from transformers import AutoFeatureExtractor
+
+>>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
+```
+
+오디오 `array`를 특성 추출기에 전달하세요. 또한, 특성 추출기에 `sampling_rate` 인수를 추가하여 발생할 수 있는 조용한 오류(silent errors)를 더 잘 디버깅하는 것을 권장합니다.
+
+```py
+>>> audio_input = [dataset[0]["audio"]["array"]]
+>>> feature_extractor(audio_input, sampling_rate=16000)
+{'input_values': [array([ 3.8106556e-04,  2.7506407e-03,  2.8015103e-03, ...,
+        5.6335266e-04,  4.6588284e-06, -1.7142107e-04], dtype=float32)]}
+```
+
+토크나이저와 마찬가지로 배치 내에서 가변적인 시퀀스를 처리하기 위해 패딩 또는 생략을 적용할 수 있습니다. 이 두 개의 오디오 샘플의 시퀀스 길이를 확인해보세요:
+
+```py
+>>> dataset[0]["audio"]["array"].shape
+(173398,)
+
+>>> dataset[1]["audio"]["array"].shape
+(106496,)
+```
+
+오디오 샘플의 길이가 동일하도록 데이터셋을 전처리하는 함수를 만들어 보세요. 최대 샘플 길이를 지정하면, 특성 추출기가 해당 길이에 맞춰 시퀀스를 패딩하거나 생략합니다:
+
+```py
+>>> def preprocess_function(examples):
+...     audio_arrays = [x["array"] for x in examples["audio"]]
+...     inputs = feature_extractor(
+...         audio_arrays,
+...         sampling_rate=16000,
+...         padding=True,
+...         max_length=100000,
+...         truncation=True,
+...     )
+...     return inputs
+```
+
+`preprocess_function`을 데이터셋의 처음 몇 가지 예제에 적용해보세요:
+
+```py
+>>> processed_dataset = preprocess_function(dataset[:5])
+```
+
+이제 샘플 길이가 모두 같고 지정된 최대 길이에 맞게 되었습니다. 드디어 전처리된 데이터셋을 모델에 전달할 수 있습니다!
+
+```py
+>>> processed_dataset["input_values"][0].shape
+(100000,)
+
+>>> processed_dataset["input_values"][1].shape
+(100000,)
+```
+
+## 컴퓨터 비전[[computer-vision]]
+
+컴퓨터 비전 작업의 경우, 모델에 대한 데이터셋을 준비하기 위해 [이미지 프로세서](main_classes/image_processor)가 필요합니다.
+이미지 전처리는 이미지를 모델이 예상하는 입력으로 변환하는 여러 단계로 이루어집니다. 
+이러한 단계에는 크기 조정, 정규화, 색상 채널 보정, 이미지의 텐서 변환 등이 포함됩니다.
+
+<Tip>
+
+이미지 전처리는 이미지 증강 기법을 몇 가지 적용한 뒤에 할 수도 있습니다.
+이미지 전처리 및 이미지 증강은 모두 이미지 데이터를 변형하지만, 서로 다른 목적을 가지고 있습니다:
+
+* 이미지 증강은 과적합(over-fitting)을 방지하고 모델의 견고성(resiliency)을 높이는 데 도움이 되는 방식으로 이미지를 수정합니다. 
+밝기와 색상 조정, 자르기, 회전, 크기 조정, 확대/축소 등 다양한 방법으로 데이터를 증강할 수 있습니다. 
+그러나 증강으로 이미지의 의미가 바뀌지 않도록 주의해야 합니다.
+* 이미지 전처리는 이미지가 모델이 예상하는 입력 형식과 일치하도록 보장합니다. 
+컴퓨터 비전 모델을 미세 조정할 때 이미지는 모델이 초기에 훈련될 때와 정확히 같은 방식으로 전처리되어야 합니다.
+
+이미지 증강에는 원하는 라이브러리를 사용할 수 있습니다. 이미지 전처리에는 모델과 연결된 `ImageProcessor`를 사용합니다.
+
+</Tip>
+
+[food101](https://huggingface.co/datasets/food101) 데이터셋을 가져와서 컴퓨터 비전 데이터셋에서 이미지 프로세서를 어떻게 사용하는지 알아보세요. 
+데이터셋 불러오는 방법은 🤗 [데이터셋 튜토리얼](https://huggingface.co/docs/datasets/load_hub.html)를 참고하세요.
+
+<Tip>
+
+데이터셋이 상당히 크기 때문에 🤗 Datasets의 `split` 매개변수를 사용하여 학습 분할에서 작은 샘플만 가져오세요!
+
+</Tip>
+
+```py
+>>> from datasets import load_dataset
+
+>>> dataset = load_dataset("food101", split="train[:100]")
+```
+
+다음으로, 🤗 Datasets의 [`image`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=image#datasets.Image) 기능으로 이미지를 확인해보세요:
+
+```py
+>>> dataset[0]["image"]
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vision-preprocess-tutorial.png"/>
+</div>
+
+[`AutoImageProcessor.from_pretrained`]로 이미지 프로세서를 가져오세요:
+
+```py
+>>> from transformers import AutoImageProcessor
+
+>>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
+```
+
+먼저 이미지 증강 단계를 추가해 봅시다. 아무 라이브러리나 사용해도 괜찮지만, 이번 튜토리얼에서는 torchvision의 [`transforms`](https://pytorch.org/vision/stable/transforms.html) 모듈을 사용하겠습니다.
+다른 데이터 증강 라이브러리를 사용하는 방법이 알고 싶다면, [Albumentations](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb) 또는 [Kornia notebooks](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb)에서 배울 수 있습니다.
+
+1. [`Compose`](https://pytorch.org/vision/master/generated/torchvision.transforms.Compose.html)로  [`RandomResizedCrop`](https://pytorch.org/vision/main/generated/torchvision.transforms.RandomResizedCrop.html)와 [`ColorJitter`](https://pytorch.org/vision/main/generated/torchvision.transforms.ColorJitter.html) 등의 변환을 몇 가지 연결하세요.
+참고로 크기 조정에 필요한 이미지의 크기 요구사항은 `image_processor`에서 가져올 수 있습니다. 
+일부 모델은 정확한 높이와 너비를 요구하지만, 제일 짧은 변의 길이(`shortest_edge`)만 정의된 모델도 있습니다.
+
+```py
+>>> from torchvision.transforms import RandomResizedCrop, ColorJitter, Compose
+
+>>> size = (
+...     image_processor.size["shortest_edge"]
+...     if "shortest_edge" in image_processor.size
+...     else (image_processor.size["height"], image_processor.size["width"])
+... )
+
+>>> _transforms = Compose([RandomResizedCrop(size), ColorJitter(brightness=0.5, hue=0.5)])
+```
+
+2. 모델은 입력으로 [`pixel_values`](model_doc/visionencoderdecoder#transformers.VisionEncoderDecoderModel.forward.pixel_values)를 받습니다. 
+`ImageProcessor`는 이미지 정규화 및 적절한 텐서 생성을 처리할 수 있습니다. 
+배치 이미지에 대한 이미지 증강 및 이미지 전처리를 결합하고 `pixel_values`를 생성하는 함수를 만듭니다:
+
+```py
+>>> def transforms(examples):
+...     images = [_transforms(img.convert("RGB")) for img in examples["image"]]
+...     examples["pixel_values"] = image_processor(images, do_resize=False, return_tensors="pt")["pixel_values"]
+...     return examples
+```
+
+<Tip>
+
+위의 예에서는 이미지 증강 중에 이미지 크기를 조정했기 때문에 `do_resize=False`로 설정하고, 해당 `image_processor`에서 `size` 속성을 활용했습니다. 
+이미지 증강 중에 이미지 크기를 조정하지 않은 경우 이 매개 변수를 생략하세요. 
+기본적으로 `ImageProcessor`가 크기 조정을 처리합니다. 
+
+증강 변환 과정에서 이미지를 정규화하려면 `image_processor.image_mean` 및 `image_processor.image_std` 값을 사용하세요.
+
+</Tip>
+
+3. 🤗 Datasets의 [`set_transform`](https://huggingface.co/docs/datasets/process.html#format-transform)를 사용하여 실시간으로 변환을 적용합니다:
+
+```py
+>>> dataset.set_transform(transforms)
+```
+
+4. 이제 이미지에 액세스하면 이미지 프로세서가 `pixel_values`를 추가한 것을 알 수 있습니다. 
+드디어 처리된 데이터셋을 모델에 전달할 수 있습니다!
+
+```py
+>>> dataset[0].keys()
+```
+
+다음은 변형이 적용된 후의 이미지입니다. 이미지가 무작위로 잘려나갔고 색상 속성이 다릅니다.
+
+```py
+>>> import numpy as np
+>>> import matplotlib.pyplot as plt
+
+>>> img = dataset[0]["pixel_values"]
+>>> plt.imshow(img.permute(1, 2, 0))
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/preprocessed_image.png"/>
+</div>
+
+<Tip>
+
+`ImageProcessor`는 객체 감지, 시맨틱 세그멘테이션(semantic segmentation), 인스턴스 세그멘테이션(instance segmentation), 파놉틱 세그멘테이션(panoptic segmentation)과 같은 작업에 대한 후처리 방법을 제공합니다. 
+이러한 방법은 모델의 원시 출력을 경계 상자나 세그멘테이션 맵과 같은 의미 있는 예측으로 변환해줍니다.
+
+</Tip>
+
+### 패드[[pad]]
+
+예를 들어, [DETR](./model_doc/detr)와 같은 경우에는 모델이 학습할 때 크기 조정 증강을 적용합니다. 
+이로 인해 배치 내 이미지 크기가 다를 수 있습니다. 
+[`DetrImageProcessor`]의 [`DetrImageProcessor.pad_and_create_pixel_mask`]를 사용하고 사용자 지정 `collate_fn`을 정의해서 배치 이미지를 처리할 수 있습니다.
+
+```py
+>>> def collate_fn(batch):
+...     pixel_values = [item["pixel_values"] for item in batch]
+...     encoding = image_processor.pad_and_create_pixel_mask(pixel_values, return_tensors="pt")
+...     labels = [item["labels"] for item in batch]
+...     batch = {}
+...     batch["pixel_values"] = encoding["pixel_values"]
+...     batch["pixel_mask"] = encoding["pixel_mask"]
+...     batch["labels"] = labels
+...     return batch
+```
+
+## 멀티모달[[multimodal]]
+
+멀티모달 입력이 필요한 작업의 경우, 모델에 데이터셋을 준비하기 위한 [프로세서](main_classes/processors)가 필요합니다. 
+프로세서는 토크나이저와 특성 추출기와 같은 두 가지 처리 객체를 결합합니다.
+
+[LJ Speech](https://huggingface.co/datasets/lj_speech) 데이터셋을 로드하여 자동 음성 인식(ASR)을 위한 프로세서를 사용하는 방법을 확인하세요. 
+(데이터셋을 로드하는 방법에 대한 자세한 내용은 🤗 [데이터셋 튜토리얼](https://huggingface.co/docs/datasets/load_hub.html)에서 볼 수 있습니다.)
+
+```py
+>>> from datasets import load_dataset
+
+>>> lj_speech = load_dataset("lj_speech", split="train")
+```
+
+ASR에서는 `audio`와 `text`에만 집중하면 되므로, 다른 열들을 제거할 수 있습니다:
+
+```py
+>>> lj_speech = lj_speech.map(remove_columns=["file", "id", "normalized_text"])
+```
+
+이제 `audio`와 `text`열을 살펴보세요:
+
+```py
+>>> lj_speech[0]["audio"]
+{'array': array([-7.3242188e-04, -7.6293945e-04, -6.4086914e-04, ...,
+         7.3242188e-04,  2.1362305e-04,  6.1035156e-05], dtype=float32),
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/917ece08c95cf0c4115e45294e3cd0dee724a1165b7fc11798369308a465bd26/LJSpeech-1.1/wavs/LJ001-0001.wav',
+ 'sampling_rate': 22050}
+
+>>> lj_speech[0]["text"]
+'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition'
+```
+
+기존에 사전 학습된 모델에서 사용된 데이터셋과 새로운 오디오 데이터셋의 샘플링 레이트를 일치시키기 위해 오디오 데이터셋의 샘플링 레이트를 [리샘플링](preprocessing#audio)해야 합니다!
+
+```py
+>>> lj_speech = lj_speech.cast_column("audio", Audio(sampling_rate=16_000))
+```
+
+[`AutoProcessor.from_pretrained`]로 프로세서를 가져오세요:
+
+```py
+>>> from transformers import AutoProcessor
+
+>>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
+```
+
+1. `array`에 들어 있는 오디오 데이터를 `input_values`로 변환하고 `text`를 토큰화하여 `labels`로 변환하는 함수를 만듭니다. 
+모델의 입력은 다음과 같습니다:
+
+```py
+>>> def prepare_dataset(example):
+...     audio = example["audio"]
+
+...     example.update(processor(audio=audio["array"], text=example["text"], sampling_rate=16000))
+
+...     return example
+```
+
+2. 샘플을 `prepare_dataset` 함수에 적용하세요:
+
+```py
+>>> prepare_dataset(lj_speech[0])
+```
+
+이제 프로세서가 `input_values`와 `labels`를 추가하고, 샘플링 레이트도 올바르게 16kHz로 다운 샘플링했습니다. 
+드디어 처리된 데이터셋을 모델에 전달할 수 있습니다!
\ No newline at end of file
diff --git a/docs/source/ko/tasks/sequence_classification.mdx b/docs/source/ko/tasks/sequence_classification.mdx
new file mode 100644
index 00000000000000..32cf216d7b4cca
--- /dev/null
+++ b/docs/source/ko/tasks/sequence_classification.mdx
@@ -0,0 +1,391 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# 텍스트 분류[[text-classification]]
+
+[[open-in-colab]]
+
+<Youtube id="leNG9fN9FQU"/>
+
+텍스트 분류는 자연어 처리의 일종으로, 텍스트에 레이블 또는 클래스를 지정하는 작업입니다. 많은 대기업이 다양한 실용적인 응용 분야에서 텍스트 분류를 운영하고 있습니다. 가장 인기 있는 텍스트 분류 형태 중 하나는 감성 분석으로, 텍스트 시퀀스에 🙂 긍정, 🙁 부정 또는 😐 중립과 같은 레이블을 지정합니다.
+
+이 가이드에서 학습할 내용은:
+
+1. [IMDb](https://huggingface.co/datasets/imdb) 데이터셋에서 [DistilBERT](https://huggingface.co/distilbert-base-uncased)를 파인 튜닝하여 영화 리뷰가 긍정적인지 부정적인지 판단합니다.
+2. 추론을 위해 파인 튜닝 모델을 사용합니다.
+
+<Tip>
+이 튜토리얼에서 설명하는 작업은 다음 모델 아키텍처에 의해 지원됩니다:
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [ESM](../model_doc/esm), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPT Neo](../model_doc/gpt_neo), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [LLaMA](../model_doc/llama), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Perceiver](../model_doc/perceiver), [PLBart](../model_doc/plbart), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [TAPAS](../model_doc/tapas), [Transformer-XL](../model_doc/transfo-xl), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
+
+
+<!--End of the generated tip-->
+
+</Tip>
+
+시작하기 전에, 필요한 모든 라이브러리가 설치되어 있는지 확인하세요:
+
+```bash
+pip install transformers datasets evaluate
+```
+
+Hugging Face 계정에 로그인하여 모델을 업로드하고 커뮤니티에 공유하는 것을 권장합니다. 메시지가 표시되면, 토큰을 입력하여 로그인하세요:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## IMDb 데이터셋 가져오기[[load-imdb-dataset]]
+
+먼저 🤗 Datasets 라이브러리에서 IMDb 데이터셋을 가져옵니다:
+
+```py
+>>> from datasets import load_dataset
+
+>>> imdb = load_dataset("imdb")
+```
+
+그런 다음 예시를 살펴봅시다:
+
+```py
+>>> imdb["test"][0]
+{
+    "label": 0,
+    "text": "I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn't match the background, and painfully one-dimensional characters cannot be overcome with a 'sci-fi' setting. (I'm sure there are those of you out there who think Babylon 5 is good sci-fi TV. It's not. It's clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It's really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it's rubbish as they have to always say \"Gene Roddenberry's Earth...\" otherwise people would not continue watching. Roddenberry's ashes must be turning in their orbit as this dull, cheap, poorly edited (watching it without advert breaks really brings this home) trudging Trabant of a show lumbers into space. Spoiler. So, kill off a main character. And then bring him back as another actor. Jeeez! Dallas all over again.",
+}
+```
+
+이 데이터셋에는 두 가지 필드가 있습니다:
+
+- `text`: 영화 리뷰 텍스트
+- `label`: `0`은 부정적인 리뷰, `1`은 긍정적인 리뷰를 나타냅니다.
+
+## 전처리[[preprocess]]
+
+다음 단계는 DistilBERT 토크나이저를 가져와서 `text` 필드를 전처리하는 것입니다:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+```
+
+`text`를 토큰화하고 시퀀스가 DistilBERT의 최대 입력 길이보다 길지 않도록 자르기 위한 전처리 함수를 생성하세요:
+
+```py
+>>> def preprocess_function(examples):
+...     return tokenizer(examples["text"], truncation=True)
+```
+
+전체 데이터셋에 전처리 함수를 적용하려면, 🤗 Datasets [`~datasets.Dataset.map`] 함수를 사용하세요. 데이터셋의 여러 요소를 한 번에 처리하기 위해 `batched=True`로 설정함으로써 데이터셋 `map`를 더 빠르게 처리할 수 있습니다:
+
+```py
+tokenized_imdb = imdb.map(preprocess_function, batched=True)
+```
+
+이제 [`DataCollatorWithPadding`]를 사용하여 예제 배치를 만들어봅시다. 데이터셋 전체를 최대 길이로 패딩하는 대신, *동적 패딩*을 사용하여 배치에서 가장 긴 길이에 맞게 문장을 패딩하는 것이 효율적입니다.
+
+<frameworkcontent>
+<pt>
+```py
+>>> from transformers import DataCollatorWithPadding
+
+>>> data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+```
+</pt>
+<tf>
+```py
+>>> from transformers import DataCollatorWithPadding
+
+>>> data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
+```
+</tf>
+</frameworkcontent>
+
+## 평가하기[[evaluate]]
+
+훈련 중 모델의 성능을 평가하기 위해 메트릭을 포함하는 것이 유용합니다. 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) 라이브러리를 사용하여 빠르게 평가 방법을 로드할 수 있습니다. 이 작업에서는 [accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy) 메트릭을 가져옵니다. (메트릭을 가져오고 계산하는 방법에 대해서는 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour)를 참조하세요):
+
+```py
+>>> import evaluate
+
+>>> accuracy = evaluate.load("accuracy")
+```
+
+그런 다음 `compute_metrics` 함수를 만들어서 예측과 레이블을 계산하여 정확도를 계산하도록 [`~evaluate.EvaluationModule.compute`]를 호출합니다:
+
+```py
+>>> import numpy as np
+
+
+>>> def compute_metrics(eval_pred):
+...     predictions, labels = eval_pred
+...     predictions = np.argmax(predictions, axis=1)
+...     return accuracy.compute(predictions=predictions, references=labels)
+```
+
+이제 `compute_metrics` 함수는 준비되었고, 훈련 과정을 설정할 때 다시 살펴볼 예정입니다.
+
+## 훈련[[train]]
+
+모델을 훈련하기 전에, `id2label`와 `label2id`를 사용하여 예상되는 id와 레이블의 맵을 생성하세요:
+
+```py
+>>> id2label = {0: "NEGATIVE", 1: "POSITIVE"}
+>>> label2id = {"NEGATIVE": 0, "POSITIVE": 1}
+```
+
+<frameworkcontent>
+<pt>
+<Tip>
+
+[`Trainer`]를 사용하여 모델을 파인 튜닝하는 방법에 익숙하지 않은 경우, [여기](../training#train-with-pytorch-trainer)의 기본 튜토리얼을 확인하세요!
+
+</Tip>
+
+이제 모델을 훈련시킬 준비가 되었습니다! [`AutoModelForSequenceClassification`]로 DistilBERT를 가쳐오고 예상되는 레이블 수와 레이블 매핑을 지정하세요:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
+
+>>> model = AutoModelForSequenceClassification.from_pretrained(
+...     "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
+... )
+```
+
+이제 세 단계만 거치면 끝입니다:
+
+1. [`TrainingArguments`]에서 하이퍼파라미터를 정의하세요. `output_dir`는 모델을 저장할 위치를 지정하는 유일한 파라미터입니다. 이 모델을 Hub에 업로드하기 위해 `push_to_hub=True`를 설정합니다. (모델을 업로드하기 위해 Hugging Face에 로그인해야합니다.) 각 에폭이 끝날 때마다, [`Trainer`]는 정확도를 평가하고 훈련 체크포인트를 저장합니다.
+2. [`Trainer`]에 훈련 인수와 모델, 데이터셋, 토크나이저, 데이터 수집기 및 `compute_metrics` 함수를 전달하세요.
+3. [`~Trainer.train`]를 호출하여 모델은 파인 튜닝하세요.
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="my_awesome_model",
+...     learning_rate=2e-5,
+...     per_device_train_batch_size=16,
+...     per_device_eval_batch_size=16,
+...     num_train_epochs=2,
+...     weight_decay=0.01,
+...     evaluation_strategy="epoch",
+...     save_strategy="epoch",
+...     load_best_model_at_end=True,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=tokenized_imdb["train"],
+...     eval_dataset=tokenized_imdb["test"],
+...     tokenizer=tokenizer,
+...     data_collator=data_collator,
+...     compute_metrics=compute_metrics,
+... )
+
+>>> trainer.train()
+```
+
+<Tip>
+
+[`Trainer`]는 `tokenizer`를 전달하면 기본적으로 동적 매핑을 적용합니다. 이 경우, 명시적으로 데이터 수집기를 지정할 필요가 없습니다.
+
+</Tip>
+
+훈련이 완료되면, [`~transformers.Trainer.push_to_hub`] 메소드를 사용하여 모델을 Hub에 공유할 수 있습니다.
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+<Tip>
+
+Keras를 사용하여 모델을 파인 튜닝하는 방법에 익숙하지 않은 경우, [여기](../training#train-a-tensorflow-model-with-keras)의 기본 튜토리얼을 확인하세요!
+
+</Tip>
+TensorFlow에서 모델을 파인 튜닝하려면, 먼저 옵티마이저 함수와 학습률 스케쥴, 그리고 일부 훈련 하이퍼파라미터를 설정해야 합니다:
+
+```py
+>>> from transformers import create_optimizer
+>>> import tensorflow as tf
+
+>>> batch_size = 16
+>>> num_epochs = 5
+>>> batches_per_epoch = len(tokenized_imdb["train"]) // batch_size
+>>> total_train_steps = int(batches_per_epoch * num_epochs)
+>>> optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)
+```
+
+그런 다음 [`TFAutoModelForSequenceClassification`]을 사용하여 DistilBERT를 로드하고, 예상되는 레이블 수와 레이블 매핑을 로드할 수 있습니다:
+
+```py
+>>> from transformers import TFAutoModelForSequenceClassification
+
+>>> model = TFAutoModelForSequenceClassification.from_pretrained(
+...     "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
+... )
+```
+
+[`~transformers.TFPreTrainedModel.prepare_tf_dataset`]을 사용하여 데이터셋을 `tf.data.Dataset` 형식으로 변환합니다:
+
+```py
+>>> tf_train_set = model.prepare_tf_dataset(
+...     tokenized_imdb["train"],
+...     shuffle=True,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_validation_set = model.prepare_tf_dataset(
+...     tokenized_imdb["test"],
+...     shuffle=False,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+```
+
+[`compile`](https://keras.io/api/models/model_training_apis/#compile-method)를 사용하여 훈련할 모델을 구성합니다:
+
+```py
+>>> import tensorflow as tf
+
+>>> model.compile(optimizer=optimizer)
+```
+
+훈련을 시작하기 전에 설정해야할 마지막 두 가지는 예측에서 정확도를 계산하고, 모델을 Hub에 업로드할 방법을 제공하는 것입니다. 모두 [Keras callbacks](../main_classes/keras_callbacks)를 사용하여 수행됩니다.
+
+[`~transformers.KerasMetricCallback`]에 `compute_metrics`를 전달하여 정확도를 높입니다.
+
+```py
+>>> from transformers.keras_callbacks import KerasMetricCallback
+
+>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
+```
+
+[`~transformers.PushToHubCallback`]에서 모델과 토크나이저를 업로드할 위치를 지정합니다:
+
+```py
+>>> from transformers.keras_callbacks import PushToHubCallback
+
+>>> push_to_hub_callback = PushToHubCallback(
+...     output_dir="my_awesome_model",
+...     tokenizer=tokenizer,
+... )
+```
+
+그런 다음 콜백을 함께 묶습니다:
+
+```py
+>>> callbacks = [metric_callback, push_to_hub_callback]
+```
+
+드디어, 모델 훈련을 시작할 준비가 되었습니다! [`fit`](https://keras.io/api/models/model_training_apis/#fit-method)에 훈련 데이터셋, 검증 데이터셋, 에폭의 수 및 콜백을 전달하여 파인 튜닝합니다:
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=callbacks)
+```
+
+훈련이 완료되면, 모델이 자동으로 Hub에 업로드되어 모든 사람이 사용할 수 있습니다!
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+텍스트 분류를 위한 모델을 파인 튜닝하는 자세한 예제는 다음 [PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb) 또는 [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb)를 참조하세요.
+
+</Tip>
+
+## 추론[[inference]]
+
+좋아요, 이제 모델을 파인 튜닝했으니 추론에 사용할 수 있습니다!
+
+추론을 수행하고자 하는 텍스트를 가져와봅시다:
+
+```py
+>>> text = "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three."
+```
+
+파인 튜닝된 모델로 추론을 시도하는 가장 간단한 방법은 [`pipeline`]를 사용하는 것입니다. 모델로 감정 분석을 위한 `pipeline`을 인스턴스화하고, 텍스트를 전달해보세요:
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline("sentiment-analysis", model="stevhliu/my_awesome_model")
+>>> classifier(text)
+[{'label': 'POSITIVE', 'score': 0.9994940757751465}]
+```
+
+원한다면, `pipeline`의 결과를 수동으로 복제할 수도 있습니다.
+
+<frameworkcontent>
+<pt>
+텍스트를 토큰화하고 PyTorch 텐서를 반환합니다.
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_model")
+>>> inputs = tokenizer(text, return_tensors="pt")
+```
+
+입력을 모델에 전달하고 `logits`을 반환합니다:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("stevhliu/my_awesome_model")
+>>> with torch.no_grad():
+...     logits = model(**inputs).logits
+```
+
+가장 높은 확률을 가진 클래스를 모델의 `id2label` 매핑을 사용하여 텍스트 레이블로 변환합니다:
+
+```py
+>>> predicted_class_id = logits.argmax().item()
+>>> model.config.id2label[predicted_class_id]
+'POSITIVE'
+```
+</pt>
+<tf>
+텍스트를 토큰화하고 TensorFlow 텐서를 반환합니다:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_model")
+>>> inputs = tokenizer(text, return_tensors="tf")
+```
+
+입력값을 모델에 전달하고 `logits`을 반환합니다:
+
+```py
+>>> from transformers import TFAutoModelForSequenceClassification
+
+>>> model = TFAutoModelForSequenceClassification.from_pretrained("stevhliu/my_awesome_model")
+>>> logits = model(**inputs).logits
+```
+
+가장 높은 확률을 가진 클래스를 모델의 `id2label` 매핑을 사용하여 텍스트 레이블로 변환합니다:
+
+```py
+>>> predicted_class_id = int(tf.math.argmax(logits, axis=-1)[0])
+>>> model.config.id2label[predicted_class_id]
+'POSITIVE'
+```
+</tf>
+</frameworkcontent>
diff --git a/docs/source/ko/training.mdx b/docs/source/ko/training.mdx
new file mode 100644
index 00000000000000..d299eb8c7ba465
--- /dev/null
+++ b/docs/source/ko/training.mdx
@@ -0,0 +1,424 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# 사전 학습된 모델 미세 튜닝하기[[Fine-tune a pretrained model]]
+
+[[open-in-colab]]
+
+사전 학습된 모델을 사용하면 상당한 이점이 있습니다. 계산 비용과 탄소발자국을 줄이고, 처음부터 모델을 학습시킬 필요 없이 최신 모델을 사용할 수 있습니다. 🤗 Transformers는 다양한 작업을 위해 사전 학습된 수천 개의 모델에 액세스할 수 있습니다. 사전 학습된 모델을 사용하는 경우, 자신의 작업과 관련된 데이터셋을 사용해 학습합니다. 이것은 미세 튜닝이라고 하는 매우 강력한 훈련 기법입니다. 이 튜토리얼에서는 당신이 선택한 딥러닝 프레임워크로 사전 학습된 모델을 미세 튜닝합니다:
+
+* 🤗 Transformers로 사전 학습된 모델 미세 튜닝하기 [`Trainer`].
+* Keras를 사용하여 TensorFlow에서 사전 학습된 모델을 미세 튜닝하기.
+* 기본 PyTorch에서 사전 학습된 모델을 미세 튜닝하기.
+
+<a id='data-processing'></a>
+
+## 데이터셋 준비[[Prepare a dataset]]
+
+<Youtube id="_BZearw7f0w"/>
+
+사전 학습된 모델을 미세 튜닝하기 위해서 데이터셋을 다운로드하고 훈련할 수 있도록 준비하세요. 이전 튜토리얼에서 훈련을 위해 데이터를 처리하는 방법을 보여드렸는데, 지금이 배울 걸 되짚을 기회입니다!
+
+먼저 [Yelp 리뷰](https://huggingface.co/datasets/yelp_review_full) 데이터 세트를 로드합니다:
+
+```py
+>>> from datasets import load_dataset
+
+>>> dataset = load_dataset("yelp_review_full")
+>>> dataset["train"][100]
+{'label': 0,
+ 'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. I\'ve worked at more than one location. I expect bad days, bad moods, and the occasional mistake. But I have yet to have a decent experience at this store. It will remain a place I avoid unless someone in my party needs to avoid illness from low blood sugar. Perhaps I should go back to the racially biased service of Steak n Shake instead!'}
+```
+
+텍스트를 처리하고 서로 다른 길이의 시퀀스 패딩 및 잘라내기 전략을 포함하려면 토크나이저가 필요합니다. 데이터셋을 한 번에 처리하려면 🤗 Dataset [`map`](https://huggingface.co/docs/datasets/process.html#map) 메서드를 사용하여 전체 데이터셋에 전처리 함수를 적용하세요:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+
+
+>>> def tokenize_function(examples):
+...     return tokenizer(examples["text"], padding="max_length", truncation=True)
+
+
+>>> tokenized_datasets = dataset.map(tokenize_function, batched=True)
+```
+
+필요한 경우 미세 튜닝을 위해 데이터셋의 작은 부분 집합을 만들어 미세 튜닝 작업 시간을 줄일 수 있습니다:
+
+```py
+>>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
+>>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
+```
+
+<a id='trainer'></a>
+
+## Train
+
+여기서부터는 사용하려는 프레임워크에 해당하는 섹션을 따라야 합니다. 오른쪽 사이드바의 링크를 사용하여 원하는 프레임워크로 이동할 수 있으며, 특정 프레임워크의 모든 콘텐츠를 숨기려면 해당 프레임워크 블록의 오른쪽 상단에 있는 버튼을 사용하면 됩니다!
+
+<frameworkcontent>
+<pt>
+<Youtube id="nvBXf7s7vTI"/>
+
+## 파이토치 Trainer로 훈련하기[[Train with PyTorch Trainer]]
+
+🤗 Transformers는 🤗 Transformers 모델 훈련에 최적화된 [`Trainer`] 클래스를 제공하여 훈련 루프를 직접 작성하지 않고도 쉽게 훈련을 시작할 수 있습니다. [`Trainer`] API는 로깅(logging), 경사 누적(gradient accumulation), 혼합 정밀도(mixed precision) 등 다양한 훈련 옵션과 기능을 지원합니다.
+
+먼저 모델을 가져오고 예상되는 레이블 수를 지정합니다. Yelp 리뷰 [데이터셋 카드](https://huggingface.co/datasets/yelp_review_full#data-fields)에서 5개의 레이블이 있음을 알 수 있습니다:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+```
+
+<Tip>
+
+사전 훈련된 가중치 중 일부가 사용되지 않고 일부 가중치가 무작위로 표시된다는 경고가 표시됩니다.
+걱정마세요. 이것은 올바른 동작입니다! 사전 학습된 BERT 모델의 헤드는 폐기되고 무작위로 초기화된 분류 헤드로 대체됩니다. 이제 사전 학습된 모델의 지식으로 시퀀스 분류 작업을 위한 새로운 모델 헤드를 미세 튜닝 합니다.
+
+</Tip>
+
+### 하이퍼파라미터 훈련[[Training hyperparameters]]
+
+다음으로 정할 수 있는 모든 하이퍼파라미터와 다양한 훈련 옵션을 활성화하기 위한 플래그를 포함하는 [`TrainingArguments`] 클래스를 생성합니다.
+
+이 튜토리얼에서는 기본 훈련 [하이퍼파라미터](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments)로 시작하지만, 자유롭게 실험하여 여러분들에게 맞는 최적의 설정을 찾을 수 있습니다.
+
+훈련에서 체크포인트(checkpoints)를 저장할 위치를 지정합니다:
+
+```py
+>>> from transformers import TrainingArguments
+
+>>> training_args = TrainingArguments(output_dir="test_trainer")
+```
+
+### Evaluate
+
+[`Trainer`]는 훈련 중에 모델 성능을 자동으로 평가하지 않습니다. 평가 지표를 계산하고 보고할 함수를 [`Trainer`]에 전달해야 합니다. 
+[🤗 Evaluate](https://huggingface.co/docs/evaluate/index) 라이브러리는 [`evaluate.load`](https://huggingface.co/spaces/evaluate-metric/accuracy) 함수로 로드할 수 있는 간단한 [`accuracy`]함수를 제공합니다 (자세한 내용은 [둘러보기](https://huggingface.co/docs/evaluate/a_quick_tour)를 참조하세요):
+
+```py
+>>> import numpy as np
+>>> import evaluate
+
+>>> metric = evaluate.load("accuracy")
+```
+
+`metric`에서 [`~evaluate.compute`]를 호출하여 예측의 정확도를 계산합니다. 예측을 `compute`에 전달하기 전에 예측을 로짓으로 변환해야 합니다(모든 🤗 Transformers 모델은 로짓으로 반환한다는 점을 기억하세요):
+
+```py
+>>> def compute_metrics(eval_pred):
+...     logits, labels = eval_pred
+...     predictions = np.argmax(logits, axis=-1)
+...     return metric.compute(predictions=predictions, references=labels)
+```
+
+미세 튜닝 중에 평가 지표를 모니터링하려면 훈련 인수에 `evaluation_strategy` 파라미터를 지정하여 각 에폭이 끝날 때 평가 지표를 확인할 수 있습니다:
+
+```py
+>>> from transformers import TrainingArguments, Trainer
+
+>>> training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
+```
+
+### Trainer
+
+모델, 훈련 인수, 훈련 및 테스트 데이터셋, 평가 함수가 포함된 [`Trainer`] 객체를 만듭니다:
+
+```py
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=small_train_dataset,
+...     eval_dataset=small_eval_dataset,
+...     compute_metrics=compute_metrics,
+... )
+```
+
+그리고 [`~transformers.Trainer.train`]을 호출하여 모델을 미세 튜닝합니다:
+
+```py
+>>> trainer.train()
+```
+</pt>
+<tf>
+<a id='keras'></a>
+
+<Youtube id="rnTGBy2ax1c"/>
+
+## Keras로 텐서플로우 모델 훈련하기[[Train a TensorFlow model with Keras]]
+
+Keras API를 사용하여 텐서플로우에서 🤗 Transformers 모델을 훈련할 수도 있습니다!
+
+### Keras용 데이터 로드[[Loading data for Keras]]
+
+Keras API로 🤗 Transformers 모델을 학습시키려면 데이터셋을 Keras가 이해할 수 있는 형식으로 변환해야 합니다.
+데이터 세트가 작은 경우, 전체를 NumPy 배열로 변환하여 Keras로 전달하면 됩니다.
+더 복잡한 작업을 수행하기 전에 먼저 이 작업을 시도해 보겠습니다.
+
+먼저 데이터 세트를 로드합니다. [GLUE 벤치마크](https://huggingface.co/datasets/glue)의 CoLA 데이터 세트를 사용하겠습니다.
+간단한 바이너리 텍스트 분류 작업이므로 지금은 훈련 데이터 분할만 사용합니다.
+
+```py
+from datasets import load_dataset
+
+dataset = load_dataset("glue", "cola")
+dataset = dataset["train"]  # Just take the training split for now
+```
+
+다음으로 토크나이저를 로드하고 데이터를 NumPy 배열로 토큰화합니다. 레이블은 이미 0과 1로 된 리스트이기 때문에 토큰화하지 않고 바로 NumPy 배열로 변환할 수 있습니다!
+
+```py
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+tokenized_data = tokenizer(dataset["sentence"], return_tensors="np", padding=True)
+# Tokenizer returns a BatchEncoding, but we convert that to a dict for Keras
+tokenized_data = dict(tokenized_data)
+
+labels = np.array(dataset["label"])  # Label is already an array of 0 and 1
+```
+
+마지막으로 모델을 로드, [`compile`](https://keras.io/api/models/model_training_apis/#compile-method), [`fit`](https://keras.io/api/models/model_training_apis/#fit-method)합니다:
+
+```py
+from transformers import TFAutoModelForSequenceClassification
+from tensorflow.keras.optimizers import Adam
+
+# Load and compile our model
+model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
+# Lower learning rates are often better for fine-tuning transformers
+model.compile(optimizer=Adam(3e-5))
+
+model.fit(tokenized_data, labels)
+```
+
+<Tip>
+
+모델을 `compile()`할 때 손실 인수를 모델에 전달할 필요가 없습니다! 
+이 인수를 비워두면 허깅 페이스 모델은 작업과 모델 아키텍처에 적합한 손실을 자동으로 선택합니다. 
+원한다면 언제든지 직접 손실을 지정하여 이를 재정의할 수 있습니다!
+
+</Tip>
+
+이 접근 방식은 소규모 데이터 집합에서는 잘 작동하지만, 대규모 데이터 집합에서는 문제가 될 수 있습니다. 왜 그럴까요?
+토큰화된 배열과 레이블을 메모리에 완전히 로드하고 NumPy는 "들쭉날쭉한" 배열을 처리하지 않기 때문에,
+모든 토큰화된 샘플을 전체 데이터셋에서 가장 긴 샘플의 길이만큼 패딩해야 합니다. 이렇게 하면 배열이 훨씬 더 커지고 이 패딩 토큰으로 인해 학습 속도도 느려집니다!
+
+### 데이터를 tf.data.Dataset으로 로드하기[[Loading data as a tf.data.Dataset]]
+
+학습 속도가 느려지는 것을 피하려면 데이터를 `tf.data.Dataset`으로 로드할 수 있습니다. 원한다면 직접
+`tf.data` 파이프라인을 직접 작성할 수도 있지만, 이 작업을 간편하게 수행하는 수 있는 두 가지 방법이 있습니다:
+
+- [`~TFPreTrainedModel.prepare_tf_dataset`]: 대부분의 경우 이 방법을 권장합니다. 모델의 메서드이기 때문에 모델을 검사하여 모델 입력으로 사용할 수 있는 열을 자동으로 파악하고
+나머지는 버려서 더 단순하고 성능이 좋은 데이터 집합을 만들 수 있습니다.
+- [`~datasets.Dataset.to_tf_dataset`]: 이 방법은 좀 더 낮은 수준이며, 포함할 '열'과 '레이블'을 정확히 지정하여
+데이터셋을 생성하는 방법을 정확히 제어하고 싶을 때 유용하며, 포함할 'columns'과 'label_cols'을 정확히 지정할 수 있습니다.
+
+[`~TFPreTrainedModel.prepare_tf_dataset`]을 사용하려면 먼저 다음 코드 샘플과 같이 토크나이저 출력을 데이터 세트에 열로 추가해야 합니다:
+
+```py
+def tokenize_dataset(data):
+    # Keys of the returned dictionary will be added to the dataset as columns
+    return tokenizer(data["text"])
+
+
+dataset = dataset.map(tokenize_dataset)
+```
+
+허깅 페이스 데이터셋은 기본적으로 디스크에 저장되므로 메모리 사용량을 늘리지 않는다는 점을 기억하세요! 
+열이 추가되면 데이터셋에서 배치를 스트리밍하고 각 배치에 패딩을 추가할 수 있으므로 전체 데이터셋에 패딩을 추가하는 것보다 패딩 토큰의 수를 크게 줄일 수 있습니다.
+
+
+```py
+>>> tf_dataset = model.prepare_tf_dataset(dataset, batch_size=16, shuffle=True, tokenizer=tokenizer)
+```
+
+위의 코드 샘플에서는 배치가 로드될 때 올바르게 패딩할 수 있도록 `prepare_tf_dataset`에 토크나이저를 전달해야 합니다.
+데이터셋의 모든 샘플 길이가 같고 패딩이 필요하지 않은 경우 이 인수를 건너뛸 수 있습니다.
+샘플을 채우는 것보다 더 복잡한 작업(예: 마스킹된 언어의 토큰 손상 모델링)을 수행하기 위해 토큰을 손상시켜야 하는 경우, 
+`collate_fn` 인수를 사용하여 샘플 목록을 배치로 변환하고 원하는 전처리를 적용할 함수를 전달할 수 있습니다. 
+[예시](https://github.com/huggingface/transformers/tree/main/examples) 또는 
+[노트북](https://huggingface.co/docs/transformers/notebooks)을 참조하여 이 접근 방식이 실제로 작동하는 모습을 확인하세요.
+
+`tf.data.Dataset`을 생성한 후에는 이전과 마찬가지로 모델을 컴파일하고 훈련(fit)할 수 있습니다:
+
+```py
+model.compile(optimizer=Adam(3e-5))
+
+model.fit(tf_dataset)
+```
+
+</tf>
+</frameworkcontent>
+
+<a id='pytorch_native'></a>
+
+## 기본 파이토치로 훈련하기[[Train in native PyTorch]]
+
+<frameworkcontent>
+<pt>
+<Youtube id="Dh9CL8fyG80"/>
+
+[`Trainer`]는 훈련 루프를 처리하며 한 줄의 코드로 모델을 미세 조정할 수 있습니다. 직접 훈련 루프를 작성하는 것을 선호하는 사용자의 경우, 기본 PyTorch에서 🤗 Transformers 모델을 미세 조정할 수도 있습니다.
+
+이 시점에서 노트북을 다시 시작하거나 다음 코드를 실행해 메모리를 확보해야 할 수 있습니다:
+
+```py
+del model
+del trainer
+torch.cuda.empty_cache()
+```
+
+다음으로, '토큰화된 데이터셋'을 수동으로 후처리하여 훈련련에 사용할 수 있도록 준비합니다.
+
+1. 모델이 원시 텍스트를 입력으로 허용하지 않으므로 `text` 열을 제거합니다:
+
+    ```py
+    >>> tokenized_datasets = tokenized_datasets.remove_columns(["text"])
+    ```
+
+2. 모델에서 인수의 이름이 `labels`로 지정될 것으로 예상하므로 `label` 열의 이름을 `labels`로 변경합니다:
+
+    ```py
+    >>> tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
+    ```
+
+3. 데이터셋의 형식을 List 대신 PyTorch 텐서를 반환하도록 설정합니다:
+
+    ```py
+    >>> tokenized_datasets.set_format("torch")
+    ```
+
+그리고 앞서 표시된 대로 데이터셋의 더 작은 하위 집합을 생성하여 미세 조정 속도를 높입니다:
+
+```py
+>>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
+>>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
+```
+
+### DataLoader[[DataLoader]]
+
+훈련 및 테스트 데이터셋에 대한 'DataLoader'를 생성하여 데이터 배치를 반복할 수 있습니다:
+
+```py
+>>> from torch.utils.data import DataLoader
+
+>>> train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
+>>> eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)
+```
+
+예측을 위한 레이블 개수를 사용하여 모델을 로드합니다:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+```
+
+### 옵티마이저 및 학습 속도 스케줄러[[Optimizer and learning rate scheduler]]
+
+옵티마이저와 학습 속도 스케줄러를 생성하여 모델을 미세 조정합니다. 파이토치에서 제공하는 [`AdamW`](https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html) 옵티마이저를 사용해 보겠습니다:
+
+```py
+>>> from torch.optim import AdamW
+
+>>> optimizer = AdamW(model.parameters(), lr=5e-5)
+```
+
+[`Trainer`]에서 기본 학습 속도 스케줄러를 생성합니다:
+
+```py
+>>> from transformers import get_scheduler
+
+>>> num_epochs = 3
+>>> num_training_steps = num_epochs * len(train_dataloader)
+>>> lr_scheduler = get_scheduler(
+...     name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
+... )
+```
+
+마지막으로, GPU에 액세스할 수 있는 경우 'device'를 지정하여 GPU를 사용하도록 합니다. 그렇지 않으면 CPU에서 훈련하며 몇 분이 아닌 몇 시간이 걸릴 수 있습니다.
+
+```py
+>>> import torch
+
+>>> device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+>>> model.to(device)
+```
+
+<Tip>
+
+[Colaboratory](https://colab.research.google.com/) 또는 [SageMaker StudioLab](https://studiolab.sagemaker.aws/)과 같은 호스팅 노트북이 없는 경우 클라우드 GPU에 무료로 액세스할 수 있습니다.
+
+</Tip>
+
+이제 훈련할 준비가 되었습니다! 🥳
+
+### 훈련 루프[[Training loop]]
+
+훈련 진행 상황을 추적하려면 [tqdm](https://tqdm.github.io/) 라이브러리를 사용하여 트레이닝 단계 수에 진행률 표시줄을 추가하세요:
+
+```py
+>>> from tqdm.auto import tqdm
+
+>>> progress_bar = tqdm(range(num_training_steps))
+
+>>> model.train()
+>>> for epoch in range(num_epochs):
+...     for batch in train_dataloader:
+...         batch = {k: v.to(device) for k, v in batch.items()}
+...         outputs = model(**batch)
+...         loss = outputs.loss
+...         loss.backward()
+
+...         optimizer.step()
+...         lr_scheduler.step()
+...         optimizer.zero_grad()
+...         progress_bar.update(1)
+```
+
+### 평가하기[[Evaluate]]
+
+[`Trainer`]에 평가 함수를 추가한 방법과 마찬가지로, 훈련 루프를 직접 작성할 때도 동일한 작업을 수행해야 합니다. 하지만 이번에는 각 에포크가 끝날 때마다 평가지표를 계산하여 보고하는 대신, [`~evaluate.add_batch`]를 사용하여 모든 배치를 누적하고 맨 마지막에 평가지표를 계산합니다.
+
+```py
+>>> import evaluate
+
+>>> metric = evaluate.load("accuracy")
+>>> model.eval()
+>>> for batch in eval_dataloader:
+...     batch = {k: v.to(device) for k, v in batch.items()}
+...     with torch.no_grad():
+...         outputs = model(**batch)
+
+...     logits = outputs.logits
+...     predictions = torch.argmax(logits, dim=-1)
+...     metric.add_batch(predictions=predictions, references=batch["labels"])
+
+>>> metric.compute()
+```
+</pt>
+</frameworkcontent>
+
+<a id='additional-resources'></a>
+
+## 추가 자료[[Additional resources]]
+
+더 많은 미세 튜닝 예제는 다음을 참조하세요:
+
+- [🤗 Trnasformers 예제](https://github.com/huggingface/transformers/tree/main/examples)에는 PyTorch 및 텐서플로우에서 일반적인 NLP 작업을 훈련할 수 있는 스크립트가 포함되어 있습니다.
+
+- [🤗 Transformers 노트북](notebooks)에는 PyTorch 및 텐서플로우에서 특정 작업을 위해 모델을 미세 튜닝하는 방법에 대한 다양한 노트북이 포함되어 있습니다.
diff --git a/examples/flax/question-answering/run_qa.py b/examples/flax/question-answering/run_qa.py
index 86cd87f233cebb..5a9f5eb16e0faf 100644
--- a/examples/flax/question-answering/run_qa.py
+++ b/examples/flax/question-answering/run_qa.py
@@ -61,7 +61,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.28.0.dev0")
+check_min_version("4.29.0.dev0")
 
 Array = Any
 Dataset = datasets.arrow_dataset.Dataset
diff --git a/examples/flax/text-classification/run_flax_glue.py b/examples/flax/text-classification/run_flax_glue.py
index 4f1019a5c4cdf3..ffd98152d77c8c 100755
--- a/examples/flax/text-classification/run_flax_glue.py
+++ b/examples/flax/text-classification/run_flax_glue.py
@@ -54,7 +54,7 @@
 
 logger = logging.getLogger(__name__)
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.28.0.dev0")
+check_min_version("4.29.0.dev0")
 
 Array = Any
 Dataset = datasets.arrow_dataset.Dataset
diff --git a/examples/flax/token-classification/run_flax_ner.py b/examples/flax/token-classification/run_flax_ner.py
index f167cad2290d8b..8e038ac136792c 100644
--- a/examples/flax/token-classification/run_flax_ner.py
+++ b/examples/flax/token-classification/run_flax_ner.py
@@ -55,7 +55,7 @@
 
 logger = logging.getLogger(__name__)
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.28.0.dev0")
+check_min_version("4.29.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
 
diff --git a/examples/pytorch/audio-classification/run_audio_classification.py b/examples/pytorch/audio-classification/run_audio_classification.py
index 2333b08b583051..e9beb8dcf9171d 100644
--- a/examples/pytorch/audio-classification/run_audio_classification.py
+++ b/examples/pytorch/audio-classification/run_audio_classification.py
@@ -45,7 +45,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.28.0.dev0")
+check_min_version("4.29.0.dev0")
 
 require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt")
 
diff --git a/examples/pytorch/contrastive-image-text/run_clip.py b/examples/pytorch/contrastive-image-text/run_clip.py
index 749582b782e57b..2121602ec3e625 100644
--- a/examples/pytorch/contrastive-image-text/run_clip.py
+++ b/examples/pytorch/contrastive-image-text/run_clip.py
@@ -54,7 +54,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.28.0.dev0")
+check_min_version("4.29.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt")
 
diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py
index e419028e70e07d..4b4fee5b5175ab 100644
--- a/examples/pytorch/image-classification/run_image_classification.py
+++ b/examples/pytorch/image-classification/run_image_classification.py
@@ -55,7 +55,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.28.0.dev0")
+check_min_version("4.29.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")
 
diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
index 33e96a1656fa45..e3069943543193 100644
--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@@ -47,7 +47,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.28.0.dev0")
+check_min_version("4.29.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/pytorch/image-pretraining/run_mae.py b/examples/pytorch/image-pretraining/run_mae.py
index c1f6ba26a42779..3f7ef47c6a6796 100644
--- a/examples/pytorch/image-pretraining/run_mae.py
+++ b/examples/pytorch/image-pretraining/run_mae.py
@@ -43,7 +43,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.28.0.dev0")
+check_min_version("4.29.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
 
diff --git a/examples/pytorch/image-pretraining/run_mim.py b/examples/pytorch/image-pretraining/run_mim.py
index 0997858901ebaf..874b7c65124880 100644
--- a/examples/pytorch/image-pretraining/run_mim.py
+++ b/examples/pytorch/image-pretraining/run_mim.py
@@ -48,7 +48,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.28.0.dev0")
+check_min_version("4.29.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
 
diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py
index 619697f9b49f90..020a6f10dde9eb 100755
--- a/examples/pytorch/language-modeling/run_clm.py
+++ b/examples/pytorch/language-modeling/run_clm.py
@@ -55,7 +55,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.28.0.dev0")
+check_min_version("4.29.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py
index a62a0267bf193f..4bb750a0b02481 100755
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -57,7 +57,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.28.0.dev0")
+check_min_version("4.29.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py
index b8336fedce0824..7b8d6798025118 100755
--- a/examples/pytorch/language-modeling/run_mlm.py
+++ b/examples/pytorch/language-modeling/run_mlm.py
@@ -53,7 +53,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.28.0.dev0")
+check_min_version("4.29.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
index c67535edb2672f..9334de8c033188 100755
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@@ -57,7 +57,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.28.0.dev0")
+check_min_version("4.29.0.dev0")
 
 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py
index 06ab1f29f86fe0..ab955d5b941ad7 100755
--- a/examples/pytorch/language-modeling/run_plm.py
+++ b/examples/pytorch/language-modeling/run_plm.py
@@ -47,7 +47,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.28.0.dev0")
+check_min_version("4.29.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/pytorch/multiple-choice/run_swag.py b/examples/pytorch/multiple-choice/run_swag.py
index 934b77d503e1aa..a0660f0085f35c 100755
--- a/examples/pytorch/multiple-choice/run_swag.py
+++ b/examples/pytorch/multiple-choice/run_swag.py
@@ -47,7 +47,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.28.0.dev0")
+check_min_version("4.29.0.dev0")
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
index f4a358719a5753..6d3987c8feb494 100755
--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@@ -56,7 +56,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.28.0.dev0")
+check_min_version("4.29.0.dev0")
 
 logger = get_logger(__name__)
 # You should update this to your particular problem to have better documentation of `model_type`
diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py
index e6a1d421b9a7a0..d3377611bdfdba 100755
--- a/examples/pytorch/question-answering/run_qa.py
+++ b/examples/pytorch/question-answering/run_qa.py
@@ -49,7 +49,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.28.0.dev0")
+check_min_version("4.29.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/question-answering/run_qa_beam_search.py b/examples/pytorch/question-answering/run_qa_beam_search.py
index 5321d184af2889..fc4b0e0288be92 100755
--- a/examples/pytorch/question-answering/run_qa_beam_search.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.28.0.dev0")
+check_min_version("4.29.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
index fa59a241422ce4..ba7e82cc6470d8 100644
--- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
@@ -56,7 +56,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.28.0.dev0")
+check_min_version("4.29.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py
index 21db900c65bafb..67d9fb8b455dad 100755
--- a/examples/pytorch/question-answering/run_qa_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_no_trainer.py
@@ -57,7 +57,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.28.0.dev0")
+check_min_version("4.29.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/question-answering/run_seq2seq_qa.py b/examples/pytorch/question-answering/run_seq2seq_qa.py
index da56580472fe86..3c0ac4dfbc22af 100644
--- a/examples/pytorch/question-answering/run_seq2seq_qa.py
+++ b/examples/pytorch/question-answering/run_seq2seq_qa.py
@@ -46,7 +46,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.28.0.dev0")
+check_min_version("4.29.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
index b503755355783d..e1027f5d67b4ad 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
@@ -51,7 +51,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.28.0.dev0")
+check_min_version("4.29.0.dev0")
 
 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/semantic-segmentation/requirements.txt")
 
diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
index 791291dd356edf..00d115646ac472 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
@@ -50,7 +50,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.28.0.dev0")
+check_min_version("4.29.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
index a702e460aaeb26..8c4f4352489a0f 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
@@ -51,7 +51,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.28.0.dev0")
+check_min_version("4.29.0.dev0")
 
 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
 
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
index 79826f0d9db499..419be107b16480 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.28.0.dev0")
+check_min_version("4.29.0.dev0")
 
 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
 
diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py
index c2e0a6828c8a0a..e083e68848ef4b 100755
--- a/examples/pytorch/summarization/run_summarization.py
+++ b/examples/pytorch/summarization/run_summarization.py
@@ -52,7 +52,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.28.0.dev0")
+check_min_version("4.29.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 
diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py
index e0c4b313ad7625..37ea3bcfbb9e76 100644
--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@@ -56,7 +56,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.28.0.dev0")
+check_min_version("4.29.0.dev0")
 
 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py
index f05967f8baf813..1bb4c7bee7b886 100755
--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.28.0.dev0")
+check_min_version("4.29.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 
diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py
index 71c3686f6764b9..c71581f7811cdb 100644
--- a/examples/pytorch/text-classification/run_glue_no_trainer.py
+++ b/examples/pytorch/text-classification/run_glue_no_trainer.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.28.0.dev0")
+check_min_version("4.29.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/pytorch/text-classification/run_xnli.py b/examples/pytorch/text-classification/run_xnli.py
index 871fa3013cf920..88139986b28613 100755
--- a/examples/pytorch/text-classification/run_xnli.py
+++ b/examples/pytorch/text-classification/run_xnli.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.28.0.dev0")
+check_min_version("4.29.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 
diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py
index af71ade1162d81..9e5dd8d31bd274 100755
--- a/examples/pytorch/token-classification/run_ner.py
+++ b/examples/pytorch/token-classification/run_ner.py
@@ -49,7 +49,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.28.0.dev0")
+check_min_version("4.29.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
 
diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py
index d76ee33ebc01e1..8003128394403a 100755
--- a/examples/pytorch/token-classification/run_ner_no_trainer.py
+++ b/examples/pytorch/token-classification/run_ner_no_trainer.py
@@ -55,7 +55,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.28.0.dev0")
+check_min_version("4.29.0.dev0")
 
 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py
index 9cac8736175e0d..d31a6a8ca0350f 100755
--- a/examples/pytorch/translation/run_translation.py
+++ b/examples/pytorch/translation/run_translation.py
@@ -52,7 +52,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.28.0.dev0")
+check_min_version("4.29.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
 
diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py
index 8c29db4d6db278..29f3e49f0a0727 100644
--- a/examples/pytorch/translation/run_translation_no_trainer.py
+++ b/examples/pytorch/translation/run_translation_no_trainer.py
@@ -57,7 +57,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.28.0.dev0")
+check_min_version("4.29.0.dev0")
 
 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
diff --git a/examples/tensorflow/contrastive-image-text/run_clip.py b/examples/tensorflow/contrastive-image-text/run_clip.py
index 4f7177b8f0bbbe..35359a2fa7089c 100644
--- a/examples/tensorflow/contrastive-image-text/run_clip.py
+++ b/examples/tensorflow/contrastive-image-text/run_clip.py
@@ -51,7 +51,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.28.0.dev0")
+check_min_version("4.29.0.dev0")
 
 require_version(
     "datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/contrastive-image-text/requirements.txt"
diff --git a/examples/tensorflow/image-classification/run_image_classification.py b/examples/tensorflow/image-classification/run_image_classification.py
index 7e68a02986efe8..105d5c805676b0 100644
--- a/examples/tensorflow/image-classification/run_image_classification.py
+++ b/examples/tensorflow/image-classification/run_image_classification.py
@@ -54,7 +54,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.28.0.dev0")
+check_min_version("4.29.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")
 
diff --git a/examples/tensorflow/language-modeling-tpu/README.md b/examples/tensorflow/language-modeling-tpu/README.md
new file mode 100644
index 00000000000000..25381f86d093af
--- /dev/null
+++ b/examples/tensorflow/language-modeling-tpu/README.md
@@ -0,0 +1,110 @@
+# Training a masked language model end-to-end from scratch on TPUs
+
+In this example, we're going to demonstrate how to train a TensorFlow model from 🤗 Transformers from scratch. If you're interested in some background theory on training Hugging Face models with TensorFlow on TPU, please check out our 
+[tutorial doc](https://huggingface.co/docs/transformers/main/perf_train_tpu_tf) on this topic!
+If you're interested in smaller-scale TPU training from a pre-trained checkpoint, you can also check out the  [TPU fine-tuning example](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb).
+
+This example will demonstrate pre-training language models at the 100M-1B parameter scale, similar to BERT or GPT-2. More concretely, we will show how to train a [RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta) (base model) from scratch on the [WikiText dataset (v1)](https://huggingface.co/datasets/wikitext).
+
+We've tried to ensure that all the practices we show you here are scalable, though - with relatively few changes, the code could be scaled up to much larger models. 
+
+Google's gargantuan [PaLM model](https://arxiv.org/abs/2204.02311), with
+over 500B parameters, is a good example of how far you can go with pure TPU training, though gathering the dataset and the budget to train at that scale is not an easy task!
+
+### Table of contents 
+
+- [Setting up a TPU-VM](#setting-up-a-tpu-vm)
+- [Training a tokenizer](#training-a-tokenizer)
+- [Preparing the dataset](#preparing-the-dataset)
+- [Training the model](#training-the-model)
+- [Inference](#inference)
+
+## Setting up a TPU-VM
+
+Since this example focuses on using TPUs, the first step is to set up access to TPU hardware. For this example, we chose to use a TPU v3-8 VM. Follow [this guide](https://cloud.google.com/tpu/docs/run-calculation-tensorflow) to quickly create a TPU VM with TensorFlow pre-installed. 
+
+> 💡 **Note**: You don't need a TPU-enabled hardware for tokenizer training and TFRecord shard preparation.
+
+## Training a tokenizer
+
+To train a language model from scratch, the first step is to tokenize text. In most Hugging Face examples, we begin from a pre-trained model and use its tokenizer. However, in this example, we're going to train a tokenizer from scratch as well. The script for this is `train_unigram.py`. An example command is:
+
+```bash 
+python train_unigram.py --batch_size 1000 --vocab_size 25000 --export_to_hub
+```
+
+The script will automatically load the `train` split of the WikiText dataset and train a [Unigram tokenizer](https://huggingface.co/course/chapter6/7?fw=pt) on it.
+
+> 💡 **Note**: In order for `export_to_hub` to work, you must authenticate yourself with the `huggingface-cli`. Run `huggingface-cli login` and follow the on-screen instructions.
+
+## Preparing the dataset
+
+The next step is to prepare the dataset. This consists of loading a text dataset from the Hugging Face Hub, tokenizing it and grouping it into chunks of a fixed length ready for training. The script for this is `prepare_tfrecord_shards.py`.
+
+The reason we create TFRecord output files from this step is that these files work well with [`tf.data` pipelines](https://www.tensorflow.org/guide/data_performance). This makes them very suitable for scalable TPU training - the dataset can easily be sharded and read in parallel just by tweaking a few parameters in the pipeline. An example command is:
+
+```bash
+python prepare_tfrecord_shards.py \
+  --tokenizer_name_or_path tf-tpu/unigram-tokenizer-wikitext \
+  --shard_size 5000  \
+  --split test 
+  --max_length 128 \
+  --output_dir gs://tf-tpu-training-resources
+```
+
+**Notes**:
+
+* While running the above script, you need to specify the `split` accordingly. The example command above will only filter the `test` split of the dataset. 
+* If you append `gs://` in your `output_dir` the TFRecord shards will be directly serialized to a Google Cloud Storage (GCS) bucket. Ensure that you have already [created the GCS bucket](https://cloud.google.com/storage/docs). 
+* If you're using a TPU node, you must stream data from a GCS bucket. Otherwise, if you're using a TPU VM,you can store the data locally. You may need to [attach](https://cloud.google.com/tpu/docs/setup-persistent-disk) a persistent storage to the VM. 
+* Additional CLI arguments are also supported. We encourage you to run `python prepare_tfrecord_shards.py -h` to know more about them.
+
+## Training the model
+
+Once that's done, the model is ready for training. By default, training takes place on TPU, but you can use the `--no_tpu` flag to train on CPU for testing purposes. An example command is:
+
+```bash
+python3 run_mlm.py \
+  --train_dataset gs://tf-tpu-training-resources/train/ \
+  --eval_dataset gs://tf-tpu-training-resources/validation/ \
+  --tokenizer tf-tpu/unigram-tokenizer-wikitext \
+  --output_dir trained_model  
+```
+
+If you had specified a `hub_model_id` while launching training, then your model will be pushed to a model repository on the Hugging Face Hub. You can find such an example repository here:
+[tf-tpu/roberta-base-epochs-500-no-wd](https://huggingface.co/tf-tpu/roberta-base-epochs-500-no-wd).
+
+## Inference
+
+Once the model is trained, you can use 🤗 Pipelines to perform inference:
+
+```python
+from transformers import pipeline
+
+model_id = "tf-tpu/roberta-base-epochs-500-no-wd"
+unmasker = pipeline("fill-mask", model=model_id, framework="tf")
+unmasker("Goal of my life is to [MASK].")
+
+[{'score': 0.1003185287117958,
+  'token': 52,
+  'token_str': 'be',
+  'sequence': 'Goal of my life is to be.'},
+ {'score': 0.032648514956235886,
+  'token': 5,
+  'token_str': '',
+  'sequence': 'Goal of my life is to .'},
+ {'score': 0.02152673341333866,
+  'token': 138,
+  'token_str': 'work',
+  'sequence': 'Goal of my life is to work.'},
+ {'score': 0.019547373056411743,
+  'token': 984,
+  'token_str': 'act',
+  'sequence': 'Goal of my life is to act.'},
+ {'score': 0.01939118467271328,
+  'token': 73,
+  'token_str': 'have',
+  'sequence': 'Goal of my life is to have.'}]
+```
+
+You can also try out inference using the [Inference Widget](https://huggingface.co/tf-tpu/roberta-base-epochs-500-no-wd?text=Goal+of+my+life+is+to+%5BMASK%5D.) from the model page.
\ No newline at end of file
diff --git a/examples/tensorflow/language-modeling-tpu/prepare_tfrecord_shards.py b/examples/tensorflow/language-modeling-tpu/prepare_tfrecord_shards.py
new file mode 100644
index 00000000000000..93ab29b74201d5
--- /dev/null
+++ b/examples/tensorflow/language-modeling-tpu/prepare_tfrecord_shards.py
@@ -0,0 +1,172 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Script for preparing TFRecord shards for pre-tokenized examples."""
+
+import argparse
+import logging
+import os
+
+import datasets
+import tensorflow as tf
+
+from transformers import AutoTokenizer
+
+
+logger = logging.getLogger(__name__)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Prepare TFRecord shards from pre-tokenized samples of the wikitext dataset."
+    )
+    parser.add_argument(
+        "--tokenizer_name_or_path",
+        type=str,
+        default="sayakpaul/unigram-tokenizer-wikitext",
+        help="Tokenizer identifier. Can be a local filepath or a Hub identifier.",
+    )
+    parser.add_argument(
+        "--shard_size",
+        type=int,
+        default=1000,
+        help="Number of entries to go in a single shard.",
+    )
+    parser.add_argument("--split", type=str, default="train", choices=["train", "test", "validation"])
+    parser.add_argument(
+        "--limit",
+        default=None,
+        type=int,
+        help="Limit the number of shards (used for debugging).",
+    )
+    parser.add_argument(
+        "--max_length",
+        type=int,
+        default=512,
+        help="Maximum sequence length. For training on TPUs, it helps to have a maximum"
+        " sequence length that is a multiple of 8.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        default="tf-tpu",
+        type=str,
+        help="Output directory where the TFRecord shards will be saved. If the"
+        " path is appended with `gs://` ('gs://tf-tpu', for example) then the TFRecord"
+        " shards will be directly saved to a Google Cloud Storage bucket.",
+    )
+
+    args = parser.parse_args()
+    return args
+
+
+def tokenize_function(tokenizer):
+    def fn(examples):
+        return tokenizer(examples["text"])
+
+    return fn
+
+
+def get_serialized_examples(tokenized_data):
+    records = []
+    for i in range(len(tokenized_data["input_ids"])):
+        features = {
+            "input_ids": tf.train.Feature(int64_list=tf.train.Int64List(value=tokenized_data["input_ids"][i])),
+            "attention_mask": tf.train.Feature(
+                int64_list=tf.train.Int64List(value=tokenized_data["attention_mask"][i])
+            ),
+        }
+        features = tf.train.Features(feature=features)
+        example = tf.train.Example(features=features)
+        record_bytes = example.SerializeToString()
+        records.append(record_bytes)
+    return records
+
+
+def main(args):
+    wikitext = datasets.load_dataset("wikitext", "wikitext-103-raw-v1", split=args.split)
+
+    if args.limit is not None:
+        max_samples = min(len(wikitext), args.limit)
+        wikitext = wikitext.select(range(max_samples))
+        print(f"Limiting the dataset to {args.limit} entries.")
+
+    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name_or_path)
+
+    # Handle output directory creation.
+    # For serializing into a Google Cloud Storage Bucket, one needs to first
+    # create a bucket.
+    if "gs" not in args.output_dir:
+        if not os.path.exists(args.output_dir):
+            os.makedirs(args.output_dir)
+        split_dir = os.path.join(args.output_dir, args.split)
+        if not os.path.exists(split_dir):
+            os.makedirs(split_dir)
+    else:
+        split_dir = os.path.join(args.output_dir, args.split)
+
+    # Tokenize the whole dataset at once.
+    tokenize_fn = tokenize_function(tokenizer)
+    wikitext_tokenized = wikitext.map(tokenize_fn, batched=True, num_proc=4, remove_columns=["text"])
+
+    # We need to concatenate all our texts together, and then split the result
+    # into chunks of a fixed size, which we will call block_size. To do this, we
+    # will use the map method again, with the option batched=True. When we use batched=True,
+    # the function we pass to map() will be passed multiple inputs at once, allowing us
+    # to group them into more or fewer examples than we had in the input.
+    # This allows us to create our new fixed-length samples. The advantage of this
+    # method is that we don't lose a whole lot of content from the dataset compared to the
+    # case where we simply tokenize with a pre-defined max_length.
+
+    def group_texts(examples):
+        # Concatenate all texts.
+        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+        total_length = len(concatenated_examples[list(examples.keys())[0]])
+        # We drop the small remainder, though you could add padding instead if the model supports it
+        # In this, as in all things, we advise you to follow your heart 🫀
+        total_length = (total_length // args.max_length) * args.max_length
+        # Split by chunks of max_len.
+        result = {
+            k: [t[i : i + args.max_length] for i in range(0, total_length, args.max_length)]
+            for k, t in concatenated_examples.items()
+        }
+        return result
+
+    grouped_dataset = wikitext_tokenized.map(group_texts, batched=True, batch_size=1000, num_proc=4)
+
+    shard_count = 0
+    total_records = 0
+    for shard in range(0, len(grouped_dataset), args.shard_size):
+        dataset_snapshot = grouped_dataset[shard : shard + args.shard_size]
+        records_containing = len(dataset_snapshot["input_ids"])
+        filename = os.path.join(split_dir, f"wikitext-{shard_count}-{records_containing}.tfrecord")
+        serialized_examples = get_serialized_examples(dataset_snapshot)
+
+        with tf.io.TFRecordWriter(filename) as out_file:
+            for i in range(len(serialized_examples)):
+                example = serialized_examples[i]
+                out_file.write(example)
+            print("Wrote file {} containing {} records".format(filename, records_containing))
+
+        shard_count += 1
+        total_records += records_containing
+
+    with open(f"split-{args.split}-records-count.txt", "w") as f:
+        print(f"Total {args.split} records: {total_records}", file=f)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/examples/tensorflow/language-modeling-tpu/requirements.txt b/examples/tensorflow/language-modeling-tpu/requirements.txt
new file mode 100644
index 00000000000000..60bbe767a21427
--- /dev/null
+++ b/examples/tensorflow/language-modeling-tpu/requirements.txt
@@ -0,0 +1,3 @@
+transformers==4.26.1
+datasets==2.9.0
+tokenizers==0.13.2
diff --git a/examples/tensorflow/language-modeling-tpu/run_mlm.py b/examples/tensorflow/language-modeling-tpu/run_mlm.py
new file mode 100644
index 00000000000000..30923b982e1ed7
--- /dev/null
+++ b/examples/tensorflow/language-modeling-tpu/run_mlm.py
@@ -0,0 +1,307 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Script for training a masked language model on TPU."""
+
+import argparse
+import logging
+import os
+import re
+
+import tensorflow as tf
+
+from transformers import (
+    AutoConfig,
+    AutoTokenizer,
+    DataCollatorForLanguageModeling,
+    PushToHubCallback,
+    TFAutoModelForMaskedLM,
+    create_optimizer,
+)
+
+
+logger = logging.getLogger(__name__)
+
+AUTO = tf.data.AUTOTUNE
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Train a masked language model on TPU.")
+    parser.add_argument(
+        "--pretrained_model_config",
+        type=str,
+        default="roberta-base",
+        help="The model config to use. Note that we don't copy the model's weights, only the config!",
+    )
+    parser.add_argument(
+        "--tokenizer",
+        type=str,
+        default="unigram-tokenizer-wikitext",
+        help="The name of the tokenizer to load. We use the pretrained tokenizer to initialize the model's vocab size.",
+    )
+
+    parser.add_argument(
+        "--per_replica_batch_size",
+        type=int,
+        default=8,
+        help="Batch size per TPU core.",
+    )
+
+    parser.add_argument(
+        "--no_tpu",
+        action="store_true",
+        help="If set, run on CPU and don't try to initialize a TPU. Useful for debugging on non-TPU instances.",
+    )
+
+    parser.add_argument(
+        "--tpu_name",
+        type=str,
+        help="Name of TPU resource to initialize. Should be blank on Colab, and 'local' on TPU VMs.",
+        default="local",
+    )
+
+    parser.add_argument(
+        "--tpu_zone",
+        type=str,
+        help="Google cloud zone that TPU resource is located in. Only used for non-Colab TPU nodes.",
+    )
+
+    parser.add_argument(
+        "--gcp_project", type=str, help="Google cloud project name. Only used for non-Colab TPU nodes."
+    )
+
+    parser.add_argument(
+        "--bfloat16",
+        action="store_true",
+        help="Use mixed-precision bfloat16 for training. This is the recommended lower-precision format for TPU.",
+    )
+
+    parser.add_argument(
+        "--train_dataset",
+        type=str,
+        help="Path to training dataset to load. If the path begins with `gs://`"
+        " then the dataset will be loaded from a Google Cloud Storage bucket.",
+    )
+
+    parser.add_argument(
+        "--shuffle_buffer_size",
+        type=int,
+        default=2**18,  # Default corresponds to a 1GB buffer for seq_len 512
+        help="Size of the shuffle buffer (in samples)",
+    )
+
+    parser.add_argument(
+        "--eval_dataset",
+        type=str,
+        help="Path to evaluation dataset to load. If the path begins with `gs://`"
+        " then the dataset will be loaded from a Google Cloud Storage bucket.",
+    )
+
+    parser.add_argument(
+        "--num_epochs",
+        type=int,
+        default=1,
+        help="Number of epochs to train for.",
+    )
+
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Learning rate to use for training.",
+    )
+
+    parser.add_argument(
+        "--weight_decay_rate",
+        type=float,
+        default=1e-3,
+        help="Weight decay rate to use for training.",
+    )
+
+    parser.add_argument(
+        "--max_length",
+        type=int,
+        default=512,
+        help="Maximum length of tokenized sequences. Should match the setting used in prepare_tfrecord_shards.py",
+    )
+
+    parser.add_argument(
+        "--mlm_probability",
+        type=float,
+        default=0.15,
+        help="Fraction of tokens to mask during training.",
+    )
+
+    parser.add_argument("--output_dir", type=str, required=True, help="Path to save model checkpoints to.")
+    parser.add_argument("--hub_model_id", type=str, help="Model ID to upload to on the Hugging Face Hub.")
+
+    args = parser.parse_args()
+    return args
+
+
+def initialize_tpu(args):
+    try:
+        if args.tpu_name:
+            tpu = tf.distribute.cluster_resolver.TPUClusterResolver(
+                args.tpu_name, zone=args.tpu_zone, project=args.gcp_project
+            )
+        else:
+            tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
+    except ValueError:
+        raise RuntimeError(
+            "Couldn't connect to TPU! Most likely you need to specify --tpu_name, --tpu_zone, or "
+            "--gcp_project. When running on a TPU VM, use --tpu_name local."
+        )
+
+    tf.config.experimental_connect_to_cluster(tpu)
+    tf.tpu.experimental.initialize_tpu_system(tpu)
+
+    return tpu
+
+
+def count_samples(file_list):
+    num_samples = 0
+    for file in file_list:
+        filename = file.split("/")[-1]
+        sample_count = re.search(r"-\d+-(\d+)\.tfrecord", filename).group(1)
+        sample_count = int(sample_count)
+        num_samples += sample_count
+
+    return num_samples
+
+
+def prepare_dataset(records, decode_fn, mask_fn, batch_size, shuffle, shuffle_buffer_size=None):
+    num_samples = count_samples(records)
+    dataset = tf.data.Dataset.from_tensor_slices(records)
+    if shuffle:
+        dataset = dataset.shuffle(len(dataset))
+    dataset = tf.data.TFRecordDataset(dataset, num_parallel_reads=AUTO)
+    # TF can't infer the total sample count because it doesn't read all the records yet, so we assert it here
+    dataset = dataset.apply(tf.data.experimental.assert_cardinality(num_samples))
+    dataset = dataset.map(decode_fn, num_parallel_calls=AUTO)
+    if shuffle:
+        assert shuffle_buffer_size is not None
+        dataset = dataset.shuffle(args.shuffle_buffer_size)
+    dataset = dataset.batch(batch_size, drop_remainder=True)
+    dataset = dataset.map(mask_fn, num_parallel_calls=AUTO)
+    dataset = dataset.prefetch(AUTO)
+    return dataset
+
+
+def main(args):
+    if not args.no_tpu:
+        tpu = initialize_tpu(args)
+        strategy = tf.distribute.TPUStrategy(tpu)
+    else:
+        strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0")
+
+    if args.bfloat16:
+        tf.keras.mixed_precision.set_global_policy("mixed_bfloat16")
+
+    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
+    config = AutoConfig.from_pretrained(args.pretrained_model_config)
+    config.vocab_size = tokenizer.vocab_size
+
+    training_records = tf.io.gfile.glob(os.path.join(args.train_dataset, "*.tfrecord"))
+    if not training_records:
+        raise ValueError(f"No .tfrecord files found in {args.train_dataset}.")
+    eval_records = tf.io.gfile.glob(os.path.join(args.eval_dataset, "*.tfrecord"))
+    if not eval_records:
+        raise ValueError(f"No .tfrecord files found in {args.eval_dataset}.")
+
+    num_train_samples = count_samples(training_records)
+
+    steps_per_epoch = num_train_samples // (args.per_replica_batch_size * strategy.num_replicas_in_sync)
+    total_train_steps = steps_per_epoch * args.num_epochs
+
+    with strategy.scope():
+        model = TFAutoModelForMaskedLM.from_config(config)
+        model(model.dummy_inputs)  # Pass some dummy inputs through the model to ensure all the weights are built
+        optimizer, schedule = create_optimizer(
+            num_train_steps=total_train_steps,
+            num_warmup_steps=total_train_steps // 20,
+            init_lr=args.learning_rate,
+            weight_decay_rate=args.weight_decay_rate,
+            # TODO Add the other Adam parameters?
+        )
+        model.compile(optimizer=optimizer, metrics=["accuracy"])
+
+    def decode_fn(example):
+        features = {
+            "input_ids": tf.io.FixedLenFeature(dtype=tf.int64, shape=(args.max_length,)),
+            "attention_mask": tf.io.FixedLenFeature(dtype=tf.int64, shape=(args.max_length,)),
+        }
+        return tf.io.parse_single_example(example, features)
+
+    # Many of the data collators in Transformers are TF-compilable when return_tensors == "tf", so we can
+    # use their methods in our data pipeline.
+    data_collator = DataCollatorForLanguageModeling(
+        tokenizer=tokenizer, mlm_probability=args.mlm_probability, mlm=True, return_tensors="tf"
+    )
+
+    def mask_with_collator(batch):
+        # TF really needs an isin() function
+        special_tokens_mask = (
+            ~tf.cast(batch["attention_mask"], tf.bool)
+            | (batch["input_ids"] == tokenizer.cls_token_id)
+            | (batch["input_ids"] == tokenizer.sep_token_id)
+        )
+        batch["input_ids"], batch["labels"] = data_collator.tf_mask_tokens(
+            batch["input_ids"],
+            vocab_size=len(tokenizer),
+            mask_token_id=tokenizer.mask_token_id,
+            special_tokens_mask=special_tokens_mask,
+        )
+        return batch
+
+    batch_size = args.per_replica_batch_size * strategy.num_replicas_in_sync
+
+    train_dataset = prepare_dataset(
+        training_records,
+        decode_fn=decode_fn,
+        mask_fn=mask_with_collator,
+        batch_size=batch_size,
+        shuffle=True,
+        shuffle_buffer_size=args.shuffle_buffer_size,
+    )
+
+    eval_dataset = prepare_dataset(
+        eval_records,
+        decode_fn=decode_fn,
+        mask_fn=mask_with_collator,
+        batch_size=batch_size,
+        shuffle=False,
+    )
+
+    callbacks = []
+    if args.hub_model_id:
+        callbacks.append(
+            PushToHubCallback(output_dir=args.output_dir, hub_model_id=args.hub_model_id, tokenizer=tokenizer)
+        )
+
+    model.fit(
+        train_dataset,
+        validation_data=eval_dataset,
+        epochs=args.num_epochs,
+        callbacks=callbacks,
+    )
+
+    model.save_pretrained(args.output_dir)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/examples/tensorflow/language-modeling-tpu/train_unigram.py b/examples/tensorflow/language-modeling-tpu/train_unigram.py
new file mode 100644
index 00000000000000..65cd2c75772817
--- /dev/null
+++ b/examples/tensorflow/language-modeling-tpu/train_unigram.py
@@ -0,0 +1,119 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Script for training a Unigram tokenizer."""
+
+import argparse
+import logging
+
+import datasets
+from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers, processors
+from tokenizers.models import Unigram
+from tokenizers.trainers import UnigramTrainer
+
+from transformers import AlbertTokenizerFast
+
+
+logger = logging.getLogger(__name__)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Train a unigram tokenizer on the wikitext dataset.")
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default="wikitext",
+        help="Name of the training. Explore datasets at: hf.co/datasets.",
+    )
+    parser.add_argument(
+        "--dataset_config", type=str, default="wikitext-103-raw-v1", help="Configuration name of the dataset."
+    )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=1000,
+        help="Batch size during training.",
+    )
+    parser.add_argument(
+        "--vocab_size",
+        type=int,
+        default=10048,
+        help="Size of the desired vocabulary.",
+    )
+    parser.add_argument(
+        "--limit",
+        default=None,
+        type=int,
+        help="Limit the number of shards (used for debugging).",
+    )
+    parser.add_argument(
+        "--export_to_hub",
+        action="store_true",
+    )
+
+    args = parser.parse_args()
+    return args
+
+
+def main(args):
+    wikitext = datasets.load_dataset(args.dataset_name, args.dataset_config, split="train")
+
+    if args.limit is not None:
+        max_train_samples = min(len(wikitext), args.limit)
+        wikitext = wikitext.select(range(max_train_samples))
+        logger.info(f"Limiting the dataset to {args.limit} entries.")
+
+    def batch_iterator():
+        for i in range(0, len(wikitext), args.batch_size):
+            yield wikitext[i : i + args.batch_size]["text"]
+
+    # Prepare the tokenizer.
+    tokenizer = Tokenizer(Unigram())
+    tokenizer.normalizer = normalizers.Sequence([normalizers.Replace("``", '"'), normalizers.Replace("''", '"')])
+    tokenizer.pre_tokenizer = pre_tokenizers.Metaspace()
+
+    # Prepare the trainer.
+    trainer = UnigramTrainer(
+        unk_token="<unk>",
+        special_tokens=["[CLS]", "[SEP]", "<unk>", "<pad>", "[MASK]"],
+        vocab_size=args.vocab_size,
+    )
+
+    logger.info("Training the tokenizer.")
+    tokenizer.train_from_iterator(batch_iterator(), trainer=trainer)
+    logger.info("Tokenizer training complete!")
+
+    cls_token_id = tokenizer.token_to_id("[CLS]")
+    sep_token_id = tokenizer.token_to_id("[SEP]")
+    tokenizer.post_processor = processors.TemplateProcessing(
+        single="[CLS]:0 $A:0 [SEP]:0",
+        pair="[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
+        special_tokens=[
+            ("[CLS]", cls_token_id),
+            ("[SEP]", sep_token_id),
+        ],
+    )
+    tokenizer.decoder = decoders.Metaspace()
+
+    if args.export_to_hub:
+        logger.info("Exporting the trained tokenzier to Hub.")
+        new_tokenizer = AlbertTokenizerFast(tokenizer_object=tokenizer)
+        new_tokenizer.push_to_hub("unigram-tokenizer-wikitext")
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/examples/tensorflow/multiple-choice/run_swag.py b/examples/tensorflow/multiple-choice/run_swag.py
index d4b5955b408c49..9c6a90f1dc1ba9 100644
--- a/examples/tensorflow/multiple-choice/run_swag.py
+++ b/examples/tensorflow/multiple-choice/run_swag.py
@@ -50,7 +50,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.28.0.dev0")
+check_min_version("4.29.0.dev0")
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/tensorflow/question-answering/run_qa.py b/examples/tensorflow/question-answering/run_qa.py
index bcc7b24f233414..7059a9a032120d 100755
--- a/examples/tensorflow/question-answering/run_qa.py
+++ b/examples/tensorflow/question-answering/run_qa.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.28.0.dev0")
+check_min_version("4.29.0.dev0")
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/tensorflow/summarization/run_summarization.py b/examples/tensorflow/summarization/run_summarization.py
index e2b19746d807a0..b60a2129166d74 100644
--- a/examples/tensorflow/summarization/run_summarization.py
+++ b/examples/tensorflow/summarization/run_summarization.py
@@ -53,7 +53,7 @@
 
 # region Checking dependencies
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.28.0.dev0")
+check_min_version("4.29.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 
diff --git a/examples/tensorflow/text-classification/run_glue.py b/examples/tensorflow/text-classification/run_glue.py
index 09936a3190004b..8aa3d4c7fe8092 100644
--- a/examples/tensorflow/text-classification/run_glue.py
+++ b/examples/tensorflow/text-classification/run_glue.py
@@ -47,7 +47,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.28.0.dev0")
+check_min_version("4.29.0.dev0")
 
 task_to_keys = {
     "cola": ("sentence", None),
diff --git a/examples/tensorflow/translation/run_translation.py b/examples/tensorflow/translation/run_translation.py
index 43f13bd0a561b6..5f45f752c50383 100644
--- a/examples/tensorflow/translation/run_translation.py
+++ b/examples/tensorflow/translation/run_translation.py
@@ -56,7 +56,7 @@
 
 # region Dependencies and constants
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.28.0.dev0")
+check_min_version("4.29.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 
diff --git a/setup.py b/setup.py
index d1cba0dfebf0de..278825a5c66048 100644
--- a/setup.py
+++ b/setup.py
@@ -425,7 +425,7 @@ def run(self):
 
 setup(
     name="transformers",
-    version="4.28.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="4.29.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
     author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)",
     author_email="transformers@huggingface.co",
     description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow",
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index f2733cad08231e..894b11c1107056 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -18,7 +18,7 @@
 # to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
 # in the namespace without actually importing anything (and especially none of the backends).
 
-__version__ = "4.28.0.dev0"
+__version__ = "4.29.0.dev0"
 
 from typing import TYPE_CHECKING
 
@@ -243,6 +243,7 @@
     "models.convnext": ["CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvNextConfig"],
     "models.convnextv2": ["CONVNEXTV2_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvNextV2Config"],
     "models.cpm": [],
+    "models.cpmant": ["CPMANT_PRETRAINED_CONFIG_ARCHIVE_MAP", "CpmAntConfig", "CpmAntTokenizer"],
     "models.ctrl": ["CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP", "CTRLConfig", "CTRLTokenizer"],
     "models.cvt": ["CVT_PRETRAINED_CONFIG_ARCHIVE_MAP", "CvtConfig"],
     "models.data2vec": [
@@ -1325,6 +1326,14 @@
             "ConvNextV2PreTrainedModel",
         ]
     )
+    _import_structure["models.cpmant"].extend(
+        [
+            "CPMANT_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "CpmAntForCausalLM",
+            "CpmAntModel",
+            "CpmAntPreTrainedModel",
+        ]
+    )
     _import_structure["models.ctrl"].extend(
         [
             "CTRL_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -3941,6 +3950,7 @@
     from .models.convbert import CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvBertConfig, ConvBertTokenizer
     from .models.convnext import CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvNextConfig
     from .models.convnextv2 import CONVNEXTV2_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvNextV2Config
+    from .models.cpmant import CPMANT_PRETRAINED_CONFIG_ARCHIVE_MAP, CpmAntConfig, CpmAntTokenizer
     from .models.ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig, CTRLTokenizer
     from .models.cvt import CVT_PRETRAINED_CONFIG_ARCHIVE_MAP, CvtConfig
     from .models.data2vec import (
@@ -4889,6 +4899,12 @@
             ConvNextV2Model,
             ConvNextV2PreTrainedModel,
         )
+        from .models.cpmant import (
+            CPMANT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            CpmAntForCausalLM,
+            CpmAntModel,
+            CpmAntPreTrainedModel,
+        )
         from .models.ctrl import (
             CTRL_PRETRAINED_MODEL_ARCHIVE_LIST,
             CTRLForSequenceClassification,
diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index 37268ea34bbeb4..718d2d8d0f1de9 100755
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -801,6 +801,13 @@ def to_dict(self) -> Dict[str, Any]:
         # Transformers version when serializing the model
         output["transformers_version"] = __version__
 
+        if hasattr(self, "quantization_config"):
+            output["quantization_config"] = (
+                self.quantization_config.to_dict()
+                if not isinstance(self.quantization_config, dict)
+                else self.quantization_config
+            )
+
         self.dict_torch_dtype_to_str(output)
 
         return output
diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py
index cd363588757bf4..a68ee3b83d9537 100644
--- a/src/transformers/data/data_collator.py
+++ b/src/transformers/data/data_collator.py
@@ -664,6 +664,8 @@ def tf_mask_tokens(
         """
         import tensorflow as tf
 
+        mask_token_id = tf.cast(mask_token_id, inputs.dtype)
+
         input_shape = tf.shape(inputs)
         # 1 for a special token, 0 for a normal token in the special tokens mask
         # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
@@ -677,8 +679,9 @@ def tf_mask_tokens(
         inputs = tf.where(indices_replaced, mask_token_id, inputs)
 
         # 10% of the time, we replace masked input tokens with random word
-        indices_random = self.tf_bernoulli(input_shape, 0.5) & masked_indices & ~indices_replaced
-        random_words = tf.random.uniform(input_shape, maxval=vocab_size, dtype=tf.int64)
+        indices_random = self.tf_bernoulli(input_shape, 0.1) & masked_indices & ~indices_replaced
+        random_words = tf.random.uniform(input_shape, maxval=vocab_size, dtype=inputs.dtype)
+
         inputs = tf.where(indices_random, random_words, inputs)
 
         # The rest of the time (10% of the time) we keep the masked input tokens unchanged
diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py
index d29e1dbceda7bd..1df7b57c735af3 100644
--- a/src/transformers/generation/configuration_utils.py
+++ b/src/transformers/generation/configuration_utils.py
@@ -681,7 +681,7 @@ def from_model_config(cls, model_config: PretrainedConfig) -> "GenerationConfig"
 
         # Special case: some models have generation attributes set in the decoder. Use them if still unset in the
         # generation config.
-        for decoder_name in ("decoder", "generator"):
+        for decoder_name in ("decoder", "generator", "text_config"):
             if decoder_name in config_dict:
                 default_generation_config = GenerationConfig()
                 decoder_config = config_dict[decoder_name]
diff --git a/src/transformers/generation/streamers.py b/src/transformers/generation/streamers.py
index 979405bc136d92..4a6226c0b7b5c4 100644
--- a/src/transformers/generation/streamers.py
+++ b/src/transformers/generation/streamers.py
@@ -101,6 +101,10 @@ def put(self, value):
             printable_text = text[self.print_len :]
             self.token_cache = []
             self.print_len = 0
+        # If the last token is a CJK character, we print the characters.
+        elif len(text) > 0 and self._is_chinese_char(ord(text[-1])):
+            printable_text = text[self.print_len :]
+            self.print_len += len(printable_text)
         # Otherwise, prints until the last space char (simple heuristic to avoid printing incomplete words,
         # which may change with the subsequent token -- there are probably smarter ways to do this!)
         else:
@@ -127,6 +131,30 @@ def on_finalized_text(self, text: str, stream_end: bool = False):
         """Prints the new text to stdout. If the stream is ending, also prints a newline."""
         print(text, flush=True, end="" if not stream_end else None)
 
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if (
+            (cp >= 0x4E00 and cp <= 0x9FFF)
+            or (cp >= 0x3400 and cp <= 0x4DBF)  #
+            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
+            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
+            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
+            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
+            or (cp >= 0xF900 and cp <= 0xFAFF)
+            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
+        ):  #
+            return True
+
+        return False
+
 
 class TextIteratorStreamer(TextStreamer):
     """
diff --git a/src/transformers/generation/tf_utils.py b/src/transformers/generation/tf_utils.py
index 749c07d547c7df..cc95cb31a4aa33 100644
--- a/src/transformers/generation/tf_utils.py
+++ b/src/transformers/generation/tf_utils.py
@@ -837,12 +837,12 @@ def generate(
 
         # 6. Prepare model inputs which will be used for auto-regressive generation
         if self.config.is_encoder_decoder:
-            # if encoder-decoder then `input_ids` come from `decoder_start_token_id`
-            input_ids = self._prepare_decoder_input_ids_for_generation(
-                batch_size,
+            input_ids, model_kwargs = self._prepare_decoder_input_ids_for_generation(
+                batch_size=batch_size,
+                model_input_name=model_input_name,
+                model_kwargs=model_kwargs,
                 decoder_start_token_id=generation_config.decoder_start_token_id,
                 bos_token_id=generation_config.bos_token_id,
-                model_kwargs=model_kwargs,
             )
         else:
             input_ids = inputs_tensor if model_input_name == "input_ids" else model_kwargs.pop("input_ids")
@@ -1095,16 +1095,41 @@ def _prepare_encoder_decoder_kwargs_for_generation(
     def _prepare_decoder_input_ids_for_generation(
         self,
         batch_size: int,
+        model_input_name: str,
+        model_kwargs: Dict[str, tf.Tensor],
         decoder_start_token_id: int = None,
         bos_token_id: int = None,
-        model_kwargs: Optional[Dict[str, tf.Tensor]] = None,
-    ) -> tf.Tensor:
-        # prepare `input_ids` for decoder if model is encoder-decoder
+    ) -> Tuple[tf.Tensor, Dict[str, tf.Tensor]]:
+        """Prepares `decoder_input_ids` for generation with encoder-decoder models"""
+        # 1. Check whether the user has defined `decoder_input_ids` manually. To facilitate in terms of input naming,
+        # we also allow the user to pass it under `input_ids`, if the encoder does not use it as the main input.
         if model_kwargs is not None and "decoder_input_ids" in model_kwargs:
-            return model_kwargs.pop("decoder_input_ids")
+            decoder_input_ids = model_kwargs.pop("decoder_input_ids")
+        elif "input_ids" in model_kwargs and model_input_name != "input_ids":
+            decoder_input_ids = model_kwargs.pop("input_ids")
         else:
-            decoder_start_token_id = self._get_decoder_start_token_id(decoder_start_token_id, bos_token_id)
-            return tf.ones((batch_size, 1), dtype=tf.int32) * decoder_start_token_id
+            decoder_input_ids = None
+
+        # 2. Encoder-decoder models expect the `decoder_input_ids` to start with a special token. Let's ensure that.
+        decoder_start_token_id = self._get_decoder_start_token_id(decoder_start_token_id, bos_token_id)
+        decoder_input_ids_start = tf.ones((batch_size, 1), dtype=tf.int32) * decoder_start_token_id
+
+        # no user input -> use decoder_start_token_id as decoder_input_ids
+        if decoder_input_ids is None:
+            decoder_input_ids = decoder_input_ids_start
+        # user input but doesn't start with decoder_start_token_id -> prepend decoder_start_token_id (and adjust
+        # decoder_attention_mask if provided)
+        elif tf.reduce_all(decoder_input_ids[:, 0] != decoder_start_token_id):
+            decoder_input_ids = tf.concat([decoder_input_ids_start, decoder_input_ids], axis=-1)
+            if "decoder_attention_mask" in model_kwargs:
+                decoder_attention_mask = model_kwargs["decoder_attention_mask"]
+                decoder_attention_mask = tf.concat(
+                    (tf.ones_like(decoder_attention_mask)[:, :1], decoder_attention_mask),
+                    axis=-1,
+                )
+                model_kwargs["decoder_attention_mask"] = decoder_attention_mask
+
+        return decoder_input_ids, model_kwargs
 
     def _get_decoder_start_token_id(self, decoder_start_token_id: int = None, bos_token_id: int = None) -> int:
         # retrieve decoder_start_token_id for encoder-decoder models
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index ae12ae2930fcf2..1200fbe5d9b495 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -642,18 +642,44 @@ def _prepare_encoder_decoder_kwargs_for_generation(
     def _prepare_decoder_input_ids_for_generation(
         self,
         batch_size: int,
+        model_input_name: str,
+        model_kwargs: Dict[str, torch.Tensor],
         decoder_start_token_id: int = None,
         bos_token_id: int = None,
-        model_kwargs: Optional[Dict[str, torch.Tensor]] = None,
         device: torch.device = None,
-    ) -> torch.LongTensor:
+    ) -> Tuple[torch.LongTensor, Dict[str, torch.Tensor]]:
+        """Prepares `decoder_input_ids` for generation with encoder-decoder models"""
+        # 1. Check whether the user has defined `decoder_input_ids` manually. To facilitate in terms of input naming,
+        # we also allow the user to pass it under `input_ids`, if the encoder does not use it as the main input.
         if model_kwargs is not None and "decoder_input_ids" in model_kwargs:
-            return model_kwargs.pop("decoder_input_ids")
+            decoder_input_ids = model_kwargs.pop("decoder_input_ids")
+        elif "input_ids" in model_kwargs and model_input_name != "input_ids":
+            decoder_input_ids = model_kwargs.pop("input_ids")
         else:
-            decoder_start_token_id = self._get_decoder_start_token_id(decoder_start_token_id, bos_token_id)
-            if device is None:
-                device = self.device
-            return torch.ones((batch_size, 1), dtype=torch.long, device=device) * decoder_start_token_id
+            decoder_input_ids = None
+
+        # 2. Encoder-decoder models expect the `decoder_input_ids` to start with a special token. Let's ensure that.
+        decoder_start_token_id = self._get_decoder_start_token_id(decoder_start_token_id, bos_token_id)
+        if device is None:
+            device = self.device
+        decoder_input_ids_start = torch.ones((batch_size, 1), dtype=torch.long, device=device) * decoder_start_token_id
+
+        # no user input -> use decoder_start_token_id as decoder_input_ids
+        if decoder_input_ids is None:
+            decoder_input_ids = decoder_input_ids_start
+        # user input but doesn't start with decoder_start_token_id -> prepend decoder_start_token_id (and adjust
+        # decoder_attention_mask if provided)
+        elif (decoder_input_ids[:, 0] != decoder_start_token_id).all().item():
+            decoder_input_ids = torch.cat([decoder_input_ids_start, decoder_input_ids], dim=-1)
+            if "decoder_attention_mask" in model_kwargs:
+                decoder_attention_mask = model_kwargs["decoder_attention_mask"]
+                decoder_attention_mask = torch.cat(
+                    (torch.ones_like(decoder_attention_mask)[:, :1], decoder_attention_mask),
+                    dim=-1,
+                )
+                model_kwargs["decoder_attention_mask"] = decoder_attention_mask
+
+        return decoder_input_ids, model_kwargs
 
     def _get_decoder_start_token_id(self, decoder_start_token_id: int = None, bos_token_id: int = None) -> int:
         decoder_start_token_id = (
@@ -1289,17 +1315,14 @@ def generate(
 
         # 5. Prepare `input_ids` which will be used for auto-regressive generation
         if self.config.is_encoder_decoder:
-            input_ids = self._prepare_decoder_input_ids_for_generation(
-                batch_size,
+            input_ids, model_kwargs = self._prepare_decoder_input_ids_for_generation(
+                batch_size=batch_size,
+                model_input_name=model_input_name,
+                model_kwargs=model_kwargs,
                 decoder_start_token_id=generation_config.decoder_start_token_id,
                 bos_token_id=generation_config.bos_token_id,
-                model_kwargs=model_kwargs,
                 device=inputs_tensor.device,
             )
-
-            # conditional generation for multi-modal models.
-            if "input_ids" in model_kwargs and model_input_name == "pixel_values":
-                input_ids = torch.cat([input_ids, model_kwargs.pop("input_ids")], dim=-1)
         else:
             input_ids = inputs_tensor if model_input_name == "input_ids" else model_kwargs.pop("input_ids")
 
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 756ab50467b410..aeeb9d3d5a8822 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -2313,6 +2313,10 @@ def save_pretrained(
             files_timestamps = self._get_files_timestamps(save_directory)
 
         if saved_model:
+            # If `torch_dtype` is in the config with a torch dtype class as the value, we need to change it to string.
+            # (Although TF doesn't care about this attribute, we can't just remove it or set it to `None`.)
+            if getattr(self.config, "torch_dtype", None) is not None and not isinstance(self.config.torch_dtype, str):
+                self.config.torch_dtype = str(self.config.torch_dtype).split(".")[1]
             if signatures is None:
                 if any(spec.dtype == tf.int32 for spec in self.serving.input_signature[0].values()):
                     int64_spec = {
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index c911095e5a04fe..0df8d7e251f95a 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -697,7 +697,15 @@ def _load_state_dict_into_meta_model(
             # For backward compatibility with older versions of `accelerate`
             set_module_tensor_to_device(model, param_name, param_device, **set_module_kwargs)
         else:
-            set_module_8bit_tensor_to_device(model, param_name, param_device, value=param)
+            if param.dtype == torch.int8 and param_name.replace("weight", "SCB") in state_dict.keys():
+                fp16_statistics = state_dict[param_name.replace("weight", "SCB")]
+            else:
+                fp16_statistics = None
+
+            if "SCB" not in param_name:
+                set_module_8bit_tensor_to_device(
+                    model, param_name, param_device, value=param, fp16_statistics=fp16_statistics
+                )
 
     return error_msgs, offload_index, state_dict_index
 
@@ -1700,10 +1708,10 @@ def save_pretrained(
                 Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
         """
         # Checks if the model has been loaded in 8-bit
-        if getattr(self, "is_loaded_in_8bit", False):
+        if getattr(self, "is_loaded_in_8bit", False) and getattr(self, "is_8bit_serializable", False):
             warnings.warn(
                 "You are calling `save_pretrained` to a 8-bit converted model you may likely encounter unexepected"
-                " behaviors. ",
+                " behaviors. If you want to save 8-bit models, make sure to have `bitsandbytes>0.37.2` installed.",
                 UserWarning,
             )
 
@@ -2165,6 +2173,11 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         variant = kwargs.pop("variant", None)
         use_safetensors = kwargs.pop("use_safetensors", None if is_safetensors_available() else False)
 
+        if is_bitsandbytes_available():
+            is_8bit_serializable = version.parse(importlib_metadata.version("bitsandbytes")) > version.parse("0.37.2")
+        else:
+            is_8bit_serializable = False
+
         if trust_remote_code is True:
             logger.warning(
                 "The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is"
@@ -2207,6 +2220,12 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                     "`quantization_config` argument at the same time."
                 )
 
+            # in the case a user loads an 8bit model from the Hub and assigns a new quantization_config
+            if device_map is None:
+                device_map = "auto"
+                if low_cpu_mem_usage is None:
+                    low_cpu_mem_usage = True
+
         if load_in_8bit:
             if not (is_accelerate_available() and is_bitsandbytes_available()):
                 raise ImportError(
@@ -2265,6 +2284,43 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         else:
             model_kwargs = kwargs
 
+        if is_8bit_serializable and quantization_config is not None and load_in_8bit:
+            if hasattr(config, "quantization_config"):
+                logger.warning(
+                    "You passed `quantization_config` to `from_pretrained` but the model you're loading already has a"
+                    " `quantization_config` attribute. The `quantization_config` attribute will be overwritten with the"
+                    " one you passed to `from_pretrained`."
+                )
+            config.quantization_config = quantization_config
+        elif is_8bit_serializable and not load_in_8bit and hasattr(config, "quantization_config"):
+            quantization_config = config.quantization_config
+            if isinstance(quantization_config, dict):
+                quantization_config = BitsAndBytesConfig.from_dict(quantization_config, return_unused_kwargs=False)
+            elif isinstance(quantization_config, BitsAndBytesConfig):
+                pass
+            else:
+                raise ValueError(
+                    f"Invalid type for `quantization_config`: {type(quantization_config)}. Should be a `dict` or a"
+                    " `BitsAndBytesConfig` instance."
+                )
+
+            load_in_8bit = quantization_config.load_in_8bit
+
+            if load_in_8bit:
+                torch_dtype = torch.float16
+
+                if device_map is None:
+                    device_map = "auto"
+
+                if low_cpu_mem_usage is None:
+                    low_cpu_mem_usage = True
+        elif not is_8bit_serializable and not load_in_8bit and hasattr(config, "quantization_config"):
+            logger.warning(
+                "Detected the presence of a `quantization_config` attribute in the model's configuration but you don't have the correct"
+                " `bitsandbytes` version to support int8 serialization. Please install the latest version of `bitsandbytes` with "
+                " `pip install --upgrade bitsandbytes`."
+            )
+
         if commit_hash is None:
             commit_hash = getattr(config, "_commit_hash", None)
 
@@ -2621,6 +2677,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 importlib_metadata.version("bitsandbytes")
             ) >= version.parse("0.37.0")
 
+            model.config.quantization_config = quantization_config
+            model.is_8bit_serializable = is_8bit_serializable
+
         if isinstance(device_map, str):
             special_dtypes = {}
             if load_in_8bit:
@@ -3113,6 +3172,10 @@ def _find_mismatched_keys(
                 )
             raise RuntimeError(f"Error(s) in loading state_dict for {model.__class__.__name__}:\n\t{error_msg}")
 
+        if load_in_8bit:
+            unexpected_keys = [elem for elem in unexpected_keys if "SCB" not in elem]
+            missing_keys = [elem for elem in missing_keys if "SCB" not in elem]
+
         if len(unexpected_keys) > 0:
             logger.warning(
                 f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when"
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index adc9ca0f264c6c..667f3718fa4213 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -50,6 +50,7 @@
     convnext,
     convnextv2,
     cpm,
+    cpmant,
     ctrl,
     cvt,
     data2vec,
diff --git a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
index 54b77df7458d2c..0f8c045121c749 100644
--- a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
@@ -414,9 +414,12 @@ def _set_gradient_checkpointing(self, module: ASTEncoder, value: bool = False) -
 
 AUDIO_SPECTROGRAM_TRANSFORMER_INPUTS_DOCSTRING = r"""
     Args:
-        input_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
-            [`ASTFeatureExtractor.__call__`] for details.
+        input_values (`torch.FloatTensor` of shape `(batch_size, max_length, num_mel_bins)`):
+            Float values mel features extracted from the raw audio waveform. Raw audio waveform can be obtained by
+            loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via
+            the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
+            [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
+            tensor of type `torch.FloatTensor`. See [`~ASTFeatureExtractor.__call__`]
 
         head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
             Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index e71257f564b7e1..225fc739eda58c 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -58,6 +58,7 @@
         ("convbert", "ConvBertConfig"),
         ("convnext", "ConvNextConfig"),
         ("convnextv2", "ConvNextV2Config"),
+        ("cpmant", "CpmAntConfig"),
         ("ctrl", "CTRLConfig"),
         ("cvt", "CvtConfig"),
         ("data2vec-audio", "Data2VecAudioConfig"),
@@ -243,6 +244,7 @@
         ("convbert", "CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("convnext", "CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("convnextv2", "CONVNEXTV2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("cpmant", "CPMANT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("ctrl", "CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("cvt", "CVT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("data2vec-audio", "DATA2VEC_AUDIO_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -419,6 +421,7 @@
         ("convnext", "ConvNeXT"),
         ("convnextv2", "ConvNeXTV2"),
         ("cpm", "CPM"),
+        ("cpmant", "CPM-Ant"),
         ("ctrl", "CTRL"),
         ("cvt", "CvT"),
         ("data2vec-audio", "Data2VecAudio"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 26fc67a4d313ee..26c9a1becd9d41 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -57,6 +57,7 @@
         ("convbert", "ConvBertModel"),
         ("convnext", "ConvNextModel"),
         ("convnextv2", "ConvNextV2Model"),
+        ("cpmant", "CpmAntModel"),
         ("ctrl", "CTRLModel"),
         ("cvt", "CvtModel"),
         ("data2vec-audio", "Data2VecAudioModel"),
@@ -279,6 +280,7 @@
         ("camembert", "CamembertForMaskedLM"),
         ("codegen", "CodeGenForCausalLM"),
         ("convbert", "ConvBertForMaskedLM"),
+        ("cpmant", "CpmAntForCausalLM"),
         ("ctrl", "CTRLLMHeadModel"),
         ("data2vec-text", "Data2VecTextForMaskedLM"),
         ("deberta", "DebertaForMaskedLM"),
@@ -358,6 +360,7 @@
         ("bloom", "BloomForCausalLM"),
         ("camembert", "CamembertForCausalLM"),
         ("codegen", "CodeGenForCausalLM"),
+        ("cpmant", "CpmAntForCausalLM"),
         ("ctrl", "CTRLLMHeadModel"),
         ("data2vec-text", "Data2VecTextForCausalLM"),
         ("electra", "ElectraForCausalLM"),
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index babcddd6f3e221..4fee20f50b371b 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -127,6 +127,7 @@
                     "CpmTokenizerFast" if is_tokenizers_available() else None,
                 ),
             ),
+            ("cpmant", ("CpmAntTokenizer", None)),
             ("ctrl", ("CTRLTokenizer", None)),
             ("data2vec-text", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
             ("deberta", ("DebertaTokenizer", "DebertaTokenizerFast" if is_tokenizers_available() else None)),
diff --git a/src/transformers/models/blip/modeling_tf_blip.py b/src/transformers/models/blip/modeling_tf_blip.py
index bce6b524a69c24..dcb5509ed43a79 100644
--- a/src/transformers/models/blip/modeling_tf_blip.py
+++ b/src/transformers/models/blip/modeling_tf_blip.py
@@ -1020,7 +1020,7 @@ def get_text_features(
         )
 
         pooled_output = text_outputs[1]
-        text_features = self.text_projection(pooled_output)
+        text_features = self.blip.text_projection(pooled_output)
 
         return text_features
 
@@ -1057,7 +1057,7 @@ def get_image_features(
         vision_outputs = self.blip.vision_model(pixel_values=pixel_values, return_dict=return_dict)
 
         pooled_output = vision_outputs[1]  # pooled_output
-        image_features = self.visual_projection(pooled_output)
+        image_features = self.blip.visual_projection(pooled_output)
 
         return image_features
 
@@ -1238,7 +1238,7 @@ def generate(
 
         >>> outputs = model.generate(**inputs)
         >>> print(processor.decode(outputs[0], skip_special_tokens=True))
-        two cats are laying on a couch
+        two cats sleeping on a couch
         ```
         """
 
@@ -1410,7 +1410,6 @@ def call(
         >>> inputs["labels"] = labels
         >>> outputs = model(**inputs)
         >>> loss = outputs.loss
-        >>> loss.backward()
 
         >>> # inference
         >>> text = "How many cats are in the picture?"
diff --git a/src/transformers/models/blip/modeling_tf_blip_text.py b/src/transformers/models/blip/modeling_tf_blip_text.py
index 3ddf8539948c26..262b2cb2796621 100644
--- a/src/transformers/models/blip/modeling_tf_blip_text.py
+++ b/src/transformers/models/blip/modeling_tf_blip_text.py
@@ -462,6 +462,7 @@ def call(
                 next_decoder_cache += (layer_outputs[-1],)
             if output_attentions:
                 all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
 
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_states,)
diff --git a/src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py b/src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py
index 0033be274d5c13..2127da4f6cf902 100644
--- a/src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py
+++ b/src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py
@@ -127,9 +127,9 @@ def convert_clip_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_pa
     input_ids = torch.arange(0, 77).unsqueeze(0)
     pixel_values = torch.randn(1, 3, 224, 224)
 
-    hf_logits_per_image, hf_logits_per_text = hf_model(
-        input_ids=input_ids, pixel_values=pixel_values, return_dict=True
-    )[1:3]
+    hf_outputs = hf_model(input_ids=input_ids, pixel_values=pixel_values, return_dict=True)
+    hf_logits_per_image = hf_outputs.logits_per_image
+    hf_logits_per_text = hf_outputs.logits_per_text
     pt_logits_per_image, pt_logits_per_text = pt_model(pixel_values, input_ids)
 
     assert torch.allclose(hf_logits_per_image, pt_logits_per_image, atol=1e-3)
diff --git a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
index 8a146ccea0d8ba..b5b9a576da8ad7 100644
--- a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
@@ -16,7 +16,6 @@
 
 import io
 import pathlib
-import warnings
 from collections import defaultdict
 from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
 
@@ -61,6 +60,7 @@
     is_torch_available,
     is_torch_tensor,
     is_vision_available,
+    logging,
 )
 
 
@@ -78,6 +78,8 @@
     import scipy.stats
 
 
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
 AnnotationType = Dict[str, Union[int, str, List[Dict]]]
 
 
@@ -795,10 +797,9 @@ def __init__(
             do_pad = kwargs.pop("pad_and_return_pixel_mask")
 
         if "max_size" in kwargs:
-            warnings.warn(
+            logger.warning_once(
                 "The `max_size` parameter is deprecated and will be removed in v4.26. "
                 "Please specify in `size['longest_edge'] instead`.",
-                FutureWarning,
             )
             max_size = kwargs.pop("max_size")
         else:
@@ -822,10 +823,9 @@ def __init__(
     @property
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.max_size
     def max_size(self):
-        warnings.warn(
+        logger.warning(
             "The `max_size` parameter is deprecated and will be removed in v4.27. "
             "Please specify in `size['longest_edge'] instead`.",
-            FutureWarning,
         )
         return self.size["longest_edge"]
 
@@ -872,7 +872,7 @@ def prepare_annotation(
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare
     def prepare(self, image, target, return_segmentation_masks=False, masks_path=None):
-        warnings.warn(
+        logger.warning_once(
             "The `prepare` method is deprecated and will be removed in a future version. "
             "Please use `prepare_annotation` instead. Note: the `prepare_annotation` method "
             "does not return the image anymore.",
@@ -882,17 +882,23 @@ def prepare(self, image, target, return_segmentation_masks=False, masks_path=Non
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.convert_coco_poly_to_mask
     def convert_coco_poly_to_mask(self, *args, **kwargs):
-        warnings.warn("The `convert_coco_poly_to_mask` method is deprecated and will be removed in a future version. ")
+        logger.warning_once(
+            "The `convert_coco_poly_to_mask` method is deprecated and will be removed in a future version. "
+        )
         return convert_coco_poly_to_mask(*args, **kwargs)
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_detection with DETR->ConditionalDetr
     def prepare_coco_detection(self, *args, **kwargs):
-        warnings.warn("The `prepare_coco_detection` method is deprecated and will be removed in a future version. ")
+        logger.warning_once(
+            "The `prepare_coco_detection` method is deprecated and will be removed in a future version. "
+        )
         return prepare_coco_detection_annotation(*args, **kwargs)
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_panoptic
     def prepare_coco_panoptic(self, *args, **kwargs):
-        warnings.warn("The `prepare_coco_panoptic` method is deprecated and will be removed in a future version. ")
+        logger.warning_once(
+            "The `prepare_coco_panoptic` method is deprecated and will be removed in a future version. "
+        )
         return prepare_coco_panoptic_annotation(*args, **kwargs)
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize
@@ -909,10 +915,9 @@ def resize(
         int, smaller edge of the image will be matched to this number.
         """
         if "max_size" in kwargs:
-            warnings.warn(
+            logger.warning_once(
                 "The `max_size` parameter is deprecated and will be removed in v4.26. "
                 "Please specify in `size['longest_edge'] instead`.",
-                FutureWarning,
             )
             max_size = kwargs.pop("max_size")
         else:
@@ -998,9 +1003,7 @@ def pad_and_create_pixel_mask(
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
         """
-        warnings.warn(
-            "This method is deprecated and will be removed in v4.27.0. Please use pad instead.", FutureWarning
-        )
+        logger.warning_once("This method is deprecated and will be removed in v4.27.0. Please use pad instead.")
         # pad expects a list of np.ndarray, but the previous feature extractors expected torch tensors
         images = [to_numpy_array(image) for image in pixel_values_list]
         return self.pad(
@@ -1139,19 +1142,17 @@ def preprocess(
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
         """
         if "pad_and_return_pixel_mask" in kwargs:
-            warnings.warn(
+            logger.warning_once(
                 "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, "
-                "use `do_pad` instead.",
-                FutureWarning,
+                "use `do_pad` instead."
             )
             do_pad = kwargs.pop("pad_and_return_pixel_mask")
 
         max_size = None
         if "max_size" in kwargs:
-            warnings.warn(
+            logger.warning_once(
                 "The `max_size` argument is deprecated and will be removed in a future version, use"
-                " `size['longest_edge']` instead.",
-                FutureWarning,
+                " `size['longest_edge']` instead."
             )
             size = kwargs.pop("max_size")
 
@@ -1296,10 +1297,9 @@ def post_process(self, outputs, target_sizes):
             `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
             in the batch as predicted by the model.
         """
-        warnings.warn(
+        logging.warning_once(
             "`post_process` is deprecated and will be removed in v5 of Transformers, please use"
             " `post_process_object_detection`",
-            FutureWarning,
         )
 
         out_logits, out_bbox = outputs.logits, outputs.pred_boxes
@@ -1560,7 +1560,7 @@ def post_process_panoptic_segmentation(
         """
 
         if label_ids_to_fuse is None:
-            warnings.warn("`label_ids_to_fuse` unset. No instance will be fused.")
+            logger.warning_once("`label_ids_to_fuse` unset. No instance will be fused.")
             label_ids_to_fuse = set()
 
         class_queries_logits = outputs.logits  # [batch_size, num_queries, num_classes+1]
diff --git a/src/transformers/models/cpmant/__init__.py b/src/transformers/models/cpmant/__init__.py
new file mode 100644
index 00000000000000..8140009b60f156
--- /dev/null
+++ b/src/transformers/models/cpmant/__init__.py
@@ -0,0 +1,64 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2022 The HuggingFace Team and The OpenBMB Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+# rely on isort to merge the imports
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
+
+
+_import_structure = {
+    "configuration_cpmant": ["CPMANT_PRETRAINED_CONFIG_ARCHIVE_MAP", "CpmAntConfig"],
+    "tokenization_cpmant": ["CpmAntTokenizer"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_cpmant"] = [
+        "CPMANT_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "CpmAntForCausalLM",
+        "CpmAntModel",
+        "CpmAntPreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_cpmant import CPMANT_PRETRAINED_CONFIG_ARCHIVE_MAP, CpmAntConfig
+    from .tokenization_cpmant import CpmAntTokenizer
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_cpmant import (
+            CPMANT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            CpmAntForCausalLM,
+            CpmAntModel,
+            CpmAntPreTrainedModel,
+        )
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/cpmant/configuration_cpmant.py b/src/transformers/models/cpmant/configuration_cpmant.py
new file mode 100644
index 00000000000000..56ba9ab31cfdb7
--- /dev/null
+++ b/src/transformers/models/cpmant/configuration_cpmant.py
@@ -0,0 +1,123 @@
+# coding=utf-8
+# Copyright 2022 The OpenBMB Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" CPMAnt model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+CPMANT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "openbmb/cpm-ant-10b": "https://huggingface.co/openbmb/cpm-ant-10b/blob/main/config.json"
+    # See all CPMAnt models at https://huggingface.co/models?filter=cpmant
+}
+
+
+class CpmAntConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`CpmAntModel`]. It is used to instantiate an
+    CPMAnt model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the CPMAnt
+    [openbmb/cpm-ant-10b](https://huggingface.co/openbmb/cpm-ant-10b) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30720):
+            Vocabulary size of the CPMAnt model. Defines the number of different tokens that can be represented by the
+            `input` passed when calling [`CpmAntModel`].
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the encoder layers.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads in the Transformer encoder.
+        dim_head (`int`, *optional*, defaults to 128):
+            Dimension of attention heads for each attention layer in the Transformer encoder.
+        dim_ff (`int`, *optional*, defaults to 10240):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 48):
+            Number of layers of the Transformer encoder.
+        dropout_p (`float`, *optional*, defaults to 0.1):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder.
+        position_bias_num_buckets (`int`, *optional*, defaults to 512):
+            The number of position_bias buckets.
+        position_bias_max_distance (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        eps (`float`, *optional*, defaults to 1e-6):
+            The epsilon used by the layer normalization layers.
+        prompt_types (`int`, *optional*, defaults to 32):
+            The type of prompt.
+        prompt_length (`int`, *optional*, defaults to 32):
+            The length of prompt.
+        segment_types (`int`, *optional*, defaults to 32):
+            The type of segment.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether to use cache.
+        init_std (`float`, *optional*, defaults to 1.0):
+            Initialize parameters with std = init_std.
+
+    Example:
+
+    ```python
+    >>> from transformers import CpmAntModel, CpmAntConfig
+
+    >>> # Initializing a CPMAnt cpm-ant-10b style configuration
+    >>> configuration = CpmAntConfig()
+
+    >>> # Initializing a model from the cpm-ant-10b style configuration
+    >>> model = CpmAntModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "cpmant"
+
+    def __init__(
+        self,
+        vocab_size: int = 30720,
+        hidden_size: int = 4096,
+        num_attention_heads: int = 32,
+        dim_head: int = 128,
+        dim_ff: int = 10240,
+        num_hidden_layers: int = 48,
+        dropout_p: int = 0.0,
+        position_bias_num_buckets: int = 512,
+        position_bias_max_distance: int = 2048,
+        eps: int = 1e-6,
+        init_std: float = 1.0,
+        prompt_types: int = 32,
+        prompt_length: int = 32,
+        segment_types: int = 32,
+        use_cache: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.prompt_types = prompt_types
+        self.prompt_length = prompt_length
+        self.segment_types = segment_types
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.dim_head = dim_head
+        self.dim_ff = dim_ff
+        self.num_hidden_layers = num_hidden_layers
+        self.position_bias_num_buckets = position_bias_num_buckets
+        self.position_bias_max_distance = position_bias_max_distance
+        self.dropout_p = dropout_p
+        self.eps = eps
+        self.use_cache = use_cache
+        self.vocab_size = vocab_size
+        self.init_std = init_std
diff --git a/src/transformers/models/cpmant/modeling_cpmant.py b/src/transformers/models/cpmant/modeling_cpmant.py
new file mode 100755
index 00000000000000..db0e0f468cefe8
--- /dev/null
+++ b/src/transformers/models/cpmant/modeling_cpmant.py
@@ -0,0 +1,880 @@
+# coding=utf-8
+# Copyright 2022 The OpenBMB Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch CPMAnt"""
+
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_utils import PreTrainedModel
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from .configuration_cpmant import CpmAntConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "openbmb/cpm-ant-10b"
+_CONFIG_FOR_DOC = "CpmAntConfig"
+
+CPMANT_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "openbmb/cpm-ant-10b",
+    # See all CPMAnt models at https://huggingface.co/models?filter=cpmant
+]
+
+
+class CpmAntLayerNorm(nn.Module):
+    """
+    We use Root Mean Square (RMS) Layer Normalization, please see https://arxiv.org/abs/1910.07467 for details."
+    """
+
+    def __init__(self, config: CpmAntConfig):
+        super().__init__()
+
+        self.eps = config.eps
+        self.dim_norm = config.hidden_size
+        self.weight = nn.Parameter(torch.empty(config.hidden_size))
+
+    def forward(self, hidden_states: torch.Tensor):
+        """
+        Args:
+            hidden_states (`torch.Tensor` of shape `(batch, seq_len, dim_in)`)
+        """
+        if hidden_states.size(-1) != self.dim_norm:
+            raise AssertionError("hidden_states.size(-1) != self.dim_norm")
+        old_dtype = hidden_states.dtype
+        variance = hidden_states.to(torch.float32).pow(2).mean(dim=-1, keepdim=True)
+        hidden_states = (hidden_states * torch.rsqrt(variance + self.eps)).to(old_dtype) * self.weight
+        return hidden_states
+
+
+class CpmAntAttention(nn.Module):
+    def __init__(self, config: CpmAntConfig):
+        super().__init__()
+        self.dim_model = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.dim_head = config.dim_head
+
+        self.project_q = nn.Linear(self.dim_model, self.num_heads * self.dim_head, bias=False)
+        self.project_k = nn.Linear(self.dim_model, self.num_heads * self.dim_head, bias=False)
+        self.project_v = nn.Linear(self.dim_model, self.num_heads * self.dim_head, bias=False)
+
+        self.attention_out = nn.Linear(self.num_heads * self.dim_head, self.dim_model, bias=False)
+
+        self.softmax = torch.nn.Softmax(dim=-1)
+
+        if config.dropout_p is not None:
+            self.dropout = torch.nn.Dropout(p=config.dropout_p)
+        else:
+            self.dropout = None
+
+    def forward(
+        self,
+        hidden_q: torch.Tensor,
+        hidden_kv: torch.Tensor,
+        attention_mask: torch.BoolTensor,
+        position_bias: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+        past_key_values: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cache: Optional[bool] = None,
+    ):
+        """
+        Args:
+            hidden_q (`torch.Tensor`):
+                Input of transformer block(self-attention block). It can be the raw embedding of a batch of sequences.
+            hidden_kv (`torch.Tensor` of shape `(batch, len_k, dim_model)`)):
+                Tensor *key_value* and *query* of shape `(batch, len_k, dim_model)`
+            attention_mask (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
+                Avoid invalid areas to participate in the calculation of self-attention.
+            position_bias (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
+                Provide positional information to self-attention block.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers.
+            past_key_values (`Tuple[torch.Tensor, torch.Tensor]`, *optional*):
+                Cached past key and value projection states.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+        """
+        batch_size = hidden_q.size(0)
+        len_q = hidden_q.size(1)
+        len_k = hidden_kv.size(1)
+
+        query = self.project_q(hidden_q)
+        key = self.project_k(hidden_kv)
+        value = self.project_v(hidden_kv)
+
+        query = query.view(batch_size, len_q, self.num_heads, self.dim_head).permute(0, 2, 1, 3)
+        key = key.view(batch_size, len_k, self.num_heads, self.dim_head).permute(0, 2, 1, 3)
+        value = value.view(batch_size, len_k, self.num_heads, self.dim_head).permute(0, 2, 1, 3)
+
+        if past_key_values is not None:
+            key = torch.cat([past_key_values[0], key], dim=-2)
+            value = torch.cat([past_key_values[1], value], dim=-2)
+            len_k = key.size(-2)
+
+        # (batch_size, num_heads, len_q, dim_head) @ (batch_size, num_heads, dim_head, len_k) -> (batch_size, num_heads, len_q, len_k)
+        score = torch.matmul(query, key.transpose(-1, -2)) / math.sqrt(self.dim_head)
+        score = score + position_bias
+
+        score = torch.masked_fill(
+            score,
+            attention_mask.view(batch_size, 1, len_q, len_k) == torch.tensor(False),
+            torch.scalar_tensor(float("-inf"), device=score.device, dtype=score.dtype),
+        )
+        score = self.softmax(score)
+
+        score = torch.masked_fill(
+            score,
+            attention_mask.view(batch_size, 1, len_q, len_k) == torch.tensor(False),
+            torch.scalar_tensor(0, device=score.device, dtype=score.dtype),
+        )
+        if output_attentions:
+            attn_weights = score
+        else:
+            attn_weights = None
+
+        if self.dropout is not None:
+            score = self.dropout(score)
+
+        # (batch_size, num_heads, len_q, len_k) @ (batch_size, num_heads, len_k, dim_head) -> (batch_size, num_heads, len_q, dim_head)
+        score = torch.matmul(score, value)
+
+        score = score.view(batch_size, self.num_heads, len_q, self.dim_head).permute(0, 2, 1, 3)
+        score = score.contiguous().view(batch_size, len_q, self.num_heads * self.dim_head)
+
+        score = self.attention_out(score)
+
+        past_key_values = None
+        if use_cache:
+            past_key_values = (key, value)
+
+        return score, attn_weights, past_key_values
+
+
+class CpmAntSelfAttentionBlock(nn.Module):
+    def __init__(self, config: CpmAntConfig):
+        super().__init__()
+        self.layernorm_before_attention = CpmAntLayerNorm(config)
+        self.self_attention = CpmAntAttention(config)
+        if config.dropout_p:
+            self.dropout = torch.nn.Dropout(config.dropout_p)
+        else:
+            self.dropout = None
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        position_bias: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        past_key_values: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cache: Optional[bool] = None,
+    ):
+        """
+        Args:
+            hidden_states (`torch.Tensor` of shape `(batch, len_seq, dim_model)`):
+                Input of transformer block(self-attention block). It can be the raw embedding of a batch of sequences.
+            attention_mask (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
+                Avoid invalid areas to participate in the calculation of self-attention.
+            position_bias (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
+                Provide positional information to self-attention block.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers.
+            past_key_values (`Tuple(torch.FloatTensor)`, *optional*):
+                Cached past key and value projection states.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+        """
+        outputs = self.layernorm_before_attention(hidden_states)
+        outputs = self.self_attention(
+            outputs, outputs, attention_mask, position_bias, output_attentions, past_key_values, use_cache
+        )
+
+        outputs, attn_weights, current_key_value = outputs
+
+        if self.dropout is not None:
+            outputs = self.dropout(outputs)
+        hidden_states = hidden_states + outputs
+
+        return hidden_states, attn_weights, current_key_value
+
+
+class CpmAntDenseGatedACT(nn.Module):
+    def __init__(self, config: CpmAntConfig):
+        super().__init__()
+        self.w_0 = nn.Linear(config.hidden_size, config.dim_ff, bias=False)
+        self.w_1 = nn.Linear(config.hidden_size, config.dim_ff, bias=False)
+        self.act = torch.nn.GELU()
+
+    def forward(self, hidden_states: torch.Tensor):
+        """Transform an input tensor from one feature space to another via a nonlinear operation
+
+        Args:
+            hidden_states (`torch.Tensor` of shape `(batch, seq_len, dim_in)`)
+        """
+        gate_score = self.act(self.w_0(hidden_states))
+        hidden_states = self.w_1(hidden_states)
+
+        hidden_states = gate_score * hidden_states
+        return hidden_states
+
+
+class CpmAntFeedForward(nn.Module):
+    def __init__(self, config: CpmAntConfig):
+        super().__init__()
+        self.w_in = CpmAntDenseGatedACT(config)
+        if config.dropout_p is not None:
+            self.dropout = torch.nn.Dropout(config.dropout_p)
+        else:
+            self.dropout = None
+
+        self.w_out = nn.Linear(config.dim_ff, config.hidden_size, bias=False)
+
+    def forward(self, hidden_states: torch.Tensor):
+        """
+        Args:
+            hidden_states (`torch.Tensor` of shape `(batch, seq_len, dim_in)`)
+        """
+        hidden_states = self.w_in(hidden_states)
+
+        if self.dropout is not None:
+            hidden_states = self.dropout(hidden_states)
+
+        hidden_states = self.w_out(hidden_states)
+
+        return hidden_states
+
+
+class CpmAntFFNBlock(nn.Module):
+    def __init__(self, config: CpmAntConfig):
+        super().__init__()
+        self.layernorm_before_ffn = CpmAntLayerNorm(config)
+        self.ffn = CpmAntFeedForward(config)
+        if config.dropout_p:
+            self.dropout = torch.nn.Dropout(config.dropout_p)
+        else:
+            self.dropout = None
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ):
+        """
+        Args:
+            hidden_states (`torch.Tensor` of shape `(batch, len_seq, dim_model)`):
+                Hidden states before feed forward layer.
+        """
+        ln_outputs = self.layernorm_before_ffn(hidden_states)
+        outputs = self.ffn(ln_outputs)
+        if self.dropout is not None:
+            outputs = self.dropout(outputs)
+        hidden_states = hidden_states + outputs
+        return hidden_states
+
+
+class CpmAntTransformerBlock(nn.Module):
+    def __init__(self, config: CpmAntConfig):
+        super().__init__()
+        self.self_att = CpmAntSelfAttentionBlock(config)
+        self.ffn = CpmAntFFNBlock(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        position_bias: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        past_key_values: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cache: Optional[bool] = None,
+    ):
+        """
+        Args:
+            hidden_states (`torch.Tensor`):
+                Input to the layer of shape `(batch, seq_len, dim_model)`
+            attention_mask (`torch.Tensor`):
+                Avoid invalid areas to participate in the calculation of shape `(batch, seq_len, seq_len)`
+            position_bias (`torch.Tensor`):
+                Provides position information to attention mechanism of shape `(num_heads, seq_len, seq_len)`
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers.
+            past_key_values (`Tuple[torch.Tensor, torch.Tensor])`, *optional*):
+                Cached past key and value projection states
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+        """
+        hidden_states = self.self_att(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_bias=position_bias,
+            output_attentions=output_attentions,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+        )
+
+        hidden_states, attn_weights, current_key_value = hidden_states
+
+        hidden_states = self.ffn(hidden_states)
+
+        return hidden_states, attn_weights, current_key_value
+
+
+class CpmAntEncoder(nn.Module):
+    def __init__(self, config: CpmAntConfig):
+        super().__init__()
+        self.num_layers = config.num_hidden_layers
+        self.layers = nn.ModuleList([CpmAntTransformerBlock(config) for ith in range(self.num_layers)])
+
+        self.output_layernorm = CpmAntLayerNorm(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        position_bias: torch.Tensor,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        past_key_values: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cache: Optional[bool] = None,
+    ):
+        """
+        Args:
+            hidden_states (`torch.Tensor`):
+                Input to the layer of shape `(batch, seq_len, dim_model)`
+            attention_mask (`torch.Tensor`):
+                Avoid invalid areas to participate in the calculation of shape `(batch, seq_len, seq_len)`
+            position_bias (`torch.Tensor`):
+                Provides position information to attention mechanism of shape `(num_heads, seq_len, seq_len)`
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers.
+            past_key_values (`Tuple[torch.Tensor, torch.Tensor])`, *optional*):
+                Cached past key and value projection states
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+        """
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        current_key_values = () if use_cache else None
+
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            layer_outputs = layer(
+                hidden_states,
+                attention_mask,
+                position_bias,
+                output_attentions=output_attentions,
+                past_key_values=past_key_values[i] if past_key_values else None,
+                use_cache=use_cache,
+            )
+            hidden_states, attn_weights, current_key_value = layer_outputs
+            if output_attentions:
+                all_self_attns += (attn_weights,)
+            if current_key_value is not None:
+                current_key_values = current_key_values + (current_key_value,)
+
+        hidden_states = self.output_layernorm(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        return hidden_states, current_key_values, all_hidden_states, all_self_attns
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->CPMAnt
+class CpmAntIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class CpmAntSegmentPositionEmbedding(nn.Module):
+    def __init__(self, config: CpmAntConfig):
+        super().__init__()
+
+        self.num_heads = config.num_attention_heads
+        self.num_buckets = config.position_bias_num_buckets
+        self.max_distance = config.position_bias_max_distance
+        self.num_segments = config.segment_types
+
+        self.relative_attention_bias = nn.Parameter(
+            torch.empty(
+                config.segment_types * config.segment_types + config.position_bias_num_buckets,
+                config.num_attention_heads,
+            )
+        )
+
+    def forward(
+        self,
+        key_pos: torch.Tensor,
+        query_pos: torch.Tensor,
+        key_segment: torch.Tensor,
+        query_segment: torch.Tensor,
+    ):
+        with torch.no_grad():
+            batch = key_pos.size(0)
+            keylen = key_pos.size(1)
+            querylen = query_pos.size(1)
+
+            if key_pos.size(0) != query_pos.size(0):
+                raise AssertionError(
+                    f"key_pos.size(0) should be equal to query_pos.size(0), but got {key_pos.size(0)} and {query_pos.size(0)}!"
+                )
+            if keylen != key_segment.size(1) or querylen != query_segment.size(1):
+                raise AssertionError(
+                    f"keylen should be equal to key_segment.size(1), but got {keylen} and {key_segment.size(1)}!"
+                )
+            if querylen != query_segment.size(1):
+                raise AssertionError(
+                    f"querylen should be equal to query_segment.size(1), but got {querylen} and {query_segment.szie(1)}!"
+                )
+
+            key_pos = key_pos.view(batch, -1, keylen)
+            query_pos = query_pos.view(batch, querylen, -1)
+            key_segment = key_segment.view(batch, -1, keylen)
+            query_segment = query_segment.view(batch, querylen, -1)
+
+            relative_position_bucket = self._segment_relative_position_bucket(query_segment, key_segment)
+            relative_position_bucket = relative_position_bucket + self.num_buckets
+
+            # (batch, len_q, len_k)
+            absolute_position_bucket = self._position_bucket(
+                torch.arange(keylen, dtype=torch.int32, device=relative_position_bucket.device)[None, :]
+                - torch.arange(querylen, dtype=torch.int32, device=relative_position_bucket.device)[:, None],
+                num_buckets=self.num_buckets,
+                max_distance=self.max_distance,
+            )
+            relative_position_bucket = torch.where(
+                (key_segment == query_segment),
+                absolute_position_bucket[None, :, :],
+                relative_position_bucket,
+            )
+
+        # (batch, len_q, len_k, num_heads)
+        embeds = F.embedding(relative_position_bucket, self.relative_attention_bias)
+        # (batch, num_heads, len_q, len_k)
+        embeds = embeds.permute(0, 3, 1, 2).contiguous()
+        return embeds
+
+    def _segment_relative_position_bucket(self, query_segment, key_segment):
+        return query_segment * self.num_segments + key_segment
+
+    def _position_bucket(self, relative_position, num_buckets=32, max_distance=128):
+        relative_buckets = 0
+        # always bidirectional in CPMAnt
+        num_buckets //= 2
+        relative_buckets = (relative_position > 0).to(torch.int32) * num_buckets
+        relative_position = torch.abs(relative_position)
+        max_exact = num_buckets // 2
+        is_small = relative_position < max_exact
+        relative_postion_if_large = max_exact + (
+            torch.log(relative_position.float() / max_exact)
+            / math.log(max_distance / max_exact)
+            * (num_buckets - max_exact)
+        ).to(torch.int32)
+        relative_postion_if_large = torch.min(
+            relative_postion_if_large,
+            torch.full_like(relative_postion_if_large, num_buckets - 1),
+        )
+        relative_buckets += torch.where(is_small, relative_position.to(torch.int32), relative_postion_if_large)
+        return relative_buckets
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->CPMAnt
+class CpmAntOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class CpmAntPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = CpmAntConfig
+    base_model_prefix = "cpmant"
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.init_std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.init_std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, CpmAntLayerNorm):
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, CpmAntSegmentPositionEmbedding):
+            module.relative_attention_bias.data.normal_(mean=0.0, std=self.config.init_std)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, CpmAntEncoder):
+            module.gradient_checkpointing = value
+
+
+CPMANT_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters
+        config ([`~CpmAntConfig`]): Model configuration class with all the parameters of the
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+CPMANT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.Tensor` of shape `(batch_size, seq_len)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`CPMAntTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare CPMAnt Model outputting raw hidden-states without any specific head on top.",
+    CPMANT_START_DOCSTRING,
+)
+class CpmAntModel(CpmAntPreTrainedModel):
+    def __init__(self, config: CpmAntConfig):
+        super().__init__(config)
+        self.encoder = CpmAntEncoder(config)
+        self.segment_embedding = nn.Embedding(config.segment_types, config.hidden_size)
+        self.input_embedding = nn.Embedding(
+            config.vocab_size + config.prompt_types * config.prompt_length, config.hidden_size
+        )
+        self.position_bias = CpmAntSegmentPositionEmbedding(config)
+        self.prompt_length = config.prompt_length
+        self.vocab_size = config.vocab_size
+
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.input_embedding
+
+    def set_input_embeddings(self, embeddings, **kwargs):
+        self.input_embedding = embeddings
+
+    def _prepare_attention_mask(self, input_ids, span, context, length):
+        batch = input_ids.size(0)
+        seqlen = input_ids.size(1)
+        device = input_ids.device
+        directional_mask_2d = torch.arange(seqlen, device=device) <= torch.arange(seqlen, device=device).view(-1, 1)
+        attention_mask = context[:, None, :] | (
+            context[:, :, None].logical_not() & directional_mask_2d.view(1, seqlen, seqlen)
+        )
+        attention_mask = attention_mask & (span[:, None, :] == span[:, :, None])
+        # mask for left padding
+        mask_1d = (
+            torch.tensor(list(range(seqlen - self.prompt_length))[::-1], device=device)[None, :].repeat(batch, 1)
+            < length[:, None]
+        )
+        mask_1d = torch.cat((torch.ones(batch, self.prompt_length, device=device).bool(), mask_1d), dim=1)
+        attention_mask = mask_1d.view(batch, seqlen, 1) & mask_1d.view(batch, 1, seqlen) & attention_mask
+        return attention_mask
+
+    @add_start_docstrings_to_model_forward(CPMANT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPast,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        use_cache: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        # add prompts ahead
+        if input_ids.dtype != torch.int32:
+            input_ids = input_ids.to(torch.int32)
+        dtype, device = input_ids.dtype, input_ids.device
+        segment = torch.where(input_ids != 0, 2, 0).to(dtype=dtype, device=device)
+        length = (segment != 0).sum(-1).to(dtype=dtype, device=device)
+        input_ids = torch.cat(
+            (
+                torch.arange(
+                    self.prompt_length * 2 + self.vocab_size,
+                    self.prompt_length * 3 + self.vocab_size,
+                    dtype=dtype,
+                    device=device,
+                ).repeat(input_ids.size(0), 1),
+                input_ids,
+            ),
+            dim=1,
+        )
+        batch, seq_length = input_ids.size()
+        segment = torch.cat((torch.zeros(batch, self.prompt_length, dtype=dtype, device=device), segment), dim=1)
+        context = torch.full((batch, seq_length), 1, dtype=dtype, device=device)
+        position = torch.arange(seq_length, dtype=dtype, device=device).repeat(batch, 1)
+        span = torch.full((batch, seq_length), 0, dtype=dtype, device=device)
+
+        if past_key_values is None:
+            past_length = 0
+            past_key_values = tuple([None] * self.encoder.num_layers)
+            input_ids = input_ids.contiguous()
+            hidden_states = self.input_embedding(input_ids)
+            segment_states = self.segment_embedding(segment)
+            hidden_states = hidden_states + segment_states
+        else:
+            past_length = past_key_values[0][0].size(-2)
+            segment_states = self.segment_embedding(segment)
+            hidden_states = self.input_embedding(input_ids) + segment_states[:, -1:, :]
+
+        attention_mask = self._prepare_attention_mask(input_ids, span, context, length)
+        position_bias = self.position_bias(position, position, segment, segment)
+
+        attention_mask = attention_mask[:, past_length:, :]
+        position_bias = position_bias[:, :, past_length:, :]
+        hidden_states = hidden_states[:, past_length:, :]
+
+        hidden_states, present_key_values, all_hidden_states, all_attentions = self.encoder(
+            hidden_states,
+            attention_mask,
+            position_bias,
+            output_attentions,
+            output_hidden_states,
+            past_key_values,
+            use_cache,
+        )
+
+        if past_length == 0:
+            hidden_states = hidden_states[:, self.prompt_length :, :]
+            # drop the prompt
+            if all_attentions is not None:
+                new_attentions = ()
+                for attention in all_attentions:
+                    new_attentions += (attention[:, :, self.prompt_length :, self.prompt_length :],)
+                all_attentions = new_attentions
+            if all_hidden_states is not None:
+                new_hidden_states = ()
+                for hidden_state in all_hidden_states:
+                    new_hidden_states += (hidden_state[:, self.prompt_length :, :],)
+                all_hidden_states = new_hidden_states
+
+        if not return_dict:
+            return tuple(
+                v for v in [hidden_states, present_key_values, all_hidden_states, all_attentions] if v is not None
+            )
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=present_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    The CPMAnt Model with a language modeling head on top (linear layer with weights tied to the input embeddings).
+    """,
+    CPMANT_START_DOCSTRING,
+)
+class CpmAntForCausalLM(CpmAntPreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
+
+    def __init__(self, config: CpmAntConfig):
+        super().__init__(config)
+        self.cpmant = CpmAntModel(config)
+
+        # lm_head.weight is tied to cpmant.input_embedding.weight
+        self.lm_head = nn.Linear(
+            config.hidden_size, config.vocab_size + config.prompt_types * config.prompt_length, bias=False
+        )
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(CPMANT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=CausalLMOutputWithPast,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+        return_dict: Optional[bool] = None,
+        attention_mask: Optional[torch.Tensor] = None,  # dummy parameter for text-generation pipeline
+        **kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            input_ids (`torch.Tensor` of shape `(batch_size, seq_len)`):
+                Indices of input sequence tokens in the vocabulary.
+
+                Indices can be obtained using [`CPMAntTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers.
+            labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                CPMAnt will process attention mask automatically, this parameter is a dummy parameter for
+                text-generation pipeline.
+
+        Example:
+
+        Text Generation with CpmAntForCausalLM.
+        ```python
+        >>> from transformers import CPMAntTokenizer, CpmAntForCausalLM
+
+        >>> texts = "今天天气不错，"
+        >>> model = CpmAntForCausalLM.from_pretrained("openbmb/cpm-ant-10b")
+        >>> tokenizer = CPMAntTokenizer.from_pretrained("openbmb/cpm-ant-10b")
+        >>> input_ids = tokenizer(texts, return_tensors="pt")
+        >>> outputs = model.generate(**input_ids)
+        >>> output_texts = tokenizer.batch_decode(outputs)
+        >>> print(output_texts)
+        ['今天天气不错，阳光明媚，我和妈妈一起去超市买东西。\n在超市里，我看到了一个很好玩的玩具，它的名字叫“机器人”。它有一个圆圆的脑袋，两只圆圆的眼睛，还有一个圆圆的']
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        model_output = self.cpmant(
+            input_ids, output_attentions, output_hidden_states, past_key_values, use_cache, return_dict
+        )
+        hidden_states = model_output.last_hidden_state if return_dict else model_output[0]
+
+        logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            loss_func = CrossEntropyLoss()
+            loss = loss_func(logits.view(-1, logits.size(-1)), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + model_output[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=model_output.past_key_values,
+            hidden_states=model_output.hidden_states,
+            attentions=model_output.attentions,
+        )
+
+    def get_input_embeddings(self):
+        return self.cpmant.input_embedding
+
+    def set_input_embeddings(self, embeddings):
+        self.cpmant.input_embedding = embeddings
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def prepare_inputs_for_generation(self, input_ids, **kwargs):
+        input_ids = input_ids.int()
+        # save the memory usage of dummy attention mask
+        if "attention_mask" in kwargs:
+            kwargs["attention_mask"] = torch.zeros(1, 1)
+
+        return {
+            "input_ids": input_ids,
+            "use_cache": kwargs["use_cache"],
+            "past_key_values": kwargs.get("past_key_values", None),
+        }
+
+    def _reorder_cache(self, past_key_values, beam_idx):
+        past_key_values = [list(each) if each is not None else each for each in past_key_values]
+        for key_value_layer in past_key_values:
+            key_value_layer[0] = key_value_layer[0][beam_idx]
+            key_value_layer[1] = key_value_layer[1][beam_idx]
+        return past_key_values
diff --git a/src/transformers/models/cpmant/tokenization_cpmant.py b/src/transformers/models/cpmant/tokenization_cpmant.py
new file mode 100644
index 00000000000000..346f28fde66756
--- /dev/null
+++ b/src/transformers/models/cpmant/tokenization_cpmant.py
@@ -0,0 +1,277 @@
+# coding=utf-8
+# Copyright 2022 The OpenBMB Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for CPMAnt."""
+import collections
+import os
+from typing import List, Optional, Tuple
+
+from transformers.utils import is_jieba_available, requires_backends
+
+
+if is_jieba_available():
+    import jieba
+
+from ...tokenization_utils import PreTrainedTokenizer
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "openbmb/cpm-ant-10b": "https://huggingface.co/openbmb/cpm-ant-10b/blob/main/vocab.txt",
+    },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "openbmb/cpm-ant-10b": 1024,
+}
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip("\n")
+        vocab[token] = index
+    return vocab
+
+
+class WordpieceTokenizer(object):
+    def __init__(self, vocab, unk_token="<unk>", max_input_chars_per_word=200):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, token):
+        chars = list(token)
+        if len(chars) > self.max_input_chars_per_word:
+            return [self.unk_token]
+
+        start = 0
+        sub_tokens = []
+        while start < len(chars):
+            end = len(chars)
+            cur_substr = None
+            while start < end:
+                substr = "".join(chars[start:end])
+                if substr in self.vocab:
+                    cur_substr = substr
+                    break
+                end -= 1
+            if cur_substr is None:
+                sub_tokens.append(self.unk_token)
+                start += 1
+            else:
+                sub_tokens.append(cur_substr)
+                start = end
+
+        return sub_tokens
+
+
+class CpmAntTokenizer(PreTrainedTokenizer):
+    """
+    Construct a CPMAnt tokenizer. Based on byte-level Byte-Pair-Encoding.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        bod_token (`str`, *optional*, defaults to `"<d>"`):
+            The beginning of document token.
+        eod_token (`str`, *optional*, defaults to `"</d>"`):
+            The end of document token.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token.
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token.
+        line_token (`str`, *optional*, defaults to `"</n>"`):
+            The line token.
+        space_token (`str`, *optional*, defaults to `"</_>"`):
+            The space token.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
+    add_prefix_space = False
+
+    def __init__(
+        self,
+        vocab_file,
+        bod_token="<d>",
+        eod_token="</d>",
+        bos_token="<s>",
+        eos_token="</s>",
+        pad_token="<pad>",
+        unk_token="<unk>",
+        line_token="</n>",
+        space_token="</_>",
+        padding_side="left",
+        **kwargs,
+    ):
+        requires_backends(self, ["jieba"])
+        super().__init__(
+            bod_token=bod_token,
+            eod_token=eod_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            unk_token=unk_token,
+            line_token=line_token,
+            space_token=space_token,
+            padding_side=padding_side,
+            **kwargs,
+        )
+        self.bod_token = bod_token
+        self.eod_token = eod_token
+        self.encoder = load_vocab(vocab_file)
+        self.encoder[" "] = self.encoder[space_token]
+        self.encoder["\n"] = self.encoder[line_token]
+
+        del self.encoder[space_token]
+        del self.encoder[line_token]
+
+        self.encoder = collections.OrderedDict(sorted(self.encoder.items(), key=lambda x: x[1]))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.encoder, unk_token=self.unk_token)
+
+    @property
+    def bod_token_id(self):
+        return self.encoder[self.bod_token]
+
+    @property
+    def eod_token_id(self):
+        return self.encoder[self.eod_token]
+
+    @property
+    def newline_id(self):
+        return self.encoder["\n"]
+
+    @property
+    def vocab_size(self) -> int:
+        return len(self.encoder)
+
+    def get_vocab(self):
+        return dict(self.encoder, **self.added_tokens_encoder)
+
+    def _tokenize(self, text):
+        """Tokenize a string."""
+        output_tokens = []
+        for x in jieba.cut(text, cut_all=False):
+            output_tokens.extend(self.wordpiece_tokenizer.tokenize(x))
+        return output_tokens
+
+    def _decode(self, token_ids, **kwargs):
+        """Decode ids into a string."""
+        token_ids = [i for i in token_ids if i >= 0]
+        token_ids = [
+            x for x in token_ids if x != self.pad_token_id and x != self.eos_token_id and x != self.bos_token_id
+        ]
+        return super()._decode(token_ids, **kwargs)
+
+    def check(self, token):
+        return token in self.encoder
+
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        return "".join(tokens)
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index, self.unk_token)
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+            )
+        else:
+            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
+        index = 0
+        if " " in self.encoder:
+            self.encoder["</_>"] = self.encoder[" "]
+            del self.encoder[" "]
+        if "\n" in self.encoder:
+            self.encoder["</n>"] = self.encoder["\n"]
+            del self.encoder["\n"]
+        self.encoder = collections.OrderedDict(sorted(self.encoder.items(), key=lambda x: x[1]))
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in self.encoder.items():
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(token + "\n")
+                index += 1
+        return (vocab_file,)
+
+    def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: List[int] = None) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A CPMAnt sequence has the following format:
+
+        - single sequence: `[BOS] Sequence`.
+
+        Args:
+            token_ids_0 (`List[int]`): The first tokenized sequence that special tokens will be added.
+            token_ids_1 (`List[int]`): The optional second tokenized sequence that special tokens will be added.
+
+        Returns:
+            `List[int]`: The model input with special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.bos_token_id] + token_ids_0
+        return [self.bos_token_id] + token_ids_0 + [self.bos_token_id] + token_ids_1
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`): List of IDs.
+            token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1))
+        return [1] + ([0] * len(token_ids_0))
diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
index 07cafe149e1561..35a18e23edb389 100644
--- a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
@@ -16,7 +16,6 @@
 
 import io
 import pathlib
-import warnings
 from collections import defaultdict
 from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
 
@@ -61,6 +60,7 @@
     is_torch_available,
     is_torch_tensor,
     is_vision_available,
+    logging,
 )
 
 
@@ -77,6 +77,8 @@
     import scipy.stats
 
 
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
 AnnotationType = Dict[str, Union[int, str, List[Dict]]]
 
 
@@ -793,10 +795,9 @@ def __init__(
             do_pad = kwargs.pop("pad_and_return_pixel_mask")
 
         if "max_size" in kwargs:
-            warnings.warn(
+            logger.warning_once(
                 "The `max_size` parameter is deprecated and will be removed in v4.26. "
                 "Please specify in `size['longest_edge'] instead`.",
-                FutureWarning,
             )
             max_size = kwargs.pop("max_size")
         else:
@@ -820,10 +821,9 @@ def __init__(
     @property
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.max_size
     def max_size(self):
-        warnings.warn(
+        logger.warning(
             "The `max_size` parameter is deprecated and will be removed in v4.27. "
             "Please specify in `size['longest_edge'] instead`.",
-            FutureWarning,
         )
         return self.size["longest_edge"]
 
@@ -870,7 +870,7 @@ def prepare_annotation(
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare
     def prepare(self, image, target, return_segmentation_masks=None, masks_path=None):
-        warnings.warn(
+        logger.warning_once(
             "The `prepare` method is deprecated and will be removed in a future version. "
             "Please use `prepare_annotation` instead. Note: the `prepare_annotation` method "
             "does not return the image anymore.",
@@ -880,17 +880,23 @@ def prepare(self, image, target, return_segmentation_masks=None, masks_path=None
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.convert_coco_poly_to_mask
     def convert_coco_poly_to_mask(self, *args, **kwargs):
-        warnings.warn("The `convert_coco_poly_to_mask` method is deprecated and will be removed in a future version. ")
+        logger.warning_once(
+            "The `convert_coco_poly_to_mask` method is deprecated and will be removed in a future version. "
+        )
         return convert_coco_poly_to_mask(*args, **kwargs)
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_detection
     def prepare_coco_detection(self, *args, **kwargs):
-        warnings.warn("The `prepare_coco_detection` method is deprecated and will be removed in a future version. ")
+        logger.warning_once(
+            "The `prepare_coco_detection` method is deprecated and will be removed in a future version. "
+        )
         return prepare_coco_detection_annotation(*args, **kwargs)
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_panoptic
     def prepare_coco_panoptic(self, *args, **kwargs):
-        warnings.warn("The `prepare_coco_panoptic` method is deprecated and will be removed in a future version. ")
+        logger.warning_once(
+            "The `prepare_coco_panoptic` method is deprecated and will be removed in a future version. "
+        )
         return prepare_coco_panoptic_annotation(*args, **kwargs)
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize
@@ -907,10 +913,9 @@ def resize(
         int, smaller edge of the image will be matched to this number.
         """
         if "max_size" in kwargs:
-            warnings.warn(
+            logger.warning_once(
                 "The `max_size` parameter is deprecated and will be removed in v4.26. "
                 "Please specify in `size['longest_edge'] instead`.",
-                FutureWarning,
             )
             max_size = kwargs.pop("max_size")
         else:
@@ -996,9 +1001,7 @@ def pad_and_create_pixel_mask(
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
         """
-        warnings.warn(
-            "This method is deprecated and will be removed in v4.27.0. Please use pad instead.", FutureWarning
-        )
+        logger.warning_once("This method is deprecated and will be removed in v4.27.0. Please use pad instead.")
         # pad expects a list of np.ndarray, but the previous feature extractors expected torch tensors
         images = [to_numpy_array(image) for image in pixel_values_list]
         return self.pad(
@@ -1137,19 +1140,17 @@ def preprocess(
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
         """
         if "pad_and_return_pixel_mask" in kwargs:
-            warnings.warn(
+            logger.warning_once(
                 "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, "
-                "use `do_pad` instead.",
-                FutureWarning,
+                "use `do_pad` instead."
             )
             do_pad = kwargs.pop("pad_and_return_pixel_mask")
 
         max_size = None
         if "max_size" in kwargs:
-            warnings.warn(
+            logger.warning_once(
                 "The `max_size` argument is deprecated and will be removed in a future version, use"
-                " `size['longest_edge']` instead.",
-                FutureWarning,
+                " `size['longest_edge']` instead."
             )
             size = kwargs.pop("max_size")
 
@@ -1294,10 +1295,9 @@ def post_process(self, outputs, target_sizes):
             `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
             in the batch as predicted by the model.
         """
-        warnings.warn(
+        logger.warning_once(
             "`post_process` is deprecated and will be removed in v5 of Transformers, please use"
             " `post_process_object_detection`.",
-            FutureWarning,
         )
 
         out_logits, out_bbox = outputs.logits, outputs.pred_boxes
diff --git a/src/transformers/models/deta/image_processing_deta.py b/src/transformers/models/deta/image_processing_deta.py
index eda4fdff167d11..d60f6f838c9ce8 100644
--- a/src/transformers/models/deta/image_processing_deta.py
+++ b/src/transformers/models/deta/image_processing_deta.py
@@ -15,7 +15,6 @@
 """Image processor class for Deformable DETR."""
 
 import pathlib
-import warnings
 from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
 
 import numpy as np
@@ -56,6 +55,7 @@
     is_torch_tensor,
     is_torchvision_available,
     is_vision_available,
+    logging,
 )
 from ...utils.generic import ExplicitEnum, TensorType
 
@@ -71,6 +71,9 @@
     import PIL
 
 
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
 class AnnotionFormat(ExplicitEnum):
     COCO_DETECTION = "coco_detection"
     COCO_PANOPTIC = "coco_panoptic"
@@ -540,7 +543,7 @@ def prepare_annotation(
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare
     def prepare(self, image, target, return_segmentation_masks=None, masks_path=None):
-        warnings.warn(
+        logger.warning_once(
             "The `prepare` method is deprecated and will be removed in a future version. "
             "Please use `prepare_annotation` instead. Note: the `prepare_annotation` method "
             "does not return the image anymore.",
@@ -550,17 +553,23 @@ def prepare(self, image, target, return_segmentation_masks=None, masks_path=None
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.convert_coco_poly_to_mask
     def convert_coco_poly_to_mask(self, *args, **kwargs):
-        warnings.warn("The `convert_coco_poly_to_mask` method is deprecated and will be removed in a future version. ")
+        logger.warning_once(
+            "The `convert_coco_poly_to_mask` method is deprecated and will be removed in a future version. "
+        )
         return convert_coco_poly_to_mask(*args, **kwargs)
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_detection
     def prepare_coco_detection(self, *args, **kwargs):
-        warnings.warn("The `prepare_coco_detection` method is deprecated and will be removed in a future version. ")
+        logger.warning_once(
+            "The `prepare_coco_detection` method is deprecated and will be removed in a future version. "
+        )
         return prepare_coco_detection_annotation(*args, **kwargs)
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_panoptic
     def prepare_coco_panoptic(self, *args, **kwargs):
-        warnings.warn("The `prepare_coco_panoptic` method is deprecated and will be removed in a future version. ")
+        logger.warning_once(
+            "The `prepare_coco_panoptic` method is deprecated and will be removed in a future version. "
+        )
         return prepare_coco_panoptic_annotation(*args, **kwargs)
 
     def resize(
@@ -656,9 +665,7 @@ def pad_and_create_pixel_mask(
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
         """
-        warnings.warn(
-            "This method is deprecated and will be removed in v4.27.0. Please use pad instead.", FutureWarning
-        )
+        logger.warning_once("This method is deprecated and will be removed in v4.27.0. Please use pad instead.")
         # pad expects a list of np.ndarray, but the previous feature extractors expected torch tensors
         images = [to_numpy_array(image) for image in pixel_values_list]
         return self.pad(
@@ -796,10 +803,9 @@ def preprocess(
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
         """
         if "pad_and_return_pixel_mask" in kwargs:
-            warnings.warn(
+            logger.warning_once(
                 "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, "
                 "use `do_pad` instead.",
-                FutureWarning,
             )
             do_pad = kwargs.pop("pad_and_return_pixel_mask")
 
diff --git a/src/transformers/models/deta/modeling_deta.py b/src/transformers/models/deta/modeling_deta.py
index eabc6e5e690d34..6fd2e8fdd18412 100644
--- a/src/transformers/models/deta/modeling_deta.py
+++ b/src/transformers/models/deta/modeling_deta.py
@@ -244,7 +244,7 @@ class DetaObjectDetectionOutput(ModelOutput):
 
 
 def _get_clones(module, N):
-    return nn.ModuleList([module for i in range(N)])
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
 
 
 def inverse_sigmoid(x, eps=1e-5):
diff --git a/src/transformers/models/detr/image_processing_detr.py b/src/transformers/models/detr/image_processing_detr.py
index eaeae66c9654e5..f39db7b8de5a9e 100644
--- a/src/transformers/models/detr/image_processing_detr.py
+++ b/src/transformers/models/detr/image_processing_detr.py
@@ -16,7 +16,6 @@
 
 import io
 import pathlib
-import warnings
 from collections import defaultdict
 from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
 
@@ -60,6 +59,7 @@
     is_torch_available,
     is_torch_tensor,
     is_vision_available,
+    logging,
 )
 
 
@@ -77,6 +77,8 @@
     import scipy.stats
 
 
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
 AnnotationType = Dict[str, Union[int, str, List[Dict]]]
 
 
@@ -777,10 +779,9 @@ def __init__(
             do_pad = kwargs.pop("pad_and_return_pixel_mask")
 
         if "max_size" in kwargs:
-            warnings.warn(
+            logger.warning_once(
                 "The `max_size` parameter is deprecated and will be removed in v4.26. "
                 "Please specify in `size['longest_edge'] instead`.",
-                FutureWarning,
             )
             max_size = kwargs.pop("max_size")
         else:
@@ -803,10 +804,9 @@ def __init__(
 
     @property
     def max_size(self):
-        warnings.warn(
+        logger.warning(
             "The `max_size` parameter is deprecated and will be removed in v4.27. "
             "Please specify in `size['longest_edge'] instead`.",
-            FutureWarning,
         )
         return self.size["longest_edge"]
 
@@ -850,7 +850,7 @@ def prepare_annotation(
         return target
 
     def prepare(self, image, target, return_segmentation_masks=None, masks_path=None):
-        warnings.warn(
+        logger.warning_once(
             "The `prepare` method is deprecated and will be removed in a future version. "
             "Please use `prepare_annotation` instead. Note: the `prepare_annotation` method "
             "does not return the image anymore.",
@@ -859,15 +859,21 @@ def prepare(self, image, target, return_segmentation_masks=None, masks_path=None
         return image, target
 
     def convert_coco_poly_to_mask(self, *args, **kwargs):
-        warnings.warn("The `convert_coco_poly_to_mask` method is deprecated and will be removed in a future version. ")
+        logger.warning_once(
+            "The `convert_coco_poly_to_mask` method is deprecated and will be removed in a future version. "
+        )
         return convert_coco_poly_to_mask(*args, **kwargs)
 
     def prepare_coco_detection(self, *args, **kwargs):
-        warnings.warn("The `prepare_coco_detection` method is deprecated and will be removed in a future version. ")
+        logger.warning_once(
+            "The `prepare_coco_detection` method is deprecated and will be removed in a future version. "
+        )
         return prepare_coco_detection_annotation(*args, **kwargs)
 
     def prepare_coco_panoptic(self, *args, **kwargs):
-        warnings.warn("The `prepare_coco_panoptic` method is deprecated and will be removed in a future version. ")
+        logger.warning_once(
+            "The `prepare_coco_panoptic` method is deprecated and will be removed in a future version. "
+        )
         return prepare_coco_panoptic_annotation(*args, **kwargs)
 
     def resize(
@@ -883,10 +889,9 @@ def resize(
         int, smaller edge of the image will be matched to this number.
         """
         if "max_size" in kwargs:
-            warnings.warn(
+            logger.warning_once(
                 "The `max_size` parameter is deprecated and will be removed in v4.26. "
                 "Please specify in `size['longest_edge'] instead`.",
-                FutureWarning,
             )
             max_size = kwargs.pop("max_size")
         else:
@@ -967,9 +972,7 @@ def pad_and_create_pixel_mask(
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
         """
-        warnings.warn(
-            "This method is deprecated and will be removed in v4.27.0. Please use pad instead.", FutureWarning
-        )
+        logger.warning_once("This method is deprecated and will be removed in v4.27.0. Please use pad instead.")
         # pad expects a list of np.ndarray, but the previous feature extractors expected torch tensors
         images = [to_numpy_array(image) for image in pixel_values_list]
         return self.pad(
@@ -1105,19 +1108,17 @@ def preprocess(
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
         """
         if "pad_and_return_pixel_mask" in kwargs:
-            warnings.warn(
+            logger.warning_once(
                 "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, "
-                "use `do_pad` instead.",
-                FutureWarning,
+                "use `do_pad` instead."
             )
             do_pad = kwargs.pop("pad_and_return_pixel_mask")
 
         max_size = None
         if "max_size" in kwargs:
-            warnings.warn(
+            logger.warning_once(
                 "The `max_size` argument is deprecated and will be removed in a future version, use"
-                " `size['longest_edge']` instead.",
-                FutureWarning,
+                " `size['longest_edge']` instead."
             )
             size = kwargs.pop("max_size")
 
@@ -1263,10 +1264,9 @@ def post_process(self, outputs, target_sizes):
             `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
             in the batch as predicted by the model.
         """
-        warnings.warn(
+        logger.warning_once(
             "`post_process` is deprecated and will be removed in v5 of Transformers, please use"
             " `post_process_object_detection`",
-            FutureWarning,
         )
 
         out_logits, out_bbox = outputs.logits, outputs.pred_boxes
@@ -1306,10 +1306,9 @@ def post_process_segmentation(self, outputs, target_sizes, threshold=0.9, mask_t
             `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, and masks for an image
             in the batch as predicted by the model.
         """
-        warnings.warn(
+        logger.warning_once(
             "`post_process_segmentation` is deprecated and will be removed in v5 of Transformers, please use"
             " `post_process_semantic_segmentation`.",
-            FutureWarning,
         )
         out_logits, raw_masks = outputs.logits, outputs.pred_masks
         empty_label = out_logits.shape[-1] - 1
@@ -1358,10 +1357,9 @@ def post_process_instance(self, results, outputs, orig_target_sizes, max_target_
             `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, boxes and masks for an
             image in the batch as predicted by the model.
         """
-        warnings.warn(
+        logger.warning_once(
             "`post_process_instance` is deprecated and will be removed in v5 of Transformers, please use"
             " `post_process_instance_segmentation`.",
-            FutureWarning,
         )
 
         if len(orig_target_sizes) != len(max_target_sizes):
@@ -1405,10 +1403,9 @@ def post_process_panoptic(self, outputs, processed_sizes, target_sizes=None, is_
             `List[Dict]`: A list of dictionaries, each dictionary containing a PNG string and segments_info values for
             an image in the batch as predicted by the model.
         """
-        warnings.warn(
+        logger.warning_once(
             "`post_process_panoptic is deprecated and will be removed in v5 of Transformers, please use"
             " `post_process_panoptic_segmentation`.",
-            FutureWarning,
         )
         if target_sizes is None:
             target_sizes = processed_sizes
@@ -1751,7 +1748,7 @@ def post_process_panoptic_segmentation(
         """
 
         if label_ids_to_fuse is None:
-            warnings.warn("`label_ids_to_fuse` unset. No instance will be fused.")
+            logger.warning_once("`label_ids_to_fuse` unset. No instance will be fused.")
             label_ids_to_fuse = set()
 
         class_queries_logits = outputs.logits  # [batch_size, num_queries, num_classes+1]
diff --git a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
index be6d03a13188fd..1c90245b696c2c 100644
--- a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
@@ -633,14 +633,18 @@ def call(
         )
 
     def serving_output(self, output):
-        pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None
-        dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None
-        dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None
-        enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None
-        enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None
+        pkv = tf.tuple(output.past_key_values)[1] if self.config.decoder.use_cache else None
+        dec_hs = (
+            tf.convert_to_tensor(output.decoder_hidden_states) if self.config.decoder.output_hidden_states else None
+        )
+        dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.decoder.output_attentions else None
+        enc_hs = (
+            tf.convert_to_tensor(output.encoder_hidden_states) if self.config.encoder.output_hidden_states else None
+        )
+        enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.encoder.output_attentions else None
         cross_attns = (
             tf.convert_to_tensor(output.cross_attentions)
-            if self.config.output_attentions and output.cross_attentions is not None
+            if self.config.decoder.output_attentions and output.cross_attentions is not None
             else None
         )
 
diff --git a/src/transformers/models/esm/tokenization_esm.py b/src/transformers/models/esm/tokenization_esm.py
index 83a1b415b00d07..2eeda3c1b5652c 100644
--- a/src/transformers/models/esm/tokenization_esm.py
+++ b/src/transformers/models/esm/tokenization_esm.py
@@ -91,11 +91,16 @@ def id_to_token(self, index: int) -> str:
     def build_inputs_with_special_tokens(
         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
     ) -> List[int]:
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.eos_token_id]
         cls = [self.cls_token_id]
         sep = [self.eos_token_id]  # No sep token in ESM vocabulary
-        return cls + token_ids_0 + sep + token_ids_1 + sep
+        if token_ids_1 is None:
+            if self.eos_token_id is None:
+                return cls + token_ids_0
+            else:
+                return cls + token_ids_0 + sep
+        elif self.eos_token_id is None:
+            raise ValueError("Cannot tokenize multiple sequences when EOS token is not set!")
+        return cls + token_ids_0 + sep + token_ids_1 + sep  # Multiple inputs always have an EOS token
 
     def get_special_tokens_mask(
         self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False
diff --git a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
index 72858532bfe36c..f6ec24d7773c73 100644
--- a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
+++ b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
@@ -538,7 +538,7 @@ def set_input_embeddings(self, new_embeddings):
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
-        past_key_values: Optional[Union[List[torch.Tensor], int]] = None,
+        past_key_values: Optional[List[torch.Tensor]] = None,
         attention_mask: Optional[torch.Tensor] = None,
         token_type_ids: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
@@ -584,7 +584,7 @@ def forward(
             past_length = 0
             past_key_values = tuple([None] * len(self.h))
         else:
-            past_length = past_key_values[0][0].size(-2)
+            past_length = past_key_values[0].size(-2)
 
         if attention_mask is not None and len(attention_mask.shape) == 2 and position_ids is None:
             # create position_ids on the fly for batch generation
diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py
old mode 100755
new mode 100644
index c5e6cc564f0922..1051a298d5133b
--- a/src/transformers/models/gptj/modeling_gptj.py
+++ b/src/transformers/models/gptj/modeling_gptj.py
@@ -1012,6 +1012,7 @@ def forward(
 
         loss = None
         if labels is not None:
+            labels = labels.to(pooled_logits.device)
             if self.config.problem_type is None:
                 if self.num_labels == 1:
                     self.config.problem_type = "regression"
diff --git a/src/transformers/models/llama/convert_llama_weights_to_hf.py b/src/transformers/models/llama/convert_llama_weights_to_hf.py
index 3dc6c7d6970041..45ae4f80722543 100644
--- a/src/transformers/models/llama/convert_llama_weights_to_hf.py
+++ b/src/transformers/models/llama/convert_llama_weights_to_hf.py
@@ -17,12 +17,22 @@
 import math
 import os
 import shutil
+import warnings
 
 import torch
 
 from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer
 
 
+try:
+    from transformers import LlamaTokenizerFast
+except ImportError as e:
+    warnings.warn(e)
+    warnings.warn(
+        "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion"
+    )
+    LlamaTokenizerFast = None
+
 """
 Sample usage:
 
@@ -94,7 +104,7 @@ def permute(w):
     print(f"Fetching all parameters from the checkpoint at {input_base_path}.")
     # Load weights
     if model_size == "7B":
-        # Not shared
+        # Not sharded
         # (The sharded implementation would also work, but this is simpler.)
         loaded = torch.load(os.path.join(input_base_path, "consolidated.00.pth"), map_location="cpu")
     else:
@@ -232,9 +242,10 @@ def permute(w):
 
 
 def write_tokenizer(tokenizer_path, input_tokenizer_path):
-    print(f"Fetching the tokenizer from {input_tokenizer_path}.")
     # Initialize the tokenizer based on the `spm` model
-    tokenizer = LlamaTokenizer(input_tokenizer_path)
+    tokenizer_class = LlamaTokenizer if LlamaTokenizerFast is None else LlamaTokenizerFast
+    print(f"Saving a {tokenizer_class.__name__} to {tokenizer_path}.")
+    tokenizer = tokenizer_class(input_tokenizer_path)
     tokenizer.save_pretrained(tokenizer_path)
 
 
@@ -259,10 +270,8 @@ def main():
             input_base_path=os.path.join(args.input_dir, args.model_size),
             model_size=args.model_size,
         )
-    write_tokenizer(
-        tokenizer_path=args.output_dir,
-        input_tokenizer_path=os.path.join(args.input_dir, "tokenizer.model"),
-    )
+    spm_path = os.path.join(args.input_dir, "tokenizer.model")
+    write_tokenizer(args.output_dir, spm_path)
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/llama/tokenization_llama.py b/src/transformers/models/llama/tokenization_llama.py
index d6daa100643659..13f093ae94324f 100644
--- a/src/transformers/models/llama/tokenization_llama.py
+++ b/src/transformers/models/llama/tokenization_llama.py
@@ -246,9 +246,12 @@ def create_token_type_ids_from_sequences(
         Returns:
             `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
         """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
 
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+        output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
+
+        if token_ids_1 is not None:
+            output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
+
+        return output
diff --git a/src/transformers/models/m2m_100/configuration_m2m_100.py b/src/transformers/models/m2m_100/configuration_m2m_100.py
index 453f8d45f3dca5..07414c1b822f8d 100644
--- a/src/transformers/models/m2m_100/configuration_m2m_100.py
+++ b/src/transformers/models/m2m_100/configuration_m2m_100.py
@@ -88,12 +88,12 @@ class M2M100Config(PretrainedConfig):
     Example:
 
     ```python
-    >>> from transformers import M2M100Model, M2M100Config
+    >>> from transformers import M2M100Config, M2M100Model
 
     >>> # Initializing a M2M100 facebook/m2m100_418M style configuration
     >>> configuration = M2M100Config()
 
-    >>> # Initializing a model from the facebook/m2m100_418M style configuration
+    >>> # Initializing a model (with random weights) from the facebook/m2m100_418M style configuration
     >>> model = M2M100Model(configuration)
 
     >>> # Accessing the model configuration
diff --git a/src/transformers/models/mvp/configuration_mvp.py b/src/transformers/models/mvp/configuration_mvp.py
index 546da24954c11f..0880985b7930fb 100644
--- a/src/transformers/models/mvp/configuration_mvp.py
+++ b/src/transformers/models/mvp/configuration_mvp.py
@@ -93,12 +93,12 @@ class MvpConfig(PretrainedConfig):
     Example:
 
     ```python
-    >>> from transformers import MvpModel, MvpConfig
+    >>> from transformers import MvpConfig, MvpModel
 
     >>> # Initializing a MVP RUCAIBox/mvp style configuration
     >>> configuration = MvpConfig()
 
-    >>> # Initializing a model from the RUCAIBox/mvp style configuration
+    >>> # Initializing a model (with random weights) from the RUCAIBox/mvp style configuration
     >>> model = MvpModel(configuration)
 
     >>> # Accessing the model configuration
diff --git a/src/transformers/models/pix2struct/configuration_pix2struct.py b/src/transformers/models/pix2struct/configuration_pix2struct.py
index dead3d8a042413..244cb27058674c 100644
--- a/src/transformers/models/pix2struct/configuration_pix2struct.py
+++ b/src/transformers/models/pix2struct/configuration_pix2struct.py
@@ -358,9 +358,10 @@ def __init__(
         initializer_range=0.02,
         is_vqa=False,
         tie_word_embeddings=False,
+        is_encoder_decoder=True,
         **kwargs,
     ):
-        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+        super().__init__(tie_word_embeddings=tie_word_embeddings, is_encoder_decoder=is_encoder_decoder, **kwargs)
 
         if text_config is None:
             text_config = {}
@@ -373,9 +374,9 @@ def __init__(
         self.text_config = Pix2StructTextConfig(**text_config)
         self.vision_config = Pix2StructVisionConfig(**vision_config)
 
-        self.text_config.encoder_hidden_size = self.vision_config.hidden_size
         self.decoder_start_token_id = self.text_config.decoder_start_token_id
         self.pad_token_id = self.text_config.pad_token_id
+        self.eos_token_id = self.text_config.eos_token_id
 
         self.initializer_factor = initializer_factor
         self.initializer_range = initializer_range
diff --git a/src/transformers/models/pix2struct/modeling_pix2struct.py b/src/transformers/models/pix2struct/modeling_pix2struct.py
index ead913e1df6ea0..dd965fc35e3716 100644
--- a/src/transformers/models/pix2struct/modeling_pix2struct.py
+++ b/src/transformers/models/pix2struct/modeling_pix2struct.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ Pix2Struct modeling file"""
 
-import copy
 import math
 from typing import Dict, List, Optional, Tuple, Union
 
@@ -1580,25 +1579,6 @@ def custom_forward(*inputs):
             cross_attentions=all_cross_attentions,
         )
 
-    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
-        input_shape = input_ids.shape
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_shape)
-
-        # cut decoder_input_ids if past_key_values is used
-        if past_key_values is not None:
-            input_ids = input_ids[:, -1:]
-
-        return {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "past_key_values": past_key_values,
-            "encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None),
-            "encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None),
-            "is_decoder": True,
-        }
-
 
 @add_start_docstrings(
     "A conditional generation model with a language modeling head. Can be used for sequence generation tasks.",
@@ -1618,13 +1598,9 @@ class Pix2StructForConditionalGeneration(Pix2StructPreTrainedModel):
 
     def __init__(self, config: Pix2StructConfig):
         super().__init__(config)
-        encoder_config = copy.deepcopy(config.vision_config)
-        self.encoder = Pix2StructVisionModel(encoder_config)
 
-        decoder_config = copy.deepcopy(config.text_config)
-        self.decoder_start_token_id = decoder_config.pad_token_id
-        self.decoder_eos_token_ids = decoder_config.eos_token_id
-        self.decoder = Pix2StructTextModel(decoder_config)
+        self.encoder = Pix2StructVisionModel(config.vision_config)
+        self.decoder = Pix2StructTextModel(config.text_config)
 
         self.is_vqa = config.is_vqa
 
@@ -1682,6 +1658,8 @@ def forward(
 
         Example:
 
+        Inference:
+
         ```python
         >>> from PIL import Image
         >>> import requests
@@ -1690,15 +1668,40 @@ def forward(
         >>> processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
         >>> model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-textcaps-base")
 
-        >>> labels = "A stop sign is on the street corner."
         >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> inputs = processor(images=image, text=labels, return_tensors="pt", add_special_tokens=True)
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> # autoregressive generation
+        >>> generated_ids = model.generate(**inputs, max_new_tokens=50)
+        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        >>> print(generated_text)
+        A stop sign is on a street corner.
+        ```
+
+        Training:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, Pix2StructForConditionalGeneration
+
+        >>> processor = AutoProcessor.from_pretrained("google/pix2struct-base")
+        >>> model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-base")
+
+        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> text = "A stop sign is on the street corner."
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+        >>> labels = processor(text=text, return_tensors="pt").input_ids
 
         >>> # forward pass
-        >>> outputs = model(**inputs)
-        >>> last_hidden_states = outputs.loss
+        >>> outputs = model(**inputs, labels=labels)
+        >>> loss = outputs.loss
+        >>> print(f"{loss.item():.5f}")
+        5.23973
         ```"""
         use_cache = use_cache if use_cache is not None else self.config.text_config.use_cache
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1759,84 +1762,36 @@ def forward(
             encoder_attentions=encoder_outputs.attentions,
         )
 
-    @torch.no_grad()
-    def generate(
+    def prepare_inputs_for_generation(
         self,
-        flattened_patches: torch.FloatTensor,
-        decoder_input_ids: Optional[torch.LongTensor] = None,
+        input_ids,
+        flattened_patches: Optional[torch.FloatTensor] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
-        decoder_attention_mask: Optional[torch.LongTensor] = None,
-        **generate_kwargs,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        past_key_values=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        **kwargs,
     ):
-        r"""
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from PIL import Image
-        >>> import requests
-        >>> from transformers import AutoProcessor, Pix2StructForConditionalGeneration
-
-        >>> processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
-        >>> model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-textcaps-base")
-
-        >>> conditional_text = "A stop sign"
-        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
-
-        >>> inputs = processor(images=image, text=conditional_text, return_tensors="pt", add_special_tokens=True)
-
-        >>> # forward pass
-        >>> outputs = model.generate(**inputs)
-        >>> print(processor.batch_decode(outputs, skip_special_tokens=True))
-        ['A stop sign the street with a sign that says yes']
-        ```"""
-        batch_size, _, _ = flattened_patches.shape
-
-        vision_outputs = self.encoder(flattened_patches=flattened_patches, attention_mask=attention_mask)
-
-        image_embeds = vision_outputs[0]
-
-        if isinstance(decoder_input_ids, torch.Tensor):
-            # check if the first element of `input_ids` is equal to `decoder_input_ids`:
-            if (decoder_input_ids[:, 0] != self.decoder_start_token_id).all().item():
-                # add `decoder_input_ids` as first token to `input_ids`
-                decoder_input_ids = torch.cat(
-                    [
-                        torch.ones((decoder_input_ids.shape[0], 1), dtype=torch.long, device=decoder_input_ids.device)
-                        * self.decoder_start_token_id,
-                        decoder_input_ids,
-                    ],
-                    dim=-1,
-                )
-
-                if decoder_attention_mask is not None:
-                    decoder_attention_mask = torch.cat(
-                        [
-                            torch.ones(
-                                (decoder_attention_mask.shape[0], 1),
-                                dtype=torch.long,
-                                device=decoder_attention_mask.device,
-                            ),
-                            decoder_attention_mask,
-                        ],
-                        dim=-1,
-                    )
-        elif decoder_input_ids is None:
-            decoder_input_ids = (
-                torch.LongTensor([[self.decoder_start_token_id]]).repeat(batch_size, 1).to(image_embeds.device)
-            )
-
         if decoder_attention_mask is None:
-            decoder_attention_mask = torch.ones_like(decoder_input_ids).to(image_embeds.device)
+            decoder_attention_mask = torch.ones_like(input_ids).to(input_ids.device)
 
-        outputs = self.decoder.generate(
-            input_ids=decoder_input_ids,
-            attention_mask=decoder_attention_mask,
-            encoder_hidden_states=image_embeds,
-            encoder_attention_mask=attention_mask,
-            **generate_kwargs,
-        )
+        # cut decoder_input_ids if past is used
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
 
-        return outputs
+        return {
+            "flattened_patches": flattened_patches,
+            "decoder_input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "encoder_outputs": encoder_outputs,
+            "attention_mask": attention_mask,
+            "decoder_attention_mask": decoder_attention_mask,
+            "head_mask": head_mask,
+            "decoder_head_mask": decoder_head_mask,
+            "cross_attn_head_mask": cross_attn_head_mask,
+            "use_cache": use_cache,
+        }
diff --git a/src/transformers/models/switch_transformers/modeling_switch_transformers.py b/src/transformers/models/switch_transformers/modeling_switch_transformers.py
index bcf1c4b7bc831f..cba7ab96d84e87 100644
--- a/src/transformers/models/switch_transformers/modeling_switch_transformers.py
+++ b/src/transformers/models/switch_transformers/modeling_switch_transformers.py
@@ -1700,6 +1700,8 @@ def forward(
                 decoder_router_probs = nn.Softmax(dim=-1)(decoder_router_logits)
                 decoder_aux_loss = load_balancing_loss_func(decoder_router_probs, decoder_expert_indexes)
 
+            # move labels to correct device to enable PP
+            labels = labels.to(lm_logits.device)
             loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
 
             if output_router_logits and labels is not None:
diff --git a/src/transformers/models/vilt/modeling_vilt.py b/src/transformers/models/vilt/modeling_vilt.py
index 6704fe42b197aa..36c5d38710ef10 100755
--- a/src/transformers/models/vilt/modeling_vilt.py
+++ b/src/transformers/models/vilt/modeling_vilt.py
@@ -1009,6 +1009,8 @@ def forward(
         masked_lm_loss = None
         if labels is not None:
             loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            # move labels to correct device to enable PP
+            labels = labels.to(mlm_logits.device)
             masked_lm_loss = loss_fct(mlm_logits.view(-1, self.config.vocab_size), labels.view(-1))
 
         if not return_dict:
@@ -1155,6 +1157,8 @@ def forward(
 
         loss = None
         if labels is not None:
+            # move labels to correct device to enable PP
+            labels = labels.to(logits.device)
             loss = nn.functional.binary_cross_entropy_with_logits(logits, labels) * labels.shape[1]
             # see https://github.com/jnhwkim/ban-vqa/blob/master/train.py#L19
 
@@ -1395,6 +1399,8 @@ def forward(
         loss = None
         if labels is not None:
             loss_fct = CrossEntropyLoss()
+            # move labels to correct device to enable PP
+            labels = labels.to(logits.device)
             loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
 
         if not return_dict:
@@ -1481,6 +1487,8 @@ def forward(
         loss = None
         if labels is not None:
             loss_fct = CrossEntropyLoss()
+            # move labels to correct device to enable PP
+            labels = labels.to(logits.device)
             loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
 
         if not return_dict:
diff --git a/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
index 5af7c195ff0e4c..439c5d668a93f2 100644
--- a/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
@@ -662,14 +662,18 @@ def call(
         )
 
     def serving_output(self, output):
-        pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None
-        dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None
-        dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None
-        enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None
-        enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None
+        pkv = tf.tuple(output.past_key_values)[1] if self.config.decoder.use_cache else None
+        dec_hs = (
+            tf.convert_to_tensor(output.decoder_hidden_states) if self.config.decoder.output_hidden_states else None
+        )
+        dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.decoder.output_attentions else None
+        enc_hs = (
+            tf.convert_to_tensor(output.encoder_hidden_states) if self.config.encoder.output_hidden_states else None
+        )
+        enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.encoder.output_attentions else None
         cross_attns = (
             tf.convert_to_tensor(output.cross_attentions)
-            if self.config.output_attentions and output.cross_attentions is not None
+            if self.config.decoder.output_attentions and output.cross_attentions is not None
             else None
         )
 
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index 68d19d14970ae8..ed845febac82d1 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -1432,6 +1432,8 @@ def forward(
         loss = None
         if labels is not None:
             loss_fct = CrossEntropyLoss()
+            # move labels to correct device to enable PP
+            labels = labels.to(lm_logits.device)
             loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.reshape(-1))
 
         if not return_dict:
@@ -1760,6 +1762,8 @@ def forward(
 
         if labels is not None:
             loss_fct = CrossEntropyLoss()
+            # move labels to correct device to enable PP
+            labels = labels.to(logits.device)
             loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
 
         if not return_dict:
diff --git a/src/transformers/models/yolos/image_processing_yolos.py b/src/transformers/models/yolos/image_processing_yolos.py
index 150051fba66199..1aa37fec4219d9 100644
--- a/src/transformers/models/yolos/image_processing_yolos.py
+++ b/src/transformers/models/yolos/image_processing_yolos.py
@@ -15,7 +15,6 @@
 """Image processor class for YOLOS."""
 
 import pathlib
-import warnings
 from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
 
 import numpy as np
@@ -707,10 +706,9 @@ def __init__(
             do_pad = kwargs.pop("pad_and_return_pixel_mask")
 
         if "max_size" in kwargs:
-            warnings.warn(
+            logger.warning_once(
                 "The `max_size` parameter is deprecated and will be removed in v4.26. "
                 "Please specify in `size['longest_edge'] instead`.",
-                FutureWarning,
             )
             max_size = kwargs.pop("max_size")
         else:
@@ -734,10 +732,9 @@ def __init__(
     @property
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.max_size
     def max_size(self):
-        warnings.warn(
+        logger.warning(
             "The `max_size` parameter is deprecated and will be removed in v4.27. "
             "Please specify in `size['longest_edge'] instead`.",
-            FutureWarning,
         )
         return self.size["longest_edge"]
 
@@ -784,7 +781,7 @@ def prepare_annotation(
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare
     def prepare(self, image, target, return_segmentation_masks=False, masks_path=None):
-        warnings.warn(
+        logger.warning_once(
             "The `prepare` method is deprecated and will be removed in a future version. "
             "Please use `prepare_annotation` instead. Note: the `prepare_annotation` method "
             "does not return the image anymore.",
@@ -794,17 +791,23 @@ def prepare(self, image, target, return_segmentation_masks=False, masks_path=Non
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.convert_coco_poly_to_mask
     def convert_coco_poly_to_mask(self, *args, **kwargs):
-        warnings.warn("The `convert_coco_poly_to_mask` method is deprecated and will be removed in a future version. ")
+        logger.warning_once(
+            "The `convert_coco_poly_to_mask` method is deprecated and will be removed in a future version. "
+        )
         return convert_coco_poly_to_mask(*args, **kwargs)
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_detection with DETR->Yolos
     def prepare_coco_detection(self, *args, **kwargs):
-        warnings.warn("The `prepare_coco_detection` method is deprecated and will be removed in a future version. ")
+        logger.warning_once(
+            "The `prepare_coco_detection` method is deprecated and will be removed in a future version. "
+        )
         return prepare_coco_detection_annotation(*args, **kwargs)
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_panoptic
     def prepare_coco_panoptic(self, *args, **kwargs):
-        warnings.warn("The `prepare_coco_panoptic` method is deprecated and will be removed in a future version. ")
+        logger.warning_once(
+            "The `prepare_coco_panoptic` method is deprecated and will be removed in a future version. "
+        )
         return prepare_coco_panoptic_annotation(*args, **kwargs)
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize
@@ -821,10 +824,9 @@ def resize(
         int, smaller edge of the image will be matched to this number.
         """
         if "max_size" in kwargs:
-            warnings.warn(
+            logger.warning_once(
                 "The `max_size` parameter is deprecated and will be removed in v4.26. "
                 "Please specify in `size['longest_edge'] instead`.",
-                FutureWarning,
             )
             max_size = kwargs.pop("max_size")
         else:
@@ -1007,19 +1009,17 @@ def preprocess(
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
         """
         if "pad_and_return_pixel_mask" in kwargs:
-            warnings.warn(
+            logger.warning_once(
                 "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, "
                 "use `do_pad` instead.",
-                FutureWarning,
             )
             do_pad = kwargs.pop("pad_and_return_pixel_mask")
 
         max_size = None
         if "max_size" in kwargs:
-            warnings.warn(
+            logger.warning_once(
                 "The `max_size` argument is deprecated and will be removed in a future version, use"
                 " `size['longest_edge']` instead.",
-                FutureWarning,
             )
             size = kwargs.pop("max_size")
 
@@ -1164,10 +1164,9 @@ def post_process(self, outputs, target_sizes):
             `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
             in the batch as predicted by the model.
         """
-        warnings.warn(
+        logger.warning_once(
             "`post_process` is deprecated and will be removed in v5 of Transformers, please use"
             " `post_process_object_detection`",
-            FutureWarning,
         )
 
         out_logits, out_bbox = outputs.logits, outputs.pred_boxes
diff --git a/src/transformers/pipelines/document_question_answering.py b/src/transformers/pipelines/document_question_answering.py
index 78f49a5e2dadb3..936d728b598393 100644
--- a/src/transformers/pipelines/document_question_answering.py
+++ b/src/transformers/pipelines/document_question_answering.py
@@ -131,6 +131,11 @@ class DocumentQuestionAnsweringPipeline(ChunkPipeline):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
+        if self.tokenizer is not None and not self.tokenizer.__class__.__name__.endswith("Fast"):
+            raise ValueError(
+                "`DocumentQuestionAnsweringPipeline` requires a fast tokenizer, but a slow tokenizer "
+                f"(`{self.tokenizer.__class__.__name__}`) is provided."
+            )
 
         if self.model.config.__class__.__name__ == "VisionEncoderDecoderConfig":
             self.model_type = ModelType.VisionEncoderDecoder
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 8d958a82509a61..3188160dddb624 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -58,6 +58,7 @@
     is_flax_available,
     is_ftfy_available,
     is_ipex_available,
+    is_jieba_available,
     is_jumanpp_available,
     is_keras_nlp_available,
     is_librosa_available,
@@ -277,6 +278,13 @@ def require_rjieba(test_case):
     return unittest.skipUnless(is_rjieba_available(), "test requires rjieba")(test_case)
 
 
+def require_jieba(test_case):
+    """
+    Decorator marking a test that requires jieba. These tests are skipped when jieba isn't installed.
+    """
+    return unittest.skipUnless(is_jieba_available(), "test requires jieba")(test_case)
+
+
 def require_tf2onnx(test_case):
     return unittest.skipUnless(is_tf2onnx_available(), "test requires tf2onnx")(test_case)
 
diff --git a/src/transformers/trainer_seq2seq.py b/src/transformers/trainer_seq2seq.py
index 17ad27bbb5299a..2d7b04c0e199e5 100644
--- a/src/transformers/trainer_seq2seq.py
+++ b/src/transformers/trainer_seq2seq.py
@@ -265,9 +265,14 @@ def prediction_step(
             gen_kwargs["synced_gpus"] if gen_kwargs.get("synced_gpus") is not None else default_synced_gpus
         )
 
-        # TODO (Joao): the following line is needed to keep a consistent result on SQUAD. Ideally, we should not block
-        # users from preparing a dataset with `decoder_input_ids`.
-        inputs = {k: v for k, v in inputs.items() if k != "decoder_input_ids"}
+        # If the `decoder_input_ids` was created from `labels`, evict the former, so that the model can freely generate
+        # (otherwise, it would continue generating from the padded `decoder_input_ids`)
+        if (
+            "labels" in inputs
+            and "decoder_input_ids" in inputs
+            and inputs["labels"].shape == inputs["decoder_input_ids"].shape
+        ):
+            inputs = {k: v for k, v in inputs.items() if k != "decoder_input_ids"}
         generated_tokens = self.model.generate(**inputs, **gen_kwargs)
 
         # Temporary hack to ensure the generation config is not initialized for each iteration of the evaluation loop
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 28387885de166d..30c2461ffa2111 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -54,8 +54,13 @@
     logging,
     requires_backends,
 )
+from .utils.import_utils import is_optimum_neuron_available
 
 
+logger = logging.get_logger(__name__)
+log_levels = logging.get_log_levels_dict().copy()
+trainer_log_levels = dict(**log_levels, passive=-1)
+
 if is_torch_available():
     import torch
     import torch.distributed as dist
@@ -67,12 +72,23 @@
     # torchrun support
     # https://github.com/pytorch/xla/pull/3609
     if os.environ.get("TORCHELASTIC_RUN_ID"):
-        import torch_xla.distributed.xla_backend as xbn
+        if is_optimum_neuron_available():
+            logger.info(
+                "Make sure that you are performing the training with the TrainiumTrainer from optimum[neuron], this "
+                "will fail otherwise."
+            )
+        else:
+            logger.warning(
+                "Please use the TrainiumTrainer from optimum[neuron] instead of the Transformers library to perform "
+                "training on AWS Trainium instances. More information here: "
+                "https://github.com/huggingface/optimum-neuron"
+            )
+            import torch_xla.distributed.xla_backend as xbn
 
-        if not isinstance(torch.distributed.group.WORLD, xbn.ProcessGroupXla):
-            torch.distributed.init_process_group(backend="xla")
             if not isinstance(torch.distributed.group.WORLD, xbn.ProcessGroupXla):
-                raise AssertionError("Failed to initialize torch.distributed process group using XLA backend.")
+                torch.distributed.init_process_group(backend="xla")
+                if not isinstance(torch.distributed.group.WORLD, xbn.ProcessGroupXla):
+                    raise AssertionError("Failed to initialize torch.distributed process group using XLA backend.")
 
 
 if is_sagemaker_mp_enabled():
@@ -81,11 +97,6 @@
     smp.init()
 
 
-logger = logging.get_logger(__name__)
-log_levels = logging.get_log_levels_dict().copy()
-trainer_log_levels = dict(**log_levels, passive=-1)
-
-
 def default_logdir() -> str:
     """
     Same default as PyTorch
@@ -564,9 +575,9 @@ class TrainingArguments:
             [`torch.compile`](https://pytorch.org/get-started/pytorch-2.0/).
 
             This will use the best defaults for the [`torch.compile`
-            API](https://pytorch.org/docs/2.0/generated/torch.compile.html?highlight=torch+compile#torch.compile). You
-            can customize the defaults with the argument `torch_compile_backend` and `torch_compile_mode` but we don't
-            guarantee any of them will work as the support is progressively rolled in in PyTorch.
+            API](https://pytorch.org/docs/stable/generated/torch.compile.html?highlight=torch+compile#torch.compile).
+            You can customize the defaults with the argument `torch_compile_backend` and `torch_compile_mode` but we
+            don't guarantee any of them will work as the support is progressively rolled in in PyTorch.
 
             This flag and the whole compile API is experimental and subject to change in future releases.
         torch_compile_backend (`str`, *optional*):
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 132c707f421b2c..1f04ca73bfc13a 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -110,6 +110,7 @@
     is_ftfy_available,
     is_in_notebook,
     is_ipex_available,
+    is_jieba_available,
     is_jumanpp_available,
     is_kenlm_available,
     is_keras_nlp_available,
diff --git a/src/transformers/utils/bitsandbytes.py b/src/transformers/utils/bitsandbytes.py
index 8dd7dd62d3260a..3768506f41138e 100644
--- a/src/transformers/utils/bitsandbytes.py
+++ b/src/transformers/utils/bitsandbytes.py
@@ -1,6 +1,8 @@
 from copy import deepcopy
 
-from .import_utils import is_accelerate_available, is_bitsandbytes_available
+from packaging import version
+
+from .import_utils import importlib_metadata, is_accelerate_available, is_bitsandbytes_available
 
 
 if is_bitsandbytes_available():
@@ -13,7 +15,7 @@
     from accelerate.utils import find_tied_parameters
 
 
-def set_module_8bit_tensor_to_device(module, tensor_name, device, value=None):
+def set_module_8bit_tensor_to_device(module, tensor_name, device, value=None, fp16_statistics=None):
     """
     A helper function to set a given tensor (parameter of buffer) of a module on a specific device (note that doing
     `param.to(device)` creates a new tensor not linked to the parameter, which is why we need this function). The
@@ -29,6 +31,8 @@ class `Int8Params` from `bitsandbytes`.
             The device on which to set the tensor.
         value (`torch.Tensor`, *optional*):
             The value of the tensor (useful when going from the meta device to any other device).
+        fp16_statistics (`torch.HalfTensor`, *optional*):
+            The list of fp16 statistics to set on the module, used for serialization.
     """
     # Recurse if needed
     if "." in tensor_name:
@@ -61,14 +65,21 @@ class `Int8Params` from `bitsandbytes`.
             elif isinstance(value, torch.Tensor):
                 new_value = value.to("cpu")
                 if value.dtype == torch.int8:
-                    raise ValueError(
-                        "You cannot load weights that are saved in int8 using `load_in_8bit=True`, make sure you are",
-                        " using `load_in_8bit=True` on float32/float16/bfloat16 weights.",
+                    is_8bit_serializable = version.parse(importlib_metadata.version("bitsandbytes")) > version.parse(
+                        "0.37.2"
                     )
+                    if not is_8bit_serializable:
+                        raise ValueError(
+                            "Detected int8 weights but the version of bitsandbytes is not compatible with int8 serialization. "
+                            "Make sure to download the latest `bitsandbytes` version. `pip install --upgrade bitsandbytes`."
+                        )
             else:
                 new_value = torch.tensor(value, device="cpu")
             new_value = bnb.nn.Int8Params(new_value, requires_grad=False, has_fp16_weights=has_fp16_weights).to(device)
             module._parameters[tensor_name] = new_value
+
+            if fp16_statistics is not None:
+                setattr(module.weight, "SCB", fp16_statistics.to(device))
     else:
         if value is None:
             new_value = old_value.to(device)
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 9524cf58621a1b..1cc23298a2cc12 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -1841,6 +1841,30 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+CPMANT_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class CpmAntForCausalLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class CpmAntModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class CpmAntPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 CTRL_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index 465dd56d371197..f250a670a9f4b8 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -274,6 +274,13 @@
 except importlib_metadata.PackageNotFoundError:
     _decord_availale = False
 
+_jieba_available = importlib.util.find_spec("jieba") is not None
+try:
+    _jieba_version = importlib_metadata.version("jieba")
+    logger.debug(f"Successfully imported jieba version {_jieba_version}")
+except importlib_metadata.PackageNotFoundError:
+    _jieba_available = False
+
 # This is the version of torch required to run torch.fx features and torch.onnx with dictionary inputs.
 TORCH_FX_REQUIRED_VERSION = version.parse("1.10")
 
@@ -576,6 +583,10 @@ def is_optimum_available():
     return importlib.util.find_spec("optimum") is not None
 
 
+def is_optimum_neuron_available():
+    return importlib.util.find_spec("optimum.neuron") is not None
+
+
 def is_safetensors_available():
     if is_torch_available():
         if version.parse(_torch_version) >= version.parse("1.10"):
@@ -740,6 +751,10 @@ def is_cython_available():
     return importlib.util.find_spec("pyximport") is not None
 
 
+def is_jieba_available():
+    return _jieba_available
+
+
 # docstyle-ignore
 DATASETS_IMPORT_ERROR = """
 {0} requires the 🤗 Datasets library but it was not found in your environment. You can install it with:
@@ -997,6 +1012,11 @@ def is_cython_available():
 Cython`. Please note that you may need to restart your runtime after installation.
 """
 
+JIEBA_IMPORT_ERROR = """
+{0} requires the jieba library but it was not found in your environment. You can install it with pip: `pip install
+jieba`. Please note that you may need to restart your runtime after installation.
+"""
+
 BACKENDS_MAPPING = OrderedDict(
     [
         ("bs4", (is_bs4_available, BS4_IMPORT_ERROR)),
@@ -1029,6 +1049,7 @@ def is_cython_available():
         ("oneccl_bind_pt", (is_ccl_available, CCL_IMPORT_ERROR)),
         ("decord", (is_decord_available, DECORD_IMPORT_ERROR)),
         ("cython", (is_cython_available, CYTHON_IMPORT_ERROR)),
+        ("jieba", (is_jieba_available, JIEBA_IMPORT_ERROR)),
     ]
 )
 
diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
index 1c49eadb6d21ab..f123faaab32f59 100644
--- a/src/transformers/utils/quantization_config.py
+++ b/src/transformers/utils/quantization_config.py
@@ -14,7 +14,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import copy
+import json
+import os
 from dataclasses import dataclass
+from typing import Any, Dict, Union
+
+from ..utils import logging
+
+
+logger = logging.get_logger(__name__)
 
 
 @dataclass
@@ -49,6 +58,8 @@ class BitsAndBytesConfig:
             your model in different parts and run some parts in int8 on GPU and some parts in fp32 on CPU, you can use
             this flag. This is useful for offloading large models such as `google/flan-t5-xxl`. Note that the int8
             operations will not be run on CPU.
+        kwargs (`Dict[str, Any]`, *optional*):
+            Additional parameters from which to initialize the configuration object.
     """
 
     def __init__(
@@ -57,6 +68,7 @@ def __init__(
         llm_int8_threshold=6.0,
         llm_int8_skip_modules=None,
         llm_int8_enable_fp32_cpu_offload=False,
+        **kwargs,
     ):
         self.load_in_8bit = load_in_8bit
         self.llm_int8_threshold = llm_int8_threshold
@@ -81,17 +93,19 @@ def post_init(self):
     @classmethod
     def from_dict(cls, config_dict, return_unused_kwargs, **kwargs):
         """
-        Instantiates a [`PretrainedConfig`] from a Python dictionary of parameters.
+        Instantiates a [`BitsAndBytesConfig`] from a Python dictionary of parameters.
 
         Args:
             config_dict (`Dict[str, Any]`):
-                Dictionary that will be used to instantiate the configuration object. Such a dictionary can be
-                retrieved from a pretrained checkpoint by leveraging the [`~PretrainedConfig.get_config_dict`] method.
+                Dictionary that will be used to instantiate the configuration object.
+            return_unused_kwargs (`bool`):
+                Whether or not to return a list of unused keyword arguments. Used for `from_pretrained` method in
+                `PreTrainedModel`.
             kwargs (`Dict[str, Any]`):
                 Additional parameters from which to initialize the configuration object.
 
         Returns:
-            [`PretrainedConfig`]: The configuration object instantiated from those parameters.
+            [`BitsAndBytesConfig`]: The configuration object instantiated from those parameters.
         """
         config = cls(**config_dict)
 
@@ -107,3 +121,28 @@ def from_dict(cls, config_dict, return_unused_kwargs, **kwargs):
             return config, kwargs
         else:
             return config
+
+    def to_json_file(self, json_file_path: Union[str, os.PathLike]):
+        """
+        Save this instance to a JSON file.
+
+        Args:
+            json_file_path (`str` or `os.PathLike`):
+                Path to the JSON file in which this configuration instance's parameters will be saved.
+            use_diff (`bool`, *optional*, defaults to `True`):
+                If set to `True`, only the difference between the config instance and the default
+                `BitsAndBytesConfig()` is serialized to JSON file.
+        """
+        with open(json_file_path, "w", encoding="utf-8") as writer:
+            config_dict = self.to_dict()
+            json_string = json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
+
+            writer.write(json_string)
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serializes this instance to a Python dictionary. Returns:
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
+        """
+        output = copy.deepcopy(self.__dict__)
+        return output
diff --git a/tests/extended/test_trainer_ext.py b/tests/extended/test_trainer_ext.py
index 8953adaa247f03..7fd2fc9389ab25 100644
--- a/tests/extended/test_trainer_ext.py
+++ b/tests/extended/test_trainer_ext.py
@@ -366,7 +366,7 @@ def run_trainer(
                 n_gpus_to_use = get_gpu_count()
             master_port = get_torch_dist_unique_port()
             distributed_args = f"""
-                -m torch.distributed.launch
+                -m torch.distributed.run
                 --nproc_per_node={n_gpus_to_use}
                 --master_port={master_port}
                 {self.examples_dir_str}/pytorch/translation/run_translation.py
diff --git a/tests/generation/test_framework_agnostic.py b/tests/generation/test_framework_agnostic.py
index 72f0b5dc141450..61845aa9bc1ae1 100644
--- a/tests/generation/test_framework_agnostic.py
+++ b/tests/generation/test_framework_agnostic.py
@@ -94,8 +94,8 @@ def test_max_new_tokens_encoder_decoder(self):
 
         # Decoder only call
         outputs = bart_model.generate(decoder_input_ids=input_ids, max_new_tokens=max_new_tokens)
-        # 29 + 3 new tokens
-        self.assertEqual(list(outputs.shape), [1, 32])
+        # 1 BOS + 29 (input length) + 3 new tokens
+        self.assertEqual(list(outputs.shape), [1, 33])
 
         # Encoder decoder call > 20
         outputs = bart_model.generate(max_new_tokens=max_new_tokens + 20)
@@ -658,3 +658,31 @@ def test_eos_token_id_int_and_list_beam_search(self):
             [token == model.config.pad_token_id for token in generated_tokens[0][expectation:]]
         )
         self.assertTrue(unpadded_correct_condition or padded_correct_condition)
+
+    def test_generate_vision2text_conditioning(self):
+        model_cls = self.framework_dependent_parameters["AutoModelForVision2Seq"]
+        floats_tensor = self.framework_dependent_parameters["floats_tensor"]
+        create_tensor_fn = self.framework_dependent_parameters["create_tensor_fn"]
+        is_pt = not model_cls.__name__.startswith("TF")
+
+        pixel_values = floats_tensor((2, 3, 30, 30))
+        conditioning_input = create_tensor_fn([[10], [10]])  # this should be the 2nd output token, after the BOS token
+        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-VisionEncoderDecoderModel-vit-gpt2")
+        if is_pt:
+            pixel_values = pixel_values.to(torch_device)
+            model = model.to(torch_device)
+            conditioning_input = conditioning_input.to(torch_device)
+
+        # we can condition on decoder_input_ids (expected decoder input) and input_ids (which we pipe internally as
+        # decoder_input_ids, if the encoder is not a model with text input)
+        output_sequences_decoder_input_ids = model.generate(
+            pixel_values, max_length=5, decoder_input_ids=conditioning_input
+        )
+        output_sequences_input_ids = model.generate(pixel_values, max_length=5, input_ids=conditioning_input)
+        if is_pt:
+            output_sequences_decoder_input_ids = output_sequences_decoder_input_ids.cpu().numpy()
+            output_sequences_input_ids = output_sequences_input_ids.cpu().numpy()
+            conditioning_input = conditioning_input.cpu().numpy()
+
+        self.assertTrue(np.array_equal(output_sequences_decoder_input_ids, output_sequences_input_ids))
+        self.assertTrue(np.array_equal(output_sequences_decoder_input_ids[:, 1:2], conditioning_input))
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index c0278f6ae46761..dffaba4fb68f43 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -1892,8 +1892,10 @@ def test_max_length_backward_compat_greedy(self):
         max_length = 20
         input_ids = input_ids.expand(2, -1)
         model_kwargs = bart_model._prepare_encoder_decoder_kwargs_for_generation(input_ids, {})
-        input_ids = bart_model._prepare_decoder_input_ids_for_generation(
-            input_ids.shape[0],
+        input_ids, model_kwargs = bart_model._prepare_decoder_input_ids_for_generation(
+            batch_size=input_ids.shape[0],
+            model_input_name=bart_model.main_input_name,
+            model_kwargs=model_kwargs,
             decoder_start_token_id=bart_model.config.decoder_start_token_id,
             bos_token_id=bart_model.config.bos_token_id,
         )
@@ -1919,8 +1921,10 @@ def test_max_length_backward_compat_sample(self):
         max_length = 20
         input_ids = input_ids.expand(2, -1)
         model_kwargs = bart_model._prepare_encoder_decoder_kwargs_for_generation(input_ids, {})
-        input_ids = bart_model._prepare_decoder_input_ids_for_generation(
-            input_ids.shape[0],
+        input_ids, model_kwargs = bart_model._prepare_decoder_input_ids_for_generation(
+            batch_size=input_ids.shape[0],
+            model_input_name=bart_model.main_input_name,
+            model_kwargs=model_kwargs,
             decoder_start_token_id=bart_model.config.decoder_start_token_id,
             bos_token_id=bart_model.config.bos_token_id,
         )
@@ -1949,8 +1953,10 @@ def test_max_length_backward_compat_beam_search(self):
 
         input_ids = input_ids.expand(2, -1)
         model_kwargs = bart_model._prepare_encoder_decoder_kwargs_for_generation(input_ids, {})
-        input_ids = bart_model._prepare_decoder_input_ids_for_generation(
-            input_ids.shape[0],
+        input_ids, model_kwargs = bart_model._prepare_decoder_input_ids_for_generation(
+            batch_size=input_ids.shape[0],
+            model_input_name=bart_model.main_input_name,
+            model_kwargs=model_kwargs,
             decoder_start_token_id=bart_model.config.decoder_start_token_id,
             bos_token_id=bart_model.config.bos_token_id,
         )
@@ -1982,8 +1988,10 @@ def test_max_length_backward_compat_group_beam_search(self):
 
         input_ids = input_ids.expand(6, -1)
         model_kwargs = bart_model._prepare_encoder_decoder_kwargs_for_generation(input_ids, {})
-        input_ids = bart_model._prepare_decoder_input_ids_for_generation(
-            input_ids.shape[0],
+        input_ids, model_kwargs = bart_model._prepare_decoder_input_ids_for_generation(
+            batch_size=input_ids.shape[0],
+            model_input_name=bart_model.main_input_name,
+            model_kwargs=model_kwargs,
             decoder_start_token_id=bart_model.config.decoder_start_token_id,
             bos_token_id=bart_model.config.bos_token_id,
         )
@@ -2021,8 +2029,10 @@ def test_max_length_warning_if_different(self):
         # Greedy
         input_ids = input_ids.expand(6, -1)
         model_kwargs = bart_model._prepare_encoder_decoder_kwargs_for_generation(input_ids, {})
-        input_ids = bart_model._prepare_decoder_input_ids_for_generation(
-            input_ids.shape[0],
+        input_ids, model_kwargs = bart_model._prepare_decoder_input_ids_for_generation(
+            batch_size=input_ids.shape[0],
+            model_input_name=bart_model.main_input_name,
+            model_kwargs=model_kwargs,
             decoder_start_token_id=bart_model.config.decoder_start_token_id,
             bos_token_id=bart_model.config.bos_token_id,
         )
diff --git a/tests/mixed_int8/test_mixed_int8.py b/tests/mixed_int8/test_mixed_int8.py
index a73f689a8a94fb..1628e08155d67e 100644
--- a/tests/mixed_int8/test_mixed_int8.py
+++ b/tests/mixed_int8/test_mixed_int8.py
@@ -19,6 +19,7 @@
 from packaging import version
 
 from transformers import (
+    AutoConfig,
     AutoModel,
     AutoModelForCausalLM,
     AutoModelForSeq2SeqLM,
@@ -150,6 +151,13 @@ def test_generate_quality_config(self):
 
         self.assertEqual(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
 
+    def test_warns_save_pretrained(self):
+        r"""
+        Test whether trying to save a model after converting it in 8-bit will throw a warning.
+        """
+        with self.assertWarns(UserWarning), tempfile.TemporaryDirectory() as tmpdirname:
+            self.model_8bit.save_pretrained(tmpdirname)
+
     def test_raise_if_config_and_load_in_8bit(self):
         r"""
         Test that loading the model with the config and `load_in_8bit` raises an error
@@ -165,13 +173,6 @@ def test_raise_if_config_and_load_in_8bit(self):
                 llm_int8_enable_fp32_cpu_offload=True,
             )
 
-    def test_warns_save_pretrained(self):
-        r"""
-        Test whether trying to save a model after converting it in 8-bit will throw a warning.
-        """
-        with self.assertWarns(UserWarning), tempfile.TemporaryDirectory() as tmpdirname:
-            self.model_8bit.save_pretrained(tmpdirname)
-
     def test_device_and_dtype_assignment(self):
         r"""
         Test whether trying to cast (or assigning a device to) a model after converting it in 8-bit will throw an error.
@@ -219,6 +220,77 @@ def test_fp32_int8_conversion(self):
         model = AutoModelForSeq2SeqLM.from_pretrained("t5-small", load_in_8bit=True, device_map="auto")
         self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wo.weight.dtype == torch.float32)
 
+    def test_int8_serialization(self):
+        r"""
+        Test whether it is possible to serialize a model in 8-bit.
+        """
+        from bitsandbytes.nn import Int8Params
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            self.model_8bit.save_pretrained(tmpdirname)
+
+            # check that the file `quantization_config` is present
+            config = AutoConfig.from_pretrained(tmpdirname)
+            self.assertTrue(hasattr(config, "quantization_config"))
+
+            model_from_saved = AutoModelForCausalLM.from_pretrained(tmpdirname, load_in_8bit=True, device_map="auto")
+
+            self.assertTrue(model_from_saved.transformer.h[0].mlp.dense_4h_to_h.weight.__class__ == Int8Params)
+            self.assertTrue(hasattr(model_from_saved.transformer.h[0].mlp.dense_4h_to_h.weight, "SCB"))
+
+            # generate
+            encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
+            output_sequences = model_from_saved.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
+
+            self.assertEqual(
+                self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUT
+            )
+
+    def test_int8_serialization_sharded(self):
+        r"""
+        Test whether it is possible to serialize a model in 8-bit - sharded version.
+        """
+        from bitsandbytes.nn import Int8Params
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            self.model_8bit.save_pretrained(tmpdirname, max_shard_size="200MB")
+
+            # check that the file `quantization_config` is present
+            config = AutoConfig.from_pretrained(tmpdirname)
+            self.assertTrue(hasattr(config, "quantization_config"))
+
+            model_from_saved = AutoModelForCausalLM.from_pretrained(tmpdirname)
+
+            self.assertTrue(model_from_saved.transformer.h[0].mlp.dense_4h_to_h.weight.__class__ == Int8Params)
+            self.assertTrue(hasattr(model_from_saved.transformer.h[0].mlp.dense_4h_to_h.weight, "SCB"))
+
+            # generate
+            encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
+            output_sequences = model_from_saved.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
+
+            self.assertEqual(
+                self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUT
+            )
+
+    def test_int8_from_pretrained(self):
+        r"""
+        Test whether loading a 8bit model from the Hub works as expected
+        """
+        from bitsandbytes.nn import Int8Params
+
+        model_id = "ybelkada/bloom-1b7-8bit"
+
+        model = AutoModelForCausalLM.from_pretrained(model_id)
+
+        self.assertTrue(model.transformer.h[0].mlp.dense_4h_to_h.weight.__class__ == Int8Params)
+        self.assertTrue(hasattr(model.transformer.h[0].mlp.dense_4h_to_h.weight, "SCB"))
+
+        # generate
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
+        output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
+
+        self.assertEqual(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
+
 
 @require_bitsandbytes
 @require_accelerate
@@ -289,6 +361,38 @@ def test_inference_with_keep_in_fp32(self):
         encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(0)
         _ = model.generate(**encoded_input)
 
+    def test_inference_with_keep_in_fp32_serialized(self):
+        r"""
+        Test whether it is possible to mix both `int8` and `fp32` weights when using `keep_in_fp32_modules` correctly on
+        a serialized model.
+        `flan-t5-small` uses `T5DenseGatedActDense` whereas `t5-small` uses `T5DenseReluDense`. We need to test
+        both cases.
+        """
+        import bitsandbytes as bnb
+
+        from transformers import T5ForConditionalGeneration
+
+        # test with `t5-small`
+        model = T5ForConditionalGeneration.from_pretrained(self.model_name, load_in_8bit=True, device_map="auto")
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir)
+
+            model = T5ForConditionalGeneration.from_pretrained(tmp_dir)
+
+            # there was a bug with decoders - this test checks that it is fixed
+            self.assertTrue(isinstance(model.decoder.block[0].layer[0].SelfAttention.q, bnb.nn.Linear8bitLt))
+
+            encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(0)
+            _ = model.generate(**encoded_input)
+
+            # test with `flan-t5-small`
+            model = T5ForConditionalGeneration.from_pretrained(
+                self.dense_act_model_name, load_in_8bit=True, device_map="auto"
+            )
+            encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(0)
+            _ = model.generate(**encoded_input)
+
 
 class MixedInt8ModelClassesTest(BaseMixedInt8Test):
     def setUp(self):
diff --git a/tests/models/bart/test_modeling_bart.py b/tests/models/bart/test_modeling_bart.py
index 36837c9556db6a..a4b77e8431bf76 100644
--- a/tests/models/bart/test_modeling_bart.py
+++ b/tests/models/bart/test_modeling_bart.py
@@ -1230,7 +1230,7 @@ def test_contrastive_search_bart(self):
             article, add_special_tokens=False, truncation=True, max_length=512, return_tensors="pt"
         ).input_ids.to(torch_device)
 
-        outputs = bart_model.generate(input_ids, penalty_alpha=0.5, top_k=5, max_length=64)
+        outputs = bart_model.generate(input_ids, penalty_alpha=0.5, top_k=5, max_length=64, num_beams=1)
         generated_text = bart_tokenizer.batch_decode(outputs, skip_special_tokens=True)
 
         self.assertListEqual(
diff --git a/tests/models/blip/test_modeling_tf_blip.py b/tests/models/blip/test_modeling_tf_blip.py
index 31630b17f943c1..b8fd916ec13ed4 100644
--- a/tests/models/blip/test_modeling_tf_blip.py
+++ b/tests/models/blip/test_modeling_tf_blip.py
@@ -783,7 +783,7 @@ def test_inference_image_captioning(self):
         # Test output
         self.assertEqual(
             predictions[0].numpy().tolist(),
-            [30522, 1037, 3861, 1997, 1037, 2450, 3564, 2006, 1996, 3509, 2007, 2014, 3899, 102],
+            [30522, 1037, 3861, 1997, 1037, 2450, 1998, 2014, 3899, 2006, 1996, 3509, 102],
         )
 
     def test_inference_vqa(self):
@@ -810,6 +810,6 @@ def test_inference_itm(self):
         out_itm = model(**inputs)
         out = model(**inputs, use_itm_head=False, training=False)
 
-        expected_scores = tf.convert_to_tensor([[0.9798, 0.0202]])
+        expected_scores = tf.convert_to_tensor([[0.0029, 0.9971]])
         self.assertTrue(np.allclose(tf.nn.softmax(out_itm[0]).numpy(), expected_scores, rtol=1e-3, atol=1e-3))
-        self.assertTrue(np.allclose(out[0], tf.convert_to_tensor([[0.5053]]), rtol=1e-3, atol=1e-3))
+        self.assertTrue(np.allclose(out[0], tf.convert_to_tensor([[0.5162]]), rtol=1e-3, atol=1e-3))
diff --git a/tests/models/cpmant/__init__.py b/tests/models/cpmant/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tests/models/cpmant/test_modeling_cpmant.py b/tests/models/cpmant/test_modeling_cpmant.py
new file mode 100644
index 00000000000000..61cd0ec17e2d83
--- /dev/null
+++ b/tests/models/cpmant/test_modeling_cpmant.py
@@ -0,0 +1,233 @@
+# coding=utf-8
+# Copyright 2022 The OpenBMB Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch CPMAnt model. """
+
+import unittest
+
+from transformers.testing_utils import is_torch_available, require_torch, tooslow
+
+from ...generation.test_utils import torch_device
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, ids_tensor
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        CpmAntConfig,
+        CpmAntForCausalLM,
+        CpmAntModel,
+        CpmAntTokenizer,
+    )
+
+
+@require_torch
+class CpmAntModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=2,
+        seq_length=8,
+        is_training=True,
+        use_token_type_ids=False,
+        use_input_mask=False,
+        use_labels=False,
+        use_mc_token_ids=False,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=3,
+        num_attention_heads=4,
+        intermediate_size=37,
+        num_buckets=32,
+        max_distance=128,
+        prompt_length=8,
+        prompt_types=8,
+        segment_types=8,
+        init_std=1.0,
+        return_dict=True,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_token_type_ids = use_token_type_ids
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.use_mc_token_ids = use_mc_token_ids
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.num_buckets = num_buckets
+        self.max_distance = max_distance
+        self.prompt_length = prompt_length
+        self.prompt_types = prompt_types
+        self.segment_types = segment_types
+        self.init_std = init_std
+        self.return_dict = return_dict
+
+    def prepare_config_and_inputs(self):
+        input_ids = {}
+        input_ids["input_ids"] = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).type(torch.int32)
+        input_ids["use_cache"] = False
+
+        config = self.get_config()
+
+        return (config, input_ids)
+
+    def get_config(self):
+        return CpmAntConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            dim_ff=self.intermediate_size,
+            position_bias_num_buckets=self.num_buckets,
+            position_bias_max_distance=self.max_distance,
+            prompt_types=self.prompt_types,
+            prompt_length=self.prompt_length,
+            segment_types=self.segment_types,
+            use_cache=True,
+            init_std=self.init_std,
+            return_dict=self.return_dict,
+        )
+
+    def create_and_check_cpmant_model(self, config, input_ids, *args):
+        model = CpmAntModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        hidden_states = model(**input_ids).last_hidden_state
+
+        self.parent.assertEqual(hidden_states.shape, (self.batch_size, self.seq_length, config.hidden_size))
+
+    def create_and_check_lm_head_model(self, config, input_ids, *args):
+        model = CpmAntForCausalLM(config)
+        model.to(torch_device)
+        input_ids["input_ids"] = input_ids["input_ids"].to(torch_device)
+        model.eval()
+
+        model_output = model(**input_ids)
+        self.parent.assertEqual(
+            model_output.logits.shape,
+            (self.batch_size, self.seq_length, config.vocab_size + config.prompt_types * config.prompt_length),
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+
+@require_torch
+class CpmAntModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (CpmAntModel, CpmAntForCausalLM) if is_torch_available() else ()
+
+    test_pruning = False
+    test_missing_keys = False
+    test_mismatched_shapes = False
+    test_head_masking = False
+    test_resize_embeddings = False
+
+    def setUp(self):
+        self.model_tester = CpmAntModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=CpmAntConfig)
+
+    def test_config(self):
+        self.config_tester.create_and_test_config_common_properties()
+        self.config_tester.create_and_test_config_to_json_string()
+        self.config_tester.create_and_test_config_to_json_file()
+        self.config_tester.create_and_test_config_from_and_save_pretrained()
+        self.config_tester.check_config_can_be_init_without_params()
+        self.config_tester.check_config_arguments_init()
+
+    def test_inputs_embeds(self):
+        unittest.skip("CPMAnt doesn't support input_embeds.")(self.test_inputs_embeds)
+
+    def test_retain_grad_hidden_states_attentions(self):
+        unittest.skip(
+            "CPMAnt doesn't support retain grad in hidden_states or attentions, because prompt management will peel off the output.hidden_states from graph.\
+                 So is attentions. We strongly recommand you use loss to tune model."
+        )(self.test_retain_grad_hidden_states_attentions)
+
+    def test_cpmant_model(self):
+        config, inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_cpmant_model(config, inputs)
+
+    def test_cpmant_lm_head_model(self):
+        config, inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_lm_head_model(config, inputs)
+
+
+@require_torch
+class CpmAntModelIntegrationTest(unittest.TestCase):
+    @tooslow
+    def test_inference_masked_lm(self):
+        texts = "今天天气真好！"
+        model_path = "openbmb/cpm-ant-10b"
+        model = CpmAntModel.from_pretrained(model_path)
+        tokenizer = CpmAntTokenizer.from_pretrained(model_path)
+        inputs = tokenizer(texts, return_tensors="pt")
+        hidden_states = model(**inputs).last_hidden_state
+
+        expected_slice = torch.tensor(
+            [[[6.1708, 5.9244, 1.0835], [6.5207, 6.2893, -11.3324], [-1.0107, -0.0576, -5.9577]]],
+        )
+        self.assertTrue(torch.allclose(hidden_states[:, :3, :3], expected_slice, atol=1e-2))
+
+
+@require_torch
+class CpmAntForCausalLMlIntegrationTest(unittest.TestCase):
+    @tooslow
+    def test_inference_casual(self):
+        texts = "今天天气真好！"
+        model_path = "openbmb/cpm-ant-10b"
+        model = CpmAntForCausalLM.from_pretrained(model_path)
+        tokenizer = CpmAntTokenizer.from_pretrained(model_path)
+        inputs = tokenizer(texts, return_tensors="pt")
+        hidden_states = model(**inputs).logits
+
+        expected_slice = torch.tensor(
+            [[[-6.4267, -6.4083, -6.3958], [-5.8802, -5.9447, -5.7811], [-5.3896, -5.4820, -5.4295]]],
+        )
+        self.assertTrue(torch.allclose(hidden_states[:, :3, :3], expected_slice, atol=1e-2))
+
+    @tooslow
+    def test_simple_generation(self):
+        model_path = "openbmb/cpm-ant-10b"
+        model = CpmAntForCausalLM.from_pretrained(model_path)
+        tokenizer = CpmAntTokenizer.from_pretrained(model_path)
+        texts = "今天天气不错，"
+        expected_output = "今天天气不错，阳光明媚，我和妈妈一起去超市买东西。\n在超市里，我看到了一个很好玩的玩具，它的名字叫“机器人”。它有一个圆圆的脑袋，两只圆圆的眼睛，还有一个圆圆的"
+        model_inputs = tokenizer(texts, return_tensors="pt")
+        token_ids = model.generate(**model_inputs)
+        output_texts = tokenizer.batch_decode(token_ids)
+        self.assertEqual(expected_output, output_texts)
+
+    @tooslow
+    def test_batch_generation(self):
+        model_path = "openbmb/cpm-ant-10b"
+        model = CpmAntForCausalLM.from_pretrained(model_path)
+        tokenizer = CpmAntTokenizer.from_pretrained(model_path)
+        texts = ["今天天气不错，", "新年快乐，万事如意！"]
+        expected_output = [
+            "今天天气不错，阳光明媚，我和妈妈一起去超市买东西。\n在超市里，我看到了一个很好玩的玩具，它的名字叫“机器人”。它有一个圆圆的脑袋，两只圆圆的眼睛，还有一个圆圆的",
+            "新年快乐，万事如意！在这辞旧迎新的美好时刻，我谨代表《农村新技术》杂志社全体同仁，向一直以来关心、支持《农村新技术》杂志发展的各级领导、各界朋友和广大读者致以最诚挚的",
+        ]
+        model_inputs = tokenizer(texts, return_tensors="pt", padding=True)
+        token_ids = model.generate(**model_inputs)
+        output_texts = tokenizer.batch_decode(token_ids)
+        self.assertEqual(expected_output, output_texts)
diff --git a/tests/models/cpmant/test_tokenization_cpmant.py b/tests/models/cpmant/test_tokenization_cpmant.py
new file mode 100644
index 00000000000000..f5d0ef32450bcc
--- /dev/null
+++ b/tests/models/cpmant/test_tokenization_cpmant.py
@@ -0,0 +1,69 @@
+# coding=utf-8
+# Copyright 2022 The OpenBMB Team and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+
+from transformers.models.cpmant.tokenization_cpmant import VOCAB_FILES_NAMES, CpmAntTokenizer
+from transformers.testing_utils import require_jieba, tooslow
+
+from ...test_tokenization_common import TokenizerTesterMixin
+
+
+@require_jieba
+class CPMAntTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+    tokenizer_class = CpmAntTokenizer
+    test_rust_tokenizer = False
+
+    def setUp(self):
+        super().setUp()
+
+        vocab_tokens = [
+            "<d>",
+            "</d>",
+            "<s>",
+            "</s>",
+            "</_>",
+            "<unk>",
+            "<pad>",
+            "</n>",
+            "我",
+            "是",
+            "C",
+            "P",
+            "M",
+            "A",
+            "n",
+            "t",
+        ]
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+
+    @tooslow
+    def test_pre_tokenization(self):
+        tokenizer = CpmAntTokenizer.from_pretrained("openbmb/cpm-ant-10b")
+        texts = "今天天气真好！"
+        jieba_tokens = ["今天", "天气", "真", "好", "！"]
+        tokens = tokenizer.tokenize(texts)
+        self.assertListEqual(tokens, jieba_tokens)
+        normalized_text = "今天天气真好！"
+        input_tokens = [tokenizer.bos_token] + tokens
+
+        input_jieba_tokens = [6, 9802, 14962, 2082, 831, 244]
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_jieba_tokens)
+
+        reconstructed_text = tokenizer.decode(input_jieba_tokens)
+        self.assertEqual(reconstructed_text, normalized_text)
diff --git a/tests/models/deta/test_modeling_deta.py b/tests/models/deta/test_modeling_deta.py
index 87cbd950c8534a..b1d54a87def479 100644
--- a/tests/models/deta/test_modeling_deta.py
+++ b/tests/models/deta/test_modeling_deta.py
@@ -192,6 +192,10 @@ def is_pipeline_test_to_skip(
 
         return False
 
+    @unittest.skip("Skip for now. PR #22437 causes some loading issue. See (not merged) #22656 for some discussions.")
+    def test_can_use_safetensors(self):
+        super().test_can_use_safetensors()
+
     # special case for head models
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
diff --git a/tests/models/layoutlm/test_modeling_layoutlm.py b/tests/models/layoutlm/test_modeling_layoutlm.py
index d2aad061c38743..0535fbf4e1f4c8 100644
--- a/tests/models/layoutlm/test_modeling_layoutlm.py
+++ b/tests/models/layoutlm/test_modeling_layoutlm.py
@@ -246,20 +246,6 @@ class LayoutLMModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
     )
     fx_compatible = True
 
-    # TODO: Fix the failed tests
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        if (
-            pipeline_test_casse_name == "DocumentQuestionAnsweringPipelineTests"
-            and tokenizer_name is not None
-            and not tokenizer_name.endswith("Fast")
-        ):
-            # This pipeline uses `sequence_ids()` which is only available for fast tokenizers.
-            return True
-
-        return False
-
     def setUp(self):
         self.model_tester = LayoutLMModelTester(self)
         self.config_tester = ConfigTester(self, config_class=LayoutLMConfig, hidden_size=37)
diff --git a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
index 6c82a34a626bff..2b17eadff57c13 100644
--- a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
@@ -295,15 +295,10 @@ def is_pipeline_test_to_skip(
             # `LayoutLMv2Config` was never used in pipeline tests (`test_pt_LayoutLMv2Config_XXX`) due to lack of tiny
             # config. With new tiny model creation, it is available, but we need to fix the failed tests.
             return True
-        elif (
-            pipeline_test_casse_name == "DocumentQuestionAnsweringPipelineTests"
-            and tokenizer_name is not None
-            and not tokenizer_name.endswith("Fast")
-        ):
-            # This pipeline uses `sequence_ids()` which is only available for fast tokenizers.
-            return True
 
-        return False
+        return super().is_pipeline_test_to_skip(
+            pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        )
 
     def setUp(self):
         self.model_tester = LayoutLMv2ModelTester(self)
diff --git a/tests/models/pix2struct/test_modeling_pix2struct.py b/tests/models/pix2struct/test_modeling_pix2struct.py
index f56f8f6d3ecf51..42ee3c2b4c6a39 100644
--- a/tests/models/pix2struct/test_modeling_pix2struct.py
+++ b/tests/models/pix2struct/test_modeling_pix2struct.py
@@ -443,24 +443,22 @@ def test_forward_signature(self):
             # signature.parameters is an OrderedDict => so arg_names order is deterministic
             arg_names = [*signature.parameters.keys()]
 
-            if model.config.is_encoder_decoder:
-                expected_arg_names = [
-                    "input_ids",
-                    "attention_mask",
-                    "decoder_input_ids",
-                    "decoder_attention_mask",
-                ]
-                expected_arg_names.extend(
-                    ["head_mask", "decoder_head_mask", "cross_attn_head_mask", "encoder_outputs"]
-                    if "head_mask" and "decoder_head_mask" and "cross_attn_head_mask" in arg_names
-                    else ["encoder_outputs"]
-                )
-                self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
-            else:
-                expected_arg_names = (
-                    ["input_ids"] if model_class != Pix2StructForConditionalGeneration else ["flattened_patches"]
-                )
-                self.assertListEqual(arg_names[:1], expected_arg_names)
+            expected_arg_names = [
+                "flattened_patches",
+                "attention_mask",
+                "decoder_input_ids",
+                "decoder_attention_mask",
+                "head_mask",
+                "decoder_head_mask",
+                "cross_attn_head_mask",
+                "encoder_outputs",
+                "past_key_values",
+                "labels",
+                "decoder_inputs_embeds",
+                "use_cache",
+            ]
+
+            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
 
     def test_training(self):
         if not self.model_tester.is_training:
@@ -765,7 +763,7 @@ def test_batched_inference_image_captioning_conditioned(self):
         )
 
     def test_vqa_model(self):
-        model_id = "ybelkada/pix2struct-ai2d-base"
+        model_id = "google/pix2struct-ai2d-base"
 
         image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg"
         image = Image.open(requests.get(image_url, stream=True).raw)
@@ -784,7 +782,7 @@ def test_vqa_model(self):
         self.assertEqual(processor.decode(predictions[0], skip_special_tokens=True), "ash cloud")
 
     def test_vqa_model_batched(self):
-        model_id = "ybelkada/pix2struct-ai2d-base"
+        model_id = "google/pix2struct-ai2d-base"
 
         image_urls = [
             "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg",
diff --git a/tests/test_pipeline_mixin.py b/tests/test_pipeline_mixin.py
index 82a23a94b40545..a73121966c3746 100644
--- a/tests/test_pipeline_mixin.py
+++ b/tests/test_pipeline_mixin.py
@@ -428,9 +428,19 @@ def test_pipeline_zero_shot_image_classification(self):
     def test_pipeline_zero_shot_object_detection(self):
         self.run_task_tests(task="zero-shot-object-detection")
 
+    # This contains the test cases to be skipped without model architecture being involved.
     def is_pipeline_test_to_skip(
         self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
     ):
+        # No fix is required for this case.
+        if (
+            pipeline_test_casse_name == "DocumentQuestionAnsweringPipelineTests"
+            and tokenizer_name is not None
+            and not tokenizer_name.endswith("Fast")
+        ):
+            # `DocumentQuestionAnsweringPipelineTests` requires a fast tokenizer.
+            return True
+
         return False
 
 
diff --git a/tests/trainer/test_trainer_distributed.py b/tests/trainer/test_trainer_distributed.py
index 5fa6edb1c88f1b..97bca4f9d367fc 100644
--- a/tests/trainer/test_trainer_distributed.py
+++ b/tests/trainer/test_trainer_distributed.py
@@ -67,7 +67,7 @@ class TestTrainerDistributedNeuronCore(TestCasePlus):
     @require_torch_neuroncore
     def test_trainer(self):
         distributed_args = f"""
-            -m torch.distributed.launch
+            -m torch.distributed.run
             --nproc_per_node=2
             --master_port={get_torch_dist_unique_port()}
             {self.test_file_dir}/test_trainer_distributed.py
@@ -83,7 +83,7 @@ class TestTrainerDistributed(TestCasePlus):
     @require_torch_multi_gpu
     def test_trainer(self):
         distributed_args = f"""
-            -m torch.distributed.launch
+            -m torch.distributed.run
             --nproc_per_node={torch.cuda.device_count()}
             --master_port={get_torch_dist_unique_port()}
             {self.test_file_dir}/test_trainer_distributed.py
@@ -98,7 +98,7 @@ def test_trainer(self):
 if __name__ == "__main__":
     # The script below is meant to be run under torch.distributed, on a machine with multiple GPUs:
     #
-    # PYTHONPATH="src" python -m torch.distributed.launch --nproc_per_node 2 --output_dir output_dir ./tests/test_trainer_distributed.py
+    # PYTHONPATH="src" python -m torch.distributed.run --nproc_per_node 2 --output_dir output_dir ./tests/test_trainer_distributed.py
 
     parser = HfArgumentParser((TrainingArguments,))
     training_args = parser.parse_args_into_dataclasses()[0]
diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt
index 56a3c1ef5d957e..d3143b2167bf9f 100644
--- a/utils/documentation_tests.txt
+++ b/utils/documentation_tests.txt
@@ -131,6 +131,7 @@ src/transformers/models/mobilenet_v1/modeling_mobilenet_v1.py
 src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py
 src/transformers/models/mobilevit/modeling_mobilevit.py
 src/transformers/models/mobilevit/modeling_tf_mobilevit.py
+src/transformers/models/mvp/configuration_mvp.py
 src/transformers/models/nat/configuration_nat.py
 src/transformers/models/nat/modeling_nat.py
 src/transformers/models/nezha/configuration_nezha.py
@@ -284,6 +285,7 @@ src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
 src/transformers/models/llama/tokenization_llama.py
 src/transformers/models/lxmert/tokenization_lxmert.py
 src/transformers/models/lxmert/tokenization_lxmert_fast.py
+src/transformers/models/m2m_100/configuration_m2m_100.py
 src/transformers/models/markuplm/tokenization_markuplm.py
 src/transformers/models/markuplm/tokenization_markuplm_fast.py
 src/transformers/models/mbart/tokenization_mbart.py
@@ -304,6 +306,7 @@ src/transformers/models/pegasus/tokenization_pegasus.py
 src/transformers/models/pegasus/tokenization_pegasus_fast.py
 src/transformers/models/perceiver/tokenization_perceiver.py
 src/transformers/models/phobert/tokenization_phobert.py
+src/transformers/models/pix2struct/modeling_pix2struct.py
 src/transformers/models/plbart/tokenization_plbart.py
 src/transformers/models/prophetnet/tokenization_prophetnet.py
 src/transformers/models/rag/tokenization_rag.py