From d84bbb2438fad19c38a3adf5beb5968f5bd6c789 Mon Sep 17 00:00:00 2001
From: Katsumata420 <cof.ktmt@gmail.com>
Date: Mon, 19 Feb 2024 15:50:43 +0900
Subject: [PATCH 1/9] Add training ir script

---
 examples/training/swallow-tart/args.py      |  34 +++
 examples/training/swallow-tart/data.py      | 258 ++++++++++++++++++++
 examples/training/swallow-tart/run_train.py | 123 ++++++++++
 3 files changed, 415 insertions(+)
 create mode 100644 examples/training/swallow-tart/args.py
 create mode 100644 examples/training/swallow-tart/data.py
 create mode 100644 examples/training/swallow-tart/run_train.py

diff --git a/examples/training/swallow-tart/args.py b/examples/training/swallow-tart/args.py
new file mode 100644
index 000000000..00755363d
--- /dev/null
+++ b/examples/training/swallow-tart/args.py
@@ -0,0 +1,34 @@
+import json
+from dataclasses import dataclass, field
+from typing import Optional
+
+from peft import get_peft_config
+from transformers import TrainingArguments as STTrainingArguments
+
+__all__ = ["STModelArguments", "STDataArgumnets", "STTrainingArguments"]
+
+
+@dataclass
+class STModelArguments:
+    model_name: str = "bert-base-uncased"
+    peft_config_path: Optional[str] = None
+    use_flash_attention: bool = False
+
+    def __post_init__(self):
+        if self.peft_config_path is not None:
+            with open(self.peft_config_path, "r") as f:
+                peft_config_data = json.load(f)
+            self.peft_config = get_peft_config(peft_config_data)
+
+
+@dataclass
+class STDataArgumnets:
+    data_dir: str
+    task_names: list[str] = field(default_factory=list)
+    max_length: int = 512
+    n_dev_sample: int = 100
+    query_file_name: str = "queries.jsonl"
+    corpus_file_name: str = "corpus.jsonl"
+    qrel_file_name: str = "qrels/train.tsv"
+    hard_negatives_file_name: str = "hard_negative/hard_negative.jsonl"
+    num_proc: int = 1
diff --git a/examples/training/swallow-tart/data.py b/examples/training/swallow-tart/data.py
new file mode 100644
index 000000000..457e2924a
--- /dev/null
+++ b/examples/training/swallow-tart/data.py
@@ -0,0 +1,258 @@
+import os
+import json
+import random
+from collections import defaultdict
+from typing import Callable, Optional, Tuple
+
+import datasets
+import torch
+from sentence_transformers.huggingface import SENTENCE_KEYS
+from torch.utils.data import Dataset
+from tqdm import tqdm
+from transformers import PreTrainedTokenizer, BatchEncoding
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class MNRLDataset(Dataset):
+    # https://github.com/texttron/tevatron/blob/main/examples/repllama/data.py#L162
+    def __init__(
+        self,
+        dataset: datasets.Dataset,
+        tokenizer: PreTrainedTokenizer,
+        max_length: int,
+    ):
+        self.train_data = dataset
+        self.tok = tokenizer
+
+        self.max_length = max_length
+        self.total_len = len(self.train_data)
+
+    def create_one_example(self, text_encoding: list[int]) -> BatchEncoding:
+        item = self.tok.prepare_for_model(
+            text_encoding + [self.tok.eos_token_id],
+            truncation='only_first',
+            max_length=self.max_length,
+            padding=True,
+            return_tensors='pt',
+        )
+        return item
+
+    def __len__(self):
+        # Return query size
+        return self.total_len
+
+    def __getitem__(self, item) -> dict[str, BatchEncoding]:
+        # https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/ms_marco/train_bi-encoder_mnrl.py#L215
+        group = self.train_data[item]
+        query_encoding = self.create_one_example(group['query'])
+
+        target_pos_ids = group['positives'].pop(0)
+        target_pos_encoding = self.create_one_example(target_pos_ids)
+        group['positives'].append(target_pos_ids)
+
+        negative_pos_ids = group['negatives'].pop(0)
+        negative_pos_encoding = self.create_one_example(negative_pos_ids)
+        group['negatives'].append(negative_pos_ids)
+
+        label = 0  # 学習には使用しないが、引数に指定されている
+
+        anchor_name, pos_name, neg_name = SENTENCE_KEYS
+        return {
+            anchor_name: query_encoding,
+            pos_name: target_pos_encoding,
+            neg_name: negative_pos_encoding,
+            "label": label,
+        }
+
+
+class TokenizeProcessor:
+    def __init__(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        max_length: int,
+    ) -> None:
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+
+    def __call__(self, example):
+        query_tokenized = self.tokenizer.encode(
+            example["query"],
+            add_special_tokens=False,
+            truncation=True,
+            max_length=self.max_length - 3,  # For bos, eos and margin
+        )
+        positive_tokenizeds = []
+        for positive in example["positives"]:
+            positive_tokenizeds.append(
+                self.tokenizer.encode(
+                    positive,
+                    add_special_tokens=False,
+                    truncation=True,
+                    max_length=self.max_length - 3,  # For bos and eos
+                )
+            )
+        negative_tokenizeds = []
+        for negative in example["negatives"]:
+            negative_tokenizeds.append(
+                self.tokenizer.encode(
+                    negative,
+                    add_special_tokens=False,
+                    truncation=True,
+                    max_length=self.max_length - 3,  # For bos and eos
+                )
+            )
+        return {"query": query_tokenized, "positives": positive_tokenizeds, "negatives": negative_tokenizeds}
+
+
+def ir_collator(batch: list[dict[str, BatchEncoding]]) -> dict[str, torch.Tensor]:
+    # this function is based on sentence_transformers.SentenceTransformer.smart_batching_collate
+    texts = []
+    for example in batch:
+        temp_texts = []
+        for key in SENTENCE_KEYS:
+            temp_texts.append(example[key])
+        texts.append(temp_texts)
+
+    transposed_texts = list(zip(*texts))
+    labels = torch.tensor([example["label"] for example in batch])
+
+    return transposed_texts, labels
+
+
+def load_queries(queries_path: str) -> dict[str, str]:
+    queries = {}
+    with open(queries_path, 'r') as f:
+        for line in f:
+            data = json.loads(line)
+            queries[data['_id']] = data['text']
+    return queries
+
+
+def load_corpus(corpus_path: str) -> dict[str, str]:
+    corpus = {}
+    with open(corpus_path, 'r') as f:
+        for line in f:
+            data = json.loads(line)
+            corpus[data['_id']] = data['text']
+    return corpus
+
+
+def load_qrels(qrels_path: str) -> dict[str, list[int]]:
+    """Load qrel.
+
+    qrel format:
+        query_id\tdocument_id\tlabel
+    """
+    qrels = defaultdict(list)
+    with open(qrels_path, 'r') as f:
+        for idx, line in enumerate(f):
+            if idx == 0:
+                continue
+            data = line.strip().split('\t')
+            qid = data[0]
+            did = data[1]
+            qrels[qid].append(did)
+    return qrels
+
+
+def load_hard_negatives(hard_negatives_path: str) -> dict[str, list[int]]:
+    """Load hard negative.
+
+    hard negative format:
+        {"query_id": str, "hard_negative": [str, str, ...]}
+    """
+    hard_negative = defaultdict(list)
+    with open(hard_negatives_path, 'r') as f:
+        for line in f:
+            data = json.loads(line)
+            qid = data['query_id']
+            hard_negative[qid].extend(data['hard_negative'])
+    return hard_negative
+
+
+def load_ir_dataset(
+    task_names: list[str],
+    input_data_dir: str,
+    query_file_name: str,
+    corpus_file_name: str,
+    qrel_file_name: str,
+    hard_negative_file_name: str,
+) -> datasets.Dataset:
+    # load dataset
+    # {"query": str, "positives": list[str], "negatives": list[str]}
+    target_datasets: list[datasets.Dataset]  = []
+    for task_idx, task_name in enumerate(task_names):
+        target_path = {
+            "queries": os.path.join(input_data_dir, task_name, query_file_name),
+            "corpus": os.path.join(input_data_dir, task_name, corpus_file_name),
+            "qrels": os.path.join(input_data_dir, task_name, qrel_file_name),
+            "hard_negatives": os.path.join(input_data_dir, task_name, hard_negative_file_name),
+        }
+
+        queries = load_queries(target_path["queries"])
+        corpus = load_corpus(target_path["corpus"])
+        qrels = load_qrels(target_path["qrels"])
+        hard_negatives = load_hard_negatives(target_path["hard_negatives"])
+
+        logger.info(f"...Task: {task_name}")
+        current_dataset = []
+        for qid, query in tqdm(queries.items()):
+            positive_ids = qrels[qid]
+            positives = [corpus[pos_id] for pos_id in positive_ids]
+            random.shuffle(positives)
+            negative_ids = hard_negatives[qid]
+            negatives = [corpus[neg_id] for neg_id in negative_ids]
+            random.shuffle(negatives)
+            current_dataset.append({"query": query, "positives": positives, "negatives": negatives})
+
+        target_datasets.append(datasets.Dataset.from_list(current_dataset))
+
+    target_concat_dataset = datasets.concatenate_datasets(target_datasets)
+    return target_concat_dataset
+
+
+def get_dataset(
+    task_names: list[str],
+    input_data_dir: str,
+    query_file_name: str,
+    corpus_file_name: str,
+    qrel_file_name: str,
+    hard_negatives_file_name: str,
+    tokenizer: PreTrainedTokenizer,
+    max_length: int,
+    n_each_dev_sample: int = 0,
+    process_func: Optional[Callable] = None,
+    num_proc: int = 1,
+) -> Tuple[Dataset, Dataset]:
+    # build HF Dataset
+    logger.info("Build huggingface datasets.")
+    hf_dataset = load_ir_dataset(
+        task_names, input_data_dir, query_file_name, corpus_file_name, qrel_file_name, hard_negatives_file_name
+    )
+
+    # apply preprocess (mainly tokenization (make word ids))
+    logger.info("Apply preprocessing.")
+    remove_column_names = hf_dataset.column_names.remove("label")
+    hf_dataset = hf_dataset.map(
+        process_func,
+        batched=True,
+        num_proc=num_proc,
+        remove_columns=remove_column_names,
+        desc="Running Tokenizer on dataset"
+    )
+
+    # split train/dev dataset
+    logger.info("Split train/dev dataset.")
+    n_dev_sample = n_each_dev_sample * len(task_names)
+    train_dev_dataset = hf_dataset.train_test_split(test_size=n_dev_sample, shuffle=True, stratify_by_column="label")
+    train_dataset = train_dev_dataset["train"]
+    dev_dataset = train_dev_dataset["test"]
+    logger.info(f"Train dataset size: {len(train_dataset)}")
+    logger.info(f"Dev dataset size: {len(dev_dataset)}")
+
+    # build Torch Dataset and Return ones.
+    train_torch_dataset = MNRLDataset(train_dataset, tokenizer, max_length)
+    dev_torch_dataset = MNRLDataset(dev_dataset, tokenizer, max_length)
+    return train_torch_dataset, dev_torch_dataset
diff --git a/examples/training/swallow-tart/run_train.py b/examples/training/swallow-tart/run_train.py
new file mode 100644
index 000000000..30f302604
--- /dev/null
+++ b/examples/training/swallow-tart/run_train.py
@@ -0,0 +1,123 @@
+"""Train embeddings with Sentence-Transformers-HF
+
+lr:
+    llm-jp: 2e-5 https://llm-jp.nii.ac.jp/blog/2024/02/09/v1.1-tuning.html#%E3%83%8F%E3%82%A4%E3%83%91%E3%83%BC%E3%83%91%E3%83%A9%E3%83%A1%E3%83%BC%E3%82%BF
+    repLLaMA: 1e-4 https://llm-jp.nii.ac.jp/blog/2024/02/09/v1.1-tuning.html#%E3%83%8F%E3%82%A4%E3%83%91%E3%83%BC%E3%83%91%E3%83%A9%E3%83%A1%E3%83%BC%E3%82%BF
+"""
+import os
+import sys
+
+from sentence_transformers import losses
+from sentence_transformers.huggingface import (
+    MNRLSentenceTransformersTrainer,
+    MNRLSentenceTransformer,
+)
+from sentence_transformers.models import Transformer, Pooling, Normalize
+from transformers import HfArgumentParser, set_seed
+from transformers.trainer_utils import get_last_checkpoint
+from transformers.utils import logging
+
+from .args import STDataArgumnets, STModelArguments, STTrainingArguments
+from .data import get_dataset, TokenizeProcessor, ir_collator
+
+logger = logging.get_logger(__name__)
+
+
+def main():
+    parser = HfArgumentParser((STDataArgumnets, STModelArguments, STTrainingArguments))
+
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        data_args, model_args, training_args = parser.parse_json_file(os.path.abspath(sys.argv[1]))
+    else:
+        data_args, model_args, training_args = parser.parse_args_into_dataclasses()
+
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        training_args.local_rank,
+        training_args.device,
+        training_args.n_gpu,
+        bool(training_args.local_rank != -1),
+        training_args.fp16,
+    )
+    logger.info("Training/evaluation parameters %s", training_args)
+    logger.info("MODEL parameters %s", model_args)
+
+    set_seed(training_args.seed)
+
+    # define model
+    logger.info("Build SentenceTransformer")
+    if model_args.use_flash_attention:
+        # validate fp16 or bf16
+        assert training_args.fp16 or training_args.bf16, "use_flash_attention requires fp16 or bf16"
+        model_args = {"attn_implementation": "flash_attention_2"}
+    tf_model = Transformer(model_args.model_name, model_args=model_args, peft_config=model_args.peft_config)
+    pooler = Pooling(tf_model.get_word_embedding_dimension(), pooing_mode="lasttoken")
+    normalize = Normalize()
+    model = MNRLSentenceTransformer(modules=[tf_model, pooler, normalize])
+    tokenizer = model.tokenizer
+    max_length = min(data_args.max_length, tokenizer.model_max_length)
+    tokenizer.model_max_length = max_length
+    loss = losses.MultipleNegativesRankingLoss(model=model)
+
+    # define train/eval dataset
+    logger.info("Load dataset")
+    logger.info(f"Target task names: {data_args.task_names}")
+    preprocessor = TokenizeProcessor(tokenizer, data_args.max_length)
+    train_dataset, eval_dataset = get_dataset(
+        data_args.task_names,
+        data_args.data_dir,
+        data_args.query_file_name,
+        data_args.corpus_file_name,
+        data_args.qrel_file_name,
+        data_args.hard_negatives_file_name,
+        tokenizer,
+        data_args.max_length,
+        data_args.n_dev_sample,
+        preprocessor,
+        data_args.num_proc,
+    )
+
+    trainer = MNRLSentenceTransformersTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        data_collator=ir_collator,
+        tokenizer=tokenizer,
+        loss=loss,
+        text_columns=[]
+    )
+
+    # detecting last checkpoint
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir) and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+
+    if last_checkpoint is not None:
+        checkpoint = last_checkpoint
+    elif training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
+
+    logger.info("Start training")
+    train_result = trainer.train(resume_from_checkpoint=checkpoint)
+    metrics = train_result.metrics
+    trainer.save_model()
+    trainer.log_metrics("train", metrics)
+    trainer.save_metrics("train", metrics)
+    trainer.save_state()
+
+
+if __name__ == "__main__":
+    main()

From b169bc40cfa8fe212ab64cec8e3929fcea74a034 Mon Sep 17 00:00:00 2001
From: Katsumata420 <cof.ktmt@gmail.com>
Date: Mon, 19 Feb 2024 16:57:43 +0900
Subject: [PATCH 2/9] Apply format

---
 examples/training/swallow-tart/data.py        | 36 +++++++++----------
 .../training/swallow-tart/peft_config.json    | 29 +++++++++++++++
 examples/training/swallow-tart/run_train.py   |  2 +-
 3 files changed, 48 insertions(+), 19 deletions(-)
 create mode 100644 examples/training/swallow-tart/peft_config.json

diff --git a/examples/training/swallow-tart/data.py b/examples/training/swallow-tart/data.py
index 457e2924a..9b0e631a8 100644
--- a/examples/training/swallow-tart/data.py
+++ b/examples/training/swallow-tart/data.py
@@ -32,10 +32,10 @@ def __init__(
     def create_one_example(self, text_encoding: list[int]) -> BatchEncoding:
         item = self.tok.prepare_for_model(
             text_encoding + [self.tok.eos_token_id],
-            truncation='only_first',
+            truncation="only_first",
             max_length=self.max_length,
             padding=True,
-            return_tensors='pt',
+            return_tensors="pt",
         )
         return item
 
@@ -46,15 +46,15 @@ def __len__(self):
     def __getitem__(self, item) -> dict[str, BatchEncoding]:
         # https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/ms_marco/train_bi-encoder_mnrl.py#L215
         group = self.train_data[item]
-        query_encoding = self.create_one_example(group['query'])
+        query_encoding = self.create_one_example(group["query"])
 
-        target_pos_ids = group['positives'].pop(0)
+        target_pos_ids = group["positives"].pop(0)
         target_pos_encoding = self.create_one_example(target_pos_ids)
-        group['positives'].append(target_pos_ids)
+        group["positives"].append(target_pos_ids)
 
-        negative_pos_ids = group['negatives'].pop(0)
+        negative_pos_ids = group["negatives"].pop(0)
         negative_pos_encoding = self.create_one_example(negative_pos_ids)
-        group['negatives'].append(negative_pos_ids)
+        group["negatives"].append(negative_pos_ids)
 
         label = 0  # 学習には使用しないが、引数に指定されている
 
@@ -123,19 +123,19 @@ def ir_collator(batch: list[dict[str, BatchEncoding]]) -> dict[str, torch.Tensor
 
 def load_queries(queries_path: str) -> dict[str, str]:
     queries = {}
-    with open(queries_path, 'r') as f:
+    with open(queries_path, "r") as f:
         for line in f:
             data = json.loads(line)
-            queries[data['_id']] = data['text']
+            queries[data["_id"]] = data["text"]
     return queries
 
 
 def load_corpus(corpus_path: str) -> dict[str, str]:
     corpus = {}
-    with open(corpus_path, 'r') as f:
+    with open(corpus_path, "r") as f:
         for line in f:
             data = json.loads(line)
-            corpus[data['_id']] = data['text']
+            corpus[data["_id"]] = data["text"]
     return corpus
 
 
@@ -146,11 +146,11 @@ def load_qrels(qrels_path: str) -> dict[str, list[int]]:
         query_id\tdocument_id\tlabel
     """
     qrels = defaultdict(list)
-    with open(qrels_path, 'r') as f:
+    with open(qrels_path, "r") as f:
         for idx, line in enumerate(f):
             if idx == 0:
                 continue
-            data = line.strip().split('\t')
+            data = line.strip().split("\t")
             qid = data[0]
             did = data[1]
             qrels[qid].append(did)
@@ -164,11 +164,11 @@ def load_hard_negatives(hard_negatives_path: str) -> dict[str, list[int]]:
         {"query_id": str, "hard_negative": [str, str, ...]}
     """
     hard_negative = defaultdict(list)
-    with open(hard_negatives_path, 'r') as f:
+    with open(hard_negatives_path, "r") as f:
         for line in f:
             data = json.loads(line)
-            qid = data['query_id']
-            hard_negative[qid].extend(data['hard_negative'])
+            qid = data["query_id"]
+            hard_negative[qid].extend(data["hard_negative"])
     return hard_negative
 
 
@@ -182,7 +182,7 @@ def load_ir_dataset(
 ) -> datasets.Dataset:
     # load dataset
     # {"query": str, "positives": list[str], "negatives": list[str]}
-    target_datasets: list[datasets.Dataset]  = []
+    target_datasets: list[datasets.Dataset] = []
     for task_idx, task_name in enumerate(task_names):
         target_path = {
             "queries": os.path.join(input_data_dir, task_name, query_file_name),
@@ -240,7 +240,7 @@ def get_dataset(
         batched=True,
         num_proc=num_proc,
         remove_columns=remove_column_names,
-        desc="Running Tokenizer on dataset"
+        desc="Running Tokenizer on dataset",
     )
 
     # split train/dev dataset
diff --git a/examples/training/swallow-tart/peft_config.json b/examples/training/swallow-tart/peft_config.json
new file mode 100644
index 000000000..016ff8df9
--- /dev/null
+++ b/examples/training/swallow-tart/peft_config.json
@@ -0,0 +1,29 @@
+# https://huggingface.co/intfloat/e5-mistral-7b-instruct/blob/main/lora/adapter_config.json
+# Lora rank and alpha: https://llm-jp.nii.ac.jp/blog/2024/02/09/v1.1-tuning.html#%E3%83%8F%E3%82%A4%E3%83%91%E3%83%BC%E3%83%91%E3%83%A9%E3%83%A1%E3%83%BC%E3%82%BF
+{
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-v0.1",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": false,
+  "init_lora_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "k_proj",
+    "v_proj",
+    "o_proj",
+    "down_proj",
+    "up_proj",
+    "gate_proj"
+  ],
+  "task_type": "FEATURE_EXTRACTION",
+  "use_rslora": true
+}
\ No newline at end of file
diff --git a/examples/training/swallow-tart/run_train.py b/examples/training/swallow-tart/run_train.py
index 30f302604..991daed54 100644
--- a/examples/training/swallow-tart/run_train.py
+++ b/examples/training/swallow-tart/run_train.py
@@ -87,7 +87,7 @@ def main():
         data_collator=ir_collator,
         tokenizer=tokenizer,
         loss=loss,
-        text_columns=[]
+        text_columns=[],
     )
 
     # detecting last checkpoint

From 7a3d146f6da41eeceb9b2730fd928a03802182a0 Mon Sep 17 00:00:00 2001
From: Katsumata420 <cof.ktmt@gmail.com>
Date: Mon, 19 Feb 2024 21:43:10 +0900
Subject: [PATCH 3/9] Add datasets lib

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index fbf48ae97..9c342f02b 100644
--- a/setup.py
+++ b/setup.py
@@ -29,6 +29,7 @@
         "huggingface-hub>=0.15.1",
         "Pillow",
         "peft",
+        "datasets",
     ],
     classifiers=[
         "Development Status :: 5 - Production/Stable",

From ed0e97ffe482692ec9285a5818d991b11ea64db1 Mon Sep 17 00:00:00 2001
From: Katsumata420 <cof.ktmt@gmail.com>
Date: Tue, 20 Feb 2024 22:36:03 +0900
Subject: [PATCH 4/9] Fix bugs

---
 examples/training/swallow-tart/args.py       | 10 +--
 examples/training/swallow-tart/data.py       | 75 +++++++++++++++-----
 examples/training/swallow-tart/run_train.py  | 23 ++++--
 sentence_transformers/SentenceTransformer.py |  7 ++
 sentence_transformers/models/Transformer.py  | 16 +++++
 5 files changed, 105 insertions(+), 26 deletions(-)

diff --git a/examples/training/swallow-tart/args.py b/examples/training/swallow-tart/args.py
index 00755363d..e2cda57c6 100644
--- a/examples/training/swallow-tart/args.py
+++ b/examples/training/swallow-tart/args.py
@@ -19,6 +19,8 @@ def __post_init__(self):
             with open(self.peft_config_path, "r") as f:
                 peft_config_data = json.load(f)
             self.peft_config = get_peft_config(peft_config_data)
+        else:
+            self.peft_config = None
 
 
 @dataclass
@@ -27,8 +29,8 @@ class STDataArgumnets:
     task_names: list[str] = field(default_factory=list)
     max_length: int = 512
     n_dev_sample: int = 100
-    query_file_name: str = "queries.jsonl"
-    corpus_file_name: str = "corpus.jsonl"
-    qrel_file_name: str = "qrels/train.tsv"
-    hard_negatives_file_name: str = "hard_negative/hard_negative.jsonl"
+    query_file_name: str = "tuple_beir/queries.jsonl"
+    corpus_file_name: str = "tuple_beir/corpus.jsonl"
+    qrel_file_name: str = "tuple_beir/qrels/train.tsv"
+    hard_negatives_file_name: str = "negatives/hard_negative.jsonl"
     num_proc: int = 1
diff --git a/examples/training/swallow-tart/data.py b/examples/training/swallow-tart/data.py
index 9b0e631a8..5aa20deac 100644
--- a/examples/training/swallow-tart/data.py
+++ b/examples/training/swallow-tart/data.py
@@ -30,12 +30,12 @@ def __init__(
         self.total_len = len(self.train_data)
 
     def create_one_example(self, text_encoding: list[int]) -> BatchEncoding:
+        """Add eos token"""
         item = self.tok.prepare_for_model(
             text_encoding + [self.tok.eos_token_id],
             truncation="only_first",
-            max_length=self.max_length,
-            padding=True,
-            return_tensors="pt",
+            max_length=self.max_length - 2,  # for bos and margin
+            padding=False,
         )
         return item
 
@@ -59,12 +59,13 @@ def __getitem__(self, item) -> dict[str, BatchEncoding]:
         label = 0  # 学習には使用しないが、引数に指定されている
 
         anchor_name, pos_name, neg_name = SENTENCE_KEYS
-        return {
+        data = {
             anchor_name: query_encoding,
             pos_name: target_pos_encoding,
             neg_name: negative_pos_encoding,
             "label": label,
         }
+        return data
 
 
 class TokenizeProcessor:
@@ -106,19 +107,58 @@ def __call__(self, example):
         return {"query": query_tokenized, "positives": positive_tokenizeds, "negatives": negative_tokenizeds}
 
 
-def ir_collator(batch: list[dict[str, BatchEncoding]]) -> dict[str, torch.Tensor]:
-    # this function is based on sentence_transformers.SentenceTransformer.smart_batching_collate
-    texts = []
-    for example in batch:
-        temp_texts = []
-        for key in SENTENCE_KEYS:
-            temp_texts.append(example[key])
-        texts.append(temp_texts)
+class TokenizeBatchProcessor(TokenizeProcessor):
+    def __call__(self, examples):
+        query_tokenized = self.tokenizer(
+            examples["query"],
+            add_special_tokens=False,
+            truncation=True,
+            max_length=self.max_length - 3,  # For bos, eos and margin
+        )["input_ids"]
+        positive_tokenizeds = []
+        for one_batch in examples["positives"]:
+            positive_tokenizeds.append(
+                self.tokenizer(
+                    one_batch,
+                    add_special_tokens=False,
+                    truncation=True,
+                    max_length=self.max_length - 3,  # For bos and eos
+                )["input_ids"]
+            )
+        negative_tokenizeds = []
+        for one_batch in examples["negatives"]:
+            negative_tokenizeds.append(
+                self.tokenizer(
+                    one_batch,
+                    add_special_tokens=False,
+                    truncation=True,
+                    max_length=self.max_length - 3,  # For bos and eos
+                )["input_ids"]
+            )
+        return {"query": query_tokenized, "positives": positive_tokenizeds, "negatives": negative_tokenizeds}
 
-    transposed_texts = list(zip(*texts))
-    labels = torch.tensor([example["label"] for example in batch])
 
-    return transposed_texts, labels
+class IRCollator:
+    def __init__(self, tokenizer: PreTrainedTokenizer, max_length: int):
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+
+    def __call__(self, batch: list[dict[str, BatchEncoding]]) -> tuple[list[BatchEncoding], torch.Tensor]:
+        # this function is based on sentence_transformers.SentenceTransformer.smart_batching_collate
+        texts = []
+        for example in batch:
+            temp_texts = []
+            for key in SENTENCE_KEYS:
+                temp_texts.append(example[key])
+            texts.append(temp_texts)
+
+        transposed_texts = [
+            self.tokenizer.pad(sentences, padding="max_length", max_length=self.max_length, return_tensors="pt")
+            for sentences in zip(*texts)
+        ]
+        labels = torch.tensor([example["label"] for example in batch])
+
+        return transposed_texts, labels
 
 
 def load_queries(queries_path: str) -> dict[str, str]:
@@ -202,10 +242,12 @@ def load_ir_dataset(
             positive_ids = qrels[qid]
             positives = [corpus[pos_id] for pos_id in positive_ids]
             random.shuffle(positives)
+            if qid not in hard_negatives:
+                continue
             negative_ids = hard_negatives[qid]
             negatives = [corpus[neg_id] for neg_id in negative_ids]
             random.shuffle(negatives)
-            current_dataset.append({"query": query, "positives": positives, "negatives": negatives})
+            current_dataset.append({"query": query, "positives": positives, "negatives": negatives, "label": task_idx})
 
         target_datasets.append(datasets.Dataset.from_list(current_dataset))
 
@@ -245,6 +287,7 @@ def get_dataset(
 
     # split train/dev dataset
     logger.info("Split train/dev dataset.")
+    hf_dataset = hf_dataset.class_encode_column("label")
     n_dev_sample = n_each_dev_sample * len(task_names)
     train_dev_dataset = hf_dataset.train_test_split(test_size=n_dev_sample, shuffle=True, stratify_by_column="label")
     train_dataset = train_dev_dataset["train"]
diff --git a/examples/training/swallow-tart/run_train.py b/examples/training/swallow-tart/run_train.py
index 991daed54..e2cc3d4ea 100644
--- a/examples/training/swallow-tart/run_train.py
+++ b/examples/training/swallow-tart/run_train.py
@@ -17,8 +17,8 @@
 from transformers.trainer_utils import get_last_checkpoint
 from transformers.utils import logging
 
-from .args import STDataArgumnets, STModelArguments, STTrainingArguments
-from .data import get_dataset, TokenizeProcessor, ir_collator
+from args import STDataArgumnets, STModelArguments, STTrainingArguments
+from data import get_dataset, TokenizeProcessor, TokenizeBatchProcessor, IRCollator
 
 logger = logging.get_logger(__name__)
 
@@ -51,20 +51,30 @@ def main():
     if model_args.use_flash_attention:
         # validate fp16 or bf16
         assert training_args.fp16 or training_args.bf16, "use_flash_attention requires fp16 or bf16"
-        model_args = {"attn_implementation": "flash_attention_2"}
-    tf_model = Transformer(model_args.model_name, model_args=model_args, peft_config=model_args.peft_config)
-    pooler = Pooling(tf_model.get_word_embedding_dimension(), pooing_mode="lasttoken")
+        model_kwargs = {"attn_implementation": "flash_attention_2"}
+    tf_model = Transformer(
+        model_args.model_name,
+        model_args=model_kwargs,
+        peft_config=model_args.peft_config,
+        is_gradient_checkpointing=training_args.gradient_checkpointing,
+    )
+    pooler = Pooling(tf_model.get_word_embedding_dimension(), pooling_mode="lasttoken")
     normalize = Normalize()
     model = MNRLSentenceTransformer(modules=[tf_model, pooler, normalize])
     tokenizer = model.tokenizer
+    # https://github.com/texttron/tevatron/blob/2e5d00ee21d5a7db0bd2ea1463c9150a572106d4/examples/repllama/train.py#L68-L69
+    tokenizer.pad_token_id = tokenizer.unk_token_id
+    tokenizer.pad_token = tokenizer.unk_token
     max_length = min(data_args.max_length, tokenizer.model_max_length)
     tokenizer.model_max_length = max_length
     loss = losses.MultipleNegativesRankingLoss(model=model)
+    ir_collator = IRCollator(tokenizer, max_length)
 
     # define train/eval dataset
     logger.info("Load dataset")
     logger.info(f"Target task names: {data_args.task_names}")
-    preprocessor = TokenizeProcessor(tokenizer, data_args.max_length)
+    # preprocessor = TokenizeProcessor(tokenizer, data_args.max_length)
+    preprocessor = TokenizeBatchProcessor(tokenizer, data_args.max_length)
     train_dataset, eval_dataset = get_dataset(
         data_args.task_names,
         data_args.data_dir,
@@ -105,6 +115,7 @@ def main():
                 "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
             )
 
+    checkpoint = None
     if last_checkpoint is not None:
         checkpoint = last_checkpoint
     elif training_args.resume_from_checkpoint is not None:
diff --git a/sentence_transformers/SentenceTransformer.py b/sentence_transformers/SentenceTransformer.py
index feb5975ac..845039ac4 100644
--- a/sentence_transformers/SentenceTransformer.py
+++ b/sentence_transformers/SentenceTransformer.py
@@ -1163,3 +1163,10 @@ def _target_device(self) -> torch.device:
     @_target_device.setter
     def _target_device(self, device: Optional[Union[int, str, torch.device]] = None) -> None:
         self.to(device)
+
+    @property
+    def config(self):
+        return self._first_module().config
+
+    def gradient_checkpointing_enable(self, *args, **kwargs):
+        return self._first_module().gradient_checkpointing_enable(*args, **kwargs)
diff --git a/sentence_transformers/models/Transformer.py b/sentence_transformers/models/Transformer.py
index e61b268d6..b979119cd 100644
--- a/sentence_transformers/models/Transformer.py
+++ b/sentence_transformers/models/Transformer.py
@@ -1,3 +1,4 @@
+import torch
 from torch import nn
 from transformers import AutoModel, AutoTokenizer, AutoConfig, T5Config, MT5Config
 from peft import PeftConfig, get_peft_model
@@ -29,6 +30,7 @@ def __init__(
         do_lower_case: bool = False,
         tokenizer_name_or_path: str = None,
         peft_config: Optional[PeftConfig] = None,
+        is_gradient_checkpointing: bool = False,
     ):
         super(Transformer, self).__init__()
         self.config_keys = ["max_seq_length", "do_lower_case"]
@@ -38,6 +40,13 @@ def __init__(
         self._load_model(model_name_or_path, config, cache_dir, **model_args)
 
         if peft_config is not None:
+            if is_gradient_checkpointing:
+                for param in self.auto_model.parameters():
+                    param.requires_grad = True
+                    if param.ndim == 1:
+                        param.data = param.data.to(torch.float32)
+                self.auto_model.gradient_checkpointing_enable()
+                self.auto_model.enable_input_require_grads()
             self.auto_model = get_peft_model(self.auto_model, peft_config)
 
         self.tokenizer = AutoTokenizer.from_pretrained(
@@ -190,3 +199,10 @@ def load(input_path: str):
         if "model_args" in config:
             config["model_args"].pop("trust_remote_code")
         return Transformer(model_name_or_path=input_path, **config)
+
+    @property
+    def config(self):
+        return self.auto_model.config
+
+    def gradient_checkpointing_enable(self, *args, **kwargs):
+        return self.auto_model.gradient_checkpointing_enable(*args, **kwargs)

From 8e82ed8d92f309a3d28c23ea28e245d93ceaf08c Mon Sep 17 00:00:00 2001
From: Katsumata420 <cof.ktmt@gmail.com>
Date: Wed, 21 Feb 2024 11:12:00 +0900
Subject: [PATCH 5/9] Save hf.dataset to use cache

---
 examples/training/swallow-tart/args.py      |   1 +
 examples/training/swallow-tart/data.py      |  93 +++-
 examples/training/swallow-tart/h            | 581 ++++++++++++++++++++
 examples/training/swallow-tart/run_train.py |   3 +
 4 files changed, 661 insertions(+), 17 deletions(-)
 create mode 100644 examples/training/swallow-tart/h

diff --git a/examples/training/swallow-tart/args.py b/examples/training/swallow-tart/args.py
index e2cda57c6..d7d2767dc 100644
--- a/examples/training/swallow-tart/args.py
+++ b/examples/training/swallow-tart/args.py
@@ -26,6 +26,7 @@ def __post_init__(self):
 @dataclass
 class STDataArgumnets:
     data_dir: str
+    hf_dataset_dir: str
     task_names: list[str] = field(default_factory=list)
     max_length: int = 512
     n_dev_sample: int = 100
diff --git a/examples/training/swallow-tart/data.py b/examples/training/swallow-tart/data.py
index 5aa20deac..d7617f114 100644
--- a/examples/training/swallow-tart/data.py
+++ b/examples/training/swallow-tart/data.py
@@ -2,10 +2,12 @@
 import json
 import random
 from collections import defaultdict
+from pathlib import Path
 from typing import Callable, Optional, Tuple
 
 import datasets
 import torch
+from datasets import load_from_disk
 from sentence_transformers.huggingface import SENTENCE_KEYS
 from torch.utils.data import Dataset
 from tqdm import tqdm
@@ -194,7 +196,7 @@ def load_qrels(qrels_path: str) -> dict[str, list[int]]:
             qid = data[0]
             did = data[1]
             qrels[qid].append(did)
-    return qrels
+    return dict(qrels)
 
 
 def load_hard_negatives(hard_negatives_path: str) -> dict[str, list[int]]:
@@ -209,16 +211,16 @@ def load_hard_negatives(hard_negatives_path: str) -> dict[str, list[int]]:
             data = json.loads(line)
             qid = data["query_id"]
             hard_negative[qid].extend(data["hard_negative"])
-    return hard_negative
+    return dict(hard_negative)
 
 
-def load_ir_dataset(
+def prepare_ir_dataset(
     task_names: list[str],
     input_data_dir: str,
     query_file_name: str,
     corpus_file_name: str,
     qrel_file_name: str,
-    hard_negative_file_name: str,
+    hard_negatives_file_name: str,
 ) -> datasets.Dataset:
     # load dataset
     # {"query": str, "positives": list[str], "negatives": list[str]}
@@ -228,7 +230,7 @@ def load_ir_dataset(
             "queries": os.path.join(input_data_dir, task_name, query_file_name),
             "corpus": os.path.join(input_data_dir, task_name, corpus_file_name),
             "qrels": os.path.join(input_data_dir, task_name, qrel_file_name),
-            "hard_negatives": os.path.join(input_data_dir, task_name, hard_negative_file_name),
+            "hard_negatives": os.path.join(input_data_dir, task_name, hard_negatives_file_name),
         }
 
         queries = load_queries(target_path["queries"])
@@ -239,14 +241,39 @@ def load_ir_dataset(
         logger.info(f"...Task: {task_name}")
         current_dataset = []
         for qid, query in tqdm(queries.items()):
+            if qid not in qrels:
+                logger.info(f"......qid: {qid} is not included at the qrel. skip this query.")
+                continue
             positive_ids = qrels[qid]
-            positives = [corpus[pos_id] for pos_id in positive_ids]
+
+            positives = []
+            for pos_id in positive_ids:
+                if pos_id not in corpus:
+                    continue
+                positive_text = corpus[pos_id]
+                if positive_text is not None:
+                    positives.append(corpus[pos_id])
+            if len(positives) == 0:
+                logger.info(f"......qid: {qid} doesn't have positive passage. skip this query.")
+                continue
             random.shuffle(positives)
+
             if qid not in hard_negatives:
                 continue
             negative_ids = hard_negatives[qid]
-            negatives = [corpus[neg_id] for neg_id in negative_ids]
+
+            negatives = []
+            for neg_id in negative_ids:
+                if neg_id not in corpus:
+                    continue
+                negative_text = corpus[neg_id]
+                if negative_text is not None:
+                    negatives.append(corpus[neg_id])
+            if len(negatives) == 0:
+                logger.info(f"......qid: {qid} doesn't have negative passage. skip this query.")
+                continue
             random.shuffle(negatives)
+
             current_dataset.append({"query": query, "positives": positives, "negatives": negatives, "label": task_idx})
 
         target_datasets.append(datasets.Dataset.from_list(current_dataset))
@@ -255,7 +282,35 @@ def load_ir_dataset(
     return target_concat_dataset
 
 
+def load_ir_dataset(
+    dataset_path: Path,
+    task_names: list[str],
+    input_data_dir: str,
+    query_file_name: str,
+    corpus_file_name: str,
+    qrel_file_name: str,
+    hard_negatives_file_name: str,
+    n_each_dev_sample: int,
+) -> datasets.Dataset:
+    if not dataset_path.exists():
+        logger.info("Build huggingface datasets.")
+        hf_dataset = prepare_ir_dataset(
+            task_names, input_data_dir, query_file_name, corpus_file_name, qrel_file_name, hard_negatives_file_name
+        )
+        logger.info("Split train/dev dataset.")
+        hf_dataset = hf_dataset.class_encode_column("label")
+        n_dev_sample = n_each_dev_sample * len(task_names)
+        hf_dataset = hf_dataset.train_test_split(test_size=n_dev_sample, shuffle=True, stratify_by_column="label")
+
+        logger.info(f"Save DatasetDict to {str(dataset_path)}.")
+        hf_dataset.save_to_disk(str(dataset_path), max_shard_size="1GB")
+
+    hf_dataset = load_from_disk(dataset_path)
+    return hf_dataset
+
+
 def get_dataset(
+    hf_dataset_dir: str,
     task_names: list[str],
     input_data_dir: str,
     query_file_name: str,
@@ -269,32 +324,36 @@ def get_dataset(
     num_proc: int = 1,
 ) -> Tuple[Dataset, Dataset]:
     # build HF Dataset
-    logger.info("Build huggingface datasets.")
+    logger.info("Load huggingface datasets.")
     hf_dataset = load_ir_dataset(
-        task_names, input_data_dir, query_file_name, corpus_file_name, qrel_file_name, hard_negatives_file_name
+        Path(hf_dataset_dir),
+        task_names,
+        input_data_dir,
+        query_file_name,
+        corpus_file_name,
+        qrel_file_name,
+        hard_negatives_file_name,
+        n_each_dev_sample,
     )
 
     # apply preprocess (mainly tokenization (make word ids))
     logger.info("Apply preprocessing.")
-    remove_column_names = hf_dataset.column_names.remove("label")
+    remove_column_names = hf_dataset.column_names["train"].remove("label")
     hf_dataset = hf_dataset.map(
         process_func,
         batched=True,
-        num_proc=num_proc,
         remove_columns=remove_column_names,
+        num_proc=num_proc,
         desc="Running Tokenizer on dataset",
     )
 
     # split train/dev dataset
-    logger.info("Split train/dev dataset.")
-    hf_dataset = hf_dataset.class_encode_column("label")
-    n_dev_sample = n_each_dev_sample * len(task_names)
-    train_dev_dataset = hf_dataset.train_test_split(test_size=n_dev_sample, shuffle=True, stratify_by_column="label")
-    train_dataset = train_dev_dataset["train"]
-    dev_dataset = train_dev_dataset["test"]
+    train_dataset = hf_dataset["train"]
+    dev_dataset = hf_dataset["test"]
     logger.info(f"Train dataset size: {len(train_dataset)}")
     logger.info(f"Dev dataset size: {len(dev_dataset)}")
 
+
     # build Torch Dataset and Return ones.
     train_torch_dataset = MNRLDataset(train_dataset, tokenizer, max_length)
     dev_torch_dataset = MNRLDataset(dev_dataset, tokenizer, max_length)
diff --git a/examples/training/swallow-tart/h b/examples/training/swallow-tart/h
new file mode 100644
index 000000000..95c104586
--- /dev/null
+++ b/examples/training/swallow-tart/h
@@ -0,0 +1,581 @@
+usage: run_train.py [-h] --data_dir DATA_DIR
+                    [--task_names TASK_NAMES [TASK_NAMES ...]]
+                    [--max_length MAX_LENGTH] [--n_dev_sample N_DEV_SAMPLE]
+                    [--query_file_name QUERY_FILE_NAME]
+                    [--corpus_file_name CORPUS_FILE_NAME]
+                    [--qrel_file_name QREL_FILE_NAME]
+                    [--hard_negatives_file_name HARD_NEGATIVES_FILE_NAME]
+                    [--num_proc NUM_PROC] [--model_name MODEL_NAME]
+                    [--peft_config_path PEFT_CONFIG_PATH]
+                    [--use_flash_attention [USE_FLASH_ATTENTION]] --output_dir
+                    OUTPUT_DIR [--overwrite_output_dir [OVERWRITE_OUTPUT_DIR]]
+                    [--do_train [DO_TRAIN]] [--do_eval [DO_EVAL]]
+                    [--do_predict [DO_PREDICT]]
+                    [--evaluation_strategy {no,steps,epoch}]
+                    [--prediction_loss_only [PREDICTION_LOSS_ONLY]]
+                    [--per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE]
+                    [--per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE]
+                    [--per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE]
+                    [--per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE]
+                    [--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS]
+                    [--eval_accumulation_steps EVAL_ACCUMULATION_STEPS]
+                    [--eval_delay EVAL_DELAY] [--learning_rate LEARNING_RATE]
+                    [--weight_decay WEIGHT_DECAY] [--adam_beta1 ADAM_BETA1]
+                    [--adam_beta2 ADAM_BETA2] [--adam_epsilon ADAM_EPSILON]
+                    [--max_grad_norm MAX_GRAD_NORM]
+                    [--num_train_epochs NUM_TRAIN_EPOCHS]
+                    [--max_steps MAX_STEPS]
+                    [--lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau}]
+                    [--lr_scheduler_kwargs LR_SCHEDULER_KWARGS]
+                    [--warmup_ratio WARMUP_RATIO]
+                    [--warmup_steps WARMUP_STEPS]
+                    [--log_level {detail,debug,info,warning,error,critical,passive}]
+                    [--log_level_replica {detail,debug,info,warning,error,critical,passive}]
+                    [--log_on_each_node [LOG_ON_EACH_NODE]]
+                    [--no_log_on_each_node] [--logging_dir LOGGING_DIR]
+                    [--logging_strategy {no,steps,epoch}]
+                    [--logging_first_step [LOGGING_FIRST_STEP]]
+                    [--logging_steps LOGGING_STEPS]
+                    [--logging_nan_inf_filter [LOGGING_NAN_INF_FILTER]]
+                    [--no_logging_nan_inf_filter]
+                    [--save_strategy {no,steps,epoch}]
+                    [--save_steps SAVE_STEPS]
+                    [--save_total_limit SAVE_TOTAL_LIMIT]
+                    [--save_safetensors [SAVE_SAFETENSORS]]
+                    [--no_save_safetensors]
+                    [--save_on_each_node [SAVE_ON_EACH_NODE]]
+                    [--save_only_model [SAVE_ONLY_MODEL]]
+                    [--no_cuda [NO_CUDA]] [--use_cpu [USE_CPU]]
+                    [--use_mps_device [USE_MPS_DEVICE]] [--seed SEED]
+                    [--data_seed DATA_SEED] [--jit_mode_eval [JIT_MODE_EVAL]]
+                    [--use_ipex [USE_IPEX]] [--bf16 [BF16]] [--fp16 [FP16]]
+                    [--fp16_opt_level FP16_OPT_LEVEL]
+                    [--half_precision_backend {auto,apex,cpu_amp}]
+                    [--bf16_full_eval [BF16_FULL_EVAL]]
+                    [--fp16_full_eval [FP16_FULL_EVAL]] [--tf32 TF32]
+                    [--local_rank LOCAL_RANK]
+                    [--ddp_backend {nccl,gloo,mpi,ccl,hccl}]
+                    [--tpu_num_cores TPU_NUM_CORES]
+                    [--tpu_metrics_debug [TPU_METRICS_DEBUG]]
+                    [--debug DEBUG [DEBUG ...]]
+                    [--dataloader_drop_last [DATALOADER_DROP_LAST]]
+                    [--eval_steps EVAL_STEPS]
+                    [--dataloader_num_workers DATALOADER_NUM_WORKERS]
+                    [--past_index PAST_INDEX] [--run_name RUN_NAME]
+                    [--disable_tqdm DISABLE_TQDM]
+                    [--remove_unused_columns [REMOVE_UNUSED_COLUMNS]]
+                    [--no_remove_unused_columns]
+                    [--label_names LABEL_NAMES [LABEL_NAMES ...]]
+                    [--load_best_model_at_end [LOAD_BEST_MODEL_AT_END]]
+                    [--metric_for_best_model METRIC_FOR_BEST_MODEL]
+                    [--greater_is_better GREATER_IS_BETTER]
+                    [--ignore_data_skip [IGNORE_DATA_SKIP]] [--fsdp FSDP]
+                    [--fsdp_min_num_params FSDP_MIN_NUM_PARAMS]
+                    [--fsdp_config FSDP_CONFIG]
+                    [--fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP]
+                    [--deepspeed DEEPSPEED]
+                    [--label_smoothing_factor LABEL_SMOOTHING_FACTOR]
+                    [--optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_torch_npu_fused,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit,rmsprop}]
+                    [--optim_args OPTIM_ARGS] [--adafactor [ADAFACTOR]]
+                    [--group_by_length [GROUP_BY_LENGTH]]
+                    [--length_column_name LENGTH_COLUMN_NAME]
+                    [--report_to REPORT_TO [REPORT_TO ...]]
+                    [--ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS]
+                    [--ddp_bucket_cap_mb DDP_BUCKET_CAP_MB]
+                    [--ddp_broadcast_buffers DDP_BROADCAST_BUFFERS]
+                    [--dataloader_pin_memory [DATALOADER_PIN_MEMORY]]
+                    [--no_dataloader_pin_memory]
+                    [--dataloader_persistent_workers [DATALOADER_PERSISTENT_WORKERS]]
+                    [--skip_memory_metrics [SKIP_MEMORY_METRICS]]
+                    [--no_skip_memory_metrics]
+                    [--use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP]]
+                    [--push_to_hub [PUSH_TO_HUB]]
+                    [--resume_from_checkpoint RESUME_FROM_CHECKPOINT]
+                    [--hub_model_id HUB_MODEL_ID]
+                    [--hub_strategy {end,every_save,checkpoint,all_checkpoints}]
+                    [--hub_token HUB_TOKEN]
+                    [--hub_private_repo [HUB_PRIVATE_REPO]]
+                    [--hub_always_push [HUB_ALWAYS_PUSH]]
+                    [--gradient_checkpointing [GRADIENT_CHECKPOINTING]]
+                    [--gradient_checkpointing_kwargs GRADIENT_CHECKPOINTING_KWARGS]
+                    [--include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS]]
+                    [--fp16_backend {auto,apex,cpu_amp}]
+                    [--push_to_hub_model_id PUSH_TO_HUB_MODEL_ID]
+                    [--push_to_hub_organization PUSH_TO_HUB_ORGANIZATION]
+                    [--push_to_hub_token PUSH_TO_HUB_TOKEN]
+                    [--mp_parameters MP_PARAMETERS]
+                    [--auto_find_batch_size [AUTO_FIND_BATCH_SIZE]]
+                    [--full_determinism [FULL_DETERMINISM]]
+                    [--torchdynamo TORCHDYNAMO] [--ray_scope RAY_SCOPE]
+                    [--ddp_timeout DDP_TIMEOUT]
+                    [--torch_compile [TORCH_COMPILE]]
+                    [--torch_compile_backend TORCH_COMPILE_BACKEND]
+                    [--torch_compile_mode TORCH_COMPILE_MODE]
+                    [--dispatch_batches DISPATCH_BATCHES]
+                    [--split_batches [SPLIT_BATCHES]]
+                    [--include_tokens_per_second [INCLUDE_TOKENS_PER_SECOND]]
+                    [--include_num_input_tokens_seen [INCLUDE_NUM_INPUT_TOKENS_SEEN]]
+                    [--neftune_noise_alpha NEFTUNE_NOISE_ALPHA]
+
+options:
+  -h, --help            show this help message and exit
+  --data_dir DATA_DIR
+  --task_names TASK_NAMES [TASK_NAMES ...]
+  --max_length MAX_LENGTH
+  --n_dev_sample N_DEV_SAMPLE
+  --query_file_name QUERY_FILE_NAME
+  --corpus_file_name CORPUS_FILE_NAME
+  --qrel_file_name QREL_FILE_NAME
+  --hard_negatives_file_name HARD_NEGATIVES_FILE_NAME
+  --num_proc NUM_PROC
+  --model_name MODEL_NAME
+  --peft_config_path PEFT_CONFIG_PATH
+  --use_flash_attention [USE_FLASH_ATTENTION]
+  --output_dir OUTPUT_DIR
+                        The output directory where the model predictions and
+                        checkpoints will be written. (default: None)
+  --overwrite_output_dir [OVERWRITE_OUTPUT_DIR]
+                        Overwrite the content of the output directory. Use
+                        this to continue training if output_dir points to a
+                        checkpoint directory. (default: False)
+  --do_train [DO_TRAIN]
+                        Whether to run training. (default: False)
+  --do_eval [DO_EVAL]   Whether to run eval on the dev set. (default: False)
+  --do_predict [DO_PREDICT]
+                        Whether to run predictions on the test set. (default:
+                        False)
+  --evaluation_strategy {no,steps,epoch}
+                        The evaluation strategy to use. (default: no)
+  --prediction_loss_only [PREDICTION_LOSS_ONLY]
+                        When performing evaluation and predictions, only
+                        returns the loss. (default: False)
+  --per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE
+                        Batch size per GPU/TPU/MPS/NPU core/CPU for training.
+                        (default: 8)
+  --per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE
+                        Batch size per GPU/TPU/MPS/NPU core/CPU for
+                        evaluation. (default: 8)
+  --per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE
+                        Deprecated, the use of `--per_device_train_batch_size`
+                        is preferred. Batch size per GPU/TPU core/CPU for
+                        training. (default: None)
+  --per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE
+                        Deprecated, the use of `--per_device_eval_batch_size`
+                        is preferred. Batch size per GPU/TPU core/CPU for
+                        evaluation. (default: None)
+  --gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS
+                        Number of updates steps to accumulate before
+                        performing a backward/update pass. (default: 1)
+  --eval_accumulation_steps EVAL_ACCUMULATION_STEPS
+                        Number of predictions steps to accumulate before
+                        moving the tensors to the CPU. (default: None)
+  --eval_delay EVAL_DELAY
+                        Number of epochs or steps to wait for before the first
+                        evaluation can be performed, depending on the
+                        evaluation_strategy. (default: 0)
+  --learning_rate LEARNING_RATE
+                        The initial learning rate for AdamW. (default: 5e-05)
+  --weight_decay WEIGHT_DECAY
+                        Weight decay for AdamW if we apply some. (default:
+                        0.0)
+  --adam_beta1 ADAM_BETA1
+                        Beta1 for AdamW optimizer (default: 0.9)
+  --adam_beta2 ADAM_BETA2
+                        Beta2 for AdamW optimizer (default: 0.999)
+  --adam_epsilon ADAM_EPSILON
+                        Epsilon for AdamW optimizer. (default: 1e-08)
+  --max_grad_norm MAX_GRAD_NORM
+                        Max gradient norm. (default: 1.0)
+  --num_train_epochs NUM_TRAIN_EPOCHS
+                        Total number of training epochs to perform. (default:
+                        3.0)
+  --max_steps MAX_STEPS
+                        If > 0: set total number of training steps to perform.
+                        Override num_train_epochs. (default: -1)
+  --lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau}
+                        The scheduler type to use. (default: linear)
+  --lr_scheduler_kwargs LR_SCHEDULER_KWARGS
+                        Extra parameters for the lr_scheduler such as
+                        {'num_cycles': 1} for the cosine with hard restarts
+                        (default: {})
+  --warmup_ratio WARMUP_RATIO
+                        Linear warmup over warmup_ratio fraction of total
+                        steps. (default: 0.0)
+  --warmup_steps WARMUP_STEPS
+                        Linear warmup over warmup_steps. (default: 0)
+  --log_level {detail,debug,info,warning,error,critical,passive}
+                        Logger log level to use on the main node. Possible
+                        choices are the log levels as strings: 'debug',
+                        'info', 'warning', 'error' and 'critical', plus a
+                        'passive' level which doesn't set anything and lets
+                        the application set the level. Defaults to 'passive'.
+                        (default: passive)
+  --log_level_replica {detail,debug,info,warning,error,critical,passive}
+                        Logger log level to use on replica nodes. Same choices
+                        and defaults as ``log_level`` (default: warning)
+  --log_on_each_node [LOG_ON_EACH_NODE]
+                        When doing a multinode distributed training, whether
+                        to log once per node or just once on the main node.
+                        (default: True)
+  --no_log_on_each_node
+                        When doing a multinode distributed training, whether
+                        to log once per node or just once on the main node.
+                        (default: False)
+  --logging_dir LOGGING_DIR
+                        Tensorboard log dir. (default: None)
+  --logging_strategy {no,steps,epoch}
+                        The logging strategy to use. (default: steps)
+  --logging_first_step [LOGGING_FIRST_STEP]
+                        Log the first global_step (default: False)
+  --logging_steps LOGGING_STEPS
+                        Log every X updates steps. Should be an integer or a
+                        float in range `[0,1)`. If smaller than 1, will be
+                        interpreted as ratio of total training steps.
+                        (default: 500)
+  --logging_nan_inf_filter [LOGGING_NAN_INF_FILTER]
+                        Filter nan and inf losses for logging. (default: True)
+  --no_logging_nan_inf_filter
+                        Filter nan and inf losses for logging. (default:
+                        False)
+  --save_strategy {no,steps,epoch}
+                        The checkpoint save strategy to use. (default: steps)
+  --save_steps SAVE_STEPS
+                        Save checkpoint every X updates steps. Should be an
+                        integer or a float in range `[0,1)`. If smaller than
+                        1, will be interpreted as ratio of total training
+                        steps. (default: 500)
+  --save_total_limit SAVE_TOTAL_LIMIT
+                        If a value is passed, will limit the total amount of
+                        checkpoints. Deletes the older checkpoints in
+                        `output_dir`. When `load_best_model_at_end` is
+                        enabled, the 'best' checkpoint according to
+                        `metric_for_best_model` will always be retained in
+                        addition to the most recent ones. For example, for
+                        `save_total_limit=5` and
+                        `load_best_model_at_end=True`, the four last
+                        checkpoints will always be retained alongside the best
+                        model. When `save_total_limit=1` and
+                        `load_best_model_at_end=True`, it is possible that two
+                        checkpoints are saved: the last one and the best one
+                        (if they are different). Default is unlimited
+                        checkpoints (default: None)
+  --save_safetensors [SAVE_SAFETENSORS]
+                        Use safetensors saving and loading for state dicts
+                        instead of default torch.load and torch.save.
+                        (default: True)
+  --no_save_safetensors
+                        Use safetensors saving and loading for state dicts
+                        instead of default torch.load and torch.save.
+                        (default: False)
+  --save_on_each_node [SAVE_ON_EACH_NODE]
+                        When doing multi-node distributed training, whether to
+                        save models and checkpoints on each node, or only on
+                        the main one (default: False)
+  --save_only_model [SAVE_ONLY_MODEL]
+                        When checkpointing, whether to only save the model, or
+                        also the optimizer, scheduler & rng state.Note that
+                        when this is true, you won't be able to resume
+                        training from checkpoint.This enables you to save
+                        storage by not storing the optimizer, scheduler & rng
+                        state.You can only load the model using
+                        from_pretrained with this option set to True.
+                        (default: False)
+  --no_cuda [NO_CUDA]   This argument is deprecated. It will be removed in
+                        version 5.0 of 🤗 Transformers. (default: False)
+  --use_cpu [USE_CPU]   Whether or not to use cpu. If set to False, we will
+                        use cuda/tpu/mps/npu device if available. (default:
+                        False)
+  --use_mps_device [USE_MPS_DEVICE]
+                        This argument is deprecated. `mps` device will be used
+                        if available similar to `cuda` device. It will be
+                        removed in version 5.0 of 🤗 Transformers (default:
+                        False)
+  --seed SEED           Random seed that will be set at the beginning of
+                        training. (default: 42)
+  --data_seed DATA_SEED
+                        Random seed to be used with data samplers. (default:
+                        None)
+  --jit_mode_eval [JIT_MODE_EVAL]
+                        Whether or not to use PyTorch jit trace for inference
+                        (default: False)
+  --use_ipex [USE_IPEX]
+                        Use Intel extension for PyTorch when it is available,
+                        installation: 'https://github.com/intel/intel-
+                        extension-for-pytorch' (default: False)
+  --bf16 [BF16]         Whether to use bf16 (mixed) precision instead of
+                        32-bit. Requires Ampere or higher NVIDIA architecture
+                        or using CPU (use_cpu) or Ascend NPU. This is an
+                        experimental API and it may change. (default: False)
+  --fp16 [FP16]         Whether to use fp16 (mixed) precision instead of
+                        32-bit (default: False)
+  --fp16_opt_level FP16_OPT_LEVEL
+                        For fp16: Apex AMP optimization level selected in
+                        ['O0', 'O1', 'O2', and 'O3']. See details at
+                        https://nvidia.github.io/apex/amp.html (default: O1)
+  --half_precision_backend {auto,apex,cpu_amp}
+                        The backend to be used for half precision. (default:
+                        auto)
+  --bf16_full_eval [BF16_FULL_EVAL]
+                        Whether to use full bfloat16 evaluation instead of
+                        32-bit. This is an experimental API and it may change.
+                        (default: False)
+  --fp16_full_eval [FP16_FULL_EVAL]
+                        Whether to use full float16 evaluation instead of
+                        32-bit (default: False)
+  --tf32 TF32           Whether to enable tf32 mode, available in Ampere and
+                        newer GPU architectures. This is an experimental API
+                        and it may change. (default: None)
+  --local_rank LOCAL_RANK
+                        For distributed training: local_rank (default: -1)
+  --ddp_backend {nccl,gloo,mpi,ccl,hccl}
+                        The backend to be used for distributed training
+                        (default: None)
+  --tpu_num_cores TPU_NUM_CORES
+                        TPU: Number of TPU cores (automatically passed by
+                        launcher script) (default: None)
+  --tpu_metrics_debug [TPU_METRICS_DEBUG]
+                        Deprecated, the use of `--debug tpu_metrics_debug` is
+                        preferred. TPU: Whether to print debug metrics
+                        (default: False)
+  --debug DEBUG [DEBUG ...]
+                        Whether or not to enable debug mode. Current options:
+                        `underflow_overflow` (Detect underflow and overflow in
+                        activations and weights), `tpu_metrics_debug` (print
+                        debug metrics on TPU). (default: None)
+  --dataloader_drop_last [DATALOADER_DROP_LAST]
+                        Drop the last incomplete batch if it is not divisible
+                        by the batch size. (default: False)
+  --eval_steps EVAL_STEPS
+                        Run an evaluation every X steps. Should be an integer
+                        or a float in range `[0,1)`. If smaller than 1, will
+                        be interpreted as ratio of total training steps.
+                        (default: None)
+  --dataloader_num_workers DATALOADER_NUM_WORKERS
+                        Number of subprocesses to use for data loading
+                        (PyTorch only). 0 means that the data will be loaded
+                        in the main process. (default: 0)
+  --past_index PAST_INDEX
+                        If >=0, uses the corresponding part of the output as
+                        the past state for next step. (default: -1)
+  --run_name RUN_NAME   An optional descriptor for the run. Notably used for
+                        wandb logging. (default: None)
+  --disable_tqdm DISABLE_TQDM
+                        Whether or not to disable the tqdm progress bars.
+                        (default: None)
+  --remove_unused_columns [REMOVE_UNUSED_COLUMNS]
+                        Remove columns not required by the model when using an
+                        nlp.Dataset. (default: True)
+  --no_remove_unused_columns
+                        Remove columns not required by the model when using an
+                        nlp.Dataset. (default: False)
+  --label_names LABEL_NAMES [LABEL_NAMES ...]
+                        The list of keys in your dictionary of inputs that
+                        correspond to the labels. (default: None)
+  --load_best_model_at_end [LOAD_BEST_MODEL_AT_END]
+                        Whether or not to load the best model found during
+                        training at the end of training. When this option is
+                        enabled, the best checkpoint will always be saved. See
+                        `save_total_limit` for more. (default: False)
+  --metric_for_best_model METRIC_FOR_BEST_MODEL
+                        The metric to use to compare two different models.
+                        (default: None)
+  --greater_is_better GREATER_IS_BETTER
+                        Whether the `metric_for_best_model` should be
+                        maximized or not. (default: None)
+  --ignore_data_skip [IGNORE_DATA_SKIP]
+                        When resuming training, whether or not to skip the
+                        first epochs and batches to get to the same training
+                        data. (default: False)
+  --fsdp FSDP           Whether or not to use PyTorch Fully Sharded Data
+                        Parallel (FSDP) training (in distributed training
+                        only). The base option should be `full_shard`,
+                        `shard_grad_op` or `no_shard` and you can add CPU-
+                        offload to `full_shard` or `shard_grad_op` like this:
+                        full_shard offload` or `shard_grad_op offload`. You
+                        can add auto-wrap to `full_shard` or `shard_grad_op`
+                        with the same syntax: full_shard auto_wrap` or
+                        `shard_grad_op auto_wrap`. (default: )
+  --fsdp_min_num_params FSDP_MIN_NUM_PARAMS
+                        This parameter is deprecated. FSDP's minimum number of
+                        parameters for Default Auto Wrapping. (useful only
+                        when `fsdp` field is passed). (default: 0)
+  --fsdp_config FSDP_CONFIG
+                        Config to be used with FSDP (Pytorch Fully Sharded
+                        Data Parallel). The value is either a fsdp json config
+                        file (e.g., `fsdp_config.json`) or an already loaded
+                        json file as `dict`. (default: None)
+  --fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP
+                        This parameter is deprecated. Transformer layer class
+                        name (case-sensitive) to wrap, e.g, `BertLayer`,
+                        `GPTJBlock`, `T5Block` .... (useful only when `fsdp`
+                        flag is passed). (default: None)
+  --deepspeed DEEPSPEED
+                        Enable deepspeed and pass the path to deepspeed json
+                        config file (e.g. `ds_config.json`) or an already
+                        loaded json file as a dict (default: None)
+  --label_smoothing_factor LABEL_SMOOTHING_FACTOR
+                        The label smoothing epsilon to apply (zero means no
+                        label smoothing). (default: 0.0)
+  --optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_torch_npu_fused,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit,rmsprop}
+                        The optimizer to use. (default: adamw_torch)
+  --optim_args OPTIM_ARGS
+                        Optional arguments to supply to optimizer. (default:
+                        None)
+  --adafactor [ADAFACTOR]
+                        Whether or not to replace AdamW by Adafactor.
+                        (default: False)
+  --group_by_length [GROUP_BY_LENGTH]
+                        Whether or not to group samples of roughly the same
+                        length together when batching. (default: False)
+  --length_column_name LENGTH_COLUMN_NAME
+                        Column name with precomputed lengths to use when
+                        grouping by length. (default: length)
+  --report_to REPORT_TO [REPORT_TO ...]
+                        The list of integrations to report the results and
+                        logs to. (default: None)
+  --ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS
+                        When using distributed training, the value of the flag
+                        `find_unused_parameters` passed to
+                        `DistributedDataParallel`. (default: None)
+  --ddp_bucket_cap_mb DDP_BUCKET_CAP_MB
+                        When using distributed training, the value of the flag
+                        `bucket_cap_mb` passed to `DistributedDataParallel`.
+                        (default: None)
+  --ddp_broadcast_buffers DDP_BROADCAST_BUFFERS
+                        When using distributed training, the value of the flag
+                        `broadcast_buffers` passed to
+                        `DistributedDataParallel`. (default: None)
+  --dataloader_pin_memory [DATALOADER_PIN_MEMORY]
+                        Whether or not to pin memory for DataLoader. (default:
+                        True)
+  --no_dataloader_pin_memory
+                        Whether or not to pin memory for DataLoader. (default:
+                        False)
+  --dataloader_persistent_workers [DATALOADER_PERSISTENT_WORKERS]
+                        If True, the data loader will not shut down the worker
+                        processes after a dataset has been consumed once. This
+                        allows to maintain the workers Dataset instances
+                        alive. Can potentially speed up training, but will
+                        increase RAM usage. (default: False)
+  --skip_memory_metrics [SKIP_MEMORY_METRICS]
+                        Whether or not to skip adding of memory profiler
+                        reports to metrics. (default: True)
+  --no_skip_memory_metrics
+                        Whether or not to skip adding of memory profiler
+                        reports to metrics. (default: False)
+  --use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP]
+                        Whether or not to use the legacy prediction_loop in
+                        the Trainer. (default: False)
+  --push_to_hub [PUSH_TO_HUB]
+                        Whether or not to upload the trained model to the
+                        model hub after training. (default: False)
+  --resume_from_checkpoint RESUME_FROM_CHECKPOINT
+                        The path to a folder with a valid checkpoint for your
+                        model. (default: None)
+  --hub_model_id HUB_MODEL_ID
+                        The name of the repository to keep in sync with the
+                        local `output_dir`. (default: None)
+  --hub_strategy {end,every_save,checkpoint,all_checkpoints}
+                        The hub strategy to use when `--push_to_hub` is
+                        activated. (default: every_save)
+  --hub_token HUB_TOKEN
+                        The token to use to push to the Model Hub. (default:
+                        None)
+  --hub_private_repo [HUB_PRIVATE_REPO]
+                        Whether the model repository is private or not.
+                        (default: False)
+  --hub_always_push [HUB_ALWAYS_PUSH]
+                        Unless `True`, the Trainer will skip pushes if the
+                        previous one wasn't finished yet. (default: False)
+  --gradient_checkpointing [GRADIENT_CHECKPOINTING]
+                        If True, use gradient checkpointing to save memory at
+                        the expense of slower backward pass. (default: False)
+  --gradient_checkpointing_kwargs GRADIENT_CHECKPOINTING_KWARGS
+                        Gradient checkpointing key word arguments such as
+                        `use_reentrant`. Will be passed to
+                        `torch.utils.checkpoint.checkpoint` through
+                        `model.gradient_checkpointing_enable`. (default: None)
+  --include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS]
+                        Whether or not the inputs will be passed to the
+                        `compute_metrics` function. (default: False)
+  --fp16_backend {auto,apex,cpu_amp}
+                        Deprecated. Use half_precision_backend instead
+                        (default: auto)
+  --push_to_hub_model_id PUSH_TO_HUB_MODEL_ID
+                        The name of the repository to which push the
+                        `Trainer`. (default: None)
+  --push_to_hub_organization PUSH_TO_HUB_ORGANIZATION
+                        The name of the organization in with to which push the
+                        `Trainer`. (default: None)
+  --push_to_hub_token PUSH_TO_HUB_TOKEN
+                        The token to use to push to the Model Hub. (default:
+                        None)
+  --mp_parameters MP_PARAMETERS
+                        Used by the SageMaker launcher to send mp-specific
+                        args. Ignored in Trainer (default: )
+  --auto_find_batch_size [AUTO_FIND_BATCH_SIZE]
+                        Whether to automatically decrease the batch size in
+                        half and rerun the training loop again each time a
+                        CUDA Out-of-Memory was reached (default: False)
+  --full_determinism [FULL_DETERMINISM]
+                        Whether to call enable_full_determinism instead of
+                        set_seed for reproducibility in distributed training.
+                        Important: this will negatively impact the
+                        performance, so only use it for debugging. (default:
+                        False)
+  --torchdynamo TORCHDYNAMO
+                        This argument is deprecated, use
+                        `--torch_compile_backend` instead. (default: None)
+  --ray_scope RAY_SCOPE
+                        The scope to use when doing hyperparameter search with
+                        Ray. By default, `"last"` will be used. Ray will then
+                        use the last checkpoint of all trials, compare those,
+                        and select the best one. However, other options are
+                        also available. See the Ray documentation (https://doc
+                        s.ray.io/en/latest/tune/api_docs/analysis.html#ray.tun
+                        e.ExperimentAnalysis.get_best_trial) for more options.
+                        (default: last)
+  --ddp_timeout DDP_TIMEOUT
+                        Overrides the default timeout for distributed training
+                        (value should be given in seconds). (default: 1800)
+  --torch_compile [TORCH_COMPILE]
+                        If set to `True`, the model will be wrapped in
+                        `torch.compile`. (default: False)
+  --torch_compile_backend TORCH_COMPILE_BACKEND
+                        Which backend to use with `torch.compile`, passing one
+                        will trigger a model compilation. (default: None)
+  --torch_compile_mode TORCH_COMPILE_MODE
+                        Which mode to use with `torch.compile`, passing one
+                        will trigger a model compilation. (default: None)
+  --dispatch_batches DISPATCH_BATCHES
+                        Whether to dispatch batches across devices in
+                        distributed training. If set to `True`, the dataloader
+                        prepared by the Accelerator is only iterated through
+                        on the main process and then the batches are split and
+                        broadcast to each process. Will default to `True` for
+                        `DataLoader` whoseunderlying dataset is an
+                        `IterableDataset`, `False` otherwise. (default: None)
+  --split_batches [SPLIT_BATCHES]
+                        Whether or not the accelerator should split the
+                        batches yielded by the dataloaders across the devices
+                        during distributed training. Ifset to `True`, the
+                        actual batch size used will be the same on any kind of
+                        distributed processes, but it must be around multiple
+                        of the number of processes you are using (such as
+                        GPUs). (default: False)
+  --include_tokens_per_second [INCLUDE_TOKENS_PER_SECOND]
+                        If set to `True`, the speed metrics will include `tgs`
+                        (tokens per second per device). (default: False)
+  --include_num_input_tokens_seen [INCLUDE_NUM_INPUT_TOKENS_SEEN]
+                        If set to `True`, will track the number of input
+                        tokens seen throughout training. (May be slower in
+                        distributed training) (default: False)
+  --neftune_noise_alpha NEFTUNE_NOISE_ALPHA
+                        Activates neftune noise embeddings into the model.
+                        NEFTune has been proven to drastically improve model
+                        performances for instrcution fine-tuning. Check out
+                        the original paper here:
+                        https://arxiv.org/abs/2310.05914 and the original code
+                        here: https://github.com/neelsjain/NEFTune. Only
+                        supported for `PreTrainedModel` and `PeftModel`
+                        classes. (default: None)
diff --git a/examples/training/swallow-tart/run_train.py b/examples/training/swallow-tart/run_train.py
index e2cc3d4ea..f1bdee901 100644
--- a/examples/training/swallow-tart/run_train.py
+++ b/examples/training/swallow-tart/run_train.py
@@ -52,6 +52,8 @@ def main():
         # validate fp16 or bf16
         assert training_args.fp16 or training_args.bf16, "use_flash_attention requires fp16 or bf16"
         model_kwargs = {"attn_implementation": "flash_attention_2"}
+    else:
+        model_kwargs = {}
     tf_model = Transformer(
         model_args.model_name,
         model_args=model_kwargs,
@@ -76,6 +78,7 @@ def main():
     # preprocessor = TokenizeProcessor(tokenizer, data_args.max_length)
     preprocessor = TokenizeBatchProcessor(tokenizer, data_args.max_length)
     train_dataset, eval_dataset = get_dataset(
+        data_args.hf_dataset_dir,
         data_args.task_names,
         data_args.data_dir,
         data_args.query_file_name,

From 1e39898c4d78ef3502c9c3417457bd5f0f9d259f Mon Sep 17 00:00:00 2001
From: Katsumata420 <cof.ktmt@gmail.com>
Date: Wed, 21 Feb 2024 14:57:25 +0900
Subject: [PATCH 6/9] Apply formatter

---
 examples/training/swallow-tart/data.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/training/swallow-tart/data.py b/examples/training/swallow-tart/data.py
index d7617f114..39d36722f 100644
--- a/examples/training/swallow-tart/data.py
+++ b/examples/training/swallow-tart/data.py
@@ -353,7 +353,6 @@ def get_dataset(
     logger.info(f"Train dataset size: {len(train_dataset)}")
     logger.info(f"Dev dataset size: {len(dev_dataset)}")
 
-
     # build Torch Dataset and Return ones.
     train_torch_dataset = MNRLDataset(train_dataset, tokenizer, max_length)
     dev_torch_dataset = MNRLDataset(dev_dataset, tokenizer, max_length)

From 95ccb0d49fbd7af5aeced526d01663c916ebfc2e Mon Sep 17 00:00:00 2001
From: Katsumata420 <cof.ktmt@gmail.com>
Date: Wed, 21 Feb 2024 16:04:11 +0900
Subject: [PATCH 7/9] Apply ruff

---
 examples/training/swallow-tart/run_train.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/training/swallow-tart/run_train.py b/examples/training/swallow-tart/run_train.py
index f1bdee901..ddc9253ba 100644
--- a/examples/training/swallow-tart/run_train.py
+++ b/examples/training/swallow-tart/run_train.py
@@ -18,7 +18,7 @@
 from transformers.utils import logging
 
 from args import STDataArgumnets, STModelArguments, STTrainingArguments
-from data import get_dataset, TokenizeProcessor, TokenizeBatchProcessor, IRCollator
+from data import get_dataset, TokenizeBatchProcessor, IRCollator
 
 logger = logging.get_logger(__name__)
 
@@ -75,7 +75,6 @@ def main():
     # define train/eval dataset
     logger.info("Load dataset")
     logger.info(f"Target task names: {data_args.task_names}")
-    # preprocessor = TokenizeProcessor(tokenizer, data_args.max_length)
     preprocessor = TokenizeBatchProcessor(tokenizer, data_args.max_length)
     train_dataset, eval_dataset = get_dataset(
         data_args.hf_dataset_dir,

From 78b324affc89935c523b989227a9dd83c8b91acd Mon Sep 17 00:00:00 2001
From: Katsumata420 <cof.ktmt@gmail.com>
Date: Thu, 22 Feb 2024 11:28:03 +0900
Subject: [PATCH 8/9] Add ds and lora config

---
 .../swallow-tart/configs/ds_config_zero3.json | 60 +++++++++++++++++++
 .../swallow-tart/configs/lora_config.json     | 27 +++++++++
 install-deepspeed.sh                          |  2 +
 3 files changed, 89 insertions(+)
 create mode 100644 examples/training/swallow-tart/configs/ds_config_zero3.json
 create mode 100644 examples/training/swallow-tart/configs/lora_config.json
 create mode 100755 install-deepspeed.sh

diff --git a/examples/training/swallow-tart/configs/ds_config_zero3.json b/examples/training/swallow-tart/configs/ds_config_zero3.json
new file mode 100644
index 000000000..ac7eeb18b
--- /dev/null
+++ b/examples/training/swallow-tart/configs/ds_config_zero3.json
@@ -0,0 +1,60 @@
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 10,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 10,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "weight_decay": "auto"
+        }
+    },
+    "scheduler": {
+        "type": "WarmupDecayLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto",
+            "total_num_steps": "auto"
+        }
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "sub_group_size": 1e9,
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": "auto"
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
diff --git a/examples/training/swallow-tart/configs/lora_config.json b/examples/training/swallow-tart/configs/lora_config.json
new file mode 100644
index 000000000..26705720e
--- /dev/null
+++ b/examples/training/swallow-tart/configs/lora_config.json
@@ -0,0 +1,27 @@
+{
+  "auto_mapping": null,
+  "base_model_name_or_path": "tokyotech-llm/Swallow-7b-hf",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": false,
+  "init_lora_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "k_proj",
+    "v_proj",
+    "o_proj",
+    "down_proj",
+    "up_proj",
+    "gate_proj"
+  ],
+  "task_type": "FEATURE_EXTRACTION",
+  "use_rslora": true
+}
diff --git a/install-deepspeed.sh b/install-deepspeed.sh
new file mode 100755
index 000000000..35414b901
--- /dev/null
+++ b/install-deepspeed.sh
@@ -0,0 +1,2 @@
+#!/bin/sh
+DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_FUSED_LAMB=1 DS_BUILD_UTILS=1 pip install deepspeed --no-cache-dir

From 3e7363ce19ec845cfa206882fbc342864fe8b9bf Mon Sep 17 00:00:00 2001
From: Katsumata420 <cof.ktmt@gmail.com>
Date: Thu, 22 Feb 2024 11:32:53 +0900
Subject: [PATCH 9/9] Delete files

---
 examples/training/swallow-tart/h              | 581 ------------------
 .../training/swallow-tart/peft_config.json    |  29 -
 2 files changed, 610 deletions(-)
 delete mode 100644 examples/training/swallow-tart/h
 delete mode 100644 examples/training/swallow-tart/peft_config.json

diff --git a/examples/training/swallow-tart/h b/examples/training/swallow-tart/h
deleted file mode 100644
index 95c104586..000000000
--- a/examples/training/swallow-tart/h
+++ /dev/null
@@ -1,581 +0,0 @@
-usage: run_train.py [-h] --data_dir DATA_DIR
-                    [--task_names TASK_NAMES [TASK_NAMES ...]]
-                    [--max_length MAX_LENGTH] [--n_dev_sample N_DEV_SAMPLE]
-                    [--query_file_name QUERY_FILE_NAME]
-                    [--corpus_file_name CORPUS_FILE_NAME]
-                    [--qrel_file_name QREL_FILE_NAME]
-                    [--hard_negatives_file_name HARD_NEGATIVES_FILE_NAME]
-                    [--num_proc NUM_PROC] [--model_name MODEL_NAME]
-                    [--peft_config_path PEFT_CONFIG_PATH]
-                    [--use_flash_attention [USE_FLASH_ATTENTION]] --output_dir
-                    OUTPUT_DIR [--overwrite_output_dir [OVERWRITE_OUTPUT_DIR]]
-                    [--do_train [DO_TRAIN]] [--do_eval [DO_EVAL]]
-                    [--do_predict [DO_PREDICT]]
-                    [--evaluation_strategy {no,steps,epoch}]
-                    [--prediction_loss_only [PREDICTION_LOSS_ONLY]]
-                    [--per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE]
-                    [--per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE]
-                    [--per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE]
-                    [--per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE]
-                    [--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS]
-                    [--eval_accumulation_steps EVAL_ACCUMULATION_STEPS]
-                    [--eval_delay EVAL_DELAY] [--learning_rate LEARNING_RATE]
-                    [--weight_decay WEIGHT_DECAY] [--adam_beta1 ADAM_BETA1]
-                    [--adam_beta2 ADAM_BETA2] [--adam_epsilon ADAM_EPSILON]
-                    [--max_grad_norm MAX_GRAD_NORM]
-                    [--num_train_epochs NUM_TRAIN_EPOCHS]
-                    [--max_steps MAX_STEPS]
-                    [--lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau}]
-                    [--lr_scheduler_kwargs LR_SCHEDULER_KWARGS]
-                    [--warmup_ratio WARMUP_RATIO]
-                    [--warmup_steps WARMUP_STEPS]
-                    [--log_level {detail,debug,info,warning,error,critical,passive}]
-                    [--log_level_replica {detail,debug,info,warning,error,critical,passive}]
-                    [--log_on_each_node [LOG_ON_EACH_NODE]]
-                    [--no_log_on_each_node] [--logging_dir LOGGING_DIR]
-                    [--logging_strategy {no,steps,epoch}]
-                    [--logging_first_step [LOGGING_FIRST_STEP]]
-                    [--logging_steps LOGGING_STEPS]
-                    [--logging_nan_inf_filter [LOGGING_NAN_INF_FILTER]]
-                    [--no_logging_nan_inf_filter]
-                    [--save_strategy {no,steps,epoch}]
-                    [--save_steps SAVE_STEPS]
-                    [--save_total_limit SAVE_TOTAL_LIMIT]
-                    [--save_safetensors [SAVE_SAFETENSORS]]
-                    [--no_save_safetensors]
-                    [--save_on_each_node [SAVE_ON_EACH_NODE]]
-                    [--save_only_model [SAVE_ONLY_MODEL]]
-                    [--no_cuda [NO_CUDA]] [--use_cpu [USE_CPU]]
-                    [--use_mps_device [USE_MPS_DEVICE]] [--seed SEED]
-                    [--data_seed DATA_SEED] [--jit_mode_eval [JIT_MODE_EVAL]]
-                    [--use_ipex [USE_IPEX]] [--bf16 [BF16]] [--fp16 [FP16]]
-                    [--fp16_opt_level FP16_OPT_LEVEL]
-                    [--half_precision_backend {auto,apex,cpu_amp}]
-                    [--bf16_full_eval [BF16_FULL_EVAL]]
-                    [--fp16_full_eval [FP16_FULL_EVAL]] [--tf32 TF32]
-                    [--local_rank LOCAL_RANK]
-                    [--ddp_backend {nccl,gloo,mpi,ccl,hccl}]
-                    [--tpu_num_cores TPU_NUM_CORES]
-                    [--tpu_metrics_debug [TPU_METRICS_DEBUG]]
-                    [--debug DEBUG [DEBUG ...]]
-                    [--dataloader_drop_last [DATALOADER_DROP_LAST]]
-                    [--eval_steps EVAL_STEPS]
-                    [--dataloader_num_workers DATALOADER_NUM_WORKERS]
-                    [--past_index PAST_INDEX] [--run_name RUN_NAME]
-                    [--disable_tqdm DISABLE_TQDM]
-                    [--remove_unused_columns [REMOVE_UNUSED_COLUMNS]]
-                    [--no_remove_unused_columns]
-                    [--label_names LABEL_NAMES [LABEL_NAMES ...]]
-                    [--load_best_model_at_end [LOAD_BEST_MODEL_AT_END]]
-                    [--metric_for_best_model METRIC_FOR_BEST_MODEL]
-                    [--greater_is_better GREATER_IS_BETTER]
-                    [--ignore_data_skip [IGNORE_DATA_SKIP]] [--fsdp FSDP]
-                    [--fsdp_min_num_params FSDP_MIN_NUM_PARAMS]
-                    [--fsdp_config FSDP_CONFIG]
-                    [--fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP]
-                    [--deepspeed DEEPSPEED]
-                    [--label_smoothing_factor LABEL_SMOOTHING_FACTOR]
-                    [--optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_torch_npu_fused,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit,rmsprop}]
-                    [--optim_args OPTIM_ARGS] [--adafactor [ADAFACTOR]]
-                    [--group_by_length [GROUP_BY_LENGTH]]
-                    [--length_column_name LENGTH_COLUMN_NAME]
-                    [--report_to REPORT_TO [REPORT_TO ...]]
-                    [--ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS]
-                    [--ddp_bucket_cap_mb DDP_BUCKET_CAP_MB]
-                    [--ddp_broadcast_buffers DDP_BROADCAST_BUFFERS]
-                    [--dataloader_pin_memory [DATALOADER_PIN_MEMORY]]
-                    [--no_dataloader_pin_memory]
-                    [--dataloader_persistent_workers [DATALOADER_PERSISTENT_WORKERS]]
-                    [--skip_memory_metrics [SKIP_MEMORY_METRICS]]
-                    [--no_skip_memory_metrics]
-                    [--use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP]]
-                    [--push_to_hub [PUSH_TO_HUB]]
-                    [--resume_from_checkpoint RESUME_FROM_CHECKPOINT]
-                    [--hub_model_id HUB_MODEL_ID]
-                    [--hub_strategy {end,every_save,checkpoint,all_checkpoints}]
-                    [--hub_token HUB_TOKEN]
-                    [--hub_private_repo [HUB_PRIVATE_REPO]]
-                    [--hub_always_push [HUB_ALWAYS_PUSH]]
-                    [--gradient_checkpointing [GRADIENT_CHECKPOINTING]]
-                    [--gradient_checkpointing_kwargs GRADIENT_CHECKPOINTING_KWARGS]
-                    [--include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS]]
-                    [--fp16_backend {auto,apex,cpu_amp}]
-                    [--push_to_hub_model_id PUSH_TO_HUB_MODEL_ID]
-                    [--push_to_hub_organization PUSH_TO_HUB_ORGANIZATION]
-                    [--push_to_hub_token PUSH_TO_HUB_TOKEN]
-                    [--mp_parameters MP_PARAMETERS]
-                    [--auto_find_batch_size [AUTO_FIND_BATCH_SIZE]]
-                    [--full_determinism [FULL_DETERMINISM]]
-                    [--torchdynamo TORCHDYNAMO] [--ray_scope RAY_SCOPE]
-                    [--ddp_timeout DDP_TIMEOUT]
-                    [--torch_compile [TORCH_COMPILE]]
-                    [--torch_compile_backend TORCH_COMPILE_BACKEND]
-                    [--torch_compile_mode TORCH_COMPILE_MODE]
-                    [--dispatch_batches DISPATCH_BATCHES]
-                    [--split_batches [SPLIT_BATCHES]]
-                    [--include_tokens_per_second [INCLUDE_TOKENS_PER_SECOND]]
-                    [--include_num_input_tokens_seen [INCLUDE_NUM_INPUT_TOKENS_SEEN]]
-                    [--neftune_noise_alpha NEFTUNE_NOISE_ALPHA]
-
-options:
-  -h, --help            show this help message and exit
-  --data_dir DATA_DIR
-  --task_names TASK_NAMES [TASK_NAMES ...]
-  --max_length MAX_LENGTH
-  --n_dev_sample N_DEV_SAMPLE
-  --query_file_name QUERY_FILE_NAME
-  --corpus_file_name CORPUS_FILE_NAME
-  --qrel_file_name QREL_FILE_NAME
-  --hard_negatives_file_name HARD_NEGATIVES_FILE_NAME
-  --num_proc NUM_PROC
-  --model_name MODEL_NAME
-  --peft_config_path PEFT_CONFIG_PATH
-  --use_flash_attention [USE_FLASH_ATTENTION]
-  --output_dir OUTPUT_DIR
-                        The output directory where the model predictions and
-                        checkpoints will be written. (default: None)
-  --overwrite_output_dir [OVERWRITE_OUTPUT_DIR]
-                        Overwrite the content of the output directory. Use
-                        this to continue training if output_dir points to a
-                        checkpoint directory. (default: False)
-  --do_train [DO_TRAIN]
-                        Whether to run training. (default: False)
-  --do_eval [DO_EVAL]   Whether to run eval on the dev set. (default: False)
-  --do_predict [DO_PREDICT]
-                        Whether to run predictions on the test set. (default:
-                        False)
-  --evaluation_strategy {no,steps,epoch}
-                        The evaluation strategy to use. (default: no)
-  --prediction_loss_only [PREDICTION_LOSS_ONLY]
-                        When performing evaluation and predictions, only
-                        returns the loss. (default: False)
-  --per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE
-                        Batch size per GPU/TPU/MPS/NPU core/CPU for training.
-                        (default: 8)
-  --per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE
-                        Batch size per GPU/TPU/MPS/NPU core/CPU for
-                        evaluation. (default: 8)
-  --per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE
-                        Deprecated, the use of `--per_device_train_batch_size`
-                        is preferred. Batch size per GPU/TPU core/CPU for
-                        training. (default: None)
-  --per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE
-                        Deprecated, the use of `--per_device_eval_batch_size`
-                        is preferred. Batch size per GPU/TPU core/CPU for
-                        evaluation. (default: None)
-  --gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS
-                        Number of updates steps to accumulate before
-                        performing a backward/update pass. (default: 1)
-  --eval_accumulation_steps EVAL_ACCUMULATION_STEPS
-                        Number of predictions steps to accumulate before
-                        moving the tensors to the CPU. (default: None)
-  --eval_delay EVAL_DELAY
-                        Number of epochs or steps to wait for before the first
-                        evaluation can be performed, depending on the
-                        evaluation_strategy. (default: 0)
-  --learning_rate LEARNING_RATE
-                        The initial learning rate for AdamW. (default: 5e-05)
-  --weight_decay WEIGHT_DECAY
-                        Weight decay for AdamW if we apply some. (default:
-                        0.0)
-  --adam_beta1 ADAM_BETA1
-                        Beta1 for AdamW optimizer (default: 0.9)
-  --adam_beta2 ADAM_BETA2
-                        Beta2 for AdamW optimizer (default: 0.999)
-  --adam_epsilon ADAM_EPSILON
-                        Epsilon for AdamW optimizer. (default: 1e-08)
-  --max_grad_norm MAX_GRAD_NORM
-                        Max gradient norm. (default: 1.0)
-  --num_train_epochs NUM_TRAIN_EPOCHS
-                        Total number of training epochs to perform. (default:
-                        3.0)
-  --max_steps MAX_STEPS
-                        If > 0: set total number of training steps to perform.
-                        Override num_train_epochs. (default: -1)
-  --lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau}
-                        The scheduler type to use. (default: linear)
-  --lr_scheduler_kwargs LR_SCHEDULER_KWARGS
-                        Extra parameters for the lr_scheduler such as
-                        {'num_cycles': 1} for the cosine with hard restarts
-                        (default: {})
-  --warmup_ratio WARMUP_RATIO
-                        Linear warmup over warmup_ratio fraction of total
-                        steps. (default: 0.0)
-  --warmup_steps WARMUP_STEPS
-                        Linear warmup over warmup_steps. (default: 0)
-  --log_level {detail,debug,info,warning,error,critical,passive}
-                        Logger log level to use on the main node. Possible
-                        choices are the log levels as strings: 'debug',
-                        'info', 'warning', 'error' and 'critical', plus a
-                        'passive' level which doesn't set anything and lets
-                        the application set the level. Defaults to 'passive'.
-                        (default: passive)
-  --log_level_replica {detail,debug,info,warning,error,critical,passive}
-                        Logger log level to use on replica nodes. Same choices
-                        and defaults as ``log_level`` (default: warning)
-  --log_on_each_node [LOG_ON_EACH_NODE]
-                        When doing a multinode distributed training, whether
-                        to log once per node or just once on the main node.
-                        (default: True)
-  --no_log_on_each_node
-                        When doing a multinode distributed training, whether
-                        to log once per node or just once on the main node.
-                        (default: False)
-  --logging_dir LOGGING_DIR
-                        Tensorboard log dir. (default: None)
-  --logging_strategy {no,steps,epoch}
-                        The logging strategy to use. (default: steps)
-  --logging_first_step [LOGGING_FIRST_STEP]
-                        Log the first global_step (default: False)
-  --logging_steps LOGGING_STEPS
-                        Log every X updates steps. Should be an integer or a
-                        float in range `[0,1)`. If smaller than 1, will be
-                        interpreted as ratio of total training steps.
-                        (default: 500)
-  --logging_nan_inf_filter [LOGGING_NAN_INF_FILTER]
-                        Filter nan and inf losses for logging. (default: True)
-  --no_logging_nan_inf_filter
-                        Filter nan and inf losses for logging. (default:
-                        False)
-  --save_strategy {no,steps,epoch}
-                        The checkpoint save strategy to use. (default: steps)
-  --save_steps SAVE_STEPS
-                        Save checkpoint every X updates steps. Should be an
-                        integer or a float in range `[0,1)`. If smaller than
-                        1, will be interpreted as ratio of total training
-                        steps. (default: 500)
-  --save_total_limit SAVE_TOTAL_LIMIT
-                        If a value is passed, will limit the total amount of
-                        checkpoints. Deletes the older checkpoints in
-                        `output_dir`. When `load_best_model_at_end` is
-                        enabled, the 'best' checkpoint according to
-                        `metric_for_best_model` will always be retained in
-                        addition to the most recent ones. For example, for
-                        `save_total_limit=5` and
-                        `load_best_model_at_end=True`, the four last
-                        checkpoints will always be retained alongside the best
-                        model. When `save_total_limit=1` and
-                        `load_best_model_at_end=True`, it is possible that two
-                        checkpoints are saved: the last one and the best one
-                        (if they are different). Default is unlimited
-                        checkpoints (default: None)
-  --save_safetensors [SAVE_SAFETENSORS]
-                        Use safetensors saving and loading for state dicts
-                        instead of default torch.load and torch.save.
-                        (default: True)
-  --no_save_safetensors
-                        Use safetensors saving and loading for state dicts
-                        instead of default torch.load and torch.save.
-                        (default: False)
-  --save_on_each_node [SAVE_ON_EACH_NODE]
-                        When doing multi-node distributed training, whether to
-                        save models and checkpoints on each node, or only on
-                        the main one (default: False)
-  --save_only_model [SAVE_ONLY_MODEL]
-                        When checkpointing, whether to only save the model, or
-                        also the optimizer, scheduler & rng state.Note that
-                        when this is true, you won't be able to resume
-                        training from checkpoint.This enables you to save
-                        storage by not storing the optimizer, scheduler & rng
-                        state.You can only load the model using
-                        from_pretrained with this option set to True.
-                        (default: False)
-  --no_cuda [NO_CUDA]   This argument is deprecated. It will be removed in
-                        version 5.0 of 🤗 Transformers. (default: False)
-  --use_cpu [USE_CPU]   Whether or not to use cpu. If set to False, we will
-                        use cuda/tpu/mps/npu device if available. (default:
-                        False)
-  --use_mps_device [USE_MPS_DEVICE]
-                        This argument is deprecated. `mps` device will be used
-                        if available similar to `cuda` device. It will be
-                        removed in version 5.0 of 🤗 Transformers (default:
-                        False)
-  --seed SEED           Random seed that will be set at the beginning of
-                        training. (default: 42)
-  --data_seed DATA_SEED
-                        Random seed to be used with data samplers. (default:
-                        None)
-  --jit_mode_eval [JIT_MODE_EVAL]
-                        Whether or not to use PyTorch jit trace for inference
-                        (default: False)
-  --use_ipex [USE_IPEX]
-                        Use Intel extension for PyTorch when it is available,
-                        installation: 'https://github.com/intel/intel-
-                        extension-for-pytorch' (default: False)
-  --bf16 [BF16]         Whether to use bf16 (mixed) precision instead of
-                        32-bit. Requires Ampere or higher NVIDIA architecture
-                        or using CPU (use_cpu) or Ascend NPU. This is an
-                        experimental API and it may change. (default: False)
-  --fp16 [FP16]         Whether to use fp16 (mixed) precision instead of
-                        32-bit (default: False)
-  --fp16_opt_level FP16_OPT_LEVEL
-                        For fp16: Apex AMP optimization level selected in
-                        ['O0', 'O1', 'O2', and 'O3']. See details at
-                        https://nvidia.github.io/apex/amp.html (default: O1)
-  --half_precision_backend {auto,apex,cpu_amp}
-                        The backend to be used for half precision. (default:
-                        auto)
-  --bf16_full_eval [BF16_FULL_EVAL]
-                        Whether to use full bfloat16 evaluation instead of
-                        32-bit. This is an experimental API and it may change.
-                        (default: False)
-  --fp16_full_eval [FP16_FULL_EVAL]
-                        Whether to use full float16 evaluation instead of
-                        32-bit (default: False)
-  --tf32 TF32           Whether to enable tf32 mode, available in Ampere and
-                        newer GPU architectures. This is an experimental API
-                        and it may change. (default: None)
-  --local_rank LOCAL_RANK
-                        For distributed training: local_rank (default: -1)
-  --ddp_backend {nccl,gloo,mpi,ccl,hccl}
-                        The backend to be used for distributed training
-                        (default: None)
-  --tpu_num_cores TPU_NUM_CORES
-                        TPU: Number of TPU cores (automatically passed by
-                        launcher script) (default: None)
-  --tpu_metrics_debug [TPU_METRICS_DEBUG]
-                        Deprecated, the use of `--debug tpu_metrics_debug` is
-                        preferred. TPU: Whether to print debug metrics
-                        (default: False)
-  --debug DEBUG [DEBUG ...]
-                        Whether or not to enable debug mode. Current options:
-                        `underflow_overflow` (Detect underflow and overflow in
-                        activations and weights), `tpu_metrics_debug` (print
-                        debug metrics on TPU). (default: None)
-  --dataloader_drop_last [DATALOADER_DROP_LAST]
-                        Drop the last incomplete batch if it is not divisible
-                        by the batch size. (default: False)
-  --eval_steps EVAL_STEPS
-                        Run an evaluation every X steps. Should be an integer
-                        or a float in range `[0,1)`. If smaller than 1, will
-                        be interpreted as ratio of total training steps.
-                        (default: None)
-  --dataloader_num_workers DATALOADER_NUM_WORKERS
-                        Number of subprocesses to use for data loading
-                        (PyTorch only). 0 means that the data will be loaded
-                        in the main process. (default: 0)
-  --past_index PAST_INDEX
-                        If >=0, uses the corresponding part of the output as
-                        the past state for next step. (default: -1)
-  --run_name RUN_NAME   An optional descriptor for the run. Notably used for
-                        wandb logging. (default: None)
-  --disable_tqdm DISABLE_TQDM
-                        Whether or not to disable the tqdm progress bars.
-                        (default: None)
-  --remove_unused_columns [REMOVE_UNUSED_COLUMNS]
-                        Remove columns not required by the model when using an
-                        nlp.Dataset. (default: True)
-  --no_remove_unused_columns
-                        Remove columns not required by the model when using an
-                        nlp.Dataset. (default: False)
-  --label_names LABEL_NAMES [LABEL_NAMES ...]
-                        The list of keys in your dictionary of inputs that
-                        correspond to the labels. (default: None)
-  --load_best_model_at_end [LOAD_BEST_MODEL_AT_END]
-                        Whether or not to load the best model found during
-                        training at the end of training. When this option is
-                        enabled, the best checkpoint will always be saved. See
-                        `save_total_limit` for more. (default: False)
-  --metric_for_best_model METRIC_FOR_BEST_MODEL
-                        The metric to use to compare two different models.
-                        (default: None)
-  --greater_is_better GREATER_IS_BETTER
-                        Whether the `metric_for_best_model` should be
-                        maximized or not. (default: None)
-  --ignore_data_skip [IGNORE_DATA_SKIP]
-                        When resuming training, whether or not to skip the
-                        first epochs and batches to get to the same training
-                        data. (default: False)
-  --fsdp FSDP           Whether or not to use PyTorch Fully Sharded Data
-                        Parallel (FSDP) training (in distributed training
-                        only). The base option should be `full_shard`,
-                        `shard_grad_op` or `no_shard` and you can add CPU-
-                        offload to `full_shard` or `shard_grad_op` like this:
-                        full_shard offload` or `shard_grad_op offload`. You
-                        can add auto-wrap to `full_shard` or `shard_grad_op`
-                        with the same syntax: full_shard auto_wrap` or
-                        `shard_grad_op auto_wrap`. (default: )
-  --fsdp_min_num_params FSDP_MIN_NUM_PARAMS
-                        This parameter is deprecated. FSDP's minimum number of
-                        parameters for Default Auto Wrapping. (useful only
-                        when `fsdp` field is passed). (default: 0)
-  --fsdp_config FSDP_CONFIG
-                        Config to be used with FSDP (Pytorch Fully Sharded
-                        Data Parallel). The value is either a fsdp json config
-                        file (e.g., `fsdp_config.json`) or an already loaded
-                        json file as `dict`. (default: None)
-  --fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP
-                        This parameter is deprecated. Transformer layer class
-                        name (case-sensitive) to wrap, e.g, `BertLayer`,
-                        `GPTJBlock`, `T5Block` .... (useful only when `fsdp`
-                        flag is passed). (default: None)
-  --deepspeed DEEPSPEED
-                        Enable deepspeed and pass the path to deepspeed json
-                        config file (e.g. `ds_config.json`) or an already
-                        loaded json file as a dict (default: None)
-  --label_smoothing_factor LABEL_SMOOTHING_FACTOR
-                        The label smoothing epsilon to apply (zero means no
-                        label smoothing). (default: 0.0)
-  --optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_torch_npu_fused,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit,rmsprop}
-                        The optimizer to use. (default: adamw_torch)
-  --optim_args OPTIM_ARGS
-                        Optional arguments to supply to optimizer. (default:
-                        None)
-  --adafactor [ADAFACTOR]
-                        Whether or not to replace AdamW by Adafactor.
-                        (default: False)
-  --group_by_length [GROUP_BY_LENGTH]
-                        Whether or not to group samples of roughly the same
-                        length together when batching. (default: False)
-  --length_column_name LENGTH_COLUMN_NAME
-                        Column name with precomputed lengths to use when
-                        grouping by length. (default: length)
-  --report_to REPORT_TO [REPORT_TO ...]
-                        The list of integrations to report the results and
-                        logs to. (default: None)
-  --ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS
-                        When using distributed training, the value of the flag
-                        `find_unused_parameters` passed to
-                        `DistributedDataParallel`. (default: None)
-  --ddp_bucket_cap_mb DDP_BUCKET_CAP_MB
-                        When using distributed training, the value of the flag
-                        `bucket_cap_mb` passed to `DistributedDataParallel`.
-                        (default: None)
-  --ddp_broadcast_buffers DDP_BROADCAST_BUFFERS
-                        When using distributed training, the value of the flag
-                        `broadcast_buffers` passed to
-                        `DistributedDataParallel`. (default: None)
-  --dataloader_pin_memory [DATALOADER_PIN_MEMORY]
-                        Whether or not to pin memory for DataLoader. (default:
-                        True)
-  --no_dataloader_pin_memory
-                        Whether or not to pin memory for DataLoader. (default:
-                        False)
-  --dataloader_persistent_workers [DATALOADER_PERSISTENT_WORKERS]
-                        If True, the data loader will not shut down the worker
-                        processes after a dataset has been consumed once. This
-                        allows to maintain the workers Dataset instances
-                        alive. Can potentially speed up training, but will
-                        increase RAM usage. (default: False)
-  --skip_memory_metrics [SKIP_MEMORY_METRICS]
-                        Whether or not to skip adding of memory profiler
-                        reports to metrics. (default: True)
-  --no_skip_memory_metrics
-                        Whether or not to skip adding of memory profiler
-                        reports to metrics. (default: False)
-  --use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP]
-                        Whether or not to use the legacy prediction_loop in
-                        the Trainer. (default: False)
-  --push_to_hub [PUSH_TO_HUB]
-                        Whether or not to upload the trained model to the
-                        model hub after training. (default: False)
-  --resume_from_checkpoint RESUME_FROM_CHECKPOINT
-                        The path to a folder with a valid checkpoint for your
-                        model. (default: None)
-  --hub_model_id HUB_MODEL_ID
-                        The name of the repository to keep in sync with the
-                        local `output_dir`. (default: None)
-  --hub_strategy {end,every_save,checkpoint,all_checkpoints}
-                        The hub strategy to use when `--push_to_hub` is
-                        activated. (default: every_save)
-  --hub_token HUB_TOKEN
-                        The token to use to push to the Model Hub. (default:
-                        None)
-  --hub_private_repo [HUB_PRIVATE_REPO]
-                        Whether the model repository is private or not.
-                        (default: False)
-  --hub_always_push [HUB_ALWAYS_PUSH]
-                        Unless `True`, the Trainer will skip pushes if the
-                        previous one wasn't finished yet. (default: False)
-  --gradient_checkpointing [GRADIENT_CHECKPOINTING]
-                        If True, use gradient checkpointing to save memory at
-                        the expense of slower backward pass. (default: False)
-  --gradient_checkpointing_kwargs GRADIENT_CHECKPOINTING_KWARGS
-                        Gradient checkpointing key word arguments such as
-                        `use_reentrant`. Will be passed to
-                        `torch.utils.checkpoint.checkpoint` through
-                        `model.gradient_checkpointing_enable`. (default: None)
-  --include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS]
-                        Whether or not the inputs will be passed to the
-                        `compute_metrics` function. (default: False)
-  --fp16_backend {auto,apex,cpu_amp}
-                        Deprecated. Use half_precision_backend instead
-                        (default: auto)
-  --push_to_hub_model_id PUSH_TO_HUB_MODEL_ID
-                        The name of the repository to which push the
-                        `Trainer`. (default: None)
-  --push_to_hub_organization PUSH_TO_HUB_ORGANIZATION
-                        The name of the organization in with to which push the
-                        `Trainer`. (default: None)
-  --push_to_hub_token PUSH_TO_HUB_TOKEN
-                        The token to use to push to the Model Hub. (default:
-                        None)
-  --mp_parameters MP_PARAMETERS
-                        Used by the SageMaker launcher to send mp-specific
-                        args. Ignored in Trainer (default: )
-  --auto_find_batch_size [AUTO_FIND_BATCH_SIZE]
-                        Whether to automatically decrease the batch size in
-                        half and rerun the training loop again each time a
-                        CUDA Out-of-Memory was reached (default: False)
-  --full_determinism [FULL_DETERMINISM]
-                        Whether to call enable_full_determinism instead of
-                        set_seed for reproducibility in distributed training.
-                        Important: this will negatively impact the
-                        performance, so only use it for debugging. (default:
-                        False)
-  --torchdynamo TORCHDYNAMO
-                        This argument is deprecated, use
-                        `--torch_compile_backend` instead. (default: None)
-  --ray_scope RAY_SCOPE
-                        The scope to use when doing hyperparameter search with
-                        Ray. By default, `"last"` will be used. Ray will then
-                        use the last checkpoint of all trials, compare those,
-                        and select the best one. However, other options are
-                        also available. See the Ray documentation (https://doc
-                        s.ray.io/en/latest/tune/api_docs/analysis.html#ray.tun
-                        e.ExperimentAnalysis.get_best_trial) for more options.
-                        (default: last)
-  --ddp_timeout DDP_TIMEOUT
-                        Overrides the default timeout for distributed training
-                        (value should be given in seconds). (default: 1800)
-  --torch_compile [TORCH_COMPILE]
-                        If set to `True`, the model will be wrapped in
-                        `torch.compile`. (default: False)
-  --torch_compile_backend TORCH_COMPILE_BACKEND
-                        Which backend to use with `torch.compile`, passing one
-                        will trigger a model compilation. (default: None)
-  --torch_compile_mode TORCH_COMPILE_MODE
-                        Which mode to use with `torch.compile`, passing one
-                        will trigger a model compilation. (default: None)
-  --dispatch_batches DISPATCH_BATCHES
-                        Whether to dispatch batches across devices in
-                        distributed training. If set to `True`, the dataloader
-                        prepared by the Accelerator is only iterated through
-                        on the main process and then the batches are split and
-                        broadcast to each process. Will default to `True` for
-                        `DataLoader` whoseunderlying dataset is an
-                        `IterableDataset`, `False` otherwise. (default: None)
-  --split_batches [SPLIT_BATCHES]
-                        Whether or not the accelerator should split the
-                        batches yielded by the dataloaders across the devices
-                        during distributed training. Ifset to `True`, the
-                        actual batch size used will be the same on any kind of
-                        distributed processes, but it must be around multiple
-                        of the number of processes you are using (such as
-                        GPUs). (default: False)
-  --include_tokens_per_second [INCLUDE_TOKENS_PER_SECOND]
-                        If set to `True`, the speed metrics will include `tgs`
-                        (tokens per second per device). (default: False)
-  --include_num_input_tokens_seen [INCLUDE_NUM_INPUT_TOKENS_SEEN]
-                        If set to `True`, will track the number of input
-                        tokens seen throughout training. (May be slower in
-                        distributed training) (default: False)
-  --neftune_noise_alpha NEFTUNE_NOISE_ALPHA
-                        Activates neftune noise embeddings into the model.
-                        NEFTune has been proven to drastically improve model
-                        performances for instrcution fine-tuning. Check out
-                        the original paper here:
-                        https://arxiv.org/abs/2310.05914 and the original code
-                        here: https://github.com/neelsjain/NEFTune. Only
-                        supported for `PreTrainedModel` and `PeftModel`
-                        classes. (default: None)
diff --git a/examples/training/swallow-tart/peft_config.json b/examples/training/swallow-tart/peft_config.json
deleted file mode 100644
index 016ff8df9..000000000
--- a/examples/training/swallow-tart/peft_config.json
+++ /dev/null
@@ -1,29 +0,0 @@
-# https://huggingface.co/intfloat/e5-mistral-7b-instruct/blob/main/lora/adapter_config.json
-# Lora rank and alpha: https://llm-jp.nii.ac.jp/blog/2024/02/09/v1.1-tuning.html#%E3%83%8F%E3%82%A4%E3%83%91%E3%83%BC%E3%83%91%E3%83%A9%E3%83%A1%E3%83%BC%E3%82%BF
-{
-  "auto_mapping": null,
-  "base_model_name_or_path": "mistralai/Mistral-7B-v0.1",
-  "bias": "none",
-  "fan_in_fan_out": false,
-  "inference_mode": false,
-  "init_lora_weights": true,
-  "layers_pattern": null,
-  "layers_to_transform": null,
-  "lora_alpha": 256,
-  "lora_dropout": 0.1,
-  "modules_to_save": null,
-  "peft_type": "LORA",
-  "r": 128,
-  "revision": null,
-  "target_modules": [
-    "q_proj",
-    "k_proj",
-    "v_proj",
-    "o_proj",
-    "down_proj",
-    "up_proj",
-    "gate_proj"
-  ],
-  "task_type": "FEATURE_EXTRACTION",
-  "use_rslora": true
-}
\ No newline at end of file