From d84bbb2438fad19c38a3adf5beb5968f5bd6c789 Mon Sep 17 00:00:00 2001 From: Katsumata420 Date: Mon, 19 Feb 2024 15:50:43 +0900 Subject: [PATCH 1/9] Add training ir script --- examples/training/swallow-tart/args.py | 34 +++ examples/training/swallow-tart/data.py | 258 ++++++++++++++++++++ examples/training/swallow-tart/run_train.py | 123 ++++++++++ 3 files changed, 415 insertions(+) create mode 100644 examples/training/swallow-tart/args.py create mode 100644 examples/training/swallow-tart/data.py create mode 100644 examples/training/swallow-tart/run_train.py diff --git a/examples/training/swallow-tart/args.py b/examples/training/swallow-tart/args.py new file mode 100644 index 000000000..00755363d --- /dev/null +++ b/examples/training/swallow-tart/args.py @@ -0,0 +1,34 @@ +import json +from dataclasses import dataclass, field +from typing import Optional + +from peft import get_peft_config +from transformers import TrainingArguments as STTrainingArguments + +__all__ = ["STModelArguments", "STDataArgumnets", "STTrainingArguments"] + + +@dataclass +class STModelArguments: + model_name: str = "bert-base-uncased" + peft_config_path: Optional[str] = None + use_flash_attention: bool = False + + def __post_init__(self): + if self.peft_config_path is not None: + with open(self.peft_config_path, "r") as f: + peft_config_data = json.load(f) + self.peft_config = get_peft_config(peft_config_data) + + +@dataclass +class STDataArgumnets: + data_dir: str + task_names: list[str] = field(default_factory=list) + max_length: int = 512 + n_dev_sample: int = 100 + query_file_name: str = "queries.jsonl" + corpus_file_name: str = "corpus.jsonl" + qrel_file_name: str = "qrels/train.tsv" + hard_negatives_file_name: str = "hard_negative/hard_negative.jsonl" + num_proc: int = 1 diff --git a/examples/training/swallow-tart/data.py b/examples/training/swallow-tart/data.py new file mode 100644 index 000000000..457e2924a --- /dev/null +++ b/examples/training/swallow-tart/data.py @@ -0,0 +1,258 @@ +import os +import json +import random +from collections import defaultdict +from typing import Callable, Optional, Tuple + +import datasets +import torch +from sentence_transformers.huggingface import SENTENCE_KEYS +from torch.utils.data import Dataset +from tqdm import tqdm +from transformers import PreTrainedTokenizer, BatchEncoding +from transformers.utils import logging + +logger = logging.get_logger(__name__) + + +class MNRLDataset(Dataset): + # https://github.com/texttron/tevatron/blob/main/examples/repllama/data.py#L162 + def __init__( + self, + dataset: datasets.Dataset, + tokenizer: PreTrainedTokenizer, + max_length: int, + ): + self.train_data = dataset + self.tok = tokenizer + + self.max_length = max_length + self.total_len = len(self.train_data) + + def create_one_example(self, text_encoding: list[int]) -> BatchEncoding: + item = self.tok.prepare_for_model( + text_encoding + [self.tok.eos_token_id], + truncation='only_first', + max_length=self.max_length, + padding=True, + return_tensors='pt', + ) + return item + + def __len__(self): + # Return query size + return self.total_len + + def __getitem__(self, item) -> dict[str, BatchEncoding]: + # https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/ms_marco/train_bi-encoder_mnrl.py#L215 + group = self.train_data[item] + query_encoding = self.create_one_example(group['query']) + + target_pos_ids = group['positives'].pop(0) + target_pos_encoding = self.create_one_example(target_pos_ids) + group['positives'].append(target_pos_ids) + + negative_pos_ids = group['negatives'].pop(0) + negative_pos_encoding = self.create_one_example(negative_pos_ids) + group['negatives'].append(negative_pos_ids) + + label = 0 # 学習には使用しないが、引数に指定されている + + anchor_name, pos_name, neg_name = SENTENCE_KEYS + return { + anchor_name: query_encoding, + pos_name: target_pos_encoding, + neg_name: negative_pos_encoding, + "label": label, + } + + +class TokenizeProcessor: + def __init__( + self, + tokenizer: PreTrainedTokenizer, + max_length: int, + ) -> None: + self.tokenizer = tokenizer + self.max_length = max_length + + def __call__(self, example): + query_tokenized = self.tokenizer.encode( + example["query"], + add_special_tokens=False, + truncation=True, + max_length=self.max_length - 3, # For bos, eos and margin + ) + positive_tokenizeds = [] + for positive in example["positives"]: + positive_tokenizeds.append( + self.tokenizer.encode( + positive, + add_special_tokens=False, + truncation=True, + max_length=self.max_length - 3, # For bos and eos + ) + ) + negative_tokenizeds = [] + for negative in example["negatives"]: + negative_tokenizeds.append( + self.tokenizer.encode( + negative, + add_special_tokens=False, + truncation=True, + max_length=self.max_length - 3, # For bos and eos + ) + ) + return {"query": query_tokenized, "positives": positive_tokenizeds, "negatives": negative_tokenizeds} + + +def ir_collator(batch: list[dict[str, BatchEncoding]]) -> dict[str, torch.Tensor]: + # this function is based on sentence_transformers.SentenceTransformer.smart_batching_collate + texts = [] + for example in batch: + temp_texts = [] + for key in SENTENCE_KEYS: + temp_texts.append(example[key]) + texts.append(temp_texts) + + transposed_texts = list(zip(*texts)) + labels = torch.tensor([example["label"] for example in batch]) + + return transposed_texts, labels + + +def load_queries(queries_path: str) -> dict[str, str]: + queries = {} + with open(queries_path, 'r') as f: + for line in f: + data = json.loads(line) + queries[data['_id']] = data['text'] + return queries + + +def load_corpus(corpus_path: str) -> dict[str, str]: + corpus = {} + with open(corpus_path, 'r') as f: + for line in f: + data = json.loads(line) + corpus[data['_id']] = data['text'] + return corpus + + +def load_qrels(qrels_path: str) -> dict[str, list[int]]: + """Load qrel. + + qrel format: + query_id\tdocument_id\tlabel + """ + qrels = defaultdict(list) + with open(qrels_path, 'r') as f: + for idx, line in enumerate(f): + if idx == 0: + continue + data = line.strip().split('\t') + qid = data[0] + did = data[1] + qrels[qid].append(did) + return qrels + + +def load_hard_negatives(hard_negatives_path: str) -> dict[str, list[int]]: + """Load hard negative. + + hard negative format: + {"query_id": str, "hard_negative": [str, str, ...]} + """ + hard_negative = defaultdict(list) + with open(hard_negatives_path, 'r') as f: + for line in f: + data = json.loads(line) + qid = data['query_id'] + hard_negative[qid].extend(data['hard_negative']) + return hard_negative + + +def load_ir_dataset( + task_names: list[str], + input_data_dir: str, + query_file_name: str, + corpus_file_name: str, + qrel_file_name: str, + hard_negative_file_name: str, +) -> datasets.Dataset: + # load dataset + # {"query": str, "positives": list[str], "negatives": list[str]} + target_datasets: list[datasets.Dataset] = [] + for task_idx, task_name in enumerate(task_names): + target_path = { + "queries": os.path.join(input_data_dir, task_name, query_file_name), + "corpus": os.path.join(input_data_dir, task_name, corpus_file_name), + "qrels": os.path.join(input_data_dir, task_name, qrel_file_name), + "hard_negatives": os.path.join(input_data_dir, task_name, hard_negative_file_name), + } + + queries = load_queries(target_path["queries"]) + corpus = load_corpus(target_path["corpus"]) + qrels = load_qrels(target_path["qrels"]) + hard_negatives = load_hard_negatives(target_path["hard_negatives"]) + + logger.info(f"...Task: {task_name}") + current_dataset = [] + for qid, query in tqdm(queries.items()): + positive_ids = qrels[qid] + positives = [corpus[pos_id] for pos_id in positive_ids] + random.shuffle(positives) + negative_ids = hard_negatives[qid] + negatives = [corpus[neg_id] for neg_id in negative_ids] + random.shuffle(negatives) + current_dataset.append({"query": query, "positives": positives, "negatives": negatives}) + + target_datasets.append(datasets.Dataset.from_list(current_dataset)) + + target_concat_dataset = datasets.concatenate_datasets(target_datasets) + return target_concat_dataset + + +def get_dataset( + task_names: list[str], + input_data_dir: str, + query_file_name: str, + corpus_file_name: str, + qrel_file_name: str, + hard_negatives_file_name: str, + tokenizer: PreTrainedTokenizer, + max_length: int, + n_each_dev_sample: int = 0, + process_func: Optional[Callable] = None, + num_proc: int = 1, +) -> Tuple[Dataset, Dataset]: + # build HF Dataset + logger.info("Build huggingface datasets.") + hf_dataset = load_ir_dataset( + task_names, input_data_dir, query_file_name, corpus_file_name, qrel_file_name, hard_negatives_file_name + ) + + # apply preprocess (mainly tokenization (make word ids)) + logger.info("Apply preprocessing.") + remove_column_names = hf_dataset.column_names.remove("label") + hf_dataset = hf_dataset.map( + process_func, + batched=True, + num_proc=num_proc, + remove_columns=remove_column_names, + desc="Running Tokenizer on dataset" + ) + + # split train/dev dataset + logger.info("Split train/dev dataset.") + n_dev_sample = n_each_dev_sample * len(task_names) + train_dev_dataset = hf_dataset.train_test_split(test_size=n_dev_sample, shuffle=True, stratify_by_column="label") + train_dataset = train_dev_dataset["train"] + dev_dataset = train_dev_dataset["test"] + logger.info(f"Train dataset size: {len(train_dataset)}") + logger.info(f"Dev dataset size: {len(dev_dataset)}") + + # build Torch Dataset and Return ones. + train_torch_dataset = MNRLDataset(train_dataset, tokenizer, max_length) + dev_torch_dataset = MNRLDataset(dev_dataset, tokenizer, max_length) + return train_torch_dataset, dev_torch_dataset diff --git a/examples/training/swallow-tart/run_train.py b/examples/training/swallow-tart/run_train.py new file mode 100644 index 000000000..30f302604 --- /dev/null +++ b/examples/training/swallow-tart/run_train.py @@ -0,0 +1,123 @@ +"""Train embeddings with Sentence-Transformers-HF + +lr: + llm-jp: 2e-5 https://llm-jp.nii.ac.jp/blog/2024/02/09/v1.1-tuning.html#%E3%83%8F%E3%82%A4%E3%83%91%E3%83%BC%E3%83%91%E3%83%A9%E3%83%A1%E3%83%BC%E3%82%BF + repLLaMA: 1e-4 https://llm-jp.nii.ac.jp/blog/2024/02/09/v1.1-tuning.html#%E3%83%8F%E3%82%A4%E3%83%91%E3%83%BC%E3%83%91%E3%83%A9%E3%83%A1%E3%83%BC%E3%82%BF +""" +import os +import sys + +from sentence_transformers import losses +from sentence_transformers.huggingface import ( + MNRLSentenceTransformersTrainer, + MNRLSentenceTransformer, +) +from sentence_transformers.models import Transformer, Pooling, Normalize +from transformers import HfArgumentParser, set_seed +from transformers.trainer_utils import get_last_checkpoint +from transformers.utils import logging + +from .args import STDataArgumnets, STModelArguments, STTrainingArguments +from .data import get_dataset, TokenizeProcessor, ir_collator + +logger = logging.get_logger(__name__) + + +def main(): + parser = HfArgumentParser((STDataArgumnets, STModelArguments, STTrainingArguments)) + + if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): + # If we pass only one argument to the script and it's the path to a json file, + # let's parse it to get our arguments. + data_args, model_args, training_args = parser.parse_json_file(os.path.abspath(sys.argv[1])) + else: + data_args, model_args, training_args = parser.parse_args_into_dataclasses() + + logger.warning( + "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", + training_args.local_rank, + training_args.device, + training_args.n_gpu, + bool(training_args.local_rank != -1), + training_args.fp16, + ) + logger.info("Training/evaluation parameters %s", training_args) + logger.info("MODEL parameters %s", model_args) + + set_seed(training_args.seed) + + # define model + logger.info("Build SentenceTransformer") + if model_args.use_flash_attention: + # validate fp16 or bf16 + assert training_args.fp16 or training_args.bf16, "use_flash_attention requires fp16 or bf16" + model_args = {"attn_implementation": "flash_attention_2"} + tf_model = Transformer(model_args.model_name, model_args=model_args, peft_config=model_args.peft_config) + pooler = Pooling(tf_model.get_word_embedding_dimension(), pooing_mode="lasttoken") + normalize = Normalize() + model = MNRLSentenceTransformer(modules=[tf_model, pooler, normalize]) + tokenizer = model.tokenizer + max_length = min(data_args.max_length, tokenizer.model_max_length) + tokenizer.model_max_length = max_length + loss = losses.MultipleNegativesRankingLoss(model=model) + + # define train/eval dataset + logger.info("Load dataset") + logger.info(f"Target task names: {data_args.task_names}") + preprocessor = TokenizeProcessor(tokenizer, data_args.max_length) + train_dataset, eval_dataset = get_dataset( + data_args.task_names, + data_args.data_dir, + data_args.query_file_name, + data_args.corpus_file_name, + data_args.qrel_file_name, + data_args.hard_negatives_file_name, + tokenizer, + data_args.max_length, + data_args.n_dev_sample, + preprocessor, + data_args.num_proc, + ) + + trainer = MNRLSentenceTransformersTrainer( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + data_collator=ir_collator, + tokenizer=tokenizer, + loss=loss, + text_columns=[] + ) + + # detecting last checkpoint + last_checkpoint = None + if os.path.isdir(training_args.output_dir) and not training_args.overwrite_output_dir: + last_checkpoint = get_last_checkpoint(training_args.output_dir) + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + raise ValueError( + f"Output directory ({training_args.output_dir}) already exists and is not empty. " + "Use --overwrite_output_dir to overcome." + ) + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) + + if last_checkpoint is not None: + checkpoint = last_checkpoint + elif training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + + logger.info("Start training") + train_result = trainer.train(resume_from_checkpoint=checkpoint) + metrics = train_result.metrics + trainer.save_model() + trainer.log_metrics("train", metrics) + trainer.save_metrics("train", metrics) + trainer.save_state() + + +if __name__ == "__main__": + main() From b169bc40cfa8fe212ab64cec8e3929fcea74a034 Mon Sep 17 00:00:00 2001 From: Katsumata420 Date: Mon, 19 Feb 2024 16:57:43 +0900 Subject: [PATCH 2/9] Apply format --- examples/training/swallow-tart/data.py | 36 +++++++++---------- .../training/swallow-tart/peft_config.json | 29 +++++++++++++++ examples/training/swallow-tart/run_train.py | 2 +- 3 files changed, 48 insertions(+), 19 deletions(-) create mode 100644 examples/training/swallow-tart/peft_config.json diff --git a/examples/training/swallow-tart/data.py b/examples/training/swallow-tart/data.py index 457e2924a..9b0e631a8 100644 --- a/examples/training/swallow-tart/data.py +++ b/examples/training/swallow-tart/data.py @@ -32,10 +32,10 @@ def __init__( def create_one_example(self, text_encoding: list[int]) -> BatchEncoding: item = self.tok.prepare_for_model( text_encoding + [self.tok.eos_token_id], - truncation='only_first', + truncation="only_first", max_length=self.max_length, padding=True, - return_tensors='pt', + return_tensors="pt", ) return item @@ -46,15 +46,15 @@ def __len__(self): def __getitem__(self, item) -> dict[str, BatchEncoding]: # https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/ms_marco/train_bi-encoder_mnrl.py#L215 group = self.train_data[item] - query_encoding = self.create_one_example(group['query']) + query_encoding = self.create_one_example(group["query"]) - target_pos_ids = group['positives'].pop(0) + target_pos_ids = group["positives"].pop(0) target_pos_encoding = self.create_one_example(target_pos_ids) - group['positives'].append(target_pos_ids) + group["positives"].append(target_pos_ids) - negative_pos_ids = group['negatives'].pop(0) + negative_pos_ids = group["negatives"].pop(0) negative_pos_encoding = self.create_one_example(negative_pos_ids) - group['negatives'].append(negative_pos_ids) + group["negatives"].append(negative_pos_ids) label = 0 # 学習には使用しないが、引数に指定されている @@ -123,19 +123,19 @@ def ir_collator(batch: list[dict[str, BatchEncoding]]) -> dict[str, torch.Tensor def load_queries(queries_path: str) -> dict[str, str]: queries = {} - with open(queries_path, 'r') as f: + with open(queries_path, "r") as f: for line in f: data = json.loads(line) - queries[data['_id']] = data['text'] + queries[data["_id"]] = data["text"] return queries def load_corpus(corpus_path: str) -> dict[str, str]: corpus = {} - with open(corpus_path, 'r') as f: + with open(corpus_path, "r") as f: for line in f: data = json.loads(line) - corpus[data['_id']] = data['text'] + corpus[data["_id"]] = data["text"] return corpus @@ -146,11 +146,11 @@ def load_qrels(qrels_path: str) -> dict[str, list[int]]: query_id\tdocument_id\tlabel """ qrels = defaultdict(list) - with open(qrels_path, 'r') as f: + with open(qrels_path, "r") as f: for idx, line in enumerate(f): if idx == 0: continue - data = line.strip().split('\t') + data = line.strip().split("\t") qid = data[0] did = data[1] qrels[qid].append(did) @@ -164,11 +164,11 @@ def load_hard_negatives(hard_negatives_path: str) -> dict[str, list[int]]: {"query_id": str, "hard_negative": [str, str, ...]} """ hard_negative = defaultdict(list) - with open(hard_negatives_path, 'r') as f: + with open(hard_negatives_path, "r") as f: for line in f: data = json.loads(line) - qid = data['query_id'] - hard_negative[qid].extend(data['hard_negative']) + qid = data["query_id"] + hard_negative[qid].extend(data["hard_negative"]) return hard_negative @@ -182,7 +182,7 @@ def load_ir_dataset( ) -> datasets.Dataset: # load dataset # {"query": str, "positives": list[str], "negatives": list[str]} - target_datasets: list[datasets.Dataset] = [] + target_datasets: list[datasets.Dataset] = [] for task_idx, task_name in enumerate(task_names): target_path = { "queries": os.path.join(input_data_dir, task_name, query_file_name), @@ -240,7 +240,7 @@ def get_dataset( batched=True, num_proc=num_proc, remove_columns=remove_column_names, - desc="Running Tokenizer on dataset" + desc="Running Tokenizer on dataset", ) # split train/dev dataset diff --git a/examples/training/swallow-tart/peft_config.json b/examples/training/swallow-tart/peft_config.json new file mode 100644 index 000000000..016ff8df9 --- /dev/null +++ b/examples/training/swallow-tart/peft_config.json @@ -0,0 +1,29 @@ +# https://huggingface.co/intfloat/e5-mistral-7b-instruct/blob/main/lora/adapter_config.json +# Lora rank and alpha: https://llm-jp.nii.ac.jp/blog/2024/02/09/v1.1-tuning.html#%E3%83%8F%E3%82%A4%E3%83%91%E3%83%BC%E3%83%91%E3%83%A9%E3%83%A1%E3%83%BC%E3%82%BF +{ + "auto_mapping": null, + "base_model_name_or_path": "mistralai/Mistral-7B-v0.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": false, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 256, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "revision": null, + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "down_proj", + "up_proj", + "gate_proj" + ], + "task_type": "FEATURE_EXTRACTION", + "use_rslora": true +} \ No newline at end of file diff --git a/examples/training/swallow-tart/run_train.py b/examples/training/swallow-tart/run_train.py index 30f302604..991daed54 100644 --- a/examples/training/swallow-tart/run_train.py +++ b/examples/training/swallow-tart/run_train.py @@ -87,7 +87,7 @@ def main(): data_collator=ir_collator, tokenizer=tokenizer, loss=loss, - text_columns=[] + text_columns=[], ) # detecting last checkpoint From 7a3d146f6da41eeceb9b2730fd928a03802182a0 Mon Sep 17 00:00:00 2001 From: Katsumata420 Date: Mon, 19 Feb 2024 21:43:10 +0900 Subject: [PATCH 3/9] Add datasets lib --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index fbf48ae97..9c342f02b 100644 --- a/setup.py +++ b/setup.py @@ -29,6 +29,7 @@ "huggingface-hub>=0.15.1", "Pillow", "peft", + "datasets", ], classifiers=[ "Development Status :: 5 - Production/Stable", From ed0e97ffe482692ec9285a5818d991b11ea64db1 Mon Sep 17 00:00:00 2001 From: Katsumata420 Date: Tue, 20 Feb 2024 22:36:03 +0900 Subject: [PATCH 4/9] Fix bugs --- examples/training/swallow-tart/args.py | 10 +-- examples/training/swallow-tart/data.py | 75 +++++++++++++++----- examples/training/swallow-tart/run_train.py | 23 ++++-- sentence_transformers/SentenceTransformer.py | 7 ++ sentence_transformers/models/Transformer.py | 16 +++++ 5 files changed, 105 insertions(+), 26 deletions(-) diff --git a/examples/training/swallow-tart/args.py b/examples/training/swallow-tart/args.py index 00755363d..e2cda57c6 100644 --- a/examples/training/swallow-tart/args.py +++ b/examples/training/swallow-tart/args.py @@ -19,6 +19,8 @@ def __post_init__(self): with open(self.peft_config_path, "r") as f: peft_config_data = json.load(f) self.peft_config = get_peft_config(peft_config_data) + else: + self.peft_config = None @dataclass @@ -27,8 +29,8 @@ class STDataArgumnets: task_names: list[str] = field(default_factory=list) max_length: int = 512 n_dev_sample: int = 100 - query_file_name: str = "queries.jsonl" - corpus_file_name: str = "corpus.jsonl" - qrel_file_name: str = "qrels/train.tsv" - hard_negatives_file_name: str = "hard_negative/hard_negative.jsonl" + query_file_name: str = "tuple_beir/queries.jsonl" + corpus_file_name: str = "tuple_beir/corpus.jsonl" + qrel_file_name: str = "tuple_beir/qrels/train.tsv" + hard_negatives_file_name: str = "negatives/hard_negative.jsonl" num_proc: int = 1 diff --git a/examples/training/swallow-tart/data.py b/examples/training/swallow-tart/data.py index 9b0e631a8..5aa20deac 100644 --- a/examples/training/swallow-tart/data.py +++ b/examples/training/swallow-tart/data.py @@ -30,12 +30,12 @@ def __init__( self.total_len = len(self.train_data) def create_one_example(self, text_encoding: list[int]) -> BatchEncoding: + """Add eos token""" item = self.tok.prepare_for_model( text_encoding + [self.tok.eos_token_id], truncation="only_first", - max_length=self.max_length, - padding=True, - return_tensors="pt", + max_length=self.max_length - 2, # for bos and margin + padding=False, ) return item @@ -59,12 +59,13 @@ def __getitem__(self, item) -> dict[str, BatchEncoding]: label = 0 # 学習には使用しないが、引数に指定されている anchor_name, pos_name, neg_name = SENTENCE_KEYS - return { + data = { anchor_name: query_encoding, pos_name: target_pos_encoding, neg_name: negative_pos_encoding, "label": label, } + return data class TokenizeProcessor: @@ -106,19 +107,58 @@ def __call__(self, example): return {"query": query_tokenized, "positives": positive_tokenizeds, "negatives": negative_tokenizeds} -def ir_collator(batch: list[dict[str, BatchEncoding]]) -> dict[str, torch.Tensor]: - # this function is based on sentence_transformers.SentenceTransformer.smart_batching_collate - texts = [] - for example in batch: - temp_texts = [] - for key in SENTENCE_KEYS: - temp_texts.append(example[key]) - texts.append(temp_texts) +class TokenizeBatchProcessor(TokenizeProcessor): + def __call__(self, examples): + query_tokenized = self.tokenizer( + examples["query"], + add_special_tokens=False, + truncation=True, + max_length=self.max_length - 3, # For bos, eos and margin + )["input_ids"] + positive_tokenizeds = [] + for one_batch in examples["positives"]: + positive_tokenizeds.append( + self.tokenizer( + one_batch, + add_special_tokens=False, + truncation=True, + max_length=self.max_length - 3, # For bos and eos + )["input_ids"] + ) + negative_tokenizeds = [] + for one_batch in examples["negatives"]: + negative_tokenizeds.append( + self.tokenizer( + one_batch, + add_special_tokens=False, + truncation=True, + max_length=self.max_length - 3, # For bos and eos + )["input_ids"] + ) + return {"query": query_tokenized, "positives": positive_tokenizeds, "negatives": negative_tokenizeds} - transposed_texts = list(zip(*texts)) - labels = torch.tensor([example["label"] for example in batch]) - return transposed_texts, labels +class IRCollator: + def __init__(self, tokenizer: PreTrainedTokenizer, max_length: int): + self.tokenizer = tokenizer + self.max_length = max_length + + def __call__(self, batch: list[dict[str, BatchEncoding]]) -> tuple[list[BatchEncoding], torch.Tensor]: + # this function is based on sentence_transformers.SentenceTransformer.smart_batching_collate + texts = [] + for example in batch: + temp_texts = [] + for key in SENTENCE_KEYS: + temp_texts.append(example[key]) + texts.append(temp_texts) + + transposed_texts = [ + self.tokenizer.pad(sentences, padding="max_length", max_length=self.max_length, return_tensors="pt") + for sentences in zip(*texts) + ] + labels = torch.tensor([example["label"] for example in batch]) + + return transposed_texts, labels def load_queries(queries_path: str) -> dict[str, str]: @@ -202,10 +242,12 @@ def load_ir_dataset( positive_ids = qrels[qid] positives = [corpus[pos_id] for pos_id in positive_ids] random.shuffle(positives) + if qid not in hard_negatives: + continue negative_ids = hard_negatives[qid] negatives = [corpus[neg_id] for neg_id in negative_ids] random.shuffle(negatives) - current_dataset.append({"query": query, "positives": positives, "negatives": negatives}) + current_dataset.append({"query": query, "positives": positives, "negatives": negatives, "label": task_idx}) target_datasets.append(datasets.Dataset.from_list(current_dataset)) @@ -245,6 +287,7 @@ def get_dataset( # split train/dev dataset logger.info("Split train/dev dataset.") + hf_dataset = hf_dataset.class_encode_column("label") n_dev_sample = n_each_dev_sample * len(task_names) train_dev_dataset = hf_dataset.train_test_split(test_size=n_dev_sample, shuffle=True, stratify_by_column="label") train_dataset = train_dev_dataset["train"] diff --git a/examples/training/swallow-tart/run_train.py b/examples/training/swallow-tart/run_train.py index 991daed54..e2cc3d4ea 100644 --- a/examples/training/swallow-tart/run_train.py +++ b/examples/training/swallow-tart/run_train.py @@ -17,8 +17,8 @@ from transformers.trainer_utils import get_last_checkpoint from transformers.utils import logging -from .args import STDataArgumnets, STModelArguments, STTrainingArguments -from .data import get_dataset, TokenizeProcessor, ir_collator +from args import STDataArgumnets, STModelArguments, STTrainingArguments +from data import get_dataset, TokenizeProcessor, TokenizeBatchProcessor, IRCollator logger = logging.get_logger(__name__) @@ -51,20 +51,30 @@ def main(): if model_args.use_flash_attention: # validate fp16 or bf16 assert training_args.fp16 or training_args.bf16, "use_flash_attention requires fp16 or bf16" - model_args = {"attn_implementation": "flash_attention_2"} - tf_model = Transformer(model_args.model_name, model_args=model_args, peft_config=model_args.peft_config) - pooler = Pooling(tf_model.get_word_embedding_dimension(), pooing_mode="lasttoken") + model_kwargs = {"attn_implementation": "flash_attention_2"} + tf_model = Transformer( + model_args.model_name, + model_args=model_kwargs, + peft_config=model_args.peft_config, + is_gradient_checkpointing=training_args.gradient_checkpointing, + ) + pooler = Pooling(tf_model.get_word_embedding_dimension(), pooling_mode="lasttoken") normalize = Normalize() model = MNRLSentenceTransformer(modules=[tf_model, pooler, normalize]) tokenizer = model.tokenizer + # https://github.com/texttron/tevatron/blob/2e5d00ee21d5a7db0bd2ea1463c9150a572106d4/examples/repllama/train.py#L68-L69 + tokenizer.pad_token_id = tokenizer.unk_token_id + tokenizer.pad_token = tokenizer.unk_token max_length = min(data_args.max_length, tokenizer.model_max_length) tokenizer.model_max_length = max_length loss = losses.MultipleNegativesRankingLoss(model=model) + ir_collator = IRCollator(tokenizer, max_length) # define train/eval dataset logger.info("Load dataset") logger.info(f"Target task names: {data_args.task_names}") - preprocessor = TokenizeProcessor(tokenizer, data_args.max_length) + # preprocessor = TokenizeProcessor(tokenizer, data_args.max_length) + preprocessor = TokenizeBatchProcessor(tokenizer, data_args.max_length) train_dataset, eval_dataset = get_dataset( data_args.task_names, data_args.data_dir, @@ -105,6 +115,7 @@ def main(): "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) + checkpoint = None if last_checkpoint is not None: checkpoint = last_checkpoint elif training_args.resume_from_checkpoint is not None: diff --git a/sentence_transformers/SentenceTransformer.py b/sentence_transformers/SentenceTransformer.py index feb5975ac..845039ac4 100644 --- a/sentence_transformers/SentenceTransformer.py +++ b/sentence_transformers/SentenceTransformer.py @@ -1163,3 +1163,10 @@ def _target_device(self) -> torch.device: @_target_device.setter def _target_device(self, device: Optional[Union[int, str, torch.device]] = None) -> None: self.to(device) + + @property + def config(self): + return self._first_module().config + + def gradient_checkpointing_enable(self, *args, **kwargs): + return self._first_module().gradient_checkpointing_enable(*args, **kwargs) diff --git a/sentence_transformers/models/Transformer.py b/sentence_transformers/models/Transformer.py index e61b268d6..b979119cd 100644 --- a/sentence_transformers/models/Transformer.py +++ b/sentence_transformers/models/Transformer.py @@ -1,3 +1,4 @@ +import torch from torch import nn from transformers import AutoModel, AutoTokenizer, AutoConfig, T5Config, MT5Config from peft import PeftConfig, get_peft_model @@ -29,6 +30,7 @@ def __init__( do_lower_case: bool = False, tokenizer_name_or_path: str = None, peft_config: Optional[PeftConfig] = None, + is_gradient_checkpointing: bool = False, ): super(Transformer, self).__init__() self.config_keys = ["max_seq_length", "do_lower_case"] @@ -38,6 +40,13 @@ def __init__( self._load_model(model_name_or_path, config, cache_dir, **model_args) if peft_config is not None: + if is_gradient_checkpointing: + for param in self.auto_model.parameters(): + param.requires_grad = True + if param.ndim == 1: + param.data = param.data.to(torch.float32) + self.auto_model.gradient_checkpointing_enable() + self.auto_model.enable_input_require_grads() self.auto_model = get_peft_model(self.auto_model, peft_config) self.tokenizer = AutoTokenizer.from_pretrained( @@ -190,3 +199,10 @@ def load(input_path: str): if "model_args" in config: config["model_args"].pop("trust_remote_code") return Transformer(model_name_or_path=input_path, **config) + + @property + def config(self): + return self.auto_model.config + + def gradient_checkpointing_enable(self, *args, **kwargs): + return self.auto_model.gradient_checkpointing_enable(*args, **kwargs) From 8e82ed8d92f309a3d28c23ea28e245d93ceaf08c Mon Sep 17 00:00:00 2001 From: Katsumata420 Date: Wed, 21 Feb 2024 11:12:00 +0900 Subject: [PATCH 5/9] Save hf.dataset to use cache --- examples/training/swallow-tart/args.py | 1 + examples/training/swallow-tart/data.py | 93 +++- examples/training/swallow-tart/h | 581 ++++++++++++++++++++ examples/training/swallow-tart/run_train.py | 3 + 4 files changed, 661 insertions(+), 17 deletions(-) create mode 100644 examples/training/swallow-tart/h diff --git a/examples/training/swallow-tart/args.py b/examples/training/swallow-tart/args.py index e2cda57c6..d7d2767dc 100644 --- a/examples/training/swallow-tart/args.py +++ b/examples/training/swallow-tart/args.py @@ -26,6 +26,7 @@ def __post_init__(self): @dataclass class STDataArgumnets: data_dir: str + hf_dataset_dir: str task_names: list[str] = field(default_factory=list) max_length: int = 512 n_dev_sample: int = 100 diff --git a/examples/training/swallow-tart/data.py b/examples/training/swallow-tart/data.py index 5aa20deac..d7617f114 100644 --- a/examples/training/swallow-tart/data.py +++ b/examples/training/swallow-tart/data.py @@ -2,10 +2,12 @@ import json import random from collections import defaultdict +from pathlib import Path from typing import Callable, Optional, Tuple import datasets import torch +from datasets import load_from_disk from sentence_transformers.huggingface import SENTENCE_KEYS from torch.utils.data import Dataset from tqdm import tqdm @@ -194,7 +196,7 @@ def load_qrels(qrels_path: str) -> dict[str, list[int]]: qid = data[0] did = data[1] qrels[qid].append(did) - return qrels + return dict(qrels) def load_hard_negatives(hard_negatives_path: str) -> dict[str, list[int]]: @@ -209,16 +211,16 @@ def load_hard_negatives(hard_negatives_path: str) -> dict[str, list[int]]: data = json.loads(line) qid = data["query_id"] hard_negative[qid].extend(data["hard_negative"]) - return hard_negative + return dict(hard_negative) -def load_ir_dataset( +def prepare_ir_dataset( task_names: list[str], input_data_dir: str, query_file_name: str, corpus_file_name: str, qrel_file_name: str, - hard_negative_file_name: str, + hard_negatives_file_name: str, ) -> datasets.Dataset: # load dataset # {"query": str, "positives": list[str], "negatives": list[str]} @@ -228,7 +230,7 @@ def load_ir_dataset( "queries": os.path.join(input_data_dir, task_name, query_file_name), "corpus": os.path.join(input_data_dir, task_name, corpus_file_name), "qrels": os.path.join(input_data_dir, task_name, qrel_file_name), - "hard_negatives": os.path.join(input_data_dir, task_name, hard_negative_file_name), + "hard_negatives": os.path.join(input_data_dir, task_name, hard_negatives_file_name), } queries = load_queries(target_path["queries"]) @@ -239,14 +241,39 @@ def load_ir_dataset( logger.info(f"...Task: {task_name}") current_dataset = [] for qid, query in tqdm(queries.items()): + if qid not in qrels: + logger.info(f"......qid: {qid} is not included at the qrel. skip this query.") + continue positive_ids = qrels[qid] - positives = [corpus[pos_id] for pos_id in positive_ids] + + positives = [] + for pos_id in positive_ids: + if pos_id not in corpus: + continue + positive_text = corpus[pos_id] + if positive_text is not None: + positives.append(corpus[pos_id]) + if len(positives) == 0: + logger.info(f"......qid: {qid} doesn't have positive passage. skip this query.") + continue random.shuffle(positives) + if qid not in hard_negatives: continue negative_ids = hard_negatives[qid] - negatives = [corpus[neg_id] for neg_id in negative_ids] + + negatives = [] + for neg_id in negative_ids: + if neg_id not in corpus: + continue + negative_text = corpus[neg_id] + if negative_text is not None: + negatives.append(corpus[neg_id]) + if len(negatives) == 0: + logger.info(f"......qid: {qid} doesn't have negative passage. skip this query.") + continue random.shuffle(negatives) + current_dataset.append({"query": query, "positives": positives, "negatives": negatives, "label": task_idx}) target_datasets.append(datasets.Dataset.from_list(current_dataset)) @@ -255,7 +282,35 @@ def load_ir_dataset( return target_concat_dataset +def load_ir_dataset( + dataset_path: Path, + task_names: list[str], + input_data_dir: str, + query_file_name: str, + corpus_file_name: str, + qrel_file_name: str, + hard_negatives_file_name: str, + n_each_dev_sample: int, +) -> datasets.Dataset: + if not dataset_path.exists(): + logger.info("Build huggingface datasets.") + hf_dataset = prepare_ir_dataset( + task_names, input_data_dir, query_file_name, corpus_file_name, qrel_file_name, hard_negatives_file_name + ) + logger.info("Split train/dev dataset.") + hf_dataset = hf_dataset.class_encode_column("label") + n_dev_sample = n_each_dev_sample * len(task_names) + hf_dataset = hf_dataset.train_test_split(test_size=n_dev_sample, shuffle=True, stratify_by_column="label") + + logger.info(f"Save DatasetDict to {str(dataset_path)}.") + hf_dataset.save_to_disk(str(dataset_path), max_shard_size="1GB") + + hf_dataset = load_from_disk(dataset_path) + return hf_dataset + + def get_dataset( + hf_dataset_dir: str, task_names: list[str], input_data_dir: str, query_file_name: str, @@ -269,32 +324,36 @@ def get_dataset( num_proc: int = 1, ) -> Tuple[Dataset, Dataset]: # build HF Dataset - logger.info("Build huggingface datasets.") + logger.info("Load huggingface datasets.") hf_dataset = load_ir_dataset( - task_names, input_data_dir, query_file_name, corpus_file_name, qrel_file_name, hard_negatives_file_name + Path(hf_dataset_dir), + task_names, + input_data_dir, + query_file_name, + corpus_file_name, + qrel_file_name, + hard_negatives_file_name, + n_each_dev_sample, ) # apply preprocess (mainly tokenization (make word ids)) logger.info("Apply preprocessing.") - remove_column_names = hf_dataset.column_names.remove("label") + remove_column_names = hf_dataset.column_names["train"].remove("label") hf_dataset = hf_dataset.map( process_func, batched=True, - num_proc=num_proc, remove_columns=remove_column_names, + num_proc=num_proc, desc="Running Tokenizer on dataset", ) # split train/dev dataset - logger.info("Split train/dev dataset.") - hf_dataset = hf_dataset.class_encode_column("label") - n_dev_sample = n_each_dev_sample * len(task_names) - train_dev_dataset = hf_dataset.train_test_split(test_size=n_dev_sample, shuffle=True, stratify_by_column="label") - train_dataset = train_dev_dataset["train"] - dev_dataset = train_dev_dataset["test"] + train_dataset = hf_dataset["train"] + dev_dataset = hf_dataset["test"] logger.info(f"Train dataset size: {len(train_dataset)}") logger.info(f"Dev dataset size: {len(dev_dataset)}") + # build Torch Dataset and Return ones. train_torch_dataset = MNRLDataset(train_dataset, tokenizer, max_length) dev_torch_dataset = MNRLDataset(dev_dataset, tokenizer, max_length) diff --git a/examples/training/swallow-tart/h b/examples/training/swallow-tart/h new file mode 100644 index 000000000..95c104586 --- /dev/null +++ b/examples/training/swallow-tart/h @@ -0,0 +1,581 @@ +usage: run_train.py [-h] --data_dir DATA_DIR + [--task_names TASK_NAMES [TASK_NAMES ...]] + [--max_length MAX_LENGTH] [--n_dev_sample N_DEV_SAMPLE] + [--query_file_name QUERY_FILE_NAME] + [--corpus_file_name CORPUS_FILE_NAME] + [--qrel_file_name QREL_FILE_NAME] + [--hard_negatives_file_name HARD_NEGATIVES_FILE_NAME] + [--num_proc NUM_PROC] [--model_name MODEL_NAME] + [--peft_config_path PEFT_CONFIG_PATH] + [--use_flash_attention [USE_FLASH_ATTENTION]] --output_dir + OUTPUT_DIR [--overwrite_output_dir [OVERWRITE_OUTPUT_DIR]] + [--do_train [DO_TRAIN]] [--do_eval [DO_EVAL]] + [--do_predict [DO_PREDICT]] + [--evaluation_strategy {no,steps,epoch}] + [--prediction_loss_only [PREDICTION_LOSS_ONLY]] + [--per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE] + [--per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE] + [--per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE] + [--per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE] + [--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS] + [--eval_accumulation_steps EVAL_ACCUMULATION_STEPS] + [--eval_delay EVAL_DELAY] [--learning_rate LEARNING_RATE] + [--weight_decay WEIGHT_DECAY] [--adam_beta1 ADAM_BETA1] + [--adam_beta2 ADAM_BETA2] [--adam_epsilon ADAM_EPSILON] + [--max_grad_norm MAX_GRAD_NORM] + [--num_train_epochs NUM_TRAIN_EPOCHS] + [--max_steps MAX_STEPS] + [--lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau}] + [--lr_scheduler_kwargs LR_SCHEDULER_KWARGS] + [--warmup_ratio WARMUP_RATIO] + [--warmup_steps WARMUP_STEPS] + [--log_level {detail,debug,info,warning,error,critical,passive}] + [--log_level_replica {detail,debug,info,warning,error,critical,passive}] + [--log_on_each_node [LOG_ON_EACH_NODE]] + [--no_log_on_each_node] [--logging_dir LOGGING_DIR] + [--logging_strategy {no,steps,epoch}] + [--logging_first_step [LOGGING_FIRST_STEP]] + [--logging_steps LOGGING_STEPS] + [--logging_nan_inf_filter [LOGGING_NAN_INF_FILTER]] + [--no_logging_nan_inf_filter] + [--save_strategy {no,steps,epoch}] + [--save_steps SAVE_STEPS] + [--save_total_limit SAVE_TOTAL_LIMIT] + [--save_safetensors [SAVE_SAFETENSORS]] + [--no_save_safetensors] + [--save_on_each_node [SAVE_ON_EACH_NODE]] + [--save_only_model [SAVE_ONLY_MODEL]] + [--no_cuda [NO_CUDA]] [--use_cpu [USE_CPU]] + [--use_mps_device [USE_MPS_DEVICE]] [--seed SEED] + [--data_seed DATA_SEED] [--jit_mode_eval [JIT_MODE_EVAL]] + [--use_ipex [USE_IPEX]] [--bf16 [BF16]] [--fp16 [FP16]] + [--fp16_opt_level FP16_OPT_LEVEL] + [--half_precision_backend {auto,apex,cpu_amp}] + [--bf16_full_eval [BF16_FULL_EVAL]] + [--fp16_full_eval [FP16_FULL_EVAL]] [--tf32 TF32] + [--local_rank LOCAL_RANK] + [--ddp_backend {nccl,gloo,mpi,ccl,hccl}] + [--tpu_num_cores TPU_NUM_CORES] + [--tpu_metrics_debug [TPU_METRICS_DEBUG]] + [--debug DEBUG [DEBUG ...]] + [--dataloader_drop_last [DATALOADER_DROP_LAST]] + [--eval_steps EVAL_STEPS] + [--dataloader_num_workers DATALOADER_NUM_WORKERS] + [--past_index PAST_INDEX] [--run_name RUN_NAME] + [--disable_tqdm DISABLE_TQDM] + [--remove_unused_columns [REMOVE_UNUSED_COLUMNS]] + [--no_remove_unused_columns] + [--label_names LABEL_NAMES [LABEL_NAMES ...]] + [--load_best_model_at_end [LOAD_BEST_MODEL_AT_END]] + [--metric_for_best_model METRIC_FOR_BEST_MODEL] + [--greater_is_better GREATER_IS_BETTER] + [--ignore_data_skip [IGNORE_DATA_SKIP]] [--fsdp FSDP] + [--fsdp_min_num_params FSDP_MIN_NUM_PARAMS] + [--fsdp_config FSDP_CONFIG] + [--fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP] + [--deepspeed DEEPSPEED] + [--label_smoothing_factor LABEL_SMOOTHING_FACTOR] + [--optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_torch_npu_fused,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit,rmsprop}] + [--optim_args OPTIM_ARGS] [--adafactor [ADAFACTOR]] + [--group_by_length [GROUP_BY_LENGTH]] + [--length_column_name LENGTH_COLUMN_NAME] + [--report_to REPORT_TO [REPORT_TO ...]] + [--ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS] + [--ddp_bucket_cap_mb DDP_BUCKET_CAP_MB] + [--ddp_broadcast_buffers DDP_BROADCAST_BUFFERS] + [--dataloader_pin_memory [DATALOADER_PIN_MEMORY]] + [--no_dataloader_pin_memory] + [--dataloader_persistent_workers [DATALOADER_PERSISTENT_WORKERS]] + [--skip_memory_metrics [SKIP_MEMORY_METRICS]] + [--no_skip_memory_metrics] + [--use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP]] + [--push_to_hub [PUSH_TO_HUB]] + [--resume_from_checkpoint RESUME_FROM_CHECKPOINT] + [--hub_model_id HUB_MODEL_ID] + [--hub_strategy {end,every_save,checkpoint,all_checkpoints}] + [--hub_token HUB_TOKEN] + [--hub_private_repo [HUB_PRIVATE_REPO]] + [--hub_always_push [HUB_ALWAYS_PUSH]] + [--gradient_checkpointing [GRADIENT_CHECKPOINTING]] + [--gradient_checkpointing_kwargs GRADIENT_CHECKPOINTING_KWARGS] + [--include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS]] + [--fp16_backend {auto,apex,cpu_amp}] + [--push_to_hub_model_id PUSH_TO_HUB_MODEL_ID] + [--push_to_hub_organization PUSH_TO_HUB_ORGANIZATION] + [--push_to_hub_token PUSH_TO_HUB_TOKEN] + [--mp_parameters MP_PARAMETERS] + [--auto_find_batch_size [AUTO_FIND_BATCH_SIZE]] + [--full_determinism [FULL_DETERMINISM]] + [--torchdynamo TORCHDYNAMO] [--ray_scope RAY_SCOPE] + [--ddp_timeout DDP_TIMEOUT] + [--torch_compile [TORCH_COMPILE]] + [--torch_compile_backend TORCH_COMPILE_BACKEND] + [--torch_compile_mode TORCH_COMPILE_MODE] + [--dispatch_batches DISPATCH_BATCHES] + [--split_batches [SPLIT_BATCHES]] + [--include_tokens_per_second [INCLUDE_TOKENS_PER_SECOND]] + [--include_num_input_tokens_seen [INCLUDE_NUM_INPUT_TOKENS_SEEN]] + [--neftune_noise_alpha NEFTUNE_NOISE_ALPHA] + +options: + -h, --help show this help message and exit + --data_dir DATA_DIR + --task_names TASK_NAMES [TASK_NAMES ...] + --max_length MAX_LENGTH + --n_dev_sample N_DEV_SAMPLE + --query_file_name QUERY_FILE_NAME + --corpus_file_name CORPUS_FILE_NAME + --qrel_file_name QREL_FILE_NAME + --hard_negatives_file_name HARD_NEGATIVES_FILE_NAME + --num_proc NUM_PROC + --model_name MODEL_NAME + --peft_config_path PEFT_CONFIG_PATH + --use_flash_attention [USE_FLASH_ATTENTION] + --output_dir OUTPUT_DIR + The output directory where the model predictions and + checkpoints will be written. (default: None) + --overwrite_output_dir [OVERWRITE_OUTPUT_DIR] + Overwrite the content of the output directory. Use + this to continue training if output_dir points to a + checkpoint directory. (default: False) + --do_train [DO_TRAIN] + Whether to run training. (default: False) + --do_eval [DO_EVAL] Whether to run eval on the dev set. (default: False) + --do_predict [DO_PREDICT] + Whether to run predictions on the test set. (default: + False) + --evaluation_strategy {no,steps,epoch} + The evaluation strategy to use. (default: no) + --prediction_loss_only [PREDICTION_LOSS_ONLY] + When performing evaluation and predictions, only + returns the loss. (default: False) + --per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE + Batch size per GPU/TPU/MPS/NPU core/CPU for training. + (default: 8) + --per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE + Batch size per GPU/TPU/MPS/NPU core/CPU for + evaluation. (default: 8) + --per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE + Deprecated, the use of `--per_device_train_batch_size` + is preferred. Batch size per GPU/TPU core/CPU for + training. (default: None) + --per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE + Deprecated, the use of `--per_device_eval_batch_size` + is preferred. Batch size per GPU/TPU core/CPU for + evaluation. (default: None) + --gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS + Number of updates steps to accumulate before + performing a backward/update pass. (default: 1) + --eval_accumulation_steps EVAL_ACCUMULATION_STEPS + Number of predictions steps to accumulate before + moving the tensors to the CPU. (default: None) + --eval_delay EVAL_DELAY + Number of epochs or steps to wait for before the first + evaluation can be performed, depending on the + evaluation_strategy. (default: 0) + --learning_rate LEARNING_RATE + The initial learning rate for AdamW. (default: 5e-05) + --weight_decay WEIGHT_DECAY + Weight decay for AdamW if we apply some. (default: + 0.0) + --adam_beta1 ADAM_BETA1 + Beta1 for AdamW optimizer (default: 0.9) + --adam_beta2 ADAM_BETA2 + Beta2 for AdamW optimizer (default: 0.999) + --adam_epsilon ADAM_EPSILON + Epsilon for AdamW optimizer. (default: 1e-08) + --max_grad_norm MAX_GRAD_NORM + Max gradient norm. (default: 1.0) + --num_train_epochs NUM_TRAIN_EPOCHS + Total number of training epochs to perform. (default: + 3.0) + --max_steps MAX_STEPS + If > 0: set total number of training steps to perform. + Override num_train_epochs. (default: -1) + --lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau} + The scheduler type to use. (default: linear) + --lr_scheduler_kwargs LR_SCHEDULER_KWARGS + Extra parameters for the lr_scheduler such as + {'num_cycles': 1} for the cosine with hard restarts + (default: {}) + --warmup_ratio WARMUP_RATIO + Linear warmup over warmup_ratio fraction of total + steps. (default: 0.0) + --warmup_steps WARMUP_STEPS + Linear warmup over warmup_steps. (default: 0) + --log_level {detail,debug,info,warning,error,critical,passive} + Logger log level to use on the main node. Possible + choices are the log levels as strings: 'debug', + 'info', 'warning', 'error' and 'critical', plus a + 'passive' level which doesn't set anything and lets + the application set the level. Defaults to 'passive'. + (default: passive) + --log_level_replica {detail,debug,info,warning,error,critical,passive} + Logger log level to use on replica nodes. Same choices + and defaults as ``log_level`` (default: warning) + --log_on_each_node [LOG_ON_EACH_NODE] + When doing a multinode distributed training, whether + to log once per node or just once on the main node. + (default: True) + --no_log_on_each_node + When doing a multinode distributed training, whether + to log once per node or just once on the main node. + (default: False) + --logging_dir LOGGING_DIR + Tensorboard log dir. (default: None) + --logging_strategy {no,steps,epoch} + The logging strategy to use. (default: steps) + --logging_first_step [LOGGING_FIRST_STEP] + Log the first global_step (default: False) + --logging_steps LOGGING_STEPS + Log every X updates steps. Should be an integer or a + float in range `[0,1)`. If smaller than 1, will be + interpreted as ratio of total training steps. + (default: 500) + --logging_nan_inf_filter [LOGGING_NAN_INF_FILTER] + Filter nan and inf losses for logging. (default: True) + --no_logging_nan_inf_filter + Filter nan and inf losses for logging. (default: + False) + --save_strategy {no,steps,epoch} + The checkpoint save strategy to use. (default: steps) + --save_steps SAVE_STEPS + Save checkpoint every X updates steps. Should be an + integer or a float in range `[0,1)`. If smaller than + 1, will be interpreted as ratio of total training + steps. (default: 500) + --save_total_limit SAVE_TOTAL_LIMIT + If a value is passed, will limit the total amount of + checkpoints. Deletes the older checkpoints in + `output_dir`. When `load_best_model_at_end` is + enabled, the 'best' checkpoint according to + `metric_for_best_model` will always be retained in + addition to the most recent ones. For example, for + `save_total_limit=5` and + `load_best_model_at_end=True`, the four last + checkpoints will always be retained alongside the best + model. When `save_total_limit=1` and + `load_best_model_at_end=True`, it is possible that two + checkpoints are saved: the last one and the best one + (if they are different). Default is unlimited + checkpoints (default: None) + --save_safetensors [SAVE_SAFETENSORS] + Use safetensors saving and loading for state dicts + instead of default torch.load and torch.save. + (default: True) + --no_save_safetensors + Use safetensors saving and loading for state dicts + instead of default torch.load and torch.save. + (default: False) + --save_on_each_node [SAVE_ON_EACH_NODE] + When doing multi-node distributed training, whether to + save models and checkpoints on each node, or only on + the main one (default: False) + --save_only_model [SAVE_ONLY_MODEL] + When checkpointing, whether to only save the model, or + also the optimizer, scheduler & rng state.Note that + when this is true, you won't be able to resume + training from checkpoint.This enables you to save + storage by not storing the optimizer, scheduler & rng + state.You can only load the model using + from_pretrained with this option set to True. + (default: False) + --no_cuda [NO_CUDA] This argument is deprecated. It will be removed in + version 5.0 of 🤗 Transformers. (default: False) + --use_cpu [USE_CPU] Whether or not to use cpu. If set to False, we will + use cuda/tpu/mps/npu device if available. (default: + False) + --use_mps_device [USE_MPS_DEVICE] + This argument is deprecated. `mps` device will be used + if available similar to `cuda` device. It will be + removed in version 5.0 of 🤗 Transformers (default: + False) + --seed SEED Random seed that will be set at the beginning of + training. (default: 42) + --data_seed DATA_SEED + Random seed to be used with data samplers. (default: + None) + --jit_mode_eval [JIT_MODE_EVAL] + Whether or not to use PyTorch jit trace for inference + (default: False) + --use_ipex [USE_IPEX] + Use Intel extension for PyTorch when it is available, + installation: 'https://github.com/intel/intel- + extension-for-pytorch' (default: False) + --bf16 [BF16] Whether to use bf16 (mixed) precision instead of + 32-bit. Requires Ampere or higher NVIDIA architecture + or using CPU (use_cpu) or Ascend NPU. This is an + experimental API and it may change. (default: False) + --fp16 [FP16] Whether to use fp16 (mixed) precision instead of + 32-bit (default: False) + --fp16_opt_level FP16_OPT_LEVEL + For fp16: Apex AMP optimization level selected in + ['O0', 'O1', 'O2', and 'O3']. See details at + https://nvidia.github.io/apex/amp.html (default: O1) + --half_precision_backend {auto,apex,cpu_amp} + The backend to be used for half precision. (default: + auto) + --bf16_full_eval [BF16_FULL_EVAL] + Whether to use full bfloat16 evaluation instead of + 32-bit. This is an experimental API and it may change. + (default: False) + --fp16_full_eval [FP16_FULL_EVAL] + Whether to use full float16 evaluation instead of + 32-bit (default: False) + --tf32 TF32 Whether to enable tf32 mode, available in Ampere and + newer GPU architectures. This is an experimental API + and it may change. (default: None) + --local_rank LOCAL_RANK + For distributed training: local_rank (default: -1) + --ddp_backend {nccl,gloo,mpi,ccl,hccl} + The backend to be used for distributed training + (default: None) + --tpu_num_cores TPU_NUM_CORES + TPU: Number of TPU cores (automatically passed by + launcher script) (default: None) + --tpu_metrics_debug [TPU_METRICS_DEBUG] + Deprecated, the use of `--debug tpu_metrics_debug` is + preferred. TPU: Whether to print debug metrics + (default: False) + --debug DEBUG [DEBUG ...] + Whether or not to enable debug mode. Current options: + `underflow_overflow` (Detect underflow and overflow in + activations and weights), `tpu_metrics_debug` (print + debug metrics on TPU). (default: None) + --dataloader_drop_last [DATALOADER_DROP_LAST] + Drop the last incomplete batch if it is not divisible + by the batch size. (default: False) + --eval_steps EVAL_STEPS + Run an evaluation every X steps. Should be an integer + or a float in range `[0,1)`. If smaller than 1, will + be interpreted as ratio of total training steps. + (default: None) + --dataloader_num_workers DATALOADER_NUM_WORKERS + Number of subprocesses to use for data loading + (PyTorch only). 0 means that the data will be loaded + in the main process. (default: 0) + --past_index PAST_INDEX + If >=0, uses the corresponding part of the output as + the past state for next step. (default: -1) + --run_name RUN_NAME An optional descriptor for the run. Notably used for + wandb logging. (default: None) + --disable_tqdm DISABLE_TQDM + Whether or not to disable the tqdm progress bars. + (default: None) + --remove_unused_columns [REMOVE_UNUSED_COLUMNS] + Remove columns not required by the model when using an + nlp.Dataset. (default: True) + --no_remove_unused_columns + Remove columns not required by the model when using an + nlp.Dataset. (default: False) + --label_names LABEL_NAMES [LABEL_NAMES ...] + The list of keys in your dictionary of inputs that + correspond to the labels. (default: None) + --load_best_model_at_end [LOAD_BEST_MODEL_AT_END] + Whether or not to load the best model found during + training at the end of training. When this option is + enabled, the best checkpoint will always be saved. See + `save_total_limit` for more. (default: False) + --metric_for_best_model METRIC_FOR_BEST_MODEL + The metric to use to compare two different models. + (default: None) + --greater_is_better GREATER_IS_BETTER + Whether the `metric_for_best_model` should be + maximized or not. (default: None) + --ignore_data_skip [IGNORE_DATA_SKIP] + When resuming training, whether or not to skip the + first epochs and batches to get to the same training + data. (default: False) + --fsdp FSDP Whether or not to use PyTorch Fully Sharded Data + Parallel (FSDP) training (in distributed training + only). The base option should be `full_shard`, + `shard_grad_op` or `no_shard` and you can add CPU- + offload to `full_shard` or `shard_grad_op` like this: + full_shard offload` or `shard_grad_op offload`. You + can add auto-wrap to `full_shard` or `shard_grad_op` + with the same syntax: full_shard auto_wrap` or + `shard_grad_op auto_wrap`. (default: ) + --fsdp_min_num_params FSDP_MIN_NUM_PARAMS + This parameter is deprecated. FSDP's minimum number of + parameters for Default Auto Wrapping. (useful only + when `fsdp` field is passed). (default: 0) + --fsdp_config FSDP_CONFIG + Config to be used with FSDP (Pytorch Fully Sharded + Data Parallel). The value is either a fsdp json config + file (e.g., `fsdp_config.json`) or an already loaded + json file as `dict`. (default: None) + --fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP + This parameter is deprecated. Transformer layer class + name (case-sensitive) to wrap, e.g, `BertLayer`, + `GPTJBlock`, `T5Block` .... (useful only when `fsdp` + flag is passed). (default: None) + --deepspeed DEEPSPEED + Enable deepspeed and pass the path to deepspeed json + config file (e.g. `ds_config.json`) or an already + loaded json file as a dict (default: None) + --label_smoothing_factor LABEL_SMOOTHING_FACTOR + The label smoothing epsilon to apply (zero means no + label smoothing). (default: 0.0) + --optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_torch_npu_fused,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit,rmsprop} + The optimizer to use. (default: adamw_torch) + --optim_args OPTIM_ARGS + Optional arguments to supply to optimizer. (default: + None) + --adafactor [ADAFACTOR] + Whether or not to replace AdamW by Adafactor. + (default: False) + --group_by_length [GROUP_BY_LENGTH] + Whether or not to group samples of roughly the same + length together when batching. (default: False) + --length_column_name LENGTH_COLUMN_NAME + Column name with precomputed lengths to use when + grouping by length. (default: length) + --report_to REPORT_TO [REPORT_TO ...] + The list of integrations to report the results and + logs to. (default: None) + --ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS + When using distributed training, the value of the flag + `find_unused_parameters` passed to + `DistributedDataParallel`. (default: None) + --ddp_bucket_cap_mb DDP_BUCKET_CAP_MB + When using distributed training, the value of the flag + `bucket_cap_mb` passed to `DistributedDataParallel`. + (default: None) + --ddp_broadcast_buffers DDP_BROADCAST_BUFFERS + When using distributed training, the value of the flag + `broadcast_buffers` passed to + `DistributedDataParallel`. (default: None) + --dataloader_pin_memory [DATALOADER_PIN_MEMORY] + Whether or not to pin memory for DataLoader. (default: + True) + --no_dataloader_pin_memory + Whether or not to pin memory for DataLoader. (default: + False) + --dataloader_persistent_workers [DATALOADER_PERSISTENT_WORKERS] + If True, the data loader will not shut down the worker + processes after a dataset has been consumed once. This + allows to maintain the workers Dataset instances + alive. Can potentially speed up training, but will + increase RAM usage. (default: False) + --skip_memory_metrics [SKIP_MEMORY_METRICS] + Whether or not to skip adding of memory profiler + reports to metrics. (default: True) + --no_skip_memory_metrics + Whether or not to skip adding of memory profiler + reports to metrics. (default: False) + --use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP] + Whether or not to use the legacy prediction_loop in + the Trainer. (default: False) + --push_to_hub [PUSH_TO_HUB] + Whether or not to upload the trained model to the + model hub after training. (default: False) + --resume_from_checkpoint RESUME_FROM_CHECKPOINT + The path to a folder with a valid checkpoint for your + model. (default: None) + --hub_model_id HUB_MODEL_ID + The name of the repository to keep in sync with the + local `output_dir`. (default: None) + --hub_strategy {end,every_save,checkpoint,all_checkpoints} + The hub strategy to use when `--push_to_hub` is + activated. (default: every_save) + --hub_token HUB_TOKEN + The token to use to push to the Model Hub. (default: + None) + --hub_private_repo [HUB_PRIVATE_REPO] + Whether the model repository is private or not. + (default: False) + --hub_always_push [HUB_ALWAYS_PUSH] + Unless `True`, the Trainer will skip pushes if the + previous one wasn't finished yet. (default: False) + --gradient_checkpointing [GRADIENT_CHECKPOINTING] + If True, use gradient checkpointing to save memory at + the expense of slower backward pass. (default: False) + --gradient_checkpointing_kwargs GRADIENT_CHECKPOINTING_KWARGS + Gradient checkpointing key word arguments such as + `use_reentrant`. Will be passed to + `torch.utils.checkpoint.checkpoint` through + `model.gradient_checkpointing_enable`. (default: None) + --include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS] + Whether or not the inputs will be passed to the + `compute_metrics` function. (default: False) + --fp16_backend {auto,apex,cpu_amp} + Deprecated. Use half_precision_backend instead + (default: auto) + --push_to_hub_model_id PUSH_TO_HUB_MODEL_ID + The name of the repository to which push the + `Trainer`. (default: None) + --push_to_hub_organization PUSH_TO_HUB_ORGANIZATION + The name of the organization in with to which push the + `Trainer`. (default: None) + --push_to_hub_token PUSH_TO_HUB_TOKEN + The token to use to push to the Model Hub. (default: + None) + --mp_parameters MP_PARAMETERS + Used by the SageMaker launcher to send mp-specific + args. Ignored in Trainer (default: ) + --auto_find_batch_size [AUTO_FIND_BATCH_SIZE] + Whether to automatically decrease the batch size in + half and rerun the training loop again each time a + CUDA Out-of-Memory was reached (default: False) + --full_determinism [FULL_DETERMINISM] + Whether to call enable_full_determinism instead of + set_seed for reproducibility in distributed training. + Important: this will negatively impact the + performance, so only use it for debugging. (default: + False) + --torchdynamo TORCHDYNAMO + This argument is deprecated, use + `--torch_compile_backend` instead. (default: None) + --ray_scope RAY_SCOPE + The scope to use when doing hyperparameter search with + Ray. By default, `"last"` will be used. Ray will then + use the last checkpoint of all trials, compare those, + and select the best one. However, other options are + also available. See the Ray documentation (https://doc + s.ray.io/en/latest/tune/api_docs/analysis.html#ray.tun + e.ExperimentAnalysis.get_best_trial) for more options. + (default: last) + --ddp_timeout DDP_TIMEOUT + Overrides the default timeout for distributed training + (value should be given in seconds). (default: 1800) + --torch_compile [TORCH_COMPILE] + If set to `True`, the model will be wrapped in + `torch.compile`. (default: False) + --torch_compile_backend TORCH_COMPILE_BACKEND + Which backend to use with `torch.compile`, passing one + will trigger a model compilation. (default: None) + --torch_compile_mode TORCH_COMPILE_MODE + Which mode to use with `torch.compile`, passing one + will trigger a model compilation. (default: None) + --dispatch_batches DISPATCH_BATCHES + Whether to dispatch batches across devices in + distributed training. If set to `True`, the dataloader + prepared by the Accelerator is only iterated through + on the main process and then the batches are split and + broadcast to each process. Will default to `True` for + `DataLoader` whoseunderlying dataset is an + `IterableDataset`, `False` otherwise. (default: None) + --split_batches [SPLIT_BATCHES] + Whether or not the accelerator should split the + batches yielded by the dataloaders across the devices + during distributed training. Ifset to `True`, the + actual batch size used will be the same on any kind of + distributed processes, but it must be around multiple + of the number of processes you are using (such as + GPUs). (default: False) + --include_tokens_per_second [INCLUDE_TOKENS_PER_SECOND] + If set to `True`, the speed metrics will include `tgs` + (tokens per second per device). (default: False) + --include_num_input_tokens_seen [INCLUDE_NUM_INPUT_TOKENS_SEEN] + If set to `True`, will track the number of input + tokens seen throughout training. (May be slower in + distributed training) (default: False) + --neftune_noise_alpha NEFTUNE_NOISE_ALPHA + Activates neftune noise embeddings into the model. + NEFTune has been proven to drastically improve model + performances for instrcution fine-tuning. Check out + the original paper here: + https://arxiv.org/abs/2310.05914 and the original code + here: https://github.com/neelsjain/NEFTune. Only + supported for `PreTrainedModel` and `PeftModel` + classes. (default: None) diff --git a/examples/training/swallow-tart/run_train.py b/examples/training/swallow-tart/run_train.py index e2cc3d4ea..f1bdee901 100644 --- a/examples/training/swallow-tart/run_train.py +++ b/examples/training/swallow-tart/run_train.py @@ -52,6 +52,8 @@ def main(): # validate fp16 or bf16 assert training_args.fp16 or training_args.bf16, "use_flash_attention requires fp16 or bf16" model_kwargs = {"attn_implementation": "flash_attention_2"} + else: + model_kwargs = {} tf_model = Transformer( model_args.model_name, model_args=model_kwargs, @@ -76,6 +78,7 @@ def main(): # preprocessor = TokenizeProcessor(tokenizer, data_args.max_length) preprocessor = TokenizeBatchProcessor(tokenizer, data_args.max_length) train_dataset, eval_dataset = get_dataset( + data_args.hf_dataset_dir, data_args.task_names, data_args.data_dir, data_args.query_file_name, From 1e39898c4d78ef3502c9c3417457bd5f0f9d259f Mon Sep 17 00:00:00 2001 From: Katsumata420 Date: Wed, 21 Feb 2024 14:57:25 +0900 Subject: [PATCH 6/9] Apply formatter --- examples/training/swallow-tart/data.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/training/swallow-tart/data.py b/examples/training/swallow-tart/data.py index d7617f114..39d36722f 100644 --- a/examples/training/swallow-tart/data.py +++ b/examples/training/swallow-tart/data.py @@ -353,7 +353,6 @@ def get_dataset( logger.info(f"Train dataset size: {len(train_dataset)}") logger.info(f"Dev dataset size: {len(dev_dataset)}") - # build Torch Dataset and Return ones. train_torch_dataset = MNRLDataset(train_dataset, tokenizer, max_length) dev_torch_dataset = MNRLDataset(dev_dataset, tokenizer, max_length) From 95ccb0d49fbd7af5aeced526d01663c916ebfc2e Mon Sep 17 00:00:00 2001 From: Katsumata420 Date: Wed, 21 Feb 2024 16:04:11 +0900 Subject: [PATCH 7/9] Apply ruff --- examples/training/swallow-tart/run_train.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/training/swallow-tart/run_train.py b/examples/training/swallow-tart/run_train.py index f1bdee901..ddc9253ba 100644 --- a/examples/training/swallow-tart/run_train.py +++ b/examples/training/swallow-tart/run_train.py @@ -18,7 +18,7 @@ from transformers.utils import logging from args import STDataArgumnets, STModelArguments, STTrainingArguments -from data import get_dataset, TokenizeProcessor, TokenizeBatchProcessor, IRCollator +from data import get_dataset, TokenizeBatchProcessor, IRCollator logger = logging.get_logger(__name__) @@ -75,7 +75,6 @@ def main(): # define train/eval dataset logger.info("Load dataset") logger.info(f"Target task names: {data_args.task_names}") - # preprocessor = TokenizeProcessor(tokenizer, data_args.max_length) preprocessor = TokenizeBatchProcessor(tokenizer, data_args.max_length) train_dataset, eval_dataset = get_dataset( data_args.hf_dataset_dir, From 78b324affc89935c523b989227a9dd83c8b91acd Mon Sep 17 00:00:00 2001 From: Katsumata420 Date: Thu, 22 Feb 2024 11:28:03 +0900 Subject: [PATCH 8/9] Add ds and lora config --- .../swallow-tart/configs/ds_config_zero3.json | 60 +++++++++++++++++++ .../swallow-tart/configs/lora_config.json | 27 +++++++++ install-deepspeed.sh | 2 + 3 files changed, 89 insertions(+) create mode 100644 examples/training/swallow-tart/configs/ds_config_zero3.json create mode 100644 examples/training/swallow-tart/configs/lora_config.json create mode 100755 install-deepspeed.sh diff --git a/examples/training/swallow-tart/configs/ds_config_zero3.json b/examples/training/swallow-tart/configs/ds_config_zero3.json new file mode 100644 index 000000000..ac7eeb18b --- /dev/null +++ b/examples/training/swallow-tart/configs/ds_config_zero3.json @@ -0,0 +1,60 @@ +{ + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 10, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 10, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "weight_decay": "auto" + } + }, + "scheduler": { + "type": "WarmupDecayLR", + "params": { + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto", + "total_num_steps": "auto" + } + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "cpu", + "pin_memory": true + }, + "offload_param": { + "device": "cpu", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "sub_group_size": 1e9, + "stage3_max_live_parameters": 1e9, + "stage3_max_reuse_distance": 1e9, + "stage3_gather_16bit_weights_on_model_save": "auto" + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false +} diff --git a/examples/training/swallow-tart/configs/lora_config.json b/examples/training/swallow-tart/configs/lora_config.json new file mode 100644 index 000000000..26705720e --- /dev/null +++ b/examples/training/swallow-tart/configs/lora_config.json @@ -0,0 +1,27 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "tokyotech-llm/Swallow-7b-hf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": false, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 256, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "revision": null, + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "down_proj", + "up_proj", + "gate_proj" + ], + "task_type": "FEATURE_EXTRACTION", + "use_rslora": true +} diff --git a/install-deepspeed.sh b/install-deepspeed.sh new file mode 100755 index 000000000..35414b901 --- /dev/null +++ b/install-deepspeed.sh @@ -0,0 +1,2 @@ +#!/bin/sh +DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_FUSED_LAMB=1 DS_BUILD_UTILS=1 pip install deepspeed --no-cache-dir From 3e7363ce19ec845cfa206882fbc342864fe8b9bf Mon Sep 17 00:00:00 2001 From: Katsumata420 Date: Thu, 22 Feb 2024 11:32:53 +0900 Subject: [PATCH 9/9] Delete files --- examples/training/swallow-tart/h | 581 ------------------ .../training/swallow-tart/peft_config.json | 29 - 2 files changed, 610 deletions(-) delete mode 100644 examples/training/swallow-tart/h delete mode 100644 examples/training/swallow-tart/peft_config.json diff --git a/examples/training/swallow-tart/h b/examples/training/swallow-tart/h deleted file mode 100644 index 95c104586..000000000 --- a/examples/training/swallow-tart/h +++ /dev/null @@ -1,581 +0,0 @@ -usage: run_train.py [-h] --data_dir DATA_DIR - [--task_names TASK_NAMES [TASK_NAMES ...]] - [--max_length MAX_LENGTH] [--n_dev_sample N_DEV_SAMPLE] - [--query_file_name QUERY_FILE_NAME] - [--corpus_file_name CORPUS_FILE_NAME] - [--qrel_file_name QREL_FILE_NAME] - [--hard_negatives_file_name HARD_NEGATIVES_FILE_NAME] - [--num_proc NUM_PROC] [--model_name MODEL_NAME] - [--peft_config_path PEFT_CONFIG_PATH] - [--use_flash_attention [USE_FLASH_ATTENTION]] --output_dir - OUTPUT_DIR [--overwrite_output_dir [OVERWRITE_OUTPUT_DIR]] - [--do_train [DO_TRAIN]] [--do_eval [DO_EVAL]] - [--do_predict [DO_PREDICT]] - [--evaluation_strategy {no,steps,epoch}] - [--prediction_loss_only [PREDICTION_LOSS_ONLY]] - [--per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE] - [--per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE] - [--per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE] - [--per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE] - [--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS] - [--eval_accumulation_steps EVAL_ACCUMULATION_STEPS] - [--eval_delay EVAL_DELAY] [--learning_rate LEARNING_RATE] - [--weight_decay WEIGHT_DECAY] [--adam_beta1 ADAM_BETA1] - [--adam_beta2 ADAM_BETA2] [--adam_epsilon ADAM_EPSILON] - [--max_grad_norm MAX_GRAD_NORM] - [--num_train_epochs NUM_TRAIN_EPOCHS] - [--max_steps MAX_STEPS] - [--lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau}] - [--lr_scheduler_kwargs LR_SCHEDULER_KWARGS] - [--warmup_ratio WARMUP_RATIO] - [--warmup_steps WARMUP_STEPS] - [--log_level {detail,debug,info,warning,error,critical,passive}] - [--log_level_replica {detail,debug,info,warning,error,critical,passive}] - [--log_on_each_node [LOG_ON_EACH_NODE]] - [--no_log_on_each_node] [--logging_dir LOGGING_DIR] - [--logging_strategy {no,steps,epoch}] - [--logging_first_step [LOGGING_FIRST_STEP]] - [--logging_steps LOGGING_STEPS] - [--logging_nan_inf_filter [LOGGING_NAN_INF_FILTER]] - [--no_logging_nan_inf_filter] - [--save_strategy {no,steps,epoch}] - [--save_steps SAVE_STEPS] - [--save_total_limit SAVE_TOTAL_LIMIT] - [--save_safetensors [SAVE_SAFETENSORS]] - [--no_save_safetensors] - [--save_on_each_node [SAVE_ON_EACH_NODE]] - [--save_only_model [SAVE_ONLY_MODEL]] - [--no_cuda [NO_CUDA]] [--use_cpu [USE_CPU]] - [--use_mps_device [USE_MPS_DEVICE]] [--seed SEED] - [--data_seed DATA_SEED] [--jit_mode_eval [JIT_MODE_EVAL]] - [--use_ipex [USE_IPEX]] [--bf16 [BF16]] [--fp16 [FP16]] - [--fp16_opt_level FP16_OPT_LEVEL] - [--half_precision_backend {auto,apex,cpu_amp}] - [--bf16_full_eval [BF16_FULL_EVAL]] - [--fp16_full_eval [FP16_FULL_EVAL]] [--tf32 TF32] - [--local_rank LOCAL_RANK] - [--ddp_backend {nccl,gloo,mpi,ccl,hccl}] - [--tpu_num_cores TPU_NUM_CORES] - [--tpu_metrics_debug [TPU_METRICS_DEBUG]] - [--debug DEBUG [DEBUG ...]] - [--dataloader_drop_last [DATALOADER_DROP_LAST]] - [--eval_steps EVAL_STEPS] - [--dataloader_num_workers DATALOADER_NUM_WORKERS] - [--past_index PAST_INDEX] [--run_name RUN_NAME] - [--disable_tqdm DISABLE_TQDM] - [--remove_unused_columns [REMOVE_UNUSED_COLUMNS]] - [--no_remove_unused_columns] - [--label_names LABEL_NAMES [LABEL_NAMES ...]] - [--load_best_model_at_end [LOAD_BEST_MODEL_AT_END]] - [--metric_for_best_model METRIC_FOR_BEST_MODEL] - [--greater_is_better GREATER_IS_BETTER] - [--ignore_data_skip [IGNORE_DATA_SKIP]] [--fsdp FSDP] - [--fsdp_min_num_params FSDP_MIN_NUM_PARAMS] - [--fsdp_config FSDP_CONFIG] - [--fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP] - [--deepspeed DEEPSPEED] - [--label_smoothing_factor LABEL_SMOOTHING_FACTOR] - [--optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_torch_npu_fused,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit,rmsprop}] - [--optim_args OPTIM_ARGS] [--adafactor [ADAFACTOR]] - [--group_by_length [GROUP_BY_LENGTH]] - [--length_column_name LENGTH_COLUMN_NAME] - [--report_to REPORT_TO [REPORT_TO ...]] - [--ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS] - [--ddp_bucket_cap_mb DDP_BUCKET_CAP_MB] - [--ddp_broadcast_buffers DDP_BROADCAST_BUFFERS] - [--dataloader_pin_memory [DATALOADER_PIN_MEMORY]] - [--no_dataloader_pin_memory] - [--dataloader_persistent_workers [DATALOADER_PERSISTENT_WORKERS]] - [--skip_memory_metrics [SKIP_MEMORY_METRICS]] - [--no_skip_memory_metrics] - [--use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP]] - [--push_to_hub [PUSH_TO_HUB]] - [--resume_from_checkpoint RESUME_FROM_CHECKPOINT] - [--hub_model_id HUB_MODEL_ID] - [--hub_strategy {end,every_save,checkpoint,all_checkpoints}] - [--hub_token HUB_TOKEN] - [--hub_private_repo [HUB_PRIVATE_REPO]] - [--hub_always_push [HUB_ALWAYS_PUSH]] - [--gradient_checkpointing [GRADIENT_CHECKPOINTING]] - [--gradient_checkpointing_kwargs GRADIENT_CHECKPOINTING_KWARGS] - [--include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS]] - [--fp16_backend {auto,apex,cpu_amp}] - [--push_to_hub_model_id PUSH_TO_HUB_MODEL_ID] - [--push_to_hub_organization PUSH_TO_HUB_ORGANIZATION] - [--push_to_hub_token PUSH_TO_HUB_TOKEN] - [--mp_parameters MP_PARAMETERS] - [--auto_find_batch_size [AUTO_FIND_BATCH_SIZE]] - [--full_determinism [FULL_DETERMINISM]] - [--torchdynamo TORCHDYNAMO] [--ray_scope RAY_SCOPE] - [--ddp_timeout DDP_TIMEOUT] - [--torch_compile [TORCH_COMPILE]] - [--torch_compile_backend TORCH_COMPILE_BACKEND] - [--torch_compile_mode TORCH_COMPILE_MODE] - [--dispatch_batches DISPATCH_BATCHES] - [--split_batches [SPLIT_BATCHES]] - [--include_tokens_per_second [INCLUDE_TOKENS_PER_SECOND]] - [--include_num_input_tokens_seen [INCLUDE_NUM_INPUT_TOKENS_SEEN]] - [--neftune_noise_alpha NEFTUNE_NOISE_ALPHA] - -options: - -h, --help show this help message and exit - --data_dir DATA_DIR - --task_names TASK_NAMES [TASK_NAMES ...] - --max_length MAX_LENGTH - --n_dev_sample N_DEV_SAMPLE - --query_file_name QUERY_FILE_NAME - --corpus_file_name CORPUS_FILE_NAME - --qrel_file_name QREL_FILE_NAME - --hard_negatives_file_name HARD_NEGATIVES_FILE_NAME - --num_proc NUM_PROC - --model_name MODEL_NAME - --peft_config_path PEFT_CONFIG_PATH - --use_flash_attention [USE_FLASH_ATTENTION] - --output_dir OUTPUT_DIR - The output directory where the model predictions and - checkpoints will be written. (default: None) - --overwrite_output_dir [OVERWRITE_OUTPUT_DIR] - Overwrite the content of the output directory. Use - this to continue training if output_dir points to a - checkpoint directory. (default: False) - --do_train [DO_TRAIN] - Whether to run training. (default: False) - --do_eval [DO_EVAL] Whether to run eval on the dev set. (default: False) - --do_predict [DO_PREDICT] - Whether to run predictions on the test set. (default: - False) - --evaluation_strategy {no,steps,epoch} - The evaluation strategy to use. (default: no) - --prediction_loss_only [PREDICTION_LOSS_ONLY] - When performing evaluation and predictions, only - returns the loss. (default: False) - --per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE - Batch size per GPU/TPU/MPS/NPU core/CPU for training. - (default: 8) - --per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE - Batch size per GPU/TPU/MPS/NPU core/CPU for - evaluation. (default: 8) - --per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE - Deprecated, the use of `--per_device_train_batch_size` - is preferred. Batch size per GPU/TPU core/CPU for - training. (default: None) - --per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE - Deprecated, the use of `--per_device_eval_batch_size` - is preferred. Batch size per GPU/TPU core/CPU for - evaluation. (default: None) - --gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS - Number of updates steps to accumulate before - performing a backward/update pass. (default: 1) - --eval_accumulation_steps EVAL_ACCUMULATION_STEPS - Number of predictions steps to accumulate before - moving the tensors to the CPU. (default: None) - --eval_delay EVAL_DELAY - Number of epochs or steps to wait for before the first - evaluation can be performed, depending on the - evaluation_strategy. (default: 0) - --learning_rate LEARNING_RATE - The initial learning rate for AdamW. (default: 5e-05) - --weight_decay WEIGHT_DECAY - Weight decay for AdamW if we apply some. (default: - 0.0) - --adam_beta1 ADAM_BETA1 - Beta1 for AdamW optimizer (default: 0.9) - --adam_beta2 ADAM_BETA2 - Beta2 for AdamW optimizer (default: 0.999) - --adam_epsilon ADAM_EPSILON - Epsilon for AdamW optimizer. (default: 1e-08) - --max_grad_norm MAX_GRAD_NORM - Max gradient norm. (default: 1.0) - --num_train_epochs NUM_TRAIN_EPOCHS - Total number of training epochs to perform. (default: - 3.0) - --max_steps MAX_STEPS - If > 0: set total number of training steps to perform. - Override num_train_epochs. (default: -1) - --lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau} - The scheduler type to use. (default: linear) - --lr_scheduler_kwargs LR_SCHEDULER_KWARGS - Extra parameters for the lr_scheduler such as - {'num_cycles': 1} for the cosine with hard restarts - (default: {}) - --warmup_ratio WARMUP_RATIO - Linear warmup over warmup_ratio fraction of total - steps. (default: 0.0) - --warmup_steps WARMUP_STEPS - Linear warmup over warmup_steps. (default: 0) - --log_level {detail,debug,info,warning,error,critical,passive} - Logger log level to use on the main node. Possible - choices are the log levels as strings: 'debug', - 'info', 'warning', 'error' and 'critical', plus a - 'passive' level which doesn't set anything and lets - the application set the level. Defaults to 'passive'. - (default: passive) - --log_level_replica {detail,debug,info,warning,error,critical,passive} - Logger log level to use on replica nodes. Same choices - and defaults as ``log_level`` (default: warning) - --log_on_each_node [LOG_ON_EACH_NODE] - When doing a multinode distributed training, whether - to log once per node or just once on the main node. - (default: True) - --no_log_on_each_node - When doing a multinode distributed training, whether - to log once per node or just once on the main node. - (default: False) - --logging_dir LOGGING_DIR - Tensorboard log dir. (default: None) - --logging_strategy {no,steps,epoch} - The logging strategy to use. (default: steps) - --logging_first_step [LOGGING_FIRST_STEP] - Log the first global_step (default: False) - --logging_steps LOGGING_STEPS - Log every X updates steps. Should be an integer or a - float in range `[0,1)`. If smaller than 1, will be - interpreted as ratio of total training steps. - (default: 500) - --logging_nan_inf_filter [LOGGING_NAN_INF_FILTER] - Filter nan and inf losses for logging. (default: True) - --no_logging_nan_inf_filter - Filter nan and inf losses for logging. (default: - False) - --save_strategy {no,steps,epoch} - The checkpoint save strategy to use. (default: steps) - --save_steps SAVE_STEPS - Save checkpoint every X updates steps. Should be an - integer or a float in range `[0,1)`. If smaller than - 1, will be interpreted as ratio of total training - steps. (default: 500) - --save_total_limit SAVE_TOTAL_LIMIT - If a value is passed, will limit the total amount of - checkpoints. Deletes the older checkpoints in - `output_dir`. When `load_best_model_at_end` is - enabled, the 'best' checkpoint according to - `metric_for_best_model` will always be retained in - addition to the most recent ones. For example, for - `save_total_limit=5` and - `load_best_model_at_end=True`, the four last - checkpoints will always be retained alongside the best - model. When `save_total_limit=1` and - `load_best_model_at_end=True`, it is possible that two - checkpoints are saved: the last one and the best one - (if they are different). Default is unlimited - checkpoints (default: None) - --save_safetensors [SAVE_SAFETENSORS] - Use safetensors saving and loading for state dicts - instead of default torch.load and torch.save. - (default: True) - --no_save_safetensors - Use safetensors saving and loading for state dicts - instead of default torch.load and torch.save. - (default: False) - --save_on_each_node [SAVE_ON_EACH_NODE] - When doing multi-node distributed training, whether to - save models and checkpoints on each node, or only on - the main one (default: False) - --save_only_model [SAVE_ONLY_MODEL] - When checkpointing, whether to only save the model, or - also the optimizer, scheduler & rng state.Note that - when this is true, you won't be able to resume - training from checkpoint.This enables you to save - storage by not storing the optimizer, scheduler & rng - state.You can only load the model using - from_pretrained with this option set to True. - (default: False) - --no_cuda [NO_CUDA] This argument is deprecated. It will be removed in - version 5.0 of 🤗 Transformers. (default: False) - --use_cpu [USE_CPU] Whether or not to use cpu. If set to False, we will - use cuda/tpu/mps/npu device if available. (default: - False) - --use_mps_device [USE_MPS_DEVICE] - This argument is deprecated. `mps` device will be used - if available similar to `cuda` device. It will be - removed in version 5.0 of 🤗 Transformers (default: - False) - --seed SEED Random seed that will be set at the beginning of - training. (default: 42) - --data_seed DATA_SEED - Random seed to be used with data samplers. (default: - None) - --jit_mode_eval [JIT_MODE_EVAL] - Whether or not to use PyTorch jit trace for inference - (default: False) - --use_ipex [USE_IPEX] - Use Intel extension for PyTorch when it is available, - installation: 'https://github.com/intel/intel- - extension-for-pytorch' (default: False) - --bf16 [BF16] Whether to use bf16 (mixed) precision instead of - 32-bit. Requires Ampere or higher NVIDIA architecture - or using CPU (use_cpu) or Ascend NPU. This is an - experimental API and it may change. (default: False) - --fp16 [FP16] Whether to use fp16 (mixed) precision instead of - 32-bit (default: False) - --fp16_opt_level FP16_OPT_LEVEL - For fp16: Apex AMP optimization level selected in - ['O0', 'O1', 'O2', and 'O3']. See details at - https://nvidia.github.io/apex/amp.html (default: O1) - --half_precision_backend {auto,apex,cpu_amp} - The backend to be used for half precision. (default: - auto) - --bf16_full_eval [BF16_FULL_EVAL] - Whether to use full bfloat16 evaluation instead of - 32-bit. This is an experimental API and it may change. - (default: False) - --fp16_full_eval [FP16_FULL_EVAL] - Whether to use full float16 evaluation instead of - 32-bit (default: False) - --tf32 TF32 Whether to enable tf32 mode, available in Ampere and - newer GPU architectures. This is an experimental API - and it may change. (default: None) - --local_rank LOCAL_RANK - For distributed training: local_rank (default: -1) - --ddp_backend {nccl,gloo,mpi,ccl,hccl} - The backend to be used for distributed training - (default: None) - --tpu_num_cores TPU_NUM_CORES - TPU: Number of TPU cores (automatically passed by - launcher script) (default: None) - --tpu_metrics_debug [TPU_METRICS_DEBUG] - Deprecated, the use of `--debug tpu_metrics_debug` is - preferred. TPU: Whether to print debug metrics - (default: False) - --debug DEBUG [DEBUG ...] - Whether or not to enable debug mode. Current options: - `underflow_overflow` (Detect underflow and overflow in - activations and weights), `tpu_metrics_debug` (print - debug metrics on TPU). (default: None) - --dataloader_drop_last [DATALOADER_DROP_LAST] - Drop the last incomplete batch if it is not divisible - by the batch size. (default: False) - --eval_steps EVAL_STEPS - Run an evaluation every X steps. Should be an integer - or a float in range `[0,1)`. If smaller than 1, will - be interpreted as ratio of total training steps. - (default: None) - --dataloader_num_workers DATALOADER_NUM_WORKERS - Number of subprocesses to use for data loading - (PyTorch only). 0 means that the data will be loaded - in the main process. (default: 0) - --past_index PAST_INDEX - If >=0, uses the corresponding part of the output as - the past state for next step. (default: -1) - --run_name RUN_NAME An optional descriptor for the run. Notably used for - wandb logging. (default: None) - --disable_tqdm DISABLE_TQDM - Whether or not to disable the tqdm progress bars. - (default: None) - --remove_unused_columns [REMOVE_UNUSED_COLUMNS] - Remove columns not required by the model when using an - nlp.Dataset. (default: True) - --no_remove_unused_columns - Remove columns not required by the model when using an - nlp.Dataset. (default: False) - --label_names LABEL_NAMES [LABEL_NAMES ...] - The list of keys in your dictionary of inputs that - correspond to the labels. (default: None) - --load_best_model_at_end [LOAD_BEST_MODEL_AT_END] - Whether or not to load the best model found during - training at the end of training. When this option is - enabled, the best checkpoint will always be saved. See - `save_total_limit` for more. (default: False) - --metric_for_best_model METRIC_FOR_BEST_MODEL - The metric to use to compare two different models. - (default: None) - --greater_is_better GREATER_IS_BETTER - Whether the `metric_for_best_model` should be - maximized or not. (default: None) - --ignore_data_skip [IGNORE_DATA_SKIP] - When resuming training, whether or not to skip the - first epochs and batches to get to the same training - data. (default: False) - --fsdp FSDP Whether or not to use PyTorch Fully Sharded Data - Parallel (FSDP) training (in distributed training - only). The base option should be `full_shard`, - `shard_grad_op` or `no_shard` and you can add CPU- - offload to `full_shard` or `shard_grad_op` like this: - full_shard offload` or `shard_grad_op offload`. You - can add auto-wrap to `full_shard` or `shard_grad_op` - with the same syntax: full_shard auto_wrap` or - `shard_grad_op auto_wrap`. (default: ) - --fsdp_min_num_params FSDP_MIN_NUM_PARAMS - This parameter is deprecated. FSDP's minimum number of - parameters for Default Auto Wrapping. (useful only - when `fsdp` field is passed). (default: 0) - --fsdp_config FSDP_CONFIG - Config to be used with FSDP (Pytorch Fully Sharded - Data Parallel). The value is either a fsdp json config - file (e.g., `fsdp_config.json`) or an already loaded - json file as `dict`. (default: None) - --fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP - This parameter is deprecated. Transformer layer class - name (case-sensitive) to wrap, e.g, `BertLayer`, - `GPTJBlock`, `T5Block` .... (useful only when `fsdp` - flag is passed). (default: None) - --deepspeed DEEPSPEED - Enable deepspeed and pass the path to deepspeed json - config file (e.g. `ds_config.json`) or an already - loaded json file as a dict (default: None) - --label_smoothing_factor LABEL_SMOOTHING_FACTOR - The label smoothing epsilon to apply (zero means no - label smoothing). (default: 0.0) - --optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_torch_npu_fused,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit,rmsprop} - The optimizer to use. (default: adamw_torch) - --optim_args OPTIM_ARGS - Optional arguments to supply to optimizer. (default: - None) - --adafactor [ADAFACTOR] - Whether or not to replace AdamW by Adafactor. - (default: False) - --group_by_length [GROUP_BY_LENGTH] - Whether or not to group samples of roughly the same - length together when batching. (default: False) - --length_column_name LENGTH_COLUMN_NAME - Column name with precomputed lengths to use when - grouping by length. (default: length) - --report_to REPORT_TO [REPORT_TO ...] - The list of integrations to report the results and - logs to. (default: None) - --ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS - When using distributed training, the value of the flag - `find_unused_parameters` passed to - `DistributedDataParallel`. (default: None) - --ddp_bucket_cap_mb DDP_BUCKET_CAP_MB - When using distributed training, the value of the flag - `bucket_cap_mb` passed to `DistributedDataParallel`. - (default: None) - --ddp_broadcast_buffers DDP_BROADCAST_BUFFERS - When using distributed training, the value of the flag - `broadcast_buffers` passed to - `DistributedDataParallel`. (default: None) - --dataloader_pin_memory [DATALOADER_PIN_MEMORY] - Whether or not to pin memory for DataLoader. (default: - True) - --no_dataloader_pin_memory - Whether or not to pin memory for DataLoader. (default: - False) - --dataloader_persistent_workers [DATALOADER_PERSISTENT_WORKERS] - If True, the data loader will not shut down the worker - processes after a dataset has been consumed once. This - allows to maintain the workers Dataset instances - alive. Can potentially speed up training, but will - increase RAM usage. (default: False) - --skip_memory_metrics [SKIP_MEMORY_METRICS] - Whether or not to skip adding of memory profiler - reports to metrics. (default: True) - --no_skip_memory_metrics - Whether or not to skip adding of memory profiler - reports to metrics. (default: False) - --use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP] - Whether or not to use the legacy prediction_loop in - the Trainer. (default: False) - --push_to_hub [PUSH_TO_HUB] - Whether or not to upload the trained model to the - model hub after training. (default: False) - --resume_from_checkpoint RESUME_FROM_CHECKPOINT - The path to a folder with a valid checkpoint for your - model. (default: None) - --hub_model_id HUB_MODEL_ID - The name of the repository to keep in sync with the - local `output_dir`. (default: None) - --hub_strategy {end,every_save,checkpoint,all_checkpoints} - The hub strategy to use when `--push_to_hub` is - activated. (default: every_save) - --hub_token HUB_TOKEN - The token to use to push to the Model Hub. (default: - None) - --hub_private_repo [HUB_PRIVATE_REPO] - Whether the model repository is private or not. - (default: False) - --hub_always_push [HUB_ALWAYS_PUSH] - Unless `True`, the Trainer will skip pushes if the - previous one wasn't finished yet. (default: False) - --gradient_checkpointing [GRADIENT_CHECKPOINTING] - If True, use gradient checkpointing to save memory at - the expense of slower backward pass. (default: False) - --gradient_checkpointing_kwargs GRADIENT_CHECKPOINTING_KWARGS - Gradient checkpointing key word arguments such as - `use_reentrant`. Will be passed to - `torch.utils.checkpoint.checkpoint` through - `model.gradient_checkpointing_enable`. (default: None) - --include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS] - Whether or not the inputs will be passed to the - `compute_metrics` function. (default: False) - --fp16_backend {auto,apex,cpu_amp} - Deprecated. Use half_precision_backend instead - (default: auto) - --push_to_hub_model_id PUSH_TO_HUB_MODEL_ID - The name of the repository to which push the - `Trainer`. (default: None) - --push_to_hub_organization PUSH_TO_HUB_ORGANIZATION - The name of the organization in with to which push the - `Trainer`. (default: None) - --push_to_hub_token PUSH_TO_HUB_TOKEN - The token to use to push to the Model Hub. (default: - None) - --mp_parameters MP_PARAMETERS - Used by the SageMaker launcher to send mp-specific - args. Ignored in Trainer (default: ) - --auto_find_batch_size [AUTO_FIND_BATCH_SIZE] - Whether to automatically decrease the batch size in - half and rerun the training loop again each time a - CUDA Out-of-Memory was reached (default: False) - --full_determinism [FULL_DETERMINISM] - Whether to call enable_full_determinism instead of - set_seed for reproducibility in distributed training. - Important: this will negatively impact the - performance, so only use it for debugging. (default: - False) - --torchdynamo TORCHDYNAMO - This argument is deprecated, use - `--torch_compile_backend` instead. (default: None) - --ray_scope RAY_SCOPE - The scope to use when doing hyperparameter search with - Ray. By default, `"last"` will be used. Ray will then - use the last checkpoint of all trials, compare those, - and select the best one. However, other options are - also available. See the Ray documentation (https://doc - s.ray.io/en/latest/tune/api_docs/analysis.html#ray.tun - e.ExperimentAnalysis.get_best_trial) for more options. - (default: last) - --ddp_timeout DDP_TIMEOUT - Overrides the default timeout for distributed training - (value should be given in seconds). (default: 1800) - --torch_compile [TORCH_COMPILE] - If set to `True`, the model will be wrapped in - `torch.compile`. (default: False) - --torch_compile_backend TORCH_COMPILE_BACKEND - Which backend to use with `torch.compile`, passing one - will trigger a model compilation. (default: None) - --torch_compile_mode TORCH_COMPILE_MODE - Which mode to use with `torch.compile`, passing one - will trigger a model compilation. (default: None) - --dispatch_batches DISPATCH_BATCHES - Whether to dispatch batches across devices in - distributed training. If set to `True`, the dataloader - prepared by the Accelerator is only iterated through - on the main process and then the batches are split and - broadcast to each process. Will default to `True` for - `DataLoader` whoseunderlying dataset is an - `IterableDataset`, `False` otherwise. (default: None) - --split_batches [SPLIT_BATCHES] - Whether or not the accelerator should split the - batches yielded by the dataloaders across the devices - during distributed training. Ifset to `True`, the - actual batch size used will be the same on any kind of - distributed processes, but it must be around multiple - of the number of processes you are using (such as - GPUs). (default: False) - --include_tokens_per_second [INCLUDE_TOKENS_PER_SECOND] - If set to `True`, the speed metrics will include `tgs` - (tokens per second per device). (default: False) - --include_num_input_tokens_seen [INCLUDE_NUM_INPUT_TOKENS_SEEN] - If set to `True`, will track the number of input - tokens seen throughout training. (May be slower in - distributed training) (default: False) - --neftune_noise_alpha NEFTUNE_NOISE_ALPHA - Activates neftune noise embeddings into the model. - NEFTune has been proven to drastically improve model - performances for instrcution fine-tuning. Check out - the original paper here: - https://arxiv.org/abs/2310.05914 and the original code - here: https://github.com/neelsjain/NEFTune. Only - supported for `PreTrainedModel` and `PeftModel` - classes. (default: None) diff --git a/examples/training/swallow-tart/peft_config.json b/examples/training/swallow-tart/peft_config.json deleted file mode 100644 index 016ff8df9..000000000 --- a/examples/training/swallow-tart/peft_config.json +++ /dev/null @@ -1,29 +0,0 @@ -# https://huggingface.co/intfloat/e5-mistral-7b-instruct/blob/main/lora/adapter_config.json -# Lora rank and alpha: https://llm-jp.nii.ac.jp/blog/2024/02/09/v1.1-tuning.html#%E3%83%8F%E3%82%A4%E3%83%91%E3%83%BC%E3%83%91%E3%83%A9%E3%83%A1%E3%83%BC%E3%82%BF -{ - "auto_mapping": null, - "base_model_name_or_path": "mistralai/Mistral-7B-v0.1", - "bias": "none", - "fan_in_fan_out": false, - "inference_mode": false, - "init_lora_weights": true, - "layers_pattern": null, - "layers_to_transform": null, - "lora_alpha": 256, - "lora_dropout": 0.1, - "modules_to_save": null, - "peft_type": "LORA", - "r": 128, - "revision": null, - "target_modules": [ - "q_proj", - "k_proj", - "v_proj", - "o_proj", - "down_proj", - "up_proj", - "gate_proj" - ], - "task_type": "FEATURE_EXTRACTION", - "use_rslora": true -} \ No newline at end of file