diff --git a/.gitignore b/.gitignore index 5d1af70..c35ebbe 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ data.tar.gz .ipynb_checkpoints __pycache__ outputs +*.out *.json elastic_setting.py elastic_test.py diff --git a/arguments.py b/arguments.py index f75ceec..1fa4f12 100644 --- a/arguments.py +++ b/arguments.py @@ -9,7 +9,7 @@ class ModelArguments: """ model_name_or_path: str = field( - default="klue/bert-base", + default="monologg/koelectra-base-v3-discriminator", metadata={ "help": "Path to pretrained model or model identifier from huggingface.co/models" }, diff --git a/es_retrieval.py b/es_retrieval.py index 7a91d0d..423b2ae 100644 --- a/es_retrieval.py +++ b/es_retrieval.py @@ -60,11 +60,11 @@ def create_index(self, index_name: str, setting_path: str = "./settings.json"): self.client.indices.delete(index=index_name) else: - return + return False self.client.indices.create(index=index_name, body=settings) - print(f"Create an Index ({index_name})") + return True def get_indices(self): indices = list(self.client.indices.get_alias().keys()) @@ -86,7 +86,7 @@ def delete_index(self, index_name: str): def insert_data( self, index_name: str, - data_path: str = "../data/deduplication_wikipedia_documents.json", + data_path: str = "../data/wikipedia_documents.json", ): """_summary_ @@ -100,14 +100,12 @@ def insert_data( docs = [] print("Data Loding...") - for k, v in data.items(): + for i, v in enumerate(data.values()): doc = { "_index": index_name, "_type": "_doc", - "_id": k, - "document_id": v["document_id"], + "_id": i, "text": v["text"], - "corpus_source": v["corpus_source"], "title": v["title"], } @@ -129,6 +127,13 @@ def delete_data(self, index_name: str, doc_id): self.client.delete(index=index_name, id=doc_id) print(f"Deleted {doc_id} document.") + + def init_index(self, index_name: str): + if self.client.indices.exists(index=index_name): + self.delete_index(index_name=index_name) + + self.create_index(index_name=index_name) + print(f"Initialization...({index_name})") def document_count(self, index_name: str): @@ -139,30 +144,21 @@ def search(self, index_name: str, question: str, topk: int = 10): body = {"query": {"bool": {"must": [{"match": {"text": question}}]}}} - responses = self.client.search(index=index_name, body=body, size=topk)["hits"][ - "hits" - ] - outputs = [ - {"text": res["_source"]["text"], "score": res["_score"]} - for res in responses - ] + responses = self.client.search(index=index_name, body=body, size=topk)["hits"]["hits"] - return outputs + return responses if __name__ == "__main__": es = ElasticObject("localhost:9200") - es.create_index("wiki_docs") - es.create_index("wiki_docs") - es.delete_index("wiki_docs") - es.create_index("wiki_docs") - es.insert_data("wiki_docs") - es.document_count("wiki_docs") + # es.create_index("wiki_docs") + # es.insert_data("wiki_docs") + # print(es.document_count("wiki_docs")) outputs = es.search("wiki_docs", "소백산맥의 동남부에 위치한 지역은?") for output in outputs: - print("doc:", output["text"]) - print("score:", output["score"]) + print("doc:", output['_source']["text"]) + print("score:", output["_score"]) print() diff --git a/inference.py b/inference.py index d04987b..c203f50 100644 --- a/inference.py +++ b/inference.py @@ -21,7 +21,7 @@ load_metric, ) from transformers import AutoTokenizer, AutoModelForMaskedLM -from retrieval import TfidfRetrieval, BM25 +from retrieval import TfidfRetrieval, BM25, ElasticRetrieval from trainer_qa import QuestionAnsweringTrainer from colbert.inference import run_colbert_retrieval from transformers import ( @@ -124,6 +124,9 @@ def run_sparse_retrieval( retriever = TfidfRetrieval( tokenize_fn=tokenize_fn, data_path=data_path, context_path=context_path ) + + elif data_args.retrieval_choice=="elastic": + retriever = ElasticRetrieval(host='localhost', port='9200') retriever.get_sparse_embedding() if data_args.use_faiss: @@ -143,6 +146,7 @@ def run_sparse_retrieval( # train data 에 대해선 정답이 존재하므로 id question context answer 로 데이터셋이 구성됩니다. elif training_args.do_eval: + df = df.drop(columns=["original_context"]) f = Features( { "answers": Sequence( @@ -172,6 +176,7 @@ def run_mrc( model, ) -> NoReturn: print(datasets["validation"]) + # eval 혹은 prediction에서만 사용함 column_names = datasets["validation"].column_names diff --git a/retrieval.py b/retrieval.py index 8058180..4e1662f 100644 --- a/retrieval.py +++ b/retrieval.py @@ -12,7 +12,7 @@ from sklearn.feature_extraction.text import TfidfVectorizer from tqdm.auto import tqdm from rank_bm25 import BM25Okapi - +from es_retrieval import ElasticObject @contextmanager def timer(name): @@ -501,25 +501,123 @@ def get_relevant_doc_bulk(self, queries: List, k: Optional[int] = 1) -> Tuple[Li return doc_scores, doc_indices + +class ElasticRetrieval: + def __init__(self, host='localhost', port='9200') -> NoReturn: + self.host = host + self.port = port + self.elastic_client = ElasticObject(host=self.host, port=self.port) + + def retrieve( + self, query_or_dataset: Union[str, Dataset], topk: Optional[int] = 1 + ) -> Union[Tuple[List, List], pd.DataFrame]: + + if isinstance(query_or_dataset, str): + doc_scores, doc_indices, responses = self.get_relevant_doc(query_or_dataset, k=topk) + print("[Search query]\n", query_or_dataset, "\n") + + for i in range(min(topk, len(responses))): + print(f"Top-{i+1} passage with score {doc_scores[i]:4f}") + print(doc_indices[i]) + print(responses[i]['_source']['text']) + + return (doc_scores, [doc_indices[i] for i in range(topk)]) + + elif isinstance(query_or_dataset, Dataset): + # Retrieve한 Passage를 pd.DataFrame으로 반환합니다. + total = [] + with timer("query exhaustive search"): + doc_scores, doc_indices, doc_responses = self.get_relevant_doc_bulk( + query_or_dataset["question"], k=topk + ) + + for idx, example in enumerate(tqdm(query_or_dataset, desc="Elasticsearch")): + # retrieved_context 구하는 부분 수정 + retrieved_context = [] + for i in range(min(topk, len(doc_responses[idx]))): + retrieved_context.append(doc_responses[idx][i]['_source']['text']) + + tmp = { + # Query와 해당 id를 반환합니다. + "question": example["question"], + "id": example["id"], + # Retrieve한 Passage의 id, context를 반환합니다. + # "context_id": doc_indices[idx], + "context": " ".join(retrieved_context), # 수정 + } + if "context" in example.keys() and "answers" in example.keys(): + # validation 데이터를 사용하면 ground_truth context와 answer도 반환합니다. + tmp["original_context"] = example["context"] + tmp["answers"] = example["answers"] + total.append(tmp) + + cqas = pd.DataFrame(total) + + return cqas + + def get_sparse_embedding(self): + with timer("elastic building..."): + indices = self.elastic_client.get_indices() + print('Elastic indices :', indices) + + def get_relevant_doc(self, query: str, k: Optional[int] = 1) -> Tuple[List, List]: + with timer("query ex search"): + responses = self.elastic_client.search('wiki_docs', question=query, topk=k) + doc_score = [] + doc_indices = [] + for res in responses: + doc_score.append(res['_score']) + doc_indices.append(res['_id']) + + return doc_score, doc_indices, responses + + def get_relevant_doc_bulk(self, queries: List, k: Optional[int] = 10) -> Tuple[List, List]: + with timer("query ex search"): + doc_scores = [] + doc_indices = [] + doc_responses = [] + + for query in queries: + doc_score = [] + doc_index = [] + responses = self.elastic_client.search('wiki_docs', question=query, topk=k) + + + for res in responses: + doc_score.append(res['_score']) + doc_indices.append(res['_id']) + + doc_scores.append(doc_score) + doc_indices.append(doc_index) + doc_responses.append(responses) + + return doc_scores, doc_indices, doc_responses + + if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="") - parser.add_argument("--dataset_name", metavar="./data/train_dataset", type=str, help="") + parser.add_argument( + "--dataset_name", default="../data/train_dataset", type=str, help="" + ) parser.add_argument( "--model_name_or_path", - metavar="bert-base-multilingual-cased", + default="bert-base-multilingual-cased", type=str, help="", ) - parser.add_argument("--data_path", metavar="./data", type=str, help="") - parser.add_argument("--context_path", metavar="wikipedia_documents", type=str, help="") - parser.add_argument("--use_faiss", metavar=False, type=bool, help="") + parser.add_argument("--data_path", default="../data", type=str, help="") + parser.add_argument( + "--context_path", default="../data/wikipedia_documents", type=str, help="" + ) + parser.add_argument("--use_faiss", default=False, type=bool, help="") args = parser.parse_args() # Test sparse + print(args.dataset_name) org_dataset = load_from_disk(args.dataset_name) full_ds = concatenate_datasets( [ @@ -530,18 +628,7 @@ def get_relevant_doc_bulk(self, queries: List, k: Optional[int] = 1) -> Tuple[Li print("*" * 40, "query dataset", "*" * 40) print(full_ds) - from transformers import AutoTokenizer - - tokenizer = AutoTokenizer.from_pretrained( - args.model_name_or_path, - use_fast=False, - ) - - retriever = SparseRetrieval( - tokenize_fn=tokenizer.tokenize, - data_path=args.data_path, - context_path=args.context_path, - ) + retriever = ElasticRetrieval(host='localhost', port='9200') query = "대통령을 포함한 미국의 행정부 견제권을 갖는 국가 기관은?" diff --git a/train.py b/train.py index e460a8a..dddc750 100644 --- a/train.py +++ b/train.py @@ -1,366 +1,366 @@ -import logging -import os -import sys -import pandas as pd -import json -import wandb -from typing import NoReturn - -from arguments import DataTrainingArguments, ModelArguments -from datasets import DatasetDict, load_from_disk, load_metric, Dataset -from trainer_qa import QuestionAnsweringTrainer -from transformers import ( - AutoConfig, - AutoModelForQuestionAnswering, - AutoTokenizer, - DataCollatorWithPadding, - EvalPrediction, - HfArgumentParser, - TrainingArguments, - set_seed, -) -from utils_qa import check_no_error, postprocess_qa_predictions - -logger = logging.getLogger(__name__) - - -def main(): - # 가능한 arguments 들은 ./arguments.py 나 transformer package 안의 src/transformers/training_args.py 에서 확인 가능합니다. - # --help flag 를 실행시켜서 확인할 수 도 있습니다. - - parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) - model_args, data_args, training_args = parser.parse_args_into_dataclasses() - print(model_args.model_name_or_path) - - # [참고] argument를 manual하게 수정하고 싶은 경우에 아래와 같은 방식을 사용할 수 있습니다 - # training_args.per_device_train_batch_size = 16 - # print(training_args.per_device_train_batch_size) - # training_args.num_train_epochs=53 - # training_args.learning_rate=9e-6 - # print("learning_rate: ",training_args.learning_rate) - - print(f"model is from {model_args.model_name_or_path}") - print(f"data is from {data_args.dataset_name}") - - # logging 설정 - logging.basicConfig( - format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", - datefmt="%m/%d/%Y %H:%M:%S", - handlers=[logging.StreamHandler(sys.stdout)], - ) - - # verbosity 설정 : Transformers logger의 정보로 사용합니다 (on main process only) - logger.info("Training/evaluation parameters %s", training_args) - - # 모델을 초기화하기 전에 난수를 고정합니다. - set_seed(training_args.seed) - datasets = load_from_disk(data_args.dataset_name) - print(datasets) - - if model_args.config_name is not None: - print("modelname!: ", model_args.config_name) - else: - print("model!!", model_args.model_name_or_path) - # AutoConfig를 이용하여 pretrained model 과 tokenizer를 불러옵니다. - # argument로 원하는 모델 이름을 설정하면 옵션을 바꿀 수 있습니다. - config = AutoConfig.from_pretrained( - model_args.config_name - if model_args.config_name is not None - else model_args.model_name_or_path, - ) - - tokenizer = AutoTokenizer.from_pretrained( - model_args.tokenizer_name - if model_args.tokenizer_name is not None - else model_args.model_name_or_path, - # 'use_fast' argument를 True로 설정할 경우 rust로 구현된 tokenizer를 사용할 수 있습니다. - # False로 설정할 경우 python으로 구현된 tokenizer를 사용할 수 있으며, - # rust version이 비교적 속도가 빠릅니다. - # use_fast=True, - ) - model = AutoModelForQuestionAnswering.from_pretrained( - model_args.model_name_or_path, - from_tf=bool(".ckpt" in model_args.model_name_or_path), - config=config, - ) - - print( - type(training_args), - type(model_args), - type(datasets), - type(tokenizer), - type(model), - ) - - wandb.watch(model) - # do_train mrc model 혹은 do_eval mrc model - if training_args.do_train or training_args.do_eval: - run_mrc(data_args, training_args, model_args, datasets, tokenizer, model) - - -def run_mrc( - data_args: DataTrainingArguments, - training_args: TrainingArguments, - model_args: ModelArguments, - datasets: DatasetDict, - tokenizer, - model, -) -> NoReturn: - training_args.save_total_limit = 3 - # training_args.eval_steps=500 - # training_args.evaluation_strategy='steps' - # training_args.load_best_model_at_end = True - # training_args.metric_for_best_model='em' - training_args.report_to = ["wandb"] - - # dataset을 전처리합니다. - # training과 evaluation에서 사용되는 전처리는 아주 조금 다른 형태를 가집니다. - if training_args.do_train: - column_names = datasets["train"].column_names - else: - column_names = datasets["validation"].column_names - - question_column_name = "question" if "question" in column_names else column_names[0] - context_column_name = "context" if "context" in column_names else column_names[1] - answer_column_name = "answers" if "answers" in column_names else column_names[2] - - # Padding에 대한 옵션을 설정합니다. - # (question|context) 혹은 (context|question)로 세팅 가능합니다. - pad_on_right = tokenizer.padding_side == "right" - - # 오류가 있는지 확인합니다. - last_checkpoint, max_seq_length = check_no_error(data_args, training_args, datasets, tokenizer) - - # Train preprocessing / 전처리를 진행합니다. - def prepare_train_features(examples): - # truncation과 padding(length가 짧을때만)을 통해 toknization을 진행하며, stride를 이용하여 overflow를 유지합니다. - # 각 example들은 이전의 context와 조금씩 겹치게됩니다. - tokenized_examples = tokenizer( - examples[question_column_name if pad_on_right else context_column_name], - examples[context_column_name if pad_on_right else question_column_name], - truncation="only_second" if pad_on_right else "only_first", - max_length=max_seq_length, - stride=data_args.doc_stride, - return_overflowing_tokens=True, - return_offsets_mapping=True, - return_token_type_ids=not model_args.is_roberta, # roberta모델을 사용할 경우 False, bert를 사용할 경우 True로 표기해야합니다. - padding="max_length" if data_args.pad_to_max_length else False, - ) - - # 길이가 긴 context가 등장할 경우 truncate를 진행해야하므로, 해당 데이터셋을 찾을 수 있도록 mapping 가능한 값이 필요합니다. - sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") - # token의 캐릭터 단위 position를 찾을 수 있도록 offset mapping을 사용합니다. - # start_positions과 end_positions을 찾는데 도움을 줄 수 있습니다. - offset_mapping = tokenized_examples.pop("offset_mapping") - - # 데이터셋에 "start position", "enc position" label을 부여합니다. - tokenized_examples["start_positions"] = [] - tokenized_examples["end_positions"] = [] - - for i, offsets in enumerate(offset_mapping): - input_ids = tokenized_examples["input_ids"][i] - cls_index = input_ids.index(tokenizer.cls_token_id) # cls index - - # sequence id를 설정합니다 (to know what is the context and what is the question). - sequence_ids = tokenized_examples.sequence_ids(i) - - # 하나의 example이 여러개의 span을 가질 수 있습니다. - sample_index = sample_mapping[i] - answers = examples[answer_column_name][sample_index] - - # answer가 없을 경우 cls_index를 answer로 설정합니다(== example에서 정답이 없는 경우 존재할 수 있음). - if len(answers["answer_start"]) == 0: - tokenized_examples["start_positions"].append(cls_index) - tokenized_examples["end_positions"].append(cls_index) - else: - # text에서 정답의 Start/end character index - start_char = answers["answer_start"][0] - end_char = start_char + len(answers["text"][0]) - - # text에서 current span의 Start token index - token_start_index = 0 - while sequence_ids[token_start_index] != (1 if pad_on_right else 0): - token_start_index += 1 - - # text에서 current span의 End token index - token_end_index = len(input_ids) - 1 - while sequence_ids[token_end_index] != (1 if pad_on_right else 0): - token_end_index -= 1 - - # 정답이 span을 벗어났는지 확인합니다(정답이 없는 경우 CLS index로 label되어있음). - if not ( - offsets[token_start_index][0] <= start_char - and offsets[token_end_index][1] >= end_char - ): - tokenized_examples["start_positions"].append(cls_index) - tokenized_examples["end_positions"].append(cls_index) - else: - # token_start_index 및 token_end_index를 answer의 끝으로 이동합니다. - # Note: answer가 마지막 단어인 경우 last offset을 따라갈 수 있습니다(edge case). - while ( - token_start_index < len(offsets) - and offsets[token_start_index][0] <= start_char - ): - token_start_index += 1 - tokenized_examples["start_positions"].append(token_start_index - 1) - while offsets[token_end_index][1] >= end_char: - token_end_index -= 1 - tokenized_examples["end_positions"].append(token_end_index + 1) - - return tokenized_examples - - if training_args.do_train: - if "train" not in datasets: - raise ValueError("--do_train requires a train dataset") - train_dataset = datasets["train"] - - # dataset에서 train feature를 생성합니다. - train_dataset = train_dataset.map( - prepare_train_features, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not data_args.overwrite_cache, - ) - - # Validation preprocessing - def prepare_validation_features(examples): - # truncation과 padding(length가 짧을때만)을 통해 toknization을 진행하며, stride를 이용하여 overflow를 유지합니다. - # 각 example들은 이전의 context와 조금씩 겹치게됩니다. - tokenized_examples = tokenizer( - examples[question_column_name if pad_on_right else context_column_name], - examples[context_column_name if pad_on_right else question_column_name], - truncation="only_second" if pad_on_right else "only_first", - max_length=max_seq_length, - stride=data_args.doc_stride, - return_overflowing_tokens=True, - return_offsets_mapping=True, - return_token_type_ids=not model_args.is_roberta, # roberta모델을 사용할 경우 False, bert를 사용할 경우 True로 표기해야합니다. - padding="max_length" if data_args.pad_to_max_length else False, - ) - - # 길이가 긴 context가 등장할 경우 truncate를 진행해야하므로, 해당 데이터셋을 찾을 수 있도록 mapping 가능한 값이 필요합니다. - sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") - - # evaluation을 위해, prediction을 context의 substring으로 변환해야합니다. - # corresponding example_id를 유지하고 offset mappings을 저장해야합니다. - tokenized_examples["example_id"] = [] - - for i in range(len(tokenized_examples["input_ids"])): - # sequence id를 설정합니다 (to know what is the context and what is the question). - sequence_ids = tokenized_examples.sequence_ids(i) - context_index = 1 if pad_on_right else 0 - - # 하나의 example이 여러개의 span을 가질 수 있습니다. - sample_index = sample_mapping[i] - tokenized_examples["example_id"].append(examples["id"][sample_index]) - - # Set to None the offset_mapping을 None으로 설정해서 token position이 context의 일부인지 쉽게 판별 할 수 있습니다. - tokenized_examples["offset_mapping"][i] = [ - (o if sequence_ids[k] == context_index else None) - for k, o in enumerate(tokenized_examples["offset_mapping"][i]) - ] - return tokenized_examples - - if training_args.do_eval: - eval_dataset = datasets["validation"] - - # Validation Feature 생성 - eval_dataset = eval_dataset.map( - prepare_validation_features, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not data_args.overwrite_cache, - ) - - # Data collator - # flag가 True이면 이미 max length로 padding된 상태입니다. - # 그렇지 않다면 data collator에서 padding을 진행해야합니다. - data_collator = DataCollatorWithPadding( - tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None - ) - - # Post-processing: - def post_processing_function(examples, features, predictions, training_args): - # Post-processing: start logits과 end logits을 original context의 정답과 match시킵니다. - predictions = postprocess_qa_predictions( - examples=examples, - features=features, - predictions=predictions, - max_answer_length=data_args.max_answer_length, - output_dir=training_args.output_dir, - ) - # Metric을 구할 수 있도록 Format을 맞춰줍니다. - formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()] - if training_args.do_predict: - return formatted_predictions - - elif training_args.do_eval: - references = [ - {"id": ex["id"], "answers": ex[answer_column_name]} for ex in datasets["validation"] - ] - return EvalPrediction(predictions=formatted_predictions, label_ids=references) - - metric = load_metric("squad") - - def compute_metrics(p: EvalPrediction): - return metric.compute(predictions=p.predictions, references=p.label_ids) - - # Trainer 초기화 - trainer = QuestionAnsweringTrainer( - model=model, - args=training_args, - train_dataset=train_dataset if training_args.do_train else None, - eval_dataset=eval_dataset if training_args.do_eval else None, - eval_examples=datasets["validation"] if training_args.do_eval else None, - tokenizer=tokenizer, - data_collator=data_collator, - post_process_function=post_processing_function, - compute_metrics=compute_metrics, - ) - - # Training - if training_args.do_train: - if last_checkpoint is not None: - checkpoint = last_checkpoint - elif os.path.isdir(model_args.model_name_or_path): - checkpoint = model_args.model_name_or_path - else: - checkpoint = None - train_result = trainer.train(resume_from_checkpoint=checkpoint) - trainer.save_model() # Saves the tokenizer too for easy upload - - metrics = train_result.metrics - metrics["train_samples"] = len(train_dataset) - - trainer.log_metrics("train", metrics) - trainer.save_metrics("train", metrics) - trainer.save_state() - - output_train_file = os.path.join(training_args.output_dir, "train_results.txt") - - with open(output_train_file, "w") as writer: - logger.info("***** Train results *****") - for key, value in sorted(train_result.metrics.items()): - logger.info(f" {key} = {value}") - writer.write(f"{key} = {value}\n") - - # State 저장 - trainer.state.save_to_json(os.path.join(training_args.output_dir, "trainer_state.json")) - - # Evaluation - if training_args.do_eval: - logger.info("*** Evaluate ***") - metrics = trainer.evaluate() - - metrics["eval_samples"] = len(eval_dataset) - - trainer.log_metrics("eval", metrics) - trainer.save_metrics("eval", metrics) - - -if __name__ == "__main__": - # TODO - wandb.init(project="YOUR_PROJECT", name="pretrain_more", entity="nlp-08-mrc") - main() +import logging +import os +import sys +import pandas as pd +import json +import wandb +from typing import NoReturn + +from arguments import DataTrainingArguments, ModelArguments +from datasets import DatasetDict, load_from_disk, load_metric, Dataset +from trainer_qa import QuestionAnsweringTrainer +from transformers import ( + AutoConfig, + AutoModelForQuestionAnswering, + AutoTokenizer, + DataCollatorWithPadding, + EvalPrediction, + HfArgumentParser, + TrainingArguments, + set_seed, +) +from utils_qa import check_no_error, postprocess_qa_predictions + +logger = logging.getLogger(__name__) + + +def main(): + # 가능한 arguments 들은 ./arguments.py 나 transformer package 안의 src/transformers/training_args.py 에서 확인 가능합니다. + # --help flag 를 실행시켜서 확인할 수 도 있습니다. + + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + print(model_args.model_name_or_path) + + # [참고] argument를 manual하게 수정하고 싶은 경우에 아래와 같은 방식을 사용할 수 있습니다 + # training_args.per_device_train_batch_size = 16 + # print(training_args.per_device_train_batch_size) + # training_args.num_train_epochs=53 + # training_args.learning_rate=9e-6 + # print("learning_rate: ",training_args.learning_rate) + + print(f"model is from {model_args.model_name_or_path}") + print(f"data is from {data_args.dataset_name}") + + # logging 설정 + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) + + # verbosity 설정 : Transformers logger의 정보로 사용합니다 (on main process only) + logger.info("Training/evaluation parameters %s", training_args) + + # 모델을 초기화하기 전에 난수를 고정합니다. + set_seed(training_args.seed) + datasets = load_from_disk(data_args.dataset_name) + print(datasets) + + if model_args.config_name is not None: + print("modelname!: ", model_args.config_name) + else: + print("model!!", model_args.model_name_or_path) + # AutoConfig를 이용하여 pretrained model 과 tokenizer를 불러옵니다. + # argument로 원하는 모델 이름을 설정하면 옵션을 바꿀 수 있습니다. + config = AutoConfig.from_pretrained( + model_args.config_name + if model_args.config_name is not None + else model_args.model_name_or_path, + ) + + tokenizer = AutoTokenizer.from_pretrained( + model_args.tokenizer_name + if model_args.tokenizer_name is not None + else model_args.model_name_or_path, + # 'use_fast' argument를 True로 설정할 경우 rust로 구현된 tokenizer를 사용할 수 있습니다. + # False로 설정할 경우 python으로 구현된 tokenizer를 사용할 수 있으며, + # rust version이 비교적 속도가 빠릅니다. + # use_fast=True, + ) + model = AutoModelForQuestionAnswering.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + ) + + print( + type(training_args), + type(model_args), + type(datasets), + type(tokenizer), + type(model), + ) + + wandb.watch(model) + # do_train mrc model 혹은 do_eval mrc model + if training_args.do_train or training_args.do_eval: + run_mrc(data_args, training_args, model_args, datasets, tokenizer, model) + + +def run_mrc( + data_args: DataTrainingArguments, + training_args: TrainingArguments, + model_args: ModelArguments, + datasets: DatasetDict, + tokenizer, + model, +) -> NoReturn: + training_args.save_total_limit = 3 + # training_args.eval_steps=500 + # training_args.evaluation_strategy='steps' + # training_args.load_best_model_at_end = True + # training_args.metric_for_best_model='em' + training_args.report_to = ["wandb"] + + # dataset을 전처리합니다. + # training과 evaluation에서 사용되는 전처리는 아주 조금 다른 형태를 가집니다. + if training_args.do_train: + column_names = datasets["train"].column_names + else: + column_names = datasets["validation"].column_names + + question_column_name = "question" if "question" in column_names else column_names[0] + context_column_name = "context" if "context" in column_names else column_names[1] + answer_column_name = "answers" if "answers" in column_names else column_names[2] + + # Padding에 대한 옵션을 설정합니다. + # (question|context) 혹은 (context|question)로 세팅 가능합니다. + pad_on_right = tokenizer.padding_side == "right" + + # 오류가 있는지 확인합니다. + last_checkpoint, max_seq_length = check_no_error(data_args, training_args, datasets, tokenizer) + + # Train preprocessing / 전처리를 진행합니다. + def prepare_train_features(examples): + # truncation과 padding(length가 짧을때만)을 통해 toknization을 진행하며, stride를 이용하여 overflow를 유지합니다. + # 각 example들은 이전의 context와 조금씩 겹치게됩니다. + tokenized_examples = tokenizer( + examples[question_column_name if pad_on_right else context_column_name], + examples[context_column_name if pad_on_right else question_column_name], + truncation="only_second" if pad_on_right else "only_first", + max_length=max_seq_length, + stride=data_args.doc_stride, + return_overflowing_tokens=True, + return_offsets_mapping=True, + return_token_type_ids=not model_args.is_roberta, # roberta모델을 사용할 경우 False, bert를 사용할 경우 True로 표기해야합니다. + padding="max_length" if data_args.pad_to_max_length else False, + ) + + # 길이가 긴 context가 등장할 경우 truncate를 진행해야하므로, 해당 데이터셋을 찾을 수 있도록 mapping 가능한 값이 필요합니다. + sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") + # token의 캐릭터 단위 position를 찾을 수 있도록 offset mapping을 사용합니다. + # start_positions과 end_positions을 찾는데 도움을 줄 수 있습니다. + offset_mapping = tokenized_examples.pop("offset_mapping") + + # 데이터셋에 "start position", "enc position" label을 부여합니다. + tokenized_examples["start_positions"] = [] + tokenized_examples["end_positions"] = [] + + for i, offsets in enumerate(offset_mapping): + input_ids = tokenized_examples["input_ids"][i] + cls_index = input_ids.index(tokenizer.cls_token_id) # cls index + + # sequence id를 설정합니다 (to know what is the context and what is the question). + sequence_ids = tokenized_examples.sequence_ids(i) + + # 하나의 example이 여러개의 span을 가질 수 있습니다. + sample_index = sample_mapping[i] + answers = examples[answer_column_name][sample_index] + + # answer가 없을 경우 cls_index를 answer로 설정합니다(== example에서 정답이 없는 경우 존재할 수 있음). + if len(answers["answer_start"]) == 0: + tokenized_examples["start_positions"].append(cls_index) + tokenized_examples["end_positions"].append(cls_index) + else: + # text에서 정답의 Start/end character index + start_char = answers["answer_start"][0] + end_char = start_char + len(answers["text"][0]) + + # text에서 current span의 Start token index + token_start_index = 0 + while sequence_ids[token_start_index] != (1 if pad_on_right else 0): + token_start_index += 1 + + # text에서 current span의 End token index + token_end_index = len(input_ids) - 1 + while sequence_ids[token_end_index] != (1 if pad_on_right else 0): + token_end_index -= 1 + + # 정답이 span을 벗어났는지 확인합니다(정답이 없는 경우 CLS index로 label되어있음). + if not ( + offsets[token_start_index][0] <= start_char + and offsets[token_end_index][1] >= end_char + ): + tokenized_examples["start_positions"].append(cls_index) + tokenized_examples["end_positions"].append(cls_index) + else: + # token_start_index 및 token_end_index를 answer의 끝으로 이동합니다. + # Note: answer가 마지막 단어인 경우 last offset을 따라갈 수 있습니다(edge case). + while ( + token_start_index < len(offsets) + and offsets[token_start_index][0] <= start_char + ): + token_start_index += 1 + tokenized_examples["start_positions"].append(token_start_index - 1) + while offsets[token_end_index][1] >= end_char: + token_end_index -= 1 + tokenized_examples["end_positions"].append(token_end_index + 1) + + return tokenized_examples + + if training_args.do_train: + if "train" not in datasets: + raise ValueError("--do_train requires a train dataset") + train_dataset = datasets["train"] + + # dataset에서 train feature를 생성합니다. + train_dataset = train_dataset.map( + prepare_train_features, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + ) + + # Validation preprocessing + def prepare_validation_features(examples): + # truncation과 padding(length가 짧을때만)을 통해 toknization을 진행하며, stride를 이용하여 overflow를 유지합니다. + # 각 example들은 이전의 context와 조금씩 겹치게됩니다. + tokenized_examples = tokenizer( + examples[question_column_name if pad_on_right else context_column_name], + examples[context_column_name if pad_on_right else question_column_name], + truncation="only_second" if pad_on_right else "only_first", + max_length=max_seq_length, + stride=data_args.doc_stride, + return_overflowing_tokens=True, + return_offsets_mapping=True, + return_token_type_ids=not model_args.is_roberta, # roberta모델을 사용할 경우 False, bert를 사용할 경우 True로 표기해야합니다. + padding="max_length" if data_args.pad_to_max_length else False, + ) + + # 길이가 긴 context가 등장할 경우 truncate를 진행해야하므로, 해당 데이터셋을 찾을 수 있도록 mapping 가능한 값이 필요합니다. + sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") + + # evaluation을 위해, prediction을 context의 substring으로 변환해야합니다. + # corresponding example_id를 유지하고 offset mappings을 저장해야합니다. + tokenized_examples["example_id"] = [] + + for i in range(len(tokenized_examples["input_ids"])): + # sequence id를 설정합니다 (to know what is the context and what is the question). + sequence_ids = tokenized_examples.sequence_ids(i) + context_index = 1 if pad_on_right else 0 + + # 하나의 example이 여러개의 span을 가질 수 있습니다. + sample_index = sample_mapping[i] + tokenized_examples["example_id"].append(examples["id"][sample_index]) + + # Set to None the offset_mapping을 None으로 설정해서 token position이 context의 일부인지 쉽게 판별 할 수 있습니다. + tokenized_examples["offset_mapping"][i] = [ + (o if sequence_ids[k] == context_index else None) + for k, o in enumerate(tokenized_examples["offset_mapping"][i]) + ] + return tokenized_examples + + if training_args.do_eval: + eval_dataset = datasets["validation"] + + # Validation Feature 생성 + eval_dataset = eval_dataset.map( + prepare_validation_features, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + ) + + # Data collator + # flag가 True이면 이미 max length로 padding된 상태입니다. + # 그렇지 않다면 data collator에서 padding을 진행해야합니다. + data_collator = DataCollatorWithPadding( + tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None + ) + + # Post-processing: + def post_processing_function(examples, features, predictions, training_args): + # Post-processing: start logits과 end logits을 original context의 정답과 match시킵니다. + predictions = postprocess_qa_predictions( + examples=examples, + features=features, + predictions=predictions, + max_answer_length=data_args.max_answer_length, + output_dir=training_args.output_dir, + ) + # Metric을 구할 수 있도록 Format을 맞춰줍니다. + formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()] + if training_args.do_predict: + return formatted_predictions + + elif training_args.do_eval: + references = [ + {"id": ex["id"], "answers": ex[answer_column_name]} for ex in datasets["validation"] + ] + return EvalPrediction(predictions=formatted_predictions, label_ids=references) + + metric = load_metric("squad") + + def compute_metrics(p: EvalPrediction): + return metric.compute(predictions=p.predictions, references=p.label_ids) + + # Trainer 초기화 + trainer = QuestionAnsweringTrainer( + model=model, + args=training_args, + train_dataset=train_dataset if training_args.do_train else None, + eval_dataset=eval_dataset if training_args.do_eval else None, + eval_examples=datasets["validation"] if training_args.do_eval else None, + tokenizer=tokenizer, + data_collator=data_collator, + post_process_function=post_processing_function, + compute_metrics=compute_metrics, + ) + + # Training + if training_args.do_train: + if last_checkpoint is not None: + checkpoint = last_checkpoint + elif os.path.isdir(model_args.model_name_or_path): + checkpoint = model_args.model_name_or_path + else: + checkpoint = None + train_result = trainer.train(resume_from_checkpoint=checkpoint) + trainer.save_model() # Saves the tokenizer too for easy upload + + metrics = train_result.metrics + metrics["train_samples"] = len(train_dataset) + + trainer.log_metrics("train", metrics) + trainer.save_metrics("train", metrics) + trainer.save_state() + + output_train_file = os.path.join(training_args.output_dir, "train_results.txt") + + with open(output_train_file, "w") as writer: + logger.info("***** Train results *****") + for key, value in sorted(train_result.metrics.items()): + logger.info(f" {key} = {value}") + writer.write(f"{key} = {value}\n") + + # State 저장 + trainer.state.save_to_json(os.path.join(training_args.output_dir, "trainer_state.json")) + + # Evaluation + if training_args.do_eval: + logger.info("*** Evaluate ***") + metrics = trainer.evaluate() + + metrics["eval_samples"] = len(eval_dataset) + + trainer.log_metrics("eval", metrics) + trainer.save_metrics("eval", metrics) + + +if __name__ == "__main__": + # TODO + wandb.init(project="YOUR_PROJECT", name="pretrain_more", entity="nlp-08-mrc") + main()