Skip to content

Commit

Permalink
Merge pull request #20 from boostcampaitech4lv23nlp2/develop_siryuon
Browse files Browse the repository at this point in the history
Develop siryuon
  • Loading branch information
coderJoon authored Jan 6, 2023
2 parents 5abd94f + 9bfec56 commit 94954e2
Show file tree
Hide file tree
Showing 6 changed files with 912 additions and 366 deletions.
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@ data.tar.gz
.ipynb_checkpoints
__pycache__
outputs
*.json
elastic_setting.py
elastic_test.py
*.csv
*.zip
*.sh
wandb
pretrain
moretrain
Expand Down
29 changes: 29 additions & 0 deletions kfold_ensemble_hard.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import json

with open('./outputs/test_dataset_fold1/nbest_predictions.json') as f:
fold1 = json.load(f)
with open('./outputs/test_dataset_fold2/nbest_predictions.json') as f:
fold2 = json.load(f)
with open('./outputs/test_dataset_fold3/nbest_predictions.json') as f:
fold3 = json.load(f)
with open('./outputs/test_dataset_fold4/nbest_predictions.json') as f:
fold4 = json.load(f)
with open('./outputs/test_dataset_fold5/nbest_predictions.json') as f:
fold5 = json.load(f)

def most_frequent(data):
return max(data, key=data.count)

mrc_id = fold1.keys()
mrc_id = list(mrc_id)

data = {}

for _id in mrc_id:
tmp = [fold1[_id][0]['text'], fold2[_id][0]['text'], fold3[_id][0]['text'], fold4[_id][0]['text'], fold5[_id][0]['text']]
data[_id] = most_frequent(tmp)

file_path = './kfold_ensemble_predictions.json'

with open(file_path, 'w') as out:
json.dump(data, out, indent = 4, ensure_ascii=False)
41 changes: 41 additions & 0 deletions kfold_ensemble_soft.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import json
from collections import defaultdict

with open('./outputs/test_dataset_fold1_pretrain/nbest_predictions.json') as f:
fold1 = json.load(f)
with open('./outputs/test_dataset_fold2_pretrain/nbest_predictions.json') as f:
fold2 = json.load(f)
with open('./outputs/test_dataset_fold3_pretrain/nbest_predictions.json') as f:
fold3 = json.load(f)
with open('./outputs/test_dataset_fold4_pretrain/nbest_predictions.json') as f:
fold4 = json.load(f)
with open('./outputs/test_dataset_fold5_pretrain/nbest_predictions.json') as f:
fold5 = json.load(f)

def most_frequent(data):
return max(data, key=data.count)

mrc_id = fold1.keys()
mrc_id = list(mrc_id)

foldList = [fold1, fold2, fold3, fold4, fold5]

output = {}
for _id in mrc_id:
dic = defaultdict()
for fold in foldList:
data = fold[_id]
for d in data:
try:
dic[d['text']] += d['probability']
except:
dic[d['text']] = d['probability']
sorted_dict = sorted(dic.items(), key=lambda item:item[1], reverse=True)
answer = sorted_dict[0][0]

output[_id] = answer

file_path = './kfold_pretrain_ensemble_predictions_soft.json'

with open(file_path, 'w') as out:
json.dump(output, out, indent = 4, ensure_ascii=False)
77 changes: 77 additions & 0 deletions make_folds.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import warnings
import argparse
import pandas as pd

from datasets import (
load_from_disk,
concatenate_datasets,
)

from sklearn.model_selection import StratifiedKFold

warnings.filterwarnings(action="ignore")


def main(args):
org_dataset = load_from_disk("../data/train_dataset/")
full_ds = concatenate_datasets(
[
org_dataset["train"].flatten_indices(),
org_dataset["validation"].flatten_indices(),
]
)

_id = []
doc_id = []
title = []
context = []
question = []
answers = []
context_len = []

for train_data in full_ds:
_id.append(train_data["id"])
doc_id.append(train_data["document_id"])
title.append(train_data["title"])
context.append(train_data["context"])
question.append(train_data["question"])
answers.append(train_data["answers"])
context_len.append(len(train_data["context"]))

train_dict = {
"id": _id,
"doc_id": doc_id,
"title": title,
"context": context,
"question": question,
"answers": answers,
"context_len": context_len,
}

train_df = pd.DataFrame(train_dict)

kfold = StratifiedKFold(n_splits=args.num_folds, shuffle=True, random_state=42)
folds = kfold.split(train_df, train_df["context_len"].values)

for fold, (train_idx, val_idx) in enumerate(folds):
val_df = train_df.iloc[val_idx]
val_df.to_csv(
args.output_dir + "/fold" + str(fold + 1) + "_test.csv", index=False
)


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--num_folds",
type=int,
default=5,
)
parser.add_argument(
"--output_dir",
type=str,
default=".",
)

args = parser.parse_args()
main(args)
Loading

0 comments on commit 94954e2

Please sign in to comment.