-
Notifications
You must be signed in to change notification settings - Fork 5
/
experiments.py
181 lines (162 loc) · 9.27 KB
/
experiments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
from sklearn.model_selection import train_test_split
import pandas as pd
from absl import flags, logging, app
import numpy as np
import classifiers
from sklearn.metrics import *
from scipy.stats import sem
import tensorflow as tf
import os, sys
import json
import datetime
# Following is a dependency on the ssig package:
#! git clone https://github.com/ipavlopoulos/ssig.git
from ssig import ci
FLAGS = flags.FLAGS
flags.DEFINE_string("model_name", None, "name:OOC' for Out Of Context architecture (or the respective context-aware schema).") # name , default, help
flags.DEFINE_integer("with_context_data", 0, "False for context-less training data.") # name , default, help
flags.DEFINE_integer("oversample", 1, "Oversample the positive class, e.g., 99/1 (enter 99)")
flags.DEFINE_integer("repeat", 0, "Repetitions of the experiment. Default is 0.")
flags.DEFINE_integer("at_split", 0, "Operate on specific split. Default is 0.")
flags.DEFINE_integer("epochs", 100, "Epochs. Default is 100.")
flags.DEFINE_integer("confidence_intervals", 1, "Show Confidence Intervals along with AUROCC") # name , default, help
flags.DEFINE_integer("create_random_splits", 0, "Create random splits. Default number is 0, which means: 'do not split'.")
flags.DEFINE_integer("patience", 3, "Waiting epochs for the best performance. Default is 10.") # name , default, help
flags.DEFINE_integer("seed", 42, "The seed to initialise the random state. Default is 42.")
flags.DEFINE_string("experiment_version_name", f"version-{datetime.datetime.now().strftime('%d%B%Y-%H%M')}", "The name of the splits directory. Default is 'standard_ten'.")
def evaluate_perspective(dataset_path="data/CAT_LARGE", splits=10):
scores = []
for i in range(splits):
ic = pd.read_csv(f"{dataset_path}/{i}/ic.val.csv")
scores.append(roc_auc_score(ic.label, ic.api))
return scores
def split_to_random_sets(splits=10, test_size=0.2):
"""
Split the datasets to random sets.
:param test_size:
:param splits: Number of sets to split.
:return:
"""
path_name = f"data/CAT_LARGE/MCCV"
if os.path.exists(path_name):
sys.exit(f"ERROR: {path_name} is not empty.")
os.makedirs(path_name)
for split_num in range(splits):
os.makedirs(f"{path_name}/{split_num}")
for setting in ("gc", "gn"):
data_pd = pd.read_csv(f"data/CAT_LARGE/{setting}.csv")
train_pd, val_pd = train_test_split(data_pd, test_size=test_size, random_state=FLAGS.seed+split_num)
train_pd, dev_pd = train_test_split(train_pd, test_size=val_pd.shape[0], random_state=FLAGS.seed+split_num)
train_pd.to_csv(f"{path_name}/{split_num}/{setting}.train.csv", index=False)
dev_pd.to_csv(f"{path_name}/{split_num}/{setting}.dev.csv", index=False)
val_pd.to_csv(f"{path_name}/{split_num}/{setting}.val.csv", index=False)
def train(with_context, verbose=1, splits_path="data/CAT_LARGE/MCCV", the_split_to_use=9):
print(f"Loading the data: Using the '{splits_path}/{the_split_to_use}' split.")
ctx_id = 'c' if with_context > 0 else 'n'
print(f"Operating w/{'' if with_context>0 else 'o'} context")
train_pd = pd.read_csv(f"{splits_path}/{the_split_to_use}/g{ctx_id}.train.csv")
dev_pd = pd.read_csv(f"{splits_path}/{the_split_to_use}/g{ctx_id}.dev.csv")
val_pd = pd.read_csv(f"{splits_path}/{the_split_to_use}/gc.val.csv")
print("Loading the embeddings...")
class_weights = {0: 1, 1: FLAGS.oversample}
embeddings = classifiers.load_embeddings_index()
print("Creating the model...")
with tf.compat.v1.Session() as sess:
if FLAGS.model_name == "RNN:OOC":
model = classifiers.LSTM_CLF(prefix=FLAGS.model_name.lower(), verbose=verbose, n_epochs=FLAGS.epochs)
else:
if FLAGS.model_name == "RNN:INC1":
model = classifiers.LSTM_IC1_CLF(prefix=FLAGS.model_name.lower(), verbose=verbose, n_epochs=FLAGS.epochs, patience=FLAGS.patience)
elif FLAGS.model_name == "RNN:INC2":
model = classifiers.LSTM_IC2_CLF(prefix=FLAGS.model_name.lower(), verbose=verbose, n_epochs=FLAGS.epochs, patience=FLAGS.patience)
elif "RNN" in FLAGS.model_name:
print("Not implemented yet...")
else:
if "BERT" in FLAGS.model_name:
os.environ['TFHUB_CACHE_DIR'] = 'embeddings'
lr = 2e-05
if FLAGS.model_name == "BERT:OOC":
print("Training BERT with no context mechanism added.")
model = classifiers.BERT_MLP(patience=FLAGS.patience, lr=lr, epochs=FLAGS.epochs, session=sess)
elif FLAGS.model_name == "BERT:INC1":
print("Training BERT with parent concatenated to text.")
model = classifiers.BERT_MLP(patience=FLAGS.patience, lr=lr, DATA2_COLUMN="parent", epochs=FLAGS.epochs, session=sess)
elif FLAGS.model_name == "BERT:INC2":
print("Training BERT with a context-reading mechanism added.")
model = classifiers.BERT_MLP_CA(patience=FLAGS.patience, lr=lr, epochs=FLAGS.epochs, session=sess)
elif FLAGS.model_name == "BERT:CCTK":
print("Training BERT over CCTK")
model = classifiers.BERT_MLP(patience=FLAGS.patience, lr=lr, epochs=FLAGS.epochs, session=sess)
cctk = pd.read_csv("data/CCTK.csv.zip", nrows=100000)
x_train_pd, x_dev_pd = train_test_split(
pd.DataFrame({"text": cctk.comment_text, "label": cctk.target.apply(round)}),
test_size=0.1,
random_state=FLAGS.seed
)
model.fit(train=x_train_pd,
dev=x_dev_pd,
class_weights=class_weights,
pretrained_embeddings=embeddings)
cctk_preds_pd = pd.DataFrame()
for i in range(10):
x_val_pd = pd.read_csv(f"data/CAT_LARGE/{i}/ic.val.csv")
gold, predictions = x_val_pd.label.to_numpy(), model.predict(x_val_pd).flatten()
score = roc_auc_score(gold, predictions)
print(f"ROC-AUC@{i}: {score}")
cctk_preds_pd[f"MCCV_{i}"] = predictions
cctk_preds_pd.to_csv("cctk.csv")
model.model.save_weights("bert_weights.h5")
else:
sys.exit("ERROR: Not implemented yet...")
print(f"Training {model.name}...")
model.fit(train=train_pd, dev=dev_pd, class_weights=class_weights, pretrained_embeddings=embeddings)
gold, predictions = val_pd.label.to_numpy(), model.predict(val_pd).flatten()
score = roc_auc_score(gold, predictions)
print("Evaluating...")
print(f"ROC-AUC: {score}")
print(f"STATS: toxicity (%) at predicted: {np.mean(predictions)} vs at gold: {np.mean(gold)}")
if FLAGS.confidence_intervals !=0:
score, intervals = ci.AUC(gold_truth=list(gold), predictions=list(predictions)).evaluate()
print(f"ROC-AUC ± CIs: {score} ± {intervals}")
return score, predictions, model
def repeat_experiment():
scores = []
predictions_pd = pd.DataFrame()
model_name = ""
splits_path = f"data/CAT_LARGE/MCCV"
if not os.path.exists(splits_path):
sys.exit(f"ERROR: {splits_path} is empty! Make sure the desired dataset is successfully created.")
os.mkdir(FLAGS.experiment_version_name)
for i in range(FLAGS.repeat):
print(f"REPETITION: {i}")
score, predictions, model = train(FLAGS.with_context_data, splits_path=splits_path, the_split_to_use=i)
scores.append(score)
predictions_pd[f"split{i}"] = predictions
model_name = model.name
return np.mean(scores), sem(scores), predictions_pd, model_name # the last model used - the same for all runs
def model_train(at_split):
splits_path = f"data/CAT_LARGE/MCCV"
score, predictions, model = train(FLAGS.with_context_data,
FLAGS.model_name,
splits_path=splits_path,
the_split_to_use=at_split)
#model.save() todo: fix.
return score, predictions
def main(argv):
if FLAGS.create_random_splits>0:
# Prepare the data for Monte Carlo k-fold Cross Validation
print(f"Splitting the data randomly into {FLAGS.create_random_splits} splits")
split_to_random_sets(splits=FLAGS.create_random_splits)
elif FLAGS.repeat == 0:
# Run at a single split
score, predictions = model_train(FLAGS.at_split)
print(f"{score}")
pd.DataFrame(predictions).to_csv(f"{FLAGS.model_name}.predictions.csv")
elif FLAGS.repeat > 0:
# Perform Monte Carlo Cross Validation
# INFO: Recall to set "repeat" to the correct folds number
score, sem, predictions_pd, model_name = repeat_experiment()
predictions_pd.to_csv(f"{FLAGS.experiment_version_name}/{model_name}.predictions.csv")
print (f"{score} ± {sem}")
if __name__ == "__main__":
app.run(main)