-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.py
90 lines (73 loc) · 2.94 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import re
import os
import errno
import pandas as pd
try:
os.makedirs("processed_data")
except OSError as e:
if e.errno != errno.EEXIST:
raise
def decontracted(phrase):
phrase = re.sub(r"won't", "will not", phrase)
phrase = re.sub(r"can't", "can not", phrase)
phrase = re.sub(r"gonna", "going to", phrase)
phrase = re.sub(r"wanna", "want to", phrase)
phrase = re.sub(r"n\'t", " not", phrase)
phrase = re.sub(r"\'re", " are", phrase)
phrase = re.sub(r"\'s", " is", phrase)
phrase = re.sub(r"\'d", " would", phrase)
phrase = re.sub(r"\'ll", " will", phrase)
phrase = re.sub(r"\'t", " not", phrase)
phrase = re.sub(r"\'ve", " have", phrase)
phrase = re.sub(r"\'m", " am", phrase)
phrase = re.sub(r"\.+", " ", phrase)
phrase = re.sub(r"[^A-Za-z$]", " ", phrase)
phrase = re.sub(r" +", " ", phrase)
return phrase
def get_df(pref, testing=False):
dataset = []
with open(pref + "_" + "examples.txt") as examples:
for example in examples:
example = example[:-1].lower().split("\t")
if example[0] != decontracted(example[0]):
text = re.sub(example[0], "\$", example[2])
text = decontracted(text)
text = re.sub(r"\$", example[0], text)
example[2] = text
else:
example[2] = decontracted(example[2])
dataset.append(example)
with open(pref + "_" + "hypernyms.txt") as hypernyms:
for index, line in enumerate(hypernyms):
line = line[:-1].lower().split("\t")
line = ";".join(line)
dataset[index].append(line)
with open(pref + "_" + "definitions.txt") as definitions:
for index, line in enumerate(definitions):
line = line[:-1].lower().split(";")
for i in range(len(line)):
line[i] = line[i].strip()
if dataset[index][0] != decontracted(dataset[index][0]):
text = re.sub(dataset[index][0], "\$", line[i])
text = decontracted(text)
text = re.sub(r"\$", dataset[index][0], text)
line[i] = text
else:
line[i] = decontracted(line[i])
line = ";".join(line)
dataset[index].append(line)
cols = ["target", "position", "sentence", "hypernym", "definition"]
if not testing:
cols.append("label")
with open(pref + "_" + "labels.txt") as labels:
for index, line in enumerate(labels):
line = line[:-1]
dataset[index].append(line)
df = pd.DataFrame(dataset, columns=cols)
return df
dev_df = get_df("data/Development/dev")
train_df = get_df("data/Training/train")
test_df = get_df("data/Test/test", True)
dev_df.to_csv("processed_data/dev.csv", index=None)
train_df.to_csv("processed_data/train.csv", index=None)
test_df.to_csv("processed_data/test.csv", index=None)