Skip to content

Commit c9f5a88

Browse files
update
1 parent f539f0d commit c9f5a88

File tree

1 file changed

+225
-0
lines changed

1 file changed

+225
-0
lines changed

ner.py

Lines changed: 225 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,225 @@
1+
from collections import Counter
2+
from itertools import product
3+
from collections import defaultdict
4+
from sklearn.metrics import f1_score
5+
import random
6+
import operator
7+
import sys
8+
import time
9+
10+
11+
def load_dataset_sents(file_path, as_zip=True, to_idx=False, token_vocab=None, target_vocab=None):
12+
targets = []
13+
inputs = []
14+
zip_inps = []
15+
with open(file_path) as f:
16+
for line in f:
17+
sent, tags = line.split('\t')
18+
words = [token_vocab[w.strip()] if to_idx else w.strip() for w in sent.split()]
19+
ner_tags = [target_vocab[w.strip()] if to_idx else w.strip() for w in tags.split()]
20+
inputs.append(words)
21+
targets.append(ner_tags)
22+
zip_inps.append(list(zip(words, ner_tags)))
23+
return zip_inps if as_zip else (inputs, targets)
24+
25+
#Get the word_label counts in the corpus
26+
def get_current_word_current_label_counts(train_data):
27+
train_set = []
28+
counts = {}
29+
for i in range(len(train_data)):
30+
train_set.extend(train_data[i])
31+
counts = Counter(train_set)
32+
33+
return counts
34+
35+
36+
def viterbi(words, w, features):
37+
labels = ["O", "PER", "LOC", "ORG", "MISC"]
38+
counts_list = []
39+
40+
best_label = []
41+
for word in words:
42+
counts = {}
43+
best = {}
44+
#Getting weights for each label
45+
for label in labels:
46+
phi = phi_1([word], [label], features)
47+
count_phi = 0
48+
for key in phi:
49+
count_phi += w[key] * phi[key]
50+
51+
if counts_list:
52+
maxVal = -100
53+
for prev_label in labels:
54+
55+
count = counts_list[-1][prev_label]
56+
count += count_phi
57+
if count > maxVal:
58+
counts[label] = count
59+
maxVal = count
60+
best[label] = prev_label
61+
else:
62+
counts[label] = count_phi
63+
counts_list.append(counts)
64+
best_label.append(best)
65+
last_label = max(counts_list[-1].items(), key=operator.itemgetter(1))[0]
66+
final_labels = [last_label]
67+
for i in range(len(words)-1):
68+
final_labels.insert(0,best_label[-1-i][final_labels[-1-i]])
69+
return final_labels
70+
71+
def beam(words, w, features):
72+
labels = ["O", "PER", "LOC", "ORG", "MISC"]
73+
counts_list = []
74+
75+
best_label = []
76+
top_labels = []
77+
for word in words:
78+
counts = {}
79+
best = {}
80+
#Getting weights for each label
81+
for label in labels:
82+
phi = phi_1([word], [label], features)
83+
count_phi = 0
84+
for key in phi:
85+
count_phi += w[key] * phi[key]
86+
#if counts list is not empty
87+
if counts_list:
88+
maxVal = -100
89+
for prev_label in top_labels:
90+
count = counts_list[-1][prev_label]
91+
count += count_phi
92+
if count > maxVal:
93+
counts[label] = count
94+
maxVal = count
95+
best[label] = prev_label
96+
#if counts_list is empty
97+
else:
98+
counts[label] = count_phi
99+
counts_list.append(counts)
100+
#Using Beam Search with Beam = 5, you can change [:5] below to any number less than or equal to 5 to get
101+
# Beam search for that Beam size
102+
top_labels = sorted(counts, key=counts.get, reverse=True)[:5]
103+
best_label.append(best)
104+
last_label = max(counts_list[-1].items(), key=operator.itemgetter(1))[0]
105+
final_labels = [last_label]
106+
for i in range(len(words)-1):
107+
final_labels.insert(0,best_label[-1-i][final_labels[-1-i]])
108+
return final_labels
109+
110+
#Implementation for PHI1
111+
def phi_1(words, labels, cw_cl_counts):
112+
dictionary = defaultdict(int)
113+
#Making a dictionary with word, labels and their counts
114+
for i in range(len(words)):
115+
if (words[i], labels[i]) in cw_cl_counts:
116+
dictionary[words[i],labels[i]] += 1
117+
else:
118+
dictionary[words[i],labels[i]] = 0
119+
return dictionary
120+
121+
#Perceptron train of PHI1
122+
def phi1_perceptron_train(train_data, features, maxIter, scheme):
123+
labels = ["O", "PER", "LOC", "ORG", "MISC"]
124+
w = defaultdict(int)
125+
for iterr in range(maxIter):
126+
print("Iteration #: ", iterr+1, " for Phi1 Train")
127+
random.shuffle(train_data)
128+
for sentence in train_data:
129+
words = []
130+
#Generating all possible labels
131+
sentence_labels = []
132+
#getting all words in sentence in words list
133+
for word, label in sentence:
134+
words.append(word)
135+
sentence_labels.append(label)
136+
137+
if scheme == '-v':
138+
predict_label = viterbi(words,w,features)
139+
elif scheme == '-b':
140+
predict_label = beam(words,w,features)
141+
142+
predict_phi = phi_1(words,predict_label,features)
143+
correct_phi = phi_1(words, sentence_labels, features)
144+
#Adjust weights
145+
if predict_label != sentence_labels:
146+
147+
for key in correct_phi:
148+
w[key] += correct_phi[key]
149+
150+
for key in predict_phi:
151+
w[key] -= predict_phi[key]
152+
return w
153+
154+
def phi1_perceptron_test(test_data, w, features, scheme):
155+
labels = ["O", "PER", "LOC", "ORG", "MISC"]
156+
all_possible_labels = []
157+
#w = defaultdict(int)
158+
correct = []
159+
predicted = []
160+
for sentence in test_data:
161+
words = []
162+
all_possible_labels = list(product(labels,repeat = len(sentence)))
163+
sentence_labels = []
164+
for word, label in sentence:
165+
words.append(word)
166+
sentence_labels.append(label)
167+
correct.append(sentence_labels)
168+
#Choosing the Scheme (Viterbi, Beam)
169+
if scheme == '-v':
170+
predict_label = viterbi(words,w,features)
171+
elif scheme == '-b':
172+
predict_label = beam(words,w,features)
173+
predicted.append(predict_label)
174+
175+
#Flatting the lists with correct and predicted labels
176+
flat_cor = []
177+
flat_pre = []
178+
for sublist in correct:
179+
for item in sublist:
180+
flat_cor.append(item)
181+
182+
for sublist in predicted:
183+
for item in sublist:
184+
flat_pre.append(item)
185+
186+
return flat_cor, flat_pre
187+
188+
def main():
189+
#Getting file paths from the command line arguments
190+
train_path = sys.argv[2]
191+
test_path = sys.argv[3]
192+
scheme = sys.argv[1]
193+
flat_cor = []
194+
flat_pre = []
195+
maxIter = 5
196+
train_data = load_dataset_sents(train_path)
197+
test_data = load_dataset_sents(test_path)
198+
random.seed(1)
199+
200+
start = time.time()
201+
if (scheme == '-v'):
202+
print("\nUsing..." + "\t Viterbi" + " and Using ", maxIter, " Iterations and Seed = 1\n")
203+
elif (scheme == '-b'):
204+
print("\nUsing..." + "\t Beam Search" + " and Using ", maxIter, " Iterations and Seed = 1\n")
205+
else:
206+
print("\nWrong arguments... Exiting Program\n")
207+
exit()
208+
209+
#getting word, tag counts in the corpus
210+
cw_cl_counts = {}
211+
cw_cl_counts = get_current_word_current_label_counts(train_data)
212+
213+
#Getting results for PHI1
214+
weights_phi1 = phi1_perceptron_train(train_data, cw_cl_counts, maxIter, scheme)
215+
flat_cor_phi1, flat_pre_phi1 = phi1_perceptron_test(test_data, weights_phi1, cw_cl_counts, scheme)
216+
217+
print("\n ---------------------------------------------------------------------------")
218+
f1_micro = f1_score(flat_cor_phi1, flat_pre_phi1, average='micro', labels=['ORG', 'MISC', 'PER', 'LOC'])
219+
print('F1 Score for PHI 1: ', round(f1_micro, 5))
220+
print("--------------------------------------------------------------------------- \n")
221+
222+
end = time.time()
223+
print("Total Time Elapsed: ", (end - start), " seconds\n")
224+
if __name__ == '__main__':
225+
main()

0 commit comments

Comments
 (0)