Skip to content

Commit 7135dca

Browse files
wip
1 parent cb6e381 commit 7135dca

File tree

8 files changed

+49707
-0
lines changed

8 files changed

+49707
-0
lines changed

nlp_class2/ner.txt

Lines changed: 48863 additions & 0 deletions
Large diffs are not rendered by default.

nlp_class2/ner_baseline.py

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
# data from https://github.com/aritter/twitter_nlp/blob/master/data/annotated/ner.txt
2+
# data2 from http://schwa.org/projects/resources/wiki/Wikiner#WikiGold
3+
4+
import numpy as np
5+
from sklearn.utils import shuffle
6+
from pos_baseline import LogisticRegression
7+
8+
def get_data(split_sequences=False):
9+
word2idx = {}
10+
tag2idx = {}
11+
word_idx = 0
12+
tag_idx = 0
13+
Xtrain = []
14+
Ytrain = []
15+
currentX = []
16+
currentY = []
17+
for line in open('ner.txt'):
18+
line = line.rstrip()
19+
if line:
20+
r = line.split()
21+
word, tag = r
22+
word = word.lower()
23+
if word not in word2idx:
24+
word2idx[word] = word_idx
25+
word_idx += 1
26+
currentX.append(word2idx[word])
27+
28+
if tag not in tag2idx:
29+
tag2idx[tag] = tag_idx
30+
tag_idx += 1
31+
currentY.append(tag2idx[tag])
32+
elif split_sequences:
33+
Xtrain.append(currentX)
34+
Ytrain.append(currentY)
35+
currentX = []
36+
currentY = []
37+
38+
if not split_sequences:
39+
Xtrain = currentX
40+
Ytrain = currentY
41+
42+
print "number of samples:", len(Xtrain)
43+
Xtrain, Ytrain = shuffle(Xtrain, Ytrain)
44+
Ntest = int(0.3*len(Xtrain))
45+
Xtest = Xtrain[:Ntest]
46+
Ytest = Ytrain[:Ntest]
47+
Xtrain = Xtrain[Ntest:]
48+
Ytrain = Ytrain[Ntest:]
49+
print "number of classes:", len(tag2idx)
50+
return Xtrain, Ytrain, Xtest, Ytest, word2idx, tag2idx
51+
52+
53+
def get_data2(split_sequences=False):
54+
word2idx = {}
55+
tag2idx = {}
56+
word_idx = 0
57+
tag_idx = 0
58+
Xtrain = []
59+
Ytrain = []
60+
for line in open('../large_files/aij-wikiner-en-wp3'):
61+
# each line is a full sentence
62+
currentX = []
63+
currentY = []
64+
line = line.rstrip()
65+
if not line:
66+
continue
67+
triples = line.split()
68+
for triple in triples:
69+
word, _, tag = triple.split('|')
70+
if word not in word2idx:
71+
word2idx[word] = word_idx
72+
word_idx += 1
73+
currentX.append(word2idx[word])
74+
75+
if tag not in tag2idx:
76+
tag2idx[tag] = tag_idx
77+
tag_idx += 1
78+
currentY.append(tag2idx[tag])
79+
80+
Xtrain.append(currentX)
81+
Ytrain.append(currentY)
82+
83+
if not split_sequences:
84+
Xtrain = np.concatenate(Xtrain)
85+
Ytrain = np.concatenate(Ytrain)
86+
87+
print "number of samples:", len(Xtrain)
88+
Xtrain, Ytrain = shuffle(Xtrain, Ytrain)
89+
Ntest = int(0.3*len(Xtrain))
90+
Xtest = Xtrain[:Ntest]
91+
Ytest = Ytrain[:Ntest]
92+
Xtrain = Xtrain[Ntest:]
93+
Ytrain = Ytrain[Ntest:]
94+
print "number of classes:", len(tag2idx)
95+
return Xtrain, Ytrain, Xtest, Ytest, word2idx, tag2idx
96+
97+
98+
def main():
99+
Xtrain, Ytrain, Xtest, Ytest, word2idx, tag2idx = get_data2()
100+
101+
V = len(word2idx)
102+
print "vocabulary size:", V
103+
K = len(tag2idx)
104+
105+
# train and score
106+
model = LogisticRegression()
107+
model.fit(Xtrain, Ytrain, V=V, K=K, epochs=5)
108+
print "training complete"
109+
print "train score:", model.score(Xtrain, Ytrain)
110+
print "train f1 score:", model.f1_score(Xtrain, Ytrain)
111+
print "test score:", model.score(Xtest, Ytest)
112+
print "test f1 score:", model.f1_score(Xtest, Ytest)
113+
114+
if __name__ == '__main__':
115+
main()

nlp_class2/ner_rnn.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
from ner_baseline import get_data, get_data2
2+
from pos_rnn import RNN
3+
4+
def main():
5+
Xtrain, Ytrain, Xtest, Ytest, word2idx, tag2idx = get_data2(split_sequences=True)
6+
V = len(word2idx)
7+
rnn = RNN(50, [50], V)
8+
rnn.fit(Xtrain, Ytrain, epochs=30)
9+
print "train f1 score:", rnn.f1_score(Xtrain, Ytrain)
10+
print "test f1 score:", rnn.f1_score(Xtest, Ytest)
11+
12+
13+
if __name__ == '__main__':
14+
main()

nlp_class2/pos_baseline.py

Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
# not considering context
2+
3+
import numpy as np
4+
import theano
5+
import theano.tensor as T
6+
import matplotlib.pyplot as plt
7+
from sklearn.utils import shuffle
8+
from sklearn.metrics import f1_score
9+
# from sklearn.linear_model import LogisticRegression
10+
11+
12+
class LogisticRegression:
13+
def __init__(self):
14+
pass
15+
16+
def fit(self, X, Y, V=None, K=None, D=50, lr=10e-1, mu=0.99, batch_sz=100, epochs=6):
17+
if V is None:
18+
V = len(set(X))
19+
if K is None:
20+
K = len(set(Y))
21+
N = len(X)
22+
23+
# We = np.random.randn(V, D) / np.sqrt(V + D)
24+
# W = np.random.randn(D, K) / np.sqrt(D + K)
25+
W = np.random.randn(V, K) / np.sqrt(V + K)
26+
b = np.zeros(K)
27+
self.W = theano.shared(W)
28+
self.b = theano.shared(b)
29+
self.params = [self.W, self.b]
30+
31+
thX = T.ivector('X')
32+
thY = T.ivector('Y')
33+
34+
py_x = T.nnet.softmax(self.W[thX] + self.b)
35+
prediction = T.argmax(py_x, axis=1)
36+
37+
cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY]))
38+
grads = T.grad(cost, self.params)
39+
dparams = [theano.shared(p.get_value()*0) for p in self.params]
40+
self.cost_predict_op = theano.function(
41+
inputs=[thX, thY],
42+
outputs=[cost, prediction],
43+
allow_input_downcast=True,
44+
)
45+
46+
updates = [
47+
(p, p + mu*dp - lr*g) for p, dp, g in zip(self.params, dparams, grads)
48+
] + [
49+
(dp, mu*dp - lr*g) for dp, g in zip(dparams, grads)
50+
]
51+
train_op = theano.function(
52+
inputs=[thX, thY],
53+
outputs=[cost, prediction],
54+
updates=updates,
55+
allow_input_downcast=True
56+
)
57+
58+
costs = []
59+
n_batches = N / batch_sz
60+
for i in xrange(epochs):
61+
X, Y = shuffle(X, Y)
62+
print "epoch:", i
63+
for j in xrange(n_batches):
64+
Xbatch = X[j*batch_sz:(j*batch_sz + batch_sz)]
65+
Ybatch = Y[j*batch_sz:(j*batch_sz + batch_sz)]
66+
67+
c, p = train_op(Xbatch, Ybatch)
68+
costs.append(c)
69+
if j % 200 == 0:
70+
print "i:", i, "j:", j, "n_batches:", n_batches, "cost:", c, "error:", np.mean(p != Ybatch)
71+
plt.plot(costs)
72+
plt.show()
73+
74+
def score(self, X, Y):
75+
_, p = self.cost_predict_op(X, Y)
76+
return np.mean(p == Y)
77+
78+
def f1_score(self, X, Y):
79+
_, p = self.cost_predict_op(X, Y)
80+
return f1_score(Y, p, average=None).mean()
81+
82+
83+
def get_data(split_sequences=False):
84+
word2idx = {}
85+
tag2idx = {}
86+
word_idx = 0
87+
tag_idx = 0
88+
Xtrain = []
89+
Ytrain = []
90+
currentX = []
91+
currentY = []
92+
for line in open('chunking/train.txt'):
93+
line = line.rstrip()
94+
if line:
95+
r = line.split()
96+
word, tag, _ = r
97+
if word not in word2idx:
98+
word2idx[word] = word_idx
99+
word_idx += 1
100+
currentX.append(word2idx[word])
101+
102+
if tag not in tag2idx:
103+
tag2idx[tag] = tag_idx
104+
tag_idx += 1
105+
currentY.append(tag2idx[tag])
106+
elif split_sequences:
107+
Xtrain.append(currentX)
108+
Ytrain.append(currentY)
109+
currentX = []
110+
currentY = []
111+
112+
if not split_sequences:
113+
Xtrain = currentX
114+
Ytrain = currentY
115+
116+
# load and score test data
117+
Xtest = []
118+
Ytest = []
119+
currentX = []
120+
currentY = []
121+
for line in open('chunking/test.txt'):
122+
line = line.rstrip()
123+
if line:
124+
r = line.split()
125+
word, tag, _ = r
126+
if word in word2idx:
127+
currentX.append(word2idx[word])
128+
else:
129+
currentX.append(word_idx) # use this as unknown
130+
currentY.append(tag2idx[tag])
131+
elif split_sequences:
132+
Xtest.append(currentX)
133+
Ytest.append(currentY)
134+
currentX = []
135+
currentY = []
136+
if not split_sequences:
137+
Xtest = currentX
138+
Ytest = currentY
139+
140+
return Xtrain, Ytrain, Xtest, Ytest, word2idx
141+
142+
def main():
143+
Xtrain, Ytrain, Xtest, Ytest, word2idx = get_data()
144+
145+
# convert to numpy arrays
146+
Xtrain = np.array(Xtrain)
147+
Ytrain = np.array(Ytrain)
148+
149+
# convert Xtrain to indicator matrix
150+
N = len(Xtrain)
151+
V = len(word2idx) + 1
152+
print "vocabulary size:", V
153+
# Xtrain_indicator = np.zeros((N, V))
154+
# Xtrain_indicator[np.arange(N), Xtrain] = 1
155+
156+
# decision tree
157+
from sklearn.tree import DecisionTreeClassifier
158+
dt = DecisionTreeClassifier()
159+
160+
# without indicator
161+
dt.fit(Xtrain.reshape(N, 1), Ytrain)
162+
print "dt score:", dt.score(Xtrain.reshape(N, 1), Ytrain)
163+
164+
# with indicator -- too slow!!
165+
# dt.fit(Xtrain_indicator, Ytrain)
166+
# print "dt score:", dt.score(Xtrain_indicator, Ytrain)
167+
168+
# train and score
169+
model = LogisticRegression()
170+
model.fit(Xtrain, Ytrain, V=V)
171+
print "training complete"
172+
print "train score:", model.score(Xtrain, Ytrain)
173+
174+
175+
Ntest = len(Xtest)
176+
Xtest = np.array(Xtest)
177+
Ytest = np.array(Ytest)
178+
# convert Xtest to indicator
179+
# Xtest_indicator = np.zeros((Ntest, V))
180+
# Xtest_indicator[np.arange(Ntest), Xtest] = 1
181+
182+
# decision tree test score
183+
print "dt test score:", dt.score(Xtest.reshape(Ntest, 1), Ytest)
184+
# print "dt test score:", dt.score(Xtest_indicator, Ytest) # too slow!
185+
186+
# logistic test score -- too slow!!
187+
print "test score:", model.score(Xtest, Ytest)
188+
189+
if __name__ == '__main__':
190+
main()

0 commit comments

Comments
 (0)