Skip to content

Commit a4718a0

Browse files
committed
split out theano
1 parent 3a85787 commit a4718a0

File tree

1 file changed

+242
-0
lines changed

1 file changed

+242
-0
lines changed

nlp_class2/glove_theano.py

+242
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,242 @@
1+
# Course URL:
2+
# https://deeplearningcourses.com/c/natural-language-processing-with-deep-learning-in-python
3+
# https://udemy.com/natural-language-processing-with-deep-learning-in-python
4+
from __future__ import print_function, division
5+
from builtins import range
6+
# Note: you may need to update your version of future
7+
# sudo pip install -U future
8+
9+
10+
import os
11+
import json
12+
import numpy as np
13+
import theano
14+
import theano.tensor as T
15+
import matplotlib.pyplot as plt
16+
17+
from datetime import datetime
18+
from sklearn.utils import shuffle
19+
from word2vec import get_wikipedia_data, find_analogies, get_sentences_with_word2idx_limit_vocab
20+
21+
# using ALS, what's the least # files to get correct analogies?
22+
# use this for word2vec training to make it faster
23+
# first tried 20 files --> not enough
24+
# how about 30 files --> some correct but still not enough
25+
# 40 files --> half right but 50 is better
26+
27+
28+
def momentum_updates(cost, params, lr=1e-4, mu=0.9):
29+
grads = T.grad(cost, params)
30+
velocities = [theano.shared(
31+
np.zeros_like(p.get_value()).astype(np.float32)
32+
) for p in params]
33+
# updates = [(p, p - learning_rate*g) for p, g in zip(params, grads)]
34+
updates = []
35+
for p, v, g in zip(params, velocities, grads):
36+
newv = mu*v - lr*g
37+
newp = p + newv
38+
updates.append((p, newp))
39+
updates.append((v, newv))
40+
return updates
41+
42+
43+
class Glove:
44+
def __init__(self, D, V, context_sz):
45+
self.D = D
46+
self.V = V
47+
self.context_sz = context_sz
48+
49+
def fit(self, sentences, cc_matrix=None, learning_rate=1e-4, reg=0.1, xmax=100, alpha=0.75, epochs=10, gd=False, use_theano=False, use_tensorflow=False):
50+
# build co-occurrence matrix
51+
# paper calls it X, so we will call it X, instead of calling
52+
# the training data X
53+
# TODO: would it be better to use a sparse matrix?
54+
t0 = datetime.now()
55+
V = self.V
56+
D = self.D
57+
58+
if not os.path.exists(cc_matrix):
59+
X = np.zeros((V, V))
60+
N = len(sentences)
61+
print("number of sentences to process:", N)
62+
it = 0
63+
for sentence in sentences:
64+
it += 1
65+
if it % 10000 == 0:
66+
print("processed", it, "/", N)
67+
n = len(sentence)
68+
for i in range(n):
69+
# i is not the word index!!!
70+
# j is not the word index!!!
71+
# i just points to which element of the sequence (sentence) we're looking at
72+
wi = sentence[i]
73+
74+
start = max(0, i - self.context_sz)
75+
end = min(n, i + self.context_sz)
76+
77+
# we can either choose only one side as context, or both
78+
# here we are doing both
79+
80+
# make sure "start" and "end" tokens are part of some context
81+
# otherwise their f(X) will be 0 (denominator in bias update)
82+
if i - self.context_sz < 0:
83+
points = 1.0 / (i + 1)
84+
X[wi,0] += points
85+
X[0,wi] += points
86+
if i + self.context_sz > n:
87+
points = 1.0 / (n - i)
88+
X[wi,1] += points
89+
X[1,wi] += points
90+
91+
# left side
92+
for j in range(start, i):
93+
wj = sentence[j]
94+
points = 1.0 / (i - j) # this is +ve
95+
X[wi,wj] += points
96+
X[wj,wi] += points
97+
98+
# right side
99+
for j in range(i + 1, end):
100+
wj = sentence[j]
101+
points = 1.0 / (j - i) # this is +ve
102+
X[wi,wj] += points
103+
X[wj,wi] += points
104+
105+
# save the cc matrix because it takes forever to create
106+
np.save(cc_matrix, X)
107+
else:
108+
X = np.load(cc_matrix)
109+
110+
print("max in X:", X.max())
111+
112+
# weighting
113+
fX = np.zeros((V, V))
114+
fX[X < xmax] = (X[X < xmax] / float(xmax)) ** alpha
115+
fX[X >= xmax] = 1
116+
117+
print("max in f(X):", fX.max())
118+
119+
# target
120+
logX = np.log(X + 1)
121+
122+
# cast
123+
fX = fX.astype(np.float32)
124+
logX = logX.astype(np.float32)
125+
126+
print("max in log(X):", logX.max())
127+
128+
print("time to build co-occurrence matrix:", (datetime.now() - t0))
129+
130+
# initialize weights
131+
W = np.random.randn(V, D) / np.sqrt(V + D)
132+
b = np.zeros(V)
133+
U = np.random.randn(V, D) / np.sqrt(V + D)
134+
c = np.zeros(V)
135+
mu = logX.mean()
136+
137+
# initialize weights, inputs, targets placeholders
138+
thW = theano.shared(W.astype(np.float32))
139+
thb = theano.shared(b.astype(np.float32))
140+
thU = theano.shared(U.astype(np.float32))
141+
thc = theano.shared(c.astype(np.float32))
142+
thLogX = T.matrix('logX')
143+
thfX = T.matrix('fX')
144+
145+
params = [thW, thb, thU, thc]
146+
147+
thDelta = thW.dot(thU.T) + T.reshape(thb, (V, 1)) + T.reshape(thc, (1, V)) + mu - thLogX
148+
thCost = ( thfX * thDelta * thDelta ).sum()
149+
150+
# regularization
151+
regularized_cost = thCost + reg*((thW * thW).sum() + (thU * thU).sum())
152+
153+
# grads = T.grad(regularized_cost, params)
154+
# updates = [(p, p - learning_rate*g) for p, g in zip(params, grads)]
155+
updates = momentum_updates(regularized_cost, params, learning_rate)
156+
157+
train_op = theano.function(
158+
inputs=[thfX, thLogX],
159+
updates=updates,
160+
)
161+
162+
cost_op = theano.function(inputs=[thfX, thLogX], outputs=thCost)
163+
164+
costs = []
165+
sentence_indexes = range(len(sentences))
166+
for epoch in range(epochs):
167+
train_op(fX, logX)
168+
cost = cost_op(fX, logX)
169+
costs.append(cost)
170+
print("epoch:", epoch, "cost:", cost)
171+
172+
173+
self.W = thW.get_value()
174+
self.U = thU.get_value()
175+
176+
plt.plot(costs)
177+
plt.show()
178+
179+
def save(self, fn):
180+
# function word_analogies expects a (V,D) matrx and a (D,V) matrix
181+
arrays = [self.W, self.U.T]
182+
np.savez(fn, *arrays)
183+
184+
185+
def main(we_file, w2i_file, use_brown=True, n_files=50):
186+
if use_brown:
187+
cc_matrix = "cc_matrix_brown.npy"
188+
else:
189+
cc_matrix = "cc_matrix_%s.npy" % n_files
190+
191+
# hacky way of checking if we need to re-load the raw data or not
192+
# remember, only the co-occurrence matrix is needed for training
193+
if os.path.exists(cc_matrix):
194+
with open(w2i_file) as f:
195+
word2idx = json.load(f)
196+
sentences = [] # dummy - we won't actually use it
197+
else:
198+
if use_brown:
199+
keep_words = set([
200+
'king', 'man', 'woman',
201+
'france', 'paris', 'london', 'rome', 'italy', 'britain', 'england',
202+
'french', 'english', 'japan', 'japanese', 'chinese', 'italian',
203+
'australia', 'australian', 'december', 'november', 'june',
204+
'january', 'february', 'march', 'april', 'may', 'july', 'august',
205+
'september', 'october',
206+
])
207+
sentences, word2idx = get_sentences_with_word2idx_limit_vocab(n_vocab=5000, keep_words=keep_words)
208+
else:
209+
sentences, word2idx = get_wikipedia_data(n_files=n_files, n_vocab=2000)
210+
211+
with open(w2i_file, 'w') as f:
212+
json.dump(word2idx, f)
213+
214+
V = len(word2idx)
215+
model = Glove(100, V, 10)
216+
model.fit(
217+
sentences,
218+
cc_matrix=cc_matrix,
219+
learning_rate=1e-4,
220+
reg=0.1,
221+
epochs=200,
222+
)
223+
model.save(we_file)
224+
225+
226+
if __name__ == '__main__':
227+
we = 'glove_model_50.npz'
228+
w2i = 'glove_word2idx_50.json'
229+
# we = 'glove_model_brown.npz'
230+
# w2i = 'glove_word2idx_brown.json'
231+
main(we, w2i, use_brown=False)
232+
for concat in (True, False):
233+
print("** concat:", concat)
234+
find_analogies('king', 'man', 'woman', concat, we, w2i)
235+
find_analogies('france', 'paris', 'london', concat, we, w2i)
236+
find_analogies('france', 'paris', 'rome', concat, we, w2i)
237+
find_analogies('paris', 'france', 'italy', concat, we, w2i)
238+
find_analogies('france', 'french', 'english', concat, we, w2i)
239+
find_analogies('japan', 'japanese', 'chinese', concat, we, w2i)
240+
find_analogies('japan', 'japanese', 'italian', concat, we, w2i)
241+
find_analogies('japan', 'japanese', 'australian', concat, we, w2i)
242+
find_analogies('december', 'november', 'june', concat, we, w2i)

0 commit comments

Comments
 (0)