Skip to content

Commit 7bee4fa

Browse files
add word2vec and glove
1 parent 7135dca commit 7bee4fa

File tree

2 files changed

+534
-276
lines changed

2 files changed

+534
-276
lines changed

nlp_class2/glove.py

Lines changed: 300 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,300 @@
1+
import os
2+
import json
3+
import numpy as np
4+
import theano
5+
import theano.tensor as T
6+
import matplotlib.pyplot as plt
7+
8+
from datetime import datetime
9+
from sklearn.utils import shuffle
10+
from word2vec import get_wikipedia_data, find_analogies
11+
12+
# Experiments
13+
# previous results did not make sense b/c X was built incorrectly
14+
# redo b/c b and c were not being added correctly as 2-D objects
15+
# can get < 200k cost
16+
17+
# using coordinate descent, what's the least # files to get correct analogies?
18+
# use this for word2vec training to make it faster
19+
# first tried 20 files --> not enough
20+
# how about 30 files --> some correct but still not enough
21+
# 40 files --> half right but 50 is better
22+
23+
class Glove:
24+
def __init__(self, D, V, context_sz):
25+
self.D = D
26+
self.V = V
27+
self.context_sz = context_sz
28+
29+
def fit(self, sentences, cc_matrix=None, learning_rate=10e-5, reg=0.1, xmax=100, alpha=0.75, epochs=10, gd=False, use_theano=True):
30+
# build co-occurrence matrix
31+
# paper calls it X, so we will call it X, instead of calling
32+
# the training data X
33+
# TODO: would it be better to use a sparse matrix?
34+
t0 = datetime.now()
35+
V = self.V
36+
D = self.D
37+
38+
if not os.path.exists(cc_matrix):
39+
X = np.zeros((V, V))
40+
N = len(sentences)
41+
print "number of sentences to process:", N
42+
it = 0
43+
for sentence in sentences:
44+
it += 1
45+
if it % 10000 == 0:
46+
print "processed", it, "/", N
47+
n = len(sentence)
48+
for i in xrange(n):
49+
# i is not the word index!!!
50+
# j is not the word index!!!
51+
# i just points to which element of the sequence (sentence) we're looking at
52+
wi = sentence[i]
53+
54+
start = max(0, i - self.context_sz)
55+
end = min(n, i + self.context_sz)
56+
57+
# we can either choose only one side as context, or both
58+
# here we are doing both
59+
60+
# make sure "start" and "end" tokens are part of some context
61+
# otherwise their f(X) will be 0 (denominator in bias update)
62+
if i - self.context_sz < 0:
63+
points = 1.0 / (i + 1)
64+
X[wi,0] += points
65+
X[0,wi] += points
66+
if i + self.context_sz > n:
67+
points = 1.0 / (n - i)
68+
X[wi,1] += points
69+
X[1,wi] += points
70+
71+
# left side
72+
for j in xrange(start, i):
73+
wj = sentence[j]
74+
points = 1.0 / (i - j) # this is +ve
75+
X[wi,wj] += points
76+
X[wj,wi] += points
77+
78+
# right side
79+
for j in xrange(i + 1, end):
80+
wj = sentence[j]
81+
points = 1.0 / (j - i) # this is +ve
82+
X[wi,wj] += points
83+
X[wj,wi] += points
84+
85+
# save the cc matrix because it takes forever to create
86+
np.save(cc_matrix, X)
87+
else:
88+
X = np.load(cc_matrix)
89+
90+
print "max in X:", X.max()
91+
92+
# weighting
93+
fX = np.zeros((V, V))
94+
fX[X < xmax] = (X[X < xmax] / float(xmax)) ** alpha
95+
fX[X >= xmax] = 1
96+
97+
print "max in f(X):", fX.max()
98+
99+
# target
100+
logX = np.log(X + 1)
101+
102+
print "max in log(X):", logX.max()
103+
104+
print "time to build co-occurrence matrix:", (datetime.now() - t0)
105+
106+
# initialize weights
107+
W = np.random.randn(V, D) / np.sqrt(V + D)
108+
b = np.zeros(V)
109+
U = np.random.randn(V, D) / np.sqrt(V + D)
110+
c = np.zeros(V)
111+
mu = logX.mean()
112+
113+
if gd and use_theano:
114+
thW = theano.shared(W)
115+
thb = theano.shared(b)
116+
thU = theano.shared(U)
117+
thc = theano.shared(c)
118+
thLogX = T.matrix('logX')
119+
thfX = T.matrix('fX')
120+
121+
params = [thW, thb, thU, thc]
122+
123+
thDelta = thW.dot(thU.T) + T.reshape(thb, (V, 1)) + T.reshape(thc, (1, V)) + mu - thLogX
124+
thCost = ( thfX * thDelta * thDelta ).sum()
125+
126+
grads = T.grad(thCost, params)
127+
128+
updates = [(p, p - learning_rate*g) for p, g in zip(params, grads)]
129+
130+
train_op = theano.function(
131+
inputs=[thfX, thLogX],
132+
updates=updates,
133+
)
134+
135+
costs = []
136+
sentence_indexes = range(len(sentences))
137+
for epoch in xrange(epochs):
138+
delta = W.dot(U.T) + b.reshape(V, 1) + c.reshape(1, V) + mu - logX
139+
cost = ( fX * delta * delta ).sum()
140+
costs.append(cost)
141+
print "epoch:", epoch, "cost:", cost
142+
143+
if gd:
144+
# gradient descent method
145+
146+
if use_theano:
147+
train_op(fX, logX)
148+
W = thW.get_value()
149+
b = thb.get_value()
150+
U = thU.get_value()
151+
c = thc.get_value()
152+
153+
else:
154+
# update W
155+
oldW = W.copy()
156+
for i in xrange(V):
157+
# for j in xrange(V):
158+
# W[i] -= learning_rate*fX[i,j]*(W[i].dot(U[j]) + b[i] + c[j] + mu - logX[i,j])*U[j]
159+
W[i] -= learning_rate*(fX[i,:]*delta[i,:]).dot(U)
160+
W -= learning_rate*reg*W
161+
# print "updated W"
162+
163+
# update b
164+
for i in xrange(V):
165+
# for j in xrange(V):
166+
# b[i] -= learning_rate*fX[i,j]*(W[i].dot(U[j]) + b[i] + c[j] + mu - logX[i,j])
167+
b[i] -= learning_rate*fX[i,:].dot(delta[i,:])
168+
b -= learning_rate*reg*b
169+
# print "updated b"
170+
171+
# update U
172+
for j in xrange(V):
173+
# for i in xrange(V):
174+
# U[j] -= learning_rate*fX[i,j]*(W[i].dot(U[j]) + b[i] + c[j] + mu - logX[i,j])*W[i]
175+
U[j] -= learning_rate*(fX[:,j]*delta[:,j]).dot(oldW)
176+
U -= learning_rate*reg*U
177+
# print "updated U"
178+
179+
# update c
180+
for j in xrange(V):
181+
# for i in xrange(V):
182+
# c[j] -= learning_rate*fX[i,j]*(W[i].dot(U[j]) + b[i] + c[j] + mu - logX[i,j])
183+
c[j] -= learning_rate*fX[:,j].dot(delta[:,j])
184+
c -= learning_rate*reg*c
185+
# print "updated c"
186+
187+
else:
188+
# coordinate descent method
189+
190+
# update W
191+
# fast way
192+
# t0 = datetime.now()
193+
for i in xrange(V):
194+
# matrix = reg*np.eye(D) + np.sum((fX[i,j]*np.outer(U[j], U[j]) for j in xrange(V)), axis=0)
195+
matrix = reg*np.eye(D) + (fX[i,:]*U.T).dot(U)
196+
# assert(np.abs(matrix - matrix2).sum() < 10e-5)
197+
vector = (fX[i,:]*(logX[i,:] - b[i] - c - mu)).dot(U)
198+
W[i] = np.linalg.solve(matrix, vector)
199+
# print "fast way took:", (datetime.now() - t0)
200+
201+
# slow way
202+
# t0 = datetime.now()
203+
# for i in xrange(V):
204+
# matrix2 = reg*np.eye(D)
205+
# vector2 = 0
206+
# for j in xrange(V):
207+
# # coordinate descent method
208+
# matrix2 += fX[i,j]*np.outer(U[j], U[j])
209+
# vector2 += fX[i,j]*(logX[i,j] - b[i] - c[j])*U[j]
210+
# print "slow way took:", (datetime.now() - t0)
211+
212+
# assert(np.abs(matrix - matrix2).sum() < 10e-5)
213+
# assert(np.abs(vector - vector2).sum() < 10e-5)
214+
# W[i] = np.linalg.solve(matrix, vector)
215+
# print "updated W"
216+
217+
# update b
218+
for i in xrange(V):
219+
denominator = fX[i,:].sum()
220+
# assert(denominator > 0)
221+
numerator = fX[i,:].dot(logX[i,:] - W[i].dot(U.T) - c - mu)
222+
# for j in xrange(V):
223+
# numerator += fX[i,j]*(logX[i,j] - W[i].dot(U[j]) - c[j])
224+
b[i] = numerator / denominator / (1 + reg)
225+
# print "updated b"
226+
227+
# update U
228+
for j in xrange(V):
229+
# matrix = reg*np.eye(D) + np.sum((fX[i,j]*np.outer(W[i], W[i]) for i in xrange(V)), axis=0)
230+
matrix = reg*np.eye(D) + (fX[:,j]*W.T).dot(W)
231+
# assert(np.abs(matrix - matrix2).sum() < 10e-8)
232+
vector = (fX[:,j]*(logX[:,j] - b - c[j] - mu)).dot(W)
233+
# matrix = reg*np.eye(D)
234+
# vector = 0
235+
# for i in xrange(V):
236+
# matrix += fX[i,j]*np.outer(W[i], W[i])
237+
# vector += fX[i,j]*(logX[i,j] - b[i] - c[j])*W[i]
238+
U[j] = np.linalg.solve(matrix, vector)
239+
# print "updated U"
240+
241+
# update c
242+
for j in xrange(V):
243+
denominator = fX[:,j].sum()
244+
numerator = fX[:,j].dot(logX[:,j] - W.dot(U[j]) - b - mu)
245+
# for i in xrange(V):
246+
# numerator += fX[i,j]*(logX[i,j] - W[i].dot(U[j]) - b[i])
247+
c[j] = numerator / denominator / (1 + reg)
248+
# print "updated c"
249+
250+
self.W = W
251+
self.U = U
252+
253+
plt.plot(costs)
254+
plt.show()
255+
256+
def save(self, fn):
257+
# function word_analogies expects a (V,D) matrx and a (D,V) matrix
258+
arrays = [self.W, self.U.T]
259+
np.savez(fn, *arrays)
260+
261+
262+
def main(we_file, w2i_file, n_files=50):
263+
cc_matrix = "cc_matrix_%s.npy" % n_files
264+
265+
# hacky way of checking if we need to re-load the raw data or not
266+
# remember, only the co-occurrence matrix is needed for training
267+
if os.path.exists(cc_matrix):
268+
with open(w2i_file) as f:
269+
word2idx = json.load(f)
270+
sentences = [] # dummy - we won't actually use it
271+
else:
272+
sentences, word2idx = get_wikipedia_data(n_files=n_files, n_vocab=2000)
273+
with open(w2i_file, 'w') as f:
274+
json.dump(word2idx, f)
275+
276+
V = len(word2idx)
277+
model = Glove(80, V, 10)
278+
# model.fit(sentences, cc_matrix=cc_matrix, epochs=20) # coordinate descent
279+
model.fit(
280+
sentences,
281+
cc_matrix=cc_matrix,
282+
learning_rate=3*10e-5,
283+
reg=0.01,
284+
epochs=2000,
285+
gd=True,
286+
use_theano=False
287+
) # gradient descent
288+
model.save(we_file)
289+
290+
291+
if __name__ == '__main__':
292+
we = 'glove_model_50.npz'
293+
w2i = 'glove_word2idx_50.json'
294+
main(we, w2i)
295+
for concat in (True, False):
296+
print "** concat:", concat
297+
find_analogies('king', 'man', 'woman', concat, we, w2i)
298+
find_analogies('france', 'paris', 'london', concat, we, w2i)
299+
find_analogies('france', 'paris', 'rome', concat, we, w2i)
300+
find_analogies('paris', 'france', 'italy', concat, we, w2i)

0 commit comments

Comments
 (0)