Skip to content

Commit 13aab2a

Browse files
compartmentalize gru,lstm add decent embeddings
1 parent d064f64 commit 13aab2a

File tree

5 files changed

+326
-0
lines changed

5 files changed

+326
-0
lines changed

rnn_class/gru.py

+55
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
import numpy as np
2+
import theano
3+
import theano.tensor as T
4+
5+
from util import init_weight
6+
7+
8+
class GRU:
9+
def __init__(self, Mi, Mo, activation):
10+
self.Mi = Mi
11+
self.Mo = Mo
12+
self.f = activation
13+
14+
# numpy init
15+
Wxr = init_weight(Mi, Mo)
16+
Whr = init_weight(Mo, Mo)
17+
br = np.zeros(Mo)
18+
Wxz = init_weight(Mi, Mo)
19+
Whz = init_weight(Mo, Mo)
20+
bz = np.zeros(Mo)
21+
Wxh = init_weight(Mi, Mo)
22+
Whh = init_weight(Mo, Mo)
23+
bh = np.zeros(Mo)
24+
h0 = np.zeros(Mo)
25+
26+
# theano vars
27+
self.Wxr = theano.shared(Wxr)
28+
self.Whr = theano.shared(Whr)
29+
self.br = theano.shared(br)
30+
self.Wxz = theano.shared(Wxz)
31+
self.Whz = theano.shared(Whz)
32+
self.bz = theano.shared(bz)
33+
self.Wxh = theano.shared(Wxh)
34+
self.Whh = theano.shared(Whh)
35+
self.bh = theano.shared(bh)
36+
self.h0 = theano.shared(h0)
37+
self.params = [self.Wxr, self.Whr, self.br, self.Wxz, self.Whz, self.bz, self.Wxh, self.Whh, self.bh, self.h0]
38+
39+
def recurrence(self, x_t, h_t1):
40+
r = T.nnet.sigmoid(x_t.dot(self.Wxr) + h_t1.dot(self.Whr) + self.br)
41+
z = T.nnet.sigmoid(x_t.dot(self.Wxz) + h_t1.dot(self.Whz) + self.bz)
42+
hhat = self.f(x_t.dot(self.Wxh) + (r * h_t1).dot(self.Whh) + self.bh)
43+
h = (1 - z) * h_t1 + z * hhat
44+
return h
45+
46+
def output(self, x):
47+
# input X should be a matrix (2-D)
48+
# rows index time
49+
h, _ = theano.scan(
50+
fn=self.recurrence,
51+
sequences=x,
52+
outputs_info=[self.h0],
53+
n_steps=x.shape[0],
54+
)
55+
return h

rnn_class/gru_nonorm_part1_wikipedia_word2idx.json

+1
Large diffs are not rendered by default.
1.22 MB
Binary file not shown.

rnn_class/lstm.py

+88
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
import numpy as np
2+
import theano
3+
import theano.tensor as T
4+
5+
from util import init_weight
6+
7+
8+
class LSTM:
9+
def __init__(self, Mi, Mo, activation):
10+
self.Mi = Mi
11+
self.Mo = Mo
12+
self.f = activation
13+
14+
# numpy init
15+
Wxi = init_weight(Mi, Mo)
16+
Whi = init_weight(Mo, Mo)
17+
Wci = init_weight(Mo, Mo)
18+
bi = np.zeros(Mo)
19+
Wxf = init_weight(Mi, Mo)
20+
Whf = init_weight(Mo, Mo)
21+
Wcf = init_weight(Mo, Mo)
22+
bf = np.zeros(Mo)
23+
Wxc = init_weight(Mi, Mo)
24+
Whc = init_weight(Mo, Mo)
25+
bc = np.zeros(Mo)
26+
Wxo = init_weight(Mi, Mo)
27+
Who = init_weight(Mo, Mo)
28+
Wco = init_weight(Mo, Mo)
29+
bo = np.zeros(Mo)
30+
c0 = np.zeros(Mo)
31+
h0 = np.zeros(Mo)
32+
33+
# theano vars
34+
self.Wxi = theano.shared(Wxi)
35+
self.Whi = theano.shared(Whi)
36+
self.Wci = theano.shared(Wci)
37+
self.bi = theano.shared(bi)
38+
self.Wxf = theano.shared(Wxf)
39+
self.Whf = theano.shared(Whf)
40+
self.Wcf = theano.shared(Wcf)
41+
self.bf = theano.shared(bf)
42+
self.Wxc = theano.shared(Wxc)
43+
self.Whc = theano.shared(Whc)
44+
self.bc = theano.shared(bc)
45+
self.Wxo = theano.shared(Wxo)
46+
self.Who = theano.shared(Who)
47+
self.Wco = theano.shared(Wco)
48+
self.bo = theano.shared(bo)
49+
self.c0 = theano.shared(c0)
50+
self.h0 = theano.shared(h0)
51+
self.params = [
52+
self.Wxi,
53+
self.Whi,
54+
self.Wci,
55+
self.bi,
56+
self.Wxf,
57+
self.Whf,
58+
self.Wcf,
59+
self.bf,
60+
self.Wxc,
61+
self.Whc,
62+
self.bc,
63+
self.Wxo,
64+
self.Who,
65+
self.Wco,
66+
self.bo,
67+
self.c0,
68+
self.h0,
69+
]
70+
71+
def recurrence(self, x_t, h_t1, c_t1):
72+
i_t = T.nnet.sigmoid(x_t.dot(self.Wxi) + h_t1.dot(self.Whi) + c_t1.dot(self.Wci) + self.bi)
73+
f_t = T.nnet.sigmoid(x_t.dot(self.Wxf) + h_t1.dot(self.Whf) + c_t1.dot(self.Wcf) + self.bf)
74+
c_t = f_t * c_t1 + i_t * T.tanh(x_t.dot(self.Wxc) + h_t1.dot(self.Whc) + self.bc)
75+
o_t = T.nnet.sigmoid(x_t.dot(self.Wxo) + h_t1.dot(self.Who) + c_t.dot(self.Wco) + self.bo)
76+
h_t = o_t * T.tanh(c_t)
77+
return h_t, c_t
78+
79+
def output(self, x):
80+
# input X should be a matrix (2-D)
81+
# rows index time
82+
[h, c], _ = theano.scan(
83+
fn=self.recurrence,
84+
sequences=x,
85+
outputs_info=[self.h0, self.c0],
86+
n_steps=x.shape[0],
87+
)
88+
return h

rnn_class/wiki.py

+182
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
import sys
2+
import theano
3+
import theano.tensor as T
4+
import numpy as np
5+
import matplotlib.pyplot as plt
6+
import json
7+
8+
from datetime import datetime
9+
from sklearn.utils import shuffle
10+
from gru import GRU
11+
from lstm import LSTM
12+
from util import init_weight, get_wikipedia_data
13+
14+
15+
class RNN:
16+
def __init__(self, D, hidden_layer_sizes, V):
17+
self.hidden_layer_sizes = hidden_layer_sizes
18+
self.D = D
19+
self.V = V
20+
21+
def fit(self, X, learning_rate=10e-5, mu=0.99, epochs=10, show_fig=True, activation=T.nnet.relu, RecurrentUnit=GRU, normalize=True):
22+
D = self.D
23+
V = self.V
24+
N = len(X)
25+
26+
We = init_weight(V, D)
27+
self.hidden_layers = []
28+
Mi = D
29+
for Mo in self.hidden_layer_sizes:
30+
ru = RecurrentUnit(Mi, Mo, activation)
31+
self.hidden_layers.append(ru)
32+
Mi = Mo
33+
34+
Wo = init_weight(Mi, V)
35+
bo = np.zeros(V)
36+
37+
self.We = theano.shared(We)
38+
self.Wo = theano.shared(Wo)
39+
self.bo = theano.shared(bo)
40+
self.params = [self.Wo, self.bo]
41+
for ru in self.hidden_layers:
42+
self.params += ru.params
43+
44+
thX = T.ivector('X')
45+
thY = T.ivector('Y')
46+
47+
Z = self.We[thX]
48+
for ru in self.hidden_layers:
49+
Z = ru.output(Z)
50+
py_x = T.nnet.softmax(Z.dot(self.Wo) + self.bo)
51+
52+
prediction = T.argmax(py_x, axis=1)
53+
# let's return py_x too so we can draw a sample instead
54+
self.predict_op = theano.function(
55+
inputs=[thX],
56+
outputs=[py_x, prediction],
57+
allow_input_downcast=True,
58+
)
59+
60+
cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY]))
61+
grads = T.grad(cost, self.params)
62+
dparams = [theano.shared(p.get_value()*0) for p in self.params]
63+
64+
dWe = theano.shared(self.We.get_value()*0)
65+
gWe = T.grad(cost, self.We)
66+
dWe_update = mu*dWe - learning_rate*gWe
67+
We_update = self.We + dWe_update
68+
if normalize:
69+
We_update /= We_update.sum(axis=1).dimshuffle(0, 'x')
70+
71+
updates = [
72+
(p, p + mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads)
73+
] + [
74+
(dp, mu*dp - learning_rate*g) for dp, g in zip(dparams, grads)
75+
] + [
76+
(self.We, We_update), (dWe, dWe_update)
77+
]
78+
79+
self.train_op = theano.function(
80+
inputs=[thX, thY],
81+
outputs=[cost, prediction],
82+
updates=updates
83+
)
84+
85+
costs = []
86+
for i in xrange(epochs):
87+
t0 = datetime.now()
88+
X = shuffle(X)
89+
n_correct = 0
90+
n_total = 0
91+
cost = 0
92+
for j in xrange(N):
93+
if np.random.random() < 0.01 or len(X[j]) <= 1:
94+
input_sequence = [0] + X[j]
95+
output_sequence = X[j] + [1]
96+
else:
97+
input_sequence = [0] + X[j][:-1]
98+
output_sequence = X[j]
99+
n_total += len(output_sequence)
100+
101+
# test:
102+
103+
try:
104+
# we set 0 to start and 1 to end
105+
c, p = self.train_op(input_sequence, output_sequence)
106+
except Exception as e:
107+
PYX, pred = self.predict_op(input_sequence)
108+
print "input_sequence len:", len(input_sequence)
109+
print "PYX.shape:",PYX.shape
110+
print "pred.shape:", pred.shape
111+
raise e
112+
# print "p:", p
113+
cost += c
114+
# print "j:", j, "c:", c/len(X[j]+1)
115+
for pj, xj in zip(p, output_sequence):
116+
if pj == xj:
117+
n_correct += 1
118+
if j % 200 == 0:
119+
sys.stdout.write("j/N: %d/%d correct rate so far: %f\r" % (j, N, float(n_correct)/n_total))
120+
sys.stdout.flush()
121+
print "i:", i, "cost:", cost, "correct rate:", (float(n_correct)/n_total), "time for epoch:", (datetime.now() - t0)
122+
costs.append(cost)
123+
124+
if show_fig:
125+
plt.plot(costs)
126+
plt.show()
127+
128+
129+
def train_wikipedia(we_file='word_embeddings.npy', w2i_file='wikipedia_word2idx.json', RecurrentUnit=GRU):
130+
# there are 32 files
131+
sentences, word2idx = get_wikipedia_data(n_files=1, n_vocab=2000)
132+
print "finished retrieving data"
133+
print "vocab size:", len(word2idx), "number of sentences:", len(sentences)
134+
rnn = RNN(30, [30], len(word2idx))
135+
rnn.fit(sentences, learning_rate=10e-6, epochs=10, show_fig=True, activation=T.nnet.relu)
136+
137+
np.save(we_file, rnn.We.get_value())
138+
with open(w2i_file, 'w') as f:
139+
json.dump(word2idx, f)
140+
141+
def generate_wikipedia():
142+
pass
143+
144+
def find_analogies(w1, w2, w3, we_file='word_embeddings.npy', w2i_file='wikipedia_word2idx.json'):
145+
We = np.load(we_file)
146+
with open(w2i_file) as f:
147+
word2idx = json.load(f)
148+
149+
king = We[word2idx[w1]]
150+
man = We[word2idx[w2]]
151+
woman = We[word2idx[w3]]
152+
v0 = king - man + woman
153+
154+
def dist1(a, b):
155+
return np.linalg.norm(a - b)
156+
def dist2(a, b):
157+
return 1 - a.dot(b) / (np.linalg.norm(a) * np.linalg.norm(b))
158+
159+
for dist, name in [(dist1, 'Euclidean'), (dist2, 'cosine')]:
160+
min_dist = float('inf')
161+
best_word = '';
162+
for word, idx in word2idx.iteritems():
163+
if word not in (w1, w2, w3):
164+
v1 = We[idx]
165+
d = dist(v0, v1)
166+
if d < min_dist:
167+
min_dist = d
168+
best_word = word
169+
print "closest match by", name, "distance:", best_word
170+
print w1, "-", w2, "=", best_word, "-", w3
171+
172+
if __name__ == '__main__':
173+
train_wikipedia() # GRU
174+
# train_wikipedia(RecurrentUnit=LSTM)
175+
find_analogies('king', 'man', 'woman')
176+
find_analogies('france', 'paris', 'london')
177+
find_analogies('france', 'paris', 'rome')
178+
find_analogies('paris', 'france', 'italy')
179+
180+
181+
182+

0 commit comments

Comments
 (0)