Skip to content

Commit 2720588

Browse files
committed
passes grad checks with beta. just circular conv and gamma left
1 parent cc695eb commit 2720588

File tree

3 files changed

+91
-17
lines changed

3 files changed

+91
-17
lines changed

ntm/addressing.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ def K(u):
3232
Given the key vector k_t, compute our sim
3333
function between k_t and u and exponentiate.
3434
"""
35-
return np.exp(np.array([[1.0]]) * cosine_sim(u, k_t))
35+
return np.exp(b_t * cosine_sim(u, k_t))
3636

3737
# Apply above function to every row in the matrix
3838
# This is surely much slower than it needs to be

ntm/ntm.py

Lines changed: 46 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
"""
44
import math
55
import autograd.numpy as np
6-
from autograd import grad
7-
from util.util import rando, sigmoid, softmax, softplus, unwrap, sigmoid_prime, tanh_prime, compare_deltas, dKdu, softmax_grads
6+
from autograd import grad, jacobian
7+
from util.util import rando, sigmoid, softmax, softplus, unwrap, sigmoid_prime, tanh_prime, compare_deltas, dKdu, softmax_grads, beta_grads, K_focus
88
import memory
99
import addressing
1010
from addressing import cosine_sim
@@ -107,10 +107,11 @@ def l():
107107
return {}
108108

109109
rs = l()
110-
zk_rs = l()
110+
zk_rs = l() # TODO: why only z for ks?
111111
k_rs, beta_rs, g_rs, s_rs, gamma_rs = l(),l(),l(),l(),l()
112112
k_ws, beta_ws, g_ws, s_ws, gamma_ws = l(),l(),l(),l(),l()
113113
adds, erases = l(),l()
114+
zbeta_rs, zbeta_ws = l(),l()
114115
w_ws, w_rs = l(),l() # read weights and write weights
115116
wc_ws, wc_rs = l(),l() # read and write content weights
116117
rs[-1] = self.W['rsInit'] # stores values read from memory
@@ -135,17 +136,17 @@ def l():
135136
# parameters to the read head
136137
zk_rs[t] =np.dot(W['ok_r'],os[t]) + W['bk_r']
137138
k_rs[t] = np.tanh(zk_rs[t])
138-
beta_rs[t] = softplus(np.dot(W['obeta_r'],os[t])
139-
+ W['bbeta_r'])
139+
zbeta_rs[t] = np.dot(W['obeta_r'],os[t]) + W['bbeta_r']
140+
beta_rs[t] = softplus(zbeta_rs[t])
140141
g_rs[t] = sigmoid(np.dot(W['og_r'],os[t]) + W['bg_r'])
141142
s_rs[t] = softmax(np.dot(W['os_r'],os[t]) + W['bs_r'])
142143
gamma_rs[t] = 1 + sigmoid(np.dot(W['ogamma_r'], os[t])
143144
+ W['bgamma_r'])
144145

145146
# parameters to the write head
146147
k_ws[t] = np.tanh(np.dot(W['ok_w'],os[t]) + W['bk_w'])
147-
beta_ws[t] = softplus(np.dot(W['obeta_w'], os[t])
148-
+ W['bbeta_w'])
148+
zbeta_ws[t] = np.dot(W['obeta_w'],os[t]) + W['bbeta_w']
149+
beta_ws[t] = softplus(zbeta_ws[t])
149150
g_ws[t] = sigmoid(np.dot(W['og_w'],os[t]) + W['bg_w'])
150151
s_ws[t] = softmax(np.dot(W['os_w'],os[t]) + W['bs_w'])
151152
gamma_ws[t] = 1 + sigmoid(np.dot(W['ogamma_w'], os[t])
@@ -191,7 +192,8 @@ def l():
191192
mems[t] = memory.write(mems[t-1],w_ws[t],erases[t],adds[t])
192193

193194
self.stats = [loss, mems, ps, ys, os, zos, hs, zhs, xs, rs, w_rs,
194-
w_ws, adds, erases, k_rs, k_ws, g_rs, g_ws, wc_rs, wc_ws]
195+
w_ws, adds, erases, k_rs, k_ws, g_rs, g_ws, wc_rs, wc_ws,
196+
zbeta_rs, zbeta_ws]
195197
return np.sum(loss)
196198

197199
def manual_grads(params):
@@ -204,7 +206,8 @@ def manual_grads(params):
204206
deltas[key] = np.zeros_like(val)
205207

206208
[loss, mems, ps, ys, os, zos, hs, zhs, xs, rs, w_rs,
207-
w_ws, adds, erases, k_rs, k_ws, g_rs, g_ws, wc_rs, wc_ws] = self.stats
209+
w_ws, adds, erases, k_rs, k_ws, g_rs, g_ws, wc_rs, wc_ws,
210+
zbeta_rs, zbeta_ws] = self.stats
208211
dd = {}
209212
drs = {}
210213
dzh = {}
@@ -305,8 +308,8 @@ def manual_grads(params):
305308
for i in range(self.N):
306309
# for every element in the weighting
307310
for j in range(self.N):
308-
dwdK_r[i,j] += softmax_grads(K_rs, i, j)
309-
dwdK_w[i,j] += softmax_grads(K_ws, i, j)
311+
dwdK_r[i,j] += softmax_grads(K_rs, softplus(zbeta_rs[t]), i, j)
312+
dwdK_w[i,j] += softmax_grads(K_ws, softplus(zbeta_ws[t]), i, j)
310313

311314
# compute dK for all i in N
312315
# K is the evaluated cosine similarity for the i-th row of mem matrix
@@ -317,6 +320,7 @@ def manual_grads(params):
317320
for i in range(self.N):
318321
# for every j in N (for every elt of the weighting)
319322
for j in range(self.N):
323+
# specifically, dwdK_r will change, and for write as well
320324
dK_r[i] += dwc_r[j] * dwdK_r[i,j]
321325
dK_w[i] += dwc_w[j] * dwdK_w[i,j]
322326

@@ -397,6 +401,32 @@ def manual_grads(params):
397401
deltas['bg_r'] += dzg_r
398402
deltas['bg_w'] += dzg_w
399403

404+
# compute dbeta, which affects w_content through interaction with Ks
405+
406+
dwcdbeta_r = np.zeros_like(w_rs[0])
407+
dwcdbeta_w = np.zeros_like(w_ws[0])
408+
for i in range(self.N):
409+
dwcdbeta_r[i] = beta_grads(K_rs, softplus(zbeta_rs[t]), i)
410+
dwcdbeta_w[i] = beta_grads(K_ws, softplus(zbeta_ws[t]), i)
411+
412+
# import pdb; pdb.set_trace()
413+
dbeta_r = np.zeros_like(zbeta_rs[0])
414+
dbeta_w = np.zeros_like(zbeta_ws[0])
415+
for i in range(self.N):
416+
dbeta_r[0] += dwc_r[i] * dwcdbeta_r[i]
417+
dbeta_w[0] += dwc_w[i] * dwcdbeta_w[i]
418+
419+
420+
# beta is activated from zbeta by softplus, grad of which is sigmoid
421+
dzbeta_r = dbeta_r * sigmoid(zbeta_rs[t])
422+
dzbeta_w = dbeta_w * sigmoid(zbeta_ws[t])
423+
424+
deltas['obeta_r'] += np.dot(dzbeta_r, os[t].T)
425+
deltas['obeta_w'] += np.dot(dzbeta_w, os[t].T)
426+
427+
deltas['bbeta_r'] += dzbeta_r
428+
deltas['bbeta_w'] += dzbeta_w
429+
400430
else:
401431
drs[t] = np.zeros_like(rs[0])
402432
dmemtilde[t] = np.zeros_like(mems[0])
@@ -417,6 +447,9 @@ def manual_grads(params):
417447
# and also through the interpolators
418448
do += np.dot(params['og_r'].T, dzg_r)
419449
do += np.dot(params['og_w'].T, dzg_w)
450+
# and also through beta
451+
do += np.dot(params['obeta_r'].T, dzbeta_r)
452+
do += np.dot(params['obeta_w'].T, dzbeta_w)
420453

421454

422455
# compute deriv w.r.t. pre-activation of o
@@ -479,6 +512,7 @@ def bprop(params, manual_grad):
479512

480513
deltas = bprop(self.W, manual_grad)
481514
[loss, mems, ps, ys, os, zos, hs, zhs, xs, rs, w_rs,
482-
w_ws, adds, erases, k_rs, k_ws, g_rs, g_ws, wc_rs, wc_ws] = map(unwrap, self.stats)
515+
w_ws, adds, erases, k_rs, k_ws, g_rs, g_ws, wc_rs, wc_ws,
516+
zbeta_rs, zbeta_ws] = map(unwrap, self.stats)
483517

484518
return loss, deltas, ps, w_rs, w_ws, adds, erases

util/util.py

Lines changed: 44 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -234,14 +234,54 @@ def dKdu(u, v):
234234
c = cosine_sim(u,v)
235235
return a - b*c
236236

237-
def softmax_grads(Ks, i, j):
237+
def softmax_grads(Ks, beta, i, j):
238238
"""
239239
return the grad of the ith element of weighting w.r.t. j-th element of Ks
240240
"""
241241
if j == i:
242-
num = np.exp(Ks[i]) * (np.sum(np.exp(Ks)) - np.exp(Ks[i]))
242+
num = beta*np.exp(Ks[i]*beta) * (np.sum(np.exp(Ks*beta)) - np.exp(Ks[i]*beta))
243243
else:
244-
num = -np.exp(Ks[i] + Ks[j])
245-
den1 = np.sum(np.exp(Ks))
244+
num = -beta*np.exp(Ks[i]*beta + Ks[j]*beta)
245+
den1 = np.sum(np.exp(Ks*beta))
246246
return num / (den1 * den1)
247247

248+
def beta_grads(Ks, beta, i):
249+
Karr = np.array(Ks)
250+
anum = Ks[i] * np.exp(Ks[i] * beta)
251+
aden = np.sum(np.exp(beta * Karr))
252+
a = anum / aden
253+
254+
bnum = np.exp(Ks[i] * beta) * (np.sum(np.multiply(Karr, np.exp(Karr * beta))))
255+
bden = aden * aden
256+
b = bnum / bden
257+
return a - b
258+
259+
def K_focus(Ks, b_t):
260+
"""
261+
The content-addressing method described in 3.3.1.
262+
Specifically, this is equations (5) and (6).
263+
k_t is the similarity key vector.
264+
b_t is the similarity key strength.
265+
memObject is a ref to our NTM memory object.
266+
"""
267+
def F(K):
268+
"""
269+
Given the key vector k_t, compute our sim
270+
function between k_t and u and exponentiate.
271+
"""
272+
return np.exp(b_t * K)
273+
274+
# Apply above function to every row in the matrix
275+
# This is surely much slower than it needs to be
276+
l = []
277+
for K in Ks:
278+
l.append(F(K))
279+
280+
# Return the normalized similarity weights
281+
# This is essentially a softmax over the similarities
282+
# with an extra degree of freedom parametrized by b_t
283+
sims = np.array(l)
284+
285+
n = sims
286+
d = np.sum(sims)
287+
return n/d

0 commit comments

Comments
 (0)