split out theano

bob7783 · bob7783 · commit a4718a0a1e65 · 2018-05-27T16:57:06.000-04:00
diff --git a/nlp_class2/glove_theano.py b/nlp_class2/glove_theano.py
@@ -0,0 +1,242 @@
+# Course URL:
+# https://deeplearningcourses.com/c/natural-language-processing-with-deep-learning-in-python
+# https://udemy.com/natural-language-processing-with-deep-learning-in-python
+from __future__ import print_function, division
+from builtins import range
+# Note: you may need to update your version of future
+# sudo pip install -U future
+
+
+import os
+import json
+import numpy as np
+import theano
+import theano.tensor as T
+import matplotlib.pyplot as plt
+
+from datetime import datetime
+from sklearn.utils import shuffle
+from word2vec import get_wikipedia_data, find_analogies, get_sentences_with_word2idx_limit_vocab
+
+# using ALS, what's the least # files to get correct analogies?
+# use this for word2vec training to make it faster
+# first tried 20 files --> not enough
+# how about 30 files --> some correct but still not enough
+# 40 files --> half right but 50 is better
+
+
+def momentum_updates(cost, params, lr=1e-4, mu=0.9):
+  grads = T.grad(cost, params)
+  velocities = [theano.shared(
+    np.zeros_like(p.get_value()).astype(np.float32)
+  ) for p in params]
+  # updates = [(p, p - learning_rate*g) for p, g in zip(params, grads)]
+  updates = []
+  for p, v, g in zip(params, velocities, grads):
+    newv = mu*v - lr*g
+    newp = p + newv
+    updates.append((p, newp))
+    updates.append((v, newv))
+  return updates
+
+
+class Glove:
+    def __init__(self, D, V, context_sz):
+        self.D = D
+        self.V = V
+        self.context_sz = context_sz
+
+    def fit(self, sentences, cc_matrix=None, learning_rate=1e-4, reg=0.1, xmax=100, alpha=0.75, epochs=10, gd=False, use_theano=False, use_tensorflow=False):
+        # build co-occurrence matrix
+        # paper calls it X, so we will call it X, instead of calling
+        # the training data X
+        # TODO: would it be better to use a sparse matrix?
+        t0 = datetime.now()
+        V = self.V
+        D = self.D
+
+        if not os.path.exists(cc_matrix):
+            X = np.zeros((V, V))
+            N = len(sentences)
+            print("number of sentences to process:", N)
+            it = 0
+            for sentence in sentences:
+                it += 1
+                if it % 10000 == 0:
+                    print("processed", it, "/", N)
+                n = len(sentence)
+                for i in range(n):
+                    # i is not the word index!!!
+                    # j is not the word index!!!
+                    # i just points to which element of the sequence (sentence) we're looking at
+                    wi = sentence[i]
+
+                    start = max(0, i - self.context_sz)
+                    end = min(n, i + self.context_sz)
+
+                    # we can either choose only one side as context, or both
+                    # here we are doing both
+
+                    # make sure "start" and "end" tokens are part of some context
+                    # otherwise their f(X) will be 0 (denominator in bias update)
+                    if i - self.context_sz < 0:
+                        points = 1.0 / (i + 1)
+                        X[wi,0] += points
+                        X[0,wi] += points
+                    if i + self.context_sz > n:
+                        points = 1.0 / (n - i)
+                        X[wi,1] += points
+                        X[1,wi] += points
+
+                    # left side
+                    for j in range(start, i):
+                        wj = sentence[j]
+                        points = 1.0 / (i - j) # this is +ve
+                        X[wi,wj] += points
+                        X[wj,wi] += points
+
+                    # right side
+                    for j in range(i + 1, end):
+                        wj = sentence[j]
+                        points = 1.0 / (j - i) # this is +ve
+                        X[wi,wj] += points
+                        X[wj,wi] += points
+
+            # save the cc matrix because it takes forever to create
+            np.save(cc_matrix, X)
+        else:
+            X = np.load(cc_matrix)
+
+        print("max in X:", X.max())
+
+        # weighting
+        fX = np.zeros((V, V))
+        fX[X < xmax] = (X[X < xmax] / float(xmax)) ** alpha
+        fX[X >= xmax] = 1
+
+        print("max in f(X):", fX.max())
+
+        # target
+        logX = np.log(X + 1)
+
+        # cast
+        fX = fX.astype(np.float32)
+        logX = logX.astype(np.float32)
+
+        print("max in log(X):", logX.max())
+
+        print("time to build co-occurrence matrix:", (datetime.now() - t0))
+
+        # initialize weights
+        W = np.random.randn(V, D) / np.sqrt(V + D)
+        b = np.zeros(V)
+        U = np.random.randn(V, D) / np.sqrt(V + D)
+        c = np.zeros(V)
+        mu = logX.mean()
+
+        # initialize weights, inputs, targets placeholders
+        thW = theano.shared(W.astype(np.float32))
+        thb = theano.shared(b.astype(np.float32))
+        thU = theano.shared(U.astype(np.float32))
+        thc = theano.shared(c.astype(np.float32))
+        thLogX = T.matrix('logX')
+        thfX = T.matrix('fX')
+
+        params = [thW, thb, thU, thc]
+
+        thDelta = thW.dot(thU.T) + T.reshape(thb, (V, 1)) + T.reshape(thc, (1, V)) + mu - thLogX
+        thCost = ( thfX * thDelta * thDelta ).sum()
+
+        # regularization
+        regularized_cost = thCost + reg*((thW * thW).sum() + (thU * thU).sum())
+
+        # grads = T.grad(regularized_cost, params)
+        # updates = [(p, p - learning_rate*g) for p, g in zip(params, grads)]
+        updates = momentum_updates(regularized_cost, params, learning_rate)
+
+        train_op = theano.function(
+            inputs=[thfX, thLogX],
+            updates=updates,
+        )
+
+        cost_op = theano.function(inputs=[thfX, thLogX], outputs=thCost)
+
+        costs = []
+        sentence_indexes = range(len(sentences))
+        for epoch in range(epochs):
+            train_op(fX, logX)
+            cost = cost_op(fX, logX)
+            costs.append(cost)
+            print("epoch:", epoch, "cost:", cost)
+
+
+        self.W = thW.get_value()
+        self.U = thU.get_value()
+
+        plt.plot(costs)
+        plt.show()
+
+    def save(self, fn):
+        # function word_analogies expects a (V,D) matrx and a (D,V) matrix
+        arrays = [self.W, self.U.T]
+        np.savez(fn, *arrays)
+
+
+def main(we_file, w2i_file, use_brown=True, n_files=50):
+    if use_brown:
+        cc_matrix = "cc_matrix_brown.npy"
+    else:
+        cc_matrix = "cc_matrix_%s.npy" % n_files
+
+    # hacky way of checking if we need to re-load the raw data or not
+    # remember, only the co-occurrence matrix is needed for training
+    if os.path.exists(cc_matrix):
+        with open(w2i_file) as f:
+            word2idx = json.load(f)
+        sentences = [] # dummy - we won't actually use it
+    else:
+        if use_brown:
+            keep_words = set([
+                'king', 'man', 'woman',
+                'france', 'paris', 'london', 'rome', 'italy', 'britain', 'england',
+                'french', 'english', 'japan', 'japanese', 'chinese', 'italian',
+                'australia', 'australian', 'december', 'november', 'june',
+                'january', 'february', 'march', 'april', 'may', 'july', 'august',
+                'september', 'october',
+            ])
+            sentences, word2idx = get_sentences_with_word2idx_limit_vocab(n_vocab=5000, keep_words=keep_words)
+        else:
+            sentences, word2idx = get_wikipedia_data(n_files=n_files, n_vocab=2000)
+        
+        with open(w2i_file, 'w') as f:
+            json.dump(word2idx, f)
+
+    V = len(word2idx)
+    model = Glove(100, V, 10)
+    model.fit(
+        sentences,
+        cc_matrix=cc_matrix,
+        learning_rate=1e-4,
+        reg=0.1,
+        epochs=200,
+    )
+    model.save(we_file)
+
+
+if __name__ == '__main__':
+    we = 'glove_model_50.npz'
+    w2i = 'glove_word2idx_50.json'
+    # we = 'glove_model_brown.npz'
+    # w2i = 'glove_word2idx_brown.json'
+    main(we, w2i, use_brown=False)
+    for concat in (True, False):
+        print("** concat:", concat)
+        find_analogies('king', 'man', 'woman', concat, we, w2i)
+        find_analogies('france', 'paris', 'london', concat, we, w2i)
+        find_analogies('france', 'paris', 'rome', concat, we, w2i)
+        find_analogies('paris', 'france', 'italy', concat, we, w2i)
+        find_analogies('france', 'french', 'english', concat, we, w2i)
+        find_analogies('japan', 'japanese', 'chinese', concat, we, w2i)
+        find_analogies('japan', 'japanese', 'italian', concat, we, w2i)
+        find_analogies('japan', 'japanese', 'australian', concat, we, w2i)
+        find_analogies('december', 'november', 'june', concat, we, w2i)