|
| 1 | +# not considering context |
| 2 | + |
| 3 | +import numpy as np |
| 4 | +import theano |
| 5 | +import theano.tensor as T |
| 6 | +import matplotlib.pyplot as plt |
| 7 | +from sklearn.utils import shuffle |
| 8 | +from sklearn.metrics import f1_score |
| 9 | +# from sklearn.linear_model import LogisticRegression |
| 10 | + |
| 11 | + |
| 12 | +class LogisticRegression: |
| 13 | + def __init__(self): |
| 14 | + pass |
| 15 | + |
| 16 | + def fit(self, X, Y, V=None, K=None, D=50, lr=10e-1, mu=0.99, batch_sz=100, epochs=6): |
| 17 | + if V is None: |
| 18 | + V = len(set(X)) |
| 19 | + if K is None: |
| 20 | + K = len(set(Y)) |
| 21 | + N = len(X) |
| 22 | + |
| 23 | + # We = np.random.randn(V, D) / np.sqrt(V + D) |
| 24 | + # W = np.random.randn(D, K) / np.sqrt(D + K) |
| 25 | + W = np.random.randn(V, K) / np.sqrt(V + K) |
| 26 | + b = np.zeros(K) |
| 27 | + self.W = theano.shared(W) |
| 28 | + self.b = theano.shared(b) |
| 29 | + self.params = [self.W, self.b] |
| 30 | + |
| 31 | + thX = T.ivector('X') |
| 32 | + thY = T.ivector('Y') |
| 33 | + |
| 34 | + py_x = T.nnet.softmax(self.W[thX] + self.b) |
| 35 | + prediction = T.argmax(py_x, axis=1) |
| 36 | + |
| 37 | + cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY])) |
| 38 | + grads = T.grad(cost, self.params) |
| 39 | + dparams = [theano.shared(p.get_value()*0) for p in self.params] |
| 40 | + self.cost_predict_op = theano.function( |
| 41 | + inputs=[thX, thY], |
| 42 | + outputs=[cost, prediction], |
| 43 | + allow_input_downcast=True, |
| 44 | + ) |
| 45 | + |
| 46 | + updates = [ |
| 47 | + (p, p + mu*dp - lr*g) for p, dp, g in zip(self.params, dparams, grads) |
| 48 | + ] + [ |
| 49 | + (dp, mu*dp - lr*g) for dp, g in zip(dparams, grads) |
| 50 | + ] |
| 51 | + train_op = theano.function( |
| 52 | + inputs=[thX, thY], |
| 53 | + outputs=[cost, prediction], |
| 54 | + updates=updates, |
| 55 | + allow_input_downcast=True |
| 56 | + ) |
| 57 | + |
| 58 | + costs = [] |
| 59 | + n_batches = N / batch_sz |
| 60 | + for i in xrange(epochs): |
| 61 | + X, Y = shuffle(X, Y) |
| 62 | + print "epoch:", i |
| 63 | + for j in xrange(n_batches): |
| 64 | + Xbatch = X[j*batch_sz:(j*batch_sz + batch_sz)] |
| 65 | + Ybatch = Y[j*batch_sz:(j*batch_sz + batch_sz)] |
| 66 | + |
| 67 | + c, p = train_op(Xbatch, Ybatch) |
| 68 | + costs.append(c) |
| 69 | + if j % 200 == 0: |
| 70 | + print "i:", i, "j:", j, "n_batches:", n_batches, "cost:", c, "error:", np.mean(p != Ybatch) |
| 71 | + plt.plot(costs) |
| 72 | + plt.show() |
| 73 | + |
| 74 | + def score(self, X, Y): |
| 75 | + _, p = self.cost_predict_op(X, Y) |
| 76 | + return np.mean(p == Y) |
| 77 | + |
| 78 | + def f1_score(self, X, Y): |
| 79 | + _, p = self.cost_predict_op(X, Y) |
| 80 | + return f1_score(Y, p, average=None).mean() |
| 81 | + |
| 82 | + |
| 83 | +def get_data(split_sequences=False): |
| 84 | + word2idx = {} |
| 85 | + tag2idx = {} |
| 86 | + word_idx = 0 |
| 87 | + tag_idx = 0 |
| 88 | + Xtrain = [] |
| 89 | + Ytrain = [] |
| 90 | + currentX = [] |
| 91 | + currentY = [] |
| 92 | + for line in open('chunking/train.txt'): |
| 93 | + line = line.rstrip() |
| 94 | + if line: |
| 95 | + r = line.split() |
| 96 | + word, tag, _ = r |
| 97 | + if word not in word2idx: |
| 98 | + word2idx[word] = word_idx |
| 99 | + word_idx += 1 |
| 100 | + currentX.append(word2idx[word]) |
| 101 | + |
| 102 | + if tag not in tag2idx: |
| 103 | + tag2idx[tag] = tag_idx |
| 104 | + tag_idx += 1 |
| 105 | + currentY.append(tag2idx[tag]) |
| 106 | + elif split_sequences: |
| 107 | + Xtrain.append(currentX) |
| 108 | + Ytrain.append(currentY) |
| 109 | + currentX = [] |
| 110 | + currentY = [] |
| 111 | + |
| 112 | + if not split_sequences: |
| 113 | + Xtrain = currentX |
| 114 | + Ytrain = currentY |
| 115 | + |
| 116 | + # load and score test data |
| 117 | + Xtest = [] |
| 118 | + Ytest = [] |
| 119 | + currentX = [] |
| 120 | + currentY = [] |
| 121 | + for line in open('chunking/test.txt'): |
| 122 | + line = line.rstrip() |
| 123 | + if line: |
| 124 | + r = line.split() |
| 125 | + word, tag, _ = r |
| 126 | + if word in word2idx: |
| 127 | + currentX.append(word2idx[word]) |
| 128 | + else: |
| 129 | + currentX.append(word_idx) # use this as unknown |
| 130 | + currentY.append(tag2idx[tag]) |
| 131 | + elif split_sequences: |
| 132 | + Xtest.append(currentX) |
| 133 | + Ytest.append(currentY) |
| 134 | + currentX = [] |
| 135 | + currentY = [] |
| 136 | + if not split_sequences: |
| 137 | + Xtest = currentX |
| 138 | + Ytest = currentY |
| 139 | + |
| 140 | + return Xtrain, Ytrain, Xtest, Ytest, word2idx |
| 141 | + |
| 142 | +def main(): |
| 143 | + Xtrain, Ytrain, Xtest, Ytest, word2idx = get_data() |
| 144 | + |
| 145 | + # convert to numpy arrays |
| 146 | + Xtrain = np.array(Xtrain) |
| 147 | + Ytrain = np.array(Ytrain) |
| 148 | + |
| 149 | + # convert Xtrain to indicator matrix |
| 150 | + N = len(Xtrain) |
| 151 | + V = len(word2idx) + 1 |
| 152 | + print "vocabulary size:", V |
| 153 | + # Xtrain_indicator = np.zeros((N, V)) |
| 154 | + # Xtrain_indicator[np.arange(N), Xtrain] = 1 |
| 155 | + |
| 156 | + # decision tree |
| 157 | + from sklearn.tree import DecisionTreeClassifier |
| 158 | + dt = DecisionTreeClassifier() |
| 159 | + |
| 160 | + # without indicator |
| 161 | + dt.fit(Xtrain.reshape(N, 1), Ytrain) |
| 162 | + print "dt score:", dt.score(Xtrain.reshape(N, 1), Ytrain) |
| 163 | + |
| 164 | + # with indicator -- too slow!! |
| 165 | + # dt.fit(Xtrain_indicator, Ytrain) |
| 166 | + # print "dt score:", dt.score(Xtrain_indicator, Ytrain) |
| 167 | + |
| 168 | + # train and score |
| 169 | + model = LogisticRegression() |
| 170 | + model.fit(Xtrain, Ytrain, V=V) |
| 171 | + print "training complete" |
| 172 | + print "train score:", model.score(Xtrain, Ytrain) |
| 173 | + |
| 174 | + |
| 175 | + Ntest = len(Xtest) |
| 176 | + Xtest = np.array(Xtest) |
| 177 | + Ytest = np.array(Ytest) |
| 178 | + # convert Xtest to indicator |
| 179 | + # Xtest_indicator = np.zeros((Ntest, V)) |
| 180 | + # Xtest_indicator[np.arange(Ntest), Xtest] = 1 |
| 181 | + |
| 182 | + # decision tree test score |
| 183 | + print "dt test score:", dt.score(Xtest.reshape(Ntest, 1), Ytest) |
| 184 | + # print "dt test score:", dt.score(Xtest_indicator, Ytest) # too slow! |
| 185 | + |
| 186 | + # logistic test score -- too slow!! |
| 187 | + print "test score:", model.score(Xtest, Ytest) |
| 188 | + |
| 189 | +if __name__ == '__main__': |
| 190 | + main() |
0 commit comments