Updated Chapter06

zuoshifan · Apr 25, 2017 · c10bb70 · c10bb70
1 parent f558d37
commit c10bb70
Show file tree

Hide file tree

Showing 7 changed files with 852 additions and 0 deletions.
diff --git a/Chapter06/alice_chargen_rnn.py b/Chapter06/alice_chargen_rnn.py
@@ -0,0 +1,106 @@
+# -*- coding: utf-8 -*-
+# Adapted from lstm_text_generation.py in keras/examples
+from __future__ import print_function
+from keras.layers.recurrent import SimpleRNN
+from keras.models import Sequential
+from keras.layers import Dense, Activation
+import numpy as np
+
+INPUT_FILE = "../data/alice_in_wonderland.txt"
+
+# extract the input as a stream of characters
+print("Extracting text from input...")
+fin = open(INPUT_FILE, 'rb')
+lines = []
+for line in fin:
+    line = line.strip().lower()
+    line = line.decode("ascii", "ignore")
+    if len(line) == 0:
+        continue
+    lines.append(line)
+fin.close()
+text = " ".join(lines)
+
+# creating lookup tables
+# Here chars is the number of features in our character "vocabulary"
+chars = set([c for c in text])
+nb_chars = len(chars)
+char2index = dict((c, i) for i, c in enumerate(chars))
+index2char = dict((i, c) for i, c in enumerate(chars))
+
+# create inputs and labels from the text. We do this by stepping
+# through the text ${step} character at a time, and extracting a 
+# sequence of size ${seqlen} and the next output char. For example,
+# assuming an input text "The sky was falling", we would get the 
+# following sequence of input_chars and label_chars (first 5 only)
+#   The sky wa -> s
+#   he sky was ->  
+#   e sky was  -> f
+#    sky was f -> a
+#   sky was fa -> l
+print("Creating input and label text...")
+SEQLEN = 10
+STEP = 1
+
+input_chars = []
+label_chars = []
+for i in range(0, len(text) - SEQLEN, STEP):
+    input_chars.append(text[i:i + SEQLEN])
+    label_chars.append(text[i + SEQLEN])
+
+# vectorize the input and label chars
+# Each row of the input is represented by seqlen characters, each 
+# represented as a 1-hot encoding of size len(char). There are 
+# len(input_chars) such rows, so shape(X) is (len(input_chars),
+# seqlen, nb_chars).
+# Each row of output is a single character, also represented as a
+# dense encoding of size len(char). Hence shape(y) is (len(input_chars),
+# nb_chars).
+print("Vectorizing input and label text...")
+X = np.zeros((len(input_chars), SEQLEN, nb_chars), dtype=np.bool)
+y = np.zeros((len(input_chars), nb_chars), dtype=np.bool)
+for i, input_char in enumerate(input_chars):
+    for j, ch in enumerate(input_char):
+        X[i, j, char2index[ch]] = 1
+    y[i, char2index[label_chars[i]]] = 1
+
+# Build the model. We use a single RNN with a fully connected layer
+# to compute the most likely predicted output char
+HIDDEN_SIZE = 128
+BATCH_SIZE = 128
+NUM_ITERATIONS = 25
+NUM_EPOCHS_PER_ITERATION = 1
+NUM_PREDS_PER_EPOCH = 100
+
+model = Sequential()
+model.add(SimpleRNN(HIDDEN_SIZE, return_sequences=False,
+                    input_shape=(SEQLEN, nb_chars),
+                    unroll=True))
+model.add(Dense(nb_chars))
+model.add(Activation("softmax"))
+
+model.compile(loss="categorical_crossentropy", optimizer="rmsprop")
+
+# We train the model in batches and test output generated at each step
+for iteration in range(NUM_ITERATIONS):
+    print("=" * 50)
+    print("Iteration #: %d" % (iteration))
+    model.fit(X, y, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS_PER_ITERATION)
+
+    # testing model
+    # randomly choose a row from input_chars, then use it to 
+    # generate text from model for next 100 chars
+    test_idx = np.random.randint(len(input_chars))
+    test_chars = input_chars[test_idx]
+    print("Generating from seed: %s" % (test_chars))
+    print(test_chars, end="")
+    for i in range(NUM_PREDS_PER_EPOCH):
+        Xtest = np.zeros((1, SEQLEN, nb_chars))
+        for i, ch in enumerate(test_chars):
+            Xtest[0, i, char2index[ch]] = 1
+        pred = model.predict(Xtest, verbose=0)[0]
+        ypred = index2char[np.argmax(pred)]
+        print(ypred, end="")
+        # move forward with test_chars + ypred
+        test_chars = test_chars[1:] + ypred
+    print()
diff --git a/Chapter06/econs_data.py b/Chapter06/econs_data.py
@@ -0,0 +1,32 @@
+# -*- coding: utf-8 -*-
+from __future__ import division, print_function
+import numpy as np
+import matplotlib.pyplot as plt
+import os
+import re
+
+DATA_DIR = "../data"
+
+fld = open(os.path.join(DATA_DIR, "LD2011_2014.txt"), "rb")
+data = []
+line_num = 0
+#cid = np.random.randint(0, 370, 1)
+cid = 250
+for line in fld:
+    if line.startswith("\"\";"):
+        continue
+    if line_num % 100 == 0:
+        print("{:d} lines read".format(line_num))
+    cols = [float(re.sub(",", ".", x)) for x in 
+            line.strip().split(";")[1:]]
+    data.append(cols[cid])
+    line_num += 1
+fld.close()
+
+NUM_ENTRIES = 1000
+plt.plot(range(NUM_ENTRIES), data[0:NUM_ENTRIES])
+plt.ylabel("electricity consumption")
+plt.xlabel("time (1pt = 15 mins)")
+plt.show()
+
+np.save(os.path.join(DATA_DIR, "LD_250.npy"), np.array(data))
diff --git a/Chapter06/econs_stateful.py b/Chapter06/econs_stateful.py
@@ -0,0 +1,82 @@
+# -*- coding: utf-8 -*-
+from __future__ import division, print_function
+from keras.layers.core import Dense
+from keras.layers.recurrent import LSTM
+from keras.models import Sequential
+from sklearn.preprocessing import MinMaxScaler
+import numpy as np
+import math
+import os
+
+DATA_DIR = "../data"
+
+data = np.load(os.path.join(DATA_DIR, "LD_250.npy"))
+
+STATELESS = False
+
+NUM_TIMESTEPS = 20
+HIDDEN_SIZE = 10
+BATCH_SIZE = 96  # 24 hours (15 min intervals)
+NUM_EPOCHS = 5
+
+# scale the data to be in the range (0, 1)
+data = data.reshape(-1, 1)
+scaler = MinMaxScaler(feature_range=(0, 1), copy=False)
+data = scaler.fit_transform(data)
+
+# transform to 4 inputs -> 1 label format
+X = np.zeros((data.shape[0], NUM_TIMESTEPS))
+Y = np.zeros((data.shape[0], 1))
+for i in range(len(data) - NUM_TIMESTEPS - 1):
+    X[i] = data[i:i + NUM_TIMESTEPS].T
+    Y[i] = data[i + NUM_TIMESTEPS + 1]
+
+# reshape X to three dimensions (samples, timesteps, features)
+X = np.expand_dims(X, axis=2)
+
+# split into training and test sets (add the extra offsets so 
+# we can use batch size of 5)
+sp = int(0.7 * len(data))
+Xtrain, Xtest, Ytrain, Ytest = X[0:sp], X[sp:], Y[0:sp], Y[sp:]
+print(Xtrain.shape, Xtest.shape, Ytrain.shape, Ytest.shape)
+
+if STATELESS:
+    # stateless
+    model = Sequential()
+    model.add(LSTM(HIDDEN_SIZE, input_shape=(NUM_TIMESTEPS, 1), 
+                   return_sequences=False))
+    model.add(Dense(1))
+else:
+    # stateful
+    model = Sequential()
+    model.add(LSTM(HIDDEN_SIZE, stateful=True,
+                   batch_input_shape=(BATCH_SIZE, NUM_TIMESTEPS, 1), 
+                   return_sequences=False))
+    model.add(Dense(1))
+
+model.compile(loss="mean_squared_error", optimizer="adam",
+              metrics=["mean_squared_error"])
+
+if STATELESS:
+    # stateless
+    model.fit(Xtrain, Ytrain, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE,
+              validation_data=(Xtest, Ytest),
+              shuffle=False)
+else:
+    # stateful
+    # need to make training and test data to multiple of BATCH_SIZE
+    train_size = (Xtrain.shape[0] // BATCH_SIZE) * BATCH_SIZE
+    test_size = (Xtest.shape[0] // BATCH_SIZE) * BATCH_SIZE
+    Xtrain, Ytrain = Xtrain[0:train_size], Ytrain[0:train_size]
+    Xtest, Ytest = Xtest[0:test_size], Ytest[0:test_size]
+    print(Xtrain.shape, Xtest.shape, Ytrain.shape, Ytest.shape)
+    for i in range(NUM_EPOCHS):
+        print("Epoch {:d}/{:d}".format(i+1, NUM_EPOCHS))
+        model.fit(Xtrain, Ytrain, batch_size=BATCH_SIZE, epochs=1,
+                  validation_data=(Xtest, Ytest),
+                  shuffle=False)
+        model.reset_states()
+
+score, _ = model.evaluate(Xtest, Ytest, batch_size=BATCH_SIZE)
+rmse = math.sqrt(score)
+print("\nMSE: {:.3f}, RMSE: {:.3f}".format(score, rmse))
diff --git a/Chapter06/pos-tagging-explore.py b/Chapter06/pos-tagging-explore.py
@@ -0,0 +1,170 @@
+# -*- coding: utf-8 -*-
+from __future__ import division, print_function
+from keras.layers.core import Activation, Dense, Dropout, RepeatVector, SpatialDropout1D
+from keras.layers.embeddings import Embedding
+from keras.layers.recurrent import GRU, LSTM
+from keras.layers.wrappers import TimeDistributed, Bidirectional
+from keras.models import Sequential
+from keras.preprocessing import sequence
+from keras.utils import np_utils
+import collections
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+
+def explore_data(datadir, datafiles):
+    counter = collections.Counter()
+    maxlen = 0
+    for datafile in datafiles:
+        fdata = open(os.path.join(datadir, datafile), "rb")
+        for line in fdata:
+            words = line.strip().split()
+            if len(words) > maxlen:
+                maxlen = len(words)
+            for word in words:
+                counter[word] += 1
+        fdata.close()
+    return maxlen, counter
+
+def build_tensor(filename, numrecs, word2index, maxlen, 
+                 make_categorical=False):
+    data = np.empty((numrecs, ), dtype=list)
+    fin = open(filename, "rb")
+    i = 0
+    for line in fin:
+        wids = []
+        for word in line.strip().split():
+            if word2index.has_key(word):
+                wids.append(word2index[word])
+            else:
+                wids.append(word2index["UNK"])
+        if make_categorical:
+            data[i] = np_utils.to_categorical(
+                wids, num_classes=len(word2index))
+        else:
+            data[i] = wids
+        i += 1
+    fin.close()
+    pdata = sequence.pad_sequences(data, maxlen=maxlen)
+    return pdata
+
+def evaluate_model(model, Xtest, Ytest, batch_size):
+    pass
+
+DATA_DIR = "../data"
+
+s_maxlen, s_counter = explore_data(DATA_DIR, ["babi-sent-train.txt", 
+                                              "babi-sent-test.txt"])
+t_maxlen, t_counter = explore_data(DATA_DIR, ["babi-pos-train.txt", 
+                                              "babi-pos-test.txt"])
+
+print(s_maxlen, len(s_counter), t_maxlen, len(t_counter))
+# 7 21 7 9
+# maxlen: 7
+# size of source vocab: 21
+# size of target vocab: 9
+
+# lookup tables
+s_word2id = {k:v+1 for v, (k, _) in enumerate(s_counter.most_common())}
+s_word2id["PAD"] = 0
+s_id2word = {v:k for k, v in s_word2id.items()}
+t_pos2id = {k:v+1 for v, (k, _) in enumerate(t_counter.most_common())}
+t_pos2id["PAD"] = 0
+t_id2pos = {v:k for k, v in t_pos2id.items()}
+
+# vectorize data
+MAX_SEQLEN = 10
+
+Xtrain = build_tensor(os.path.join(DATA_DIR, "babi-sent-train.txt"),
+                      30000, s_word2id, MAX_SEQLEN)
+Xtest = build_tensor(os.path.join(DATA_DIR, "babi-sent-test.txt"),
+                     3000, s_word2id, MAX_SEQLEN)
+Ytrain = build_tensor(os.path.join(DATA_DIR, "babi-pos-train.txt"),
+                      30000, t_pos2id, MAX_SEQLEN, make_categorical=True)
+Ytest = build_tensor(os.path.join(DATA_DIR, "babi-pos-test.txt"),
+                     3000, t_pos2id, MAX_SEQLEN, make_categorical=True)
+print(Xtrain.shape, Xtest.shape, Ytrain.shape, Ytest.shape)
+
+# define network
+EMBED_SIZE = 32
+HIDDEN_SIZE = 32
+
+BATCH_SIZE = 32
+NUM_EPOCHS = 5
+
+model = Sequential()
+model.add(Embedding(len(s_word2id), EMBED_SIZE,
+                    input_length=MAX_SEQLEN))
+model.add(SpatialDropout1D(Dropout(0.2)))
+model.add(LSTM(HIDDEN_SIZE, dropout=0.2, recurrent_dropout=0.2))
+#model.add(GRU(HIDDEN_SIZE, dropout=0.2, recurrent_dropout=0.2))
+#model.add(Bidirectional(LSTM(HIDDEN_SIZE, dropout=0.2, recurrent_dropout=0.2)))
+model.add(RepeatVector(MAX_SEQLEN))
+model.add(LSTM(HIDDEN_SIZE, return_sequences=True))
+#model.add(GRU(HIDDEN_SIZE, return_sequences=True))
+#model.add(Bidirectional(LSTM(HIDDEN_SIZE, return_sequences=True)))
+model.add(TimeDistributed(Dense(len(t_pos2id))))
+model.add(Activation("softmax"))
+
+model.compile(loss="categorical_crossentropy", optimizer="adam",
+             metrics=["accuracy"])
+
+history = model.fit(Xtrain, Ytrain, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS,
+                    validation_data=[Xtest, Ytest])
+
+# plot loss and accuracy
+plt.subplot(211)
+plt.title("Accuracy")
+plt.plot(history.history["acc"], color="g", label="Train")
+plt.plot(history.history["val_acc"], color="b", label="Validation")
+plt.legend(loc="best")
+
+plt.subplot(212)
+plt.title("Loss")
+plt.plot(history.history["loss"], color="g", label="Train")
+plt.plot(history.history["val_loss"], color="b", label="Validation")
+plt.legend(loc="best")
+
+plt.tight_layout()
+plt.show()
+
+# evaluate model
+score, acc = model.evaluate(Xtest, Ytest, batch_size=BATCH_SIZE)
+print("Test score: %.3f, accuracy: %.3f" % (score, acc))
+
+# custom evaluate
+hit_rates = []
+num_iters = Xtest.shape[0] // BATCH_SIZE
+for i in range(num_iters - 1):
+    xtest = Xtest[i * BATCH_SIZE : (i + 1) * BATCH_SIZE]
+    ytest = np.argmax(Ytest[i * BATCH_SIZE : (i + 1) * BATCH_SIZE], axis=2)
+    ytest_ = np.argmax(model.predict(xtest), axis=2)
+#    print(ytest.shape, ytest_.shape)
+    for j in range(BATCH_SIZE):
+#        print("sentence:  " + " ".join([s_id2word[x] for x in xtest[j].tolist()]))
+#        print("predicted: " + " ".join([t_id2pos[y] for y in ytest_[j].tolist()]))
+#        print("label:     " + " ".join([t_id2pos[y] for y in ytest[j].tolist()]))
+        word_indices = np.nonzero(xtest[j])
+        pos_labels = ytest[j][word_indices]
+        pos_pred = ytest_[j][word_indices]
+        hit_rates.append(np.sum(pos_labels == pos_pred) / len(pos_pred))
+    break
+
+accuracy = sum(hit_rates) / len(hit_rates)
+print("accuracy: {:.3f}".format(accuracy))        
+
+# prediction
+pred_ids = np.random.randint(0, 3000, 5)
+for pred_id in pred_ids:
+    xtest = Xtest[pred_id].reshape(1, 10)
+    ytest_ = np.argmax(model.predict(xtest), axis=2)
+    ytest = np.argmax(Ytest[pred_id], axis=1)
+    print("sentence:  " + " ".join([s_id2word[x] for x in xtest[0].tolist()]))
+    print("predicted: " + " ".join([t_id2pos[y] for y in ytest_[0].tolist()]))
+    print("label:     " + " ".join([t_id2pos[y] for y in ytest.tolist()]))
+    word_indices = np.nonzero(xtest)[1]
+    ypred_tags = ytest_[0][word_indices]
+    ytrue_tags = ytest[word_indices]
+    hit_rate = np.sum(ypred_tags == ytrue_tags) / len(ypred_tags)
+    print("hit rate: {:.3f}".format(hit_rate))
+    print()