diff --git a/Chapter06/alice_chargen_rnn.py b/Chapter06/alice_chargen_rnn.py new file mode 100644 index 0000000..a6d4c41 --- /dev/null +++ b/Chapter06/alice_chargen_rnn.py @@ -0,0 +1,106 @@ +# -*- coding: utf-8 -*- +# Adapted from lstm_text_generation.py in keras/examples +from __future__ import print_function +from keras.layers.recurrent import SimpleRNN +from keras.models import Sequential +from keras.layers import Dense, Activation +import numpy as np + +INPUT_FILE = "../data/alice_in_wonderland.txt" + +# extract the input as a stream of characters +print("Extracting text from input...") +fin = open(INPUT_FILE, 'rb') +lines = [] +for line in fin: + line = line.strip().lower() + line = line.decode("ascii", "ignore") + if len(line) == 0: + continue + lines.append(line) +fin.close() +text = " ".join(lines) + +# creating lookup tables +# Here chars is the number of features in our character "vocabulary" +chars = set([c for c in text]) +nb_chars = len(chars) +char2index = dict((c, i) for i, c in enumerate(chars)) +index2char = dict((i, c) for i, c in enumerate(chars)) + +# create inputs and labels from the text. We do this by stepping +# through the text ${step} character at a time, and extracting a +# sequence of size ${seqlen} and the next output char. For example, +# assuming an input text "The sky was falling", we would get the +# following sequence of input_chars and label_chars (first 5 only) +# The sky wa -> s +# he sky was -> +# e sky was -> f +# sky was f -> a +# sky was fa -> l +print("Creating input and label text...") +SEQLEN = 10 +STEP = 1 + +input_chars = [] +label_chars = [] +for i in range(0, len(text) - SEQLEN, STEP): + input_chars.append(text[i:i + SEQLEN]) + label_chars.append(text[i + SEQLEN]) + +# vectorize the input and label chars +# Each row of the input is represented by seqlen characters, each +# represented as a 1-hot encoding of size len(char). There are +# len(input_chars) such rows, so shape(X) is (len(input_chars), +# seqlen, nb_chars). +# Each row of output is a single character, also represented as a +# dense encoding of size len(char). Hence shape(y) is (len(input_chars), +# nb_chars). +print("Vectorizing input and label text...") +X = np.zeros((len(input_chars), SEQLEN, nb_chars), dtype=np.bool) +y = np.zeros((len(input_chars), nb_chars), dtype=np.bool) +for i, input_char in enumerate(input_chars): + for j, ch in enumerate(input_char): + X[i, j, char2index[ch]] = 1 + y[i, char2index[label_chars[i]]] = 1 + +# Build the model. We use a single RNN with a fully connected layer +# to compute the most likely predicted output char +HIDDEN_SIZE = 128 +BATCH_SIZE = 128 +NUM_ITERATIONS = 25 +NUM_EPOCHS_PER_ITERATION = 1 +NUM_PREDS_PER_EPOCH = 100 + +model = Sequential() +model.add(SimpleRNN(HIDDEN_SIZE, return_sequences=False, + input_shape=(SEQLEN, nb_chars), + unroll=True)) +model.add(Dense(nb_chars)) +model.add(Activation("softmax")) + +model.compile(loss="categorical_crossentropy", optimizer="rmsprop") + +# We train the model in batches and test output generated at each step +for iteration in range(NUM_ITERATIONS): + print("=" * 50) + print("Iteration #: %d" % (iteration)) + model.fit(X, y, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS_PER_ITERATION) + + # testing model + # randomly choose a row from input_chars, then use it to + # generate text from model for next 100 chars + test_idx = np.random.randint(len(input_chars)) + test_chars = input_chars[test_idx] + print("Generating from seed: %s" % (test_chars)) + print(test_chars, end="") + for i in range(NUM_PREDS_PER_EPOCH): + Xtest = np.zeros((1, SEQLEN, nb_chars)) + for i, ch in enumerate(test_chars): + Xtest[0, i, char2index[ch]] = 1 + pred = model.predict(Xtest, verbose=0)[0] + ypred = index2char[np.argmax(pred)] + print(ypred, end="") + # move forward with test_chars + ypred + test_chars = test_chars[1:] + ypred + print() diff --git a/Chapter06/econs_data.py b/Chapter06/econs_data.py new file mode 100644 index 0000000..2f79c76 --- /dev/null +++ b/Chapter06/econs_data.py @@ -0,0 +1,32 @@ +# -*- coding: utf-8 -*- +from __future__ import division, print_function +import numpy as np +import matplotlib.pyplot as plt +import os +import re + +DATA_DIR = "../data" + +fld = open(os.path.join(DATA_DIR, "LD2011_2014.txt"), "rb") +data = [] +line_num = 0 +#cid = np.random.randint(0, 370, 1) +cid = 250 +for line in fld: + if line.startswith("\"\";"): + continue + if line_num % 100 == 0: + print("{:d} lines read".format(line_num)) + cols = [float(re.sub(",", ".", x)) for x in + line.strip().split(";")[1:]] + data.append(cols[cid]) + line_num += 1 +fld.close() + +NUM_ENTRIES = 1000 +plt.plot(range(NUM_ENTRIES), data[0:NUM_ENTRIES]) +plt.ylabel("electricity consumption") +plt.xlabel("time (1pt = 15 mins)") +plt.show() + +np.save(os.path.join(DATA_DIR, "LD_250.npy"), np.array(data)) diff --git a/Chapter06/econs_stateful.py b/Chapter06/econs_stateful.py new file mode 100644 index 0000000..b554000 --- /dev/null +++ b/Chapter06/econs_stateful.py @@ -0,0 +1,82 @@ +# -*- coding: utf-8 -*- +from __future__ import division, print_function +from keras.layers.core import Dense +from keras.layers.recurrent import LSTM +from keras.models import Sequential +from sklearn.preprocessing import MinMaxScaler +import numpy as np +import math +import os + +DATA_DIR = "../data" + +data = np.load(os.path.join(DATA_DIR, "LD_250.npy")) + +STATELESS = False + +NUM_TIMESTEPS = 20 +HIDDEN_SIZE = 10 +BATCH_SIZE = 96 # 24 hours (15 min intervals) +NUM_EPOCHS = 5 + +# scale the data to be in the range (0, 1) +data = data.reshape(-1, 1) +scaler = MinMaxScaler(feature_range=(0, 1), copy=False) +data = scaler.fit_transform(data) + +# transform to 4 inputs -> 1 label format +X = np.zeros((data.shape[0], NUM_TIMESTEPS)) +Y = np.zeros((data.shape[0], 1)) +for i in range(len(data) - NUM_TIMESTEPS - 1): + X[i] = data[i:i + NUM_TIMESTEPS].T + Y[i] = data[i + NUM_TIMESTEPS + 1] + +# reshape X to three dimensions (samples, timesteps, features) +X = np.expand_dims(X, axis=2) + +# split into training and test sets (add the extra offsets so +# we can use batch size of 5) +sp = int(0.7 * len(data)) +Xtrain, Xtest, Ytrain, Ytest = X[0:sp], X[sp:], Y[0:sp], Y[sp:] +print(Xtrain.shape, Xtest.shape, Ytrain.shape, Ytest.shape) + +if STATELESS: + # stateless + model = Sequential() + model.add(LSTM(HIDDEN_SIZE, input_shape=(NUM_TIMESTEPS, 1), + return_sequences=False)) + model.add(Dense(1)) +else: + # stateful + model = Sequential() + model.add(LSTM(HIDDEN_SIZE, stateful=True, + batch_input_shape=(BATCH_SIZE, NUM_TIMESTEPS, 1), + return_sequences=False)) + model.add(Dense(1)) + +model.compile(loss="mean_squared_error", optimizer="adam", + metrics=["mean_squared_error"]) + +if STATELESS: + # stateless + model.fit(Xtrain, Ytrain, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE, + validation_data=(Xtest, Ytest), + shuffle=False) +else: + # stateful + # need to make training and test data to multiple of BATCH_SIZE + train_size = (Xtrain.shape[0] // BATCH_SIZE) * BATCH_SIZE + test_size = (Xtest.shape[0] // BATCH_SIZE) * BATCH_SIZE + Xtrain, Ytrain = Xtrain[0:train_size], Ytrain[0:train_size] + Xtest, Ytest = Xtest[0:test_size], Ytest[0:test_size] + print(Xtrain.shape, Xtest.shape, Ytrain.shape, Ytest.shape) + for i in range(NUM_EPOCHS): + print("Epoch {:d}/{:d}".format(i+1, NUM_EPOCHS)) + model.fit(Xtrain, Ytrain, batch_size=BATCH_SIZE, epochs=1, + validation_data=(Xtest, Ytest), + shuffle=False) + model.reset_states() + +score, _ = model.evaluate(Xtest, Ytest, batch_size=BATCH_SIZE) +rmse = math.sqrt(score) +print("\nMSE: {:.3f}, RMSE: {:.3f}".format(score, rmse)) diff --git a/Chapter06/pos-tagging-explore.py b/Chapter06/pos-tagging-explore.py new file mode 100644 index 0000000..e170cf4 --- /dev/null +++ b/Chapter06/pos-tagging-explore.py @@ -0,0 +1,170 @@ +# -*- coding: utf-8 -*- +from __future__ import division, print_function +from keras.layers.core import Activation, Dense, Dropout, RepeatVector, SpatialDropout1D +from keras.layers.embeddings import Embedding +from keras.layers.recurrent import GRU, LSTM +from keras.layers.wrappers import TimeDistributed, Bidirectional +from keras.models import Sequential +from keras.preprocessing import sequence +from keras.utils import np_utils +import collections +import matplotlib.pyplot as plt +import numpy as np +import os + +def explore_data(datadir, datafiles): + counter = collections.Counter() + maxlen = 0 + for datafile in datafiles: + fdata = open(os.path.join(datadir, datafile), "rb") + for line in fdata: + words = line.strip().split() + if len(words) > maxlen: + maxlen = len(words) + for word in words: + counter[word] += 1 + fdata.close() + return maxlen, counter + +def build_tensor(filename, numrecs, word2index, maxlen, + make_categorical=False): + data = np.empty((numrecs, ), dtype=list) + fin = open(filename, "rb") + i = 0 + for line in fin: + wids = [] + for word in line.strip().split(): + if word2index.has_key(word): + wids.append(word2index[word]) + else: + wids.append(word2index["UNK"]) + if make_categorical: + data[i] = np_utils.to_categorical( + wids, num_classes=len(word2index)) + else: + data[i] = wids + i += 1 + fin.close() + pdata = sequence.pad_sequences(data, maxlen=maxlen) + return pdata + +def evaluate_model(model, Xtest, Ytest, batch_size): + pass + +DATA_DIR = "../data" + +s_maxlen, s_counter = explore_data(DATA_DIR, ["babi-sent-train.txt", + "babi-sent-test.txt"]) +t_maxlen, t_counter = explore_data(DATA_DIR, ["babi-pos-train.txt", + "babi-pos-test.txt"]) + +print(s_maxlen, len(s_counter), t_maxlen, len(t_counter)) +# 7 21 7 9 +# maxlen: 7 +# size of source vocab: 21 +# size of target vocab: 9 + +# lookup tables +s_word2id = {k:v+1 for v, (k, _) in enumerate(s_counter.most_common())} +s_word2id["PAD"] = 0 +s_id2word = {v:k for k, v in s_word2id.items()} +t_pos2id = {k:v+1 for v, (k, _) in enumerate(t_counter.most_common())} +t_pos2id["PAD"] = 0 +t_id2pos = {v:k for k, v in t_pos2id.items()} + +# vectorize data +MAX_SEQLEN = 10 + +Xtrain = build_tensor(os.path.join(DATA_DIR, "babi-sent-train.txt"), + 30000, s_word2id, MAX_SEQLEN) +Xtest = build_tensor(os.path.join(DATA_DIR, "babi-sent-test.txt"), + 3000, s_word2id, MAX_SEQLEN) +Ytrain = build_tensor(os.path.join(DATA_DIR, "babi-pos-train.txt"), + 30000, t_pos2id, MAX_SEQLEN, make_categorical=True) +Ytest = build_tensor(os.path.join(DATA_DIR, "babi-pos-test.txt"), + 3000, t_pos2id, MAX_SEQLEN, make_categorical=True) +print(Xtrain.shape, Xtest.shape, Ytrain.shape, Ytest.shape) + +# define network +EMBED_SIZE = 32 +HIDDEN_SIZE = 32 + +BATCH_SIZE = 32 +NUM_EPOCHS = 5 + +model = Sequential() +model.add(Embedding(len(s_word2id), EMBED_SIZE, + input_length=MAX_SEQLEN)) +model.add(SpatialDropout1D(Dropout(0.2))) +model.add(LSTM(HIDDEN_SIZE, dropout=0.2, recurrent_dropout=0.2)) +#model.add(GRU(HIDDEN_SIZE, dropout=0.2, recurrent_dropout=0.2)) +#model.add(Bidirectional(LSTM(HIDDEN_SIZE, dropout=0.2, recurrent_dropout=0.2))) +model.add(RepeatVector(MAX_SEQLEN)) +model.add(LSTM(HIDDEN_SIZE, return_sequences=True)) +#model.add(GRU(HIDDEN_SIZE, return_sequences=True)) +#model.add(Bidirectional(LSTM(HIDDEN_SIZE, return_sequences=True))) +model.add(TimeDistributed(Dense(len(t_pos2id)))) +model.add(Activation("softmax")) + +model.compile(loss="categorical_crossentropy", optimizer="adam", + metrics=["accuracy"]) + +history = model.fit(Xtrain, Ytrain, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS, + validation_data=[Xtest, Ytest]) + +# plot loss and accuracy +plt.subplot(211) +plt.title("Accuracy") +plt.plot(history.history["acc"], color="g", label="Train") +plt.plot(history.history["val_acc"], color="b", label="Validation") +plt.legend(loc="best") + +plt.subplot(212) +plt.title("Loss") +plt.plot(history.history["loss"], color="g", label="Train") +plt.plot(history.history["val_loss"], color="b", label="Validation") +plt.legend(loc="best") + +plt.tight_layout() +plt.show() + +# evaluate model +score, acc = model.evaluate(Xtest, Ytest, batch_size=BATCH_SIZE) +print("Test score: %.3f, accuracy: %.3f" % (score, acc)) + +# custom evaluate +hit_rates = [] +num_iters = Xtest.shape[0] // BATCH_SIZE +for i in range(num_iters - 1): + xtest = Xtest[i * BATCH_SIZE : (i + 1) * BATCH_SIZE] + ytest = np.argmax(Ytest[i * BATCH_SIZE : (i + 1) * BATCH_SIZE], axis=2) + ytest_ = np.argmax(model.predict(xtest), axis=2) +# print(ytest.shape, ytest_.shape) + for j in range(BATCH_SIZE): +# print("sentence: " + " ".join([s_id2word[x] for x in xtest[j].tolist()])) +# print("predicted: " + " ".join([t_id2pos[y] for y in ytest_[j].tolist()])) +# print("label: " + " ".join([t_id2pos[y] for y in ytest[j].tolist()])) + word_indices = np.nonzero(xtest[j]) + pos_labels = ytest[j][word_indices] + pos_pred = ytest_[j][word_indices] + hit_rates.append(np.sum(pos_labels == pos_pred) / len(pos_pred)) + break + +accuracy = sum(hit_rates) / len(hit_rates) +print("accuracy: {:.3f}".format(accuracy)) + +# prediction +pred_ids = np.random.randint(0, 3000, 5) +for pred_id in pred_ids: + xtest = Xtest[pred_id].reshape(1, 10) + ytest_ = np.argmax(model.predict(xtest), axis=2) + ytest = np.argmax(Ytest[pred_id], axis=1) + print("sentence: " + " ".join([s_id2word[x] for x in xtest[0].tolist()])) + print("predicted: " + " ".join([t_id2pos[y] for y in ytest_[0].tolist()])) + print("label: " + " ".join([t_id2pos[y] for y in ytest.tolist()])) + word_indices = np.nonzero(xtest)[1] + ypred_tags = ytest_[0][word_indices] + ytrue_tags = ytest[word_indices] + hit_rate = np.sum(ypred_tags == ytrue_tags) / len(ypred_tags) + print("hit rate: {:.3f}".format(hit_rate)) + print() diff --git a/Chapter06/pos_tagging_data.py b/Chapter06/pos_tagging_data.py new file mode 100644 index 0000000..2cd14ea --- /dev/null +++ b/Chapter06/pos_tagging_data.py @@ -0,0 +1,145 @@ +# -*- coding: utf-8 -*- +# Copied from: Out of core classification of Text Documents +# from the scikit-learn documentation. +# http://scikit-learn.org/stable/auto_examples/applications/plot_out_of_core_classification.html +# +from __future__ import division, print_function +from sklearn.externals.six.moves import html_parser +from glob import glob +import collections +import nltk +import os +import re + +class ReutersParser(html_parser.HTMLParser): + """ Utility class to parse a SGML file and yield documents one at + a time. + """ + def __init__(self, encoding='latin-1'): + html_parser.HTMLParser.__init__(self) + self._reset() + self.encoding = encoding + + def handle_starttag(self, tag, attrs): + method = 'start_' + tag + getattr(self, method, lambda x: None)(attrs) + + def handle_endtag(self, tag): + method = 'end_' + tag + getattr(self, method, lambda: None)() + + def _reset(self): + self.in_title = 0 + self.in_body = 0 + self.in_topics = 0 + self.in_topic_d = 0 + self.title = "" + self.body = "" + self.topics = [] + self.topic_d = "" + + def parse(self, fd): + self.docs = [] + for chunk in fd: + self.feed(chunk.decode(self.encoding)) + for doc in self.docs: + yield doc + self.docs = [] + self.close() + + def handle_data(self, data): + if self.in_body: + self.body += data + elif self.in_title: + self.title += data + elif self.in_topic_d: + self.topic_d += data + + def start_reuters(self, attributes): + pass + + def end_reuters(self): + self.body = re.sub(r'\s+', r' ', self.body) + self.docs.append({'title': self.title, + 'body': self.body, + 'topics': self.topics}) + self._reset() + + def start_title(self, attributes): + self.in_title = 1 + + def end_title(self): + self.in_title = 0 + + def start_body(self, attributes): + self.in_body = 1 + + def end_body(self): + self.in_body = 0 + + def start_topics(self, attributes): + self.in_topics = 1 + + def end_topics(self): + self.in_topics = 0 + + def start_d(self, attributes): + self.in_topic_d = 1 + + def end_d(self): + self.in_topic_d = 0 + self.topics.append(self.topic_d) + self.topic_d = "" + + +def stream_reuters_documents(reuters_dir): + """ Iterate over documents of the Reuters dataset. + + The Reuters archive will automatically be downloaded and uncompressed if + the `data_path` directory does not exist. + + Documents are represented as dictionaries with 'body' (str), + 'title' (str), 'topics' (list(str)) keys. + + """ + parser = ReutersParser() + for filename in glob(os.path.join(reuters_dir, "*.sgm")): + for doc in parser.parse(open(filename, 'rb')): + yield doc + + +##################### main ###################### + +DATA_DIR = "../data" +REUTERS_DIR = os.path.join(DATA_DIR, "reuters-21578") + +num_read = 0 +num_sents = 0 + +fsent = open(os.path.join(DATA_DIR, "reuters-sent.txt"), "wb") +fpos = open(os.path.join(DATA_DIR, "reuters-pos.txt"), "wb") +tagger = nltk.tag.PerceptronTagger() + +for doc in stream_reuters_documents(REUTERS_DIR): + # skip docs without specified topic + topics = doc["topics"] + if len(topics) == 0: + continue + title = doc["title"] + body = doc["body"] + sents = nltk.sent_tokenize(body) + for sent in sents: + if num_sents % 100 == 0: + print("{:d} sentences written".format(num_sents)) + if len(sent) <= 20: + continue + sent = sent.encode("utf8").decode("ascii", "ignore") + words = nltk.word_tokenize(sent) + fsent.write("{:s}\n".format(" ".join(words))) + tokentags = nltk.tag._pos_tag(words, None, tagger) + fpos.write("{:s}\n".format(" ".join([x[1] for x in tokentags]))) + num_sents += 1 + +fsent.close() +fpos.close() +print("{:d} sentences written, COMPLETE".format(num_sents)) diff --git a/Chapter06/pos_tagging_gru.py b/Chapter06/pos_tagging_gru.py new file mode 100644 index 0000000..9dc7b6e --- /dev/null +++ b/Chapter06/pos_tagging_gru.py @@ -0,0 +1,196 @@ +# -*- coding: utf-8 -*- +from __future__ import division, print_function +from keras.layers.core import Activation, Dense, RepeatVector +from keras.layers.embeddings import Embedding +from keras.layers.recurrent import GRU, LSTM +from keras.layers.wrappers import TimeDistributed, Bidirectional +from keras.metrics import top_k_categorical_accuracy +from keras.models import Sequential +from keras.optimizers import Adam +from keras.preprocessing import sequence +from keras.utils import np_utils +from sklearn.model_selection import train_test_split +import collections +import matplotlib.pyplot as plt +import numpy as np +import os + +def parse_sentences(filename): + sents = [] + word_freqs = collections.Counter() + fin = open(filename, "rb") + for line in fin: + words = line.strip().lower().split() + for word in words: + word_freqs[word] += 1 + sents.append(words) + fin.close() + return sents, word_freqs + +def get_or_else(dictionary, key, default_value): + try: + return dictionary[key] + except KeyError: + return default_value + +def generate_batch(s_sents, s_word2index, t_sents, t_word2index, + batch_size, maxlen): + while True: + # shuffle the input + indices = np.random.permutation(np.arange(len(s_sents))) + ss_sents = [s_sents[ix] for ix in indices] + ts_sents = [t_sents[ix] for ix in indices] + # convert to word indices + si_sents = [[get_or_else(s_word2index, word, s_word2index["UNK"]) + for word in sent] + for sent in ss_sents] + ti_sents = [[t_word2index[word] for word in sent] + for sent in ts_sents] + # inner loop should run for an epoch + num_batches = len(s_sents) // batch_size + for i in range(num_batches): + s_batch = si_sents[i * batch_size : (i + 1) * batch_size] + t_batch = ti_sents[i * batch_size : (i + 1) * batch_size] + sp_batch = sequence.pad_sequences(s_batch, maxlen=maxlen) + tp_batch = sequence.pad_sequences(t_batch, maxlen=maxlen) + tpc_batch = np_utils.to_categorical(tp_batch.reshape(-1, 1), + num_classes=len(t_word2index)).reshape(batch_size, + -1, len(t_word2index)) + yield sp_batch, tpc_batch + + +def top_3_categorical_accuracy(ytrue, ypred): + return top_k_categorical_accuracy(ytrue, ypred, k=3) + + +########################## main ########################## + +DATA_DIR = "../data" + +# data exploration, set constants +s_sents, s_wordfreqs = parse_sentences(os.path.join(DATA_DIR, "reuters-sent.txt")) +t_sents, t_wordfreqs = parse_sentences(os.path.join(DATA_DIR, "reuters-pos.txt")) +sent_lengths = np.array([len(sent) for sent in s_sents]) + +print("# records: {:d}".format(len(s_sents))) +print("# unique words: {:d}".format(len(s_wordfreqs))) +print("# unique POS tags: {:d}".format(len(t_wordfreqs))) +print("# words/sentence: min: {:d}, max: {:d}, mean: {:.3f}, median: {:.0f}" + .format(np.min(sent_lengths), np.max(sent_lengths), + np.mean(sent_lengths), np.median(sent_lengths))) + +## records: 103126 +## unique words: 67749 +## unique POS tags: 44 +## words/sentence: min: 3, max: 429, mean: 26.694, median: 26 +## np.where(sent_lengths <= 50)[0].shape +## (100343,) +## Gives rise to the following constants + +MAX_SEQLEN = 50 +S_MAX_FEATURES = 50000 +T_MAX_FEATURES = 45 + +EMBED_SIZE = 300 +HIDDEN_SIZE = 100 + +BATCH_SIZE = 64 + +# run for 1000 epochs, show sample results every 50 +NUM_EPOCHS = 50 +NUM_ITERATIONS = 20 + +# lookup tables +s_vocabsize = min(len(s_wordfreqs), S_MAX_FEATURES) + 2 +s_word2index = {x[0]:i+2 for i, x in + enumerate(s_wordfreqs.most_common(S_MAX_FEATURES))} +s_word2index["PAD"] = 0 +s_word2index["UNK"] = 1 +s_index2word = {v:k for k, v in s_word2index.items()} + +t_vocabsize = len(t_wordfreqs) + 1 +t_word2index = {x[0]:i+1 for i, x in + enumerate(t_wordfreqs.most_common(T_MAX_FEATURES))} +t_word2index["PAD"] = 0 +t_index2word = {v:k for k, v in t_word2index.items()} + +# split into train and test +test_size = int(0.3 * len(s_sents)) +s_sents_train, s_sents_test = s_sents[0:-test_size], s_sents[-test_size:] +t_sents_train, t_sents_test = t_sents[0:-test_size], t_sents[-test_size:] +train_gen = generate_batch(s_sents_train, s_word2index, t_sents_train, + t_word2index, BATCH_SIZE, MAX_SEQLEN) +test_gen = generate_batch(s_sents_test, s_word2index, t_sents_test, + t_word2index, BATCH_SIZE, MAX_SEQLEN) +print(len(s_sents_train), len(s_sents_test)) + +# define network +model = Sequential() +model.add(Embedding(s_vocabsize, EMBED_SIZE, + input_length=MAX_SEQLEN, + embeddings_initializer="glorot_uniform")) +#model.add(GRU(HIDDEN_SIZE)) +model.add(LSTM(HIDDEN_SIZE)) +#model.add(Bidirectional(LSTM(HIDDEN_SIZE, dropout_W=0.2, dropout_U=0.2))) +model.add(RepeatVector(MAX_SEQLEN)) +#model.add(GRU(HIDDEN_SIZE, return_sequences=True)) +model.add(LSTM(HIDDEN_SIZE, return_sequences=True)) +#model.add(Bidirectional(LSTM(HIDDEN_SIZE, return_sequences=True))) +model.add(TimeDistributed(Dense(t_vocabsize))) +model.add(Activation("softmax")) + +model.compile(loss="categorical_crossentropy", optimizer="adam", + metrics=["accuracy"]) + +num_train_samples = len(s_sents_train) // BATCH_SIZE +num_test_samples = len(s_sents_test) // BATCH_SIZE + +hist_acc, hist_val_acc, hist_loss, hist_val_loss = [], [], [], [] +for i in range(NUM_ITERATIONS): + history = model.fit_generator(train_gen, + steps_per_epoch=num_train_samples, + epochs=NUM_EPOCHS, + validation_data=test_gen, + validation_steps=num_test_samples) + # save off history data + hist_acc.extend(history.history["acc"]) + hist_val_acc.extend(history.history["val_acc"]) + hist_loss.extend(history.history["loss"]) + hist_val_loss.extend(history.history["val_loss"]) + # show some predictions + Xtest, Ytest = test_gen.next() + Ytest_ = model.predict(Xtest) + ytest = np.argmax(Ytest, axis=2) + ytest_ = np.argmax(Ytest_, axis=2) + print("=" * 80) + print("Iteration # {:d}".format(i + 1)) + print("-" * 80) + for i in range(min(5, Ytest.shape[0])): + sent_ids = Xtest[i] + sent_words = [s_index2word[x] for x in sent_ids.tolist()] + pos_labels = [t_index2word[x] for x in ytest[i].tolist()] + pos_preds = [t_index2word[x] for x in ytest_[i].tolist()] + triples = [x for x in zip(sent_words, pos_labels, pos_preds) + if x[0] != "PAD"] + print("label: " + " ".join([x[0] + x[1].upper() + for x in triples])) + print("predicted: " + " ".join([x[0] + x[2].upper() + for x in triples])) + print("-" * 80) + +# plot loss and accuracy +plt.subplot(211) +plt.title("Accuracy") +plt.plot(hist_acc, color="g", label="Train") +plt.plot(hist_val_acc, color="b", label="Validation") +plt.legend(loc="best") + +plt.subplot(212) +plt.title("Loss") +plt.plot(hist_loss, color="g", label="Train") +plt.plot(hist_val_loss, color="b", label="Validation") +plt.legend(loc="best") + +plt.tight_layout() +plt.show() + diff --git a/Chapter06/umich_sentiment_lstm.py b/Chapter06/umich_sentiment_lstm.py new file mode 100644 index 0000000..38f5410 --- /dev/null +++ b/Chapter06/umich_sentiment_lstm.py @@ -0,0 +1,121 @@ +# -*- coding: utf-8 -*- +from keras.layers.core import Activation, Dense, Dropout, SpatialDropout1D +from keras.layers.embeddings import Embedding +from keras.layers.recurrent import LSTM +from keras.models import Sequential +from keras.preprocessing import sequence +from sklearn.model_selection import train_test_split +import collections +import matplotlib.pyplot as plt +import nltk +import numpy as np +import os + +DATA_DIR = "../data" + +MAX_FEATURES = 2000 +MAX_SENTENCE_LENGTH = 40 + +EMBEDDING_SIZE = 128 +HIDDEN_LAYER_SIZE = 64 +BATCH_SIZE = 32 +NUM_EPOCHS = 10 + +# Read training data and generate vocabulary +maxlen = 0 +word_freqs = collections.Counter() +num_recs = 0 +ftrain = open(os.path.join(DATA_DIR, "umich-sentiment-train.txt"), 'rb') +for line in ftrain: + label, sentence = line.strip().split("\t") + words = nltk.word_tokenize(sentence.decode("ascii", "ignore").lower()) + if len(words) > maxlen: + maxlen = len(words) + for word in words: + word_freqs[word] += 1 + num_recs += 1 +ftrain.close() + +## Get some information about our corpus +#print maxlen # 42 +#print len(word_freqs) # 2313 + +# 1 is UNK, 0 is PAD +# We take MAX_FEATURES-1 featurs to accound for PAD +vocab_size = min(MAX_FEATURES, len(word_freqs)) + 2 +word2index = {x[0]: i+2 for i, x in + enumerate(word_freqs.most_common(MAX_FEATURES))} +word2index["PAD"] = 0 +word2index["UNK"] = 1 +index2word = {v:k for k, v in word2index.items()} + +# convert sentences to sequences +X = np.empty((num_recs, ), dtype=list) +y = np.zeros((num_recs, )) +i = 0 +ftrain = open(os.path.join(DATA_DIR, "umich-sentiment-train.txt"), 'rb') +for line in ftrain: + label, sentence = line.strip().split("\t") + words = nltk.word_tokenize(sentence.decode("ascii", "ignore").lower()) + seqs = [] + for word in words: + if word2index.has_key(word): + seqs.append(word2index[word]) + else: + seqs.append(word2index["UNK"]) + X[i] = seqs + y[i] = int(label) + i += 1 +ftrain.close() + +# Pad the sequences (left padded with zeros) +X = sequence.pad_sequences(X, maxlen=MAX_SENTENCE_LENGTH) + +# Split input into training and test +Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, + random_state=42) +print(Xtrain.shape, Xtest.shape, ytrain.shape, ytest.shape) + +# Build model +model = Sequential() +model.add(Embedding(vocab_size, EMBEDDING_SIZE, + input_length=MAX_SENTENCE_LENGTH)) +model.add(SpatialDropout1D(Dropout(0.2))) +model.add(LSTM(HIDDEN_LAYER_SIZE, dropout=0.2, recurrent_dropout=0.2)) +model.add(Dense(1)) +model.add(Activation("sigmoid")) + +model.compile(loss="binary_crossentropy", optimizer="adam", + metrics=["accuracy"]) + +history = model.fit(Xtrain, ytrain, batch_size=BATCH_SIZE, + epochs=NUM_EPOCHS, + validation_data=(Xtest, ytest)) + +# plot loss and accuracy +plt.subplot(211) +plt.title("Accuracy") +plt.plot(history.history["acc"], color="g", label="Train") +plt.plot(history.history["val_acc"], color="b", label="Validation") +plt.legend(loc="best") + +plt.subplot(212) +plt.title("Loss") +plt.plot(history.history["loss"], color="g", label="Train") +plt.plot(history.history["val_loss"], color="b", label="Validation") +plt.legend(loc="best") + +plt.tight_layout() +plt.show() + +# evaluate +score, acc = model.evaluate(Xtest, ytest, batch_size=BATCH_SIZE) +print("Test score: %.3f, accuracy: %.3f" % (score, acc)) + +for i in range(5): + idx = np.random.randint(len(Xtest)) + xtest = Xtest[idx].reshape(1,40) + ylabel = ytest[idx] + ypred = model.predict(xtest)[0][0] + sent = " ".join([index2word[x] for x in xtest[0].tolist() if x != 0]) + print("%.0f\t%d\t%s" % (ypred, ylabel, sent))