Skip to content

Commit

Permalink
Updated Chapter06
Browse files Browse the repository at this point in the history
  • Loading branch information
dinesh-packt authored Apr 25, 2017
1 parent f558d37 commit c10bb70
Show file tree
Hide file tree
Showing 7 changed files with 852 additions and 0 deletions.
106 changes: 106 additions & 0 deletions Chapter06/alice_chargen_rnn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# -*- coding: utf-8 -*-
# Adapted from lstm_text_generation.py in keras/examples
from __future__ import print_function
from keras.layers.recurrent import SimpleRNN
from keras.models import Sequential
from keras.layers import Dense, Activation
import numpy as np

INPUT_FILE = "../data/alice_in_wonderland.txt"

# extract the input as a stream of characters
print("Extracting text from input...")
fin = open(INPUT_FILE, 'rb')
lines = []
for line in fin:
line = line.strip().lower()
line = line.decode("ascii", "ignore")
if len(line) == 0:
continue
lines.append(line)
fin.close()
text = " ".join(lines)

# creating lookup tables
# Here chars is the number of features in our character "vocabulary"
chars = set([c for c in text])
nb_chars = len(chars)
char2index = dict((c, i) for i, c in enumerate(chars))
index2char = dict((i, c) for i, c in enumerate(chars))

# create inputs and labels from the text. We do this by stepping
# through the text ${step} character at a time, and extracting a
# sequence of size ${seqlen} and the next output char. For example,
# assuming an input text "The sky was falling", we would get the
# following sequence of input_chars and label_chars (first 5 only)
# The sky wa -> s
# he sky was ->
# e sky was -> f
# sky was f -> a
# sky was fa -> l
print("Creating input and label text...")
SEQLEN = 10
STEP = 1

input_chars = []
label_chars = []
for i in range(0, len(text) - SEQLEN, STEP):
input_chars.append(text[i:i + SEQLEN])
label_chars.append(text[i + SEQLEN])

# vectorize the input and label chars
# Each row of the input is represented by seqlen characters, each
# represented as a 1-hot encoding of size len(char). There are
# len(input_chars) such rows, so shape(X) is (len(input_chars),
# seqlen, nb_chars).
# Each row of output is a single character, also represented as a
# dense encoding of size len(char). Hence shape(y) is (len(input_chars),
# nb_chars).
print("Vectorizing input and label text...")
X = np.zeros((len(input_chars), SEQLEN, nb_chars), dtype=np.bool)
y = np.zeros((len(input_chars), nb_chars), dtype=np.bool)
for i, input_char in enumerate(input_chars):
for j, ch in enumerate(input_char):
X[i, j, char2index[ch]] = 1
y[i, char2index[label_chars[i]]] = 1

# Build the model. We use a single RNN with a fully connected layer
# to compute the most likely predicted output char
HIDDEN_SIZE = 128
BATCH_SIZE = 128
NUM_ITERATIONS = 25
NUM_EPOCHS_PER_ITERATION = 1
NUM_PREDS_PER_EPOCH = 100

model = Sequential()
model.add(SimpleRNN(HIDDEN_SIZE, return_sequences=False,
input_shape=(SEQLEN, nb_chars),
unroll=True))
model.add(Dense(nb_chars))
model.add(Activation("softmax"))

model.compile(loss="categorical_crossentropy", optimizer="rmsprop")

# We train the model in batches and test output generated at each step
for iteration in range(NUM_ITERATIONS):
print("=" * 50)
print("Iteration #: %d" % (iteration))
model.fit(X, y, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS_PER_ITERATION)

# testing model
# randomly choose a row from input_chars, then use it to
# generate text from model for next 100 chars
test_idx = np.random.randint(len(input_chars))
test_chars = input_chars[test_idx]
print("Generating from seed: %s" % (test_chars))
print(test_chars, end="")
for i in range(NUM_PREDS_PER_EPOCH):
Xtest = np.zeros((1, SEQLEN, nb_chars))
for i, ch in enumerate(test_chars):
Xtest[0, i, char2index[ch]] = 1
pred = model.predict(Xtest, verbose=0)[0]
ypred = index2char[np.argmax(pred)]
print(ypred, end="")
# move forward with test_chars + ypred
test_chars = test_chars[1:] + ypred
print()
32 changes: 32 additions & 0 deletions Chapter06/econs_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# -*- coding: utf-8 -*-
from __future__ import division, print_function
import numpy as np
import matplotlib.pyplot as plt
import os
import re

DATA_DIR = "../data"

fld = open(os.path.join(DATA_DIR, "LD2011_2014.txt"), "rb")
data = []
line_num = 0
#cid = np.random.randint(0, 370, 1)
cid = 250
for line in fld:
if line.startswith("\"\";"):
continue
if line_num % 100 == 0:
print("{:d} lines read".format(line_num))
cols = [float(re.sub(",", ".", x)) for x in
line.strip().split(";")[1:]]
data.append(cols[cid])
line_num += 1
fld.close()

NUM_ENTRIES = 1000
plt.plot(range(NUM_ENTRIES), data[0:NUM_ENTRIES])
plt.ylabel("electricity consumption")
plt.xlabel("time (1pt = 15 mins)")
plt.show()

np.save(os.path.join(DATA_DIR, "LD_250.npy"), np.array(data))
82 changes: 82 additions & 0 deletions Chapter06/econs_stateful.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# -*- coding: utf-8 -*-
from __future__ import division, print_function
from keras.layers.core import Dense
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import math
import os

DATA_DIR = "../data"

data = np.load(os.path.join(DATA_DIR, "LD_250.npy"))

STATELESS = False

NUM_TIMESTEPS = 20
HIDDEN_SIZE = 10
BATCH_SIZE = 96 # 24 hours (15 min intervals)
NUM_EPOCHS = 5

# scale the data to be in the range (0, 1)
data = data.reshape(-1, 1)
scaler = MinMaxScaler(feature_range=(0, 1), copy=False)
data = scaler.fit_transform(data)

# transform to 4 inputs -> 1 label format
X = np.zeros((data.shape[0], NUM_TIMESTEPS))
Y = np.zeros((data.shape[0], 1))
for i in range(len(data) - NUM_TIMESTEPS - 1):
X[i] = data[i:i + NUM_TIMESTEPS].T
Y[i] = data[i + NUM_TIMESTEPS + 1]

# reshape X to three dimensions (samples, timesteps, features)
X = np.expand_dims(X, axis=2)

# split into training and test sets (add the extra offsets so
# we can use batch size of 5)
sp = int(0.7 * len(data))
Xtrain, Xtest, Ytrain, Ytest = X[0:sp], X[sp:], Y[0:sp], Y[sp:]
print(Xtrain.shape, Xtest.shape, Ytrain.shape, Ytest.shape)

if STATELESS:
# stateless
model = Sequential()
model.add(LSTM(HIDDEN_SIZE, input_shape=(NUM_TIMESTEPS, 1),
return_sequences=False))
model.add(Dense(1))
else:
# stateful
model = Sequential()
model.add(LSTM(HIDDEN_SIZE, stateful=True,
batch_input_shape=(BATCH_SIZE, NUM_TIMESTEPS, 1),
return_sequences=False))
model.add(Dense(1))

model.compile(loss="mean_squared_error", optimizer="adam",
metrics=["mean_squared_error"])

if STATELESS:
# stateless
model.fit(Xtrain, Ytrain, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE,
validation_data=(Xtest, Ytest),
shuffle=False)
else:
# stateful
# need to make training and test data to multiple of BATCH_SIZE
train_size = (Xtrain.shape[0] // BATCH_SIZE) * BATCH_SIZE
test_size = (Xtest.shape[0] // BATCH_SIZE) * BATCH_SIZE
Xtrain, Ytrain = Xtrain[0:train_size], Ytrain[0:train_size]
Xtest, Ytest = Xtest[0:test_size], Ytest[0:test_size]
print(Xtrain.shape, Xtest.shape, Ytrain.shape, Ytest.shape)
for i in range(NUM_EPOCHS):
print("Epoch {:d}/{:d}".format(i+1, NUM_EPOCHS))
model.fit(Xtrain, Ytrain, batch_size=BATCH_SIZE, epochs=1,
validation_data=(Xtest, Ytest),
shuffle=False)
model.reset_states()

score, _ = model.evaluate(Xtest, Ytest, batch_size=BATCH_SIZE)
rmse = math.sqrt(score)
print("\nMSE: {:.3f}, RMSE: {:.3f}".format(score, rmse))
170 changes: 170 additions & 0 deletions Chapter06/pos-tagging-explore.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
# -*- coding: utf-8 -*-
from __future__ import division, print_function
from keras.layers.core import Activation, Dense, Dropout, RepeatVector, SpatialDropout1D
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import GRU, LSTM
from keras.layers.wrappers import TimeDistributed, Bidirectional
from keras.models import Sequential
from keras.preprocessing import sequence
from keras.utils import np_utils
import collections
import matplotlib.pyplot as plt
import numpy as np
import os

def explore_data(datadir, datafiles):
counter = collections.Counter()
maxlen = 0
for datafile in datafiles:
fdata = open(os.path.join(datadir, datafile), "rb")
for line in fdata:
words = line.strip().split()
if len(words) > maxlen:
maxlen = len(words)
for word in words:
counter[word] += 1
fdata.close()
return maxlen, counter

def build_tensor(filename, numrecs, word2index, maxlen,
make_categorical=False):
data = np.empty((numrecs, ), dtype=list)
fin = open(filename, "rb")
i = 0
for line in fin:
wids = []
for word in line.strip().split():
if word2index.has_key(word):
wids.append(word2index[word])
else:
wids.append(word2index["UNK"])
if make_categorical:
data[i] = np_utils.to_categorical(
wids, num_classes=len(word2index))
else:
data[i] = wids
i += 1
fin.close()
pdata = sequence.pad_sequences(data, maxlen=maxlen)
return pdata

def evaluate_model(model, Xtest, Ytest, batch_size):
pass

DATA_DIR = "../data"

s_maxlen, s_counter = explore_data(DATA_DIR, ["babi-sent-train.txt",
"babi-sent-test.txt"])
t_maxlen, t_counter = explore_data(DATA_DIR, ["babi-pos-train.txt",
"babi-pos-test.txt"])

print(s_maxlen, len(s_counter), t_maxlen, len(t_counter))
# 7 21 7 9
# maxlen: 7
# size of source vocab: 21
# size of target vocab: 9

# lookup tables
s_word2id = {k:v+1 for v, (k, _) in enumerate(s_counter.most_common())}
s_word2id["PAD"] = 0
s_id2word = {v:k for k, v in s_word2id.items()}
t_pos2id = {k:v+1 for v, (k, _) in enumerate(t_counter.most_common())}
t_pos2id["PAD"] = 0
t_id2pos = {v:k for k, v in t_pos2id.items()}

# vectorize data
MAX_SEQLEN = 10

Xtrain = build_tensor(os.path.join(DATA_DIR, "babi-sent-train.txt"),
30000, s_word2id, MAX_SEQLEN)
Xtest = build_tensor(os.path.join(DATA_DIR, "babi-sent-test.txt"),
3000, s_word2id, MAX_SEQLEN)
Ytrain = build_tensor(os.path.join(DATA_DIR, "babi-pos-train.txt"),
30000, t_pos2id, MAX_SEQLEN, make_categorical=True)
Ytest = build_tensor(os.path.join(DATA_DIR, "babi-pos-test.txt"),
3000, t_pos2id, MAX_SEQLEN, make_categorical=True)
print(Xtrain.shape, Xtest.shape, Ytrain.shape, Ytest.shape)

# define network
EMBED_SIZE = 32
HIDDEN_SIZE = 32

BATCH_SIZE = 32
NUM_EPOCHS = 5

model = Sequential()
model.add(Embedding(len(s_word2id), EMBED_SIZE,
input_length=MAX_SEQLEN))
model.add(SpatialDropout1D(Dropout(0.2)))
model.add(LSTM(HIDDEN_SIZE, dropout=0.2, recurrent_dropout=0.2))
#model.add(GRU(HIDDEN_SIZE, dropout=0.2, recurrent_dropout=0.2))
#model.add(Bidirectional(LSTM(HIDDEN_SIZE, dropout=0.2, recurrent_dropout=0.2)))
model.add(RepeatVector(MAX_SEQLEN))
model.add(LSTM(HIDDEN_SIZE, return_sequences=True))
#model.add(GRU(HIDDEN_SIZE, return_sequences=True))
#model.add(Bidirectional(LSTM(HIDDEN_SIZE, return_sequences=True)))
model.add(TimeDistributed(Dense(len(t_pos2id))))
model.add(Activation("softmax"))

model.compile(loss="categorical_crossentropy", optimizer="adam",
metrics=["accuracy"])

history = model.fit(Xtrain, Ytrain, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS,
validation_data=[Xtest, Ytest])

# plot loss and accuracy
plt.subplot(211)
plt.title("Accuracy")
plt.plot(history.history["acc"], color="g", label="Train")
plt.plot(history.history["val_acc"], color="b", label="Validation")
plt.legend(loc="best")

plt.subplot(212)
plt.title("Loss")
plt.plot(history.history["loss"], color="g", label="Train")
plt.plot(history.history["val_loss"], color="b", label="Validation")
plt.legend(loc="best")

plt.tight_layout()
plt.show()

# evaluate model
score, acc = model.evaluate(Xtest, Ytest, batch_size=BATCH_SIZE)
print("Test score: %.3f, accuracy: %.3f" % (score, acc))

# custom evaluate
hit_rates = []
num_iters = Xtest.shape[0] // BATCH_SIZE
for i in range(num_iters - 1):
xtest = Xtest[i * BATCH_SIZE : (i + 1) * BATCH_SIZE]
ytest = np.argmax(Ytest[i * BATCH_SIZE : (i + 1) * BATCH_SIZE], axis=2)
ytest_ = np.argmax(model.predict(xtest), axis=2)
# print(ytest.shape, ytest_.shape)
for j in range(BATCH_SIZE):
# print("sentence: " + " ".join([s_id2word[x] for x in xtest[j].tolist()]))
# print("predicted: " + " ".join([t_id2pos[y] for y in ytest_[j].tolist()]))
# print("label: " + " ".join([t_id2pos[y] for y in ytest[j].tolist()]))
word_indices = np.nonzero(xtest[j])
pos_labels = ytest[j][word_indices]
pos_pred = ytest_[j][word_indices]
hit_rates.append(np.sum(pos_labels == pos_pred) / len(pos_pred))
break

accuracy = sum(hit_rates) / len(hit_rates)
print("accuracy: {:.3f}".format(accuracy))

# prediction
pred_ids = np.random.randint(0, 3000, 5)
for pred_id in pred_ids:
xtest = Xtest[pred_id].reshape(1, 10)
ytest_ = np.argmax(model.predict(xtest), axis=2)
ytest = np.argmax(Ytest[pred_id], axis=1)
print("sentence: " + " ".join([s_id2word[x] for x in xtest[0].tolist()]))
print("predicted: " + " ".join([t_id2pos[y] for y in ytest_[0].tolist()]))
print("label: " + " ".join([t_id2pos[y] for y in ytest.tolist()]))
word_indices = np.nonzero(xtest)[1]
ypred_tags = ytest_[0][word_indices]
ytrue_tags = ytest[word_indices]
hit_rate = np.sum(ypred_tags == ytrue_tags) / len(ypred_tags)
print("hit rate: {:.3f}".format(hit_rate))
print()
Loading

0 comments on commit c10bb70

Please sign in to comment.