forked from PacktPublishing/Deep-Learning-with-Keras
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
dinesh-packt
committed
Apr 25, 2017
1 parent
c1b0367
commit f558d37
Showing
11 changed files
with
890 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
from keras.layers.core import Dense, Dropout, SpatialDropout1D | ||
from keras.layers.convolutional import Conv1D | ||
from keras.layers.embeddings import Embedding | ||
from keras.layers.pooling import GlobalMaxPooling1D | ||
from keras.models import Sequential | ||
from keras.preprocessing.sequence import pad_sequences | ||
from keras.utils import np_utils | ||
from sklearn.model_selection import train_test_split | ||
import collections | ||
import matplotlib.pyplot as plt | ||
import nltk | ||
import numpy as np | ||
|
||
np.random.seed(42) | ||
|
||
INPUT_FILE = "../data/umich-sentiment-train.txt" | ||
GLOVE_MODEL = "../data/glove.6B.300d.txt" | ||
VOCAB_SIZE = 5000 | ||
EMBED_SIZE = 300 | ||
NUM_FILTERS = 256 | ||
NUM_WORDS = 3 | ||
BATCH_SIZE = 64 | ||
NUM_EPOCHS = 10 | ||
|
||
counter = collections.Counter() | ||
fin = open(INPUT_FILE, "rb") | ||
maxlen = 0 | ||
for line in fin: | ||
_, sent = line.strip().split("\t") | ||
words = [x.lower() for x in nltk.word_tokenize(sent)] | ||
if len(words) > maxlen: | ||
maxlen = len(words) | ||
for word in words: | ||
counter[word] += 1 | ||
fin.close() | ||
|
||
word2index = collections.defaultdict(int) | ||
for wid, word in enumerate(counter.most_common(VOCAB_SIZE)): | ||
word2index[word[0]] = wid + 1 | ||
vocab_sz = len(word2index) + 1 | ||
index2word = {v:k for k, v in word2index.items()} | ||
|
||
xs, ys = [], [] | ||
fin = open(INPUT_FILE, "rb") | ||
for line in fin: | ||
label, sent = line.strip().split("\t") | ||
ys.append(int(label)) | ||
words = [x.lower() for x in nltk.word_tokenize(sent)] | ||
wids = [word2index[word] for word in words] | ||
xs.append(wids) | ||
fin.close() | ||
X = pad_sequences(xs, maxlen=maxlen) | ||
Y = np_utils.to_categorical(ys) | ||
|
||
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.3, | ||
random_state=42) | ||
print(Xtrain.shape, Xtest.shape, Ytrain.shape, Ytest.shape) | ||
|
||
# load GloVe vectors | ||
word2emb = {} | ||
fglove = open(GLOVE_MODEL, "rb") | ||
for line in fglove: | ||
cols = line.strip().split() | ||
word = cols[0] | ||
embedding = np.array(cols[1:], dtype="float32") | ||
word2emb[word] = embedding | ||
fglove.close() | ||
embedding_weights = np.zeros((vocab_sz, EMBED_SIZE)) | ||
for word, index in word2index.items(): | ||
try: | ||
embedding_weights[index, :] = word2emb[word] | ||
except KeyError: | ||
pass | ||
|
||
model = Sequential() | ||
model.add(Embedding(vocab_sz, EMBED_SIZE, input_length=maxlen, | ||
weights=[embedding_weights], | ||
trainable=True)) | ||
model.add(SpatialDropout1D(Dropout(0.2))) | ||
model.add(Conv1D(filters=NUM_FILTERS, kernel_size=NUM_WORDS, | ||
activation="relu")) | ||
model.add(GlobalMaxPooling1D()) | ||
model.add(Dense(2, activation="softmax")) | ||
|
||
model.compile(optimizer="adam", loss="categorical_crossentropy", | ||
metrics=["accuracy"]) | ||
|
||
history = model.fit(Xtrain, Ytrain, batch_size=BATCH_SIZE, | ||
epochs=NUM_EPOCHS, | ||
validation_data=(Xtest, Ytest)) | ||
|
||
|
||
# plot loss function | ||
plt.subplot(211) | ||
plt.title("accuracy") | ||
plt.plot(history.history["acc"], color="r", label="train") | ||
plt.plot(history.history["val_acc"], color="b", label="validation") | ||
plt.legend(loc="best") | ||
|
||
plt.subplot(212) | ||
plt.title("loss") | ||
plt.plot(history.history["loss"], color="r", label="train") | ||
plt.plot(history.history["val_loss"], color="b", label="validation") | ||
plt.legend(loc="best") | ||
|
||
plt.tight_layout() | ||
plt.show() | ||
|
||
# evaluate model | ||
score = model.evaluate(Xtest, Ytest, verbose=1) | ||
print("Test score: {:.3f}, accuracy: {:.3f}".format(score[0], score[1])) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
from gensim.models import KeyedVectors | ||
from keras.layers.core import Dense, Dropout, SpatialDropout1D | ||
from keras.layers.convolutional import Conv1D | ||
from keras.layers.embeddings import Embedding | ||
from keras.layers.pooling import GlobalMaxPooling1D | ||
from keras.models import Sequential | ||
from keras.preprocessing.sequence import pad_sequences | ||
from keras.utils import np_utils | ||
from sklearn.model_selection import train_test_split | ||
import collections | ||
import matplotlib.pyplot as plt | ||
import nltk | ||
import numpy as np | ||
|
||
np.random.seed(42) | ||
|
||
INPUT_FILE = "../data/umich-sentiment-train.txt" | ||
WORD2VEC_MODEL = "../data/GoogleNews-vectors-negative300.bin.gz" | ||
VOCAB_SIZE = 5000 | ||
EMBED_SIZE = 300 | ||
NUM_FILTERS = 256 | ||
NUM_WORDS = 3 | ||
BATCH_SIZE = 64 | ||
NUM_EPOCHS = 10 | ||
|
||
counter = collections.Counter() | ||
fin = open(INPUT_FILE, "rb") | ||
maxlen = 0 | ||
for line in fin: | ||
_, sent = line.strip().split("\t") | ||
words = [x.lower() for x in nltk.word_tokenize(sent)] | ||
if len(words) > maxlen: | ||
maxlen = len(words) | ||
for word in words: | ||
counter[word] += 1 | ||
fin.close() | ||
|
||
word2index = collections.defaultdict(int) | ||
for wid, word in enumerate(counter.most_common(VOCAB_SIZE)): | ||
word2index[word[0]] = wid + 1 | ||
vocab_sz = len(word2index) + 1 | ||
index2word = {v:k for k, v in word2index.items()} | ||
|
||
xs, ys = [], [] | ||
fin = open(INPUT_FILE, "rb") | ||
for line in fin: | ||
label, sent = line.strip().split("\t") | ||
ys.append(int(label)) | ||
words = [x.lower() for x in nltk.word_tokenize(sent)] | ||
wids = [word2index[word] for word in words] | ||
xs.append(wids) | ||
fin.close() | ||
X = pad_sequences(xs, maxlen=maxlen) | ||
Y = np_utils.to_categorical(ys) | ||
|
||
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.3, | ||
random_state=42) | ||
print(Xtrain.shape, Xtest.shape, Ytrain.shape, Ytest.shape) | ||
|
||
# load word2vec model | ||
word2vec = KeyedVectors.load_word2vec_format(WORD2VEC_MODEL, binary=True) | ||
embedding_weights = np.zeros((vocab_sz, EMBED_SIZE)) | ||
for word, index in word2index.items(): | ||
try: | ||
embedding_weights[index, :] = word2vec[word] | ||
except KeyError: | ||
pass | ||
|
||
model = Sequential() | ||
model.add(Embedding(vocab_sz, EMBED_SIZE, input_length=maxlen, | ||
weights=[embedding_weights], | ||
trainable=True)) | ||
model.add(SpatialDropout1D(Dropout(0.2))) | ||
model.add(Conv1D(filters=NUM_FILTERS, kernel_size=NUM_WORDS, | ||
activation="relu")) | ||
model.add(GlobalMaxPooling1D()) | ||
model.add(Dense(2, activation="softmax")) | ||
|
||
model.compile(optimizer="adam", loss="categorical_crossentropy", | ||
metrics=["accuracy"]) | ||
history = model.fit(Xtrain, Ytrain, batch_size=BATCH_SIZE, | ||
epochs=NUM_EPOCHS, | ||
validation_data=(Xtest, Ytest)) | ||
|
||
# plot loss function | ||
plt.subplot(211) | ||
plt.title("accuracy") | ||
plt.plot(history.history["acc"], color="r", label="train") | ||
plt.plot(history.history["val_acc"], color="b", label="validation") | ||
plt.legend(loc="best") | ||
|
||
plt.subplot(212) | ||
plt.title("loss") | ||
plt.plot(history.history["loss"], color="r", label="train") | ||
plt.plot(history.history["val_loss"], color="b", label="validation") | ||
plt.legend(loc="best") | ||
|
||
plt.tight_layout() | ||
plt.show() | ||
|
||
# evaluate model | ||
score = model.evaluate(Xtest, Ytest, verbose=1) | ||
print("Test score: {:.3f}, accuracy: {:.3f}".format(score[0], score[1])) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
# -*- coding: utf-8 -*- | ||
from __future__ import division, print_function | ||
from keras.models import Sequential | ||
from keras.layers.core import Dense, Lambda | ||
from keras.layers.embeddings import Embedding | ||
import keras.backend as K | ||
|
||
vocab_size = 5000 | ||
embed_size = 300 | ||
window_size = 1 | ||
|
||
model = Sequential() | ||
model.add(Embedding(input_dim=vocab_size, output_dim=embed_size, | ||
embeddings_initializer='glorot_uniform', | ||
input_length=window_size*2)) | ||
model.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(embed_size,))) | ||
model.add(Dense(vocab_size, kernel_initializer='glorot_uniform', | ||
activation='softmax')) | ||
|
||
model.compile(loss='categorical_crossentropy', optimizer="adadelta") | ||
|
||
# get weights | ||
weights = model.layers[0].get_weights()[0] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
# -*- coding: utf-8 -*- | ||
from __future__ import division, print_function | ||
from keras.layers import Merge | ||
from keras.layers.core import Dense, Reshape | ||
from keras.layers.embeddings import Embedding | ||
from keras.models import Sequential | ||
import keras.backend as K | ||
|
||
vocab_size = 5000 | ||
embed_size = 300 | ||
|
||
word_model = Sequential() | ||
word_model.add(Embedding(vocab_size, embed_size, | ||
embeddings_initializer="glorot_uniform", | ||
input_length=1)) | ||
word_model.add(Reshape((embed_size,))) | ||
|
||
context_model = Sequential() | ||
context_model.add(Embedding(vocab_size, embed_size, | ||
embeddings_initializer="glorot_uniform", | ||
input_length=1)) | ||
context_model.add(Reshape((embed_size,))) | ||
|
||
model = Sequential() | ||
model.add(Merge([word_model, context_model], mode="dot", dot_axes=0)) | ||
model.add(Dense(1, kernel_initializer="glorot_uniform", activation="sigmoid")) | ||
|
||
model.compile(loss="mean_squared_error", optimizer="adam") | ||
|
||
merge_layer = model.layers[0] | ||
word_model = merge_layer.layers[0] | ||
word_embed_layer = word_model.layers[0] | ||
weights = word_embed_layer.get_weights()[0] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
from keras.layers.core import Dense, Dropout, SpatialDropout1D | ||
from keras.layers.convolutional import Conv1D | ||
from keras.layers.embeddings import Embedding | ||
from keras.layers.pooling import GlobalMaxPooling1D | ||
from keras.models import Sequential | ||
from keras.preprocessing.sequence import pad_sequences | ||
from keras.utils import np_utils | ||
from sklearn.model_selection import train_test_split | ||
import collections | ||
import matplotlib.pyplot as plt | ||
import nltk | ||
import numpy as np | ||
|
||
np.random.seed(42) | ||
|
||
INPUT_FILE = "../data/umich-sentiment-train.txt" | ||
VOCAB_SIZE = 5000 | ||
EMBED_SIZE = 100 | ||
NUM_FILTERS = 256 | ||
NUM_WORDS = 3 | ||
BATCH_SIZE = 64 | ||
NUM_EPOCHS = 20 | ||
|
||
counter = collections.Counter() | ||
fin = open(INPUT_FILE, "rb") | ||
maxlen = 0 | ||
for line in fin: | ||
_, sent = line.strip().split("\t") | ||
words = [x.lower() for x in nltk.word_tokenize(sent)] | ||
if len(words) > maxlen: | ||
maxlen = len(words) | ||
for word in words: | ||
counter[word] += 1 | ||
fin.close() | ||
|
||
word2index = collections.defaultdict(int) | ||
for wid, word in enumerate(counter.most_common(VOCAB_SIZE)): | ||
word2index[word[0]] = wid + 1 | ||
vocab_sz = len(word2index) + 1 | ||
index2word = {v:k for k, v in word2index.items()} | ||
|
||
xs, ys = [], [] | ||
fin = open(INPUT_FILE, "rb") | ||
for line in fin: | ||
label, sent = line.strip().split("\t") | ||
ys.append(int(label)) | ||
words = [x.lower() for x in nltk.word_tokenize(sent)] | ||
wids = [word2index[word] for word in words] | ||
xs.append(wids) | ||
fin.close() | ||
X = pad_sequences(xs, maxlen=maxlen) | ||
Y = np_utils.to_categorical(ys) | ||
|
||
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.3, | ||
random_state=42) | ||
print(Xtrain.shape, Xtest.shape, Ytrain.shape, Ytest.shape) | ||
|
||
model = Sequential() | ||
model.add(Embedding(vocab_sz, EMBED_SIZE, input_length=maxlen)) | ||
model.add(SpatialDropout1D(Dropout(0.2))) | ||
model.add(Conv1D(filters=NUM_FILTERS, kernel_size=NUM_WORDS, activation="relu")) | ||
model.add(GlobalMaxPooling1D()) | ||
model.add(Dense(2, activation="softmax")) | ||
|
||
model.compile(optimizer="adam", loss="categorical_crossentropy", | ||
metrics=["accuracy"]) | ||
history = model.fit(Xtrain, Ytrain, batch_size=BATCH_SIZE, | ||
epochs=NUM_EPOCHS, | ||
validation_data=(Xtest, Ytest)) | ||
|
||
# plot loss function | ||
plt.subplot(211) | ||
plt.title("accuracy") | ||
plt.plot(history.history["acc"], color="r", label="train") | ||
plt.plot(history.history["val_acc"], color="b", label="validation") | ||
plt.legend(loc="best") | ||
|
||
plt.subplot(212) | ||
plt.title("loss") | ||
plt.plot(history.history["loss"], color="r", label="train") | ||
plt.plot(history.history["val_loss"], color="b", label="validation") | ||
plt.legend(loc="best") | ||
|
||
plt.tight_layout() | ||
plt.show() | ||
|
||
# evaluate model | ||
score = model.evaluate(Xtest, Ytest, verbose=1) | ||
print("Test score: {:.3f}, accuracy: {:.3f}".format(score[0], score[1])) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
# -*- coding: utf-8 -*- | ||
from keras.preprocessing.text import * | ||
from keras.preprocessing.sequence import skipgrams | ||
|
||
text = "I love green eggs and ham ." | ||
|
||
tokenizer = Tokenizer() | ||
tokenizer.fit_on_texts([text]) | ||
|
||
word2id = tokenizer.word_index | ||
id2word = {v:k for k, v in word2id.items()} | ||
|
||
wids = [word2id[w] for w in text_to_word_sequence(text)] | ||
pairs, labels = skipgrams(wids, len(word2id)) | ||
print(len(pairs), len(labels)) | ||
for i in range(10): | ||
print("({:s} ({:d}), {:s} ({:d})) -> {:d}".format( | ||
id2word[pairs[i][0]], pairs[i][0], | ||
id2word[pairs[i][1]], pairs[i][1], | ||
labels[i])) | ||
|
Oops, something went wrong.