Skip to content

Commit

Permalink
Revert to keras provided features
Browse files Browse the repository at this point in the history
  • Loading branch information
Ioannis Chalkiadakis committed Aug 14, 2017
1 parent d113f89 commit 32efdee
Show file tree
Hide file tree
Showing 3 changed files with 111 additions and 46 deletions.
2 changes: 1 addition & 1 deletion sbatch_run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ module load easybuild
module load lang/Python/3.5.2-foss-2016b

#execute application
python3.5 /home/icha/tRustNN/src/dynamic_lstm_TF.py with 'seed=27105828' 'db="file"' 'run_id="lstm_big_embedding_nwords10k"' 'n_epoch=10' 'net_arch_layers = ["lstm","fc","output"]' 'net_arch.lstm.return_state=True' 'net_arch.lstm.return_seq=True' 'batch_size = 32' 'save_path = "/home/icha/sacred_models/"' 'tensorboard_dir = "/home/icha/sacred_models/tf_logs/"' 'embedding_dim = 300' 'internals = "all"' 'save_mode="pickle"' 'test_size=0.005' 'net_arch.lstm.n_units=128' 'n_words=10000' 'embedding_layer=1'
python3.5 /home/icha/tRustNN/src/dynamic_lstm_TF.py with 'seed=27105828' 'db="file"' 'run_id="lstm_embedding_newLRP"' 'n_epoch=100' 'net_arch_layers = ["lstm","fc","output"]' 'net_arch.lstm.return_state=True' 'net_arch.lstm.return_seq=True' 'batch_size = 32' 'save_path = "/home/icha/sacred_models/"' 'tensorboard_dir = "/home/icha/sacred_models/tf_logs/"' 'embedding_dim = 150' 'internals = "all"' 'save_mode="pickle"' 'test_size=0.005' 'net_arch.lstm.n_units=128' 'n_words=10000' 'embedding_layer=1'


#python3.5 /home/icha/tRustNN/src/dynamic_lstm_TF.py with 'seed=73467635' 'db="file"' 'run_id="lstm_small_embedding_toy_noInitMat"' 'net_arch.lstm.n_units=20' 'net_arch_layers = ["lstm","fc","output"]' 'net_arch.lstm.return_state=True' 'net_arch.lstm.return_seq=True' 'batch_size = 32' 'save_path = "/home/icha/sacred_models/"' 'tensorboard_dir = "/home/icha/sacred_models/tf_logs/"' 'embedding_dim = 300' 'internals = "all"' 'save_mode="pickle"' 'test_size=0.005' 'n_words=1000' 'embedding_layer=1' 'n_epoch = 10'
Expand Down
113 changes: 95 additions & 18 deletions src/IMDB_dataset/imdb_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
import re
import pickle

from keras.datasets import imdb as imdb
from keras.preprocessing import sequence

def get_input_json(filenames,w2v=None,token=None,feed=None):

if w2v==None:
Expand Down Expand Up @@ -148,21 +151,6 @@ def tokenize_and_remove_unk(X,n_words,dictionary):
return X_tokenized


def get_initial_embeddings_from_dictionary(n_words,embedding_dim,dictionary):

model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)
w2v = dict(zip(model.index2word, model.syn0))
inv_dict = {v: k for k, v in dictionary.items()}

ebd_init = np.zeros((n_words,embedding_dim))
w2v_w = list(w2v.keys())
for i in range(n_words):
if inv_dict[i] in w2v_w:
ebd_init[i,:] = w2v[inv_dict[i]]
else:
ebd_init[i,:] = np.zeros((embedding_dim))

return ebd_init


def extract_features(filenames_train_valid,filenames_test,seed,test_size,save_test,n_words,dictionary,embedding_dim):
Expand Down Expand Up @@ -274,9 +262,93 @@ def extract_labels(filenames_train,filenames_valid,filenames_test):
return trainY,validY,testY


def get_initial_embeddings_from_dictionary(n_words,embedding_dim,dictionary):

model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)
w2v = dict(zip(model.index2word, model.syn0))
inv_dict = {v: k for k, v in dictionary.items()}

ebd_init = np.zeros((n_words,embedding_dim))
w2v_w = list(w2v.keys())
for i in range(n_words):
if inv_dict[i] in w2v_w:
ebd_init[i,:] = w2v[inv_dict[i]]
else:
ebd_init[i,:] = np.zeros((embedding_dim))

return ebd_init


def get_review_from_token(rev_matrix,inv_dictionary_w,save_mode,save_dir,n_words,embedding_dim,dictionary_w):

review_num = rev_matrix.shape[0]
texts = collections.OrderedDict()

for i in range(review_num):
x = rev_matrix[i,:].tolist()
texts[i] = ' '.join(inv_dictionary_w[id] for id in x)

if save_mode=="pickle":
with open(save_dir+"test_data_text.pickle", "wb") as f:
pickle.dump(texts,f)
else:
with open(save_dir+"test_data_text.json", "w") as f:
json.dump(texts, f)

print("Exported test id:review dictionary...")
embedding_initMat = None
#embedding_initMat = get_initial_embeddings_from_dictionary(n_words,embedding_dim,dictionary_w)

print("Got initial word embeddings...")

return embedding_initMat


def get_ready_features(NUM_WORDS,INDEX_FROM,test_samples_num,save_mode,save_dir,embedding_dim,maxlen=None):

train,test = imdb.load_data(num_words=NUM_WORDS, index_from=INDEX_FROM)
train_X,train_Y = train
valid_X,valid_Y = test

trainY = to_categorical(train_Y,2)
validY = to_categorical(valid_Y,2)

def preprocess_IMDBdata(seed,filenames_train_valid,filenames_test,n_words=None,dictionary=None,embedding_dim=300,test_size=0.1,save_test=None):
dictionary_w = imdb.get_word_index()
dictionary_w = {k:(v+INDEX_FROM) for k,v in dictionary_w.items()}
dictionary_w["<PAD>"] = 0
dictionary_w["<START>"] = 1
dictionary_w["<UNK>"] = 2
inv_dictionary_w = {value:key for key,value in dictionary_w.items()}

lengthsTr = np.max([len(s) for s in train_X])
lengthsVd = np.max([len(s) for s in valid_X])
if maxlen is None:
maxlen = np.max(np.array([lengthsTr,lengthsVd]))

trainX = sequence.pad_sequences(train_X, maxlen=maxlen)
validX = sequence.pad_sequences(valid_X, maxlen=maxlen)

testX = np.zeros((test_samples_num,maxlen))
testY = np.zeros((test_samples_num,validY.shape[1]))
test_idx = np.array([random.randrange(0,validX.shape[0],1) for k in range(test_samples_num)])
testX[:,:] = validX[test_idx,:]
testY[:,:] = validY[test_idx,:]
valid_idx = [item for item in [k for k in range(validX.shape[0])] if item not in test_idx.tolist()]
validdX = np.zeros((len(valid_idx),maxlen))
validdY = np.zeros((len(valid_idx),validY.shape[1]))
validdX[:,:] = validX[valid_idx,:]
validdY[:,:] = validY[valid_idx,:]

embedding_initMat = get_review_from_token(testX,inv_dictionary_w,save_mode,save_dir,NUM_WORDS,embedding_dim,dictionary_w)


return trainX,trainY,validdX,validdY,testX,testY,embedding_initMat,dictionary_w,inv_dictionary_w



def preprocess_IMDBdata(n_words=None,INDEX_FROM=3,embedding_dim=300,test_samples_num=100,save_dir="/tmp/",save_mode="pickle"):

"""
trainX,validX,testX,filenames_train,filenames_valid,filenames_test,test_dict,test_dict_token,embedding_initMat = extract_features(filenames_train_valid,filenames_test,seed,test_size,save_test,n_words,dictionary,embedding_dim)
# extract_features_w2v(filenames,seed,test_size,save_test=None)
Expand All @@ -286,5 +358,10 @@ def preprocess_IMDBdata(seed,filenames_train_valid,filenames_test,n_words=None,d
testX = np.array(testX)
trainY,validY,testY = extract_labels(filenames_train,filenames_valid,filenames_test)

return trainX,validX,testX,trainY,validY,testY,filenames_train,filenames_valid,filenames_test,maxlen,test_dict,test_dict_token,embedding_initMat
"""
trainX,trainY,validdX,validdY,testX,testY,embedding_initMat,dictionary_w,inv_dictionary_w = get_ready_features(n_words,INDEX_FROM,test_samples_num,save_mode,save_dir,embedding_dim,maxlen=None)

return trainX,trainY,validdX,validdY,testX,testY,embedding_initMat,dictionary_w,inv_dictionary_w



42 changes: 15 additions & 27 deletions src/dynamic_lstm_TF.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
- http://ai.stanford.edu/~amaas/data/sentiment/
"""
from __future__ import division, print_function, absolute_import
from IMDB_dataset.textData import filenames
from IMDB_dataset.textData_cluster_BKP import filenames_train_valid,filenames_test
from parameter_persistence import export_serial_model,export_serial_lstm_data
from sacred.observers import FileStorageObserver
import IMDB_dataset.imdb_preprocess as imdb_pre
Expand Down Expand Up @@ -53,8 +53,8 @@ def config():
tensorboard_dir = "./sacred_models/tf_logs/"
run_id = "runID_newOutput"
n_words = 10000 #89527
dictionary = "/home/yannis/Desktop/tRustNN/imdb_dict.pickle" #"/home/icha/tRustNN/imdb_dict.pickle"
embedding_dim = 300
dictionary = "/home/icha/tRustNN/imdb_dict.pickle" #"/home/yannis/Desktop/tRustNN/imdb_dict.pickle"
embedding_dim = 150
ckp_path = None #"./sacred_models/ckp/"
internals = "all"
save_mode = "pickle"
Expand Down Expand Up @@ -149,21 +149,20 @@ def train(seed,net_arch,net_arch_layers,save_path,n_epoch,tensorboard_verbose,sh
dictionary_w = pickle.load(handle)
inv_dictionary_w = {v: k for k, v in dictionary_w.items()}


print("Extracting features...")

#Train, valid and test sets. Have to return filenames_test as we have now shuffled them
"""
trainX,validX,testX,trainY,validY,testY,filenames_train,filenames_valid,filenames_test_sfd,maxlen,test_dict,test_dict_token,embedding_initMat = imdb_pre.preprocess_IMDBdata(seed=seed,filenames=filenames,n_words=n_words,dictionary=dictionary_w,embedding_dim=embedding_dim,test_size=test_size,save_test="save_test")

trainX,validX,testX,trainY,validY,testY,filenames_train,filenames_valid,filenames_test_sfd,maxlen,test_dict,test_dict_token,embedding_initMat = imdb_pre.preprocess_IMDBdata(seed=seed,filenames_train_valid=filenames_train_valid,filenames_test=filenames_test,n_words=n_words,dictionary=dictionary_w,embedding_dim=embedding_dim,test_size=test_size,save_test="save_test")

"""
with open('trainValidtestNew.pickle','rb') as handle:
(trainX,validX,testX,trainY,validY,testY,filenames_train,filenames_valid,filenames_test_sfd,maxlen,test_dict,test_dict_token,embedding_initMat) = pickle.load(handle)
"""
with open(save_dir+"embedding_initMat.pickle", "wb") as f:
pickle.dump(embedding_initMat,f)
"""

d = test_dict
if save_mode=="pickle":
with open(save_dir+"test_data_input.pickle", "wb") as f:
Expand All @@ -181,36 +180,27 @@ def train(seed,net_arch,net_arch_layers,save_path,n_epoch,tensorboard_verbose,sh
json.dump(d, f)
print("Exported test data token dictionary...")

"""

"""
with open('trainValidtestNew.pickle','wb') as handle:
pickle.dump((trainX,validX,testX,trainY,validY,testY,filenames_train,filenames_valid,filenames_test_sfd,maxlen,test_dict,test_dict_token,embedding_initMat),handle)
"""

print("Training model...")

model, layer_outputs = build_network(net_arch,net_arch_layers,tensorboard_verbose,trainX.shape[1],embedding_dim,tensorboard_dir,batch_size,n_words,embedding_layer,ckp_path,embedding_initMat)

model.fit(trainX, trainY, validation_set=(validX, validY), show_metric=show_metric, batch_size=batch_size) #n_epoch=n_epoch,
"""
model.fit(trainX, trainY, validation_set=(validX, validY), n_epoch=n_epoch,show_metric=show_metric, batch_size=batch_size)

print("Evaluating trained model on test set...")
score = model.evaluate(testX,testY)
print("Accuracy on test set: %0.4f%%" % (score[0] * 100))
"""


#Save model to json format
export_serial_model(model,net_arch_layers,save_dir)

#Get model's internals for 'feed' input
"""
feed = trainX
input_files = filenames_train
export_serial_lstm_data(model,layer_outputs,feed,input_files,internals,save_dir+"train_")
feed = validX
input_files = filenames_valid
export_serial_lstm_data(model,layer_outputs,feed,input_files,internals,save_dir+"valid_")
"""

feed = testX
input_files = filenames_test_sfd

Expand All @@ -222,14 +212,12 @@ def train(seed,net_arch,net_arch_layers,save_path,n_epoch,tensorboard_verbose,sh
del tf.get_collection_ref(tf.GraphKeys.TRAIN_OPS)[:]
model.save(save_dir+"tf_model.tfl")
print("Saved model...")

predicted_tgs = model.predict_label(feed)



LRP = lrp.lrp_full(model,embedding_layer,n_words,input_files,net_arch,net_arch_layers,save_dir+"test_data_input_token."+save_mode,save_dir+"test_data_input."+save_mode,save_dir+"test_model_internals_fc."+save_mode,save_dir+"test_model_internals_lstm_hidden."+save_mode,save_dir+"test_model_internals_lstm_states."+save_mode,save_dir+"test_model_internals_ebd."+save_mode,inv_dictionary_w,eps=0.001,delta=0.0,save_dir=save_dir,lstm_actv1=expit,lstm_actv2=np.tanh,topN=5,debug=False,predictions=predicted_tgs)




with open(save_dir+"lstm_predictions.pickle","wb") as handle:
pickle.dump(predicted_tgs,handle)
print("Finished with LRP and related data...now exiting...")
Expand Down

0 comments on commit 32efdee

Please sign in to comment.