Skip to content

Commit

Permalink
Merge branch 'master' of https://gitlab.com/yhalkiad/tRustNN
Browse files Browse the repository at this point in the history
  • Loading branch information
Ioannis Chalkiadakis committed Aug 10, 2017
2 parents 41d2399 + 90c928b commit 82983c6
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 31 deletions.
1 change: 1 addition & 0 deletions bokeh_vis/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ def apply_cluster(data,algorithm,n_clusters,review=None,neuronData=None,mode="nn
if algorithm == "DBSCAN - selected review":
reviewData_name = [s for s in list(neuronData.keys()) if review_part in s][0]
dstMat = neuronData[reviewData_name]
print(dstMat.shape)
db = cluster.DBSCAN(eps=0.2,metric='precomputed').fit(dstMat)
y_pred = db.labels_.astype(np.int)
elif algorithm == "DBSCAN - all reviews":
Expand Down
27 changes: 18 additions & 9 deletions bokeh_vis/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from bokeh.plotting import figure, show, output_file
from bokeh.io import curdoc
from bokeh.layouts import widgetbox , layout
from bokeh.models.widgets import Select, Slider
from bokeh.models.widgets import Select, Slider, Button
import dim_reduction
import numpy as np
import clustering
Expand All @@ -22,6 +22,11 @@
import heatmap as hmap
from lrp import get_lrp_timedata


def button_callback():
text_src = re.sub('/home/icha/','/home/yannis/Desktop/tRustNN/',rawInput_selections.value)
text_banner.text = open(text_src,"r").read()

def get_wc_colourGroups(rawInput_source):

words = rawInput_source.data['w']
Expand Down Expand Up @@ -107,6 +112,11 @@ def update_source(attrname, old, new):

x = data[lstm_layer_name][gate_value]

#update raw input
text_src = re.sub('/home/icha/','/home/yannis/Desktop/tRustNN/',rawInput_selections.value)
text_banner.text = open(text_src,"r").read()
label_banner.text = "Network decision : POSITIVE" if predicted_tgs[list(keys_raw).index(rawInput_selections.value)][1] == 1 else "Network decision : NEGATIVE"

#update dimension reduction source
algorithm = projection_selections.value
knn = 5
Expand Down Expand Up @@ -140,7 +150,7 @@ def update_source(attrname, old, new):
text_set.text = "KMeans: Clusters neurons based on their gate values after training."
elif algorithm_cl_neurons=="DBSCAN - selected review":
text_set.text = "DBSCAN - selected review: Clusters neurons based on how related their most activating words are. List of activating words generated from seleceted review."
neuronData = similarityMatrix_PerReview
neuronData = similarityMatrix_PerReview
cluster_labels, colors, _ = clustering.apply_cluster(x,algorithm_cl_neurons,n_clusters,review=rawInput_selections.value,neuronData=neuronData,mode="nn")


Expand All @@ -152,10 +162,6 @@ def update_source(attrname, old, new):
project_plot.title.text = algorithm
"""

#update raw input
text_src = re.sub('/home/icha/','/home/yannis/Desktop/tRustNN/',rawInput_selections.value)
text_banner.text = open(text_src,"r").read()
label_banner.text = "Network decision : POSITIVE" if predicted_tgs[list(keys_raw).index(rawInput_selections.value)][1] == 1 else "Network decision : NEGATIVE"

text_data,text_words = get_rawText_data(rawInput_selections.value,keys_raw,data_raw) ###LOADS EMBEDDINGS HERE
w2v_labels, w2v_colors, _ = clustering.apply_cluster(text_data,"KMeans - selected gate",n_clusters,mode="wc")
Expand All @@ -164,7 +170,6 @@ def update_source(attrname, old, new):
if gate_value=="input_gate":
wc_filename,wc_img,wc_words = get_wcloud(LRP,rawInput_selections.value,load_dir,color_dict=color_dict,gate="in",text=text_banner.text)
elif gate_value=="forget_gate":
print(LRP)
wc_filename,wc_img,wc_words = get_wcloud(LRP,rawInput_selections.value,load_dir,color_dict=color_dict,gate="forget")
elif gate_value=="output_gate":
wc_filename,wc_img,wc_words = get_wcloud(LRP,rawInput_selections.value,load_dir,color_dict=color_dict,gate="out")
Expand Down Expand Up @@ -196,6 +201,7 @@ def update_source(attrname, old, new):
predicted_tgs = pickle.load(handle)
with open(load_dir+"exploratoryDataFull.pickle", 'rb') as f:
excitingWords_fullSet,similarityMatrix_AllReviews,similarityMatrix_PerReview,neuron_types,totalLRP,LRP = pickle.load(f)


#neuronExcitingWords_AllReviews = list((excitingWords_fullSet.values()))
_,lstm_hidden = data_format.get_data(load_dir+"test_model_internals_lstm_hidden.pickle")
Expand Down Expand Up @@ -240,6 +246,8 @@ def update_source(attrname, old, new):
text_banner = Div(text=open(text_src,"r").read(), width=1300, height=100)
label_banner = Paragraph(text="Network decision : POSITIVE" if predicted_tgs[list(keys_raw).index(rawInput_selections.value)][1] == 1 else "Network decision : NEGATIVE", width=200, height=30)

button = Button(label="Reset text")
button.on_click(button_callback)

#WordCloud
color_dict = get_wc_colourGroups(rawInput_source) #Colors based on similarity in embedding space
Expand Down Expand Up @@ -308,7 +316,8 @@ def update_source(attrname, old, new):


lrp_timedata = get_lrp_timedata(LRP)
lrptime_source = ColumnDataSource(dict(lrptime = lrp_timedata,time=[i for i in range(len(lrp_timedata))]))
time = [i for i in range(len(lrp_timedata))]
lrptime_source = ColumnDataSource(dict(lrptime = lrp_timedata,time=time))
lrp_plot = figure(title="Total normalized LRP per timestep",plot_width=300, plot_height=50)
lrp_plot.scatter('time','lrptime', marker='circle', size=5, alpha=0.5, source=lrptime_source)
lrp_plot.xaxis.axis_label = 'Time'
Expand All @@ -322,7 +331,7 @@ def update_source(attrname, old, new):
attr.on_change('value', update_source)
rawInput_selections.on_change('value', update_source)

gp = layout([project_plot, wc_plot, widgetbox(gate_selections,projection_selections,rawInput_selections,clustering_selections[0],clustering_selections[1],text_0,text_set,label_banner)],
gp = layout([project_plot, wc_plot, widgetbox(rawInput_selections,gate_selections,projection_selections,clustering_selections[0],clustering_selections[1],text_0,text_set,label_banner,button)],
[lrp_plot],
[text_banner],
responsive=True)
Expand Down
16 changes: 8 additions & 8 deletions src/dynamic_lstm_TF.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
- http://ai.stanford.edu/~amaas/data/sentiment/
"""
from __future__ import division, print_function, absolute_import
from IMDB_dataset.textData_cluster import filenames
from IMDB_dataset.textData import filenames
from parameter_persistence import export_serial_model,export_serial_lstm_data
from sacred.observers import FileStorageObserver
import IMDB_dataset.imdb_preprocess as imdb_pre
Expand Down Expand Up @@ -53,12 +53,12 @@ def config():
tensorboard_dir = "./sacred_models/tf_logs/"
run_id = "runID_newOutput"
n_words = 10000 #89527
dictionary = "/home/icha/tRustNN/imdb_dict.pickle"
dictionary = "/home/yannis/Desktop/tRustNN/imdb_dict.pickle" #"/home/icha/tRustNN/imdb_dict.pickle"
embedding_dim = 300
ckp_path = None #"./sacred_models/ckp/"
internals = "all"
save_mode = "pickle"
n_epoch = 2
n_epoch = 10
test_size = 0.05 # -1 for whole test set
embedding_layer = 1

Expand Down Expand Up @@ -153,18 +153,17 @@ def train(seed,net_arch,net_arch_layers,save_path,n_epoch,tensorboard_verbose,sh
print("Extracting features...")

#Train, valid and test sets. Have to return filenames_test as we have now shuffled them

"""
trainX,validX,testX,trainY,validY,testY,filenames_train,filenames_valid,filenames_test_sfd,maxlen,test_dict,test_dict_token,embedding_initMat = imdb_pre.preprocess_IMDBdata(seed=seed,filenames=filenames,n_words=n_words,dictionary=dictionary_w,embedding_dim=embedding_dim,test_size=test_size,save_test="save_test")
"""
with open('trainValidtestNew.pickle','rb') as handle:
(trainX,validX,testX,trainY,validY,testY,filenames_train,filenames_valid,filenames_test_sfd,maxlen,test_dict,test_dict_token,embedding_initMat) = pickle.load(handle)

"""
"""
with open(save_dir+"embedding_initMat.pickle", "wb") as f:
pickle.dump(embedding_initMat,f)
"""
d = test_dict
if save_mode=="pickle":
with open(save_dir+"test_data_input.pickle", "wb") as f:
Expand All @@ -182,6 +181,7 @@ def train(seed,net_arch,net_arch_layers,save_path,n_epoch,tensorboard_verbose,sh
json.dump(d, f)
print("Exported test data token dictionary...")
"""
"""
with open('trainValidtestNew.pickle','wb') as handle:
pickle.dump((trainX,validX,testX,trainY,validY,testY,filenames_train,filenames_valid,filenames_test_sfd,maxlen,test_dict,test_dict_token,embedding_initMat),handle)
Expand All @@ -191,11 +191,11 @@ def train(seed,net_arch,net_arch_layers,save_path,n_epoch,tensorboard_verbose,sh
model, layer_outputs = build_network(net_arch,net_arch_layers,tensorboard_verbose,trainX.shape[1],embedding_dim,tensorboard_dir,batch_size,n_words,embedding_layer,ckp_path,embedding_initMat)

model.fit(trainX, trainY, validation_set=(validX, validY), show_metric=show_metric, batch_size=batch_size) #n_epoch=n_epoch,

"""
print("Evaluating trained model on test set...")
score = model.evaluate(testX,testY)
print("Accuracy on test set: %0.4f%%" % (score[0] * 100))

"""

#Save model to json format
export_serial_model(model,net_arch_layers,save_dir)
Expand Down
37 changes: 25 additions & 12 deletions src/lrp.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def get_lrp_timedata(LRP):
for k in kkeys:
if lens[j]-1-i>=0:
normalize_factor = normalize_factor + 1
lrp = list(LRP[k]['scores'])[lens[j]-1-i]
lrp = abs(list(LRP[k]['scores'])[lens[j]-1-i]) #abs, since we want the total LRP, either positive or negative
lrp_t = lrp_t + lrp

j = j + 1
Expand All @@ -38,9 +38,17 @@ def get_lrp_timedata(LRP):

def get_PosNegNeurons_dict(i,predictions,lrp_neurons):
# Get neurons that trigger exclusively for positive or negative reviews according to the network. Assign them to neutral if activate for both types of reviews.

reviewLRP_data = {"pos":[],"neg":[],"neutral":[]}

if predictions[i].all()==0:
pred = -1
if predictions[i,0]==1:
pred = 0
elif predictions[i,0]==0:
pred = 1


if pred==0:
for j in lrp_neurons:
if reviewLRP_data["neg"]==[]:
reviewLRP_data["neg"] = [j]
Expand All @@ -50,7 +58,7 @@ def get_PosNegNeurons_dict(i,predictions,lrp_neurons):
reviewLRP_data["neutral"].append(j)
elif j not in reviewLRP_data["neg"]:
reviewLRP_data["neg"].append(j)
elif predictions[i]==1:
elif pred==1:
for j in lrp_neurons:
if reviewLRP_data["pos"]==[]:
reviewLRP_data["pos"] = [j]
Expand All @@ -60,19 +68,18 @@ def get_PosNegNeurons_dict(i,predictions,lrp_neurons):
reviewLRP_data["neutral"].append(j)
elif j not in reviewLRP_data["pos"]:
reviewLRP_data["pos"].append(j)

return reviewLRP_data

def get_NeuronType(reviewLRP_data):
def get_NeuronType(reviewLRP_data,neuron_num):
# Assign a label to each neuron based on whether it activates on positive-,negative-only or both types of reviews.

neuron_num = len(reviewLRP_data["pos"])+len(reviewLRP_data["neg"])+len(reviewLRP_data["neutral"])
posNeg_predictionLabel = np.zeros((neuron_num,))

for i in range(neuron_num):
if i in reviewLRP_data["pos"]:
posNeg_predictionLabel[i] = 1
elif i in reviewLRP_data["neutral"]:
elif i in reviewLRP_data["neg"]:
posNeg_predictionLabel[i] = 2

return posNeg_predictionLabel
Expand Down Expand Up @@ -187,7 +194,13 @@ def get_NeuronExcitingWords_dict(lstm_hidden_json,kkeys,k,save_dir,topN=5):
d[kkeys[i]] = ord_cells[-(topN+1):-1].tolist()

NtoW = invert_dict_nonunique(d,topN)

NtoW_keys = map(int,list(NtoW.keys()))
for i in range(kdata.shape[1]):
if i not in NtoW_keys:
NtoW[str(i)] = []

print(map(int,list(NtoW.keys()))) #########################

with open(save_dir+re.sub('/', '_', k[-18:-4])+"_ActCells.json", 'w') as f:
json.dump(NtoW, f)

Expand Down Expand Up @@ -400,16 +413,16 @@ def lrp_full(model,embedding_layer,n_words,input_filename,net_arch,net_arch_laye
lrp_input,lrp_fc,lstm_lrp_x,(lstm_lrp_h,lstm_lrp_g,lstm_lrp_c) = lrp_single_input(model,embedding_layer,n_words,net_arch_layers,k,kdata,data_token,eps,delta,fc_out_json,lstm_hidden_json,lstm_cell_json,ebd_json,dictionary,target_class=1,T=T,classes=2,lstm_actv1=expit,lstm_actv2=np.tanh,debug=debug)

lrp_neurons = get_topLRP_cells(lrp_fc,k,save_dir,topN)
reviewLRP_data = get_PosNegNeurons_dict(i,predictions,lrp_neurons)
reviewLRP_data = get_PosNegNeurons_dict(i,predictions,lrp_neurons)
review_filename, _ = get_NeuronExcitingWords_dict(lstm_hidden_json,kkeys,k,save_dir,topN)
dstMat = get_DstMatrix_singleReview(save_dir+review_filename,test_data_json,k)
neuronWords_jsons.append(review_filename)
similarityMatrix_PerReview[review_filename] = dstMat
similarityMatrix_PerReview[k] = dstMat

LRP[k] = lrp_input # contains LRP of input words
totalLRP[k] = collections.OrderedDict(words=kkeys,lrp=lrp_fc) # contains LRP halfway through network, i.e. LRP of LSTM neurons

neuron_types = get_NeuronType(reviewLRP_data)
neuron_types = get_NeuronType(reviewLRP_data,lrp_fc.shape[0])
excitingWords_fullSet = get_MostExcitingWords_allReviews(save_dir,neuronWords_jsons,topN=5)
similarityMatrix_AllReviews = get_NeuronSimilarity_AllReviews(excitingWords_fullSet)
with open(save_dir+"exploratoryDataFull.pickle", 'wb') as f:
Expand Down
4 changes: 2 additions & 2 deletions src/wcloud_standalone.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,8 @@ def get_wcloud(LRP,k,save_dir,color_dict=None,gate="out",text=None):
wc = WordCloud(
background_color="white",
max_words=2000,
width = 400,
height = 400,
width = 500,
height = 550,
stopwords=stopwords.words("english")
)
wc.generate(text)
Expand Down

0 comments on commit 82983c6

Please sign in to comment.