Merge branch 'master' of https://gitlab.com/yhalkiad/tRustNN

ichalkiad · Aug 10, 2017 · 82983c6 · 82983c6
2 parents 41d2399 + 90c928b
commit 82983c6
Show file tree

Hide file tree

Showing 5 changed files with 54 additions and 31 deletions.
diff --git a/bokeh_vis/clustering.py b/bokeh_vis/clustering.py
@@ -102,6 +102,7 @@ def apply_cluster(data,algorithm,n_clusters,review=None,neuronData=None,mode="nn
         if algorithm == "DBSCAN - selected review":
             reviewData_name = [s for s in list(neuronData.keys()) if review_part in s][0]
             dstMat = neuronData[reviewData_name]
+            print(dstMat.shape)
             db = cluster.DBSCAN(eps=0.2,metric='precomputed').fit(dstMat)
             y_pred = db.labels_.astype(np.int)
         elif algorithm == "DBSCAN - all reviews":

diff --git a/bokeh_vis/main.py b/bokeh_vis/main.py
@@ -2,7 +2,7 @@
 from bokeh.plotting import figure, show, output_file
 from bokeh.io import curdoc
 from bokeh.layouts import widgetbox , layout
-from bokeh.models.widgets import Select, Slider
+from bokeh.models.widgets import Select, Slider, Button
 import dim_reduction
 import numpy as np
 import clustering
@@ -22,6 +22,11 @@
 import heatmap as hmap
 from lrp import get_lrp_timedata
 
+
+def button_callback():
+    text_src = re.sub('/home/icha/','/home/yannis/Desktop/tRustNN/',rawInput_selections.value)
+    text_banner.text = open(text_src,"r").read()
+
 def get_wc_colourGroups(rawInput_source):
 
     words  = rawInput_source.data['w']
@@ -107,6 +112,11 @@ def update_source(attrname, old, new):
 
     x = data[lstm_layer_name][gate_value]
 
+    #update raw input
+    text_src = re.sub('/home/icha/','/home/yannis/Desktop/tRustNN/',rawInput_selections.value)
+    text_banner.text = open(text_src,"r").read()
+    label_banner.text = "Network decision : POSITIVE" if predicted_tgs[list(keys_raw).index(rawInput_selections.value)][1] == 1 else "Network decision : NEGATIVE"
+
     #update dimension reduction source
     algorithm = projection_selections.value
     knn = 5
@@ -140,7 +150,7 @@ def update_source(attrname, old, new):
             text_set.text = "KMeans: Clusters neurons based on their gate values after training."
         elif algorithm_cl_neurons=="DBSCAN - selected review":
             text_set.text = "DBSCAN - selected review: Clusters neurons based on how related their most activating words are. List of activating words generated from seleceted review."
-        neuronData = similarityMatrix_PerReview
+        neuronData = similarityMatrix_PerReview       
         cluster_labels, colors, _ = clustering.apply_cluster(x,algorithm_cl_neurons,n_clusters,review=rawInput_selections.value,neuronData=neuronData,mode="nn")
 
 
@@ -152,10 +162,6 @@ def update_source(attrname, old, new):
         project_plot.title.text = algorithm
     """
 
-    #update raw input
-    text_src = re.sub('/home/icha/','/home/yannis/Desktop/tRustNN/',rawInput_selections.value)
-    text_banner.text = open(text_src,"r").read()
-    label_banner.text = "Network decision : POSITIVE" if predicted_tgs[list(keys_raw).index(rawInput_selections.value)][1] == 1 else "Network decision : NEGATIVE"
 
     text_data,text_words = get_rawText_data(rawInput_selections.value,keys_raw,data_raw) ###LOADS EMBEDDINGS HERE
     w2v_labels, w2v_colors, _ = clustering.apply_cluster(text_data,"KMeans - selected gate",n_clusters,mode="wc")
@@ -164,7 +170,6 @@ def update_source(attrname, old, new):
     if gate_value=="input_gate":
         wc_filename,wc_img,wc_words = get_wcloud(LRP,rawInput_selections.value,load_dir,color_dict=color_dict,gate="in",text=text_banner.text)
     elif gate_value=="forget_gate":
-        print(LRP)
         wc_filename,wc_img,wc_words = get_wcloud(LRP,rawInput_selections.value,load_dir,color_dict=color_dict,gate="forget")
     elif gate_value=="output_gate":
         wc_filename,wc_img,wc_words = get_wcloud(LRP,rawInput_selections.value,load_dir,color_dict=color_dict,gate="out")
@@ -196,6 +201,7 @@ def update_source(attrname, old, new):
     predicted_tgs = pickle.load(handle)
 with open(load_dir+"exploratoryDataFull.pickle", 'rb') as f:
     excitingWords_fullSet,similarityMatrix_AllReviews,similarityMatrix_PerReview,neuron_types,totalLRP,LRP = pickle.load(f)
+
 
 #neuronExcitingWords_AllReviews = list((excitingWords_fullSet.values()))
 _,lstm_hidden = data_format.get_data(load_dir+"test_model_internals_lstm_hidden.pickle")
@@ -240,6 +246,8 @@ def update_source(attrname, old, new):
 text_banner = Div(text=open(text_src,"r").read(), width=1300, height=100)
 label_banner = Paragraph(text="Network decision : POSITIVE" if predicted_tgs[list(keys_raw).index(rawInput_selections.value)][1] == 1 else "Network decision : NEGATIVE", width=200, height=30)
 
+button = Button(label="Reset text")
+button.on_click(button_callback)
 
 #WordCloud
 color_dict = get_wc_colourGroups(rawInput_source) #Colors based on similarity in embedding space
@@ -308,7 +316,8 @@ def update_source(attrname, old, new):
 
 
 lrp_timedata = get_lrp_timedata(LRP)
-lrptime_source = ColumnDataSource(dict(lrptime = lrp_timedata,time=[i for i in range(len(lrp_timedata))]))
+time = [i for i in range(len(lrp_timedata))]
+lrptime_source = ColumnDataSource(dict(lrptime = lrp_timedata,time=time))
 lrp_plot = figure(title="Total normalized LRP per timestep",plot_width=300, plot_height=50)
 lrp_plot.scatter('time','lrptime', marker='circle', size=5, alpha=0.5, source=lrptime_source)
 lrp_plot.xaxis.axis_label = 'Time'
@@ -322,7 +331,7 @@ def update_source(attrname, old, new):
     attr.on_change('value', update_source)
 rawInput_selections.on_change('value', update_source)
 
-gp = layout([project_plot, wc_plot, widgetbox(gate_selections,projection_selections,rawInput_selections,clustering_selections[0],clustering_selections[1],text_0,text_set,label_banner)],
+gp = layout([project_plot, wc_plot, widgetbox(rawInput_selections,gate_selections,projection_selections,clustering_selections[0],clustering_selections[1],text_0,text_set,label_banner,button)],
             [lrp_plot],
             [text_banner],
             responsive=True)

diff --git a/src/dynamic_lstm_TF.py b/src/dynamic_lstm_TF.py
@@ -12,7 +12,7 @@
     - http://ai.stanford.edu/~amaas/data/sentiment/
 """
 from __future__ import division, print_function, absolute_import
-from IMDB_dataset.textData_cluster import filenames
+from IMDB_dataset.textData import filenames
 from parameter_persistence import export_serial_model,export_serial_lstm_data
 from sacred.observers import FileStorageObserver
 import IMDB_dataset.imdb_preprocess as imdb_pre
@@ -53,12 +53,12 @@ def config():
     tensorboard_dir = "./sacred_models/tf_logs/"
     run_id = "runID_newOutput"
     n_words = 10000 #89527 
-    dictionary = "/home/icha/tRustNN/imdb_dict.pickle"
+    dictionary = "/home/yannis/Desktop/tRustNN/imdb_dict.pickle"   #"/home/icha/tRustNN/imdb_dict.pickle"
     embedding_dim = 300
     ckp_path = None #"./sacred_models/ckp/"
     internals = "all"    
     save_mode = "pickle"
-    n_epoch = 2
+    n_epoch = 10
     test_size = 0.05 # -1 for whole test set
     embedding_layer = 1
 
@@ -153,18 +153,17 @@ def train(seed,net_arch,net_arch_layers,save_path,n_epoch,tensorboard_verbose,sh
     print("Extracting features...")
 
     #Train, valid and test sets. Have to return filenames_test as we have now shuffled them
-
+    """
     trainX,validX,testX,trainY,validY,testY,filenames_train,filenames_valid,filenames_test_sfd,maxlen,test_dict,test_dict_token,embedding_initMat = imdb_pre.preprocess_IMDBdata(seed=seed,filenames=filenames,n_words=n_words,dictionary=dictionary_w,embedding_dim=embedding_dim,test_size=test_size,save_test="save_test")
     
     """
     with open('trainValidtestNew.pickle','rb') as handle:
         (trainX,validX,testX,trainY,validY,testY,filenames_train,filenames_valid,filenames_test_sfd,maxlen,test_dict,test_dict_token,embedding_initMat) = pickle.load(handle)
 
-    """
     """
     with open(save_dir+"embedding_initMat.pickle", "wb") as f:
             pickle.dump(embedding_initMat,f)
-    """
+    
     d = test_dict        
     if save_mode=="pickle":
         with open(save_dir+"test_data_input.pickle", "wb") as f:
@@ -182,6 +181,7 @@ def train(seed,net_arch,net_arch_layers,save_path,n_epoch,tensorboard_verbose,sh
             json.dump(d, f)
     print("Exported test data token dictionary...")
     
+    """
     """
     with open('trainValidtestNew.pickle','wb') as handle:
         pickle.dump((trainX,validX,testX,trainY,validY,testY,filenames_train,filenames_valid,filenames_test_sfd,maxlen,test_dict,test_dict_token,embedding_initMat),handle)
@@ -191,11 +191,11 @@ def train(seed,net_arch,net_arch_layers,save_path,n_epoch,tensorboard_verbose,sh
     model, layer_outputs = build_network(net_arch,net_arch_layers,tensorboard_verbose,trainX.shape[1],embedding_dim,tensorboard_dir,batch_size,n_words,embedding_layer,ckp_path,embedding_initMat)
 
     model.fit(trainX, trainY, validation_set=(validX, validY),  show_metric=show_metric, batch_size=batch_size)  #n_epoch=n_epoch,
-
+    """
     print("Evaluating trained model on test set...")
     score = model.evaluate(testX,testY)
     print("Accuracy on test set: %0.4f%%" % (score[0] * 100))
-
+    """
 
     #Save model to json format
     export_serial_model(model,net_arch_layers,save_dir)

diff --git a/src/lrp.py b/src/lrp.py
@@ -28,7 +28,7 @@ def get_lrp_timedata(LRP):
         for k in kkeys:
             if lens[j]-1-i>=0:
                 normalize_factor = normalize_factor + 1
-                lrp = list(LRP[k]['scores'])[lens[j]-1-i]
+                lrp = abs(list(LRP[k]['scores'])[lens[j]-1-i])  #abs, since we want the total LRP, either positive or negative
                 lrp_t = lrp_t + lrp
 
             j = j + 1
@@ -38,9 +38,17 @@ def get_lrp_timedata(LRP):
 
 def get_PosNegNeurons_dict(i,predictions,lrp_neurons):
 # Get neurons that trigger exclusively for positive or negative reviews according to the network. Assign them to neutral if activate for both types of reviews.
+
     reviewLRP_data = {"pos":[],"neg":[],"neutral":[]}
 
-    if predictions[i].all()==0:
+    pred = -1
+    if predictions[i,0]==1:
+       pred = 0
+    elif predictions[i,0]==0:
+       pred = 1
+
+
+    if pred==0:
        for j in lrp_neurons:
            if reviewLRP_data["neg"]==[]:
               reviewLRP_data["neg"] = [j]
@@ -50,7 +58,7 @@ def get_PosNegNeurons_dict(i,predictions,lrp_neurons):
                    reviewLRP_data["neutral"].append(j)
               elif j not in reviewLRP_data["neg"]:
                    reviewLRP_data["neg"].append(j)
-    elif predictions[i]==1:
+    elif pred==1:
        for j in lrp_neurons:
            if reviewLRP_data["pos"]==[]:
               reviewLRP_data["pos"] = [j]
@@ -60,19 +68,18 @@ def get_PosNegNeurons_dict(i,predictions,lrp_neurons):
                  reviewLRP_data["neutral"].append(j)
               elif j not in reviewLRP_data["pos"]:
                  reviewLRP_data["pos"].append(j)
-
+    
     return reviewLRP_data
 
-def get_NeuronType(reviewLRP_data):
+def get_NeuronType(reviewLRP_data,neuron_num):
 # Assign a label to each neuron based on whether it activates on positive-,negative-only or both types of reviews.
 
-    neuron_num = len(reviewLRP_data["pos"])+len(reviewLRP_data["neg"])+len(reviewLRP_data["neutral"])
     posNeg_predictionLabel = np.zeros((neuron_num,))
 
     for i in range(neuron_num):
         if i in reviewLRP_data["pos"]:
             posNeg_predictionLabel[i] = 1
-        elif i in reviewLRP_data["neutral"]:
+        elif i in reviewLRP_data["neg"]:
             posNeg_predictionLabel[i] = 2
 
     return posNeg_predictionLabel
@@ -187,7 +194,13 @@ def get_NeuronExcitingWords_dict(lstm_hidden_json,kkeys,k,save_dir,topN=5):
           d[kkeys[i]] = ord_cells[-(topN+1):-1].tolist()
 
      NtoW = invert_dict_nonunique(d,topN)
-
+     NtoW_keys = map(int,list(NtoW.keys()))
+     for i in range(kdata.shape[1]):
+         if i not in NtoW_keys:
+             NtoW[str(i)] = []     
+
+     print(map(int,list(NtoW.keys())))   #########################
+
      with open(save_dir+re.sub('/', '_', k[-18:-4])+"_ActCells.json", 'w') as f:
             json.dump(NtoW, f)
 
@@ -400,16 +413,16 @@ def lrp_full(model,embedding_layer,n_words,input_filename,net_arch,net_arch_laye
          lrp_input,lrp_fc,lstm_lrp_x,(lstm_lrp_h,lstm_lrp_g,lstm_lrp_c) = lrp_single_input(model,embedding_layer,n_words,net_arch_layers,k,kdata,data_token,eps,delta,fc_out_json,lstm_hidden_json,lstm_cell_json,ebd_json,dictionary,target_class=1,T=T,classes=2,lstm_actv1=expit,lstm_actv2=np.tanh,debug=debug)
 
          lrp_neurons = get_topLRP_cells(lrp_fc,k,save_dir,topN)
-         reviewLRP_data = get_PosNegNeurons_dict(i,predictions,lrp_neurons)
+         reviewLRP_data = get_PosNegNeurons_dict(i,predictions,lrp_neurons) 
          review_filename, _ = get_NeuronExcitingWords_dict(lstm_hidden_json,kkeys,k,save_dir,topN)
          dstMat = get_DstMatrix_singleReview(save_dir+review_filename,test_data_json,k)
          neuronWords_jsons.append(review_filename)
-         similarityMatrix_PerReview[review_filename] = dstMat
+         similarityMatrix_PerReview[k] = dstMat
 
          LRP[k] = lrp_input # contains LRP of input words
          totalLRP[k] = collections.OrderedDict(words=kkeys,lrp=lrp_fc) # contains LRP halfway through network, i.e. LRP of LSTM neurons
-
-    neuron_types = get_NeuronType(reviewLRP_data)
+         
+    neuron_types = get_NeuronType(reviewLRP_data,lrp_fc.shape[0])
     excitingWords_fullSet = get_MostExcitingWords_allReviews(save_dir,neuronWords_jsons,topN=5)
     similarityMatrix_AllReviews = get_NeuronSimilarity_AllReviews(excitingWords_fullSet)
     with open(save_dir+"exploratoryDataFull.pickle", 'wb') as f:

diff --git a/src/wcloud_standalone.py b/src/wcloud_standalone.py
@@ -97,8 +97,8 @@ def get_wcloud(LRP,k,save_dir,color_dict=None,gate="out",text=None):
          wc = WordCloud(
             background_color="white",
             max_words=2000,
-            width = 400,
-            height = 400,
+            width = 500,
+            height = 550,
             stopwords=stopwords.words("english")
          )
          wc.generate(text)