Merge branch 'master' of https://gitlab.com/yhalkiad/tRustNN

ichalkiad · Aug 7, 2017 · f5c504e · f5c504e
2 parents d5376b3 + 6821963
commit f5c504e
Show file tree

Hide file tree

Showing 6 changed files with 58 additions and 35 deletions.
diff --git a/bokeh_vis/clustering.py b/bokeh_vis/clustering.py
@@ -32,7 +32,7 @@ def clustering(X, algorithm, n_clusters=2):
     bandwidth = cluster.estimate_bandwidth(X, quantile=0.3)
 
     # connectivity matrix for structured Ward
-    connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False)
+    connectivity = kneighbors_graph(X, n_neighbors=5, include_self=False)
 
     # make connectivity symmetric
     connectivity = 0.5 * (connectivity + connectivity.T)

diff --git a/bokeh_vis/main.py b/bokeh_vis/main.py
@@ -20,7 +20,7 @@
 import data_format
 from wcloud_standalone import get_wcloud
 import heatmap as hmap
-
+from lrp import get_lrp_timedata
 
 def get_wc_colourGroups(rawInput_source):
 
@@ -109,7 +109,7 @@ def update_source(attrname, old, new):
 
     #update dimension reduction source
     algorithm = projection_selections.value
-    knn = 10
+    knn = 5
     x_pr,performance_metric = dim_reduction.project(x, algorithm, knn, labels)
 
     #update clustering 
@@ -164,6 +164,7 @@ def update_source(attrname, old, new):
     if gate_value=="input_gate":
         wc_filename,wc_img,wc_words = get_wcloud(LRP,rawInput_selections.value,load_dir,color_dict=color_dict,gate="in",text=text_banner.text)
     elif gate_value=="forget_gate":
+        print(LRP)
         wc_filename,wc_img,wc_words = get_wcloud(LRP,rawInput_selections.value,load_dir,color_dict=color_dict,gate="forget")
     elif gate_value=="output_gate":
         wc_filename,wc_img,wc_words = get_wcloud(LRP,rawInput_selections.value,load_dir,color_dict=color_dict,gate="out")
@@ -212,12 +213,12 @@ def update_source(attrname, old, new):
 #Raw input clustering
 rawInput_selections = get_rawInput_selections()
 
-tools = "pan,wheel_zoom,box_zoom,reset,hover"
+tools = "pan,wheel_zoom,box_zoom,reset"
 
 #Dimensionality reduction
 labels = None 
 data_pr = data[lstm_layer_name][gate_selections.value]
-X, performance_metric = dim_reduction.project(data_pr, "PCA", n_neighbors=10, labels=labels)
+X, performance_metric = dim_reduction.project(data_pr, "PCA", n_neighbors=5, labels=labels)
 X_cluster_labels, X_colors, _ = clustering.apply_cluster(data_pr,algorithm=clustering_selections[0].value,n_clusters=int(clustering_selections[1].value),mode="nn")
 proj_source = ColumnDataSource(dict(x=X[:,0],y=X[:,1],z=X_colors))
 #  + performance_metric[0] + performance_metric[1]
@@ -246,19 +247,21 @@ def update_source(attrname, old, new):
 
 #ONLY if wc from "out" gate?????
 words_to_be_highlighted = list(set(wc_words).intersection(totalLRP[rawInput_selections.value]['words']))
-print(words_to_be_highlighted) #[i for i in wc_words and totalLRP[rawInput_selections.value]['words']]
+
 highlight_source = ColumnDataSource(dict(scores=[]))
-tap_source = ColumnDataSource(dict(wc_words=words_to_be_highlighted,lrp=totalLRP[rawInput_selections.value]['lrp'].tolist()))
+
+tap_source = ColumnDataSource(dict(wc_words=words_to_be_highlighted))
+lrp_source = ColumnDataSource(dict(lrp=totalLRP[rawInput_selections.value]['lrp'].tolist()))
 #totalLRP : how relevant is each LSTM neuron
 
-taptool.callback = CustomJS(args=dict(source=tap_source,high=highlight_source,div=text_banner),
+
+taptool.callback = CustomJS(args=dict(source=tap_source,lrp=lrp_source,high=highlight_source,div=text_banner),
 code="""
      cell = cb_obj.selected['1d']['indices'][0]
      var d = high.data;
      d['scores'] = []
-     e = []
-     for(var i=0; i<source.data['lrp'].length; i++){
-        d['scores'].push(source.data['lrp'][i][cell])
+     for(var i=0; i<source.data['wc_words'].length; i++){
+        d['scores'].push(lrp.data['lrp'][cell]*1e4)
      }
      high.change.emit();
      ws = div.text.split(" ");
@@ -268,15 +271,15 @@ def update_source(attrname, old, new):
         if (w_idx>=0){
            if (d['scores'][w_idx]>0){
               if (d['scores'][w_idx]<1){
-                 ws_out.push("<span style='background-color: rgba(255,0,0,d['scores'][w_idx])'>"+ws[j]+"</span>")
+                 ws_out.push("<span style='background-color: rgba(255,0,0,"+d['scores'][w_idx]+")'>"+ws[j]+"</span>")
               }
               else {
                  ws_out.push("<span style='background-color: rgba(255,0,0,0.98)'>"+ws[j]+"</span>")
               }
            }
            if (d['scores'][w_idx]<0){
               if (Math.abs(d['scores'][w_idx])<1){
-                 ws_out.push("<span style='background-color: rgba(0,255,0,Math.abs(d['scores'][w_idx]))'>"+ws[j]+"</span>")
+                 ws_out.push("<span style='background-color: rgba(0,255,0,"+Math.abs(d['scores'][w_idx])+")'>"+ws[j]+"</span>")
               }
               else {
                  ws_out.push("<span style='background-color: rgba(0,255,0,0.98)'>"+ws[j]+"</span>")
@@ -288,17 +291,9 @@ def update_source(attrname, old, new):
         }
      }
      div.text = ws_out.join(" ")
-
+     console.log(ws_out)     
      """)
-
-"""
-     dmax = Math.max.apply(Math, d['scores']); 
-     for(var i=0; i<source.data['lrp'].length; i++){
-        e.push((d['scores'][i]/dmax)*1e10)
-     }
-     console.log(e)
-"""
-
+
 
 img_source = ColumnDataSource(dict(url = [load_dir+wc_filename]))
 xdr = Range1d(start=0, end=600)
@@ -312,6 +307,14 @@ def update_source(attrname, old, new):
 text_set = Paragraph(text="KMeans: Clusters neurons based on their gate values after training.", width=250, height=100)
 
 
+lrp_timedata = get_lrp_timedata(LRP)
+lrptime_source = ColumnDataSource(dict(lrptime = lrp_timedata,time=[i for i in range(len(lrp_timedata))]))
+lrp_plot = figure(title="Total normalized LRP per timestep",plot_width=300, plot_height=50)
+lrp_plot.scatter('time','lrptime', marker='circle', size=5, alpha=0.5, source=lrptime_source)
+lrp_plot.xaxis.axis_label = 'Time'
+lrp_plot.yaxis.axis_label = 'Total normalized LRP'
+
+
 #Layout
 gate_selections.on_change('value', update_source)
 projection_selections.on_change('value', update_source)
@@ -320,6 +323,7 @@ def update_source(attrname, old, new):
 rawInput_selections.on_change('value', update_source)
 
 gp = layout([project_plot, wc_plot, widgetbox(gate_selections,projection_selections,rawInput_selections,clustering_selections[0],clustering_selections[1],text_0,text_set,label_banner)],
+            [lrp_plot],
             [text_banner],
             responsive=True)
 curdoc().add_root(gp)

diff --git a/src/IMDB_dataset/imdb_preprocess.py b/src/IMDB_dataset/imdb_preprocess.py
@@ -216,6 +216,7 @@ def extract_features(filenames,seed,test_size,save_test,n_words,dictionary,embed
     testX = tokenize_and_remove_unk(test_X_tokenized,n_words,dictionary)
     validX = tokenize_and_remove_unk(valid_X_tokenized,n_words,dictionary)
 
+
     test_dict = None
     test_dict_token = None
     if save_test!=None:

diff --git a/src/dynamic_lstm_TF.py b/src/dynamic_lstm_TF.py
@@ -86,14 +86,8 @@ def build_network(net_arch,net_arch_layers,tensorboard_verbose,sequence_length,e
     # Network building
     if embedding_layer:
         net = tflearn.input_data([None,sequence_length]) 
-        """
-        W = tf.Variable(tf.constant(embedding_initMat.astype(np.float32)), trainable=True, name="W")
-        embedding_placeholder = tf.placeholder(tf.float32, [n_words, embedding_dim])
-        embedding_init = W.assign(embedding_placeholder)
-        """
         W = tf.constant(embedding_initMat, dtype=np.float32,name="W")
         ebd_output = tflearn.embedding(net, input_dim=n_words, output_dim=embedding_dim,weights_init=W, name='embedding')
-
         n = "embedding_output"
         layer_outputs[n] = ebd_output
         prev_incoming = ebd_output
@@ -164,6 +158,7 @@ def train(seed,net_arch,net_arch_layers,save_path,n_epoch,tensorboard_verbose,sh
     """
     with open('trainValidtestNew.pickle','rb') as handle:
         (trainX,validX,testX,trainY,validY,testY,filenames_train,filenames_valid,filenames_test_sfd,maxlen,test_dict,test_dict_token,embedding_initMat) = pickle.load(handle)
+    
     """
     d = test_dict        
     if save_mode=="pickle":
@@ -190,8 +185,6 @@ def train(seed,net_arch,net_arch_layers,save_path,n_epoch,tensorboard_verbose,sh
 
     model, layer_outputs = build_network(net_arch,net_arch_layers,tensorboard_verbose,trainX.shape[1],embedding_dim,tensorboard_dir,batch_size,n_words,embedding_layer,ckp_path,embedding_initMat)
 
-    #model.session.run(embedding_init, feed_dict={embedding_placeholder: embedding_initMat})
-
     model.fit(trainX, trainY, validation_set=(validX, validY), n_epoch=n_epoch, show_metric=show_metric, batch_size=batch_size)
 
     print("Evaluating trained model on test set...")
@@ -215,11 +208,11 @@ def train(seed,net_arch,net_arch_layers,save_path,n_epoch,tensorboard_verbose,sh
 
     feed = testX
     input_files = filenames_test_sfd
-
+    """
     export_serial_lstm_data(model,layer_outputs,feed,input_files,internals,save_dir+"test_",save_mode=save_mode)
     
     print("Exported internals...")
-
+    """
     #Delete part that creates problem in restoring model - should still be able to evaluate, but tricky for continuing training
     del tf.get_collection_ref(tf.GraphKeys.TRAIN_OPS)[:]
     model.save(save_dir+"tf_model.tfl")

diff --git a/src/lrp.py b/src/lrp.py
@@ -13,6 +13,29 @@
 import gensim
 
 
+def get_lrp_timedata(LRP):
+
+    out_reversed = []
+    kkeys = list(LRP.keys())
+    lens = []
+    for i in kkeys:
+        lens.append(len(list(LRP[i]['words'])))
+    max_len = np.max(lens)
+    for i in range(max_len):
+        j = 0
+        normalize_factor = 0
+        lrp_t = 0
+        for k in kkeys:
+            if lens[j]-1-i>=0:
+                normalize_factor = normalize_factor + 1
+                lrp = list(LRP[k]['scores'])[lens[j]-1-i]
+                lrp_t = lrp_t + lrp
+
+            j = j + 1
+        out_reversed.append(lrp_t/normalize_factor)
+
+    return out_reversed[::-1] #reverse for time = 0...T
+
 def get_PosNegNeurons_dict(i,predictions,lrp_neurons):
 # Get neurons that trigger exclusively for positive or negative reviews according to the network. Assign them to neutral if activate for both types of reviews.
     reviewLRP_data = {"pos":[],"neg":[],"neutral":[]}
@@ -215,6 +238,7 @@ def lrp_embedding(model,emb_name,n_words,feed,lstm_first_input,lrp_lstm,dictiona
         Rout = lrp_lstm
         ws = []
         scores = []
+
         for t in range(sequence_len):
             zj = lstm_first_input[t,:]
             zi = np.zeros((W.shape[0]))
@@ -223,7 +247,7 @@ def lrp_embedding(model,emb_name,n_words,feed,lstm_first_input,lrp_lstm,dictiona
             lrp_ebd = lrp_linear(zi, W, b, zj, R_t, N, eps, delta, debug)
             ws.append(dictionary[z[t]])
             scores.append(lrp_ebd[z[t]])
-
+        
         LRP = collections.OrderedDict(words=ws,scores=scores)
 
         return LRP

diff --git a/src/wcloud_standalone.py b/src/wcloud_standalone.py
@@ -92,6 +92,7 @@ def get_wcloud(LRP,k,save_dir,color_dict=None,gate="out",text=None):
 
      ws = LRP[k]['words']
      scs = LRP[k]['scores']
+
      if gate=="in":
          wc = WordCloud(
             background_color="white",
@@ -132,7 +133,7 @@ def get_wcloud(LRP,k,save_dir,color_dict=None,gate="out",text=None):
          weights=collections.OrderedDict()
          for i in range(len(ws)):
              if ws[i] not in out_words:
-                weights[ws[i]] = scs[i]*1e+80
+                weights[ws[i]] = scs[i]
          wc = WordCloud(
                  background_color="white",
                  max_words=2000,