Skip to content

Commit

Permalink
Merge branch 'master' of https://gitlab.com/yhalkiad/tRustNN
Browse files Browse the repository at this point in the history
  • Loading branch information
Ioannis Chalkiadakis committed Aug 7, 2017
2 parents d5376b3 + 6821963 commit f5c504e
Show file tree
Hide file tree
Showing 6 changed files with 58 additions and 35 deletions.
2 changes: 1 addition & 1 deletion bokeh_vis/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def clustering(X, algorithm, n_clusters=2):
bandwidth = cluster.estimate_bandwidth(X, quantile=0.3)

# connectivity matrix for structured Ward
connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False)
connectivity = kneighbors_graph(X, n_neighbors=5, include_self=False)

# make connectivity symmetric
connectivity = 0.5 * (connectivity + connectivity.T)
Expand Down
48 changes: 26 additions & 22 deletions bokeh_vis/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import data_format
from wcloud_standalone import get_wcloud
import heatmap as hmap

from lrp import get_lrp_timedata

def get_wc_colourGroups(rawInput_source):

Expand Down Expand Up @@ -109,7 +109,7 @@ def update_source(attrname, old, new):

#update dimension reduction source
algorithm = projection_selections.value
knn = 10
knn = 5
x_pr,performance_metric = dim_reduction.project(x, algorithm, knn, labels)

#update clustering
Expand Down Expand Up @@ -164,6 +164,7 @@ def update_source(attrname, old, new):
if gate_value=="input_gate":
wc_filename,wc_img,wc_words = get_wcloud(LRP,rawInput_selections.value,load_dir,color_dict=color_dict,gate="in",text=text_banner.text)
elif gate_value=="forget_gate":
print(LRP)
wc_filename,wc_img,wc_words = get_wcloud(LRP,rawInput_selections.value,load_dir,color_dict=color_dict,gate="forget")
elif gate_value=="output_gate":
wc_filename,wc_img,wc_words = get_wcloud(LRP,rawInput_selections.value,load_dir,color_dict=color_dict,gate="out")
Expand Down Expand Up @@ -212,12 +213,12 @@ def update_source(attrname, old, new):
#Raw input clustering
rawInput_selections = get_rawInput_selections()

tools = "pan,wheel_zoom,box_zoom,reset,hover"
tools = "pan,wheel_zoom,box_zoom,reset"

#Dimensionality reduction
labels = None
data_pr = data[lstm_layer_name][gate_selections.value]
X, performance_metric = dim_reduction.project(data_pr, "PCA", n_neighbors=10, labels=labels)
X, performance_metric = dim_reduction.project(data_pr, "PCA", n_neighbors=5, labels=labels)
X_cluster_labels, X_colors, _ = clustering.apply_cluster(data_pr,algorithm=clustering_selections[0].value,n_clusters=int(clustering_selections[1].value),mode="nn")
proj_source = ColumnDataSource(dict(x=X[:,0],y=X[:,1],z=X_colors))
# + performance_metric[0] + performance_metric[1]
Expand Down Expand Up @@ -246,19 +247,21 @@ def update_source(attrname, old, new):

#ONLY if wc from "out" gate?????
words_to_be_highlighted = list(set(wc_words).intersection(totalLRP[rawInput_selections.value]['words']))
print(words_to_be_highlighted) #[i for i in wc_words and totalLRP[rawInput_selections.value]['words']]

highlight_source = ColumnDataSource(dict(scores=[]))
tap_source = ColumnDataSource(dict(wc_words=words_to_be_highlighted,lrp=totalLRP[rawInput_selections.value]['lrp'].tolist()))

tap_source = ColumnDataSource(dict(wc_words=words_to_be_highlighted))
lrp_source = ColumnDataSource(dict(lrp=totalLRP[rawInput_selections.value]['lrp'].tolist()))
#totalLRP : how relevant is each LSTM neuron

taptool.callback = CustomJS(args=dict(source=tap_source,high=highlight_source,div=text_banner),

taptool.callback = CustomJS(args=dict(source=tap_source,lrp=lrp_source,high=highlight_source,div=text_banner),
code="""
cell = cb_obj.selected['1d']['indices'][0]
var d = high.data;
d['scores'] = []
e = []
for(var i=0; i<source.data['lrp'].length; i++){
d['scores'].push(source.data['lrp'][i][cell])
for(var i=0; i<source.data['wc_words'].length; i++){
d['scores'].push(lrp.data['lrp'][cell]*1e4)
}
high.change.emit();
ws = div.text.split(" ");
Expand All @@ -268,15 +271,15 @@ def update_source(attrname, old, new):
if (w_idx>=0){
if (d['scores'][w_idx]>0){
if (d['scores'][w_idx]<1){
ws_out.push("<span style='background-color: rgba(255,0,0,d['scores'][w_idx])'>"+ws[j]+"</span>")
ws_out.push("<span style='background-color: rgba(255,0,0,"+d['scores'][w_idx]+")'>"+ws[j]+"</span>")
}
else {
ws_out.push("<span style='background-color: rgba(255,0,0,0.98)'>"+ws[j]+"</span>")
}
}
if (d['scores'][w_idx]<0){
if (Math.abs(d['scores'][w_idx])<1){
ws_out.push("<span style='background-color: rgba(0,255,0,Math.abs(d['scores'][w_idx]))'>"+ws[j]+"</span>")
ws_out.push("<span style='background-color: rgba(0,255,0,"+Math.abs(d['scores'][w_idx])+")'>"+ws[j]+"</span>")
}
else {
ws_out.push("<span style='background-color: rgba(0,255,0,0.98)'>"+ws[j]+"</span>")
Expand All @@ -288,17 +291,9 @@ def update_source(attrname, old, new):
}
}
div.text = ws_out.join(" ")
console.log(ws_out)
""")

"""
dmax = Math.max.apply(Math, d['scores']);
for(var i=0; i<source.data['lrp'].length; i++){
e.push((d['scores'][i]/dmax)*1e10)
}
console.log(e)
"""



img_source = ColumnDataSource(dict(url = [load_dir+wc_filename]))
xdr = Range1d(start=0, end=600)
Expand All @@ -312,6 +307,14 @@ def update_source(attrname, old, new):
text_set = Paragraph(text="KMeans: Clusters neurons based on their gate values after training.", width=250, height=100)


lrp_timedata = get_lrp_timedata(LRP)
lrptime_source = ColumnDataSource(dict(lrptime = lrp_timedata,time=[i for i in range(len(lrp_timedata))]))
lrp_plot = figure(title="Total normalized LRP per timestep",plot_width=300, plot_height=50)
lrp_plot.scatter('time','lrptime', marker='circle', size=5, alpha=0.5, source=lrptime_source)
lrp_plot.xaxis.axis_label = 'Time'
lrp_plot.yaxis.axis_label = 'Total normalized LRP'


#Layout
gate_selections.on_change('value', update_source)
projection_selections.on_change('value', update_source)
Expand All @@ -320,6 +323,7 @@ def update_source(attrname, old, new):
rawInput_selections.on_change('value', update_source)

gp = layout([project_plot, wc_plot, widgetbox(gate_selections,projection_selections,rawInput_selections,clustering_selections[0],clustering_selections[1],text_0,text_set,label_banner)],
[lrp_plot],
[text_banner],
responsive=True)
curdoc().add_root(gp)
Expand Down
1 change: 1 addition & 0 deletions src/IMDB_dataset/imdb_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,7 @@ def extract_features(filenames,seed,test_size,save_test,n_words,dictionary,embed
testX = tokenize_and_remove_unk(test_X_tokenized,n_words,dictionary)
validX = tokenize_and_remove_unk(valid_X_tokenized,n_words,dictionary)


test_dict = None
test_dict_token = None
if save_test!=None:
Expand Down
13 changes: 3 additions & 10 deletions src/dynamic_lstm_TF.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,14 +86,8 @@ def build_network(net_arch,net_arch_layers,tensorboard_verbose,sequence_length,e
# Network building
if embedding_layer:
net = tflearn.input_data([None,sequence_length])
"""
W = tf.Variable(tf.constant(embedding_initMat.astype(np.float32)), trainable=True, name="W")
embedding_placeholder = tf.placeholder(tf.float32, [n_words, embedding_dim])
embedding_init = W.assign(embedding_placeholder)
"""
W = tf.constant(embedding_initMat, dtype=np.float32,name="W")
ebd_output = tflearn.embedding(net, input_dim=n_words, output_dim=embedding_dim,weights_init=W, name='embedding')

n = "embedding_output"
layer_outputs[n] = ebd_output
prev_incoming = ebd_output
Expand Down Expand Up @@ -164,6 +158,7 @@ def train(seed,net_arch,net_arch_layers,save_path,n_epoch,tensorboard_verbose,sh
"""
with open('trainValidtestNew.pickle','rb') as handle:
(trainX,validX,testX,trainY,validY,testY,filenames_train,filenames_valid,filenames_test_sfd,maxlen,test_dict,test_dict_token,embedding_initMat) = pickle.load(handle)
"""
d = test_dict
if save_mode=="pickle":
Expand All @@ -190,8 +185,6 @@ def train(seed,net_arch,net_arch_layers,save_path,n_epoch,tensorboard_verbose,sh

model, layer_outputs = build_network(net_arch,net_arch_layers,tensorboard_verbose,trainX.shape[1],embedding_dim,tensorboard_dir,batch_size,n_words,embedding_layer,ckp_path,embedding_initMat)

#model.session.run(embedding_init, feed_dict={embedding_placeholder: embedding_initMat})

model.fit(trainX, trainY, validation_set=(validX, validY), n_epoch=n_epoch, show_metric=show_metric, batch_size=batch_size)

print("Evaluating trained model on test set...")
Expand All @@ -215,11 +208,11 @@ def train(seed,net_arch,net_arch_layers,save_path,n_epoch,tensorboard_verbose,sh

feed = testX
input_files = filenames_test_sfd

"""
export_serial_lstm_data(model,layer_outputs,feed,input_files,internals,save_dir+"test_",save_mode=save_mode)
print("Exported internals...")

"""
#Delete part that creates problem in restoring model - should still be able to evaluate, but tricky for continuing training
del tf.get_collection_ref(tf.GraphKeys.TRAIN_OPS)[:]
model.save(save_dir+"tf_model.tfl")
Expand Down
26 changes: 25 additions & 1 deletion src/lrp.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,29 @@
import gensim


def get_lrp_timedata(LRP):

out_reversed = []
kkeys = list(LRP.keys())
lens = []
for i in kkeys:
lens.append(len(list(LRP[i]['words'])))
max_len = np.max(lens)
for i in range(max_len):
j = 0
normalize_factor = 0
lrp_t = 0
for k in kkeys:
if lens[j]-1-i>=0:
normalize_factor = normalize_factor + 1
lrp = list(LRP[k]['scores'])[lens[j]-1-i]
lrp_t = lrp_t + lrp

j = j + 1
out_reversed.append(lrp_t/normalize_factor)

return out_reversed[::-1] #reverse for time = 0...T

def get_PosNegNeurons_dict(i,predictions,lrp_neurons):
# Get neurons that trigger exclusively for positive or negative reviews according to the network. Assign them to neutral if activate for both types of reviews.
reviewLRP_data = {"pos":[],"neg":[],"neutral":[]}
Expand Down Expand Up @@ -215,6 +238,7 @@ def lrp_embedding(model,emb_name,n_words,feed,lstm_first_input,lrp_lstm,dictiona
Rout = lrp_lstm
ws = []
scores = []

for t in range(sequence_len):
zj = lstm_first_input[t,:]
zi = np.zeros((W.shape[0]))
Expand All @@ -223,7 +247,7 @@ def lrp_embedding(model,emb_name,n_words,feed,lstm_first_input,lrp_lstm,dictiona
lrp_ebd = lrp_linear(zi, W, b, zj, R_t, N, eps, delta, debug)
ws.append(dictionary[z[t]])
scores.append(lrp_ebd[z[t]])

LRP = collections.OrderedDict(words=ws,scores=scores)

return LRP
Expand Down
3 changes: 2 additions & 1 deletion src/wcloud_standalone.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ def get_wcloud(LRP,k,save_dir,color_dict=None,gate="out",text=None):

ws = LRP[k]['words']
scs = LRP[k]['scores']

if gate=="in":
wc = WordCloud(
background_color="white",
Expand Down Expand Up @@ -132,7 +133,7 @@ def get_wcloud(LRP,k,save_dir,color_dict=None,gate="out",text=None):
weights=collections.OrderedDict()
for i in range(len(ws)):
if ws[i] not in out_words:
weights[ws[i]] = scs[i]*1e+80
weights[ws[i]] = scs[i]
wc = WordCloud(
background_color="white",
max_words=2000,
Expand Down

0 comments on commit f5c504e

Please sign in to comment.