-
-
Notifications
You must be signed in to change notification settings - Fork 27
/
modularity_nmf.py
148 lines (139 loc) · 7.44 KB
/
modularity_nmf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
"""M-NMF module."""
import os
import time
import community
import numpy as np
import pandas as pd
from tqdm import tqdm
import networkx as nx
import tensorflow as tf
from calculation_helper import modularity_generator, overlap_generator
from calculation_helper import graph_reader, json_dumper, log_setup, log_updater, loss_printer
class MNMF:
"""
Modularity regularized non-negative matrix factorization machine class.
The calculations use Tensorflow.
"""
def __init__(self, args):
"""
Method to parse the graph setup the similarity matrices.
Embedding matrices and cluster centers.
:param args: Object with parameters.
"""
print("Model initialization started.\n")
self.computation_graph = tf.Graph()
with self.computation_graph.as_default():
self.args = args
self.G = graph_reader(args.input)
self.number_of_nodes = len(nx.nodes(self.G))
if self.number_of_nodes > 10000:
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
self.S_0 = tf.placeholder(tf.float64, shape=(None, None))
self.B1 = tf.placeholder(tf.float64, shape=(None, None))
self.B2 = tf.placeholder(tf.float64, shape=(None, None))
self.M = tf.Variable(tf.random_uniform([self.number_of_nodes, self.args.dimensions],
0, 1, dtype=tf.float64))
self.U = tf.Variable(tf.random_uniform([self.number_of_nodes, self.args.dimensions],
0, 1, dtype=tf.float64))
self.H = tf.Variable(tf.random_uniform([self.number_of_nodes, self.args.clusters],
0, 1, dtype=tf.float64))
self.C = tf.Variable(tf.random_uniform([self.args.clusters, self.args.dimensions],
0, 1, dtype=tf.float64))
self.S = np.float64(self.args.eta)*self.S_0 + self.B1
self.init = tf.global_variables_initializer()
def build_graph(self):
"""
Defining the M-NMF computation graph based on the power iteration method.
The procedure has 4 separate phases:
1. Updating the base matrix.
2. Updating the embedding.
3. Updating the cluster centers.
4. Updating the membership of nodes.
"""
#---------------------------------
# 1. Phase
#---------------------------------
self.enum_1 = tf.matmul(self.S, self.U, a_is_sparse=True)
self.denom_1 = tf.matmul(self.M, tf.matmul(self.U, self.U, transpose_a=True))
self.denom_2 = tf.maximum(np.float64(self.args.lower_control), self.denom_1)
self.M = self.M.assign(tf.nn.l2_normalize(tf.multiply(self.M, self.enum_1/self.denom_2), 1))
#---------------------------------
# 2. Phase
#---------------------------------
self.enum_2 = tf.matmul(self.S, self.M, transpose_a=True, a_is_sparse=True)+self.args.alpha*tf.matmul(self.H, self.C)
self.denom_3 = tf.matmul(self.U, tf.matmul(self.M, self.M, transpose_a=True)+self.args.alpha*tf.matmul(self.C, self.C, transpose_a=True))
self.denom_4 = tf.maximum(np.float64(self.args.lower_control), self.denom_3)
self.U = self.U.assign(tf.nn.l2_normalize(np.multiply(self.U, self.enum_2/self.denom_4), 1))
#---------------------------------
# 3. Phase
#---------------------------------
self.enum_3 = tf.matmul(self.H, self.U, transpose_a=True)
self.denom_5 = tf.matmul(self.C, tf.matmul(self.U, self.U, transpose_a=True))
self.denom_6 = tf.maximum(np.float64(self.args.lower_control), self.denom_5)
self.C = self.C.assign(tf.nn.l2_normalize(tf.multiply(self.C, self.enum_3/self.denom_6), 1))
#---------------------------------
# 4. Phase
#---------------------------------
self.B1H = tf.matmul(self.B1, self.H, a_is_sparse=True)
self.B2H = tf.matmul(self.B2, self.H, a_is_sparse=True)
self.HHH = tf.matmul(self.H, (tf.matmul(self.H, self.H, transpose_a=True)))
self.UC = tf.matmul(self.U, self.C, transpose_b=True)
self.rooted = tf.square(np.float64(2*self.args.beta)*self.B2H)+tf.multiply(np.float64(16*self.args.lambd)*self.HHH, (np.float64(2*self.args.beta)*self.B1H+np.float64(2*self.args.alpha)*self.UC+(np.float64(4*self.args.lambd-2*self.args.alpha))*self.H))
self.sqroot_1 = tf.sqrt(self.rooted)
self.enum_4 = np.float64(-2*self.args.beta)*self.B2H+self.sqroot_1
self.denom_7 = np.float64(8*self.args.lambd)*self.HHH
self.denom_8 = tf.maximum(np.float64(self.args.lower_control), self.denom_7)
self.sqroot_2 = tf.sqrt(self.enum_4/self.denom_8)
self.H = self.H.assign(tf.nn.l2_normalize(tf.multiply(self.H, self.sqroot_2), 1))
def update_state(self, H):
"""
Procedure to calculate the cluster memberships and modularity.
:param H: Cluster membership indicator.
:return current_modularity: Modularity based on the cluster memberships.
"""
indices = np.argmax(H, axis=1)
indices = {int(i): int(indices[i]) for i, _ in enumerate(indices)}
current_modularity = community.modularity(indices, self.G)
if current_modularity > self.best_modularity:
self.best_modularity = current_modularity
self.optimal_indices = indices
self.stop_index = 0
else:
self.stop_index = self.stop_index + 1
return current_modularity
def initiate_dump(self, session, feed_dict):
"""
Method to save the clusters, node representations, cluster memberships and logs.
"""
json_dumper(self.optimal_indices, self.args.assignment_output)
json_dumper(self.logs, self.args.log_output)
loss_printer(self.logs)
if self.args.dump_matrices:
cols = ["X_"+ str(x) for x in range(self.args.dimensions)]
self.optimal_clusters = pd.DataFrame(session.run(self.C, feed_dict=feed_dict),
columns=cols)
self.optimal_node_representations = pd.DataFrame(session.run(self.U, feed_dict=feed_dict), columns=cols)
self.optimal_clusters.to_csv(self.args.cluster_mean_output, index=None)
self.optimal_node_representations.to_csv(self.args.embedding_output, index=None)
def optimize(self):
"""
Method to run the optimization and halt it when overfitting started.
The output matrices are all saved when optimization has finished.
"""
self.best_modularity = 0
self.stop_index = 0
with tf.Session(graph=self.computation_graph) as session:
self.init.run()
self.logs = log_setup(self.args)
print("Optimization started.\n")
self.build_graph()
feed_dict = {self.S_0: overlap_generator(self.G), self.B1: np.array(nx.adjacency_matrix(self.G).todense()), self.B2:modularity_generator(self.G)}
for i in tqdm(range(self.args.iteration_number)):
start = time.time()
H = session.run(self.H, feed_dict=feed_dict)
current_modularity = self.update_state(H)
end = time.time()
log_updater(self.logs, i, end-start, current_modularity)
if self.stop_index > self.args.early_stopping:
break
self.initiate_dump(session, feed_dict)