fixes

kkteru · jaemuzzin · Jul 17, 2021 · Aug 1, 2021 · Aug 1, 2021 · Aug 1, 2021
commit da2f3a0e10b9d7f70dfe6582e3922d35888337ff
diff --git a/subgraph_extraction/datasets.py b/subgraph_extraction/datasets.py
@@ -7,6 +7,7 @@
 import json
 import pickle
 import dgl
+import dgl.contrib.sampling
 from utils.graph_utils import ssp_multigraph_to_dgl, incidence_matrix
 from utils.data_utils import process_files, save_to_file, plot_rel_dist
 from .graph_sampler import *
@@ -53,9 +54,10 @@ def get_kge_embeddings(dataset, kge_model):
     return node_features, kge_entity2id
 
 
+
 class SubgraphDataset(Dataset):
     """Extracted, labeled, subgraph dataset -- DGL Only"""
-
+    
 
     def __init__(self, db_path, db_name_pos, db_name_neg, raw_data_paths, included_relations=None, add_traspose_rels=False, num_neg_samples_per_link=1, use_kge_embeddings=False, dataset='', kge_model='', file_name='', placn_size=20):
 
@@ -73,7 +75,9 @@ def __init__(self, db_path, db_name_pos, db_name_neg, raw_data_paths, included_r
         if add_traspose_rels:
             ssp_graph_t = [adj.T for adj in ssp_graph]
             ssp_graph += ssp_graph_t
-
+
+        A_incidence = incidence_matrix(ssp_graph)
+        A_incidence += A_incidence.T
         # the effective number of relations after adding symmetric adjacency matrices and/or self connections
         self.aug_num_rels = len(ssp_graph)
         self.graph = ssp_multigraph_to_dgl(ssp_graph)
@@ -83,24 +87,39 @@ def __init__(self, db_path, db_name_pos, db_name_neg, raw_data_paths, included_r
         n_nodes = self.graph.number_of_nodes();
         #tensor of features to use to look up features by nodes (i, j)
         self.placn_features = np.zeros((n_nodes, n_nodes, 5))
-        for i in range(0,n_nodes):
-            i_nei = dgl.sampling.sample_neighbors(self.graph, np.array([i]), -1).nodes()
+        neighborCache = {}
+        for i in tqdm(range(0,n_nodes)):
+            if i in neighborCache:
+                i_nei = neighborCache[i]
+            else:
+                i_nei = get_neighbor_nodes(set([i]), A_incidence, 1, None)
+                neighborCache[i] = i_nei
+
             for j in range(0,n_nodes):
                 if i==j: continue
-                j_nei = dgl.sampling.sample_neighbors(self.graph, np.array([j]), -1).nodes()
+                if j in neighborCache:
+                    j_nei = neighborCache[j]
+                else:
+                    j_nei = get_neighbor_nodes(set([j]), A_incidence, 1, None)
+                    neighborCache[j] = j_nei
 
                 cn_set = set(i_nei)
                 cn_set.intersection_update(set(j_nei))
-                placn_features[i][j][0] = len(cn_set)#Common neighboiurs
+                self.placn_features[i][j][0] = len(cn_set)#Common neighboiurs
 
                 all_nei = set(i_nei)
                 all_nei.union(set(j_nei))
-                placn_features[i][j][1] = len(cn_set) / len(all_nei) #Jerard coefficient
+                self.placn_features[i][j][1] = len(cn_set) / len(all_nei) #Jerard coefficient
 
                 aa_sum = 0;#adamic-adair
                 for k in all_nei:
-                    aa_sum = aa_sum + len(dgl.sampling.sample_neighbors(self.graph, np.array([k]), -1).nodes())
-                placn_features[i][j][2] = aa_sum #adamic-adair
+                    if k in neighborCache != None:
+                        k_nei = neighborCache[k]
+                    else:
+                        k_nei = get_neighbor_nodes(set([k]), A_incidence, 1, None)
+                        neighborCache[k] = k_nei
+                    aa_sum = aa_sum + len(k_nei)
+                self.placn_features[i][j][2] = aa_sum #adamic-adair
 
 
         self.ssp_graph = ssp_graph
@@ -145,7 +164,7 @@ def __init__(self, db_path, db_name_pos, db_name_neg, raw_data_paths, included_r
     def __getitem__(self, index):
         with self.main_env.begin(db=self.db_pos) as txn:
             str_id = '{:08}'.format(index).encode('ascii')
-            nodes_pos, r_label_pos, g_label_pos, n_labels_pos, placn_features = deserialize(txn.get(str_id)).values()
+            nodes_pos, r_label_pos, g_label_pos, n_labels_pos = deserialize(txn.get(str_id)).values()
             subgraph_pos = self._prepare_subgraphs(nodes_pos, r_label_pos, n_labels_pos)
         subgraphs_neg = []
         r_labels_neg = []
@@ -158,7 +177,7 @@ def __getitem__(self, index):
                 r_labels_neg.append(r_label_neg)
                 g_labels_neg.append(g_label_neg)
 
-        return subgraph_pos, g_label_pos, r_label_pos, subgraphs_neg, g_labels_neg, r_labels_neg, placn_features
+        return subgraph_pos, g_label_pos, r_label_pos, subgraphs_neg, g_labels_neg, r_labels_neg
 
     def __len__(self):
         return self.num_graphs_pos
@@ -188,18 +207,18 @@ def _prepare_features_placn(self, nodes, subgraph, n_labels, n_feats=None):
         n_nodes = subgraph.number_of_nodes()
         label_feats = np.zeros((n_nodes,len(n_labels)))
         label_feats[np.array(np.arange(n_nodes)), n_labels] = 1
-        placn_subfeats=[]
+        placn_subfeats=np.zeros((n_nodes, self.placn_size))
         for i in range(0, n_nodes):
             ith=np.zeros((n_nodes * 3))
             for j in range(0, n_nodes):
                 # We always assign zero to the positive target link in the adjacency matrix of the weighted graph. The reason is that when we test PLACN
                 # model, positive links should not contain any information of the link’s
                 # existence.
                 for f in range(0, 3):
-                    ith[3*j + f] = placn_features[i][j][f] if i!=j else 0
-            placn_subfeats[] = ith
+                    ith[3*j + f] = self.placn_features[i][j][f] if i!=j else 0
+            np.concatenate((placn_subfeats, ith), axis=0)
         n_feats = np.concatenate((label_feats, n_feats), axis=1) if n_feats is not None else label_feats
-        n_feats = np.concatenate((n_feats, placn_subfeats), axis=1)
+        n_feats = np.concatenate((n_feats, [placn_subfeats]), axis=1)
         subgraph.ndata['feat'] = torch.FloatTensor(n_feats)
 
         head_id = np.argwhere([label == 0 for label in n_labels])

diff --git a/subgraph_extraction/graph_sampler.py b/subgraph_extraction/graph_sampler.py
@@ -235,8 +235,11 @@ def placn_node_label(subgraph,  k):
         h_j = dist_to_roots[1][r]
         #weights not available, just use distance
         d = (h_i+h_j)/2
-        if(d > .5 && d <= k) #worse case is K hops if graph is a straight line of nodes
+        if d > .5 and d <= k: #worse case is K hops if graph is a straight line of nodes
             node_map += [d]
-        else
+        else:
             node_map += [k]
-    return np.argsort(np.argsort(node_map)), range(subgraph.shape[0])
+    r = np.argsort(np.argsort(node_map))
+    print(r)
+
+    return r, range(subgraph.shape[0])
diff --git a/train.py b/train.py
@@ -22,9 +22,7 @@ def main(params):
 
     params.db_path = os.path.join(params.main_dir, f'data/{params.dataset}/subgraphs_en_{params.enclosing_sub_graph}_neg_{params.num_neg_samples_per_link}_hop_{params.hop}')
 
-    logging.info("omg here we go??")
     if not os.path.isdir(params.db_path):
-        logging.info("omg here we go")
         generate_subgraph_datasets(params)
 
     train = SubgraphDataset(params.db_path, 'train_pos', 'train_neg', params.file_paths,