placn

kkteru · jaemuzzin · Jul 17, 2021 · Aug 1, 2021 · Aug 1, 2021 · Aug 1, 2021
commit 5ee8b287afa6450ec05780ec62e8da60856586aa
diff --git a/subgraph_extraction/datasets.py b/subgraph_extraction/datasets.py
@@ -80,10 +80,9 @@ def __init__(self, db_path, db_name_pos, db_name_neg, raw_data_paths, included_r
         self.id2entity = id2entity
         self.id2relation = id2relation
 
-        self.max_n_label = np.array([0, 0])
+        self.max_n_label = 0
         with self.main_env.begin() as txn:
-            self.max_n_label[0] = int.from_bytes(txn.get('max_n_label_sub'.encode()), byteorder='little')
-            self.max_n_label[1] = int.from_bytes(txn.get('max_n_label_obj'.encode()), byteorder='little')
+            self.max_n_label = int.from_bytes(txn.get('max_n_label'.encode()), byteorder='little')
 
             self.avg_subgraph_size = struct.unpack('f', txn.get('avg_subgraph_size'.encode()))
             self.min_subgraph_size = struct.unpack('f', txn.get('min_subgraph_size'.encode()))
@@ -100,7 +99,7 @@ def __init__(self, db_path, db_name_pos, db_name_neg, raw_data_paths, included_r
             self.max_num_pruned_nodes = struct.unpack('f', txn.get('max_num_pruned_nodes'.encode()))
             self.std_num_pruned_nodes = struct.unpack('f', txn.get('std_num_pruned_nodes'.encode()))
 
-        logging.info(f"Max distance from sub : {self.max_n_label[0]}, Max distance from obj : {self.max_n_label[1]}")
+        logging.info(f"Max distance node label: {self.max_n_label}")
 
         # logging.info('=====================')
         # logging.info(f"Subgraph size stats: \n Avg size {self.avg_subgraph_size}, \n Min size {self.min_subgraph_size}, \n Max size {self.max_subgraph_size}, \n Std {self.std_subgraph_size}")
@@ -158,31 +157,17 @@ def _prepare_subgraphs(self, nodes, r_label, n_labels):
 
         return subgraph
 
-    def _prepare_features(self, subgraph, n_labels, n_feats=None):
+    def _prepare_features_placn(self, subgraph, n_labels, n_feats=None):
         # One hot encode the node label feature and concat to n_featsure
         n_nodes = subgraph.number_of_nodes()
-        label_feats = np.zeros((n_nodes, self.max_n_label[0] + 1))
-        label_feats[np.arange(n_nodes), n_labels] = 1
-        label_feats[np.arange(n_nodes), self.max_n_label[0] + 1 + n_labels[:, 1]] = 1
-        n_feats = np.concatenate((label_feats, n_feats), axis=1) if n_feats else label_feats
-        subgraph.ndata['feat'] = torch.FloatTensor(n_feats)
-        self.n_feat_dim = n_feats.shape[1]  # Find cleaner way to do this -- i.e. set the n_feat_dim
-        return subgraph
-
-    def _prepare_features_new(self, subgraph, n_labels, n_feats=None):
-        # One hot encode the node label feature and concat to n_featsure
-        n_nodes = subgraph.number_of_nodes()
-        label_feats = np.zeros((n_nodes, self.max_n_label[0] + 1 + self.max_n_label[1] + 1))
-        label_feats[np.arange(n_nodes), n_labels[:, 0]] = 1
-        label_feats[np.arange(n_nodes), self.max_n_label[0] + 1 + n_labels[:, 1]] = 1
-        # label_feats = np.zeros((n_nodes, self.max_n_label[0] + 1 + self.max_n_label[1] + 1))
-        # label_feats[np.arange(n_nodes), 0] = 1
-        # label_feats[np.arange(n_nodes), self.max_n_label[0] + 1] = 1
+        label_feats = np.zeros((n_nodes, self.max_n_label + 1))
+        label_feats[np.arange(n_nodes), n_labels[:]] = 1
+        label_feats[np.arange(n_nodes), self.max_n_label + 1 + n_labels[:]] = 1
         n_feats = np.concatenate((label_feats, n_feats), axis=1) if n_feats is not None else label_feats
         subgraph.ndata['feat'] = torch.FloatTensor(n_feats)
 
-        head_id = np.argwhere([label[0] == 0 and label[1] == 1 for label in n_labels])
-        tail_id = np.argwhere([label[0] == 1 and label[1] == 0 for label in n_labels])
+        head_id = 0
+        tail_id = 1
         n_ids = np.zeros(n_nodes)
         n_ids[head_id] = 1  # head
         n_ids[tail_id] = 2  # tail

diff --git a/subgraph_extraction/graph_sampler.py b/subgraph_extraction/graph_sampler.py
@@ -70,7 +70,7 @@ def links2subgraphs(A, graphs, params, max_label_value=None):
     '''
     extract enclosing subgraphs, write map mode + named dbs
     '''
-    max_n_label = {'value': np.array([0, 0])}
+    max_n_label = {'value': 0}
     subgraph_sizes = []
     enc_ratios = []
     num_pruned_nodes = []
@@ -91,7 +91,7 @@ def extraction_helper(A, links, g_labels, split_env):
         with mp.Pool(processes=None, initializer=intialize_worker, initargs=(A, params, max_label_value)) as p:
             args_ = zip(range(len(links)), links, g_labels)
             for (str_id, datum) in tqdm(p.imap(extract_save_subgraph, args_), total=len(links)):
-                max_n_label['value'] = np.maximum(np.max(datum['n_labels'], axis=0), max_n_label['value'])
+                max_n_label['value'] = np.maximum(np.max(datum['n_labels']), max_n_label['value'])
                 subgraph_sizes.append(datum['subgraph_size'])
                 enc_ratios.append(datum['enc_ratio'])
                 num_pruned_nodes.append(datum['num_pruned_nodes'])
@@ -115,10 +115,8 @@ def extraction_helper(A, links, g_labels, split_env):
     max_n_label['value'] = max_label_value if max_label_value is not None else max_n_label['value']
 
     with env.begin(write=True) as txn:
-        bit_len_label_sub = int.bit_length(int(max_n_label['value'][0]))
-        bit_len_label_obj = int.bit_length(int(max_n_label['value'][1]))
-        txn.put('max_n_label_sub'.encode(), (int(max_n_label['value'][0])).to_bytes(bit_len_label_sub, byteorder='little'))
-        txn.put('max_n_label_obj'.encode(), (int(max_n_label['value'][1])).to_bytes(bit_len_label_obj, byteorder='little'))
+        bit_len_label_sub = int.bit_length(int(max_n_label['value']))
+        txn.put('max_n_label'.encode(), (int(max_n_label['value'])).to_bytes(bit_len_label_sub, byteorder='little'))
 
         txn.put('avg_subgraph_size'.encode(), struct.pack('f', float(np.mean(subgraph_sizes))))
         txn.put('min_subgraph_size'.encode(), struct.pack('f', float(np.min(subgraph_sizes))))
@@ -155,7 +153,7 @@ def extract_save_subgraph(args_):
     nodes, n_labels, subgraph_size, enc_ratio, num_pruned_nodes = subgraph_extraction_labeling((n1, n2), r_label, A_, params_.hop, params_.enclosing_sub_graph, params_.max_nodes_per_hop, None, params_)
 
     # max_label_value_ is to set the maximum possible value of node label while doing double-radius labelling.
-    if max_label_value_ is not None:
+    if max_label_value_ is not None and not params_.placn_subgraph_size:
         n_labels = np.array([np.minimum(label, max_label_value_).tolist() for label in n_labels])
 
     datum = {'nodes': nodes, 'r_label': r_label, 'g_label': g_label, 'n_labels': n_labels, 'subgraph_size': subgraph_size, 'enc_ratio': enc_ratio, 'num_pruned_nodes': num_pruned_nodes}
@@ -176,13 +174,29 @@ def get_neighbor_nodes(roots, adj, h=1, max_nodes_per_hop=None):
 
 
 def subgraph_extraction_labeling(ind, rel, A_list, h=1, enclosing_sub_graph=False, max_nodes_per_hop=None, max_node_label_value=None, params=[]):
-    if not params.placn_subgraphs:
-        # extract the h-hop enclosing subgraphs around link 'ind'
+    # proof we dont need to union the expanding subgraphs on each iteration as listed in placn paper
+    #Lh(i) subset of Lh+1(i)
+    # Lh(j) subset of Lh+1(j)
+
+    # x belong to Lh(i) intersect Lh(j)
+    # x belong to Lh(i) and x belong to Lh(j)
+    # x belong to Lh(i)
+    # x belong to Lh(j)
+
+    # x belong to Lh(i) => x belong to Lh+1(i)
+    # x belong to Lh(j) => x belong to Lh+1(j)
+
+    # x belong to Lh+1(i) and x belong to Lh+1(j)
+    # x belong to Lh+1(i) intersect Lh+1(j)
+
+    hop = 1
+    subgraph_nodes = []
+    while len(subgraph_nodes) < params.placn_subgraph_size and hop < 10:
         A_incidence = incidence_matrix(A_list)
         A_incidence += A_incidence.T
 
-        root1_nei = get_neighbor_nodes(set([ind[0]]), A_incidence, h, max_nodes_per_hop)
-        root2_nei = get_neighbor_nodes(set([ind[1]]), A_incidence, h, max_nodes_per_hop)
+        root1_nei = get_neighbor_nodes(set([ind[0]]), A_incidence, hop, None)
+        root2_nei = get_neighbor_nodes(set([ind[1]]), A_incidence, hop, None)
 
         subgraph_nei_nodes_int = root1_nei.intersection(root2_nei)
         subgraph_nei_nodes_un = root1_nei.union(root2_nei)
@@ -192,88 +206,42 @@ def subgraph_extraction_labeling(ind, rel, A_list, h=1, enclosing_sub_graph=Fals
             subgraph_nodes = list(ind) + list(subgraph_nei_nodes_int)
         else:
             subgraph_nodes = list(ind) + list(subgraph_nei_nodes_un)
+        hop = hop + 1
 
-        subgraph = [adj[subgraph_nodes, :][:, subgraph_nodes] for adj in A_list]
-
-        labels, enclosing_subgraph_nodes = node_label(incidence_matrix(subgraph), max_distance=h)
-
-        pruned_subgraph_nodes = np.array(subgraph_nodes)[enclosing_subgraph_nodes].tolist()
-        pruned_labels = labels[enclosing_subgraph_nodes]
-        # pruned_subgraph_nodes = subgraph_nodes
-        # pruned_labels = labels
+    subgraph = [adj[subgraph_nodes, :][:, subgraph_nodes] for adj in A_list]
 
-        if max_node_label_value is not None:
-            pruned_labels = np.array([np.minimum(label, max_node_label_value).tolist() for label in pruned_labels])
-
-        subgraph_size = len(pruned_subgraph_nodes)
-        enc_ratio = len(subgraph_nei_nodes_int) / (len(subgraph_nei_nodes_un) + 1e-3)
-        num_pruned_nodes = len(subgraph_nodes) - len(pruned_subgraph_nodes)
-
-        return pruned_subgraph_nodes, pruned_labels, subgraph_size, enc_ratio, num_pruned_nodes
-    else:
-        # proof we dont need to union the expanding subgraphs on each iteration as listed in placn paper
-        #Lh(i) subset of Lh+1(i)
-        # Lh(j) subset of Lh+1(j)
-
-        # x belong to Lh(i) intersect Lh(j)
-        # x belong to Lh(i) and x belong to Lh(j)
-        # x belong to Lh(i)
-        # x belong to Lh(j)
-
-        # x belong to Lh(i) => x belong to Lh+1(i)
-        # x belong to Lh(j) => x belong to Lh+1(j)
-
-        # x belong to Lh+1(i) and x belong to Lh+1(j)
-        # x belong to Lh+1(i) intersect Lh+1(j)
-
-        hop = 1
-        subgraph_nodes = []
-        while len(subgraph_nodes) < params.placn_subgraph_size and hop < 10:
-            A_incidence = incidence_matrix(A_list)
-            A_incidence += A_incidence.T
+    labels, enclosing_subgraph_nodes = placn_node_label(incidence_matrix(subgraph), max_distance=hop)
 
-            root1_nei = get_neighbor_nodes(set([ind[0]]), A_incidence, hop, None)
-            root2_nei = get_neighbor_nodes(set([ind[1]]), A_incidence, hop, None)
+    pruned_subgraph_nodes = np.array(subgraph_nodes)[enclosing_subgraph_nodes].tolist()[:params.placn_subgraph_size] #guarantee K size (placn)
+    pruned_labels = labels[enclosing_subgraph_nodes]
+    # pruned_subgraph_nodes = subgraph_nodes
+    # pruned_labels = labels
 
-            subgraph_nei_nodes_int = root1_nei.intersection(root2_nei)
-            subgraph_nei_nodes_un = root1_nei.union(root2_nei)
+    subgraph_size = len(pruned_subgraph_nodes)
+    enc_ratio = len(subgraph_nei_nodes_int) / (len(subgraph_nei_nodes_un) + 1e-3)
+    num_pruned_nodes = len(subgraph_nodes) - len(pruned_subgraph_nodes)
 
-            # Extract subgraph | Roots being in the front is essential for labelling and the model to work properly.
-            if enclosing_sub_graph:
-                subgraph_nodes = list(ind) + list(subgraph_nei_nodes_int)
-            else:
-                subgraph_nodes = list(ind) + list(subgraph_nei_nodes_un)
-            hop = hop + 1
-
-        subgraph = [adj[subgraph_nodes, :][:, subgraph_nodes] for adj in A_list]
-
-        labels, enclosing_subgraph_nodes = node_label(incidence_matrix(subgraph), max_distance=hop)
-
-        pruned_subgraph_nodes = np.array(subgraph_nodes)[enclosing_subgraph_nodes].tolist()
-        pruned_labels = labels[enclosing_subgraph_nodes]
-        # pruned_subgraph_nodes = subgraph_nodes
-        # pruned_labels = labels
-
-        if max_node_label_value is not None:
-            pruned_labels = np.array([np.minimum(label, max_node_label_value).tolist() for label in pruned_labels])
-
-        subgraph_size = len(pruned_subgraph_nodes)
-        enc_ratio = len(subgraph_nei_nodes_int) / (len(subgraph_nei_nodes_un) + 1e-3)
-        num_pruned_nodes = len(subgraph_nodes) - len(pruned_subgraph_nodes)
-
-        return pruned_subgraph_nodes, pruned_labels, subgraph_size, enc_ratio, num_pruned_nodes
+    return pruned_subgraph_nodes, pruned_labels, subgraph_size, enc_ratio, num_pruned_nodes
 
 
 
-def node_label(subgraph, max_distance=1):
-    # implementation of the node labeling scheme described in the paper
+def placn_node_label(subgraph, max_distance=1, k=6):
+    # implementation of the node labeling scheme described in PLACN
+
     roots = [0, 1]
-    sgs_single_root = [remove_nodes(subgraph, [root]) for root in roots]
-    dist_to_roots = [np.clip(ssp.csgraph.dijkstra(sg, indices=[0], directed=False, unweighted=True, limit=1e6)[:, 1:], 0, 1e7) for r, sg in enumerate(sgs_single_root)]
-    dist_to_roots = np.array(list(zip(dist_to_roots[0][0], dist_to_roots[1][0])), dtype=int)
-
-    target_node_labels = np.array([[0, 1], [1, 0]])
-    labels = np.concatenate((target_node_labels, dist_to_roots)) if dist_to_roots.size else target_node_labels
-
-    enclosing_subgraph_nodes = np.where(np.max(labels, axis=1) <= max_distance)[0]
+    rk = remove_nodes(subgraph, roots)
+    ordered_nodes = [root for root in roots]
+    node_map = []
+    dist_to_roots = np.clip(ssp.csgraph.dijkstra(subgraph, indices=[0, 1], directed=False, unweighted=True, min_only=false, limit=1e6)[:, 1:], 0, 1e7)
+
+    for r in (range(subgraph.shape[0]) - [0,1])
+        h_i = dist_to_roots[0][r+2]
+        h_j = dist_to_roots[1][r+2]
+        #weights not available, just use distance
+        d = (h_i+h_j)/2
+        node_map += [d]
+
+    sorted = np.argsort(node_map)
+    ordered_nodes += sorted;
+    enclosing_subgraph_nodes = np.where(np.max(ordered_nodes, axis=1) <= max_distance)[0]
     return labels, enclosing_subgraph_nodes
diff --git a/utils/data_utils.py b/utils/data_utils.py
@@ -58,7 +58,18 @@ def process_files(files, saved_relation2id=None):
     adj_list = []
     for i in range(len(relation2id)):
         idx = np.argwhere(triplets['train'][:, 2] == i)
-        adj_list.append(csc_matrix((np.ones(len(idx), dtype=np.uint8), (triplets['train'][:, 0][idx].squeeze(1), triplets['train'][:, 1][idx].squeeze(1))), shape=(len(entity2id), len(entity2id))))
+        adj_list.append(
+            csc_matrix(
+                (
+                    np.ones(len(idx), dtype=np.uint8),
+                    (
+                        triplets['train'][:, 0][idx].squeeze(1),
+                        triplets['train'][:, 1][idx].squeeze(1)
+                    )
+                )
+                , shape=(len(entity2id), len(entity2id))
+            )
+        )
 
     return adj_list, triplets, entity2id, relation2id, id2entity, id2relation