Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Placn Edits #5

Draft
wants to merge 27 commits into
base: master
Choose a base branch
from
Prev Previous commit
Next Next commit
placn
  • Loading branch information
Jae committed Aug 1, 2021
commit 5ee8b287afa6450ec05780ec62e8da60856586aa
33 changes: 9 additions & 24 deletions subgraph_extraction/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,10 +80,9 @@ def __init__(self, db_path, db_name_pos, db_name_neg, raw_data_paths, included_r
self.id2entity = id2entity
self.id2relation = id2relation

self.max_n_label = np.array([0, 0])
self.max_n_label = 0
with self.main_env.begin() as txn:
self.max_n_label[0] = int.from_bytes(txn.get('max_n_label_sub'.encode()), byteorder='little')
self.max_n_label[1] = int.from_bytes(txn.get('max_n_label_obj'.encode()), byteorder='little')
self.max_n_label = int.from_bytes(txn.get('max_n_label'.encode()), byteorder='little')

self.avg_subgraph_size = struct.unpack('f', txn.get('avg_subgraph_size'.encode()))
self.min_subgraph_size = struct.unpack('f', txn.get('min_subgraph_size'.encode()))
Expand All @@ -100,7 +99,7 @@ def __init__(self, db_path, db_name_pos, db_name_neg, raw_data_paths, included_r
self.max_num_pruned_nodes = struct.unpack('f', txn.get('max_num_pruned_nodes'.encode()))
self.std_num_pruned_nodes = struct.unpack('f', txn.get('std_num_pruned_nodes'.encode()))

logging.info(f"Max distance from sub : {self.max_n_label[0]}, Max distance from obj : {self.max_n_label[1]}")
logging.info(f"Max distance node label: {self.max_n_label}")

# logging.info('=====================')
# logging.info(f"Subgraph size stats: \n Avg size {self.avg_subgraph_size}, \n Min size {self.min_subgraph_size}, \n Max size {self.max_subgraph_size}, \n Std {self.std_subgraph_size}")
Expand Down Expand Up @@ -158,31 +157,17 @@ def _prepare_subgraphs(self, nodes, r_label, n_labels):

return subgraph

def _prepare_features(self, subgraph, n_labels, n_feats=None):
def _prepare_features_placn(self, subgraph, n_labels, n_feats=None):
# One hot encode the node label feature and concat to n_featsure
n_nodes = subgraph.number_of_nodes()
label_feats = np.zeros((n_nodes, self.max_n_label[0] + 1))
label_feats[np.arange(n_nodes), n_labels] = 1
label_feats[np.arange(n_nodes), self.max_n_label[0] + 1 + n_labels[:, 1]] = 1
n_feats = np.concatenate((label_feats, n_feats), axis=1) if n_feats else label_feats
subgraph.ndata['feat'] = torch.FloatTensor(n_feats)
self.n_feat_dim = n_feats.shape[1] # Find cleaner way to do this -- i.e. set the n_feat_dim
return subgraph

def _prepare_features_new(self, subgraph, n_labels, n_feats=None):
# One hot encode the node label feature and concat to n_featsure
n_nodes = subgraph.number_of_nodes()
label_feats = np.zeros((n_nodes, self.max_n_label[0] + 1 + self.max_n_label[1] + 1))
label_feats[np.arange(n_nodes), n_labels[:, 0]] = 1
label_feats[np.arange(n_nodes), self.max_n_label[0] + 1 + n_labels[:, 1]] = 1
# label_feats = np.zeros((n_nodes, self.max_n_label[0] + 1 + self.max_n_label[1] + 1))
# label_feats[np.arange(n_nodes), 0] = 1
# label_feats[np.arange(n_nodes), self.max_n_label[0] + 1] = 1
label_feats = np.zeros((n_nodes, self.max_n_label + 1))
label_feats[np.arange(n_nodes), n_labels[:]] = 1
label_feats[np.arange(n_nodes), self.max_n_label + 1 + n_labels[:]] = 1
n_feats = np.concatenate((label_feats, n_feats), axis=1) if n_feats is not None else label_feats
subgraph.ndata['feat'] = torch.FloatTensor(n_feats)

head_id = np.argwhere([label[0] == 0 and label[1] == 1 for label in n_labels])
tail_id = np.argwhere([label[0] == 1 and label[1] == 0 for label in n_labels])
head_id = 0
tail_id = 1
n_ids = np.zeros(n_nodes)
n_ids[head_id] = 1 # head
n_ids[tail_id] = 2 # tail
Expand Down
140 changes: 54 additions & 86 deletions subgraph_extraction/graph_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def links2subgraphs(A, graphs, params, max_label_value=None):
'''
extract enclosing subgraphs, write map mode + named dbs
'''
max_n_label = {'value': np.array([0, 0])}
max_n_label = {'value': 0}
subgraph_sizes = []
enc_ratios = []
num_pruned_nodes = []
Expand All @@ -91,7 +91,7 @@ def extraction_helper(A, links, g_labels, split_env):
with mp.Pool(processes=None, initializer=intialize_worker, initargs=(A, params, max_label_value)) as p:
args_ = zip(range(len(links)), links, g_labels)
for (str_id, datum) in tqdm(p.imap(extract_save_subgraph, args_), total=len(links)):
max_n_label['value'] = np.maximum(np.max(datum['n_labels'], axis=0), max_n_label['value'])
max_n_label['value'] = np.maximum(np.max(datum['n_labels']), max_n_label['value'])
subgraph_sizes.append(datum['subgraph_size'])
enc_ratios.append(datum['enc_ratio'])
num_pruned_nodes.append(datum['num_pruned_nodes'])
Expand All @@ -115,10 +115,8 @@ def extraction_helper(A, links, g_labels, split_env):
max_n_label['value'] = max_label_value if max_label_value is not None else max_n_label['value']

with env.begin(write=True) as txn:
bit_len_label_sub = int.bit_length(int(max_n_label['value'][0]))
bit_len_label_obj = int.bit_length(int(max_n_label['value'][1]))
txn.put('max_n_label_sub'.encode(), (int(max_n_label['value'][0])).to_bytes(bit_len_label_sub, byteorder='little'))
txn.put('max_n_label_obj'.encode(), (int(max_n_label['value'][1])).to_bytes(bit_len_label_obj, byteorder='little'))
bit_len_label_sub = int.bit_length(int(max_n_label['value']))
txn.put('max_n_label'.encode(), (int(max_n_label['value'])).to_bytes(bit_len_label_sub, byteorder='little'))

txn.put('avg_subgraph_size'.encode(), struct.pack('f', float(np.mean(subgraph_sizes))))
txn.put('min_subgraph_size'.encode(), struct.pack('f', float(np.min(subgraph_sizes))))
Expand Down Expand Up @@ -155,7 +153,7 @@ def extract_save_subgraph(args_):
nodes, n_labels, subgraph_size, enc_ratio, num_pruned_nodes = subgraph_extraction_labeling((n1, n2), r_label, A_, params_.hop, params_.enclosing_sub_graph, params_.max_nodes_per_hop, None, params_)

# max_label_value_ is to set the maximum possible value of node label while doing double-radius labelling.
if max_label_value_ is not None:
if max_label_value_ is not None and not params_.placn_subgraph_size:
n_labels = np.array([np.minimum(label, max_label_value_).tolist() for label in n_labels])

datum = {'nodes': nodes, 'r_label': r_label, 'g_label': g_label, 'n_labels': n_labels, 'subgraph_size': subgraph_size, 'enc_ratio': enc_ratio, 'num_pruned_nodes': num_pruned_nodes}
Expand All @@ -176,13 +174,29 @@ def get_neighbor_nodes(roots, adj, h=1, max_nodes_per_hop=None):


def subgraph_extraction_labeling(ind, rel, A_list, h=1, enclosing_sub_graph=False, max_nodes_per_hop=None, max_node_label_value=None, params=[]):
if not params.placn_subgraphs:
# extract the h-hop enclosing subgraphs around link 'ind'
# proof we dont need to union the expanding subgraphs on each iteration as listed in placn paper
#Lh(i) subset of Lh+1(i)
# Lh(j) subset of Lh+1(j)

# x belong to Lh(i) intersect Lh(j)
# x belong to Lh(i) and x belong to Lh(j)
# x belong to Lh(i)
# x belong to Lh(j)

# x belong to Lh(i) => x belong to Lh+1(i)
# x belong to Lh(j) => x belong to Lh+1(j)

# x belong to Lh+1(i) and x belong to Lh+1(j)
# x belong to Lh+1(i) intersect Lh+1(j)

hop = 1
subgraph_nodes = []
while len(subgraph_nodes) < params.placn_subgraph_size and hop < 10:
A_incidence = incidence_matrix(A_list)
A_incidence += A_incidence.T

root1_nei = get_neighbor_nodes(set([ind[0]]), A_incidence, h, max_nodes_per_hop)
root2_nei = get_neighbor_nodes(set([ind[1]]), A_incidence, h, max_nodes_per_hop)
root1_nei = get_neighbor_nodes(set([ind[0]]), A_incidence, hop, None)
root2_nei = get_neighbor_nodes(set([ind[1]]), A_incidence, hop, None)

subgraph_nei_nodes_int = root1_nei.intersection(root2_nei)
subgraph_nei_nodes_un = root1_nei.union(root2_nei)
Expand All @@ -192,88 +206,42 @@ def subgraph_extraction_labeling(ind, rel, A_list, h=1, enclosing_sub_graph=Fals
subgraph_nodes = list(ind) + list(subgraph_nei_nodes_int)
else:
subgraph_nodes = list(ind) + list(subgraph_nei_nodes_un)
hop = hop + 1

subgraph = [adj[subgraph_nodes, :][:, subgraph_nodes] for adj in A_list]

labels, enclosing_subgraph_nodes = node_label(incidence_matrix(subgraph), max_distance=h)

pruned_subgraph_nodes = np.array(subgraph_nodes)[enclosing_subgraph_nodes].tolist()
pruned_labels = labels[enclosing_subgraph_nodes]
# pruned_subgraph_nodes = subgraph_nodes
# pruned_labels = labels
subgraph = [adj[subgraph_nodes, :][:, subgraph_nodes] for adj in A_list]

if max_node_label_value is not None:
pruned_labels = np.array([np.minimum(label, max_node_label_value).tolist() for label in pruned_labels])

subgraph_size = len(pruned_subgraph_nodes)
enc_ratio = len(subgraph_nei_nodes_int) / (len(subgraph_nei_nodes_un) + 1e-3)
num_pruned_nodes = len(subgraph_nodes) - len(pruned_subgraph_nodes)

return pruned_subgraph_nodes, pruned_labels, subgraph_size, enc_ratio, num_pruned_nodes
else:
# proof we dont need to union the expanding subgraphs on each iteration as listed in placn paper
#Lh(i) subset of Lh+1(i)
# Lh(j) subset of Lh+1(j)

# x belong to Lh(i) intersect Lh(j)
# x belong to Lh(i) and x belong to Lh(j)
# x belong to Lh(i)
# x belong to Lh(j)

# x belong to Lh(i) => x belong to Lh+1(i)
# x belong to Lh(j) => x belong to Lh+1(j)

# x belong to Lh+1(i) and x belong to Lh+1(j)
# x belong to Lh+1(i) intersect Lh+1(j)

hop = 1
subgraph_nodes = []
while len(subgraph_nodes) < params.placn_subgraph_size and hop < 10:
A_incidence = incidence_matrix(A_list)
A_incidence += A_incidence.T
labels, enclosing_subgraph_nodes = placn_node_label(incidence_matrix(subgraph), max_distance=hop)

root1_nei = get_neighbor_nodes(set([ind[0]]), A_incidence, hop, None)
root2_nei = get_neighbor_nodes(set([ind[1]]), A_incidence, hop, None)
pruned_subgraph_nodes = np.array(subgraph_nodes)[enclosing_subgraph_nodes].tolist()[:params.placn_subgraph_size] #guarantee K size (placn)
pruned_labels = labels[enclosing_subgraph_nodes]
# pruned_subgraph_nodes = subgraph_nodes
# pruned_labels = labels

subgraph_nei_nodes_int = root1_nei.intersection(root2_nei)
subgraph_nei_nodes_un = root1_nei.union(root2_nei)
subgraph_size = len(pruned_subgraph_nodes)
enc_ratio = len(subgraph_nei_nodes_int) / (len(subgraph_nei_nodes_un) + 1e-3)
num_pruned_nodes = len(subgraph_nodes) - len(pruned_subgraph_nodes)

# Extract subgraph | Roots being in the front is essential for labelling and the model to work properly.
if enclosing_sub_graph:
subgraph_nodes = list(ind) + list(subgraph_nei_nodes_int)
else:
subgraph_nodes = list(ind) + list(subgraph_nei_nodes_un)
hop = hop + 1

subgraph = [adj[subgraph_nodes, :][:, subgraph_nodes] for adj in A_list]

labels, enclosing_subgraph_nodes = node_label(incidence_matrix(subgraph), max_distance=hop)

pruned_subgraph_nodes = np.array(subgraph_nodes)[enclosing_subgraph_nodes].tolist()
pruned_labels = labels[enclosing_subgraph_nodes]
# pruned_subgraph_nodes = subgraph_nodes
# pruned_labels = labels

if max_node_label_value is not None:
pruned_labels = np.array([np.minimum(label, max_node_label_value).tolist() for label in pruned_labels])

subgraph_size = len(pruned_subgraph_nodes)
enc_ratio = len(subgraph_nei_nodes_int) / (len(subgraph_nei_nodes_un) + 1e-3)
num_pruned_nodes = len(subgraph_nodes) - len(pruned_subgraph_nodes)

return pruned_subgraph_nodes, pruned_labels, subgraph_size, enc_ratio, num_pruned_nodes
return pruned_subgraph_nodes, pruned_labels, subgraph_size, enc_ratio, num_pruned_nodes



def node_label(subgraph, max_distance=1):
# implementation of the node labeling scheme described in the paper
def placn_node_label(subgraph, max_distance=1, k=6):
# implementation of the node labeling scheme described in PLACN

roots = [0, 1]
sgs_single_root = [remove_nodes(subgraph, [root]) for root in roots]
dist_to_roots = [np.clip(ssp.csgraph.dijkstra(sg, indices=[0], directed=False, unweighted=True, limit=1e6)[:, 1:], 0, 1e7) for r, sg in enumerate(sgs_single_root)]
dist_to_roots = np.array(list(zip(dist_to_roots[0][0], dist_to_roots[1][0])), dtype=int)

target_node_labels = np.array([[0, 1], [1, 0]])
labels = np.concatenate((target_node_labels, dist_to_roots)) if dist_to_roots.size else target_node_labels

enclosing_subgraph_nodes = np.where(np.max(labels, axis=1) <= max_distance)[0]
rk = remove_nodes(subgraph, roots)
ordered_nodes = [root for root in roots]
node_map = []
dist_to_roots = np.clip(ssp.csgraph.dijkstra(subgraph, indices=[0, 1], directed=False, unweighted=True, min_only=false, limit=1e6)[:, 1:], 0, 1e7)

for r in (range(subgraph.shape[0]) - [0,1])
h_i = dist_to_roots[0][r+2]
h_j = dist_to_roots[1][r+2]
#weights not available, just use distance
d = (h_i+h_j)/2
node_map += [d]

sorted = np.argsort(node_map)
ordered_nodes += sorted;
enclosing_subgraph_nodes = np.where(np.max(ordered_nodes, axis=1) <= max_distance)[0]
return labels, enclosing_subgraph_nodes
13 changes: 12 additions & 1 deletion utils/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,18 @@ def process_files(files, saved_relation2id=None):
adj_list = []
for i in range(len(relation2id)):
idx = np.argwhere(triplets['train'][:, 2] == i)
adj_list.append(csc_matrix((np.ones(len(idx), dtype=np.uint8), (triplets['train'][:, 0][idx].squeeze(1), triplets['train'][:, 1][idx].squeeze(1))), shape=(len(entity2id), len(entity2id))))
adj_list.append(
csc_matrix(
(
np.ones(len(idx), dtype=np.uint8),
(
triplets['train'][:, 0][idx].squeeze(1),
triplets['train'][:, 1][idx].squeeze(1)
)
)
, shape=(len(entity2id), len(entity2id))
)
)

return adj_list, triplets, entity2id, relation2id, id2entity, id2relation

Expand Down