Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Placn Edits #5

Draft
wants to merge 27 commits into
base: master
Choose a base branch
from
Prev Previous commit
Next Next commit
fixes
  • Loading branch information
Jae committed Aug 9, 2021
commit da2f3a0e10b9d7f70dfe6582e3922d35888337ff
49 changes: 34 additions & 15 deletions subgraph_extraction/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import json
import pickle
import dgl
import dgl.contrib.sampling
from utils.graph_utils import ssp_multigraph_to_dgl, incidence_matrix
from utils.data_utils import process_files, save_to_file, plot_rel_dist
from .graph_sampler import *
Expand Down Expand Up @@ -53,9 +54,10 @@ def get_kge_embeddings(dataset, kge_model):
return node_features, kge_entity2id



class SubgraphDataset(Dataset):
"""Extracted, labeled, subgraph dataset -- DGL Only"""


def __init__(self, db_path, db_name_pos, db_name_neg, raw_data_paths, included_relations=None, add_traspose_rels=False, num_neg_samples_per_link=1, use_kge_embeddings=False, dataset='', kge_model='', file_name='', placn_size=20):

Expand All @@ -73,7 +75,9 @@ def __init__(self, db_path, db_name_pos, db_name_neg, raw_data_paths, included_r
if add_traspose_rels:
ssp_graph_t = [adj.T for adj in ssp_graph]
ssp_graph += ssp_graph_t


A_incidence = incidence_matrix(ssp_graph)
A_incidence += A_incidence.T
# the effective number of relations after adding symmetric adjacency matrices and/or self connections
self.aug_num_rels = len(ssp_graph)
self.graph = ssp_multigraph_to_dgl(ssp_graph)
Expand All @@ -83,24 +87,39 @@ def __init__(self, db_path, db_name_pos, db_name_neg, raw_data_paths, included_r
n_nodes = self.graph.number_of_nodes();
#tensor of features to use to look up features by nodes (i, j)
self.placn_features = np.zeros((n_nodes, n_nodes, 5))
for i in range(0,n_nodes):
i_nei = dgl.sampling.sample_neighbors(self.graph, np.array([i]), -1).nodes()
neighborCache = {}
for i in tqdm(range(0,n_nodes)):
if i in neighborCache:
i_nei = neighborCache[i]
else:
i_nei = get_neighbor_nodes(set([i]), A_incidence, 1, None)
neighborCache[i] = i_nei

for j in range(0,n_nodes):
if i==j: continue
j_nei = dgl.sampling.sample_neighbors(self.graph, np.array([j]), -1).nodes()
if j in neighborCache:
j_nei = neighborCache[j]
else:
j_nei = get_neighbor_nodes(set([j]), A_incidence, 1, None)
neighborCache[j] = j_nei

cn_set = set(i_nei)
cn_set.intersection_update(set(j_nei))
placn_features[i][j][0] = len(cn_set)#Common neighboiurs
self.placn_features[i][j][0] = len(cn_set)#Common neighboiurs

all_nei = set(i_nei)
all_nei.union(set(j_nei))
placn_features[i][j][1] = len(cn_set) / len(all_nei) #Jerard coefficient
self.placn_features[i][j][1] = len(cn_set) / len(all_nei) #Jerard coefficient

aa_sum = 0;#adamic-adair
for k in all_nei:
aa_sum = aa_sum + len(dgl.sampling.sample_neighbors(self.graph, np.array([k]), -1).nodes())
placn_features[i][j][2] = aa_sum #adamic-adair
if k in neighborCache != None:
k_nei = neighborCache[k]
else:
k_nei = get_neighbor_nodes(set([k]), A_incidence, 1, None)
neighborCache[k] = k_nei
aa_sum = aa_sum + len(k_nei)
self.placn_features[i][j][2] = aa_sum #adamic-adair


self.ssp_graph = ssp_graph
Expand Down Expand Up @@ -145,7 +164,7 @@ def __init__(self, db_path, db_name_pos, db_name_neg, raw_data_paths, included_r
def __getitem__(self, index):
with self.main_env.begin(db=self.db_pos) as txn:
str_id = '{:08}'.format(index).encode('ascii')
nodes_pos, r_label_pos, g_label_pos, n_labels_pos, placn_features = deserialize(txn.get(str_id)).values()
nodes_pos, r_label_pos, g_label_pos, n_labels_pos = deserialize(txn.get(str_id)).values()
subgraph_pos = self._prepare_subgraphs(nodes_pos, r_label_pos, n_labels_pos)
subgraphs_neg = []
r_labels_neg = []
Expand All @@ -158,7 +177,7 @@ def __getitem__(self, index):
r_labels_neg.append(r_label_neg)
g_labels_neg.append(g_label_neg)

return subgraph_pos, g_label_pos, r_label_pos, subgraphs_neg, g_labels_neg, r_labels_neg, placn_features
return subgraph_pos, g_label_pos, r_label_pos, subgraphs_neg, g_labels_neg, r_labels_neg

def __len__(self):
return self.num_graphs_pos
Expand Down Expand Up @@ -188,18 +207,18 @@ def _prepare_features_placn(self, nodes, subgraph, n_labels, n_feats=None):
n_nodes = subgraph.number_of_nodes()
label_feats = np.zeros((n_nodes,len(n_labels)))
label_feats[np.array(np.arange(n_nodes)), n_labels] = 1
placn_subfeats=[]
placn_subfeats=np.zeros((n_nodes, self.placn_size))
for i in range(0, n_nodes):
ith=np.zeros((n_nodes * 3))
for j in range(0, n_nodes):
# We always assign zero to the positive target link in the adjacency matrix of the weighted graph. The reason is that when we test PLACN
# model, positive links should not contain any information of the link’s
# existence.
for f in range(0, 3):
ith[3*j + f] = placn_features[i][j][f] if i!=j else 0
placn_subfeats[] = ith
ith[3*j + f] = self.placn_features[i][j][f] if i!=j else 0
np.concatenate((placn_subfeats, ith), axis=0)
n_feats = np.concatenate((label_feats, n_feats), axis=1) if n_feats is not None else label_feats
n_feats = np.concatenate((n_feats, placn_subfeats), axis=1)
n_feats = np.concatenate((n_feats, [placn_subfeats]), axis=1)
subgraph.ndata['feat'] = torch.FloatTensor(n_feats)

head_id = np.argwhere([label == 0 for label in n_labels])
Expand Down
9 changes: 6 additions & 3 deletions subgraph_extraction/graph_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,8 +235,11 @@ def placn_node_label(subgraph, k):
h_j = dist_to_roots[1][r]
#weights not available, just use distance
d = (h_i+h_j)/2
if(d > .5 && d <= k) #worse case is K hops if graph is a straight line of nodes
if d > .5 and d <= k: #worse case is K hops if graph is a straight line of nodes
node_map += [d]
else
else:
node_map += [k]
return np.argsort(np.argsort(node_map)), range(subgraph.shape[0])
r = np.argsort(np.argsort(node_map))
print(r)

return r, range(subgraph.shape[0])
2 changes: 0 additions & 2 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,7 @@ def main(params):

params.db_path = os.path.join(params.main_dir, f'data/{params.dataset}/subgraphs_en_{params.enclosing_sub_graph}_neg_{params.num_neg_samples_per_link}_hop_{params.hop}')

logging.info("omg here we go??")
if not os.path.isdir(params.db_path):
logging.info("omg here we go")
generate_subgraph_datasets(params)

train = SubgraphDataset(params.db_path, 'train_pos', 'train_neg', params.file_paths,
Expand Down