diff --git a/docs/api/tgb.linkproppred.md b/docs/api/tgb.linkproppred.md index 21b2106..e47dd75 100644 --- a/docs/api/tgb.linkproppred.md +++ b/docs/api/tgb.linkproppred.md @@ -5,3 +5,7 @@ ::: tgb.linkproppred.evaluate ::: tgb.linkproppred.negative_sampler ::: tgb.linkproppred.negative_generator +::: tgb.linkproppred.tkg_negative_generator +::: tgb.linkproppred.tkg_negative_sampler +::: tgb.linkproppred.thg_negative_generator +::: tgb.linkproppred.thg_negative_sampler diff --git a/tgb/linkproppred/thg_negative_generator.py b/tgb/linkproppred/thg_negative_generator.py index 82c84be..da7832b 100644 --- a/tgb/linkproppred/thg_negative_generator.py +++ b/tgb/linkproppred/thg_negative_generator.py @@ -29,7 +29,7 @@ def __init__( edge_data: TemporalData = None, ) -> None: r""" - Negative Edge Sampler class + Negative Edge Generator class for Temporal Heterogeneous Graphs this is a class for generating negative samples for a specific datasets the set of the positive samples are provided, the negative samples are generated with specific strategies and are saved for consistent evaluation across different methods @@ -39,11 +39,10 @@ def __init__( first_node_id: the first node id last_node_id: the last node id node_type: the node type of each node - num_neg_e: number of negative edges being generated per each positive edge - strategy: specifies which strategy should be used for generating the negatives - rnd_seed: random seed for reproducibility - edge_data: the positive edges to generate the negatives for, assuming sorted temporally - + strategy: the strategy to generate negative samples + num_neg_e: number of negative samples to generate + rnd_seed: random seed + edge_data: the edge data object containing the positive edges Returns: None """ @@ -72,7 +71,6 @@ def get_destinations_based_on_node_type(self, node_type: np.ndarray) -> dict: r""" get the destination node id arrays based on the node type - Parameters: first_node_id: the first node id last_node_id: the last node id diff --git a/tgb/linkproppred/thg_negative_sampler.py b/tgb/linkproppred/thg_negative_sampler.py index 1ab281b..9b4ffa0 100644 --- a/tgb/linkproppred/thg_negative_sampler.py +++ b/tgb/linkproppred/thg_negative_sampler.py @@ -26,7 +26,7 @@ def __init__( r""" Negative Edge Sampler Loads and query the negative batches based on the positive batches provided. - constructor for the negative edge sampler class + constructor for the negative edge sampler class Parameters: dataset_name: name of the dataset @@ -124,16 +124,6 @@ def query_batch(self, neg_samples.append( neg_d_arr ) - - # conflict_set, d_node_type = conflict_dict[(pos_t, pos_s, e_type)] - - # all_dst = self.node_type_dict[d_node_type] - # # filtered_all_dst = np.delete(all_dst, conflict_set, axis=0) - # filtered_all_dst = np.setdiff1d(all_dst, conflict_set) - # neg_d_arr = filtered_all_dst - # neg_samples.append( - # neg_d_arr - # ) #? can't convert to numpy array due to different lengths of negative samples return neg_samples diff --git a/tgb/linkproppred/tkg_negative_generator.py b/tgb/linkproppred/tkg_negative_generator.py index 2f22525..6f2da6f 100644 --- a/tgb/linkproppred/tkg_negative_generator.py +++ b/tgb/linkproppred/tkg_negative_generator.py @@ -28,13 +28,8 @@ def __init__( edge_data: TemporalData = None, ) -> None: r""" - Negative Edge Sampler class - this is a class for generating negative samples for a specific datasets - the set of the positive samples are provided, the negative samples are generated with specific strategies - and are saved for consistent evaluation across different methods - negative edges are sampled with 'oen_vs_many' strategy. - it is assumed that the destination nodes are indexed sequentially with 'first_dst_id' - and 'last_dst_id' being the first and last index, respectively. + Negative Edge Generator class for Temporal Knowledge Graphs + constructor for the negative edge generator class Parameters: dataset_name: name of the dataset @@ -121,13 +116,6 @@ def generate_dst_dict(self, edge_data: TemporalData, dst_name: str) -> dict: edge_type_size = [] for key in dst_track_dict: dst = np.array(list(dst_track_dict[key].keys())) - # #* if there are too few dst, sample up to 1000 - # if len(dst) < 1000: - # dst_sampled = np.random.choice(np.arange(min_dst_idx, max_dst_idx+1), 1000, replace=False) - # while np.intersect1d(dst, dst_sampled).shape[0] != 0: - # dst_sampled = np.random.choice(np.arange(min_dst_idx, max_dst_idx+1), 1000, replace=False) - # dst_sampled[0:len(dst)] = dst[:] - # dst = dst_sampled edge_type_size.append(len(dst)) dst_dict[key] = dst print ('destination candidates generated for all edge types ', len(dst_dict)) @@ -401,118 +389,4 @@ def generate_negative_samples_random(self, evaluation_set[(pos_t, pos_s, edge_type)] = neg_d_arr save_pkl(evaluation_set, filename) - - - - - # def generate_negative_samples_ftr(self, - # data: TemporalData, - # split_mode: str, - # filename: str, - # ) -> None: - # r""" - # now we consider (s, d, t, edge_type) as a unique edge - # Generate negative samples based on the random strategy: - # - for each positive edge, sample a batch of negative edges from all possible edges with the same source node - # - filter actual positive edges at the same timestamp with the same edge type - - # Parameters: - # data: an object containing positive edges information - # split_mode: specifies whether to generate negative edges for 'validation' or 'test' splits - # filename: name of the file containing the generated negative edges - # """ - # print( - # f"INFO: Negative Sampling Strategy: {self.strategy}, Data Split: {split_mode}" - # ) - # assert split_mode in [ - # "val", - # "test", - # ], "Invalid split-mode! It should be `val` or `test`!" - - # if os.path.exists(filename): - # print( - # f"INFO: negative samples for '{split_mode}' evaluation are already generated!" - # ) - # else: - # print(f"INFO: Generating negative samples for '{split_mode}' evaluation!") - # # retrieve the information from the batch - # pos_src, pos_dst, pos_timestamp, edge_type = ( - # data.src.cpu().numpy(), - # data.dst.cpu().numpy(), - # data.t.cpu().numpy(), - # data.edge_type.cpu().numpy(), - # ) - - # # all possible destinations - # all_dst = np.arange(self.first_dst_id, self.last_dst_id + 1) - # evaluation_set = {} - # # generate a list of negative destinations for each positive edge - # pos_edge_tqdm = tqdm( - # zip(pos_src, pos_dst, pos_timestamp, edge_type), total=len(pos_src) - # ) - - # edge_t_dict = {} # {(t, u, edge_type): {v_1, v_2, ..} } - # #! iterate once to put all edges into a dictionary for reference - # for ( - # pos_s, - # pos_d, - # pos_t, - # edge_type, - # ) in pos_edge_tqdm: - # if (pos_t, pos_s, edge_type) not in edge_t_dict: - # edge_t_dict[(pos_t, pos_s, edge_type)] = {pos_d:1} - # else: - # edge_t_dict[(pos_t, pos_s, edge_type)][pos_d] = 1 - - # conflict_dict = {} - # for key in edge_t_dict: - # conflict_dict[key] = np.array(list(edge_t_dict[key].keys())) - - # print ("conflict sets for ns samples for ", len(conflict_dict), " positive edges are generated") - - # # save the generated evaluation set to disk - # save_pkl(conflict_dict, filename) - - # # pos_src, pos_dst, pos_timestamp, edge_type = ( - # # data.src.cpu().numpy(), - # # data.dst.cpu().numpy(), - # # data.t.cpu().numpy(), - # # data.edge_type.cpu().numpy(), - # # ) - - - # # # generate a list of negative destinations for each positive edge - # # pos_edge_tqdm = tqdm( - # # zip(pos_src, pos_dst, pos_timestamp, edge_type), total=len(pos_src) - # # ) - - - # # for ( - # # pos_s, - # # pos_d, - # # pos_t, - # # edge_type, - # # ) in pos_edge_tqdm: - - # # #! generate all negatives unless restricted - # # conflict_set = list(edge_t_dict[(pos_t, pos_s, edge_type)].keys()) - - # # # filter out positive destination - # # conflict_set = np.array(conflict_set) - # # filtered_all_dst = np.setdiff1d(all_dst, conflict_set) - - # # ''' - # # when num_neg_e is larger than all possible destinations simple return all possible destinations - # # ''' - # # if (self.num_neg_e < 0): - # # neg_d_arr = filtered_all_dst - # # elif (self.num_neg_e > len(filtered_all_dst)): - # # neg_d_arr = filtered_all_dst - # # else: - # # neg_d_arr = np.random.choice( - # # filtered_all_dst, self.num_neg_e, replace=False) #never replace negatives - - # # evaluation_set[(pos_s, pos_d, pos_t, edge_type)] = neg_d_arr - - # # # save the generated evaluation set to disk - # # save_pkl(evaluation_set, filename) + \ No newline at end of file diff --git a/tgb/linkproppred/tkg_negative_sampler.py b/tgb/linkproppred/tkg_negative_sampler.py index 1e6fd0d..38106d1 100644 --- a/tgb/linkproppred/tkg_negative_sampler.py +++ b/tgb/linkproppred/tkg_negative_sampler.py @@ -44,17 +44,7 @@ def __init__( self.last_dst_id = last_dst_id self.strategy = strategy self.dst_dict = None - # if self.strategy in ["dst-time-filtered"]: - # dst_dict_name = ( - # partial_path - # + "_" - # + "dst_dict" - # + ".pkl" - # ) - # if not os.path.exists(dst_dict_name): - # raise FileNotFoundError(f"File not found at {dst_dict_name}, dst_time_filtered strategy requires the dst_dict file") - # self.dst_dict = load_pkl(dst_dict_name) - + def load_eval_set( self, fname: str,