Merge pull request #51 from JuliaGast/julia_new

Julia new: update comments for methods and modules
shenyangHuang · Jun 10, 2024 · e3737f6 · e3737f6
2 parents 09d9cf5 + 9a6e939
commit e3737f6
Show file tree

Hide file tree

Showing 33 changed files with 409 additions and 271 deletions.
diff --git a/examples/linkproppred/thgl-forum/recurrencybaseline.py b/examples/linkproppred/thgl-forum/recurrencybaseline.py
@@ -33,6 +33,11 @@
 
 def predict(num_processes,  data_c_rel, all_data_c_rel, alpha, lmbda_psi,
             perf_list_all, hits_list_all, window, neg_sampler, split_mode):
+    """ create predictions for each relation on test or valid set and compute mrr
+    :return  perf_list_all: list of mrrs for each test query
+    :return hits_list_all: list of hits for each test query
+    """
+
     first_ts = data_c_rel[0][3]
     ## use this if you wanna use ray:
     num_queries = len(data_c_rel) // num_processes
@@ -78,7 +83,7 @@ def predict(num_processes,  data_c_rel, all_data_c_rel, alpha, lmbda_psi,
 
 ## test
 def test(best_config, all_relations,test_data_prel, all_data_prel, neg_sampler, num_processes, window, split_mode='test'):  
-    """ create predictions for each relation on test or valid set and compute mrr
+    """ create predictions by loopoing through all relations on test or valid set and compute mrr
     :return  perf_list_all: list of mrrs for each test query
     :return hits_list_all: list of hits for each test query
     """       
@@ -121,6 +126,10 @@ def test(best_config, all_relations,test_data_prel, all_data_prel, neg_sampler,
     return perf_list_all, hits_list_all
 
 def read_dict_compute_mrr(split_mode='test'):
+    """ read the results per relation  from a precreated file and compute mrr
+    :return  mrr_per_rel: dictionary of mrrs for each relation
+    :return all_mrrs: list of mrrs for all relations
+    """
     csv_file = f'{perrel_results_path}/{MODEL_NAME}_NONE_{DATA}_results_{SEED}'+split_mode+'.csv'
     # Initialize an empty dictionary to store the data
     results_per_rel_dict = {}
@@ -153,7 +162,9 @@ def read_dict_compute_mrr(split_mode='test'):
 
 ## train
 def train(params_dict, rels,val_data_prel, trainval_data_prel, neg_sampler, num_processes, window):
-    """ optional, find best values for lambda and alpha
+    """ optional, find best values for lambda and alpha by looping through all relations and testing an a fixed set of params
+    based on validation mrr
+    :return best_config: dictionary of best params for each relation
     """
     best_config= {}
     best_mrr = 0
@@ -243,6 +254,7 @@ def train(params_dict, rels,val_data_prel, trainval_data_prel, neg_sampler, num_
 
 ## args
 def get_args(): 
+    """parse all arguments for the script"""
     parser = argparse.ArgumentParser()
     parser.add_argument("--dataset", "-d", default="thgl-forum", type=str) 
     parser.add_argument("--window", "-w", default=0, type=int) # set to e.g. 200 if only the most recent 200 timesteps should be considered. set to -2 if multistep

diff --git a/examples/linkproppred/thgl-github/recurrencybaseline.py b/examples/linkproppred/thgl-github/recurrencybaseline.py
@@ -33,6 +33,11 @@
 
 def predict(num_processes,  data_c_rel, all_data_c_rel, alpha, lmbda_psi,
             perf_list_all, hits_list_all, window, neg_sampler, split_mode):
+    """ create predictions for each relation on test or valid set and compute mrr
+    :return  perf_list_all: list of mrrs for each test query
+    :return hits_list_all: list of hits for each test query
+    """
+
     first_ts = data_c_rel[0][3]
     ## use this if you wanna use ray:
     num_queries = len(data_c_rel) // num_processes
@@ -78,7 +83,7 @@ def predict(num_processes,  data_c_rel, all_data_c_rel, alpha, lmbda_psi,
 
 ## test
 def test(best_config, all_relations,test_data_prel, all_data_prel, neg_sampler, num_processes, window, split_mode='test'):  
-    """ create predictions for each relation on test or valid set and compute mrr
+    """ create predictions by loopoing through all relations on test or valid set and compute mrr
     :return  perf_list_all: list of mrrs for each test query
     :return hits_list_all: list of hits for each test query
     """       
@@ -121,6 +126,10 @@ def test(best_config, all_relations,test_data_prel, all_data_prel, neg_sampler,
     return perf_list_all, hits_list_all
 
 def read_dict_compute_mrr(split_mode='test'):
+    """ read the results per relation  from a precreated file and compute mrr
+    :return  mrr_per_rel: dictionary of mrrs for each relation
+    :return all_mrrs: list of mrrs for all relations
+    """
     csv_file = f'{perrel_results_path}/{MODEL_NAME}_NONE_{DATA}_results_{SEED}'+split_mode+'.csv'
     # Initialize an empty dictionary to store the data
     results_per_rel_dict = {}
@@ -153,7 +162,9 @@ def read_dict_compute_mrr(split_mode='test'):
 
 ## train
 def train(params_dict, rels,val_data_prel, trainval_data_prel, neg_sampler, num_processes, window):
-    """ optional, find best values for lambda and alpha
+    """ optional, find best values for lambda and alpha by looping through all relations and testing an a fixed set of params
+    based on validation mrr
+    :return best_config: dictionary of best params for each relation
     """
     best_config= {}
     best_mrr = 0
@@ -243,6 +254,7 @@ def train(params_dict, rels,val_data_prel, trainval_data_prel, neg_sampler, num_
 
 ## args
 def get_args(): 
+    """parse all arguments for the script"""
     parser = argparse.ArgumentParser()
     parser.add_argument("--dataset", "-d", default="thgl-github", type=str) 
     parser.add_argument("--window", "-w", default=0, type=int) # set to e.g. 200 if only the most recent 200 timesteps should be considered. set to -2 if multistep

diff --git a/examples/linkproppred/thgl-myket/recurrencybaseline.py b/examples/linkproppred/thgl-myket/recurrencybaseline.py
@@ -33,6 +33,11 @@
 
 def predict(num_processes,  data_c_rel, all_data_c_rel, alpha, lmbda_psi,
             perf_list_all, hits_list_all, window, neg_sampler, split_mode):
+    """ create predictions for each relation on test or valid set and compute mrr
+    :return  perf_list_all: list of mrrs for each test query
+    :return hits_list_all: list of hits for each test query
+    """
+
     first_ts = data_c_rel[0][3]
     ## use this if you wanna use ray:
     num_queries = len(data_c_rel) // num_processes
@@ -78,7 +83,7 @@ def predict(num_processes,  data_c_rel, all_data_c_rel, alpha, lmbda_psi,
 
 ## test
 def test(best_config, all_relations,test_data_prel, all_data_prel, neg_sampler, num_processes, window, split_mode='test'):  
-    """ create predictions for each relation on test or valid set and compute mrr
+    """ create predictions by loopoing through all relations on test or valid set and compute mrr
     :return  perf_list_all: list of mrrs for each test query
     :return hits_list_all: list of hits for each test query
     """       
@@ -121,6 +126,10 @@ def test(best_config, all_relations,test_data_prel, all_data_prel, neg_sampler,
     return perf_list_all, hits_list_all
 
 def read_dict_compute_mrr(split_mode='test'):
+    """ read the results per relation  from a precreated file and compute mrr
+    :return  mrr_per_rel: dictionary of mrrs for each relation
+    :return all_mrrs: list of mrrs for all relations
+    """
     csv_file = f'{perrel_results_path}/{MODEL_NAME}_NONE_{DATA}_results_{SEED}'+split_mode+'.csv'
     # Initialize an empty dictionary to store the data
     results_per_rel_dict = {}
@@ -153,7 +162,9 @@ def read_dict_compute_mrr(split_mode='test'):
 
 ## train
 def train(params_dict, rels,val_data_prel, trainval_data_prel, neg_sampler, num_processes, window):
-    """ optional, find best values for lambda and alpha
+    """ optional, find best values for lambda and alpha by looping through all relations and testing an a fixed set of params
+    based on validation mrr
+    :return best_config: dictionary of best params for each relation
     """
     best_config= {}
     best_mrr = 0
@@ -243,6 +254,7 @@ def train(params_dict, rels,val_data_prel, trainval_data_prel, neg_sampler, num_
 
 ## args
 def get_args(): 
+    """parse all arguments for the script"""
     parser = argparse.ArgumentParser()
     parser.add_argument("--dataset", "-d", default="thgl-myket", type=str) 
     parser.add_argument("--window", "-w", default=0, type=int) # set to e.g. 200 if only the most recent 200 timesteps should be considered. set to -2 if multistep

diff --git a/examples/linkproppred/thgl-software/recurrencybaseline.py b/examples/linkproppred/thgl-software/recurrencybaseline.py
@@ -33,6 +33,11 @@
 
 def predict(num_processes,  data_c_rel, all_data_c_rel, alpha, lmbda_psi,
             perf_list_all, hits_list_all, window, neg_sampler, split_mode):
+    """ create predictions for each relation on test or valid set and compute mrr
+    :return  perf_list_all: list of mrrs for each test query
+    :return hits_list_all: list of hits for each test query
+    """
+
     first_ts = data_c_rel[0][3]
     ## use this if you wanna use ray:
     num_queries = len(data_c_rel) // num_processes
@@ -78,7 +83,7 @@ def predict(num_processes,  data_c_rel, all_data_c_rel, alpha, lmbda_psi,
 
 ## test
 def test(best_config, all_relations,test_data_prel, all_data_prel, neg_sampler, num_processes, window, split_mode='test'):  
-    """ create predictions for each relation on test or valid set and compute mrr
+    """ create predictions by loopoing through all relations on test or valid set and compute mrr
     :return  perf_list_all: list of mrrs for each test query
     :return hits_list_all: list of hits for each test query
     """       
@@ -121,6 +126,10 @@ def test(best_config, all_relations,test_data_prel, all_data_prel, neg_sampler,
     return perf_list_all, hits_list_all
 
 def read_dict_compute_mrr(split_mode='test'):
+    """ read the results per relation  from a precreated file and compute mrr
+    :return  mrr_per_rel: dictionary of mrrs for each relation
+    :return all_mrrs: list of mrrs for all relations
+    """
     csv_file = f'{perrel_results_path}/{MODEL_NAME}_NONE_{DATA}_results_{SEED}'+split_mode+'.csv'
     # Initialize an empty dictionary to store the data
     results_per_rel_dict = {}
@@ -153,7 +162,9 @@ def read_dict_compute_mrr(split_mode='test'):
 
 ## train
 def train(params_dict, rels,val_data_prel, trainval_data_prel, neg_sampler, num_processes, window):
-    """ optional, find best values for lambda and alpha
+    """ optional, find best values for lambda and alpha by looping through all relations and testing an a fixed set of params
+    based on validation mrr
+    :return best_config: dictionary of best params for each relation
     """
     best_config= {}
     best_mrr = 0
@@ -243,6 +254,7 @@ def train(params_dict, rels,val_data_prel, trainval_data_prel, neg_sampler, num_
 
 ## args
 def get_args(): 
+    """parse all arguments for the script"""
     parser = argparse.ArgumentParser()
     parser.add_argument("--dataset", "-d", default="thgl-software", type=str) 
     parser.add_argument("--window", "-w", default=0, type=int) # set to e.g. 200 if only the most recent 200 timesteps should be considered. set to -2 if multistep

diff --git a/examples/linkproppred/tkgl-icews/cen.py b/examples/linkproppred/tkgl-icews/cen.py
@@ -27,6 +27,7 @@
 
 def test(model, history_len, history_list, test_list, num_rels, num_nodes, use_cuda, model_name, mode, split_mode):
     """
+    Test the model
     :param model: model used to test
     :param history_list:    all input history snap shot list, not include output label train list or valid list
     :param test_list:   test triple snap shot list
@@ -101,7 +102,14 @@ def test(model, history_len, history_list, test_list, num_rels, num_nodes, use_c
 
 def run_experiment(args, trainvalidtest_id=0, n_hidden=None, n_layers=None, dropout=None, n_bases=None):
     '''
-    trainvalidtest_id: -1: pretrainig, 0: curriculum training (to find best test history len), 1: test on valid set, 2: test on test set
+    Run experiment for CEN model
+    :param args: arguments for the model
+    :param trainvalidtest_id: -1: pretrainig, 0: curriculum training (to find best test history len), 1: test on valid set, 2: test on test set
+    :param n_hidden: number of hidden units
+    :param n_layers: number of layers
+    :param dropout: dropout rate
+    :param n_bases: number of bases
+    return: mrr, perf_per_rel: mean reciprocal rank and performance per relation
     '''
     # 1) load configuration for grid search the best configuration
     if n_hidden:

diff --git a/examples/linkproppred/tkgl-icews/recurrencybaseline.py b/examples/linkproppred/tkgl-icews/recurrencybaseline.py
@@ -33,6 +33,11 @@
 
 def predict(num_processes,  data_c_rel, all_data_c_rel, alpha, lmbda_psi,
             perf_list_all, hits_list_all, window, neg_sampler, split_mode):
+    """ create predictions for each relation on test or valid set and compute mrr
+    :return  perf_list_all: list of mrrs for each test query
+    :return hits_list_all: list of hits for each test query
+    """
+
     first_ts = data_c_rel[0][3]
     ## use this if you wanna use ray:
     num_queries = len(data_c_rel) // num_processes
@@ -78,7 +83,7 @@ def predict(num_processes,  data_c_rel, all_data_c_rel, alpha, lmbda_psi,
 
 ## test
 def test(best_config, all_relations,test_data_prel, all_data_prel, neg_sampler, num_processes, window, split_mode='test'):  
-    """ create predictions for each relation on test or valid set and compute mrr
+    """ create predictions by loopoing through all relations on test or valid set and compute mrr
     :return  perf_list_all: list of mrrs for each test query
     :return hits_list_all: list of hits for each test query
     """       
@@ -121,6 +126,10 @@ def test(best_config, all_relations,test_data_prel, all_data_prel, neg_sampler,
     return perf_list_all, hits_list_all
 
 def read_dict_compute_mrr(split_mode='test'):
+    """ read the results per relation  from a precreated file and compute mrr
+    :return  mrr_per_rel: dictionary of mrrs for each relation
+    :return all_mrrs: list of mrrs for all relations
+    """
     csv_file = f'{perrel_results_path}/{MODEL_NAME}_NONE_{DATA}_results_{SEED}'+split_mode+'.csv'
     # Initialize an empty dictionary to store the data
     results_per_rel_dict = {}
@@ -153,7 +162,9 @@ def read_dict_compute_mrr(split_mode='test'):
 
 ## train
 def train(params_dict, rels,val_data_prel, trainval_data_prel, neg_sampler, num_processes, window):
-    """ optional, find best values for lambda and alpha
+    """ optional, find best values for lambda and alpha by looping through all relations and testing an a fixed set of params
+    based on validation mrr
+    :return best_config: dictionary of best params for each relation
     """
     best_config= {}
     best_mrr = 0
@@ -243,6 +254,7 @@ def train(params_dict, rels,val_data_prel, trainval_data_prel, neg_sampler, num_
 
 ## args
 def get_args(): 
+    """parse all arguments for the script"""
     parser = argparse.ArgumentParser()
     parser.add_argument("--dataset", "-d", default="tkgl-icews", type=str) 
     parser.add_argument("--window", "-w", default=0, type=int) # set to e.g. 200 if only the most recent 200 timesteps should be considered. set to -2 if multistep

diff --git a/examples/linkproppred/tkgl-icews/regcn.py b/examples/linkproppred/tkgl-icews/regcn.py
@@ -28,6 +28,7 @@
 
 def test(model, history_list, test_list, num_rels, num_nodes, use_cuda, model_name, static_graph, mode, split_mode):
     """
+    Test the model on either test or validation set
     :param model: model used to test
     :param history_list:    all input history snap shot list, not include output label train list or valid list
     :param test_list:   test triple snap shot list
@@ -75,7 +76,7 @@ def test(model, history_list, test_list, num_rels, num_nodes, use_cuda, model_na
         pos_samples_batch = test_triples_input[:,2]
 
         _, perf_list = model.predict(history_glist, num_rels, static_graph, test_triples_input, use_cuda, neg_samples_batch, pos_samples_batch, 
-                                    evaluator, METRIC)  # TODO:  num_rels, static_graph different!
+                                    evaluator, METRIC)  
 
         perf_list_all.extend(perf_list)
         if split_mode == "test":
@@ -100,6 +101,15 @@ def test(model, history_list, test_list, num_rels, num_nodes, use_cuda, model_na
 
 
 def run_experiment(args, n_hidden=None, n_layers=None, dropout=None, n_bases=None):
+    """
+    Run the experiment with the given configuration
+    :param args: arguments
+    :param n_hidden: hidden dimension
+    :param n_layers: number of layers
+    :param dropout: dropout rate
+    :param n_bases: number of bases
+    :return: mrr, perf_per_rel  (mean reciprocal rank, performance per relation)
+    """
     # load configuration for grid search the best configuration
     if n_hidden:
         args.n_hidden = n_hidden
@@ -120,19 +130,10 @@ def run_experiment(args, n_hidden=None, n_layers=None, dropout=None, n_bases=Non
     perf_per_rel = {}
     use_cuda = args.gpu >= 0 and torch.cuda.is_available()
 
-
-    # if args.add_static_graph:
-    #     static_triples = np.array(_read_triplets_as_list("../data/" + args.dataset + "/e-w-graph.txt", {}, {}, load_time=False))
-    #     num_static_rels = len(np.unique(static_triples[:, 1]))
-    #     num_words = len(np.unique(static_triples[:, 2]))
-    #     static_triples[:, 2] = static_triples[:, 2] + num_nodes 
-    #     static_node_id = torch.from_numpy(np.arange(num_words + data.num_nodes)).view(-1, 1).long().cuda(args.gpu) \
-    #         if use_cuda else torch.from_numpy(np.arange(num_words + data.num_nodes)).view(-1, 1).long()
-    # else:
     num_static_rels, num_words, static_triples, static_graph = 0, 0, [], None
 
     # create stat
-    model = RecurrentRGCNREGCN(args.decoder, #TODO: this has slightly different args than CEN
+    model = RecurrentRGCNREGCN(args.decoder,
                           args.encoder,
                         num_nodes,
                         int(num_rels/2),
@@ -166,9 +167,6 @@ def run_experiment(args, n_hidden=None, n_layers=None, dropout=None, n_bases=Non
         torch.cuda.set_device(args.gpu)
         model.cuda()
 
-    # if args.add_static_graph: # TODO: what to do about this part: 
-    #     static_graph = build_sub_graph(len(static_node_id), num_static_rels, static_triples, use_cuda, args.gpu)
-
     # optimizer
     optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-5)
 
@@ -248,16 +246,6 @@ def run_experiment(args, n_hidden=None, n_layers=None, dropout=None, n_bases=Non
                     best_mrr = mrr
                     torch.save({'state_dict': model.state_dict(), 'epoch': epoch}, model_state_file)
 
-        # mrr = test(model, 
-        #         train_list+valid_list,
-        #         test_list, 
-        #         num_rels, 
-        #         num_nodes, 
-        #         use_cuda,
-        #         model_state_file, 
-        #         static_graph, 
-        #         mode="test", split_mode='test')
-
         return best_mrr, perf_per_rel
 # ==================
 # ==================
@@ -311,20 +299,20 @@ def run_experiment(args, n_hidden=None, n_layers=None, dropout=None, n_bases=Non
 dataset.load_val_ns()
 dataset.load_test_ns()
 
+## run training and testing
 val_mrr, test_mrr = 0, 0
 if args.grid_search:
-    print("TODO: implement hyperparameter grid search")
+    print("hyperparameter grid search not implemented. Exiting.")
 # single run
 else:
-    #TODO: differentiate between train, valid, test
     start_train = timeit.default_timer()
-    if args.test == False:
+    if args.test == False: #if they are true: directly test on a previously trained and stored model
         print('start training')
-        val_mrr, perf_per_rel = run_experiment(args)
+        val_mrr, perf_per_rel = run_experiment(args) # do training
     start_test = timeit.default_timer()
     args.test = True
     print('start testing')
-    test_mrr, perf_per_rel = run_experiment(args)
+    test_mrr, perf_per_rel = run_experiment(args) # do testing
 
 
 test_time = timeit.default_timer() - start_test