diff --git a/examples/linkproppred/thgl-software/edgebank.py b/examples/linkproppred/thgl-software/edgebank.py new file mode 100644 index 0000000..3f5b981 --- /dev/null +++ b/examples/linkproppred/thgl-software/edgebank.py @@ -0,0 +1,197 @@ +""" +Dynamic Link Prediction with EdgeBank +NOTE: This implementation works only based on `numpy` + +Reference: + - https://github.com/fpour/DGB/tree/main + + +""" + +import timeit +import numpy as np +from sklearn.metrics import average_precision_score, roc_auc_score +from torch_geometric.loader import TemporalDataLoader +from tqdm import tqdm +import math +import os +import os.path as osp +from pathlib import Path +import sys +import argparse + +# internal imports +tgb_modules_path = osp.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..')) +sys.path.append(tgb_modules_path) +from tgb.linkproppred.evaluate import Evaluator +from tgb_modules.edgebank_predictor import EdgeBankPredictor +from tgb.utils.utils import set_random_seed +from tgb.linkproppred.dataset import LinkPropPredDataset +from tgb.utils.utils import save_results + +# ================== +# ================== +# ================== + +def test(data, test_mask, neg_sampler, split_mode): + r""" + Evaluated the dynamic link prediction + Evaluation happens as 'one vs. many', meaning that each positive edge is evaluated against many negative edges + + Parameters: + data: a dataset object + test_mask: required masks to load the test set edges + neg_sampler: an object that gives the negative edges corresponding to each positive edge + split_mode: specifies whether it is the 'validation' or 'test' set to correctly load the negatives + Returns: + perf_metric: the result of the performance evaluation + """ + num_batches = math.ceil(len(data['sources'][test_mask]) / BATCH_SIZE) + perf_list = [] + for batch_idx in tqdm(range(num_batches)): + start_idx = batch_idx * BATCH_SIZE + end_idx = min(start_idx + BATCH_SIZE, len(data['sources'][test_mask])) + pos_src, pos_dst, pos_t, pos_edge = ( + data['sources'][test_mask][start_idx: end_idx], + data['destinations'][test_mask][start_idx: end_idx], + data['timestamps'][test_mask][start_idx: end_idx], + data['edge_type'][test_mask][start_idx: end_idx], + ) + neg_batch_list = neg_sampler.query_batch(pos_src, pos_dst, pos_t, pos_edge, split_mode=split_mode) + + for idx, neg_batch in enumerate(neg_batch_list): + query_src = np.array([int(pos_src[idx]) for _ in range(len(neg_batch) + 1)]) + query_dst = np.concatenate([np.array([int(pos_dst[idx])]), neg_batch]) + + y_pred = edgebank.predict_link(query_src, query_dst) + # compute MRR + input_dict = { + "y_pred_pos": np.array([y_pred[0]]), + "y_pred_neg": np.array(y_pred[1:]), + "eval_metric": [metric], + } + perf_list.append(evaluator.eval(input_dict)[metric]) + + # update edgebank memory after each positive batch + edgebank.update_memory(pos_src, pos_dst, pos_t) + + perf_metrics = float(np.mean(perf_list)) + + return perf_metrics + +def get_args(): + parser = argparse.ArgumentParser('*** TGB: EdgeBank ***') + parser.add_argument('-d', '--data', type=str, help='Dataset name', default='thgl-software') + parser.add_argument('--bs', type=int, help='Batch size', default=200) + parser.add_argument('--k_value', type=int, help='k_value for computing ranking metrics', default=10) + parser.add_argument('--seed', type=int, help='Random seed', default=1) + parser.add_argument('--mem_mode', type=str, help='Memory mode', default='unlimited', choices=['unlimited', 'fixed_time_window']) + parser.add_argument('--time_window_ratio', type=float, help='Test window ratio', default=0.15) + + try: + args = parser.parse_args() + except: + parser.print_help() + sys.exit(0) + return args, sys.argv + +# ================== +# ================== +# ================== + +start_overall = timeit.default_timer() + +# set hyperparameters +args, _ = get_args() + +SEED = args.seed # set the random seed for consistency +set_random_seed(SEED) +MEMORY_MODE = args.mem_mode # `unlimited` or `fixed_time_window` +BATCH_SIZE = args.bs +K_VALUE = args.k_value +TIME_WINDOW_RATIO = args.time_window_ratio +DATA = args.data +MODEL_NAME = 'EdgeBank' + + + +# data loading with `numpy` +dataset = LinkPropPredDataset(name=DATA, root="datasets", preprocess=True) +data = dataset.full_data +metric = dataset.eval_metric + + +# get masks +train_mask = dataset.train_mask +val_mask = dataset.val_mask +test_mask = dataset.test_mask + +#data for memory in edgebank +hist_src = np.concatenate([data['sources'][train_mask]]) +hist_dst = np.concatenate([data['destinations'][train_mask]]) +hist_ts = np.concatenate([data['timestamps'][train_mask]]) + +# Set EdgeBank with memory updater +edgebank = EdgeBankPredictor( + hist_src, + hist_dst, + hist_ts, + memory_mode=MEMORY_MODE, + time_window_ratio=TIME_WINDOW_RATIO) + +print("==========================================================") +print(f"============*** {MODEL_NAME}: {MEMORY_MODE}: {DATA} ***==============") +print("==========================================================") + +evaluator = Evaluator(name=DATA) +neg_sampler = dataset.negative_sampler + +# for saving the results... +results_path = f'{osp.dirname(osp.abspath(__file__))}/saved_results' +if not osp.exists(results_path): + os.mkdir(results_path) + print('INFO: Create directory {}'.format(results_path)) +Path(results_path).mkdir(parents=True, exist_ok=True) +results_filename = f'{results_path}/{MODEL_NAME}_{MEMORY_MODE}_{DATA}_results.json' + +# ==================================================== Test +# loading the validation negative samples +dataset.load_val_ns() + +# testing ... +start_val = timeit.default_timer() +perf_metric_val = test(data, val_mask, neg_sampler, split_mode='val') +end_val = timeit.default_timer() + +print(f"INFO: val: Evaluation Setting: >>> ONE-VS--ALL <<< ") +print(f"\tval: {metric}: {perf_metric_val: .4f}") +val_time = timeit.default_timer() - start_val +print(f"\tval: Elapsed Time (s): {val_time: .4f}") + + + + +# ==================================================== Test +# loading the test negative samples +dataset.load_test_ns() + +# testing ... +start_test = timeit.default_timer() +perf_metric_test = test(data, test_mask, neg_sampler, split_mode='test') +end_test = timeit.default_timer() + +print(f"INFO: Test: Evaluation Setting: >>> <<< ") +print(f"\tTest: {metric}: {perf_metric_test: .4f}") +test_time = timeit.default_timer() - start_test +print(f"\tTest: Elapsed Time (s): {test_time: .4f}") + +save_results({'model': MODEL_NAME, + 'memory_mode': MEMORY_MODE, + 'data': DATA, + 'run': 1, + 'seed': SEED, + metric: perf_metric_test, + 'val_mrr': perf_metric_val, + 'test_time': test_time, + 'tot_train_val_time': test_time+val_time }, + results_filename) diff --git a/examples/linkproppred/thgl-software/recurrencybaseline.py b/examples/linkproppred/thgl-software/recurrencybaseline.py new file mode 100644 index 0000000..70eecac --- /dev/null +++ b/examples/linkproppred/thgl-software/recurrencybaseline.py @@ -0,0 +1,404 @@ +""" from paper: "History repeats itself: A Baseline for Temporal Knowledge Graph Forecasting" +Julia Gastinger, Christian Meilicke, Federico Errica, Timo Sztyler, Anett Schuelke, Heiner Stuckenschmidt (IJCAI 2024) + +@inproceedings{gastinger2024baselines, + title={History repeats itself: A Baseline for Temporal Knowledge Graph Forecasting}, + author={Gastinger, Julia and Meilicke, Christian and Errica, Federico and Sztyler, Timo and Schuelke, Anett and Stuckenschmidt, Heiner}, + booktitle={33nd International Joint Conference on Artificial Intelligence (IJCAI 2024)}, + year={2024}, + organization={International Joint Conferences on Artificial Intelligence Organization} +} + +""" + +## imports +import timeit +import argparse +import numpy as np +from copy import copy +from pathlib import Path +import ray +import sys +import os +import os.path as osp +import json +#internal imports +tgb_modules_path = osp.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..')) +sys.path.append(tgb_modules_path) +from tgb_modules.recurrencybaseline_predictor import baseline_predict, baseline_predict_remote +from tgb.linkproppred.evaluate import Evaluator +from tgb.linkproppred.dataset import LinkPropPredDataset +from tgb.utils.utils import set_random_seed, save_results +from tgb_modules.tkg_utils import create_basis_dict, group_by, reformat_ts + +def predict(num_processes, data_c_rel, all_data_c_rel, alpha, lmbda_psi, + perf_list_all, hits_list_all, window, neg_sampler, split_mode): + first_ts = data_c_rel[0][3] + ## use this if you wanna use ray: + num_queries = len(data_c_rel) // num_processes + if num_queries < num_processes: # if we do not have enough queries for all the processes + num_processes_tmp = 1 + num_queries = len(data_c_rel) + else: + num_processes_tmp = num_processes + if num_processes > 1: + object_references =[] + + for i in range(num_processes_tmp): + num_test_queries = len(data_c_rel) - (i + 1) * num_queries + if num_test_queries >= num_queries: + test_queries_idx =[i * num_queries, (i + 1) * num_queries] + else: + test_queries_idx = [i * num_queries, len(test_data)] + + valid_data_b = data_c_rel[test_queries_idx[0]:test_queries_idx[1]] + + ob = baseline_predict_remote.remote(num_queries, valid_data_b, all_data_c_rel, window, + basis_dict, + num_nodes, num_rels, lmbda_psi, + alpha, evaluator,first_ts, neg_sampler, split_mode) + object_references.append(ob) + + output = ray.get(object_references) + + # updates the scores and logging dict for each process + for proc_loop in range(num_processes_tmp): + perf_list_all.extend(output[proc_loop][0]) + hits_list_all.extend(output[proc_loop][1]) + + else: + perf_list, hits_list = baseline_predict(len(data_c_rel), data_c_rel, all_data_c_rel, + window, basis_dict, + num_nodes, num_rels, lmbda_psi, + alpha, evaluator, first_ts, neg_sampler, split_mode) + perf_list_all.extend(perf_list) + hits_list_all.extend(hits_list) + + return perf_list_all, hits_list_all + +## test +def test(best_config, all_relations,test_data_prel, all_data_prel, neg_sampler, num_processes, window, split_mode='test'): + """ create predictions for each relation on test or valid set and compute mrr + :return perf_list_all: list of mrrs for each test query + :return hits_list_all: list of hits for each test query + """ + perf_list_all = [] + hits_list_all =[] + + csv_file = f'{perrel_results_path}/{MODEL_NAME}_NONE_{DATA}_results_{SEED}'+split_mode+'.csv' + + ## loop through relations and apply baselines + + for rel in all_relations: + start = timeit.default_timer() + if rel in test_data_prel.keys(): + lmbda_psi = best_config[str(rel)]['lmbda_psi'][0] + alpha = best_config[str(rel)]['alpha'][0] + + # test data for this relation + test_data_c_rel = test_data_prel[rel] + timesteps_test = list(set(test_data_c_rel[:,3])) + timesteps_test.sort() + all_data_c_rel = all_data_prel[rel] + perf_list_rel = [] + hits_list_rel = [] + perf_list_rel, hits_list_rel = predict(num_processes, test_data_c_rel, + all_data_c_rel, alpha, lmbda_psi,perf_list_rel, hits_list_rel, + window, neg_sampler, split_mode) + perf_list_all.extend(perf_list_rel) + hits_list_all.extend(hits_list_rel) + else: + perf_list_rel =[] + + end = timeit.default_timer() + total_time = round(end - start, 6) + print("Relation {} finished in {} seconds.".format(rel, total_time)) + + + with open(csv_file, 'a') as f: + f.write("{},{}\n".format(rel, perf_list_rel)) + + return perf_list_all, hits_list_all + +def read_dict_compute_mrr(split_mode='test'): + csv_file = f'{perrel_results_path}/{MODEL_NAME}_NONE_{DATA}_results_{SEED}'+split_mode+'.csv' + # Initialize an empty dictionary to store the data + results_per_rel_dict = {} + mrr_per_rel = {} + all_mrrs = [] + # Open the file for reading + with open(csv_file, 'r') as f: + # Read each line in the file + for line in f: + # Split the line at the comma + parts = line.strip().split(',') + # Extract the key (the first part) + key = int(parts[0]) + # Extract the values (the rest of the parts), remove square brackets + values = [float(value.strip('[]')) for value in parts[1:]] + # Add the key-value pair to the dictionary + if key in results_per_rel_dict.keys(): + print(f"Key {key} already exists in the dictionary!!! might have duplicate entries in results csv") + results_per_rel_dict[key] = values + all_mrrs.extend(values) + mrr_per_rel[key] = np.mean(values) + + if len(list(results_per_rel_dict.keys())) != num_rels: + print("we do not have entries for each rel in the results csv file. only num enties: ", len(list(results_per_rel_dict.keys()))) + + print("Split mode: "+split_mode +" Mean MRR: ", np.mean(all_mrrs)) + print("mrr per relation: ", mrr_per_rel) + + + +## train +def train(params_dict, rels,val_data_prel, trainval_data_prel, neg_sampler, num_processes, window): + """ optional, find best values for lambda and alpha + """ + best_config= {} + best_mrr = 0 + for rel in rels: # loop through relations. for each relation, apply rules with selected params, compute valid mrr + start = timeit.default_timer() + rel_key = int(rel) + + best_config[str(rel_key)] = {} + best_config[str(rel_key)]['not_trained'] = 'True' + best_config[str(rel_key)]['lmbda_psi'] = [default_lmbda_psi,0] #default + best_config[str(rel_key)]['other_lmbda_mrrs'] = list(np.zeros(len(params_dict['lmbda_psi']))) + best_config[str(rel_key)]['alpha'] = [default_alpha,0] #default + best_config[str(rel_key)]['other_alpha_mrrs'] = list(np.zeros(len(params_dict['alpha']))) + + if rel in val_data_prel.keys(): + # valid data for this relation + val_data_c_rel = val_data_prel[rel] + timesteps_valid = list(set(val_data_c_rel[:,3])) + timesteps_valid.sort() + trainval_data_c_rel = trainval_data_prel[rel] + + ###### 1) select lambda ############### + lmbdas_psi = params_dict['lmbda_psi'] + + alpha = 1 + best_lmbda_psi = 0.1 + best_mrr_psi = 0 + lmbda_mrrs = [] + + best_config[str(rel_key)]['num_app_valid'] = copy(len(val_data_c_rel)) + best_config[str(rel_key)]['num_app_train_valid'] = copy(len(trainval_data_c_rel)) + best_config[str(rel_key)]['not_trained'] = 'False' + + for lmbda_psi in lmbdas_psi: + perf_list_r = [] + hits_list_r = [] + perf_list_r, hits_list_r = predict(num_processes, val_data_c_rel, + trainval_data_c_rel, alpha, lmbda_psi,perf_list_r, hits_list_r, + window, neg_sampler, split_mode='val') + # compute mrr + mrr = np.mean(perf_list_r) + # # is new mrr better than previous best? if yes: store lmbda + if mrr > best_mrr_psi: + best_mrr_psi = float(mrr) + best_lmbda_psi = lmbda_psi + + + lmbda_mrrs.append(float(mrr)) + best_config[str(rel_key)]['lmbda_psi'] = [best_lmbda_psi, best_mrr_psi] + best_config[str(rel_key)]['other_lmbda_mrrs'] = lmbda_mrrs + best_mrr = best_mrr_psi + ##### 2) select alpha ############### + best_config[str(rel_key)]['not_trained'] = 'False' + alphas = params_dict['alpha'] + lmbda_psi = best_config[str(rel_key)]['lmbda_psi'][0] # use the best lmbda psi + + alpha_mrrs = [] + # perf_list_all = [] + best_mrr_alpha = 0 + best_alpha=0.99 + for alpha in alphas: + perf_list_r = [] + hits_list_r = [] + + perf_list_r, hits_list_r = predict(num_processes, val_data_c_rel, + trainval_data_c_rel, alpha, lmbda_psi,perf_list_r, hits_list_r, + window, neg_sampler, split_mode='val') + # compute mrr + mrr_alpha = np.mean(perf_list_r) + + # is new mrr better than previous best? if yes: store alpha + if mrr_alpha > best_mrr_alpha: + best_mrr_alpha = float(mrr_alpha) + best_alpha = alpha + best_mrr = best_mrr_alpha + alpha_mrrs.append(float(mrr_alpha)) + + best_config[str(rel_key)]['alpha'] = [best_alpha, best_mrr_alpha] + best_config[str(rel_key)]['other_alpha_mrrs'] = alpha_mrrs + + end = timeit.default_timer() + total_time = round(end - start, 6) + print("Relation {} finished in {} seconds.".format(rel, total_time)) + return best_config + + + +## args +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--dataset", "-d", default="thgl-software", type=str) + parser.add_argument("--window", "-w", default=0, type=int) # set to e.g. 200 if only the most recent 200 timesteps should be considered. set to -2 if multistep + parser.add_argument("--num_processes", "-p", default=1, type=int) + parser.add_argument("--lmbda", "-l", default=0.1, type=float) # fix lambda. used if trainflag == false + parser.add_argument("--alpha", "-alpha", default=0.99, type=float) # fix alpha. used if trainflag == false + parser.add_argument("--train_flag", "-tr", default='False') # do we need training, ie selection of lambda and alpha + parser.add_argument("--load_flag", "-lo", default='False') # if train_flag set to True: do you want to load best_config? + parser.add_argument("--save_config", "-c", default='True') # do we need to save the selection of lambda and alpha in config file? + parser.add_argument('--seed', type=int, help='Random seed', default=1) # not needed + parsed = vars(parser.parse_args()) + return parsed + +start_o = timeit.default_timer() + +parsed = get_args() +if parsed['num_processes']>1: + ray.init(num_cpus=parsed["num_processes"], num_gpus=0) +MODEL_NAME = 'RecurrencyBaseline' +SEED = parsed['seed'] # set the random seed for consistency +set_random_seed(SEED) +perrel_results_path = f'{osp.dirname(osp.abspath(__file__))}/saved_models' +if not osp.exists(perrel_results_path): + os.mkdir(perrel_results_path) + print('INFO: Create directory {}'.format(perrel_results_path)) +Path(perrel_results_path).mkdir(parents=True, exist_ok=True) + +## load dataset and prepare it accordingly +name = parsed["dataset"] +dataset = LinkPropPredDataset(name=name, root="datasets", preprocess=True) +DATA = name + +relations = dataset.edge_type +num_rels = dataset.num_rels +rels = np.arange(0,num_rels) +subjects = dataset.full_data["sources"] +objects= dataset.full_data["destinations"] +num_nodes = dataset.num_nodes +timestamps_orig = dataset.full_data["timestamps"] +timestamps = reformat_ts(timestamps_orig, DATA) # stepsize:1 + +print("split train valid test data") +all_quads = np.stack((subjects, relations, objects, timestamps, timestamps_orig), axis=1) +train_data = all_quads[dataset.train_mask] +val_data = all_quads[dataset.val_mask] +test_data = all_quads[dataset.test_mask] + +metric = dataset.eval_metric +evaluator = Evaluator(name=name) +neg_sampler = dataset.negative_sampler + +train_val_data = np.concatenate([train_data, val_data]) +all_data = np.concatenate([train_data, val_data, test_data]) + +# create dicts with key: relation id, values: triples for that relation id +print("grouping data by relation") +test_data_prel = group_by(test_data, 1) +all_data_prel = group_by(all_data, 1) +val_data_prel = group_by(val_data, 1) +trainval_data_prel = group_by(train_val_data, 1) + +#load the ns samples +# if parsed['train_flag']: +print("loading negative samples") +dataset.load_val_ns() +dataset.load_test_ns() + +# parameter options +if parsed['train_flag'] == 'True': + params_dict = {} + params_dict['lmbda_psi'] = [0, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.5, 0.9, 1.0001] + params_dict['alpha'] = [0, 0.00001, 0.0001, 0.001, 0.01, 0.1, 0.5, 0.9, 0.99, 0.999, 0.9999, 0.99999, 1] + default_lmbda_psi = params_dict['lmbda_psi'][-1] + default_alpha = params_dict['alpha'][-2] + +## load rules +print("creating rules") +basis_dict = create_basis_dict(train_val_data) +print("done with creating rules") +## init +# rb_predictor = RecurrencyBaselinePredictor(rels) +## train to find best lambda and alpha +start_train = timeit.default_timer() +if parsed['train_flag'] == 'True': + if parsed['load_flag'] == 'True': + with open('best_config.json', 'r') as infile: + best_config = json.load(infile) + else: + print('start training') + best_config = train(params_dict, rels, val_data_prel, trainval_data_prel, neg_sampler, parsed['num_processes'], + parsed['window']) + if parsed['save_config'] == 'True': + import json + with open('best_config.json', 'w') as outfile: + json.dump(best_config, outfile) + +else: # use preset lmbda and alpha; same for all relations + best_config = {} + for rel in rels: + best_config[str(rel)] = {} + best_config[str(rel)]['lmbda_psi'] = [parsed['lmbda']] + best_config[str(rel)]['alpha'] = [parsed['alpha']] + +end_train = timeit.default_timer() + +# compute validation mrr +print("Computing validation MRR") +perf_list_all_val, hits_list_all_val = test(best_config,rels, val_data_prel, + trainval_data_prel, neg_sampler, parsed['num_processes'], + parsed['window'], split_mode='val') +val_mrr = float(np.mean(perf_list_all_val)) + +# compute test mrr +print("Computing test MRR") +start_test = timeit.default_timer() +perf_list_all, hits_list_all = test(best_config,rels, test_data_prel, + all_data_prel, neg_sampler, parsed['num_processes'], + parsed['window']) + +end_o = timeit.default_timer() +train_time_o = round(end_train- start_train, 6) +test_time_o = round(end_o- start_test, 6) +total_time_o = round(end_o- start_o, 6) +print("Running Training to find best configs finished in {} seconds.".format(train_time_o)) +print("Running testing with best configs finished in {} seconds.".format(test_time_o)) +print("Running all steps finished in {} seconds.".format(total_time_o)) + +print(f"The test MRR is {np.mean(perf_list_all)}") +print(f"The valid MRR is {val_mrr}") +print(f"The Hits@10 is {np.mean(hits_list_all)}") +print(f"We have {len(perf_list_all)} predictions") +print(f"The test set has len {len(test_data)} ") + +# for saving the results... +results_path = f'{osp.dirname(osp.abspath(__file__))}/saved_results' +if not osp.exists(results_path): + os.mkdir(results_path) + print('INFO: Create directory {}'.format(results_path)) +Path(results_path).mkdir(parents=True, exist_ok=True) +results_filename = f'{results_path}/{MODEL_NAME}_NONE_{DATA}_results.json' +metric = dataset.eval_metric +save_results({'model': MODEL_NAME, + 'train_flag': parsed['train_flag'], + 'data': DATA, + 'run': 1, + 'seed': SEED, + metric: float(np.mean(perf_list_all)), + 'val_mrr': val_mrr, + 'hits10': float(np.mean(hits_list_all)), + 'test_time': test_time_o, + 'tot_train_val_time': total_time_o + }, + results_filename) + +if parsed['num_processes']>1: + ray.shutdown() + + + diff --git a/examples/linkproppred/tkgl-icews/cen.py b/examples/linkproppred/tkgl-icews/cen.py index b7d84b0..6fb5ace 100644 --- a/examples/linkproppred/tkgl-icews/cen.py +++ b/examples/linkproppred/tkgl-icews/cen.py @@ -20,7 +20,8 @@ sys.path.append(tgb_modules_path) from tgb_modules.rrgcn import RecurrentRGCNCEN from tgb.utils.utils import set_random_seed, split_by_time, save_results -from tgb_modules.tkg_utils import get_args_cen, build_sub_graph, reformat_ts +from tgb_modules.tkg_utils import get_args_cen, reformat_ts +from tgb_modules.tkg_utils_dgl import build_sub_graph from tgb.linkproppred.evaluate import Evaluator from tgb.linkproppred.dataset import LinkPropPredDataset @@ -351,7 +352,7 @@ def run_experiment(args, trainvalidtest_id=0, n_hidden=None, n_layers=None, drop MODEL_NAME = 'CEN' print("logging mrrs per relation: ", args.log_per_rel) -print("do train? do only test no validation?: ", args.trainflag, args.test_only) +print("do test and valid? do only test no validation?: ", args.validtest, args.test_only) # load data dataset = LinkPropPredDataset(name=DATA, root="datasets", preprocess=True) @@ -393,22 +394,24 @@ def run_experiment(args, trainvalidtest_id=0, n_hidden=None, n_layers=None, drop else: start_train = timeit.default_timer() - if args.trainflag: + if args.validtest: + print('directly start testing') + if args.test_history_len_2 != args.test_history_len: + args.test_history_len = args.test_history_len_2 # hyperparameter value as given in original paper + else: print('running pretrain and train') # pretrain mrr, _, _ = run_experiment(args, trainvalidtest_id=-1) # train - mrr, args.test_history_len = run_experiment(args, trainvalidtest_id=0) # overwrite test_history_len with - # the best history len (for valid mrr) - else: - print('directly start testing') - if args.test_history_len_2 != args.test_history_len: - args.test_history_len = args.test_history_len_2 # hyperparameter value as given in original paper + mrr, args.test_history_len, _ = run_experiment(args, trainvalidtest_id=0) # overwrite test_history_len with + # the best history len (for valid mrr) if args.test_only == False: print("running test (on val and test dataset) with test_history_len of: ", args.test_history_len) # test on val set val_mrr, _, _ = run_experiment(args, trainvalidtest_id=1) + else: + val_mrr = 0 # test on test set start_test = timeit.default_timer() diff --git a/examples/linkproppred/tkgl-icews/regcn.py b/examples/linkproppred/tkgl-icews/regcn.py index b2e49e2..2785ade 100644 --- a/examples/linkproppred/tkgl-icews/regcn.py +++ b/examples/linkproppred/tkgl-icews/regcn.py @@ -20,7 +20,8 @@ sys.path.append(tgb_modules_path) from tgb_modules.rrgcn import RecurrentRGCNREGCN from tgb.utils.utils import set_random_seed, split_by_time, save_results -from tgb_modules.tkg_utils import get_args_regcn, build_sub_graph, reformat_ts +from tgb_modules.tkg_utils import get_args_regcn, reformat_ts +from tgb_modules.tkg_utils_dgl import build_sub_graph from tgb.linkproppred.evaluate import Evaluator from tgb.linkproppred.dataset import LinkPropPredDataset import json diff --git a/examples/linkproppred/tkgl-icews/tlogic.py b/examples/linkproppred/tkgl-icews/tlogic.py index 0b9d36f..9048cd7 100644 --- a/examples/linkproppred/tkgl-icews/tlogic.py +++ b/examples/linkproppred/tkgl-icews/tlogic.py @@ -72,7 +72,8 @@ def learn_rules(i, num_relations): return rl.rules_dict -def apply_rules(i, num_queries, rules_dict, neg_sampler, data, window, learn_edges, all_quads, args, split_mode): +def apply_rules(i, num_queries, rules_dict, neg_sampler, data, window, learn_edges, all_quads, args, split_mode, + log_per_rel=False, num_rels=0): """ Apply rules (multiprocessing possible). @@ -84,7 +85,9 @@ def apply_rules(i, num_queries, rules_dict, neg_sampler, data, window, learn_edg hits_list (list): hits list (hits@10 per sample) perf_list (list): performance list (mrr per sample) """ - + perf_per_rel = {} + for rel in range(num_rels): + perf_per_rel[rel] = [] print("Start process", i, "...") all_candidates = [dict() for _ in range(len(args))] no_cands_counter = 0 @@ -207,9 +210,20 @@ def apply_rules(i, num_queries, rules_dict, neg_sampler, data, window, learn_edg predictions = evaluator.eval(input_dict) perf_list[index] = predictions['mrr'] hits_list[index] = predictions['hits@10'] + if split_mode == "test": + if log_per_rel: + perf_per_rel[test_query[1]].append(perf_list[index]) #test_query[1] is the relation index + + if split_mode == "test": + if log_per_rel: + for rel in range(num_rels): + if len(perf_per_rel[rel]) > 0: + perf_per_rel[rel] = float(np.mean(perf_per_rel[rel])) + else: + perf_per_rel.pop(rel) - return perf_list, hits_list + return perf_list, hits_list, perf_per_rel ## args @@ -229,6 +243,8 @@ def get_args(): parser.add_argument('--run_nr', type=int, help='Run Number', default=1) parser.add_argument('--learn_rules_flag', type=bool, help='Do we want to learn the rules', default=True) parser.add_argument('--rule_filename', type=str, help='if rules not learned: where are they stored', default='0_r[3]_n100_exp_s1_rules.json') + parser.add_argument('--log_per_rel', type=bool, help='Do we want to log mrr per relation', default=False) + parser.add_argument('--compute_valid_mrr', type=bool, help='Do we want to compute mrr for valid set', default=True) parsed = vars(parser.parse_args()) return parsed @@ -244,6 +260,7 @@ def get_args(): num_processes = parsed["num_processes"] window = parsed["window"] top_k = parsed["top_k"] +log_per_rel = parsed['log_per_rel'] MODEL_NAME = 'TLogic' SEED = parsed['seed'] # set the random seed for consistency @@ -251,6 +268,7 @@ def get_args(): ## load dataset and prepare it accordingly name = parsed["dataset"] +compute_valid_mrr = parsed["compute_valid_mrr"] dataset = LinkPropPredDataset(name=name, root="datasets", preprocess=True) DATA = name @@ -334,25 +352,33 @@ def get_args(): args = [[0.1, 0.5]] # compute valid mrr -print('Computing valid MRR') start_valid = timeit.default_timer() -num_queries = len(val_data) // num_processes +if compute_valid_mrr: + print('Computing valid MRR') -output = Parallel(n_jobs=num_processes)( - delayed(apply_rules)(i, num_queries,rules_dict, neg_sampler, val_data, window, learn_edges, - all_quads, args, split_mode='val') for i in range(num_processes)) -end = timeit.default_timer() + num_queries = len(val_data) // num_processes -perf_list_val = [] -hits_list_val = [] + output = Parallel(n_jobs=num_processes)( + delayed(apply_rules)(i, num_queries,rules_dict, neg_sampler, val_data, window, learn_edges, + all_quads, args, split_mode='val') for i in range(num_processes)) + end = timeit.default_timer() -for i in range(num_processes): - perf_list_val.extend(output[i][0]) - hits_list_val.extend(output[i][1]) + perf_list_val = [] + hits_list_val = [] + + for i in range(num_processes): + perf_list_val.extend(output[i][0]) + hits_list_val.extend(output[i][1]) +else: + perf_list_val = [0] + hits_list_val = [0] + end_valid = timeit.default_timer() # compute test mrr +if log_per_rel ==True: + num_processes = 1 #otherwise logging per rel does not work for our implementation start_test = timeit.default_timer() print('Computing test MRR') start = timeit.default_timer() @@ -360,15 +386,19 @@ def get_args(): output = Parallel(n_jobs=num_processes)( delayed(apply_rules)(i, num_queries,rules_dict, neg_sampler, test_data, window, learn_edges, - all_quads, args, split_mode='test') for i in range(num_processes)) + all_quads, args, split_mode='test', log_per_rel=log_per_rel, num_rels=num_rels) for i in range(num_processes)) end = timeit.default_timer() perf_list_all = [] hits_list_all = [] + for i in range(num_processes): perf_list_all.extend(output[i][0]) hits_list_all.extend(output[i][1]) +if log_per_rel == True: + perf_per_rel = output[0][2] + total_time = round(end - start, 6) total_valid_time = round(end_valid - start_valid, 6) @@ -393,9 +423,13 @@ def get_args(): os.mkdir(results_path) print('INFO: Create directory {}'.format(results_path)) Path(results_path).mkdir(parents=True, exist_ok=True) -results_filename = f'{results_path}/{MODEL_NAME}_NONE_{DATA}_results.json' +if log_per_rel == True: + results_filename = f'{results_path}/{MODEL_NAME}_{DATA}_results_per_rel.json' + with open(results_filename, 'w') as json_file: + json.dump(perf_per_rel, json_file) +results_filename = f'{results_path}/{MODEL_NAME}_NONE_{DATA}_results.json' metric = dataset.eval_metric save_results({'model': MODEL_NAME, 'train_flag': None, diff --git a/examples/linkproppred/tkgl-polecat/cen.py b/examples/linkproppred/tkgl-polecat/cen.py index 363e17c..4e6a219 100644 --- a/examples/linkproppred/tkgl-polecat/cen.py +++ b/examples/linkproppred/tkgl-polecat/cen.py @@ -20,7 +20,8 @@ sys.path.append(tgb_modules_path) from tgb_modules.rrgcn import RecurrentRGCNCEN from tgb.utils.utils import set_random_seed, split_by_time, save_results -from tgb_modules.tkg_utils import get_args_cen, build_sub_graph, reformat_ts +from tgb_modules.tkg_utils import get_args_cen, reformat_ts +from tgb_modules.tkg_utils_dgl import build_sub_graph from tgb.linkproppred.evaluate import Evaluator from tgb.linkproppred.dataset import LinkPropPredDataset @@ -351,7 +352,7 @@ def run_experiment(args, trainvalidtest_id=0, n_hidden=None, n_layers=None, drop MODEL_NAME = 'CEN' print("logging mrrs per relation: ", args.log_per_rel) -print("do train? do only test no validation?: ", args.trainflag, args.test_only) +print("do test and valid? do only test no validation?: ", args.validtest, args.test_only) # load data dataset = LinkPropPredDataset(name=DATA, root="datasets", preprocess=True) @@ -393,22 +394,24 @@ def run_experiment(args, trainvalidtest_id=0, n_hidden=None, n_layers=None, drop else: start_train = timeit.default_timer() - if args.trainflag: + if args.validtest: + print('directly start testing') + if args.test_history_len_2 != args.test_history_len: + args.test_history_len = args.test_history_len_2 # hyperparameter value as given in original paper + else: print('running pretrain and train') # pretrain mrr, _, _ = run_experiment(args, trainvalidtest_id=-1) # train - mrr, args.test_history_len = run_experiment(args, trainvalidtest_id=0) # overwrite test_history_len with - # the best history len (for valid mrr) - else: - print('directly start testing') - if args.test_history_len_2 != args.test_history_len: - args.test_history_len = args.test_history_len_2 # hyperparameter value as given in original paper + mrr, args.test_history_len, _ = run_experiment(args, trainvalidtest_id=0) # overwrite test_history_len with + # the best history len (for valid mrr) if args.test_only == False: print("running test (on val and test dataset) with test_history_len of: ", args.test_history_len) # test on val set val_mrr, _, _ = run_experiment(args, trainvalidtest_id=1) + else: + val_mrr = 0 # test on test set start_test = timeit.default_timer() diff --git a/examples/linkproppred/tkgl-polecat/regcn.py b/examples/linkproppred/tkgl-polecat/regcn.py index 80e3277..6014d68 100644 --- a/examples/linkproppred/tkgl-polecat/regcn.py +++ b/examples/linkproppred/tkgl-polecat/regcn.py @@ -20,7 +20,8 @@ sys.path.append(tgb_modules_path) from tgb_modules.rrgcn import RecurrentRGCNREGCN from tgb.utils.utils import set_random_seed, split_by_time, save_results -from tgb_modules.tkg_utils import get_args_regcn, build_sub_graph, reformat_ts +from tgb_modules.tkg_utils import get_args_regcn, reformat_ts +from tgb_modules.tkg_utils_dgl import build_sub_graph from tgb.linkproppred.evaluate import Evaluator from tgb.linkproppred.dataset import LinkPropPredDataset import json diff --git a/examples/linkproppred/tkgl-polecat/tlogic.py b/examples/linkproppred/tkgl-polecat/tlogic.py index efb22ee..97ee49a 100644 --- a/examples/linkproppred/tkgl-polecat/tlogic.py +++ b/examples/linkproppred/tkgl-polecat/tlogic.py @@ -72,7 +72,8 @@ def learn_rules(i, num_relations): return rl.rules_dict -def apply_rules(i, num_queries, rules_dict, neg_sampler, data, window, learn_edges, all_quads, args, split_mode): +def apply_rules(i, num_queries, rules_dict, neg_sampler, data, window, learn_edges, all_quads, args, split_mode, + log_per_rel=False, num_rels=0): """ Apply rules (multiprocessing possible). @@ -84,7 +85,9 @@ def apply_rules(i, num_queries, rules_dict, neg_sampler, data, window, learn_edg hits_list (list): hits list (hits@10 per sample) perf_list (list): performance list (mrr per sample) """ - + perf_per_rel = {} + for rel in range(num_rels): + perf_per_rel[rel] = [] print("Start process", i, "...") all_candidates = [dict() for _ in range(len(args))] no_cands_counter = 0 @@ -207,9 +210,20 @@ def apply_rules(i, num_queries, rules_dict, neg_sampler, data, window, learn_edg predictions = evaluator.eval(input_dict) perf_list[index] = predictions['mrr'] hits_list[index] = predictions['hits@10'] + if split_mode == "test": + if log_per_rel: + perf_per_rel[test_query[1]].append(perf_list[index]) #test_query[1] is the relation index + + if split_mode == "test": + if log_per_rel: + for rel in range(num_rels): + if len(perf_per_rel[rel]) > 0: + perf_per_rel[rel] = float(np.mean(perf_per_rel[rel])) + else: + perf_per_rel.pop(rel) - return perf_list, hits_list + return perf_list, hits_list, perf_per_rel ## args @@ -229,6 +243,8 @@ def get_args(): parser.add_argument('--run_nr', type=int, help='Run Number', default=1) parser.add_argument('--learn_rules_flag', type=bool, help='Do we want to learn the rules', default=True) parser.add_argument('--rule_filename', type=str, help='if rules not learned: where are they stored', default='0_r[3]_n100_exp_s1_rules.json') + parser.add_argument('--log_per_rel', type=bool, help='Do we want to log mrr per relation', default=False) + parser.add_argument('--compute_valid_mrr', type=bool, help='Do we want to compute mrr for valid set', default=True) parsed = vars(parser.parse_args()) return parsed @@ -244,6 +260,7 @@ def get_args(): num_processes = parsed["num_processes"] window = parsed["window"] top_k = parsed["top_k"] +log_per_rel = parsed['log_per_rel'] MODEL_NAME = 'TLogic' SEED = parsed['seed'] # set the random seed for consistency @@ -251,6 +268,7 @@ def get_args(): ## load dataset and prepare it accordingly name = parsed["dataset"] +compute_valid_mrr = parsed["compute_valid_mrr"] dataset = LinkPropPredDataset(name=name, root="datasets", preprocess=True) DATA = name @@ -334,25 +352,33 @@ def get_args(): args = [[0.1, 0.5]] # compute valid mrr -print('Computing valid MRR') start_valid = timeit.default_timer() -num_queries = len(val_data) // num_processes +if compute_valid_mrr: + print('Computing valid MRR') -output = Parallel(n_jobs=num_processes)( - delayed(apply_rules)(i, num_queries,rules_dict, neg_sampler, val_data, window, learn_edges, - all_quads, args, split_mode='val') for i in range(num_processes)) -end = timeit.default_timer() + num_queries = len(val_data) // num_processes -perf_list_val = [] -hits_list_val = [] + output = Parallel(n_jobs=num_processes)( + delayed(apply_rules)(i, num_queries,rules_dict, neg_sampler, val_data, window, learn_edges, + all_quads, args, split_mode='val') for i in range(num_processes)) + end = timeit.default_timer() -for i in range(num_processes): - perf_list_val.extend(output[i][0]) - hits_list_val.extend(output[i][1]) + perf_list_val = [] + hits_list_val = [] + + for i in range(num_processes): + perf_list_val.extend(output[i][0]) + hits_list_val.extend(output[i][1]) +else: + perf_list_val = [0] + hits_list_val = [0] + end_valid = timeit.default_timer() # compute test mrr +if log_per_rel ==True: + num_processes = 1 #otherwise logging per rel does not work for our implementation start_test = timeit.default_timer() print('Computing test MRR') start = timeit.default_timer() @@ -360,15 +386,19 @@ def get_args(): output = Parallel(n_jobs=num_processes)( delayed(apply_rules)(i, num_queries,rules_dict, neg_sampler, test_data, window, learn_edges, - all_quads, args, split_mode='test') for i in range(num_processes)) + all_quads, args, split_mode='test', log_per_rel=log_per_rel, num_rels=num_rels) for i in range(num_processes)) end = timeit.default_timer() perf_list_all = [] hits_list_all = [] + for i in range(num_processes): perf_list_all.extend(output[i][0]) hits_list_all.extend(output[i][1]) +if log_per_rel == True: + perf_per_rel = output[0][2] + total_time = round(end - start, 6) total_valid_time = round(end_valid - start_valid, 6) @@ -393,9 +423,13 @@ def get_args(): os.mkdir(results_path) print('INFO: Create directory {}'.format(results_path)) Path(results_path).mkdir(parents=True, exist_ok=True) -results_filename = f'{results_path}/{MODEL_NAME}_NONE_{DATA}_results.json' +if log_per_rel == True: + results_filename = f'{results_path}/{MODEL_NAME}_{DATA}_results_per_rel.json' + with open(results_filename, 'w') as json_file: + json.dump(perf_per_rel, json_file) +results_filename = f'{results_path}/{MODEL_NAME}_NONE_{DATA}_results.json' metric = dataset.eval_metric save_results({'model': MODEL_NAME, 'train_flag': None, diff --git a/examples/linkproppred/tkgl-smallpedia/cen.py b/examples/linkproppred/tkgl-smallpedia/cen.py index 1107337..03f3b0f 100644 --- a/examples/linkproppred/tkgl-smallpedia/cen.py +++ b/examples/linkproppred/tkgl-smallpedia/cen.py @@ -20,7 +20,8 @@ sys.path.append(tgb_modules_path) from tgb_modules.rrgcn import RecurrentRGCNCEN from tgb.utils.utils import set_random_seed, split_by_time, save_results -from tgb_modules.tkg_utils import get_args_cen, build_sub_graph, reformat_ts +from tgb_modules.tkg_utils import get_args_cen, reformat_ts +from tgb_modules.tkg_utils_dgl import build_sub_graph from tgb.linkproppred.evaluate import Evaluator from tgb.linkproppred.dataset import LinkPropPredDataset @@ -351,7 +352,7 @@ def run_experiment(args, trainvalidtest_id=0, n_hidden=None, n_layers=None, drop MODEL_NAME = 'CEN' print("logging mrrs per relation: ", args.log_per_rel) -print("do train? do only test no validation?: ", args.trainflag, args.test_only) +print("do test and valid? do only test no validation?: ", args.validtest, args.test_only) # load data dataset = LinkPropPredDataset(name=DATA, root="datasets", preprocess=True) @@ -393,22 +394,24 @@ def run_experiment(args, trainvalidtest_id=0, n_hidden=None, n_layers=None, drop else: start_train = timeit.default_timer() - if args.trainflag: + if args.validtest: + print('directly start testing') + if args.test_history_len_2 != args.test_history_len: + args.test_history_len = args.test_history_len_2 # hyperparameter value as given in original paper + else: print('running pretrain and train') # pretrain mrr, _, _ = run_experiment(args, trainvalidtest_id=-1) # train - mrr, args.test_history_len = run_experiment(args, trainvalidtest_id=0) # overwrite test_history_len with - # the best history len (for valid mrr) - else: - print('directly start testing') - if args.test_history_len_2 != args.test_history_len: - args.test_history_len = args.test_history_len_2 # hyperparameter value as given in original paper + mrr, args.test_history_len, _ = run_experiment(args, trainvalidtest_id=0) # overwrite test_history_len with + # the best history len (for valid mrr) if args.test_only == False: print("running test (on val and test dataset) with test_history_len of: ", args.test_history_len) # test on val set val_mrr, _, _ = run_experiment(args, trainvalidtest_id=1) + else: + val_mrr = 0 # test on test set start_test = timeit.default_timer() diff --git a/examples/linkproppred/tkgl-smallpedia/regcn.py b/examples/linkproppred/tkgl-smallpedia/regcn.py index 0513f6d..8236319 100644 --- a/examples/linkproppred/tkgl-smallpedia/regcn.py +++ b/examples/linkproppred/tkgl-smallpedia/regcn.py @@ -20,7 +20,8 @@ sys.path.append(tgb_modules_path) from tgb_modules.rrgcn import RecurrentRGCNREGCN from tgb.utils.utils import set_random_seed, split_by_time, save_results -from tgb_modules.tkg_utils import get_args_regcn, build_sub_graph, reformat_ts +from tgb_modules.tkg_utils import get_args_regcn, reformat_ts +from tgb_modules.tkg_utils_dgl import build_sub_graph from tgb.linkproppred.evaluate import Evaluator from tgb.linkproppred.dataset import LinkPropPredDataset import json diff --git a/examples/linkproppred/tkgl-smallpedia/tlogic.py b/examples/linkproppred/tkgl-smallpedia/tlogic.py index 87f4b91..5a42df9 100644 --- a/examples/linkproppred/tkgl-smallpedia/tlogic.py +++ b/examples/linkproppred/tkgl-smallpedia/tlogic.py @@ -72,7 +72,8 @@ def learn_rules(i, num_relations): return rl.rules_dict -def apply_rules(i, num_queries, rules_dict, neg_sampler, data, window, learn_edges, all_quads, args, split_mode): +def apply_rules(i, num_queries, rules_dict, neg_sampler, data, window, learn_edges, all_quads, args, split_mode, + log_per_rel=False, num_rels=0): """ Apply rules (multiprocessing possible). @@ -84,7 +85,9 @@ def apply_rules(i, num_queries, rules_dict, neg_sampler, data, window, learn_edg hits_list (list): hits list (hits@10 per sample) perf_list (list): performance list (mrr per sample) """ - + perf_per_rel = {} + for rel in range(num_rels): + perf_per_rel[rel] = [] print("Start process", i, "...") all_candidates = [dict() for _ in range(len(args))] no_cands_counter = 0 @@ -207,9 +210,20 @@ def apply_rules(i, num_queries, rules_dict, neg_sampler, data, window, learn_edg predictions = evaluator.eval(input_dict) perf_list[index] = predictions['mrr'] hits_list[index] = predictions['hits@10'] + if split_mode == "test": + if log_per_rel: + perf_per_rel[test_query[1]].append(perf_list[index]) #test_query[1] is the relation index + + if split_mode == "test": + if log_per_rel: + for rel in range(num_rels): + if len(perf_per_rel[rel]) > 0: + perf_per_rel[rel] = float(np.mean(perf_per_rel[rel])) + else: + perf_per_rel.pop(rel) - return perf_list, hits_list + return perf_list, hits_list, perf_per_rel ## args @@ -229,6 +243,8 @@ def get_args(): parser.add_argument('--run_nr', type=int, help='Run Number', default=1) parser.add_argument('--learn_rules_flag', type=bool, help='Do we want to learn the rules', default=True) parser.add_argument('--rule_filename', type=str, help='if rules not learned: where are they stored', default='0_r[3]_n100_exp_s1_rules.json') + parser.add_argument('--log_per_rel', type=bool, help='Do we want to log mrr per relation', default=False) + parser.add_argument('--compute_valid_mrr', type=bool, help='Do we want to compute mrr for valid set', default=True) parsed = vars(parser.parse_args()) return parsed @@ -239,11 +255,13 @@ def get_args(): dataset = parsed["dataset"] rule_lengths = parsed["rule_lengths"] rule_lengths = [rule_lengths] if (type(rule_lengths) == int) else rule_lengths +print('rule_lengths', rule_lengths) num_walks = parsed["num_walks"] transition_distr = parsed["transition_distr"] num_processes = parsed["num_processes"] window = parsed["window"] top_k = parsed["top_k"] +log_per_rel = parsed['log_per_rel'] MODEL_NAME = 'TLogic' SEED = parsed['seed'] # set the random seed for consistency @@ -251,6 +269,7 @@ def get_args(): ## load dataset and prepare it accordingly name = parsed["dataset"] +compute_valid_mrr = parsed["compute_valid_mrr"] dataset = LinkPropPredDataset(name=name, root="datasets", preprocess=True) DATA = name @@ -334,25 +353,33 @@ def get_args(): args = [[0.1, 0.5]] # compute valid mrr -print('Computing valid MRR') start_valid = timeit.default_timer() -num_queries = len(val_data) // num_processes +if compute_valid_mrr: + print('Computing valid MRR') -output = Parallel(n_jobs=num_processes)( - delayed(apply_rules)(i, num_queries,rules_dict, neg_sampler, val_data, window, learn_edges, - all_quads, args, split_mode='val') for i in range(num_processes)) -end = timeit.default_timer() + num_queries = len(val_data) // num_processes -perf_list_val = [] -hits_list_val = [] + output = Parallel(n_jobs=num_processes)( + delayed(apply_rules)(i, num_queries,rules_dict, neg_sampler, val_data, window, learn_edges, + all_quads, args, split_mode='val') for i in range(num_processes)) + end = timeit.default_timer() -for i in range(num_processes): - perf_list_val.extend(output[i][0]) - hits_list_val.extend(output[i][1]) + perf_list_val = [] + hits_list_val = [] + + for i in range(num_processes): + perf_list_val.extend(output[i][0]) + hits_list_val.extend(output[i][1]) +else: + perf_list_val = [0] + hits_list_val = [0] + end_valid = timeit.default_timer() # compute test mrr +if log_per_rel ==True: + num_processes = 1 #otherwise logging per rel does not work for our implementation start_test = timeit.default_timer() print('Computing test MRR') start = timeit.default_timer() @@ -360,15 +387,19 @@ def get_args(): output = Parallel(n_jobs=num_processes)( delayed(apply_rules)(i, num_queries,rules_dict, neg_sampler, test_data, window, learn_edges, - all_quads, args, split_mode='test') for i in range(num_processes)) + all_quads, args, split_mode='test', log_per_rel=log_per_rel, num_rels=num_rels) for i in range(num_processes)) end = timeit.default_timer() perf_list_all = [] hits_list_all = [] + for i in range(num_processes): perf_list_all.extend(output[i][0]) hits_list_all.extend(output[i][1]) +if log_per_rel == True: + perf_per_rel = output[0][2] + total_time = round(end - start, 6) total_valid_time = round(end_valid - start_valid, 6) @@ -393,9 +424,13 @@ def get_args(): os.mkdir(results_path) print('INFO: Create directory {}'.format(results_path)) Path(results_path).mkdir(parents=True, exist_ok=True) -results_filename = f'{results_path}/{MODEL_NAME}_NONE_{DATA}_results.json' +if log_per_rel == True: + results_filename = f'{results_path}/{MODEL_NAME}_{DATA}_results_per_rel.json' + with open(results_filename, 'w') as json_file: + json.dump(perf_per_rel, json_file) +results_filename = f'{results_path}/{MODEL_NAME}_NONE_{DATA}_results.json' metric = dataset.eval_metric save_results({'model': MODEL_NAME, 'train_flag': None, diff --git a/examples/linkproppred/tkgl-wikidata/regcn.py b/examples/linkproppred/tkgl-wikidata/regcn.py index 19aea92..d4282d4 100644 --- a/examples/linkproppred/tkgl-wikidata/regcn.py +++ b/examples/linkproppred/tkgl-wikidata/regcn.py @@ -20,7 +20,8 @@ sys.path.append(tgb_modules_path) from tgb_modules.rrgcn import RecurrentRGCNREGCN from tgb.utils.utils import set_random_seed, split_by_time, save_results -from tgb_modules.tkg_utils import get_args_regcn, build_sub_graph, reformat_ts +from tgb_modules.tkg_utils import get_args_regcn, reformat_ts +from tgb_modules.tkg_utils_dgl import build_sub_graph from tgb.linkproppred.evaluate import Evaluator from tgb.linkproppred.dataset import LinkPropPredDataset import json diff --git a/examples/linkproppred/tkgl-wikidata/tkgl-wikidata_example.py b/examples/linkproppred/tkgl-wikidata/tkgl-wikidata_example.py index f0ee4f4..41b84f5 100644 --- a/examples/linkproppred/tkgl-wikidata/tkgl-wikidata_example.py +++ b/examples/linkproppred/tkgl-wikidata/tkgl-wikidata_example.py @@ -39,7 +39,7 @@ evaluator = Evaluator(name=DATA) neg_sampler = dataset.negative_sampler -BATCH_SIZE = 200 +BATCH_SIZE = 1 ## 200 val_loader = TemporalDataLoader(val_data, batch_size=BATCH_SIZE) test_loader = TemporalDataLoader(test_data, batch_size=BATCH_SIZE) @@ -49,6 +49,9 @@ for batch in tqdm(val_loader): src, pos_dst, t, msg, rel = batch.src, batch.dst, batch.t, batch.msg, batch.edge_type neg_batch_list = neg_sampler.query_batch(src.detach().cpu().numpy(), pos_dst.detach().cpu().numpy(), t.detach().cpu().numpy(), rel.detach().cpu().numpy(), split_mode='val') + + if len(neg_batch_list[0]) > 1500: + print(rel, len(neg_batch_list[0])) print ("loading ns samples from validation", timeit.default_timer() - start_time) start_time = timeit.default_timer() diff --git a/examples/linkproppred/tkgl-wikidata/tlogic.py b/examples/linkproppred/tkgl-wikidata/tlogic.py index eb07b07..e10605c 100644 --- a/examples/linkproppred/tkgl-wikidata/tlogic.py +++ b/examples/linkproppred/tkgl-wikidata/tlogic.py @@ -72,7 +72,8 @@ def learn_rules(i, num_relations): return rl.rules_dict -def apply_rules(i, num_queries, rules_dict, neg_sampler, data, window, learn_edges, all_quads, args, split_mode): +def apply_rules(i, num_queries, rules_dict, neg_sampler, data, window, learn_edges, all_quads, args, split_mode, + log_per_rel=False, num_rels=0): """ Apply rules (multiprocessing possible). @@ -84,7 +85,9 @@ def apply_rules(i, num_queries, rules_dict, neg_sampler, data, window, learn_edg hits_list (list): hits list (hits@10 per sample) perf_list (list): performance list (mrr per sample) """ - + perf_per_rel = {} + for rel in range(num_rels): + perf_per_rel[rel] = [] print("Start process", i, "...") all_candidates = [dict() for _ in range(len(args))] no_cands_counter = 0 @@ -207,9 +210,20 @@ def apply_rules(i, num_queries, rules_dict, neg_sampler, data, window, learn_edg predictions = evaluator.eval(input_dict) perf_list[index] = predictions['mrr'] hits_list[index] = predictions['hits@10'] + if split_mode == "test": + if log_per_rel: + perf_per_rel[test_query[1]].append(perf_list[index]) #test_query[1] is the relation index + + if split_mode == "test": + if log_per_rel: + for rel in range(num_rels): + if len(perf_per_rel[rel]) > 0: + perf_per_rel[rel] = float(np.mean(perf_per_rel[rel])) + else: + perf_per_rel.pop(rel) - return perf_list, hits_list + return perf_list, hits_list, perf_per_rel ## args @@ -229,6 +243,8 @@ def get_args(): parser.add_argument('--run_nr', type=int, help='Run Number', default=1) parser.add_argument('--learn_rules_flag', type=bool, help='Do we want to learn the rules', default=True) parser.add_argument('--rule_filename', type=str, help='if rules not learned: where are they stored', default='0_r[3]_n100_exp_s1_rules.json') + parser.add_argument('--log_per_rel', type=bool, help='Do we want to log mrr per relation', default=False) + parser.add_argument('--compute_valid_mrr', type=bool, help='Do we want to compute mrr for valid set', default=True) parsed = vars(parser.parse_args()) return parsed @@ -244,6 +260,7 @@ def get_args(): num_processes = parsed["num_processes"] window = parsed["window"] top_k = parsed["top_k"] +log_per_rel = parsed['log_per_rel'] MODEL_NAME = 'TLogic' SEED = parsed['seed'] # set the random seed for consistency @@ -251,6 +268,7 @@ def get_args(): ## load dataset and prepare it accordingly name = parsed["dataset"] +compute_valid_mrr = parsed["compute_valid_mrr"] dataset = LinkPropPredDataset(name=name, root="datasets", preprocess=True) DATA = name @@ -334,25 +352,33 @@ def get_args(): args = [[0.1, 0.5]] # compute valid mrr -print('Computing valid MRR') start_valid = timeit.default_timer() -num_queries = len(val_data) // num_processes +if compute_valid_mrr: + print('Computing valid MRR') -output = Parallel(n_jobs=num_processes)( - delayed(apply_rules)(i, num_queries,rules_dict, neg_sampler, val_data, window, learn_edges, - all_quads, args, split_mode='val') for i in range(num_processes)) -end = timeit.default_timer() + num_queries = len(val_data) // num_processes -perf_list_val = [] -hits_list_val = [] + output = Parallel(n_jobs=num_processes)( + delayed(apply_rules)(i, num_queries,rules_dict, neg_sampler, val_data, window, learn_edges, + all_quads, args, split_mode='val') for i in range(num_processes)) + end = timeit.default_timer() -for i in range(num_processes): - perf_list_val.extend(output[i][0]) - hits_list_val.extend(output[i][1]) + perf_list_val = [] + hits_list_val = [] + + for i in range(num_processes): + perf_list_val.extend(output[i][0]) + hits_list_val.extend(output[i][1]) +else: + perf_list_val = [0] + hits_list_val = [0] + end_valid = timeit.default_timer() # compute test mrr +if log_per_rel ==True: + num_processes = 1 #otherwise logging per rel does not work for our implementation start_test = timeit.default_timer() print('Computing test MRR') start = timeit.default_timer() @@ -360,15 +386,19 @@ def get_args(): output = Parallel(n_jobs=num_processes)( delayed(apply_rules)(i, num_queries,rules_dict, neg_sampler, test_data, window, learn_edges, - all_quads, args, split_mode='test') for i in range(num_processes)) + all_quads, args, split_mode='test', log_per_rel=log_per_rel, num_rels=num_rels) for i in range(num_processes)) end = timeit.default_timer() perf_list_all = [] hits_list_all = [] + for i in range(num_processes): perf_list_all.extend(output[i][0]) hits_list_all.extend(output[i][1]) +if log_per_rel == True: + perf_per_rel = output[0][2] + total_time = round(end - start, 6) total_valid_time = round(end_valid - start_valid, 6) @@ -393,9 +423,13 @@ def get_args(): os.mkdir(results_path) print('INFO: Create directory {}'.format(results_path)) Path(results_path).mkdir(parents=True, exist_ok=True) -results_filename = f'{results_path}/{MODEL_NAME}_NONE_{DATA}_results.json' +if log_per_rel == True: + results_filename = f'{results_path}/{MODEL_NAME}_{DATA}_results_per_rel.json' + with open(results_filename, 'w') as json_file: + json.dump(perf_per_rel, json_file) +results_filename = f'{results_path}/{MODEL_NAME}_NONE_{DATA}_results.json' metric = dataset.eval_metric save_results({'model': MODEL_NAME, 'train_flag': None, diff --git a/examples/linkproppred/tkgl-yago/cen.py b/examples/linkproppred/tkgl-yago/cen.py index 8200e1d..46caeb5 100644 --- a/examples/linkproppred/tkgl-yago/cen.py +++ b/examples/linkproppred/tkgl-yago/cen.py @@ -20,7 +20,8 @@ sys.path.append(tgb_modules_path) from tgb_modules.rrgcn import RecurrentRGCNCEN from tgb.utils.utils import set_random_seed, split_by_time, save_results -from tgb_modules.tkg_utils import get_args_cen, build_sub_graph, reformat_ts +from tgb_modules.tkg_utils import get_args_cen, reformat_ts +from tgb_modules.tkg_utils_dgl import build_sub_graph from tgb.linkproppred.evaluate import Evaluator from tgb.linkproppred.dataset import LinkPropPredDataset @@ -351,7 +352,7 @@ def run_experiment(args, trainvalidtest_id=0, n_hidden=None, n_layers=None, drop MODEL_NAME = 'CEN' print("logging mrrs per relation: ", args.log_per_rel) -print("do train? do only test no validation?: ", args.trainflag, args.test_only) +print("do test and valid? do only test no validation?: ", args.validtest, args.test_only) # load data dataset = LinkPropPredDataset(name=DATA, root="datasets", preprocess=True) @@ -393,22 +394,24 @@ def run_experiment(args, trainvalidtest_id=0, n_hidden=None, n_layers=None, drop else: start_train = timeit.default_timer() - if args.trainflag: + if args.validtest: + print('directly start testing') + if args.test_history_len_2 != args.test_history_len: + args.test_history_len = args.test_history_len_2 # hyperparameter value as given in original paper + else: print('running pretrain and train') # pretrain mrr, _, _ = run_experiment(args, trainvalidtest_id=-1) # train - mrr, args.test_history_len = run_experiment(args, trainvalidtest_id=0) # overwrite test_history_len with - # the best history len (for valid mrr) - else: - print('directly start testing') - if args.test_history_len_2 != args.test_history_len: - args.test_history_len = args.test_history_len_2 # hyperparameter value as given in original paper + mrr, args.test_history_len, _ = run_experiment(args, trainvalidtest_id=0) # overwrite test_history_len with + # the best history len (for valid mrr) if args.test_only == False: print("running test (on val and test dataset) with test_history_len of: ", args.test_history_len) # test on val set val_mrr, _, _ = run_experiment(args, trainvalidtest_id=1) + else: + val_mrr = 0 # test on test set start_test = timeit.default_timer() diff --git a/examples/linkproppred/tkgl-yago/regcn.py b/examples/linkproppred/tkgl-yago/regcn.py index ff502b1..29206ff 100644 --- a/examples/linkproppred/tkgl-yago/regcn.py +++ b/examples/linkproppred/tkgl-yago/regcn.py @@ -20,7 +20,8 @@ sys.path.append(tgb_modules_path) from tgb_modules.rrgcn import RecurrentRGCNREGCN from tgb.utils.utils import set_random_seed, split_by_time, save_results -from tgb_modules.tkg_utils import get_args_regcn, build_sub_graph, reformat_ts +from tgb_modules.tkg_utils import get_args_regcn, reformat_ts +from tgb_modules.tkg_utils_dgl import build_sub_graph from tgb.linkproppred.evaluate import Evaluator from tgb.linkproppred.dataset import LinkPropPredDataset import json diff --git a/examples/linkproppred/tkgl-yago/tlogic.py b/examples/linkproppred/tkgl-yago/tlogic.py index 3461344..5fcb5d4 100644 --- a/examples/linkproppred/tkgl-yago/tlogic.py +++ b/examples/linkproppred/tkgl-yago/tlogic.py @@ -72,7 +72,8 @@ def learn_rules(i, num_relations): return rl.rules_dict -def apply_rules(i, num_queries, rules_dict, neg_sampler, data, window, learn_edges, all_quads, args, split_mode): +def apply_rules(i, num_queries, rules_dict, neg_sampler, data, window, learn_edges, all_quads, args, split_mode, + log_per_rel=False, num_rels=0): """ Apply rules (multiprocessing possible). @@ -84,7 +85,9 @@ def apply_rules(i, num_queries, rules_dict, neg_sampler, data, window, learn_edg hits_list (list): hits list (hits@10 per sample) perf_list (list): performance list (mrr per sample) """ - + perf_per_rel = {} + for rel in range(num_rels): + perf_per_rel[rel] = [] print("Start process", i, "...") all_candidates = [dict() for _ in range(len(args))] no_cands_counter = 0 @@ -207,9 +210,20 @@ def apply_rules(i, num_queries, rules_dict, neg_sampler, data, window, learn_edg predictions = evaluator.eval(input_dict) perf_list[index] = predictions['mrr'] hits_list[index] = predictions['hits@10'] + if split_mode == "test": + if log_per_rel: + perf_per_rel[test_query[1]].append(perf_list[index]) #test_query[1] is the relation index + + if split_mode == "test": + if log_per_rel: + for rel in range(num_rels): + if len(perf_per_rel[rel]) > 0: + perf_per_rel[rel] = float(np.mean(perf_per_rel[rel])) + else: + perf_per_rel.pop(rel) - return perf_list, hits_list + return perf_list, hits_list, perf_per_rel ## args @@ -229,6 +243,8 @@ def get_args(): parser.add_argument('--run_nr', type=int, help='Run Number', default=1) parser.add_argument('--learn_rules_flag', type=bool, help='Do we want to learn the rules', default=True) parser.add_argument('--rule_filename', type=str, help='if rules not learned: where are they stored', default='0_r[3]_n100_exp_s1_rules.json') + parser.add_argument('--log_per_rel', type=bool, help='Do we want to log mrr per relation', default=False) + parser.add_argument('--compute_valid_mrr', type=bool, help='Do we want to compute mrr for valid set', default=True) parsed = vars(parser.parse_args()) return parsed @@ -244,6 +260,7 @@ def get_args(): num_processes = parsed["num_processes"] window = parsed["window"] top_k = parsed["top_k"] +log_per_rel = parsed['log_per_rel'] MODEL_NAME = 'TLogic' SEED = parsed['seed'] # set the random seed for consistency @@ -251,6 +268,7 @@ def get_args(): ## load dataset and prepare it accordingly name = parsed["dataset"] +compute_valid_mrr = parsed["compute_valid_mrr"] dataset = LinkPropPredDataset(name=name, root="datasets", preprocess=True) DATA = name @@ -334,25 +352,33 @@ def get_args(): args = [[0.1, 0.5]] # compute valid mrr -print('Computing valid MRR') start_valid = timeit.default_timer() -num_queries = len(val_data) // num_processes +if compute_valid_mrr: + print('Computing valid MRR') -output = Parallel(n_jobs=num_processes)( - delayed(apply_rules)(i, num_queries,rules_dict, neg_sampler, val_data, window, learn_edges, - all_quads, args, split_mode='val') for i in range(num_processes)) -end = timeit.default_timer() + num_queries = len(val_data) // num_processes -perf_list_val = [] -hits_list_val = [] + output = Parallel(n_jobs=num_processes)( + delayed(apply_rules)(i, num_queries,rules_dict, neg_sampler, val_data, window, learn_edges, + all_quads, args, split_mode='val') for i in range(num_processes)) + end = timeit.default_timer() -for i in range(num_processes): - perf_list_val.extend(output[i][0]) - hits_list_val.extend(output[i][1]) + perf_list_val = [] + hits_list_val = [] + + for i in range(num_processes): + perf_list_val.extend(output[i][0]) + hits_list_val.extend(output[i][1]) +else: + perf_list_val = [0] + hits_list_val = [0] + end_valid = timeit.default_timer() # compute test mrr +if log_per_rel ==True: + num_processes = 1 #otherwise logging per rel does not work for our implementation start_test = timeit.default_timer() print('Computing test MRR') start = timeit.default_timer() @@ -360,15 +386,19 @@ def get_args(): output = Parallel(n_jobs=num_processes)( delayed(apply_rules)(i, num_queries,rules_dict, neg_sampler, test_data, window, learn_edges, - all_quads, args, split_mode='test') for i in range(num_processes)) + all_quads, args, split_mode='test', log_per_rel=log_per_rel, num_rels=num_rels) for i in range(num_processes)) end = timeit.default_timer() perf_list_all = [] hits_list_all = [] + for i in range(num_processes): perf_list_all.extend(output[i][0]) hits_list_all.extend(output[i][1]) +if log_per_rel == True: + perf_per_rel = output[0][2] + total_time = round(end - start, 6) total_valid_time = round(end_valid - start_valid, 6) @@ -393,9 +423,13 @@ def get_args(): os.mkdir(results_path) print('INFO: Create directory {}'.format(results_path)) Path(results_path).mkdir(parents=True, exist_ok=True) -results_filename = f'{results_path}/{MODEL_NAME}_NONE_{DATA}_results.json' +if log_per_rel == True: + results_filename = f'{results_path}/{MODEL_NAME}_{DATA}_results_per_rel.json' + with open(results_filename, 'w') as json_file: + json.dump(perf_per_rel, json_file) +results_filename = f'{results_path}/{MODEL_NAME}_NONE_{DATA}_results.json' metric = dataset.eval_metric save_results({'model': MODEL_NAME, 'train_flag': None, diff --git a/stats_figures/README.md b/stats_figures/README.md new file mode 100644 index 0000000..603fd85 --- /dev/null +++ b/stats_figures/README.md @@ -0,0 +1,31 @@ +### How to compute stats and figures +for each of them, you can specify the datasets of interest in the list at the beginning. +same applies to methods of interest, if it is about correlation of methods results and stats. + +For *creating the figures* you only need to run 2. and 5., provided that you have previously stored all stats in the respective dataset subfolder. + +## 1. compute_dataset_stats.py +- loads datasets and computes all stats that we put in paper table and writes it in stats_figures/dataset_name/dataset_stats.csv +- computes number of nodes and stores in stats_figures/dataset_name/figs/numedges_datasetname.json # number of edges per timestep (to create the figures) +- saves timestamps in timestamps.csv +- this can be a bit slow (takes a few hours for all datasets, especially computing seasonality is slow) + +## 2. create_edges_figures.py +- makes the *figures with number of edges per timestep* (bins) +- input needed: dataset_name/numedges_datasetname.json and dataset_name/dataset_stats.csv (output from 1.) +- output figures stored in dataset_name/num_edges_discretized_numbin_datasetname.pdf and png where numbin is the number of bins +- i usually use num_edges_discretized_{num_bars}_{dataset_name}2.pdf + +## 3. compute_relation_dataset_stats.py +- compute the statistics for each relation based on the dataset, e.g. number of occurences, recurrency degree +- outputs: csv file with relationship stats dataset_name/stats/relation_statistics_dataset_name.csv +- comment: for icews and polecat I manually added the strings for the 10 most occuring relations for the plots + +## 4. compute_relation_results_df.py +- add the results for selected methods for each relation in head and tail direction to new columns to the dataset_name/stats/relation_statistics_dataset_name.csv +- input needed: relation_statistics_dataset_name.csv (from 4.) and: results_per_relation files e.g. examples/linkpropprediction/tkgl-polecat/saved_results/REGCN_tkgl-polecat_results_per_rel.json + +## 5. create_relation_figures.py +- creates the figures for mrr per relation +- outputs: figures (*pie charts*) +- input needed: \ No newline at end of file diff --git a/stats_figures/compute_dataset_stats.py b/stats_figures/compute_dataset_stats.py new file mode 100644 index 0000000..a99da30 --- /dev/null +++ b/stats_figures/compute_dataset_stats.py @@ -0,0 +1,350 @@ +""" +This script computes statistics for all datasets in TGB2. +Basically everything that we report in the paper table, as well as some additional statistics like number of edges per timestep +Needed: +datasets in dataset folder (no preprocessing needed) +Output: +dataset_stats.csv # statistics for a datasets - stored in the respective dataset folder +numedges_datasetname.json # number of edges per timestep (to create the figures) +""" + +## imports +import numpy as np +import sys +import os +import os.path as osp +tgb_modules_path = osp.abspath(os.path.join(os.path.dirname(__file__), '..')) +sys.path.append(tgb_modules_path) +import json +import numpy as np +from datetime import datetime +import matplotlib.pyplot as plt +from datetime import datetime + +#internal imports +from tgb.linkproppred.dataset import LinkPropPredDataset +from tgb_modules.tkg_utils import reformat_ts, get_original_ts +import stats_figures.dataset_utils as du + + +# specify all datasets +names = ['thgl-myket'] #, 'tkgl-smallpedia','tkgl-polecat', 'thgl-software', 'tkgl-icews','thgl-github', 'thgl-forum', 'tkgl-wikidata', 'thgl-myket','tkgl-yago'] +for dataset_name in names: + current_dir = os.path.dirname(os.path.abspath(__file__)) + # Navigate one folder up + parent_dir = os.path.dirname(current_dir) + figs_dir = os.path.join( current_dir, dataset_name, 'figs') + if not os.path.exists(figs_dir): + os.makedirs(figs_dir) + + # load dataset + dataset = LinkPropPredDataset(name=dataset_name, root="datasets", preprocess=True) + + relations = dataset.edge_type + num_rels = dataset.num_rels + if 'tkgl' in dataset_name: + num_rels_without_inv = int(num_rels/2) + else: + num_rels_without_inv = num_rels + + rels = np.arange(0,num_rels) + subjects = dataset.full_data["sources"] + objects= dataset.full_data["destinations"] + num_nodes = dataset.num_nodes + timestamps_orig = dataset.full_data["timestamps"] + timestamps = reformat_ts(timestamps_orig, dataset_name) # stepsize:1 + current_dir = os.path.dirname(os.path.abspath(__file__)) + csv_dir = os.path.join( current_dir, dataset_name) + np.savetxt(csv_dir +"/"+dataset_name+"timestamps.csv", timestamps,fmt='%i', delimiter=",") + all_quads = np.stack((subjects, relations, objects, timestamps, timestamps_orig), axis=1) + train_data = all_quads[dataset.train_mask] + val_data = all_quads[dataset.val_mask] + test_data = all_quads[dataset.test_mask] + collision_trainval = np.intersect1d(list(set(timestamps_orig[dataset.train_mask])), list(set(timestamps_orig[dataset.val_mask]))) + collision_valtest = np.intersect1d(list(set(timestamps_orig[dataset.val_mask])), list(set(timestamps_orig[dataset.test_mask]))) + if len(collision_trainval) > 0: # check if there is a collision between train and val set + print("!!!!!!!!!Collision between train and val set!!!!!!!!!") + if len(collision_valtest) > 0: # check if there is a collision between val and test set + print("!!!!!!!!!Collision between val and test set!!!!!!!!!") + print(subjects.shape) + + first_ts = timestamps_orig[0] + last_ts = timestamps_orig[-1] + + # timestamp strings for figure (first and last timestamp in dataset) + if 'wikidata' in dataset_name or 'smallpedia' in dataset_name or 'yago' in dataset_name: + first_ts_string = str(first_ts) + last_ts_string = str(last_ts) + elif 'thgl' in dataset_name: + first_ts_string = datetime.utcfromtimestamp(first_ts).strftime('%Y-%m-%d %H:%M:%S') + last_ts_string = datetime.utcfromtimestamp(last_ts).strftime('%Y-%m-%d %H:%M:%S') + else: + first_ts_string = datetime.utcfromtimestamp(first_ts).strftime('%Y-%m-%d') + last_ts_string = datetime.utcfromtimestamp(last_ts).strftime('%Y-%m-%d') + + print(dataset_name, "first timestamp:", first_ts_string, "last timestamp:", last_ts_string) + + + # compute number of quads in train/val/test set + num_train_quads = train_data.shape[0] + num_val_quads = val_data.shape[0] + num_test_quads = test_data.shape[0] + num_all_quads = num_train_quads + num_val_quads + num_test_quads + print(num_all_quads) + + # compute inductive nodes + test_ind_nodes = du.num_nodes_not_in_train(train_data, test_data) + val_ind_nodes = du.num_nodes_not_in_train(train_data, val_data) + test_ind_nodes_perc = test_ind_nodes/num_nodes + val_ind_nodes_perc = val_ind_nodes/num_nodes + + # compute number of timesteps in train/val/test set + num_train_timesteps = len(np.unique(train_data[:,-1])) + num_val_timesteps = len(np.unique(val_data[:,-1])) + num_test_timesteps = len(np.unique(test_data[:,-1])) + num_all_ts = num_train_timesteps + num_val_timesteps + num_test_timesteps + + # compute number on nodes in valid set or test set that have not been seen in train set + # compute recurrency degree + # compute average duration of facts + timestep_range = 1+np.max(timestamps) - np.min(timestamps) + all_possible_timestep_indices = [i for i in range(timestep_range)] + ts_all = du.TripleSet() + ts_all.add_triples(all_quads, num_rels_without_inv, timestep_range) + ts_all.compute_stat() + ts_test = du.TripleSet() + ts_test.add_triples(test_data, num_rels_without_inv, timestep_range) + ts_test.compute_stat() + + lens = [] + for timesteps in ts_all.timestep_lists: + lens.append(len(timesteps)) + + count_previous = 0 + count_sometime = 0 + count_all = 0 + for qtriple in ts_test.triples: + (s,r,o,t) = qtriple + k = ts_all.get_latest_ts(s,r,o, t) + count_all += 1 + if k + 1 == t: count_previous += 1 + if k > -1 and k < t: count_sometime += 1 + + print("DATATSET: " + dataset_name) + print("all: " + str(count_all)) + print("previous: " + str(count_previous)) + print("sometime: " + str(count_sometime)) + print("f-direct (DRec): " + str(count_previous / count_all)) + print("f-sometime (Rec): " + str(count_sometime / count_all)) + + print(f"the mean number of timesteps that a triple appears in is {np.mean(lens)}") + print(f"the median number of timesteps that a triple appears in is {np.median(lens)}") + print(f"the maximum number of timesteps that a triple appears in is {np.max(lens)}") + + # Compute max consecutive timesteps per triple + results = [du.max_consecutive_numbers(inner_list) for inner_list in ts_all.timestep_lists] + print(f"number of timesteps is {ts_all.num_timesteps}") + print(f"number of total triples is {ts_all.num_triples}") + print(f"number of distinct triples is {len(ts_all.timestep_lists)}") + print(f"the mean max number of 100*consecutive timesteps/number of timesteps that a triple appears in is {100*np.mean(results)/ts_all.num_timesteps}") + print(f"the median max number of 100*consecutive timesteps/number of timesteps that a triple appears in is {100*np.median(results)/ts_all.num_timesteps}") + print(f"the maximum max number of 100*consecutive timesteps/number of timesteps that a triple appears in is {100*np.max(results)/ts_all.num_timesteps}") + print(f"the mean max number of consecutive timesteps that a triple appears in is {np.mean(results)}") + print(f"the median max number of consecutive timesteps that a triple appears in is {np.median(results)}") + print(f"the maximum max number of consecutive timesteps that a triple appears in is {np.max(results)}") + print(f"the std for max number of consecutive timesteps that a triple appears in is {np.std(results)}") + + direct_recurrency_degree = count_previous / count_all + recurrency_degree = count_sometime / count_all + consecutiveness_degree = np.mean(results) # the mean max number of consecutive timesteps that a triple appears in + + + # compute number of triples per timestep + n_nodes_list = [] + n_edges_list = [] + + ts_set = list(set(timestamps_orig)) + ts_set.sort() + ts_dist = ts_set[1] - ts_set[0] + if 'tkg' in dataset_name: + all_possible_orig_timestamps =get_original_ts(all_possible_timestep_indices, ts_dist, np.min(ts_set)) + + no_nodes_list = [] + no_nodes_list_orig = [] + no_nodes_datetime = [] + for t in ts_all.t_2_triple.keys(): + num_nodes_ts = len(ts_all.unique_nodes(ts_all.t_2_triple[t])) + n_nodes_list.append(num_nodes_ts) + n_edges_list.append(len(ts_all.t_2_triple[t])) + if 'tkg' in dataset_name: + if num_nodes_ts == 0: + if t not in no_nodes_list: + no_nodes_list.append(t) + no_nodes_list_orig.append(all_possible_orig_timestamps[t]) + no_nodes_datetime.append(datetime.utcfromtimestamp(all_possible_orig_timestamps[t])) + # compute seasonality of num nodes over time: + seasonal_value =1 + seasonal_value = du.estimate_seasons(n_nodes_list) + if seasonal_value == 1: + print('there was no seasonality for number of nodes found') + else: + print(f'the seasonality for number of nodes is {seasonal_value}') + if 'tkgl' in dataset_name: + print('we have 0 nodes for' + str(len(no_nodes_list)) + ' timesteps') + print('0 nodes for timesteps: ', no_nodes_list) + print('this is original unix timestamps: ', no_nodes_list_orig) + print('this is datetime: ', no_nodes_datetime) + else: + print('we have 0 nodes for' + str(len(no_nodes_list)) + ' timesteps') + + + print(f"average number of triples per ts is {np.mean(n_edges_list)}") + print(f"std for average number of triples per ts is {np.std(n_edges_list)}") + print(f"min/max number of triples per ts is {np.min(n_edges_list), np.max(n_edges_list)}") + + print(f"average number of nodes per ts is {np.mean(n_nodes_list)}") + print(f"std for average number of nodes per ts is {np.std(n_nodes_list)}") + print(f"min/max number of nodes per ts is {np.min(n_nodes_list), np.max(n_nodes_list)}") + # colortgb = '#60ab84' + # fontsize =12 + # labelsize=12 + # bars_list = [20] + # for num_bars in bars_list: + # if num_bars < 100: + # capsize=2 + # capthick=2 + # elinewidth=2 + # else: + # capsize=1 + # capthick=1 + # elinewidth=1 + # ts_discretized_mean, ts_discretized_sum, ts_discretized_min, ts_discretized_max, start_indices, end_indices, mid_indices = du.discretize_values(n_edges_list, num_bars) + # plt.figure() + # plt.tick_params(axis='both', which='major', labelsize=labelsize) + # # plt.bar(mid_indices, ts_discretized_mean, width=(len(n_edges_list) // num_bars), label ='Mean Value', color =colortgb) + # plt.step(mid_indices, ts_discretized_mean, where='mid', linestyle='-', label ='Mean Value', color=colortgb) + # plt.scatter(mid_indices, ts_discretized_min, label ='min value') + # plt.scatter(mid_indices, ts_discretized_max, label ='max value') + # plt.xlabel('Timestep', fontsize=fontsize) + # plt.ylabel('Number of Edges', fontsize=fontsize) + # plt.legend() + # #plt.title(dataset_name+ ' - Number of Edges aggregated across multiple timesteps') + # modified_dataset_name = dataset_name.replace('-', '_') + # current_dir = os.path.dirname(os.path.abspath(__file__)) + # # Navigate one folder up + # parent_dir = os.path.dirname(current_dir) + # figs_dir = os.path.join( parent_dir, modified_dataset_name, 'figs') + # # Create the 'figs' directory if it doesn't exist + # if not os.path.exists(figs_dir): + # os.makedirs(figs_dir) + # save_path = (os.path.join(figs_dir, f"num_edges_discretized_{num_bars}_{dataset_name}.png")) + # plt.savefig(save_path, bbox_inches='tight') + # save_path = (os.path.join(figs_dir, f"num_edges_discretized_{num_bars}_{dataset_name}.pdf")) + # plt.savefig(save_path, bbox_inches='tight') + + # plt.figure() + # plt.tick_params(axis='both', which='major', labelsize=labelsize) + # mins = np.array(ts_discretized_min) + # maxs = np.array(ts_discretized_max) + # means = np.array(ts_discretized_mean) + # # plt.bar(mid_indices, ts_discretized_mean, width=(len(n_edges_list) // num_bars), label='Mean', color =colortgb) + # plt.step(mid_indices, ts_discretized_mean, where='mid', linestyle='-', label ='Mean Value', color=colortgb, linewidth=2) + # #plt.scatter(mid_indices, ts_discretized_mean, label ='Mean Value', color=colortgb) + # plt.errorbar(mid_indices, maxs, yerr=[maxs-mins, maxs-maxs], fmt='none', alpha=0.9, color='grey',capsize=capsize, capthick=capthick, elinewidth=elinewidth, label='Min-Max Range') + # plt.xlabel('Timestep', fontsize=fontsize) + # plt.ylabel('Number of Edges', fontsize=fontsize) + # plt.legend() + # #plt.title(dataset_name+ ' - Number of Edges aggregated across multiple timesteps') + # plt.show() + # save_path2 = (os.path.join(figs_dir,f"num_edges_discretized_{num_bars}_{dataset_name}2.png")) + # plt.savefig(save_path2, bbox_inches='tight') + # save_path2 = (os.path.join(figs_dir,f"num_edges_discretized_{num_bars}_{dataset_name}2.pdf")) + # plt.savefig(save_path2, bbox_inches='tight') + + # plt.figure() + # plt.tick_params(axis='both', which='major', labelsize=labelsize) + # mins = np.array(ts_discretized_min) + # maxs = np.array(ts_discretized_max) + # means = np.array(ts_discretized_mean) + # plt.bar(mid_indices, ts_discretized_sum, width=(len(n_edges_list) // num_bars), label='Sum', color =colortgb) + # # plt.step(mid_indices, ts_discretized_mean, where='mid', linestyle='-', label ='Mean Value', color=colortgb) + # # plt.errorbar(mid_indices, sums, yerr=[mins, maxs], fmt='none', alpha=0.9, color='grey',capsize=1.5, capthick=1.5, elinewidth=2, label='Min-Max Range') + # plt.xlabel('Timestep', fontsize=fontsize) + # plt.ylabel('Number of Edges', fontsize=fontsize) + # plt.legend() + # #plt.title(dataset_name+ ' - Number of Edges aggregated across multiple timesteps') + # plt.show() + # save_path2 = (os.path.join(figs_dir,f"num_edges_discretized_{num_bars}_{dataset_name}3.png")) + # plt.savefig(save_path2, bbox_inches='tight') + # save_path2 = (os.path.join(figs_dir,f"num_edges_discretized_{num_bars}_{dataset_name}3.pdf")) + # plt.savefig(save_path2, bbox_inches='tight') + + # try: + # plt.figure() + # plt.tick_params(axis='both', which='major', labelsize=labelsize) + # mins = np.array(ts_discretized_min) + # maxs = np.array(ts_discretized_max) + # means = np.array(ts_discretized_mean) + # # plt.bar(mid_indices, ts_discretized_mean, width=(len(n_edges_list) // num_bars), label='Mean', color =colortgb) + # plt.step(mid_indices, ts_discretized_mean, where='mid', linestyle='-', label ='Mean Value', color=colortgb) + # #plt.scatter(mid_indices, ts_discretized_mean, label ='Mean Value', color=colortgb) + # plt.errorbar(mid_indices, maxs, yerr=[maxs-mins, maxs-maxs], fmt='none', alpha=0.9, color='grey',capsize=capsize, capthick=capthick, elinewidth=elinewidth, label='Min-Max Range') + # plt.xlabel('Timestep', fontsize=fontsize) + # plt.ylabel('Number of Edges', fontsize=fontsize) + # #plt.title(dataset_name+ ' - Number of Edges aggregated across multiple timesteps') + # plt.yscale('log') + # plt.legend(fontsize=fontsize) + # plt.show() + # save_path2 = (os.path.join(figs_dir,f"num_edges_discretized_{num_bars}_{dataset_name}2log.png")) + # plt.savefig(save_path2, bbox_inches='tight') + # save_path2 = (os.path.join(figs_dir,f"num_edges_discretized_{num_bars}_{dataset_name}2log.pdf")) + # plt.savefig(save_path2, bbox_inches='tight') + # except: + # print('Could not plot log scale') + # plt.close('all') + + # plt.figure() + # plt.tick_params(axis='both', which='major', labelsize=labelsize) + # plt.scatter(range(ts_all.num_timesteps), n_edges_list, s=0.2) + # plt.xlabel('Timestep', fontsize=fontsize) + # plt.ylabel('number of triples', fontsize=fontsize) + # #plt.title(f'Number of triples per timestep for {dataset_name}') + # # save + # # Get the current directory of the script + # current_dir = os.path.dirname(os.path.abspath(__file__)) + # # Navigate one folder up + # parent_dir = os.path.dirname(current_dir) + # # Save stats_dict as CSV + # modified_dataset_name = dataset_name.replace('-', '_') + # save_path = (os.path.join(figs_dir,f"num_edges_per_ts_{dataset_name}.png")) + # plt.savefig(save_path, bbox_inches='tight') + + + # create a dict with number of endges and number of + to_be_saved_dict = {} + to_be_saved_dict['num_edges'] = n_edges_list + to_be_saved_dict['num_nodes'] = n_nodes_list + parent_dir = os.path.dirname(current_dir) + save_path = (os.path.join(figs_dir,f"numedges_{dataset_name}.json")) + save_file = open(save_path, "w") + json.dump(to_be_saved_dict, save_file) + save_file.close() + + # plt.figure() + # plt.scatter(range(ts_all.num_timesteps), n_nodes_list, s=0.2) + # plt.xlabel('Timestep', fontsize=fontsize) + # plt.ylabel('number of nodes', fontsize=fontsize) + # #plt.title(f'Number of nodes per timestep for {dataset_name}') + # save_path = (os.path.join(figs_dir,f"num_nodes_per_ts_{dataset_name}.png")) + # plt.savefig(save_path, bbox_inches='tight') + # plt.close('all') + + + # save the statistics in a dictionary + du.create_dict_and_save(dataset_name, num_rels_without_inv, num_nodes, num_train_quads, num_val_quads, num_test_quads, + num_all_quads, num_train_timesteps, num_val_timesteps, num_test_timesteps, num_all_ts, + test_ind_nodes, test_ind_nodes_perc, val_ind_nodes, val_ind_nodes_perc, + direct_recurrency_degree, recurrency_degree, consecutiveness_degree, + np.mean(n_edges_list), np.std(n_edges_list), np.min(n_edges_list), np.max(n_edges_list), + np.mean(n_nodes_list), np.std(n_nodes_list), np.min(n_nodes_list), np.max(n_nodes_list), + seasonal_value, collision_trainval, collision_valtest, first_ts_string, last_ts_string) diff --git a/tgb/datasets/dataset_scripts/compute_relation_dataset_stats.py b/stats_figures/compute_relation_dataset_stats.py similarity index 64% rename from tgb/datasets/dataset_scripts/compute_relation_dataset_stats.py rename to stats_figures/compute_relation_dataset_stats.py index 042c27c..eaeac1c 100644 --- a/tgb/datasets/dataset_scripts/compute_relation_dataset_stats.py +++ b/stats_figures/compute_relation_dataset_stats.py @@ -5,7 +5,7 @@ import os import os.path as osp from pathlib import Path -tgb_modules_path = osp.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..')) +tgb_modules_path = osp.abspath(os.path.join(os.path.dirname(__file__), '..')) sys.path.append(tgb_modules_path) import matplotlib.pyplot as plt import numpy as np @@ -13,37 +13,34 @@ #internal imports from tgb.linkproppred.dataset import LinkPropPredDataset -from tgb_modules.tkg_utils import reformat_ts -import tgb.datasets.dataset_scripts.dataset_utils as du +from tgb_modules.tkg_utils import reformat_ts +import stats_figures.dataset_utils as du # specify params -names = [ 'tkgl-wikidata'] #'tkgl-polecat','tkgl-smallpedia', 'tkgl-yago', 'tkgl-icews' ,'tkgl-smallpedia','thgl-myket','tkgl-yago', 'tkgl-icews','thgl-github', 'thgl-forum', 'tkgl-wikidata'] -colortgb = '#60ab84' +names = [ 'tkgl-polecat'] #'tkgl-polecat','tkgl-smallpedia', 'tkgl-yago', 'tkgl-icews' ,'tkgl-smallpedia','thgl-myket','tkgl-yago', 'tkgl-icews','thgl-github', 'thgl-forum', 'tkgl-wikidata'] +colortgb = '#60ab84' #tgb logo colrs colortgb2 = '#eeb641' colortgb3 = '#dd613a' -#colortgb4 ='#bce9ef' -#colortgb5 ='#d6e9d9' colors = [colortgb,colortgb2,colortgb3] # from tgb logo capsize=1.5 capthick=1.5 elinewidth=1.5 occ_threshold = 5 -k=10 # how many slices in the cake +1 +k=10 # how many slices in the cake (+1 will be added for "others") plots_flag = True # run through each datasest for dataset_name in names: ############################## LOAD DATA ############################## - print(dataset_name) - modified_dataset_name = dataset_name.replace('-', '_') + print(dataset_name) current_dir = os.path.dirname(os.path.abspath(__file__)) # Navigate one folder up - parent_dir = os.path.dirname(current_dir) - figs_dir = os.path.join( parent_dir, modified_dataset_name, 'figs') + + figs_dir = os.path.join( current_dir, dataset_name, 'figs') # Create the 'figs' directory if it doesn't exist if not os.path.exists(figs_dir): os.makedirs(figs_dir) - stats_dir = os.path.join( parent_dir, modified_dataset_name, 'stats') + stats_dir = os.path.join( current_dir, dataset_name, 'stats') if not os.path.exists(stats_dir): os.makedirs(stats_dir) @@ -63,9 +60,7 @@ timestamps_orig = dataset.full_data["timestamps"] timestamps = reformat_ts(timestamps_orig, dataset_name) # stepsize:1 current_dir = os.path.dirname(os.path.abspath(__file__)) - parent_dir = os.path.dirname(current_dir) - modified_dataset_name = dataset_name.replace('-', '_') - csv_dir = os.path.join( parent_dir, modified_dataset_name) + csv_dir = os.path.join( current_dir, dataset_name) np.savetxt(csv_dir +"/"+dataset_name+"timestamps.csv", timestamps,fmt='%i', delimiter=",") all_quads = np.stack((subjects, relations, objects, timestamps, timestamps_orig), axis=1) train_data = all_quads[dataset.train_mask] @@ -75,8 +70,8 @@ # Read the CSV file into a DataFrame rel_type2id_dict = {} rel_id2type_dict = {} - if 'wikidata' in dataset_name or 'smallpedia' in dataset_name: - csv_dir = os.path.join( parent_dir, modified_dataset_name, dataset_name+'_edgelist.csv') + if 'wikidata' in dataset_name or 'smallpedia' in dataset_name: #otherwise I add it manually + csv_dir = os.path.join( current_dir, dataset_name, dataset_name+'_edgelist.csv') df = pd.read_csv(csv_dir) # Create a dictionary mapping the entries in the 'relation_type' column to IDs @@ -214,68 +209,70 @@ df_sorted.to_csv(os.path.join(stats_dir, f"relation_statistics_{dataset_name}.csv"), index=False) - ###################### Figures ############################## - ##PIE CHART - # Repeat the colors to match the number of slices - if plots_flag: - num_slices = len(plot_names) - repeated_colors = (colors * ((num_slices // len(colors)) + 1))[:num_slices] - plt.figure(figsize=(8, 8)) - plt.pie(plot_names.values(), labels=plot_names.keys(), autopct='%1.f%%', startangle=140, - colors=repeated_colors) - #plt.title(f'Pie Chart of Top {k} Relations and "Others"') - plt.axis('equal') - save_path = (os.path.join(figs_dir, f"rel_pie_{dataset_name}.png")) - plt.savefig(save_path, bbox_inches='tight') + # ###################### Figures ############################## I moved them to create_relation_figures.py + # ##PIE CHART + # # Repeat the colors to match the number of slices + # if plots_flag: + # num_slices = len(plot_names) + # repeated_colors = (colors * ((num_slices // len(colors)) + 1))[:num_slices] + # plt.figure(figsize=(8, 8)) + # plt.pie(plot_names.values(), labels=plot_names.keys(), autopct='%1.f%%', startangle=140, + # colors=repeated_colors) + # #plt.title(f'Pie Chart of Top {k} Relations and "Others"') + # plt.axis('equal') + # save_path = (os.path.join(figs_dir, f"rel_pie_{dataset_name}.png")) + # plt.savefig(save_path, bbox_inches='tight') - ## TRIPLES PER RELATION - plt.figure() - plt.bar(rels_occurences.keys(), rels_occurences.values(), color=colortgb) - plt.xlabel('Relation') - plt.ylabel('Number of Triples') - #plt.title('Number of Triples per Relation') - save_path = (os.path.join(figs_dir, f"rel_tripperrel_{dataset_name}.png")) - plt.savefig(save_path, bbox_inches='tight') + # ## TRIPLES PER RELATION + # plt.figure() + # plt.bar(rels_occurences.keys(), rels_occurences.values(), color=colortgb) + # plt.xlabel('Relation') + # plt.ylabel('Number of Triples') + # #plt.title('Number of Triples per Relation') + # save_path = (os.path.join(figs_dir, f"rel_tripperrel_{dataset_name}.png")) + # plt.savefig(save_path, bbox_inches='tight') + + # ## NUMBER OF OCCURENCES OF TRIPLES PER RELATION + # plt.figure() + # mins = np.array([x[3] for x in statistics_dict_prominent.values()]) + # maxs = np.array([x[2] for x in statistics_dict_prominent.values()]) + # mean = np.array([x[0] for x in statistics_dict_prominent.values()]) + # std = np.array([x[1] for x in statistics_dict_prominent.values()]) + # # plt.bar(mean_max_min_dict.keys(), [x[0] for x in mean_max_min_dict.values()], color=colortgb) + # plt.scatter(statistics_dict_prominent.keys(), [x[0] for x in statistics_dict_prominent.values()], label ='mean value', color=colortgb) + # plt.scatter(statistics_dict_prominent.keys(), [x[4] for x in statistics_dict_prominent.values()], label ='median value', color='orange') + # # plt.errorbar(mean_std_max_min_dict.keys(), mean, yerr=std, fmt='none', alpha=0.9, color='grey',capsize=capsize, capthick=capthick, elinewidth=elinewidth, label='Std') + # plt.errorbar(statistics_dict_prominent.keys(), maxs, yerr=[maxs-mins, maxs-maxs], fmt='none', alpha=0.9, color='grey',capsize=capsize, capthick=capthick, elinewidth=elinewidth, label='Min-Max Range') + # plt.xlabel('Relation') + # plt.ylabel('Mean Number of Occurences of [subject, object]') + # #plt.title('Mean Number of Occurences of [subject, object] per Relation') + # plt.legend() + # #plt.yscale('log') + # save_path = (os.path.join(figs_dir, f"rel_mean_occurences_{dataset_name}.png")) + # plt.savefig(save_path, bbox_inches='tight') - ## NUMBER OF OCCURENCES OF TRIPLES PER RELATION - plt.figure() - mins = np.array([x[3] for x in statistics_dict_prominent.values()]) - maxs = np.array([x[2] for x in statistics_dict_prominent.values()]) - mean = np.array([x[0] for x in statistics_dict_prominent.values()]) - std = np.array([x[1] for x in statistics_dict_prominent.values()]) - # plt.bar(mean_max_min_dict.keys(), [x[0] for x in mean_max_min_dict.values()], color=colortgb) - plt.scatter(statistics_dict_prominent.keys(), [x[0] for x in statistics_dict_prominent.values()], label ='mean value', color=colortgb) - plt.scatter(statistics_dict_prominent.keys(), [x[4] for x in statistics_dict_prominent.values()], label ='median value', color='orange') - # plt.errorbar(mean_std_max_min_dict.keys(), mean, yerr=std, fmt='none', alpha=0.9, color='grey',capsize=capsize, capthick=capthick, elinewidth=elinewidth, label='Std') - plt.errorbar(statistics_dict_prominent.keys(), maxs, yerr=[maxs-mins, maxs-maxs], fmt='none', alpha=0.9, color='grey',capsize=capsize, capthick=capthick, elinewidth=elinewidth, label='Min-Max Range') - plt.xlabel('Relation') - plt.ylabel('Mean Number of Occurences of [subject, object]') - #plt.title('Mean Number of Occurences of [subject, object] per Relation') - plt.legend() - #plt.yscale('log') - save_path = (os.path.join(figs_dir, f"rel_mean_occurences_{dataset_name}.png")) - plt.savefig(save_path, bbox_inches='tight') + # ## bar plot that shows how many relations belong to the low occurence category vs high occurence category + # plt.figure() + # plt.bar(['Low Occurence', 'High Occurence'], [len(low_occurences), len(high_occurences)], color=colortgb) + # plt.xlabel('Occurence Category') + # plt.ylabel('Number of Relations') + # #plt.title('Number of Relations in Low and High Occurence Categories') + # save_path = (os.path.join(figs_dir, f"rel_occurence_categories_{dataset_name}.png")) + # plt.savefig(save_path, bbox_inches='tight') - ## bar plot that shows how many relations belong to the low occurence category vs high occurence category - plt.figure() - plt.bar(['Low Occurence', 'High Occurence'], [len(low_occurences), len(high_occurences)], color=colortgb) - plt.xlabel('Occurence Category') - plt.ylabel('Number of Relations') - #plt.title('Number of Relations in Low and High Occurence Categories') - save_path = (os.path.join(figs_dir, f"rel_occurence_categories_{dataset_name}.png")) - plt.savefig(save_path, bbox_inches='tight') + # ## bar plot that shows the number of triples in each occurence category + # plt.figure() + # plt.bar(['Low Occurence', 'High Occurence'], [sum([num_oc for num_oc in low_occurences.values()]), sum([num_oc for num_oc in high_occurences.values()]),], color=colortgb) + # plt.xlabel('Occurence Category') + # plt.ylabel('Number of Triples') + # #plt.title('Number of Triples in Low and High Occurence Categories') + # save_path = (os.path.join(figs_dir, f"rel_occurence_triples_categories_{dataset_name}.png")) + # plt.savefig(save_path, bbox_inches='tight') - ## bar plot that shows the number of triples in each occurence category - plt.figure() - plt.bar(['Low Occurence', 'High Occurence'], [sum([num_oc for num_oc in low_occurences.values()]), sum([num_oc for num_oc in high_occurences.values()]),], color=colortgb) - plt.xlabel('Occurence Category') - plt.ylabel('Number of Triples') - #plt.title('Number of Triples in Low and High Occurence Categories') - save_path = (os.path.join(figs_dir, f"rel_occurence_triples_categories_{dataset_name}.png")) - plt.savefig(save_path, bbox_inches='tight') + # ## bar plot that shows the mean consecutive timesteps of each relation + # plt.figure() + # plt.bar(mean_per_rel.keys(), mean_per_rel.values(), color=colortgb) + # save_path = (os.path.join(figs_dir, f"rel_conperrel_{dataset_name}.png")) + # plt.savefig(save_path, bbox_inches='tight') - ## bar plot that shows the mean consecutive timesteps of each relation - plt.figure() - plt.bar(mean_per_rel.keys(), mean_per_rel.values(), color=colortgb) - save_path = (os.path.join(figs_dir, f"rel_conperrel_{dataset_name}.png")) - plt.savefig(save_path, bbox_inches='tight') +print('done') \ No newline at end of file diff --git a/stats_figures/compute_relation_results_df.py b/stats_figures/compute_relation_results_df.py new file mode 100644 index 0000000..443caa9 --- /dev/null +++ b/stats_figures/compute_relation_results_df.py @@ -0,0 +1,95 @@ +""" +for every method and dataset: load the mrr per relation and add it to new columns to the +dataset_name/stats/relation_statistics_dataset_name.csv dataframe +compute it separately for head and tail direction +for this, we need to have extracted the mrr per relation for each method and dataset +""" + +## imports +import numpy as np +import sys +import os +import os.path as osp +tgb_modules_path = osp.abspath(os.path.join(os.path.dirname(__file__), '..')) +sys.path.append(tgb_modules_path) +import json +import numpy as np +import pandas as pd +import stats_figures.dataset_utils as du + + +# specify params +names = [ 'tkgl-polecat'] #'tkgl-icews', 'tkgl-polecat', 'tkgl-smallpedia'] #'tkgl-polecat','tkgl-smallpedia', 'tkgl-yago', 'tkgl-icews' ,'tkgl-smallpedia','thgl-myket','tkgl-yago', 'tkgl-icews','thgl-github', 'thgl-forum', 'tkgl-wikidata'] +methods = ['recurrency', 'regcn', 'cen'] #'recurrency' + +# this is where the results per relation are stored +model_names = {'recurrency': {'tkgl-polecat': ['saved_models/RecurrencyBaseline', 1], + 'tkgl-icews': ['saved_models/RecurrencyBaseline', 500], + 'tkgl-smallpedia': ['saved_models/RecurrencyBaseline', 1]}, + 'regcn': {'tkgl-polecat': 'saved_results/REGCN_tkgl-polecat_results_per_rel.json', + 'tkgl-icews': 'saved_results/REGCN_tkgl-icews_results_per_rel.json', + 'tkgl-smallpedia': 'saved_results/REGCN_tkgl-smallpedia_results_per_rel.json'}, + 'cen': {'tkgl-polecat': 'saved_results/CEN_tkgl-polecat_results_per_rel.json', + 'tkgl-icews': 'saved_results/CEN_tkgl-icews_results_per_rel.json', + 'tkgl-smallpedia': 'saved_results/CEN_tkgl-smallpedia_results_per_rel.json'}, + 'tlogic': {'tkgl-smallpedia': 'saved_results/TLogic_tkgl-smallpedia_results_per_rel.json'} + } + +def inverse_rel(rel_id, max_id): + inverse_rel = rel_id + max_id + 1 + return inverse_rel +# run through each datasest +for dataset_name in names: + # read dataframe with the stats for this dataset from csv + print(dataset_name) + modified_dataset_name = dataset_name.replace('-', '_') + current_dir = os.path.dirname(os.path.abspath(__file__)) + # Navigate one folder up + parent_dir = os.path.dirname(current_dir) + + tgb_dir = os.path.dirname((os.path.dirname(os.path.abspath(__file__)))) + figs_dir = os.path.join(current_dir, dataset_name, 'figs') + # Create the 'figs' directory if it doesn't exist + if not os.path.exists(figs_dir): + os.makedirs(figs_dir) + stats_dir = os.path.join( current_dir, dataset_name, 'stats') + if not os.path.exists(stats_dir): + os.makedirs(stats_dir) + stats_df = pd.read_csv(os.path.join(stats_dir, f"relation_statistics_{dataset_name}.csv")) + + + for method in methods: + results_dict = os.path.join(tgb_dir, 'examples', 'linkproppred', dataset_name) + + # if the method is recurrency, we need to load the results from the csv and compute the mean value per relation + # load csv, create dict with the mean value per relation + if method == 'recurrency': + name = model_names[method][dataset_name][0] + seed = model_names[method][dataset_name][1] + results_filename = f'{results_dict}/{name}' + + mrr_per_rel, full_mrr = du.read_dict_compute_mrr_per_rel(results_dict, name, dataset_name, seed, num_rels=0, split_mode='test') + + else: + name = model_names[method][dataset_name] + results_filename = f'{results_dict}/{name}' + with open(results_filename, 'r') as json_file: + mrr_per_rel = json.load(json_file) + num_rels = len(list(set(stats_df['relation']))) + max_id = max(list(set(stats_df['relation']))) + assert num_rels == max_id+1 + + for rel in stats_df['relation']: + if str(rel) in mrr_per_rel: + stats_df.loc[stats_df['relation'] == rel, method+'_tail'] = mrr_per_rel[str(rel)] + stats_df.loc[stats_df['relation'] == rel, method+'_head'] = mrr_per_rel[str(inverse_rel(rel, max_id))] + elif rel in mrr_per_rel: + stats_df.loc[stats_df['relation'] == rel, method+'_tail'] = mrr_per_rel[rel] + stats_df.loc[stats_df['relation'] == rel, method+'_head'] = mrr_per_rel[inverse_rel(rel, max_id)] + else: + stats_df.loc[stats_df['relation'] == rel, method+'_tail'] = 'N/A' + stats_df.loc[stats_df['relation'] == rel, method+'_head'] = 'N/A' + # save the dataframe with the new columns + stats_df.to_csv(os.path.join(stats_dir, f"relation_statistics_{dataset_name}.csv"), index=False) + + # save dataframe with the new columns diff --git a/stats_figures/create_edges_figures.py b/stats_figures/create_edges_figures.py new file mode 100644 index 0000000..c1c0d7d --- /dev/null +++ b/stats_figures/create_edges_figures.py @@ -0,0 +1,143 @@ +import numpy as np + +import sys +import os +import os.path as osp +tgb_modules_path = osp.abspath(os.path.join(os.path.dirname(__file__), '..')) +sys.path.append(tgb_modules_path) +import json +import csv + +## imports +import matplotlib.pyplot as plt +import numpy as np +import stats_figures.dataset_utils as du + + +# specify params +names = ['thgl-github', 'tkgl-polecat', 'tkgl-icews', 'tkgl-smallpedia', 'tkgl-wikidata', 'thgl-myket', 'thgl-forum', 'thgl-software'] +granularity ={} #for labels +granularity['tkgl-polecat'] = 'days' +granularity['tkgl-icews'] = 'days' +granularity['tkgl-smallpedia'] = 'years' +granularity['tkgl-wikidata'] = 'years' +granularity['tkgl-yago'] = 'years' +granularity['thgl-myket'] = 's.' +granularity['thgl-github'] = 's.' +granularity['thgl-software'] = 's.' +granularity['thgl-forum'] = 's.' + +# colors from tgb logo +colortgb = '#60ab84' +colortgb2 = '#eeb641' +colortgb3 = '#dd613a' +#colortgb4 ='#bce9ef' +#colortgb5 ='#d6e9d9' + + +fontsize =12 +labelsize=12 +for dataset_name in names: + + # dataset_name = dataset_name.replace('-', '_') + current_dir = os.path.dirname(os.path.abspath(__file__)) + # Navigate one folder up + + figs_dir = os.path.join( current_dir, dataset_name, 'figs') + data_dir = os.path.join(current_dir, dataset_name) + save_path = (os.path.join(figs_dir,f"numedges_{dataset_name}.json")) + stats_path = (os.path.join(data_dir,f"dataset_stats.csv")) + + n_edgesnodes_list_all = json.load(open(save_path)) + n_edges_list = n_edgesnodes_list_all['num_edges'] + bars_list = [20] #number of bins + + # Read the CSV file + with open(stats_path, 'r') as file: + reader = csv.reader(file) + for row in reader: + if row[0] == 'first_ts_string': + start_date = row[1] + elif row[0] == 'last_ts_string': + end_date = row[1] + + for num_bars in bars_list: + # Create the 'figs' directory if it doesn't exist + if not os.path.exists(figs_dir): + os.makedirs(figs_dir) + if num_bars < 100: + capsize=2 + capthick=2 + elinewidth=2 + else: + capsize=1 + capthick=1 + elinewidth=1 + ts_discretized_mean, ts_discretized_sum, ts_discretized_min, ts_discretized_max, start_indices, end_indices, mid_indices = du.discretize_values(n_edges_list, num_bars) + + # line chart + plt.figure() + plt.tick_params(axis='both', which='major', labelsize=labelsize) + mins = np.array(ts_discretized_min) + maxs = np.array(ts_discretized_max) + means = np.array(ts_discretized_mean) + # plt.bar(mid_indices, ts_discretized_mean, width=(len(n_edges_list) // num_bars), label='Mean', color =colortgb) + plt.step(mid_indices, ts_discretized_mean, where='mid', linestyle='-', label ='Mean Value', color=colortgb, linewidth=2) + #plt.scatter(mid_indices, ts_discretized_mean, label ='Mean Value', color=colortgb) + plt.errorbar(mid_indices, maxs, yerr=[maxs-mins, maxs-maxs], fmt='none', alpha=0.9, color='grey',capsize=capsize, capthick=capthick, elinewidth=elinewidth, label='Min-Max Range') + plt.xlabel(f'Ts. [{granularity[dataset_name]}] from {start_date} to {end_date}', fontsize=fontsize) + plt.ylabel('Number of Edges', fontsize=fontsize) + plt.legend() + plt.tight_layout() + #plt.title(dataset_name+ ' - Number of Edges aggregated across multiple timesteps') + plt.show() + save_path2 = (os.path.join(figs_dir,f"num_edges_discretized_{num_bars}_{dataset_name}2.png")) + plt.savefig(save_path2, bbox_inches='tight') + save_path2 = (os.path.join(figs_dir,f"num_edges_discretized_{num_bars}_{dataset_name}2.pdf")) + plt.savefig(save_path2, bbox_inches='tight') + + # bar chart + plt.figure() + plt.tick_params(axis='both', which='major', labelsize=labelsize) + mins = np.array(ts_discretized_min) + maxs = np.array(ts_discretized_max) + means = np.array(ts_discretized_mean) + plt.bar(mid_indices, ts_discretized_sum, width=(len(n_edges_list) // num_bars), label='Sum', color =colortgb) + # plt.step(mid_indices, ts_discretized_mean, where='mid', linestyle='-', label ='Mean Value', color=colortgb) + # plt.errorbar(mid_indices, sums, yerr=[mins, maxs], fmt='none', alpha=0.9, color='grey',capsize=1.5, capthick=1.5, elinewidth=2, label='Min-Max Range') + plt.xlabel(f'Timestep [{granularity[dataset_name]}] from {start_date} to {end_date}', fontsize=fontsize) + plt.ylabel('Number of Edges', fontsize=fontsize) + plt.legend() + #plt.title(dataset_name+ ' - Number of Edges aggregated across multiple timesteps') + plt.show() + save_path2 = (os.path.join(figs_dir,f"num_edges_discretized_{num_bars}_{dataset_name}3.png")) + plt.savefig(save_path2, bbox_inches='tight') + save_path2 = (os.path.join(figs_dir,f"num_edges_discretized_{num_bars}_{dataset_name}3.pdf")) + plt.savefig(save_path2, bbox_inches='tight') + + + try: + # try log scale + plt.figure() + plt.tick_params(axis='both', which='major', labelsize=labelsize) + mins = np.array(ts_discretized_min) + maxs = np.array(ts_discretized_max) + means = np.array(ts_discretized_mean) + # plt.bar(mid_indices, ts_discretized_mean, width=(len(n_edges_list) // num_bars), label='Mean', color =colortgb) + plt.step(mid_indices, ts_discretized_mean, where='mid', linestyle='-', label ='Mean Value', color=colortgb) + #plt.scatter(mid_indices, ts_discretized_mean, label ='Mean Value', color=colortgb) + plt.errorbar(mid_indices, maxs, yerr=[maxs-mins, maxs-maxs], fmt='none', alpha=0.9, color='grey',capsize=capsize, capthick=capthick, elinewidth=elinewidth, label='Min-Max Range') + plt.xlabel(f'Timestep [{granularity[dataset_name]}] from {start_date} to {end_date}', fontsize=fontsize) + plt.ylabel('Number of Edges', fontsize=fontsize) + #plt.title(dataset_name+ ' - Number of Edges aggregated across multiple timesteps') + plt.yscale('log') + plt.legend(fontsize=fontsize) + plt.show() + save_path2 = (os.path.join(figs_dir,f"num_edges_discretized_{num_bars}_{dataset_name}2log.png")) + plt.savefig(save_path2, bbox_inches='tight') + save_path2 = (os.path.join(figs_dir,f"num_edges_discretized_{num_bars}_{dataset_name}2log.pdf")) + plt.savefig(save_path2, bbox_inches='tight') + except: + print('Could not plot log scale') + plt.close('all') + diff --git a/stats_figures/create_relation_figures.py b/stats_figures/create_relation_figures.py new file mode 100644 index 0000000..796c232 --- /dev/null +++ b/stats_figures/create_relation_figures.py @@ -0,0 +1,270 @@ +""" pie charts, mrr per relation charts +""" + +## imports +import numpy as np +import sys +import os +import os.path as osp +tgb_modules_path = osp.abspath(os.path.join(os.path.dirname(__file__), '..')) +sys.path.append(tgb_modules_path) +import matplotlib.pyplot as plt +from matplotlib.colors import LogNorm +from matplotlib.colors import Normalize +import numpy as np +import pandas as pd +import stats_figures.dataset_utils as du + + +# specify params +# which datasets +names = [ 'tkgl-polecat', 'tkgl-icews', 'tkgl-wikidata', 'tkgl-smallpedia','tkgl-polecat'] #'tkgl-polecat','tkgl-smallpedia', 'tkgl-yago', 'tkgl-icews' ,'tkgl-smallpedia','thgl-myket','tkgl-yago', 'tkgl-icews','thgl-github', 'thgl-forum', 'tkgl-wikidata'] +# which methods for the mrr_per_rel figures +methods = ['recurrency', 'regcn', 'cen'] #'recurrency' +colortgb = '#60ab84' #tgb logo colors +colortgb2 = '#eeb641' +colortgb3 = '#dd613a' +head_tail_flag = False # if true, the head and tail of the relation are shown in the plot, otherwise just the mean across both directions + +# pie chart colors +colors = [colortgb,colortgb2,colortgb3] # from tgb logo +colors2= ['#8e0152', '#c51b7d', '#de77ae', '#f1b6da', '#fde0ef', '#f7f7f7', '#e6f5d0', '#b8e186', '#7fbc41', '#4d9221', '#276419'] +# from https://colorbrewer2.org/#type=diverging&scheme=PiYG&n=11 color blind friendly + +capsize=1.5 +capthick=1.5 +elinewidth=1.5 +occ_threshold = 5 +k=10 # how many slices in the cake +1 +plots_flag = True +ylimdict = {'tkgl-polecat': 0.25, 'tkgl-icews':0.6, 'tkgl-smallpedia': 1.01} # for the mrr charts the upper mrr limit + +overall_min = -1 # for the correlation matrix colorbar +overall_max =1 # for the correlation matrix colorbar +num_rels_plot = 10 # how many relations to we want to plot in the mrr chart +i = 0 +plot_values_list = [] +plot_names_multi_line_list =[] +for dataset_name in names: + print('dataset_name:', dataset_name) + # some directory stuff + modified_dataset_name = dataset_name.replace('-', '_') + current_dir = os.path.dirname(os.path.abspath(__file__)) + + stats_dir = os.path.join( current_dir,dataset_name,'stats') + tgb_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) + figs_dir = os.path.join(current_dir,dataset_name,'figs_rel') + stats_df = pd.read_csv(os.path.join(stats_dir, f"relation_statistics_{dataset_name}.csv")) + + # Create the 'figs' directory if it doesn't exist + if not os.path.exists(figs_dir): + os.makedirs(figs_dir) + stats_dir = os.path.join( current_dir,dataset_name,'stats') + if not os.path.exists(stats_dir): + os.makedirs(stats_dir) + + ### A) pie charts #plot top k relations accordung to the number of occurences plus a slice for "others" + plot_names = list(stats_df['rel_string_word'].iloc[:k]) + plot_values = list(stats_df['number_total_occurences'].iloc[:k]) + all_others = np.sum(stats_df['number_total_occurences'].iloc[k:]) #slice for "others" (sum of all other relations occurences) + plot_values.append(all_others) + plot_names.append('Others') + # for the pie chart labels to be more readable (i.e. force line break if words are long) + plot_names_multi_line= [] + for name in plot_names: # add some \n to make the labels more fittable to the pie chart + if type(name) == str: + words = name.split() + newname = words[0] + if len(words) > 1: + for i in range(len(words)-1): + if not '(' in words[i+1]: + if len(words[i]) > 3: + newname+='\n' + else: + newname+=' ' + newname+=words[i+1] + else: + newname = str(name) #then only plot the int as is. + plot_names_multi_line.append(newname) + + num_slices = len(plot_names) + plt.figure(figsize=(7, 7)) + wedges, texts, autotexts =plt.pie(plot_values,autopct=lambda pct: f"{pct:.0f}%" if pct > 1.5 else '', startangle=140, colors=colors2, labeldistance=2.2) #repeated_colors) + # Increase the font size of the percentage values + for autotext in autotexts: + autotext.set_fontsize(15) + plt.axis('equal') + # Move the percentage labels further outside + for autotext, wedge in zip(autotexts, wedges): + angle = (wedge.theta2 - wedge.theta1) / 2 + wedge.theta1 + x = np.cos(np.deg2rad(angle)) + y = np.sin(np.deg2rad(angle)) + distance = 0.85 # Adjust this value to move the labels further or closer to the center + autotext.set_position((x * distance, y * distance)) + # Set the labels for each pie slice + plt.legend(wedges, plot_names_multi_line, loc="center left", bbox_to_anchor=(1, 0, 0.5, 1), fontsize=14) + save_path = (os.path.join(figs_dir, f"rel_pie_{dataset_name}.png")) + plt.savefig(save_path, bbox_inches='tight') + save_path = (os.path.join(figs_dir, f"rel_pie_{dataset_name}.pdf")) + plt.savefig(save_path, bbox_inches='tight') + + if dataset_name == 'tkgl-wikidata': #then we do not want to plot the mrr for the relations + continue + + ### B) plot the mrr for each relation for each method, different color for different number of occurences or for different recurrency degree + + # prepare the dataframe: only take the top ten relations according to number of occurences and sort by recurrency degree + # we use selected_df_sorted to plot the relations in the order of recurrency degree + rels_sorted = np.array(stats_df['relation'])[0:num_rels_plot] + mask = stats_df['relation'].isin(rels_sorted) + selected_df = stats_df[mask] #only the parts of the dataframe that contain the top ten relations according to number of occurences + selected_df_sorted = selected_df.sort_values(by='recurrency_degree', ascending=False) # Sort selected_df by 'recurrency_degree' column in descending order + rels_to_plot = list(selected_df_sorted['relation']) + labels = np.array(selected_df_sorted['relation'])# only plotting the id for space reasons + mrr_per_rel_freq = [] # list of mrr values for each relation - three lists for three methods + mrr_per_rel_freq2 = [] + mrr_per_rel_freq3 = [] + lab = [] + lab_ht = [] + lab_rel = [] + # rel_oc_dict[rel] = count_occurrences + count_occurrences_sorted = [] + rec_degree_sorted = [] + for index, r in enumerate(rels_to_plot): + if head_tail_flag: + lab_ht.append('h') + lab_ht.append('t') + lab_rel.append(str(labels[index])+' ') # add spaces to make the labels longer + else: + lab_rel.append(str(labels[index])+'') # add spaces to make the labels longer + + lab.append(labels[index]) + if head_tail_flag: # if we do head and tail separately we need the value for head and tail direction + mrr_per_rel_freq.append(selected_df_sorted['recurrency_head'].iloc[index]) + mrr_per_rel_freq.append(selected_df_sorted['recurrency_tail'].iloc[index]) + mrr_per_rel_freq2.append(selected_df_sorted['regcn_head'].iloc[index]) + mrr_per_rel_freq2.append(selected_df_sorted['regcn_tail'].iloc[index]) + mrr_per_rel_freq3.append(selected_df_sorted['cen_head'].iloc[index]) + mrr_per_rel_freq3.append(selected_df_sorted['cen_tail'].iloc[index]) + count_occurrences_sorted.append(selected_df_sorted['number_total_occurences'].iloc[index])#append twice for head and tail + count_occurrences_sorted.append(selected_df_sorted['number_total_occurences'].iloc[index]) + rec_degree_sorted.append(selected_df_sorted['recurrency_degree'].iloc[index]) #append twice for head and tail + rec_degree_sorted.append(selected_df_sorted['recurrency_degree'].iloc[index]) + else:# if we do NOT head and tail separately we need the mean value for head and tail direction + mrr_per_rel_freq.append(np.mean([selected_df_sorted['recurrency_head'].iloc[index], selected_df_sorted['recurrency_tail'].iloc[index]])) + mrr_per_rel_freq2.append(np.mean([selected_df_sorted['regcn_head'].iloc[index],selected_df_sorted['regcn_tail'].iloc[index]])) + mrr_per_rel_freq3.append(np.mean([selected_df_sorted['cen_head'].iloc[index], selected_df_sorted['cen_tail'].iloc[index]])) + count_occurrences_sorted.append(selected_df_sorted['number_total_occurences'].iloc[index])#append twice for head and tail + rec_degree_sorted.append(selected_df_sorted['recurrency_degree'].iloc[index]) + + # these are the x-values of the ticks. in case we plot head and tail separately, we need to have two ticks per relation + x_values = [] + x_values_rel = [] + for i in range(0,num_rels_plot): + if head_tail_flag: + x_values.append(i*2+0.4) + x_values.append(i*2+0.8) + else: + x_values.append(i*2+0.4) + x_values_rel.append(i*2+0.4) + + lab_lines = lab_rel #labels, for now + a = count_occurrences_sorted + + # version 1) colors are based on the reucrrency degree + plt.figure() + sca = plt.scatter(x_values, mrr_per_rel_freq2, marker='p',s=150, c = rec_degree_sorted, alpha=1, edgecolor='grey', cmap='jet', norm=Normalize(vmin=0, vmax=1), label='REGCN') # cmap='gist_rainbow', + sca = plt.scatter(x_values, mrr_per_rel_freq3 , marker='*',s=150, c = rec_degree_sorted, alpha=1, edgecolor='grey', cmap='jet', norm=Normalize(vmin=0, vmax=1), label='CEN') + sca = plt.scatter(x_values, mrr_per_rel_freq, marker='o',s=60, c = rec_degree_sorted, alpha=1, edgecolor='grey', cmap='jet', norm=Normalize(vmin=0, vmax=1), label='Recurrency Baseline') + plt.ylabel('MRR', fontsize=14) + plt.xlabel('Relation', fontsize=14) + plt.legend(fontsize=14) + cbar =plt.colorbar(sca) + plt.ylim([0,ylimdict[dataset_name]]) + cbar.ax.yaxis.label.set_color('gray') + + if head_tail_flag: + plt.xticks(x_values, lab_ht, size=13) #, verticalalignment="center") # ha='right', + plt.xticks(x_values_rel, lab_lines, size=14, minor=True) + plt.tick_params(axis='x', which='minor', rotation=90, length=0) + else: + plt.xticks(x_values_rel, lab_lines, size=14) + plt.tick_params(axis='x', rotation=90, length=0) + plt.yticks(size=13) + # Create a locator for the second set of x-ticks + # plt.secondary_xaxis('top', x_values_rel) + + plt.grid() + save_path = (os.path.join(figs_dir, f"rel_mrrperrel_recdeg_{dataset_name}.png")) + plt.savefig(save_path, bbox_inches='tight') + save_path = (os.path.join(figs_dir, f"rel_mrrperrel_recdeg_{dataset_name}.pdf")) + plt.savefig(save_path, bbox_inches='tight') + print('saved in ', save_path) + + # version 2) colors are the number of occurences + plt.figure() + sca = plt.scatter(x_values, mrr_per_rel_freq2, marker='p',s=150, c = a, alpha=1, edgecolor='grey', norm=LogNorm(), cmap='jet', label='REGCN') + sca = plt.scatter(x_values, mrr_per_rel_freq3 , marker='*',s=150, c = a, alpha=1, edgecolor='grey', norm=LogNorm(), cmap='jet', label='CEN') + sca = plt.scatter(x_values, mrr_per_rel_freq, marker='o',s=60, c = a, alpha=1, edgecolor='grey', norm=LogNorm(), cmap='jet', label='Recurrency Baseline') + plt.ylabel('MRR', fontsize=14) + plt.xlabel('Relation', fontsize=14) + plt.legend(fontsize=14) + cbar =plt.colorbar(sca) + plt.ylim([0,ylimdict[dataset_name]]) + cbar.ax.yaxis.label.set_color('gray') + + plt.xticks(x_values, lab_ht, size=13) #, verticalalignment="center") # ha='right', + plt.yticks(size=13) + # Create a locator for the second set of x-ticks + # plt.secondary_xaxis('top', x_values_rel) + plt.xticks(x_values_rel, lab_lines, size=14, minor=True) + plt.tick_params(axis='x', which='minor', rotation=90, length=0) + plt.grid() + save_path = (os.path.join(figs_dir, f"rel_mrrperrel_occ_{dataset_name}.png")) + plt.savefig(save_path, bbox_inches='tight') + + + ### C) plot all sorts of correlation matrix. I specify different columns for the different plots + df = stats_df[['recurrency_degree', 'direct_recurrency-degree', 'recurrency_tail', 'recurrency_head', 'regcn_tail', 'regcn_head', 'cen_tail', 'cen_head']] + corrmat= df.corr() + f = plt.figure(figsize=(19, 15)) + plt.matshow(corrmat, fignum=f.number, vmin=overall_min, vmax=overall_max) + plt.xticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=16, rotation=90) + plt.yticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=16) + cb = plt.colorbar() + cb.ax.tick_params(labelsize=16) + save_path = (os.path.join(figs_dir, f"corr_rec_meth_{dataset_name}.png")) + plt.savefig(save_path, bbox_inches='tight') + + df = stats_df[['consecutiveness_value', 'recurrency_tail', 'recurrency_head', 'regcn_tail', 'regcn_head', 'cen_tail', 'cen_head']] + corrmat= df.corr() + f = plt.figure(figsize=(19, 15)) + plt.matshow(corrmat, fignum=f.number, vmin=overall_min, vmax=overall_max) + plt.xticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=16, rotation=90) + plt.yticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=16) + cb = plt.colorbar() + cb.ax.tick_params(labelsize=16) + save_path = (os.path.join(figs_dir, f"corr_con_meth_{dataset_name}.png")) + plt.savefig(save_path, bbox_inches='tight') + + df = stats_df[['recurrency_degree', 'direct_recurrency-degree', 'consecutiveness_value', 'mean_occurence_per_triple','number_total_occurences', 'recurrency_tail', 'recurrency_head', 'regcn_tail', 'regcn_head', 'cen_tail', 'cen_head']] + corrmat= df.corr() + f = plt.figure(figsize=(19, 15)) + plt.matshow(corrmat, fignum=f.number, vmin=overall_min, vmax=overall_max) + plt.xticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=16, rotation=90) + plt.yticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=16) + for i in range(corrmat.shape[0]): + for j in range(corrmat.shape[1]): + plt.text(j, i, "{:.2f}".format(corrmat.iloc[i, j]), ha='center', va='center', color='black', fontsize=16) + cb = plt.colorbar() + # fig.colorbar(cax, ticks=[-1,0,1], shrink=0.8) + cb.ax.tick_params(labelsize=16) + # Plot the correlation matrix + save_path = (os.path.join(figs_dir, f"corr_all_meth_{dataset_name}.png")) + plt.savefig(save_path, bbox_inches='tight') + plt.close('all') + + + +print('done with creating the figs') + diff --git a/tgb/datasets/dataset_scripts/dataset_utils.py b/stats_figures/dataset_utils.py similarity index 96% rename from tgb/datasets/dataset_scripts/dataset_utils.py rename to stats_figures/dataset_utils.py index 78a6d63..be0ab62 100644 --- a/tgb/datasets/dataset_scripts/dataset_utils.py +++ b/stats_figures/dataset_utils.py @@ -4,7 +4,7 @@ import os import os.path as osp -tgb_modules_path = osp.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..')) +tgb_modules_path = osp.abspath(os.path.join(os.path.dirname(__file__), '..')) sys.path.append(tgb_modules_path) import json @@ -294,14 +294,16 @@ def estimate_seasons(train_data): return Nbseason -# create a dictionary with all the stats and save to json and csv + def create_dict_and_save(dataset_name, num_rels, num_nodes, num_train_quads, num_val_quads, num_test_quads, num_all_quads, num_train_timesteps, num_val_timesteps, num_test_timesteps, num_all_timesteps, test_ind_nodes, test_ind_nodes_perc, val_ind_nodes, val_ind_nodes_perc, direct_recurrency_degree, recurrency_degree, consecutiveness_degree, mean_edge_per_ts, std_edge_per_ts, min_edge_per_ts, max_edge_per_ts, mean_node_per_ts, std_node_per_ts, min_node_per_ts, max_node_per_ts, - seasonal_value, collision_trainval, collision_valtest): + seasonal_value, collision_trainval, collision_valtest,first_ts_string, last_ts_string): + """ + Create a dictionary with the statistics of the dataset and save it as a csv file.""" if 'tkgl' in dataset_name: num_train_quads = int(num_train_quads/2) num_val_quads = int(num_val_quads/2) @@ -337,7 +339,9 @@ def create_dict_and_save(dataset_name, num_rels, num_nodes, num_train_quads, num "max_node_per_ts": max_node_per_ts, "seasonal_value": seasonal_value, "collision_trainval": collision_trainval, - "collision_valtest": collision_valtest + "collision_valtest": collision_valtest, + "first_ts_string": first_ts_string, + "last_ts_string": last_ts_string # "train_nodes": train_nodes } @@ -347,15 +351,10 @@ def create_dict_and_save(dataset_name, num_rels, num_nodes, num_train_quads, num # Get the current directory of the script current_dir = os.path.dirname(os.path.abspath(__file__)) - # Navigate one folder up - parent_dir = os.path.dirname(current_dir) - # Save stats_dict as CSV - modified_dataset_name = dataset_name.replace('-', '_') - save_path = (os.path.join(parent_dir, modified_dataset_name, "dataset_stats.csv")) + save_path = (os.path.join(current_dir, dataset_name, "dataset_stats.csv")) df.to_csv(save_path) - - print("Stats saved to csv and json in folder: ", save_path) + print("Stats saved to csv in folder: ", save_path) def num_nodes_not_in_train(train_data, test_data): """ Calculate the number of nodes in the test set that are not in the train set. @@ -439,7 +438,13 @@ def read_dict_compute_mrr_per_rel(perrel_results_path, model_name, dataset_name, # Extract the key (the first part) key = int(parts[0]) # Extract the values (the rest of the parts), remove square brackets - values = [float(value.strip('[]')) for value in parts[1:]] + values = [] + for value in parts[1:]: + if value == '[]': + print(f"Rel {key} has empty list as value") + + else: + values.append(float(value.strip('[]'))) # Add the key-value pair to the dictionary if key in results_per_rel_dict.keys(): print(f"Key {key} already exists in the dictionary!!! might have duplicate entries in results csv - skipping") @@ -498,6 +503,8 @@ def set_plot_names(top_k, sorted_dict, dataset_name, rel_id2type_dict): return plot_names + + import requests import re diff --git a/tgb/datasets/dataset_scripts/compute_dataset_stats.py b/tgb/datasets/dataset_scripts/compute_dataset_stats.py deleted file mode 100644 index 6b335b0..0000000 --- a/tgb/datasets/dataset_scripts/compute_dataset_stats.py +++ /dev/null @@ -1,311 +0,0 @@ -import numpy as np - -import sys -import os -import os.path as osp -tgb_modules_path = osp.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..')) -sys.path.append(tgb_modules_path) -import json - -## imports -import numpy as np -from datetime import datetime -#internal imports -from tgb.linkproppred.dataset import LinkPropPredDataset -from tgb_modules.tkg_utils import reformat_ts, get_original_ts -import tgb.datasets.dataset_scripts.dataset_utils as du - -import networkx as nx -import matplotlib.pyplot as plt - - - - -names = ['tkgl-smallpedia'] #'tkgl-polecat', 'thgl-myket','tkgl-yago', 'tkgl-icews','thgl-github', 'thgl-forum', 'tkgl-wikidata'] -for dataset_name in names: - dataset = LinkPropPredDataset(name=dataset_name, root="datasets", preprocess=True) - - relations = dataset.edge_type - num_rels = dataset.num_rels - if 'tkgl' in dataset_name: - num_rels_without_inv = int(num_rels/2) - else: - num_rels_without_inv = num_rels - - rels = np.arange(0,num_rels) - subjects = dataset.full_data["sources"] - objects= dataset.full_data["destinations"] - num_nodes = dataset.num_nodes - timestamps_orig = dataset.full_data["timestamps"] - timestamps = reformat_ts(timestamps_orig, dataset_name) # stepsize:1 - current_dir = os.path.dirname(os.path.abspath(__file__)) - parent_dir = os.path.dirname(current_dir) - modified_dataset_name = dataset_name.replace('-', '_') - csv_dir = os.path.join( parent_dir, modified_dataset_name) - np.savetxt(csv_dir +"/"+dataset_name+"timestamps.csv", timestamps,fmt='%i', delimiter=",") - all_quads = np.stack((subjects, relations, objects, timestamps, timestamps_orig), axis=1) - train_data = all_quads[dataset.train_mask] - val_data = all_quads[dataset.val_mask] - test_data = all_quads[dataset.test_mask] - collision_trainval = np.intersect1d(list(set(timestamps_orig[dataset.train_mask])), list(set(timestamps_orig[dataset.val_mask]))) - collision_valtest = np.intersect1d(list(set(timestamps_orig[dataset.val_mask])), list(set(timestamps_orig[dataset.test_mask]))) - if len(collision_trainval) > 0: - print("!!!!!!!!!Collision between train and val set!!!!!!!!!") - if len(collision_valtest) > 0: - print("!!!!!!!!!Collision between val and test set!!!!!!!!!") - print(subjects.shape) - - # compute number of quads in train/val/test set - num_train_quads = train_data.shape[0] - num_val_quads = val_data.shape[0] - num_test_quads = test_data.shape[0] - num_all_quads = num_train_quads + num_val_quads + num_test_quads - print(num_all_quads) - - # compute inductive nodes - test_ind_nodes = du.num_nodes_not_in_train(train_data, test_data) - val_ind_nodes = du.num_nodes_not_in_train(train_data, val_data) - test_ind_nodes_perc = test_ind_nodes/num_nodes - val_ind_nodes_perc = val_ind_nodes/num_nodes - - # compute number of timesteps in train/val/test set - num_train_timesteps = len(np.unique(train_data[:,-1])) - num_val_timesteps = len(np.unique(val_data[:,-1])) - num_test_timesteps = len(np.unique(test_data[:,-1])) - num_all_ts = num_train_timesteps + num_val_timesteps + num_test_timesteps - - # compute number on nodes in valid set or test set that have not been seen in train set - # compute recurrency factor - # compute average duration of facts - timestep_range = 1+np.max(timestamps) - np.min(timestamps) - all_possible_timestep_indices = [i for i in range(timestep_range)] - ts_all = du.TripleSet() - ts_all.add_triples(all_quads, num_rels_without_inv, timestep_range) - ts_all.compute_stat() - ts_test = du.TripleSet() - ts_test.add_triples(test_data, num_rels_without_inv, timestep_range) - ts_test.compute_stat() - - lens = [] - for timesteps in ts_all.timestep_lists: - lens.append(len(timesteps)) - - count_previous = 0 - count_sometime = 0 - count_all = 0 - for qtriple in ts_test.triples: - (s,r,o,t) = qtriple - k = ts_all.get_latest_ts(s,r,o, t) - count_all += 1 - if k + 1 == t: count_previous += 1 - if k > -1 and k < t: count_sometime += 1 - - print("DATATSET: " + dataset_name) - print("all: " + str(count_all)) - print("previous: " + str(count_previous)) - print("sometime: " + str(count_sometime)) - print("f-direct (DRec): " + str(count_previous / count_all)) - print("f-sometime (Rec): " + str(count_sometime / count_all)) - - print(f"the mean number of timesteps that a triple appears in is {np.mean(lens)}") - print(f"the median number of timesteps that a triple appears in is {np.median(lens)}") - print(f"the maximum number of timesteps that a triple appears in is {np.max(lens)}") - - # Compute max consecutive timesteps per triple - results = [du.max_consecutive_numbers(inner_list) for inner_list in ts_all.timestep_lists] - print(f"number of timesteps is {ts_all.num_timesteps}") - print(f"number of total triples is {ts_all.num_triples}") - print(f"number of distinct triples is {len(ts_all.timestep_lists)}") - print(f"the mean max number of 100*consecutive timesteps/number of timesteps that a triple appears in is {100*np.mean(results)/ts_all.num_timesteps}") - print(f"the median max number of 100*consecutive timesteps/number of timesteps that a triple appears in is {100*np.median(results)/ts_all.num_timesteps}") - print(f"the maximum max number of 100*consecutive timesteps/number of timesteps that a triple appears in is {100*np.max(results)/ts_all.num_timesteps}") - print(f"the mean max number of consecutive timesteps that a triple appears in is {np.mean(results)}") - print(f"the median max number of consecutive timesteps that a triple appears in is {np.median(results)}") - print(f"the maximum max number of consecutive timesteps that a triple appears in is {np.max(results)}") - print(f"the std for max number of consecutive timesteps that a triple appears in is {np.std(results)}") - - direct_recurrency_degree = count_previous / count_all - recurrency_degree = count_sometime / count_all - consecutiveness_degree = np.mean(results) # the mean max number of consecutive timesteps that a triple appears in - # compute graph parameters (density and such stuff) - - # compute number of triples per timestep - n_nodes_list = [] - n_edges_list = [] - - ts_set = list(set(timestamps_orig)) - ts_set.sort() - ts_dist = ts_set[1] - ts_set[0] - if 'tkg' in dataset_name: - all_possible_orig_timestamps =get_original_ts(all_possible_timestep_indices, ts_dist, np.min(ts_set)) - - no_nodes_list = [] - no_nodes_list_orig = [] - no_nodes_datetime = [] - for t in ts_all.t_2_triple.keys(): - num_nodes_ts = len(ts_all.unique_nodes(ts_all.t_2_triple[t])) - n_nodes_list.append(num_nodes_ts) - n_edges_list.append(len(ts_all.t_2_triple[t])) - if 'tkg' in dataset_name: - if num_nodes_ts == 0: - if t not in no_nodes_list: - no_nodes_list.append(t) - no_nodes_list_orig.append(all_possible_orig_timestamps[t]) - no_nodes_datetime.append(datetime.utcfromtimestamp(all_possible_orig_timestamps[t])) - # compute seasonality of num nodes over time: - seasonal_value =1 - seasonal_value = du.estimate_seasons(n_nodes_list) - if seasonal_value == 1: - print('there was no seasonality for number of nodes found') - else: - print(f'the seasonality for number of nodes is {seasonal_value}') - if 'tkgl' in dataset_name: - print('we have 0 nodes for' + str(len(no_nodes_list)) + ' timesteps') - print('0 nodes for timesteps: ', no_nodes_list) - print('this is original unix timestamps: ', no_nodes_list_orig) - print('this is datetime: ', no_nodes_datetime) - else: - print('we have 0 nodes for' + str(len(no_nodes_list)) + ' timesteps') - - - print(f"average number of triples per ts is {np.mean(n_edges_list)}") - print(f"std for average number of triples per ts is {np.std(n_edges_list)}") - print(f"min/max number of triples per ts is {np.min(n_edges_list), np.max(n_edges_list)}") - - print(f"average number of nodes per ts is {np.mean(n_nodes_list)}") - print(f"std for average number of nodes per ts is {np.std(n_nodes_list)}") - print(f"min/max number of nodes per ts is {np.min(n_nodes_list), np.max(n_nodes_list)}") - colortgb = '#60ab84' - bars_list = [20, 50, 100] - for num_bars in bars_list: - if num_bars < 100: - capsize=1.5 - capthick=1.5 - elinewidth=1.5 - else: - capsize=1 - capthick=1 - elinewidth=1 - ts_discretized_mean, ts_discretized_sum, ts_discretized_min, ts_discretized_max, start_indices, end_indices, mid_indices = du.discretize_values(n_edges_list, num_bars) - plt.figure() - # plt.bar(mid_indices, ts_discretized_mean, width=(len(n_edges_list) // num_bars), label ='mean value', color =colortgb) - plt.step(mid_indices, ts_discretized_mean, where='mid', linestyle='-', label ='mean value', color=colortgb) - plt.scatter(mid_indices, ts_discretized_min, label ='min value') - plt.scatter(mid_indices, ts_discretized_max, label ='max value') - plt.xlabel('Timestep (bins)') - plt.ylabel('Number of Edges') - plt.legend() - #plt.title(dataset_name+ ' - Number of Edges aggregated across multiple timesteps') - modified_dataset_name = dataset_name.replace('-', '_') - current_dir = os.path.dirname(os.path.abspath(__file__)) - # Navigate one folder up - parent_dir = os.path.dirname(current_dir) - figs_dir = os.path.join( parent_dir, modified_dataset_name, 'figs') - # Create the 'figs' directory if it doesn't exist - if not os.path.exists(figs_dir): - os.makedirs(figs_dir) - save_path = (os.path.join(figs_dir, f"num_edges_discretized_{num_bars}_{dataset_name}.png")) - plt.savefig(save_path, bbox_inches='tight') - save_path = (os.path.join(figs_dir, f"num_edges_discretized_{num_bars}_{dataset_name}.pdf")) - plt.savefig(save_path, bbox_inches='tight') - - plt.figure() - mins = np.array(ts_discretized_min) - maxs = np.array(ts_discretized_max) - means = np.array(ts_discretized_mean) - # plt.bar(mid_indices, ts_discretized_mean, width=(len(n_edges_list) // num_bars), label='Mean', color =colortgb) - # plt.step(mid_indices, ts_discretized_mean, where='mid', linestyle='-', label ='mean value', color=colortgb) - plt.scatter(mid_indices, ts_discretized_mean, label ='mean value', color=colortgb) - plt.errorbar(mid_indices, maxs, yerr=[maxs-mins, maxs-maxs], fmt='none', alpha=0.9, color='grey',capsize=capsize, capthick=capthick, elinewidth=elinewidth, label='Min-Max Range') - plt.xlabel('Timestep (bins)') - plt.ylabel('Number of Edges') - plt.legend() - #plt.title(dataset_name+ ' - Number of Edges aggregated across multiple timesteps') - plt.show() - save_path2 = (os.path.join(figs_dir,f"num_edges_discretized_{num_bars}_{dataset_name}2.png")) - plt.savefig(save_path2, bbox_inches='tight') - save_path2 = (os.path.join(figs_dir,f"num_edges_discretized_{num_bars}_{dataset_name}2.pdf")) - plt.savefig(save_path2, bbox_inches='tight') - - plt.figure() - mins = np.array(ts_discretized_min) - maxs = np.array(ts_discretized_max) - means = np.array(ts_discretized_mean) - plt.bar(mid_indices, ts_discretized_sum, width=(len(n_edges_list) // num_bars), label='Sum', color =colortgb) - # plt.step(mid_indices, ts_discretized_mean, where='mid', linestyle='-', label ='mean value', color=colortgb) - # plt.errorbar(mid_indices, sums, yerr=[mins, maxs], fmt='none', alpha=0.9, color='grey',capsize=1.5, capthick=1.5, elinewidth=2, label='Min-Max Range') - plt.xlabel('Timestep (bins)') - plt.ylabel('Number of Edges') - plt.legend() - #plt.title(dataset_name+ ' - Number of Edges aggregated across multiple timesteps') - plt.show() - save_path2 = (os.path.join(figs_dir,f"num_edges_discretized_{num_bars}_{dataset_name}3.png")) - plt.savefig(save_path2, bbox_inches='tight') - save_path2 = (os.path.join(figs_dir,f"num_edges_discretized_{num_bars}_{dataset_name}3.pdf")) - plt.savefig(save_path2, bbox_inches='tight') - - try: - plt.figure() - mins = np.array(ts_discretized_min) - maxs = np.array(ts_discretized_max) - means = np.array(ts_discretized_mean) - # plt.bar(mid_indices, ts_discretized_mean, width=(len(n_edges_list) // num_bars), label='Mean', color =colortgb) - # plt.step(mid_indices, ts_discretized_mean, where='mid', linestyle='-', label ='mean value', color=colortgb) - plt.scatter(mid_indices, ts_discretized_mean, label ='mean value', color=colortgb) - plt.errorbar(mid_indices, maxs, yerr=[maxs-mins, maxs-maxs], fmt='none', alpha=0.9, color='grey',capsize=capsize, capthick=capthick, elinewidth=elinewidth, label='Min-Max Range') - plt.xlabel('Timestep (bins)') - plt.ylabel('Number of Edges') - #plt.title(dataset_name+ ' - Number of Edges aggregated across multiple timesteps') - plt.yscale('log') - plt.legend() - plt.show() - save_path2 = (os.path.join(figs_dir,f"num_edges_discretized_{num_bars}_{dataset_name}2log.png")) - plt.savefig(save_path2, bbox_inches='tight') - save_path2 = (os.path.join(figs_dir,f"num_edges_discretized_{num_bars}_{dataset_name}2log.pdf")) - plt.savefig(save_path2, bbox_inches='tight') - except: - print('Could not plot log scale') - plt.close('all') - - plt.figure() - plt.scatter(range(ts_all.num_timesteps), n_edges_list, s=0.2) - plt.xlabel('timestep') - plt.ylabel('number of triples') - #plt.title(f'Number of triples per timestep for {dataset_name}') - # save - # Get the current directory of the script - current_dir = os.path.dirname(os.path.abspath(__file__)) - # Navigate one folder up - parent_dir = os.path.dirname(current_dir) - # Save stats_dict as CSV - modified_dataset_name = dataset_name.replace('-', '_') - save_path = (os.path.join(figs_dir,f"num_edges_per_ts_{dataset_name}.png")) - plt.savefig(save_path, bbox_inches='tight') - - to_be_saved_dict = {} - to_be_saved_dict['num_edges'] = n_edges_list - to_be_saved_dict['num_nodes'] = n_nodes_list - parent_dir = os.path.dirname(current_dir) - save_path = (os.path.join(figs_dir,f"numedges_{dataset_name}.json")) - save_file = open(save_path, "w") - json.dump(to_be_saved_dict, save_file) - save_file.close() - - plt.figure() - plt.scatter(range(ts_all.num_timesteps), n_nodes_list, s=0.2) - plt.xlabel('timestep') - plt.ylabel('number of nodes') - #plt.title(f'Number of nodes per timestep for {dataset_name}') - save_path = (os.path.join(figs_dir,f"num_nodes_per_ts_{dataset_name}.png")) - plt.savefig(save_path, bbox_inches='tight') - plt.close('all') - - - - du.create_dict_and_save(dataset_name, num_rels_without_inv, num_nodes, num_train_quads, num_val_quads, num_test_quads, - num_all_quads, num_train_timesteps, num_val_timesteps, num_test_timesteps, num_all_ts, - test_ind_nodes, test_ind_nodes_perc, val_ind_nodes, val_ind_nodes_perc, - direct_recurrency_degree, recurrency_degree, consecutiveness_degree, - np.mean(n_edges_list), np.std(n_edges_list), np.min(n_edges_list), np.max(n_edges_list), - np.mean(n_nodes_list), np.std(n_nodes_list), np.min(n_nodes_list), np.max(n_nodes_list), - seasonal_value, collision_trainval, collision_valtest) diff --git a/tgb/datasets/dataset_scripts/compute_relation_results_df.py b/tgb/datasets/dataset_scripts/compute_relation_results_df.py deleted file mode 100644 index 329f937..0000000 --- a/tgb/datasets/dataset_scripts/compute_relation_results_df.py +++ /dev/null @@ -1,100 +0,0 @@ -import numpy as np - -import sys -import os -import os.path as osp -tgb_modules_path = osp.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..')) -sys.path.append(tgb_modules_path) -import json - -## imports -import numpy as np -import pandas as pd -import tgb.datasets.dataset_scripts.dataset_utils as du - - - - -# specify params -names = [ 'tkgl-icews', 'tkgl-polecat'] #'tkgl-polecat','tkgl-smallpedia', 'tkgl-yago', 'tkgl-icews' ,'tkgl-smallpedia','thgl-myket','tkgl-yago', 'tkgl-icews','thgl-github', 'thgl-forum', 'tkgl-wikidata'] -methods = [ 'regcn'] #'recurrency' -colortgb = '#60ab84' -colortgb2 = '#eeb641' -colortgb3 = '#dd613a' -#colortgb4 ='#bce9ef' -#colortgb5 ='#d6e9d9' - -colors = [colortgb,colortgb2,colortgb3] # from tgb logo -capsize=1.5 -capthick=1.5 -elinewidth=1.5 -occ_threshold = 5 -k=10 # how many slices in the cake +1 -plots_flag = True - -model_names = {'recurrency': {'tkgl-polecat': ['saved_models/RecurrencyBaseline', 1], - 'tkgl-icews': ['saved_models/RecurrencyBaseline', 500]}, - 'regcn': {'tkgl-polecat': 'saved_results/REGCN_tkgl-polecat_results_per_rel.json', - 'tkgl-icews': 'saved_results/REGCN_tkgl-icews_results_per_rel.json'}} -# run through each datasest -for dataset_name in names: - # read dataframe with the stats for this dataset from csv - print(dataset_name) - modified_dataset_name = dataset_name.replace('-', '_') - current_dir = os.path.dirname(os.path.abspath(__file__)) - # Navigate one folder up - parent_dir = os.path.dirname(current_dir) - - tgb_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) - figs_dir = os.path.join(parent_dir, modified_dataset_name, 'figs') - # Create the 'figs' directory if it doesn't exist - if not os.path.exists(figs_dir): - os.makedirs(figs_dir) - stats_dir = os.path.join( parent_dir, modified_dataset_name, 'stats') - if not os.path.exists(stats_dir): - os.makedirs(stats_dir) - stats_df = pd.read_csv(os.path.join(stats_dir, f"relation_statistics_{dataset_name}.csv")) - - for method in methods: - results_dict = os.path.join(tgb_dir, 'examples', 'linkproppred', dataset_name) - - - # if the method is recurrency, we need to load the results from the csv and compute the mean value per relation - # load csv, create dict with the mean value per relation - if method == 'recurrency': - name = model_names[method][dataset_name][0] - seed = model_names[method][dataset_name][1] - results_filename = f'{results_dict}/{name}' - # results_df = pd.read_csv(results_filename) - # for each relation, compute the mean value of the relation_mrr - # csv i - mrr_per_rel, full_mrr = du.read_dict_compute_mrr_per_rel(results_dict, name, dataset_name, seed, num_rels=0, split_mode='test') - - else: - name = model_names[method][dataset_name] - results_filename = f'{results_dict}/{name}' - with open(results_filename, 'r') as json_file: - mrr_per_rel = json.load(json_file) - - - # else we can just load the results from the json file - # load json where for each relation the results are stored - - # for each entry in the stats dataframe: append a column for each method with the mean value of the relation_mrr - # if the column is not present, add it - # if the column is present, append the value - if method in stats_df.columns: - print('Column already present') - else: - # each line of the original dataframe has a relation id - for rel in stats_df['relation']: - if str(rel) in mrr_per_rel: - stats_df.loc[stats_df['relation'] == rel, method] = mrr_per_rel[str(rel)] - elif rel in mrr_per_rel: - stats_df.loc[stats_df['relation'] == rel, method] = mrr_per_rel[rel] - else: - stats_df.loc[stats_df['relation'] == rel, method] = 'N/A' - # save the dataframe with the new columns - stats_df.to_csv(os.path.join(stats_dir, f"relation_statistics_{dataset_name}.csv"), index=False) - - # save dataframe with the new columns diff --git a/tgb_modules/recurrencybaseline_predictor.py b/tgb_modules/recurrencybaseline_predictor.py index 63b959d..be84b35 100644 --- a/tgb_modules/recurrencybaseline_predictor.py +++ b/tgb_modules/recurrencybaseline_predictor.py @@ -104,12 +104,8 @@ def baseline_predict(num_queries, test_data, all_data, window, basis_dict, num_n # Find quadruples that match the rule (starting from the test query subject) # Find edges whose subject match the query subject and the relation matches # the relation in the rule body. np array with [[sub, obj, ts]] - if 0 not in [len(x) for x in walk_edges]: # if we found at least one potential rule - if len(neg_sample_el) < num_nodes-2: - cands_subset = neg_sample_el + pos_sample_el - else: - cands_subset = [] - cands_dict_psi = get_candidates_psi(walk_edges[0][:,1:3], cur_ts, cands_dict, lmbda_psi, sum_delta_t, cands_subset) + if 0 not in [len(x) for x in walk_edges]: # if we found at least one potential rule + cands_dict_psi = get_candidates_psi(walk_edges[0][:,1:3], cur_ts, cands_dict, lmbda_psi, sum_delta_t) if len(cands_dict_psi)>0: # predictions_psi = create_scores_tensor(cands_dict_psi, num_nodes) predictions_psi = create_scores_array(cands_dict_psi, num_nodes) @@ -240,7 +236,7 @@ def quads_per_rel(quads): edges[rel] = quads[quads[:, 1] == rel] return edges -def get_candidates_psi(rule_walks, test_query_ts, cands_dict,lmbda, sum_delta_t,cands_subset): +def get_candidates_psi(rule_walks, test_query_ts, cands_dict,lmbda, sum_delta_t): """ Get answer candidates from the walks that follow the rule. Add the confidence of the rule that leads to these candidates. @@ -257,12 +253,8 @@ def get_candidates_psi(rule_walks, test_query_ts, cands_dict,lmbda, sum_delta_t, cands_dict (dict): keys: candidates, values: score for the candidates """ cands = set(rule_walks[:,0]) - if len(cands_subset) > 0: - cands_subset = set(cands_subset) - cands_of_interest = cands.intersection(cands_subset) - else: - cands_of_interest = cands - for cand in cands_of_interest: + + for cand in cands: cands_walks = rule_walks[rule_walks[:,0] == cand] score = score_psi(cands_walks, test_query_ts, lmbda, sum_delta_t).astype(np.float64) cands_dict[cand] = score @@ -355,5 +347,4 @@ def update_delta_t(min_ts, max_ts, cur_ts, lmbda): now = np.ones(len(timesteps))*cur_ts delta_all = score_delta(timesteps, now, lmbda) delta_all = np.sum(delta_all) - return delta_all - + return delta_all \ No newline at end of file diff --git a/tgb_modules/tkg_utils.py b/tgb_modules/tkg_utils.py index 00352ac..3e5922f 100644 --- a/tgb_modules/tkg_utils.py +++ b/tgb_modules/tkg_utils.py @@ -109,8 +109,8 @@ def get_args_cen(): help="dataset to use") parser.add_argument("--test", type=int, default=0, help="1: formal test 2: continual test") - parser.add_argument("--trainflag", type=bool, default=True, - help="do we want to train or directly test") + parser.add_argument("--validtest", default=False, + help="load stat from dir and directly valid and test") parser.add_argument("--test-only", type=bool, default=False, help="do we want to compute valid mrr or only test") parser.add_argument("--run-statistic", action='store_true', default=False, diff --git a/tgb_modules/tkg_utils_dgl.py b/tgb_modules/tkg_utils_dgl.py new file mode 100644 index 0000000..bcc64b8 --- /dev/null +++ b/tgb_modules/tkg_utils_dgl.py @@ -0,0 +1,48 @@ + +import dgl +import torch +import numpy as np + + +def build_sub_graph(num_nodes, num_rels, triples, use_cuda, gpu, mode='dyn'): + """ + https://github.com/Lee-zix/CEN/blob/main/rgcn/utils.py + :param node_id: node id in the large graph + :param num_rels: number of relation + :param src: relabeled src id + :param rel: original rel id + :param dst: relabeled dst id + :param use_cuda: + :return: + """ + def comp_deg_norm(g): + in_deg = g.in_degrees(range(g.number_of_nodes())).float() + in_deg[torch.nonzero(in_deg == 0).view(-1)] = 1 + norm = 1.0 / in_deg + return norm + + src, rel, dst = triples.transpose() + if mode =='static': + src, dst = np.concatenate((src, dst)), np.concatenate((dst, src)) + rel = np.concatenate((rel, rel + num_rels)) + g = dgl.DGLGraph() + g.add_nodes(num_nodes) + #g.ndata['original_id'] = np.unique(np.concatenate((np.unique(triples[:,0]), np.unique(triples[:,2])))) + g.add_edges(src, dst) + norm = comp_deg_norm(g) + #node_id =torch.arange(0, g.num_nodes(), dtype=torch.long).view(-1, 1) #updated to deal with the fact that ot only the first k nodes of our graph have static infos + node_id = torch.arange(0, num_nodes, dtype=torch.long).view(-1, 1) + g.ndata.update({'id': node_id, 'norm': norm.view(-1, 1)}) + g.apply_edges(lambda edges: {'norm': edges.dst['norm'] * edges.src['norm']}) + g.edata['type'] = torch.LongTensor(rel) + + + uniq_r, r_len, r_to_e = r2e(triples, num_rels) + g.uniq_r = uniq_r + g.r_to_e = r_to_e + g.r_len = r_len + + if use_cuda: + g = g.to(gpu) + g.r_to_e = torch.from_numpy(np.array(r_to_e)) + return g