From 5e72e0bee46280dda626cbde675ed11b26d080c1 Mon Sep 17 00:00:00 2001 From: Shenyang Huang Date: Wed, 29 May 2024 16:31:13 -0400 Subject: [PATCH 1/5] adding recurrency baseline to run --- examples/linkproppred/thgl-github/recurrencybaseline.py | 1 + tgb_modules/tkg_utils.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/linkproppred/thgl-github/recurrencybaseline.py b/examples/linkproppred/thgl-github/recurrencybaseline.py index f42b1f2..5f764fe 100644 --- a/examples/linkproppred/thgl-github/recurrencybaseline.py +++ b/examples/linkproppred/thgl-github/recurrencybaseline.py @@ -9,6 +9,7 @@ organization={International Joint Conferences on Artificial Intelligence Organization} } +python recurrencybaseline.py --seed 1 --num_processes 1 -tr False """ ## imports diff --git a/tgb_modules/tkg_utils.py b/tgb_modules/tkg_utils.py index 00352ac..1c45c69 100644 --- a/tgb_modules/tkg_utils.py +++ b/tgb_modules/tkg_utils.py @@ -2,7 +2,7 @@ from itertools import groupby from operator import itemgetter from collections import defaultdict -import dgl +# import dgl import sys import argparse import torch From 64bf43a28103a82bbb6194e7c0486210a3d4aca5 Mon Sep 17 00:00:00 2001 From: Shenyang Huang Date: Mon, 3 Jun 2024 11:18:18 -0400 Subject: [PATCH 2/5] adding gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index d21b06e..33157d1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ !requirements*.txt +get_croissant.py #dataset *.xz *.dict From 40174df33ef848ac747be16a851f60fca608a801 Mon Sep 17 00:00:00 2001 From: Shenyang Huang Date: Mon, 10 Jun 2024 15:43:45 -0400 Subject: [PATCH 3/5] small change --- stats_figures/create_relation_figures.py | 550 +++++++++++------------ 1 file changed, 275 insertions(+), 275 deletions(-) diff --git a/stats_figures/create_relation_figures.py b/stats_figures/create_relation_figures.py index 16164f4..6b136f3 100644 --- a/stats_figures/create_relation_figures.py +++ b/stats_figures/create_relation_figures.py @@ -1,275 +1,275 @@ -""" pie charts, mrr per relation charts -""" - -## imports -import numpy as np -import sys -import os -import os.path as osp -tgb_modules_path = osp.abspath(os.path.join(os.path.dirname(__file__), '..')) -sys.path.append(tgb_modules_path) -import matplotlib.pyplot as plt -from matplotlib.colors import LogNorm -from matplotlib.colors import Normalize -import numpy as np -import pandas as pd -import stats_figures.dataset_utils as du - - -# specify params -# which datasets -names = ['thgl-software']#['tkgl-smallpedia','tkgl-polecat', 'tkgl-icews']#[ 'tkgl-polecat', 'tkgl-icews', 'tkgl-wikidata', 'tkgl-smallpedia', 'thgl-software'] #['thgl-software']# -# names = [ 'tkgl-polecat', 'tkgl-icews', 'tkgl-wikidata', 'tkgl-smallpedia','tkgl-polecat'] #'tkgl-polecat','tkgl-smallpedia', 'tkgl-yago', 'tkgl-icews' ,'tkgl-smallpedia','thgl-myket','tkgl-yago', 'tkgl-icews','thgl-github', 'thgl-forum', 'tkgl-wikidata'] -# which methods for the mrr_per_rel figures -methods = ['recurrency', 'regcn', 'cen'] #'recurrency' -colortgb = '#60ab84' #tgb logo colors -colortgb2 = '#eeb641' -colortgb3 = '#dd613a' -head_tail_flag = False # if true, the head and tail of the relation are shown in the plot, otherwise just the mean across both directions - -# pie chart colors -colors = [colortgb,colortgb2,colortgb3] # from tgb logo -colors2= ['#a6cee3', '#1f78b4', '#b2df8a', '#33a02c', '#fb9a99', '#e31a1c', '#fdbf6f', '#ff7f00', '#cab2d6', '#6a3d9a', '#ffff99'] #from https://colorbrewer2.org/#type=qualitative&scheme=Paired&n=11 - -# colors2= ['#8e0152', '#c51b7d', '#de77ae', '#f1b6da', '#fde0ef', '#f7f7f7', '#e6f5d0', '#b8e186', '#7fbc41', '#4d9221', '#276419'] -# from https://colorbrewer2.org/#type=diverging&scheme=PiYG&n=11 color blind friendly - -capsize=1.5 -capthick=1.5 -elinewidth=1.5 -occ_threshold = 5 -k=10 # how many slices in the cake +1 -# k = 14 -plots_flag = True -ylimdict = {'tkgl-polecat': 0.25, 'tkgl-icews':0.6, 'tkgl-smallpedia': 1.01} # for the mrr charts the upper mrr limit - -overall_min = -1 # for the correlation matrix colorbar -overall_max =1 # for the correlation matrix colorbar -num_rels_plot = 10 # how many relations to we want to plot in the mrr chart -i = 0 -plot_values_list = [] -plot_names_multi_line_list =[] -for dataset_name in names: - print('dataset_name:', dataset_name) - # some directory stuff - modified_dataset_name = dataset_name.replace('-', '_') - current_dir = os.path.dirname(os.path.abspath(__file__)) - - stats_dir = os.path.join( current_dir,dataset_name,'stats') - tgb_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) - figs_dir = os.path.join(current_dir,dataset_name,'figs_rel') - stats_df = pd.read_csv(os.path.join(stats_dir, f"relation_statistics_{dataset_name}.csv")) - - # Create the 'figs' directory if it doesn't exist - if not os.path.exists(figs_dir): - os.makedirs(figs_dir) - stats_dir = os.path.join( current_dir,dataset_name,'stats') - if not os.path.exists(stats_dir): - os.makedirs(stats_dir) - - ### A) pie charts #plot top k relations accordung to the number of occurences plus a slice for "others" - plot_names = list(stats_df['rel_string_word'].iloc[:k]) - plot_values = list(stats_df['number_total_occurences'].iloc[:k]) - all_others = np.sum(stats_df['number_total_occurences'].iloc[k:]) #slice for "others" (sum of all other relations occurences) - plot_values.append(all_others) - plot_names.append('Others') - # for the pie chart labels to be more readable (i.e. force line break if words are long) - plot_names_multi_line= [] - for name in plot_names: # add some \n to make the labels more fittable to the pie chart - if type(name) == str: - words = name.split() - newname = words[0] - if len(words) > 1: - for i in range(len(words)-1): - if not '(' in words[i+1]: - if len(words[i]) > 3: - newname+='\n' - else: - newname+=' ' - newname+=words[i+1] - else: - newname = str(name) #then only plot the int as is. - plot_names_multi_line.append(newname) - - num_slices = len(plot_names) - plt.figure(figsize=(7, 7)) - wedges, texts, autotexts =plt.pie(plot_values,autopct=lambda pct: f"{pct:.0f}%" if pct > 1.5 else '', startangle=140, colors=colors2, labeldistance=2.2) #repeated_colors) - # Increase the font size of the percentage values - for autotext in autotexts: - autotext.set_fontsize(20) #15 - plt.axis('equal') - # Move the percentage labels further outside - for autotext, wedge in zip(autotexts, wedges): - angle = (wedge.theta2 - wedge.theta1) / 2 + wedge.theta1 - x = np.cos(np.deg2rad(angle)) - y = np.sin(np.deg2rad(angle)) - distance = 0.85 # Adjust this value to move the labels further or closer to the center - autotext.set_position((x * distance, y * distance)) - # Set the labels for each pie slice - # plt.legend(wedges, plot_names_multi_line, loc="center left", bbox_to_anchor=(1, 0, 0.5, 1), fontsize=14) - plt.legend(wedges, plot_names_multi_line, loc="center left", bbox_to_anchor=(1, 0, 0.5, 1), fontsize=14) - save_path = (os.path.join(figs_dir, f"rel_pie_{dataset_name}.png")) - plt.savefig(save_path, bbox_inches='tight') - save_path = (os.path.join(figs_dir, f"rel_pie_{dataset_name}.pdf")) - plt.savefig(save_path, bbox_inches='tight') - - if dataset_name == 'tkgl-wikidata': #then we do not want to plot the mrr for the relations - continue - - ### B) plot the mrr for each relation for each method, different color for different number of occurences or for different recurrency degree - - # prepare the dataframe: only take the top ten relations according to number of occurences and sort by recurrency degree - # we use selected_df_sorted to plot the relations in the order of recurrency degree - rels_sorted = np.array(stats_df['relation'])[0:num_rels_plot] - mask = stats_df['relation'].isin(rels_sorted) - selected_df = stats_df[mask] #only the parts of the dataframe that contain the top ten relations according to number of occurences - selected_df_sorted = selected_df.sort_values(by='recurrency_degree', ascending=False) # Sort selected_df by 'recurrency_degree' column in descending order - rels_to_plot = list(selected_df_sorted['relation']) - labels = np.array(selected_df_sorted['relation'])# only plotting the id for space reasons - mrr_per_rel_freq = [] # list of mrr values for each relation - three lists for three methods - mrr_per_rel_freq2 = [] - mrr_per_rel_freq3 = [] - lab = [] - lab_ht = [] - lab_rel = [] - # rel_oc_dict[rel] = count_occurrences - count_occurrences_sorted = [] - rec_degree_sorted = [] - for index, r in enumerate(rels_to_plot): - if head_tail_flag: - lab_ht.append('h') - lab_ht.append('t') - lab_rel.append(str(labels[index])+' ') # add spaces to make the labels longer - else: - lab_rel.append(str(labels[index])+'') # add spaces to make the labels longer - - lab.append(labels[index]) - if head_tail_flag: # if we do head and tail separately we need the value for head and tail direction - mrr_per_rel_freq.append(selected_df_sorted['recurrency_head'].iloc[index]) - mrr_per_rel_freq.append(selected_df_sorted['recurrency_tail'].iloc[index]) - mrr_per_rel_freq2.append(selected_df_sorted['regcn_head'].iloc[index]) - mrr_per_rel_freq2.append(selected_df_sorted['regcn_tail'].iloc[index]) - mrr_per_rel_freq3.append(selected_df_sorted['cen_head'].iloc[index]) - mrr_per_rel_freq3.append(selected_df_sorted['cen_tail'].iloc[index]) - count_occurrences_sorted.append(selected_df_sorted['number_total_occurences'].iloc[index])#append twice for head and tail - count_occurrences_sorted.append(selected_df_sorted['number_total_occurences'].iloc[index]) - rec_degree_sorted.append(selected_df_sorted['recurrency_degree'].iloc[index]) #append twice for head and tail - rec_degree_sorted.append(selected_df_sorted['recurrency_degree'].iloc[index]) - else:# if we do NOT head and tail separately we need the mean value for head and tail direction - mrr_per_rel_freq.append(np.mean([selected_df_sorted['recurrency_head'].iloc[index], selected_df_sorted['recurrency_tail'].iloc[index]])) - mrr_per_rel_freq2.append(np.mean([selected_df_sorted['regcn_head'].iloc[index],selected_df_sorted['regcn_tail'].iloc[index]])) - mrr_per_rel_freq3.append(np.mean([selected_df_sorted['cen_head'].iloc[index], selected_df_sorted['cen_tail'].iloc[index]])) - count_occurrences_sorted.append(selected_df_sorted['number_total_occurences'].iloc[index])#append twice for head and tail - rec_degree_sorted.append(selected_df_sorted['recurrency_degree'].iloc[index]) - - # these are the x-values of the ticks. in case we plot head and tail separately, we need to have two ticks per relation - x_values = [] - x_values_rel = [] - for i in range(0,num_rels_plot): - if head_tail_flag: - x_values.append(i*2+0.4) - x_values.append(i*2+0.8) - else: - x_values.append(i*2+0.4) - x_values_rel.append(i*2+0.4) - - lab_lines = lab_rel #labels, for now - a = count_occurrences_sorted - - # version 1) colors are based on the reucrrency degree - plt.figure() - sca = plt.scatter(x_values, mrr_per_rel_freq2, marker='p',s=150, c = rec_degree_sorted, alpha=1, edgecolor='grey', cmap='jet', norm=Normalize(vmin=0, vmax=1), label='REGCN') # cmap='gist_rainbow', - sca = plt.scatter(x_values, mrr_per_rel_freq3 , marker='*',s=150, c = rec_degree_sorted, alpha=1, edgecolor='grey', cmap='jet', norm=Normalize(vmin=0, vmax=1), label='CEN') - sca = plt.scatter(x_values, mrr_per_rel_freq, marker='o',s=60, c = rec_degree_sorted, alpha=1, edgecolor='grey', cmap='jet', norm=Normalize(vmin=0, vmax=1), label='Recurrency Baseline') - plt.ylabel('MRR', fontsize=14) - plt.xlabel('Relation', fontsize=14) - plt.legend(fontsize=14) - cbar =plt.colorbar(sca) - plt.ylim([0,ylimdict[dataset_name]]) - cbar.ax.yaxis.label.set_color('gray') - - if head_tail_flag: - plt.xticks(x_values, lab_ht, size=13) #, verticalalignment="center") # ha='right', - plt.xticks(x_values_rel, lab_lines, size=14, minor=True) - plt.tick_params(axis='x', which='minor', rotation=90, length=0) - else: - plt.xticks(x_values_rel, lab_lines, size=14) - plt.tick_params(axis='x', rotation=90, length=0) - plt.yticks(size=13) - # Create a locator for the second set of x-ticks - # plt.secondary_xaxis('top', x_values_rel) - - plt.grid() - save_path = (os.path.join(figs_dir, f"rel_mrrperrel_recdeg_{dataset_name}.png")) - plt.savefig(save_path, bbox_inches='tight') - save_path = (os.path.join(figs_dir, f"rel_mrrperrel_recdeg_{dataset_name}.pdf")) - plt.savefig(save_path, bbox_inches='tight') - print('saved in ', save_path) - - # version 2) colors are the number of occurences - plt.figure() - sca = plt.scatter(x_values, mrr_per_rel_freq2, marker='p',s=150, c = a, alpha=1, edgecolor='grey', norm=LogNorm(), cmap='jet', label='REGCN') - sca = plt.scatter(x_values, mrr_per_rel_freq3 , marker='*',s=150, c = a, alpha=1, edgecolor='grey', norm=LogNorm(), cmap='jet', label='CEN') - sca = plt.scatter(x_values, mrr_per_rel_freq, marker='o',s=60, c = a, alpha=1, edgecolor='grey', norm=LogNorm(), cmap='jet', label='Recurrency Baseline') - plt.ylabel('MRR', fontsize=14) - plt.xlabel('Relation', fontsize=14) - plt.legend(fontsize=14) - cbar =plt.colorbar(sca) - plt.ylim([0,ylimdict[dataset_name]]) - cbar.ax.yaxis.label.set_color('gray') - - plt.xticks(x_values, lab_ht, size=13) #, verticalalignment="center") # ha='right', - plt.yticks(size=13) - # Create a locator for the second set of x-ticks - # plt.secondary_xaxis('top', x_values_rel) - plt.xticks(x_values_rel, lab_lines, size=14, minor=True) - plt.tick_params(axis='x', which='minor', rotation=90, length=0) - plt.grid() - save_path = (os.path.join(figs_dir, f"rel_mrrperrel_occ_{dataset_name}.png")) - plt.savefig(save_path, bbox_inches='tight') - - - ### C) plot all sorts of correlation matrix. I specify different columns for the different plots - df = stats_df[['recurrency_degree', 'direct_recurrency-degree', 'recurrency_tail', 'recurrency_head', 'regcn_tail', 'regcn_head', 'cen_tail', 'cen_head']] - corrmat= df.corr() - f = plt.figure(figsize=(19, 15)) - plt.matshow(corrmat, fignum=f.number, vmin=overall_min, vmax=overall_max) - plt.xticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=16, rotation=90) - plt.yticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=16) - cb = plt.colorbar() - cb.ax.tick_params(labelsize=16) - save_path = (os.path.join(figs_dir, f"corr_rec_meth_{dataset_name}.png")) - plt.savefig(save_path, bbox_inches='tight') - - df = stats_df[['consecutiveness_value', 'recurrency_tail', 'recurrency_head', 'regcn_tail', 'regcn_head', 'cen_tail', 'cen_head']] - corrmat= df.corr() - f = plt.figure(figsize=(19, 15)) - plt.matshow(corrmat, fignum=f.number, vmin=overall_min, vmax=overall_max) - plt.xticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=16, rotation=90) - plt.yticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=16) - cb = plt.colorbar() - cb.ax.tick_params(labelsize=16) - save_path = (os.path.join(figs_dir, f"corr_con_meth_{dataset_name}.png")) - plt.savefig(save_path, bbox_inches='tight') - - df = stats_df[['recurrency_degree', 'direct_recurrency-degree', 'consecutiveness_value', 'mean_occurence_per_triple','number_total_occurences', 'recurrency_tail', 'recurrency_head', 'regcn_tail', 'regcn_head', 'cen_tail', 'cen_head']] - corrmat= df.corr() - f = plt.figure(figsize=(19, 15)) - plt.matshow(corrmat, fignum=f.number, vmin=overall_min, vmax=overall_max) - plt.xticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=16, rotation=90) - plt.yticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=16) - for i in range(corrmat.shape[0]): - for j in range(corrmat.shape[1]): - plt.text(j, i, "{:.2f}".format(corrmat.iloc[i, j]), ha='center', va='center', color='black', fontsize=16) - cb = plt.colorbar() - # fig.colorbar(cax, ticks=[-1,0,1], shrink=0.8) - cb.ax.tick_params(labelsize=16) - # Plot the correlation matrix - save_path = (os.path.join(figs_dir, f"corr_all_meth_{dataset_name}.png")) - plt.savefig(save_path, bbox_inches='tight') - plt.close('all') - - - -print('done with creating the figs') - +""" pie charts, mrr per relation charts +""" + +## imports +import numpy as np +import sys +import os +import os.path as osp +tgb_modules_path = osp.abspath(os.path.join(os.path.dirname(__file__), '..')) +sys.path.append(tgb_modules_path) +import matplotlib.pyplot as plt +from matplotlib.colors import LogNorm +from matplotlib.colors import Normalize +import numpy as np +import pandas as pd +import stats_figures.dataset_utils as du + + +# specify params +# which datasets +names = ['thgl-software']#['tkgl-smallpedia','tkgl-polecat', 'tkgl-icews']#[ 'tkgl-polecat', 'tkgl-icews', 'tkgl-wikidata', 'tkgl-smallpedia', 'thgl-software'] #['thgl-software']# +# names = [ 'tkgl-polecat', 'tkgl-icews', 'tkgl-wikidata', 'tkgl-smallpedia','tkgl-polecat'] #'tkgl-polecat','tkgl-smallpedia', 'tkgl-yago', 'tkgl-icews' ,'tkgl-smallpedia','thgl-myket','tkgl-yago', 'tkgl-icews','thgl-github', 'thgl-forum', 'tkgl-wikidata'] +# which methods for the mrr_per_rel figures +methods = ['recurrency', 'regcn', 'cen'] #'recurrency' +colortgb = '#60ab84' #tgb logo colors +colortgb2 = '#eeb641' +colortgb3 = '#dd613a' +head_tail_flag = False # if true, the head and tail of the relation are shown in the plot, otherwise just the mean across both directions + +# pie chart colors +colors = [colortgb,colortgb2,colortgb3] # from tgb logo +colors2= ['#a6cee3', '#1f78b4', '#b2df8a', '#33a02c', '#fb9a99', '#e31a1c', '#fdbf6f', '#ff7f00', '#cab2d6', '#6a3d9a', '#ffff99'] #from https://colorbrewer2.org/#type=qualitative&scheme=Paired&n=11 + +# colors2= ['#8e0152', '#c51b7d', '#de77ae', '#f1b6da', '#fde0ef', '#f7f7f7', '#e6f5d0', '#b8e186', '#7fbc41', '#4d9221', '#276419'] +# from https://colorbrewer2.org/#type=diverging&scheme=PiYG&n=11 color blind friendly + +capsize=1.5 +capthick=1.5 +elinewidth=1.5 +occ_threshold = 5 +k=10 # how many slices in the cake +1 +# k = 14 +plots_flag = True +ylimdict = {'tkgl-polecat': 0.25, 'tkgl-icews':0.6, 'tkgl-smallpedia': 1.01} # for the mrr charts the upper mrr limit + +overall_min = -1 # for the correlation matrix colorbar +overall_max =1 # for the correlation matrix colorbar +num_rels_plot = 10 # how many relations to we want to plot in the mrr chart +i = 0 +plot_values_list = [] +plot_names_multi_line_list =[] +for dataset_name in names: + print('dataset_name:', dataset_name) + # some directory stuff + modified_dataset_name = dataset_name.replace('-', '_') + current_dir = os.path.dirname(os.path.abspath(__file__)) + + stats_dir = os.path.join( current_dir,dataset_name,'stats') + tgb_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) + figs_dir = os.path.join(current_dir,dataset_name,'figs_rel') + stats_df = pd.read_csv(os.path.join(stats_dir, f"relation_statistics_{dataset_name}.csv")) + + # Create the 'figs' directory if it doesn't exist + if not os.path.exists(figs_dir): + os.makedirs(figs_dir) + stats_dir = os.path.join( current_dir,dataset_name,'stats') + if not os.path.exists(stats_dir): + os.makedirs(stats_dir) + + ### A) pie charts #plot top k relations accordung to the number of occurences plus a slice for "others" + plot_names = list(stats_df['rel_string_word'].iloc[:k]) + plot_values = list(stats_df['number_total_occurences'].iloc[:k]) + all_others = np.sum(stats_df['number_total_occurences'].iloc[k:]) #slice for "others" (sum of all other relations occurences) + plot_values.append(all_others) + plot_names.append('Others') + # for the pie chart labels to be more readable (i.e. force line break if words are long) + plot_names_multi_line= [] + for name in plot_names: # add some \n to make the labels more fittable to the pie chart + if type(name) == str: + words = name.split() + newname = words[0] + if len(words) > 1: + for i in range(len(words)-1): + if not '(' in words[i+1]: + if len(words[i]) > 3: + newname+='\n' + else: + newname+=' ' + newname+=words[i+1] + else: + newname = str(name) #then only plot the int as is. + plot_names_multi_line.append(newname) + + num_slices = len(plot_names) + plt.figure(figsize=(7, 7)) + wedges, texts, autotexts =plt.pie(plot_values,autopct=lambda pct: f"{pct:.0f}%" if pct > 1.5 else '', startangle=140, colors=colors2, labeldistance=2.2) #repeated_colors) + # Increase the font size of the percentage values + for autotext in autotexts: + autotext.set_fontsize(20) #15 + plt.axis('equal') + # Move the percentage labels further outside + for autotext, wedge in zip(autotexts, wedges): + angle = (wedge.theta2 - wedge.theta1) / 2 + wedge.theta1 + x = np.cos(np.deg2rad(angle)) + y = np.sin(np.deg2rad(angle)) + distance = 0.85 # Adjust this value to move the labels further or closer to the center + autotext.set_position((x * distance, y * distance)) + # Set the labels for each pie slice + # plt.legend(wedges, plot_names_multi_line, loc="center left", bbox_to_anchor=(1, 0, 0.5, 1), fontsize=14) + plt.legend(wedges, plot_names_multi_line, loc="center left", bbox_to_anchor=(1, 0, 0.5, 1), fontsize=14) + save_path = (os.path.join(figs_dir, f"rel_pie_{dataset_name}.png")) + plt.savefig(save_path, bbox_inches='tight') + save_path = (os.path.join(figs_dir, f"rel_pie_{dataset_name}.pdf")) + plt.savefig(save_path, bbox_inches='tight') + + if dataset_name == 'tkgl-wikidata': #then we do not want to plot the mrr for the relations + continue + + ### B) plot the mrr for each relation for each method, different color for different number of occurences or for different recurrency degree + + # prepare the dataframe: only take the top ten relations according to number of occurences and sort by recurrency degree + # we use selected_df_sorted to plot the relations in the order of recurrency degree + rels_sorted = np.array(stats_df['relation'])[0:num_rels_plot] + mask = stats_df['relation'].isin(rels_sorted) + selected_df = stats_df[mask] #only the parts of the dataframe that contain the top ten relations according to number of occurences + selected_df_sorted = selected_df.sort_values(by='recurrency_degree', ascending=False) # Sort selected_df by 'recurrency_degree' column in descending order + rels_to_plot = list(selected_df_sorted['relation']) + labels = np.array(selected_df_sorted['relation'])# only plotting the id for space reasons + mrr_per_rel_freq = [] # list of mrr values for each relation - three lists for three methods + mrr_per_rel_freq2 = [] + mrr_per_rel_freq3 = [] + lab = [] + lab_ht = [] + lab_rel = [] + # rel_oc_dict[rel] = count_occurrences + count_occurrences_sorted = [] + rec_degree_sorted = [] + for index, r in enumerate(rels_to_plot): + if head_tail_flag: + lab_ht.append('h') + lab_ht.append('t') + lab_rel.append(str(labels[index])+' ') # add spaces to make the labels longer + else: + lab_rel.append(str(labels[index])+'') # add spaces to make the labels longer + + lab.append(labels[index]) + if head_tail_flag: # if we do head and tail separately we need the value for head and tail direction + mrr_per_rel_freq.append(selected_df_sorted['recurrency_head'].iloc[index]) + mrr_per_rel_freq.append(selected_df_sorted['recurrency_tail'].iloc[index]) + mrr_per_rel_freq2.append(selected_df_sorted['regcn_head'].iloc[index]) + mrr_per_rel_freq2.append(selected_df_sorted['regcn_tail'].iloc[index]) + mrr_per_rel_freq3.append(selected_df_sorted['cen_head'].iloc[index]) + mrr_per_rel_freq3.append(selected_df_sorted['cen_tail'].iloc[index]) + count_occurrences_sorted.append(selected_df_sorted['number_total_occurences'].iloc[index])#append twice for head and tail + count_occurrences_sorted.append(selected_df_sorted['number_total_occurences'].iloc[index]) + rec_degree_sorted.append(selected_df_sorted['recurrency_degree'].iloc[index]) #append twice for head and tail + rec_degree_sorted.append(selected_df_sorted['recurrency_degree'].iloc[index]) + else:# if we do NOT head and tail separately we need the mean value for head and tail direction + mrr_per_rel_freq.append(np.mean([selected_df_sorted['recurrency_head'].iloc[index], selected_df_sorted['recurrency_tail'].iloc[index]])) + mrr_per_rel_freq2.append(np.mean([selected_df_sorted['regcn_head'].iloc[index],selected_df_sorted['regcn_tail'].iloc[index]])) + mrr_per_rel_freq3.append(np.mean([selected_df_sorted['cen_head'].iloc[index], selected_df_sorted['cen_tail'].iloc[index]])) + count_occurrences_sorted.append(selected_df_sorted['number_total_occurences'].iloc[index])#append twice for head and tail + rec_degree_sorted.append(selected_df_sorted['recurrency_degree'].iloc[index]) + + # these are the x-values of the ticks. in case we plot head and tail separately, we need to have two ticks per relation + x_values = [] + x_values_rel = [] + for i in range(0,num_rels_plot): + if head_tail_flag: + x_values.append(i*2+0.4) + x_values.append(i*2+0.8) + else: + x_values.append(i*2+0.4) + x_values_rel.append(i*2+0.4) + + lab_lines = lab_rel #labels, for now + a = count_occurrences_sorted + + # version 1) colors are based on the reucrrency degree + plt.figure() + sca = plt.scatter(x_values, mrr_per_rel_freq2, marker='p',s=150, c = rec_degree_sorted, alpha=1, edgecolor='grey', cmap='jet', norm=Normalize(vmin=0, vmax=1), label='REGCN') # cmap='gist_rainbow', + sca = plt.scatter(x_values, mrr_per_rel_freq3 , marker='*',s=150, c = rec_degree_sorted, alpha=1, edgecolor='grey', cmap='jet', norm=Normalize(vmin=0, vmax=1), label='CEN') + sca = plt.scatter(x_values, mrr_per_rel_freq, marker='o',s=60, c = rec_degree_sorted, alpha=1, edgecolor='grey', cmap='jet', norm=Normalize(vmin=0, vmax=1), label='Recurrency Baseline') + plt.ylabel('MRR', fontsize=14) + plt.xlabel('Relation', fontsize=14) + plt.legend(fontsize=14) + cbar =plt.colorbar(sca) + plt.ylim([0,ylimdict[dataset_name]]) + cbar.ax.yaxis.label.set_color('gray') + + if head_tail_flag: + plt.xticks(x_values, lab_ht, size=13) #, verticalalignment="center") # ha='right', + plt.xticks(x_values_rel, lab_lines, size=14, minor=True) + plt.tick_params(axis='x', which='minor', rotation=90, length=0) + else: + plt.xticks(x_values_rel, lab_lines, size=14) + plt.tick_params(axis='x', rotation=90, length=0) + plt.yticks(size=13) + # Create a locator for the second set of x-ticks + # plt.secondary_xaxis('top', x_values_rel) + + plt.grid() + save_path = (os.path.join(figs_dir, f"rel_mrrperrel_recdeg_{dataset_name}.png")) + plt.savefig(save_path, bbox_inches='tight') + save_path = (os.path.join(figs_dir, f"rel_mrrperrel_recdeg_{dataset_name}.pdf")) + plt.savefig(save_path, bbox_inches='tight') + print('saved in ', save_path) + + # version 2) colors are the number of occurences + plt.figure() + sca = plt.scatter(x_values, mrr_per_rel_freq2, marker='p',s=150, c = a, alpha=1, edgecolor='grey', norm=LogNorm(), cmap='jet', label='REGCN') + sca = plt.scatter(x_values, mrr_per_rel_freq3 , marker='*',s=150, c = a, alpha=1, edgecolor='grey', norm=LogNorm(), cmap='jet', label='CEN') + sca = plt.scatter(x_values, mrr_per_rel_freq, marker='o',s=60, c = a, alpha=1, edgecolor='grey', norm=LogNorm(), cmap='jet', label='Recurrency Baseline') + plt.ylabel('MRR', fontsize=14) + plt.xlabel('Relation', fontsize=14) + plt.legend(fontsize=14) + cbar =plt.colorbar(sca) + plt.ylim([0,ylimdict[dataset_name]]) + cbar.ax.yaxis.label.set_color('gray') + + plt.xticks(x_values, lab_ht, size=13) #, verticalalignment="center") # ha='right', + plt.yticks(size=13) + # Create a locator for the second set of x-ticks + # plt.secondary_xaxis('top', x_values_rel) + plt.xticks(x_values_rel, lab_lines, size=14, minor=True) + plt.tick_params(axis='x', which='minor', rotation=90, length=0) + plt.grid() + save_path = (os.path.join(figs_dir, f"rel_mrrperrel_occ_{dataset_name}.png")) + plt.savefig(save_path, bbox_inches='tight') + + + ### C) plot all sorts of correlation matrix. I specify different columns for the different plots + df = stats_df[['recurrency_degree', 'direct_recurrency-degree', 'recurrency_tail', 'recurrency_head', 'regcn_tail', 'regcn_head', 'cen_tail', 'cen_head']] + corrmat= df.corr() + f = plt.figure(figsize=(19, 15)) + plt.matshow(corrmat, fignum=f.number, vmin=overall_min, vmax=overall_max) + plt.xticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=16, rotation=90) + plt.yticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=16) + cb = plt.colorbar() + cb.ax.tick_params(labelsize=16) + save_path = (os.path.join(figs_dir, f"corr_rec_meth_{dataset_name}.png")) + plt.savefig(save_path, bbox_inches='tight') + + df = stats_df[['consecutiveness_value', 'recurrency_tail', 'recurrency_head', 'regcn_tail', 'regcn_head', 'cen_tail', 'cen_head']] + corrmat= df.corr() + f = plt.figure(figsize=(19, 15)) + plt.matshow(corrmat, fignum=f.number, vmin=overall_min, vmax=overall_max) + plt.xticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=16, rotation=90) + plt.yticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=16) + cb = plt.colorbar() + cb.ax.tick_params(labelsize=16) + save_path = (os.path.join(figs_dir, f"corr_con_meth_{dataset_name}.png")) + plt.savefig(save_path, bbox_inches='tight') + + df = stats_df[['recurrency_degree', 'direct_recurrency-degree', 'consecutiveness_value', 'mean_occurence_per_triple','number_total_occurences', 'recurrency_tail', 'recurrency_head', 'regcn_tail', 'regcn_head', 'cen_tail', 'cen_head']] + corrmat= df.corr() + f = plt.figure(figsize=(19, 15)) + plt.matshow(corrmat, fignum=f.number, vmin=overall_min, vmax=overall_max) + plt.xticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=16, rotation=90) + plt.yticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=16) + for i in range(corrmat.shape[0]): + for j in range(corrmat.shape[1]): + plt.text(j, i, "{:.2f}".format(corrmat.iloc[i, j]), ha='center', va='center', color='black', fontsize=16) + cb = plt.colorbar() + # fig.colorbar(cax, ticks=[-1,0,1], shrink=0.8) + cb.ax.tick_params(labelsize=16) + # Plot the correlation matrix + save_path = (os.path.join(figs_dir, f"corr_all_meth_{dataset_name}.png")) + plt.savefig(save_path, bbox_inches='tight') + plt.close('all') + + + +print('done with creating the figs') + From 47bf81db2a3123ee99fe43bd57c788cc05cfad69 Mon Sep 17 00:00:00 2001 From: Shenyang Huang Date: Wed, 12 Jun 2024 10:42:04 -0400 Subject: [PATCH 4/5] moving files --- .../linkproppred/thgl-software/STHN_README.md | 0 stats_figures/create_relation_figures.py | 2 +- tgb/linkproppred/evaluate.py | 4 ++-- sampler_core.cpp => tgb_modules/sampler_core.cpp | 0 sthn_sampler_setup.py => tgb_modules/sthn_sampler_setup.py | 0 5 files changed, 3 insertions(+), 3 deletions(-) rename STHN_README.md => examples/linkproppred/thgl-software/STHN_README.md (100%) rename sampler_core.cpp => tgb_modules/sampler_core.cpp (100%) rename sthn_sampler_setup.py => tgb_modules/sthn_sampler_setup.py (100%) diff --git a/STHN_README.md b/examples/linkproppred/thgl-software/STHN_README.md similarity index 100% rename from STHN_README.md rename to examples/linkproppred/thgl-software/STHN_README.md diff --git a/stats_figures/create_relation_figures.py b/stats_figures/create_relation_figures.py index 6b136f3..5f4dd72 100644 --- a/stats_figures/create_relation_figures.py +++ b/stats_figures/create_relation_figures.py @@ -18,7 +18,7 @@ # specify params # which datasets -names = ['thgl-software']#['tkgl-smallpedia','tkgl-polecat', 'tkgl-icews']#[ 'tkgl-polecat', 'tkgl-icews', 'tkgl-wikidata', 'tkgl-smallpedia', 'thgl-software'] #['thgl-software']# +names = ['tkgl-wikidata']#['tkgl-smallpedia','tkgl-polecat', 'tkgl-icews']#[ 'tkgl-polecat', 'tkgl-icews', 'tkgl-wikidata', 'tkgl-smallpedia', 'thgl-software'] #['thgl-software']# # names = [ 'tkgl-polecat', 'tkgl-icews', 'tkgl-wikidata', 'tkgl-smallpedia','tkgl-polecat'] #'tkgl-polecat','tkgl-smallpedia', 'tkgl-yago', 'tkgl-icews' ,'tkgl-smallpedia','thgl-myket','tkgl-yago', 'tkgl-icews','thgl-github', 'thgl-forum', 'tkgl-wikidata'] # which methods for the mrr_per_rel figures methods = ['recurrency', 'regcn', 'cen'] #'recurrency' diff --git a/tgb/linkproppred/evaluate.py b/tgb/linkproppred/evaluate.py index b4ca20e..91553ef 100644 --- a/tgb/linkproppred/evaluate.py +++ b/tgb/linkproppred/evaluate.py @@ -108,8 +108,8 @@ def _eval_hits_and_mrr(self, y_pred_pos, y_pred_neg, type_info, k_value): else: y_pred_pos = y_pred_pos.reshape(-1, 1) - optimistic_rank = (y_pred_neg >= y_pred_pos).sum(axis=1) - pessimistic_rank = (y_pred_neg > y_pred_pos).sum(axis=1) + optimistic_rank = (y_pred_neg > y_pred_pos).sum(axis=1) + pessimistic_rank = (y_pred_neg >= y_pred_pos).sum(axis=1) ranking_list = 0.5 * (optimistic_rank + pessimistic_rank) + 1 hitsK_list = (ranking_list <= k_value).astype(np.float32) mrr_list = 1./ranking_list.astype(np.float32) diff --git a/sampler_core.cpp b/tgb_modules/sampler_core.cpp similarity index 100% rename from sampler_core.cpp rename to tgb_modules/sampler_core.cpp diff --git a/sthn_sampler_setup.py b/tgb_modules/sthn_sampler_setup.py similarity index 100% rename from sthn_sampler_setup.py rename to tgb_modules/sthn_sampler_setup.py From 20bb069b31852a3d5dfcf19d1bb448d033310ca7 Mon Sep 17 00:00:00 2001 From: Shenyang Huang Date: Wed, 19 Jun 2024 17:47:36 -0400 Subject: [PATCH 5/5] adding the docs --- docs/api/tgb.linkproppred.md | 4 + tgb/linkproppred/thg_negative_generator.py | 12 +- tgb/linkproppred/thg_negative_sampler.py | 12 +- tgb/linkproppred/tkg_negative_generator.py | 132 +-------------------- tgb/linkproppred/tkg_negative_sampler.py | 12 +- 5 files changed, 14 insertions(+), 158 deletions(-) diff --git a/docs/api/tgb.linkproppred.md b/docs/api/tgb.linkproppred.md index 21b2106..e47dd75 100644 --- a/docs/api/tgb.linkproppred.md +++ b/docs/api/tgb.linkproppred.md @@ -5,3 +5,7 @@ ::: tgb.linkproppred.evaluate ::: tgb.linkproppred.negative_sampler ::: tgb.linkproppred.negative_generator +::: tgb.linkproppred.tkg_negative_generator +::: tgb.linkproppred.tkg_negative_sampler +::: tgb.linkproppred.thg_negative_generator +::: tgb.linkproppred.thg_negative_sampler diff --git a/tgb/linkproppred/thg_negative_generator.py b/tgb/linkproppred/thg_negative_generator.py index 82c84be..da7832b 100644 --- a/tgb/linkproppred/thg_negative_generator.py +++ b/tgb/linkproppred/thg_negative_generator.py @@ -29,7 +29,7 @@ def __init__( edge_data: TemporalData = None, ) -> None: r""" - Negative Edge Sampler class + Negative Edge Generator class for Temporal Heterogeneous Graphs this is a class for generating negative samples for a specific datasets the set of the positive samples are provided, the negative samples are generated with specific strategies and are saved for consistent evaluation across different methods @@ -39,11 +39,10 @@ def __init__( first_node_id: the first node id last_node_id: the last node id node_type: the node type of each node - num_neg_e: number of negative edges being generated per each positive edge - strategy: specifies which strategy should be used for generating the negatives - rnd_seed: random seed for reproducibility - edge_data: the positive edges to generate the negatives for, assuming sorted temporally - + strategy: the strategy to generate negative samples + num_neg_e: number of negative samples to generate + rnd_seed: random seed + edge_data: the edge data object containing the positive edges Returns: None """ @@ -72,7 +71,6 @@ def get_destinations_based_on_node_type(self, node_type: np.ndarray) -> dict: r""" get the destination node id arrays based on the node type - Parameters: first_node_id: the first node id last_node_id: the last node id diff --git a/tgb/linkproppred/thg_negative_sampler.py b/tgb/linkproppred/thg_negative_sampler.py index 1ab281b..9b4ffa0 100644 --- a/tgb/linkproppred/thg_negative_sampler.py +++ b/tgb/linkproppred/thg_negative_sampler.py @@ -26,7 +26,7 @@ def __init__( r""" Negative Edge Sampler Loads and query the negative batches based on the positive batches provided. - constructor for the negative edge sampler class + constructor for the negative edge sampler class Parameters: dataset_name: name of the dataset @@ -124,16 +124,6 @@ def query_batch(self, neg_samples.append( neg_d_arr ) - - # conflict_set, d_node_type = conflict_dict[(pos_t, pos_s, e_type)] - - # all_dst = self.node_type_dict[d_node_type] - # # filtered_all_dst = np.delete(all_dst, conflict_set, axis=0) - # filtered_all_dst = np.setdiff1d(all_dst, conflict_set) - # neg_d_arr = filtered_all_dst - # neg_samples.append( - # neg_d_arr - # ) #? can't convert to numpy array due to different lengths of negative samples return neg_samples diff --git a/tgb/linkproppred/tkg_negative_generator.py b/tgb/linkproppred/tkg_negative_generator.py index 2f22525..6f2da6f 100644 --- a/tgb/linkproppred/tkg_negative_generator.py +++ b/tgb/linkproppred/tkg_negative_generator.py @@ -28,13 +28,8 @@ def __init__( edge_data: TemporalData = None, ) -> None: r""" - Negative Edge Sampler class - this is a class for generating negative samples for a specific datasets - the set of the positive samples are provided, the negative samples are generated with specific strategies - and are saved for consistent evaluation across different methods - negative edges are sampled with 'oen_vs_many' strategy. - it is assumed that the destination nodes are indexed sequentially with 'first_dst_id' - and 'last_dst_id' being the first and last index, respectively. + Negative Edge Generator class for Temporal Knowledge Graphs + constructor for the negative edge generator class Parameters: dataset_name: name of the dataset @@ -121,13 +116,6 @@ def generate_dst_dict(self, edge_data: TemporalData, dst_name: str) -> dict: edge_type_size = [] for key in dst_track_dict: dst = np.array(list(dst_track_dict[key].keys())) - # #* if there are too few dst, sample up to 1000 - # if len(dst) < 1000: - # dst_sampled = np.random.choice(np.arange(min_dst_idx, max_dst_idx+1), 1000, replace=False) - # while np.intersect1d(dst, dst_sampled).shape[0] != 0: - # dst_sampled = np.random.choice(np.arange(min_dst_idx, max_dst_idx+1), 1000, replace=False) - # dst_sampled[0:len(dst)] = dst[:] - # dst = dst_sampled edge_type_size.append(len(dst)) dst_dict[key] = dst print ('destination candidates generated for all edge types ', len(dst_dict)) @@ -401,118 +389,4 @@ def generate_negative_samples_random(self, evaluation_set[(pos_t, pos_s, edge_type)] = neg_d_arr save_pkl(evaluation_set, filename) - - - - - # def generate_negative_samples_ftr(self, - # data: TemporalData, - # split_mode: str, - # filename: str, - # ) -> None: - # r""" - # now we consider (s, d, t, edge_type) as a unique edge - # Generate negative samples based on the random strategy: - # - for each positive edge, sample a batch of negative edges from all possible edges with the same source node - # - filter actual positive edges at the same timestamp with the same edge type - - # Parameters: - # data: an object containing positive edges information - # split_mode: specifies whether to generate negative edges for 'validation' or 'test' splits - # filename: name of the file containing the generated negative edges - # """ - # print( - # f"INFO: Negative Sampling Strategy: {self.strategy}, Data Split: {split_mode}" - # ) - # assert split_mode in [ - # "val", - # "test", - # ], "Invalid split-mode! It should be `val` or `test`!" - - # if os.path.exists(filename): - # print( - # f"INFO: negative samples for '{split_mode}' evaluation are already generated!" - # ) - # else: - # print(f"INFO: Generating negative samples for '{split_mode}' evaluation!") - # # retrieve the information from the batch - # pos_src, pos_dst, pos_timestamp, edge_type = ( - # data.src.cpu().numpy(), - # data.dst.cpu().numpy(), - # data.t.cpu().numpy(), - # data.edge_type.cpu().numpy(), - # ) - - # # all possible destinations - # all_dst = np.arange(self.first_dst_id, self.last_dst_id + 1) - # evaluation_set = {} - # # generate a list of negative destinations for each positive edge - # pos_edge_tqdm = tqdm( - # zip(pos_src, pos_dst, pos_timestamp, edge_type), total=len(pos_src) - # ) - - # edge_t_dict = {} # {(t, u, edge_type): {v_1, v_2, ..} } - # #! iterate once to put all edges into a dictionary for reference - # for ( - # pos_s, - # pos_d, - # pos_t, - # edge_type, - # ) in pos_edge_tqdm: - # if (pos_t, pos_s, edge_type) not in edge_t_dict: - # edge_t_dict[(pos_t, pos_s, edge_type)] = {pos_d:1} - # else: - # edge_t_dict[(pos_t, pos_s, edge_type)][pos_d] = 1 - - # conflict_dict = {} - # for key in edge_t_dict: - # conflict_dict[key] = np.array(list(edge_t_dict[key].keys())) - - # print ("conflict sets for ns samples for ", len(conflict_dict), " positive edges are generated") - - # # save the generated evaluation set to disk - # save_pkl(conflict_dict, filename) - - # # pos_src, pos_dst, pos_timestamp, edge_type = ( - # # data.src.cpu().numpy(), - # # data.dst.cpu().numpy(), - # # data.t.cpu().numpy(), - # # data.edge_type.cpu().numpy(), - # # ) - - - # # # generate a list of negative destinations for each positive edge - # # pos_edge_tqdm = tqdm( - # # zip(pos_src, pos_dst, pos_timestamp, edge_type), total=len(pos_src) - # # ) - - - # # for ( - # # pos_s, - # # pos_d, - # # pos_t, - # # edge_type, - # # ) in pos_edge_tqdm: - - # # #! generate all negatives unless restricted - # # conflict_set = list(edge_t_dict[(pos_t, pos_s, edge_type)].keys()) - - # # # filter out positive destination - # # conflict_set = np.array(conflict_set) - # # filtered_all_dst = np.setdiff1d(all_dst, conflict_set) - - # # ''' - # # when num_neg_e is larger than all possible destinations simple return all possible destinations - # # ''' - # # if (self.num_neg_e < 0): - # # neg_d_arr = filtered_all_dst - # # elif (self.num_neg_e > len(filtered_all_dst)): - # # neg_d_arr = filtered_all_dst - # # else: - # # neg_d_arr = np.random.choice( - # # filtered_all_dst, self.num_neg_e, replace=False) #never replace negatives - - # # evaluation_set[(pos_s, pos_d, pos_t, edge_type)] = neg_d_arr - - # # # save the generated evaluation set to disk - # # save_pkl(evaluation_set, filename) + \ No newline at end of file diff --git a/tgb/linkproppred/tkg_negative_sampler.py b/tgb/linkproppred/tkg_negative_sampler.py index 1e6fd0d..38106d1 100644 --- a/tgb/linkproppred/tkg_negative_sampler.py +++ b/tgb/linkproppred/tkg_negative_sampler.py @@ -44,17 +44,7 @@ def __init__( self.last_dst_id = last_dst_id self.strategy = strategy self.dst_dict = None - # if self.strategy in ["dst-time-filtered"]: - # dst_dict_name = ( - # partial_path - # + "_" - # + "dst_dict" - # + ".pkl" - # ) - # if not os.path.exists(dst_dict_name): - # raise FileNotFoundError(f"File not found at {dst_dict_name}, dst_time_filtered strategy requires the dst_dict file") - # self.dst_dict = load_pkl(dst_dict_name) - + def load_eval_set( self, fname: str,