diff --git a/README.md b/README.md index 0fad6ab..92a8617 100644 --- a/README.md +++ b/README.md @@ -195,7 +195,7 @@ As you can see, the attached debiasing adapter successfully mitigates bias in th The following example shows how to evaluate the original metrics' perfomance on [WMT20](https://aclanthology.org/2020.wmt-1.77/): ```bash -cd Metric-Fairness/mitigating_bias/performance_eval +cd Metric-Fairness/mitigating_bias/performance_eval/WMT pip install -r requirements.txt python eval_bert_score.py --model_type bert-base-uncased python eval_bert_score.py --model_type bert-large-uncased @@ -244,9 +244,25 @@ In like wise, each score of BERTScore (both BERT-base and BERT-large), BARTScore +-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+---------+ ``` +##### REALSumm +For the sake of time, we provide the pkl file directly, run -##### RealSumm +```bash +cd Metric-Fairness/mitigating_bias/performance_eval/REALSumm +pip install -r requirements.txt +python analyse_pkls.py +``` + +and you will get scores like + +``` ++------------------------------+----------------------+------------------------------+----------------------+-------------------------------+-----------------------+--------------------------+------------------+ +| bart_score_bart_base_adapter | bart_score_bart_base | bert_score_bert_base_adapter | bert_score_bert_base | bert_score_bert_large_adapter | bert_score_bert_large | bleurt_bert_base_adapter | bleurt_bert_base | ++------------------------------+----------------------+------------------------------+----------------------+-------------------------------+-----------------------+--------------------------+------------------+ +| 0.307 | 0.325 | 0.473 | 0.465 | 0.468 | 0.464 | 0.4 | 0.299 | ++------------------------------+----------------------+------------------------------+----------------------+-------------------------------+-----------------------+--------------------------+------------------+ +``` If you use our data or code, please cite: diff --git a/mitigating_bias/performance_eval/REALSumm/analyse_pkls.py b/mitigating_bias/performance_eval/REALSumm/analyse_pkls.py new file mode 100644 index 0000000..2615d75 --- /dev/null +++ b/mitigating_bias/performance_eval/REALSumm/analyse_pkls.py @@ -0,0 +1,35 @@ +import pandas as pd +from prettytable import PrettyTable + +KEY_VALUE = { + 'bart_score_bart_base_adapter': 'bart_score_avg_f', + 'bart_score_bart_base': 'bart_score_avg_f', + 'bert_score_bert_base_adapter': 'bert_score_f', + 'bert_score_bert_base': 'bert_score_f', + 'bert_score_bert_large_adapter': 'bert_score_f', + 'bert_score_bert_large': 'bert_score_f', + 'bleurt_bert_base_adapter': 'bleurt_score', + 'bleurt_bert_base': 'bleurt_score' +} + + + +def analyse_pkls(key,value): + data=pd.read_pickle('pkls/' + key +'.pkl' ) + from scipy.stats import pearsonr, spearmanr, kendalltau + + human = [] + metric = [] + + for i in data.keys(): + for j in data[i]['sys_summs'].keys(): + human.append(data[i]['sys_summs'][j]['scores']['litepyramid_recall']) + metric.append(data[i]['sys_summs'][j]['scores'][value]) + correlation, p_value = spearmanr(metric, human) + return correlation +pt = PrettyTable() + +for key in KEY_VALUE.keys(): + pt.add_column(key, [analyse_pkls(key,KEY_VALUE[key])]) + +print(pt) \ No newline at end of file diff --git a/mitigating_bias/performance_eval/REALSumm/pkls/bart_score_bart_base.pkl b/mitigating_bias/performance_eval/REALSumm/pkls/bart_score_bart_base.pkl new file mode 100644 index 0000000..5e18ead Binary files /dev/null and b/mitigating_bias/performance_eval/REALSumm/pkls/bart_score_bart_base.pkl differ diff --git a/mitigating_bias/performance_eval/REALSumm/pkls/bart_score_bart_base_adapter.pkl b/mitigating_bias/performance_eval/REALSumm/pkls/bart_score_bart_base_adapter.pkl new file mode 100644 index 0000000..1744637 Binary files /dev/null and b/mitigating_bias/performance_eval/REALSumm/pkls/bart_score_bart_base_adapter.pkl differ diff --git a/mitigating_bias/performance_eval/REALSumm/pkls/bert_score_bert_base.pkl b/mitigating_bias/performance_eval/REALSumm/pkls/bert_score_bert_base.pkl new file mode 100644 index 0000000..daaa793 Binary files /dev/null and b/mitigating_bias/performance_eval/REALSumm/pkls/bert_score_bert_base.pkl differ diff --git a/mitigating_bias/performance_eval/REALSumm/pkls/bert_score_bert_base_adapter.pkl b/mitigating_bias/performance_eval/REALSumm/pkls/bert_score_bert_base_adapter.pkl new file mode 100644 index 0000000..f55cb9b Binary files /dev/null and b/mitigating_bias/performance_eval/REALSumm/pkls/bert_score_bert_base_adapter.pkl differ diff --git a/mitigating_bias/performance_eval/REALSumm/pkls/bert_score_bert_large.pkl b/mitigating_bias/performance_eval/REALSumm/pkls/bert_score_bert_large.pkl new file mode 100644 index 0000000..7122429 Binary files /dev/null and b/mitigating_bias/performance_eval/REALSumm/pkls/bert_score_bert_large.pkl differ diff --git a/mitigating_bias/performance_eval/REALSumm/pkls/bert_score_bert_large_adapter.pkl b/mitigating_bias/performance_eval/REALSumm/pkls/bert_score_bert_large_adapter.pkl new file mode 100644 index 0000000..04f8e3f Binary files /dev/null and b/mitigating_bias/performance_eval/REALSumm/pkls/bert_score_bert_large_adapter.pkl differ diff --git a/mitigating_bias/performance_eval/REALSumm/pkls/bleurt_bert_base.pkl b/mitigating_bias/performance_eval/REALSumm/pkls/bleurt_bert_base.pkl new file mode 100644 index 0000000..23938d5 Binary files /dev/null and b/mitigating_bias/performance_eval/REALSumm/pkls/bleurt_bert_base.pkl differ diff --git a/mitigating_bias/performance_eval/REALSumm/pkls/bleurt_bert_base_adapter.pkl b/mitigating_bias/performance_eval/REALSumm/pkls/bleurt_bert_base_adapter.pkl new file mode 100644 index 0000000..86edac5 Binary files /dev/null and b/mitigating_bias/performance_eval/REALSumm/pkls/bleurt_bert_base_adapter.pkl differ diff --git a/mitigating_bias/performance_eval/REALSumm/requirements.txt b/mitigating_bias/performance_eval/REALSumm/requirements.txt new file mode 100644 index 0000000..71ea94a --- /dev/null +++ b/mitigating_bias/performance_eval/REALSumm/requirements.txt @@ -0,0 +1,3 @@ +pandas==1.3.4 +prettytable==3.4.1 +scipy==1.7.1 diff --git a/mitigating_bias/performance_eval/WMT/requirements.txt b/mitigating_bias/performance_eval/WMT/requirements.txt new file mode 100644 index 0000000..20df590 --- /dev/null +++ b/mitigating_bias/performance_eval/WMT/requirements.txt @@ -0,0 +1,12 @@ +absl==0.0 +adapter_transformers==3.1.0 +matplotlib==3.4.3 +numpy==1.20.3 +pandas==1.3.4 +prettytable==3.4.1 +pyemd==0.5.1 +scipy==1.7.1 +score==0.0.1a0 +setuptools==58.0.4 +tqdm==4.62.3 +transformers==4.23.1