From 051d00cc219cec1f8053b99caa0fe630194b2c27 Mon Sep 17 00:00:00 2001 From: mcollardanuy Date: Thu, 27 Apr 2023 13:32:32 +0000 Subject: [PATCH] Improve docs and move functions --- experiments/README.md | 19 +- {geoparser => experiments}/experiment.py | 7 +- experiments/toponym_resolution.py | 3 +- tests/test_disambiguation.py | 10 +- tests/test_experiments.py | 5 +- utils/REL/entity_disambiguation.py | 8 +- utils/process_data.py | 276 +---------------------- utils/rel_e2e.py | 227 ++++++++++++++++--- 8 files changed, 233 insertions(+), 322 deletions(-) rename {geoparser => experiments}/experiment.py (98%) diff --git a/experiments/README.md b/experiments/README.md index 7ec178f1..a82d6938 100644 --- a/experiments/README.md +++ b/experiments/README.md @@ -4,7 +4,7 @@ Follow these steps to reproduce the experiments in our paper. ### 1. Obtain the external resources [DRAFT] -You will need the following resources, which are created using the code in the wiki2gaz repository ([TODO: add link]) or can be downloaded from [TODO: add link]: +You will need the following resources, which are created using the code in the [wiki2gaz](https://github.com/Living-with-machines/wiki2gaz) or can be downloaded from [TODO: add link]: ``` ../resources/wikidata/wikidata_gazetteer.csv ../resources/wikidata/entity2class.txt @@ -14,6 +14,8 @@ You will need the following resources, which are created using the code in the w ../resources/wikipedia/wikidata2wikipedia/index_enwiki-latest.db ``` +You will also need the [word2vec embeddings](TODO: add link) trained from 19th Century data. These embeddings have been created by Nilo Pedrazzini. For more information, check https://github.com/Living-with-machines/DiachronicEmb-BigHistData. + ### 2. Preparing the data To create the datasets that we use in the experiments presented in the paper, run the following command: @@ -22,4 +24,17 @@ python prepare_data.py ``` This script takes care of downloading the LwM and HIPE datasets and format them as needed in the experiments. -### 3. Running the experiments \ No newline at end of file +### 3. Running the experiments + +To run the experiments, run the following script: +```bash +python toponym_resolution.py +``` +This script does runs for all different scenarios reported in the experiments in the paper. + +### 4. Evaluate + +To evaluate the different approaches and obtain a table with results such as the one provided in the paper, go to the `../evaluation/` directory. There, you should clone the [HIPE scorer](https://github.com/hipe-eval/HIPE-scorer). We are using the code version at commit 50dff4e, and have added the line `return eval_stats` at the end of the `get_results()` function. From `../evaluation/`, run the following script to obtain the results in latex format: +```bash +python display_results.py +``` \ No newline at end of file diff --git a/geoparser/experiment.py b/experiments/experiment.py similarity index 98% rename from geoparser/experiment.py rename to experiments/experiment.py index 83bf38c1..7953da3b 100644 --- a/geoparser/experiment.py +++ b/experiments/experiment.py @@ -75,7 +75,7 @@ def __init__( ) else: sys.exit( - "\nError: The dataset has not been created, you should first run the data_processing.py script.\n" + "\nError: The dataset has not been created, you should first run the prepare_data.py script.\n" ) def __str__(self): @@ -189,6 +189,11 @@ def prepare_data(self): return self.processed_data def linking_experiments(self): + """ + Prepares the data for the linking experiments, creating a mention-based + dataframe. It produces tsv files in the format required by the HIPE + scorer, ready to be evaluated. + """ # Create a mention-based dataframe for the linking experiments: processed_df = process_data.create_mentions_df(self) self.processed_data["processed_df"] = processed_df diff --git a/experiments/toponym_resolution.py b/experiments/toponym_resolution.py index bcab897a..b3724c63 100644 --- a/experiments/toponym_resolution.py +++ b/experiments/toponym_resolution.py @@ -5,7 +5,8 @@ # Add "../" to path to import utils sys.path.insert(0, os.path.abspath(os.path.pardir)) -from geoparser import experiment, recogniser, ranking, linking +from geoparser import recogniser, ranking, linking +from experiments import experiment # Choose test scenario: test_scenario = "test" # "dev" while experimenting, "test" for the final numbers diff --git a/tests/test_disambiguation.py b/tests/test_disambiguation.py index e64b0571..b210b64d 100644 --- a/tests/test_disambiguation.py +++ b/tests/test_disambiguation.py @@ -60,10 +60,10 @@ def test_prepare_initial_data(): "experiments/outputs/data/lwm/linking_df_split.tsv", sep="\t" ).iloc[:1] parsed_doc = rel_utils.prepare_initial_data(df, context_len=100) - assert parsed_doc["4939308"][0]["mention"] == "STALYBRIDGE" - assert parsed_doc["4939308"][0]["gold"][0] == "Q1398653" - assert parsed_doc["4939308"][3]["mention"] == "Market-street" - assert parsed_doc["4939308"][3]["gold"] == "NIL" + assert parsed_doc["4939308_1"][0]["mention"] == "STALYBRIDGE" + assert parsed_doc["4939308_1"][0]["gold"][0] == "Q1398653" + assert parsed_doc["4939308_6"][1]["mention"] == "Market-street" + assert parsed_doc["4939308_6"][1]["gold"] == "NIL" def test_train(): @@ -163,7 +163,7 @@ def test_train(): ) # assert expected performance on test set - assert 0.60 < mylinker.rel_params["ed_model"].best_performance["f1"] + assert mylinker.rel_params["ed_model"].best_performance["f1"] == 0.6583541147132169 def test_load_eval_model(): diff --git a/tests/test_experiments.py b/tests/test_experiments.py index c04c99c1..13d26e13 100644 --- a/tests/test_experiments.py +++ b/tests/test_experiments.py @@ -7,7 +7,8 @@ # Add "../" to path to import utils sys.path.insert(0, os.path.abspath(os.path.pardir)) -from geoparser import experiment, linking, ranking, recogniser +from geoparser import linking, ranking, recogniser +from experiments import experiment def test_wrong_dataset_path(): @@ -25,7 +26,7 @@ def test_wrong_dataset_path(): assert ( cm.value.code - == "\nError: The dataset has not been created, you should first run the data_processing.py script.\n" + == "\nError: The dataset has not been created, you should first run the prepare_data.py script.\n" ) diff --git a/utils/REL/entity_disambiguation.py b/utils/REL/entity_disambiguation.py index fd3dc8e5..d6ba435c 100644 --- a/utils/REL/entity_disambiguation.py +++ b/utils/REL/entity_disambiguation.py @@ -7,7 +7,7 @@ import pickle import numpy as np from pathlib import Path -from random import shuffle +import random import torch.optim as optim from string import punctuation from torch.autograd import Variable @@ -27,6 +27,9 @@ for the ED step. """ +RANDOM_SEED = 42 +random.seed(RANDOM_SEED) + class EntityDisambiguation: def __init__(self, db_embs, user_config, reset_embeddings=False): @@ -163,7 +166,7 @@ def train(self, org_train_dataset, org_dev_dataset): eval_after_n_epochs = self.config["eval_after_n_epochs"] for e in range(self.config["n_epochs"]): - shuffle(train_dataset) + random.shuffle(train_dataset) total_loss = 0 for dc, batch in enumerate(train_dataset): # each document is a minibatch @@ -397,7 +400,6 @@ def __predict(self, data, include_timing=False, eval_raw=False): timing = [] for batch in data: # each document is a minibatch - start = time.time() token_ids = [ diff --git a/utils/process_data.py b/utils/process_data.py index 31b3506e..26abe628 100644 --- a/utils/process_data.py +++ b/utils/process_data.py @@ -4,12 +4,10 @@ from ast import literal_eval from pathlib import Path from tqdm import tqdm - import pandas as pd -import requests sys.path.insert(0, os.path.abspath(os.path.pardir)) -from utils import ner, process_wikipedia +from utils import ner # ---------------------------------------------------- @@ -28,30 +26,6 @@ def eval_with_exception(str2parse, in_case=""): return in_case -def get_wikidata_instance_ids(mylinker): - """helper to map wikidata entitiess to class ids - if there is more than one, we take the most general class - i.e. the one with lowest number - """ - mylinker.linking_resources["wikidata_id2inst_id"] = {} - for i, row in tqdm(mylinker.linking_resources["gazetteer"].iterrows()): - instances = row["instance_of"] - if instances: - if len(instances) > 1: - instance = instances[0] - for i in instances[1:]: - if int(i[1:]) < int(instance[1:]): - instance = i - mylinker.linking_resources["wikidata_id2inst_id"][ - row.wikidata_id - ] = instance - else: - mylinker.linking_resources["wikidata_id2inst_id"][ - row.wikidata_id - ] = instances[0] - return mylinker.linking_resources["wikidata_id2inst_id"] - - # ---------------------------------------------------- def prepare_sents(df): """ @@ -569,229 +543,6 @@ def store_processed_data( return dict_processed_data -# ---------------------------------------------------- -def rel_end_to_end(sent): - """ - REL end-to-end using the API. - - Arguments: - sent (str): a sentence in plain text. - - Returns: - el_result (dict): the output from REL end-to-end API - for the input sentence. - """ - API_URL = "https://rel.cs.ru.nl/api" - el_result = requests.post(API_URL, json={"text": sent, "spans": []}).json() - return el_result - - -# ---------------------------------------------------- -def get_rel_from_api(dSentences, rel_end2end_path): - """ - Uses the REL API to do end-to-end entity linking. - - Arguments: - dSentences (dict): dictionary of sentences, where the - key is the article-sent identifier and the value - is the full text of the sentence. - rel_end2end_path (str): the path of the file where the - REL results will be stored. - - Returns: - A JSON file with the REL results. - """ - # Dictionary to store REL predictions: - rel_preds = dict() - if Path(rel_end2end_path).exists(): - with open(rel_end2end_path) as f: - rel_preds = json.load(f) - print("\nObtain REL linking from API (unless already stored):") - for s in tqdm(dSentences): - if not s in rel_preds: - rel_preds[s] = rel_end_to_end(dSentences[s]) - # Append per processed sentence in case of API limit: - with open(rel_end2end_path, "w") as fp: - json.dump(rel_preds, fp) - with open(rel_end2end_path) as f: - rel_preds = json.load(f) - - -# ---------------------------------------------------- -def rel_process_results( - mentions_dataset, - predictions, - processed, - include_offset=False, -): - """ - Function that can be used to process the End-to-End results. - :return: dictionary with results and document as key. - """ - res = {} - for doc in mentions_dataset: - if doc not in predictions: - # No mentions found, we return empty list. - continue - pred_doc = predictions[doc] - ment_doc = mentions_dataset[doc] - text = processed[doc][0] - res_doc = [] - - for pred, ment in zip(pred_doc, ment_doc): - sent = ment["sentence"] - idx = ment["sent_idx"] - start_pos = ment["pos"] - mention_length = int(ment["end_pos"] - ment["pos"]) - - if pred["prediction"] != "NIL": - temp = ( - start_pos, - mention_length, - ment["ngram"], - pred["prediction"], - pred["conf_ed"], - ment["conf_md"] if "conf_md" in ment else 0.0, - ment["tag"] if "tag" in ment else "NULL", - ) - res_doc.append(temp) - res[doc] = res_doc - return res - - -# ---------------------------------------------------- -def get_rel_locally(dSentences, mention_detection, tagger_ner, linking_model): - """ - Uses the REL API to do end-to-end entity linking. - - Arguments: - dSentences (dict): dictionary of sentences, where the - key is the article-sent identifier and the value - is the full text of the sentence. - XXXX - XXXX - - Returns: - A JSON file with the REL results. - """ - # Dictionary to store REL predictions: - print("\nObtain REL linking locally\n") - - def rel_sentence_preprocessing(s, sentence): - text = sentence - processed = {s: [text, []]} - return processed - - dREL = dict() - for s in tqdm(dSentences): - input_text = rel_sentence_preprocessing(s, dSentences[s]) - mentions_dataset, n_mentions = mention_detection.find_mentions( - input_text, tagger_ner - ) - predictions, timing = linking_model.predict(mentions_dataset) - result = rel_process_results(mentions_dataset, predictions, input_text) - for k in result: - dREL[k] = result[k] - return dREL - - -# ---------------------------------------------------- -def match_wikipedia_to_wikidata(wiki_title): - """ - Get the Wikidata ID from a Wikipedia title. - - Arguments: - wiki_title (str): a Wikipedia title, underscore-separated. - - Returns: - a string, either the Wikidata QID corresponding entity, or NIL. - """ - wqid = process_wikipedia.title_to_id(wiki_title, lower=False) - if not wqid: - wqid = "NIL" - return wqid - - -# ---------------------------------------------------- -def match_ent(pred_ents, start, end, prev_ann, gazetteer_ids): - """ - Function that, given the position in a sentence of a - specific gold standard token, finds the corresponding - string and prediction information returned by REL. - - Arguments: - pred_ents (list): a list of lists, each inner list - corresponds to a token. - start (int): start character of a token in the gold standard. - end (int): end character of a token in the gold standard. - prev_ann (str): entity type of the previous token. - - Returns: - A tuple with three elements: (1) the entity type, (2) the - entity link and (3) the entity type of the previous token. - """ - for ent in pred_ents: - wqid = match_wikipedia_to_wikidata(ent[3]) - # If entity is a LOC or linked entity is in our KB: - if ent[-1] == "LOC" or wqid in gazetteer_ids: - # Any place with coordinates is considered a location - # throughout our experiments: - ent_type = "LOC" - st_ent = ent[0] - len_ent = ent[1] - if start >= st_ent and end <= (st_ent + len_ent): - if prev_ann == ent_type: - ent_pos = "I-" - else: - ent_pos = "B-" - prev_ann = ent_type - - n = ent_pos + ent_type - try: - el = ent_pos + match_wikipedia_to_wikidata(ent[3]) - except Exception as e: - print(e) - # to be checked but it seems some Wikipedia pages are not in our Wikidata - # see for instance Zante%2C%20California - return n, "O", "" - return n, el, prev_ann - return "O", "O", "" - - -# ---------------------------------------------------- -def postprocess_rel(rel_preds, dSentences, gold_tokenization, wikigaz_ids): - """ - For each sentence, retokenizes the REL output to match the gold - standard tokenization. - - Arguments: - XXX - dSentences (dict): dictionary that maps a sentence id to the text. - gold_tokenization (dict): dictionary that contains the tokenized - sentence with gold standard annotations of entity type and - link, per sentence. - - Returns: - dREL (dict): dictionary that maps a sentence id with the REL predictions, - retokenized as in the gold standard. - """ - dREL = dict() - for sent_id in tqdm(list(dSentences.keys())): - sentence_preds = [] - prev_ann = "" - for token in gold_tokenization[sent_id]: - start = token["start"] - end = token["end"] - word = token["word"] - current_preds = rel_preds.get(sent_id, []) - n, el, prev_ann = match_ent( - current_preds, start, end, prev_ann, wikigaz_ids - ) - sentence_preds.append([word, n, el]) - dREL[sent_id] = sentence_preds - return dREL - - # ---------------------------------------------------- def create_mentions_df(experiment): """ @@ -1086,28 +837,3 @@ def store_results(experiment, task, how_split, which_split): experiment.processed_data["skys"], test_articles, ) - - -def store_rel(experiment, dREL, approach, how_split, which_split): - hipe_scorer_results_path = os.path.join(experiment.results_path, experiment.dataset) - scenario_name = ( - approach - + "_" - + experiment.myner.model # The model name is needed due to tokenization - + "_" - + how_split - ) - - # Find article ids of the corresponding test set (e.g. 'dev' of the original split, - # 'test' of the Ashton1860 split, etc): - all = experiment.dataset_df - test_articles = list(all[all[how_split] == "test"].article_id.unique()) - test_articles = [str(art) for art in test_articles] - - # Store REL results formatted for CLEF-HIPE scorer: - store_for_scorer( - hipe_scorer_results_path, - scenario_name, - dREL, - test_articles, - ) diff --git a/utils/rel_e2e.py b/utils/rel_e2e.py index d7c916c3..a4675ab2 100644 --- a/utils/rel_e2e.py +++ b/utils/rel_e2e.py @@ -1,23 +1,203 @@ import json import os import sys +import requests from pathlib import Path - -import pandas as pd +from tqdm import tqdm # Import utils sys.path.insert(0, os.path.abspath(os.path.pardir)) -from utils import process_data -from utils.REL.entity_disambiguation import EntityDisambiguation +from utils import process_data, process_wikipedia -# REL imports -from utils.REL.mention_detection import MentionDetection -from utils.REL.ner import load_flair_ner +def rel_end_to_end(sent): + """ + REL end-to-end entity linking using the API. -def run_rel_experiments(self): + Arguments: + sent (str): a sentence in plain text. + + Returns: + el_result (dict): the output from REL end-to-end API + for the input sentence. + """ + API_URL = "https://rel.cs.ru.nl/api" + el_result = requests.post(API_URL, json={"text": sent, "spans": []}).json() + return el_result + + +def get_rel_from_api(dSentences, rel_end2end_path): + """ + Uses the REL API to do end-to-end entity linking. + + Arguments: + dSentences (dict): dictionary of sentences, where the + key is the article-sent identifier and the value + is the full text of the sentence. + rel_end2end_path (str): the path of the file where the + REL results will be stored. + + Returns: + A JSON file with the REL results. + """ + # Dictionary to store REL predictions: + rel_preds = dict() + if Path(rel_end2end_path).exists(): + with open(rel_end2end_path) as f: + rel_preds = json.load(f) + print("\nObtain REL linking from API (unless already stored):") + for s in tqdm(dSentences): + if not s in rel_preds: + rel_preds[s] = rel_end_to_end(dSentences[s]) + # Append per processed sentence in case of API limit: + with open(rel_end2end_path, "w") as fp: + json.dump(rel_preds, fp) + with open(rel_end2end_path) as f: + rel_preds = json.load(f) + + +def match_wikipedia_to_wikidata(wiki_title): + """ + Get the Wikidata ID from a Wikipedia title. + + Arguments: + wiki_title (str): a Wikipedia title, underscore-separated. + + Returns: + a string, either the Wikidata QID corresponding entity, or NIL. + """ + wqid = process_wikipedia.title_to_id( + wiki_title, + lower=False, + path_to_db="../resources/wikipedia/index_enwiki-latest.db", + ) + if not wqid: + wqid = "NIL" + return wqid + + +def match_ent(pred_ents, start, end, prev_ann, gazetteer_ids): + """ + Function that, given the position in a sentence of a + specific gold standard token, finds the corresponding + string and prediction information returned by REL. + + Arguments: + pred_ents (list): a list of lists, each inner list + corresponds to a token. + start (int): start character of a token in the gold standard. + end (int): end character of a token in the gold standard. + prev_ann (str): entity type of the previous token. + + Returns: + A tuple with three elements: (1) the entity type, (2) the + entity link and (3) the entity type of the previous token. + """ + for ent in pred_ents: + wqid = match_wikipedia_to_wikidata(ent[3]) + # If entity is a LOC or linked entity is in our KB: + if ent[-1] == "LOC" or wqid in gazetteer_ids: + # Any place with coordinates is considered a location + # throughout our experiments: + ent_type = "LOC" + st_ent = ent[0] + len_ent = ent[1] + if start >= st_ent and end <= (st_ent + len_ent): + if prev_ann == ent_type: + ent_pos = "I-" + else: + ent_pos = "B-" + prev_ann = ent_type + + n = ent_pos + ent_type + try: + el = ent_pos + match_wikipedia_to_wikidata(ent[3]) + except Exception as e: + print(e) + # to be checked but it seems some Wikipedia pages are not in our Wikidata + # see for instance Zante%2C%20California + return n, "O", "" + return n, el, prev_ann + return "O", "O", "" - # Continue only is flag is True: + +def postprocess_rel(rel_preds, dSentences, gold_tokenization, wikigaz_ids): + """ + For each sentence, retokenizes the REL output to match the gold + standard tokenization. + + Arguments: + rel_preds (dict): dictionary containing the predictions using REL. + dSentences (dict): dictionary that maps a sentence id to the text. + gold_tokenization (dict): dictionary that contains the tokenized + sentence with gold standard annotations of entity type and + link, per sentence. + wikigaz_ids (set): set of Wikidata IDs of entities in the gazetteer. + + Returns: + dREL (dict): dictionary that maps a sentence id with the REL predictions, + retokenized as in the gold standard. + """ + dREL = dict() + for sent_id in tqdm(list(dSentences.keys())): + sentence_preds = [] + prev_ann = "" + for token in gold_tokenization[sent_id]: + start = token["start"] + end = token["end"] + word = token["word"] + current_preds = rel_preds.get(sent_id, []) + n, el, prev_ann = match_ent( + current_preds, start, end, prev_ann, wikigaz_ids + ) + sentence_preds.append([word, n, el]) + dREL[sent_id] = sentence_preds + return dREL + + +def store_rel(experiment, dREL, approach, how_split): + """ + Prepare the data to be stored in the format required by the HIPE scorer. + + Arguments: + experiment (Experiment object): object for the current experiment. + dREL (dict): dictionary with the results using the REL approach. + approach (str): name of the REL approach (only rel_end_to_end_api available). + how_split (str): data split to store the results for. + + Returns: + A tsv with the results in the Conll format required by the scorer. + + """ + hipe_scorer_results_path = os.path.join(experiment.results_path, experiment.dataset) + scenario_name = ( + approach + + "_" + + experiment.myner.model # The model name is needed due to tokenization + + "_" + + how_split + ) + + # Find article ids of the corresponding test set (e.g. 'dev' of the original split, + # 'test' of the Ashton1860 split, etc): + all = experiment.dataset_df + test_articles = list(all[all[how_split] == "test"].article_id.unique()) + test_articles = [str(art) for art in test_articles] + + # Store REL results formatted for CLEF-HIPE scorer: + process_data.store_for_scorer( + hipe_scorer_results_path, + scenario_name, + dREL, + test_articles, + ) + + +def run_rel_experiments(self): + """ + Function that runs the end-to-end experiments using REL. + """ + # Continue only if flag is True: if self.rel_experiments == False: return @@ -25,9 +205,7 @@ def run_rel_experiments(self): print("Start the REL experiments.\n") self.processed_data = self.load_data() - # ------------------------------------------- - # 0. LIST EVALUATION SCENARIOS - # ------------------------------------------- + # List of evaluation scenarios dict_splits = dict() dict_splits["dev"] = ["originalsplit"] if self.dataset == "hipe": @@ -47,27 +225,16 @@ def run_rel_experiments(self): "Poole1860", ] - # ------------------------------------------- - # REL: SUMMARY OF APPROACHES - # ------------------------------------------- dict_rel_approaches = dict() - # ------------------------------------------- - # 1. END TO END FROM API - # ------------------------------------------- - # Run REL end-to-end, as well - # Note: Do not move the next block of code, - # as REL relies on the tokenisation performed - # by the previous method, so it needs to be - # run after ther our method. - print("* REL: Approach 1") + # Run REL end-to-end: rel_approach_name = "rel_end_to_end_api" Path(self.results_path + self.dataset).mkdir(parents=True, exist_ok=True) rel_end2end_path = self.results_path + self.dataset + "/rel_e2d_from_api.json" - process_data.get_rel_from_api(self.processed_data["dSentences"], rel_end2end_path) + get_rel_from_api(self.processed_data["dSentences"], rel_end2end_path) with open(rel_end2end_path) as f: rel_preds = json.load(f) - dREL = process_data.postprocess_rel( + dREL = postprocess_rel( rel_preds, self.processed_data["dSentences"], self.processed_data["gold_tok"], @@ -75,22 +242,16 @@ def run_rel_experiments(self): ) dict_rel_approaches[rel_approach_name] = {"results": dREL} - # ------------------------------------------- - # N. STORE RESULTS PER EVAL SCENARIO - # ------------------------------------------- - # Store results for each split for rel_approach_name in dict_rel_approaches: for test_split in dict_splits: for split in dict_splits[test_split]: - # Process REL results: - process_data.store_rel( + store_rel( self, dict_rel_approaches[rel_approach_name]["results"], approach=rel_approach_name, how_split=split, - which_split=test_split, ) print("... done!\n")