Improve docs and move functions

Living-with-machines · Apr 27, 2023 · 051d00c · 051d00c
1 parent 6a7f1cf
commit 051d00c
Show file tree

Hide file tree

Showing 8 changed files with 233 additions and 322 deletions.
diff --git a/experiments/README.md b/experiments/README.md
@@ -4,7 +4,7 @@ Follow these steps to reproduce the experiments in our paper.
 
 ### 1. Obtain the external resources [DRAFT]
 
-You will need the following resources, which are created using the code in the wiki2gaz repository ([TODO: add link]) or can be downloaded from [TODO: add link]:
+You will need the following resources, which are created using the code in the [wiki2gaz](https://github.com/Living-with-machines/wiki2gaz) or can be downloaded from [TODO: add link]:
 ```
 ../resources/wikidata/wikidata_gazetteer.csv
 ../resources/wikidata/entity2class.txt
@@ -14,6 +14,8 @@ You will need the following resources, which are created using the code in the w
 ../resources/wikipedia/wikidata2wikipedia/index_enwiki-latest.db
 ```
 
+You will also need the [word2vec embeddings](TODO: add link) trained from 19th Century data. These embeddings have been created by Nilo Pedrazzini. For more information, check https://github.com/Living-with-machines/DiachronicEmb-BigHistData.
+
 ### 2. Preparing the data
 
 To create the datasets that we use in the experiments presented in the paper, run the following command:
@@ -22,4 +24,17 @@ python prepare_data.py
 ```
 This script takes care of downloading the LwM and HIPE datasets and format them as needed in the experiments.
 
-### 3. Running the experiments
+### 3. Running the experiments
+
+To run the experiments, run the following script:
+```bash
+python toponym_resolution.py
+```
+This script does runs for all different scenarios reported in the experiments in the paper.
+
+### 4. Evaluate
+
+To evaluate the different approaches and obtain a table with results such as the one provided in the paper, go to the `../evaluation/` directory. There, you should clone the [HIPE scorer](https://github.com/hipe-eval/HIPE-scorer). We are using the code version at commit 50dff4e, and have added the line `return eval_stats` at the end of the `get_results()` function. From `../evaluation/`, run the following script to obtain the results in latex format:
+```bash
+python display_results.py
+```
diff --git a/geoparser/experiment.py → experiments/experiment.py b/geoparser/experiment.py → experiments/experiment.py
@@ -75,7 +75,7 @@ def __init__(
             )
         else:
             sys.exit(
-                "\nError: The dataset has not been created, you should first run the data_processing.py script.\n"
+                "\nError: The dataset has not been created, you should first run the prepare_data.py script.\n"
             )
 
     def __str__(self):
@@ -189,6 +189,11 @@ def prepare_data(self):
         return self.processed_data
 
     def linking_experiments(self):
+        """
+        Prepares the data for the linking experiments, creating a mention-based
+        dataframe. It produces tsv files in the format required by the HIPE
+        scorer, ready to be evaluated.
+        """
         # Create a mention-based dataframe for the linking experiments:
         processed_df = process_data.create_mentions_df(self)
         self.processed_data["processed_df"] = processed_df

diff --git a/experiments/toponym_resolution.py b/experiments/toponym_resolution.py
@@ -5,7 +5,8 @@
 
 # Add "../" to path to import utils
 sys.path.insert(0, os.path.abspath(os.path.pardir))
-from geoparser import experiment, recogniser, ranking, linking
+from geoparser import recogniser, ranking, linking
+from experiments import experiment
 
 # Choose test scenario:
 test_scenario = "test"  # "dev" while experimenting, "test" for the final numbers

diff --git a/tests/test_disambiguation.py b/tests/test_disambiguation.py
@@ -60,10 +60,10 @@ def test_prepare_initial_data():
         "experiments/outputs/data/lwm/linking_df_split.tsv", sep="\t"
     ).iloc[:1]
     parsed_doc = rel_utils.prepare_initial_data(df, context_len=100)
-    assert parsed_doc["4939308"][0]["mention"] == "STALYBRIDGE"
-    assert parsed_doc["4939308"][0]["gold"][0] == "Q1398653"
-    assert parsed_doc["4939308"][3]["mention"] == "Market-street"
-    assert parsed_doc["4939308"][3]["gold"] == "NIL"
+    assert parsed_doc["4939308_1"][0]["mention"] == "STALYBRIDGE"
+    assert parsed_doc["4939308_1"][0]["gold"][0] == "Q1398653"
+    assert parsed_doc["4939308_6"][1]["mention"] == "Market-street"
+    assert parsed_doc["4939308_6"][1]["gold"] == "NIL"
 
 
 def test_train():
@@ -163,7 +163,7 @@ def test_train():
     )
 
     # assert expected performance on test set
-    assert 0.60 < mylinker.rel_params["ed_model"].best_performance["f1"]
+    assert mylinker.rel_params["ed_model"].best_performance["f1"] == 0.6583541147132169
 
 
 def test_load_eval_model():

diff --git a/tests/test_experiments.py b/tests/test_experiments.py
@@ -7,7 +7,8 @@
 
 # Add "../" to path to import utils
 sys.path.insert(0, os.path.abspath(os.path.pardir))
-from geoparser import experiment, linking, ranking, recogniser
+from geoparser import linking, ranking, recogniser
+from experiments import experiment
 
 
 def test_wrong_dataset_path():
@@ -25,7 +26,7 @@ def test_wrong_dataset_path():
 
     assert (
         cm.value.code
-        == "\nError: The dataset has not been created, you should first run the data_processing.py script.\n"
+        == "\nError: The dataset has not been created, you should first run the prepare_data.py script.\n"
     )
 
 

diff --git a/utils/REL/entity_disambiguation.py b/utils/REL/entity_disambiguation.py
@@ -7,7 +7,7 @@
 import pickle
 import numpy as np
 from pathlib import Path
-from random import shuffle
+import random
 import torch.optim as optim
 from string import punctuation
 from torch.autograd import Variable
@@ -27,6 +27,9 @@
 for the ED step.
 """
 
+RANDOM_SEED = 42
+random.seed(RANDOM_SEED)
+
 
 class EntityDisambiguation:
     def __init__(self, db_embs, user_config, reset_embeddings=False):
@@ -163,7 +166,7 @@ def train(self, org_train_dataset, org_dev_dataset):
         eval_after_n_epochs = self.config["eval_after_n_epochs"]
 
         for e in range(self.config["n_epochs"]):
-            shuffle(train_dataset)
+            random.shuffle(train_dataset)
 
             total_loss = 0
             for dc, batch in enumerate(train_dataset):  # each document is a minibatch
@@ -397,7 +400,6 @@ def __predict(self, data, include_timing=False, eval_raw=False):
         timing = []
 
         for batch in data:  # each document is a minibatch
-
             start = time.time()
 
             token_ids = [