Skip to content

Commit

Permalink
Improve docs and move functions
Browse files Browse the repository at this point in the history
  • Loading branch information
mcollardanuy committed Apr 27, 2023
1 parent 6a7f1cf commit 051d00c
Show file tree
Hide file tree
Showing 8 changed files with 233 additions and 322 deletions.
19 changes: 17 additions & 2 deletions experiments/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ Follow these steps to reproduce the experiments in our paper.

### 1. Obtain the external resources [DRAFT]

You will need the following resources, which are created using the code in the wiki2gaz repository ([TODO: add link]) or can be downloaded from [TODO: add link]:
You will need the following resources, which are created using the code in the [wiki2gaz](https://github.com/Living-with-machines/wiki2gaz) or can be downloaded from [TODO: add link]:
```
../resources/wikidata/wikidata_gazetteer.csv
../resources/wikidata/entity2class.txt
Expand All @@ -14,6 +14,8 @@ You will need the following resources, which are created using the code in the w
../resources/wikipedia/wikidata2wikipedia/index_enwiki-latest.db
```

You will also need the [word2vec embeddings](TODO: add link) trained from 19th Century data. These embeddings have been created by Nilo Pedrazzini. For more information, check https://github.com/Living-with-machines/DiachronicEmb-BigHistData.

### 2. Preparing the data

To create the datasets that we use in the experiments presented in the paper, run the following command:
Expand All @@ -22,4 +24,17 @@ python prepare_data.py
```
This script takes care of downloading the LwM and HIPE datasets and format them as needed in the experiments.

### 3. Running the experiments
### 3. Running the experiments

To run the experiments, run the following script:
```bash
python toponym_resolution.py
```
This script does runs for all different scenarios reported in the experiments in the paper.

### 4. Evaluate

To evaluate the different approaches and obtain a table with results such as the one provided in the paper, go to the `../evaluation/` directory. There, you should clone the [HIPE scorer](https://github.com/hipe-eval/HIPE-scorer). We are using the code version at commit 50dff4e, and have added the line `return eval_stats` at the end of the `get_results()` function. From `../evaluation/`, run the following script to obtain the results in latex format:
```bash
python display_results.py
```
7 changes: 6 additions & 1 deletion geoparser/experiment.py → experiments/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def __init__(
)
else:
sys.exit(
"\nError: The dataset has not been created, you should first run the data_processing.py script.\n"
"\nError: The dataset has not been created, you should first run the prepare_data.py script.\n"
)

def __str__(self):
Expand Down Expand Up @@ -189,6 +189,11 @@ def prepare_data(self):
return self.processed_data

def linking_experiments(self):
"""
Prepares the data for the linking experiments, creating a mention-based
dataframe. It produces tsv files in the format required by the HIPE
scorer, ready to be evaluated.
"""
# Create a mention-based dataframe for the linking experiments:
processed_df = process_data.create_mentions_df(self)
self.processed_data["processed_df"] = processed_df
Expand Down
3 changes: 2 additions & 1 deletion experiments/toponym_resolution.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@

# Add "../" to path to import utils
sys.path.insert(0, os.path.abspath(os.path.pardir))
from geoparser import experiment, recogniser, ranking, linking
from geoparser import recogniser, ranking, linking
from experiments import experiment

# Choose test scenario:
test_scenario = "test" # "dev" while experimenting, "test" for the final numbers
Expand Down
10 changes: 5 additions & 5 deletions tests/test_disambiguation.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,10 @@ def test_prepare_initial_data():
"experiments/outputs/data/lwm/linking_df_split.tsv", sep="\t"
).iloc[:1]
parsed_doc = rel_utils.prepare_initial_data(df, context_len=100)
assert parsed_doc["4939308"][0]["mention"] == "STALYBRIDGE"
assert parsed_doc["4939308"][0]["gold"][0] == "Q1398653"
assert parsed_doc["4939308"][3]["mention"] == "Market-street"
assert parsed_doc["4939308"][3]["gold"] == "NIL"
assert parsed_doc["4939308_1"][0]["mention"] == "STALYBRIDGE"
assert parsed_doc["4939308_1"][0]["gold"][0] == "Q1398653"
assert parsed_doc["4939308_6"][1]["mention"] == "Market-street"
assert parsed_doc["4939308_6"][1]["gold"] == "NIL"


def test_train():
Expand Down Expand Up @@ -163,7 +163,7 @@ def test_train():
)

# assert expected performance on test set
assert 0.60 < mylinker.rel_params["ed_model"].best_performance["f1"]
assert mylinker.rel_params["ed_model"].best_performance["f1"] == 0.6583541147132169


def test_load_eval_model():
Expand Down
5 changes: 3 additions & 2 deletions tests/test_experiments.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@

# Add "../" to path to import utils
sys.path.insert(0, os.path.abspath(os.path.pardir))
from geoparser import experiment, linking, ranking, recogniser
from geoparser import linking, ranking, recogniser
from experiments import experiment


def test_wrong_dataset_path():
Expand All @@ -25,7 +26,7 @@ def test_wrong_dataset_path():

assert (
cm.value.code
== "\nError: The dataset has not been created, you should first run the data_processing.py script.\n"
== "\nError: The dataset has not been created, you should first run the prepare_data.py script.\n"
)


Expand Down
8 changes: 5 additions & 3 deletions utils/REL/entity_disambiguation.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import pickle
import numpy as np
from pathlib import Path
from random import shuffle
import random
import torch.optim as optim
from string import punctuation
from torch.autograd import Variable
Expand All @@ -27,6 +27,9 @@
for the ED step.
"""

RANDOM_SEED = 42
random.seed(RANDOM_SEED)


class EntityDisambiguation:
def __init__(self, db_embs, user_config, reset_embeddings=False):
Expand Down Expand Up @@ -163,7 +166,7 @@ def train(self, org_train_dataset, org_dev_dataset):
eval_after_n_epochs = self.config["eval_after_n_epochs"]

for e in range(self.config["n_epochs"]):
shuffle(train_dataset)
random.shuffle(train_dataset)

total_loss = 0
for dc, batch in enumerate(train_dataset): # each document is a minibatch
Expand Down Expand Up @@ -397,7 +400,6 @@ def __predict(self, data, include_timing=False, eval_raw=False):
timing = []

for batch in data: # each document is a minibatch

start = time.time()

token_ids = [
Expand Down
Loading

0 comments on commit 051d00c

Please sign in to comment.