diff --git a/experiments/experiment.py b/experiments/experiment.py index db578425..06855585 100644 --- a/experiments/experiment.py +++ b/experiments/experiment.py @@ -223,10 +223,7 @@ def prepare_data(self) -> dict: # Obtain candidates per sentence: for sentence_id in tqdm(dMentionsPred): pred_mentions_sent = dMentionsPred[sentence_id] - ( - wk_cands, - self.myranker.already_collected_cands, - ) = self.myranker.find_candidates(pred_mentions_sent) + wk_cands = self.myranker.find_candidates(pred_mentions_sent) dCandidates[sentence_id] = wk_cands # ------------------------------------------- @@ -466,10 +463,12 @@ def create_mentions_df(self) -> pd.DataFrame: data=rows, ) + print(f"Saving to {os.path.join(self.data_path,self.dataset,f'{self.myner.model}_{cand_approach}')}") output_path = ( - self.data_path + self.dataset + "/" + self.myner.model + "_" + cand_approach + os.path.join(self.data_path,self.dataset,f"{self.myner.model}_{cand_approach}") ) + # List of columns to merge (i.e. columns where we have indicated # out data splits), and "article_id", the columns on which we # will merge the data: diff --git a/pyproject.toml b/pyproject.toml index 9a84746b..609dfd55 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,3 +65,8 @@ include = '\.pyi?$' [tool.isort] profile = "black" + +[tool.pytest.ini_options] +markers = [ + "deezy: tests which need a deezy model", +] \ No newline at end of file diff --git a/t_res/geoparser/pipeline.py b/t_res/geoparser/pipeline.py index 648d281f..148b43d2 100644 --- a/t_res/geoparser/pipeline.py +++ b/t_res/geoparser/pipeline.py @@ -150,9 +150,6 @@ def __init__( self.myranker ) - # Check we've actually loaded the mentions2wikidata dictionary: - assert self.myranker.mentions_to_wikidata["London"] is not None - def run_sentence( self, sentence: str, diff --git a/t_res/geoparser/recogniser.py b/t_res/geoparser/recogniser.py index 2d4197e9..975f0010 100644 --- a/t_res/geoparser/recogniser.py +++ b/t_res/geoparser/recogniser.py @@ -270,7 +270,7 @@ def compute_metrics(p: Tuple[list, list]) -> dict: training_args = TrainingArguments( output_dir=self.model_path, evaluation_strategy="epoch", - logging_dir=self.model_path + "runs/" + self.model, + logging_dir=os.path.join(self.model_path,"runs/",self.model), learning_rate=self.training_args["learning_rate"], per_device_train_batch_size=self.training_args["batch_size"], per_device_eval_batch_size=self.training_args["batch_size"], @@ -295,7 +295,7 @@ def compute_metrics(p: Tuple[list, list]) -> dict: trainer.evaluate() # Save the model: - trainer.save_model(self.model_path + self.model + ".model") + trainer.save_model(os.path.join(self.model_path,f"{self.model}.model")) # ------------------------------------------------------------- def create_pipeline(self) -> Pipeline: diff --git a/tests/test_data_processing.py b/tests/test_data_processing.py index 31b5462a..6b5570ad 100644 --- a/tests/test_data_processing.py +++ b/tests/test_data_processing.py @@ -6,11 +6,13 @@ from pathlib import Path import pandas as pd +import pytest -large_resources = "/resources/" # path to large resources -small_resources = "./resources/" # path to small resources -processed_path_lwm = "./experiments/outputs/data/lwm/" # path to processed LwM data -processed_path_hipe = "./experiments/outputs/data/hipe/" # path to processed LwM data +current_dir = Path(__file__).parent.resolve() + +small_resources = os.path.join(current_dir,"sample_files/resources/") # path to small resources +processed_path_lwm = os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/") # path to processed LwM data +processed_path_hipe = os.path.join(current_dir,"sample_files/experiments/outputs/data/hipe/") # path to processed LwM data def test_publication_metadata_exists(): @@ -53,8 +55,8 @@ def test_original_lwm_data(): train_metadata = pd.read_csv(path_train_metadata, sep="\t") test_metadata = pd.read_csv(path_test_metadata, sep="\t") # Assert the size of the metadata files: - assert train_metadata.shape[0] == 343 - assert test_metadata.shape[0] == 112 + assert train_metadata.shape[0] == 1 + assert test_metadata.shape[0] == 1 assert train_metadata.shape[1] == 10 assert test_metadata.shape[1] == 10 # Items in metadata match number of files in directory, for test: @@ -98,8 +100,8 @@ def test_lwm_ner_conversion_fine(): dtype={"id": str}, ) # Assert size of the train and dev sets: - assert df_ner_train.shape == (5216, 3) - assert df_ner_dev.shape == (1304, 3) + assert df_ner_train.shape == (141, 3) + assert df_ner_dev.shape == (41, 3) # Assert number of sentences in train and dev (length of list and set should be the same): assert ( len(list(df_ner_train["id"]) + list(df_ner_dev["id"])) @@ -107,45 +109,11 @@ def test_lwm_ner_conversion_fine(): == df_ner_train.shape[0] + df_ner_dev.shape[0] ) # Assert ID is read as string: - assert type(df_ner_train["id"].iloc[0]) == str + assert isinstance(df_ner_train["id"].iloc[0],str) # Assert number of unique articles: train_articles = [x.split("_")[0] for x in list(df_ner_train["id"])] dev_articles = [x.split("_")[0] for x in list(df_ner_dev["id"])] - assert len(set(train_articles + dev_articles)) == 343 - - -def test_lwm_ner_conversion_coarse(): - """ - Test process_lwm_for_ner is not missing articles. - """ - df_ner_train = pd.read_json( - os.path.join(f"{processed_path_lwm}", "ner_coarse_train.json"), - orient="records", - lines=True, - dtype={"id": str}, - ) - df_ner_dev = pd.read_json( - os.path.join(f"{processed_path_lwm}", "ner_coarse_dev.json"), - orient="records", - lines=True, - dtype={"id": str}, - ) - # Assert size of the train and dev sets: - assert df_ner_train.shape == (5216, 3) - assert df_ner_dev.shape == (1304, 3) - # Assert number of sentences in train and dev (length of list and set should be the same): - assert ( - len(list(df_ner_train["id"]) + list(df_ner_dev["id"])) - == len(set(list(df_ner_train["id"]) + list(df_ner_dev["id"]))) - == df_ner_train.shape[0] + df_ner_dev.shape[0] - ) - # Assert ID is read as string: - assert type(df_ner_train["id"].iloc[0]) == str - # Assert number of unique articles: - train_articles = [x.split("_")[0] for x in list(df_ner_train["id"])] - dev_articles = [x.split("_")[0] for x in list(df_ner_dev["id"])] - assert len(set(train_articles + dev_articles)) == 343 - + assert len(set(train_articles + dev_articles)) == 11 def test_lwm_linking_conversion(): """ @@ -156,26 +124,26 @@ def test_lwm_linking_conversion(): sep="\t", ) # Assert size of the dataset (i.e. number of articles): - assert df_linking.shape[0] == 455 + assert df_linking.shape[0] == 14 # Assert if place has been filled correctly: for x in df_linking.place: - assert type(x) == str + assert isinstance(x,str) assert x != "" # Assert if place QID has been filled correctly: for x in df_linking.place_wqid: - assert type(x) == str + assert isinstance(x,str) assert x != "" for x in df_linking.annotations: x = literal_eval(x) for ann in x: assert ann["wkdt_qid"] == "NIL" or ann["wkdt_qid"].startswith("Q") - assert df_linking[df_linking["originalsplit"] == "train"].shape[0] == 229 - assert df_linking[df_linking["originalsplit"] == "dev"].shape[0] == 114 - assert df_linking[df_linking["originalsplit"] == "test"].shape[0] == 112 - assert df_linking[df_linking["withouttest"] == "train"].shape[0] == 153 - assert df_linking[df_linking["withouttest"] == "dev"].shape[0] == 76 - assert df_linking[df_linking["withouttest"] == "test"].shape[0] == 114 - assert df_linking[df_linking["withouttest"] == "left_out"].shape[0] == 112 + assert df_linking[df_linking["originalsplit"] == "train"].shape[0] == 10 + assert df_linking[df_linking["originalsplit"] == "dev"].shape[0] == 2 + assert df_linking[df_linking["originalsplit"] == "test"].shape[0] == 2 + assert df_linking[df_linking["withouttest"] == "train"].shape[0] == 8 + assert df_linking[df_linking["withouttest"] == "dev"].shape[0] == 2 + assert df_linking[df_linking["withouttest"] == "test"].shape[0] == 2 + assert df_linking[df_linking["withouttest"] == "left_out"].shape[0] == 2 test_withouttest = set( list(df_linking[df_linking["withouttest"] == "test"].article_id) ) @@ -185,7 +153,7 @@ def test_lwm_linking_conversion(): # Test articles of the original split and without test should not overlap: assert not (test_withouttest & test_originalsplit) - +@pytest.mark.skip(reason="Requires HIPE data") def test_hipe_linking_conversion(): """ Test process_hipe_for_linking is not missing articles. @@ -211,11 +179,11 @@ def test_hipe_linking_conversion(): assert not (test_withouttest & test_originalsplit) # Assert if place has been filled correctly: for x in df_linking.place: - assert type(x) == str + assert isinstance(x,str) assert x != "" # Assert if place QID has been filled correctly: for x in df_linking.place_wqid: - assert type(x) == str + assert isinstance(x,str) assert x != "" # Do HIPE stats match https://github.com/hipe-eval/HIPE-2022-data/blob/main/notebooks/hipe2022-datasets-stats.ipynb number_locs = 0 diff --git a/tests/test_deezy.py b/tests/test_deezy.py new file mode 100644 index 00000000..a72fa630 --- /dev/null +++ b/tests/test_deezy.py @@ -0,0 +1,44 @@ +import os +from pathlib import Path + +import pytest +from DeezyMatch import candidate_ranker + +current_dir = Path(__file__).parent.resolve() + +@pytest.mark.deezy(reason="Needs deezy model") +def test_deezy_match_deezy_candidate_ranker(tmp_path): + deezy_parameters = { + # Paths and filenames of DeezyMatch models and data: + "dm_path": os.path.join(current_dir,"sample_files/resources/deezymatch/"), + "dm_cands": "wkdtalts", + "dm_model": "w2v_ocr", + "dm_output": "deezymatch_on_the_fly", + # Ranking measures: + "ranking_metric": "faiss", + "selection_threshold": 50, + "num_candidates": 1, + "verbose": False, + # DeezyMatch training: + "overwrite_training": False, + "do_test": False, + } + + dm_path = deezy_parameters["dm_path"] + dm_cands = deezy_parameters["dm_cands"] + dm_model = deezy_parameters["dm_model"] + dm_output = deezy_parameters["dm_output"] + + query = ["-", "ST G", "• - , i", "- P", "• FERRIS"] + + candidates = candidate_ranker( + candidate_scenario=os.path.join(dm_path, "combined", dm_cands + "_" + dm_model), + query=query, + ranking_metric=deezy_parameters["ranking_metric"], + selection_threshold=deezy_parameters["selection_threshold"], + num_candidates=deezy_parameters["num_candidates"], + search_size=deezy_parameters["num_candidates"], + verbose=deezy_parameters["verbose"], + output_path=os.path.join(tmp_path,dm_output), + ) + assert len(candidates) == len(query) diff --git a/tests/test_disambiguation.py b/tests/test_disambiguation.py index 69a9dc73..010fed15 100644 --- a/tests/test_disambiguation.py +++ b/tests/test_disambiguation.py @@ -3,12 +3,14 @@ import sys from pathlib import Path +import pytest import pandas as pd from t_res.geoparser import linking, pipeline, ranking, recogniser from t_res.utils import rel_utils from t_res.utils.REL import entity_disambiguation +current_dir = Path(__file__).parent.resolve() def test_embeddings(): """ @@ -16,7 +18,7 @@ def test_embeddings(): """ # Test 1: Check glove embeddings mentions = ["in", "apple"] - with sqlite3.connect("resources/rel_db/embeddings_database.db") as conn: + with sqlite3.connect(os.path.join(current_dir,"sample_files/resources/rel_db/embeddings_database.db")) as conn: cursor = conn.cursor() embs = rel_utils.get_db_emb(cursor, mentions, "snd") assert len(mentions) == len(embs) @@ -41,29 +43,18 @@ def test_embeddings(): embs = rel_utils.get_db_emb(cursor, mentions, "entity") assert embs == [None] - -def test_prepare_initial_data(): - df = pd.read_csv( - "experiments/outputs/data/lwm/linking_df_split.tsv", sep="\t" - ).iloc[:1] - parsed_doc = rel_utils.prepare_initial_data(df) - assert parsed_doc["4939308_1"][0]["mention"] == "STALYBRIDGE" - assert parsed_doc["4939308_1"][0]["gold"][0] == "Q1398653" - assert parsed_doc["4939308_6"][1]["mention"] == "Market-street" - assert parsed_doc["4939308_6"][1]["gold"] == "NIL" - - -def test_train(): +@pytest.mark.deezy(reason="Needs deezy model") +def test_train(tmp_path): myner = recogniser.Recogniser( - model="blb_lwm-ner-fine", # NER model name prefix (will have suffixes appended) + model="ner_test", # NER model name prefix (will have suffixes appended) pipe=None, # We'll store the NER pipeline here base_model="khosseini/bert_1760_1900", # Base model to fine-tune (from huggingface) - train_dataset="experiments/outputs/data/lwm/ner_fine_train.json", # Training set (part of overall training set) - test_dataset="experiments/outputs/data/lwm/ner_fine_dev.json", # Test set (part of overall training set) - model_path="resources/models/", # Path where the NER model is or will be stored + train_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_train.json"), + test_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_dev.json"), + model_path=str(tmp_path), # Path where the NER model is or will be stored training_args={ "batch_size": 8, - "num_train_epochs": 10, + "num_train_epochs": 1, "learning_rate": 0.00005, "weight_decay": 0.0, }, @@ -74,7 +65,7 @@ def test_train(): myranker = ranking.Ranker( method="deezymatch", - resources_path="resources/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), mentions_to_wikidata=dict(), wikidata_to_mentions=dict(), strvar_parameters={ @@ -83,13 +74,13 @@ def test_train(): "top_threshold": 85, "min_len": 5, "max_len": 15, - "w2v_ocr_path": str(Path("resources/models/w2v/").resolve()), - "w2v_ocr_model": "w2v_*_news", + "w2v_ocr_path": str(tmp_path), + "w2v_ocr_model": "w2v_1800_news", "overwrite_dataset": False, }, deezy_parameters={ # Paths and filenames of DeezyMatch models and data: - "dm_path": str(Path("resources/deezymatch/").resolve()), + "dm_path": os.path.join(current_dir,"sample_files/resources/deezymatch"), "dm_cands": "wkdtalts", "dm_model": "w2v_ocr", "dm_output": "deezymatch_on_the_fly", @@ -103,16 +94,16 @@ def test_train(): "do_test": False, }, ) - with sqlite3.connect("resources/rel_db/embeddings_database.db") as conn: - cursor = conn.cursor() + with sqlite3.connect(os.path.join(current_dir,"sample_files/resources/rel_db/embeddings_database.db")) as conn: + cursor = conn.cursor() mylinker = linking.Linker( method="reldisamb", - resources_path="resources/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), linking_resources=dict(), rel_params={ - "model_path": "resources/models/disambiguation/", - "data_path": "experiments/outputs/data/lwm/", + "model_path": os.path.join(current_dir,"sample_files/resources/models/disambiguation/"), + "data_path": os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm"), "training_split": "originalsplit", "db_embeddings": cursor, "with_publication": False, @@ -152,18 +143,18 @@ def test_train(): # assert expected performance on test set assert mylinker.rel_params["ed_model"].best_performance["f1"] == 0.6288416075650118 - -def test_load_eval_model(): +@pytest.mark.deezy(reason="Needs deezy model") +def test_load_eval_model(tmp_path): myner = recogniser.Recogniser( model="blb_lwm-ner-fine", # NER model name prefix (will have suffixes appended) pipe=None, # We'll store the NER pipeline here base_model="khosseini/bert_1760_1900", # Base model to fine-tune (from huggingface) - train_dataset="experiments/outputs/data/lwm/ner_fine_train.json", # Training set (part of overall training set) - test_dataset="experiments/outputs/data/lwm/ner_fine_dev.json", # Test set (part of overall training set) - model_path="resources/models/", # Path where the NER model is or will be stored + train_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_train.json"), + test_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_dev.json"), + model_path=str(tmp_path), # Path where the NER model is or will be stored training_args={ "batch_size": 8, - "num_train_epochs": 10, + "num_train_epochs": 1, "learning_rate": 0.00005, "weight_decay": 0.0, }, @@ -174,7 +165,7 @@ def test_load_eval_model(): myranker = ranking.Ranker( method="deezymatch", - resources_path="resources/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), mentions_to_wikidata=dict(), wikidata_to_mentions=dict(), strvar_parameters={ @@ -183,13 +174,13 @@ def test_load_eval_model(): "top_threshold": 85, "min_len": 5, "max_len": 15, - "w2v_ocr_path": str(Path("resources/models/w2v/").resolve()), + "w2v_ocr_path": str(tmp_path), "w2v_ocr_model": "w2v_*_news", "overwrite_dataset": False, }, deezy_parameters={ # Paths and filenames of DeezyMatch models and data: - "dm_path": str(Path("resources/deezymatch/").resolve()), + "dm_path": os.path.join(current_dir,"sample_files/resources/deezymatch"), "dm_cands": "wkdtalts", "dm_model": "w2v_ocr", "dm_output": "deezymatch_on_the_fly", @@ -204,18 +195,16 @@ def test_load_eval_model(): }, ) - with sqlite3.connect("resources/rel_db/embeddings_database.db") as conn: + with sqlite3.connect(os.path.join(current_dir,"sample_files/resources/rel_db/embeddings_database.db")) as conn: cursor = conn.cursor() - mylinker = linking.Linker( method="reldisamb", - resources_path="resources/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), linking_resources=dict(), rel_params={ - "model_path": "resources/models/disambiguation/", - "data_path": "experiments/outputs/data/lwm/", + "model_path": os.path.join(current_dir,"sample_files/resources/models/disambiguation/"), + "data_path": os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm"), "training_split": "originalsplit", - "topn_candidates": 10, "db_embeddings": cursor, "with_publication": False, "without_microtoponyms": False, @@ -251,19 +240,19 @@ def test_load_eval_model(): == entity_disambiguation.EntityDisambiguation ) - -def test_predict(): +@pytest.mark.deezy(reason="Needs deezy model") +def test_predict(tmp_path): myner = recogniser.Recogniser( model="blb_lwm-ner-fine", # NER model name prefix (will have suffixes appended) pipe=None, # We'll store the NER pipeline here base_model="khosseini/bert_1760_1900", # Base model to fine-tune (from huggingface) - train_dataset="experiments/outputs/data/lwm/ner_fine_train.json", # Training set (part of overall training set) - test_dataset="experiments/outputs/data/lwm/ner_fine_dev.json", # Test set (part of overall training set) - model_path="resources/models/", # Path where the NER model is or will be stored + train_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_train.json"), + test_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_dev.json"), + model_path=str(tmp_path), # Path where the NER model is or will be stored training_args={ "learning_rate": 5e-5, "batch_size": 16, - "num_train_epochs": 4, + "num_train_epochs": 1, "weight_decay": 0.01, }, overwrite_training=False, # Set to True if you want to overwrite model if existing @@ -273,7 +262,7 @@ def test_predict(): myranker = ranking.Ranker( method="deezymatch", - resources_path="resources/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), mentions_to_wikidata=dict(), wikidata_to_mentions=dict(), strvar_parameters={ @@ -282,13 +271,13 @@ def test_predict(): "top_threshold": 85, "min_len": 5, "max_len": 15, - "w2v_ocr_path": str(Path("resources/models/w2v/").resolve()), - "w2v_ocr_model": "w2v_*_news", + "w2v_ocr_path": str(tmp_path), + "w2v_ocr_model": "w2v_1800s_news", "overwrite_dataset": False, }, deezy_parameters={ # Paths and filenames of DeezyMatch models and data: - "dm_path": str(Path("resources/deezymatch/").resolve()), + "dm_path": os.path.join(current_dir,"sample_files/resources/deezymatch/"), "dm_cands": "wkdtalts", "dm_model": "w2v_ocr", "dm_output": "deezymatch_on_the_fly", @@ -302,23 +291,21 @@ def test_predict(): "do_test": False, }, ) - with sqlite3.connect("resources/rel_db/embeddings_database.db") as conn: - cursor = conn.cursor() + with sqlite3.connect(os.path.join(current_dir,"sample_files/resources/rel_db/embeddings_database.db")) as conn: + cursor = conn.cursor() mylinker = linking.Linker( method="reldisamb", - resources_path="resources/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), linking_resources=dict(), rel_params={ - "model_path": "resources/models/disambiguation/", - "data_path": "experiments/outputs/data/lwm/", + "model_path": os.path.join(current_dir,"sample_files/resources/models/disambiguation/"), + "data_path": os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm"), "training_split": "originalsplit", "db_embeddings": cursor, "with_publication": True, "without_microtoponyms": True, "do_test": False, - "default_publname": "United Kingdom", - "default_publwqid": "Q145", }, overwrite_training=False, ) @@ -330,7 +317,7 @@ def test_predict(): place="London", place_wqid="Q84", ) - assert type(predictions) == list + assert isinstance(predictions,list) assert predictions[1]["prediction"] in predictions[1]["cross_cand_score"] diff --git a/tests/test_experiments.py b/tests/test_experiments.py index 5c4a0a41..b9a002e1 100644 --- a/tests/test_experiments.py +++ b/tests/test_experiments.py @@ -1,23 +1,25 @@ import os import sys from ast import literal_eval +from pathlib import Path import pandas as pd import pytest # Add "../" to path to import experiment -sys.path.insert(0, os.path.abspath("../")) +current_dir = Path(__file__).parent.resolve() +sys.path.insert(0, os.path.join(current_dir,"../")) from experiments import experiment -from t_res.geoparser import linking, ranking, recogniser +from t_res.geoparser import linking, ranking, recogniser -def test_wrong_dataset_path(): +def test_experiments_wrong_dataset_path(tmp_path): with pytest.raises(SystemExit) as cm: experiment.Experiment( dataset="lwm", data_path="wrong_path/", dataset_df=pd.DataFrame(), - results_path="experiments/outputs/results/", + results_path=str(tmp_path), myner="test", myranker="test", mylinker="test", @@ -30,8 +32,8 @@ def test_wrong_dataset_path(): ) -def test_load_data(): - data = pd.read_csv("experiments/outputs/data/lwm/linking_df_split.tsv", sep="\t") +def test_load_data(tmp_path): + data = pd.read_csv(os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/linking_df_split.tsv"), sep="\t") ids = set() for idx, row in data.iterrows(): @@ -41,29 +43,34 @@ def test_load_data(): ids.add(str(article_id) + "_" + str(sent["sentence_pos"])) myner = recogniser.Recogniser( - model="blb_lwm-ner-fine", # NER model name prefix (will have suffixes appended) - pipe=None, # We'll store the NER pipeline here - base_model="khosseini/bert_1760_1900", # Base model to fine-tune (from huggingface) - train_dataset="experiments/outputs/data/lwm/ner_fine_train.json", # Training set (part of overall training set) - test_dataset="experiments/outputs/data/lwm/ner_fine_dev.json", # Test set (part of overall training set) - model_path="resources/models/", # Path where the NER model is or will be stored - training_args=dict(), + model="blb_lwm-ner-fine", + train_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_train.json"), + test_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_dev.json"), + pipe=None, + base_model="khosseini/bert_1760_1900", # Base model to fine-tune + model_path=str(tmp_path), # Path where the NER model will be stored + training_args={ + "batch_size": 8, + "num_train_epochs": 1, + "learning_rate": 0.00005, + "weight_decay": 0.0, + }, overwrite_training=False, # Set to True if you want to overwrite model if existing do_test=False, # Set to True if you want to train on test mode - load_from_hub=False, + load_from_hub=False, ) # Instantiate the ranker: myranker = ranking.Ranker( method="perfectmatch", - resources_path="resources/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), ) # -------------------------------------- # Instantiate the linker: mylinker = linking.Linker( method="mostpopular", - resources_path="resources/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), ) myner.train() @@ -78,9 +85,9 @@ def test_load_data(): # Instantiate the experiment: exp = experiment.Experiment( dataset="lwm", - data_path="experiments/outputs/data/", + data_path=os.path.join(current_dir,"sample_files/experiments/outputs/data/"), dataset_df=pd.DataFrame(), - results_path="experiments/outputs/results/", + results_path=str(tmp_path), myner=myner, myranker=myranker, mylinker=mylinker, @@ -121,18 +128,18 @@ def test_load_data(): assert len(not_empty_dMentionsPred) == len(not_empty_dCandidates) -def test_wrong_ranker_method(): +def test_wrong_ranker_method(tmp_path): ranker = ranking.Ranker( # wrong naming: it should be perfectmatch method="perfect_match", - resources_path="resources/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), ) exp = experiment.Experiment( dataset="lwm", - data_path="experiments/outputs/data/", + data_path=os.path.join(current_dir,"sample_files/experiments/outputs/data/"), dataset_df=pd.DataFrame(), - results_path="experiments/outputs/results/", + results_path=str(tmp_path), myner="test", myranker=ranker, mylinker="test", @@ -141,32 +148,37 @@ def test_wrong_ranker_method(): exp.prepare_data() assert cm.value.code == 0 - -def test_apply(): +@pytest.mark.skip +def test_apply(tmp_path): myner = recogniser.Recogniser( - model="blb_lwm-ner-fine", # NER model name prefix (will have suffixes appended) - pipe=None, # We'll store the NER pipeline here - base_model="khosseini/bert_1760_1900", # Base model to fine-tune (from huggingface) - train_dataset="experiments/outputs/data/lwm/ner_fine_train.json", # Training set (part of overall training set) - test_dataset="experiments/outputs/data/lwm/ner_fine_dev.json", # Test set (part of overall training set) - model_path="resources/models/", # Path where the NER model is or will be stored - training_args=dict(), + model="blb_lwm-ner-fine", + train_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_train.json"), + test_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_dev.json"), + pipe=None, + base_model="khosseini/bert_1760_1900", # Base model to fine-tune + model_path=str(tmp_path), # Path where the NER model will be stored + training_args={ + "batch_size": 8, + "num_train_epochs": 1, + "learning_rate": 0.00005, + "weight_decay": 0.0, + }, overwrite_training=False, # Set to True if you want to overwrite model if existing do_test=False, # Set to True if you want to train on test mode - load_from_hub=False, + load_from_hub=False, ) # Instantiate the ranker: myranker = ranking.Ranker( method="perfectmatch", - resources_path="resources/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), ) # -------------------------------------- # Instantiate the linker: mylinker = linking.Linker( method="mostpopular", - resources_path="resources/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), ) myner.train() @@ -181,9 +193,9 @@ def test_apply(): # Instantiate the experiment: exp = experiment.Experiment( dataset="lwm", - data_path="experiments/outputs/data/", + data_path=os.path.join(current_dir,"sample_files/experiments/outputs/data/"), dataset_df=pd.DataFrame(), - results_path="experiments/outputs/results/", + results_path=str(tmp_path), myner=myner, myranker=myranker, mylinker=mylinker, diff --git a/tests/test_linking.py b/tests/test_linking.py index 3f987ea9..b5b4e2a0 100644 --- a/tests/test_linking.py +++ b/tests/test_linking.py @@ -1,31 +1,18 @@ import os import sqlite3 import sys +from pathlib import Path import numpy as np from t_res.geoparser import linking +current_dir = Path(__file__).parent.resolve() -def test_initialise_method(): - """ - Test initialisation works fine - """ +def test_linking_most_popular(): mylinker = linking.Linker( method="mostpopular", - resources_path="resources/", - linking_resources=dict(), - rel_params=dict(), - overwrite_training=False, - ) - - assert type(mylinker.__str__()) == str - - -def test_most_popular(): - mylinker = linking.Linker( - method="mostpopular", - resources_path="resources/", + resources_path=os.path.join(current_dir,"../resources/"), linking_resources=dict(), rel_params=dict(), overwrite_training=False, @@ -50,7 +37,7 @@ def test_most_popular(): def test_by_distance(): mylinker = linking.Linker( method="bydistance", - resources_path="resources/", + resources_path=os.path.join(current_dir,"../resources/"), linking_resources=dict(), rel_params=dict(), overwrite_training=False, diff --git a/tests/test_ner.py b/tests/test_ner.py index cda0d904..2c941a35 100644 --- a/tests/test_ner.py +++ b/tests/test_ner.py @@ -1,188 +1,128 @@ import os -import shutil -import sys +from pathlib import Path -import transformers +from transformers.pipelines.token_classification import TokenClassificationPipeline from t_res.geoparser import recogniser from t_res.utils import ner +current_dir = Path(__file__).parent.resolve() -def test_training(): - """ - Test that running train() generates a model folder - """ - - test_folder_path = "resources/models/blb_lwm-ner-coarse_test.model" - - if os.path.isdir(test_folder_path): - shutil.rmtree(test_folder_path) - +def test_ner_local_train(tmp_path): + model_path = os.path.join(tmp_path,"ner_test.model") + myner = recogniser.Recogniser( - model="blb_lwm-ner-coarse", # NER model name prefix (will have suffixes appended) - base_model="Livingwithmachines/bert_1760_1900", # Base model to fine-tune (from huggingface) - train_dataset="experiments/outputs/data/lwm/ner_coarse_train.json", # Training set (part of overall training set) - test_dataset="experiments/outputs/data/lwm/ner_coarse_dev.json", # Test set (part of overall training set) - model_path="resources/models/", # Path where the NER model is or will be stored + model="ner_test", + train_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_train.json"), + test_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_dev.json"), + base_model="Livingwithmachines/bert_1760_1900", + model_path=f"{tmp_path}/", training_args={ "batch_size": 8, "num_train_epochs": 10, "learning_rate": 0.00005, "weight_decay": 0.0, }, - overwrite_training=True, # Set to True if you want to overwrite model if existing - do_test=True, # Set to True if you want to train on test mode + overwrite_training=False, + do_test=False, load_from_hub=False, ) - assert os.path.isdir(test_folder_path) == False + assert os.path.exists(model_path) is False myner.train() - assert os.path.isdir(test_folder_path) == True - - -def test_create_pipeline(): - """ - Test that create_pipeline returns a model folder path that exists and an Pipeline object - """ - myner = recogniser.Recogniser( - model="blb_lwm-ner-coarse", # NER model name prefix (will have suffixes appended) - base_model="Livingwithmachines/bert_1760_1900", # Base model to fine-tune (from huggingface) - train_dataset="experiments/outputs/data/lwm/ner_fine_train.json", # Training set (part of overall training set) - test_dataset="experiments/outputs/data/lwm/ner_fine_dev.json", # Test set (part of overall training set) - model_path="resources/models/", # Path where the NER model is or will be stored - training_args={ - "batch_size": 8, - "num_train_epochs": 10, - "learning_rate": 0.00005, - "weight_decay": 0.0, - }, - overwrite_training=False, # Set to True if you want to overwrite model if existing - do_test=True, # Set to True if you want to train on test mode - load_from_hub=False, - ) - pipe = myner.create_pipeline() - assert ( - type(pipe) - == transformers.pipelines.token_classification.TokenClassificationPipeline - ) + print(model_path) + print(os.listdir(tmp_path)) + assert os.path.exists(model_path) is True def test_ner_predict(): + model_path = os.path.join(current_dir,"sample_files/resources/models/ner_test.model") + assert os.path.isdir(model_path) is True + myner = recogniser.Recogniser( - model="blb_lwm-ner-fine", # NER model name prefix (will have suffixes appended) - base_model="Livingwithmachines/bert_1760_1900", # Base model to fine-tune (from huggingface) - train_dataset="experiments/outputs/data/lwm/ner_fine_train.json", # Training set (part of overall training set) - test_dataset="experiments/outputs/data/lwm/ner_fine_dev.json", # Test set (part of overall training set) - model_path="resources/models/", # Path where the NER model is or will be stored + model="ner_test", + train_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_train.json"), + test_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_dev.json"), + base_model="Livingwithmachines/bert_1760_1900", + model_path=os.path.join(current_dir,"sample_files/resources/models/"), training_args={ "batch_size": 8, "num_train_epochs": 10, "learning_rate": 0.00005, "weight_decay": 0.0, }, - overwrite_training=False, # Set to True if you want to overwrite model if existing - do_test=False, # Set to True if you want to train on test mode - load_from_hub=False, + overwrite_training=False, + do_test=False, + load_from_hub=False, # Whether the final model should be loaded from the HuggingFace hub" ) myner.pipe = myner.create_pipeline() + assert isinstance(myner.pipe, TokenClassificationPipeline) - preds = myner.ner_predict( - "I grew up in Bologna, a city near Florence, but way more interesting." - ) - assert type(preds) == list - assert (type(preds[0])) == dict - assert len(preds) == 16 - assert preds[4]["entity"] == "B-LOC" - assert preds[4]["score"] == 0.9994915723800659 + sentence = "A remarkable case of rattening has just occurred in the building trade at Sheffield." + predictions = myner.ner_predict(sentence) + assert isinstance(predictions, list) + assert len(predictions) == 15 + assert predictions[13] == {'entity': 'B-LOC', 'score': 0.7941257357597351, 'word': 'Sheffield', 'start': 74, 'end': 83} # Test that ner_predict() can handle hyphens - preds = myner.ner_predict("- I grew up in Plymouth—Kingston.") - assert preds[0]["word"] == "-" - assert preds[6]["word"] == "," + sentence = "- I grew up in Plymouth—Kingston." + predictions = myner.ner_predict(sentence) + assert predictions[0]["word"] == "-" + assert predictions[6]["word"] == "," -def test_ner_load_from_hub(): +def test_ner_from_hub(): myner = recogniser.Recogniser( model="Livingwithmachines/toponym-19thC-en", load_from_hub=True, ) - pipe = myner.create_pipeline() - assert ( - type(pipe) - == transformers.pipelines.token_classification.TokenClassificationPipeline - ) + myner.train() + myner.pipe = myner.create_pipeline() + assert isinstance(myner.pipe, TokenClassificationPipeline) + + sentence = "A remarkable case of rattening has just occurred in the building trade at Sheffield." + predictions = myner.ner_predict(sentence) + assert isinstance(predictions, list) + assert len(predictions) == 15 + assert predictions[13] == {'entity': 'B-LOC', 'score': 0.9996446371078491, 'word': 'Sheffield', 'start': 74, 'end': 83} def test_aggregate_mentions(): myner = recogniser.Recogniser( - model="blb_lwm-ner-fine", # NER model name prefix (will have suffixes appended) - base_model="Livingwithmachines/bert_1760_1900", # Base model to fine-tune (from huggingface) - train_dataset="experiments/outputs/data/lwm/ner_fine_train.json", # Training set (part of overall training set) - test_dataset="experiments/outputs/data/lwm/ner_fine_dev.json", # Test set (part of overall training set) - model_path="resources/models/", # Path where the NER model is or will be stored - training_args={ - "batch_size": 8, - "num_train_epochs": 10, - "learning_rate": 0.00005, - "weight_decay": 0.0, - }, - overwrite_training=False, # Set to True if you want to overwrite model if existing - do_test=False, # Set to True if you want to train on test mode - load_from_hub=False, + model="Livingwithmachines/toponym-19thC-en", + load_from_hub=True, ) myner.pipe = myner.create_pipeline() - + sentence = "I grew up in Bologna, a city near Florence, but way more interesting." predictions = myner.ner_predict(sentence) # Process predictions: procpreds = [ - [x["word"], x["entity"], "O", x["start"], x["end"], x["score"]] + [x["word"], x["entity"], "O", x["start"], x["end"]] for x in predictions ] # Aggregate mentions: mentions = ner.aggregate_mentions(procpreds, "pred") - assert mentions[0]["mention"] == "Bologna" + assert len(mentions) == 2 assert mentions[1]["mention"] == "Florence" + assert mentions[0] == {'mention': 'Bologna', 'start_offset': 4, 'end_offset': 4, 'start_char': 13, 'end_char': 20, 'ner_score': 20.0, 'ner_label': 'LOC', 'entity_link': 'O'} assert mentions[0]["end_char"] - mentions[0]["start_char"] == len( mentions[0]["mention"] ) - assert mentions[1]["end_char"] - mentions[1]["start_char"] == len( - mentions[1]["mention"] - ) assert mentions[0]["mention"] in sentence - assert mentions[1]["mention"] in sentence - - sentence = "I grew up in New York City, a city in the United States." - predictions = myner.ner_predict(sentence) - # Process predictions: - procpreds = [ - [x["word"], x["entity"], "O", x["start"], x["end"], x["score"]] - for x in predictions - ] - # Aggregate mentions: - mentions = ner.aggregate_mentions(procpreds, "pred") - assert mentions[0]["mention"] == "New York City" - assert mentions[1]["mention"] == "United States" - assert mentions[0]["end_char"] - mentions[0]["start_char"] == len( - mentions[0]["mention"] - ) - assert mentions[1]["end_char"] - mentions[1]["start_char"] == len( - mentions[1]["mention"] - ) - assert mentions[0]["mention"] in sentence - assert mentions[1]["mention"] in sentence sentence = "ARMITAGE, DEM’TIST, may be consulted dally, from 9 a.m., till 8 p.m., at his residence, 95, STAMFORP-9TKEET, Ashton-cnder-Ltne." predictions = myner.ner_predict(sentence) # Process predictions: procpreds = [ - [x["word"], x["entity"], "O", x["start"], x["end"], x["score"]] + [x["word"], x["entity"], "O", x["start"], x["end"]] for x in predictions ] # Aggregate mentions: mentions = ner.aggregate_mentions(procpreds, "pred") - assert mentions[-1]["mention"] == "Ashton-cnder-Ltne" - for i in range(len(mentions)): - assert mentions[i]["end_char"] - mentions[i]["start_char"] == len( - mentions[i]["mention"] + assert len(mentions) == 2 + assert mentions[1]["mention"] == "Ashton-cnder-Ltne" + assert mentions[0] == {'mention': 'STAMFORP-9TKEET', 'start_offset': 31, 'end_offset': 33, 'start_char': 92, 'end_char': 107, 'ner_score': 102.667, 'ner_label': 'STREET', 'entity_link': 'O'} + assert mentions[0]["end_char"] - mentions[0]["start_char"] == len( + mentions[0]["mention"] ) - assert mentions[i]["mention"] in sentence + assert mentions[0]["mention"] in sentence diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 2abee043..274d7267 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -1,22 +1,57 @@ import os import sqlite3 -import sys from pathlib import Path +import pytest + from t_res.geoparser import linking, pipeline, ranking, recogniser +current_dir = Path(__file__).parent.resolve() + +def test_pipeline_basic(): + geoparser = pipeline.Pipeline( + resources_path=os.path.join(current_dir,"sample_files/resources") + ) + + sentence = "A remarkable case of rattening has just occurred in the building trade at Sheffield." + resolved = geoparser.run_text(sentence) + assert len(resolved)==1 + assert resolved[0]["mention"]=="Sheffield" + assert resolved[0]["ner_score"]==1.0 + assert resolved[0]["prediction"]=="Q42448" + +def test_pipeline_modular(): + myranker = ranking.Ranker( + method="perfectmatch", + resources_path=os.path.join(current_dir,"sample_files/resources"), + ) + + mylinker = linking.Linker( + method="mostpopular", + resources_path=os.path.join(current_dir,"sample_files/resources/"), + ) -def test_deezy_mostpopular(): + geoparser = pipeline.Pipeline(myranker=myranker, mylinker=mylinker) + + sentence = "A remarkable case of rattening has just occurred in the building trade at Sheffield." + resolved = geoparser.run_text(sentence) + assert len(resolved)==1 + assert resolved[0]["mention"]=="Sheffield" + assert resolved[0]["ner_score"]==1.0 + assert resolved[0]["prediction"]=="Q42448" + +@pytest.mark.deezy(reason="Needs deezy model") +def test_deezy_mostpopular(tmp_path): myner = recogniser.Recogniser( - model="blb_lwm-ner-fine", # We'll store the NER model here - train_dataset="experiments/outputs/data/lwm/ner_fine_train.json", # Training set (part of overall training set) - test_dataset="experiments/outputs/data/lwm/ner_fine_dev.json", # Test set (part of overall training set) - pipe=None, # We'll store the NER pipeline here + model="blb_lwm-ner-fine", + train_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_train.json"), + test_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_dev.json"), + pipe=None, base_model="khosseini/bert_1760_1900", # Base model to fine-tune - model_path="resources/models/", # Path where the NER model is or will be stored + model_path=str(tmp_path), # Path where the NER model will be stored training_args={ "batch_size": 8, - "num_train_epochs": 10, + "num_train_epochs": 1, "learning_rate": 0.00005, "weight_decay": 0.0, }, @@ -27,7 +62,7 @@ def test_deezy_mostpopular(): myranker = ranking.Ranker( method="deezymatch", - resources_path="resources/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), mentions_to_wikidata=dict(), wikidata_to_mentions=dict(), strvar_parameters={ @@ -36,13 +71,13 @@ def test_deezy_mostpopular(): "top_threshold": 85, "min_len": 5, "max_len": 15, - "w2v_ocr_path": str(Path("resources/models/").resolve()), - "w2v_ocr_model": "w2v_*_news", + "w2v_ocr_path": os.path.join(tmp_path,"resources/models/"), + "w2v_ocr_model": "w2v_1800s_news", "overwrite_dataset": False, }, deezy_parameters={ # Paths and filenames of DeezyMatch models and data: - "dm_path": str(Path("resources/deezymatch/").resolve()), + "dm_path": os.path.join(current_dir,"sample_files/resources/deezymatch/"), "dm_cands": "wkdtalts", "dm_model": "w2v_ocr", "dm_output": "deezymatch_on_the_fly", @@ -59,14 +94,16 @@ def test_deezy_mostpopular(): mylinker = linking.Linker( method="mostpopular", - resources_path="resources/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), ) geoparser = pipeline.Pipeline(myner=myner, myranker=myranker, mylinker=mylinker) + assert len(geoparser.myranker.mentions_to_wikidata.keys())>0 resolved = geoparser.run_text( - "A remarkable case of rattening has just occurred in the building trade at Shefiield, but also in Lancaster. Not in Nottingham though. Not in Ashton either, nor in Salop!", + "A remarkable case of rattening has just occurred in the building trade at Shefiield, but also in Leeds. Not in London though.", ) + assert len(resolved) == 3 assert resolved[0]["mention"] == "Shefiield" assert resolved[0]["prior_cand_score"] == dict() assert resolved[0]["cross_cand_score"]["Q42448"] == 0.903 @@ -83,20 +120,19 @@ def test_deezy_mostpopular(): # asserting behaviour with • character resolved = geoparser.run_text( - " • - ST G pOllO-P• FERRIS - • - , i ", + " • - S G pOllO-P• FERRIS - • - , i ", ) - assert resolved == [] - -def test_deezy_rel_wpubl_wmtops(): +@pytest.mark.deezy(reason="Needs deezy model") +def test_deezy_rel_wpubl_wmtops(tmp_path): myner = recogniser.Recogniser( - model="blb_lwm-ner-fine", # We'll store the NER model here - train_dataset="experiments/outputs/data/lwm/ner_fine_train.json", # Training set (part of overall training set) - test_dataset="experiments/outputs/data/lwm/ner_fine_dev.json", # Test set (part of overall training set) - pipe=None, # We'll store the NER pipeline here + model="blb_lwm-ner-fine", + train_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_train.json"), + test_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_dev.json"), + pipe=None, base_model="khosseini/bert_1760_1900", # Base model to fine-tune - model_path="resources/models/", # Path where the NER model is or will be stored + model_path=str(tmp_path), # Path where the NER model will be stored training_args={ "batch_size": 8, "num_train_epochs": 10, @@ -112,7 +148,7 @@ def test_deezy_rel_wpubl_wmtops(): # Instantiate the ranker: myranker = ranking.Ranker( method="deezymatch", - resources_path="resources/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), mentions_to_wikidata=dict(), wikidata_to_mentions=dict(), strvar_parameters={ @@ -121,13 +157,13 @@ def test_deezy_rel_wpubl_wmtops(): "top_threshold": 85, "min_len": 5, "max_len": 15, - "w2v_ocr_path": str(Path("resources/models/w2v/").resolve()), - "w2v_ocr_model": "w2v_*_news", + "w2v_ocr_path": str(tmp_path), + "w2v_ocr_model": "w2v_1800s_news", "overwrite_dataset": False, }, deezy_parameters={ # Paths and filenames of DeezyMatch models and data: - "dm_path": str(Path("resources/deezymatch/").resolve()), + "dm_path": os.path.join(current_dir,"sample_files/resources/deezymatch/"), "dm_cands": "wkdtalts", "dm_model": "w2v_ocr", "dm_output": "deezymatch_on_the_fly", @@ -142,15 +178,15 @@ def test_deezy_rel_wpubl_wmtops(): }, ) - with sqlite3.connect("resources/rel_db/embeddings_database.db") as conn: + with sqlite3.connect(os.path.join(current_dir,"sample_files/resources/rel_db/embeddings_database.db")) as conn: cursor = conn.cursor() mylinker = linking.Linker( method="reldisamb", - resources_path="resources/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), linking_resources=dict(), rel_params={ - "model_path": "resources/models/disambiguation/", - "data_path": "experiments/outputs/data/lwm/", + "model_path": os.path.join(current_dir,"sample_files/resources/models/disambiguation/"), + "data_path": os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/"), "training_split": "originalsplit", "db_embeddings": cursor, "with_publication": True, @@ -165,11 +201,12 @@ def test_deezy_rel_wpubl_wmtops(): geoparser = pipeline.Pipeline(myner=myner, myranker=myranker, mylinker=mylinker) resolved = geoparser.run_text( - "A remarkable case of rattening has just occurred in the building trade at Shefiield, but also in Lancaster. Not in Nottingham though. Not in Ashton either, nor in Salop!", + "A remarkable case of rattening has just occurred in the building trade at Shefiield, but also in Leeds. Not in London though.", place="Sheffield", place_wqid="Q42448", ) + assert len(resolved) == 3 assert resolved[0]["mention"] == "Shefiield" assert resolved[0]["prior_cand_score"]["Q42448"] == 0.891 assert resolved[0]["cross_cand_score"]["Q42448"] == 0.576 @@ -177,18 +214,18 @@ def test_deezy_rel_wpubl_wmtops(): assert resolved[0]["ed_score"] == 0.039 assert resolved[0]["ner_score"] == 1.0 - -def test_perfect_rel_wpubl_wmtops(): +@pytest.mark.deezy(reason="Needs deezy model") +def test_perfect_rel_wpubl_wmtops(tmp_path): myner = recogniser.Recogniser( - model="blb_lwm-ner-fine", # We'll store the NER model here - train_dataset="experiments/outputs/data/lwm/ner_fine_train.json", # Training set (part of overall training set) - test_dataset="experiments/outputs/data/lwm/ner_fine_dev.json", # Test set (part of overall training set) - pipe=None, # We'll store the NER pipeline here + model="blb_lwm-ner-fine", + train_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_train.json"), + test_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_dev.json"), + pipe=None, base_model="khosseini/bert_1760_1900", # Base model to fine-tune - model_path="resources/models/", # Path where the NER model is or will be stored + model_path=str(tmp_path), # Path where the NER model will be stored training_args={ "batch_size": 8, - "num_train_epochs": 10, + "num_train_epochs": 1, "learning_rate": 0.00005, "weight_decay": 0.0, }, @@ -201,7 +238,7 @@ def test_perfect_rel_wpubl_wmtops(): # Instantiate the ranker: myranker = ranking.Ranker( method="perfectmatch", - resources_path="resources/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), mentions_to_wikidata=dict(), wikidata_to_mentions=dict(), strvar_parameters={ @@ -210,13 +247,13 @@ def test_perfect_rel_wpubl_wmtops(): "top_threshold": 85, "min_len": 5, "max_len": 15, - "w2v_ocr_path": str(Path("resources/models/w2v/").resolve()), - "w2v_ocr_model": "w2v_*_news", + "w2v_ocr_path": str(tmp_path), + "w2v_ocr_model": "w2v_1800s_news", "overwrite_dataset": False, }, deezy_parameters={ # Paths and filenames of DeezyMatch models and data: - "dm_path": str(Path("resources/deezymatch/").resolve()), + "dm_path": os.path.join(current_dir,"sample_files/resources/deezymatch/"), "dm_cands": "wkdtalts", "dm_model": "w2v_ocr", "dm_output": "deezymatch_on_the_fly", @@ -231,20 +268,20 @@ def test_perfect_rel_wpubl_wmtops(): }, ) - with sqlite3.connect("resources/rel_db/embeddings_database.db") as conn: + with sqlite3.connect(os.path.join(current_dir,"sample_files/resources/rel_db/embeddings_database.db")) as conn: cursor = conn.cursor() mylinker = linking.Linker( method="reldisamb", - resources_path="resources/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), linking_resources=dict(), rel_params={ - "model_path": "resources/models/disambiguation/", - "data_path": "experiments/outputs/data/lwm/", + "model_path": os.path.join(current_dir,"sample_files/resources/models/disambiguation/"), + "data_path": os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/"), "training_split": "originalsplit", "db_embeddings": cursor, "with_publication": True, "without_microtoponyms": True, - "do_test": True, + "do_test": False, "default_publname": "United Kingdom", "default_publwqid": "Q145", }, @@ -254,7 +291,7 @@ def test_perfect_rel_wpubl_wmtops(): geoparser = pipeline.Pipeline(myner=myner, myranker=myranker, mylinker=mylinker) resolved = geoparser.run_text( - "A remarkable case of rattening has just occurred in the building trade at Shefiield, but also in Lancaster. Not in Nottingham though. Not in Ashton either, nor in Salop!", + "A remarkable case of rattening has just occurred in the building trade at Shefiield, but also in Leeds. Not in London though.", place="Sheffield", place_wqid="Q42448", ) @@ -266,18 +303,18 @@ def test_perfect_rel_wpubl_wmtops(): assert resolved[0]["ed_score"] == 0.0 assert resolved[0]["ner_score"] == 1.0 - -def test_modular_deezy_rel(): +@pytest.mark.deezy(reason="Needs deezy model") +def test_modular_deezy_rel(tmp_path): myner = recogniser.Recogniser( - model="blb_lwm-ner-fine", # We'll store the NER model here - train_dataset="experiments/outputs/data/lwm/ner_fine_train.json", # Training set (part of overall training set) - test_dataset="experiments/outputs/data/lwm/ner_fine_dev.json", # Test set (part of overall training set) - pipe=None, # We'll store the NER pipeline here + model="blb_lwm-ner-fine", + train_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_train.json"), + test_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_dev.json"), + pipe=None, base_model="khosseini/bert_1760_1900", # Base model to fine-tune - model_path="resources/models/", # Path where the NER model is or will be stored + model_path=str(tmp_path), # Path where the NER model will be stored training_args={ "batch_size": 8, - "num_train_epochs": 10, + "num_train_epochs": 1, "learning_rate": 0.00005, "weight_decay": 0.0, }, @@ -286,22 +323,26 @@ def test_modular_deezy_rel(): load_from_hub=False, # Bool: True if model is in HuggingFace hub ) + # -------------------------------------- + # Instantiate the ranker: myranker = ranking.Ranker( method="deezymatch", - resources_path="./resources/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), + mentions_to_wikidata=dict(), + wikidata_to_mentions=dict(), strvar_parameters={ # Parameters to create the string pair dataset: "ocr_threshold": 60, "top_threshold": 85, "min_len": 5, "max_len": 15, - "w2v_ocr_path": str(Path("./resources/models/w2v/").resolve()), - "w2v_ocr_model": "w2v_*_news", + "w2v_ocr_path": str(tmp_path), + "w2v_ocr_model": "w2v_1800s_news", "overwrite_dataset": False, }, deezy_parameters={ # Paths and filenames of DeezyMatch models and data: - "dm_path": str(Path("./resources/deezymatch/").resolve()), + "dm_path": os.path.join(current_dir,"sample_files/resources/deezymatch/"), "dm_cands": "wkdtalts", "dm_model": "w2v_ocr", "dm_output": "deezymatch_on_the_fly", @@ -316,15 +357,15 @@ def test_modular_deezy_rel(): }, ) - with sqlite3.connect("./resources/rel_db/embeddings_database.db") as conn: + with sqlite3.connect(os.path.join(current_dir,"sample_files/resources/rel_db/embeddings_database.db")) as conn: cursor = conn.cursor() mylinker = linking.Linker( method="reldisamb", - resources_path="./resources/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), linking_resources=dict(), rel_params={ - "model_path": "./resources/models/disambiguation/", - "data_path": "./experiments/outputs/data/lwm/", + "model_path": os.path.join(current_dir,"sample_files/resources/models/disambiguation/"), + "data_path": os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/"), "training_split": "apply", "db_embeddings": cursor, "with_publication": True, diff --git a/tests/test_process_data.py b/tests/test_process_data.py index 80887621..3d02cb6a 100644 --- a/tests/test_process_data.py +++ b/tests/test_process_data.py @@ -1,5 +1,6 @@ import os import sys +from pathlib import Path import pandas as pd import pytest @@ -7,6 +8,7 @@ from t_res.geoparser import recogniser from t_res.utils import process_data +current_dir = Path(__file__).parent.resolve() def test_eval_with_exception(): # test normal behaviour @@ -15,10 +17,8 @@ def test_eval_with_exception(): list_of_dict = process_data.eval_with_exception(str_list_of_dict) assert list_of_dict != str_list_of_dict - - assert type(list_of_dict) == list - - assert type(list_of_dict[0]) == dict + assert isinstance(list_of_dict,list) + assert isinstance(list_of_dict[0],dict) # test that it returns "" if the input is None @@ -29,17 +29,15 @@ def test_eval_with_exception(): # test that it raises an error if the syntax is wrong str_list_of_dict = "[{'key_1': 1, 'key_2': 2}" - check = False + with pytest.raises(SyntaxError) as cm: - check = True process_data.eval_with_exception(str_list_of_dict) - assert check == True def test_prepare_sents(): dataset_df = pd.read_csv( - "experiments/outputs/data/lwm/linking_df_split.tsv", + os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/linking_df_split.tsv"), sep="\t", ) @@ -50,7 +48,7 @@ def test_prepare_sents(): dAnnotated, dSentences, dMetadata = process_data.prepare_sents(dataset_df) - assert dAnnotated["4428937_4"][(26, 41)] == ("LOC", "Bt. Jamess Park", "Q216914") + assert dAnnotated["3580760_2"][(0, 6)] == ('LOC', 'LONDON', 'Q84') test_data = process_data.eval_with_exception(dataset_df["annotations"][0]) test_data[0]["wkdt_qid"] = "*" @@ -59,7 +57,7 @@ def test_prepare_sents(): dAnnotated, dSentences, dMetadata = process_data.prepare_sents(dataset_df) - assert dAnnotated["4428937_4"][(26, 41)] == ("LOC", "Bt. Jamess Park", "Q216914") + assert dAnnotated["3580760_2"][(0, 6)] == ('LOC', 'LONDON', 'Q84') assert len(dAnnotated) == len(dSentences) == len(dMetadata) @@ -67,35 +65,37 @@ def test_prepare_sents(): assert len([x for x, y in dMetadata.items() if len(y) == 0]) == 0 -def test_align_gold(): +def test_align_gold(tmp_path): myner = recogniser.Recogniser( - model="blb_lwm-ner-fine", # We'll store the NER model here - pipe=None, # We'll store the NER pipeline here + model="blb_lwm-ner-fine", + train_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_train.json"), + test_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_dev.json"), + pipe=None, base_model="khosseini/bert_1760_1900", # Base model to fine-tune - train_dataset="experiments/outputs/data/lwm/ner_fine_train.json", # Training set (part of overall training set) - test_dataset="experiments/outputs/data/lwm/ner_fine_dev.json", # Test set (part of overall training set) - model_path="resources/models/", # Path where the NER model is or will be stored + model_path=str(tmp_path), # Path where the NER model will be stored training_args={ "learning_rate": 5e-5, "batch_size": 16, - "num_train_epochs": 4, + "num_train_epochs": 1, "weight_decay": 0.01, }, overwrite_training=False, # Set to True if you want to overwrite model if existing do_test=False, # Set to True if you want to train on test mode + load_from_hub=False, # Bool: True if model is in HuggingFace hub ) + myner.train() myner.pipe = myner.create_pipeline() dataset_df = pd.read_csv( - "experiments/outputs/data/lwm/linking_df_split.tsv", + os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/linking_df_split.tsv"), sep="\t", ) dAnnotated, dSentences, dMetadata = process_data.prepare_sents(dataset_df) empty_list = [] for sent_id in dSentences.keys(): - if "4935585_1" == sent_id: + if "3580760_2" == sent_id: sent = dSentences[sent_id] annotations = dAnnotated[sent_id] predictions = myner.ner_predict(sent) @@ -122,27 +122,30 @@ def test_align_gold(): assert len(empty_list) == 0 -def test_ner_and_process(): +def test_ner_and_process(tmp_path): myner = recogniser.Recogniser( - model="blb_lwm-ner-fine", # We'll store the NER model here - pipe=None, # We'll store the NER pipeline here + model="blb_lwm-ner-fine", + train_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_train.json"), + test_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_dev.json"), + pipe=None, base_model="khosseini/bert_1760_1900", # Base model to fine-tune - train_dataset="experiments/outputs/data/lwm/ner_fine_train.json", # Training set (part of overall training set) - test_dataset="experiments/outputs/data/lwm/ner_fine_dev.json", # Test set (part of overall training set) - model_path="resources/models/", # Path where the NER model is or will be stored + model_path=str(tmp_path), # Path where the NER model will be stored training_args={ "learning_rate": 5e-5, "batch_size": 16, - "num_train_epochs": 4, + "num_train_epochs": 1, "weight_decay": 0.01, }, overwrite_training=False, # Set to True if you want to overwrite model if existing do_test=False, # Set to True if you want to train on test mode + load_from_hub=False, # Bool: True if model is in HuggingFace hub ) + + myner.train() myner.pipe = myner.create_pipeline() dataset_df = pd.read_csv( - "experiments/outputs/data/lwm/linking_df_split.tsv", + os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/linking_df_split.tsv"), sep="\t", ) diff --git a/tests/test_ranking.py b/tests/test_ranking.py index 852d38bb..98053704 100644 --- a/tests/test_ranking.py +++ b/tests/test_ranking.py @@ -1,38 +1,24 @@ -import json import os -import sys from pathlib import Path import pytest -from DeezyMatch import candidate_ranker from t_res.geoparser import ranking +current_dir = Path(__file__).parent.resolve() -def test_initialise_method(): - """ - Test initialisation works fine - """ - myranker = ranking.Ranker( - method="perfectmatch", - resources_path="resources/", - mentions_to_wikidata=dict(), - wikidata_to_mentions=dict(), - ) - assert type(myranker.__str__()) == str - - -def test_perfect_match(): +def test_ranking_perfect_match(): """ Test that perfect_match returns only perfect matching cases """ myranker = ranking.Ranker( method="perfectmatch", - resources_path="resources/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), ) myranker.load_resources() + candidates = myranker.perfect_match(["London"]) - assert candidates["London"]["London"] == 1.0 + assert candidates["London"] == {'London': 1.0} candidates = myranker.perfect_match(["Lvndon"]) assert candidates["Lvndon"] == {} @@ -41,63 +27,55 @@ def test_perfect_match(): assert candidates["Paperopoli"] == {} -def test_damlev(): +def test_ranking_damlev(): """ Test that damlev returns correctly """ myranker = ranking.Ranker( method="partialmatch", - resources_path="resources/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), ) score = myranker.damlev_dist("Lvndon", {"mentions": "London"}) assert score == 0.8333333283662796 + score = myranker.damlev_dist("uityity", {"mentions": "asdasd"}) + assert score == 0.0 + with pytest.raises(TypeError): - found = True myranker.damlev_dist("Lvndon", "London") - assert found == True - - assert 0.0 == myranker.damlev_dist("uityity", {"mentions": "asdasd"}) -def test_check_if_contained(): +def test_ranking_check_if_contained(): """ Test that check_if_contained returns score only when there is an overlap """ myranker = ranking.Ranker( method="partialmatch", - resources_path="resources/", - mentions_to_wikidata=dict(), - wikidata_to_mentions=dict(), + resources_path=os.path.join(current_dir,"sample_files/resources/"), ) + score_a = myranker.check_if_contained("New York", {"mentions": "New York City"}) score_b = myranker.check_if_contained("New York City", {"mentions": "New York"}) - assert score_a == score_b == 0.6153846153846154 with pytest.raises(TypeError): - found = True myranker.check_if_contained("Lvndon", "London") - assert found == True - assert None == myranker.check_if_contained("London", {"mentions": "New York"}) + score = myranker.check_if_contained("London", {"mentions": "New York"}) + assert score is None -def test_partial_match(): +def test_ranking_partial_match(): """ Test that partial match either returns results or {} """ myranker = ranking.Ranker( method="partialmatch", - resources_path="resources/", - mentions_to_wikidata=dict(), - wikidata_to_mentions=dict(), + resources_path=os.path.join(current_dir,"sample_files/resources/"), ) - myranker.load_resources() - # Test that perfect_match acts before partial match myranker.mentions_to_wikidata = {"London": "Q84"} candidates = myranker.partial_match(["London"], damlev=False) @@ -105,33 +83,28 @@ def test_partial_match(): # Test that damlev works myranker.already_collected_cands = {} - candidates = myranker.partial_match(["Lvndvn"], damlev=True) assert candidates["Lvndvn"]["London"] == 0.6666666567325592 # Test that overlap works properly myranker.mentions_to_wikidata = {"New York City": "Q60"} myranker.already_collected_cands = {} - candidates = myranker.partial_match(["New York"], damlev=False) assert candidates["New York"]["New York City"] == 0.6153846153846154 - myranker.mentions_to_wikidata = {"New York City": "Q60"} myranker.already_collected_cands = {} - candidates = myranker.partial_match(["Lvndvn"], damlev=False) assert candidates["Lvndvn"] == {} myranker.already_collected_cands = {} - candidates = myranker.partial_match(["asdasd"], damlev=True) assert candidates["asdasd"] == {"New York City": 0.0} - -def test_deezy_on_the_fly(): +@pytest.mark.deezy(reason="Needs deezy model") +def test_ranking_deezy_on_the_fly(tmp_path): myranker = ranking.Ranker( method="deezymatch", - resources_path="resources/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), mentions_to_wikidata=dict(), wikidata_to_mentions=dict(), strvar_parameters={ @@ -140,13 +113,13 @@ def test_deezy_on_the_fly(): "top_threshold": 85, "min_len": 5, "max_len": 15, - "w2v_ocr_path": "resources/models/", + "w2v_ocr_path": str(tmp_path), "w2v_ocr_model": "w2v_1800s_news", "overwrite_dataset": False, }, deezy_parameters={ # Paths and filenames of DeezyMatch models and data: - "dm_path": "resources/deezymatch/", + "dm_path": os.path.join(current_dir,"sample_files/resources/deezymatch/"), "dm_cands": "wkdtalts", "dm_model": "w2v_ocr", "dm_output": "deezymatch_on_the_fly", @@ -158,7 +131,7 @@ def test_deezy_on_the_fly(): "verbose": False, # DeezyMatch training: "overwrite_training": False, - "do_test": True, + "do_test": False, }, ) @@ -169,18 +142,14 @@ def test_deezy_on_the_fly(): # Test that deezy works myranker.already_collected_cands = {} - candidates = myranker.deezy_on_the_fly(["Ashton-cnderLyne"]) - assert ( - candidates["Ashton-cnderLyne"]["Ashton-under-Lyne"] > 0.0 - and candidates["Ashton-cnderLyne"]["Ashton-under-Lyne"] < 1.0 - ) + assert (0.0 < candidates["Ashton-cnderLyne"]["Ashton-under-Lyne"] < 1.0) - -def test_find_candidates(): +@pytest.mark.deezy(reason="Needs deezy model") +def test_ranking_find_candidates(tmp_path): myranker = ranking.Ranker( method="deezymatch", - resources_path="resources/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), mentions_to_wikidata=dict(), wikidata_to_mentions=dict(), strvar_parameters={ @@ -189,13 +158,13 @@ def test_find_candidates(): "top_threshold": 85, "min_len": 5, "max_len": 15, - "w2v_ocr_path": "resources/models/", + "w2v_ocr_path": str(tmp_path), "w2v_ocr_model": "w2v_1800s_news", "overwrite_dataset": False, }, deezy_parameters={ # Paths and filenames of DeezyMatch models and data: - "dm_path": "resources/deezymatch/", + "dm_path": os.path.join(current_dir,"sample_files/resources/deezymatch/"), "dm_cands": "wkdtalts", "dm_model": "w2v_ocr", "dm_output": "deezymatch_on_the_fly", @@ -219,12 +188,8 @@ def test_find_candidates(): # Test that deezy works myranker.already_collected_cands = {} - candidates = myranker.find_candidates([{"mention": "Sheftield"}]) - assert ( - candidates["Sheftield"]["Sheffield"]["Score"] > 0.0 - and candidates["Sheftield"]["Sheffield"]["Score"] < 1.0 - ) + assert (0.0 < candidates["Sheftield"]["Sheffield"]["Score"] < 1.0) assert "Q42448" in candidates["Sheftield"]["Sheffield"]["Candidates"] # Test that Perfect Match works @@ -237,7 +202,6 @@ def test_find_candidates(): assert "Q42448" in candidates["Sheffield"]["Sheffield"]["Candidates"] myranker.already_collected_cands = {} - candidates = myranker.find_candidates([{"mention": "Sheftield"}]) assert candidates["Sheftield"] == {} @@ -246,13 +210,11 @@ def test_find_candidates(): # Test that perfect_match acts before partialmatch myranker.load_resources() - candidates = myranker.find_candidates([{"mention": "Sheffield"}]) assert candidates["Sheffield"]["Sheffield"]["Score"] == 1.0 assert "Q42448" in candidates["Sheffield"]["Sheffield"]["Candidates"] myranker.already_collected_cands = {} - candidates = myranker.find_candidates([{"mention": "Sheftield"}]) assert "Sheffield" not in candidates["Sheftield"] @@ -261,59 +223,11 @@ def test_find_candidates(): # Test that perfect_match acts before partialmatch myranker.load_resources() - candidates = myranker.find_candidates([{"mention": "Sheffield"}]) assert candidates["Sheffield"]["Sheffield"]["Score"] == 1.0 assert "Q42448" in candidates["Sheffield"]["Sheffield"]["Candidates"] myranker.already_collected_cands = {} - candidates = myranker.find_candidates([{"mention": "Sheftield"}]) - assert ( - candidates["Sheftield"]["Sheffield"]["Score"] > 0.0 - and candidates["Sheftield"]["Sheffield"]["Score"] < 1.0 - ) + assert (0.0 < candidates["Sheftield"]["Sheffield"]["Score"] < 1.0) assert "Q42448" in candidates["Sheftield"]["Sheffield"]["Candidates"] - - -def test_deezy_candidate_ranker(): - deezy_parameters = { - # Paths and filenames of DeezyMatch models and data: - "dm_path": str(Path("resources/deezymatch/").resolve()), - "dm_cands": "wkdtalts", - "dm_model": "w2v_ocr", - "dm_output": "deezymatch_on_the_fly", - # Ranking measures: - "ranking_metric": "faiss", - "selection_threshold": 50, - "num_candidates": 1, - "verbose": False, - # DeezyMatch training: - "overwrite_training": False, - "do_test": False, - } - - dm_path = deezy_parameters["dm_path"] - dm_cands = deezy_parameters["dm_cands"] - dm_model = deezy_parameters["dm_model"] - dm_output = deezy_parameters["dm_output"] - - query = ["-", "ST G", "• - , i", "- P", "• FERRIS"] - - candidates = candidate_ranker( - candidate_scenario=os.path.join(dm_path, "combined", dm_cands + "_" + dm_model), - query=query, - ranking_metric=deezy_parameters["ranking_metric"], - selection_threshold=deezy_parameters["selection_threshold"], - num_candidates=deezy_parameters["num_candidates"], - search_size=deezy_parameters["num_candidates"], - verbose=deezy_parameters["verbose"], - output_path=os.path.join(dm_path, "ranking", dm_output), - pretrained_model_path=os.path.join( - f"{dm_path}", "models", f"{dm_model}", f"{dm_model}" + ".model" - ), - pretrained_vocab_path=os.path.join( - f"{dm_path}", "models", f"{dm_model}", f"{dm_model}" + ".vocab" - ), - ) - assert len(candidates) == len(query)