From 8b666512c429c151257393b1c5490cfd12e8df20 Mon Sep 17 00:00:00 2001 From: mcollardanuy Date: Thu, 27 Jul 2023 12:53:13 +0000 Subject: [PATCH] Change HF base and ner model, and pipeline default NER --- examples/load_use_ner_model.ipynb | 21 ++------ examples/train_use_ner_model.ipynb | 14 ++--- experiments/toponym_resolution.py | 4 +- geoparser/pipeline.py | 86 +++++++++++++----------------- tests/test_ner.py | 22 ++------ 5 files changed, 54 insertions(+), 93 deletions(-) diff --git a/examples/load_use_ner_model.ipynb b/examples/load_use_ner_model.ipynb index a35d5c5d..750dd06d 100644 --- a/examples/load_use_ner_model.ipynb +++ b/examples/load_use_ner_model.ipynb @@ -32,7 +32,7 @@ "source": [ "Create a `myner` object of the `Recogniser` class.\n", "\n", - "We only need to pass the path to the model in `model` and set `load_from_hum` to True, as follows:" + "We only need to pass the path to the model in `model` and set `load_from_hub` to True, as follows:" ] }, { @@ -42,21 +42,8 @@ "outputs": [], "source": [ "myner = recogniser.Recogniser(\n", - " model=\"blb_lwm-ner-fine\",\n", - " pipe=None,\n", - " base_model=\"khosseini/bert_1760_1900\",\n", - " train_dataset=\"../experiments/outputs/data/lwm/ner_fine_train.json\",\n", - " test_dataset=\"../experiments/outputs/data/lwm/ner_fine_dev.json\",\n", - " model_path=\"../resources/models/\",\n", - " training_args={\n", - " \"learning_rate\": 5e-5,\n", - " \"batch_size\": 16,\n", - " \"num_train_epochs\": 4,\n", - " \"weight_decay\": 0.01,\n", - " },\n", - " overwrite_training=False,\n", - " do_test=False,\n", - " load_from_hub=False,\n", + " model=\"Livingwithmachines/toponym-19thC-en\",\n", + " load_from_hub=True,\n", ")" ] }, @@ -128,7 +115,7 @@ "sentence = \"A remarkable case of rattening has just occurred in the building trade at Sheffield.\"\n", "\n", "predictions = myner.ner_predict(sentence)\n", - "print(predictions) # Note that, if you've trained the model in the test mode, the model will probably not identify \"Sheffield\" as a location." + "print(predictions)" ] } ], diff --git a/examples/train_use_ner_model.ipynb b/examples/train_use_ner_model.ipynb index eb6930db..8bc8ea09 100644 --- a/examples/train_use_ner_model.ipynb +++ b/examples/train_use_ner_model.ipynb @@ -50,21 +50,21 @@ " train_dataset=\"../experiments/outputs/data/lwm/ner_fine_train.json\", # Path to the json file containing the training set (see note above).\n", " test_dataset=\"../experiments/outputs/data/lwm/ner_fine_dev.json\", # Path to the json file containing the test set (see note above).\n", " pipe=None, # We'll store the NER pipeline here, leave this empty.\n", - " base_model=\"khosseini/bert_1760_1900\", # Base model to fine-tune for NER. The value can be: either \n", + " base_model=\"Livingwithmachines/bert_1760_1900\", # Base model to fine-tune for NER. The value can be: either \n", " # your local path to a model or the huggingface path.\n", " # In this case, we use the huggingface path:\n", - " # https://huggingface.co/khosseini/bert_1760_1900). You can\n", + " # https://huggingface.co/Livingwithmachines/bert_1760_1900). You can\n", " # chose any other model from the HuggingFace hub, as long as it's\n", " # trained on the \"Fill-Mask\" objective (filter by task).\n", " model_path=\"../resources/models/\", # Path where the NER model will be stored.\n", " training_args={\n", - " \"learning_rate\": 5e-5,\n", - " \"batch_size\": 16,\n", - " \"num_train_epochs\": 4,\n", - " \"weight_decay\": 0.01,\n", + " \"batch_size\": 8,\n", + " \"num_train_epochs\": 10,\n", + " \"learning_rate\": 0.00005,\n", + " \"weight_decay\": 0.0,\n", " }, # Training arguments: you can change them.\n", " overwrite_training=False, # Set to True if you want to overwrite an existing model with the same name.\n", - " do_test=False, # Set to True if you want to perform the training on test mode (the string \"_test\" will be appended to your model name).\n", + " do_test=True, # Set to True if you want to perform the training on test mode (the string \"_test\" will be appended to your model name).\n", " load_from_hub=False, # Whether the model should be loaded from the HuggingFace hub\n", ")" ] diff --git a/experiments/toponym_resolution.py b/experiments/toponym_resolution.py index 42ad8c67..0fb55bd8 100644 --- a/experiments/toponym_resolution.py +++ b/experiments/toponym_resolution.py @@ -59,10 +59,10 @@ + granularity + "_dev.json", # Path to the json file containing the test set (see note above). pipe=None, # We'll store the NER pipeline here, leave this empty. - base_model="khosseini/bert_1760_1900", # Base model to fine-tune for NER. The value can be: either + base_model="Livingwithmachines/bert_1760_1900", # Base model to fine-tune for NER. The value can be: either # your local path to a model or the huggingface path. # In this case, we use the huggingface path: - # https://huggingface.co/khosseini/bert_1760_1900). You can + # https://huggingface.co/Livingwithmachines/bert_1760_1900). You can # chose any other model from the HuggingFace hub, as long as it's # trained on the "Fill-Mask" objective (filter by task). model_path="../resources/models/", # Path where the NER model will be stored. diff --git a/geoparser/pipeline.py b/geoparser/pipeline.py index 4c661214..dc0d095d 100644 --- a/geoparser/pipeline.py +++ b/geoparser/pipeline.py @@ -47,21 +47,8 @@ class Pipeline: .. code-block:: python recogniser.Recogniser( - model="blb_lwm-ner-fine", - pipe=None, - base_model="khosseini/bert_1760_1900", - train_dataset="../experiments/outputs/data/lwm/ner_fine_train.json", - test_dataset="../experiments/outputs/data/lwm/ner_fine_dev.json", - model_path="../resources/models/", - training_args={ - "learning_rate": 5e-5, - "batch_size": 16, - "num_train_epochs": 4, - "weight_decay": 0.01, - }, - overwrite_training=False, - do_test=False, - load_from_hub=False, + model="Livingwithmachines/toponym-19thC-en", + load_from_hub=True, ) * The default settings for the ``Ranker``: @@ -99,23 +86,9 @@ def __init__( # If myner is None, instantiate the default Recogniser. if not self.myner: - dataset_path = "../experiments/outputs/data/lwm" self.myner = recogniser.Recogniser( - model="blb_lwm-ner-fine", - pipe=None, - base_model="khosseini/bert_1760_1900", - train_dataset=f"{dataset_path}/ner_fine_train.json", - test_dataset=f"{dataset_path}/ner_fine_dev.json", - model_path="../resources/models/", - training_args={ - "learning_rate": 5e-5, - "batch_size": 16, - "num_train_epochs": 4, - "weight_decay": 0.01, - }, - overwrite_training=False, - do_test=False, - load_from_hub=False, + model="Livingwithmachines/toponym-19thC-en", + load_from_hub=True, ) # If myranker is None, instantiate the default Ranker. @@ -257,7 +230,15 @@ def run_sentence( mentions_dataset = dict() mentions_dataset["linking"] = [] for m in mentions: - prediction = self.format_prediction(m, sentence, wk_cands=wk_cands, context=context, sent_idx=sent_idx, place=place, place_wqid=place_wqid) + prediction = self.format_prediction( + m, + sentence, + wk_cands=wk_cands, + context=context, + sent_idx=sent_idx, + place=place, + place_wqid=place_wqid, + ) mentions_dataset["linking"].append(prediction) # If the linking method is "reldisamb", rank and format candidates, @@ -508,11 +489,7 @@ def run_text( return document_dataset - - def run_sentence_recognition( - self, - sentence - ) -> List[dict]: + def run_sentence_recognition(self, sentence) -> List[dict]: # Get predictions: predictions = self.myner.ner_predict(sentence) @@ -525,15 +502,16 @@ def run_sentence_recognition( # Aggregate mentions: mentions = ner.aggregate_mentions(procpreds, "pred") return mentions - - - def format_prediction(self, mention, - sentence: str, - wk_cands: Optional[dict] = None, - context: Optional[Tuple[str, str]] = ("", ""), - sent_idx: Optional[int] = 0, - place: Optional[str] = "", - place_wqid: Optional[str] = "" + + def format_prediction( + self, + mention, + sentence: str, + wk_cands: Optional[dict] = None, + context: Optional[Tuple[str, str]] = ("", ""), + sent_idx: Optional[int] = 0, + place: Optional[str] = "", + place_wqid: Optional[str] = "", ) -> dict: prediction = dict() prediction["mention"] = mention["mention"] @@ -551,12 +529,12 @@ def format_prediction(self, mention, prediction["place"] = place prediction["place_wqid"] = place_wqid if wk_cands: - prediction["string_match_candidates"] = wk_cands.get(mention["mention"], dict()) + prediction["string_match_candidates"] = wk_cands.get( + mention["mention"], dict() + ) prediction["candidates"] = wk_cands.get(mention["mention"], dict()) return prediction - - def run_text_recognition( self, text: str, @@ -627,7 +605,15 @@ def run_text_recognition( mentions_dataset = [] for m in mentions: - prediction = self.format_prediction(m, sentence, wk_cands=None, context=context, sent_idx=idx, place=place, place_wqid=place_wqid) + prediction = self.format_prediction( + m, + sentence, + wk_cands=None, + context=context, + sent_idx=idx, + place=place, + place_wqid=place_wqid, + ) # mentions_dataset["linking"].append(prediction) if not len(m["mention"]) == 1 and not m["mention"].islower(): mentions_dataset.append(prediction) diff --git a/tests/test_ner.py b/tests/test_ner.py index 0bc838ab..b8a9718c 100644 --- a/tests/test_ner.py +++ b/tests/test_ner.py @@ -22,8 +22,7 @@ def test_training(): myner = recogniser.Recogniser( model="blb_lwm-ner-coarse", # NER model name prefix (will have suffixes appended) - pipe=None, # We'll store the NER pipeline here - base_model="khosseini/bert_1760_1900", # Base model to fine-tune (from huggingface) + base_model="Livingwithmachines/bert_1760_1900", # Base model to fine-tune (from huggingface) train_dataset="experiments/outputs/data/lwm/ner_coarse_train.json", # Training set (part of overall training set) test_dataset="experiments/outputs/data/lwm/ner_coarse_dev.json", # Test set (part of overall training set) model_path="resources/models/", # Path where the NER model is or will be stored @@ -48,8 +47,7 @@ def test_create_pipeline(): """ myner = recogniser.Recogniser( model="blb_lwm-ner-coarse", # NER model name prefix (will have suffixes appended) - pipe=None, # We'll store the NER pipeline here - base_model="khosseini/bert_1760_1900", # Base model to fine-tune (from huggingface) + base_model="Livingwithmachines/bert_1760_1900", # Base model to fine-tune (from huggingface) train_dataset="experiments/outputs/data/lwm/ner_fine_train.json", # Training set (part of overall training set) test_dataset="experiments/outputs/data/lwm/ner_fine_dev.json", # Test set (part of overall training set) model_path="resources/models/", # Path where the NER model is or will be stored @@ -73,8 +71,7 @@ def test_create_pipeline(): def test_ner_predict(): myner = recogniser.Recogniser( model="blb_lwm-ner-fine", # NER model name prefix (will have suffixes appended) - pipe=None, # We'll store the NER pipeline here - base_model="khosseini/bert_1760_1900", # Base model to fine-tune (from huggingface) + base_model="Livingwithmachines/bert_1760_1900", # Base model to fine-tune (from huggingface) train_dataset="experiments/outputs/data/lwm/ner_fine_train.json", # Training set (part of overall training set) test_dataset="experiments/outputs/data/lwm/ner_fine_dev.json", # Test set (part of overall training set) model_path="resources/models/", # Path where the NER model is or will be stored @@ -107,15 +104,7 @@ def test_ner_predict(): def test_ner_load_from_hub(): myner = recogniser.Recogniser( - model="dslim/bert-base-NER", # Test loading from huggingface hub - pipe=None, # We'll store the NER pipeline here - base_model="khosseini/bert_1760_1900", # Base model to fine-tune (from huggingface) - train_dataset="experiments/outputs/data/lwm/ner_fine_train.json", # Training set (part of overall training set) - test_dataset="experiments/outputs/data/lwm/ner_fine_dev.json", # Test set (part of overall training set) - model_path="resources/models/", # Path where the NER model is or will be stored - training_args=dict(), - overwrite_training=False, # Set to True if you want to overwrite model if existing - do_test=False, # Set to True if you want to train on test mode + model="Livingwithmachines/toponym-19thC-en", load_from_hub=True, ) pipe = myner.create_pipeline() @@ -128,8 +117,7 @@ def test_ner_load_from_hub(): def test_aggregate_mentions(): myner = recogniser.Recogniser( model="blb_lwm-ner-fine", # NER model name prefix (will have suffixes appended) - pipe=None, # We'll store the NER pipeline here - base_model="khosseini/bert_1760_1900", # Base model to fine-tune (from huggingface) + base_model="Livingwithmachines/bert_1760_1900", # Base model to fine-tune (from huggingface) train_dataset="experiments/outputs/data/lwm/ner_fine_train.json", # Training set (part of overall training set) test_dataset="experiments/outputs/data/lwm/ner_fine_dev.json", # Test set (part of overall training set) model_path="resources/models/", # Path where the NER model is or will be stored