Support prediction for NER new task. (#331)

Jiaqiang-Ruan · hunterhector · jasonyanwenl · web-flow · commit 4e891a8337ff · 2021-08-17T12:02:55.000-04:00
* RJQ: [ner_new_task] predict process for ner task

* apply cherry-pick yanwen &amp; RJQ: [main-train-tagging] move prediction to folder tagging;

Co-authored-by: Hector &lt;hunterhector@gmail.com&gt;
Co-authored-by: Yanwen Lin &lt;lyw1124278064@gmail.com&gt;
diff --git a/examples/tagging/configs/config_predict.yml b/examples/tagging/configs/config_predict.yml
@@ -0,0 +1,4 @@
+test_path: "data/conll03_english/test"
+model_path: "best_crf_model.ckpt"
+train_state_path: "train_state.pkl"
+batch_size: 10
diff --git a/examples/tagging/evaluator.py b/examples/tagging/evaluator.py
@@ -0,0 +1,124 @@
+#  Copyright 2020 The Forte Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable-msg=too-many-locals
+"""Evaluator for Conll03 NER tag."""
+import os
+from pathlib import Path
+from forte.data.base_pack import PackType
+from forte.evaluation.base import Evaluator
+from forte.data.extractor.utils import bio_tagging
+from ft.onto.base_ontology import Sentence, Token, EntityMention
+
+
+def _post_edit(element):
+    if element[0] is None:
+        return "O"
+    return "%s-%s" % (element[1], element[0].ner_type)
+
+
+def _get_tag(data, pack):
+    based_on = [pack.get_entry(x) for x in data["Token"]['tid']]
+    entry = [pack.get_entry(x) for x in data["EntityMention"]['tid']]
+    tag = bio_tagging(based_on, entry)
+    tag = [_post_edit(x) for x in tag]
+    return tag
+
+
+def _write_tokens_to_file(pred_pack, pred_request,
+                          refer_pack, refer_request,
+                          output_filename):
+    opened_file = open(output_filename, "w+")
+    for pred_data, refer_data in zip(
+        pred_pack.get_data(**pred_request),
+        refer_pack.get_data(**refer_request)
+    ):
+        pred_tag = _get_tag(pred_data, pred_pack)
+        refer_tag = _get_tag(refer_data, refer_pack)
+        words = refer_data["Token"]["text"]
+        pos = refer_data["Token"]["pos"]
+        chunk = refer_data["Token"]["chunk"]
+
+        for i, (word, position, chun, tgt, pred) in \
+                enumerate(zip(words, pos, chunk, refer_tag, pred_tag), 1):
+            opened_file.write(
+                "%d %s %s %s %s %s\n" % (i, word, position, chun, tgt, pred)
+            )
+        opened_file.write("\n")
+    opened_file.close()
+
+
+class CoNLLNEREvaluator(Evaluator):
+    """Evaluator for Conll NER task."""
+    def __init__(self):
+        super().__init__()
+        # self.test_component = CoNLLNERPredictor().name
+        self.output_file = "tmp_eval.txt"
+        self.score_file = "tmp_eval.score"
+        self.scores = {}
+
+    def consume_next(self, pred_pack: PackType, ref_pack: PackType):
+        pred_getdata_args = {
+            "context_type": Sentence,
+            "request": {
+                Token: {
+                    "fields": ["chunk", "pos"]
+                },
+                EntityMention: {
+                    "fields": ["ner_type"],
+                },
+                Sentence: [],  # span by default
+            }
+        }
+
+        refer_getdata_args = {
+            "context_type": Sentence,
+            "request": {
+                Token: {
+                    "fields": ["chunk", "pos", "ner"]
+                },
+                EntityMention: {
+                    "fields": ["ner_type"],
+                },
+                Sentence: [],  # span by default
+            }
+        }
+
+        _write_tokens_to_file(pred_pack=pred_pack,
+                                pred_request=pred_getdata_args,
+                                refer_pack=ref_pack,
+                                refer_request=refer_getdata_args,
+                                output_filename=self.output_file)
+        eval_script = \
+            Path(os.path.abspath(__file__)).parents[2] / \
+            "forte/utils/eval_scripts/conll03eval.v2"
+        os.system(f"perl {eval_script} < {self.output_file} > "
+                  f"{self.score_file}")
+        with open(self.score_file, "r") as fin:
+            fin.readline()
+            line = fin.readline()
+            fields = line.split(";")
+            acc = float(fields[0].split(":")[1].strip()[:-1])
+            precision = float(fields[1].split(":")[1].strip()[:-1])
+            recall = float(fields[2].split(":")[1].strip()[:-1])
+            f_1 = float(fields[3].split(":")[1].strip())
+
+        self.scores = {
+            "accuracy": acc,
+            "precision": precision,
+            "recall": recall,
+            "f1": f_1,
+        }
+
+    def get_result(self):
+        return self.scores
diff --git a/examples/tagging/main_predict_tagging.py b/examples/tagging/main_predict_tagging.py
@@ -0,0 +1,72 @@
+# Copyright 2020 The Forte Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""This file predict the ner tag for conll03 dataset."""
+import sys
+import yaml
+import torch
+from forte.pipeline import Pipeline
+from forte.data.readers.conll03_reader_new import CoNLL03Reader
+from forte.predictor import Predictor
+from ft.onto.base_ontology import Sentence, EntityMention, Token
+from examples.tagging.evaluator import CoNLLNEREvaluator
+
+
+def predict_forward_fn(model, batch):
+    '''Use model and batch data to predict ner tag.'''
+    word = batch["text_tag"]["data"]
+    char = batch["char_tag"]["data"]
+    word_masks = batch["text_tag"]["masks"][0]
+    output = model.decode(input_word=word, input_char=char, mask=word_masks)
+    output = output.numpy()
+    return {'output_tag': output}
+
+
+task = sys.argv[1]
+assert task in ["ner", "pos"], \
+    "Not supported nlp task type: {}".format(task)
+
+config_predict = yaml.safe_load(open("configs/config_predict.yml", "r"))
+saved_model = torch.load(config_predict['model_path'])
+train_state = torch.load(config_predict['train_state_path'])
+
+reader = CoNLL03Reader()
+predictor = Predictor(batch_size=config_predict['batch_size'],
+                model=saved_model,
+                predict_forward_fn=predict_forward_fn,
+                feature_resource=train_state['feature_resource'])
+evaluator = CoNLLNEREvaluator()
+
+
+pl = Pipeline()
+pl.set_reader(reader)
+pl.add(predictor)
+pl.add(evaluator)
+pl.initialize()
+
+
+for pack in pl.process_dataset(config_predict['test_path']):
+    print("---- pack ----")
+    for instance in pack.get(Sentence):
+        sent = instance.text
+        output_tags = []
+        if task == "ner":
+            for entry in pack.get(EntityMention, instance):
+                output_tags.append((entry.text, entry.ner_type))
+        else:
+            for entry in pack.get(Token, instance):
+                output_tags.append((entry.text, entry.pos))
+        print('---- example -----')
+        print("sentence: ", sent)
+        print("output_tags: ", output_tags)
+    print(evaluator.get_result())