Sindhi NER conversion for SiNER

AngledLuffa · AngledLuffa · commit 2a8ded4b0c32 · 2022-09-28T16:37:32.000-07:00
Update a few broken tags from the Sindhi NER dataset
Includes notes on how to get SiNER from the github repo
diff --git a/stanza/utils/datasets/ner/convert_sindhi_siner.py b/stanza/utils/datasets/ner/convert_sindhi_siner.py
@@ -0,0 +1,69 @@
+"""
+Converts the raw data from SiNER to .json for the Stanza NER system
+
+https://aclanthology.org/2020.lrec-1.361.pdf
+"""
+
+from stanza.utils.datasets.ner.utils import write_dataset
+
+def fix_sentence(sentence):
+    """
+    Fix some of the mistags in the dataset
+
+    This covers 11 sentences: 1 P-PERSON, 2 with line breaks in the middle of the tag, and 8 with no B- or I-
+    """
+    new_sentence = []
+    for word_idx, word in enumerate(sentence):
+        if word[1] == 'P-PERSON':
+            new_sentence.append((word[0], 'B-PERSON'))
+        elif word[1] == 'B-OT"':
+            new_sentence.append((word[0], 'B-OTHERS'))
+        elif word[1] == 'B-T"':
+            new_sentence.append((word[0], 'B-TITLE'))
+        elif word[1] in ('GPE', 'LOC', 'OTHERS'):
+            if len(new_sentence) > 0 and new_sentence[-1][1][:2] in ('B-', 'I-') and new_sentence[-1][1][2:] == word[1]:
+                # one example... no idea if it should be a break or
+                # not, but the last word translates to "Corporation",
+                # so probably not: ميٽرو پوليٽن ڪارپوريشن
+                new_sentence.append((word[0], 'I-' + word[1]))
+            else:
+                new_sentence.append((word[0], 'B-' + word[1]))
+        else:
+            new_sentence.append(word)
+    return new_sentence
+
+def convert_sindhi_siner(in_filename, out_directory, short_name, train_frac=0.8, dev_frac=0.1):
+    """
+    Read lines from the dataset, crudely separate sentences based on . or !, and write the dataset
+    """
+    with open(in_filename, encoding="utf-8") as fin:
+        lines = fin.readlines()
+
+    lines = [x.strip().split("\t") for x in lines]
+    lines = [(x[0].strip(), x[1].strip()) for x in lines if len(x) == 2]
+    print("Read %d words from %s" % (len(lines), in_filename))
+    sentences = []
+    prev_idx = 0
+    for sent_idx, line in enumerate(lines):
+        # maybe also handle line[0] == '،', "Arabic comma"?
+        if line[0] in ('.', '!'):
+            sentences.append(lines[prev_idx:sent_idx+1])
+            prev_idx=sent_idx+1
+
+    # in case the file doesn't end with punctuation, grab the last few lines
+    if prev_idx < len(lines):
+        sentences.append(lines[prev_idx:])
+
+    print("Found %d sentences before splitting" % len(sentences))
+    sentences = [fix_sentence(x) for x in sentences]
+    assert not any('"' in x[1] or x[1].startswith("P-") or x[1] in ("GPE", "LOC", "OTHERS") for sentence in sentences for x in sentence)
+
+    train_len = int(len(sentences) * train_frac)
+    dev_len = int(len(sentences) * (train_frac+dev_frac))
+    train_sentences = sentences[:train_len]
+    dev_sentences = sentences[train_len:dev_len]
+    test_sentences = sentences[dev_len:]
+
+    datasets = (train_sentences, dev_sentences, test_sentences)
+    write_dataset(datasets, out_directory, short_name, suffix="bio")
+
diff --git a/stanza/utils/datasets/ner/prepare_ner_dataset.py b/stanza/utils/datasets/ner/prepare_ner_dataset.py
@@ -313,6 +313,19 @@
     the 2 letter code for lcode.  The tool will throw an error
     if the language is not supported in Masakhane.
 
+SiNER is a Sindhi NER dataset
+  - https://aclanthology.org/2020.lrec-1.361/
+    SiNER: A Large Dataset for Sindhi Named Entity Recognition
+    Wazir Ali, Junyu Lu, Zenglin Xu
+  - It is available via git repository
+    https://github.com/AliWazir/SiNER-dataset
+    Temporarily (September 2022) there is a pull request which
+    fixes a few tag errors, but the code should compensate for
+    that regardless
+  - Clone the repo to $NERBASE/sindhi
+  - python3 -m stanza.utils.datasets.ner.prepare_ner_dataset sd_siner
+
+
 en_sample is the toy dataset included with stanza-train
   https://github.com/stanfordnlp/stanza-train
   this is not meant for any kind of actual NER use
@@ -347,6 +360,7 @@
 import stanza.utils.datasets.ner.convert_starlang_ner as convert_starlang_ner
 import stanza.utils.datasets.ner.convert_nkjp as convert_nkjp
 import stanza.utils.datasets.ner.prepare_ner_file as prepare_ner_file
+import stanza.utils.datasets.ner.convert_sindhi_siner as convert_sindhi_siner
 import stanza.utils.datasets.ner.suc_to_iob as suc_to_iob
 import stanza.utils.datasets.ner.suc_conll_to_iob as suc_conll_to_iob
 from stanza.utils.datasets.ner.utils import convert_bio_to_json, get_tags, read_tsv, write_dataset
@@ -936,6 +950,17 @@ def process_masakhane(paths, dataset_name):
         raise UnknownDatasetError(dataset_name, "Found the Masakhane repo, but there was no %s in the repo at path %s" % (dataset_name, in_directory))
     convert_bio_to_json(in_directory, paths["NER_DATA_DIR"], "%s_masakhane" % lcode, "txt")
 
+def process_sd_siner(paths, short_name):
+    in_directory = os.path.join(paths["NERBASE"], "sindhi", "SiNER-dataset")
+    if not os.path.exists(in_directory):
+        raise FileNotFoundError("Cannot find SiNER checkout in $NERBASE/sindhi  Please git clone to repo in that directory")
+    in_filename = os.path.join(in_directory, "SiNER-dataset.txt")
+    if not os.path.exists(in_filename):
+        in_filename = os.path.join(in_directory, "SiNER dataset.txt")
+        if not os.path.exists(in_filename):
+            raise FileNotFoundError("Found an SiNER directory at %s but the directory did not contain the dataset" % in_directory)
+    convert_sindhi_siner.convert_sindhi_siner(in_filename, paths["NER_DATA_DIR"], short_name)
+
 def process_toy_dataset(paths, short_name):
     convert_bio_to_json(os.path.join(paths["NERBASE"], "English-SAMPLE"), paths["NER_DATA_DIR"], short_name)
 
@@ -956,6 +981,7 @@ def process_toy_dataset(paths, short_name):
     "mr_l3cube":         process_mr_l3cube,
     "my_ucsy":           process_my_ucsy,
     "pl_nkjp":           process_pl_nkjp,
+    "sd_siner":          process_sd_siner,
     "sv_suc3licensed":   process_sv_suc3licensed,
     "sv_suc3shuffle":    process_sv_suc3shuffle,
     "tr_starlang":       process_starlang,