Skip to content

Commit 2a8ded4

Browse files
committed
Sindhi NER conversion for SiNER
Update a few broken tags from the Sindhi NER dataset Includes notes on how to get SiNER from the github repo
1 parent 6a3a024 commit 2a8ded4

File tree

2 files changed

+95
-0
lines changed

2 files changed

+95
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
"""
2+
Converts the raw data from SiNER to .json for the Stanza NER system
3+
4+
https://aclanthology.org/2020.lrec-1.361.pdf
5+
"""
6+
7+
from stanza.utils.datasets.ner.utils import write_dataset
8+
9+
def fix_sentence(sentence):
10+
"""
11+
Fix some of the mistags in the dataset
12+
13+
This covers 11 sentences: 1 P-PERSON, 2 with line breaks in the middle of the tag, and 8 with no B- or I-
14+
"""
15+
new_sentence = []
16+
for word_idx, word in enumerate(sentence):
17+
if word[1] == 'P-PERSON':
18+
new_sentence.append((word[0], 'B-PERSON'))
19+
elif word[1] == 'B-OT"':
20+
new_sentence.append((word[0], 'B-OTHERS'))
21+
elif word[1] == 'B-T"':
22+
new_sentence.append((word[0], 'B-TITLE'))
23+
elif word[1] in ('GPE', 'LOC', 'OTHERS'):
24+
if len(new_sentence) > 0 and new_sentence[-1][1][:2] in ('B-', 'I-') and new_sentence[-1][1][2:] == word[1]:
25+
# one example... no idea if it should be a break or
26+
# not, but the last word translates to "Corporation",
27+
# so probably not: ميٽرو پوليٽن ڪارپوريشن
28+
new_sentence.append((word[0], 'I-' + word[1]))
29+
else:
30+
new_sentence.append((word[0], 'B-' + word[1]))
31+
else:
32+
new_sentence.append(word)
33+
return new_sentence
34+
35+
def convert_sindhi_siner(in_filename, out_directory, short_name, train_frac=0.8, dev_frac=0.1):
36+
"""
37+
Read lines from the dataset, crudely separate sentences based on . or !, and write the dataset
38+
"""
39+
with open(in_filename, encoding="utf-8") as fin:
40+
lines = fin.readlines()
41+
42+
lines = [x.strip().split("\t") for x in lines]
43+
lines = [(x[0].strip(), x[1].strip()) for x in lines if len(x) == 2]
44+
print("Read %d words from %s" % (len(lines), in_filename))
45+
sentences = []
46+
prev_idx = 0
47+
for sent_idx, line in enumerate(lines):
48+
# maybe also handle line[0] == '،', "Arabic comma"?
49+
if line[0] in ('.', '!'):
50+
sentences.append(lines[prev_idx:sent_idx+1])
51+
prev_idx=sent_idx+1
52+
53+
# in case the file doesn't end with punctuation, grab the last few lines
54+
if prev_idx < len(lines):
55+
sentences.append(lines[prev_idx:])
56+
57+
print("Found %d sentences before splitting" % len(sentences))
58+
sentences = [fix_sentence(x) for x in sentences]
59+
assert not any('"' in x[1] or x[1].startswith("P-") or x[1] in ("GPE", "LOC", "OTHERS") for sentence in sentences for x in sentence)
60+
61+
train_len = int(len(sentences) * train_frac)
62+
dev_len = int(len(sentences) * (train_frac+dev_frac))
63+
train_sentences = sentences[:train_len]
64+
dev_sentences = sentences[train_len:dev_len]
65+
test_sentences = sentences[dev_len:]
66+
67+
datasets = (train_sentences, dev_sentences, test_sentences)
68+
write_dataset(datasets, out_directory, short_name, suffix="bio")
69+

stanza/utils/datasets/ner/prepare_ner_dataset.py

+26
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,19 @@
313313
the 2 letter code for lcode. The tool will throw an error
314314
if the language is not supported in Masakhane.
315315
316+
SiNER is a Sindhi NER dataset
317+
- https://aclanthology.org/2020.lrec-1.361/
318+
SiNER: A Large Dataset for Sindhi Named Entity Recognition
319+
Wazir Ali, Junyu Lu, Zenglin Xu
320+
- It is available via git repository
321+
https://github.com/AliWazir/SiNER-dataset
322+
Temporarily (September 2022) there is a pull request which
323+
fixes a few tag errors, but the code should compensate for
324+
that regardless
325+
- Clone the repo to $NERBASE/sindhi
326+
- python3 -m stanza.utils.datasets.ner.prepare_ner_dataset sd_siner
327+
328+
316329
en_sample is the toy dataset included with stanza-train
317330
https://github.com/stanfordnlp/stanza-train
318331
this is not meant for any kind of actual NER use
@@ -347,6 +360,7 @@
347360
import stanza.utils.datasets.ner.convert_starlang_ner as convert_starlang_ner
348361
import stanza.utils.datasets.ner.convert_nkjp as convert_nkjp
349362
import stanza.utils.datasets.ner.prepare_ner_file as prepare_ner_file
363+
import stanza.utils.datasets.ner.convert_sindhi_siner as convert_sindhi_siner
350364
import stanza.utils.datasets.ner.suc_to_iob as suc_to_iob
351365
import stanza.utils.datasets.ner.suc_conll_to_iob as suc_conll_to_iob
352366
from stanza.utils.datasets.ner.utils import convert_bio_to_json, get_tags, read_tsv, write_dataset
@@ -936,6 +950,17 @@ def process_masakhane(paths, dataset_name):
936950
raise UnknownDatasetError(dataset_name, "Found the Masakhane repo, but there was no %s in the repo at path %s" % (dataset_name, in_directory))
937951
convert_bio_to_json(in_directory, paths["NER_DATA_DIR"], "%s_masakhane" % lcode, "txt")
938952

953+
def process_sd_siner(paths, short_name):
954+
in_directory = os.path.join(paths["NERBASE"], "sindhi", "SiNER-dataset")
955+
if not os.path.exists(in_directory):
956+
raise FileNotFoundError("Cannot find SiNER checkout in $NERBASE/sindhi Please git clone to repo in that directory")
957+
in_filename = os.path.join(in_directory, "SiNER-dataset.txt")
958+
if not os.path.exists(in_filename):
959+
in_filename = os.path.join(in_directory, "SiNER dataset.txt")
960+
if not os.path.exists(in_filename):
961+
raise FileNotFoundError("Found an SiNER directory at %s but the directory did not contain the dataset" % in_directory)
962+
convert_sindhi_siner.convert_sindhi_siner(in_filename, paths["NER_DATA_DIR"], short_name)
963+
939964
def process_toy_dataset(paths, short_name):
940965
convert_bio_to_json(os.path.join(paths["NERBASE"], "English-SAMPLE"), paths["NER_DATA_DIR"], short_name)
941966

@@ -956,6 +981,7 @@ def process_toy_dataset(paths, short_name):
956981
"mr_l3cube": process_mr_l3cube,
957982
"my_ucsy": process_my_ucsy,
958983
"pl_nkjp": process_pl_nkjp,
984+
"sd_siner": process_sd_siner,
959985
"sv_suc3licensed": process_sv_suc3licensed,
960986
"sv_suc3shuffle": process_sv_suc3shuffle,
961987
"tr_starlang": process_starlang,

0 commit comments

Comments
 (0)