313
313
the 2 letter code for lcode. The tool will throw an error
314
314
if the language is not supported in Masakhane.
315
315
316
+ SiNER is a Sindhi NER dataset
317
+ - https://aclanthology.org/2020.lrec-1.361/
318
+ SiNER: A Large Dataset for Sindhi Named Entity Recognition
319
+ Wazir Ali, Junyu Lu, Zenglin Xu
320
+ - It is available via git repository
321
+ https://github.com/AliWazir/SiNER-dataset
322
+ Temporarily (September 2022) there is a pull request which
323
+ fixes a few tag errors, but the code should compensate for
324
+ that regardless
325
+ - Clone the repo to $NERBASE/sindhi
326
+ - python3 -m stanza.utils.datasets.ner.prepare_ner_dataset sd_siner
327
+
328
+
316
329
en_sample is the toy dataset included with stanza-train
317
330
https://github.com/stanfordnlp/stanza-train
318
331
this is not meant for any kind of actual NER use
347
360
import stanza .utils .datasets .ner .convert_starlang_ner as convert_starlang_ner
348
361
import stanza .utils .datasets .ner .convert_nkjp as convert_nkjp
349
362
import stanza .utils .datasets .ner .prepare_ner_file as prepare_ner_file
363
+ import stanza .utils .datasets .ner .convert_sindhi_siner as convert_sindhi_siner
350
364
import stanza .utils .datasets .ner .suc_to_iob as suc_to_iob
351
365
import stanza .utils .datasets .ner .suc_conll_to_iob as suc_conll_to_iob
352
366
from stanza .utils .datasets .ner .utils import convert_bio_to_json , get_tags , read_tsv , write_dataset
@@ -936,6 +950,17 @@ def process_masakhane(paths, dataset_name):
936
950
raise UnknownDatasetError (dataset_name , "Found the Masakhane repo, but there was no %s in the repo at path %s" % (dataset_name , in_directory ))
937
951
convert_bio_to_json (in_directory , paths ["NER_DATA_DIR" ], "%s_masakhane" % lcode , "txt" )
938
952
953
+ def process_sd_siner (paths , short_name ):
954
+ in_directory = os .path .join (paths ["NERBASE" ], "sindhi" , "SiNER-dataset" )
955
+ if not os .path .exists (in_directory ):
956
+ raise FileNotFoundError ("Cannot find SiNER checkout in $NERBASE/sindhi Please git clone to repo in that directory" )
957
+ in_filename = os .path .join (in_directory , "SiNER-dataset.txt" )
958
+ if not os .path .exists (in_filename ):
959
+ in_filename = os .path .join (in_directory , "SiNER dataset.txt" )
960
+ if not os .path .exists (in_filename ):
961
+ raise FileNotFoundError ("Found an SiNER directory at %s but the directory did not contain the dataset" % in_directory )
962
+ convert_sindhi_siner .convert_sindhi_siner (in_filename , paths ["NER_DATA_DIR" ], short_name )
963
+
939
964
def process_toy_dataset (paths , short_name ):
940
965
convert_bio_to_json (os .path .join (paths ["NERBASE" ], "English-SAMPLE" ), paths ["NER_DATA_DIR" ], short_name )
941
966
@@ -956,6 +981,7 @@ def process_toy_dataset(paths, short_name):
956
981
"mr_l3cube" : process_mr_l3cube ,
957
982
"my_ucsy" : process_my_ucsy ,
958
983
"pl_nkjp" : process_pl_nkjp ,
984
+ "sd_siner" : process_sd_siner ,
959
985
"sv_suc3licensed" : process_sv_suc3licensed ,
960
986
"sv_suc3shuffle" : process_sv_suc3shuffle ,
961
987
"tr_starlang" : process_starlang ,
0 commit comments