Adding WikiAnn NER to TFDS. The following version of dataset has been…

… added (https://github.com/afshinrahimi/mmner) containing 176 languages. PiperOrigin-RevId: 362588736
PKandarp · Mar 12, 2021 · c7096bd · c7096bd
1 parent 89969cf
commit c7096bd
Show file tree

Hide file tree

Showing 6 changed files with 334 additions and 0 deletions.
diff --git a/tensorflow_datasets/text/__init__.py b/tensorflow_datasets/text/__init__.py
@@ -66,6 +66,7 @@
 from tensorflow_datasets.text.tiny_shakespeare import TinyShakespeare
 from tensorflow_datasets.text.trec import Trec
 from tensorflow_datasets.text.wiki40b import Wiki40b
+from tensorflow_datasets.text.wikiann import Wikiann
 from tensorflow_datasets.text.wikipedia import Wikipedia
 from tensorflow_datasets.text.wikipedia_toxicity_subtypes import WikipediaToxicitySubtypes
 from tensorflow_datasets.text.winogrande import Winogrande

diff --git a/tensorflow_datasets/text/wikiann/__init__.py b/tensorflow_datasets/text/wikiann/__init__.py
@@ -0,0 +1,18 @@
+# coding=utf-8
+# Copyright 2021 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""wikiann dataset."""
+
+from tensorflow_datasets.text.wikiann.wikiann import Wikiann
diff --git a/tensorflow_datasets/text/wikiann/checksums.tsv b/tensorflow_datasets/text/wikiann/checksums.tsv
@@ -0,0 +1 @@
+https://www.dropbox.com/s/12h3qqog6q4bjve/panx_dataset.tar?dl=1	234126336	e2720a94b6590d4d70e9a7725106d1e5bdaafde730e06d8101ce4981d1417cce	panx_dataset.tar
diff --git a/tensorflow_datasets/text/wikiann/dummy_data/panx/en.tar.gz b/tensorflow_datasets/text/wikiann/dummy_data/panx/en.tar.gz
diff --git a/tensorflow_datasets/text/wikiann/wikiann.py b/tensorflow_datasets/text/wikiann/wikiann.py
@@ -0,0 +1,278 @@
+# coding=utf-8
+# Copyright 2021 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""wikiann dataset."""
+
+import os
+import tensorflow.compat.v2 as tf
+import tensorflow_datasets.public_api as tfds
+
+_DESCRIPTION = """\
+WikiANN (sometimes called PAN-X) is a multilingual named entity recognition \
+dataset consisting of Wikipedia articles annotated with LOC (location), PER \
+(person), and ORG (organisation) tags in the IOB2 format. This version \
+corresponds to the balanced train, dev, and test splits of Rahimi et al. \
+(2019), which supports 176 of the 282 languages from the original WikiANN \
+corpus.
+"""
+
+_CITATION = """
+@inproceedings{rahimi-etal-2019-massively,
+    title = "Massively Multilingual Transfer for {NER}",
+    author = "Rahimi, Afshin  and
+      Li, Yuan  and
+      Cohn, Trevor",
+    booktitle = "Proceedings of the 57th Annual Meeting of the Association \
+    for Computational Linguistics",
+    month = jul,
+    year = "2019",
+    address = "Florence, Italy",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/P19-1015",
+    pages = "151--164",
+}
+"""
+
+URL = "https://www.dropbox.com/s/12h3qqog6q4bjve/panx_dataset.tar?dl=1"
+
+LANGS = [
+    "ace", "af", "als", "am", "ang", "an", "arc", "ar", "arz", "as", "ast",
+    "ay", "az", "bar", "ba", "bat-smg", "be", "be-x-old", "bg", "bh", "bn",
+    "bo", "br", "bs", "ca", "cbk-zam", "cdo", "ceb", "ce", "ckb", "co", "crh",
+    "csb", "cs", "cv", "cy", "da", "de", "diq", "dv", "el", "eml", "en", "eo",
+    "es", "et", "eu", "ext", "fa", "fi", "fiu-vro", "fo", "frr", "fr", "fur",
+    "fy", "gan", "ga", "gd", "gl", "gn", "gu", "hak", "he", "hi", "hr", "hsb",
+    "hu", "hy", "ia", "id", "ig", "ilo", "io", "is", "it", "ja", "jbo", "jv",
+    "ka", "kk", "km", "kn", "ko", "ksh", "ku", "ky", "la", "lb", "lij", "li",
+    "lmo", "ln", "lt", "lv", "map-bms", "mg", "mhr", "min", "mi", "mk", "ml",
+    "mn", "mr", "ms", "mt", "mwl", "my", "mzn", "nap", "nds", "ne", "nl", "nn",
+    "no", "nov", "oc", "or", "os", "pa", "pdc", "pl", "pms", "pnb", "ps", "pt",
+    "qu", "rm", "ro", "ru", "rw", "sah", "sa", "scn", "sco", "sd", "sh",
+    "simple", "si", "sk", "sl", "so", "sq", "sr", "su", "sv", "sw", "szl", "ta",
+    "te", "tg", "th", "tk", "tl", "tr", "tt", "ug", "uk", "ur", "uz", "vec",
+    "vep", "vi", "vls", "vo", "war", "wa", "wuu", "xmf", "yi", "yo", "zea",
+    "zh-classical", "zh-min-nan", "zh", "zh-yue"
+]
+
+
+class WikiannConfig(tfds.core.BuilderConfig):
+  """ConfigurationClass for wikiann dataset."""
+
+  def __init__(self, *, language, **kwargs):
+    if language not in LANGS:
+      raise ValueError("language must be one of {}".format(list(LANGS)))
+
+    super(WikiannConfig, self).__init__(**kwargs)
+    self.language = language
+
+
+def tags_to_spans(tags):
+  """Convert tags to spans."""
+  spans = set()
+  span_start = 0
+  span_end = 0
+  active_conll_tag = None
+  for index, string_tag in enumerate(tags):
+    # Actual BIO tag.
+    bio_tag = string_tag[0]
+    assert bio_tag in ["B", "I", "O"], "Invalid Tag"
+    conll_tag = string_tag[2:]
+    if bio_tag == "O":
+      # The span has ended.
+      if active_conll_tag:
+        spans.add((active_conll_tag, (span_start, span_end)))
+      active_conll_tag = None
+      # We don't care about tags we are
+      # told to ignore, so we do nothing.
+      continue
+    elif bio_tag == "B":
+      # We are entering a new span; reset indices and active tag to new span.
+      if active_conll_tag:
+        spans.add((active_conll_tag, (span_start, span_end)))
+      active_conll_tag = conll_tag
+      span_start = index
+      span_end = index
+    elif bio_tag == "I" and conll_tag == active_conll_tag:
+      # We're inside a span.
+      span_end += 1
+    else:
+      # This is the case the bio label is an "I", but either:
+      # 1) the span hasn't started - i.e. an ill formed span.
+      # 2) We have IOB1 tagging scheme.
+      # We'll process the previous span if it exists, but also include this
+      # span. This is important, because otherwise, a model may get a perfect
+      # F1 score whilst still including false positive ill-formed spans.
+      if active_conll_tag:
+        spans.add((active_conll_tag, (span_start, span_end)))
+      active_conll_tag = conll_tag
+      span_start = index
+      span_end = index
+  # Last token might have been a part of a valid span.
+  if active_conll_tag:
+    spans.add((active_conll_tag, (span_start, span_end)))
+  # Return sorted list of spans
+  return sorted(list(spans), key=lambda x: x[1][0])
+
+
+def get_spans(tokens, tags):
+  """Convert tags to textspans."""
+  spans = tags_to_spans(tags)
+  text_spans = [
+      x[0] + ": " + " ".join([tokens[i]
+                              for i in range(x[1][0], x[1][1] + 1)])
+      for x in spans
+  ]
+  if not text_spans:
+    text_spans = ["None"]
+  return text_spans
+
+
+class Wikiann(tfds.core.GeneratorBasedBuilder):
+  """DatasetBuilder for wikiann dataset."""
+
+  BUILDER_CONFIGS = [
+      WikiannConfig(  # pylint: disable=g-complex-comprehension
+          name=language,
+          description=("Wikiann {} train/dev/test splits".format(language)),
+          version="1.0.0",
+          language=language,
+      ) for language in LANGS
+  ]
+
+  VERSION = tfds.core.Version("1.0.0")
+  RELEASE_NOTES = {
+      "1.0.0": "Initial release.",
+  }
+
+  def _info(self) -> tfds.core.DatasetInfo:
+    """Returns the dataset metadata."""
+    features = tfds.features.FeaturesDict({
+        "tokens":
+            tfds.features.Sequence(tfds.features.Text()),
+        "tags":
+            tfds.features.Sequence(
+                tfds.features.ClassLabel(names=[
+                    "O",
+                    "B-PER",
+                    "I-PER",
+                    "B-ORG",
+                    "I-ORG",
+                    "B-LOC",
+                    "I-LOC",
+                ])),
+        "langs":
+            tfds.features.Sequence(tfds.features.Text()),
+        "spans":
+            tfds.features.Sequence(tfds.features.Text()),
+    })
+    return tfds.core.DatasetInfo(
+        builder=self,
+        description=_DESCRIPTION,
+        features=features,
+        supervised_keys=None,
+        homepage="https://github.com/afshinrahimi/mmner",
+        citation=_CITATION,
+    )
+
+  def _split_generators(self, dl_manager: tfds.download.DownloadManager):
+    """Returns SplitGenerators."""
+    path = dl_manager.download_and_extract(URL)
+    subpath = dl_manager.extract(
+        os.path.join(path, self.builder_config.language + ".tar.gz"))
+
+    return [
+        tfds.core.SplitGenerator(
+            name=tfds.Split.VALIDATION,
+            gen_kwargs={"filepath": os.path.join(subpath, "dev")},
+        ),
+        tfds.core.SplitGenerator(
+            name=tfds.Split.TEST,
+            gen_kwargs={"filepath": os.path.join(subpath, "test")},
+        ),
+        tfds.core.SplitGenerator(
+            name=tfds.Split.TRAIN,
+            gen_kwargs={"filepath": os.path.join(subpath, "train")},
+        ),
+    ]
+
+  def _generate_examples(self, filepath):
+    """Reads line by line format of the NER dataset and generates examples.
+
+    Input Format:
+
+    en:rick  B-PER
+    en:and  O
+    en:morty  B-PER
+    en:are  O
+    en:cool  O
+    en:.  O
+
+    Output Format:
+
+    {
+    'tokens': ["rick", "and", "morty", "are", "cool", "."],
+    'tags': ["B-PER", "O" , "B-PER", "O", "O", "O"],
+    'langs': ["en", "en", "en", "en", "en", "en"]
+    'spans': ["PER: rick", "PER: morty"]
+    }
+
+    Args:
+      filepath: Path to file with line by line NER format.
+
+    Yields:
+      Examples with the format listed above.
+
+    """
+
+    key = 1
+    with tf.io.gfile.GFile(filepath, "r") as f:
+      tokens = []
+      tags = []
+      langs = []
+      for line in f:
+        line = line.rstrip()
+        # pylint: disable=g-explicit-bool-comparison
+        if line.startswith("-DOCSTART-") or line == "":
+          if tokens:
+            spans = get_spans(tokens, tags)
+            yield key, {
+                "tokens": tokens,
+                "tags": tags,
+                "langs": langs,
+                "spans": spans
+            }
+            key += 1
+            tokens = []
+            tags = []
+            langs = []
+        else:
+          # wikiann data is tab separated
+          fields = line.split("\t")
+          # strip out language prefix
+          langs.append(fields[0].split(":")[0])
+          tokens.append(":".join(fields[0].split(":")[1:]))
+          if len(fields) > 1:
+            tags.append(fields[-1])
+          else:
+            # examples have no label in test set
+            tags.append("O")
+      if tokens:
+        spans = get_spans(tokens, tags)
+        yield key, {
+            "tokens": tokens,
+            "tags": tags,
+            "langs": langs,
+            "spans": spans
+        }
diff --git a/tensorflow_datasets/text/wikiann/wikiann_test.py b/tensorflow_datasets/text/wikiann/wikiann_test.py
@@ -0,0 +1,36 @@
+# coding=utf-8
+# Copyright 2021 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""wikiann dataset."""
+
+import tensorflow_datasets.public_api as tfds
+from tensorflow_datasets.text.wikiann import wikiann
+
+
+class WikiannTest(tfds.testing.DatasetBuilderTestCase):
+  """Tests for wikiann dataset."""
+  BUILDER_CONFIG_NAMES_TO_TEST = ["en"]
+  DATASET_CLASS = wikiann.Wikiann
+  SPLITS = {
+      "train": 3,  # Number of fake train example
+      "validation": 1,  # Number of fake dev example
+      "test": 1,  # Number of fake test example
+  }
+
+  DL_EXTRACT_RESULT = "panx"
+
+
+if __name__ == "__main__":
+  tfds.testing.test_main()
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		https://www.dropbox.com/s/12h3qqog6q4bjve/panx_dataset.tar?dl=1 234126336 e2720a94b6590d4d70e9a7725106d1e5bdaafde730e06d8101ce4981d1417cce panx_dataset.tar