💫 New JSON helpers, training data internals & CLI rewrite (#2932)

* Support nowrap setting in util.prints * Tidy up and fix whitespace * Simplify script and use read_jsonl helper * Add JSON schemas (see #2928) * Deprecate Doc.print_tree Will be replaced with Doc.to_json, which will produce a unified format * Add Doc.to_json() method (see #2928) Converts Doc objects to JSON using the same unified format as the training data. Method also supports serializing selected custom attributes in the doc._. space. * Remove outdated test * Add write_json and write_jsonl helpers * WIP: Update spacy train * Tidy up spacy train * WIP: Use wasabi for formatting * Add GoldParse helpers for JSON format * WIP: add debug-data command * Fix typo * Add missing import * Update wasabi pin * Add missing import * 💫 Refactor CLI (#2943) To be merged into #2932. ## Description - [x] refactor CLI To use [`wasabi`](https://github.com/ines/wasabi) - [x] use [`black`](https://github.com/ambv/black) for auto-formatting - [x] add `flake8` config - [x] move all messy UD-related scripts to `cli.ud` - [x] make converters function that take the opened file and return the converted data (instead of having them handle the IO) ### Types of change enhancement ## Checklist  - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. * Update wasabi pin * Delete old test * Update errors * Fix typo * Tidy up and format remaining code * Fix formatting * Improve formatting of messages * Auto-format remaining code * Add tok2vec stuff to spacy.train * Fix typo * Update wasabi pin * Fix path checks for when train() is called as function * Reformat and tidy up pretrain script * Update argument annotations * Raise error if model language doesn't match lang * Document new train command
explosion · Nov 30, 2018 · 37c7c85 · 37c7c85
1 parent 0369db7
commit 37c7c85
Show file tree

Hide file tree

Showing 46 changed files with 2,469 additions and 1,532 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -11,6 +11,8 @@ ujson>=1.35
 dill>=0.2,<0.3
 regex==2018.01.10
 requests>=2.13.0,<3.0.0
+jsonschema>=2.6.0,<3.0.0
+wasabi>=0.0.8,<1.1.0
 pathlib==1.0.1; python_version < "3.4"
 # Development dependencies
 pytest>=4.0.0,<5.0.0

diff --git a/setup.py b/setup.py
@@ -207,6 +207,8 @@ def setup_package():
                 "regex==2018.01.10",
                 "dill>=0.2,<0.3",
                 "requests>=2.13.0,<3.0.0",
+                "jsonschema>=2.6.0,<3.0.0",
+                "wasabi>=0.0.8,<1.1.0",
                 'pathlib==1.0.1; python_version < "3.4"',
             ],
             setup_requires=["wheel"],

diff --git a/spacy/__main__.py b/spacy/__main__.py
@@ -1,40 +1,41 @@
 # coding: utf8
 from __future__ import print_function
+
 # NB! This breaks in plac on Python 2!!
 # from __future__ import unicode_literals
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     import plac
     import sys
+    from wasabi import Printer
     from spacy.cli import download, link, info, package, train, pretrain, convert
-    from spacy.cli import vocab, init_model, profile, evaluate, validate
-    from spacy.cli import ud_train, ud_evaluate
-    from spacy.util import prints
+    from spacy.cli import init_model, profile, evaluate, validate
+    from spacy.cli import ud_train, ud_evaluate, debug_data
+
+    msg = Printer()
 
     commands = {
-        'download': download,
-        'link': link,
-        'info': info,
-        'train': train,
-        'pretrain': pretrain,
-        'ud-train': ud_train,
-        'evaluate': evaluate,
-        'ud-evaluate': ud_evaluate,
-        'convert': convert,
-        'package': package,
-        'vocab': vocab,
-        'init-model': init_model,
-        'profile': profile,
-        'validate': validate
+        "download": download,
+        "link": link,
+        "info": info,
+        "train": train,
+        "pretrain": pretrain,
+        "debug-data": debug_data,
+        "ud-train": ud_train,
+        "evaluate": evaluate,
+        "ud-evaluate": ud_evaluate,
+        "convert": convert,
+        "package": package,
+        "init-model": init_model,
+        "profile": profile,
+        "validate": validate,
     }
     if len(sys.argv) == 1:
-        prints(', '.join(commands), title="Available commands", exits=1)
+        msg.info("Available commands", ", ".join(commands), exits=1)
     command = sys.argv.pop(1)
-    sys.argv[0] = 'spacy %s' % command
+    sys.argv[0] = "spacy %s" % command
     if command in commands:
         plac.call(commands[command], sys.argv[1:])
     else:
-        prints(
-            "Available: %s" % ', '.join(commands),
-            title="Unknown command: %s" % command,
-            exits=1)
+        available = "Available: {}".format(", ".join(commands))
+        msg.fail("Unknown command: {}".format(command), available, exits=1)
diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py
@@ -1,14 +1,13 @@
-from .download import download
-from .info import info
-from .link import link
-from .package import package
-from .profile import profile
-from .train import train
-from .pretrain import pretrain
-from .evaluate import evaluate
-from .convert import convert
-from .vocab import make_vocab as vocab
-from .init_model import init_model
-from .validate import validate
-from .ud_train import main as ud_train
-from .conll17_ud_eval import main as ud_evaluate
+from .download import download  # noqa: F401
+from .info import info  # noqa: F401
+from .link import link  # noqa: F401
+from .package import package  # noqa: F401
+from .profile import profile  # noqa: F401
+from .train import train  # noqa: F401
+from .pretrain import pretrain  # noqa: F401
+from .debug_data import debug_data  # noqa: F401
+from .evaluate import evaluate  # noqa: F401
+from .convert import convert  # noqa: F401
+from .init_model import init_model  # noqa: F401
+from .validate import validate  # noqa: F401
+from .ud import ud_train, ud_evaluate  # noqa: F401
diff --git a/spacy/cli/_messages.py b/spacy/cli/_messages.py
@@ -2,6 +2,8 @@
 from __future__ import unicode_literals
 
 
+# fmt: off
+
 class Messages(object):
     M001 = ("Download successful but linking failed")
     M002 = ("Creating a shortcut link for 'en' didn't work (maybe you "
@@ -73,3 +75,31 @@ class Messages(object):
     M052 = ("Not a valid meta.json format")
     M053 = ("Expected dict but got: {meta_type}")
     M054 = ("No --lang specified, but tokenization required.")
+    M055 = ("Training pipeline: {pipeline}")
+    M056 = ("Starting with base model '{model}'")
+    M057 = ("Starting with blank model '{model}'")
+    M058 = ("Loading vector from model '{model}'")
+    M059 = ("Can't use multitask objective without '{pipe}' in the pipeline")
+    M060 = ("Counting training words (limit={limit})")
+    M061 = ("\nSaving model...")
+    M062 = ("Output directory is not empty.")
+    M063 = ("Incompatible arguments")
+    M064 = ("The -f and -c arguments are deprecated, and not compatible with "
+            "the -j argument, which should specify the same information. "
+            "Either merge the frequencies and clusters data into the "
+            "JSONL-formatted file (recommended), or use only the -f and -c "
+            "files, without the other lexical attributes.")
+    M065 = ("This can lead to unintended side effects when saving the model. "
+            "Please use an empty directory or a different path instead. If "
+            "the specified output path doesn't exist, the directory will be "
+            "created for you.")
+    M066 = ("Saved model to output directory")
+    M067 = ("Can't find lexical data")
+    M068 = ("Sucessfully compiled vocab and vectors, and saved model")
+    M069 = ("Unknown file type: '{name}'")
+    M070 = ("Supported file types: '{options}'")
+    M071 = ("Loaded pretrained tok2vec for: {components}")
+    M072 = ("Model language ('{model_lang}') doesn't match language specified "
+            "as `lang` argument ('{lang}') ")
+
+# fmt: on
diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
@@ -3,49 +3,91 @@
 
 import plac
 from pathlib import Path
+from wasabi import Printer
 
+from ..util import write_jsonl, write_json
+from ..compat import json_dumps, path2str
 from .converters import conllu2json, conllubio2json, iob2json, conll_ner2json
 from .converters import ner_jsonl2json
 from ._messages import Messages
-from ..util import prints
+
 
 # Converters are matched by file extension. To add a converter, add a new
 # entry to this dict with the file extension mapped to the converter function
 # imported from /converters.
 CONVERTERS = {
-    'conllubio': conllubio2json,
-    'conllu': conllu2json,
-    'conll': conllu2json,
-    'ner': conll_ner2json,
-    'iob': iob2json,
-    'jsonl': ner_jsonl2json
+    "conllubio": conllubio2json,
+    "conllu": conllu2json,
+    "conll": conllu2json,
+    "ner": conll_ner2json,
+    "iob": iob2json,
+    "jsonl": ner_jsonl2json,
 }
 
+# File types
+FILE_TYPES = ("json", "jsonl")
+
 
 @plac.annotations(
-    input_file=("input file", "positional", None, str),
-    output_dir=("output directory for converted file", "positional", None, str),
+    input_file=("Input file", "positional", None, str),
+    output_dir=("Output directory for converted file", "positional", None, str),
+    file_type=("Type of data to produce: 'jsonl' or 'json'", "option", "t", str),
     n_sents=("Number of sentences per doc", "option", "n", int),
     converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str),
     lang=("Language (if tokenizer required)", "option", "l", str),
-    morphology=("Enable appending morphology to tags", "flag", "m", bool))
-def convert(input_file, output_dir, n_sents=1, morphology=False, converter='auto',
-        lang=None):
+    morphology=("Enable appending morphology to tags", "flag", "m", bool),
+)
+def convert(
+    input_file,
+    output_dir="-",
+    file_type="jsonl",
+    n_sents=1,
+    morphology=False,
+    converter="auto",
+    lang=None,
+):
     """
     Convert files into JSON format for use with train command and other
-    experiment management functions.
+    experiment management functions. If no output_dir is specified, the data
+    is written to stdout, so you can pipe them forward to a JSONL file:
+    $ spacy convert some_file.conllu > some_file.jsonl
     """
+    msg = Printer()
     input_path = Path(input_file)
-    output_path = Path(output_dir)
+    if file_type not in FILE_TYPES:
+        msg.fail(
+            Messages.M069.format(name=file_type),
+            Messages.M070.format(options=", ".join(FILE_TYPES)),
+            exits=1,
+        )
     if not input_path.exists():
-        prints(input_path, title=Messages.M028, exits=1)
-    if not output_path.exists():
-        prints(output_path, title=Messages.M029, exits=1)
-    if converter == 'auto':
+        msg.fail(Messages.M028, input_path, exits=1)
+    if output_dir != "-" and not Path(output_dir).exists():
+        msg.fail(Messages.M029, output_dir, exits=1)
+    if converter == "auto":
         converter = input_path.suffix[1:]
     if converter not in CONVERTERS:
-            prints(Messages.M031.format(converter=converter),
-                   title=Messages.M030, exits=1)
+        msg.fail(Messages.M030, Messages.M031.format(converter=converter), exits=1)
+    # Use converter function to convert data
     func = CONVERTERS[converter]
-    func(input_path, output_path,
-         n_sents=n_sents, use_morphology=morphology, lang=lang)
+    input_data = input_path.open("r", encoding="utf-8").read()
+    data = func(input_data, nsents=n_sents, use_morphology=morphology, lang=lang)
+    if output_dir != "-":
+        # Export data to a file
+        suffix = ".{}".format(file_type)
+        output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix)
+        if file_type == "json":
+            write_json(output_file, data)
+        elif file_type == "jsonl":
+            write_jsonl(output_file, data)
+        msg.good(
+            Messages.M032.format(name=path2str(output_file)),
+            Messages.M033.format(n_docs=len(data)),
+        )
+    else:
+        # Print to stdout
+        if file_type == "json":
+            print(json_dumps(data))
+        elif file_type == "jsonl":
+            for line in data:
+                print(json_dumps(line))
diff --git a/spacy/cli/converters/__init__.py b/spacy/cli/converters/__init__.py
@@ -1,5 +1,5 @@
-from .conllu2json import conllu2json
-from .conllubio2json import conllubio2json
-from .iob2json import iob2json
-from .conll_ner2json import conll_ner2json
-from .jsonl2json import ner_jsonl2json
+from .conllu2json import conllu2json  # noqa: F401
+from .conllubio2json import conllubio2json  # noqa: F401
+from .iob2json import iob2json  # noqa: F401
+from .conll_ner2json import conll_ner2json  # noqa: F401
+from .jsonl2json import ner_jsonl2json  # noqa: F401
diff --git a/spacy/cli/converters/conll_ner2json.py b/spacy/cli/converters/conll_ner2json.py
@@ -1,52 +1,38 @@
 # coding: utf8
 from __future__ import unicode_literals
 
-from .._messages import Messages
-from ...compat import json_dumps, path2str
-from ...util import prints
 from ...gold import iob_to_biluo
 
 
-def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False, lang=None):
+def conll_ner2json(input_data, **kwargs):
     """
     Convert files in the CoNLL-2003 NER format into JSON format for use with
     train cli.
     """
-    docs = read_conll_ner(input_path)
-
-    output_filename = input_path.parts[-1].replace(".conll", "") + ".json"
-    output_filename = input_path.parts[-1].replace(".conll", "") + ".json"
-    output_file = output_path / output_filename
-    with output_file.open('w', encoding='utf-8') as f:
-        f.write(json_dumps(docs))
-    prints(Messages.M033.format(n_docs=len(docs)),
-           title=Messages.M032.format(name=path2str(output_file)))
-
-
-def read_conll_ner(input_path):
-    text = input_path.open('r', encoding='utf-8').read()
-    i = 0
-    delimit_docs = '-DOCSTART- -X- O O'
+    delimit_docs = "-DOCSTART- -X- O O"
     output_docs = []
-    for doc in text.strip().split(delimit_docs):
+    for doc in input_data.strip().split(delimit_docs):
         doc = doc.strip()
         if not doc:
             continue
         output_doc = []
-        for sent in doc.split('\n\n'):
+        for sent in doc.split("\n\n"):
             sent = sent.strip()
             if not sent:
                 continue
-            lines = [line.strip() for line in sent.split('\n') if line.strip()]
+            lines = [line.strip() for line in sent.split("\n") if line.strip()]
             words, tags, chunks, iob_ents = zip(*[line.split() for line in lines])
             biluo_ents = iob_to_biluo(iob_ents)
-            output_doc.append({'tokens': [
-                {'orth': w, 'tag': tag, 'ner': ent} for (w, tag, ent) in
-                zip(words, tags, biluo_ents)
-            ]})
-        output_docs.append({
-            'id': len(output_docs),
-            'paragraphs': [{'sentences': output_doc}]
-        })
+            output_doc.append(
+                {
+                    "tokens": [
+                        {"orth": w, "tag": tag, "ner": ent}
+                        for (w, tag, ent) in zip(words, tags, biluo_ents)
+                    ]
+                }
+            )
+        output_docs.append(
+            {"id": len(output_docs), "paragraphs": [{"sentences": output_doc}]}
+        )
         output_doc = []
     return output_docs