Skip to content

Commit

Permalink
💫 New JSON helpers, training data internals & CLI rewrite (#2932)
Browse files Browse the repository at this point in the history
* Support nowrap setting in util.prints

* Tidy up and fix whitespace

* Simplify script and use read_jsonl helper

* Add JSON schemas (see #2928)

* Deprecate Doc.print_tree

Will be replaced with Doc.to_json, which will produce a unified format

* Add Doc.to_json() method (see #2928)

Converts Doc objects to JSON using the same unified format as the training data. Method also supports serializing selected custom attributes in the doc._. space.

* Remove outdated test

* Add write_json and write_jsonl helpers

* WIP: Update spacy train

* Tidy up spacy train

* WIP: Use wasabi for formatting

* Add GoldParse helpers for JSON format

* WIP: add debug-data command

* Fix typo

* Add missing import

* Update wasabi pin

* Add missing import

* 💫 Refactor CLI (#2943)

To be merged into #2932.

## Description
- [x] refactor CLI To use [`wasabi`](https://github.com/ines/wasabi)
- [x] use [`black`](https://github.com/ambv/black) for auto-formatting
- [x] add `flake8` config
- [x] move all messy UD-related scripts to `cli.ud`
- [x] make converters function that take the opened file and return the converted data (instead of having them handle the IO)

### Types of change
enhancement

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Update wasabi pin

* Delete old test

* Update errors

* Fix typo

* Tidy up and format remaining code

* Fix formatting

* Improve formatting of messages

* Auto-format remaining code

* Add tok2vec stuff to spacy.train

* Fix typo

* Update wasabi pin

* Fix path checks for when train() is called as function

* Reformat and tidy up pretrain script

* Update argument annotations

* Raise error if model language doesn't match lang

* Document new train command
  • Loading branch information
ines authored and honnibal committed Nov 30, 2018
1 parent 0369db7 commit 37c7c85
Show file tree
Hide file tree
Showing 46 changed files with 2,469 additions and 1,532 deletions.
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ ujson>=1.35
dill>=0.2,<0.3
regex==2018.01.10
requests>=2.13.0,<3.0.0
jsonschema>=2.6.0,<3.0.0
wasabi>=0.0.8,<1.1.0
pathlib==1.0.1; python_version < "3.4"
# Development dependencies
pytest>=4.0.0,<5.0.0
Expand Down
2 changes: 2 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,8 @@ def setup_package():
"regex==2018.01.10",
"dill>=0.2,<0.3",
"requests>=2.13.0,<3.0.0",
"jsonschema>=2.6.0,<3.0.0",
"wasabi>=0.0.8,<1.1.0",
'pathlib==1.0.1; python_version < "3.4"',
],
setup_requires=["wheel"],
Expand Down
49 changes: 25 additions & 24 deletions spacy/__main__.py
Original file line number Diff line number Diff line change
@@ -1,40 +1,41 @@
# coding: utf8
from __future__ import print_function

# NB! This breaks in plac on Python 2!!
# from __future__ import unicode_literals

if __name__ == '__main__':
if __name__ == "__main__":
import plac
import sys
from wasabi import Printer
from spacy.cli import download, link, info, package, train, pretrain, convert
from spacy.cli import vocab, init_model, profile, evaluate, validate
from spacy.cli import ud_train, ud_evaluate
from spacy.util import prints
from spacy.cli import init_model, profile, evaluate, validate
from spacy.cli import ud_train, ud_evaluate, debug_data

msg = Printer()

commands = {
'download': download,
'link': link,
'info': info,
'train': train,
'pretrain': pretrain,
'ud-train': ud_train,
'evaluate': evaluate,
'ud-evaluate': ud_evaluate,
'convert': convert,
'package': package,
'vocab': vocab,
'init-model': init_model,
'profile': profile,
'validate': validate
"download": download,
"link": link,
"info": info,
"train": train,
"pretrain": pretrain,
"debug-data": debug_data,
"ud-train": ud_train,
"evaluate": evaluate,
"ud-evaluate": ud_evaluate,
"convert": convert,
"package": package,
"init-model": init_model,
"profile": profile,
"validate": validate,
}
if len(sys.argv) == 1:
prints(', '.join(commands), title="Available commands", exits=1)
msg.info("Available commands", ", ".join(commands), exits=1)
command = sys.argv.pop(1)
sys.argv[0] = 'spacy %s' % command
sys.argv[0] = "spacy %s" % command
if command in commands:
plac.call(commands[command], sys.argv[1:])
else:
prints(
"Available: %s" % ', '.join(commands),
title="Unknown command: %s" % command,
exits=1)
available = "Available: {}".format(", ".join(commands))
msg.fail("Unknown command: {}".format(command), available, exits=1)
27 changes: 13 additions & 14 deletions spacy/cli/__init__.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
from .download import download
from .info import info
from .link import link
from .package import package
from .profile import profile
from .train import train
from .pretrain import pretrain
from .evaluate import evaluate
from .convert import convert
from .vocab import make_vocab as vocab
from .init_model import init_model
from .validate import validate
from .ud_train import main as ud_train
from .conll17_ud_eval import main as ud_evaluate
from .download import download # noqa: F401
from .info import info # noqa: F401
from .link import link # noqa: F401
from .package import package # noqa: F401
from .profile import profile # noqa: F401
from .train import train # noqa: F401
from .pretrain import pretrain # noqa: F401
from .debug_data import debug_data # noqa: F401
from .evaluate import evaluate # noqa: F401
from .convert import convert # noqa: F401
from .init_model import init_model # noqa: F401
from .validate import validate # noqa: F401
from .ud import ud_train, ud_evaluate # noqa: F401
30 changes: 30 additions & 0 deletions spacy/cli/_messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
from __future__ import unicode_literals


# fmt: off

class Messages(object):
M001 = ("Download successful but linking failed")
M002 = ("Creating a shortcut link for 'en' didn't work (maybe you "
Expand Down Expand Up @@ -73,3 +75,31 @@ class Messages(object):
M052 = ("Not a valid meta.json format")
M053 = ("Expected dict but got: {meta_type}")
M054 = ("No --lang specified, but tokenization required.")
M055 = ("Training pipeline: {pipeline}")
M056 = ("Starting with base model '{model}'")
M057 = ("Starting with blank model '{model}'")
M058 = ("Loading vector from model '{model}'")
M059 = ("Can't use multitask objective without '{pipe}' in the pipeline")
M060 = ("Counting training words (limit={limit})")
M061 = ("\nSaving model...")
M062 = ("Output directory is not empty.")
M063 = ("Incompatible arguments")
M064 = ("The -f and -c arguments are deprecated, and not compatible with "
"the -j argument, which should specify the same information. "
"Either merge the frequencies and clusters data into the "
"JSONL-formatted file (recommended), or use only the -f and -c "
"files, without the other lexical attributes.")
M065 = ("This can lead to unintended side effects when saving the model. "
"Please use an empty directory or a different path instead. If "
"the specified output path doesn't exist, the directory will be "
"created for you.")
M066 = ("Saved model to output directory")
M067 = ("Can't find lexical data")
M068 = ("Sucessfully compiled vocab and vectors, and saved model")
M069 = ("Unknown file type: '{name}'")
M070 = ("Supported file types: '{options}'")
M071 = ("Loaded pretrained tok2vec for: {components}")
M072 = ("Model language ('{model_lang}') doesn't match language specified "
"as `lang` argument ('{lang}') ")

# fmt: on
86 changes: 64 additions & 22 deletions spacy/cli/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,49 +3,91 @@

import plac
from pathlib import Path
from wasabi import Printer

from ..util import write_jsonl, write_json
from ..compat import json_dumps, path2str
from .converters import conllu2json, conllubio2json, iob2json, conll_ner2json
from .converters import ner_jsonl2json
from ._messages import Messages
from ..util import prints


# Converters are matched by file extension. To add a converter, add a new
# entry to this dict with the file extension mapped to the converter function
# imported from /converters.
CONVERTERS = {
'conllubio': conllubio2json,
'conllu': conllu2json,
'conll': conllu2json,
'ner': conll_ner2json,
'iob': iob2json,
'jsonl': ner_jsonl2json
"conllubio": conllubio2json,
"conllu": conllu2json,
"conll": conllu2json,
"ner": conll_ner2json,
"iob": iob2json,
"jsonl": ner_jsonl2json,
}

# File types
FILE_TYPES = ("json", "jsonl")


@plac.annotations(
input_file=("input file", "positional", None, str),
output_dir=("output directory for converted file", "positional", None, str),
input_file=("Input file", "positional", None, str),
output_dir=("Output directory for converted file", "positional", None, str),
file_type=("Type of data to produce: 'jsonl' or 'json'", "option", "t", str),
n_sents=("Number of sentences per doc", "option", "n", int),
converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str),
lang=("Language (if tokenizer required)", "option", "l", str),
morphology=("Enable appending morphology to tags", "flag", "m", bool))
def convert(input_file, output_dir, n_sents=1, morphology=False, converter='auto',
lang=None):
morphology=("Enable appending morphology to tags", "flag", "m", bool),
)
def convert(
input_file,
output_dir="-",
file_type="jsonl",
n_sents=1,
morphology=False,
converter="auto",
lang=None,
):
"""
Convert files into JSON format for use with train command and other
experiment management functions.
experiment management functions. If no output_dir is specified, the data
is written to stdout, so you can pipe them forward to a JSONL file:
$ spacy convert some_file.conllu > some_file.jsonl
"""
msg = Printer()
input_path = Path(input_file)
output_path = Path(output_dir)
if file_type not in FILE_TYPES:
msg.fail(
Messages.M069.format(name=file_type),
Messages.M070.format(options=", ".join(FILE_TYPES)),
exits=1,
)
if not input_path.exists():
prints(input_path, title=Messages.M028, exits=1)
if not output_path.exists():
prints(output_path, title=Messages.M029, exits=1)
if converter == 'auto':
msg.fail(Messages.M028, input_path, exits=1)
if output_dir != "-" and not Path(output_dir).exists():
msg.fail(Messages.M029, output_dir, exits=1)
if converter == "auto":
converter = input_path.suffix[1:]
if converter not in CONVERTERS:
prints(Messages.M031.format(converter=converter),
title=Messages.M030, exits=1)
msg.fail(Messages.M030, Messages.M031.format(converter=converter), exits=1)
# Use converter function to convert data
func = CONVERTERS[converter]
func(input_path, output_path,
n_sents=n_sents, use_morphology=morphology, lang=lang)
input_data = input_path.open("r", encoding="utf-8").read()
data = func(input_data, nsents=n_sents, use_morphology=morphology, lang=lang)
if output_dir != "-":
# Export data to a file
suffix = ".{}".format(file_type)
output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix)
if file_type == "json":
write_json(output_file, data)
elif file_type == "jsonl":
write_jsonl(output_file, data)
msg.good(
Messages.M032.format(name=path2str(output_file)),
Messages.M033.format(n_docs=len(data)),
)
else:
# Print to stdout
if file_type == "json":
print(json_dumps(data))
elif file_type == "jsonl":
for line in data:
print(json_dumps(line))
10 changes: 5 additions & 5 deletions spacy/cli/converters/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .conllu2json import conllu2json
from .conllubio2json import conllubio2json
from .iob2json import iob2json
from .conll_ner2json import conll_ner2json
from .jsonl2json import ner_jsonl2json
from .conllu2json import conllu2json # noqa: F401
from .conllubio2json import conllubio2json # noqa: F401
from .iob2json import iob2json # noqa: F401
from .conll_ner2json import conll_ner2json # noqa: F401
from .jsonl2json import ner_jsonl2json # noqa: F401
46 changes: 16 additions & 30 deletions spacy/cli/converters/conll_ner2json.py
Original file line number Diff line number Diff line change
@@ -1,52 +1,38 @@
# coding: utf8
from __future__ import unicode_literals

from .._messages import Messages
from ...compat import json_dumps, path2str
from ...util import prints
from ...gold import iob_to_biluo


def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False, lang=None):
def conll_ner2json(input_data, **kwargs):
"""
Convert files in the CoNLL-2003 NER format into JSON format for use with
train cli.
"""
docs = read_conll_ner(input_path)

output_filename = input_path.parts[-1].replace(".conll", "") + ".json"
output_filename = input_path.parts[-1].replace(".conll", "") + ".json"
output_file = output_path / output_filename
with output_file.open('w', encoding='utf-8') as f:
f.write(json_dumps(docs))
prints(Messages.M033.format(n_docs=len(docs)),
title=Messages.M032.format(name=path2str(output_file)))


def read_conll_ner(input_path):
text = input_path.open('r', encoding='utf-8').read()
i = 0
delimit_docs = '-DOCSTART- -X- O O'
delimit_docs = "-DOCSTART- -X- O O"
output_docs = []
for doc in text.strip().split(delimit_docs):
for doc in input_data.strip().split(delimit_docs):
doc = doc.strip()
if not doc:
continue
output_doc = []
for sent in doc.split('\n\n'):
for sent in doc.split("\n\n"):
sent = sent.strip()
if not sent:
continue
lines = [line.strip() for line in sent.split('\n') if line.strip()]
lines = [line.strip() for line in sent.split("\n") if line.strip()]
words, tags, chunks, iob_ents = zip(*[line.split() for line in lines])
biluo_ents = iob_to_biluo(iob_ents)
output_doc.append({'tokens': [
{'orth': w, 'tag': tag, 'ner': ent} for (w, tag, ent) in
zip(words, tags, biluo_ents)
]})
output_docs.append({
'id': len(output_docs),
'paragraphs': [{'sentences': output_doc}]
})
output_doc.append(
{
"tokens": [
{"orth": w, "tag": tag, "ner": ent}
for (w, tag, ent) in zip(words, tags, biluo_ents)
]
}
)
output_docs.append(
{"id": len(output_docs), "paragraphs": [{"sentences": output_doc}]}
)
output_doc = []
return output_docs
Loading

0 comments on commit 37c7c85

Please sign in to comment.