-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
💫 New JSON helpers, training data internals & CLI rewrite (#2932)
* Support nowrap setting in util.prints * Tidy up and fix whitespace * Simplify script and use read_jsonl helper * Add JSON schemas (see #2928) * Deprecate Doc.print_tree Will be replaced with Doc.to_json, which will produce a unified format * Add Doc.to_json() method (see #2928) Converts Doc objects to JSON using the same unified format as the training data. Method also supports serializing selected custom attributes in the doc._. space. * Remove outdated test * Add write_json and write_jsonl helpers * WIP: Update spacy train * Tidy up spacy train * WIP: Use wasabi for formatting * Add GoldParse helpers for JSON format * WIP: add debug-data command * Fix typo * Add missing import * Update wasabi pin * Add missing import * 💫 Refactor CLI (#2943) To be merged into #2932. ## Description - [x] refactor CLI To use [`wasabi`](https://github.com/ines/wasabi) - [x] use [`black`](https://github.com/ambv/black) for auto-formatting - [x] add `flake8` config - [x] move all messy UD-related scripts to `cli.ud` - [x] make converters function that take the opened file and return the converted data (instead of having them handle the IO) ### Types of change enhancement ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. * Update wasabi pin * Delete old test * Update errors * Fix typo * Tidy up and format remaining code * Fix formatting * Improve formatting of messages * Auto-format remaining code * Add tok2vec stuff to spacy.train * Fix typo * Update wasabi pin * Fix path checks for when train() is called as function * Reformat and tidy up pretrain script * Update argument annotations * Raise error if model language doesn't match lang * Document new train command
- Loading branch information
Showing
46 changed files
with
2,469 additions
and
1,532 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,40 +1,41 @@ | ||
# coding: utf8 | ||
from __future__ import print_function | ||
|
||
# NB! This breaks in plac on Python 2!! | ||
# from __future__ import unicode_literals | ||
|
||
if __name__ == '__main__': | ||
if __name__ == "__main__": | ||
import plac | ||
import sys | ||
from wasabi import Printer | ||
from spacy.cli import download, link, info, package, train, pretrain, convert | ||
from spacy.cli import vocab, init_model, profile, evaluate, validate | ||
from spacy.cli import ud_train, ud_evaluate | ||
from spacy.util import prints | ||
from spacy.cli import init_model, profile, evaluate, validate | ||
from spacy.cli import ud_train, ud_evaluate, debug_data | ||
|
||
msg = Printer() | ||
|
||
commands = { | ||
'download': download, | ||
'link': link, | ||
'info': info, | ||
'train': train, | ||
'pretrain': pretrain, | ||
'ud-train': ud_train, | ||
'evaluate': evaluate, | ||
'ud-evaluate': ud_evaluate, | ||
'convert': convert, | ||
'package': package, | ||
'vocab': vocab, | ||
'init-model': init_model, | ||
'profile': profile, | ||
'validate': validate | ||
"download": download, | ||
"link": link, | ||
"info": info, | ||
"train": train, | ||
"pretrain": pretrain, | ||
"debug-data": debug_data, | ||
"ud-train": ud_train, | ||
"evaluate": evaluate, | ||
"ud-evaluate": ud_evaluate, | ||
"convert": convert, | ||
"package": package, | ||
"init-model": init_model, | ||
"profile": profile, | ||
"validate": validate, | ||
} | ||
if len(sys.argv) == 1: | ||
prints(', '.join(commands), title="Available commands", exits=1) | ||
msg.info("Available commands", ", ".join(commands), exits=1) | ||
command = sys.argv.pop(1) | ||
sys.argv[0] = 'spacy %s' % command | ||
sys.argv[0] = "spacy %s" % command | ||
if command in commands: | ||
plac.call(commands[command], sys.argv[1:]) | ||
else: | ||
prints( | ||
"Available: %s" % ', '.join(commands), | ||
title="Unknown command: %s" % command, | ||
exits=1) | ||
available = "Available: {}".format(", ".join(commands)) | ||
msg.fail("Unknown command: {}".format(command), available, exits=1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,13 @@ | ||
from .download import download | ||
from .info import info | ||
from .link import link | ||
from .package import package | ||
from .profile import profile | ||
from .train import train | ||
from .pretrain import pretrain | ||
from .evaluate import evaluate | ||
from .convert import convert | ||
from .vocab import make_vocab as vocab | ||
from .init_model import init_model | ||
from .validate import validate | ||
from .ud_train import main as ud_train | ||
from .conll17_ud_eval import main as ud_evaluate | ||
from .download import download # noqa: F401 | ||
from .info import info # noqa: F401 | ||
from .link import link # noqa: F401 | ||
from .package import package # noqa: F401 | ||
from .profile import profile # noqa: F401 | ||
from .train import train # noqa: F401 | ||
from .pretrain import pretrain # noqa: F401 | ||
from .debug_data import debug_data # noqa: F401 | ||
from .evaluate import evaluate # noqa: F401 | ||
from .convert import convert # noqa: F401 | ||
from .init_model import init_model # noqa: F401 | ||
from .validate import validate # noqa: F401 | ||
from .ud import ud_train, ud_evaluate # noqa: F401 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
from .conllu2json import conllu2json | ||
from .conllubio2json import conllubio2json | ||
from .iob2json import iob2json | ||
from .conll_ner2json import conll_ner2json | ||
from .jsonl2json import ner_jsonl2json | ||
from .conllu2json import conllu2json # noqa: F401 | ||
from .conllubio2json import conllubio2json # noqa: F401 | ||
from .iob2json import iob2json # noqa: F401 | ||
from .conll_ner2json import conll_ner2json # noqa: F401 | ||
from .jsonl2json import ner_jsonl2json # noqa: F401 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,52 +1,38 @@ | ||
# coding: utf8 | ||
from __future__ import unicode_literals | ||
|
||
from .._messages import Messages | ||
from ...compat import json_dumps, path2str | ||
from ...util import prints | ||
from ...gold import iob_to_biluo | ||
|
||
|
||
def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False, lang=None): | ||
def conll_ner2json(input_data, **kwargs): | ||
""" | ||
Convert files in the CoNLL-2003 NER format into JSON format for use with | ||
train cli. | ||
""" | ||
docs = read_conll_ner(input_path) | ||
|
||
output_filename = input_path.parts[-1].replace(".conll", "") + ".json" | ||
output_filename = input_path.parts[-1].replace(".conll", "") + ".json" | ||
output_file = output_path / output_filename | ||
with output_file.open('w', encoding='utf-8') as f: | ||
f.write(json_dumps(docs)) | ||
prints(Messages.M033.format(n_docs=len(docs)), | ||
title=Messages.M032.format(name=path2str(output_file))) | ||
|
||
|
||
def read_conll_ner(input_path): | ||
text = input_path.open('r', encoding='utf-8').read() | ||
i = 0 | ||
delimit_docs = '-DOCSTART- -X- O O' | ||
delimit_docs = "-DOCSTART- -X- O O" | ||
output_docs = [] | ||
for doc in text.strip().split(delimit_docs): | ||
for doc in input_data.strip().split(delimit_docs): | ||
doc = doc.strip() | ||
if not doc: | ||
continue | ||
output_doc = [] | ||
for sent in doc.split('\n\n'): | ||
for sent in doc.split("\n\n"): | ||
sent = sent.strip() | ||
if not sent: | ||
continue | ||
lines = [line.strip() for line in sent.split('\n') if line.strip()] | ||
lines = [line.strip() for line in sent.split("\n") if line.strip()] | ||
words, tags, chunks, iob_ents = zip(*[line.split() for line in lines]) | ||
biluo_ents = iob_to_biluo(iob_ents) | ||
output_doc.append({'tokens': [ | ||
{'orth': w, 'tag': tag, 'ner': ent} for (w, tag, ent) in | ||
zip(words, tags, biluo_ents) | ||
]}) | ||
output_docs.append({ | ||
'id': len(output_docs), | ||
'paragraphs': [{'sentences': output_doc}] | ||
}) | ||
output_doc.append( | ||
{ | ||
"tokens": [ | ||
{"orth": w, "tag": tag, "ner": ent} | ||
for (w, tag, ent) in zip(words, tags, biluo_ents) | ||
] | ||
} | ||
) | ||
output_docs.append( | ||
{"id": len(output_docs), "paragraphs": [{"sentences": output_doc}]} | ||
) | ||
output_doc = [] | ||
return output_docs |
Oops, something went wrong.