Skip to content

Commit

Permalink
Tests for CLI app - init config generates train-able config (#12173)
Browse files Browse the repository at this point in the history
* remove migration support form

* initial test commit

* add fixture

* add combo test

* pull out parameter example data

* fix formatting on examples

* remove unused import

* remove unncessary fmt:off instructions

* only set logger level if verbose flag is explicitly set

---------

Co-authored-by: svlandeg <svlandeg@github.com>
  • Loading branch information
pmbaumgartner and svlandeg authored Jul 31, 2023
1 parent 186889e commit a0a1956
Show file tree
Hide file tree
Showing 5 changed files with 172 additions and 8 deletions.
3 changes: 2 additions & 1 deletion spacy/cli/assemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ def assemble_cli(
DOCS: https://spacy.io/api/cli#assemble
"""
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
if verbose:
util.logger.setLevel(logging.DEBUG)
# Make sure all files and paths exists if they are needed
if not config_path or (str(config_path) != "-" and not config_path.exists()):
msg.fail("Config file not found", config_path, exits=1)
Expand Down
4 changes: 2 additions & 2 deletions spacy/cli/find_threshold.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ def find_threshold_cli(
DOCS: https://spacy.io/api/cli#find-threshold
"""

util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
if verbose:
util.logger.setLevel(logging.DEBUG)
import_code(code_path)
find_threshold(
model=model,
Expand Down
9 changes: 6 additions & 3 deletions spacy/cli/init_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ def init_vectors_cli(
you can use in the [initialize] block of your config to initialize
a model with vectors.
"""
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
if verbose:
util.logger.setLevel(logging.DEBUG)
msg.info(f"Creating blank nlp object for language '{lang}'")
nlp = util.get_lang_class(lang)()
if jsonl_loc is not None:
Expand Down Expand Up @@ -87,7 +88,8 @@ def init_pipeline_cli(
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
# fmt: on
):
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
if verbose:
util.logger.setLevel(logging.DEBUG)
overrides = parse_config_overrides(ctx.args)
import_code(code_path)
setup_gpu(use_gpu)
Expand Down Expand Up @@ -116,7 +118,8 @@ def init_labels_cli(
"""Generate JSON files for the labels in the data. This helps speed up the
training process, since spaCy won't have to preprocess the data to
extract the labels."""
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
if verbose:
util.logger.setLevel(logging.DEBUG)
if not output_path.exists():
output_path.mkdir(parents=True)
overrides = parse_config_overrides(ctx.args)
Expand Down
3 changes: 2 additions & 1 deletion spacy/cli/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ def train_cli(
DOCS: https://spacy.io/api/cli#train
"""
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
if verbose:
util.logger.setLevel(logging.DEBUG)
overrides = parse_config_overrides(ctx.args)
import_code(code_path)
train(config_path, output_path, use_gpu=use_gpu, overrides=overrides)
Expand Down
161 changes: 160 additions & 1 deletion spacy/tests/test_cli_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from typer.testing import CliRunner

from spacy.cli._util import app, get_git_version
from spacy.tokens import Doc, DocBin
from spacy.tokens import Doc, DocBin, Span

from .util import make_tempdir, normalize_whitespace

Expand Down Expand Up @@ -267,3 +267,162 @@ def test_find_function_invalid():
function = "spacy.TextCatBOW.v666"
result = CliRunner().invoke(app, ["find-function", function])
assert f"Couldn't find registered function: '{function}'" in result.stdout


example_words_1 = ["I", "like", "cats"]
example_words_2 = ["I", "like", "dogs"]
example_lemmas_1 = ["I", "like", "cat"]
example_lemmas_2 = ["I", "like", "dog"]
example_tags = ["PRP", "VBP", "NNS"]
example_morphs = [
"Case=Nom|Number=Sing|Person=1|PronType=Prs",
"Tense=Pres|VerbForm=Fin",
"Number=Plur",
]
example_deps = ["nsubj", "ROOT", "dobj"]
example_pos = ["PRON", "VERB", "NOUN"]
example_ents = ["O", "O", "I-ANIMAL"]
example_spans = [(2, 3, "ANIMAL")]

TRAIN_EXAMPLE_1 = dict(
words=example_words_1,
lemmas=example_lemmas_1,
tags=example_tags,
morphs=example_morphs,
deps=example_deps,
heads=[1, 1, 1],
pos=example_pos,
ents=example_ents,
spans=example_spans,
cats={"CAT": 1.0, "DOG": 0.0},
)
TRAIN_EXAMPLE_2 = dict(
words=example_words_2,
lemmas=example_lemmas_2,
tags=example_tags,
morphs=example_morphs,
deps=example_deps,
heads=[1, 1, 1],
pos=example_pos,
ents=example_ents,
spans=example_spans,
cats={"CAT": 0.0, "DOG": 1.0},
)


@pytest.mark.slow
@pytest.mark.parametrize(
"component,examples",
[
("tagger", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]),
("morphologizer", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]),
("trainable_lemmatizer", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]),
("parser", [TRAIN_EXAMPLE_1] * 30),
("ner", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]),
("spancat", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]),
("textcat", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]),
],
)
def test_init_config_trainable(component, examples, en_vocab):
if component == "textcat":
train_docs = []
for example in examples:
doc = Doc(en_vocab, words=example["words"])
doc.cats = example["cats"]
train_docs.append(doc)
elif component == "spancat":
train_docs = []
for example in examples:
doc = Doc(en_vocab, words=example["words"])
doc.spans["sc"] = [
Span(doc, start, end, label) for start, end, label in example["spans"]
]
train_docs.append(doc)
else:
train_docs = []
for example in examples:
# cats, spans are not valid kwargs for instantiating a Doc
example = {k: v for k, v in example.items() if k not in ("cats", "spans")}
doc = Doc(en_vocab, **example)
train_docs.append(doc)

with make_tempdir() as d_in:
train_bin = DocBin(docs=train_docs)
train_bin.to_disk(d_in / "train.spacy")
dev_bin = DocBin(docs=train_docs)
dev_bin.to_disk(d_in / "dev.spacy")
init_config_result = CliRunner().invoke(
app,
[
"init",
"config",
f"{d_in}/config.cfg",
"--lang",
"en",
"--pipeline",
component,
],
)
assert init_config_result.exit_code == 0
train_result = CliRunner().invoke(
app,
[
"train",
f"{d_in}/config.cfg",
"--paths.train",
f"{d_in}/train.spacy",
"--paths.dev",
f"{d_in}/dev.spacy",
"--output",
f"{d_in}/model",
],
)
assert train_result.exit_code == 0
assert Path(d_in / "model" / "model-last").exists()


@pytest.mark.slow
@pytest.mark.parametrize(
"component,examples",
[("tagger,parser,morphologizer", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2] * 15)],
)
def test_init_config_trainable_multiple(component, examples, en_vocab):
train_docs = []
for example in examples:
example = {k: v for k, v in example.items() if k not in ("cats", "spans")}
doc = Doc(en_vocab, **example)
train_docs.append(doc)

with make_tempdir() as d_in:
train_bin = DocBin(docs=train_docs)
train_bin.to_disk(d_in / "train.spacy")
dev_bin = DocBin(docs=train_docs)
dev_bin.to_disk(d_in / "dev.spacy")
init_config_result = CliRunner().invoke(
app,
[
"init",
"config",
f"{d_in}/config.cfg",
"--lang",
"en",
"--pipeline",
component,
],
)
assert init_config_result.exit_code == 0
train_result = CliRunner().invoke(
app,
[
"train",
f"{d_in}/config.cfg",
"--paths.train",
f"{d_in}/train.spacy",
"--paths.dev",
f"{d_in}/dev.spacy",
"--output",
f"{d_in}/model",
],
)
assert train_result.exit_code == 0
assert Path(d_in / "model" / "model-last").exists()

0 comments on commit a0a1956

Please sign in to comment.