From f1a418c7c571814467650f5674f31ff023aa54c5 Mon Sep 17 00:00:00 2001 From: Tobias Domhan Date: Wed, 29 Nov 2017 06:03:46 -0800 Subject: [PATCH] Regularly run system tests. (#227) - Run one system test per model for each commit. - Run all system tests as daily cron jobs. - Factored out logging and optionally disable logging in run_train_translate. --- .travis.yml | 6 ++++- CHANGELOG.md | 4 +++ sockeye/__init__.py | 2 +- sockeye/arguments.py | 19 ++++++++------ sockeye/decoder.py | 2 +- sockeye/encoder.py | 2 +- sockeye/evaluate.py | 1 + sockeye/lexicon.py | 6 +++-- sockeye/translate.py | 5 +++- test/common.py | 43 ++++++++++++++++++++++---------- test/system/test_seq_copy_sys.py | 10 ++++---- test/unit/test_arguments.py | 18 +++++++++---- 12 files changed, 80 insertions(+), 38 deletions(-) diff --git a/.travis.yml b/.travis.yml index 13f9e202f..10e099a7d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -25,4 +25,8 @@ script: - pylint --rcfile=pylintrc test -E - mypy --ignore-missing-imports --follow-imports=silent @typechecked-files - check-manifest --ignore sockeye/git_version.py - #- python -m pytest test/system + - if [ "$TRAVIS_EVENT_TYPE" != "cron" ]; then python -m pytest -k "Copy:lstm:lstm" test/system; fi + - if [ "$TRAVIS_EVENT_TYPE" != "cron" ]; then python -m pytest -k "Copy:transformer:transformer" test/system; fi + - if [ "$TRAVIS_EVENT_TYPE" != "cron" ]; then python -m pytest -k "Copy:cnn:cnn" test/system; fi + - if [ "$TRAVIS_EVENT_TYPE" = "cron" ]; then python -m pytest test/system; fi + diff --git a/CHANGELOG.md b/CHANGELOG.md index 10df2d127..30d44af69 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,10 @@ Note that Sockeye has checks in place to not translate with an old model that wa Each version section may have have subsections for: _Added_, _Changed_, _Removed_, _Deprecated_, and _Fixed_. +## [1.14.0] +### Changed +- Downscaled fixed positional embeddings for CNN models. + ## [1.13.2] ### Added - A tool that extracts specified parameters from params.x into a .npz file for downstream applications or analysis. diff --git a/sockeye/__init__.py b/sockeye/__init__.py index e949adca3..7e372ad16 100644 --- a/sockeye/__init__.py +++ b/sockeye/__init__.py @@ -11,4 +11,4 @@ # express or implied. See the License for the specific language governing # permissions and limitations under the License. -__version__ = '1.13.2' +__version__ = '1.14.0' diff --git a/sockeye/arguments.py b/sockeye/arguments.py index b5464a3b0..6948c4986 100644 --- a/sockeye/arguments.py +++ b/sockeye/arguments.py @@ -23,6 +23,7 @@ from . import constants as C from . import data_io + def regular_file() -> Callable: """ Returns a method that can be used in argument parsing to check the argument is a regular file or a symbolic link, @@ -226,6 +227,14 @@ def add_lexicon_args(params): help="Number of target translations to keep per source. Default: %(default)s.") +def add_logging_args(params): + logging_params = params.add_argument_group("Logging") + logging_params.add_argument('--quiet', '-q', + default=False, + action="store_true", + help='Suppress console logging.') + + def add_io_args(params): data_params = params.add_argument_group("Data & I/O") @@ -283,11 +292,6 @@ def add_io_args(params): help="Statistics function to run on monitored outputs/weights/gradients. " "Default: %(default)s.") - data_params.add_argument('--quiet', '-q', - default=False, - action="store_true", - help='Suppress console logging.') - def add_device_args(params): device_params = params.add_argument_group("Device parameters") @@ -790,11 +794,13 @@ def add_train_cli_args(params): add_model_parameters(params) add_training_args(params) add_device_args(params) + add_logging_args(params) def add_translate_cli_args(params): add_inference_args(params) add_device_args(params) + add_logging_args(params) def add_inference_args(params): @@ -896,9 +902,6 @@ def add_evaluate_args(params): type=file_or_stdin(), default=sys.stdin, help="File with hypotheses. If none will read from stdin. Default: %(default)s.") - eval_params.add_argument('--quiet', '-q', - action="store_true", - help="Do not print logging information.") eval_params.add_argument('--metrics', nargs='+', default=[C.BLEU, C.CHRF], diff --git a/sockeye/decoder.py b/sockeye/decoder.py index 85cc5b299..88c3ee647 100644 --- a/sockeye/decoder.py +++ b/sockeye/decoder.py @@ -898,7 +898,7 @@ def __init__(self, num_embed=config.num_embed, max_seq_len=config.max_seq_len_target, fixed_pos_embed_scale_up_input=False, - fixed_pos_embed_scale_down_positions=False, + fixed_pos_embed_scale_down_positions=True, prefix=C.TARGET_POSITIONAL_EMBEDDING_PREFIX) self.layers = [convolution.ConvolutionBlock( diff --git a/sockeye/encoder.py b/sockeye/encoder.py index acdbe3210..82b981c1f 100644 --- a/sockeye/encoder.py +++ b/sockeye/encoder.py @@ -143,7 +143,7 @@ def get_convolutional_encoder(config: ConvolutionalEncoderConfig) -> 'Encoder': config.num_embed, max_seq_len=config.max_seq_len_source, fixed_pos_embed_scale_up_input=False, - fixed_pos_embed_scale_down_positions=False, + fixed_pos_embed_scale_down_positions=True, prefix=C.SOURCE_POSITIONAL_EMBEDDING_PREFIX)) encoders.append(ConvolutionalEncoder(config=config)) encoders.append(BatchMajor2TimeMajor()) diff --git a/sockeye/evaluate.py b/sockeye/evaluate.py index 35e88ea35..84b1f3128 100644 --- a/sockeye/evaluate.py +++ b/sockeye/evaluate.py @@ -46,6 +46,7 @@ def main(): params = argparse.ArgumentParser(description='Evaluate translations by calculating metrics with ' 'respect to a reference set.') arguments.add_evaluate_args(params) + arguments.add_logging_args(params) args = params.parse_args() if args.quiet: diff --git a/sockeye/lexicon.py b/sockeye/lexicon.py index 51cff0796..18ae7120b 100644 --- a/sockeye/lexicon.py +++ b/sockeye/lexicon.py @@ -269,13 +269,15 @@ def main(): """ Commandline interface for building top-k lexicons using during decoding. """ - logger = setup_main_logger(__name__, console=True, file_logging=False) - log_sockeye_version(logger) params = argparse.ArgumentParser(description="Build a top-k lexicon for use during decoding.") arguments.add_lexicon_args(params) + arguments.add_logging_args(params) args = params.parse_args() + logger = setup_main_logger(__name__, console=not args.quiet, file_logging=False) + log_sockeye_version(logger) + logger.info("Reading source and target vocab from \"%s\"", args.model) vocab_source = vocab.vocab_from_json_or_pickle(os.path.join(args.model, C.VOCAB_SRC_NAME)) vocab_target = vocab.vocab_from_json_or_pickle(os.path.join(args.model, C.VOCAB_TRG_NAME)) diff --git a/sockeye/translate.py b/sockeye/translate.py index 23dedd1db..72ed39273 100644 --- a/sockeye/translate.py +++ b/sockeye/translate.py @@ -45,7 +45,10 @@ def main(): if args.output is not None: global logger - logger = setup_main_logger(__name__, file_logging=True, path="%s.%s" % (args.output, C.LOG_NAME)) + logger = setup_main_logger(__name__, + console=not args.quiet, + file_logging=True, + path="%s.%s" % (args.output, C.LOG_NAME)) if args.checkpoints is not None: check_condition(len(args.checkpoints) == len(args.models), "must provide checkpoints for each model") diff --git a/test/common.py b/test/common.py index 43e7f2748..d7a2cb121 100644 --- a/test/common.py +++ b/test/common.py @@ -14,6 +14,7 @@ import os import random import sys +import logging from contextlib import contextmanager from tempfile import TemporaryDirectory from typing import Optional, Tuple @@ -33,6 +34,8 @@ from sockeye.evaluate import raw_corpus_bleu from sockeye.chrf import corpus_chrf +logger = logging.getLogger(__name__) + def gaussian_vector(shape, return_symbol=False): """ @@ -116,7 +119,7 @@ def generate_fast_align_lex(lex_path: str): print("{0}\t{0}\t0".format(digit), file=lex_out) -_LEXICON_PARAMS_COMMON = "-i {input} -m {model} -k 1 -o {json}" +_LEXICON_PARAMS_COMMON = "-i {input} -m {model} -k 1 -o {json} {quiet}" @contextmanager @@ -144,13 +147,13 @@ def tmp_digits_dataset(prefix: str, _TRAIN_PARAMS_COMMON = "--use-cpu --max-seq-len {max_len} --source {train_source} --target {train_target}" \ - " --validation-source {dev_source} --validation-target {dev_target} --output {model}" + " --validation-source {dev_source} --validation-target {dev_target} --output {model} {quiet}" -_TRANSLATE_PARAMS_COMMON = "--use-cpu --models {model} --input {input} --output {output}" +_TRANSLATE_PARAMS_COMMON = "--use-cpu --models {model} --input {input} --output {output} {quiet}" _TRANSLATE_PARAMS_RESTRICT = "--restrict-lexicon {json}" -_EVAL_PARAMS_COMMON = "--hypotheses {hypotheses} --references {references} --metrics {metrics}" +_EVAL_PARAMS_COMMON = "--hypotheses {hypotheses} --references {references} --metrics {metrics} {quiet}" def run_train_translate(train_params: str, @@ -162,7 +165,8 @@ def run_train_translate(train_params: str, dev_target_path: str, max_seq_len: int = 10, restrict_lexicon: bool = False, - work_dir: Optional[str] = None) -> Tuple[float, float, float, float]: + work_dir: Optional[str] = None, + quiet: bool = False) -> Tuple[float, float, float, float]: """ Train a model and translate a dev set. Report validation perplexity and BLEU. @@ -176,8 +180,13 @@ def run_train_translate(train_params: str, :param max_seq_len: The maximum sequence length. :param restrict_lexicon: Additional translation run with top-k lexicon-based vocabulary restriction. :param work_dir: The directory to store the model and other outputs in. + :param quiet: Suppress the console output of training and decoding. :return: A tuple containing perplexity, bleu scores for standard and reduced vocab decoding, chrf score. """ + if quiet: + quiet_arg = "--quiet" + else: + quiet_arg = "" with TemporaryDirectory(dir=work_dir, prefix="test_train_translate.") as work_dir: # Train model model_path = os.path.join(work_dir, "model") @@ -187,17 +196,21 @@ def run_train_translate(train_params: str, dev_source=dev_source_path, dev_target=dev_target_path, model=model_path, - max_len=max_seq_len), + max_len=max_seq_len, + quiet=quiet_arg), train_params) + logger.info("Starting training with parameters %s.", train_params) with patch.object(sys, "argv", params.split()): sockeye.train.main() + logger.info("Translating with parameters %s.", translate_params) # Translate corpus with the 1st params out_path = os.path.join(work_dir, "out.txt") params = "{} {} {}".format(sockeye.translate.__file__, _TRANSLATE_PARAMS_COMMON.format(model=model_path, input=dev_source_path, - output=out_path), + output=out_path, + quiet=quiet_arg), translate_params) with patch.object(sys, "argv", params.split()): sockeye.translate.main() @@ -208,7 +221,8 @@ def run_train_translate(train_params: str, params = "{} {} {}".format(sockeye.translate.__file__, _TRANSLATE_PARAMS_COMMON.format(model=model_path, input=dev_source_path, - output=out_path_equiv), + output=out_path_equiv, + quiet=quiet_arg), translate_params_equiv) with patch.object(sys, "argv", params.split()): sockeye.translate.main() @@ -231,14 +245,16 @@ def run_train_translate(train_params: str, params = "{} {}".format(sockeye.lexicon.__file__, _LEXICON_PARAMS_COMMON.format(input=lex_path, model=model_path, - json=json_path)) + json=json_path, + quiet=quiet_arg)) with patch.object(sys, "argv", params.split()): sockeye.lexicon.main() # Translate corpus with restrict-lexicon params = "{} {} {} {}".format(sockeye.translate.__file__, _TRANSLATE_PARAMS_COMMON.format(model=model_path, input=dev_source_path, - output=out_restrict_path), + output=out_restrict_path, + quiet=quiet_arg), translate_params, _TRANSLATE_PARAMS_RESTRICT.format(json=json_path)) with patch.object(sys, "argv", params.split()): @@ -253,9 +269,9 @@ def run_train_translate(train_params: str, averaged_params = sockeye.average.average(points) assert averaged_params - # get last validation perplexity + # get best validation perplexity metrics = sockeye.utils.read_metrics_file(path=os.path.join(model_path, C.METRICS_NAME)) - perplexity = metrics[-1][C.PERPLEXITY + '-val'] + perplexity = min(m[C.PERPLEXITY + '-val'] for m in metrics) hypotheses = open(out_path, "r").readlines() references = open(dev_target_path, "r").readlines() @@ -272,7 +288,8 @@ def run_train_translate(train_params: str, eval_params = "{} {} ".format(sockeye.evaluate.__file__, _EVAL_PARAMS_COMMON.format(hypotheses=out_path, references=dev_target_path, - metrics="bleu chrf"), ) + metrics="bleu chrf", + quiet=quiet_arg)) with patch.object(sys, "argv", eval_params.split()): sockeye.evaluate.main() diff --git a/test/system/test_seq_copy_sys.py b/test/system/test_seq_copy_sys.py index 655625523..31d9c00cb 100644 --- a/test/system/test_seq_copy_sys.py +++ b/test/system/test_seq_copy_sys.py @@ -27,7 +27,7 @@ @pytest.mark.parametrize("name, train_params, translate_params, perplexity_thresh, bleu_thresh", [ - ("Copy:lstm", + ("Copy:lstm:lstm", "--encoder rnn --num-layers 1 --rnn-cell-type lstm --rnn-num-hidden 64 --num-embed 32 --rnn-attention-type mlp" " --rnn-attention-num-hidden 32 --batch-size 16 --loss cross-entropy --optimized-metric perplexity" " --checkpoint-frequency 1000 --optimizer adam --initial-learning-rate 0.001" @@ -80,7 +80,7 @@ "--beam-size 1", 1.01, 0.999), - ("Copy:cnn", + ("Copy:cnn:cnn", "--encoder cnn --decoder cnn " " --batch-size 16 --num-layers 3 --max-updates 3000" " --cnn-num-hidden 32 --cnn-positional-embedding-type fixed" @@ -154,7 +154,7 @@ def test_seq_copy(name, train_params, translate_params, perplexity_thresh, bleu_ " --transformer-feed-forward-num-hidden 64" " --checkpoint-frequency 1000 --optimizer adam --initial-learning-rate 0.001", "--beam-size 1", - 1.01, + 1.02, 0.99), ("Sort:cnn", "--encoder cnn --decoder cnn " @@ -162,8 +162,8 @@ def test_seq_copy(name, train_params, translate_params, perplexity_thresh, bleu_ " --cnn-num-hidden 32 --cnn-positional-embedding-type fixed" " --checkpoint-frequency 1000 --optimizer adam --initial-learning-rate 0.001", "--beam-size 1", - 1.10, - 0.93) + 1.05, + 0.97) ]) def test_seq_sort(name, train_params, translate_params, perplexity_thresh, bleu_thresh): """Task: sort short sequences of digits""" diff --git a/test/unit/test_arguments.py b/test/unit/test_arguments.py index 9f5457aa1..09ba6e2e5 100644 --- a/test/unit/test_arguments.py +++ b/test/unit/test_arguments.py @@ -20,6 +20,7 @@ from itertools import zip_longest + @pytest.mark.parametrize("test_params, expected_params", [ # mandatory parameters ('--source test_src --target test_tgt ' @@ -28,7 +29,7 @@ dict(source='test_src', target='test_tgt', limit=None, validation_source='test_validation_src', validation_target='test_validation_tgt', output='test_output', overwrite_output=False, - source_vocab=None, target_vocab=None, use_tensorboard=False, quiet=False, + source_vocab=None, target_vocab=None, use_tensorboard=False, monitor_pattern=None, monitor_stat_func='mx_default')), # all parameters @@ -36,27 +37,34 @@ '--validation-source test_validation_src --validation-target test_validation_tgt ' '--output test_output ' '--source-vocab test_src_vocab --target-vocab test_tgt_vocab ' - '--use-tensorboard --overwrite-output --quiet', + '--use-tensorboard --overwrite-output', dict(source='test_src', target='test_tgt', limit=10, validation_source='test_validation_src', validation_target='test_validation_tgt', output='test_output', overwrite_output=True, - source_vocab='test_src_vocab', target_vocab='test_tgt_vocab', use_tensorboard=True, quiet=True, + source_vocab='test_src_vocab', target_vocab='test_tgt_vocab', use_tensorboard=True, monitor_pattern=None, monitor_stat_func='mx_default')), # short parameters ('-s test_src -t test_tgt ' '-vs test_validation_src -vt test_validation_tgt ' - '-o test_output -q', + '-o test_output', dict(source='test_src', target='test_tgt', limit=None, validation_source='test_validation_src', validation_target='test_validation_tgt', output='test_output', overwrite_output=False, - source_vocab=None, target_vocab=None, use_tensorboard=False, quiet=True, + source_vocab=None, target_vocab=None, use_tensorboard=False, monitor_pattern=None, monitor_stat_func='mx_default')) ]) def test_io_args(test_params, expected_params): _test_args(test_params, expected_params, arguments.add_io_args) +@pytest.mark.parametrize("test_params, expected_params", [ + ('', dict(quiet=False)), +]) +def test_logging_args(test_params, expected_params): + _test_args(test_params, expected_params, arguments.add_logging_args) + + @pytest.mark.parametrize("test_params, expected_params", [ ('', dict(device_ids=[-1], use_cpu=False, disable_device_locking=False, lock_dir='/tmp')), ('--device-ids 1 2 3 --use-cpu --disable-device-locking --lock-dir test_dir',