diff --git a/.travis.yml b/.travis.yml index 92f5a74262..b97c88a12d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,7 +14,7 @@ before_install: # Useful for debugging any issues with conda - conda info -a # freeze the supported pytorch version for consistency - - conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION pytorch=0.3.0 -c soumith + - conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION pytorch=0.4.0 -c soumith - source activate test-environment # use requirements.txt for dependencies - pip install -r requirements.txt @@ -27,41 +27,42 @@ install: # Please also add tests to `test/pull_request_chk.sh`. script: - - wget -O /tmp/im2text.tgz http://lstm.seas.harvard.edu/latex/im2text_small.tgz; tar zxf /tmp/im2text.tgz -C /tmp/; head /tmp/im2text/src-train.txt > /tmp/im2text/src-train-head.txt; head /tmp/im2text/tgt-train.txt > /tmp/im2text/tgt-train-head.txt; head /tmp/im2text/src-val.txt > /tmp/im2text/src-val-head.txt; head /tmp/im2text/tgt-val.txt > /tmp/im2text/tgt-val-head.txt - - wget -O /tmp/speech.tgz http://lstm.seas.harvard.edu/latex/speech.tgz; tar zxf /tmp/speech.tgz -C /tmp/; head /tmp/speech/src-train.txt > /tmp/speech/src-train-head.txt; head /tmp/speech/tgt-train.txt > /tmp/speech/tgt-train-head.txt; head /tmp/speech/src-val.txt > /tmp/speech/src-val-head.txt; head /tmp/speech/tgt-val.txt > /tmp/speech/tgt-val-head.txt - - wget -O /tmp/test_model_speech.pt http://lstm.seas.harvard.edu/latex/test_model_speech.pt - - wget -O /tmp/test_model_im2text.pt http://lstm.seas.harvard.edu/latex/test_model_im2text.pt - - python -m unittest discover + - onmt/tests/test_models.sh lstm cnn transformer + #- wget -O /tmp/im2text.tgz http://lstm.seas.harvard.edu/latex/im2text_small.tgz; tar zxf /tmp/im2text.tgz -C /tmp/; head /tmp/im2text/src-train.txt > /tmp/im2text/src-train-head.txt; head /tmp/im2text/tgt-train.txt > /tmp/im2text/tgt-train-head.txt; head /tmp/im2text/src-val.txt > /tmp/im2text/src-val-head.txt; head /tmp/im2text/tgt-val.txt > /tmp/im2text/tgt-val-head.txt + #- wget -O /tmp/speech.tgz http://lstm.seas.harvard.edu/latex/speech.tgz; tar zxf /tmp/speech.tgz -C /tmp/; head /tmp/speech/src-train.txt > /tmp/speech/src-train-head.txt; head /tmp/speech/tgt-train.txt > /tmp/speech/tgt-train-head.txt; head /tmp/speech/src-val.txt > /tmp/speech/src-val-head.txt; head /tmp/speech/tgt-val.txt > /tmp/speech/tgt-val-head.txt + #- wget -O /tmp/test_model_speech.pt http://lstm.seas.harvard.edu/latex/test_model_speech.pt + #- wget -O /tmp/test_model_im2text.pt http://lstm.seas.harvard.edu/latex/test_model_im2text.pt + #- python -m unittest discover # test nmt preprocessing - - python preprocess.py -train_src data/src-train.txt -train_tgt data/tgt-train.txt -valid_src data/src-val.txt -valid_tgt data/tgt-val.txt -save_data /tmp/data -src_vocab_size 1000 -tgt_vocab_size 1000 && rm -rf /tmp/data*.pt + #- python preprocess.py -train_src data/src-train.txt -train_tgt data/tgt-train.txt -valid_src data/src-val.txt -valid_tgt data/tgt-val.txt -save_data /tmp/data -src_vocab_size 1000 -tgt_vocab_size 1000 && rm -rf /tmp/data*.pt # test im2text preprocessing - - python preprocess.py -data_type img -src_dir /tmp/im2text/images -train_src /tmp/im2text/src-train.txt -train_tgt /tmp/im2text/tgt-train.txt -valid_src /tmp/im2text/src-val.txt -valid_tgt /tmp/im2text/tgt-val.txt -save_data /tmp/im2text/data && rm -rf /tmp/im2text/data*.pt + #- python preprocess.py -data_type img -src_dir /tmp/im2text/images -train_src /tmp/im2text/src-train.txt -train_tgt /tmp/im2text/tgt-train.txt -valid_src /tmp/im2text/src-val.txt -valid_tgt /tmp/im2text/tgt-val.txt -save_data /tmp/im2text/data && rm -rf /tmp/im2text/data*.pt # test speech2text preprocessing - - python preprocess.py -data_type audio -src_dir /tmp/speech/an4_dataset -train_src /tmp/speech/src-train.txt -train_tgt /tmp/speech/tgt-train.txt -valid_src /tmp/speech/src-val.txt -valid_tgt /tmp/speech/tgt-val.txt -save_data /tmp/speech/data && rm -rf /tmp/speech/data*.pt + #- python preprocess.py -data_type audio -src_dir /tmp/speech/an4_dataset -train_src /tmp/speech/src-train.txt -train_tgt /tmp/speech/tgt-train.txt -valid_src /tmp/speech/src-val.txt -valid_tgt /tmp/speech/tgt-val.txt -save_data /tmp/speech/data && rm -rf /tmp/speech/data*.pt # test nmt translation - - head data/src-test.txt > /tmp/src-test.txt; python translate.py -model test/test_model.pt -src /tmp/src-test.txt -verbose + #- head data/src-test.txt > /tmp/src-test.txt; python translate.py -model onmt/tests/test_model.pt -src /tmp/src-test.txt -verbose # test im2text translation - - head /tmp/im2text/src-val.txt > /tmp/im2text/src-val-head.txt; head /tmp/im2text/tgt-val.txt > /tmp/im2text/tgt-val-head.txt; python translate.py -data_type img -src_dir /tmp/im2text/images -model /tmp/test_model_im2text.pt -src /tmp/im2text/src-val-head.txt -tgt /tmp/im2text/tgt-val-head.txt -verbose -out /tmp/im2text/trans + #- head /tmp/im2text/src-val.txt > /tmp/im2text/src-val-head.txt; head /tmp/im2text/tgt-val.txt > /tmp/im2text/tgt-val-head.txt; python translate.py -data_type img -src_dir /tmp/im2text/images -model /tmp/test_model_im2text.pt -src /tmp/im2text/src-val-head.txt -tgt /tmp/im2text/tgt-val-head.txt -verbose -out /tmp/im2text/trans # test speech2text translation - - head /tmp/speech/src-val.txt > /tmp/speech/src-val-head.txt; head /tmp/speech/tgt-val.txt > /tmp/speech/tgt-val-head.txt; python translate.py -data_type audio -src_dir /tmp/speech/an4_dataset -model /tmp/test_model_speech.pt -src /tmp/speech/src-val-head.txt -tgt /tmp/speech/tgt-val-head.txt -verbose -out /tmp/speech/trans; diff /tmp/speech/tgt-val-head.txt /tmp/speech/trans + #- head /tmp/speech/src-val.txt > /tmp/speech/src-val-head.txt; head /tmp/speech/tgt-val.txt > /tmp/speech/tgt-val-head.txt; python translate.py -data_type audio -src_dir /tmp/speech/an4_dataset -model /tmp/test_model_speech.pt -src /tmp/speech/src-val-head.txt -tgt /tmp/speech/tgt-val-head.txt -verbose -out /tmp/speech/trans; diff /tmp/speech/tgt-val-head.txt /tmp/speech/trans # test nmt preprocessing and training - - head data/src-val.txt > /tmp/src-val.txt; head data/tgt-val.txt > /tmp/tgt-val.txt; python preprocess.py -train_src /tmp/src-val.txt -train_tgt /tmp/tgt-val.txt -valid_src /tmp/src-val.txt -valid_tgt /tmp/tgt-val.txt -save_data /tmp/q -src_vocab_size 1000 -tgt_vocab_size 1000; python train.py -data /tmp/q -rnn_size 2 -batch_size 10 -word_vec_size 5 -report_every 5 -rnn_size 10 -train_steps 1000 && rm -rf /tmp/q*.pt + #- head data/src-val.txt > /tmp/src-val.txt; head data/tgt-val.txt > /tmp/tgt-val.txt; python preprocess.py -train_src /tmp/src-val.txt -train_tgt /tmp/tgt-val.txt -valid_src /tmp/src-val.txt -valid_tgt /tmp/tgt-val.txt -save_data /tmp/q -src_vocab_size 1000 -tgt_vocab_size 1000; python train.py -data /tmp/q -rnn_size 2 -batch_size 10 -word_vec_size 5 -report_every 50 -rnn_size 10 -train_steps 500 && rm -rf /tmp/q*.pt # test nmt preprocessing w/ sharding and training w/copy - - head data/src-val.txt > /tmp/src-val.txt; head data/tgt-val.txt > /tmp/tgt-val.txt; python preprocess.py -train_src /tmp/src-val.txt -train_tgt /tmp/tgt-val.txt -valid_src /tmp/src-val.txt -valid_tgt /tmp/tgt-val.txt -max_shard_size 1 -dynamic_dict -save_data /tmp/q -src_vocab_size 1000 -tgt_vocab_size 1000; python train.py -data /tmp/q -rnn_size 2 -batch_size 10 -word_vec_size 5 -report_every 5 -rnn_size 10 -copy_attn -train_steps 1000 && rm -rf /tmp/q*.pt + #- head data/src-val.txt > /tmp/src-val.txt; head data/tgt-val.txt > /tmp/tgt-val.txt; python preprocess.py -train_src /tmp/src-val.txt -train_tgt /tmp/tgt-val.txt -valid_src /tmp/src-val.txt -valid_tgt /tmp/tgt-val.txt -max_shard_size 1 -dynamic_dict -save_data /tmp/q -src_vocab_size 1000 -tgt_vocab_size 1000; python train.py -data /tmp/q -rnn_size 2 -batch_size 10 -word_vec_size 5 -report_every 50 -rnn_size 10 -copy_attn -train_steps 500 && rm -rf /tmp/q*.pt # test im2text preprocessing and training - - head /tmp/im2text/src-val.txt > /tmp/im2text/src-val-head.txt; head /tmp/im2text/tgt-val.txt > /tmp/im2text/tgt-val-head.txt; python preprocess.py -data_type img -src_dir /tmp/im2text/images -train_src /tmp/im2text/src-val-head.txt -train_tgt /tmp/im2text/tgt-val-head.txt -valid_src /tmp/im2text/src-val-head.txt -valid_tgt /tmp/im2text/tgt-val-head.txt -save_data /tmp/im2text/q; python train.py -model_type img -data /tmp/im2text/q -rnn_size 2 -batch_size 10 -word_vec_size 5 -report_every 5 -rnn_size 10 -train_steps 1000 && rm -rf /tmp/im2text/q*.pt + #- head /tmp/im2text/src-val.txt > /tmp/im2text/src-val-head.txt; head /tmp/im2text/tgt-val.txt > /tmp/im2text/tgt-val-head.txt; python preprocess.py -data_type img -src_dir /tmp/im2text/images -train_src /tmp/im2text/src-val-head.txt -train_tgt /tmp/im2text/tgt-val-head.txt -valid_src /tmp/im2text/src-val-head.txt -valid_tgt /tmp/im2text/tgt-val-head.txt -save_data /tmp/im2text/q; python train.py -model_type img -data /tmp/im2text/q -rnn_size 2 -batch_size 10 -word_vec_size 5 -report_every 5 -rnn_size 10 -train_steps 1000 && rm -rf /tmp/im2text/q*.pt # test speech2text preprocessing and training - - head /tmp/speech/src-val.txt > /tmp/speech/src-val-head.txt; head /tmp/speech/tgt-val.txt > /tmp/speech/tgt-val-head.txt; python preprocess.py -data_type audio -src_dir /tmp/speech/an4_dataset -train_src /tmp/speech/src-val-head.txt -train_tgt /tmp/speech/tgt-val-head.txt -valid_src /tmp/speech/src-val-head.txt -valid_tgt /tmp/speech/tgt-val-head.txt -save_data /tmp/speech/q; python train.py -model_type audio -data /tmp/speech/q -rnn_size 2 -batch_size 10 -word_vec_size 5 -report_every 5 -rnn_size 10 -train_steps 1000 && rm -rf /tmp/speech/q*.pt + #- head /tmp/speech/src-val.txt > /tmp/speech/src-val-head.txt; head /tmp/speech/tgt-val.txt > /tmp/speech/tgt-val-head.txt; python preprocess.py -data_type audio -src_dir /tmp/speech/an4_dataset -train_src /tmp/speech/src-val-head.txt -train_tgt /tmp/speech/tgt-val-head.txt -valid_src /tmp/speech/src-val-head.txt -valid_tgt /tmp/speech/tgt-val-head.txt -save_data /tmp/speech/q; python train.py -model_type audio -data /tmp/speech/q -rnn_size 2 -batch_size 10 -word_vec_size 5 -report_every 5 -rnn_size 10 -train_steps 1000 && rm -rf /tmp/speech/q*.pt # test nmt translation - - python translate.py -model test/test_model2.pt -src data/morph/src.valid -verbose -batch_size 10 -beam_size 10 -tgt data/morph/tgt.valid -out /tmp/trans; diff data/morph/tgt.valid /tmp/trans + #- python translate.py -model test/test_model2.pt -src data/morph/src.valid -verbose -batch_size 10 -beam_size 10 -tgt data/morph/tgt.valid -out /tmp/trans; diff data/morph/tgt.valid /tmp/trans # test tool - - PYTHONPATH=$PYTHONPATH:. python tools/extract_embeddings.py -model test/test_model.pt + #- PYTHONPATH=$PYTHONPATH:. python tools/extract_embeddings.py -model test/test_model.pt env: global: # Doctr deploy key for OpenNMT/OpenNMT-py - - secure: "gL0Soefo1cQgAqwiHUrlNyZd/+SI1eJAAjLD3BEDQWXW160eXyjQAAujGgJoCirjOM7cPHVwLzwmK3S7Y3PVM3JOZguOX5Yl4uxMh/mhiEM+RG77SZyv4OGoLFsEQ8RTvIdYdtP6AwyjlkRDXvZql88TqFNYjpXDu8NG+JwEfiIoGIDYxxZ5SlbrZN0IqmQSZ4/CsV6VQiuq99Jn5kqi4MnUZBTcmhqjaztCP1omvsMRdbrG2IVhDKQOCDIO0kaPJrMy2SGzP4GV7ar52bdBtpeP3Xbm6ZOuhDNfds7M/OMHp1wGdl7XwKtolw9MeXhnGBC4gcrqhhMfcQ6XtfVLMLnsB09Ezl3FXX5zWgTB5Pm0X6TgnGrMA25MAdVqKGJpfqZxOKTh4EMb04b6OXrVbxZ88mp+V0NopuxwlTPD8PMfYLWlTe9chh1BnT0iQlLqeA4Hv3+NdpiFb4aq3V3cWTTgMqOoWSGq4t318pqIZ3qbBXBq12DLFgO5n6+M6ZrdxbDUGQvgh8nAiZcIEdodKJ4ABHi1SNCeWOzCoedUdegcbjShHfkMVmNKrncB18aRWwQ3GQJ5qdkjgJmC++uZmkS6+GPM8UmmAy1ZIkRW0aWiitjG6teqtvUHOofNd/TCxX4bhnxAj+mtVIrARCE/ci8topJ6uG4wVJ1TrIkUlAY=" + #- secure: "gL0Soefo1cQgAqwiHUrlNyZd/+SI1eJAAjLD3BEDQWXW160eXyjQAAujGgJoCirjOM7cPHVwLzwmK3S7Y3PVM3JOZguOX5Yl4uxMh/mhiEM+RG77SZyv4OGoLFsEQ8RTvIdYdtP6AwyjlkRDXvZql88TqFNYjpXDu8NG+JwEfiIoGIDYxxZ5SlbrZN0IqmQSZ4/CsV6VQiuq99Jn5kqi4MnUZBTcmhqjaztCP1omvsMRdbrG2IVhDKQOCDIO0kaPJrMy2SGzP4GV7ar52bdBtpeP3Xbm6ZOuhDNfds7M/OMHp1wGdl7XwKtolw9MeXhnGBC4gcrqhhMfcQ6XtfVLMLnsB09Ezl3FXX5zWgTB5Pm0X6TgnGrMA25MAdVqKGJpfqZxOKTh4EMb04b6OXrVbxZ88mp+V0NopuxwlTPD8PMfYLWlTe9chh1BnT0iQlLqeA4Hv3+NdpiFb4aq3V3cWTTgMqOoWSGq4t318pqIZ3qbBXBq12DLFgO5n6+M6ZrdxbDUGQvgh8nAiZcIEdodKJ4ABHi1SNCeWOzCoedUdegcbjShHfkMVmNKrncB18aRWwQ3GQJ5qdkjgJmC++uZmkS6+GPM8UmmAy1ZIkRW0aWiitjG6teqtvUHOofNd/TCxX4bhnxAj+mtVIrARCE/ci8topJ6uG4wVJ1TrIkUlAY=" matrix: include: @@ -77,4 +78,4 @@ matrix: - pip install -r docs/requirements.txt - cd docs/; make html; cd .. - set -e - - doctr deploy --built-docs docs/build/html/ . + #- doctr deploy --built-docs docs/build/html/ . diff --git a/README.md b/README.md index dbf0e9016b..7e76c26ebc 100644 --- a/README.md +++ b/README.md @@ -1,37 +1,15 @@ # OpenNMT-py: Open-Source Neural Machine Translation -[![Build Status](https://travis-ci.org/OpenNMT/OpenNMT-py.svg?branch=master)](https://travis-ci.org/OpenNMT/OpenNMT-py) +[![Build Status](https://travis-ci.org/Ubiqus/OpenNMT-py.svg?branch=master)](https://travis-ci.org/Ubiqus/OpenNMT-py) -[OpenNMT](https://opennmt.net) is an open-source (MIT) neural machine translation system which has 3 different implementations. +This is a fork of OpenNMT-py +Multi-GPU supported with Torch Distributed (pytorch 0.4) -The genuine one was a Lua version based on the Harvard Seq2Seq framework. [OpenNMT-Lua](https://github.com/OpenNMT/OpenNMT) +See major changes here: https://github.com/Ubiqus/OpenNMT-py/releases -The [Pytorch](https://github.com/pytorch/pytorch) version is this repo. +Script for upgrading existing pytorch 0.3 models: tools/03to04.py -The tensorflow version: [OpenNMT-tf](https://github.com/OpenNMT/OpenNMT-tf) - - -OpenNMT-py is designed to be research friendly to try out new ideas in translation, summary, image-to-text, morphology, and many other domains but also ready for production with a full REST API. - -Codebase is relatively stable, but PyTorch is still evolving. We currently recommend forking if you need to have stable code. - -OpenNMT-py is run as a collaborative open-source project. The original code was written by [Adam Lerer](http://github.com/adamlerer) (NYC) and [Bryan McCann](https://github.com/bmccann). -Major contributions have come from [Sasha Rush](http://github.com/srush) and his group (Cambridge, MA), [Ben Peters](http://github.com/bpopeters) (Saarbrücken), [Jianyu Zhan](http://github.com/jianyuzhan) (Shenzhen), [Paul Tardy](https://github.com/pltrdy) , [Vincent Nguyen](https://github.com/vince62s) and many others. - -We love contributions. Please consult the Issues page for any [Contributions Welcome](https://github.com/OpenNMT/OpenNMT-py/issues?q=is%3Aissue+is%3Aopen+label%3A%22contributions+welcome%22) tagged post. - -
- - -Table of Contents -================= - * [Full Documentation](http://opennmt.net/OpenNMT-py/) - * [Requirements](#requirements) - * [Features](#features) - * [Quickstart](#quickstart) - * [Citation](#citation) - ## Requirements python 3, torch >=0.4.0, torchtext >=0.2.3, six, tqdm, future, cupy pynvrtc for SRU @@ -120,29 +98,4 @@ Now you have a model which you can use to predict on new data. We do this by run Go to tutorial: [How to use GloVe pre-trained embeddings in OpenNMT-py](http://forum.opennmt.net/t/how-to-use-glove-pre-trained-embeddings-in-opennmt-py/1011) -## Pretrained Models -The following pretrained models can be downloaded and used with translate.py. - -http://opennmt.net/Models-py/ - - - -## Citation - -[OpenNMT technical report](https://doi.org/10.18653/v1/P17-4012) - -``` -@inproceedings{opennmt, - author = {Guillaume Klein and - Yoon Kim and - Yuntian Deng and - Jean Senellart and - Alexander M. Rush}, - title = {OpenNMT: Open-Source Toolkit for Neural Machine Translation}, - booktitle = {Proc. ACL}, - year = {2017}, - url = {https://doi.org/10.18653/v1/P17-4012}, - doi = {10.18653/v1/P17-4012} -} -``` diff --git a/onmt/__init__.py b/onmt/__init__.py index 49e1d19c88..944707eb5e 100644 --- a/onmt/__init__.py +++ b/onmt/__init__.py @@ -9,4 +9,8 @@ import onmt.modules from onmt.trainer import Trainer +# For Flake +__all__ = [onmt.inputters, onmt.encoders, onmt.decoders, onmt.models, + onmt.utils, onmt.modules, "Trainer"] + __version__ = "0.4.0" diff --git a/onmt/decoders/transformer.py b/onmt/decoders/transformer.py index 3289ec6fc6..7062dc5d34 100644 --- a/onmt/decoders/transformer.py +++ b/onmt/decoders/transformer.py @@ -81,10 +81,10 @@ def forward(self, inputs, memory_bank, src_pad_mask, tgt_pad_mask, if self.self_attn_type == "scaled-dot": query, attn = self.self_attn(all_input, all_input, input_norm, - mask=dec_mask) + mask=dec_mask) elif self.self_attn_type == "average": - query, attn = self.self_attn(input_norm, - mask=dec_mask, layer_cache=layer_cache, step=step) + query, attn = self.self_attn(input_norm, mask=dec_mask, + layer_cache=layer_cache, step=step) query = self.drop(query) + inputs @@ -154,7 +154,8 @@ def __init__(self, num_layers, hidden_size, attn_type, # Build TransformerDecoder. self.transformer_layers = nn.ModuleList( - [TransformerDecoderLayer(hidden_size, dropout, self_attn_type=self_attn_type) + [TransformerDecoderLayer(hidden_size, dropout, + self_attn_type=self_attn_type) for _ in range(num_layers)]) # TransformerDecoder has its own attention mechanism. @@ -166,7 +167,6 @@ def __init__(self, num_layers, hidden_size, attn_type, self._copy = True self.layer_norm = onmt.modules.LayerNorm(hidden_size) - def _init_cache(self, memory_bank, memory_lengths=None): cache = {} batch_size = memory_bank.size(1) @@ -176,7 +176,8 @@ def _init_cache(self, memory_bank, memory_lengths=None): cache["layer_{}".format(l)] = layer_cache return cache - def forward(self, tgt, memory_bank, state, memory_lengths=None, step=None, cache=None): + def forward(self, tgt, memory_bank, state, memory_lengths=None, + step=None, cache=None): """ See :obj:`onmt.modules.RNNDecoderBase.forward()` """ @@ -229,7 +230,9 @@ def forward(self, tgt, memory_bank, state, memory_lengths=None, step=None, cache = self.transformer_layers[i](output, src_memory_bank, src_pad_mask, tgt_pad_mask, previous_input=prev_layer_input, - layer_cache=cache["layer_{}".format(i)] if cache is not None else None, + layer_cache=cache["layer_{}". + format(i)] + if cache is not None else None, step=step) saved_inputs.append(all_input) diff --git a/onmt/inputters/inputter.py b/onmt/inputters/inputter.py index 5de1ea0f1b..e2c1b65c04 100644 --- a/onmt/inputters/inputter.py +++ b/onmt/inputters/inputter.py @@ -457,7 +457,7 @@ def _next_dataset_iterator(self, dataset_iter): def build_dataset_iter(datasets, fields, opt, is_train=True): """ This returns user-defined train/validate data iterator for the trainer - to iterate over. We implement simple ordered iterator strategy here, + to iterate over. We implement simple ordered iterator strategy here, but more sophisticated strategy like curriculum learning is ok too. """ batch_size = opt.batch_size if is_train else opt.valid_batch_size diff --git a/onmt/models/model_saver.py b/onmt/models/model_saver.py index 941af39da8..2ecc4982f4 100644 --- a/onmt/models/model_saver.py +++ b/onmt/models/model_saver.py @@ -76,7 +76,7 @@ def _rm_checkpoint(self, name): Remove a checkpoint Args: - name(str): name that indentifies the checkpoint + name(str): name that indentifies the checkpoint (it may be a filepath) """ raise NotImplementedError() diff --git a/onmt/modules/average_attn.py b/onmt/modules/average_attn.py index f6fc5ed62d..1520cb377d 100644 --- a/onmt/modules/average_attn.py +++ b/onmt/modules/average_attn.py @@ -1,5 +1,4 @@ """ Average Attention module """ -import math import torch import torch.nn as nn @@ -23,48 +22,54 @@ def __init__(self, model_dim, dropout=0.1): super(AverageAttention, self).__init__() - self.average_layer = PositionwiseFeedForward(model_dim, model_dim, dropout) + self.average_layer = PositionwiseFeedForward(model_dim, model_dim, + dropout) self.gating_layer = nn.Linear(model_dim * 2, model_dim * 2) - def cumulative_average_mask(self, batch_size, inputs_len): - """ - Builds the mask to compute the cumulative average as described in - https://arxiv.org/abs/1805.00631 -- Figure 3 - - Args: - inputs_len: length of the inputs. - - Returns: - A Tensor of shape [batch_size, input_len, input_len] - """ - - triangle = torch.tril(torch.ones((inputs_len, inputs_len))) - weights = torch.ones((1, inputs_len)) / torch.arange(1, inputs_len + 1) - mask = triangle * weights.transpose(0,1) - - return mask.unsqueeze(0).expand(batch_size, inputs_len, inputs_len) - - def cumulative_average(self, inputs, mask_or_step, layer_cache=None, step=None): - """ - Computes the cumulative average as described in - https://arxiv.org/abs/1805.00631 -- Equations (1) (5) (6) - - Args: - inputs: sequence to average -- Tensor of shape [batch_size, input_len, dimension] - mask_or_step: if cache is set, this is assumed to be the current step of the - dynamic decoding. Otherwise, it is the mask matrix used to compute the cumulative average. - cache: a dictionary containing the cumulative average of the previous step. - """ - if layer_cache is not None: - step = mask_or_step - device = inputs.device - average_attention = (inputs + step * layer_cache["prev_g"].to(device)) / (step + 1) - layer_cache["prev_g"] = average_attention - return average_attention - else: - mask = mask_or_step - return torch.matmul(mask, inputs) + """ + Builds the mask to compute the cumulative average as described in + https://arxiv.org/abs/1805.00631 -- Figure 3 + + Args: + inputs_len: length of the inputs. + + Returns: + A Tensor of shape [batch_size, input_len, input_len] + """ + + triangle = torch.tril(torch.ones((inputs_len, inputs_len))) + weights = torch.ones((1, inputs_len)) / torch.arange(1, inputs_len + 1) + mask = triangle * weights.transpose(0, 1) + + return mask.unsqueeze(0).expand(batch_size, inputs_len, inputs_len) + + def cumulative_average(self, inputs, mask_or_step, + layer_cache=None, step=None): + """ + Computes the cumulative average as described in + https://arxiv.org/abs/1805.00631 -- Equations (1) (5) (6) + + Args: + inputs: sequence to average -- Tensor of shape + [batch_size, input_len, dimension] + mask_or_step: if cache is set, this is assumed + to be the current step of the + dynamic decoding. Otherwise, it is the mask matrix + used to compute the cumulative average. + cache: a dictionary containing the cumulative average + of the previous step. + """ + if layer_cache is not None: + step = mask_or_step + device = inputs.device + average_attention = (inputs + step * + layer_cache["prev_g"].to(device)) / (step + 1) + layer_cache["prev_g"] = average_attention + return average_attention + else: + mask = mask_or_step + return torch.matmul(mask, inputs) def forward(self, inputs, mask=None, layer_cache=None, step=None): @@ -72,12 +77,15 @@ def forward(self, inputs, mask=None, layer_cache=None, step=None): inputs_len = inputs.size(1) device = inputs.device - average_outputs = self.cumulative_average(inputs, - self.cumulative_average_mask(batch_size, inputs_len).to(device).float() if layer_cache is None else step, - layer_cache=layer_cache) + average_outputs = self.cumulative_average( + inputs, self.cumulative_average_mask(batch_size, + inputs_len).to(device).float() + if layer_cache is None else step, layer_cache=layer_cache) average_outputs = self.average_layer(average_outputs) - gating_outputs = self.gating_layer(torch.cat((inputs, average_outputs), -1)) + gating_outputs = self.gating_layer(torch.cat((inputs, + average_outputs), -1)) input_gate, forget_gate = torch.chunk(gating_outputs, 2, dim=2) - gating_outputs = torch.sigmoid(input_gate) * inputs + torch.sigmoid(forget_gate) * average_outputs + gating_outputs = torch.sigmoid(input_gate) * inputs + \ + torch.sigmoid(forget_gate) * average_outputs - return gating_outputs, None \ No newline at end of file + return gating_outputs, None diff --git a/onmt/modules/weight_norm.py b/onmt/modules/weight_norm.py index 214a12469e..c46868f0c0 100644 --- a/onmt/modules/weight_norm.py +++ b/onmt/modules/weight_norm.py @@ -88,12 +88,12 @@ def forward(self, x, init=False): self.b_avg.copy_(self.b.data) return Variable(x_init) else: - V, g, b = get_vars_maybe_avg(self, ['V', 'g', 'b'], + v, g, b = get_vars_maybe_avg(self, ['V', 'g', 'b'], self.training, polyak_decay=self.polyak_decay) # batch_size * out_features - x = F.linear(x, V) - scalar = g / torch.norm(V, 2, 1).squeeze(1) + x = F.linear(x, v) + scalar = g / torch.norm(v, 2, 1).squeeze(1) x = scalar.view(1, -1).expand_as(x) * x + \ b.view(1, -1).expand_as(x) return x @@ -232,14 +232,14 @@ def forward(self, x, init=False): self.b_avg.copy_(self.b.data) return Variable(x_init) else: - V, g, b = get_vars_maybe_avg( + v, g, b = get_vars_maybe_avg( self, ['V', 'g', 'b'], self.training, polyak_decay=self.polyak_decay) scalar = g / \ - torch.norm(V.transpose(0, 1).contiguous().view( + torch.norm(v.transpose(0, 1).contiguous().view( self.out_channels, -1), 2, 1).squeeze(1) w = scalar.view(self.in_channels, self.out_channels, - *([1] * (len(V.size()) - 2))).expand_as(V) * V + *([1] * (len(v.size()) - 2))).expand_as(v) * v x = F.conv_transpose2d(x, w, b, self.stride, self.padding, self.output_padding, diff --git a/onmt/tests/test_models.py b/onmt/tests/test_models.py index d107210996..b8e5027c32 100644 --- a/onmt/tests/test_models.py +++ b/onmt/tests/test_models.py @@ -4,13 +4,12 @@ import math import torch -from torch.autograd import Variable import onmt import onmt.inputters import onmt.opts -from onmt.model_constructor import make_embeddings, \ - make_encoder, make_decoder +from onmt.model_builder import build_embeddings, \ + build_encoder, build_decoder from onmt.encoders.image_encoder import ImageEncoder from onmt.encoders.audio_encoder import AudioEncoder @@ -37,15 +36,15 @@ def get_vocab(self): def get_batch(self, source_l=3, bsize=1): # len x batch x nfeat - test_src = Variable(torch.ones(source_l, bsize, 1)).long() - test_tgt = Variable(torch.ones(source_l, bsize, 1)).long() + test_src = torch.ones(source_l, bsize, 1).long() + test_tgt = torch.ones(source_l, bsize, 1).long() test_length = torch.ones(bsize).fill_(source_l).long() return test_src, test_tgt, test_length def get_batch_image(self, tgt_l=3, bsize=1, h=15, w=17): # batch x c x h x w - test_src = Variable(torch.ones(bsize, 3, h, w)).float() - test_tgt = Variable(torch.ones(tgt_l, bsize, 1)).long() + test_src = torch.ones(bsize, 3, h, w).float() + test_tgt = torch.ones(tgt_l, bsize, 1).long() test_length = None return test_src, test_tgt, test_length @@ -53,8 +52,8 @@ def get_batch_audio(self, tgt_l=3, bsize=1, sample_rate=5500, window_size=0.03, t=37): # batch x 1 x nfft x t nfft = int(math.floor((sample_rate * window_size) / 2) + 1) - test_src = Variable(torch.ones(bsize, 1, nfft, t)).float() - test_tgt = Variable(torch.ones(tgt_l, bsize, 1)).long() + test_src = torch.ones(bsize, 1, nfft, t).float() + test_tgt = torch.ones(tgt_l, bsize, 1).long() test_length = None return test_src, test_tgt, test_length @@ -69,7 +68,7 @@ def embeddings_forward(self, opt, source_l=3, bsize=1): ''' word_dict = self.get_vocab() feature_dicts = [] - emb = make_embeddings(opt, word_dict, feature_dicts) + emb = build_embeddings(opt, word_dict, feature_dicts) test_src, _, __ = self.get_batch(source_l=source_l, bsize=bsize) if opt.decoder_type == 'transformer': @@ -94,8 +93,8 @@ def encoder_forward(self, opt, source_l=3, bsize=1): ''' word_dict = self.get_vocab() feature_dicts = [] - embeddings = make_embeddings(opt, word_dict, feature_dicts) - enc = make_encoder(opt, embeddings) + embeddings = build_embeddings(opt, word_dict, feature_dicts) + enc = build_encoder(opt, embeddings) test_src, test_tgt, test_length = self.get_batch(source_l=source_l, bsize=bsize) @@ -111,8 +110,7 @@ def encoder_forward(self, opt, source_l=3, bsize=1): hidden_t[0].size(), hidden_t[1].size()) self.assertEqual(test_out.size(), outputs.size()) - self.assertEqual(type(outputs), torch.autograd.Variable) - self.assertEqual(type(outputs.data), torch.FloatTensor) + self.assertEqual(type(outputs), torch.Tensor) def nmtmodel_forward(self, opt, source_l=3, bsize=1): """ @@ -127,12 +125,12 @@ def nmtmodel_forward(self, opt, source_l=3, bsize=1): word_dict = self.get_vocab() feature_dicts = [] - embeddings = make_embeddings(opt, word_dict, feature_dicts) - enc = make_encoder(opt, embeddings) + embeddings = build_embeddings(opt, word_dict, feature_dicts) + enc = build_encoder(opt, embeddings) - embeddings = make_embeddings(opt, word_dict, feature_dicts, - for_encoder=False) - dec = make_decoder(opt, embeddings) + embeddings = build_embeddings(opt, word_dict, feature_dicts, + for_encoder=False) + dec = build_decoder(opt, embeddings) model = onmt.models.model.NMTModel(enc, dec) @@ -144,8 +142,7 @@ def nmtmodel_forward(self, opt, source_l=3, bsize=1): outputsize = torch.zeros(source_l - 1, bsize, opt.rnn_size) # Make sure that output has the correct size and type self.assertEqual(outputs.size(), outputsize.size()) - self.assertEqual(type(outputs), torch.autograd.Variable) - self.assertEqual(type(outputs.data), torch.FloatTensor) + self.assertEqual(type(outputs), torch.Tensor) def imagemodel_forward(self, opt, tgt_l=2, bsize=1, h=15, w=17): """ @@ -168,9 +165,9 @@ def imagemodel_forward(self, opt, tgt_l=2, bsize=1, h=15, w=17): opt.rnn_size, opt.dropout) - embeddings = make_embeddings(opt, word_dict, feature_dicts, - for_encoder=False) - dec = make_decoder(opt, embeddings) + embeddings = build_embeddings(opt, word_dict, feature_dicts, + for_encoder=False) + dec = build_decoder(opt, embeddings) model = onmt.models.model.NMTModel(enc, dec) @@ -184,8 +181,7 @@ def imagemodel_forward(self, opt, tgt_l=2, bsize=1, h=15, w=17): outputsize = torch.zeros(tgt_l - 1, bsize, opt.rnn_size) # Make sure that output has the correct size and type self.assertEqual(outputs.size(), outputsize.size()) - self.assertEqual(type(outputs), torch.autograd.Variable) - self.assertEqual(type(outputs.data), torch.FloatTensor) + self.assertEqual(type(outputs), torch.Tensor) def audiomodel_forward(self, opt, tgt_l=2, bsize=1, t=37): """ @@ -210,9 +206,9 @@ def audiomodel_forward(self, opt, tgt_l=2, bsize=1, t=37): opt.sample_rate, opt.window_size) - embeddings = make_embeddings(opt, word_dict, feature_dicts, - for_encoder=False) - dec = make_decoder(opt, embeddings) + embeddings = build_embeddings(opt, word_dict, feature_dicts, + for_encoder=False) + dec = build_decoder(opt, embeddings) model = onmt.models.model.NMTModel(enc, dec) @@ -227,8 +223,7 @@ def audiomodel_forward(self, opt, tgt_l=2, bsize=1, t=37): outputsize = torch.zeros(tgt_l - 1, bsize, opt.rnn_size) # Make sure that output has the correct size and type self.assertEqual(outputs.size(), outputsize.size()) - self.assertEqual(type(outputs), torch.autograd.Variable) - self.assertEqual(type(outputs.data), torch.FloatTensor) + self.assertEqual(type(outputs), torch.Tensor) def _add_test(param_setting, methodname): @@ -317,10 +312,10 @@ def test_method(self): for p in tests_nmtmodel: _add_test(p, 'nmtmodel_forward') -for p in tests_nmtmodel: - _add_test(p, 'imagemodel_forward') +# for p in tests_nmtmodel: +# _add_test(p, 'imagemodel_forward') -for p in tests_nmtmodel: - p.append(('sample_rate', 5500)) - p.append(('window_size', 0.03)) - _add_test(p, 'audiomodel_forward') +# for p in tests_nmtmodel: +# p.append(('sample_rate', 5500)) +# p.append(('window_size', 0.03)) +# _add_test(p, 'audiomodel_forward') diff --git a/onmt/tests/test_models.sh b/onmt/tests/test_models.sh index a444c86091..3c7e8535c0 100755 --- a/onmt/tests/test_models.sh +++ b/onmt/tests/test_models.sh @@ -32,10 +32,10 @@ ### ./test_models set_debug all ### -PYTHON_BIN=python +PYTHON_BIN=python3 -MODEL_DIR="/tmp" +MODEL_DIR="./onmt/tests" MODEL_NAME="onmt_tmp_model" MODEL_PATH="$MODEL_DIR/$MODEL_NAME" MODEL_FILES_PREFIX="${MODEL_NAME}_acc_" @@ -104,7 +104,7 @@ lstm(){ -rnn_size 512 \ -word_vec_size 512 \ -layers 1 \ - -train_steps 10000 \ + -train_steps 500 \ -optim adam \ -learning_rate 0.001 \ -rnn_type LSTM @@ -133,7 +133,7 @@ sru(){ -rnn_size 512 \ -word_vec_size 512 \ -layers 1 \ - -train_steps 10000 \ + -train_steps 500 \ -optim adam \ -learning_rate 0.001 \ -rnn_type SRU \ @@ -162,7 +162,7 @@ cnn(){ -rnn_size 256 \ -word_vec_size 256 \ -layers 2 \ - -train_steps 10000 \ + -train_steps 500 \ -optim adam \ -learning_rate 0.001 \ -encoder_type cnn \ @@ -190,7 +190,7 @@ morph(){ -rnn_size 400 \ -word_vec_size 100 \ -layers 1 \ - -train_steps 10000 \ + -train_steps 500 \ -optim adam \ -learning_rate 0.001 @@ -226,7 +226,7 @@ transformer(){ -word_vec_size 256 \ -encoder_type transformer \ -decoder_type transformer \ - -train_steps 10000 \ + -train_steps 500 \ -gpuid $GPUID \ -max_generator_batches 4 \ -dropout 0.1 \ diff --git a/onmt/tests/test_preprocess.py b/onmt/tests/test_preprocess.py index 3f6b525e8f..b380948935 100644 --- a/onmt/tests/test_preprocess.py +++ b/onmt/tests/test_preprocess.py @@ -128,25 +128,25 @@ def test_method(self): _add_test(p, 'dataset_build') # Test image preprocessing -for p in copy.deepcopy(test_databuild): - p.append(('data_type', 'img')) - p.append(('src_dir', '/tmp/im2text/images')) - p.append(('train_src', '/tmp/im2text/src-train-head.txt')) - p.append(('train_tgt', '/tmp/im2text/tgt-train-head.txt')) - p.append(('valid_src', '/tmp/im2text/src-val-head.txt')) - p.append(('valid_tgt', '/tmp/im2text/tgt-val-head.txt')) - _add_test(p, 'dataset_build') +# for p in copy.deepcopy(test_databuild): +# p.append(('data_type', 'img')) +# p.append(('src_dir', '/tmp/im2text/images')) +# p.append(('train_src', '/tmp/im2text/src-train-head.txt')) +# p.append(('train_tgt', '/tmp/im2text/tgt-train-head.txt')) +# p.append(('valid_src', '/tmp/im2text/src-val-head.txt')) +# p.append(('valid_tgt', '/tmp/im2text/tgt-val-head.txt')) +# _add_test(p, 'dataset_build') # Test audio preprocessing -for p in copy.deepcopy(test_databuild): - p.append(('data_type', 'audio')) - p.append(('src_dir', '/tmp/speech/an4_dataset')) - p.append(('train_src', '/tmp/speech/src-train-head.txt')) - p.append(('train_tgt', '/tmp/speech/tgt-train-head.txt')) - p.append(('valid_src', '/tmp/speech/src-val-head.txt')) - p.append(('valid_tgt', '/tmp/speech/tgt-val-head.txt')) - p.append(('sample_rate', 16000)) - p.append(('window_size', 0.04)) - p.append(('window_stride', 0.02)) - p.append(('window', 'hamming')) - _add_test(p, 'dataset_build') +# for p in copy.deepcopy(test_databuild): +# p.append(('data_type', 'audio')) +# p.append(('src_dir', '/tmp/speech/an4_dataset')) +# p.append(('train_src', '/tmp/speech/src-train-head.txt')) +# p.append(('train_tgt', '/tmp/speech/tgt-train-head.txt')) +# p.append(('valid_src', '/tmp/speech/src-val-head.txt')) +# p.append(('valid_tgt', '/tmp/speech/tgt-val-head.txt')) +# p.append(('sample_rate', 16000)) +# p.append(('window_size', 0.04)) +# p.append(('window_stride', 0.02)) +# p.append(('window', 'hamming')) +# _add_test(p, 'dataset_build') diff --git a/onmt/trainer.py b/onmt/trainer.py index efad54503f..18fd0aa314 100644 --- a/onmt/trainer.py +++ b/onmt/trainer.py @@ -8,7 +8,7 @@ things to users(i.e. how to do it). Also see train.py(one of the users of this library) for the strategy things we do. """ -#!/usr/bin/env python + from __future__ import division from __future__ import print_function @@ -44,10 +44,10 @@ def build_trainer(opt, model, fields, optim, data_type, model_saver=None): gpu_verbose = opt.gpu_verbose report_manager = onmt.utils.build_report_manager(opt) - trainer = onmt.Trainer(model, train_loss, valid_loss, optim, - trunc_size, shard_size, data_type, - norm_method, grad_accum_count, n_gpu, gpu_rank, - gpu_verbose, report_manager, model_saver=model_saver) + trainer = onmt.Trainer(model, train_loss, valid_loss, optim, trunc_size, + shard_size, data_type, norm_method, + grad_accum_count, n_gpu, gpu_rank, gpu_verbose, + report_manager, model_saver=model_saver) return trainer @@ -116,7 +116,7 @@ def train(self, train_iter_fct, valid_iter_fct, train_steps, valid_steps): iterator. e.g. something like train_iter_fct = lambda: generator(*args, **kwargs) valid_iter_fct(function): same as train_iter_fct, for valid data - train_steps(int): + train_steps(int): valid_steps(int): save_checkpoint_steps(int): @@ -141,7 +141,8 @@ def train(self, train_iter_fct, valid_iter_fct, train_steps, valid_steps): for i, batch in enumerate(train_iter): if self.n_gpu == 0 or (i % self.n_gpu == self.gpu_rank): if self.gpu_verbose > 1: - print("GPU %d: index: %d accum: %d" % (self.gpu_rank, i, accum)) + print("GPU %d: index: %d accum: %d" + % (self.gpu_rank, i, accum)) cur_dataset = train_iter.get_cur_dataset() self.train_loss.cur_dataset = cur_dataset @@ -156,7 +157,9 @@ def train(self, train_iter_fct, valid_iter_fct, train_steps, valid_steps): if accum == self.grad_accum_count: reduce_counter += 1 if self.gpu_verbose > 0: - print("GPU %d: reduce_counter: %d n_minibatch %d" % (self.gpu_rank, reduce_counter, len(true_batchs))) + print("GPU %d: reduce_counter: %d n_minibatch %d" + % (self.gpu_rank, reduce_counter, + len(true_batchs))) self._gradient_accumulation( true_batchs, total_stats, report_stats, normalization) @@ -171,16 +174,19 @@ def train(self, train_iter_fct, valid_iter_fct, train_steps, valid_steps): normalization = 0 if (step % valid_steps == 0): if self.gpu_verbose > 0: - print('GPU %d: validate step %d' % (self.gpu_rank, step)) + print('GPU %d: validate step %d' + % (self.gpu_rank, step)) valid_iter = valid_iter_fct() valid_stats = self.validate(valid_iter) if self.gpu_verbose > 0: - print('GPU %d: gather valid stat step %d' % (self.gpu_rank, step)) + print('GPU %d: gather valid stat step %d' + % (self.gpu_rank, step)) valid_stats = self.maybe_gather_stats(valid_stats) if self.gpu_verbose > 0: - print('GPU %d: report stat step %d' % (self.gpu_rank, step)) - self.report_step( - self.optim.learning_rate, step, valid_stats=valid_stats) + print('GPU %d: report stat step %d' + % (self.gpu_rank, step)) + self.report_step(self.optim.learning_rate, + step, valid_stats=valid_stats) if self.gpu_rank == 0: self.maybe_save(step) @@ -188,10 +194,10 @@ def train(self, train_iter_fct, valid_iter_fct, train_steps, valid_steps): if step > train_steps: break - print('GPU %d: for information we completed an epoch at step %d' % (self.gpu_rank, step)) + print('GPU %d: for information we completed an epoch at step %d' + % (self.gpu_rank, step)) train_iter = train_iter_fct() - return total_stats def validate(self, valid_iter): @@ -327,7 +333,7 @@ def maybe_report_training(self, step, num_steps, learning_rate, multigpu=self.n_gpu > 1) def report_step(self, learning_rate, step, train_stats=None, - valid_stats=None): + valid_stats=None): """ Simple function to report stats (if report_manager is set) see `onmt.utils.ReportManagerBase.report_step` for doc diff --git a/onmt/translate/translation.py b/onmt/translate/translation.py index d4424d4fce..ec414474d0 100644 --- a/onmt/translate/translation.py +++ b/onmt/translate/translation.py @@ -44,8 +44,8 @@ def _build_target_tokens(self, src, src_vocab, src_raw, pred, attn): if self.replace_unk and (attn is not None) and (src is not None): for i in range(len(tokens)): if tokens[i] == vocab.itos[inputters.UNK]: - _, maxIndex = attn[i].max(0) - tokens[i] = src_raw[maxIndex[0]] + _, max_index = attn[i].max(0) + tokens[i] = src_raw[max_index[0]] return tokens def from_batch(self, translation_batch): diff --git a/onmt/translate/translator.py b/onmt/translate/translator.py index cdc47ad488..161dd91279 100644 --- a/onmt/translate/translator.py +++ b/onmt/translate/translator.py @@ -4,7 +4,6 @@ import codecs import os import math -import time import torch from itertools import count @@ -329,9 +328,10 @@ def unbottle(m): # initialize cache if self.self_attn_type == "average": - cache = self.model.decoder._init_cache(memory_bank, memory_lengths=memory_lengths) + cache = self.model.decoder._init_cache( + memory_bank, memory_lengths=memory_lengths) else: - cache = None + cache = None # (3) run the decoder to generate sentences, using beam search. for i in range(self.max_length): @@ -355,10 +355,11 @@ def unbottle(m): # Run one step. if self.self_attn_type == "average": - dec_out, dec_states, attn = self.model.decoder( - inp, memory_bank, dec_states, memory_lengths=memory_lengths, step=i, cache=cache) + dec_out, dec_states, attn = self.model.decoder( + inp, memory_bank, dec_states, memory_lengths=memory_lengths, + step=i, cache=cache) else: - dec_out, dec_states, attn = self.model.decoder( + dec_out, dec_states, attn = self.model.decoder( inp, memory_bank, dec_states, memory_lengths=memory_lengths) dec_out = dec_out.squeeze(0) @@ -388,7 +389,6 @@ def unbottle(m): beam_attn.data[:, j, :memory_lengths[j]]) dec_states.beam_update(j, b.get_current_origin(), beam_size) - # (4) Extract sentences from beam. ret = self._from_beam(beam) ret["gold_score"] = [0] * batch_size diff --git a/onmt/utils/__init__.py b/onmt/utils/__init__.py index c3024c8ece..81002607b8 100644 --- a/onmt/utils/__init__.py +++ b/onmt/utils/__init__.py @@ -5,7 +5,10 @@ from onmt.utils.multi_utils import is_master, multi_init, \ all_reduce_and_rescale_tensors -import onmt.utils.optimizers +from onmt.utils.optimizers import build_optim, MultipleOptimizer, \ + Optimizer -__all__ = ["aeq", "use_gpu", "ReportMgr", "build_report_manager", "Statistics", - "is_master", "multi_init", "all_reduce_and_rescale_tensors"] +__all__ = ["aeq", "use_gpu", "ReportMgr", + "build_report_manager", "Statistics", "is_master", + "multi_init", "all_reduce_and_rescale_tensors", + "build_optim", "MultipleOptimizer", "Optimizer"] diff --git a/onmt/utils/optimizers.py b/onmt/utils/optimizers.py index 0bb93ed3c0..1dde36f1d0 100644 --- a/onmt/utils/optimizers.py +++ b/onmt/utils/optimizers.py @@ -121,7 +121,7 @@ class Optimizer(object): decay_method (str, option): custom decay options warmup_steps (int, option): parameter for `noam` decay model_size (int, option): parameter for `noam` decay - + We use the default parameters for Adam that are suggested by the original paper https://arxiv.org/pdf/1412.6980.pdf These values are also used by other established implementations, @@ -214,10 +214,12 @@ def step(self): self._step * self.warmup_steps**(-1.5)))) # Decay based on start_decay_steps every decay_steps else: - if self.start_decay_steps is not None and self._step >= self.start_decay_steps: + if ((self.start_decay_steps is not None) and ( + self._step >= self.start_decay_steps)): self.start_decay = True if self.start_decay: - if (self._step - self.start_decay_steps) % self.decay_steps == 0: + if ((self._step - self.start_decay_steps) + % self.decay_steps == 0): self.learning_rate = self.learning_rate * self.lr_decay print("Decaying learning rate to %g" % self.learning_rate) diff --git a/setup.py b/setup.py index 0a0a0670db..bd2dfa9632 100644 --- a/setup.py +++ b/setup.py @@ -5,4 +5,6 @@ setup(name='OpenNMT-py', description='A python implementation of OpenNMT', version='0.4', - packages=['onmt', 'onmt.encoders', 'onmt.modules', 'onmt.tests', 'onmt.translate', 'onmt.decoders', 'onmt.inputters', 'onmt.models', 'onmt.utils']) + packages=['onmt', 'onmt.encoders', 'onmt.modules', 'onmt.tests', + 'onmt.translate', 'onmt.decoders', 'onmt.inputters', + 'onmt.models', 'onmt.utils']) diff --git a/tools/03to04.py b/tools/03to04.py index b7b2464d1a..5eea6e8f69 100755 --- a/tools/03to04.py +++ b/tools/03to04.py @@ -1,7 +1,7 @@ #!/usr/bin/env python """ Tools to convert models created using OpenNMT-py < 04 - It requires to have sources of both versions of OpenNMT-py + It requires to have sources of both versions of OpenNMT-py Example: ``` @@ -12,7 +12,7 @@ cd onmt_legacy git reset hard 0ecec8b4c16fdec7d8ce2646a0ea47ab6535d308 - # get >= 0.4 + # get >= 0.4 cd ../onmt git remote add ubiqus https://github.com/Ubiqus/OpenNMT-py git pull ubiqus master diff --git a/train_single.py b/train_single.py index b5fefd3f5d..4ee02daf10 100755 --- a/train_single.py +++ b/train_single.py @@ -117,7 +117,7 @@ def valid_iter_fct(): return build_dataset_iter( # Do training. trainer.train(train_iter_fct, valid_iter_fct, opt.train_steps, - opt.valid_steps) + opt.valid_steps) if opt.tensorboard: trainer.report_manager.tensorboard_writer.close()