put back proper flake8 checks (OpenNMT#2106)

ssatturi · Sep 22, 2021 · 4ccf582 · 4ccf582
1 parent c8081af
commit 4ccf582
Show file tree

Hide file tree

Showing 15 changed files with 139 additions and 68 deletions.
diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml
@@ -26,10 +26,7 @@ jobs:
         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
     - name: Lint with flake8
       run: |
-        # stop the build if there are Python syntax errors or undefined names
-        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
-        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-        flake8 . --count --exit-zero --statistics
+        flake8 .
     - name: Unit tests
       run: |
         python -m unittest discover

diff --git a/onmt/bin/build_vocab.py b/onmt/bin/build_vocab.py
@@ -54,7 +54,7 @@ def save_counter(counter, save_path):
     else:
         save_counter(src_counter, opts.src_vocab)
         save_counter(tgt_counter, opts.tgt_vocab)
-    
+
     for k, v in src_feats_counter.items():
         save_counter(v, opts.src_feats_vocab[k])
 

diff --git a/onmt/constants.py b/onmt/constants.py
@@ -22,7 +22,11 @@ class CorpusName(object):
 class SubwordMarker(object):
     SPACER = '▁'
     JOINER = '￭'
-    CASE_MARKUP = ["｟mrk_case_modifier_C｠", "｟mrk_begin_case_region_U｠", "｟mrk_end_case_region_U｠"]
+    CASE_MARKUP = [
+        "｟mrk_case_modifier_C｠",
+        "｟mrk_begin_case_region_U｠",
+        "｟mrk_end_case_region_U｠"
+    ]
 
 
 class ModelTask(object):

diff --git a/onmt/inputters/corpus.py b/onmt/inputters/corpus.py
@@ -11,7 +11,6 @@
 from contextlib import contextmanager
 
 import multiprocessing as mp
-from collections import defaultdict
 
 
 @contextmanager
@@ -143,7 +142,8 @@ def load(self, offset=0, stride=1):
         with exfile_open(self.src, mode='rb') as fs,\
                 exfile_open(self.tgt, mode='rb') as ft,\
                 exfile_open(self.align, mode='rb') as fa:
-            for i, (sline, tline, align, *features) in enumerate(zip(fs, ft, fa, *features_files)):
+            for i, (sline, tline, align, *features) in \
+                    enumerate(zip(fs, ft, fa, *features_files)):
                 if (i % stride) == offset:
                     sline = sline.decode('utf-8')
                     tline = tline.decode('utf-8')
@@ -156,7 +156,8 @@ def load(self, offset=0, stride=1):
                     if features:
                         example["src_feats"] = dict()
                         for j, feat in enumerate(features):
-                            example["src_feats"][features_names[j]] = feat.decode("utf-8")
+                            example["src_feats"][features_names[j]] = \
+                                feat.decode("utf-8")
                     yield example
         for f in features_files:
             f.close()
@@ -223,7 +224,8 @@ def _tokenize(self, stream):
                 example['align'] = example['align'].strip('\n').split()
             if 'src_feats' in example:
                 for k in example['src_feats'].keys():
-                    example['src_feats'][k] = example['src_feats'][k].strip('\n').split()
+                    example['src_feats'][k] = \
+                        example['src_feats'][k].strip('\n').split()
             yield example
 
     def _transform(self, stream):
@@ -317,7 +319,7 @@ def build_sub_vocab(corpora, transforms, opts, n_sample, stride, offset):
     """Build vocab on (strided) subpart of the data."""
     sub_counter_src = Counter()
     sub_counter_tgt = Counter()
-    sub_counter_src_feats =  defaultdict(Counter)
+    sub_counter_src_feats = defaultdict(Counter)
     datasets_iterables = build_corpora_iters(
         corpora, transforms, opts.data,
         skip_empty_level=opts.skip_empty_level,
@@ -329,10 +331,12 @@ def build_sub_vocab(corpora, transforms, opts, n_sample, stride, offset):
                 if opts.dump_samples:
                     build_sub_vocab.queues[c_name][offset].put("blank")
                 continue
-            src_line, tgt_line = maybe_example['src']['src'], maybe_example['tgt']['tgt']
+            src_line, tgt_line = (maybe_example['src']['src'],
+                                  maybe_example['tgt']['tgt'])
             for feat_name, feat_line in maybe_example["src"].items():
                 if feat_name != "src":
-                    sub_counter_src_feats[feat_name].update(feat_line.split(' '))
+                    sub_counter_src_feats[feat_name].update(
+                        feat_line.split(' '))
             sub_counter_src.update(src_line.split(' '))
             sub_counter_tgt.update(tgt_line.split(' '))
             if opts.dump_samples:
@@ -368,7 +372,7 @@ def build_vocab(opts, transforms, n_sample=3):
     corpora = get_corpora(opts, is_train=True)
     counter_src = Counter()
     counter_tgt = Counter()
-    counter_src_feats =  defaultdict(Counter)
+    counter_src_feats = defaultdict(Counter)
     from functools import partial
     queues = {c_name: [mp.Queue(opts.vocab_sample_queue_size)
                        for i in range(opts.num_threads)]
@@ -424,7 +428,8 @@ def save_transformed_sample(opts, transforms, n_sample=3):
                 maybe_example = DatasetAdapter._process(item, is_train=True)
                 if maybe_example is None:
                     continue
-                src_line, tgt_line = maybe_example['src']['src'], maybe_example['tgt']['tgt']
+                src_line, tgt_line = (maybe_example['src']['src'],
+                                      maybe_example['tgt']['tgt'])
                 f_src.write(src_line + '\n')
                 f_tgt.write(tgt_line + '\n')
                 if n_sample > 0 and i >= n_sample:

diff --git a/onmt/inputters/dataset_base.py b/onmt/inputters/dataset_base.py
@@ -116,7 +116,8 @@ def __init__(self, fields, readers, data, sort_key, filter_pred=None):
         self.sort_key = sort_key
         can_copy = 'src_map' in fields and 'alignment' in fields
 
-        read_iters = [r.read(dat, name, feats) for r, (name, dat, feats) in zip(readers, data)]
+        read_iters = [r.read(dat, name, feats)
+                      for r, (name, dat, feats) in zip(readers, data)]
 
         # self.src_vocabs is used in collapse_copy_scores and Translator.py
         self.src_vocabs = []

diff --git a/onmt/inputters/text_dataset.py b/onmt/inputters/text_dataset.py
@@ -190,7 +190,7 @@ def text_fields(**kwargs):
     truncate = kwargs.get("truncate", None)
     fields_ = []
 
-    feat_delim = None #u"￨" if n_feats > 0 else None
+    feat_delim = None  # u"￨" if n_feats > 0 else None
 
     # Base field
     tokenize = partial(

diff --git a/onmt/opts.py b/onmt/opts.py
@@ -133,7 +133,9 @@ def _add_dynamic_fields_opts(parser, build_vocab_only=False):
               help="Share source and target vocabulary.")
 
     group.add("-src_feats_vocab", "--src_feats_vocab",
-              help=("List of paths to save" if build_vocab_only else "List of paths to")
+              help=("List of paths to save"
+                    if build_vocab_only
+                    else "List of paths to")
               + " src features vocabulary files. "
               "Files format: one <word> or <word>\t<count> per line.")
 
@@ -762,7 +764,7 @@ def translate_opts(parser):
                    "sequence)")
     group.add("-src_feats", "--src_feats", required=False,
               help="Source sequence features (dict format). "
-                    "Ex: {'feat_0': '../data.txt.feats0', 'feat_1': '../data.txt.feats1'}")
+                   "Ex: {'feat_0': '../data.txt.feats0', 'feat_1': '../data.txt.feats1'}")  # noqa: E501
     group.add('--tgt', '-tgt',
               help='True target sequence (optional)')
     group.add('--tgt_prefix', '-tgt_prefix', action='store_true',

diff --git a/onmt/tests/test_subword_marker.py b/onmt/tests/test_subword_marker.py
@@ -2,7 +2,7 @@
 
 from onmt.transforms.bart import word_start_finder
 from onmt.utils.alignment import subword_map_by_joiner, subword_map_by_spacer
-from onmt.constants import DefaultTokens, SubwordMarker
+from onmt.constants import SubwordMarker
 
 
 class TestWordStartFinder(unittest.TestCase):
@@ -38,25 +38,37 @@ class TestSubwordGroup(unittest.TestCase):
     def test_subword_group_joiner(self):
         data_in = ['however', '￭,', 'according', 'to', 'the', 'logs', '￭,', 'she', 'is', 'hard', '￭-￭', 'working', '￭.']  # noqa: E501
         true_out = [0, 0, 1, 2, 3, 4, 4, 5, 6, 7, 7, 7, 7]
-        out = subword_map_by_joiner(data_in, marker=SubwordMarker.JOINER, case_markup=SubwordMarker.CASE_MARKUP)
+        out = subword_map_by_joiner(
+            data_in,
+            marker=SubwordMarker.JOINER,
+            case_markup=SubwordMarker.CASE_MARKUP)
         self.assertEqual(out, true_out)
 
     def test_subword_group_joiner_with_case_markup(self):
         data_in = ['｟mrk_case_modifier_C｠', 'however', '￭,', 'according', 'to', 'the', 'logs', '￭,', '｟mrk_begin_case_region_U｠', 'she', 'is', 'hard', '￭-￭', 'working', '￭.', '｟mrk_end_case_region_U｠']  # noqa: E501
         true_out = [0, 0, 0, 1, 2, 3, 4, 4, 5, 5, 6, 7, 7, 7, 7, 7]
-        out = subword_map_by_joiner(data_in, marker=SubwordMarker.JOINER, case_markup=SubwordMarker.CASE_MARKUP)
+        out = subword_map_by_joiner(
+            data_in,
+            marker=SubwordMarker.JOINER,
+            case_markup=SubwordMarker.CASE_MARKUP)
         self.assertEqual(out, true_out)
 
     def test_subword_group_joiner_with_new_joiner(self):
         data_in = ['｟mrk_case_modifier_C｠', 'however', '￭', ',', 'according', 'to', 'the', 'logs', '￭', ',', '｟mrk_begin_case_region_U｠', 'she', 'is', 'hard', '￭', '-', '￭', 'working', '￭', '.', '｟mrk_end_case_region_U｠']  # noqa: E501
-        true_out = [0, 0, 0, 0, 1, 2, 3, 4, 4, 4, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7]
-        out = subword_map_by_joiner(data_in, marker=SubwordMarker.JOINER, case_markup=SubwordMarker.CASE_MARKUP)
+        true_out = [0, 0, 0, 0, 1, 2, 3, 4, 4, 4, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7]  # noqa: E501
+        out = subword_map_by_joiner(
+            data_in,
+            marker=SubwordMarker.JOINER,
+            case_markup=SubwordMarker.CASE_MARKUP)
         self.assertEqual(out, true_out)
 
     def test_subword_group_naive(self):
         data_in = ['however', ',', 'according', 'to', 'the', 'logs', ',', 'she', 'is', 'hard', '-', 'working', '.']  # noqa: E501
         true_out = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
-        out = subword_map_by_joiner(data_in, marker=SubwordMarker.JOINER, case_markup=SubwordMarker.CASE_MARKUP)
+        out = subword_map_by_joiner(
+            data_in,
+            marker=SubwordMarker.JOINER,
+            case_markup=SubwordMarker.CASE_MARKUP)
         self.assertEqual(out, true_out)
 
     def test_subword_group_spacer(self):
@@ -77,7 +89,7 @@ def test_subword_group_spacer_with_case_markup(self):
 
     def test_subword_group_spacer_with_spacer_new(self):
         data_in = ['｟mrk_case_modifier_C｠', '▁', 'however', ',', '▁', 'according', '▁', 'to', '▁', 'the', '▁', 'logs', ',', '▁', '｟mrk_begin_case_region_U｠', '▁', 'she', '▁', 'is', '▁', 'hard', '-', 'working', '.', '▁', '｟mrk_end_case_region_U｠']  # noqa: E501
-        true_out = [0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7, 7, 7, 7, 7, 7, 7]
+        true_out = [0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7, 7, 7, 7, 7, 7, 7]  # noqa: E501
         out = subword_map_by_spacer(data_in)
         self.assertEqual(out, true_out)
 

diff --git a/onmt/tests/test_text_dataset.py b/onmt/tests/test_text_dataset.py
@@ -80,7 +80,13 @@ def test_preprocess_shape(self):
             init_case = self.initialize_case(init_case, params)
             mf = TextMultiField(**init_case)
 
-            sample_str = {"base_field": "dummy input here .", "a": "A A B D", "r": "C C C C", "b": "D F E D", "zbase_field": "another dummy input ."}
+            sample_str = {
+                "base_field": "dummy input here .",
+                "a": "A A B D",
+                "r": "C C C C",
+                "b": "D F E D",
+                "zbase_field": "another dummy input ."
+            }
             proc = mf.preprocess(sample_str)
             self.assertEqual(len(proc), len(init_case["feats_fields"]) + 1)
 
@@ -175,7 +181,9 @@ def tearDownClass(cls):
     def test_read(self):
         rdr = TextDataReader()
         for i, ex in enumerate(rdr.read(self.FILE_NAME, "src")):
-            self.assertEqual(ex["src"], {"src": self.STRINGS[i].decode("utf-8")})
+            self.assertEqual(
+                ex["src"], {"src": self.STRINGS[i].decode("utf-8")})
+
 
 class TestTextDataReaderWithFeatures(unittest.TestCase):
     def test_read(self):
@@ -191,7 +199,10 @@ def test_read(self):
                 "A A D D E E".encode("utf-8")
                 ]
             }
-    
+
         rdr = TextDataReader()
         for i, ex in enumerate(rdr.read(strings, "src", features)):
-            self.assertEqual(ex["src"], {"src": strings[i].decode("utf-8"), "feat_0": features["feat_0"][i].decode("utf-8")})
+            self.assertEqual(
+                ex["src"],
+                {"src": strings[i].decode("utf-8"),
+                 "feat_0": features["feat_0"][i].decode("utf-8")})
diff --git a/onmt/tests/test_transform.py b/onmt/tests/test_transform.py
@@ -55,7 +55,6 @@ def test_transform_specials(self):
         specials_expected = {"src": {"｟_pf_src｠"}, "tgt": {"｟_pf_tgt｠"}}
         self.assertEqual(specials, specials_expected)
 
-
     def test_transform_pipe(self):
         # 1. Init first transform in the pipe
         prefix_cls = get_transforms_cls(["prefix"])["prefix"]
@@ -510,24 +509,40 @@ def test_span_infilling(self):
         # print(f"Text Span Infilling: {infillied} / {tokens}")
         # print(n_words, n_masked)
 
+
 class TestFeaturesTransform(unittest.TestCase):
     def test_inferfeats(self):
         inferfeats_cls = get_transforms_cls(["inferfeats"])["inferfeats"]
         opt = Namespace(reversible_tokenization="joiner")
         inferfeats_transform = inferfeats_cls(opt)
 
         ex_in = {
-            "src": ['however', '￭,', 'according', 'to', 'the', 'logs', '￭,', 'she', 'is', 'hard', '￭-￭', 'working', '￭.'],
-            "tgt": ['however', '￭,', 'according', 'to', 'the', 'logs', '￭,', 'she', 'is', 'hard', '￭-￭', 'working', '￭.']
+            "src": ['however', '￭,', 'according', 'to', 'the', 'logs',
+                    '￭,', 'she', 'is', 'hard', '￭-￭', 'working', '￭.'],
+            "tgt": ['however', '￭,', 'according', 'to', 'the', 'logs',
+                    '￭,', 'she', 'is', 'hard', '￭-￭', 'working', '￭.']
         }
         ex_out = inferfeats_transform.apply(ex_in)
         self.assertIs(ex_out, ex_in)
 
-        ex_in["src_feats"] = {"feat_0": ["A", "A", "A", "A", "B", "A", "A", "C"]}
+        ex_in["src_feats"] = {
+            "feat_0": ["A", "A", "A", "A", "B", "A", "A", "C"]
+        }
         ex_out = inferfeats_transform.apply(ex_in)
-        self.assertEqual(ex_out["src_feats"]["feat_0"], ["A", "<null>", "A", "A", "A", "B", "<null>", "A", "A", "C", "<null>", "C", "<null>"])
-
-        ex_in["src"] = ['｟mrk_case_modifier_C｠', 'however', '￭,', 'according', 'to', 'the', 'logs', '￭,', '｟mrk_begin_case_region_U｠', 'she', 'is', 'hard', '￭-￭', 'working', '￭.', '｟mrk_end_case_region_U｠']
-        ex_in["src_feats"] = {"feat_0": ["A", "A", "A", "A", "B", "A", "A", "C"]}
+        self.assertEqual(
+            ex_out["src_feats"]["feat_0"],
+            ["A", "<null>", "A", "A", "A", "B", "<null>", "A",
+             "A", "C", "<null>", "C", "<null>"])
+
+        ex_in["src"] = ['｟mrk_case_modifier_C｠', 'however', '￭,',
+                        'according', 'to', 'the', 'logs', '￭,',
+                        '｟mrk_begin_case_region_U｠', 'she', 'is', 'hard',
+                        '￭-￭', 'working', '￭.', '｟mrk_end_case_region_U｠']
+        ex_in["src_feats"] = {
+            "feat_0": ["A", "A", "A", "A", "B", "A", "A", "C"]
+        }
         ex_out = inferfeats_transform.apply(ex_in)
-        self.assertEqual(ex_out["src_feats"]["feat_0"], ["<null>", "A", "<null>", "A", "A", "A", "B", "<null>", "<null>", "A", "A", "C", "<null>", "C", "<null>", "<null>"])
+        self.assertEqual(
+            ex_out["src_feats"]["feat_0"],
+            ["<null>", "A", "<null>", "A", "A", "A", "B", "<null>", "<null>",
+             "A", "A", "C", "<null>", "C", "<null>", "<null>"])
diff --git a/onmt/tests/test_translator.py b/onmt/tests/test_translator.py
@@ -4,7 +4,7 @@
 
 
 class TestGeneratorLM(unittest.TestCase):
-    def test_split_src_to_prevent_padding_target_prefix_is_none_when_equal_size(
+    def test_split_src_to_prevent_padding_target_prefix_is_none_when_equal_size(  # noqa: E501
         self,
     ):
         src = torch.randint(0, 10, (5, 6))
@@ -16,7 +16,7 @@ def test_split_src_to_prevent_padding_target_prefix_is_none_when_equal_size(
         ) = GeneratorLM.split_src_to_prevent_padding(src, src_lengths)
         self.assertIsNone(target_prefix)
 
-    def test_split_src_to_prevent_padding_target_prefix_is_ok_when_different_size(
+    def test_split_src_to_prevent_padding_target_prefix_is_ok_when_different_size(  # noqa: E501
         self,
     ):
         default_length = 5

diff --git a/onmt/transforms/features.py b/onmt/transforms/features.py
@@ -1,7 +1,7 @@
 from onmt.utils.logging import logger
 from onmt.transforms import register_transform
-from .transform import Transform, ObservableStats
-from onmt.constants import DefaultTokens, SubwordMarker
+from .transform import Transform
+from onmt.constants import SubwordMarker
 from onmt.utils.alignment import subword_map_by_joiner, subword_map_by_spacer
 import re
 from collections import defaultdict
@@ -30,7 +30,9 @@ def apply(self, example, is_train=False, stats=None, **kwargs):
 
         for feat_name, feat_values in example['src_feats'].items():
             if len(example['src']) != len(feat_values):
-                logger.warning(f"Skipping example due to mismatch between source and feature {feat_name}")
+                logger.warning(
+                    f"Skipping example due to mismatch "
+                    f"between source and feature {feat_name}")
                 return None
         return example
 
@@ -49,8 +51,10 @@ def __init__(self, opts):
     def add_options(cls, parser):
         """Avalilable options related to this Transform."""
         group = parser.add_argument_group("Transform/InferFeats")
-        group.add("--reversible_tokenization", "-reversible_tokenization", default="joiner",
-                  choices=["joiner", "spacer"], help="Type of reversible tokenization applied on the tokenizer.")
+        group.add("--reversible_tokenization", "-reversible_tokenization",
+                  default="joiner", choices=["joiner", "spacer"],
+                  help="Type of reversible tokenization "
+                       "applied on the tokenizer.")
 
     def _parse_opts(self):
         super()._parse_opts()
@@ -64,7 +68,7 @@ def apply(self, example, is_train=False, stats=None, **kwargs):
 
         if self.reversible_tokenization == "joiner":
             word_to_subword_mapping = subword_map_by_joiner(example["src"])
-        else: #Spacer
+        else:  # Spacer
             word_to_subword_mapping = subword_map_by_spacer(example["src"])
 
         inferred_feats = defaultdict(list)
@@ -73,7 +77,8 @@ def apply(self, example, is_train=False, stats=None, **kwargs):
                 # If case markup placeholder
                 if subword in SubwordMarker.CASE_MARKUP:
                     inferred_feat = "<null>"
-                # Punctuation only (assumes joiner is also some punctuation token)
+                # Punctuation only
+                # (assumes joiner is also some punctuation token)
                 elif not re.sub(r'(\W)+', '', subword).strip():
                     inferred_feat = "<null>"
                 else:
@@ -87,4 +92,4 @@ def apply(self, example, is_train=False, stats=None, **kwargs):
         return example
 
     def _repr_args(self):
-        return ''
+        return ''