Skip to content

Commit

Permalink
put back proper flake8 checks (OpenNMT#2106)
Browse files Browse the repository at this point in the history
  • Loading branch information
francoishernandez authored Sep 22, 2021
1 parent c8081af commit 4ccf582
Show file tree
Hide file tree
Showing 15 changed files with 139 additions and 68 deletions.
5 changes: 1 addition & 4 deletions .github/workflows/push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,7 @@ jobs:
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --statistics
flake8 .
- name: Unit tests
run: |
python -m unittest discover
Expand Down
2 changes: 1 addition & 1 deletion onmt/bin/build_vocab.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def save_counter(counter, save_path):
else:
save_counter(src_counter, opts.src_vocab)
save_counter(tgt_counter, opts.tgt_vocab)

for k, v in src_feats_counter.items():
save_counter(v, opts.src_feats_vocab[k])

Expand Down
6 changes: 5 additions & 1 deletion onmt/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,11 @@ class CorpusName(object):
class SubwordMarker(object):
SPACER = '▁'
JOINER = '■'
CASE_MARKUP = ["⦅mrk_case_modifier_C⦆", "⦅mrk_begin_case_region_U⦆", "⦅mrk_end_case_region_U⦆"]
CASE_MARKUP = [
"⦅mrk_case_modifier_C⦆",
"⦅mrk_begin_case_region_U⦆",
"⦅mrk_end_case_region_U⦆"
]


class ModelTask(object):
Expand Down
23 changes: 14 additions & 9 deletions onmt/inputters/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from contextlib import contextmanager

import multiprocessing as mp
from collections import defaultdict


@contextmanager
Expand Down Expand Up @@ -143,7 +142,8 @@ def load(self, offset=0, stride=1):
with exfile_open(self.src, mode='rb') as fs,\
exfile_open(self.tgt, mode='rb') as ft,\
exfile_open(self.align, mode='rb') as fa:
for i, (sline, tline, align, *features) in enumerate(zip(fs, ft, fa, *features_files)):
for i, (sline, tline, align, *features) in \
enumerate(zip(fs, ft, fa, *features_files)):
if (i % stride) == offset:
sline = sline.decode('utf-8')
tline = tline.decode('utf-8')
Expand All @@ -156,7 +156,8 @@ def load(self, offset=0, stride=1):
if features:
example["src_feats"] = dict()
for j, feat in enumerate(features):
example["src_feats"][features_names[j]] = feat.decode("utf-8")
example["src_feats"][features_names[j]] = \
feat.decode("utf-8")
yield example
for f in features_files:
f.close()
Expand Down Expand Up @@ -223,7 +224,8 @@ def _tokenize(self, stream):
example['align'] = example['align'].strip('\n').split()
if 'src_feats' in example:
for k in example['src_feats'].keys():
example['src_feats'][k] = example['src_feats'][k].strip('\n').split()
example['src_feats'][k] = \
example['src_feats'][k].strip('\n').split()
yield example

def _transform(self, stream):
Expand Down Expand Up @@ -317,7 +319,7 @@ def build_sub_vocab(corpora, transforms, opts, n_sample, stride, offset):
"""Build vocab on (strided) subpart of the data."""
sub_counter_src = Counter()
sub_counter_tgt = Counter()
sub_counter_src_feats = defaultdict(Counter)
sub_counter_src_feats = defaultdict(Counter)
datasets_iterables = build_corpora_iters(
corpora, transforms, opts.data,
skip_empty_level=opts.skip_empty_level,
Expand All @@ -329,10 +331,12 @@ def build_sub_vocab(corpora, transforms, opts, n_sample, stride, offset):
if opts.dump_samples:
build_sub_vocab.queues[c_name][offset].put("blank")
continue
src_line, tgt_line = maybe_example['src']['src'], maybe_example['tgt']['tgt']
src_line, tgt_line = (maybe_example['src']['src'],
maybe_example['tgt']['tgt'])
for feat_name, feat_line in maybe_example["src"].items():
if feat_name != "src":
sub_counter_src_feats[feat_name].update(feat_line.split(' '))
sub_counter_src_feats[feat_name].update(
feat_line.split(' '))
sub_counter_src.update(src_line.split(' '))
sub_counter_tgt.update(tgt_line.split(' '))
if opts.dump_samples:
Expand Down Expand Up @@ -368,7 +372,7 @@ def build_vocab(opts, transforms, n_sample=3):
corpora = get_corpora(opts, is_train=True)
counter_src = Counter()
counter_tgt = Counter()
counter_src_feats = defaultdict(Counter)
counter_src_feats = defaultdict(Counter)
from functools import partial
queues = {c_name: [mp.Queue(opts.vocab_sample_queue_size)
for i in range(opts.num_threads)]
Expand Down Expand Up @@ -424,7 +428,8 @@ def save_transformed_sample(opts, transforms, n_sample=3):
maybe_example = DatasetAdapter._process(item, is_train=True)
if maybe_example is None:
continue
src_line, tgt_line = maybe_example['src']['src'], maybe_example['tgt']['tgt']
src_line, tgt_line = (maybe_example['src']['src'],
maybe_example['tgt']['tgt'])
f_src.write(src_line + '\n')
f_tgt.write(tgt_line + '\n')
if n_sample > 0 and i >= n_sample:
Expand Down
3 changes: 2 additions & 1 deletion onmt/inputters/dataset_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,8 @@ def __init__(self, fields, readers, data, sort_key, filter_pred=None):
self.sort_key = sort_key
can_copy = 'src_map' in fields and 'alignment' in fields

read_iters = [r.read(dat, name, feats) for r, (name, dat, feats) in zip(readers, data)]
read_iters = [r.read(dat, name, feats)
for r, (name, dat, feats) in zip(readers, data)]

# self.src_vocabs is used in collapse_copy_scores and Translator.py
self.src_vocabs = []
Expand Down
2 changes: 1 addition & 1 deletion onmt/inputters/text_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ def text_fields(**kwargs):
truncate = kwargs.get("truncate", None)
fields_ = []

feat_delim = None #u"│" if n_feats > 0 else None
feat_delim = None # u"│" if n_feats > 0 else None

# Base field
tokenize = partial(
Expand Down
6 changes: 4 additions & 2 deletions onmt/opts.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,9 @@ def _add_dynamic_fields_opts(parser, build_vocab_only=False):
help="Share source and target vocabulary.")

group.add("-src_feats_vocab", "--src_feats_vocab",
help=("List of paths to save" if build_vocab_only else "List of paths to")
help=("List of paths to save"
if build_vocab_only
else "List of paths to")
+ " src features vocabulary files. "
"Files format: one <word> or <word>\t<count> per line.")

Expand Down Expand Up @@ -762,7 +764,7 @@ def translate_opts(parser):
"sequence)")
group.add("-src_feats", "--src_feats", required=False,
help="Source sequence features (dict format). "
"Ex: {'feat_0': '../data.txt.feats0', 'feat_1': '../data.txt.feats1'}")
"Ex: {'feat_0': '../data.txt.feats0', 'feat_1': '../data.txt.feats1'}") # noqa: E501
group.add('--tgt', '-tgt',
help='True target sequence (optional)')
group.add('--tgt_prefix', '-tgt_prefix', action='store_true',
Expand Down
26 changes: 19 additions & 7 deletions onmt/tests/test_subword_marker.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from onmt.transforms.bart import word_start_finder
from onmt.utils.alignment import subword_map_by_joiner, subword_map_by_spacer
from onmt.constants import DefaultTokens, SubwordMarker
from onmt.constants import SubwordMarker


class TestWordStartFinder(unittest.TestCase):
Expand Down Expand Up @@ -38,25 +38,37 @@ class TestSubwordGroup(unittest.TestCase):
def test_subword_group_joiner(self):
data_in = ['however', '■,', 'according', 'to', 'the', 'logs', '■,', 'she', 'is', 'hard', '■-■', 'working', '■.'] # noqa: E501
true_out = [0, 0, 1, 2, 3, 4, 4, 5, 6, 7, 7, 7, 7]
out = subword_map_by_joiner(data_in, marker=SubwordMarker.JOINER, case_markup=SubwordMarker.CASE_MARKUP)
out = subword_map_by_joiner(
data_in,
marker=SubwordMarker.JOINER,
case_markup=SubwordMarker.CASE_MARKUP)
self.assertEqual(out, true_out)

def test_subword_group_joiner_with_case_markup(self):
data_in = ['⦅mrk_case_modifier_C⦆', 'however', '■,', 'according', 'to', 'the', 'logs', '■,', '⦅mrk_begin_case_region_U⦆', 'she', 'is', 'hard', '■-■', 'working', '■.', '⦅mrk_end_case_region_U⦆'] # noqa: E501
true_out = [0, 0, 0, 1, 2, 3, 4, 4, 5, 5, 6, 7, 7, 7, 7, 7]
out = subword_map_by_joiner(data_in, marker=SubwordMarker.JOINER, case_markup=SubwordMarker.CASE_MARKUP)
out = subword_map_by_joiner(
data_in,
marker=SubwordMarker.JOINER,
case_markup=SubwordMarker.CASE_MARKUP)
self.assertEqual(out, true_out)

def test_subword_group_joiner_with_new_joiner(self):
data_in = ['⦅mrk_case_modifier_C⦆', 'however', '■', ',', 'according', 'to', 'the', 'logs', '■', ',', '⦅mrk_begin_case_region_U⦆', 'she', 'is', 'hard', '■', '-', '■', 'working', '■', '.', '⦅mrk_end_case_region_U⦆'] # noqa: E501
true_out = [0, 0, 0, 0, 1, 2, 3, 4, 4, 4, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7]
out = subword_map_by_joiner(data_in, marker=SubwordMarker.JOINER, case_markup=SubwordMarker.CASE_MARKUP)
true_out = [0, 0, 0, 0, 1, 2, 3, 4, 4, 4, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7] # noqa: E501
out = subword_map_by_joiner(
data_in,
marker=SubwordMarker.JOINER,
case_markup=SubwordMarker.CASE_MARKUP)
self.assertEqual(out, true_out)

def test_subword_group_naive(self):
data_in = ['however', ',', 'according', 'to', 'the', 'logs', ',', 'she', 'is', 'hard', '-', 'working', '.'] # noqa: E501
true_out = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
out = subword_map_by_joiner(data_in, marker=SubwordMarker.JOINER, case_markup=SubwordMarker.CASE_MARKUP)
out = subword_map_by_joiner(
data_in,
marker=SubwordMarker.JOINER,
case_markup=SubwordMarker.CASE_MARKUP)
self.assertEqual(out, true_out)

def test_subword_group_spacer(self):
Expand All @@ -77,7 +89,7 @@ def test_subword_group_spacer_with_case_markup(self):

def test_subword_group_spacer_with_spacer_new(self):
data_in = ['⦅mrk_case_modifier_C⦆', '▁', 'however', ',', '▁', 'according', '▁', 'to', '▁', 'the', '▁', 'logs', ',', '▁', '⦅mrk_begin_case_region_U⦆', '▁', 'she', '▁', 'is', '▁', 'hard', '-', 'working', '.', '▁', '⦅mrk_end_case_region_U⦆'] # noqa: E501
true_out = [0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7, 7, 7, 7, 7, 7, 7]
true_out = [0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7, 7, 7, 7, 7, 7, 7] # noqa: E501
out = subword_map_by_spacer(data_in)
self.assertEqual(out, true_out)

Expand Down
19 changes: 15 additions & 4 deletions onmt/tests/test_text_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,13 @@ def test_preprocess_shape(self):
init_case = self.initialize_case(init_case, params)
mf = TextMultiField(**init_case)

sample_str = {"base_field": "dummy input here .", "a": "A A B D", "r": "C C C C", "b": "D F E D", "zbase_field": "another dummy input ."}
sample_str = {
"base_field": "dummy input here .",
"a": "A A B D",
"r": "C C C C",
"b": "D F E D",
"zbase_field": "another dummy input ."
}
proc = mf.preprocess(sample_str)
self.assertEqual(len(proc), len(init_case["feats_fields"]) + 1)

Expand Down Expand Up @@ -175,7 +181,9 @@ def tearDownClass(cls):
def test_read(self):
rdr = TextDataReader()
for i, ex in enumerate(rdr.read(self.FILE_NAME, "src")):
self.assertEqual(ex["src"], {"src": self.STRINGS[i].decode("utf-8")})
self.assertEqual(
ex["src"], {"src": self.STRINGS[i].decode("utf-8")})


class TestTextDataReaderWithFeatures(unittest.TestCase):
def test_read(self):
Expand All @@ -191,7 +199,10 @@ def test_read(self):
"A A D D E E".encode("utf-8")
]
}

rdr = TextDataReader()
for i, ex in enumerate(rdr.read(strings, "src", features)):
self.assertEqual(ex["src"], {"src": strings[i].decode("utf-8"), "feat_0": features["feat_0"][i].decode("utf-8")})
self.assertEqual(
ex["src"],
{"src": strings[i].decode("utf-8"),
"feat_0": features["feat_0"][i].decode("utf-8")})
33 changes: 24 additions & 9 deletions onmt/tests/test_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ def test_transform_specials(self):
specials_expected = {"src": {"⦅_pf_src⦆"}, "tgt": {"⦅_pf_tgt⦆"}}
self.assertEqual(specials, specials_expected)


def test_transform_pipe(self):
# 1. Init first transform in the pipe
prefix_cls = get_transforms_cls(["prefix"])["prefix"]
Expand Down Expand Up @@ -510,24 +509,40 @@ def test_span_infilling(self):
# print(f"Text Span Infilling: {infillied} / {tokens}")
# print(n_words, n_masked)


class TestFeaturesTransform(unittest.TestCase):
def test_inferfeats(self):
inferfeats_cls = get_transforms_cls(["inferfeats"])["inferfeats"]
opt = Namespace(reversible_tokenization="joiner")
inferfeats_transform = inferfeats_cls(opt)

ex_in = {
"src": ['however', '■,', 'according', 'to', 'the', 'logs', '■,', 'she', 'is', 'hard', '■-■', 'working', '■.'],
"tgt": ['however', '■,', 'according', 'to', 'the', 'logs', '■,', 'she', 'is', 'hard', '■-■', 'working', '■.']
"src": ['however', '■,', 'according', 'to', 'the', 'logs',
'■,', 'she', 'is', 'hard', '■-■', 'working', '■.'],
"tgt": ['however', '■,', 'according', 'to', 'the', 'logs',
'■,', 'she', 'is', 'hard', '■-■', 'working', '■.']
}
ex_out = inferfeats_transform.apply(ex_in)
self.assertIs(ex_out, ex_in)

ex_in["src_feats"] = {"feat_0": ["A", "A", "A", "A", "B", "A", "A", "C"]}
ex_in["src_feats"] = {
"feat_0": ["A", "A", "A", "A", "B", "A", "A", "C"]
}
ex_out = inferfeats_transform.apply(ex_in)
self.assertEqual(ex_out["src_feats"]["feat_0"], ["A", "<null>", "A", "A", "A", "B", "<null>", "A", "A", "C", "<null>", "C", "<null>"])

ex_in["src"] = ['⦅mrk_case_modifier_C⦆', 'however', '■,', 'according', 'to', 'the', 'logs', '■,', '⦅mrk_begin_case_region_U⦆', 'she', 'is', 'hard', '■-■', 'working', '■.', '⦅mrk_end_case_region_U⦆']
ex_in["src_feats"] = {"feat_0": ["A", "A", "A", "A", "B", "A", "A", "C"]}
self.assertEqual(
ex_out["src_feats"]["feat_0"],
["A", "<null>", "A", "A", "A", "B", "<null>", "A",
"A", "C", "<null>", "C", "<null>"])

ex_in["src"] = ['⦅mrk_case_modifier_C⦆', 'however', '■,',
'according', 'to', 'the', 'logs', '■,',
'⦅mrk_begin_case_region_U⦆', 'she', 'is', 'hard',
'■-■', 'working', '■.', '⦅mrk_end_case_region_U⦆']
ex_in["src_feats"] = {
"feat_0": ["A", "A", "A", "A", "B", "A", "A", "C"]
}
ex_out = inferfeats_transform.apply(ex_in)
self.assertEqual(ex_out["src_feats"]["feat_0"], ["<null>", "A", "<null>", "A", "A", "A", "B", "<null>", "<null>", "A", "A", "C", "<null>", "C", "<null>", "<null>"])
self.assertEqual(
ex_out["src_feats"]["feat_0"],
["<null>", "A", "<null>", "A", "A", "A", "B", "<null>", "<null>",
"A", "A", "C", "<null>", "C", "<null>", "<null>"])
4 changes: 2 additions & 2 deletions onmt/tests/test_translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@


class TestGeneratorLM(unittest.TestCase):
def test_split_src_to_prevent_padding_target_prefix_is_none_when_equal_size(
def test_split_src_to_prevent_padding_target_prefix_is_none_when_equal_size( # noqa: E501
self,
):
src = torch.randint(0, 10, (5, 6))
Expand All @@ -16,7 +16,7 @@ def test_split_src_to_prevent_padding_target_prefix_is_none_when_equal_size(
) = GeneratorLM.split_src_to_prevent_padding(src, src_lengths)
self.assertIsNone(target_prefix)

def test_split_src_to_prevent_padding_target_prefix_is_ok_when_different_size(
def test_split_src_to_prevent_padding_target_prefix_is_ok_when_different_size( # noqa: E501
self,
):
default_length = 5
Expand Down
21 changes: 13 additions & 8 deletions onmt/transforms/features.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from onmt.utils.logging import logger
from onmt.transforms import register_transform
from .transform import Transform, ObservableStats
from onmt.constants import DefaultTokens, SubwordMarker
from .transform import Transform
from onmt.constants import SubwordMarker
from onmt.utils.alignment import subword_map_by_joiner, subword_map_by_spacer
import re
from collections import defaultdict
Expand Down Expand Up @@ -30,7 +30,9 @@ def apply(self, example, is_train=False, stats=None, **kwargs):

for feat_name, feat_values in example['src_feats'].items():
if len(example['src']) != len(feat_values):
logger.warning(f"Skipping example due to mismatch between source and feature {feat_name}")
logger.warning(
f"Skipping example due to mismatch "
f"between source and feature {feat_name}")
return None
return example

Expand All @@ -49,8 +51,10 @@ def __init__(self, opts):
def add_options(cls, parser):
"""Avalilable options related to this Transform."""
group = parser.add_argument_group("Transform/InferFeats")
group.add("--reversible_tokenization", "-reversible_tokenization", default="joiner",
choices=["joiner", "spacer"], help="Type of reversible tokenization applied on the tokenizer.")
group.add("--reversible_tokenization", "-reversible_tokenization",
default="joiner", choices=["joiner", "spacer"],
help="Type of reversible tokenization "
"applied on the tokenizer.")

def _parse_opts(self):
super()._parse_opts()
Expand All @@ -64,7 +68,7 @@ def apply(self, example, is_train=False, stats=None, **kwargs):

if self.reversible_tokenization == "joiner":
word_to_subword_mapping = subword_map_by_joiner(example["src"])
else: #Spacer
else: # Spacer
word_to_subword_mapping = subword_map_by_spacer(example["src"])

inferred_feats = defaultdict(list)
Expand All @@ -73,7 +77,8 @@ def apply(self, example, is_train=False, stats=None, **kwargs):
# If case markup placeholder
if subword in SubwordMarker.CASE_MARKUP:
inferred_feat = "<null>"
# Punctuation only (assumes joiner is also some punctuation token)
# Punctuation only
# (assumes joiner is also some punctuation token)
elif not re.sub(r'(\W)+', '', subword).strip():
inferred_feat = "<null>"
else:
Expand All @@ -87,4 +92,4 @@ def apply(self, example, is_train=False, stats=None, **kwargs):
return example

def _repr_args(self):
return ''
return ''
Loading

0 comments on commit 4ccf582

Please sign in to comment.