Skip to content

Commit

Permalink
add src or tgt min frequency to counter value (OpenNMT#1414)
Browse files Browse the repository at this point in the history
  • Loading branch information
francoishernandez authored and vince62s committed Apr 29, 2019
1 parent 624a0b3 commit 531fb39
Showing 1 changed file with 6 additions and 4 deletions.
10 changes: 6 additions & 4 deletions onmt/inputters/inputter.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,15 +285,15 @@ def _build_field_vocab(field, counter, size_multiple=1, **kwargs):
_pad_vocab_to_multiple(field.vocab, size_multiple)


def _load_vocab(vocab_path, name, counters):
def _load_vocab(vocab_path, name, counters, min_freq):
# counters changes in place
vocab = _read_vocab_file(vocab_path, name)
vocab_size = len(vocab)
logger.info('Loaded %s vocab has %d tokens.' % (name, vocab_size))
for i, token in enumerate(vocab):
# keep the order of tokens specified in the vocab file by
# adding them to the counter with decreasing counting values
counters[name][token] = vocab_size - i
counters[name][token] = vocab_size - i + min_freq
return vocab, vocab_size


Expand Down Expand Up @@ -351,13 +351,15 @@ def build_vocab(train_dataset_files, fields, data_type, share_vocab,
# Load vocabulary
if src_vocab_path:
src_vocab, src_vocab_size = _load_vocab(
src_vocab_path, "src", counters)
src_vocab_path, "src", counters,
src_words_min_frequency)
else:
src_vocab = None

if tgt_vocab_path:
tgt_vocab, tgt_vocab_size = _load_vocab(
tgt_vocab_path, "tgt", counters)
tgt_vocab_path, "tgt", counters,
tgt_words_min_frequency)
else:
tgt_vocab = None

Expand Down

0 comments on commit 531fb39

Please sign in to comment.