Skip to content

Commit

Permalink
Add some extra flags to TokenTextEncoder for increased flexibility.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 258428299
  • Loading branch information
TensorFlow Datasets Team authored and copybara-github committed Jul 16, 2019
1 parent b9c4488 commit 3bc311e
Showing 1 changed file with 13 additions and 3 deletions.
16 changes: 13 additions & 3 deletions tensorflow_datasets/core/features/text/text_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,9 @@ def __init__(self,
oov_buckets=1,
oov_token="UNK",
lowercase=False,
tokenizer=None):
tokenizer=None,
strip_vocab=True,
decode_token_separator=" "):
"""Constructs a TokenTextEncoder.
To load from a file saved with `TokenTextEncoder.save_to_file`, use
Expand All @@ -244,8 +246,14 @@ def __init__(self,
lowercase: `bool`, whether to make all text and tokens lowercase.
tokenizer: `Tokenizer`, responsible for converting incoming text into a
list of tokens.
strip_vocab: `bool`, whether to strip whitespace from the beginning and
end of elements of `vocab_list`.
decode_token_separator: `str`, the string used to separate tokens when
decoding.
"""
self._vocab_list = [tf.compat.as_text(el).strip() for el in vocab_list]
self._vocab_list = [tf.compat.as_text(el) for el in vocab_list]
if strip_vocab:
self._vocab_list = [el.strip() for el in self._vocab_list]
self._lowercase = lowercase
if self._lowercase:
self._vocab_list = [t.lower() for t in self._vocab_list]
Expand All @@ -261,6 +269,8 @@ def __init__(self,
self._tokenizer = (tokenizer or Tokenizer(reserved_tokens=reserved_tokens))
self._user_defined_tokenizer = tokenizer

self._decode_token_separator = decode_token_separator

def encode(self, s):
s = tf.compat.as_text(s)
if self.lowercase:
Expand All @@ -286,7 +296,7 @@ def decode(self, ids):
tokens.append(self._vocab_list[int_id])
else:
tokens.append(self._oov_token)
return " ".join(tokens)
return self._decode_token_separator.join(tokens)

@property
def vocab_size(self):
Expand Down

0 comments on commit 3bc311e

Please sign in to comment.