Fix bug involving temporary filepaths when loading a Tokenizer of a T…

…okenTextEncoder. Because the data directory is marked as "incomplete" while the dataset is being built, and later renamed, absolute filepaths written while the dataset is being built are wrong. This change eliminates the absolute filepaths and instead assumes a constant relative structure between the metadata files. PiperOrigin-RevId: 258279826
bdnoxt · Jul 16, 2019 · 30cdeb2 · 30cdeb2
1 parent a5d0580
commit 30cdeb2
Showing 1 changed file with 4 additions and 4 deletions.
diff --git a/tensorflow_datasets/core/features/text/text_encoder.py b/tensorflow_datasets/core/features/text/text_encoder.py
@@ -330,16 +330,16 @@ def save_to_file(self, filename_prefix):
     }
     if self._user_defined_tokenizer is not None:
       self._tokenizer.save_to_file(filename)
-      kwargs["tokenizer_file_prefix"] = filename
+      kwargs["has_tokenizer"] = True
     self._write_lines_to_file(filename, self._vocab_list, kwargs)
 
   @classmethod
   def load_from_file(cls, filename_prefix):
     filename = cls._filename(filename_prefix)
     vocab_lines, kwargs = cls._read_lines_from_file(filename)
-    tokenizer_file = kwargs.pop("tokenizer_file_prefix", None)
-    if tokenizer_file:
-      kwargs["tokenizer"] = Tokenizer.load_from_file(tokenizer_file)
+    has_tokenizer = kwargs.pop("has_tokenizer", False)
+    if has_tokenizer:
+      kwargs["tokenizer"] = Tokenizer.load_from_file(filename)
     return cls(vocab_list=vocab_lines, **kwargs)