Exposing batch_shuffle_size as hparam (#1231)

stefan-falk · afrozenator · commit 49e7cf5cad62 · 2018-11-16T10:41:06.000-08:00
* Pass data_dir to feature_encoders

Pass data_dir to feature_encoders

* Fixing error passing wrong data_dir

* Exposing batch_shuffle_size as hparam

* Checking d_input since d_input may be None
diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py
@@ -802,8 +802,7 @@ def input_fn(self,
                config=None,
                force_repeat=False,
                prevent_repeat=False,
-               dataset_kwargs=None,
-               batch_shuffle_size=512):
+               dataset_kwargs=None):
     """Builds input pipeline for problem.
 
     Args:
@@ -818,8 +817,6 @@ def input_fn(self,
         Overrides force_repeat.
       dataset_kwargs: dict, if passed, will pass as kwargs to self.dataset
         method when called
-      batch_shuffle_size: int, the size of the buffer to shuffle batches.
-        if none, the batches will not be shuffled.
 
     Returns:
       (features_dict<str name, Tensor feature>, Tensor targets)
@@ -969,8 +966,9 @@ def define_shapes(example):
     # buffer size for record shuffling is smaller than the batch size. In such
     # cases, adding batch shuffling ensures that the data is in random order
     # during training
-    if is_training and batch_shuffle_size:
-      dataset = dataset.shuffle(batch_shuffle_size)
+    if hasattr(hparams, 'batch_shuffle_size'):
+      if is_training and hparams.batch_shuffle_size:
+        dataset = dataset.shuffle(hparams.batch_shuffle_size)
 
     def prepare_for_output(example):
       if not config or not config.use_tpu:
diff --git a/tensor2tensor/layers/common_hparams.py b/tensor2tensor/layers/common_hparams.py
@@ -33,6 +33,7 @@ def basic_params1():
       # of tokens per batch per GPU or per TPU core.  Otherwise, this is
       # the number of examples per GPU or per TPU core.
       batch_size=4096,
+      batch_shuffle_size=512,
       # If True, then if the features are of variable length, the batch_size is
       # used as the actual batch size (and not tokens per batch).
       use_fixed_batch_size=False,
diff --git a/tensor2tensor/utils/decoding.py b/tensor2tensor/utils/decoding.py
@@ -319,8 +319,9 @@ def decode_once(estimator,
     if decode_to_file:
       for i, (d_input, d_output, d_target) in enumerate(decoded_outputs):
         # Skip if all padding
-        if re.match("^({})+$".format(text_encoder.PAD), d_input):
-          continue
+        if d_input:
+          if re.match("^({})+$".format(text_encoder.PAD), d_input):
+            continue
         beam_score_str = ""
         if decode_hp.write_beam_scores:
           beam_score_str = "\t%.2f" % decoded_scores[i]