Explicitly import estimator from tensorflow as a separate import inst…

…ead of accessing it via tf.estimator and depend on the tensorflow estimator target. PiperOrigin-RevId: 436808246
tensorflow · Mar 23, 2022 · 316c9ce · 316c9ce
1 parent a8e50c0
commit 316c9ce
Show file tree

Hide file tree

Showing 17 changed files with 60 additions and 43 deletions.
diff --git a/tensor2tensor/bin/t2t_attack.py b/tensor2tensor/bin/t2t_attack.py
@@ -49,6 +49,7 @@
 from tensor2tensor.utils import usr_dir
 
 import tensorflow.compat.v1 as tf
+from tensorflow.compat.v1 import estimator as tf_estimator
 
 flags = tf.flags
 FLAGS = flags.FLAGS
@@ -134,7 +135,7 @@ def create_surrogate_run_config(hp):
 def prepare_data(problem, hparams, params, config):
   """Construct input pipeline."""
   input_fn = problem.make_estimator_input_fn(
-      tf.estimator.ModeKeys.EVAL, hparams, force_repeat=True)
+      tf_estimator.ModeKeys.EVAL, hparams, force_repeat=True)
   dataset = input_fn(params, config)
   features, _ = dataset.make_one_shot_iterator().get_next()
   inputs, labels = features["targets"], features["inputs"]

diff --git a/tensor2tensor/bin/t2t_decoder.py b/tensor2tensor/bin/t2t_decoder.py
@@ -42,6 +42,7 @@
 from tensor2tensor.utils import usr_dir
 
 import tensorflow.compat.v1 as tf
+from tensorflow.compat.v1 import estimator as tf_estimator
 
 flags = tf.flags
 FLAGS = flags.FLAGS
@@ -129,7 +130,7 @@ def score_file(filename):
     features = {"targets": batch_targets}
 
   # Prepare the model and the graph when model runs on features.
-  model = registry.model(FLAGS.model)(hparams, tf.estimator.ModeKeys.EVAL)
+  model = registry.model(FLAGS.model)(hparams, tf_estimator.ModeKeys.EVAL)
   _, losses = model(features)
   saver = tf.train.Saver()
 

diff --git a/tensor2tensor/bin/t2t_eval.py b/tensor2tensor/bin/t2t_eval.py
@@ -24,6 +24,7 @@
 from tensor2tensor.utils import trainer_lib
 from tensor2tensor.utils import usr_dir
 import tensorflow.compat.v1 as tf
+from tensorflow.compat.v1 import estimator as tf_estimator
 
 flags = tf.flags
 FLAGS = flags.FLAGS
@@ -42,7 +43,7 @@ def main(_):
   dataset_split = "test" if FLAGS.eval_use_test_set else None
   dataset_kwargs = {"dataset_split": dataset_split}
   eval_input_fn = hparams.problem.make_estimator_input_fn(
-      tf.estimator.ModeKeys.EVAL, hparams, dataset_kwargs=dataset_kwargs)
+      tf_estimator.ModeKeys.EVAL, hparams, dataset_kwargs=dataset_kwargs)
   config = t2t_trainer.create_run_config(hparams)
 
   # summary-hook in tf.estimator.EstimatorSpec requires

diff --git a/tensor2tensor/bin/t2t_prune.py b/tensor2tensor/bin/t2t_prune.py
@@ -40,6 +40,7 @@
 from tensor2tensor.utils import usr_dir
 
 import tensorflow.compat.v1 as tf
+from tensorflow.compat.v1 import estimator as tf_estimator
 
 flags = tf.flags
 FLAGS = flags.FLAGS
@@ -79,7 +80,7 @@ def main(argv):
 
   # add "_rev" as a hack to avoid image standardization
   problem = registry.problem(FLAGS.problem)
-  input_fn = problem.make_estimator_input_fn(tf.estimator.ModeKeys.EVAL,
+  input_fn = problem.make_estimator_input_fn(tf_estimator.ModeKeys.EVAL,
                                              hparams)
   dataset = input_fn(params, config).repeat()
   features, labels = dataset.make_one_shot_iterator().get_next()
@@ -91,7 +92,7 @@ def main(argv):
   spec = model_fn(
       features,
       labels,
-      tf.estimator.ModeKeys.EVAL,
+      tf_estimator.ModeKeys.EVAL,
       params=hparams,
       config=config)
 

diff --git a/tensor2tensor/bin/t2t_trainer.py b/tensor2tensor/bin/t2t_trainer.py
@@ -35,6 +35,7 @@
 from tensor2tensor.utils import trainer_lib
 from tensor2tensor.utils import usr_dir
 import tensorflow.compat.v1 as tf
+from tensorflow.compat.v1 import estimator as tf_estimator
 
 
 flags = tf.flags
@@ -243,7 +244,7 @@ def create_run_config(hp, output_dir=None):
         "num_cores_per_replica":
             1,
         "per_host_input_for_training":
-            tf.estimator.tpu.InputPipelineConfig.BROADCAST,
+            tf_estimator.tpu.InputPipelineConfig.BROADCAST,
     }
 
   # the various custom getters we have written do not play well together yet.

diff --git a/tensor2tensor/layers/common_image_attention.py b/tensor2tensor/layers/common_image_attention.py
@@ -27,6 +27,7 @@
 from tensor2tensor.utils import expert_utils
 
 import tensorflow.compat.v1 as tf
+from tensorflow.compat.v1 import estimator as tf_estimator
 
 
 class AttentionType(object):
@@ -460,7 +461,7 @@ def ffn_layer(x, hparams, losses=None):
       y = tf.reshape(y, x_shape)
     elif hparams.ffn_layer == "local_moe_tpu":
       overhead = (hparams.moe_overhead_train
-                  if hparams.mode == tf.estimator.ModeKeys.TRAIN
+                  if hparams.mode == tf_estimator.ModeKeys.TRAIN
                   else hparams.moe_overhead_eval)
       x, x_shape, is_4d = maybe_reshape_4d_to_3d(x)
       y, loss = expert_utils.local_moe_tpu(
@@ -531,7 +532,7 @@ def postprocess_image(x, rows, cols, hparams):
                               use_bias=True,
                               activation=None,
                               name="output_conv")
-  if (hparams.mode == tf.estimator.ModeKeys.PREDICT and
+  if (hparams.mode == tf_estimator.ModeKeys.PREDICT and
       hparams.block_raster_scan):
     y = targets
     yshape = common_layers.shape_list(y)
@@ -577,7 +578,7 @@ def prepare_decoder(targets, hparams):
 
   # during training, images are [batch, IMG_LEN, IMG_LEN, 3].
   # At inference, they are [batch, curr_infer_length, 1, 1]
-  if hparams.mode == tf.estimator.ModeKeys.PREDICT:
+  if hparams.mode == tf_estimator.ModeKeys.PREDICT:
     curr_infer_length = targets_shape[1]
     if hparams.block_raster_scan:
       assert hparams.img_len*channels % hparams.query_shape[1] == 0
@@ -659,7 +660,7 @@ def create_output(decoder_output, rows, cols, targets, hparams):
   batch = common_layers.shape_list(decoded_image)[0]
   depth = common_layers.shape_list(decoded_image)[-1]
   likelihood = getattr(hparams, "likelihood", DistributionType.CAT)
-  if hparams.mode == tf.estimator.ModeKeys.PREDICT:
+  if hparams.mode == tf_estimator.ModeKeys.PREDICT:
     y = tf.reshape(decoded_image, [batch, -1, 1, 1, depth])
     output = y[:, :rows, :, :, :]
   elif likelihood == DistributionType.CAT:

diff --git a/tensor2tensor/layers/common_image_attention_test.py b/tensor2tensor/layers/common_image_attention_test.py
@@ -25,6 +25,7 @@
 from tensor2tensor.utils import hparam
 
 import tensorflow.compat.v1 as tf
+from tensorflow.compat.v1 import estimator as tf_estimator
 
 
 class CommonImageAttentionTest(parameterized.TestCase, tf.test.TestCase):
@@ -40,7 +41,7 @@ def testPostProcessImageTrainMode(self, likelihood, num_mixtures, depth):
     hparams = hparam.HParams(
         hidden_size=2,
         likelihood=likelihood,
-        mode=tf.estimator.ModeKeys.TRAIN,
+        mode=tf_estimator.ModeKeys.TRAIN,
         num_mixtures=num_mixtures,
     )
     inputs = tf.random_uniform([batch, rows, cols, hparams.hidden_size],
@@ -63,7 +64,7 @@ def testPostProcessImageInferMode(self, likelihood, num_mixtures, depth):
         block_raster_scan=True,
         hidden_size=2,
         likelihood=likelihood,
-        mode=tf.estimator.ModeKeys.PREDICT,
+        mode=tf_estimator.ModeKeys.PREDICT,
         num_mixtures=num_mixtures,
         query_shape=[block_length, block_width],
     )
@@ -95,7 +96,7 @@ def testCreateOutputTrainMode(self, likelihood, num_mixtures, depth):
         hidden_size=2,
         likelihood=likelihood,
         num_channels=channels,
-        mode=tf.estimator.ModeKeys.TRAIN,
+        mode=tf_estimator.ModeKeys.TRAIN,
         num_mixtures=num_mixtures,
     )
     decoder_output = tf.random_normal([batch, rows, cols, hparams.hidden_size])

diff --git a/tensor2tensor/layers/discretization.py b/tensor2tensor/layers/discretization.py
@@ -25,6 +25,7 @@
 from tensor2tensor.layers import common_layers
 
 import tensorflow.compat.v1 as tf
+from tensorflow.compat.v1 import estimator as tf_estimator
 import tensorflow_probability as tfp
 
 from tensorflow.python.training import moving_averages  # pylint: disable=g-direct-tensorflow-import
@@ -472,7 +473,7 @@ def gumbel_softmax(x,
     d_dev = -tf.reduce_mean(d_variance)
     ret = s
 
-    if mode != tf.estimator.ModeKeys.TRAIN:
+    if mode != tf_estimator.ModeKeys.TRAIN:
       ret = tf.reshape(maxvhot, common_layers.shape_list(s))  # Just hot @eval.
     return m, ret, d_dev * 5.0 + tf.reduce_mean(kl) * 0.002
 
@@ -754,7 +755,7 @@ def discrete_bottleneck(inputs,
       y_clean = common_layers.saturating_sigmoid(outputs_discrete)
       if summary:
         tf.summary.histogram("y_clean", tf.reshape(y_clean, [-1]))
-      if noise_dev > 0 and mode == tf.estimator.ModeKeys.TRAIN:
+      if noise_dev > 0 and mode == tf_estimator.ModeKeys.TRAIN:
         noise = tf.truncated_normal(
             common_layers.shape_list(outputs_discrete),
             mean=0.0,
@@ -766,7 +767,7 @@ def discrete_bottleneck(inputs,
       y_discrete = tf.stop_gradient(d) + y - tf.stop_gradient(y)
       pd = common_layers.inverse_exp_decay(startup_steps * 2)
       pd *= discrete_mix
-      pd = pd if mode == tf.estimator.ModeKeys.TRAIN else 1.0
+      pd = pd if mode == tf_estimator.ModeKeys.TRAIN else 1.0
       c = tf.where(
           tf.less(tf.random_uniform([common_layers.shape_list(y)[0]]), pd),
           y_discrete, y)
@@ -1379,17 +1380,17 @@ def tanh_discrete_bottleneck(x, bottleneck_bits, bottleneck_noise,
   """Simple discretization through tanh, flip bottleneck_noise many bits."""
   x = tf.layers.dense(x, bottleneck_bits, name="tanh_discrete_bottleneck")
   d0 = tf.stop_gradient(2.0 * tf.to_float(tf.less(0.0, x))) - 1.0
-  if mode == tf.estimator.ModeKeys.TRAIN:
+  if mode == tf_estimator.ModeKeys.TRAIN:
     x += tf.truncated_normal(
         common_layers.shape_list(x), mean=0.0, stddev=0.2)
   x = tf.tanh(x)
   d = x + tf.stop_gradient(2.0 * tf.to_float(tf.less(0.0, x)) - 1.0 - x)
-  if mode == tf.estimator.ModeKeys.TRAIN:
+  if mode == tf_estimator.ModeKeys.TRAIN:
     noise = tf.random_uniform(common_layers.shape_list(x))
     noise = 2.0 * tf.to_float(tf.less(bottleneck_noise, noise)) - 1.0
     d *= noise
   d = common_layers.mix(d, x, discretize_warmup_steps,
-                        mode == tf.estimator.ModeKeys.TRAIN)
+                        mode == tf_estimator.ModeKeys.TRAIN)
   return d, d0
 
 
@@ -1410,21 +1411,21 @@ def isemhash_bottleneck(x,
   with tf.variable_scope("isemhash_bottleneck"):
     x = tf.layers.dense(x, bottleneck_bits, name="dense")
     y = common_layers.saturating_sigmoid(x)
-    if isemhash_noise_dev > 0 and mode == tf.estimator.ModeKeys.TRAIN:
+    if isemhash_noise_dev > 0 and mode == tf_estimator.ModeKeys.TRAIN:
       noise = tf.truncated_normal(
           common_layers.shape_list(x), mean=0.0, stddev=isemhash_noise_dev)
       y = common_layers.saturating_sigmoid(x + noise)
     d = tf.to_float(tf.less(0.5, y)) + y - tf.stop_gradient(y)
     d = 2.0 * d - 1.0  # Move from [0, 1] to [-1, 1].
-    if mode == tf.estimator.ModeKeys.TRAIN:  # Flip some bits.
+    if mode == tf_estimator.ModeKeys.TRAIN:  # Flip some bits.
       noise = tf.random_uniform(common_layers.shape_list(x))
       noise = 2.0 * tf.to_float(tf.less(bottleneck_noise, noise)) - 1.0
       d *= noise
       d = common_layers.mix(
           d,
           2.0 * y - 1.0,
           discretize_warmup_steps,
-          mode == tf.estimator.ModeKeys.TRAIN,
+          mode == tf_estimator.ModeKeys.TRAIN,
           max_prob=isemhash_mix_prob)
     return d, 0.0
 

diff --git a/tensor2tensor/layers/latent_layers.py b/tensor2tensor/layers/latent_layers.py
@@ -27,6 +27,7 @@
 from tensor2tensor.utils import beam_search
 
 import tensorflow.compat.v1 as tf
+from tensorflow.compat.v1 import estimator as tf_estimator
 import tensorflow_probability as tfp
 
 DO_SUMMARIES = True
@@ -556,7 +557,7 @@ def latent_prediction_model(inputs,
     latents_pred_loss: Tensor of shape [batch, length_q].
   """
   with tf.variable_scope(name, default_name="latent_prediction"):
-    if hparams.mode != tf.estimator.ModeKeys.PREDICT:
+    if hparams.mode != tf_estimator.ModeKeys.PREDICT:
       latents_pred = transformer_latent_decoder(tf.stop_gradient(latents_dense),
                                                 inputs,
                                                 ed_attention_bias,
@@ -617,10 +618,10 @@ def transformer_autoencoder(inputs,
   losses = {"extra": 0.,
             "extra_loss": 0.,
             "latent_pred": 0.}
-  if hparams.mode != tf.estimator.ModeKeys.PREDICT:
+  if hparams.mode != tf_estimator.ModeKeys.PREDICT:
     targets_compressed = compress_fn(targets, hparams, name="compress")
 
-    if hparams.mode == tf.estimator.ModeKeys.TRAIN:
+    if hparams.mode == tf_estimator.ModeKeys.TRAIN:
       scale = common_layers.inverse_exp_decay(hparams.startup_steps)
     else:
       scale = 1.0
@@ -681,7 +682,7 @@ def transformer_autoencoder(inputs,
       [-1, hparams.img_len, hparams.img_len, hparams.hidden_size])
 
   if hparams.use_gold_targets:
-    if hparams.mode == tf.estimator.ModeKeys.PREDICT:
+    if hparams.mode == tf_estimator.ModeKeys.PREDICT:
       masking = predict_mask
     else:
       masking = common_layers.inverse_exp_decay(hparams.mask_startup_steps)

diff --git a/tensor2tensor/layers/latent_layers_test.py b/tensor2tensor/layers/latent_layers_test.py
@@ -28,6 +28,7 @@
 from tensor2tensor.utils import test_utils
 
 import tensorflow.compat.v1 as tf
+from tensorflow.compat.v1 import estimator as tf_estimator
 tf.enable_eager_execution()
 
 
@@ -110,7 +111,7 @@ def testComputeBitsAndNats(self):
   @test_utils.run_in_graph_and_eager_modes()
   def testTransformerAutoencoder(self):
     hparams = imagetransformer_latent_tiny()
-    hparams.mode = tf.estimator.ModeKeys.TRAIN
+    hparams.mode = tf_estimator.ModeKeys.TRAIN
     block_dim = int(hparams.hidden_size // hparams.num_blocks)
     block_v_size = 2**(hparams.bottleneck_bits /
                        (hparams.num_residuals * hparams.num_blocks))

diff --git a/tensor2tensor/layers/modalities.py b/tensor2tensor/layers/modalities.py
@@ -33,6 +33,7 @@
 from tensor2tensor.layers import discretization
 
 import tensorflow.compat.v1 as tf
+from tensorflow.compat.v1 import estimator as tf_estimator
 import tensorflow_probability as tfp
 
 
@@ -309,7 +310,7 @@ def _image_channel_compress_bottom(inputs, model_hparams, name="bottom"):
   with tf.variable_scope(name):
     inputs = tf.to_float(inputs)
     hp = model_hparams
-    if hp.mode != tf.estimator.ModeKeys.PREDICT:
+    if hp.mode != tf_estimator.ModeKeys.PREDICT:
       tf.summary.image(
           "inputs",
           common_layers.tpu_safe_image_summary(inputs),
@@ -600,7 +601,7 @@ def video_pixel_noise_bottom(x, model_hparams, vocab_size):
   """Bottom transformation for video."""
   input_noise = getattr(model_hparams, "video_modality_input_noise", 0.25)
   inputs = x
-  if model_hparams.mode == tf.estimator.ModeKeys.TRAIN:
+  if model_hparams.mode == tf_estimator.ModeKeys.TRAIN:
     background = tfp.stats.percentile(inputs, 50., axis=[0, 1, 2, 3])
     input_shape = common_layers.shape_list(inputs)
     input_size = tf.reduce_prod(input_shape[:-1])
@@ -1126,7 +1127,7 @@ def symbol_top(body_output, targets, model_hparams, vocab_size):
     body_output_shape = common_layers.shape_list(body_output)
     var = get_weights(model_hparams, vocab_size, body_output_shape[-1])
     if (model_hparams.factored_logits and
-        model_hparams.mode == tf.estimator.ModeKeys.TRAIN):
+        model_hparams.mode == tf_estimator.ModeKeys.TRAIN):
       # insert channels dimension
       body_output = tf.expand_dims(body_output, 3)
       return common_layers.FactoredTensor(body_output, var)

diff --git a/tensor2tensor/layers/modalities_test.py b/tensor2tensor/layers/modalities_test.py
@@ -26,6 +26,7 @@
 from tensor2tensor.utils import test_utils
 
 import tensorflow.compat.v1 as tf
+from tensorflow.compat.v1 import estimator as tf_estimator
 tf.enable_eager_execution()
 
 
@@ -60,7 +61,7 @@ def testSymbolModalityInputs(self):
     hidden_size = 9
     model_hparams = common_hparams.basic_params1()
     model_hparams.hidden_size = hidden_size
-    model_hparams.mode = tf.estimator.ModeKeys.TRAIN
+    model_hparams.mode = tf_estimator.ModeKeys.TRAIN
     x = np.random.randint(
         vocab_size, size=(batch_size, length, 1, 1))
     data_parallelism = expert_utils.Parallelism(
@@ -86,7 +87,7 @@ def testSymbolModalityTargets(self):
     vocab_size = 11
     model_hparams = common_hparams.basic_params1()
     model_hparams.hidden_size = hidden_size
-    model_hparams.mode = tf.estimator.ModeKeys.TRAIN
+    model_hparams.mode = tf_estimator.ModeKeys.TRAIN
     body_output = np.random.randint(
         100, size=(batch_size, length, height, hidden_size))
     targets = np.random.randint(
@@ -127,7 +128,7 @@ def testSymbolModalityTargetsFactored(self):
     model_hparams = common_hparams.basic_params1()
     model_hparams.factored_logits = True
     model_hparams.hidden_size = hidden_size
-    model_hparams.mode = tf.estimator.ModeKeys.TRAIN
+    model_hparams.mode = tf_estimator.ModeKeys.TRAIN
     body_output = np.random.randint(
         100, size=(batch_size, length, height, hidden_size))
     targets = np.random.randint(