Merged commit includes the following changes: (tensorflow#7398)

saberkun · web-flow · commit 45b708d48e87 · 2019-08-07T13:11:58.000-07:00
262039434  by A. Unique TensorFlower&lt;gardener@tensorflow.org&gt;:

    Internal change

262024241  by hongkuny&lt;hongkuny@google.com&gt;:

    Adds __init__.py

--
262021128  by isaprykin&lt;isaprykin@google.com&gt;:

    Internal change

PiperOrigin-RevId: 262039434
diff --git a/official/modeling/hyperparams/__init__.py b/official/modeling/hyperparams/__init__.py
@@ -1 +0,0 @@
-
diff --git a/official/transformer/v2/optimizer.py b/official/transformer/v2/optimizer.py
@@ -23,73 +23,6 @@
 K = tf.keras.backend
 
 
-class LazyAdam(tf.keras.optimizers.Adam):
-  """Variant of the Adam optimizer that handles sparse updates more efficiently.
-
-  The original Adam algorithm maintains two moving-average accumulators for
-  each trainable variable; the accumulators are updated at every step.
-  This class provides lazier handling of gradient updates for sparse
-  variables.  It only updates moving-average accumulators for sparse variable
-  indices that appear in the current batch, rather than updating the
-  accumulators for all indices. Compared with the original Adam optimizer,
-  it can provide large improvements in model training throughput for some
-  applications. However, it provides slightly different semantics than the
-  original Adam algorithm, and may lead to different empirical results.
-  Note, amsgrad is currently not supported and the argument can only be
-  False.
-
-  This class is borrowed from:
-  https://github.com/tensorflow/addons/blob/master/tensorflow_addons/optimizers/lazy_adam.py
-  """
-
-  def _resource_apply_sparse(self, grad, var, indices):
-    """Applies grad for one step."""
-    var_dtype = var.dtype.base_dtype
-    lr_t = self._decayed_lr(var_dtype)
-    beta_1_t = self._get_hyper('beta_1', var_dtype)
-    beta_2_t = self._get_hyper('beta_2', var_dtype)
-    local_step = tf.cast(self.iterations + 1, var_dtype)
-    beta_1_power = tf.math.pow(beta_1_t, local_step)
-    beta_2_power = tf.math.pow(beta_2_t, local_step)
-    epsilon_t = tf.convert_to_tensor(self.epsilon, var_dtype)
-    lr = (lr_t * tf.math.sqrt(1 - beta_2_power) / (1 - beta_1_power))
-
-    # \\(m := beta1 * m + (1 - beta1) * g_t\\)
-    m = self.get_slot(var, 'm')
-    m_t_slice = beta_1_t * tf.gather(m, indices) + (1 - beta_1_t) * grad
-
-    m_update_kwargs = {
-        'resource': m.handle,
-        'indices': indices,
-        'updates': m_t_slice
-    }
-    m_update_op = tf.raw_ops.ResourceScatterUpdate(**m_update_kwargs)
-
-    # \\(v := beta2 * v + (1 - beta2) * (g_t * g_t)\\)
-    v = self.get_slot(var, 'v')
-    v_t_slice = (beta_2_t * tf.gather(v, indices) +
-                 (1 - beta_2_t) * tf.math.square(grad))
-
-    v_update_kwargs = {
-        'resource': v.handle,
-        'indices': indices,
-        'updates': v_t_slice
-    }
-    v_update_op = tf.raw_ops.ResourceScatterUpdate(**v_update_kwargs)
-
-    # \\(variable -= learning_rate * m_t / (epsilon_t + sqrt(v_t))\\)
-    var_slice = lr * m_t_slice / (tf.math.sqrt(v_t_slice) + epsilon_t)
-
-    var_update_kwargs = {
-        'resource': var.handle,
-        'indices': indices,
-        'updates': var_slice
-    }
-    var_update_op = tf.raw_ops.ResourceScatterSub(**var_update_kwargs)
-
-    return tf.group(*[var_update_op, m_update_op, v_update_op])
-
-
 class LearningRateFn(object):
   """Creates learning rate function."""
 
diff --git a/official/transformer/v2/transformer_main.py b/official/transformer/v2/transformer_main.py
@@ -250,7 +250,7 @@ def _load_weights_if_possible(self, model, init_weight_path=None):
   def _create_optimizer(self):
     """Creates optimizer."""
     params = self.params
-    opt = optimizer.LazyAdam(
+    opt = tf.keras.optimizers.Adam(
         params["learning_rate"],
         params["optimizer_adam_beta1"],
         params["optimizer_adam_beta2"],