[NCF] Add run_eagerly for ctl. (tensorflow#7229)

tfboyd · nnigania · commit 62184a96e740 · 2019-08-09T10:13:29.000-07:00
* Add run_eagerly for ctl.

* fix test name and do not set "default".
diff --git a/official/recommendation/ncf_keras_benchmark.py b/official/recommendation/ncf_keras_benchmark.py
@@ -181,6 +181,13 @@ def benchmark_1_gpu_ctl_early_stop(self):
     FLAGS.early_stopping = True
     self._run_and_report_benchmark()
 
+  def benchmark_1_gpu_ctl_run_eagerly_early_stop(self):
+    self._setup()
+    FLAGS.keras_use_ctl = True
+    FLAGS.early_stopping = True
+    FLAGS.run_eagerly = True
+    self._run_and_report_benchmark()
+
   def benchmark_xla_1_gpu_ctl_early_stop(self):
     self._setup()
     FLAGS.keras_use_ctl = True
@@ -203,7 +210,7 @@ def benchmark_2_gpus_ctl_early_stop(self):
     self._run_and_report_benchmark()
 
 #############################################
-# Tests below with mlperf in the test name are of two types
+# Tests below with mlperf in the test name are of two types:
 #  1) 1 GPU tests are based on MLPerf 0.5 and the TensorFlow pulled submission.
 #  2) 8 GPU tests are based on MLPerf 0.5 and use NVIDIA's hyper parameters.
 #
@@ -254,6 +261,14 @@ def benchmark_1_gpu_ctl_mlperf_like(self):
     FLAGS.train_epochs = 7
     self._run_and_report_benchmark_mlperf_like()
 
+  def benchmark_1_gpu_ctl_run_eagerly_mlperf_like(self):
+    """1 GPU using CTL with eager and distribution strategy."""
+    self._setup()
+    FLAGS.keras_use_ctl = True
+    FLAGS.run_eagerly = True
+    FLAGS.train_epochs = 7
+    self._run_and_report_benchmark()
+
   def benchmark_xla_1_gpu_ctl_mlperf_like(self):
     """1 GPU using CTL with XLA."""
     self._setup()
diff --git a/official/recommendation/ncf_keras_main.py b/official/recommendation/ncf_keras_main.py
@@ -285,7 +285,6 @@ def run_ncf(_):
     train_input_iterator = strategy.make_dataset_iterator(train_input_dataset)
     eval_input_iterator = strategy.make_dataset_iterator(eval_input_dataset)
 
-    @tf.function
     def train_step():
       """Called once per step to train the model."""
       def step_fn(features):
@@ -310,7 +309,6 @@ def step_fn(features):
           tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None)
       return mean_loss
 
-    @tf.function
     def eval_step():
       """Called once per eval step to compute eval metrics."""
       def step_fn(features):
@@ -330,6 +328,10 @@ def step_fn(features):
           tf.distribute.ReduceOp.SUM, per_replica_hr_count, axis=None)
       return hr_sum, hr_count
 
+    if not FLAGS.run_eagerly:
+      train_step = tf.function(train_step)
+      eval_step = tf.function(eval_step)
+
     time_callback.on_train_begin()
     for epoch in range(FLAGS.train_epochs):
       for cb in callbacks: