Improve the way profiling is performed

pribalta · pribalta · commit 6ca963788565 · 2020-01-10T11:11:08.000+01:00
Signed-off-by: Pablo Ribalta &lt;pribalta@nvidia.com&gt;
diff --git a/TensorFlow/Segmentation/VNet/Dockerfile b/TensorFlow/Segmentation/VNet/Dockerfile
@@ -4,7 +4,6 @@ ADD . /workspace/vnet
 WORKDIR /workspace/vnet
 
 RUN pip install --upgrade pip
-RUN pip install --user git+https://github.com/NVIDIA/dllogger
 RUN pip install --disable-pip-version-check -r requirements.txt
 
 
diff --git a/TensorFlow/Segmentation/VNet/hooks/profiling_hook.py b/TensorFlow/Segmentation/VNet/hooks/profiling_hook.py
@@ -36,6 +36,6 @@ def before_run(self, run_context):
 
     def end(self, session):
         deltas = [self._timestamps[i + 1] - self._timestamps[i] for i in range(len(self._timestamps) - 1)]
-        self._logger.log(step=self._step, data={
-            'average_throughput_' + 'train' if self._training else 'test': self._global_batch_size / np.mean(deltas)})
+        self._logger.log(step=(), data={
+            'average_throughput_train' if self._training else 'average_throughput_test': self._global_batch_size / np.mean(deltas)})
         self._logger.flush()
diff --git a/TensorFlow/Segmentation/VNet/hooks/train_hook.py b/TensorFlow/Segmentation/VNet/hooks/train_hook.py
@@ -36,7 +36,7 @@ def after_run(self,
                   run_context,
                   run_values):
         if self._step % self._log_every == 0:
-            self._logger.log(step=self._step, data={'total_loss': run_values.results[0]})
+            self._logger.log(step=(self._step,), data={'total_loss': str(run_values.results[0])})
         self._step += 1
 
     def end(self, session):
diff --git a/TensorFlow/Segmentation/VNet/main.py b/TensorFlow/Segmentation/VNet/main.py
@@ -87,7 +87,7 @@ def main(_):
 
     run_config = tf.estimator.RunConfig(
         save_summary_steps=None,
-        save_checkpoints_steps=dataset.train_steps * FLAGS.train_epochs,
+        save_checkpoints_steps=None if FLAGS.benchmark else dataset.train_steps * FLAGS.train_epoch,
         save_checkpoints_secs=None,
         tf_random_seed=None,
         session_config=config,
@@ -112,22 +112,32 @@ def main(_):
             if hvd.rank() == 0:
                 train_hooks += [TrainHook(FLAGS.log_every, DLLogger)]
 
+        DLLogger.log(step=tuple(), data={"training": "START"})
+
         estimator.train(
             input_fn=lambda: dataset.train_fn(FLAGS.augment),
             steps=steps,
             hooks=train_hooks)
 
+        DLLogger.log(step=tuple(), data={"training": "FINISHED"})
+
     if 'evaluate' in FLAGS.exec_mode:
         if hvd.rank() == 0:
             if FLAGS.train_split >= 1.0:
                 raise ValueError("Missing argument: --train_split < 1.0")
+
+            DLLogger.log(step=tuple(), data={"evaluating": "START"})
+
             result = estimator.evaluate(
                 input_fn=dataset.eval_fn,
                 steps=dataset.eval_steps,
                 hooks=[])
-            DLLogger.log(step=tuple(), data={'background_dice': result['background dice']})
-            DLLogger.log(step=tuple(), data={'anterior_dice': result['Anterior dice']})
-            DLLogger.log(step=tuple(), data={'posterior_dice': result['Posterior dice']})
+
+            DLLogger.log(step=tuple(), data={"evaluating": "FINISH"})
+
+            DLLogger.log(step=tuple(), data={'background_dice': str(result['background dice'])})
+            DLLogger.log(step=tuple(), data={'anterior_dice': str(result['Anterior dice'])})
+            DLLogger.log(step=tuple(), data={'posterior_dice': str(result['Posterior dice'])})
 
     if 'predict' in FLAGS.exec_mode:
         count = 1
diff --git a/TensorFlow/Segmentation/VNet/requirements.txt b/TensorFlow/Segmentation/VNet/requirements.txt
@@ -2,3 +2,4 @@ SimpleITK==1.1.0
 requests
 googledrivedownloader
 tf2onnx
+git+git://github.com/NVIDIA/dllogger#egg=dllogger
diff --git a/TensorFlow/Segmentation/VNet/utils/model_fn.py b/TensorFlow/Segmentation/VNet/utils/model_fn.py
@@ -122,6 +122,7 @@ def vnet_v2(features, labels, mode, params):
                         loss_scale='dynamic'
                     )
 
+
                 train_op = optimizer.minimize(total_loss, global_step=global_step)
 
         eval_metric_ops = None

Original file line number	Diff line number	Diff line change
`@@ -122,6 +122,7 @@ def vnet_v2(features, labels, mode, params):`
`122`	`122`	`loss_scale='dynamic'`
`123`	`123`	`)`
`124`	`124`
	`125`	`+`
`125`	`126`	`train_op = optimizer.minimize(total_loss, global_step=global_step)`
`126`	`127`
`127`	`128`	`eval_metric_ops = None`