Completely sort out order of initial Keras and/or TensorFlow init msgs

felker · felker · commit c5dfa9d8d5a1 · 2019-11-06T15:40:58.000-05:00
Dont forget about stderr!
diff --git a/plasma/models/builder.py b/plasma/models/builder.py
@@ -1,4 +1,8 @@
-from __future__ import division
+from __future__ import division, print_function
+import plasma.global_vars as g
+# KGF: the first time Keras is ever imported via mpi_learn.py -> mpi_runner.py
+import keras.backend as K
+# KGF: see below synchronization--- output is launched here
 from keras.models import Sequential, Model
 from keras.layers import Input
 from keras.layers.core import (
@@ -14,8 +18,6 @@
 from keras.callbacks import Callback
 from keras.regularizers import l2  # l1, l1_l2
 
-import keras.backend as K
-
 import re
 import os
 import sys
@@ -24,6 +26,23 @@
 from plasma.utils.downloading import makedirs_process_safe
 from plasma.utils.hashing import general_object_hash
 
+# Synchronize 2x stderr msg from TensorFlow initialization via Keras backend
+# "Succesfully opened dynamic library... libcudart" "Using TensorFlow backend."
+if g.comm is not None:
+    g.flush_all_inorder()
+# if g.comm is not None:
+#     g.comm.Barrier()
+# if g.task_index == 0:
+#     sys.stdout.flush()
+#     sys.stderr.flush()
+# if g.comm is not None:
+#     g.comm.Barrier()
+# TODO(KGF): need to create wrapper .py file (or place in some __init__.py)
+# that detects, for an arbitrary import, if tensorflow has been initialized
+# either directly from "import tensorflow ..." and/or via backend of
+# "from keras.layers ..."
+# OR if this is the first time. See below "first_time" variable.
+
 
 class LossHistory(Callback):
     def on_train_begin(self, logs=None):
@@ -262,6 +281,8 @@ def slicer_output_shape(input_shape, indices):
             x_out = Dense(1, activation=output_activation)(x_in)
         model = Model(inputs=x_input, outputs=x_out)
         # bug with tensorflow/Keras
+        # TODO(KGF): what is this bug? this is the only direct "tensorflow"
+        # import outside of mpi_runner.py and runner.py
         if (conf['model']['backend'] == 'tf'
                 or conf['model']['backend'] == 'tensorflow'):
             first_time = "tensorflow" not in sys.modules
diff --git a/plasma/models/mpi_runner.py b/plasma/models/mpi_runner.py
@@ -4,9 +4,12 @@
 from plasma.utils.performance import PerformanceAnalyzer
 from plasma.utils.processing import concatenate_sublists
 from plasma.utils.evaluation import get_loss_from_list
+# KGF: this is the first module that imports Keras:
 from plasma.models import builder
 from plasma.models.loader import ProcessGenerator
 from plasma.utils.state_reset import reset_states
+# KGF: plasma.conf calls print_unique() for "Selected signals". Ensure that
+# Keras "Using TensorFlow backend" stderr messages do not interfere in stdout
 from plasma.conf import conf
 from mpi4py import MPI
 from pkg_resources import parse_version, get_distribution
@@ -39,6 +42,7 @@
 import socket
 sys.setrecursionlimit(10000)
 
+# TODO(KGF): remove the next 3 lines?
 # import keras sequentially because it otherwise reads from ~/.keras/keras.json
 # with too many threads:
 # from mpi_launch_tensorflow import get_mpi_task_index
@@ -49,6 +53,8 @@
 # g.init_MPI()
 # TODO(KGF): set "mpi_initialized" global bool flag?
 
+g.flush_all_inorder()   # see above about conf_parser.py stdout writes
+
 # initialization code for mpi_runner.py module:
 if g.backend == 'tf' or g.backend == 'tensorflow':
     if g.NUM_GPUS > 1:
@@ -67,11 +73,20 @@
         import tensorflow.compat.v1 as tf
     else:
         import tensorflow as tf
+    # TODO(KGF): above, builder.py (bug workaround), mpi_launch_tensorflow.py,
+    # and runner.py are the only files that import tensorflow directly
+
     from keras.backend.tensorflow_backend import set_session
+    # KGF: next 3 lines dump many TensorFlow diagnostics to stderr.
+    # All MPI ranks first "Successfully opened dynamic library libcuda"
+    # then, one by one: ID GPU, libcudart, libcublas, libcufft, ...
+    # Finally, "Device interconnect StreamExecutor with strength 1 edge matrix"
     gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.95,
                                 allow_growth=True)
     config = tf.ConfigProto(gpu_options=gpu_options)
     set_session(tf.Session(config=config))
+    g.flush_all_inorder()
+    g.comm.Barrier()
 else:
     os.environ['KERAS_BACKEND'] = 'theano'
     base_compile_dir = '{}/tmp/{}-{}'.format(
@@ -90,8 +105,9 @@
         from keras.utils.generic_utils import Progbar
         import keras.callbacks as cbks
 
-
 g.pprint_unique(conf)
+g.flush_all_inorder()
+g.comm.Barrier()
 
 
 class MPIOptimizer(object):
diff --git a/plasma/models/targets.py b/plasma/models/targets.py
@@ -1,3 +1,5 @@
+from __future__ import print_function
+import plasma.global_vars as g
 import numpy as np
 import abc
 
@@ -7,9 +9,14 @@
     )
 import keras.backend as K
 from keras.losses import hinge
-# Requirement: larger value must mean disruption more likely.
+
+# synchronize output from TensorFlow initialization via Keras backend
+if g.comm is not None:
+    g.flush_all_inorder()
+    g.comm.Barrier()
 
 
+# Requirement: larger value must mean disruption more likely.
 class Target(object):
     activation = 'linear'
     loss = 'mse'