|
4 | 4 | from plasma.utils.performance import PerformanceAnalyzer
|
5 | 5 | from plasma.utils.processing import concatenate_sublists
|
6 | 6 | from plasma.utils.evaluation import get_loss_from_list
|
| 7 | +# KGF: this is the first module that imports Keras: |
7 | 8 | from plasma.models import builder
|
8 | 9 | from plasma.models.loader import ProcessGenerator
|
9 | 10 | from plasma.utils.state_reset import reset_states
|
| 11 | +# KGF: plasma.conf calls print_unique() for "Selected signals". Ensure that |
| 12 | +# Keras "Using TensorFlow backend" stderr messages do not interfere in stdout |
10 | 13 | from plasma.conf import conf
|
11 | 14 | from mpi4py import MPI
|
12 | 15 | from pkg_resources import parse_version, get_distribution
|
|
39 | 42 | import socket
|
40 | 43 | sys.setrecursionlimit(10000)
|
41 | 44 |
|
| 45 | +# TODO(KGF): remove the next 3 lines? |
42 | 46 | # import keras sequentially because it otherwise reads from ~/.keras/keras.json
|
43 | 47 | # with too many threads:
|
44 | 48 | # from mpi_launch_tensorflow import get_mpi_task_index
|
|
49 | 53 | # g.init_MPI()
|
50 | 54 | # TODO(KGF): set "mpi_initialized" global bool flag?
|
51 | 55 |
|
| 56 | +g.flush_all_inorder() # see above about conf_parser.py stdout writes |
| 57 | + |
52 | 58 | # initialization code for mpi_runner.py module:
|
53 | 59 | if g.backend == 'tf' or g.backend == 'tensorflow':
|
54 | 60 | if g.NUM_GPUS > 1:
|
|
67 | 73 | import tensorflow.compat.v1 as tf
|
68 | 74 | else:
|
69 | 75 | import tensorflow as tf
|
| 76 | + # TODO(KGF): above, builder.py (bug workaround), mpi_launch_tensorflow.py, |
| 77 | + # and runner.py are the only files that import tensorflow directly |
| 78 | + |
70 | 79 | from keras.backend.tensorflow_backend import set_session
|
| 80 | + # KGF: next 3 lines dump many TensorFlow diagnostics to stderr. |
| 81 | + # All MPI ranks first "Successfully opened dynamic library libcuda" |
| 82 | + # then, one by one: ID GPU, libcudart, libcublas, libcufft, ... |
| 83 | + # Finally, "Device interconnect StreamExecutor with strength 1 edge matrix" |
71 | 84 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.95,
|
72 | 85 | allow_growth=True)
|
73 | 86 | config = tf.ConfigProto(gpu_options=gpu_options)
|
74 | 87 | set_session(tf.Session(config=config))
|
| 88 | + g.flush_all_inorder() |
| 89 | + g.comm.Barrier() |
75 | 90 | else:
|
76 | 91 | os.environ['KERAS_BACKEND'] = 'theano'
|
77 | 92 | base_compile_dir = '{}/tmp/{}-{}'.format(
|
|
90 | 105 | from keras.utils.generic_utils import Progbar
|
91 | 106 | import keras.callbacks as cbks
|
92 | 107 |
|
93 |
| - |
94 | 108 | g.pprint_unique(conf)
|
| 109 | +g.flush_all_inorder() |
| 110 | +g.comm.Barrier() |
95 | 111 |
|
96 | 112 |
|
97 | 113 | class MPIOptimizer(object):
|
|
0 commit comments