Merge pull request NVIDIA#737 from NVIDIA/gh/release

nv-kkudrynski · web-flow · commit 308925ecaa03 · 2020-11-03T14:59:05.000+01:00
[ConvNets/TF] Performance fix
diff --git a/TensorFlow/Classification/ConvNets/main.py b/TensorFlow/Classification/ConvNets/main.py
@@ -99,7 +99,7 @@
             symmetric=FLAGS.symmetric,
             quant_delay = FLAGS.quant_delay,
             use_qdq = FLAGS.use_qdq,
-            finetune_checkpoint = FLAGS.finetune_checkpoint,
+            finetune_checkpoint=FLAGS.finetune_checkpoint,
         )
 
     if FLAGS.mode in ["train_and_evaluate", 'evaluate', 'inference_benchmark']:
diff --git a/TensorFlow/Classification/ConvNets/model/resnet.py b/TensorFlow/Classification/ConvNets/model/resnet.py
@@ -188,6 +188,9 @@ def __call__(self, features, labels, mode, params):
                 use_final_conv=params['use_final_conv']
             )
             
+            if mode!=tf.estimator.ModeKeys.PREDICT:
+                logits = tf.squeeze(logits)
+
             if mode!=tf.estimator.ModeKeys.PREDICT:
                 logits = tf.squeeze(logits)
 
@@ -201,7 +204,7 @@ def __call__(self, features, labels, mode, params):
             tf.identity(logits, name="logits_ref")
             tf.identity(probs, name="probs_ref")
             tf.identity(y_preds, name="y_preds_ref")
-            
+
             if mode == tf.estimator.ModeKeys.TRAIN and params['quantize']:
                 dllogger.log(data={"QUANTIZATION AWARE TRAINING ENABLED": True}, step=tuple())
                 if params['symmetric']:
@@ -219,7 +222,7 @@ def __call__(self, features, labels, mode, params):
                     train_var_dict[var.op.name] = var
                 dllogger.log(data={"Restoring variables from checkpoint": params['finetune_checkpoint']}, step=tuple())
                 tf.train.init_from_checkpoint(params['finetune_checkpoint'], train_var_dict)
-                
+
         if mode == tf.estimator.ModeKeys.PREDICT:
 
             predictions = {'classes': y_preds, 'probabilities': probs}
@@ -458,7 +461,7 @@ def build_model(self, inputs, training=True, reuse=False, use_final_conv=False):
 
                 if logits.dtype != tf.float32:
                     logits = tf.cast(logits, tf.float32)
-                    
+
                 axis = 3 if self.model_hparams.compute_format=="NHWC" and use_final_conv else 1
                 probs = layers.softmax(logits, name="softmax", axis=axis)
 
diff --git a/TensorFlow/Classification/ConvNets/resnet50v1.5/README.md b/TensorFlow/Classification/ConvNets/resnet50v1.5/README.md
@@ -20,6 +20,8 @@ This repository provides a script and recipe to train the ResNet-50 v1.5 model t
     * [Parameters](#parameters)
         * [The `main.py` script](#the-mainpy-script)
     * [Quantization Aware training](#quantization-aware-training)
+        * [Post process checkpoint](#post-process-checkpoint)
+        * [Exporting Frozen graphs](#exporting-frozen-graphs)
     * [Inference process](#inference-process)
 * [Performance](#performance)
     * [Benchmarking](#benchmarking)
@@ -200,7 +202,16 @@ cd DeepLearningExamples/TensorFlow/Classification/ConvNets
 2. Download and preprocess the dataset.
 The ResNet50 v1.5 script operates on ImageNet 1k, a widely popular image classification dataset from the ILSVRC challenge.
 
-To download and preprocess the dataset, use the [Generate ImageNet for TensorFlow](https://github.com/tensorflow/models/blob/archive/research/inception/inception/data/download_and_preprocess_imagenet.sh) script. The dataset will be downloaded to a directory specified as the first parameter of the script.
+* [Download the images](http://image-net.org/download-images)
+* Extract the training and validation data:
+```bash
+mkdir train && mv ILSVRC2012_img_train.tar train/ && cd train
+tar -xvf ILSVRC2012_img_train.tar && rm -f ILSVRC2012_img_train.tar
+find . -name "*.tar" | while read NAME ; do mkdir -p "${NAME%.tar}"; tar -xvf "${NAME}" -C "${NAME%.tar}"; rm -f "${NAME}"; done
+cd ..
+mkdir val && mv ILSVRC2012_img_val.tar val/ && cd val && tar -xvf ILSVRC2012_img_val.tar
+```
+* Preprocess dataset to TFRecord form using [script](https://github.com/tensorflow/models/blob/archive/research/inception/inception/data/build_imagenet_data.py). Additional metadata from [autors repository](https://github.com/tensorflow/models/tree/archive/research/inception/inception/data) might be required.
 
 3. Build the ResNet-50 v1.5 TensorFlow NGC container.
 ```bash
@@ -400,7 +411,7 @@ operations for `tf.contrib.quantize.experimental_create_training_graph` has been
      * `--output` : Name of the new checkpoint file which has the FC layer weights reshaped into 1x1 conv layer weights.
      * `--dense_layer` : Name of the FC layer
 
-### Exporting Frozen graphs
+#### Exporting Frozen graphs
 To export frozen graphs (which can be used for inference with <a href="https://developer.nvidia.com/tensorrt">TensorRT</a>), use:
 
 `python export_frozen_graph.py --checkpoint <path_to_checkpoint> --quantize --use_final_conv --use_qdq --symmetric --input_format NCHW --compute_format NCHW --output_file=<output_file_name>`
@@ -452,7 +463,7 @@ To benchmark the training performance on a specific batch size, run:
 Each of these scripts runs 200 warm-up iterations and measures the first epoch.
 
 To control warmup and benchmark length, use the `--warmup_steps`, `--num_iter` and `--iter_unit` flags. Features like XLA or DALI can be controlled
-with `--use_xla` and `--use_dali` flags. If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset. 
+with `--use_xla` and `--use_dali` flags. If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset.
 Suggested batch sizes for training are 256 for mixed precision training and 128 for single precision training per single V100 16 GB.
 
 #### Inference performance benchmark
@@ -468,8 +479,8 @@ To benchmark the inference performance on a specific batch size, run:
 `python ./main.py --mode=inference_benchmark --use_tf_amp --warmup_steps 20 --num_iter 100 --iter_unit batch --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
 
 By default, each of these scripts runs 20 warm-up iterations and measures the next 80 iterations.
-To control warm-up and benchmark length, use the `--warmup_steps`, `--num_iter` and `--iter_unit` flags.
-If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset. 
+To control warm-up and benchmark length, use the `--warmup_steps`, `--num_iter` and `--iter_unit` flags. 
+If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset.
 
 The benchmark can be automated with the `inference_benchmark.sh` script provided in `resnet50v1.5`, by simply running:
 `bash ./resnet50v1.5/inference_benchmark.sh <data dir> <data idx dir>`
@@ -518,8 +529,8 @@ on NVIDIA DGX A100 (8x A100 40GB) GPUs. Performance numbers (in images per secon
 
 | GPUs | Batch Size / GPU | Throughput - TF32 + XLA | Throughput - mixed precision + XLA | Throughput speedup (TF32 - mixed precision) | Weak scaling - TF32 + XLA | Weak scaling - mixed precision + XLA |
 |----|---------------|---------------|------------------------|-----------------|-----------|-------------------|
-| 1  | 256 | 808 img/s  | 1770 img/s    | 2.20x           | 1.00x     | 1.00x             |
-| 8  | 256 | 6300 img/s | 16400 img/s   | 2.60x           | 7.79x     | 9.26x             |
+| 1  | 256 | 909 img/s  | 2375 img/s    | 2.60x           | 1.00x     | 1.00x             |
+| 8  | 256 | 7000 img/s | 17400 img/s   | 2.48x           | 7.70x     | 7.32x             |
 
 ##### Training performance: NVIDIA DGX-1 (8x V100 16G)
 Our results were obtained by running the `resnet50v1.5/training/training_perf.sh` benchmark script in the 
diff --git a/TensorFlow/Classification/ConvNets/resnext101-32x4d/README.md b/TensorFlow/Classification/ConvNets/resnext101-32x4d/README.md
@@ -209,7 +209,16 @@ cd DeepLearningExamples/TensorFlow/Classification/ConvNets
 2. Download and preprocess the dataset.
 The ResNext101-32x4d script operates on ImageNet 1k, a widely popular image classification dataset from the ILSVRC challenge.
 
-To download and preprocess the dataset, use the [Generate ImageNet for TensorFlow](https://github.com/tensorflow/models/blob/archive/research/inception/inception/data/download_and_preprocess_imagenet.sh) script. The dataset will be downloaded to a directory specified as the first parameter of the script.
+* [Download the images](http://image-net.org/download-images)
+* Extract the training and validation data:
+```bash
+mkdir train && mv ILSVRC2012_img_train.tar train/ && cd train
+tar -xvf ILSVRC2012_img_train.tar && rm -f ILSVRC2012_img_train.tar
+find . -name "*.tar" | while read NAME ; do mkdir -p "${NAME%.tar}"; tar -xvf "${NAME}" -C "${NAME%.tar}"; rm -f "${NAME}"; done
+cd ..
+mkdir val && mv ILSVRC2012_img_val.tar val/ && cd val && tar -xvf ILSVRC2012_img_val.tar
+```
+* Preprocess dataset to TFRecord form using [script](https://github.com/tensorflow/models/blob/archive/research/inception/inception/data/build_imagenet_data.py). Additional metadata from [autors repository](https://github.com/tensorflow/models/tree/archive/research/inception/inception/data) might be required.
 
 3. Build the ResNext101-32x4d TensorFlow NGC container.
 ```bash
@@ -420,7 +429,7 @@ To benchmark the training performance on a specific batch size, run:
 Each of these scripts runs 200 warm-up iterations and measures the first epoch.
 
 To control warmup and benchmark length, use the `--warmup_steps`, `--num_iter` and `--iter_unit` flags. Features like XLA or DALI can be controlled
-with `--use_xla` and `--use_dali` flags. If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset. 
+with `--use_xla` and `--use_dali` flags. If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset.
 Suggested batch sizes for training are 128 for mixed precision training and 64 for single precision training per single V100 16 GB.
 
 
@@ -438,7 +447,7 @@ To benchmark the inference performance on a specific batch size, run:
 
 By default, each of these scripts runs 20 warm-up iterations and measures the next 80 iterations.
 To control warm-up and benchmark length, use the `--warmup_steps`, `--num_iter` and `--iter_unit` flags.
-If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset. 
+If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset.
 
 The benchmark can be automated with the `inference_benchmark.sh` script provided in `resnext101-32x4d`, by simply running:
 `bash ./resnext101-32x4d/inference_benchmark.sh <data dir> <data idx dir>`
@@ -487,8 +496,8 @@ on NVIDIA DGX A100 (8x A100 40GB) GPUs. Performance numbers (in images per secon
 
 | GPUs | Batch Size / GPU | Throughput - TF32 + XLA | Throughput - mixed precision + XLA | Throughput speedup (TF32 - mixed precision) | Weak scaling - TF32 + XLA| Weak scaling - mixed precision + XLA |
 |----|---------------|---------------|------------------------|-----------------|-----------|-------------------|
-| 1  | 128 (TF) / 256 (AMP) | 340 img/s  | 905 img/s    | 2.66x           | 1.00x     | 1.00x             |
-| 8  | 128 (TF) / 256 (AMP) | 2630 img/s | 8000 img/s   | 3.05x           | 7.73x     | 8.84x             |
+| 1  | 128 (TF) / 256 (AMP) | 371 img/s  | 1132 img/s    | 3.05x           | 1.00x     | 1.00x             |
+| 8  | 128 (TF) / 256 (AMP) | 2854 img/s | 8500 img/s   | 2.98x           | 7.69x     | 7.51x             |
 
 
 ##### Training performance: NVIDIA DGX-1 (8x V100 16G)
diff --git a/TensorFlow/Classification/ConvNets/runtime/runner.py b/TensorFlow/Classification/ConvNets/runtime/runner.py
@@ -95,7 +95,7 @@ def __init__(
         #os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 
         os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
-        os.environ['TF_GPU_THREAD_COUNT'] = '1' if not hvd_utils.is_using_hvd() else str(hvd.size())
+        os.environ['TF_GPU_THREAD_COUNT'] = '2'
 
         os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'
 
@@ -246,11 +246,7 @@ def _get_session_config(mode, use_xla, use_dali, gpu_memory_fraction, gpu_id=0):
 
         if mode == 'train':
             config.intra_op_parallelism_threads = 1  # Avoid pool of Eigen threads
-
-            if hvd_utils.is_using_hvd():
-                config.inter_op_parallelism_threads = max(2, (multiprocessing.cpu_count() // hvd.size()) - 2)
-            else:
-                config.inter_op_parallelism_threads = 4
+            config.inter_op_parallelism_threads = max(2, (multiprocessing.cpu_count() // max(hvd.size(), 8) - 2))
 
         return config
 
@@ -407,15 +403,16 @@ def train(
 
             if is_benchmark:
                 self.training_logging_hook = hooks.BenchmarkLoggingHook(
-                    global_batch_size=global_batch_size, warmup_steps=warmup_steps
+                    global_batch_size=global_batch_size, warmup_steps=warmup_steps, logging_steps=log_every_n_steps
                 )
             else:
                 self.training_logging_hook = hooks.TrainingLoggingHook(
                     global_batch_size=global_batch_size,
                     num_steps=num_steps,
                     num_samples=num_samples,
                     num_epochs=num_epochs,
-                    steps_per_epoch=steps_per_epoch
+                    steps_per_epoch=steps_per_epoch,
+                    logging_steps=log_every_n_steps
                 )
             training_hooks.append(self.training_logging_hook)
 
@@ -446,10 +443,10 @@ def train(
             'symmetric': symmetric,
             'quant_delay': quant_delay
         }
-        
+
         if finetune_checkpoint:
-           estimator_params['finetune_checkpoint']=finetune_checkpoint
-        
+            estimator_params['finetune_checkpoint'] = finetune_checkpoint
+
         image_classifier = self._get_estimator(
             mode='train',
             run_params=estimator_params,
@@ -589,7 +586,9 @@ def evaluate(
         eval_hooks = []
 
         if hvd.rank() == 0:
-            self.eval_logging_hook = hooks.BenchmarkLoggingHook(global_batch_size=batch_size, warmup_steps=warmup_steps)
+            self.eval_logging_hook = hooks.BenchmarkLoggingHook(
+                global_batch_size=batch_size, warmup_steps=warmup_steps, logging_steps=log_every_n_steps
+            )
             eval_hooks.append(self.eval_logging_hook)
 
             print('Starting Model Evaluation...')
diff --git a/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/README.md b/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/README.md
@@ -204,7 +204,16 @@ cd DeepLearningExamples/TensorFlow/Classification/ConvNets
 2. Download and preprocess the dataset.
 The SE-ResNext101-32x4d script operates on ImageNet 1k, a widely popular image classification dataset from the ILSVRC challenge.
 
-To download and preprocess the dataset, use the [Generate ImageNet for TensorFlow](https://github.com/tensorflow/models/blob/archive/research/inception/inception/data/download_and_preprocess_imagenet.sh) script. The dataset will be downloaded to a directory specified as the first parameter of the script.
+* [Download the images](http://image-net.org/download-images)
+* Extract the training and validation data:
+```bash
+mkdir train && mv ILSVRC2012_img_train.tar train/ && cd train
+tar -xvf ILSVRC2012_img_train.tar && rm -f ILSVRC2012_img_train.tar
+find . -name "*.tar" | while read NAME ; do mkdir -p "${NAME%.tar}"; tar -xvf "${NAME}" -C "${NAME%.tar}"; rm -f "${NAME}"; done
+cd ..
+mkdir val && mv ILSVRC2012_img_val.tar val/ && cd val && tar -xvf ILSVRC2012_img_val.tar
+```
+* Preprocess dataset to TFRecord form using [script](https://github.com/tensorflow/models/blob/archive/research/inception/inception/data/build_imagenet_data.py). Additional metadata from [autors repository](https://github.com/tensorflow/models/tree/archive/research/inception/inception/data) might be required.
 
 3. Build the SE-ResNext101-32x4d TensorFlow NGC container.
 ```bash
@@ -415,7 +424,7 @@ To benchmark the training performance on a specific batch size, run:
 Each of these scripts runs 200 warm-up iterations and measures the first epoch.
 
 To control warmup and benchmark length, use the `--warmup_steps`, `--num_iter` and `--iter_unit` flags. Features like XLA or DALI can be controlled
-with `--use_xla` and `--use_dali` flags. If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset. 
+with `--use_xla` and `--use_dali` flags. If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset.
 Suggested batch sizes for training are 96 for mixed precision training and 64 for single precision training per single V100 16 GB.
 
 
@@ -433,7 +442,7 @@ To benchmark the inference performance on a specific batch size, run:
 
 By default, each of these scripts runs 20 warm-up iterations and measures the next 80 iterations.
 To control warm-up and benchmark length, use the `--warmup_steps`, `--num_iter` and `--iter_unit` flags.
-If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset. 
+If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset.
 
 The benchmark can be automated with the `inference_benchmark.sh` script provided in `se-resnext101-32x4d`, by simply running:
 `bash ./se-resnext101-32x4d/inference_benchmark.sh <data dir> <data idx dir>`
@@ -482,8 +491,8 @@ on NVIDIA DGX A100 (8x A100 40GB) GPUs. Performance numbers (in images per secon
 
 | GPUs | Batch Size / GPU | Throughput - TF32 + XLA | Throughput - mixed precision + XLA | Throughput speedup (TF32 - mixed precision) | Weak scaling - TF32 + XLA | Weak scaling - mixed precision + XLA |
 |----|---------------|---------------|------------------------|-----------------|-----------|-------------------|
-| 1  | 128 (TF) / 256 (AMP) | 313 img/s  | 895 img/s    | 2.86x           | 1.00x     | 1.00x             |
-| 8  | 128 (TF) / 256 (AMP) | 2400 img/s | 6930 img/s   | 2.88x           | 7.66x     | 7.74x             |
+| 1  | 128 (TF) / 256 (AMP) | 342 img/s  | 975 img/s    | 2.86x           | 1.00x     | 1.00x             |
+| 8  | 128 (TF) / 256 (AMP) | 2610 img/s | 7230 img/s   | 2.77x           | 7.63x     | 7.41x             |
 
 ##### Training performance: NVIDIA DGX-1 (8x V100 16G)
 Our results were obtained by running the `se-resnext101-32x4d/training/training_perf.sh` benchmark script in the 
diff --git a/TensorFlow/Classification/ConvNets/utils/bind_dgx_a100.sh b/TensorFlow/Classification/ConvNets/utils/bind_dgx_a100.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+if [[ -v SLURM_LOCALID ]]; then
+    echo "Bind using slurm localid"
+    LOCAL_ID=$SLURM_LOCALID
+elif [[ -v OMPI_COMM_WORLD_LOCAL_RANK ]]; then
+    echo "Bind using OpenMPI env"
+    LOCAL_ID=$OMPI_COMM_WORLD_LOCAL_RANK
+else
+    echo "Bind to first node"
+    LOCAL_ID=0
+fi
+
+case $LOCAL_ID in
+    0|1) exec numactl --cpunodebind=3 --membind=3 $@;;
+    2|3) exec numactl --cpunodebind=1 --membind=1 $@;;
+    4|5) exec numactl --cpunodebind=7 --membind=7 $@;;
+    6|7) exec numactl --cpunodebind=5 --membind=5 $@;;
+    *) echo "unknown binding"; exec $@;;
+esac
diff --git a/TensorFlow/Classification/ConvNets/utils/cmdline_helper.py b/TensorFlow/Classification/ConvNets/utils/cmdline_helper.py
@@ -173,6 +173,51 @@ def parse_cmdline(available_arch):
         help="Quantize weights and activations during training using symmetric quantization."
     )
 
+    p.add_argument(
+        '--finetune_checkpoint',
+        required=False,
+        default=None,
+        type=str,
+        help="Path to pre-trained checkpoint which will be used for fine-tuning"
+    )
+    
+    _add_bool_argument(
+        parser=p, name="use_final_conv", default=False, required=False, help="Use cosine learning rate schedule."
+    )
+
+    p.add_argument(
+        '--quant_delay',
+        type=int,
+        default=0,
+        required=False,
+        help="Number of steps to be run before quantization starts to happen"
+    )
+
+    _add_bool_argument(
+        parser=p,
+        name="quantize",
+        default=False,
+        required=False,
+        help="Quantize weights and activations during training. (Defaults to Assymmetric quantization)"
+    )
+
+    _add_bool_argument(
+        parser=p,
+        name="use_qdq",
+        default=False,
+        required=False,
+        help="Use QDQV3 op instead of FakeQuantWithMinMaxVars op for quantization. QDQv3 does only scaling"
+    )
+
+    _add_bool_argument(
+        parser=p,
+        name="symmetric",
+        default=False,
+        required=False,
+        help="Quantize weights and activations during training using symmetric quantization."
+    )
+
+
     p.add_argument(
         '--log_filename',
         type=str,
@@ -183,7 +228,7 @@ def parse_cmdline(available_arch):
 
     p.add_argument(
         '--display_every',
-        default=10,
+        default=1,
         type=int,
         required=False,
         help="""How often (in batches) to print out running information."""
diff --git a/TensorFlow/Classification/ConvNets/utils/hooks/benchmark_hooks.py b/TensorFlow/Classification/ConvNets/utils/hooks/benchmark_hooks.py
diff --git a/TensorFlow/Classification/ConvNets/utils/hooks/training_hooks.py b/TensorFlow/Classification/ConvNets/utils/hooks/training_hooks.py

Original file line number	Diff line number	Diff line change
`@@ -99,7 +99,7 @@`
`99`	`99`	`symmetric=FLAGS.symmetric,`
`100`	`100`	`quant_delay = FLAGS.quant_delay,`
`101`	`101`	`use_qdq = FLAGS.use_qdq,`
`102`		`- finetune_checkpoint = FLAGS.finetune_checkpoint,`
	`102`	`+ finetune_checkpoint=FLAGS.finetune_checkpoint,`
`103`	`103`	`)`
`104`	`104`
`105`	`105`	`if FLAGS.mode in ["train_and_evaluate", 'evaluate', 'inference_benchmark']:`