Skip to content

Commit 308925e

Browse files
Merge pull request NVIDIA#737 from NVIDIA/gh/release
[ConvNets/TF] Performance fix
2 parents 6f20c08 + b2e7f4a commit 308925e

File tree

10 files changed

+141
-41
lines changed

10 files changed

+141
-41
lines changed

TensorFlow/Classification/ConvNets/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@
9999
symmetric=FLAGS.symmetric,
100100
quant_delay = FLAGS.quant_delay,
101101
use_qdq = FLAGS.use_qdq,
102-
finetune_checkpoint = FLAGS.finetune_checkpoint,
102+
finetune_checkpoint=FLAGS.finetune_checkpoint,
103103
)
104104

105105
if FLAGS.mode in ["train_and_evaluate", 'evaluate', 'inference_benchmark']:

TensorFlow/Classification/ConvNets/model/resnet.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,9 @@ def __call__(self, features, labels, mode, params):
188188
use_final_conv=params['use_final_conv']
189189
)
190190

191+
if mode!=tf.estimator.ModeKeys.PREDICT:
192+
logits = tf.squeeze(logits)
193+
191194
if mode!=tf.estimator.ModeKeys.PREDICT:
192195
logits = tf.squeeze(logits)
193196

@@ -201,7 +204,7 @@ def __call__(self, features, labels, mode, params):
201204
tf.identity(logits, name="logits_ref")
202205
tf.identity(probs, name="probs_ref")
203206
tf.identity(y_preds, name="y_preds_ref")
204-
207+
205208
if mode == tf.estimator.ModeKeys.TRAIN and params['quantize']:
206209
dllogger.log(data={"QUANTIZATION AWARE TRAINING ENABLED": True}, step=tuple())
207210
if params['symmetric']:
@@ -219,7 +222,7 @@ def __call__(self, features, labels, mode, params):
219222
train_var_dict[var.op.name] = var
220223
dllogger.log(data={"Restoring variables from checkpoint": params['finetune_checkpoint']}, step=tuple())
221224
tf.train.init_from_checkpoint(params['finetune_checkpoint'], train_var_dict)
222-
225+
223226
if mode == tf.estimator.ModeKeys.PREDICT:
224227

225228
predictions = {'classes': y_preds, 'probabilities': probs}
@@ -458,7 +461,7 @@ def build_model(self, inputs, training=True, reuse=False, use_final_conv=False):
458461

459462
if logits.dtype != tf.float32:
460463
logits = tf.cast(logits, tf.float32)
461-
464+
462465
axis = 3 if self.model_hparams.compute_format=="NHWC" and use_final_conv else 1
463466
probs = layers.softmax(logits, name="softmax", axis=axis)
464467

TensorFlow/Classification/ConvNets/resnet50v1.5/README.md

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ This repository provides a script and recipe to train the ResNet-50 v1.5 model t
2020
* [Parameters](#parameters)
2121
* [The `main.py` script](#the-mainpy-script)
2222
* [Quantization Aware training](#quantization-aware-training)
23+
* [Post process checkpoint](#post-process-checkpoint)
24+
* [Exporting Frozen graphs](#exporting-frozen-graphs)
2325
* [Inference process](#inference-process)
2426
* [Performance](#performance)
2527
* [Benchmarking](#benchmarking)
@@ -200,7 +202,16 @@ cd DeepLearningExamples/TensorFlow/Classification/ConvNets
200202
2. Download and preprocess the dataset.
201203
The ResNet50 v1.5 script operates on ImageNet 1k, a widely popular image classification dataset from the ILSVRC challenge.
202204

203-
To download and preprocess the dataset, use the [Generate ImageNet for TensorFlow](https://github.com/tensorflow/models/blob/archive/research/inception/inception/data/download_and_preprocess_imagenet.sh) script. The dataset will be downloaded to a directory specified as the first parameter of the script.
205+
* [Download the images](http://image-net.org/download-images)
206+
* Extract the training and validation data:
207+
```bash
208+
mkdir train && mv ILSVRC2012_img_train.tar train/ && cd train
209+
tar -xvf ILSVRC2012_img_train.tar && rm -f ILSVRC2012_img_train.tar
210+
find . -name "*.tar" | while read NAME ; do mkdir -p "${NAME%.tar}"; tar -xvf "${NAME}" -C "${NAME%.tar}"; rm -f "${NAME}"; done
211+
cd ..
212+
mkdir val && mv ILSVRC2012_img_val.tar val/ && cd val && tar -xvf ILSVRC2012_img_val.tar
213+
```
214+
* Preprocess dataset to TFRecord form using [script](https://github.com/tensorflow/models/blob/archive/research/inception/inception/data/build_imagenet_data.py). Additional metadata from [autors repository](https://github.com/tensorflow/models/tree/archive/research/inception/inception/data) might be required.
204215

205216
3. Build the ResNet-50 v1.5 TensorFlow NGC container.
206217
```bash
@@ -400,7 +411,7 @@ operations for `tf.contrib.quantize.experimental_create_training_graph` has been
400411
* `--output` : Name of the new checkpoint file which has the FC layer weights reshaped into 1x1 conv layer weights.
401412
* `--dense_layer` : Name of the FC layer
402413

403-
### Exporting Frozen graphs
414+
#### Exporting Frozen graphs
404415
To export frozen graphs (which can be used for inference with <a href="https://developer.nvidia.com/tensorrt">TensorRT</a>), use:
405416

406417
`python export_frozen_graph.py --checkpoint <path_to_checkpoint> --quantize --use_final_conv --use_qdq --symmetric --input_format NCHW --compute_format NCHW --output_file=<output_file_name>`
@@ -452,7 +463,7 @@ To benchmark the training performance on a specific batch size, run:
452463
Each of these scripts runs 200 warm-up iterations and measures the first epoch.
453464

454465
To control warmup and benchmark length, use the `--warmup_steps`, `--num_iter` and `--iter_unit` flags. Features like XLA or DALI can be controlled
455-
with `--use_xla` and `--use_dali` flags. If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset.
466+
with `--use_xla` and `--use_dali` flags. If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset.
456467
Suggested batch sizes for training are 256 for mixed precision training and 128 for single precision training per single V100 16 GB.
457468

458469
#### Inference performance benchmark
@@ -468,8 +479,8 @@ To benchmark the inference performance on a specific batch size, run:
468479
`python ./main.py --mode=inference_benchmark --use_tf_amp --warmup_steps 20 --num_iter 100 --iter_unit batch --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
469480

470481
By default, each of these scripts runs 20 warm-up iterations and measures the next 80 iterations.
471-
To control warm-up and benchmark length, use the `--warmup_steps`, `--num_iter` and `--iter_unit` flags.
472-
If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset.
482+
To control warm-up and benchmark length, use the `--warmup_steps`, `--num_iter` and `--iter_unit` flags.
483+
If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset.
473484

474485
The benchmark can be automated with the `inference_benchmark.sh` script provided in `resnet50v1.5`, by simply running:
475486
`bash ./resnet50v1.5/inference_benchmark.sh <data dir> <data idx dir>`
@@ -518,8 +529,8 @@ on NVIDIA DGX A100 (8x A100 40GB) GPUs. Performance numbers (in images per secon
518529

519530
| GPUs | Batch Size / GPU | Throughput - TF32 + XLA | Throughput - mixed precision + XLA | Throughput speedup (TF32 - mixed precision) | Weak scaling - TF32 + XLA | Weak scaling - mixed precision + XLA |
520531
|----|---------------|---------------|------------------------|-----------------|-----------|-------------------|
521-
| 1 | 256 | 808 img/s | 1770 img/s | 2.20x | 1.00x | 1.00x |
522-
| 8 | 256 | 6300 img/s | 16400 img/s | 2.60x | 7.79x | 9.26x |
532+
| 1 | 256 | 909 img/s | 2375 img/s | 2.60x | 1.00x | 1.00x |
533+
| 8 | 256 | 7000 img/s | 17400 img/s | 2.48x | 7.70x | 7.32x |
523534

524535
##### Training performance: NVIDIA DGX-1 (8x V100 16G)
525536
Our results were obtained by running the `resnet50v1.5/training/training_perf.sh` benchmark script in the

TensorFlow/Classification/ConvNets/resnext101-32x4d/README.md

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,16 @@ cd DeepLearningExamples/TensorFlow/Classification/ConvNets
209209
2. Download and preprocess the dataset.
210210
The ResNext101-32x4d script operates on ImageNet 1k, a widely popular image classification dataset from the ILSVRC challenge.
211211

212-
To download and preprocess the dataset, use the [Generate ImageNet for TensorFlow](https://github.com/tensorflow/models/blob/archive/research/inception/inception/data/download_and_preprocess_imagenet.sh) script. The dataset will be downloaded to a directory specified as the first parameter of the script.
212+
* [Download the images](http://image-net.org/download-images)
213+
* Extract the training and validation data:
214+
```bash
215+
mkdir train && mv ILSVRC2012_img_train.tar train/ && cd train
216+
tar -xvf ILSVRC2012_img_train.tar && rm -f ILSVRC2012_img_train.tar
217+
find . -name "*.tar" | while read NAME ; do mkdir -p "${NAME%.tar}"; tar -xvf "${NAME}" -C "${NAME%.tar}"; rm -f "${NAME}"; done
218+
cd ..
219+
mkdir val && mv ILSVRC2012_img_val.tar val/ && cd val && tar -xvf ILSVRC2012_img_val.tar
220+
```
221+
* Preprocess dataset to TFRecord form using [script](https://github.com/tensorflow/models/blob/archive/research/inception/inception/data/build_imagenet_data.py). Additional metadata from [autors repository](https://github.com/tensorflow/models/tree/archive/research/inception/inception/data) might be required.
213222

214223
3. Build the ResNext101-32x4d TensorFlow NGC container.
215224
```bash
@@ -420,7 +429,7 @@ To benchmark the training performance on a specific batch size, run:
420429
Each of these scripts runs 200 warm-up iterations and measures the first epoch.
421430

422431
To control warmup and benchmark length, use the `--warmup_steps`, `--num_iter` and `--iter_unit` flags. Features like XLA or DALI can be controlled
423-
with `--use_xla` and `--use_dali` flags. If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset.
432+
with `--use_xla` and `--use_dali` flags. If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset.
424433
Suggested batch sizes for training are 128 for mixed precision training and 64 for single precision training per single V100 16 GB.
425434

426435

@@ -438,7 +447,7 @@ To benchmark the inference performance on a specific batch size, run:
438447

439448
By default, each of these scripts runs 20 warm-up iterations and measures the next 80 iterations.
440449
To control warm-up and benchmark length, use the `--warmup_steps`, `--num_iter` and `--iter_unit` flags.
441-
If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset.
450+
If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset.
442451

443452
The benchmark can be automated with the `inference_benchmark.sh` script provided in `resnext101-32x4d`, by simply running:
444453
`bash ./resnext101-32x4d/inference_benchmark.sh <data dir> <data idx dir>`
@@ -487,8 +496,8 @@ on NVIDIA DGX A100 (8x A100 40GB) GPUs. Performance numbers (in images per secon
487496

488497
| GPUs | Batch Size / GPU | Throughput - TF32 + XLA | Throughput - mixed precision + XLA | Throughput speedup (TF32 - mixed precision) | Weak scaling - TF32 + XLA| Weak scaling - mixed precision + XLA |
489498
|----|---------------|---------------|------------------------|-----------------|-----------|-------------------|
490-
| 1 | 128 (TF) / 256 (AMP) | 340 img/s | 905 img/s | 2.66x | 1.00x | 1.00x |
491-
| 8 | 128 (TF) / 256 (AMP) | 2630 img/s | 8000 img/s | 3.05x | 7.73x | 8.84x |
499+
| 1 | 128 (TF) / 256 (AMP) | 371 img/s | 1132 img/s | 3.05x | 1.00x | 1.00x |
500+
| 8 | 128 (TF) / 256 (AMP) | 2854 img/s | 8500 img/s | 2.98x | 7.69x | 7.51x |
492501

493502

494503
##### Training performance: NVIDIA DGX-1 (8x V100 16G)

TensorFlow/Classification/ConvNets/runtime/runner.py

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ def __init__(
9595
#os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
9696

9797
os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
98-
os.environ['TF_GPU_THREAD_COUNT'] = '1' if not hvd_utils.is_using_hvd() else str(hvd.size())
98+
os.environ['TF_GPU_THREAD_COUNT'] = '2'
9999

100100
os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'
101101

@@ -246,11 +246,7 @@ def _get_session_config(mode, use_xla, use_dali, gpu_memory_fraction, gpu_id=0):
246246

247247
if mode == 'train':
248248
config.intra_op_parallelism_threads = 1 # Avoid pool of Eigen threads
249-
250-
if hvd_utils.is_using_hvd():
251-
config.inter_op_parallelism_threads = max(2, (multiprocessing.cpu_count() // hvd.size()) - 2)
252-
else:
253-
config.inter_op_parallelism_threads = 4
249+
config.inter_op_parallelism_threads = max(2, (multiprocessing.cpu_count() // max(hvd.size(), 8) - 2))
254250

255251
return config
256252

@@ -407,15 +403,16 @@ def train(
407403

408404
if is_benchmark:
409405
self.training_logging_hook = hooks.BenchmarkLoggingHook(
410-
global_batch_size=global_batch_size, warmup_steps=warmup_steps
406+
global_batch_size=global_batch_size, warmup_steps=warmup_steps, logging_steps=log_every_n_steps
411407
)
412408
else:
413409
self.training_logging_hook = hooks.TrainingLoggingHook(
414410
global_batch_size=global_batch_size,
415411
num_steps=num_steps,
416412
num_samples=num_samples,
417413
num_epochs=num_epochs,
418-
steps_per_epoch=steps_per_epoch
414+
steps_per_epoch=steps_per_epoch,
415+
logging_steps=log_every_n_steps
419416
)
420417
training_hooks.append(self.training_logging_hook)
421418

@@ -446,10 +443,10 @@ def train(
446443
'symmetric': symmetric,
447444
'quant_delay': quant_delay
448445
}
449-
446+
450447
if finetune_checkpoint:
451-
estimator_params['finetune_checkpoint']=finetune_checkpoint
452-
448+
estimator_params['finetune_checkpoint'] = finetune_checkpoint
449+
453450
image_classifier = self._get_estimator(
454451
mode='train',
455452
run_params=estimator_params,
@@ -589,7 +586,9 @@ def evaluate(
589586
eval_hooks = []
590587

591588
if hvd.rank() == 0:
592-
self.eval_logging_hook = hooks.BenchmarkLoggingHook(global_batch_size=batch_size, warmup_steps=warmup_steps)
589+
self.eval_logging_hook = hooks.BenchmarkLoggingHook(
590+
global_batch_size=batch_size, warmup_steps=warmup_steps, logging_steps=log_every_n_steps
591+
)
593592
eval_hooks.append(self.eval_logging_hook)
594593

595594
print('Starting Model Evaluation...')

TensorFlow/Classification/ConvNets/se-resnext101-32x4d/README.md

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,16 @@ cd DeepLearningExamples/TensorFlow/Classification/ConvNets
204204
2. Download and preprocess the dataset.
205205
The SE-ResNext101-32x4d script operates on ImageNet 1k, a widely popular image classification dataset from the ILSVRC challenge.
206206

207-
To download and preprocess the dataset, use the [Generate ImageNet for TensorFlow](https://github.com/tensorflow/models/blob/archive/research/inception/inception/data/download_and_preprocess_imagenet.sh) script. The dataset will be downloaded to a directory specified as the first parameter of the script.
207+
* [Download the images](http://image-net.org/download-images)
208+
* Extract the training and validation data:
209+
```bash
210+
mkdir train && mv ILSVRC2012_img_train.tar train/ && cd train
211+
tar -xvf ILSVRC2012_img_train.tar && rm -f ILSVRC2012_img_train.tar
212+
find . -name "*.tar" | while read NAME ; do mkdir -p "${NAME%.tar}"; tar -xvf "${NAME}" -C "${NAME%.tar}"; rm -f "${NAME}"; done
213+
cd ..
214+
mkdir val && mv ILSVRC2012_img_val.tar val/ && cd val && tar -xvf ILSVRC2012_img_val.tar
215+
```
216+
* Preprocess dataset to TFRecord form using [script](https://github.com/tensorflow/models/blob/archive/research/inception/inception/data/build_imagenet_data.py). Additional metadata from [autors repository](https://github.com/tensorflow/models/tree/archive/research/inception/inception/data) might be required.
208217

209218
3. Build the SE-ResNext101-32x4d TensorFlow NGC container.
210219
```bash
@@ -415,7 +424,7 @@ To benchmark the training performance on a specific batch size, run:
415424
Each of these scripts runs 200 warm-up iterations and measures the first epoch.
416425

417426
To control warmup and benchmark length, use the `--warmup_steps`, `--num_iter` and `--iter_unit` flags. Features like XLA or DALI can be controlled
418-
with `--use_xla` and `--use_dali` flags. If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset.
427+
with `--use_xla` and `--use_dali` flags. If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset.
419428
Suggested batch sizes for training are 96 for mixed precision training and 64 for single precision training per single V100 16 GB.
420429

421430

@@ -433,7 +442,7 @@ To benchmark the inference performance on a specific batch size, run:
433442

434443
By default, each of these scripts runs 20 warm-up iterations and measures the next 80 iterations.
435444
To control warm-up and benchmark length, use the `--warmup_steps`, `--num_iter` and `--iter_unit` flags.
436-
If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset.
445+
If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset.
437446

438447
The benchmark can be automated with the `inference_benchmark.sh` script provided in `se-resnext101-32x4d`, by simply running:
439448
`bash ./se-resnext101-32x4d/inference_benchmark.sh <data dir> <data idx dir>`
@@ -482,8 +491,8 @@ on NVIDIA DGX A100 (8x A100 40GB) GPUs. Performance numbers (in images per secon
482491

483492
| GPUs | Batch Size / GPU | Throughput - TF32 + XLA | Throughput - mixed precision + XLA | Throughput speedup (TF32 - mixed precision) | Weak scaling - TF32 + XLA | Weak scaling - mixed precision + XLA |
484493
|----|---------------|---------------|------------------------|-----------------|-----------|-------------------|
485-
| 1 | 128 (TF) / 256 (AMP) | 313 img/s | 895 img/s | 2.86x | 1.00x | 1.00x |
486-
| 8 | 128 (TF) / 256 (AMP) | 2400 img/s | 6930 img/s | 2.88x | 7.66x | 7.74x |
494+
| 1 | 128 (TF) / 256 (AMP) | 342 img/s | 975 img/s | 2.86x | 1.00x | 1.00x |
495+
| 8 | 128 (TF) / 256 (AMP) | 2610 img/s | 7230 img/s | 2.77x | 7.63x | 7.41x |
487496

488497
##### Training performance: NVIDIA DGX-1 (8x V100 16G)
489498
Our results were obtained by running the `se-resnext101-32x4d/training/training_perf.sh` benchmark script in the
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
#!/bin/bash
2+
3+
if [[ -v SLURM_LOCALID ]]; then
4+
echo "Bind using slurm localid"
5+
LOCAL_ID=$SLURM_LOCALID
6+
elif [[ -v OMPI_COMM_WORLD_LOCAL_RANK ]]; then
7+
echo "Bind using OpenMPI env"
8+
LOCAL_ID=$OMPI_COMM_WORLD_LOCAL_RANK
9+
else
10+
echo "Bind to first node"
11+
LOCAL_ID=0
12+
fi
13+
14+
case $LOCAL_ID in
15+
0|1) exec numactl --cpunodebind=3 --membind=3 $@;;
16+
2|3) exec numactl --cpunodebind=1 --membind=1 $@;;
17+
4|5) exec numactl --cpunodebind=7 --membind=7 $@;;
18+
6|7) exec numactl --cpunodebind=5 --membind=5 $@;;
19+
*) echo "unknown binding"; exec $@;;
20+
esac

TensorFlow/Classification/ConvNets/utils/cmdline_helper.py

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,51 @@ def parse_cmdline(available_arch):
173173
help="Quantize weights and activations during training using symmetric quantization."
174174
)
175175

176+
p.add_argument(
177+
'--finetune_checkpoint',
178+
required=False,
179+
default=None,
180+
type=str,
181+
help="Path to pre-trained checkpoint which will be used for fine-tuning"
182+
)
183+
184+
_add_bool_argument(
185+
parser=p, name="use_final_conv", default=False, required=False, help="Use cosine learning rate schedule."
186+
)
187+
188+
p.add_argument(
189+
'--quant_delay',
190+
type=int,
191+
default=0,
192+
required=False,
193+
help="Number of steps to be run before quantization starts to happen"
194+
)
195+
196+
_add_bool_argument(
197+
parser=p,
198+
name="quantize",
199+
default=False,
200+
required=False,
201+
help="Quantize weights and activations during training. (Defaults to Assymmetric quantization)"
202+
)
203+
204+
_add_bool_argument(
205+
parser=p,
206+
name="use_qdq",
207+
default=False,
208+
required=False,
209+
help="Use QDQV3 op instead of FakeQuantWithMinMaxVars op for quantization. QDQv3 does only scaling"
210+
)
211+
212+
_add_bool_argument(
213+
parser=p,
214+
name="symmetric",
215+
default=False,
216+
required=False,
217+
help="Quantize weights and activations during training using symmetric quantization."
218+
)
219+
220+
176221
p.add_argument(
177222
'--log_filename',
178223
type=str,
@@ -183,7 +228,7 @@ def parse_cmdline(available_arch):
183228

184229
p.add_argument(
185230
'--display_every',
186-
default=10,
231+
default=1,
187232
type=int,
188233
required=False,
189234
help="""How often (in batches) to print out running information."""

0 commit comments

Comments
 (0)