Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

[MXNET-139] Tutorial for mixed precision training with float16 #10391

Merged
merged 49 commits into from
Jun 27, 2018
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
2599e6e
dtype for data, working fp16
rahul003 Mar 27, 2018
f874635
test dtype fp16 gluon
rahul003 Mar 28, 2018
b5173e2
add gluon fine tuning code
rahul003 Mar 28, 2018
a80065e
data iter caltech
rahul003 Mar 28, 2018
2c3f344
caltech iter
rahul003 Mar 28, 2018
a5025f2
working finetuning for fp16, but is it using pretrained params
rahul003 Mar 28, 2018
e27c8b8
benchmark fp16
rahul003 Mar 28, 2018
d9056f0
add wip tutorials
rahul003 Mar 28, 2018
d1d58bb
Merge branch 'docs-fp16-new' of https://github.com/rahul003/mxnet int…
rahul003 Mar 28, 2018
e75cdcf
working notebook fp16
rahul003 Mar 29, 2018
910fa32
update master
rahul003 Apr 1, 2018
ae56ace
changes to symbolic examples
rahul003 Apr 1, 2018
fb63684
changes to symbolic examples
rahul003 Apr 1, 2018
9123231
Merge branch 'docs-fp16-new' of https://github.com/rahul003/mxnet int…
rahul003 Apr 1, 2018
09516aa
add fp16 notebook
rahul003 Apr 4, 2018
d4cab73
remove extra files
rahul003 Apr 4, 2018
ae12a4f
remove output of notebook
rahul003 Apr 4, 2018
811607b
update md file
rahul003 Apr 4, 2018
d2b3ff6
remove from faq
rahul003 Apr 4, 2018
17b0e49
dtype for data, working fp16
rahul003 Mar 27, 2018
f2778fd
test dtype fp16 gluon
rahul003 Mar 28, 2018
f6d5243
add gluon fine tuning code
rahul003 Mar 28, 2018
cb7180b
data iter caltech
rahul003 Mar 28, 2018
40893f2
caltech iter
rahul003 Mar 28, 2018
56d2d66
working finetuning for fp16, but is it using pretrained params
rahul003 Mar 28, 2018
af13bb2
benchmark fp16
rahul003 Mar 28, 2018
4b007de
add wip tutorials
rahul003 Mar 28, 2018
a1283e3
working notebook fp16
rahul003 Mar 29, 2018
dde212b
changes to symbolic examples
rahul003 Apr 1, 2018
6de678e
changes to symbolic examples
rahul003 Apr 1, 2018
632862e
add fp16 notebook
rahul003 Apr 4, 2018
a986efd
remove extra files
rahul003 Apr 4, 2018
99a5ab1
remove output of notebook
rahul003 Apr 4, 2018
60e6fe1
update md file
rahul003 Apr 4, 2018
bdaf3c0
remove from faq
rahul003 Apr 4, 2018
2cc3579
WIP address feedback
rahul003 May 25, 2018
0f854e8
gluon example
rahul003 Jun 6, 2018
ceae9af
add top5 back
Jun 8, 2018
1d985b3
clean up gluon example
rahul003 Jun 8, 2018
602e51b
address feedback
rahul003 Jun 8, 2018
fe7b48a
update tutorial
rahul003 Jun 8, 2018
653bb2c
address comments
rahul003 Jun 8, 2018
ea2c5b7
move tutorial to faq
rahul003 Jun 8, 2018
1cca0ce
Add training curves
rahul003 Jun 13, 2018
6d6c6bd
formatting
rahul003 Jun 13, 2018
a7852b4
update image
rahul003 Jun 13, 2018
458bccc
Merge branch 'master' into docs-fp16-new
rahul003 Jun 18, 2018
eb37906
trigger ci
rahul003 Jun 19, 2018
923ed74
Merge branch 'docs-fp16-new' of https://github.com/rahul003/mxnet int…
rahul003 Jun 19, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
gluon example
  • Loading branch information
rahul003 committed Jun 6, 2018
commit 0f854e80a35ae81192763ead52181666856da660
85 changes: 83 additions & 2 deletions example/gluon/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def val_transform(image, label):
return mx.nd.cast(image, dtype), label
return train_transform, val_transform

def get_imagenet_iterator(root, batch_size, num_workers, data_shape=224, dtype='float32'):
def get_imagenet_dataloader_iterator(root, batch_size, num_workers, data_shape=224, dtype='float32'):
"""Dataset loader with preprocessing."""
train_dir = os.path.join(root, 'train')
train_transform, val_transform = get_imagenet_transforms(data_shape, dtype)
Expand All @@ -88,7 +88,88 @@ def get_imagenet_iterator(root, batch_size, num_workers, data_shape=224, dtype='
val_data = DataLoader(val_dataset, batch_size, last_batch='keep', num_workers=num_workers)
return DataLoaderIter(train_data, dtype), DataLoaderIter(val_data, dtype)

def get_caltech101_iterator()
def get_imagenet_iterator(kv, batch_size, opt,image_shape=(3,224,224)):
rank, nworker = kv.rank, kv.num_workers
rgb_mean = [float(i) for i in opt.rgb_mean.split(',')]
train = mx.io.ImageRecordIter(
path_imgrec = opt.data_train,
path_imgidx = opt.data_train_idx,
label_width = 1,
mean_r = rgb_mean[0],
mean_g = rgb_mean[1],
mean_b = rgb_mean[2],
data_name = 'data',
label_name = 'softmax_label',
data_shape = image_shape,
batch_size = batch_size,
rand_crop = opt.random_crop,
max_random_scale = opt.max_random_scale,
pad = opt.pad_size,
fill_value = 127,
min_random_scale = opt.min_random_scale,
max_aspect_ratio = opt.max_random_aspect_ratio,
random_h = opt.max_random_h,
random_s = opt.max_random_s,
random_l = opt.max_random_l,
max_rotate_angle = opt.max_random_rotate_angle,
max_shear_ratio = opt.max_random_shear_ratio,
rand_mirror = opt.random_mirror,
preprocess_threads = opt.data_nthreads,
shuffle = True,
num_parts = nworker,
part_index = rank)
val = mx.io.ImageRecordIter(
path_imgrec = opt.data_val,
path_imgidx = opt.data_val_idx,
label_width = 1,
mean_r = rgb_mean[0],
mean_g = rgb_mean[1],
mean_b = rgb_mean[2],
data_name = 'data',
label_name = 'softmax_label',
batch_size = batch_size,
data_shape = image_shape,
preprocess_threads = opt.data_nthreads,
rand_crop = False,
rand_mirror = False,
num_parts = nworker,
part_index = rank)
return (train, val)

def get_caltech101_data():
url = "https://s3.us-east-2.amazonaws.com/mxnet-public/101_ObjectCategories.tar.gz"
dataset_name = "101_ObjectCategories"
if not os.path.isdir("data"):
os.makedirs(data_folder)
tar_path = mx.gluon.utils.download(url, path='data')
if (not os.path.isdir(os.path.join(data_folder, "101_ObjectCategories")) or
not os.path.isdir(os.path.join(data_folder, "101_ObjectCategories_test"))):
tar = tarfile.open(tar_path, "r:gz")
tar.extractall(data_folder)
tar.close()
print('Data extracted')
training_path = os.path.join(data_folder, dataset_name)
testing_path = os.path.join(data_folder, "{}_test".format(dataset_name))
return training_path, testing_path

def get_caltech101_iterator(batch_size, num_workers, dtype):
def transform(image, label):
# resize the shorter edge to 224, the longer edge will be greater or equal to 224
resized = mx.image.resize_short(image, 224)
# center and crop an area of size (224,224)
cropped, crop_info = mx.image.center_crop(resized, 224)
# transpose the channels to be (3,224,224)
transposed = nd.transpose(cropped, (2, 0, 1))
image = mx.nd.cast(image, dtype)
return image, label

training_path, testing_path = get_caltech101_data()
dataset_train = ImageFolderDataset(root=training_path, transform=transform)
dataset_test = ImageFolderDataset(root=testing_path, transform=transform)

train_data = gluon.data.DataLoader(dataset_train, batch_size, shuffle=True, num_workers=num_workers)
test_data = gluon.data.DataLoader(dataset_test, batch_size, shuffle=False, num_workers=num_workers)
return DataLoaderIter(train_data), DataLoaderIter(test_data)

class DummyIter(mx.io.DataIter):
def __init__(self, batch_size, data_shape, batches = 100):
Expand Down
122 changes: 90 additions & 32 deletions example/gluon/image_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,59 @@

# CLI
parser = argparse.ArgumentParser(description='Train a model for image classification.')
parser.add_argument('--dataset', type=str, default='cifar10',

data = parser.add_argument_group('Data', 'the input for training')
data.add_argument('--dataset', type=str, default='cifar10',
help='dataset to use. options are mnist, cifar10, caltech101, imagenet and dummy.')
parser.add_argument('--data-dir', type=str, default='',
help='training directory of imagenet images, contains train/val subdirs.')
data.add_argument('--use-dataloader', action='store_true',
help='Use more flexible Gluon Dataloader for Imagenet dataset. '
'Requires data-dir to be passed in that case'
'If this is not passed, tries to use the faster RecIO pipeline.'
'Uses arguments data-train, data-val, data-train-idx, data-val-idx then')
data.add_argument('--data-dir', type=str, default='',
help='training directory of imagenet images, contains train/val subdirs.')
data.add_argument('--num-worker', '-j', dest='num_workers', default=4, type=int,
help='number of workers for dataloader')
data.add_argument('--data-train', type=str, default='',
help='Path to training rec files for imagenet')
data.add_argument('--data-train', type=str, default='',
help='Path to idx file for training rec file for imagenet')
data.add_argument('--data-val', type=str, default='',
help='Path to validation set rec file for imagenet')
data.add_argument('--data-val-idx', type=str, default='',
help='Path to idx file for validation rec file for imagenet')
data.add_argument('--rgb-mean', type=str, default='123.68,116.779,103.939',
help='a tuple of size 3 for the mean rgb')
data.add_argument('--data-nthreads', type=int, default=4,
help='number of threads for data decoding')
data.add_argument('--pad-size', type=int, default=0,
help='padding the input image')

aug = parser.add_argument_group(
'Image augmentations', 'implemented in src/io/image_aug_default.cc')
aug.add_argument('--random-crop', type=int, default=1,
help='if or not randomly crop the image')
aug.add_argument('--random-mirror', type=int, default=1,
help='if or not randomly flip horizontally')
aug.add_argument('--max-random-h', type=int, default=0,
help='max change of hue, whose range is [0, 180]')
aug.add_argument('--max-random-s', type=int, default=0,
help='max change of saturation, whose range is [0, 255]')
aug.add_argument('--max-random-l', type=int, default=0,
help='max change of intensity, whose range is [0, 255]')
aug.add_argument('--max-random-aspect-ratio', type=float, default=0,
help='max change of aspect ratio, whose range is [0, 1]')
aug.add_argument('--max-random-rotate-angle', type=int, default=0,
help='max angle to rotate, whose range is [0, 360]')
aug.add_argument('--max-random-shear-ratio', type=float, default=0,
help='max ratio to shear, whose range is [0, 1]')
aug.add_argument('--max-random-scale', type=float, default=1,
help='max ratio to scale')
aug.add_argument('--min-random-scale', type=float, default=1,
help='min ratio to scale, should >= img_size/input_shape. otherwise use --pad-size')

parser.add_argument('--batch-size', type=int, default=32,
help='training batch size per device (CPU/GPU).')
parser.add_argument('--num-worker', '-j', dest='num_workers', default=4, type=int,
help='number of workers of dataloader.')
parser.add_argument('--gpus', type=str, default='',
help='ordinates of gpus to use, can be "0,1,2" or empty for cpu only.')
parser.add_argument('--epochs', type=int, default=120,
Expand Down Expand Up @@ -90,6 +135,8 @@
help='data type, float32 or float16 if applicable')
parser.add_argument('--save-frequency', default=10, type=int,
help='epoch frequence to save model, best model will always be saved')
parser.add_argument('--top-k', default=0, type=int,
help='specify if topk accuracy is to be tracked')
parser.add_argument('--kvstore', type=str, default='device',
help='kvstore to use for trainer/module.')
parser.add_argument('--log-interval', type=int, default=50,
Expand All @@ -104,13 +151,16 @@
logger.info('Starting new image-classification task:, %s',opt)
mx.random.seed(opt.seed)
model_name = opt.model
dataset_classes = {'mnist': 10, 'cifar10': 10, 'imagenet': 1000, 'dummy': 1000}
dataset_classes = {'mnist': 10, 'cifar10': 10, 'caltech101':101, 'imagenet': 1000, 'dummy': 1000}
batch_size, dataset, classes = opt.batch_size, opt.dataset, dataset_classes[opt.dataset]
context = [mx.gpu(int(i)) for i in opt.gpus.split(',')] if opt.gpus.strip() else [mx.cpu()]
num_gpus = len(context)
batch_size *= max(1, num_gpus)
lr_steps = [int(x) for x in opt.lr_steps.split(',') if x.strip()]
metric = CompositeEvalMetric([Accuracy(), TopKAccuracy(5)])
metric = CompositeEvalMetric([Accuracy()])
if opt.top_k:
metric.add(TopKAccuracy(opt.top_k))
kv = mx.kv.create(opt.kvstore)

def get_model(model, ctx, opt):
"""Model initialization."""
Expand All @@ -133,34 +183,39 @@ def get_model(model, ctx, opt):

net = get_model(opt.model, context, opt)

def get_data_iters(dataset, batch_size, num_workers=1, rank=0):
def get_data_iters(dataset, batch_size, opt):
"""get dataset iterators"""
if dataset == 'mnist':
train_data, val_data = get_mnist_iterator(batch_size, (1, 28, 28),
num_parts=num_workers, part_index=rank)
num_parts=kv.num_workers, part_index=kv.rank)
elif dataset == 'cifar10':
train_data, val_data = get_cifar10_iterator(batch_size, (3, 32, 32),
num_parts=num_workers, part_index=rank)
num_parts=kv.num_workers, part_index=kv.rank)
elif dataset == 'imagenet':
if not opt.data_dir:
raise ValueError('Dir containing raw images in train/val is required for imagenet, plz specify "--data-dir"')
if model_name == 'inceptionv3':
train_data, val_data = get_imagenet_iterator(opt.data_dir, batch_size, opt.num_workers, 299, opt.dtype)
shape_dim = 299 if model_name == 'inceptionv3' else 224
if opt.use_dataloader:
if not opt.data_dir:
raise ValueError('Dir containing raw images in train/val is required for imagenet.'
'Please specify "--data-dir"')

train_data, val_data = get_imagenet_dataloader_iterator(opt.data_dir, batch_size,
opt.num_workers, shape_dim, opt.dtype)
else:
train_data, val_data = get_imagenet_iterator(opt.data_dir, batch_size, opt.num_workers, 224, opt.dtype)
train_data, val_data = get_imagenet_iterator(kv, batch_size, opt,
image_shape=(3, shape_dim, shape_dim))
elif dataset == 'caltech101':
train_data, val_data = get_caltech101_iterator(batch_size, opt.num_workers, opt.dtype)
elif dataset == 'dummy':
if model_name == 'inceptionv3':
train_data, val_data = dummy_iterator(batch_size, (3, 299, 299))
else:
train_data, val_data = dummy_iterator(batch_size, (3, 224, 224))
shape_dim = 299 if model_name == 'inceptionv3' else 224
train_data, val_data = dummy_iterator(batch_size, (3, shape_dim, shape_dim))
return train_data, val_data

def test(ctx, val_data):
metric.reset()
val_data.reset()
for batch in val_data:
data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0)
label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0)
data = gluon.utils.split_and_load(batch.data[0].astype(opt.dtype), ctx_list=ctx, batch_axis=0)
label = gluon.utils.split_and_load(batch.label[0].astype(opt.dtype), ctx_list=ctx, batch_axis=0)
outputs = []
for x in data:
outputs.append(net(x))
Expand All @@ -187,16 +242,17 @@ def save_checkpoint(epoch, top1, best_acc):
def train(opt, ctx):
if isinstance(ctx, mx.Context):
ctx = [ctx]
kv = mx.kv.create(opt.kvstore)

train_data, val_data = get_data_iters(dataset, batch_size, kv.num_workers, kv.rank)
net.collect_params().reset_ctx(ctx)
trainer = gluon.Trainer(net.collect_params(), 'sgd',
{'learning_rate': opt.lr, 'wd': opt.wd, 'momentum': opt.momentum,
'multi_precision': True},
kvstore = kv)
optimizer_params={'learning_rate': opt.lr,
'wd': opt.wd,
'momentum': opt.momentum,
'multi_precision': True},
kvstore=kv)
loss = gluon.loss.SoftmaxCrossEntropyLoss()


total_time = 0
num_epochs = 0
best_acc = [0]
Expand Down Expand Up @@ -256,17 +312,19 @@ def main():
out = net(data)
softmax = mx.sym.SoftmaxOutput(out, name='softmax')
mod = mx.mod.Module(softmax, context=context)
kv = mx.kv.create(opt.kvstore)
train_data, val_data = get_data_iters(dataset, batch_size, kv.num_workers, kv.rank)
train_data, val_data = get_data_iters(dataset, batch_size, opt)
mod.fit(train_data,
eval_data = val_data,
eval_data=val_data,
num_epoch=opt.epochs,
kvstore=kv,
batch_end_callback = mx.callback.Speedometer(batch_size, max(1, opt.log_interval)),
epoch_end_callback = mx.callback.do_checkpoint('image-classifier-%s'% opt.model),
optimizer = 'sgd',
optimizer_params = {'learning_rate': opt.lr, 'wd': opt.wd, 'momentum': opt.momentum, 'multi_precision': True},
initializer = mx.init.Xavier(magnitude=2))
optimizer='sgd',
optimizer_params={'learning_rate': opt.lr,
'wd': opt.wd,
'momentum': opt.momentum,
'multi_precision': True},
initializer=mx.init.Xavier(magnitude=2))
mod.save_params('image-classifier-%s-%d-final.params'%(opt.model, opt.epochs))
else:
if opt.mode == 'hybrid':
Expand Down