Skip to content

Commit

Permalink
Providing callback for cross validation
Browse files Browse the repository at this point in the history
  • Loading branch information
eldakms committed Feb 28, 2017
1 parent 5868673 commit 2a1cd77
Show file tree
Hide file tree
Showing 14 changed files with 831 additions and 480 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import _cntk_py

from cntk.utils import *
from cntk.training_session import *
from cntk.ops import *
from cntk.distributed import data_parallel_distributed_learner, Communicator
from cntk.io import ImageDeserializer, MinibatchSource, StreamDef, StreamDefs, FULL_DATA_SWEEP
Expand Down Expand Up @@ -140,7 +141,7 @@ def create_alexnet():
}

# Create trainer
def create_trainer(network, epoch_size, num_quantization_bits):
def create_trainer(network, epoch_size, num_quantization_bits, printer):
# Set learning parameters
lr_per_mb = [0.01]*25 + [0.001]*25 + [0.0001]*25 + [0.00001]*25 + [0.000001]
lr_schedule = cntk.learning_rate_schedule(lr_per_mb, unit=cntk.learner.UnitType.minibatch, epoch_size=epoch_size)
Expand All @@ -156,34 +157,26 @@ def create_trainer(network, epoch_size, num_quantization_bits):
distributed_after=0)

# Create trainer
return cntk.Trainer(network['output'], (network['ce'], network['pe']), parameter_learner)
return cntk.Trainer(network['output'], (network['ce'], network['pe']), parameter_learner, printer)

# Train and test
def train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, restore):
def train_and_test(network, trainer, train_source, test_source, minibatch_size, epoch_size, restore):

# define mapping from intput streams to network inputs
input_map = {
network['feature']: train_source.streams.features,
network['label']: train_source.streams.labels
}

training_session = cntk.training_session(
training_minibatch_source = train_source,
trainer = trainer,
model_inputs_to_mb_source_mapping = input_map,
mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size),
progress_printer = progress_printer,
# checkpoint_frequency = epoch_size,
checkpoint_filename = os.path.join(model_path, model_name),
# save_all_checkpoints = True,
progress_frequency = epoch_size,
cv_source = test_source,
cv_mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size),
# cv_frequency = epoch_size,
restore = restore)

# Train all minibatches
training_session.train()
# Train all minibatches
training_session(
trainer=trainer, mb_source = train_source,
var_to_stream = input_map,
mb_size = minibatch_size,
progress_frequency=epoch_size,
checkpoint_config = CheckpointConfig(filename=os.path.join(model_path, model_name), restore=restore),
cv_config= CrossValidationConfig(source=test_source, mb_size=minibatch_size)
).train()

# Train and evaluate the network.
def alexnet_train_and_eval(train_data, test_data, num_quantization_bits=32, minibatch_size=256, epoch_size = 1281167, max_epochs=112,
Expand All @@ -199,10 +192,10 @@ def alexnet_train_and_eval(train_data, test_data, num_quantization_bits=32, mini
num_epochs=max_epochs)

network = create_alexnet()
trainer = create_trainer(network, epoch_size, num_quantization_bits)
trainer = create_trainer(network, epoch_size, num_quantization_bits, progress_printer)
train_source = create_image_mb_source(train_data, True, total_number_of_samples=max_epochs * epoch_size)
test_source = create_image_mb_source(test_data, False, total_number_of_samples=FULL_DATA_SWEEP)
train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, restore)
train_and_test(network, trainer, train_source, test_source, minibatch_size, epoch_size, restore)


if __name__=='__main__':
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import cntk
import _cntk_py
import cntk.io.transforms as xforms
from cntk.training_session import *

# default Paths relative to current python file.
abs_path = os.path.dirname(os.path.abspath(__file__))
Expand Down Expand Up @@ -90,7 +91,7 @@ def create_conv_network():


# Create trainer
def create_trainer(network, epoch_size, num_quantization_bits, block_size, warm_up):
def create_trainer(network, epoch_size, num_quantization_bits, block_size, warm_up, progress_writers):
# Set learning parameters
lr_per_sample = [0.0015625]*20 + [0.00046875]*20 + [0.00015625]*20 + [0.000046875]*10 + [0.000015625]
lr_schedule = cntk.learning_rate_schedule(lr_per_sample, unit=cntk.learner.UnitType.sample, epoch_size=epoch_size)
Expand All @@ -112,37 +113,31 @@ def create_trainer(network, epoch_size, num_quantization_bits, block_size, warm_
parameter_learner = cntk.distributed.data_parallel_distributed_learner(local_learner, num_quantization_bits=num_quantization_bits, distributed_after=warm_up)

# Create trainer
return cntk.Trainer(network['output'], (network['ce'], network['pe']), parameter_learner)
return cntk.Trainer(network['output'], (network['ce'], network['pe']), parameter_learner, progress_writers)

# Train and test
def train_and_test(network, trainer, train_source, test_source, progress_writers, minibatch_size, epoch_size, restore, profiling=False):
def train_and_test(network, trainer, train_source, test_source, minibatch_size, epoch_size, restore, profiling=False):

# define mapping from intput streams to network inputs
input_map = {
network['feature']: train_source.streams.features,
network['label']: train_source.streams.labels
}

training_session = cntk.training_session(
training_minibatch_source = train_source,
trainer = trainer,
model_inputs_to_mb_source_mapping = input_map,
mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size),
progress_printer = progress_writers,
checkpoint_frequency = epoch_size,
checkpoint_filename = os.path.join(model_path, "ConvNet_CIFAR10_DataAug"),
# save_all_checkpoints = False,
progress_frequency=epoch_size,
cv_source = test_source,
cv_mb_size_schedule=cntk.minibatch_size_schedule(minibatch_size),
# cv_frequency = epoch_size,
restore=restore)

# Train all minibatches
if profiling:
cntk.start_profiler(sync_gpu=True)

training_session.train()
training_session(
trainer=trainer, mb_source = train_source,
var_to_stream = input_map,
mb_size = minibatch_size,
progress_frequency=epoch_size,
checkpoint_config = CheckpointConfig(frequency = epoch_size,
filename = os.path.join(model_path, "ConvNet_CIFAR10_DataAug"),
restore = restore),
cv_config = CrossValidationConfig(source = test_source, mb_size=minibatch_size)
).train()

if profiling:
cntk.stop_profiler()
Expand All @@ -169,10 +164,10 @@ def convnet_cifar10_dataaug(train_data, test_data, mean_data, minibatch_size=64,
rank=cntk.distributed.Communicator.rank(),
model=network['output'])

trainer = create_trainer(network, epoch_size, num_quantization_bits, block_size, warm_up)
trainer = create_trainer(network, epoch_size, num_quantization_bits, block_size, warm_up, [progress_printer, tensorboard_writer])
train_source = create_image_mb_source(train_data, mean_data, train=True, total_number_of_samples=max_epochs * epoch_size)
test_source = create_image_mb_source(test_data, mean_data, train=False, total_number_of_samples=cntk.io.FULL_DATA_SWEEP)
train_and_test(network, trainer, train_source, test_source, [progress_printer, tensorboard_writer], minibatch_size,
train_and_test(network, trainer, train_source, test_source, minibatch_size,
epoch_size, restore, profiling)


Expand Down
26 changes: 13 additions & 13 deletions Examples/Image/Classification/MLP/Python/SimpleMNIST.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@
import numpy as np
import sys
import os
from cntk import Trainer, training_session, minibatch_size_schedule
from cntk import Trainer, minibatch_size_schedule
from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs, INFINITELY_REPEAT, FULL_DATA_SWEEP
from cntk.device import cpu, set_default_device
from cntk.learner import sgd, learning_rate_schedule, UnitType
from cntk.ops import input_variable, cross_entropy_with_softmax, classification_error, relu, element_times, constant
from cntk.utils import ProgressPrinter
from cntk.training_session import *

abs_path = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.join(abs_path, "..", "..", "..", "..", "common"))
Expand Down Expand Up @@ -65,8 +66,6 @@ def simple_mnist():
}

lr_per_minibatch=learning_rate_schedule(0.2, UnitType.minibatch)
# Instantiate the trainer object to drive the model training
trainer = Trainer(z, (ce, pe), sgd(z.parameters, lr=lr_per_minibatch))

# Get minibatches of images to train with and perform model training
minibatch_size = 64
Expand All @@ -79,16 +78,17 @@ def simple_mnist():
tag='Training',
num_epochs=num_sweeps_to_train_with)

session = training_session(
training_minibatch_source = reader_train,
trainer = trainer,
mb_size_schedule = minibatch_size_schedule(minibatch_size),
progress_printer = progress_printer,
model_inputs_to_mb_source_mapping = input_map,
progress_frequency = num_samples_per_sweep,
max_training_samples = num_samples_per_sweep * num_sweeps_to_train_with)

session.train()
# Instantiate the trainer object to drive the model training
trainer = Trainer(z, (ce, pe), sgd(z.parameters, lr=lr_per_minibatch), progress_printer)

training_session(
trainer=trainer,
mb_source = reader_train,
mb_size = minibatch_size,
var_to_stream = input_map,
max_samples = num_samples_per_sweep * num_sweeps_to_train_with,
progress_frequency=num_samples_per_sweep
).train()

# Load test data
path = os.path.normpath(os.path.join(data_dir, "Test-28x28_cntk_text.txt"))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from _cntk_py import set_computation_network_trace_level
from cntk.device import set_default_device, gpu
from cntk.distributed import data_parallel_distributed_learner, block_momentum_distributed_learner, Communicator
from cntk.training_session import *

from resnet_models import *

Expand Down Expand Up @@ -66,7 +67,7 @@ def create_resnet_network(network_name):


# Create trainer
def create_trainer(network, minibatch_size, epoch_size, num_quantization_bits, block_size, warm_up):
def create_trainer(network, minibatch_size, epoch_size, num_quantization_bits, block_size, warm_up, progress_printer):
if network['name'] == 'resnet20':
lr_per_mb = [1.0]*80+[0.1]*40+[0.01]
elif network['name'] == 'resnet110':
Expand Down Expand Up @@ -94,34 +95,28 @@ def create_trainer(network, minibatch_size, epoch_size, num_quantization_bits, b
else:
learner = data_parallel_distributed_learner(local_learner, num_quantization_bits=num_quantization_bits, distributed_after=warm_up)

return Trainer(network['output'], (network['ce'], network['pe']), learner)
return Trainer(network['output'], (network['ce'], network['pe']), learner, progress_printer)

# Train and test
def train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, profiling=False):
def train_and_test(network, trainer, train_source, test_source, minibatch_size, epoch_size, profiling=False):

# define mapping from intput streams to network inputs
input_map = {
network['feature']: train_source.streams.features,
network['label']: train_source.streams.labels
}

training_session = cntk.training_session(
training_minibatch_source = train_source,
trainer = trainer,
mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size),
progress_printer = progress_printer,
model_inputs_to_mb_source_mapping = input_map,
checkpoint_frequency = epoch_size,
checkpoint_filename="ResNet_CIFAR10_DataAug",
progress_frequency=epoch_size,
cv_source=test_source,
cv_mb_size_schedule=cntk.minibatch_size_schedule(16),
restore=False)

if profiling:
start_profiler(sync_gpu=True)

training_session.train()
training_session(
trainer=trainer, mb_source = train_source,
mb_size = minibatch_size,
var_to_stream = input_map,
checkpoint_config = CheckpointConfig(frequency=epoch_size, filename="ResNet_CIFAR10_DataAug", restore=False),
progress_frequency=epoch_size,
cv_config = CrossValidationConfig(source=test_source, mb_size=16)
).train()

if profiling:
stop_profiler()
Expand All @@ -146,10 +141,10 @@ def resnet_cifar10(train_data, test_data, mean_data, network_name, epoch_size, n
num_epochs=max_epochs)

network = create_resnet_network(network_name)
trainer = create_trainer(network, minibatch_size, epoch_size, num_quantization_bits, block_size, warm_up)
trainer = create_trainer(network, minibatch_size, epoch_size, num_quantization_bits, block_size, warm_up, progress_printer)
train_source = create_image_mb_source(train_data, mean_data, train=True, total_number_of_samples=max_epochs * epoch_size)
test_source = create_image_mb_source(test_data, mean_data, train=False, total_number_of_samples=cntk.io.FULL_DATA_SWEEP)
train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, profiling)
train_and_test(network, trainer, train_source, test_source, minibatch_size, epoch_size, profiling)


if __name__=='__main__':
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from cntk.io import ImageDeserializer, MinibatchSource, StreamDef, StreamDefs, FULL_DATA_SWEEP
from cntk.layers import Placeholder, Block, Convolution2D, Activation, MaxPooling, Dense, Dropout, default_options, Sequential, For
from cntk.initializer import normal
from cntk.training_session import *

# default Paths relative to current python file.
abs_path = os.path.dirname(os.path.abspath(__file__))
Expand Down Expand Up @@ -131,7 +132,7 @@ def create_vgg16():
}

# Create trainer
def create_trainer(network, epoch_size, num_quantization_bits):
def create_trainer(network, epoch_size, num_quantization_bits, progress_printer):
# Set learning parameters
lr_per_mb = [0.01]*20 + [0.001]*20 + [0.0001]*20 + [0.00001]*10 + [0.000001]
lr_schedule = cntk.learning_rate_schedule(lr_per_mb, unit=cntk.learner.UnitType.minibatch, epoch_size=epoch_size)
Expand All @@ -147,34 +148,28 @@ def create_trainer(network, epoch_size, num_quantization_bits):
distributed_after=0)

# Create trainer
return cntk.Trainer(network['output'], (network['ce'], network['pe']), parameter_learner)
return cntk.Trainer(network['output'], (network['ce'], network['pe']), parameter_learner, progress_printer)

# Train and test
def train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, restore):
def train_and_test(network, trainer, train_source, test_source, minibatch_size, epoch_size, restore):

# define mapping from intput streams to network inputs
input_map = {
network['feature']: train_source.streams.features,
network['label']: train_source.streams.labels
}

training_session = cntk.training_session(
training_minibatch_source = train_source,
trainer = trainer,
model_inputs_to_mb_source_mapping = input_map,
mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size),
progress_printer = progress_printer,
# checkpoint_frequency = epoch_size,
checkpoint_filename = os.path.join(model_path, model_name),
# save_all_checkpoints = True,
progress_frequency = epoch_size,
cv_source = test_source,
cv_mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size),
# cv_frequency = epoch_size,
restore = restore)
mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size)

# Train all minibatches
training_session.train()
training_session(
trainer=trainer, mb_source = train_source,
var_to_stream = input_map,
mb_size_schedule = mb_size_schedule,
progress_frequency=epoch_size,
checkpoint_config = CheckpointConfig(filename = os.path.join(model_path, model_name), restore=restore),
cv_config = CrossValidationConfig(source=test_source, schedule=mb_size_schedule)
).train()

# Train and evaluate the network.
def vgg16_train_and_eval(train_data, test_data, num_quantization_bits=32, minibatch_size=128, epoch_size = 1281167, max_epochs=80,
Expand All @@ -190,10 +185,10 @@ def vgg16_train_and_eval(train_data, test_data, num_quantization_bits=32, miniba
num_epochs=max_epochs)

network = create_vgg16()
trainer = create_trainer(network, epoch_size, num_quantization_bits)
trainer = create_trainer(network, epoch_size, num_quantization_bits, progress_printer)
train_source = create_image_mb_source(train_data, True, total_number_of_samples=max_epochs * epoch_size)
test_source = create_image_mb_source(test_data, False, total_number_of_samples=FULL_DATA_SWEEP)
train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, restore)
train_and_test(network, trainer, train_source, test_source, minibatch_size, epoch_size, restore)


if __name__=='__main__':
Expand Down
Loading

0 comments on commit 2a1cd77

Please sign in to comment.