From f1d6fc9ed6c2c6fed1c8adc0095d4c40e31c6e50 Mon Sep 17 00:00:00 2001 From: Yuqing Tang Date: Mon, 13 Nov 2017 15:30:59 -0800 Subject: [PATCH] Updated examples and tests with new Learning APIs. --- .../LogisticRegression_FunctionalAPI.py | 2 +- .../1stSteps/LogisticRegression_GraphAPI.py | 2 +- Examples/1stSteps/MNIST_Complex_Training.py | 12 +++---- .../Python/ConvNetLRN_CIFAR10_DataAug.py | 6 ++-- .../ConvNet/Python/ConvNet_CIFAR10_DataAug.py | 10 +++--- .../ConvNet_CIFAR10_DataAug_Distributed.py | 6 ++-- .../ConvNet/Python/ConvNet_MNIST.py | 6 ++-- .../Python/BN_Inception_CIFAR10.py | 4 +-- .../BN_Inception_CIFAR10_Distributed.py | 4 +-- .../Python/BN_Inception_ImageNet.py | 4 +-- .../BN_Inception_ImageNet_Distributed.py | 4 +-- .../Python/InceptionV3_ImageNet.py | 2 +- .../InceptionV3_ImageNet_Distributed.py | 2 +- .../Classification/MLP/Python/SimpleMNIST.py | 4 +-- .../ResNet/Python/TrainResNet_CIFAR10.py | 7 ++-- .../Python/TrainResNet_CIFAR10_Distributed.py | 9 +++-- .../VGG/Python/VGG16_ImageNet_Distributed.py | 2 +- .../VGG/Python/VGG19_ImageNet_Distributed.py | 2 +- .../FastRCNN/BrainScript/A2_RunWithPyModel.py | 8 ++--- .../FastRCNN/BrainScript/PARAMETERS.py | 6 ++-- .../Detection/FastRCNN/FastRCNN_train.py | 8 ++--- .../Detection/FasterRCNN/FasterRCNN_train.py | 8 ++--- .../GettingStarted/07_Deconvolution_PY.py | 4 +-- .../TransferLearning/TransferLearning.py | 6 ++-- .../ATIS/Python/LanguageUnderstanding.py | 4 +-- .../DeepQNeuralNetwork.py | 4 +-- .../Python/SequenceClassification.py | 2 +- .../CMUDict/Python/Sequence2Sequence.py | 13 +++---- .../Python/Sequence2Sequence_Distributed.py | 6 ++-- .../Python/HTK_LSTM_Truncated_Distributed.py | 4 +-- Examples/Text/CharacterLM/char_rnn.py | 8 ++--- Examples/Text/LightRNN/LightRNN/train.py | 6 ++-- .../Text/WordLMWithSampledSoftmax/word_rnn.py | 8 ++--- .../GettingStarted/Python/Conv3D_UCF11.py | 6 ++-- .../Examples/htk_deserializer_test.py | 4 +-- .../multiple_distributed_learners_test.py | 8 ++--- .../CNTKv2Python/Examples/word_rnn_test.py | 2 +- .../train_models_for_evaluation.py | 4 +-- .../contrib/deeprl/agent/policy_gradient.py | 8 ++--- .../cntk/contrib/deeprl/agent/qlearning.py | 8 ++--- .../python/cntk/debugging/tests/debug_test.py | 4 +-- .../cntk/debugging/tests/userlearner_test.py | 4 +-- bindings/python/cntk/io/tests/io_tests.py | 6 ++-- bindings/python/cntk/learners/__init__.py | 2 +- .../tests/distributed_learner_test.py | 2 +- .../cntk/learners/tests/learner_test.py | 36 +++++++++---------- bindings/python/cntk/ops/functions.py | 2 +- bindings/python/cntk/ops/tests/sparse_test.py | 4 +-- .../ops/tests/userfunction_complex_test.py | 2 +- bindings/python/cntk/tests/function_test.py | 2 +- bindings/python/cntk/tests/persist_test.py | 2 +- .../cntk/train/tests/training_session_test.py | 4 +-- bindings/python/doc/simplenet.py | 4 +-- bindings/python/doc/simplernn.py | 4 +-- 54 files changed, 149 insertions(+), 152 deletions(-) diff --git a/Examples/1stSteps/LogisticRegression_FunctionalAPI.py b/Examples/1stSteps/LogisticRegression_FunctionalAPI.py index c5808b956c97..1f09c3b591d1 100644 --- a/Examples/1stSteps/LogisticRegression_FunctionalAPI.py +++ b/Examples/1stSteps/LogisticRegression_FunctionalAPI.py @@ -54,7 +54,7 @@ def criterion(data, label_one_hot): # Learner object. The learner implements the update algorithm, in this case plain SGD. learning_rate = 0.1 -learner = cntk.sgd(model.parameters, cntk.learning_rate_schedule(learning_rate, cntk.UnitType.minibatch)) +learner = cntk.sgd(model.parameters, cntk.learning_parameter_schedule(learning_rate)) # Trainer configuration parameters. progress_writer = cntk.logging.ProgressPrinter(50) # helper for logging progress; log every 50 minibatches diff --git a/Examples/1stSteps/LogisticRegression_GraphAPI.py b/Examples/1stSteps/LogisticRegression_GraphAPI.py index 214e1edad54f..9cf7dfda7b12 100644 --- a/Examples/1stSteps/LogisticRegression_GraphAPI.py +++ b/Examples/1stSteps/LogisticRegression_GraphAPI.py @@ -52,7 +52,7 @@ def generate_synthetic_data(N): # Learner object. The learner implements the update algorithm, in this case plain SGD. learning_rate = 0.1 -learner = cntk.sgd(model.parameters, cntk.learning_rate_schedule(learning_rate, cntk.UnitType.minibatch)) +learner = cntk.sgd(model.parameters, cntk.learning_parameter_schedule(learning_rate)) # Trainer. minibatch_size = 32 diff --git a/Examples/1stSteps/MNIST_Complex_Training.py b/Examples/1stSteps/MNIST_Complex_Training.py index 9f4f2034e689..5559814685f0 100644 --- a/Examples/1stSteps/MNIST_Complex_Training.py +++ b/Examples/1stSteps/MNIST_Complex_Training.py @@ -81,14 +81,14 @@ def criterion(data, label_one_hot): # Learner object. The learner implements the update algorithm, in this case momentum SGD. # Because this script supports data-parallel training, the learning rate is specified -# "per sample" (UnitType.sample), the value is already pre-divided by the minibatch size. +# "per sample", the value is already pre-divided by the minibatch size. # This allows data-parallel training to slice the data into subsets and also to increase # the minibatch size where possible, while maintaining the same contribution per sample gradient. epoch_size = len(X_train) lr_per_sample = 0.001 -lr_schedule = C.learning_rate_schedule(lr_per_sample, C.learners.UnitType.sample) -mm_time_constant = [0]*5 + [1024] # 5 epochs without momentum, then switch it on -mm_schedule = C.learners.momentum_as_time_constant_schedule(mm_time_constant, epoch_size) +lr_schedule = C.learning_parameter_schedule_per_sample(lr_per_sample) +mm_per_sample = [0]*5 + [0.9990239141819757] # 5 epochs without momentum, then switch it on +mm_schedule = C.learners.momentum_schedule_per_sample(mm_per_sample, epoch_size=epoch_size) # Instantiate the trainer object to drive the model training. learner = C.learners.momentum_sgd(model.parameters, lr_schedule, mm_schedule) @@ -114,7 +114,7 @@ def criterion(data, label_one_hot): def adjust_lr_callback(index, average_error, cv_num_samples, cv_num_minibatches): global prev_metric if (prev_metric - average_error) / prev_metric < 0.05: # relative gain must reduce metric by at least 5% rel - learner.reset_learning_rate(C.learning_rate_schedule(learner.learning_rate() / 2, C.learners.UnitType.sample)) + learner.reset_learning_rate(C.learning_parameter_per_sample(learner.learning_rate() / 2)) if learner.learning_rate() < lr_per_sample / (2**7-0.1): # we are done after the 6-th LR cut print("Learning rate {} too small. Training complete.".format(learner.learning_rate())) return False # means we are done @@ -137,7 +137,7 @@ def adjust_lr_callback(index, average_error, cv_num_samples, cv_num_minibatches) # For distributed training, we must maximize the minibatch size, as to minimize # communication cost and GPU underutilization. Hence, we use a "schedule" # that increases the minibatch size after a few epochs. By specifying the learning rate -# as UnitType.sample, the contribution per sample maintains the same scale without +# as per sample, the contribution per sample maintains the same scale without # having to fix up the learning rate. # For this MNIST model, larger minibatch sizes make it faster, because the # model is too small to utilize a full GPU. Hence data-parallel training cannot diff --git a/Examples/Image/Classification/ConvNet/Python/ConvNetLRN_CIFAR10_DataAug.py b/Examples/Image/Classification/ConvNet/Python/ConvNetLRN_CIFAR10_DataAug.py index aadb0c246ccf..c23c9dca20bc 100644 --- a/Examples/Image/Classification/ConvNet/Python/ConvNetLRN_CIFAR10_DataAug.py +++ b/Examples/Image/Classification/ConvNet/Python/ConvNetLRN_CIFAR10_DataAug.py @@ -100,9 +100,9 @@ def convnetlrn_cifar10_dataaug(reader_train, reader_test, epoch_size=50000, max_ # Set learning parameters lr_per_sample = [0.0015625]*20 + [0.00046875]*20 + [0.00015625]*20 + [0.000046875]*10 + [0.000015625] - lr_schedule = C.learning_rate_schedule(lr_per_sample, unit=C.learners.UnitType.sample, epoch_size=epoch_size) - mm_time_constant = [0]*20 + [600]*20 + [1200] - mm_schedule = C.learners.momentum_as_time_constant_schedule(mm_time_constant, epoch_size=epoch_size) + lr_schedule = C.learning_parameter_schedule_per_sample(lr_per_sample, epoch_size=epoch_size) + mms = [0]*20 + [0.9983347214509387]*20 + [0.9991670137924583] + mm_schedule = C.learners.momentum_schedule_per_sample(mms, epoch_size=epoch_size) l2_reg_weight = 0.002 # trainer object diff --git a/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10_DataAug.py b/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10_DataAug.py index e9515ed73d23..235608dfb34f 100644 --- a/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10_DataAug.py +++ b/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10_DataAug.py @@ -16,7 +16,7 @@ from cntk.layers.typing import * from cntk.io import MinibatchSource, ImageDeserializer, StreamDef, StreamDefs, INFINITELY_REPEAT from cntk import Trainer, use_default_device -from cntk.learners import momentum_sgd, learning_rate_schedule, UnitType, momentum_as_time_constant_schedule, learning_parameter_schedule +from cntk.learners import momentum_sgd, momentum_schedule, momentum_schedule_per_sample, learning_parameter_schedule, learning_parameter_schedule_per_sample from cntk import cross_entropy_with_softmax, classification_error, relu from cntk.ops import Function from cntk.debugging import set_computation_network_trace_level @@ -109,8 +109,8 @@ def train_model(reader, model, criterion, epoch_size=50000, max_epochs=80): # learning parameters learner = momentum_sgd(model.parameters, - lr = learning_parameter_schedule([0.0015625]*20+[0.00046875]*20+[0.00015625]*20+[0.000046875]*10+[0.000015625], minibatch_size=1, epoch_size=epoch_size), - momentum = momentum_as_time_constant_schedule([0]*20+[600]*20+[1200], epoch_size=epoch_size), + lr = learning_parameter_schedule_per_sample([0.0015625]*20+[0.00046875]*20+[0.00015625]*20+[0.000046875]*10+[0.000015625], epoch_size=epoch_size), + momentum = momentum_schedule_per_sample([0]*20+[0.9983347214509387]*20+[0.9991670137924583], epoch_size=epoch_size), l2_regularization_weight = 0.002) # trainer object @@ -147,8 +147,8 @@ def Evaluator(criterion): if metric: parameters |= set(metric.parameters) dummy_learner = momentum_sgd(tuple(parameters), - lr = learning_rate_schedule(1, UnitType.minibatch), - momentum = momentum_as_time_constant_schedule(0)) + lr = learning_parameter_schedule(1), + momentum = momentum_schedule(0)) return Trainer(None, (loss, metric), dummy_learner) def evaluate(reader, criterion, device=None, minibatch_size=16, max_samples=None): diff --git a/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10_DataAug_Distributed.py b/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10_DataAug_Distributed.py index 9a01eebeeb1d..3779f3390bdc 100644 --- a/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10_DataAug_Distributed.py +++ b/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10_DataAug_Distributed.py @@ -86,9 +86,9 @@ def create_conv_network(): def create_trainer(network, epoch_size, num_quantization_bits, block_size, warm_up, progress_writers): # Set learning parameters lr_per_sample = [0.0015625]*20 + [0.00046875]*20 + [0.00015625]*20 + [0.000046875]*10 + [0.000015625] - lr_schedule = C.learning_rate_schedule(lr_per_sample, unit=C.learners.UnitType.sample, epoch_size=epoch_size) - mm_time_constant = [0]*20 + [600]*20 + [1200] - mm_schedule = C.learners.momentum_as_time_constant_schedule(mm_time_constant, epoch_size=epoch_size) + lr_schedule = C.learning_parameter_schedule_per_sample(lr_per_sample, epoch_size=epoch_size) + mms = [0]*20 + [0.9983347214509387]*20 + [0.9991670137924583] + mm_schedule = C.learners.momentum_schedule_per_sample(mms, epoch_size=epoch_size) l2_reg_weight = 0.002 # Create learner diff --git a/Examples/Image/Classification/ConvNet/Python/ConvNet_MNIST.py b/Examples/Image/Classification/ConvNet/Python/ConvNet_MNIST.py index 9a22b77ab025..0a31be1bd49e 100644 --- a/Examples/Image/Classification/ConvNet/Python/ConvNet_MNIST.py +++ b/Examples/Image/Classification/ConvNet/Python/ConvNet_MNIST.py @@ -55,9 +55,9 @@ def convnet_mnist(debug_output=False, epoch_size=60000, minibatch_size=64, max_e # Set learning parameters lr_per_sample = [0.001]*10 + [0.0005]*10 + [0.0001] - lr_schedule = C.learning_rate_schedule(lr_per_sample, C.learners.UnitType.sample, epoch_size) - mm_time_constant = [0]*5 + [1024] - mm_schedule = C.learners.momentum_as_time_constant_schedule(mm_time_constant, epoch_size) + lr_schedule = C.learning_parameter_schedule_per_sample(lr_per_sample, epoch_size=epoch_size) + mms = [0]*5 + [0.9990239141819757] + mm_schedule = C.learners.momentum_schedule_per_sample(mms, epoch_size=epoch_size) # Instantiate the trainer object to drive the model training learner = C.learners.momentum_sgd(z.parameters, lr_schedule, mm_schedule) diff --git a/Examples/Image/Classification/GoogLeNet/BN-Inception/Python/BN_Inception_CIFAR10.py b/Examples/Image/Classification/GoogLeNet/BN-Inception/Python/BN_Inception_CIFAR10.py index 42e1989e8911..75230cd78825 100644 --- a/Examples/Image/Classification/GoogLeNet/BN-Inception/Python/BN_Inception_CIFAR10.py +++ b/Examples/Image/Classification/GoogLeNet/BN-Inception/Python/BN_Inception_CIFAR10.py @@ -17,7 +17,7 @@ import cntk.io.transforms as xforms from cntk.debugging import start_profiler, stop_profiler, enable_profiler from cntk.io import ImageDeserializer, MinibatchSource, StreamDef, StreamDefs, FULL_DATA_SWEEP -from cntk.learners import learning_rate_schedule, momentum_schedule, momentum_sgd, UnitType +from cntk.learners import learning_parameter_schedule, momentum_schedule, momentum_sgd from cntk.logging import ProgressPrinter, log_number_of_parameters from cntk.losses import cross_entropy_with_softmax from cntk.metrics import classification_error @@ -114,7 +114,7 @@ def create_trainer(network, epoch_size, num_epochs, minibatch_size, progress_wri lr_per_mb.extend([learning_rate] * learn_rate_adjust_interval) learning_rate *= learn_rate_decrease_factor - lr_schedule = learning_rate_schedule(lr_per_mb, unit=UnitType.minibatch, epoch_size=epoch_size) + lr_schedule = learning_parameter_schedule(lr_per_mb, epoch_size=epoch_size) mm_schedule = momentum_schedule(0.9) l2_reg_weight = 0.0001 # CNTK L2 regularization is per sample, thus same as Caffe diff --git a/Examples/Image/Classification/GoogLeNet/BN-Inception/Python/BN_Inception_CIFAR10_Distributed.py b/Examples/Image/Classification/GoogLeNet/BN-Inception/Python/BN_Inception_CIFAR10_Distributed.py index 0516e011cdad..56e7c5861979 100644 --- a/Examples/Image/Classification/GoogLeNet/BN-Inception/Python/BN_Inception_CIFAR10_Distributed.py +++ b/Examples/Image/Classification/GoogLeNet/BN-Inception/Python/BN_Inception_CIFAR10_Distributed.py @@ -16,7 +16,7 @@ import cntk.io.transforms as xforms from cntk.debugging import start_profiler, stop_profiler -from cntk.learners import learning_rate_schedule, momentum_schedule, momentum_sgd, UnitType +from cntk.learners import learning_parameter_schedule, momentum_schedule, momentum_sgd from cntk.logging import ProgressPrinter, log_number_of_parameters from cntk.ops import input from cntk.io import ImageDeserializer, MinibatchSource, StreamDef, StreamDefs, FULL_DATA_SWEEP @@ -51,7 +51,7 @@ def create_trainer(network, epoch_size, num_epochs, minibatch_size, num_quantiza lr_per_mb.extend([learning_rate] * learn_rate_adjust_interval) learning_rate *= learn_rate_decrease_factor - lr_schedule = learning_rate_schedule(lr_per_mb, unit=UnitType.minibatch, epoch_size=epoch_size) + lr_schedule = learning_parameter_schedule(lr_per_mb, epoch_size=epoch_size) mm_schedule = momentum_schedule(0.9) l2_reg_weight = 0.0001 # CNTK L2 regularization is per sample, thus same as Caffe diff --git a/Examples/Image/Classification/GoogLeNet/BN-Inception/Python/BN_Inception_ImageNet.py b/Examples/Image/Classification/GoogLeNet/BN-Inception/Python/BN_Inception_ImageNet.py index 425c7b099dd7..81fc9915b8d9 100644 --- a/Examples/Image/Classification/GoogLeNet/BN-Inception/Python/BN_Inception_ImageNet.py +++ b/Examples/Image/Classification/GoogLeNet/BN-Inception/Python/BN_Inception_ImageNet.py @@ -17,7 +17,7 @@ import cntk.io.transforms as xforms from cntk.debugging import start_profiler, stop_profiler, enable_profiler from cntk.io import ImageDeserializer, MinibatchSource, StreamDef, StreamDefs, FULL_DATA_SWEEP -from cntk.learners import learning_rate_schedule, momentum_schedule, momentum_sgd, UnitType +from cntk.learners import learning_parameter_schedule, momentum_schedule, momentum_sgd from cntk.logging import ProgressPrinter, log_number_of_parameters from cntk.losses import cross_entropy_with_softmax from cntk.metrics import classification_error @@ -115,7 +115,7 @@ def create_trainer(network, epoch_size, num_epochs, minibatch_size): lr_per_mb.extend([learning_rate] * learn_rate_adjust_interval) learning_rate *= learn_rate_decrease_factor - lr_schedule = learning_rate_schedule(lr_per_mb, unit=UnitType.minibatch, epoch_size=epoch_size) + lr_schedule = learning_parameter_schedule(lr_per_mb, epoch_size=epoch_size) mm_schedule = momentum_schedule(0.9) l2_reg_weight = 0.0001 # CNTK L2 regularization is per sample, thus same as Caffe diff --git a/Examples/Image/Classification/GoogLeNet/BN-Inception/Python/BN_Inception_ImageNet_Distributed.py b/Examples/Image/Classification/GoogLeNet/BN-Inception/Python/BN_Inception_ImageNet_Distributed.py index 4ceec63e4ba7..7bf0902bbf09 100644 --- a/Examples/Image/Classification/GoogLeNet/BN-Inception/Python/BN_Inception_ImageNet_Distributed.py +++ b/Examples/Image/Classification/GoogLeNet/BN-Inception/Python/BN_Inception_ImageNet_Distributed.py @@ -16,7 +16,7 @@ import cntk.io.transforms as xforms from cntk.debugging import start_profiler, stop_profiler -from cntk.learners import learning_rate_schedule, momentum_schedule, momentum_sgd, UnitType +from cntk.learners import learning_parameter_schedule, momentum_schedule, momentum_sgd from cntk.logging import ProgressPrinter, log_number_of_parameters from cntk.train.distributed import data_parallel_distributed_learner, Communicator from cntk.io import ImageDeserializer, MinibatchSource, StreamDef, StreamDefs, FULL_DATA_SWEEP @@ -51,7 +51,7 @@ def create_trainer(network, epoch_size, num_epochs, minibatch_size, num_quantiza lr_per_mb.extend([learning_rate] * learn_rate_adjust_interval) learning_rate *= learn_rate_decrease_factor - lr_schedule = learning_rate_schedule(lr_per_mb, unit=UnitType.minibatch, epoch_size=epoch_size) + lr_schedule = learning_parameter_schedule(lr_per_mb, epoch_size=epoch_size) mm_schedule = momentum_schedule(0.9) l2_reg_weight = 0.0001 # CNTK L2 regularization is per sample, thus same as Caffe diff --git a/Examples/Image/Classification/GoogLeNet/InceptionV3/Python/InceptionV3_ImageNet.py b/Examples/Image/Classification/GoogLeNet/InceptionV3/Python/InceptionV3_ImageNet.py index f83f7432b28b..08a84f9882ee 100644 --- a/Examples/Image/Classification/GoogLeNet/InceptionV3/Python/InceptionV3_ImageNet.py +++ b/Examples/Image/Classification/GoogLeNet/InceptionV3/Python/InceptionV3_ImageNet.py @@ -103,7 +103,7 @@ def create_trainer(network, epoch_size, num_epochs, minibatch_size): lr_per_mb.extend([learning_rate] * learn_rate_adjust_interval) learning_rate *= learn_rate_decrease_factor - lr_schedule = C.learners.learning_rate_schedule(lr_per_mb, unit=C.learners.UnitType.minibatch, epoch_size=epoch_size) + lr_schedule = C.learners.learning_parameter_schedule(lr_per_mb, epoch_size=epoch_size) mm_schedule = C.learners.momentum_schedule(0.9) l2_reg_weight = 0.0001 # CNTK L2 regularization is per sample, thus same as Caffe diff --git a/Examples/Image/Classification/GoogLeNet/InceptionV3/Python/InceptionV3_ImageNet_Distributed.py b/Examples/Image/Classification/GoogLeNet/InceptionV3/Python/InceptionV3_ImageNet_Distributed.py index 31fdcb9d87c8..7a0861cb3f66 100644 --- a/Examples/Image/Classification/GoogLeNet/InceptionV3/Python/InceptionV3_ImageNet_Distributed.py +++ b/Examples/Image/Classification/GoogLeNet/InceptionV3/Python/InceptionV3_ImageNet_Distributed.py @@ -39,7 +39,7 @@ def create_trainer(network, epoch_size, num_epochs, minibatch_size, num_quantiza lr_per_mb.extend([learning_rate] * learn_rate_adjust_interval) learning_rate *= learn_rate_decrease_factor - lr_schedule = C.learners.learning_rate_schedule(lr_per_mb, unit=C.learners.UnitType.minibatch, epoch_size=epoch_size) + lr_schedule = C.learners.learning_parameter_schedule(lr_per_mb, epoch_size=epoch_size) mm_schedule = C.learners.momentum_schedule(0.9) l2_reg_weight = 0.0001 # CNTK L2 regularization is per sample, thus same as Caffe diff --git a/Examples/Image/Classification/MLP/Python/SimpleMNIST.py b/Examples/Image/Classification/MLP/Python/SimpleMNIST.py index e9685b4395a9..10a3634613cf 100644 --- a/Examples/Image/Classification/MLP/Python/SimpleMNIST.py +++ b/Examples/Image/Classification/MLP/Python/SimpleMNIST.py @@ -12,7 +12,7 @@ from cntk.train import Trainer, minibatch_size_schedule from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs, INFINITELY_REPEAT from cntk.device import cpu, try_set_default_device -from cntk.learners import adadelta, learning_rate_schedule, UnitType +from cntk.learners import adadelta, learning_parameter_schedule_per_sample from cntk.ops import relu, element_times, constant from cntk.layers import Dense, Sequential, For from cntk.losses import cross_entropy_with_softmax @@ -85,7 +85,7 @@ def simple_mnist(tensorboard_logdir=None): progress_writers.append(TensorBoardProgressWriter(freq=10, log_dir=tensorboard_logdir, model=z)) # Instantiate the trainer object to drive the model training - lr = learning_rate_schedule(1, UnitType.sample) + lr = learning_parameter_schedule_per_sample(1) trainer = Trainer(z, (ce, pe), adadelta(z.parameters, lr), progress_writers) training_session( diff --git a/Examples/Image/Classification/ResNet/Python/TrainResNet_CIFAR10.py b/Examples/Image/Classification/ResNet/Python/TrainResNet_CIFAR10.py index bf6409172e9e..c6b7f735a168 100644 --- a/Examples/Image/Classification/ResNet/Python/TrainResNet_CIFAR10.py +++ b/Examples/Image/Classification/ResNet/Python/TrainResNet_CIFAR10.py @@ -13,7 +13,7 @@ from cntk import cross_entropy_with_softmax, classification_error, reduce_mean from cntk import Trainer, cntk_py from cntk.io import MinibatchSource, ImageDeserializer, StreamDef, StreamDefs -from cntk.learners import momentum_sgd, learning_rate_schedule, momentum_as_time_constant_schedule, UnitType +from cntk.learners import momentum_sgd, learning_parameter_schedule_per_sample, momentum_schedule from cntk.debugging import * from cntk.logging import * from resnet_models import * @@ -80,13 +80,12 @@ def train_and_evaluate(reader_train, reader_test, network_name, epoch_size, max_ # shared training parameters minibatch_size = 128 - momentum_time_constant = -minibatch_size/np.log(0.9) l2_reg_weight = 0.0001 # Set learning parameters lr_per_sample = [lr/minibatch_size for lr in lr_per_mb] - lr_schedule = learning_rate_schedule(lr_per_sample, epoch_size=epoch_size, unit=UnitType.sample) - mm_schedule = momentum_as_time_constant_schedule(momentum_time_constant) + lr_schedule = learning_parameter_schedule_per_sample(lr_per_sample, epoch_size=epoch_size) + mm_schedule = momentum_schedule(0.9, minibatch_size) # progress writers progress_writers = [ProgressPrinter(tag='Training', log_to_file=log_dir, num_epochs=max_epochs, gen_heartbeat=gen_heartbeat)] diff --git a/Examples/Image/Classification/ResNet/Python/TrainResNet_CIFAR10_Distributed.py b/Examples/Image/Classification/ResNet/Python/TrainResNet_CIFAR10_Distributed.py index 4d9e162141a5..f0d325affbba 100644 --- a/Examples/Image/Classification/ResNet/Python/TrainResNet_CIFAR10_Distributed.py +++ b/Examples/Image/Classification/ResNet/Python/TrainResNet_CIFAR10_Distributed.py @@ -10,9 +10,10 @@ import cntk as C import numpy as np +import cntk as C from cntk import input, cross_entropy_with_softmax, classification_error, Trainer, cntk_py from cntk import data_parallel_distributed_learner, block_momentum_distributed_learner, Communicator -from cntk.learners import momentum_sgd, learning_rate_schedule, momentum_as_time_constant_schedule, UnitType +from cntk.learners import momentum_sgd, learning_parameter_schedule, momentum_schedule from cntk.device import try_set_default_device, gpu from cntk.train.training_session import * from cntk.debugging import * @@ -71,15 +72,13 @@ def create_trainer(network, minibatch_size, epoch_size, num_quantization_bits, b else: return RuntimeError("Unknown model name!") - momentum_time_constant = -minibatch_size/np.log(0.9) l2_reg_weight = 0.0001 # Set learning parameters minibatch_size = 128 lr_per_sample = [lr/minibatch_size for lr in lr_per_mb] - lr_schedule = learning_rate_schedule(lr_per_sample, epoch_size=epoch_size, unit=UnitType.sample) - mm_schedule = momentum_as_time_constant_schedule(momentum_time_constant) - + lr_schedule = learning_parameter_schedule(lr_per_mb, minibatch_size = minibatch_size, epoch_size=epoch_size) + mm_schedule = momentum_schedule(0.9, minibatch_size = minibatch_size) # learner object if block_size != None and num_quantization_bits != 32: raise RuntimeError("Block momentum cannot be used with quantization, please remove quantized_bits option.") diff --git a/Examples/Image/Classification/VGG/Python/VGG16_ImageNet_Distributed.py b/Examples/Image/Classification/VGG/Python/VGG16_ImageNet_Distributed.py index 449236103cfb..ac6b8ac6e43c 100644 --- a/Examples/Image/Classification/VGG/Python/VGG16_ImageNet_Distributed.py +++ b/Examples/Image/Classification/VGG/Python/VGG16_ImageNet_Distributed.py @@ -136,7 +136,7 @@ def create_vgg16(): def create_trainer(network, epoch_size, num_quantization_bits, progress_printer): # Set learning parameters lr_per_mb = [0.01]*20 + [0.001]*20 + [0.0001]*20 + [0.00001]*10 + [0.000001] - lr_schedule = C.learning_rate_schedule(lr_per_mb, unit=C.learners.UnitType.minibatch, epoch_size=epoch_size) + lr_schedule = C.learning_parameter_schedule(lr_per_mb, epoch_size=epoch_size) mm_schedule = C.learners.momentum_schedule(0.9) l2_reg_weight = 0.0005 # CNTK L2 regularization is per sample, thus same as Caffe diff --git a/Examples/Image/Classification/VGG/Python/VGG19_ImageNet_Distributed.py b/Examples/Image/Classification/VGG/Python/VGG19_ImageNet_Distributed.py index 7462632f45f7..3d9fe61d5334 100644 --- a/Examples/Image/Classification/VGG/Python/VGG19_ImageNet_Distributed.py +++ b/Examples/Image/Classification/VGG/Python/VGG19_ImageNet_Distributed.py @@ -136,7 +136,7 @@ def create_vgg19(): def create_trainer(network, epoch_size, num_quantization_bits, progress_printer): # Set learning parameters lr_per_mb = [0.01]*20 + [0.001]*20 + [0.0001]*20 + [0.00001]*10 + [0.000001] - lr_schedule = C.learning_rate_schedule(lr_per_mb, unit=C.learners.UnitType.minibatch, epoch_size=epoch_size) + lr_schedule = C.learning_parameter_schedule(lr_per_mb, epoch_size=epoch_size) mm_schedule = C.learners.momentum_schedule(0.9) l2_reg_weight = 0.0005 # CNTK L2 regularization is per sample, thus same as Caffe diff --git a/Examples/Image/Detection/FastRCNN/BrainScript/A2_RunWithPyModel.py b/Examples/Image/Detection/FastRCNN/BrainScript/A2_RunWithPyModel.py index 76c416b3f172..7ee6f3711c9f 100644 --- a/Examples/Image/Detection/FastRCNN/BrainScript/A2_RunWithPyModel.py +++ b/Examples/Image/Detection/FastRCNN/BrainScript/A2_RunWithPyModel.py @@ -11,7 +11,7 @@ from cntk.io import MinibatchSource, ImageDeserializer, CTFDeserializer, StreamDefs, StreamDef from cntk.io.transforms import scale from cntk.layers import placeholder, Constant -from cntk.learners import momentum_sgd, learning_rate_schedule, momentum_as_time_constant_schedule +from cntk.learners import momentum_sgd, learning_parameter_schedule_per_sample, momentum_schedule_per_sample from cntk.logging import log_number_of_parameters, ProgressPrinter from cntk.logging.graph import find_by_name, plot import PARAMETERS @@ -44,10 +44,10 @@ num_test_images = p.cntk_num_test_images mb_size = p.cntk_mb_size max_epochs = p.cntk_max_epochs -momentum_time_constant = p.cntk_momentum_time_constant distributed_flg = p.distributed_flg num_quantization_bits = p.num_quantization_bits warm_up = p.warm_up +momentum_per_sample = p.cntk_momentum_per_sample # model specific variables (only AlexNet for now) base_model = "AlexNet" @@ -154,8 +154,8 @@ def train_fast_rcnn(debug_output=False, model_path=model_file): # Set learning parameters l2_reg_weight = 0.0005 lr_per_sample = [0.00001] * 10 + [0.000001] * 5 + [0.0000001] - lr_schedule = learning_rate_schedule(lr_per_sample, unit=UnitType.sample) - mm_schedule = momentum_as_time_constant_schedule(momentum_time_constant) + lr_schedule = learning_parameter_schedule_per_sample(lr_per_sample) + mm_schedule = momentum_schedule_per_sample(momentum_per_sample) # Instantiate the trainer object as default learner = momentum_sgd(frcn_output.parameters, lr_schedule, mm_schedule, l2_regularization_weight=l2_reg_weight) diff --git a/Examples/Image/Detection/FastRCNN/BrainScript/PARAMETERS.py b/Examples/Image/Detection/FastRCNN/BrainScript/PARAMETERS.py index 1470f43abff8..9ba3f40488d4 100644 --- a/Examples/Image/Detection/FastRCNN/BrainScript/PARAMETERS.py +++ b/Examples/Image/Detection/FastRCNN/BrainScript/PARAMETERS.py @@ -54,7 +54,7 @@ def __init__(self, datasetName): self.cntk_num_test_images = -1 # set per data set below self.cntk_mb_size = -1 # set per data set below self.cntk_max_epochs = -1 # set per data set below - self.cntk_momentum_time_constant = -1 # set per data set below + self.cntk_momentum_per_sample = -1 # set per data set below # for Distributed learner self.distributed_flg = False # In case of distributed learning, set 'True' @@ -83,7 +83,7 @@ def __init__(self, datasetName): self.cntk_num_test_images = 5 self.cntk_mb_size = 5 self.cntk_max_epochs = 20 - self.cntk_momentum_time_constant = 10 + self.cntk_momentum_per_sample = 0.8187307530779818 # postprocessing self.nmsThreshold = 0.01 @@ -117,7 +117,7 @@ def __init__(self, datasetName): self.cntk_num_test_images = 4952 self.cntk_mb_size = 2 self.cntk_max_epochs = 17 - self.cntk_momentum_time_constant = 20 + self.cntk_momentum_per_sample = 0.951229424500714 self.pascalDataDir = os.path.join(self.rootDir, "..", "..", "DataSets", "Pascal") self.imgDir = self.pascalDataDir diff --git a/Examples/Image/Detection/FastRCNN/FastRCNN_train.py b/Examples/Image/Detection/FastRCNN/FastRCNN_train.py index 377874189874..d98209c56cce 100644 --- a/Examples/Image/Detection/FastRCNN/FastRCNN_train.py +++ b/Examples/Image/Detection/FastRCNN/FastRCNN_train.py @@ -10,12 +10,12 @@ import argparse import easydict # pip install easydict import cntk -from cntk import Trainer, UnitType, load_model, Axis, input_variable, parameter, times, combine, \ +from cntk import Trainer, load_model, Axis, input_variable, parameter, times, combine, \ softmax, roipooling, plus, element_times, CloneMethod, alias, Communicator, reduce_sum from cntk.core import Value from cntk.initializer import normal from cntk.layers import placeholder, Constant, Sequential -from cntk.learners import momentum_sgd, learning_rate_schedule, momentum_schedule +from cntk.learners import momentum_sgd, learning_parameter_schedule_per_sample, momentum_schedule from cntk.logging import log_number_of_parameters, ProgressPrinter from cntk.logging.graph import find_by_name, plot from cntk.losses import cross_entropy_with_softmax @@ -295,11 +295,11 @@ def train_fast_rcnn(cfg): biases = [p for p in params if '.b' in p.name or 'b' == p.name] others = [p for p in params if not p in biases] bias_lr_mult = cfg["CNTK"].BIAS_LR_MULT - lr_schedule = learning_rate_schedule(lr_per_sample_scaled, unit=UnitType.sample) + lr_schedule = learning_parameter_schedule_per_sample(lr_per_sample_scaled) learner = momentum_sgd(others, lr_schedule, mm_schedule, l2_regularization_weight=l2_reg_weight, unit_gain=False, use_mean_gradient=True) bias_lr_per_sample = [v * bias_lr_mult for v in cfg["CNTK"].LR_PER_SAMPLE] - bias_lr_schedule = learning_rate_schedule(bias_lr_per_sample, unit=UnitType.sample) + bias_lr_schedule = learning_parameter_schedule_per_sample(bias_lr_per_sample) bias_learner = momentum_sgd(biases, bias_lr_schedule, mm_schedule, l2_regularization_weight=l2_reg_weight, unit_gain=False, use_mean_gradient=True) trainer = Trainer(None, (loss, pred_error), [learner, bias_learner]) diff --git a/Examples/Image/Detection/FasterRCNN/FasterRCNN_train.py b/Examples/Image/Detection/FasterRCNN/FasterRCNN_train.py index 0822826c01ae..ec321a43b6d3 100644 --- a/Examples/Image/Detection/FasterRCNN/FasterRCNN_train.py +++ b/Examples/Image/Detection/FasterRCNN/FasterRCNN_train.py @@ -10,13 +10,13 @@ import argparse import easydict # pip install easydict import cntk -from cntk import Trainer, UnitType, load_model, Axis, input_variable, parameter, times, combine, \ +from cntk import Trainer, load_model, Axis, input_variable, parameter, times, combine, \ softmax, roipooling, plus, element_times, CloneMethod, alias, Communicator, reduce_sum from cntk.core import Value from cntk.io import MinibatchData from cntk.initializer import normal from cntk.layers import placeholder, Constant, Sequential -from cntk.learners import momentum_sgd, learning_rate_schedule, momentum_schedule +from cntk.learners import momentum_sgd, learning_parameter_schedule_per_sample, momentum_schedule from cntk.logging import log_number_of_parameters, ProgressPrinter from cntk.logging.graph import find_by_name, plot from cntk.losses import cross_entropy_with_softmax @@ -521,12 +521,12 @@ def train_model(image_input, roi_input, dims_input, loss, pred_error, print("bias_lr_mult: {}".format(bias_lr_mult)) # Instantiate the learners and the trainer object - lr_schedule = learning_rate_schedule(lr_per_sample, unit=UnitType.sample) + lr_schedule = learning_parameter_schedule_per_sample(lr_per_sample) learner = momentum_sgd(others, lr_schedule, mm_schedule, l2_regularization_weight=l2_reg_weight, unit_gain=False, use_mean_gradient=True) bias_lr_per_sample = [v * bias_lr_mult for v in lr_per_sample] - bias_lr_schedule = learning_rate_schedule(bias_lr_per_sample, unit=UnitType.sample) + bias_lr_schedule = learning_parameter_schedule_per_sample(bias_lr_per_sample) bias_learner = momentum_sgd(biases, bias_lr_schedule, mm_schedule, l2_regularization_weight=l2_reg_weight, unit_gain=False, use_mean_gradient=True) trainer = Trainer(None, (loss, pred_error), [learner, bias_learner]) diff --git a/Examples/Image/GettingStarted/07_Deconvolution_PY.py b/Examples/Image/GettingStarted/07_Deconvolution_PY.py index 5c18865a511d..b4e4b405d0ce 100644 --- a/Examples/Image/GettingStarted/07_Deconvolution_PY.py +++ b/Examples/Image/GettingStarted/07_Deconvolution_PY.py @@ -57,8 +57,8 @@ def deconv_mnist(max_epochs=3): minibatch_size = 64 # Set learning parameters - lr_schedule = C.learning_rate_schedule([0.00015], C.learners.UnitType.sample, epoch_size) - mm_schedule = C.learners.momentum_as_time_constant_schedule([600], epoch_size) + lr_schedule = C.learning_parameter_schedule_per_sample([0.00015], epoch_size=epoch_size) + mm_schedule = C.learners.momentum_schedule_per_sample([0.9983347214509387], epoch_size=epoch_size) # Instantiate the trainer object to drive the model training learner = C.learners.momentum_sgd(z.parameters, lr_schedule, mm_schedule, unit_gain=True) diff --git a/Examples/Image/TransferLearning/TransferLearning.py b/Examples/Image/TransferLearning/TransferLearning.py index e96b636b1c37..3b807cfab217 100644 --- a/Examples/Image/TransferLearning/TransferLearning.py +++ b/Examples/Image/TransferLearning/TransferLearning.py @@ -11,12 +11,12 @@ from PIL import Image from cntk.device import try_set_default_device, gpu from cntk import load_model, placeholder, Constant -from cntk import Trainer, UnitType +from cntk import Trainer from cntk.logging.graph import find_by_name, get_node_outputs from cntk.io import MinibatchSource, ImageDeserializer, StreamDefs, StreamDef import cntk.io.transforms as xforms from cntk.layers import Dense -from cntk.learners import momentum_sgd, learning_rate_schedule, momentum_schedule +from cntk.learners import momentum_sgd, learning_parameter_schedule, momentum_schedule from cntk.ops import combine, softmax from cntk.ops.functions import CloneMethod from cntk.losses import cross_entropy_with_softmax @@ -114,7 +114,7 @@ def train_model(base_model_file, feature_node_name, last_hidden_node_name, pe = classification_error(tl_model, label_input) # Instantiate the trainer object - lr_schedule = learning_rate_schedule(lr_per_mb, unit=UnitType.minibatch) + lr_schedule = learning_parameter_schedule(lr_per_mb) mm_schedule = momentum_schedule(momentum_per_mb) learner = momentum_sgd(tl_model.parameters, lr_schedule, mm_schedule, l2_regularization_weight=l2_reg_weight) progress_printer = ProgressPrinter(tag='Training', num_epochs=num_epochs) diff --git a/Examples/LanguageUnderstanding/ATIS/Python/LanguageUnderstanding.py b/Examples/LanguageUnderstanding/ATIS/Python/LanguageUnderstanding.py index 6f37ee5f94df..a06f3a1df113 100644 --- a/Examples/LanguageUnderstanding/ATIS/Python/LanguageUnderstanding.py +++ b/Examples/LanguageUnderstanding/ATIS/Python/LanguageUnderstanding.py @@ -138,8 +138,8 @@ def train(reader, model, max_epochs): # SGD parameters learner = cntk.learners.fsadagrad(criterion.parameters, - lr = cntk.learners.learning_rate_schedule([0.003]*2+[0.0015]*12+[0.0003], cntk.learners.UnitType.sample, epoch_size), - momentum = cntk.learners.momentum_as_time_constant_schedule(minibatch_size / -math.log(0.9)), + lr = cntk.learners.learning_parameter_schedule_per_sample([0.003]*2+[0.0015]*12+[0.0003], epoch_size=epoch_size), + momentum = cntk.learners.momentum_schedule(0.9, minibatch_size), gradient_clipping_threshold_per_sample = 15, gradient_clipping_with_truncation = True) diff --git a/Examples/ReinforcementLearning/DeepQNeuralNetwork.py b/Examples/ReinforcementLearning/DeepQNeuralNetwork.py index d6324dfd2854..0b348c2b3e93 100644 --- a/Examples/ReinforcementLearning/DeepQNeuralNetwork.py +++ b/Examples/ReinforcementLearning/DeepQNeuralNetwork.py @@ -12,7 +12,7 @@ from cntk.initializer import he_uniform from cntk.layers import Sequential, Convolution2D, Dense, default_options from cntk.layers.typing import Signature, Tensor -from cntk.learners import adam, learning_rate_schedule, momentum_schedule, UnitType +from cntk.learners import adam, learning_parameter_schedule, momentum_schedule from cntk.logging import TensorBoardProgressWriter from cntk.ops import abs, argmax, element_select, less, relu, reduce_max, reduce_sum, square from cntk.ops.functions import CloneMethod, Function @@ -317,7 +317,7 @@ def criterion(pre_states, actions, post_states, rewards, terminals): return huber_loss(q_targets, q_acted, 1.0) # Adam based SGD - lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch) + lr_schedule = learning_parameter_schedule(learning_rate) m_schedule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) l_sgd = adam(self._action_value_net.parameters, lr_schedule, diff --git a/Examples/SequenceClassification/SimpleExample/Python/SequenceClassification.py b/Examples/SequenceClassification/SimpleExample/Python/SequenceClassification.py index 6825dd86a2f8..51389152eb92 100644 --- a/Examples/SequenceClassification/SimpleExample/Python/SequenceClassification.py +++ b/Examples/SequenceClassification/SimpleExample/Python/SequenceClassification.py @@ -53,7 +53,7 @@ def train_sequence_classifier(): label : reader.streams.labels } - lr_per_sample = C.learning_rate_schedule(0.1, C.UnitType.sample) + lr_per_sample = C.learning_parameter_schedule_per_sample(0.1) # Instantiate the trainer object to drive the model training progress_printer = C.logging.ProgressPrinter(0) diff --git a/Examples/SequenceToSequence/CMUDict/Python/Sequence2Sequence.py b/Examples/SequenceToSequence/CMUDict/Python/Sequence2Sequence.py index bce1e8d2f53b..b3e84ab5cf7f 100644 --- a/Examples/SequenceToSequence/CMUDict/Python/Sequence2Sequence.py +++ b/Examples/SequenceToSequence/CMUDict/Python/Sequence2Sequence.py @@ -9,7 +9,8 @@ import os from cntk import Trainer, Axis from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs, INFINITELY_REPEAT -from cntk.learners import momentum_sgd, fsadagrad, momentum_as_time_constant_schedule, learning_rate_schedule, UnitType +from cntk.learners import momentum_sgd, fsadagrad, momentum_schedule_per_sample, \ + learning_parameter_schedule, learning_parameter_schedule_per_sample from cntk import input, cross_entropy_with_softmax, classification_error, sequence, \ element_select, alias, hardmax, placeholder, combine, parameter, times, plus from cntk.ops.functions import CloneMethod, load_model, Function @@ -218,8 +219,8 @@ def train(train_reader, valid_reader, vocab, i2w, s2smodel, max_epochs, epoch_si minibatch_size = 72 lr = 0.001 if use_attention else 0.005 # TODO: can we use the same value for both? learner = fsadagrad(model_train.parameters, - lr = learning_rate_schedule([lr]*2+[lr/2]*3+[lr/4], UnitType.sample, epoch_size), - momentum = momentum_as_time_constant_schedule(1100), + lr = learning_parameter_schedule_per_sample([lr]*2+[lr/2]*3+[lr/4], epoch_size=epoch_size), + momentum = momentum_schedule_per_sample(0.9990913221888589), gradient_clipping_threshold_per_sample=2.3, gradient_clipping_with_truncation=True) trainer = Trainer(None, criterion, learner) @@ -314,7 +315,7 @@ def evaluate_decoding(reader, s2smodel, i2w): # TODO: replace by a proper such class once available def Evaluator(model, criterion): from cntk import Trainer - from cntk.learners import momentum_sgd, learning_rate_schedule, UnitType, momentum_as_time_constant_schedule + from cntk.learners import momentum_sgd, momentum_schedule_per_sample loss, metric = Trainer._get_loss_metric(criterion) parameters = set(loss.parameters) if model: @@ -322,8 +323,8 @@ def Evaluator(model, criterion): if metric: parameters |= set(metric.parameters) dummy_learner = momentum_sgd(tuple(parameters), - lr = learning_rate_schedule(1, UnitType.minibatch), - momentum = momentum_as_time_constant_schedule(0)) + lr = learning_parameter_schedule(1), + momentum = momentum_schedule_per_sample(0)) return Trainer(model, (loss, metric), dummy_learner) # This computes the metric on the test set. diff --git a/Examples/SequenceToSequence/CMUDict/Python/Sequence2Sequence_Distributed.py b/Examples/SequenceToSequence/CMUDict/Python/Sequence2Sequence_Distributed.py index cbd2f19df364..51abe8426740 100644 --- a/Examples/SequenceToSequence/CMUDict/Python/Sequence2Sequence_Distributed.py +++ b/Examples/SequenceToSequence/CMUDict/Python/Sequence2Sequence_Distributed.py @@ -15,7 +15,7 @@ from cntk import Trainer from cntk.train.distributed import Communicator, data_parallel_distributed_learner, block_momentum_distributed_learner from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs, INFINITELY_REPEAT, FULL_DATA_SWEEP -from cntk.learners import fsadagrad, learning_rate_schedule, UnitType, momentum_as_time_constant_schedule +from cntk.learners import fsadagrad, learning_parameter_schedule_per_sample, momentum_schedule, momentum_schedule_per_sample from cntk.train.training_session import * from cntk.logging import * @@ -50,8 +50,8 @@ def train_and_test(s2smodel, train_reader, test_reader, block_size, num_quantiza lr = 0.001 if use_attention else 0.005 # TODO: can we use the same value for both? local_learner = fsadagrad(model_train.parameters, - lr = learning_rate_schedule([lr]*2+[lr/2]*3+[lr/4], UnitType.sample, epoch_size), - momentum = momentum_as_time_constant_schedule(1100), + lr = learning_parameter_schedule_per_sample([lr]*2+[lr/2]*3+[lr/4], epoch_size=epoch_size), + momentum = momentum_schedule_per_sample(0.9990913221888589), gradient_clipping_threshold_per_sample=2.3, gradient_clipping_with_truncation=True) diff --git a/Examples/Speech/AN4/Python/HTK_LSTM_Truncated_Distributed.py b/Examples/Speech/AN4/Python/HTK_LSTM_Truncated_Distributed.py index f802e3625bdf..c029bb6742e4 100644 --- a/Examples/Speech/AN4/Python/HTK_LSTM_Truncated_Distributed.py +++ b/Examples/Speech/AN4/Python/HTK_LSTM_Truncated_Distributed.py @@ -72,8 +72,8 @@ def create_trainer(network, epoch_size, num_quantization_bits, block_size, warm_ lr = [0.001] local_learner = fsadagrad(network['output'].parameters, - lr=learning_rate_schedule(lr, UnitType.sample, epoch_size), - momentum=momentum_as_time_constant_schedule(1000), + lr=learning_parameter_schedule_per_sample(lr, epoch_size=epoch_size), + momentum=momentum_schedule_per_sample(0.9990913221888589), gradient_clipping_threshold_per_sample=15, gradient_clipping_with_truncation=True) if block_size != None: diff --git a/Examples/Text/CharacterLM/char_rnn.py b/Examples/Text/CharacterLM/char_rnn.py index 319d93a1bc62..f3b896a210d9 100644 --- a/Examples/Text/CharacterLM/char_rnn.py +++ b/Examples/Text/CharacterLM/char_rnn.py @@ -9,7 +9,7 @@ import os import sys from cntk import Trainer, Axis -from cntk.learners import momentum_sgd, momentum_as_time_constant_schedule, learning_rate_schedule, UnitType +from cntk.learners import momentum_sgd, momentum_schedule_per_sample, learning_parameter_schedule_per_sample from cntk.ops import sequence from cntk.losses import cross_entropy_with_softmax from cntk.metrics import classification_error @@ -157,11 +157,11 @@ def train_lm(training_file, epochs, max_num_minibatches): errs = classification_error(z, label_sequence) # Instantiate the trainer object to drive the model training - lr_per_sample = learning_rate_schedule(0.001, UnitType.sample) - momentum_time_constant = momentum_as_time_constant_schedule(1100) + lr_per_sample = learning_parameter_schedule_per_sample(0.001) + momentum_schedule = momentum_schedule_per_sample(0.9990913221888589) clipping_threshold_per_sample = 5.0 gradient_clipping_with_truncation = True - learner = momentum_sgd(z.parameters, lr_per_sample, momentum_time_constant, + learner = momentum_sgd(z.parameters, lr_per_sample, momentum_schedule, gradient_clipping_threshold_per_sample=clipping_threshold_per_sample, gradient_clipping_with_truncation=gradient_clipping_with_truncation) progress_printer = ProgressPrinter(freq=100, tag='Training') diff --git a/Examples/Text/LightRNN/LightRNN/train.py b/Examples/Text/LightRNN/LightRNN/train.py index ad18c7dafd03..68b6e1ca751a 100644 --- a/Examples/Text/LightRNN/LightRNN/train.py +++ b/Examples/Text/LightRNN/LightRNN/train.py @@ -190,12 +190,12 @@ def create_criterion(network): # return: learners: [sgd, adam, adagrad] def create_learner(model): '''Create the optimized method''' - lr_per_minibatch = C.learning_rate_schedule(opt.lr, C.UnitType.minibatch) - momentum_time_constant = C.momentum_as_time_constant_schedule(1100) + lr_per_minibatch = C.learning_parameter_schedule(opt.lr) + momentum_schedule = C.momentum_schedule_per_sample(0.9990913221888589) if opt.optim == 'sgd': return C.sgd(model.parameters, lr=lr_per_minibatch) elif opt.optim == 'adam': - return C.adam(model.parameters, lr=lr_per_minibatch, momentum=momentum_time_constant) + return C.adam(model.parameters, lr=lr_per_minibatch, momentum=momentum_schedule) elif opt.optim == 'adagrad': return C.adagrad(model.parameters, lr=lr_per_minibatch) else: diff --git a/Examples/Text/WordLMWithSampledSoftmax/word_rnn.py b/Examples/Text/WordLMWithSampledSoftmax/word_rnn.py index 741c11957dd7..84b9ab3bceb5 100644 --- a/Examples/Text/WordLMWithSampledSoftmax/word_rnn.py +++ b/Examples/Text/WordLMWithSampledSoftmax/word_rnn.py @@ -10,7 +10,7 @@ import timeit from cntk import Axis from cntk.train import Trainer -from cntk.learners import momentum_sgd, momentum_as_time_constant_schedule, learning_rate_schedule, UnitType +from cntk.learners import momentum_sgd from cntk.ops import sequence from cntk.losses import cross_entropy_with_softmax from cntk.metrics import classification_error @@ -34,7 +34,7 @@ sequences_per_batch = 10 alpha = 0.75 learning_rate = 0.002 -momentum_as_time_constant = 10000 +momentum_per_sample = 0.9999000049998333 clipping_threshold_per_sample = 5.0 token_to_id_path = './ptb/token2id.txt' validation_file_path = './ptb/valid.txt' @@ -196,8 +196,8 @@ def train_lm(testing=False): num_trained_samples_since_last_report = 0 # Instantiate the trainer object to drive the model training - lr_schedule = learning_rate_schedule(learning_rate, UnitType.sample) - momentum_schedule = momentum_as_time_constant_schedule(momentum_as_time_constant) + lr_schedule = C.learning_parameter_schedule_per_sample(learning_rate) + momentum_schedule = C.momentum_schedule_per_sample(momentum_per_sample) gradient_clipping_with_truncation = True learner = momentum_sgd(z.parameters, lr_schedule, momentum_schedule, gradient_clipping_threshold_per_sample=clipping_threshold_per_sample, diff --git a/Examples/Video/GettingStarted/Python/Conv3D_UCF11.py b/Examples/Video/GettingStarted/Python/Conv3D_UCF11.py index 960d1a2095e1..f8de4f5683df 100644 --- a/Examples/Video/GettingStarted/Python/Conv3D_UCF11.py +++ b/Examples/Video/GettingStarted/Python/Conv3D_UCF11.py @@ -192,9 +192,9 @@ def conv3d_ucf11(train_reader, test_reader, max_epochs=30): # Set learning parameters lr_per_sample = [0.01]*10+[0.001]*10+[0.0001] - lr_schedule = C.learning_rate_schedule(lr_per_sample, epoch_size=train_epoch_size, unit=C.UnitType.sample) - momentum_time_constant = 4096 - mm_schedule = C.momentum_as_time_constant_schedule([momentum_time_constant]) + lr_schedule = C.learning_parameter_schedule_per_sample(lr_per_sample, epoch_size=train_epoch_size) + momentum_per_sample = 0.9997558891748972 + mm_schedule = C.momentum_schedule_per_sample([momentum_per_sample]) # Instantiate the trainer object to drive the model training learner = C.momentum_sgd(z.parameters, lr_schedule, mm_schedule, True) diff --git a/Tests/EndToEndTests/CNTKv2Python/Examples/htk_deserializer_test.py b/Tests/EndToEndTests/CNTKv2Python/Examples/htk_deserializer_test.py index a217581cbf68..ec3c88876692 100644 --- a/Tests/EndToEndTests/CNTKv2Python/Examples/htk_deserializer_test.py +++ b/Tests/EndToEndTests/CNTKv2Python/Examples/htk_deserializer_test.py @@ -39,8 +39,8 @@ def test_htk_deserializers(): errs = C.classification_error (z, labels) learner = C.fsadagrad(z.parameters, - lr=C.learning_rate_schedule(lr, C.UnitType.sample, epoch_size), - momentum=C.momentum_as_time_constant_schedule(1000), + lr=C.learning_parameter_schedule_per_sample(lr, epoch_size=epoch_size), + momentum=C.momentum_schedule_per_sample(0.9990913221888589), gradient_clipping_threshold_per_sample=15, gradient_clipping_with_truncation=True) progress_printer = C.logging.ProgressPrinter(freq=0) trainer = C.Trainer(z, (ce, errs), learner, progress_printer) diff --git a/Tests/EndToEndTests/CNTKv2Python/Examples/multiple_distributed_learners_test.py b/Tests/EndToEndTests/CNTKv2Python/Examples/multiple_distributed_learners_test.py index c75d73b87b68..f3e3b504cdd5 100644 --- a/Tests/EndToEndTests/CNTKv2Python/Examples/multiple_distributed_learners_test.py +++ b/Tests/EndToEndTests/CNTKv2Python/Examples/multiple_distributed_learners_test.py @@ -40,11 +40,11 @@ def test_sample_count_with_several_distributed_learners(): z = plus(n, p2, name='z') ce = squared_error(z, labels) - momentum_time_constant = C.momentum_as_time_constant_schedule(1100) - lr_per_sample = C.learning_rate_schedule(0.007, C.UnitType.sample) + momentum_schedule = C.momentum_schedule_per_sample(0.9990913221888589) + lr_per_sample = C.learning_parameter_schedule_per_sample(0.007) dist_learners = [ - C.distributed.data_parallel_distributed_learner(C.momentum_sgd([p1], lr_per_sample, momentum_time_constant, True)), - C.distributed.data_parallel_distributed_learner(C.momentum_sgd([p2], lr_per_sample, momentum_time_constant, True)) + C.distributed.data_parallel_distributed_learner(C.momentum_sgd([p1], lr_per_sample, momentum_schedule, True)), + C.distributed.data_parallel_distributed_learner(C.momentum_sgd([p2], lr_per_sample, momentum_schedule, True)) ] trainer = C.Trainer(z, ce, dist_learners) diff --git a/Tests/EndToEndTests/CNTKv2Python/Examples/word_rnn_test.py b/Tests/EndToEndTests/CNTKv2Python/Examples/word_rnn_test.py index beaf3dd6fe58..867286579388 100644 --- a/Tests/EndToEndTests/CNTKv2Python/Examples/word_rnn_test.py +++ b/Tests/EndToEndTests/CNTKv2Python/Examples/word_rnn_test.py @@ -75,7 +75,7 @@ def test_word_rnn(device_id): W.sequences_per_batch = 2 W.alpha = 0.75 W.learning_rate = 0.02 - W.momentum_as_time_constant = 5 + W.momentum_per_sample = 0.8187307530779818 W.clipping_threshold_per_sample = 5.0 W.segment_sepparator = '' W.num_samples_between_progress_report = 2 diff --git a/Tests/EndToEndTests/EvalClientTests/CNTKLibraryCSEvalExamplesTest/train_models_for_evaluation.py b/Tests/EndToEndTests/EvalClientTests/CNTKLibraryCSEvalExamplesTest/train_models_for_evaluation.py index ecaba8bc350b..cdeec189dd38 100644 --- a/Tests/EndToEndTests/EvalClientTests/CNTKLibraryCSEvalExamplesTest/train_models_for_evaluation.py +++ b/Tests/EndToEndTests/EvalClientTests/CNTKLibraryCSEvalExamplesTest/train_models_for_evaluation.py @@ -77,8 +77,8 @@ def LanguageUnderstanding_train(reader, model, max_epochs): minibatch_size = 70 learner = fsadagrad(criterion.parameters, - lr = learning_rate_schedule([0.003]*2+[0.0015]*12+[0.0003], UnitType.sample, epoch_size), - momentum = momentum_as_time_constant_schedule(minibatch_size / -math.log(0.9)), + lr = learning_parameter_schedule_per_sample([0.003]*2+[0.0015]*12+[0.0003], epoch_size=epoch_size), + momentum = momentum_schedule(0.9, minibatch_size), gradient_clipping_threshold_per_sample = 15, gradient_clipping_with_truncation = True) diff --git a/bindings/python/cntk/contrib/deeprl/agent/policy_gradient.py b/bindings/python/cntk/contrib/deeprl/agent/policy_gradient.py index 70543c3a537c..65e22b566068 100644 --- a/bindings/python/cntk/contrib/deeprl/agent/policy_gradient.py +++ b/bindings/python/cntk/contrib/deeprl/agent/policy_gradient.py @@ -235,9 +235,8 @@ def _set_up_policy_network_and_value_network(self): (combined_loss, None), C.learners.adam( combined_networks.parameters, - C.learners.learning_rate_schedule( - self._parameters.initial_eta, - C.learners.UnitType.sample), + C.learners.learning_parameter_schedule_per_sample( + self._parameters.initial_eta), momentum=C.learners.momentum_schedule(self._parameters.momentum), variance_momentum=C.learners.momentum_schedule(0.999), minibatch_size=C.learners.IGNORE)) @@ -255,8 +254,7 @@ def _adjust_learning_rate(self): (self._parameters.initial_eta - self._parameters.eta_minimum) * (1 - float(self.step_count)/self._parameters.eta_decay_step_count)) self._trainer.parameter_learners[0].reset_learning_rate( - C.learners.learning_rate_schedule( - eta, C.learners.UnitType.sample)) + C.learners.learning_parameter_schedule_per_sample(eta)) def _choose_action(self, state): """ diff --git a/bindings/python/cntk/contrib/deeprl/agent/qlearning.py b/bindings/python/cntk/contrib/deeprl/agent/qlearning.py index 606897394302..c07a00c84888 100644 --- a/bindings/python/cntk/contrib/deeprl/agent/qlearning.py +++ b/bindings/python/cntk/contrib/deeprl/agent/qlearning.py @@ -105,8 +105,8 @@ def __init__(self, config_filename, o_space, a_space): # TODO: allow user to specify learner through config file. opt = C.learners.adam( self._q.parameters, - C.learners.learning_rate_schedule( - self._parameters.initial_eta, C.learners.UnitType.sample), + C.learners.learning_parameter_schedule_per_sample( + self._parameters.initial_eta), use_mean_gradient=True, momentum=C.learners.momentum_schedule(self._parameters.momentum), variance_momentum=C.learners.momentum_schedule(0.999), @@ -226,8 +226,8 @@ def _adjust_learning_rate(self): (1 - float(self.step_count)/self._parameters.eta_decay_step_count)) self._trainer.parameter_learners[0].reset_learning_rate( - C.learners.learning_rate_schedule( - eta, C.learners.UnitType.sample)) + C.learners.learning_parameter_schedule_per_sample( + eta)) def _adjust_exploration_rate(self): self._epsilon = self._parameters.epsilon_minimum + max( diff --git a/bindings/python/cntk/debugging/tests/debug_test.py b/bindings/python/cntk/debugging/tests/debug_test.py index d6ac4c3e76d7..2c2f39c04e9a 100644 --- a/bindings/python/cntk/debugging/tests/debug_test.py +++ b/bindings/python/cntk/debugging/tests/debug_test.py @@ -5,7 +5,7 @@ import numpy as np import cntk as C -from cntk import sgd, Trainer, learning_rate_schedule, parameter, \ +from cntk import sgd, Trainer, learning_parameter_schedule, parameter, \ times, cross_entropy_with_softmax, \ classification_error, UnitType, combine from cntk.debugging.debug import debug_model, _DebugNode @@ -32,7 +32,7 @@ def _train_backcompatible_test(z, loss, eval_error, input_dim = 2 - lr_schedule = learning_rate_schedule(0.5, UnitType.minibatch) + lr_schedule = learning_parameter_schedule(0.5) learner = sgd(z.parameters, lr_schedule) trainer = Trainer(z, (loss, eval_error), [learner]) diff --git a/bindings/python/cntk/debugging/tests/userlearner_test.py b/bindings/python/cntk/debugging/tests/userlearner_test.py index 903d11ce86ca..fd654f998a04 100644 --- a/bindings/python/cntk/debugging/tests/userlearner_test.py +++ b/bindings/python/cntk/debugging/tests/userlearner_test.py @@ -5,7 +5,7 @@ import cntk as C from cntk import Axis, NDArrayView from cntk.logging import ProgressPrinter -from cntk.learners import UserLearner, sgd, learning_rate_schedule, UnitType +from cntk.learners import UserLearner, sgd, learning_parameter_schedule from cntk.layers import Dense, Sequential import pytest @@ -79,7 +79,7 @@ def update(self, gradient_values, training_sample_count, sweep_end): ADDITIONAL_ARGUMENTS = [ #(additional learning rate arguments (args), additional learner arguments (kwargs)) - (C.learning_rate_schedule, [UnitType.minibatch], {'minibatch_size': 0}), #for backward compatible test + (C.learning_rate_schedule, [C.learners.UnitType.minibatch], {'minibatch_size': 0}), #for backward compatible test (C.learning_parameter_schedule, [25], {'minibatch_size': 25}), # test new API; 25 is the actually minibatch size (C.learning_parameter_schedule, [], {'minibatch_size': 0}), # test new API ] diff --git a/bindings/python/cntk/io/tests/io_tests.py b/bindings/python/cntk/io/tests/io_tests.py index 8baa70d2a926..1eec22da4947 100644 --- a/bindings/python/cntk/io/tests/io_tests.py +++ b/bindings/python/cntk/io/tests/io_tests.py @@ -896,8 +896,8 @@ def test_usermbsource_training(tmpdir, with_checkpoint_impl): mbs_cv = MBS_CV_CLASS(input_dim, num_output_classes) from cntk import sequence, parameter, plus, cross_entropy_with_softmax, \ - classification_error, learning_rate_schedule, sgd, Trainer, \ - training_session, times, UnitType + classification_error, learning_parameter_schedule_per_sample, sgd, Trainer, \ + training_session, times feature = sequence.input_variable(shape=(input_dim,)) label = C.input_variable(shape=(num_output_classes,)) @@ -908,7 +908,7 @@ def test_usermbsource_training(tmpdir, with_checkpoint_impl): #having a large learning rate to prevent the model from converging earlier where not all the intended samples are fed #note that training session can end earlier if there is no updates - lr_per_sample = learning_rate_schedule(0.3, UnitType.sample) + lr_per_sample = learning_parameter_schedule_per_sample(0.3) learner = sgd(z.parameters, lr_per_sample) trainer = Trainer(z, (ce, errs), [learner]) input_map = { diff --git a/bindings/python/cntk/learners/__init__.py b/bindings/python/cntk/learners/__init__.py index 33fc75922941..2b34a31ae2a6 100644 --- a/bindings/python/cntk/learners/__init__.py +++ b/bindings/python/cntk/learners/__init__.py @@ -800,7 +800,7 @@ def nesterov(parameters, lr, momentum, unit_gain=default_unit_gain_value(), return opt @typemap -def adadelta(parameters, lr=learning_rate_schedule(1, UnitType.sample), rho=0.95, epsilon=1e-8, +def adadelta(parameters, lr=learning_parameter_schedule_per_sample(1), rho=0.95, epsilon=1e-8, l1_regularization_weight=0.0, l2_regularization_weight=0.0, gaussian_noise_injection_std_dev=0.0, gradient_clipping_threshold_per_sample=np.inf, gradient_clipping_with_truncation=True, use_mean_gradient=None, diff --git a/bindings/python/cntk/learners/tests/distributed_learner_test.py b/bindings/python/cntk/learners/tests/distributed_learner_test.py index 078370b0f1ab..1e775bf49638 100644 --- a/bindings/python/cntk/learners/tests/distributed_learner_test.py +++ b/bindings/python/cntk/learners/tests/distributed_learner_test.py @@ -51,7 +51,7 @@ def create_trainer(self, mode, config): self.trainer = C.Trainer(self.z, (self.z, None), learner, []) if learner else None def create_distributed_learner(self, mode, config): - local_learner = C.sgd(self.z.parameters, C.learning_rate_schedule(0.01, unit=C.learners.UnitType.sample)) + local_learner = C.sgd(self.z.parameters, C.learning_parameter_schedule_per_sample(0.01)) try: if mode == 'data_parallel': if config is None: diff --git a/bindings/python/cntk/learners/tests/learner_test.py b/bindings/python/cntk/learners/tests/learner_test.py index d0f2f9810b99..0b1d7f1d0c7f 100644 --- a/bindings/python/cntk/learners/tests/learner_test.py +++ b/bindings/python/cntk/learners/tests/learner_test.py @@ -52,13 +52,13 @@ LEARNER_LAMBDAS = [ lambda params: C.adadelta(params), - lambda params: C.adagrad(params, lr=learning_rate_schedule(1, UnitType.minibatch)), - lambda params: C.adam(params, lr=learning_rate_schedule(1, UnitType.minibatch), momentum=C.momentum_schedule(0.9)), - lambda params: C.fsadagrad(params, lr=learning_rate_schedule(1, UnitType.minibatch), momentum=C.momentum_schedule(0.9)), - lambda params: C.nesterov(params, lr=learning_rate_schedule(1, UnitType.minibatch), momentum=C.momentum_schedule(0.9)), - lambda params: C.rmsprop(params, lr=learning_rate_schedule(1, UnitType.minibatch), gamma=0.1, inc=3.0, dec=0.1, max=np.inf, min=1e-8), - lambda params: C.sgd(params, lr=learning_rate_schedule(1, UnitType.minibatch)), - lambda params: C.momentum_sgd(params, lr=learning_rate_schedule(1, UnitType.minibatch), momentum=C.momentum_schedule(0.9))] + lambda params: C.adagrad(params, lr=learning_parameter_schedule(1)), + lambda params: C.adam(params, lr=learning_parameter_schedule(1), momentum=C.momentum_schedule(0.9)), + lambda params: C.fsadagrad(params, lr=learning_parameter_schedule(1), momentum=C.momentum_schedule(0.9)), + lambda params: C.nesterov(params, lr=learning_parameter_schedule(1), momentum=C.momentum_schedule(0.9)), + lambda params: C.rmsprop(params, lr=learning_parameter_schedule(1), gamma=0.1, inc=3.0, dec=0.1, max=np.inf, min=1e-8), + lambda params: C.sgd(params, lr=learning_parameter_schedule(1)), + lambda params: C.momentum_sgd(params, lr=learning_parameter_schedule(1), momentum=C.momentum_schedule(0.9))] @pytest.mark.parametrize("params, expectation, minibatch_size", LR_SCHEDULE_PARAMS_LEGACY) def test_learning_rate_schedule(params, expectation, minibatch_size): @@ -466,7 +466,7 @@ def test_noise_injection_with_checkpointing(): w2 = parameter(shape=shape, init=initializer.glorot_uniform(seed=123)) w3 = parameter(shape=shape, init=initializer.glorot_uniform(seed=123)) - lr=learning_rate_schedule(0.5, UnitType.sample) + lr=C.learning_parameter_schedule_per_sample(0.5) m=C.momentum_schedule(0.99) learner1 = C.momentum_sgd([w1], lr, m, gaussian_noise_injection_std_dev=0.5) @@ -515,8 +515,8 @@ def test_learner_logging(): lr_values = [0.3, 0.2, 0.1, 0] m_values = [0.6, 0.7, 0.8] learner = C.momentum_sgd(z.parameters, - learning_rate_schedule(lr_values, UnitType.sample, 1), - C.momentum_schedule(m_values, 1)) + C.learning_parameter_schedule_per_sample(lr_values, epoch_size=1), + C.momentum_schedule(m_values, epoch_size=1)) trainer = Trainer(z, (ce, errs), [learner], writer) for i in range(10): @@ -572,7 +572,7 @@ def test_sweep_based_schedule(tmpdir, device_id): ce = cross_entropy_with_softmax(z, labels) errs = classification_error(z, labels) - lr_per_sample = learning_rate_schedule([0.3, 0.2, 0.1, 0.0], UnitType.sample) + lr_per_sample = C.learning_parameter_schedule_per_sample([0.3, 0.2, 0.1, 0.0]) learner = sgd(z.parameters, lr_per_sample) trainer = Trainer(z, (ce, errs), [learner]) @@ -617,7 +617,7 @@ def generate_random_data(sample_size, feature_dim, num_classes): def test_learner_empy_parameters_list(): - lr_per_sample = learning_rate_schedule(0.1, UnitType.sample) + lr_per_sample = C.learning_parameter_schedule_per_sample(0.1) with pytest.raises(ValueError): learner = C.sgd([], lr_per_sample) @@ -673,14 +673,14 @@ def test_sgd_with_noise(): # in some layers. This tests that cuRand library will not # complain about generating an odd number of random values np.random.seed(98052) - learner = lambda params: sgd(params, lr=learning_rate_schedule(0.125, UnitType.minibatch), gaussian_noise_injection_std_dev=0.01) + learner = lambda params: sgd(params, lr=C.learning_parameter_schedule(0.125), gaussian_noise_injection_std_dev=0.01) ffnet(learner) # We just verify that we did not crash assert(True) def test_universal(): np.random.seed(98052) - builtin_sgd = lambda params: sgd(params, lr=learning_rate_schedule(0.125, UnitType.minibatch)) + builtin_sgd = lambda params: sgd(params, lr=C.learning_parameter_schedule(0.125)) builtin_last_avg_error, builtin_avg_error, _ = ffnet(builtin_sgd) np.random.seed(98052) my_sgd = lambda ps, gs: C.combine([C.assign(p, p - 0.125/25 * g) for p, g in zip(ps, gs)]) @@ -735,10 +735,10 @@ def test_restore_from_checkpoint(tmpdir, learner): # this should be replaced with LEARNER_LAMBDAS SPARSE_AND_DENSE_LEARNER_LAMBDAS = [ (lambda params: C.adadelta(params), False), - (lambda params: C.adam(params, lr=learning_rate_schedule(1, UnitType.minibatch), momentum=C.momentum_schedule(0.9)), True), - (lambda params: C.fsadagrad(params, lr=learning_rate_schedule(1, UnitType.minibatch), momentum=C.momentum_schedule(0.9)), True), - (lambda params: C.rmsprop(params, lr=learning_rate_schedule(1, UnitType.minibatch), gamma=0.1, inc=3.0, dec=0.1, max=np.inf, min=1e-8), True), - (lambda params: C.sgd(params, lr=learning_rate_schedule(1, UnitType.minibatch)), False)] + (lambda params: C.adam(params, lr=learning_parameter_schedule(1), momentum=C.momentum_schedule(0.9)), True), + (lambda params: C.fsadagrad(params, lr=learning_parameter_schedule(1), momentum=C.momentum_schedule(0.9)), True), + (lambda params: C.rmsprop(params, lr=learning_parameter_schedule(1), gamma=0.1, inc=3.0, dec=0.1, max=np.inf, min=1e-8), True), + (lambda params: C.sgd(params, lr=learning_parameter_schedule(1)), False)] @pytest.mark.parametrize("learner, gpu_only", SPARSE_AND_DENSE_LEARNER_LAMBDAS) @pytest.mark.parametrize("checkpoint", [True, False]) diff --git a/bindings/python/cntk/ops/functions.py b/bindings/python/cntk/ops/functions.py index fa8e22899106..e864943a6c8c 100644 --- a/bindings/python/cntk/ops/functions.py +++ b/bindings/python/cntk/ops/functions.py @@ -1358,7 +1358,7 @@ def train(self, minibatch_source, ... def criterion(data, label_one_hot): ... z = model(data) # apply model. Computes a non-normalized log probability for every output class. ... return cntk.cross_entropy_with_softmax(z, label_one_hot) - >>> learner = cntk.sgd(model.parameters, cntk.learning_rate_schedule(0.1, cntk.UnitType.minibatch)) + >>> learner = cntk.sgd(model.parameters, 0.1) >>> progress = criterion.train((X, Y), minibatch_size=25, max_epochs=2, epoch_size=125, parameter_learners=[learner]) >>> print("%.2f" % progress.epoch_summaries[-1].loss) # get the final epoch's loss value 0.68 diff --git a/bindings/python/cntk/ops/tests/sparse_test.py b/bindings/python/cntk/ops/tests/sparse_test.py index 18be6ce5b7b1..de169e5707e5 100644 --- a/bindings/python/cntk/ops/tests/sparse_test.py +++ b/bindings/python/cntk/ops/tests/sparse_test.py @@ -88,7 +88,7 @@ def create_trainer(use_sparse, device): l = C.sequence.input_variable(shape=label_shape, is_sparse=use_sparse, name='label') loss = cross_entropy_with_softmax(z, l, axis=-1) - trainer = C.Trainer(z, (loss, None), C.sgd(z.parameters, lr=C.learning_rate_schedule(0.7, C.UnitType.sample))) + trainer = C.Trainer(z, (loss, None), C.sgd(z.parameters, lr=C.learning_parameter_schedule_per_sample(0.7))) return (a, l, w, trainer) # Run with sparse inputs @@ -146,7 +146,7 @@ def create_trainer(use_sparse, device): l = C.sequence.input_variable(shape=label_shape, is_sparse=use_sparse, name='label') loss = cross_entropy_with_softmax(z, l, axis=-1) - trainer = C.Trainer(z, (loss, None), C.sgd(z.parameters, lr=C.learning_rate_schedule(0.7, C.UnitType.sample))) + trainer = C.Trainer(z, (loss, None), C.sgd(z.parameters, lr=C.learning_parameter_schedule_per_sample(0.7))) return (a, l, w_i, w_h, trainer) # Run with sparse inputs diff --git a/bindings/python/cntk/ops/tests/userfunction_complex_test.py b/bindings/python/cntk/ops/tests/userfunction_complex_test.py index a3509ece45cf..624ca486bdaf 100644 --- a/bindings/python/cntk/ops/tests/userfunction_complex_test.py +++ b/bindings/python/cntk/ops/tests/userfunction_complex_test.py @@ -129,7 +129,7 @@ def mem_leak_check(nonlinearity, num_hidden_layers, device_id, np.random.seed(0) learning_rate = 0.5 - lr_schedule = C.learning_rate_schedule(learning_rate) + lr_schedule = C.learning_parameter_schedule(learning_rate) hidden_layers_dim = 50 diff --git a/bindings/python/cntk/tests/function_test.py b/bindings/python/cntk/tests/function_test.py index c8b41e86e32f..257c8cc49874 100644 --- a/bindings/python/cntk/tests/function_test.py +++ b/bindings/python/cntk/tests/function_test.py @@ -79,7 +79,7 @@ def test_clone_freeze(): c_copies = [q.value for q in c_clone.constants] # update z - trainer = C.Trainer(z, C.squared_error(z, label), C.sgd(z.parameters, C.learning_rate_schedule(1.0, C.UnitType.minibatch))) + trainer = C.Trainer(z, C.squared_error(z, label), C.sgd(z.parameters, C.learning_parameter_schedule(1.0))) x = np.random.randn(16,3).astype('f') y = np.random.randn(16,5).astype('f') trainer.train_minibatch({features: x, label: y}) diff --git a/bindings/python/cntk/tests/persist_test.py b/bindings/python/cntk/tests/persist_test.py index cf37344aeb32..9d5097fbc1c5 100644 --- a/bindings/python/cntk/tests/persist_test.py +++ b/bindings/python/cntk/tests/persist_test.py @@ -192,7 +192,7 @@ def _checkall(f, v): x = C.input_variable(10) f = C.layers.BatchNormalization()(x) - trainer = C.Trainer(f, C.reduce_sum(f), C.sgd(f.parameters, C.learning_rate_schedule(0.1, 'sample'))) + trainer = C.Trainer(f, C.reduce_sum(f), C.sgd(f.parameters, C.learning_parameter_schedule_per_sample(0.1))) model_filename = str(tmpdir / 'function.out') checkpoint_filename = str(tmpdir / 'checkpoint.out') diff --git a/bindings/python/cntk/train/tests/training_session_test.py b/bindings/python/cntk/train/tests/training_session_test.py index 00c45e69c71e..3611f31aa42e 100644 --- a/bindings/python/cntk/train/tests/training_session_test.py +++ b/bindings/python/cntk/train/tests/training_session_test.py @@ -97,7 +97,7 @@ def mb_source(tmpdir, fileprefix, max_samples=FULL_DATA_SWEEP, ctf=ctf_data, str def create_sample_model(device, writer=None, - lr_per_sample=C.learning_rate_schedule([0.3, 0.2, 0.1, 0.0], C.UnitType.sample)): + lr_per_sample=C.learning_parameter_schedule_per_sample([0.3, 0.2, 0.1, 0.0])): in1 = sequence.input_variable(shape=(input_dim,)) labels = sequence.input_variable(shape=(input_dim,)) p = parameter(shape=(input_dim,), init=10, device=device) @@ -428,7 +428,7 @@ def test_session_progress_print_on_sweep_unit(tmpdir, device_id): device = cntk_device(device_id) writer = MockProgressWriter() #set to a higher learning rate as we don't need to have converge but just to go through all the samples - t, feature, label = create_sample_model(device, writer, lr_per_sample=C.learning_rate_schedule(0.3, C.UnitType.sample)) + t, feature, label = create_sample_model(device, writer, lr_per_sample=C.learning_parameter_schedule_per_sample(0.3)) mbs = mb_source(tmpdir, "training", #max_samples=INFINITELY_REPEAT, max_sweeps = 4) diff --git a/bindings/python/doc/simplenet.py b/bindings/python/doc/simplenet.py index 77dac488c6bd..55741a516865 100644 --- a/bindings/python/doc/simplenet.py +++ b/bindings/python/doc/simplenet.py @@ -1,7 +1,7 @@ from __future__ import print_function import numpy as np import cntk as C -from cntk.learners import sgd, learning_rate_schedule, UnitType +from cntk.learners import sgd from cntk.logging import ProgressPrinter from cntk.layers import Dense, Sequential @@ -38,7 +38,7 @@ def ffnet(): pe = C.classification_error(z, label) # Instantiate the trainer object to drive the model training - lr_per_minibatch = learning_rate_schedule(0.125, UnitType.minibatch) + lr_per_minibatch = C.learning_parameter_schedule(0.125) progress_printer = ProgressPrinter(0) trainer = C.Trainer(z, (ce, pe), [sgd(z.parameters, lr=lr_per_minibatch)], [progress_printer]) diff --git a/bindings/python/doc/simplernn.py b/bindings/python/doc/simplernn.py index e24fa1c8f940..3f8be483497f 100644 --- a/bindings/python/doc/simplernn.py +++ b/bindings/python/doc/simplernn.py @@ -3,7 +3,7 @@ from cntk import Trainer, Axis from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs,\ INFINITELY_REPEAT -from cntk.learners import sgd, learning_rate_schedule, UnitType +from cntk.learners import sgd, learning_parameter_schedule_per_sample from cntk import input_variable, cross_entropy_with_softmax, \ classification_error, sequence from cntk.logging import ProgressPrinter @@ -58,7 +58,7 @@ def train_sequence_classifier(): label: reader.streams.labels } - lr_per_sample = learning_rate_schedule(0.0005, UnitType.sample) + lr_per_sample = learning_parameter_schedule_per_sample(0.0005) # Instantiate the trainer object to drive the model training progress_printer = ProgressPrinter(0) trainer = Trainer(classifier_output, (ce, pe),