Merge pull request #115 from ldmt-muri/sgd

Sgd
ldmt-muri · Apr 7, 2015 · 29f28d5 · 29f28d5
2 parents d3ab541 + 76bd371
commit 29f28d5
Show file tree

Hide file tree

Showing 11 changed files with 773 additions and 589 deletions.
diff --git a/alignment/IbmModel1.cc b/alignment/IbmModel1.cc
@@ -45,15 +45,13 @@ void IbmModel1::CoreConstructor(const string& bitextFilename,
   PersistParams(initialModelFilename.str());
 
   // create the initial grammar FST
-  cerr << "create grammar fst" << endl;
   CreateGrammarFst();
 
 }
 
 void IbmModel1::Train() {
 
   // create tgt fsts
-  cerr << "create tgt fsts" << endl;
   vector< VectorFst <FstUtils::LogArc> > tgtFsts;
   CreateTgtFsts(tgtFsts);
 
@@ -303,11 +301,6 @@ void IbmModel1::LearnParameters(vector< VectorFst< FstUtils::LogArc > >& tgtFsts
       } else {
 	logLikelihood += fSentLogLikelihood;
       }
-
-      // logging
-      if (++sentsCounter % 1000 == 0) {
-	cerr << sentsCounter << " sents processed. iterationLoglikelihood = " << logLikelihood <<  endl;
-      }
     }
 
     // normalize fractional counts such that \sum_t p(t|s) = 1 \forall s

diff --git a/alignment/train-latentCrfAligner.cc b/alignment/train-latentCrfAligner.cc
@@ -79,13 +79,16 @@ bool ParseParameters(int argc, char **argv, string &textFilename,
     L1_STRENGTH = "l1-strength",
     MAX_ITER_COUNT = "max-iter-count",
     MIN_RELATIVE_DIFF = "min-relative-diff",
-    MAX_LBFGS_ITER_COUNT = "max-lbfgs-iter-count",
+    MAX_LAMBDA_UPDATES_EPOCH_COUNT = "max-lambda-updates-epoch-count",
     //MAX_ADAGRAD_ITER_COUNT = "max-adagrad-iter-count",
     MAX_EM_ITER_COUNT = "max-em-iter-count",
     MAX_MODEL1_ITER_COUNT = "max-model1-iter-count",
     NO_DIRECT_DEP_BTW_HIDDEN_LABELS = "no-direct-dep-btw-hidden-labels",
     CACHE_FEATS = "cache-feats",
-    OPTIMIZER = "optimizer",
+    LAMBDA_OPTIMIZER = "lambda-optimizer",
+    LAMBDA_OPTIMIZER_LEARNING_RATE = "lambda-learning-rate",
+    LAMBDA_OPTIMIZER_LEARNING_RATE_DECAY_STRATEGY = "lambda-optimizer-learning-rate-decay-strategy",
+    LAMBDA_OPTIMIZER_LEARNING_RATE_DECAY_PARAMETER = "lambda-optimizer-learning-rate-decay-parameter",
     MINIBATCH_SIZE = "minibatch-size",
     LOGLINEAR_OPT_FIX_Z_GIVEN_X = "loglinear-opt-fix-z-given-x",
     DIRICHLET_ALPHA = "dirichlet-alpha",
@@ -112,12 +115,15 @@ bool ParseParameters(int argc, char **argv, string &textFilename,
     (L1_STRENGTH.c_str(), po::value<float>()->default_value(0.0), "(double) strength of an l1 regularizer")
     (MAX_ITER_COUNT.c_str(), po::value<int>(&learningInfo.maxIterationsCount)->default_value(50), "(int) max number of coordinate descent iterations after which the model is assumed to have converged")
     (MIN_RELATIVE_DIFF.c_str(), po::value<float>(&learningInfo.minLikelihoodRelativeDiff)->default_value(0.03), "(double) convergence threshold for the relative difference between the objective value in two consecutive coordinate descent iterations")
-    (MAX_LBFGS_ITER_COUNT.c_str(), po::value<int>(&learningInfo.optimizationMethod.subOptMethod->lbfgsParams.maxIterations)->default_value(6), "(int) quit LBFGS optimization after this many iterations")
+    (MAX_LAMBDA_UPDATES_EPOCH_COUNT.c_str(), po::value<int>(&learningInfo.optimizationMethod.subOptMethod->epochs)->default_value(1), "(int) quit LBFGS optimization after this many iterations")
     //(MAX_ADAGRAD_ITER_COUNT.c_str(), po::value<int>(&learningInfo.optimizationMethod.subOptMethod->adagradParams.maxIterations)->default_value(4), "(int) quit Adagrad optimization after this many iterations")
     (MAX_EM_ITER_COUNT.c_str(), po::value<unsigned int>(&learningInfo.emIterationsCount)->default_value(3), "(int) quit EM optimization after this many iterations")
     (NO_DIRECT_DEP_BTW_HIDDEN_LABELS.c_str(), "(flag) consecutive labels are independent given observation sequence")
     (CACHE_FEATS.c_str(), po::value<bool>(&learningInfo.cacheActiveFeatures)->default_value(false), "(flag) (set by default) maintains and uses a map from a factor to its active features to speed up training, at the expense of higher memory requirements.")
-    (OPTIMIZER.c_str(), po::value<string>(), "(string) optimization algorithm to use for updating loglinear parameters")
+    (LAMBDA_OPTIMIZER.c_str(), po::value<string>()->default_value("sgd"), "(string) optimization algorithm to use for optimizing the CRF parameters. Supported values are: 'lbfgs', 'sgd', 'adagrad'. L-BFGS is a popular quasi-Newton optimization algorithm, SGD is stochastic gradient descent, and ADAGRAD is the adaptive gradient algorithm described at http://www.magicbroom.info/Papers/DuchiHaSi10.pdf")
+    (LAMBDA_OPTIMIZER_LEARNING_RATE.c_str(), po::value<float>(&learningInfo.optimizationMethod.subOptMethod->learningRate)->default_value(1.0), "(float) If the optimizer used for CRF parameters uses a learning rate (e.g., stochastic gradient descent), specify the initial learning rate using htis argument. Note that the learning rate decays in subsequent iterations of SGD.")
+    (LAMBDA_OPTIMIZER_LEARNING_RATE_DECAY_STRATEGY.c_str(), po::value<string>()->default_value("epoch-fixed"), "(string) Specify which strategy to use for diminishing the learning rate across iterations of stochastic gradient descent. Possible values are 'fixed', 'epoch-fixed', 'bottou', 'geometric'. 'fixed' means that learning rate is the same for all iterations and equal to the specified value for the initial learning rate. 'epoch-fixed' uses the same learning rate for each epoch = initial_learning_rate * 1.0 / epoch_index (the epoch index is one-based). 'bottou' uses the learning rate described in section 5.2 of Leon Bottou's article titled 'Stochastic Gradient Descent Tricks'; i.e., learning_rate = initial_learning_rate / (1 + initial_learning_rate * eta * iteration_index) where eta is the specified decay hyperparameter. 'geometric' uses learning_rate = initial_learning_rate / (1 + eta)^iteration_index.")
+    (LAMBDA_OPTIMIZER_LEARNING_RATE_DECAY_PARAMETER.c_str(), po::value<float>(&learningInfo.optimizationMethod.subOptMethod->learningRateDecayParameter)->default_value(0.001), "(float) some decay strategies for the learning rate in stochastic gradient use a decay parameter (e.g., 'bottou'). The higher this parameter is, the faster will the learning rate decay. Must be greater than zero.")
     (MINIBATCH_SIZE.c_str(), po::value<int>(&learningInfo.optimizationMethod.subOptMethod->miniBatchSize)->default_value(0), "(int) minibatch size for optimizing loglinear params. Defaults to zero which indicates batch training.")
     (LOGLINEAR_OPT_FIX_Z_GIVEN_X.c_str(), po::value<bool>(&learningInfo.fixPosteriorExpectationsAccordingToPZGivenXWhileOptimizingLambdas)->default_value(false), "(flag) (clera by default) fix the feature expectations according to p(Z|X), which involves both multinomial and loglinear parameters. This speeds up the optimization of loglinear parameters and makes it convex; but it does not have principled justification.")
     (MAX_MODEL1_ITER_COUNT.c_str(), po::value<int>(&maxModel1IterCount)->default_value(15), "(int) (defaults to 15) number of model 1 iterations to use for initializing theta parameters")
@@ -143,11 +149,6 @@ bool ParseParameters(int argc, char **argv, string &textFilename,
     return false;
   }
 
-  if (vm.count(MAX_LBFGS_ITER_COUNT.c_str())) {
-    learningInfo.optimizationMethod.subOptMethod->lbfgsParams.memoryBuffer = 
-      vm[MAX_LBFGS_ITER_COUNT.c_str()].as<int>();
-  }
-
   if (vm.count(FEAT.c_str()) == 0) {
     cerr << "No features were specified. We will enable src-tgt word pair identities features by default." << endl;
     learningInfo.featureTemplates.push_back(FeatureTemplate::SRC0_TGT0);
@@ -207,15 +208,45 @@ bool ParseParameters(int argc, char **argv, string &textFilename,
     learningInfo.hiddenSequenceIsMarkovian = false;
   }
 
-  if(vm.count(OPTIMIZER.c_str())) {
-    if(vm[OPTIMIZER.c_str()].as<string>() == "adagrad") {
+  if(vm.count(LAMBDA_OPTIMIZER.c_str())) {
+    if(vm[LAMBDA_OPTIMIZER.c_str()].as<string>() == "adagrad") {
       learningInfo.optimizationMethod.subOptMethod->algorithm = OptAlgorithm::ADAGRAD;
+    } else if (vm[LAMBDA_OPTIMIZER.c_str()].as<string>() == "sgd") {
+      learningInfo.optimizationMethod.subOptMethod->algorithm = OptAlgorithm::SGD;
+    } else if (vm[LAMBDA_OPTIMIZER.c_str()].as<string>() == "lbfgs") {
+      learningInfo.optimizationMethod.subOptMethod->algorithm = OptAlgorithm::LBFGS;
+      learningInfo.optimizationMethod.subOptMethod->lbfgsParams.maxIterations = 
+        learningInfo.optimizationMethod.subOptMethod->epochs;
+      learningInfo.optimizationMethod.subOptMethod->lbfgsParams.memoryBuffer = 
+        learningInfo.optimizationMethod.subOptMethod->epochs;
     } else {
-      cerr << "option --optimizer cannot take the value " << vm[OPTIMIZER.c_str()].as<string>() << endl;
+      cerr << "option --lambda-optimizer cannot take the value " << vm[LAMBDA_OPTIMIZER.c_str()].as<string>() << endl;
       return false;
     }
   }
+
+  // If the minibatch size is not specified while using stochastic gradient descent, 
+  // assume a minibatch size of 1.
+  if (learningInfo.optimizationMethod.subOptMethod->algorithm == OptAlgorithm::SGD &&
+      learningInfo.optimizationMethod.subOptMethod->miniBatchSize == 0) {
+    learningInfo.optimizationMethod.subOptMethod->miniBatchSize = 1;
+  }
 
+  if(vm.count(LAMBDA_OPTIMIZER_LEARNING_RATE_DECAY_STRATEGY.c_str())) {
+    if(vm[LAMBDA_OPTIMIZER_LEARNING_RATE_DECAY_STRATEGY.c_str()].as<string>() == "fixed") {
+      learningInfo.optimizationMethod.subOptMethod->learningRateDecayStrategy = DecayStrategy::FIXED;
+    } else if(vm[LAMBDA_OPTIMIZER_LEARNING_RATE_DECAY_STRATEGY.c_str()].as<string>() == "epoch-fixed") {
+      learningInfo.optimizationMethod.subOptMethod->learningRateDecayStrategy = DecayStrategy::EPOCH_FIXED;
+    } else if(vm[LAMBDA_OPTIMIZER_LEARNING_RATE_DECAY_STRATEGY.c_str()].as<string>() == "bottou") {
+      learningInfo.optimizationMethod.subOptMethod->learningRateDecayStrategy = DecayStrategy::BOTTOU;
+    } else if(vm[LAMBDA_OPTIMIZER_LEARNING_RATE_DECAY_STRATEGY.c_str()].as<string>() == "geometric") {
+      learningInfo.optimizationMethod.subOptMethod->learningRateDecayStrategy = DecayStrategy::GEOMETRIC;
+    } else {
+      cerr << "option --lambda-optimizer-learning-rate-decay-strategy cannot take the value " << vm[LAMBDA_OPTIMIZER_LEARNING_RATE_DECAY_STRATEGY.c_str()].as<string>() << endl;
+      return false;
+    }
+  }
+
   // logging
   if(learningInfo.mpiWorld->rank() == 0) {
     cerr << "program options are as follows:" << endl;
@@ -236,12 +267,12 @@ bool ParseParameters(int argc, char **argv, string &textFilename,
     cerr << L1_STRENGTH << "=" << vm[L1_STRENGTH.c_str()].as<float>() << endl;
     cerr << MAX_ITER_COUNT << "=" << learningInfo.maxIterationsCount << endl;
     cerr << MIN_RELATIVE_DIFF << "=" << learningInfo.minLikelihoodRelativeDiff << endl;
-    cerr << MAX_LBFGS_ITER_COUNT << "=" << learningInfo.optimizationMethod.subOptMethod->lbfgsParams.maxIterations << endl;
+    cerr << MAX_LAMBDA_UPDATES_EPOCH_COUNT << "=" << learningInfo.optimizationMethod.subOptMethod->epochs << endl;
     cerr << MAX_EM_ITER_COUNT << "=" << learningInfo.emIterationsCount << endl;
     cerr << NO_DIRECT_DEP_BTW_HIDDEN_LABELS << "=" << !learningInfo.hiddenSequenceIsMarkovian << endl;
     cerr << CACHE_FEATS << "=" << learningInfo.cacheActiveFeatures << endl;
-    if(vm.count(OPTIMIZER.c_str())) {
-      cerr << OPTIMIZER << "=" << vm[OPTIMIZER.c_str()].as<string>() << endl;
+    if(vm.count(LAMBDA_OPTIMIZER.c_str())) {
+      cerr << LAMBDA_OPTIMIZER << "=" << vm[LAMBDA_OPTIMIZER.c_str()].as<string>() << endl;
     }
     cerr << MINIBATCH_SIZE << "=" << learningInfo.optimizationMethod.subOptMethod->miniBatchSize << endl;
     cerr << LOGLINEAR_OPT_FIX_Z_GIVEN_X << "=" << learningInfo.fixPosteriorExpectationsAccordingToPZGivenXWhileOptimizingLambdas << endl;
@@ -405,12 +436,13 @@ int main(int argc, char **argv) {
   // block coordinate descent
   learningInfo.optimizationMethod.algorithm = OptAlgorithm::BLOCK_COORD_DESCENT;
   // lbfgs
-  learningInfo.optimizationMethod.subOptMethod = new OptMethod();
-  learningInfo.optimizationMethod.subOptMethod->algorithm = OptAlgorithm::LBFGS;
+  learningInfo.optimizationMethod.subOptMethod = new OptMethod();  	
   learningInfo.optimizationMethod.subOptMethod->miniBatchSize = 0;
   learningInfo.optimizationMethod.subOptMethod->lbfgsParams.maxEvalsPerIteration = 4;
   learningInfo.optimizationMethod.subOptMethod->moveAwayPenalty = 0.0;
   learningInfo.retryLbfgsOnRoundingErrors = true;
+  learningInfo.optimizationMethod.subOptMethod->lbfgsParams.maxIterations = 6;
+  learningInfo.optimizationMethod.subOptMethod->lbfgsParams.memoryBuffer = 6;
   // thetas
   learningInfo.thetaOptMethod = new OptMethod();
   learningInfo.thetaOptMethod->algorithm = OptAlgorithm::EXPECTATION_MAXIMIZATION;

diff --git a/core/BasicTypes.h b/core/BasicTypes.h
@@ -56,11 +56,25 @@ namespace Regularizer {
 }
 
 namespace OptAlgorithm {
-  enum OptAlgorithm {GRADIENT_DESCENT, STOCHASTIC_GRADIENT_DESCENT, 
+  enum OptAlgorithm {GRADIENT_DESCENT, SGD, 
     BLOCK_COORD_DESCENT, LBFGS, SIMULATED_ANNEALING, EXPECTATION_MAXIMIZATION,
     ADAGRAD};
 }
 
+// Specify which strategy to use for diminishing the learning rate across 
+// iterations of stochastic gradient descent. 
+// FIXED: learning rate is the same for all iterations and equal to the 
+//   specified value for the initial learning rate. 
+// EPOCH-FIXED: uses the same learning rate for all updates in the same epoch
+//   epoch_learning_rate = initial_learning_rate * 1.0 / epoch_index;
+//   (where the epoch index is one-based). 
+// BOTTOU: uses the learning rate described in section 5.2 of Leon Bottou's 
+//   article titled 'Stochastic Gradient Descent Tricks'; i.e., learning_rate=
+//   initial_learning_rate / (1 + initial_learning_rate*eta*iteration_index);
+//   where eta is the specified decay hyperparameter. 
+// GEOMETRIC: learning_rate = initial_learning_rate / (1 + eta)^iteration_index;
+enum class DecayStrategy {FIXED, EPOCH_FIXED, BOTTOU, GEOMETRIC};
+
 namespace DebugLevel {
   enum DebugLevel {NONE=0, ESSENTIAL=1, CORPUS=2, MINI_BATCH=3, SENTENCE=4, TOKEN=5, REDICULOUS=6, TEMP = 4};
 }
@@ -113,28 +127,38 @@ struct OptMethod {
   LbfgsParams lbfgsParams;
   // if algorithm = ADAGRAD, use these ADAGRAD hyper params
   //AdagradParams adagradParams;
-  // some optimization algorithms require specifying a learning rate (e.g. gradient descent)
+  // some optimization algorithms require specifying a learning rate (e.g. gradient descent).
+  // when using stochastic gradient descent, the value specified by this variable is the initial
+  // learning rate which may or may not be diminished in subsequent iterations, depending on 
+  // learningRateDiminishingStrategy.
   float learningRate;
-  // stochastic = 0 means batch optimization
-  // stochastic = 1 means online optimization
-  bool stochastic;
-  // when stochastic = 1, specifies the minibatch size
+  // if using stochastic gradient descent, this variable determines the decay strategy for the learning rate.
+  DecayStrategy learningRateDecayStrategy;
+  // some decay strategies require a decay parameter
+  float learningRateDecayParameter;
+  // when using a stochastic optimization method, use this variable to specify the mini-batch size. 
   int miniBatchSize;
   // regularization details
   Regularizer::Regularizer regularizer;
   float regularizationStrength;
   // move-away from previous weights penalty
   float moveAwayPenalty;
+  // maximum number of epochs (1 epoch = full pass on the training set) this 
+  // algorithm is allowed to make to update lambdas in one iteration of block
+  // coordinate descent.
+  int epochs;
 
   OptMethod() {
-    stochastic = false;
     algorithm = OptAlgorithm::GRADIENT_DESCENT;
     learningRate = 0.01;
+    learningRateDecayParameter = 1.0;
+    learningRateDecayStrategy = DecayStrategy::EPOCH_FIXED;
     miniBatchSize = 1;
     regularizer = Regularizer::NONE;
     regularizationStrength = 1000;
     subOptMethod = 0;
     moveAwayPenalty = 1.0;
+    epochs = 1;
   }
 };