Skip to content

Commit

Permalink
Merge pull request #115 from ldmt-muri/sgd
Browse files Browse the repository at this point in the history
Sgd
  • Loading branch information
Waleed Ammar committed Apr 7, 2015
2 parents d3ab541 + 76bd371 commit 29f28d5
Show file tree
Hide file tree
Showing 11 changed files with 773 additions and 589 deletions.
7 changes: 0 additions & 7 deletions alignment/IbmModel1.cc
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,13 @@ void IbmModel1::CoreConstructor(const string& bitextFilename,
PersistParams(initialModelFilename.str());

// create the initial grammar FST
cerr << "create grammar fst" << endl;
CreateGrammarFst();

}

void IbmModel1::Train() {

// create tgt fsts
cerr << "create tgt fsts" << endl;
vector< VectorFst <FstUtils::LogArc> > tgtFsts;
CreateTgtFsts(tgtFsts);

Expand Down Expand Up @@ -303,11 +301,6 @@ void IbmModel1::LearnParameters(vector< VectorFst< FstUtils::LogArc > >& tgtFsts
} else {
logLikelihood += fSentLogLikelihood;
}

// logging
if (++sentsCounter % 1000 == 0) {
cerr << sentsCounter << " sents processed. iterationLoglikelihood = " << logLikelihood << endl;
}
}

// normalize fractional counts such that \sum_t p(t|s) = 1 \forall s
Expand Down
66 changes: 49 additions & 17 deletions alignment/train-latentCrfAligner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -79,13 +79,16 @@ bool ParseParameters(int argc, char **argv, string &textFilename,
L1_STRENGTH = "l1-strength",
MAX_ITER_COUNT = "max-iter-count",
MIN_RELATIVE_DIFF = "min-relative-diff",
MAX_LBFGS_ITER_COUNT = "max-lbfgs-iter-count",
MAX_LAMBDA_UPDATES_EPOCH_COUNT = "max-lambda-updates-epoch-count",
//MAX_ADAGRAD_ITER_COUNT = "max-adagrad-iter-count",
MAX_EM_ITER_COUNT = "max-em-iter-count",
MAX_MODEL1_ITER_COUNT = "max-model1-iter-count",
NO_DIRECT_DEP_BTW_HIDDEN_LABELS = "no-direct-dep-btw-hidden-labels",
CACHE_FEATS = "cache-feats",
OPTIMIZER = "optimizer",
LAMBDA_OPTIMIZER = "lambda-optimizer",
LAMBDA_OPTIMIZER_LEARNING_RATE = "lambda-learning-rate",
LAMBDA_OPTIMIZER_LEARNING_RATE_DECAY_STRATEGY = "lambda-optimizer-learning-rate-decay-strategy",
LAMBDA_OPTIMIZER_LEARNING_RATE_DECAY_PARAMETER = "lambda-optimizer-learning-rate-decay-parameter",
MINIBATCH_SIZE = "minibatch-size",
LOGLINEAR_OPT_FIX_Z_GIVEN_X = "loglinear-opt-fix-z-given-x",
DIRICHLET_ALPHA = "dirichlet-alpha",
Expand All @@ -112,12 +115,15 @@ bool ParseParameters(int argc, char **argv, string &textFilename,
(L1_STRENGTH.c_str(), po::value<float>()->default_value(0.0), "(double) strength of an l1 regularizer")
(MAX_ITER_COUNT.c_str(), po::value<int>(&learningInfo.maxIterationsCount)->default_value(50), "(int) max number of coordinate descent iterations after which the model is assumed to have converged")
(MIN_RELATIVE_DIFF.c_str(), po::value<float>(&learningInfo.minLikelihoodRelativeDiff)->default_value(0.03), "(double) convergence threshold for the relative difference between the objective value in two consecutive coordinate descent iterations")
(MAX_LBFGS_ITER_COUNT.c_str(), po::value<int>(&learningInfo.optimizationMethod.subOptMethod->lbfgsParams.maxIterations)->default_value(6), "(int) quit LBFGS optimization after this many iterations")
(MAX_LAMBDA_UPDATES_EPOCH_COUNT.c_str(), po::value<int>(&learningInfo.optimizationMethod.subOptMethod->epochs)->default_value(1), "(int) quit LBFGS optimization after this many iterations")
//(MAX_ADAGRAD_ITER_COUNT.c_str(), po::value<int>(&learningInfo.optimizationMethod.subOptMethod->adagradParams.maxIterations)->default_value(4), "(int) quit Adagrad optimization after this many iterations")
(MAX_EM_ITER_COUNT.c_str(), po::value<unsigned int>(&learningInfo.emIterationsCount)->default_value(3), "(int) quit EM optimization after this many iterations")
(NO_DIRECT_DEP_BTW_HIDDEN_LABELS.c_str(), "(flag) consecutive labels are independent given observation sequence")
(CACHE_FEATS.c_str(), po::value<bool>(&learningInfo.cacheActiveFeatures)->default_value(false), "(flag) (set by default) maintains and uses a map from a factor to its active features to speed up training, at the expense of higher memory requirements.")
(OPTIMIZER.c_str(), po::value<string>(), "(string) optimization algorithm to use for updating loglinear parameters")
(LAMBDA_OPTIMIZER.c_str(), po::value<string>()->default_value("sgd"), "(string) optimization algorithm to use for optimizing the CRF parameters. Supported values are: 'lbfgs', 'sgd', 'adagrad'. L-BFGS is a popular quasi-Newton optimization algorithm, SGD is stochastic gradient descent, and ADAGRAD is the adaptive gradient algorithm described at http://www.magicbroom.info/Papers/DuchiHaSi10.pdf")
(LAMBDA_OPTIMIZER_LEARNING_RATE.c_str(), po::value<float>(&learningInfo.optimizationMethod.subOptMethod->learningRate)->default_value(1.0), "(float) If the optimizer used for CRF parameters uses a learning rate (e.g., stochastic gradient descent), specify the initial learning rate using htis argument. Note that the learning rate decays in subsequent iterations of SGD.")
(LAMBDA_OPTIMIZER_LEARNING_RATE_DECAY_STRATEGY.c_str(), po::value<string>()->default_value("epoch-fixed"), "(string) Specify which strategy to use for diminishing the learning rate across iterations of stochastic gradient descent. Possible values are 'fixed', 'epoch-fixed', 'bottou', 'geometric'. 'fixed' means that learning rate is the same for all iterations and equal to the specified value for the initial learning rate. 'epoch-fixed' uses the same learning rate for each epoch = initial_learning_rate * 1.0 / epoch_index (the epoch index is one-based). 'bottou' uses the learning rate described in section 5.2 of Leon Bottou's article titled 'Stochastic Gradient Descent Tricks'; i.e., learning_rate = initial_learning_rate / (1 + initial_learning_rate * eta * iteration_index) where eta is the specified decay hyperparameter. 'geometric' uses learning_rate = initial_learning_rate / (1 + eta)^iteration_index.")
(LAMBDA_OPTIMIZER_LEARNING_RATE_DECAY_PARAMETER.c_str(), po::value<float>(&learningInfo.optimizationMethod.subOptMethod->learningRateDecayParameter)->default_value(0.001), "(float) some decay strategies for the learning rate in stochastic gradient use a decay parameter (e.g., 'bottou'). The higher this parameter is, the faster will the learning rate decay. Must be greater than zero.")
(MINIBATCH_SIZE.c_str(), po::value<int>(&learningInfo.optimizationMethod.subOptMethod->miniBatchSize)->default_value(0), "(int) minibatch size for optimizing loglinear params. Defaults to zero which indicates batch training.")
(LOGLINEAR_OPT_FIX_Z_GIVEN_X.c_str(), po::value<bool>(&learningInfo.fixPosteriorExpectationsAccordingToPZGivenXWhileOptimizingLambdas)->default_value(false), "(flag) (clera by default) fix the feature expectations according to p(Z|X), which involves both multinomial and loglinear parameters. This speeds up the optimization of loglinear parameters and makes it convex; but it does not have principled justification.")
(MAX_MODEL1_ITER_COUNT.c_str(), po::value<int>(&maxModel1IterCount)->default_value(15), "(int) (defaults to 15) number of model 1 iterations to use for initializing theta parameters")
Expand All @@ -143,11 +149,6 @@ bool ParseParameters(int argc, char **argv, string &textFilename,
return false;
}

if (vm.count(MAX_LBFGS_ITER_COUNT.c_str())) {
learningInfo.optimizationMethod.subOptMethod->lbfgsParams.memoryBuffer =
vm[MAX_LBFGS_ITER_COUNT.c_str()].as<int>();
}

if (vm.count(FEAT.c_str()) == 0) {
cerr << "No features were specified. We will enable src-tgt word pair identities features by default." << endl;
learningInfo.featureTemplates.push_back(FeatureTemplate::SRC0_TGT0);
Expand Down Expand Up @@ -207,15 +208,45 @@ bool ParseParameters(int argc, char **argv, string &textFilename,
learningInfo.hiddenSequenceIsMarkovian = false;
}

if(vm.count(OPTIMIZER.c_str())) {
if(vm[OPTIMIZER.c_str()].as<string>() == "adagrad") {
if(vm.count(LAMBDA_OPTIMIZER.c_str())) {
if(vm[LAMBDA_OPTIMIZER.c_str()].as<string>() == "adagrad") {
learningInfo.optimizationMethod.subOptMethod->algorithm = OptAlgorithm::ADAGRAD;
} else if (vm[LAMBDA_OPTIMIZER.c_str()].as<string>() == "sgd") {
learningInfo.optimizationMethod.subOptMethod->algorithm = OptAlgorithm::SGD;
} else if (vm[LAMBDA_OPTIMIZER.c_str()].as<string>() == "lbfgs") {
learningInfo.optimizationMethod.subOptMethod->algorithm = OptAlgorithm::LBFGS;
learningInfo.optimizationMethod.subOptMethod->lbfgsParams.maxIterations =
learningInfo.optimizationMethod.subOptMethod->epochs;
learningInfo.optimizationMethod.subOptMethod->lbfgsParams.memoryBuffer =
learningInfo.optimizationMethod.subOptMethod->epochs;
} else {
cerr << "option --optimizer cannot take the value " << vm[OPTIMIZER.c_str()].as<string>() << endl;
cerr << "option --lambda-optimizer cannot take the value " << vm[LAMBDA_OPTIMIZER.c_str()].as<string>() << endl;
return false;
}
}

// If the minibatch size is not specified while using stochastic gradient descent,
// assume a minibatch size of 1.
if (learningInfo.optimizationMethod.subOptMethod->algorithm == OptAlgorithm::SGD &&
learningInfo.optimizationMethod.subOptMethod->miniBatchSize == 0) {
learningInfo.optimizationMethod.subOptMethod->miniBatchSize = 1;
}

if(vm.count(LAMBDA_OPTIMIZER_LEARNING_RATE_DECAY_STRATEGY.c_str())) {
if(vm[LAMBDA_OPTIMIZER_LEARNING_RATE_DECAY_STRATEGY.c_str()].as<string>() == "fixed") {
learningInfo.optimizationMethod.subOptMethod->learningRateDecayStrategy = DecayStrategy::FIXED;
} else if(vm[LAMBDA_OPTIMIZER_LEARNING_RATE_DECAY_STRATEGY.c_str()].as<string>() == "epoch-fixed") {
learningInfo.optimizationMethod.subOptMethod->learningRateDecayStrategy = DecayStrategy::EPOCH_FIXED;
} else if(vm[LAMBDA_OPTIMIZER_LEARNING_RATE_DECAY_STRATEGY.c_str()].as<string>() == "bottou") {
learningInfo.optimizationMethod.subOptMethod->learningRateDecayStrategy = DecayStrategy::BOTTOU;
} else if(vm[LAMBDA_OPTIMIZER_LEARNING_RATE_DECAY_STRATEGY.c_str()].as<string>() == "geometric") {
learningInfo.optimizationMethod.subOptMethod->learningRateDecayStrategy = DecayStrategy::GEOMETRIC;
} else {
cerr << "option --lambda-optimizer-learning-rate-decay-strategy cannot take the value " << vm[LAMBDA_OPTIMIZER_LEARNING_RATE_DECAY_STRATEGY.c_str()].as<string>() << endl;
return false;
}
}

// logging
if(learningInfo.mpiWorld->rank() == 0) {
cerr << "program options are as follows:" << endl;
Expand All @@ -236,12 +267,12 @@ bool ParseParameters(int argc, char **argv, string &textFilename,
cerr << L1_STRENGTH << "=" << vm[L1_STRENGTH.c_str()].as<float>() << endl;
cerr << MAX_ITER_COUNT << "=" << learningInfo.maxIterationsCount << endl;
cerr << MIN_RELATIVE_DIFF << "=" << learningInfo.minLikelihoodRelativeDiff << endl;
cerr << MAX_LBFGS_ITER_COUNT << "=" << learningInfo.optimizationMethod.subOptMethod->lbfgsParams.maxIterations << endl;
cerr << MAX_LAMBDA_UPDATES_EPOCH_COUNT << "=" << learningInfo.optimizationMethod.subOptMethod->epochs << endl;
cerr << MAX_EM_ITER_COUNT << "=" << learningInfo.emIterationsCount << endl;
cerr << NO_DIRECT_DEP_BTW_HIDDEN_LABELS << "=" << !learningInfo.hiddenSequenceIsMarkovian << endl;
cerr << CACHE_FEATS << "=" << learningInfo.cacheActiveFeatures << endl;
if(vm.count(OPTIMIZER.c_str())) {
cerr << OPTIMIZER << "=" << vm[OPTIMIZER.c_str()].as<string>() << endl;
if(vm.count(LAMBDA_OPTIMIZER.c_str())) {
cerr << LAMBDA_OPTIMIZER << "=" << vm[LAMBDA_OPTIMIZER.c_str()].as<string>() << endl;
}
cerr << MINIBATCH_SIZE << "=" << learningInfo.optimizationMethod.subOptMethod->miniBatchSize << endl;
cerr << LOGLINEAR_OPT_FIX_Z_GIVEN_X << "=" << learningInfo.fixPosteriorExpectationsAccordingToPZGivenXWhileOptimizingLambdas << endl;
Expand Down Expand Up @@ -405,12 +436,13 @@ int main(int argc, char **argv) {
// block coordinate descent
learningInfo.optimizationMethod.algorithm = OptAlgorithm::BLOCK_COORD_DESCENT;
// lbfgs
learningInfo.optimizationMethod.subOptMethod = new OptMethod();
learningInfo.optimizationMethod.subOptMethod->algorithm = OptAlgorithm::LBFGS;
learningInfo.optimizationMethod.subOptMethod = new OptMethod();
learningInfo.optimizationMethod.subOptMethod->miniBatchSize = 0;
learningInfo.optimizationMethod.subOptMethod->lbfgsParams.maxEvalsPerIteration = 4;
learningInfo.optimizationMethod.subOptMethod->moveAwayPenalty = 0.0;
learningInfo.retryLbfgsOnRoundingErrors = true;
learningInfo.optimizationMethod.subOptMethod->lbfgsParams.maxIterations = 6;
learningInfo.optimizationMethod.subOptMethod->lbfgsParams.memoryBuffer = 6;
// thetas
learningInfo.thetaOptMethod = new OptMethod();
learningInfo.thetaOptMethod->algorithm = OptAlgorithm::EXPECTATION_MAXIMIZATION;
Expand Down
38 changes: 31 additions & 7 deletions core/BasicTypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,25 @@ namespace Regularizer {
}

namespace OptAlgorithm {
enum OptAlgorithm {GRADIENT_DESCENT, STOCHASTIC_GRADIENT_DESCENT,
enum OptAlgorithm {GRADIENT_DESCENT, SGD,
BLOCK_COORD_DESCENT, LBFGS, SIMULATED_ANNEALING, EXPECTATION_MAXIMIZATION,
ADAGRAD};
}

// Specify which strategy to use for diminishing the learning rate across
// iterations of stochastic gradient descent.
// FIXED: learning rate is the same for all iterations and equal to the
// specified value for the initial learning rate.
// EPOCH-FIXED: uses the same learning rate for all updates in the same epoch
// epoch_learning_rate = initial_learning_rate * 1.0 / epoch_index;
// (where the epoch index is one-based).
// BOTTOU: uses the learning rate described in section 5.2 of Leon Bottou's
// article titled 'Stochastic Gradient Descent Tricks'; i.e., learning_rate=
// initial_learning_rate / (1 + initial_learning_rate*eta*iteration_index);
// where eta is the specified decay hyperparameter.
// GEOMETRIC: learning_rate = initial_learning_rate / (1 + eta)^iteration_index;
enum class DecayStrategy {FIXED, EPOCH_FIXED, BOTTOU, GEOMETRIC};

namespace DebugLevel {
enum DebugLevel {NONE=0, ESSENTIAL=1, CORPUS=2, MINI_BATCH=3, SENTENCE=4, TOKEN=5, REDICULOUS=6, TEMP = 4};
}
Expand Down Expand Up @@ -113,28 +127,38 @@ struct OptMethod {
LbfgsParams lbfgsParams;
// if algorithm = ADAGRAD, use these ADAGRAD hyper params
//AdagradParams adagradParams;
// some optimization algorithms require specifying a learning rate (e.g. gradient descent)
// some optimization algorithms require specifying a learning rate (e.g. gradient descent).
// when using stochastic gradient descent, the value specified by this variable is the initial
// learning rate which may or may not be diminished in subsequent iterations, depending on
// learningRateDiminishingStrategy.
float learningRate;
// stochastic = 0 means batch optimization
// stochastic = 1 means online optimization
bool stochastic;
// when stochastic = 1, specifies the minibatch size
// if using stochastic gradient descent, this variable determines the decay strategy for the learning rate.
DecayStrategy learningRateDecayStrategy;
// some decay strategies require a decay parameter
float learningRateDecayParameter;
// when using a stochastic optimization method, use this variable to specify the mini-batch size.
int miniBatchSize;
// regularization details
Regularizer::Regularizer regularizer;
float regularizationStrength;
// move-away from previous weights penalty
float moveAwayPenalty;
// maximum number of epochs (1 epoch = full pass on the training set) this
// algorithm is allowed to make to update lambdas in one iteration of block
// coordinate descent.
int epochs;

OptMethod() {
stochastic = false;
algorithm = OptAlgorithm::GRADIENT_DESCENT;
learningRate = 0.01;
learningRateDecayParameter = 1.0;
learningRateDecayStrategy = DecayStrategy::EPOCH_FIXED;
miniBatchSize = 1;
regularizer = Regularizer::NONE;
regularizationStrength = 1000;
subOptMethod = 0;
moveAwayPenalty = 1.0;
epochs = 1;
}
};

Expand Down
Loading

0 comments on commit 29f28d5

Please sign in to comment.