Skip to content

Commit

Permalink
1) use the last iteration of coordinate descent, 2) fix variational i…
Browse files Browse the repository at this point in the history
…nference and dirichlet priors, 3) less logging.
  • Loading branch information
Waleed Ammar committed Apr 17, 2015
1 parent 08f08b7 commit 022ab61
Show file tree
Hide file tree
Showing 7 changed files with 199 additions and 238 deletions.
2 changes: 0 additions & 2 deletions alignment/train-latentCrfAligner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -395,10 +395,8 @@ void endOfKIterationsCallbackFunction() {
LatentCrfAligner &aligner = *( (LatentCrfAligner*) model );

// fix learningInfo.test_size
cerr << "firstKExamplesToLabel = " << aligner.learningInfo.firstKExamplesToLabel << endl;
if(aligner.learningInfo.firstKExamplesToLabel <= 0) {
aligner.learningInfo.firstKExamplesToLabel = aligner.examplesCount;
cerr << "firstKExamplesToLabel = " << aligner.learningInfo.firstKExamplesToLabel << endl;
}

// find viterbi alignment for the top K examples of the training set (i.e. our test set)
Expand Down
335 changes: 103 additions & 232 deletions core/LatentCrfModel.cc

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions core/LatentCrfModel.h
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,9 @@ class LatentCrfModel : public UnsupervisedSequenceTaggingModel {
void ReduceMleAndMarginals(MultinomialParams::ConditionalMultinomialParam<int64_t> &mleGivenOneLabel,
boost::unordered_map<int64_t, double> &mleMarginalsGivenOneLabel);

void AllReduceMleAndMarginals(MultinomialParams::ConditionalMultinomialParam<int64_t> &mleGivenOneLabel,
boost::unordered_map<int64_t, double> &mleMarginalsGivenOneLabel);

void BroadcastTheta(unsigned rankId);

// filenames
Expand Down
24 changes: 24 additions & 0 deletions ducttape-files/en-fi-exp000.tconf
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import en-fi.tconf

global {
# Output directory:
ducttape_output="/usr3/home/wammar/mt-systems/en-fi-exp000"

# initial autoencoder params
init_theta_fwd=""
init_lambda_fwd=""
init_theta_bwd=""
init_lambda_bwd=""

# general
cores=16

# prefix; must be important for concurrent runs
output_prefix="exp000"

}

plan Full {
#reach AutoencoderAlignT2S
reach Evaluate via (TuneOrTest: tune test) * (UseCustomLM: no) * (Aligner: giza) * (L2: point_o_one) * (DirichletAlpha: one_point_five) * (PrecomputedFeatures: dyer11) * (OptimizeLambdasFirst: yes) * (UseOtherAligners: yes) * (EmItercount: one) * (SymHeuristic: grow_diag_final_and ) * (MaxLambdaEpochCount: one) * (LambdaOptimizer: lbfgs sgd) * (ThetaOptimizer: em)
}
37 changes: 37 additions & 0 deletions ducttape-files/en-fi-paths.tconf
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
global {

train_corpus="/usr3/home/wammar/corpora/parallel/en-fi.short.en-fi"
tune_corpus="/usr3/home/wammar/corpora/parallel/en-fi.wmt-dev"
test_corpus="/usr3/home/wammar/corpora/parallel/en-fi.wmt-devtest"
src_brown_clusters="/usr1/home/wammar/parallel/english/news-commentary10.cz-en.en.tok.brown80"
tgt_brown_clusters="/usr3/home/wammar/brown-clusters/wmt_mono+parallel+dev+devtest-c100-p1.out/paths"

# only specify when you want to reuse a previously built LM
language_model=""
# only use these two parameters if you want to build a language model. if you have a language model already built, specify "language_model=" instead
lm_order=4
lm_data="/usr3/home/wammar/corpora/monolingual/finnish/wmt_mono+parallel+dev+devtest.tok"

# tool paths
cdec_dir="/home/wammar/cdec/"
multeval="/home/wammar/git/multeval/multeval.sh"
giza_bin="/opt/tools/mgizapp-0.7.2/bin"
moses_train_script="/home/wammar/git/mosesdecoder/scripts/training/train-model.perl"
mkcls_bin="/mal0/tools/mosesdecoder/bin/mkcls"
wammar_utils_dir="/home/wammar/wammar-utils"
alignment_with_openfst_dir="/home/wammar/online_em/alignment-with-openfst/"
kenlm_dir="/home/wammar/git/kenlm"

# aer
conv_pharaoh_script="/home/wammar/alignment-with-openfst/data/hansards/conv-pharaoh.pl"
aer_eval_script=""
gold_alignment=""

# other aligner outputs
fwd_giza_alignments=""
bwd_giza_alignments=""
sym_giza_alignments=""
fwd_fast_alignments=""
bwd_fast_alignments=""
sym_fast_alignments=""
}
23 changes: 23 additions & 0 deletions ducttape-files/en-fi.tconf
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import en-fi-paths.tconf

global {

# UseCustomAlignment controls whether to retrain word alignment from the train_corpus, or just use
# alignments at 'alignment'
alignment="/dev/null"

# preprocess data
tokenize_corpus="false"
lowercase_corpus="false"
max_sentence_length=0

# language specific
src="en"
tgt="fi"
meteor_lang="english"

# aer
test_sents_count=0

}

13 changes: 9 additions & 4 deletions ducttape-files/hiero-local.tape
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ global {
intersect="intersect"
union="union")
lambda_optimizer=(LambdaOptimizer: sgd="sgd" lbfgs="lbfgs")
theta_optimizer=(ThetaOptimizer: em="em" online_em="online_em")
}

#import ../submitters.tape
Expand Down Expand Up @@ -239,6 +240,7 @@ task AutoencoderAlignS2T
:: tgt_brown_clusters=@
:: output_prefix=@
:: lambda_optimizer=@
:: theta_optimizer=@
< executable=$executable@BuildLatentCrfAligner
< wordpair_feats_file=$wordpair_feats_file@GenerateWordpairFeatsS2T
> alignment
Expand All @@ -257,6 +259,7 @@ task AutoencoderAlignS2T
--test-size $test_sents_count
--max-model1-iter-count $model1_itercount
--lambda-optimizer $lambda_optimizer
--theta-optimizer $theta_optimizer
--max-iter-count 10"

# --feat SRC0_TGT0
Expand Down Expand Up @@ -369,6 +372,7 @@ task AutoencoderAlignT2S
:: use_src_bigrams=@
:: src_brown_clusters=@
:: lambda_optimizer=@
:: theta_optimizer=@
< wordpair_feats_file=$wordpair_feats_file@GenerateWordpairFeatsT2S
< executable=$executable@BuildLatentCrfAligner
> alignment
Expand All @@ -377,7 +381,7 @@ task AutoencoderAlignT2S
{

model1_itercount="5"
variational="true"
variational=""

# the latent-CRF word alignment mode
command="mpirun -np $procs $executable
Expand All @@ -388,7 +392,9 @@ task AutoencoderAlignT2S
--reverse true
--max-model1-iter-count $model1_itercount
--lambda-optimizer $lambda_optimizer
--theta-optimizer $theta_optimizer
--max-iter-count 10"

#--tgt-word-classes-filename $src_brown_clusters
# --feat SRC0_TGT0
# --feat SYNC_START --feat SYNC_END
Expand Down Expand Up @@ -579,7 +585,6 @@ task Tune
# :: .submitter=torque_normal .walltime="48:00:00" .cpus=32 .vmem=60g .q=normal
{


ref_count=$(head -n 1 $tune_set | grep -o '|||' | wc -l)
#cut -f 1 -d '|' $tune_set > ./tune_source
for i in `seq 1 $ref_count`; do
Expand All @@ -606,8 +611,8 @@ task Tune
echo "Glue 0.0" >> $initial_weights
echo "WordPenalty 0.0" >> $initial_weights

$cdec_dir/training/mira/mira.py -j $cores --update-size 250 -o $mira_work --weights ./initial_weights \
--devset $tune_set --config $cdec_ini
$cdec_dir/training/mira/mira.py --jobs $cores --kbest-size 500 -o $mira_work --weights ./initial_weights \
-d $tune_set --config $cdec_ini --step-size 0.001

index=$(ls $mira_work | grep '^weights\.[0-9]*$' | sed 's/^weights\.//' | sort -n | tail -n 1)
ln -s $mira_work/weights.$index $optimized_weights
Expand Down

0 comments on commit 022ab61

Please sign in to comment.