Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fine-tuning bug fix #51

Merged
merged 8 commits into from
Jan 24, 2022
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 46 additions & 44 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ mono_max_sent_trg = config['experiment']['mono-max-sentences-trg']
bicl_default_threshold = config['experiment']['bicleaner']['default-threshold']
bicl_dataset_thresholds = config['experiment']['bicleaner']['dataset-thresholds']
backward_pretrained = config['experiment']['backward-model']
vocab_pretrained = config['experiment']['vocab']

experiment_dir=f"{data_root_dir}/experiments/{src}-{trg}/{experiment}"

Expand Down Expand Up @@ -91,14 +92,16 @@ align_dir = f"{data_dir}/alignment"

# models
models_dir = f"{data_root_dir}/models/{src}-{trg}/{experiment}"
teacher_dir = f"{models_dir}/teacher"
teacher_all_dir = f"{models_dir}/teacher-all"
teacher_parallel_dir = f"{models_dir}/teacher-parallel"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From reading the source, I don't understand what teacher_parallel_dir should contain. What is a parallel teacher model?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Teacher all - the model is trained on all available data.
Teacher parallel - optional model to fine-tune on parallel data only if the data was augmented with back translations.

Will it be easier to understand if I rename them to teacher and teacher-finetuned?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes that would be easier to understand, but this is a very minor point.

student_dir = f"{models_dir}/student"
student_finetuned_dir = f"{models_dir}/student-finetuned"
speed_dir = f"{models_dir}/speed"
exported_dir = f"{models_dir}/exported"
best_model = f"model.npz.best-{config['experiment']['best-model']}.npz"
backward_dir = f'{models_dir}/backward'
spm_sample_size=config['experiment']['spm-sample-size']
vocab_path=vocab_pretrained or f"{models_dir}/vocab/vocab.spm"

#evaluation
eval_data_dir = f"{original}/eval"
Expand All @@ -119,7 +122,7 @@ results = [f'{exported_dir}/model.{src}{trg}.intgemm.alphas.bin.gz',
f'{exported_dir}/lex.50.50.{src}{trg}.s2t.bin.gz',
f'{exported_dir}/vocab.{src}{trg}.spm.gz',
f'{experiment_dir}/config.yml',
*expand(f'{eval_res_dir}/teacher{{ens}}/{{dataset}}.metrics',ens=ensemble, dataset=eval_datasets),
*expand(f'{eval_res_dir}/teacher-all{{ens}}/{{dataset}}.metrics',ens=ensemble, dataset=eval_datasets),
*expand(f'{eval_student_dir}/{{dataset}}.metrics', dataset=eval_datasets),
*expand(f'{eval_student_finetuned_dir}/{{dataset}}.metrics', dataset=eval_datasets),
*expand(f'{eval_speed_dir}/{{dataset}}.metrics', dataset=eval_datasets)
Expand Down Expand Up @@ -162,12 +165,11 @@ clean_corpus_trg = f'{clean_corpus_prefix}.{trg}.gz'
if mono_trg_datasets:
teacher_corpus = f'{augmented}/corpus'
augment_corpus = True
continue_teacher = True # continue training on parallel corpus
teacher_all_output = 'model.npz'
final_teacher_dir = teacher_parallel_dir
results.extend(expand(f'{eval_res_dir}/teacher-parallel{{ens}}/{{dataset}}.metrics',ens=ensemble, dataset=eval_datasets))
else:
augment_corpus = False
continue_teacher = False
teacher_all_output = best_model
final_teacher_dir = teacher_all_dir


### helper functions
Expand Down Expand Up @@ -381,16 +383,17 @@ rule merge_mono:

# augmentation and teacher training

rule train_vocab:
message: "Training spm vocab"
log: f"{log_dir}/train_vocab.log"
conda: "envs/base.yml"
threads: 2
input: bin=spm_trainer, corpus_src=clean_corpus_src, corpus_trg=clean_corpus_trg
output: f"{models_dir}/vocab/vocab.spm"
params: prefix_train=clean_corpus_prefix,prefix_test=f"{original}/devset"
shell: '''bash pipeline/train/spm-vocab.sh "{input.corpus_src}" "{input.corpus_trg}" "{output}" {spm_sample_size} \
>> {log} 2>&1'''
if not vocab_pretrained:
rule train_vocab:
message: "Training spm vocab"
log: f"{log_dir}/train_vocab.log"
conda: "envs/base.yml"
threads: 2
input: bin=spm_trainer, corpus_src=clean_corpus_src, corpus_trg=clean_corpus_trg
output: vocab_path
params: prefix_train=clean_corpus_prefix,prefix_test=f"{original}/devset"
shell: '''bash pipeline/train/spm-vocab.sh "{input.corpus_src}" "{input.corpus_trg}" "{output}" {spm_sample_size} \
>> {log} 2>&1'''

if do_train_backward:
rule train_backward:
Expand All @@ -402,7 +405,7 @@ if do_train_backward:
group: 'backward'
input:
rules.merge_devset.output, train_src=clean_corpus_src,train_trg=clean_corpus_trg,
bin=trainer, vocab=rules.train_vocab.output,
bin=trainer, vocab=vocab_path,
output: model=f'{backward_dir}/{best_model}'
params: prefix_train=clean_corpus_prefix,prefix_test=f"{original}/devset",
args=get_args("training-backward")
Expand All @@ -428,7 +431,7 @@ if augment_corpus:
resources: gpu=gpus_num
input:
bin=decoder, file=f'{translated}/mono_trg/file.{{part}}',
vocab=rules.train_vocab.output, model=f'{backward_dir}/{best_model}'
vocab=vocab_path, model=f'{backward_dir}/{best_model}'
output: f'{translated}/mono_trg/file.{{part}}.out'
params: args = get_args("decoding-backward")
shell: '''bash pipeline/translate/translate.sh "{input.file}" "{input.vocab}" {input.model} {params.args} \
Expand Down Expand Up @@ -467,35 +470,34 @@ rule teacher_all:
conda: "envs/base.yml"
threads: gpus_num*2
resources: gpu=gpus_num
group: 'teacher{ens}'
input:
rules.merge_devset.output, train_src=f'{teacher_corpus}.{src}.gz',train_trg=f'{teacher_corpus}.{trg}.gz',
bin=trainer, vocab=rules.train_vocab.output
output: model=f'{teacher_dir}{{ens}}/{teacher_all_output}'
params: prefix_train=teacher_corpus, prefix_test=f"{original}/devset", dir=directory(f'{teacher_dir}{{ens}}'),
bin=trainer, vocab=vocab_path
output: model=f'{teacher_all_dir}{{ens}}/{best_model}'
params: prefix_train=teacher_corpus, prefix_test=f"{original}/devset", dir=directory(f'{teacher_all_dir}{{ens}}'),
args=get_args("training-teacher-all")
shell: '''bash pipeline/train/train.sh \
teacher train {src} {trg} "{params.prefix_train}" "{params.prefix_test}" "{params.dir}" \
"{input.vocab}" {params.args} >> {log} 2>&1'''

if continue_teacher:
if augment_corpus:
rule teacher_parallel:
message: "Continue training teacher on parallel corpus"
log: f"{log_dir}/train_teacher_parallel{{ens}}.log"
conda: "envs/base.yml"
threads: gpus_num * 2
resources: gpu=gpus_num
group: 'teacher{ens}'
input:
rules.merge_devset.output, model = f'{teacher_dir}{{ens}}/model.npz',
rules.merge_devset.output, model=f'{teacher_all_dir}{{ens}}/{best_model}',
train_src=clean_corpus_src, train_trg=clean_corpus_trg,
bin=trainer, vocab=rules.train_vocab.output
output: model=protected(f'{teacher_dir}{{ens}}/{best_model}')
params: prefix_train=clean_corpus_prefix,prefix_test=f"{original}/devset",dir=directory(f'{teacher_dir}{{ens}}'),
bin=trainer, vocab=vocab_path
output: model=f'{teacher_parallel_dir}{{ens}}/{best_model}'
params: prefix_train=clean_corpus_prefix, prefix_test=f"{original}/devset",
dir=directory(f'{teacher_parallel_dir}{{ens}}'),
args=get_args("training-teacher-parallel")
shell: '''bash pipeline/train/train.sh \
teacher continue {src} {trg} "{params.prefix_train}" "{params.prefix_test}" "{params.dir}" \
"{input.vocab}" {params.args} >> {log} 2>&1'''
teacher train {src} {trg} "{params.prefix_train}" "{params.prefix_test}" "{params.dir}" \
"{input.vocab}" --pretrained-model "{input.model}" {params.args} >> {log} 2>&1'''

### translation with teacher

Expand All @@ -520,8 +522,8 @@ rule translate_corpus:
input:
decoder,
file=f'{translated}/corpus/file.{{part}}',
vocab=rules.train_vocab.output,
teacher_models=expand(f"{teacher_dir}{{ens}}/{best_model}",ens=ensemble)
vocab=vocab_path,
teacher_models=expand(f"{final_teacher_dir}{{ens}}/{best_model}",ens=ensemble)
output: f'{translated}/corpus/file.{{part}}.nbest'
params: args=get_args('decoding-teacher')
shell: '''bash pipeline/translate/translate-nbest.sh \
Expand Down Expand Up @@ -569,8 +571,8 @@ rule translate_mono_src:
resources: gpu=gpus_num
input:
bin=decoder,
file=f'{translated}/mono_src/file.{{part}}',vocab=rules.train_vocab.output,
teacher_models=expand(f"{teacher_dir}{{ens}}/{best_model}",ens=ensemble)
file=f'{translated}/mono_src/file.{{part}}',vocab=vocab_path,
teacher_models=expand(f"{final_teacher_dir}{{ens}}/{best_model}",ens=ensemble)
output: f'{translated}/mono_src/file.{{part}}.out'
params: args=get_args('decoding-teacher')
shell: '''bash pipeline/translate/translate.sh "{input.file}" "{input.vocab}" {input.teacher_models} \
Expand Down Expand Up @@ -615,7 +617,7 @@ rule score:
resources: gpu=gpus_num
input:
scorer,
model=rules.train_backward.output.model, vocab=rules.train_vocab.output,
model=f'{backward_dir}/{best_model}', vocab=vocab_path,
src_corpus=rules.merge_translated.output.res_src, trg_corpus=rules.merge_translated.output.res_trg
output: f"{filtered}/scores.txt"
params: input_prefix=f'{merged}/corpus'
Expand Down Expand Up @@ -644,7 +646,7 @@ rule alignments:
input:
spm_encoder, spm_exporter,
src_corpus=rules.ce_filter.output.src_corpus,trg_corpus=rules.ce_filter.output.trg_corpus,
vocab=rules.train_vocab.output,
vocab=vocab_path,
fast_align=rules.fast_align.output.fast_align, atools=rules.fast_align.output.atools,
extract_lex=rules.extract_lex.output
output: alignment=f'{align_dir}/corpus.aln.gz',shortlist=f'{align_dir}/lex.s2t.pruned.gz'
Expand All @@ -663,7 +665,7 @@ rule student:
rules.merge_devset.output, trainer,
train_src=rules.ce_filter.output.src_corpus, train_trg=rules.ce_filter.output.trg_corpus,
alignments=rules.alignments.output.alignment,
vocab=rules.train_vocab.output
vocab=vocab_path
output: model=f'{student_dir}/{best_model}'
params: prefix_train=rules.ce_filter.params.output_prefix,prefix_test=f"{original}/devset",
args=get_args("training-student")
Expand All @@ -684,13 +686,13 @@ rule finetune_student:
rules.merge_devset.output, trainer,
train_src=rules.ce_filter.output.src_corpus, train_trg=rules.ce_filter.output.trg_corpus,
alignments=rules.alignments.output.alignment, student_model=rules.student.output.model,
vocab=rules.train_vocab.output
vocab=vocab_path
output: model=f'{student_finetuned_dir}/{best_model}'
params: prefix_train=rules.ce_filter.params.output_prefix,prefix_test=f"{original}/devset",
args=get_args("training-student-finetune")
shell: '''bash pipeline/train/train-student.sh \
"{input.alignments}" student finetune {src} {trg} "{params.prefix_train}" "{params.prefix_test}" \
"{student_finetuned_dir}" "{input.vocab}" {params.args} >> {log} 2>&1'''
"{student_finetuned_dir}" "{input.vocab}" --pretrained-model "{input.student_model}" {params.args} >> {log} 2>&1'''

rule quantize:
message: "Quantization"
Expand All @@ -700,7 +702,7 @@ rule quantize:
input:
bmt_decoder, bmt_converter,
shortlist=rules.alignments.output.shortlist, model=rules.finetune_student.output.model,
vocab=rules.train_vocab.output, devset=f"{original}/devset.{src}.gz"
vocab=vocab_path, devset=f"{original}/devset.{src}.gz"
output: model=f'{speed_dir}/model.intgemm.alphas.bin'
shell: '''bash pipeline/quantize/quantize.sh \
"{input.model}" "{input.vocab}" "{input.shortlist}" "{input.devset}" "{speed_dir}" >> {log} 2>&1'''
Expand All @@ -713,7 +715,7 @@ rule export:
threads: 1
input:
model=rules.quantize.output.model,shortlist=rules.alignments.output.shortlist,
vocab=rules.train_vocab.output,marian=bmt_converter
vocab=vocab_path,marian=bmt_converter
output:
model=f'{exported_dir}/model.{src}{trg}.intgemm.alphas.bin.gz',
shortlist=f'{exported_dir}/lex.50.50.{src}{trg}.s2t.bin.gz',
Expand All @@ -739,7 +741,7 @@ rule evaluate:
data=multiext(f'{eval_data_dir}/{{dataset}}',f".{src}.gz",f".{trg}.gz"),
models=lambda wildcards: f'{models_dir}/{wildcards.model}/{best_model}'
if wildcards.model != 'teacher-ensemble'
else [f'{teacher_dir}{ens}/{best_model}' for ens in ensemble]
else [f'{final_teacher_dir}{ens}/{best_model}' for ens in ensemble]
output:
report(f'{eval_res_dir}/{{model}}/{{dataset}}.metrics',
category='evaluation', subcategory='{model}', caption='reports/evaluation.rst')
Expand All @@ -750,7 +752,7 @@ rule evaluate:
trg_lng=lambda wildcards: trg if wildcards.model != 'backward' else src,
decoder_config=lambda wildcards: f'{models_dir}/{wildcards.model}/{best_model}.decoder.yml'
if wildcards.model != 'teacher-ensemble'
else f'{teacher_dir}0/{best_model}.decoder.yml'
else f'{final_teacher_dir}0/{best_model}.decoder.yml'
shell: '''bash pipeline/eval/eval-gpu.sh "{params.res_prefix}" "{params.dataset_prefix}" \
{params.src_lng} {params.trg_lng} "{params.decoder_config}" {input.models} >> {log} 2>&1'''

Expand All @@ -766,7 +768,7 @@ rule eval_quantized:
data=multiext(f'{eval_data_dir}/{{dataset}}',f".{src}.gz",f".{trg}.gz"),
model=rules.quantize.output.model,
shortlist=rules.alignments.output.shortlist,
vocab=rules.train_vocab.output
vocab=vocab_path
output:
report(f'{eval_speed_dir}/{{dataset}}.metrics', category='evaluation',
subcategory='quantized', caption='reports/evaluation.rst')
Expand Down
2 changes: 2 additions & 0 deletions configs/config.prod.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ experiment:
teacher-ensemble: 2
# path to a pretrained backward model (optional)
backward-model: ""
# path to a pretrained vocabulary (optional)
vocab: ""

# limits per downloaded dataset
mono-max-sentences-src: 100000000
Expand Down
1 change: 1 addition & 0 deletions configs/config.test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ experiment:

teacher-ensemble: 2
backward-model: ""
vocab: ""

mono-max-sentences-src: 100000
mono-max-sentences-trg: 200000
Expand Down
9 changes: 0 additions & 9 deletions pipeline/train/configs/training/teacher.continue.yml

This file was deleted.