Skip to content

Commit

Permalink
Add CJK one stage config (#1018)
Browse files Browse the repository at this point in the history
* Add CJK one stage config

* Add comment
  • Loading branch information
eu9ene authored Feb 5, 2025
1 parent e8dfd75 commit 3b4615e
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 0 deletions.
40 changes: 40 additions & 0 deletions pipeline/train/configs/opustrainer/teacher.one-stage.cjk.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# This config includes one-stage training and CJK specific modifiers
# It's helpful when en -> CJK models stop training too early

datasets:
original: {dataset0} # Original parallel corpus
backtranslated: {dataset1} # Back-translated data

stages:
- train

# Train on a mix until early stopping
# (useful for clean back-translated data produced by a strong model)
train:
- original 0.7
- backtranslated 0.3
- until original inf

# The default values of the modifiers are taken from the paper https://arxiv.org/pdf/2311.14838.pdf
# Please refer to docs/opus-trainer.md for further details
modifiers:
## Insert new sentences composed form Unicode noise
- Noise: 0.0005
min_word_length: 2 # Minimum word length for each word in the noisy sentence
max_word_length: 5 # Maximum word length for each word in the noisy sentence
max_words: 6 # Maximum number of words in each noisy sentence
# generates inline noise (emojis etc.) matching positions in source and target sentences using alignments
# no spm_vocab argument -> alignments will be removed from Marian input
# we don't use alignments for teacher training
# Tags modifier has to be the last one to remove the alignments
- Tags: 0.005
custom_detok_src: "icu:{src}"
custom_detok_trg: "icu:{trg}"
augment: 1
tag: 0


# random seed should be different for different teacher models
seed: {seed}
# parallel sentences + token alignments
num_fields: 3
1 change: 1 addition & 0 deletions taskcluster/kinds/train-teacher/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ tasks:
resources:
- pipeline/train/configs/model/teacher.yml
- pipeline/train/configs/opustrainer/teacher.one-stage.yml
- pipeline/train/configs/opustrainer/teacher.one-stage.cjk.yml
- pipeline/train/configs/opustrainer/teacher.two-stage.yml
- pipeline/train/configs/opustrainer/teacher.two-stage.cjk.yml
- pipeline/train/configs/training/teacher.train.yml
Expand Down

0 comments on commit 3b4615e

Please sign in to comment.