From 0f6e64cf190b56b614022c93552110133f742c75 Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Tue, 17 Aug 2021 13:20:34 -0700 Subject: [PATCH] Minor improvements (#20) - Flores dataset importer - custom dataset importer - ability to use a pre-trained backward model - save experiment config on start - stubs for dataset caching ( decided to sync implementation with workflow manager integration ) - use best bleu models instead of best ce-mean-words - fix linting warnings --- README.md | 5 +- config.sh | 13 ++-- pipeline/clean/ce-filter.sh | 4 +- pipeline/clean/clean-mono.sh | 2 +- pipeline/data/download-corpus.sh | 3 +- pipeline/data/download-eval.sh | 6 +- pipeline/data/download-mono.sh | 3 +- .../data/importers/corpus/custom-corpus.sh | 24 +++++++ pipeline/data/importers/corpus/flores.sh | 52 ++++++++++++++ pipeline/data/importers/mono/custom-mono.sh | 22 ++++++ pipeline/translate/translate-corpus.sh | 13 ++-- pipeline/translate/translate-mono.sh | 4 +- pipeline/utils/find-corpus.py | 4 +- pipeline/utils/merge-corpus.sh | 4 +- run.sh | 70 +++++++++++++------ 15 files changed, 182 insertions(+), 47 deletions(-) create mode 100644 pipeline/data/importers/corpus/custom-corpus.sh create mode 100644 pipeline/data/importers/corpus/flores.sh create mode 100644 pipeline/data/importers/mono/custom-mono.sh diff --git a/README.md b/README.md index e9e81f005..5a66f81cf 100644 --- a/README.md +++ b/README.md @@ -133,14 +133,17 @@ TRAIN_DATASETS="opus_OPUS-ParaCrawl/v7.1 mtdata_newstest2019_ruen" TEST_DATASETS="sacrebleu_wmt20 sacrebleu_wmt18" ``` -Data source | Prefix | Name example | Type | Comments +Data source | Prefix | Name examples | Type | Comments --- | --- | --- | ---| --- [MTData](https://github.com/thammegowda/mtdata) | mtdata | newstest2017_ruen | corpus | Supports many datasets. Run `mtdata list -l ru-en` to see datasets for a specific language pair. [OPUS](opus.nlpl.eu/) | opus | ParaCrawl/v7.1 | corpus | Many open source datasets. Go to the website, choose a language pair, check links under Moses column to see what names and version is used in a link. [SacreBLEU](https://github.com/mjpost/sacrebleu) | sacrebleu | wmt20 | corpus | Official evaluation datasets available in SacreBLEU tool. Recommended to use in `TEST_DATASETS`. Look up supported datasets and language pairs in `sacrebleu.dataset` python module. +[Flores](https://github.com/facebookresearch/flores) | flores | dev, devtest | corpus | Evaluation dataset from Facebook that supports 100 languages. +Custom parallel | custom-corpus | /tmp/test-corpus | corpus | Custom parallel dataset that is already downloaded to a local disk. The dataset name is an absolute path prefix without ".lang.gz" [Paracrawl](https://paracrawl.eu/) | paracrawl-mono | paracrawl8 | mono | Datasets that are crawled from the web. Only [mono datasets](https://paracrawl.eu/index.php/moredata) are used in this importer. Parallel corpus is available using opus importer. [News crawl](http://data.statmt.org/news-crawl) | news-crawl | news.2019 | mono | Some news monolingual datasets from [WMT21](https://www.statmt.org/wmt21/translation-task.html) [Common crawl](https://commoncrawl.org/) | commoncrawl | wmt16 | mono | Huge web crawl datasets. The links are posted on [WMT21](https://www.statmt.org/wmt21/translation-task.html) +Custom mono | custom-mono | /tmp/test-mono | mono | Custom monolingual dataset that is already downloaded to a local disk. The dataset name is an absolute path prefix without ".lang.gz" You can also use [find-corpus](pipeline/utils/find-corpus.py) tool to find all datasets for an importer and get them formatted to use in config. diff --git a/config.sh b/config.sh index 410f03939..65ce916d4 100644 --- a/config.sh +++ b/config.sh @@ -11,8 +11,10 @@ set -a WORKDIR=$(pwd) CUDA_DIR=/usr/local/cuda-11.2 -DATA_DIR=${DATA_DIR:-${WORKDIR}/data} -MODELS_DIR=${MODELS_DIR:-${WORKDIR}/models} +DATA_ROOT_DIR=${DATA_ROOT_DIR:-${WORKDIR}} +DATA_DIR=${DATA_ROOT_DIR}/data +MODELS_DIR=${DATA_ROOT_DIR}/models +EXPERIMENTS_DIR=${DATA_ROOT_DIR}/experiments MARIAN=${MARIAN:-${WORKDIR}/3rd_party/marian-dev/build} CLEAN_TOOLS=${WORKDIR}/pipeline/clean/tools BIN=${WORKDIR}/bin @@ -23,11 +25,14 @@ EXPERIMENT=test SRC=ru TRG=en +# path to a pretrained backward model (optional) +BACKWARD_MODEL="" + # parallel corpus TRAIN_DATASETS="opus_ada83/v1 opus_UN/v20090831 opus_GNOME/v1 opus_wikimedia/v20210402 opus_CCMatrix/v1 opus_Wikipedia/v1.0 opus_tico-19/v2020-10-28 opus_KDE4/v2 opus_OpenSubtitles/v2018 opus_MultiUN/v1 opus_GlobalVoices/v2018q4 opus_ELRC_2922/v1 opus_PHP/v1 opus_Tatoeba/v2021-03-10 opus_Tanzil/v1 opus_XLEnt/v1.1 opus_TildeMODEL/v2018 opus_Ubuntu/v14.10 opus_TED2013/v1.1 opus_infopankki/v1 opus_EUbookshop/v2 opus_ParaCrawl/v8 opus_Books/v1 opus_WMT-News/v2019 opus_bible-uedin/v1 opus_WikiMatrix/v1 opus_QED/v2.0a opus_CCAligned/v1 opus_TED2020/v1 opus_News-Commentary/v16 opus_UNPC/v1.0"\ " mtdata_cc_aligned mtdata_airbaltic mtdata_GlobalVoices_2018Q4 mtdata_UNv1_test mtdata_neulab_tedtalksv1_train mtdata_neulab_tedtalksv1_dev mtdata_wmt13_commoncrawl mtdata_czechtourism mtdata_paracrawl_bonus mtdata_worldbank mtdata_wiki_titles_v1 mtdata_WikiMatrix_v1 mtdata_wmt18_news_commentary_v13 mtdata_wiki_titles_v2 mtdata_news_commentary_v14 mtdata_UNv1_dev mtdata_neulab_tedtalksv1_test mtdata_JW300" -DEVTEST_DATASETS="mtdata_newstest2019_ruen mtdata_newstest2017_ruen mtdata_newstest2015_ruen mtdata_newstest2014_ruen" -TEST_DATASETS="sacrebleu_wmt20 sacrebleu_wmt18 sacrebleu_wmt16 sacrebleu_wmt13" +DEVTEST_DATASETS="flores_dev mtdata_newstest2019_ruen mtdata_newstest2017_ruen mtdata_newstest2015_ruen mtdata_newstest2014_ruen" +TEST_DATASETS="flores_devtest sacrebleu_wmt20 sacrebleu_wmt18 sacrebleu_wmt16 sacrebleu_wmt13" # monolingual datasets (ex. paracrawl-mono_paracrawl8, commoncrawl_wmt16, news-crawl_news.2020) # to be translated by the teacher model MONO_DATASETS_SRC="news-crawl_news.2020 news-crawl_news.2019 news-crawl_news.2018 news-crawl_news.2017 "\ diff --git a/pipeline/clean/ce-filter.sh b/pipeline/clean/ce-filter.sh index 27a3e8f62..f4ab22045 100644 --- a/pipeline/clean/ce-filter.sh +++ b/pipeline/clean/ce-filter.sh @@ -29,7 +29,7 @@ fi # Part of the data to be removed (0.05 is 5%) remove=0.05 -model="${model_dir}/model.npz.best-ce-mean-words.npz" +model="${model_dir}/model.npz.best-bleu-detok.npz" vocab="${model_dir}/vocab.spm" output_dir=$(dirname "${output_prefix}") dir="${output_dir}/scored" @@ -68,7 +68,7 @@ echo "### Sorting scores" if [ ! -s "${dir}/sorted.gz" ]; then buffer_size="$(echo "$(grep MemTotal /proc/meminfo | awk '{print $2}')"*0.9 | bc | cut -f1 -d.)" paste "${dir}/scores.nrm.txt" "${dir}/corpus.${SRC}" "${dir}/corpus.${TRG}" | - LC_ALL=C sort -n -k1,1 -S "${buffer_size}K" | + LC_ALL=C sort -n -k1,1 -S "${buffer_size}K" -T "${dir}" | pigz >"${dir}/sorted.gz" fi diff --git a/pipeline/clean/clean-mono.sh b/pipeline/clean/clean-mono.sh index 630930a24..f9dd2f052 100755 --- a/pipeline/clean/clean-mono.sh +++ b/pipeline/clean/clean-mono.sh @@ -35,7 +35,7 @@ test -s "${output}.${lang}.gz" || test -s "${output}.${lang}.nrm.gz" || echo "### Deduplication" test -s "${output}.${lang}.gz" || test -s "${output}.${lang}.nrm.uniq.gz" || pigz -dc "${output}.${lang}.nrm.gz" | - LC_ALL=C sort -S 10G | + LC_ALL=C sort -S 10G -T "${output}" | uniq | pigz >"${output}.${lang}.nrm.uniq.gz" diff --git a/pipeline/data/download-corpus.sh b/pipeline/data/download-corpus.sh index d8c30fef5..d48105766 100644 --- a/pipeline/data/download-corpus.sh +++ b/pipeline/data/download-corpus.sh @@ -15,6 +15,7 @@ test -v SRC test -v TRG prefix=$1 +cache=$2 src_corpus="${prefix}.${SRC}.gz" trg_corpus="${prefix}.${TRG}.gz" @@ -25,7 +26,7 @@ mkdir -p "${dir}" if [ ! -e "${trg_corpus}" ]; then echo "### Downloading datasets" - for dataset in "${@:2}"; do + for dataset in "${@:3}"; do echo "### Downloading dataset ${dataset}" name=${dataset#*_} type=${dataset%%_*} diff --git a/pipeline/data/download-eval.sh b/pipeline/data/download-eval.sh index 73db2f26b..82b150810 100644 --- a/pipeline/data/download-eval.sh +++ b/pipeline/data/download-eval.sh @@ -15,11 +15,11 @@ test -v WORKDIR test -v TEST_DATASETS dir=$1 +cache=$2 - -for dataset in "${@:2}"; do +for dataset in "${@:3}"; do name="${dataset//[^A-Za-z0-9_- ]/_}" - bash "${WORKDIR}/pipeline/data/download-corpus.sh" "${dir}/${name}" "${dataset}" + bash "${WORKDIR}/pipeline/data/download-corpus.sh" "${dir}/${name}" "${cache}" "${dataset}" test -e "${dir}/${name}.${SRC}" || pigz -dk "${dir}/${name}.${SRC}.gz" test -e "${dir}/${name}.${TRG}" || pigz -dk "${dir}/${name}.${TRG}.gz" diff --git a/pipeline/data/download-mono.sh b/pipeline/data/download-mono.sh index 18c4fb5a6..05b83bbb7 100644 --- a/pipeline/data/download-mono.sh +++ b/pipeline/data/download-mono.sh @@ -14,6 +14,7 @@ echo "###### Downloading monolingual data" lang=$1 max_sent=$2 prefix=$3 +cache=$4 file_name="${prefix}.${lang}.gz" dir=$(dirname "${prefix}")/mono @@ -23,7 +24,7 @@ if [ ! -e "${file_name}" ]; then mkdir -p "${dir}" coef=0.1 - for dataset in "${@:4}"; do + for dataset in "${@:5}"; do echo "### Downloading dataset ${dataset}" source_prefix="${dir}/${dataset}.original.${lang}" gz_path="${dir}/${dataset}.${lang}.gz" diff --git a/pipeline/data/importers/corpus/custom-corpus.sh b/pipeline/data/importers/corpus/custom-corpus.sh new file mode 100644 index 000000000..4c4c05842 --- /dev/null +++ b/pipeline/data/importers/corpus/custom-corpus.sh @@ -0,0 +1,24 @@ +#!/bin/bash +## +# Use custom dataset that is already downloaded to a local disk +# Local path prefix without `..gz` should be specified as a "dataset" parameter +# +# Usage: +# bash custom-corpus.sh source target dir dataset +# + +set -x +set -euo pipefail + +echo "###### Copying custom corpus" + +src=$1 +trg=$2 +dir=$3 +dataset=$4 + +cp "${dataset}.${src}.gz" "${dir}/" +cp "${dataset}.${trg}.gz" "${dir}/" + + +echo "###### Done: Copying custom corpus" \ No newline at end of file diff --git a/pipeline/data/importers/corpus/flores.sh b/pipeline/data/importers/corpus/flores.sh new file mode 100644 index 000000000..3dd68ab4f --- /dev/null +++ b/pipeline/data/importers/corpus/flores.sh @@ -0,0 +1,52 @@ +#!/bin/bash +## +# Downloads flores dataset +# Dataset type can be "dev" or "devtest" +# +# Usage: +# bash flores.sh source target dir dataset +# + +set -x +set -euo pipefail + +echo "###### Downloading flores corpus" + +src=$1 +trg=$2 +dir=$3 +dataset=$4 + +tmp="${dir}/flores" +mkdir -p "${tmp}" + +test -s "${tmp}/flores101_dataset.tar.gz" || + wget -O "${tmp}/flores101_dataset.tar.gz" "https://dl.fbaipublicfiles.com/flores101/dataset/flores101_dataset.tar.gz" + +tar -xzf "${tmp}/flores101_dataset.tar.gz" -C "${tmp}" --no-same-owner + +source "${WORKDIR}/pipeline/setup/activate-python.sh" + +flores_code() { + code=$1 + + if [ "${code}" == "zh" ] || [ "${code}" == "zh-Hans" ]; then + flores_code="zho_simpl" + elif [ "${code}" == "zh-Hant" ]; then + flores_code="zho_trad" + else + flores_code=$(python -c "from mtdata.iso import iso3_code; print(iso3_code('${code}', fail_error=True))") + fi + + echo "${flores_code}" +} + +src_flores=$(flores_code "${src}") +trg_flores=$(flores_code "${trg}") + +cp "${tmp}/flores101_dataset/${dataset}/${src_flores}.${dataset}" "${dir}/flores.${src}" +cp "${tmp}/flores101_dataset/${dataset}/${trg_flores}.${dataset}" "${dir}/flores.${trg}" + +rm -rf "${tmp}" + +echo "###### Done: Downloading flores corpus" diff --git a/pipeline/data/importers/mono/custom-mono.sh b/pipeline/data/importers/mono/custom-mono.sh new file mode 100644 index 000000000..ea7a92130 --- /dev/null +++ b/pipeline/data/importers/mono/custom-mono.sh @@ -0,0 +1,22 @@ +#!/bin/bash +## +# Use custom monolingual dataset that is already downloaded to a local disk +# Local path prefix without `..gz` should be specified as a "dataset" parameter +# +# Usage: +# bash custom-mono.sh lang output_prefix dataset +# + +set -x +set -euo pipefail + +echo "###### Copying custom monolingual dataset" + +lang=$1 +output_prefix=$2 +dataset=$3 + +cp "${dataset}.${lang}.gz" "${output_prefix}.${lang}.gz" + + +echo "###### Done: Copying custom monolingual dataset" \ No newline at end of file diff --git a/pipeline/translate/translate-corpus.sh b/pipeline/translate/translate-corpus.sh index 1ac612bd6..3d60373aa 100755 --- a/pipeline/translate/translate-corpus.sh +++ b/pipeline/translate/translate-corpus.sh @@ -27,7 +27,7 @@ if [ -e "${output_path}" ]; then exit 0 fi -config="${model_dir}/model.npz.best-ce-mean-words.npz.decoder.yml" +config="${model_dir}/model.npz.best-bleu-detok.npz.decoder.yml" decoder_config="${WORKDIR}/pipeline/translate/decoder.yml" tmp_dir=$(dirname "${output_path}")/tmp mkdir -p "${tmp_dir}" @@ -44,7 +44,7 @@ test -s "${tmp_dir}/file.00.ref" || echo "### Translating source sentences with Marian" # This can be parallelized across several GPU machines. -for name in $(ls "${tmp_dir}" | grep -E "^file\.[0-9]+$" | shuf); do +for name in $(find "${tmp_dir}" -regex '.*file\.[0-9]+' -printf "%f\n" | shuf); do prefix="${tmp_dir}/${name}" echo "### ${prefix}" test -e "${prefix}.nbest" || @@ -60,11 +60,10 @@ done echo "### Extracting the best translations from n-best lists w.r.t to the reference" # It is CPU-only, can be run after translation on a CPU machine. -test -s "${tmp_dir}/file.00.nbest.out" || - ls "${tmp_dir}" | grep -E "^file\.[0-9]+$" | shuf | - parallel --no-notice -k -j "$(nproc)" \ - "python ${WORKDIR}/pipeline/translate/bestbleu.py -i ${tmp_dir}/{}.nbest -r ${tmp_dir}/{}.ref -m bleu > ${tmp_dir}/{}.nbest.out" \ - 2>"${tmp_dir}/debug.txt" +find "${tmp_dir}" -regex '.*file\.[0-9]+' -printf "%f\n" | shuf | +parallel --no-notice -k -j "$(nproc)" \ + "test -e ${tmp_dir}/{}.nbest.out || python ${WORKDIR}/pipeline/translate/bestbleu.py -i ${tmp_dir}/{}.nbest -r ${tmp_dir}/{}.ref -m bleu > ${tmp_dir}/{}.nbest.out" \ + 2>"${tmp_dir}/debug.txt" echo "### Collecting translations" test -s "${output_path}" || cat "${tmp_dir}"/file.*.nbest.out | pigz >"${output_path}" diff --git a/pipeline/translate/translate-mono.sh b/pipeline/translate/translate-mono.sh index 7a836e83e..df635034a 100755 --- a/pipeline/translate/translate-mono.sh +++ b/pipeline/translate/translate-mono.sh @@ -21,7 +21,7 @@ if [ -e "${output_path}" ]; then exit 0 fi -config="${model_dir}/model.npz.best-ce-mean-words.npz.decoder.yml" +config="${model_dir}/model.npz.best-bleu-detok.npz.decoder.yml" decoder_config="${WORKDIR}/pipeline/translate/decoder.yml" tmp_dir=$(dirname "${output_path}")/tmp @@ -32,7 +32,7 @@ test -s "${tmp_dir}/file.00" || pigz -dc "${mono_path}" | split -d -l 2000000 - echo "### Translate source sentences with Marian" # This can be parallelized across several GPU machines. -for name in $(ls "${tmp_dir}" | grep -E "^file\.[0-9]+$" | shuf); do +for name in $(find "${tmp_dir}" -regex '.*file\.[0-9]+' -printf "%f\n" | shuf); do prefix="${tmp_dir}/${name}" echo "### ${prefix}" test -e "${prefix}.out" || diff --git a/pipeline/utils/find-corpus.py b/pipeline/utils/find-corpus.py index 49663012e..f4341b1e8 100644 --- a/pipeline/utils/find-corpus.py +++ b/pipeline/utils/find-corpus.py @@ -25,7 +25,7 @@ names = [] if type == 'opus': - exclude += ['OPUS100v'] + exclude += ['OPUS100v', 'WMT-News'] datasets = requests.get(f'https://opus.nlpl.eu/opusapi/?source={source}&target={target}&preprocessing=moses&version=latest').json() names = [f'opus_{d["corpus"]}/{d["version"]}' for d in datasets['corpora']] elif type == 'sacrebleu': @@ -35,7 +35,7 @@ elif type == 'mtdata': from mtdata.main import LangPair from mtdata.data import get_entries - exclude += ['opus', 'newstest'] + exclude += ['opus', 'newstest', 'UNv1'] entries = get_entries(LangPair(f'{source}-{target}'), None, None) names = [f'mtdata_{entry.name}' for entry in entries] else: diff --git a/pipeline/utils/merge-corpus.sh b/pipeline/utils/merge-corpus.sh index 9d0bee5c8..b55f40c06 100644 --- a/pipeline/utils/merge-corpus.sh +++ b/pipeline/utils/merge-corpus.sh @@ -20,8 +20,8 @@ res_trg=$6 mkdir -p "$(dirname "${res_src}")" mkdir -p "$(dirname "${res_trg}")" -test -s "${res_src}" || cat "${src1}" "${src2}" >"${res_src}" -test -s "${res_trg}" || cat "${trg1}" "${trg2}" >"${res_trg}" +test -s "${res_src}" || cat <(pigz -dc "${src1}") <(pigz -dc "${src2}") | shuf | pigz >"${res_src}" +test -s "${res_trg}" || cat <(pigz -dc "${trg1}") <(pigz -dc "${trg2}") | shuf | pigz >"${res_trg}" src_len=$(pigz -dc "${res_src}" | wc -l) trg_len=$(pigz -dc "${res_trg}" | wc -l) diff --git a/run.sh b/run.sh index 978a6fc4e..cfbe690a9 100644 --- a/run.sh +++ b/run.sh @@ -13,9 +13,15 @@ set -euo pipefail # Directories structure # #├ data -#│ ├ cache TODO -#│ │ └ opus_wmt20.ru.gz -#│ │ └ sacrebleu_wmt20.en.gz +#│ ├ cache +#│ │ ├ corpus +#│ │ │ └ opus +#│ │ │ ├ ada83_v1.en.gz +#│ │ │ └ ada83_v1.ru.gz +#│ │ └ mono +#│ │ └ news-crawl +#│ │ ├ news.2019.ru.gz +#│ │ └ news.2019.en.gz #│ └ ru-en #│ └ test #│ ├ original @@ -62,18 +68,21 @@ set -euo pipefail #│ │ ├ speed #│ │ └ exported #│ ├ en-ru -#│ │ └ test -#│ │ └ s2s +#│ └ test +#│ └ s2s +#│ +#├ experiments +#│ └ ru-en +#│ └ test +#│ └ config.sh echo "###### read config " source ./config.sh -echo "###### setup" -bash ./pipeline/setup/install-all.sh - echo "###### set common variables" # data -data_dir="${DATA_DIR}/${SRC}-${TRG}/${EXPERIMENT}" +data_dir="${DATA_ROOT_DIR}/data/${SRC}-${TRG}/${EXPERIMENT}" +cache_dir="${DATA_ROOT_DIR}/cache" original="${data_dir}/original" evaluation="${data_dir}/evaluation" clean="${data_dir}/clean" @@ -84,22 +93,35 @@ merged="${data_dir}/merged" filtered="${data_dir}/filtered" align_dir="${data_dir}/alignment" # models -models_dir="${MODELS_DIR}/${SRC}-${TRG}/${EXPERIMENT}" +models_dir="${DATA_ROOT_DIR}/models/${SRC}-${TRG}/${EXPERIMENT}" student_dir="${models_dir}/student" student_finetuned_dir="${models_dir}/student-finetuned" teacher_dir="${models_dir}/teacher" -s2s="${MODELS_DIR}/${TRG}-${SRC}/${EXPERIMENT}/s2s" speed="${models_dir}/speed" exported="${models_dir}/exported" +echo "###### save experiment " +experiment_dir="${EXPERIMENTS_DIR}/${SRC}-${TRG}/${EXPERIMENT}" +mkdir -p "${experiment_dir}" +cp ./config.sh "${experiment_dir}/config.sh" +cp -r ./pipeline/train/configs "${experiment_dir}/" + +echo "###### setup" +bash ./pipeline/setup/install-all.sh + echo "###### download data" -bash ./pipeline/data/download-corpus.sh "${original}/corpus" ${TRAIN_DATASETS} -bash ./pipeline/data/download-corpus.sh "${original}/devset" ${DEVTEST_DATASETS} -bash ./pipeline/data/download-eval.sh "${evaluation}" ${TEST_DATASETS} +# shellcheck disable=SC2086 +bash ./pipeline/data/download-corpus.sh "${original}/corpus" "${cache_dir}" ${TRAIN_DATASETS} +# shellcheck disable=SC2086 +bash ./pipeline/data/download-corpus.sh "${original}/devset" "${cache_dir}" ${DEVTEST_DATASETS} +# shellcheck disable=SC2086 +bash ./pipeline/data/download-eval.sh "${evaluation}" "${cache_dir}" ${TEST_DATASETS} +# shellcheck disable=SC2086 test -n "${MONO_DATASETS_SRC}" && - bash ./pipeline/data/download-mono.sh "${SRC}" "${MONO_MAX_SENTENCES_SRC}" "${original}/mono" ${MONO_DATASETS_SRC} + bash ./pipeline/data/download-mono.sh "${SRC}" "${MONO_MAX_SENTENCES_SRC}" "${original}/mono" "${cache_dir}" ${MONO_DATASETS_SRC} +# shellcheck disable=SC2086 test -n "${MONO_DATASETS_TRG}" && - bash ./pipeline/data/download-mono.sh "${TRG}" "${MONO_MAX_SENTENCES_TRG}" "${original}/mono" ${MONO_DATASETS_TRG} + bash ./pipeline/data/download-mono.sh "${TRG}" "${MONO_MAX_SENTENCES_TRG}" "${original}/mono" "${cache_dir}" ${MONO_DATASETS_TRG} echo "###### clean data" bash ./pipeline/clean/clean-corpus.sh "${original}/corpus" "${clean}/corpus" @@ -109,13 +131,19 @@ test -e "${original}/mono.${SRC}.gz" && test -e "${original}/mono.${TRG}.gz" && bash ./pipeline/clean/clean-mono.sh "${TRG}" "${original}/mono" "${clean}/mono" -echo "###### train backward model" -bash ./pipeline/train/train-s2s.sh "${s2s}" "${biclean}/corpus" "${original}/devset" "${TRG}" "${SRC}" -bash ./pipeline/train/eval.sh "${s2s}" "${evaluation}" "${TRG}" "${SRC}" +if [ -n "${BACKWARD_MODEL}" ]; then + echo "###### use pretrained backward model" + backward="${BACKWARD_MODEL}" +else + echo "###### train backward model" + backward="${DATA_ROOT_DIR}/models/${TRG}-${SRC}/${EXPERIMENT}/s2s" + bash ./pipeline/train/train-s2s.sh "${backward}" "${biclean}/corpus" "${original}/devset" "${TRG}" "${SRC}" + bash ./pipeline/train/eval.sh "${backward}" "${evaluation}" "${TRG}" "${SRC}" +fi if [ -e "${clean}/mono.${TRG}.gz" ]; then echo "###### augment corpus with back translations" - bash ./pipeline/translate/translate-mono.sh "${clean}/mono.${TRG}.gz" "${s2s}" "${translated}/mono.${SRC}.gz" + bash ./pipeline/translate/translate-mono.sh "${clean}/mono.${TRG}.gz" "${backward}" "${translated}/mono.${SRC}.gz" bash ./pipeline/utils/merge-corpus.sh \ "${translated}/mono.${SRC}.gz" \ "${biclean}/corpus.${SRC}.gz" \ @@ -152,7 +180,7 @@ bash ./pipeline/utils/merge-corpus.sh "${clean}/corpus.${SRC}.gz" \ "${merged}/corpus.${TRG}.gz" echo "###### cross entropy filtering" -bash ./pipeline/clean/ce-filter.sh "${s2s}" "${merged}/corpus" "${filtered}/corpus" +bash ./pipeline/clean/ce-filter.sh "${backward}" "${merged}/corpus" "${filtered}/corpus" echo "###### train word alignment and lexical shortlists" bash ./pipeline/alignment/generate-alignment-and-shortlist.sh "${filtered}/corpus" \