Minor improvements (mozilla#20)

- Flores dataset importer - custom dataset importer - ability to use a pre-trained backward model - save experiment config on start - stubs for dataset caching ( decided to sync implementation with workflow manager integration ) - use best bleu models instead of best ce-mean-words - fix linting warnings
MaksymDel · Aug 17, 2021 · 0f6e64c · 0f6e64c
1 parent ec783cf
commit 0f6e64c
Show file tree

Hide file tree

Showing 15 changed files with 182 additions and 47 deletions.
diff --git a/README.md b/README.md
@@ -133,14 +133,17 @@ TRAIN_DATASETS="opus_OPUS-ParaCrawl/v7.1 mtdata_newstest2019_ruen"
 TEST_DATASETS="sacrebleu_wmt20 sacrebleu_wmt18"
 ```
 
-Data source | Prefix | Name example | Type | Comments
+Data source | Prefix | Name examples | Type | Comments
 --- | --- | --- | ---| ---
 [MTData](https://github.com/thammegowda/mtdata) | mtdata | newstest2017_ruen | corpus | Supports many datasets. Run `mtdata list -l ru-en` to see datasets for a specific language pair.
 [OPUS](opus.nlpl.eu/) | opus | ParaCrawl/v7.1 | corpus | Many open source datasets. Go to the website, choose a language pair, check links under Moses column to see what names and version is used in a link.
 [SacreBLEU](https://github.com/mjpost/sacrebleu) | sacrebleu | wmt20 | corpus | Official evaluation datasets available in SacreBLEU tool. Recommended to use in `TEST_DATASETS`. Look up supported datasets and language pairs in `sacrebleu.dataset` python module.
+[Flores](https://github.com/facebookresearch/flores) | flores | dev, devtest | corpus | Evaluation dataset from Facebook that supports 100 languages.
+Custom parallel | custom-corpus | /tmp/test-corpus | corpus | Custom parallel dataset that is already downloaded to a local disk. The dataset name is an absolute path prefix without ".lang.gz"
 [Paracrawl](https://paracrawl.eu/) | paracrawl-mono | paracrawl8 | mono | Datasets that are crawled from the web. Only [mono datasets](https://paracrawl.eu/index.php/moredata) are used in this importer. Parallel corpus is available using opus importer.
 [News crawl](http://data.statmt.org/news-crawl) | news-crawl | news.2019 | mono | Some news monolingual datasets from [WMT21](https://www.statmt.org/wmt21/translation-task.html)
 [Common crawl](https://commoncrawl.org/) | commoncrawl | wmt16 | mono | Huge web crawl datasets. The links are posted on [WMT21](https://www.statmt.org/wmt21/translation-task.html)
+Custom mono | custom-mono | /tmp/test-mono | mono | Custom monolingual dataset that is already downloaded to a local disk. The dataset name is an absolute path prefix without ".lang.gz"
 
 You can also use [find-corpus](pipeline/utils/find-corpus.py) tool to find all datasets for an importer and get them formatted to use in config.
 

diff --git a/config.sh b/config.sh
@@ -11,8 +11,10 @@ set -a
 
 WORKDIR=$(pwd)
 CUDA_DIR=/usr/local/cuda-11.2
-DATA_DIR=${DATA_DIR:-${WORKDIR}/data}
-MODELS_DIR=${MODELS_DIR:-${WORKDIR}/models}
+DATA_ROOT_DIR=${DATA_ROOT_DIR:-${WORKDIR}}
+DATA_DIR=${DATA_ROOT_DIR}/data
+MODELS_DIR=${DATA_ROOT_DIR}/models
+EXPERIMENTS_DIR=${DATA_ROOT_DIR}/experiments
 MARIAN=${MARIAN:-${WORKDIR}/3rd_party/marian-dev/build}
 CLEAN_TOOLS=${WORKDIR}/pipeline/clean/tools
 BIN=${WORKDIR}/bin
@@ -23,11 +25,14 @@ EXPERIMENT=test
 SRC=ru
 TRG=en
 
+# path to a pretrained backward model (optional)
+BACKWARD_MODEL=""
+
 # parallel corpus
 TRAIN_DATASETS="opus_ada83/v1 opus_UN/v20090831 opus_GNOME/v1 opus_wikimedia/v20210402 opus_CCMatrix/v1 opus_Wikipedia/v1.0 opus_tico-19/v2020-10-28 opus_KDE4/v2 opus_OpenSubtitles/v2018 opus_MultiUN/v1 opus_GlobalVoices/v2018q4 opus_ELRC_2922/v1 opus_PHP/v1 opus_Tatoeba/v2021-03-10 opus_Tanzil/v1 opus_XLEnt/v1.1 opus_TildeMODEL/v2018 opus_Ubuntu/v14.10 opus_TED2013/v1.1 opus_infopankki/v1 opus_EUbookshop/v2 opus_ParaCrawl/v8 opus_Books/v1 opus_WMT-News/v2019 opus_bible-uedin/v1 opus_WikiMatrix/v1 opus_QED/v2.0a opus_CCAligned/v1 opus_TED2020/v1 opus_News-Commentary/v16 opus_UNPC/v1.0"\
 " mtdata_cc_aligned mtdata_airbaltic mtdata_GlobalVoices_2018Q4 mtdata_UNv1_test mtdata_neulab_tedtalksv1_train mtdata_neulab_tedtalksv1_dev mtdata_wmt13_commoncrawl mtdata_czechtourism mtdata_paracrawl_bonus mtdata_worldbank mtdata_wiki_titles_v1 mtdata_WikiMatrix_v1 mtdata_wmt18_news_commentary_v13 mtdata_wiki_titles_v2 mtdata_news_commentary_v14 mtdata_UNv1_dev mtdata_neulab_tedtalksv1_test mtdata_JW300"
-DEVTEST_DATASETS="mtdata_newstest2019_ruen mtdata_newstest2017_ruen mtdata_newstest2015_ruen mtdata_newstest2014_ruen"
-TEST_DATASETS="sacrebleu_wmt20 sacrebleu_wmt18 sacrebleu_wmt16 sacrebleu_wmt13"
+DEVTEST_DATASETS="flores_dev mtdata_newstest2019_ruen mtdata_newstest2017_ruen mtdata_newstest2015_ruen mtdata_newstest2014_ruen"
+TEST_DATASETS="flores_devtest sacrebleu_wmt20 sacrebleu_wmt18 sacrebleu_wmt16 sacrebleu_wmt13"
 # monolingual datasets (ex. paracrawl-mono_paracrawl8, commoncrawl_wmt16, news-crawl_news.2020)
 # to be translated by the teacher model
 MONO_DATASETS_SRC="news-crawl_news.2020 news-crawl_news.2019 news-crawl_news.2018 news-crawl_news.2017 "\

diff --git a/pipeline/clean/ce-filter.sh b/pipeline/clean/ce-filter.sh
@@ -29,7 +29,7 @@ fi
 
 # Part of the data to be removed (0.05 is 5%)
 remove=0.05
-model="${model_dir}/model.npz.best-ce-mean-words.npz"
+model="${model_dir}/model.npz.best-bleu-detok.npz"
 vocab="${model_dir}/vocab.spm"
 output_dir=$(dirname "${output_prefix}")
 dir="${output_dir}/scored"
@@ -68,7 +68,7 @@ echo "### Sorting scores"
 if [ ! -s "${dir}/sorted.gz" ]; then
   buffer_size="$(echo "$(grep MemTotal /proc/meminfo | awk '{print $2}')"*0.9 | bc | cut -f1 -d.)"
   paste "${dir}/scores.nrm.txt" "${dir}/corpus.${SRC}" "${dir}/corpus.${TRG}" |
-  LC_ALL=C sort -n -k1,1 -S "${buffer_size}K" |
+  LC_ALL=C sort -n -k1,1 -S "${buffer_size}K" -T "${dir}" |
   pigz >"${dir}/sorted.gz"
 fi
 

diff --git a/pipeline/clean/clean-mono.sh b/pipeline/clean/clean-mono.sh
@@ -35,7 +35,7 @@ test -s "${output}.${lang}.gz" || test -s "${output}.${lang}.nrm.gz" ||
 echo "### Deduplication"
 test -s "${output}.${lang}.gz" || test -s "${output}.${lang}.nrm.uniq.gz" ||
   pigz -dc "${output}.${lang}.nrm.gz" |
-  LC_ALL=C sort -S 10G |
+  LC_ALL=C sort -S 10G -T "${output}" |
   uniq |
   pigz >"${output}.${lang}.nrm.uniq.gz"
 

diff --git a/pipeline/data/download-corpus.sh b/pipeline/data/download-corpus.sh
@@ -15,6 +15,7 @@ test -v SRC
 test -v TRG
 
 prefix=$1
+cache=$2
 
 src_corpus="${prefix}.${SRC}.gz"
 trg_corpus="${prefix}.${TRG}.gz"
@@ -25,7 +26,7 @@ mkdir -p "${dir}"
 if [ ! -e "${trg_corpus}" ]; then
   echo "### Downloading datasets"
 
-  for dataset in "${@:2}"; do
+  for dataset in "${@:3}"; do
     echo "### Downloading dataset ${dataset}"
     name=${dataset#*_}
     type=${dataset%%_*}

diff --git a/pipeline/data/download-eval.sh b/pipeline/data/download-eval.sh
@@ -15,11 +15,11 @@ test -v WORKDIR
 test -v TEST_DATASETS
 
 dir=$1
+cache=$2
 
-
-for dataset in "${@:2}"; do
+for dataset in "${@:3}"; do
   name="${dataset//[^A-Za-z0-9_- ]/_}"
-  bash "${WORKDIR}/pipeline/data/download-corpus.sh" "${dir}/${name}" "${dataset}"
+  bash "${WORKDIR}/pipeline/data/download-corpus.sh" "${dir}/${name}" "${cache}" "${dataset}"
 
   test -e "${dir}/${name}.${SRC}" || pigz -dk "${dir}/${name}.${SRC}.gz"
   test -e "${dir}/${name}.${TRG}" || pigz -dk "${dir}/${name}.${TRG}.gz"

diff --git a/pipeline/data/download-mono.sh b/pipeline/data/download-mono.sh
@@ -14,6 +14,7 @@ echo "###### Downloading monolingual data"
 lang=$1
 max_sent=$2
 prefix=$3
+cache=$4
 
 file_name="${prefix}.${lang}.gz"
 dir=$(dirname "${prefix}")/mono
@@ -23,7 +24,7 @@ if [ ! -e "${file_name}" ]; then
   mkdir -p "${dir}"
   coef=0.1
 
-  for dataset in "${@:4}"; do
+  for dataset in "${@:5}"; do
     echo "### Downloading dataset ${dataset}"
     source_prefix="${dir}/${dataset}.original.${lang}"
     gz_path="${dir}/${dataset}.${lang}.gz"

diff --git a/pipeline/data/importers/corpus/custom-corpus.sh b/pipeline/data/importers/corpus/custom-corpus.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+##
+# Use custom dataset that is already downloaded to a local disk
+# Local path prefix without `.<lang_code>.gz` should be specified as a "dataset" parameter
+#
+# Usage:
+#   bash custom-corpus.sh source target dir dataset
+#
+
+set -x
+set -euo pipefail
+
+echo "###### Copying custom corpus"
+
+src=$1
+trg=$2
+dir=$3
+dataset=$4
+
+cp "${dataset}.${src}.gz" "${dir}/"
+cp "${dataset}.${trg}.gz" "${dir}/"
+
+
+echo "###### Done: Copying custom corpus"
diff --git a/pipeline/data/importers/corpus/flores.sh b/pipeline/data/importers/corpus/flores.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+##
+# Downloads flores dataset
+# Dataset type can be "dev" or "devtest"
+#
+# Usage:
+#   bash flores.sh source target dir dataset
+#
+
+set -x
+set -euo pipefail
+
+echo "###### Downloading flores corpus"
+
+src=$1
+trg=$2
+dir=$3
+dataset=$4
+
+tmp="${dir}/flores"
+mkdir -p "${tmp}"
+
+test -s "${tmp}/flores101_dataset.tar.gz" ||
+  wget -O "${tmp}/flores101_dataset.tar.gz" "https://dl.fbaipublicfiles.com/flores101/dataset/flores101_dataset.tar.gz"
+
+tar -xzf "${tmp}/flores101_dataset.tar.gz" -C "${tmp}" --no-same-owner
+
+source "${WORKDIR}/pipeline/setup/activate-python.sh"
+
+flores_code() {
+  code=$1
+
+  if [ "${code}" == "zh" ] || [ "${code}" == "zh-Hans" ]; then
+    flores_code="zho_simpl"
+  elif [ "${code}" == "zh-Hant" ]; then
+    flores_code="zho_trad"
+  else
+    flores_code=$(python -c "from mtdata.iso import iso3_code; print(iso3_code('${code}', fail_error=True))")
+  fi
+
+  echo "${flores_code}"
+}
+
+src_flores=$(flores_code "${src}")
+trg_flores=$(flores_code "${trg}")
+
+cp "${tmp}/flores101_dataset/${dataset}/${src_flores}.${dataset}" "${dir}/flores.${src}"
+cp "${tmp}/flores101_dataset/${dataset}/${trg_flores}.${dataset}" "${dir}/flores.${trg}"
+
+rm -rf "${tmp}"
+
+echo "###### Done: Downloading flores corpus"
diff --git a/pipeline/data/importers/mono/custom-mono.sh b/pipeline/data/importers/mono/custom-mono.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+##
+# Use custom monolingual dataset that is already downloaded to a local disk
+# Local path prefix without `.<lang_code>.gz` should be specified as a "dataset" parameter
+#
+# Usage:
+#   bash custom-mono.sh lang output_prefix dataset
+#
+
+set -x
+set -euo pipefail
+
+echo "###### Copying custom monolingual dataset"
+
+lang=$1
+output_prefix=$2
+dataset=$3
+
+cp "${dataset}.${lang}.gz" "${output_prefix}.${lang}.gz"
+
+
+echo "###### Done: Copying custom monolingual dataset"
diff --git a/pipeline/translate/translate-corpus.sh b/pipeline/translate/translate-corpus.sh
@@ -27,7 +27,7 @@ if [ -e "${output_path}" ]; then
   exit 0
 fi
 
-config="${model_dir}/model.npz.best-ce-mean-words.npz.decoder.yml"
+config="${model_dir}/model.npz.best-bleu-detok.npz.decoder.yml"
 decoder_config="${WORKDIR}/pipeline/translate/decoder.yml"
 tmp_dir=$(dirname "${output_path}")/tmp
 mkdir -p "${tmp_dir}"
@@ -44,7 +44,7 @@ test -s "${tmp_dir}/file.00.ref" ||
 
 echo "### Translating source sentences with Marian"
 # This can be parallelized across several GPU machines.
-for name in $(ls "${tmp_dir}" | grep -E "^file\.[0-9]+$" | shuf); do
+for name in $(find "${tmp_dir}" -regex '.*file\.[0-9]+' -printf "%f\n" | shuf); do
   prefix="${tmp_dir}/${name}"
   echo "### ${prefix}"
   test -e "${prefix}.nbest" ||
@@ -60,11 +60,10 @@ done
 
 echo "### Extracting the best translations from n-best lists w.r.t to the reference"
 # It is CPU-only, can be run after translation on a CPU machine.
-test -s "${tmp_dir}/file.00.nbest.out" ||
-  ls "${tmp_dir}" | grep -E "^file\.[0-9]+$" | shuf |
-  parallel --no-notice -k -j "$(nproc)" \
-    "python ${WORKDIR}/pipeline/translate/bestbleu.py -i ${tmp_dir}/{}.nbest -r ${tmp_dir}/{}.ref -m bleu > ${tmp_dir}/{}.nbest.out" \
-    2>"${tmp_dir}/debug.txt"
+find "${tmp_dir}" -regex '.*file\.[0-9]+' -printf "%f\n" | shuf |
+parallel --no-notice -k -j "$(nproc)" \
+  "test -e ${tmp_dir}/{}.nbest.out || python ${WORKDIR}/pipeline/translate/bestbleu.py -i ${tmp_dir}/{}.nbest -r ${tmp_dir}/{}.ref -m bleu > ${tmp_dir}/{}.nbest.out" \
+  2>"${tmp_dir}/debug.txt"
 
 echo "### Collecting translations"
 test -s "${output_path}" || cat "${tmp_dir}"/file.*.nbest.out | pigz >"${output_path}"

diff --git a/pipeline/translate/translate-mono.sh b/pipeline/translate/translate-mono.sh
@@ -21,7 +21,7 @@ if [ -e "${output_path}" ]; then
   exit 0
 fi
 
-config="${model_dir}/model.npz.best-ce-mean-words.npz.decoder.yml"
+config="${model_dir}/model.npz.best-bleu-detok.npz.decoder.yml"
 decoder_config="${WORKDIR}/pipeline/translate/decoder.yml"
 tmp_dir=$(dirname "${output_path}")/tmp
 
@@ -32,7 +32,7 @@ test -s "${tmp_dir}/file.00" || pigz -dc "${mono_path}" | split -d -l 2000000 -
 
 echo "### Translate source sentences with Marian"
 # This can be parallelized across several GPU machines.
-for name in $(ls "${tmp_dir}" | grep -E "^file\.[0-9]+$" | shuf); do
+for name in $(find "${tmp_dir}" -regex '.*file\.[0-9]+' -printf "%f\n" | shuf); do
   prefix="${tmp_dir}/${name}"
   echo "### ${prefix}"
   test -e "${prefix}.out" ||

diff --git a/pipeline/utils/find-corpus.py b/pipeline/utils/find-corpus.py
@@ -25,7 +25,7 @@
 names = []
 
 if type == 'opus':
-    exclude += ['OPUS100v']
+    exclude += ['OPUS100v', 'WMT-News']
     datasets = requests.get(f'https://opus.nlpl.eu/opusapi/?source={source}&target={target}&preprocessing=moses&version=latest').json()
     names = [f'opus_{d["corpus"]}/{d["version"]}' for d in datasets['corpora']]
 elif type == 'sacrebleu':
@@ -35,7 +35,7 @@
 elif type == 'mtdata':
     from mtdata.main import LangPair
     from mtdata.data import get_entries
-    exclude += ['opus', 'newstest']
+    exclude += ['opus', 'newstest', 'UNv1']
     entries = get_entries(LangPair(f'{source}-{target}'), None, None)
     names = [f'mtdata_{entry.name}' for entry in entries]
 else:

diff --git a/pipeline/utils/merge-corpus.sh b/pipeline/utils/merge-corpus.sh
@@ -20,8 +20,8 @@ res_trg=$6
 
 mkdir -p "$(dirname "${res_src}")"
 mkdir -p "$(dirname "${res_trg}")"
-test -s "${res_src}" || cat "${src1}" "${src2}" >"${res_src}"
-test -s "${res_trg}" || cat "${trg1}" "${trg2}" >"${res_trg}"
+test -s "${res_src}" || cat <(pigz -dc "${src1}") <(pigz -dc "${src2}") | shuf | pigz >"${res_src}"
+test -s "${res_trg}" || cat <(pigz -dc "${trg1}") <(pigz -dc "${trg2}") | shuf | pigz >"${res_trg}"
 
 src_len=$(pigz -dc "${res_src}" | wc -l)
 trg_len=$(pigz -dc "${res_trg}" | wc -l)