|
| 1 | +#!/bin/bash |
| 2 | +# Copyright (c) Facebook, Inc. and its affiliates. |
| 3 | +# All rights reserved. |
| 4 | +# |
| 5 | +# This source code is licensed under the license found in the |
| 6 | +# LICENSE file in the root directory of this source tree. |
| 7 | + |
| 8 | +SRCS=( |
| 9 | + "de" |
| 10 | + "fr" |
| 11 | +) |
| 12 | +TGT=en |
| 13 | + |
| 14 | +ROOT=$(dirname "$0") |
| 15 | +SCRIPTS=$ROOT/../../scripts |
| 16 | +SPM_TRAIN=$SCRIPTS/spm_train.py |
| 17 | +SPM_ENCODE=$SCRIPTS/spm_encode.py |
| 18 | + |
| 19 | +BPESIZE=16384 |
| 20 | +ORIG=$ROOT/iwslt17_orig |
| 21 | +DATA=$ROOT/iwslt17.de_fr.en.bpe16k |
| 22 | +mkdir -p "$ORIG" "$DATA" |
| 23 | + |
| 24 | +TRAIN_MINLEN=1 # remove sentences with <1 BPE token |
| 25 | +TRAIN_MAXLEN=250 # remove sentences with >250 BPE tokens |
| 26 | + |
| 27 | +URLS=( |
| 28 | + "https://wit3.fbk.eu/archive/2017-01-trnted/texts/de/en/de-en.tgz" |
| 29 | + "https://wit3.fbk.eu/archive/2017-01-trnted/texts/fr/en/fr-en.tgz" |
| 30 | +) |
| 31 | +ARCHIVES=( |
| 32 | + "de-en.tgz" |
| 33 | + "fr-en.tgz" |
| 34 | +) |
| 35 | +VALID_SETS=( |
| 36 | + "IWSLT17.TED.dev2010.de-en IWSLT17.TED.tst2010.de-en IWSLT17.TED.tst2011.de-en IWSLT17.TED.tst2012.de-en IWSLT17.TED.tst2013.de-en IWSLT17.TED.tst2014.de-en IWSLT17.TED.tst2015.de-en" |
| 37 | + "IWSLT17.TED.dev2010.fr-en IWSLT17.TED.tst2010.fr-en IWSLT17.TED.tst2011.fr-en IWSLT17.TED.tst2012.fr-en IWSLT17.TED.tst2013.fr-en IWSLT17.TED.tst2014.fr-en IWSLT17.TED.tst2015.fr-en" |
| 38 | +) |
| 39 | + |
| 40 | +# download and extract data |
| 41 | +for ((i=0;i<${#URLS[@]};++i)); do |
| 42 | + ARCHIVE=$ORIG/${ARCHIVES[i]} |
| 43 | + if [ -f "$ARCHIVE" ]; then |
| 44 | + echo "$ARCHIVE already exists, skipping download" |
| 45 | + else |
| 46 | + URL=${URLS[i]} |
| 47 | + wget -P "$ORIG" "$URL" |
| 48 | + if [ -f "$ARCHIVE" ]; then |
| 49 | + echo "$URL successfully downloaded." |
| 50 | + else |
| 51 | + echo "$URL not successfully downloaded." |
| 52 | + exit 1 |
| 53 | + fi |
| 54 | + fi |
| 55 | + FILE=${ARCHIVE: -4} |
| 56 | + if [ -e "$FILE" ]; then |
| 57 | + echo "$FILE already exists, skipping extraction" |
| 58 | + else |
| 59 | + tar -C "$ORIG" -xzvf "$ARCHIVE" |
| 60 | + fi |
| 61 | +done |
| 62 | + |
| 63 | +echo "pre-processing train data..." |
| 64 | +for SRC in "${SRCS[@]}"; do |
| 65 | + for LANG in "${SRC}" "${TGT}"; do |
| 66 | + cat "$ORIG/${SRC}-${TGT}/train.tags.${SRC}-${TGT}.${LANG}" \ |
| 67 | + | grep -v '<url>' \ |
| 68 | + | grep -v '<talkid>' \ |
| 69 | + | grep -v '<keywords>' \ |
| 70 | + | grep -v '<speaker>' \ |
| 71 | + | grep -v '<reviewer' \ |
| 72 | + | grep -v '<translator' \ |
| 73 | + | grep -v '<doc' \ |
| 74 | + | grep -v '</doc>' \ |
| 75 | + | sed -e 's/<title>//g' \ |
| 76 | + | sed -e 's/<\/title>//g' \ |
| 77 | + | sed -e 's/<description>//g' \ |
| 78 | + | sed -e 's/<\/description>//g' \ |
| 79 | + | sed 's/^\s*//g' \ |
| 80 | + | sed 's/\s*$//g' \ |
| 81 | + > "$DATA/train.${SRC}-${TGT}.${LANG}" |
| 82 | + done |
| 83 | +done |
| 84 | + |
| 85 | +echo "pre-processing valid data..." |
| 86 | +for ((i=0;i<${#SRCS[@]};++i)); do |
| 87 | + SRC=${SRCS[i]} |
| 88 | + VALID_SET=${VALID_SETS[i]} |
| 89 | + for FILE in ${VALID_SET[@]}; do |
| 90 | + for LANG in "$SRC" "$TGT"; do |
| 91 | + grep '<seg id' "$ORIG/${SRC}-${TGT}/${FILE}.${LANG}.xml" \ |
| 92 | + | sed -e 's/<seg id="[0-9]*">\s*//g' \ |
| 93 | + | sed -e 's/\s*<\/seg>\s*//g' \ |
| 94 | + | sed -e "s/\’/\'/g" \ |
| 95 | + > "$DATA/valid.${SRC}-${TGT}.${LANG}" |
| 96 | + done |
| 97 | + done |
| 98 | +done |
| 99 | + |
| 100 | +# learn BPE with sentencepiece |
| 101 | +TRAIN_FILES=$(for SRC in "${SRCS[@]}"; do echo $DATA/train.${SRC}-${TGT}.${SRC}; echo $DATA/train.${SRC}-${TGT}.${TGT}; done | tr "\n" ",") |
| 102 | +echo "learning joint BPE over ${TRAIN_FILES}..." |
| 103 | +python "$SPM_TRAIN" \ |
| 104 | + --input=$TRAIN_FILES \ |
| 105 | + --model_prefix=$DATA/sentencepiece.bpe \ |
| 106 | + --vocab_size=$BPESIZE \ |
| 107 | + --character_coverage=1.0 \ |
| 108 | + --model_type=bpe |
| 109 | + |
| 110 | +# encode train/valid/test |
| 111 | +echo "encoding train/valid with learned BPE..." |
| 112 | +for SRC in "${SRCS[@]}"; do |
| 113 | + for LANG in "$SRC" "$TGT"; do |
| 114 | + python "$SPM_ENCODE" \ |
| 115 | + --model "$DATA/sentencepiece.bpe.model" \ |
| 116 | + --output_format=piece \ |
| 117 | + --inputs "$DATA/train.${SRC}-${TGT}.${SRC} $DATA/train.${SRC}-${TGT}.${TGT}" \ |
| 118 | + --outputs "$DATA/train.bpe.${SRC}-${TGT}.${SRC} $DATA/train.bpe.${SRC}-${TGT}.${TGT}" \ |
| 119 | + --min-len $TRAIN_MINLEN --max-len $TRAIN_MAXLEN |
| 120 | + python "$SPM_ENCODE" \ |
| 121 | + --model "$DATA/sentencepiece.bpe.model" \ |
| 122 | + --output_format=piece \ |
| 123 | + --inputs "$DATA/valid.${SRC}-${TGT}.${SRC} $DATA/valid.${SRC}-${TGT}.${TGT}" \ |
| 124 | + --outputs "$DATA/valid.bpe.${SRC}-${TGT}.${SRC} $DATA/valid.bpe.${SRC}-${TGT}.${TGT}" |
| 125 | + done |
| 126 | +done |
0 commit comments