Skip to content

Commit

Permalink
Quality improvements (mozilla#29)
Browse files Browse the repository at this point in the history
  • Loading branch information
eu9ene authored Dec 6, 2021
1 parent a09b0ac commit 3b3f33b
Show file tree
Hide file tree
Showing 67 changed files with 912 additions and 871 deletions.
Binary file added DAG.pdf
Binary file not shown.
91 changes: 35 additions & 56 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,12 @@ WORKSPACE=12000
CLUSTER_CORES=16
CONFIG=configs/config.prod.yml
CONDA_PATH=$(SHARED_ROOT)/mambaforge
SNAKEMAKE_OUTPUT_CACHE=$(SHARED_ROOT)/cache
TARGET=
###

CONDA_ACTIVATE=source $(CONDA_PATH)/etc/profile.d/conda.sh ; conda activate ; conda activate
SNAKEMAKE=export SNAKEMAKE_OUTPUT_CACHE=$(SNAKEMAKE_OUTPUT_CACHE); snakemake

### 2. setup

Expand All @@ -26,7 +29,8 @@ conda:

snakemake:
$(CONDA_ACTIVATE) base
mamba create -c conda-forge -c bioconda -n snakemake snakemake==6.9.1 --yes
mamba create -c conda-forge -c bioconda -n snakemake snakemake==6.10.0 --yes
mkdir -p "$(SNAKEMAKE_OUTPUT_CACHE)"

# build container image for cluster and run-local modes (preferred)
build:
Expand All @@ -44,64 +48,78 @@ pull:

dry-run:
$(CONDA_ACTIVATE) snakemake
snakemake \
$(SNAKEMAKE) \
--use-conda \
--cores all \
--cache \
--reason \
--configfile $(CONFIG) \
--config root="$(SHARED_ROOT)" cuda="$(CUDA_DIR)" gpus=$(GPUS) workspace=$(WORKSPACE) deps=true \
-n
-n \
$(TARGET)

run-local:
echo "Running with config $(CONFIG)"
$(CONDA_ACTIVATE) snakemake
snakemake \
$(SNAKEMAKE) \
--use-conda \
--reason \
--cores all \
--cache \
--resources gpu=$(GPUS) \
--configfile $(CONFIG) \
--config root="$(SHARED_ROOT)" cuda="$(CUDA_DIR)" gpus=$(GPUS) workspace=$(WORKSPACE) deps=true
--config root="$(SHARED_ROOT)" cuda="$(CUDA_DIR)" gpus=$(GPUS) workspace=$(WORKSPACE) deps=true \
$(TARGET)

test: CONFIG=configs/config.test.yml
test: run-local

run-local-container:
$(CONDA_ACTIVATE) snakemake
module load singularity
snakemake \
$(SNAKEMAKE) \
--use-conda \
--use-singularity \
--reason \
--cores all \
--cache \
--resources gpu=$(GPUS) \
--configfile $(CONFIG) \
--config root="$(SHARED_ROOT)" cuda="$(CUDA_DIR)" gpus=$(GPUS) workspace=$(WORKSPACE) \
--singularity-args="--bind $(SHARED_ROOT),$(CUDA_DIR) --nv"
--singularity-args="--bind $(SHARED_ROOT),$(CUDA_DIR) --nv" \
$(TARGET)

run-slurm:
$(CONDA_ACTIVATE) snakemake
chmod +x profiles/slurm/*
snakemake \
$(SNAKEMAKE) \
--use-conda \
--reason \
--cores $(CLUSTER_CORES) \
--cache \
--configfile $(CONFIG) \
--config root="$(SHARED_ROOT)" cuda="$(CUDA_DIR)" gpus=$(GPUS) workspace=$(WORKSPACE) \
--profile=profiles/slurm
--profile=profiles/slurm \
$(TARGET)

run-slurm-container:
$(CONDA_ACTIVATE) snakemake
chmod +x profiles/slurm/*
module load singularity
snakemake \
$(SNAKEMAKE) \
--use-conda \
--use-singularity \
--reason \
--verbose \
--cores $(CLUSTER_CORES) \
--cache \
--configfile $(CONFIG) \
--config root="$(SHARED_ROOT)" cuda="$(CUDA_DIR)" gpus=$(GPUS) workspace=$(WORKSPACE) \
--profile=profiles/slurm \
--singularity-args="--bind $(SHARED_ROOT),$(CUDA_DIR),/tmp --nv --containall"
--singularity-args="--bind $(SHARED_ROOT),$(CUDA_DIR),/tmp --nv --containall" \
$(TARGET)
# if CPU nodes don't have access to cuda dirs, use
# export CUDA_DIR=$(CUDA_DIR)
# export CUDA_DIR=$(CUDA_DIR); $(SNAKEMAKE) \
# --singularity-args="--bind $(SHARED_ROOT),/tmp --nv --containall"


Expand All @@ -123,25 +141,11 @@ run-file-server:
### extra

dag:
snakemake --dag | dot -Tpdf > DAG.pdf

lint:
snakemake --lint

install-monitor:
$(CONDA_ACTIVATE) base
conda create --name panoptes
conda install -c panoptes-organization panoptes-ui

run-monitor:
$(CONDA_ACTIVATE) panoptes
panoptes

run-with-monitor:
snakemake \
--use-conda \
--cores all \
--wms-monitor http://127.0.0.1:5000
--dag \
--configfile $(CONFIG) \
--config root="$(SHARED_ROOT)" cuda="$(CUDA_DIR)" gpus=$(GPUS) workspace=$(WORKSPACE) \
| dot -Tpdf > DAG.pdf

install-tensorboard:
$(CONDA_ACTIVATE) base
Expand All @@ -151,29 +155,4 @@ tensorboard:
$(CONDA_ACTIVATE) tensorboard
ls -d $(SHARED_ROOT)/models/*/*/* > tb-monitored-jobs; \
tensorboard --logdir=$$MODELS --host=0.0.0.0 &; \
python utils/tb_log_parser.py --prefix=

install-snakepit-scheduler:
mkdir -p $(SHARED_ROOT)/snakepit
cd $(SHARED_ROOT)/snakepit

curl -sL https://deb.nodesource.com/setup_12.x | sudo -E bash -
sudo apt install nodejs

if [ ! -e snakepit-client ]; then
git clone https://github.com/mozilla/snakepit-client.git
fi
cd snakepit-client
npm install
sudo npm link

echo "http://10.2.224.243" > /root/.pitconnect.txt

pit status

run-snakepit:
chmod +x profiles/snakepit/*
snakemake \
--use-conda \
--cores all \
--profile=profiles/snakepit
python utils/tb_log_parser.py --prefix=
156 changes: 145 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -128,15 +128,24 @@ make dry-run

### Local mode

Without containerization:
#### Without containerization

```
make run-local
```
With containerization:
To test the whole pipeline end to end (it supposed to run quickly and does not train anything useful):

```
make test
```
Or run
#### With containerization
```
make run-local-container
```



### Cluster mode

To run on Slurm
Expand All @@ -149,6 +158,18 @@ with containerization (recommended):
```
make run-slurm-container
```
### Specific target

By default, all Snakemake rules are executed. To run the pipeline up to a specific rule use:
```
make <run-command> TARGET=<non-wildcard-rule>
```

For example, collect corpus first:
```
make run-local TARGET=merge_corpus
```


### Using Snakepit

Expand Down Expand Up @@ -209,20 +230,23 @@ Step | Description | Bottleneck | Comments
--- | --- | --- | ---
Installation | Installing dependencies and compiling | CPU | Takes ~1 hour
Data downloading | Downloads datasets, samples sentences | Network, Disk | Time depends on dataset size, sampling of huge mono datasets (100M+ sentences) is the most intensive operation.
Data cleaning | Basic preprocessing, language specific, rule based, deduplication, and other attempts to clean noisy data in parallel and mono datasets | CPU | Good parallelization across CPU cores. To make cleaning of a new language more efficient add it to [clean_parallel.py](/pipeline/clean/clean_parallel.py).
Bicleaner | Filters noisy sentence pairs in a parallel corpus using [bicleaner](https://github.com/bitextor/bicleaner) or [bicleaner-ai](https://github.com/bitextor/bicleaner-ai) depending on available language packs. | CPU, GPU | If there are no pretrained language packs for bicleaner-ai, it uses bicleaner. If there are no ones for bicleaner either, this step is skipped. Cleaning threshold is controlled by `BICLEANER_THRESHOLD` config setting.
Data cleaning | Basic preprocessing, dataset specific, language specific, rule based and other attempts to clean noisy data in parallel and mono datasets | CPU | Good parallelization across CPU cores. To make cleaning of a new language more efficient add it to [clean_parallel.py](/pipeline/clean/tools/clean_parallel.py).
Bicleaner | Filters noisy sentence pairs in a parallel corpus using [bicleaner](https://github.com/bitextor/bicleaner) or [bicleaner-ai](https://github.com/bitextor/bicleaner-ai) depending on available language packs. | CPU, GPU | If there are no pretrained language packs for bicleaner-ai, it uses bicleaner. If there are no ones for bicleaner either, this step is skipped. Cleaning thresholds are configurable per dataset, see [Dataset cleaning](##Dataset cleaning).
Merge and dedupe | Merges clean dataset and applies deduplicaiton | CPU, Disk |
Training s2s | Trains a backward shallow s2s model, which is useful for back-translations and ce-filtering | GPU | Inspired by a [marian example](https://github.com/marian-nmt/marian-examples/tree/master/training-basics-sentencepiece).
Augmentation with back-translations | Translates mono corpus combined from `MONO_DATASETS_TRG` using shallow s2s model. | GPU | It is more useful for low-resource languages and can be skipped for others.
Training teacher | Trains one or multiple big transformer models | GPU | You might want to adjust [early stopping](pipeline/train/configs/training/teacher.transformer.train.yml) parameters depending on datasets size. Inspired by [transformer](https://github.com/marian-nmt/marian-examples/tree/master/transformer) and [wmt2017-uedin](https://github.com/marian-nmt/marian-examples/tree/master/wmt2017-uedin) marian examples and extended with [SentencePiece](https://github.com/google/sentencepiece).
Augmentation with back-translations | Translates mono corpus combined from monolingual datasets in target language using shallow s2s model. | GPU | It is more useful for low-resource languages and can be skipped for others.
Training teacher | Trains an ensemble of big transformer models on augmented dataset | GPU | You might want to adjust [early stopping](pipeline/train/configs/training/teacher.transformer.train.yml) or `after-epochs` parameters depending on datasets size.
Continue training teacher | Continue training an ensemble of teachers on parallel data only | GPU | You might want to adjust [early stopping](pipeline/train/configs/training/teacher.transformer.train.yml) parameters depending on datasets size.
Translation by teacher | Translates a corpus and monolingual data combined from `MONO_DATASETS_SRC` using the teacher model (ensemble is not supported yet) | GPU | The slowest part of the pipeline. Can take days. It is possible to speed it up launching the same scripts ([corpus](pipeline/translate/translate-corpus.sh), [mono](pipeline/translate/translate-mono.sh)) in parallel from another machine with access to the same network directory.
Cross-entropy filtering | Scores translated corpus with backward s2s model and removes a part of the corpus with the lowest scores to reduce noise | GPU, CPU, Disk | At this point we work with huge datasets, so it utilizes copying to a local disk to make things faster.
Training alignments and shortlist | Trains alignments using [fast_align](https://github.com/clab/fast_align) and extracts lexical shortlist using [extract_lex](https://github.com/marian-nmt/extract-lex) tool | CPU, Disk | Some tools requires uncompressed datasets on disk and they are huge at this point. Data is copied to a local disk to make things faster. Might take 100+GB of local disk depending on a dataset size. Good CPU parallelization.
Training student | Trains a small transformer student model on filtered data and using alignments | GPU | Run [Tensorboard](utils/tensorboard/tensorboard.sh) manually to see training visualization.
Training student | Trains a small transformer student model on filtered data and using alignments | GPU |
Fine-tuning student | Finetunes the student model by emulating 8bit GEMM during training | GPU | Converges very quickly and then degrades. It's quick but you might want to reduce early stopping threshold.
Quantizaiton | Applies 8 bit quantization to the fined-tuned student model and evaluates on CPU | CPU | CPU threads must be set to 1 for this step.
Evaluation | Calculates metrics for all models (BLEU, chrf) using [SacreBLEU](https://github.com/mjpost/sacrebleu) | GPU | Uses `datasets.test` configuration section.
Export | Exports trained model and shortlist to (bergamot-translator)(https://github.com/mozilla/bergamot-translator) format | |

## Datasets importers
## Dataset importers

Dataset importers can be used in `datasets` sections of experiment config.

Expand Down Expand Up @@ -256,6 +280,119 @@ Example:
Just add a shell script to [corpus](pipeline/data/importers/corpus) or [mono]() which is named as `<prefix>.sh`
and accepts the same parameters as the other scripts from the same folder.

## Dataset fixing

Some datasets require fixes like detokenization. Dataset and language specific fixes are implemented in [pipeline/clean/fixes]([pipeline/clean/fixes]).
Naming convention:
- `<dataset_name>.sh` for parallel dataset cleaning
- `<dataset_name>.<lang>.sh` for language specific cleaning of parallel or monolingual dataset
- `/` in dataset name should be replaced with `_`

## Dataset cleaning
Some parallel datasets require more aggressive filtering.
Dataset specific Bicleaner thretholds can be set in config. Example:

```angular2html
experiment:
...
bicleaner:
default-threshold: 0.5
dataset-thresholds:
mtdata_neulab_tedtalksv1_train: 0.6
```

## Utilities

### Tensorboard

To see training graphs run tensorboard:

```
make install-tensorboard
make tensorboard
```

Then port forward 6006.

## Directory structure

├ data
│ └ ru-en
│ └ test
│ ├ original
│ │ ├ corpus
│ │ │ ├ mtdata_JW300.en.gz
│ │ │ └ mtdata_JW300.ru.gz
│ │ ├ devset
│ │ │ ├ flores_dev.en.gz
│ │ │ └ flores_dev.ru.gz
│ │ ├ eval
│ │ │ ├ sacrebleu_wmt20.en.gz
│ │ │ └ sacrebleu_wmt20.ru.gz
│ │ ├ mono
│ │ │ ├ news-crawl_news.2020.ru.gz
│ │ │ └ news-crawl_news.2020.en.gz
│ │ ├ devset.ru.gz
│ │ └ devset.en.gz
│ ├ clean
│ │ ├ corpus
│ │ │ ├ mtdata_JW300.en.gz
│ │ │ └ mtdata_JW300.ru.gz
│ │ ├ mono
│ │ │ ├ news-crawl_news.2020.ru.gz
│ │ │ └ news-crawl_news.2020.en.gz
│ │ ├ mono.ru.gz
│ │ └ mono.en.gz
│ ├ biclean
│ │ ├ corpus
│ │ │ ├ mtdata_JW300.en.gz
│ │ │ └ mtdata_JW300.ru.gz
│ │ ├ corpus.ru.gz
│ │ ├ corpus.en.gz
│ ├ translated
│ │ ├ mono.ru.gz
│ │ └ mono.en.gz
│ ├ augmented
│ │ ├ corpus.ru.gz
│ │ └ corpus.en.gz
│ ├ alignment
│ │ ├ corpus.aln.gz
│ │ └ lex.s2t.pruned.gz
│ ├ merged
│ │ ├ corpus.ru.gz
│ │ └ corpus.en.gz
│ └ filtered
│ ├ corpus.ru.gz
│ └ corpus.en.gz
├ models
│ ├ ru-en
│ │ └ test
│ │ ├ teacher
│ │ ├ student
│ │ ├ student-finetuned
│ │ ├ speed
│ │ ├ evaluation
│ │ │ ├ backward
│ │ │ ├ teacher0
│ │ │ ├ teacher1
│ │ │ ├ teacher-ensemble
│ │ │ ├ student
│ │ │ ├ student-finetuned
│ │ │ └ speed
│ │ └ exported
│ ├ en-ru
│ └ test
│ └ backward
├ experiments
│ └ ru-en
│ └ test
│ └ config.sh
├ logs
│ └ ru-en
│ └ test
│ └ clean_corpus.log

## Development

### Architecture
Expand All @@ -271,9 +408,6 @@ Snakemake parallelizes steps that can be executed simultniously. It is especiall
The main snakemkae process (scheduler) should be launched interactively. It runs job processes on the worker nodes in cluster mode or on a local machine in local mode.

### Conventions

- All scripts work with respect to repo root directory.
It allows to not think about relative paths and execution folders.

- Scripts inside the `pipeline` directory are independent and operate only using input arguments, input files
and global envs.
Expand Down
Loading

0 comments on commit 3b3f33b

Please sign in to comment.