diff --git a/DAG.pdf b/DAG.pdf new file mode 100644 index 000000000..d1c5a539f Binary files /dev/null and b/DAG.pdf differ diff --git a/Makefile b/Makefile index 718cbd0db..ef05855d4 100644 --- a/Makefile +++ b/Makefile @@ -11,9 +11,12 @@ WORKSPACE=12000 CLUSTER_CORES=16 CONFIG=configs/config.prod.yml CONDA_PATH=$(SHARED_ROOT)/mambaforge +SNAKEMAKE_OUTPUT_CACHE=$(SHARED_ROOT)/cache +TARGET= ### CONDA_ACTIVATE=source $(CONDA_PATH)/etc/profile.d/conda.sh ; conda activate ; conda activate +SNAKEMAKE=export SNAKEMAKE_OUTPUT_CACHE=$(SNAKEMAKE_OUTPUT_CACHE); snakemake ### 2. setup @@ -26,7 +29,8 @@ conda: snakemake: $(CONDA_ACTIVATE) base - mamba create -c conda-forge -c bioconda -n snakemake snakemake==6.9.1 --yes + mamba create -c conda-forge -c bioconda -n snakemake snakemake==6.10.0 --yes + mkdir -p "$(SNAKEMAKE_OUTPUT_CACHE)" # build container image for cluster and run-local modes (preferred) build: @@ -44,64 +48,78 @@ pull: dry-run: $(CONDA_ACTIVATE) snakemake - snakemake \ + $(SNAKEMAKE) \ --use-conda \ --cores all \ + --cache \ --reason \ --configfile $(CONFIG) \ --config root="$(SHARED_ROOT)" cuda="$(CUDA_DIR)" gpus=$(GPUS) workspace=$(WORKSPACE) deps=true \ - -n + -n \ + $(TARGET) run-local: + echo "Running with config $(CONFIG)" $(CONDA_ACTIVATE) snakemake - snakemake \ + $(SNAKEMAKE) \ --use-conda \ --reason \ --cores all \ + --cache \ --resources gpu=$(GPUS) \ --configfile $(CONFIG) \ - --config root="$(SHARED_ROOT)" cuda="$(CUDA_DIR)" gpus=$(GPUS) workspace=$(WORKSPACE) deps=true + --config root="$(SHARED_ROOT)" cuda="$(CUDA_DIR)" gpus=$(GPUS) workspace=$(WORKSPACE) deps=true \ + $(TARGET) + +test: CONFIG=configs/config.test.yml +test: run-local run-local-container: $(CONDA_ACTIVATE) snakemake module load singularity - snakemake \ + $(SNAKEMAKE) \ --use-conda \ --use-singularity \ --reason \ --cores all \ + --cache \ --resources gpu=$(GPUS) \ --configfile $(CONFIG) \ --config root="$(SHARED_ROOT)" cuda="$(CUDA_DIR)" gpus=$(GPUS) workspace=$(WORKSPACE) \ - --singularity-args="--bind $(SHARED_ROOT),$(CUDA_DIR) --nv" + --singularity-args="--bind $(SHARED_ROOT),$(CUDA_DIR) --nv" \ + $(TARGET) run-slurm: $(CONDA_ACTIVATE) snakemake chmod +x profiles/slurm/* - snakemake \ + $(SNAKEMAKE) \ --use-conda \ --reason \ --cores $(CLUSTER_CORES) \ + --cache \ --configfile $(CONFIG) \ --config root="$(SHARED_ROOT)" cuda="$(CUDA_DIR)" gpus=$(GPUS) workspace=$(WORKSPACE) \ - --profile=profiles/slurm + --profile=profiles/slurm \ + $(TARGET) run-slurm-container: $(CONDA_ACTIVATE) snakemake chmod +x profiles/slurm/* module load singularity - snakemake \ + $(SNAKEMAKE) \ --use-conda \ --use-singularity \ --reason \ --verbose \ --cores $(CLUSTER_CORES) \ + --cache \ --configfile $(CONFIG) \ --config root="$(SHARED_ROOT)" cuda="$(CUDA_DIR)" gpus=$(GPUS) workspace=$(WORKSPACE) \ --profile=profiles/slurm \ - --singularity-args="--bind $(SHARED_ROOT),$(CUDA_DIR),/tmp --nv --containall" + --singularity-args="--bind $(SHARED_ROOT),$(CUDA_DIR),/tmp --nv --containall" \ + $(TARGET) # if CPU nodes don't have access to cuda dirs, use -# export CUDA_DIR=$(CUDA_DIR) +# export CUDA_DIR=$(CUDA_DIR); $(SNAKEMAKE) \ # --singularity-args="--bind $(SHARED_ROOT),/tmp --nv --containall" @@ -123,25 +141,11 @@ run-file-server: ### extra dag: - snakemake --dag | dot -Tpdf > DAG.pdf - -lint: - snakemake --lint - -install-monitor: - $(CONDA_ACTIVATE) base - conda create --name panoptes - conda install -c panoptes-organization panoptes-ui - -run-monitor: - $(CONDA_ACTIVATE) panoptes - panoptes - -run-with-monitor: snakemake \ - --use-conda \ - --cores all \ - --wms-monitor http://127.0.0.1:5000 + --dag \ + --configfile $(CONFIG) \ + --config root="$(SHARED_ROOT)" cuda="$(CUDA_DIR)" gpus=$(GPUS) workspace=$(WORKSPACE) \ + | dot -Tpdf > DAG.pdf install-tensorboard: $(CONDA_ACTIVATE) base @@ -151,29 +155,4 @@ tensorboard: $(CONDA_ACTIVATE) tensorboard ls -d $(SHARED_ROOT)/models/*/*/* > tb-monitored-jobs; \ tensorboard --logdir=$$MODELS --host=0.0.0.0 &; \ - python utils/tb_log_parser.py --prefix= - -install-snakepit-scheduler: - mkdir -p $(SHARED_ROOT)/snakepit - cd $(SHARED_ROOT)/snakepit - - curl -sL https://deb.nodesource.com/setup_12.x | sudo -E bash - - sudo apt install nodejs - - if [ ! -e snakepit-client ]; then - git clone https://github.com/mozilla/snakepit-client.git - fi - cd snakepit-client - npm install - sudo npm link - - echo "http://10.2.224.243" > /root/.pitconnect.txt - - pit status - -run-snakepit: - chmod +x profiles/snakepit/* - snakemake \ - --use-conda \ - --cores all \ - --profile=profiles/snakepit + python utils/tb_log_parser.py --prefix= \ No newline at end of file diff --git a/README.md b/README.md index 97dd07078..18a152a7f 100644 --- a/README.md +++ b/README.md @@ -128,15 +128,24 @@ make dry-run ### Local mode -Without containerization: +#### Without containerization + ``` make run-local ``` -With containerization: +To test the whole pipeline end to end (it supposed to run quickly and does not train anything useful): + +``` +make test +``` +Or run +#### With containerization ``` make run-local-container ``` + + ### Cluster mode To run on Slurm @@ -149,6 +158,18 @@ with containerization (recommended): ``` make run-slurm-container ``` +### Specific target + +By default, all Snakemake rules are executed. To run the pipeline up to a specific rule use: +``` +make TARGET= +``` + +For example, collect corpus first: +``` +make run-local TARGET=merge_corpus +``` + ### Using Snakepit @@ -209,20 +230,23 @@ Step | Description | Bottleneck | Comments --- | --- | --- | --- Installation | Installing dependencies and compiling | CPU | Takes ~1 hour Data downloading | Downloads datasets, samples sentences | Network, Disk | Time depends on dataset size, sampling of huge mono datasets (100M+ sentences) is the most intensive operation. -Data cleaning | Basic preprocessing, language specific, rule based, deduplication, and other attempts to clean noisy data in parallel and mono datasets | CPU | Good parallelization across CPU cores. To make cleaning of a new language more efficient add it to [clean_parallel.py](/pipeline/clean/clean_parallel.py). -Bicleaner | Filters noisy sentence pairs in a parallel corpus using [bicleaner](https://github.com/bitextor/bicleaner) or [bicleaner-ai](https://github.com/bitextor/bicleaner-ai) depending on available language packs. | CPU, GPU | If there are no pretrained language packs for bicleaner-ai, it uses bicleaner. If there are no ones for bicleaner either, this step is skipped. Cleaning threshold is controlled by `BICLEANER_THRESHOLD` config setting. +Data cleaning | Basic preprocessing, dataset specific, language specific, rule based and other attempts to clean noisy data in parallel and mono datasets | CPU | Good parallelization across CPU cores. To make cleaning of a new language more efficient add it to [clean_parallel.py](/pipeline/clean/tools/clean_parallel.py). +Bicleaner | Filters noisy sentence pairs in a parallel corpus using [bicleaner](https://github.com/bitextor/bicleaner) or [bicleaner-ai](https://github.com/bitextor/bicleaner-ai) depending on available language packs. | CPU, GPU | If there are no pretrained language packs for bicleaner-ai, it uses bicleaner. If there are no ones for bicleaner either, this step is skipped. Cleaning thresholds are configurable per dataset, see [Dataset cleaning](##Dataset cleaning). +Merge and dedupe | Merges clean dataset and applies deduplicaiton | CPU, Disk | Training s2s | Trains a backward shallow s2s model, which is useful for back-translations and ce-filtering | GPU | Inspired by a [marian example](https://github.com/marian-nmt/marian-examples/tree/master/training-basics-sentencepiece). -Augmentation with back-translations | Translates mono corpus combined from `MONO_DATASETS_TRG` using shallow s2s model. | GPU | It is more useful for low-resource languages and can be skipped for others. -Training teacher | Trains one or multiple big transformer models | GPU | You might want to adjust [early stopping](pipeline/train/configs/training/teacher.transformer.train.yml) parameters depending on datasets size. Inspired by [transformer](https://github.com/marian-nmt/marian-examples/tree/master/transformer) and [wmt2017-uedin](https://github.com/marian-nmt/marian-examples/tree/master/wmt2017-uedin) marian examples and extended with [SentencePiece](https://github.com/google/sentencepiece). +Augmentation with back-translations | Translates mono corpus combined from monolingual datasets in target language using shallow s2s model. | GPU | It is more useful for low-resource languages and can be skipped for others. +Training teacher | Trains an ensemble of big transformer models on augmented dataset | GPU | You might want to adjust [early stopping](pipeline/train/configs/training/teacher.transformer.train.yml) or `after-epochs` parameters depending on datasets size. +Continue training teacher | Continue training an ensemble of teachers on parallel data only | GPU | You might want to adjust [early stopping](pipeline/train/configs/training/teacher.transformer.train.yml) parameters depending on datasets size. Translation by teacher | Translates a corpus and monolingual data combined from `MONO_DATASETS_SRC` using the teacher model (ensemble is not supported yet) | GPU | The slowest part of the pipeline. Can take days. It is possible to speed it up launching the same scripts ([corpus](pipeline/translate/translate-corpus.sh), [mono](pipeline/translate/translate-mono.sh)) in parallel from another machine with access to the same network directory. Cross-entropy filtering | Scores translated corpus with backward s2s model and removes a part of the corpus with the lowest scores to reduce noise | GPU, CPU, Disk | At this point we work with huge datasets, so it utilizes copying to a local disk to make things faster. Training alignments and shortlist | Trains alignments using [fast_align](https://github.com/clab/fast_align) and extracts lexical shortlist using [extract_lex](https://github.com/marian-nmt/extract-lex) tool | CPU, Disk | Some tools requires uncompressed datasets on disk and they are huge at this point. Data is copied to a local disk to make things faster. Might take 100+GB of local disk depending on a dataset size. Good CPU parallelization. -Training student | Trains a small transformer student model on filtered data and using alignments | GPU | Run [Tensorboard](utils/tensorboard/tensorboard.sh) manually to see training visualization. +Training student | Trains a small transformer student model on filtered data and using alignments | GPU | Fine-tuning student | Finetunes the student model by emulating 8bit GEMM during training | GPU | Converges very quickly and then degrades. It's quick but you might want to reduce early stopping threshold. Quantizaiton | Applies 8 bit quantization to the fined-tuned student model and evaluates on CPU | CPU | CPU threads must be set to 1 for this step. +Evaluation | Calculates metrics for all models (BLEU, chrf) using [SacreBLEU](https://github.com/mjpost/sacrebleu) | GPU | Uses `datasets.test` configuration section. Export | Exports trained model and shortlist to (bergamot-translator)(https://github.com/mozilla/bergamot-translator) format | | -## Datasets importers +## Dataset importers Dataset importers can be used in `datasets` sections of experiment config. @@ -256,6 +280,119 @@ Example: Just add a shell script to [corpus](pipeline/data/importers/corpus) or [mono]() which is named as `.sh` and accepts the same parameters as the other scripts from the same folder. +## Dataset fixing + +Some datasets require fixes like detokenization. Dataset and language specific fixes are implemented in [pipeline/clean/fixes]([pipeline/clean/fixes]). +Naming convention: +- `.sh` for parallel dataset cleaning +- `..sh` for language specific cleaning of parallel or monolingual dataset +- `/` in dataset name should be replaced with `_` + +## Dataset cleaning +Some parallel datasets require more aggressive filtering. +Dataset specific Bicleaner thretholds can be set in config. Example: + +```angular2html +experiment: +... + bicleaner: + default-threshold: 0.5 + dataset-thresholds: + mtdata_neulab_tedtalksv1_train: 0.6 +``` + +## Utilities + +### Tensorboard + +To see training graphs run tensorboard: + +``` +make install-tensorboard +make tensorboard +``` + +Then port forward 6006. + +## Directory structure + + ├ data + │ └ ru-en + │ └ test + │ ├ original + │ │ ├ corpus + │ │ │ ├ mtdata_JW300.en.gz + │ │ │ └ mtdata_JW300.ru.gz + │ │ ├ devset + │ │ │ ├ flores_dev.en.gz + │ │ │ └ flores_dev.ru.gz + │ │ ├ eval + │ │ │ ├ sacrebleu_wmt20.en.gz + │ │ │ └ sacrebleu_wmt20.ru.gz + │ │ ├ mono + │ │ │ ├ news-crawl_news.2020.ru.gz + │ │ │ └ news-crawl_news.2020.en.gz + │ │ ├ devset.ru.gz + │ │ └ devset.en.gz + │ ├ clean + │ │ ├ corpus + │ │ │ ├ mtdata_JW300.en.gz + │ │ │ └ mtdata_JW300.ru.gz + │ │ ├ mono + │ │ │ ├ news-crawl_news.2020.ru.gz + │ │ │ └ news-crawl_news.2020.en.gz + │ │ ├ mono.ru.gz + │ │ └ mono.en.gz + │ ├ biclean + │ │ ├ corpus + │ │ │ ├ mtdata_JW300.en.gz + │ │ │ └ mtdata_JW300.ru.gz + │ │ ├ corpus.ru.gz + │ │ ├ corpus.en.gz + │ ├ translated + │ │ ├ mono.ru.gz + │ │ └ mono.en.gz + │ ├ augmented + │ │ ├ corpus.ru.gz + │ │ └ corpus.en.gz + │ ├ alignment + │ │ ├ corpus.aln.gz + │ │ └ lex.s2t.pruned.gz + │ ├ merged + │ │ ├ corpus.ru.gz + │ │ └ corpus.en.gz + │ └ filtered + │ ├ corpus.ru.gz + │ └ corpus.en.gz + ├ models + │ ├ ru-en + │ │ └ test + │ │ ├ teacher + │ │ ├ student + │ │ ├ student-finetuned + │ │ ├ speed + │ │ ├ evaluation + │ │ │ ├ backward + │ │ │ ├ teacher0 + │ │ │ ├ teacher1 + │ │ │ ├ teacher-ensemble + │ │ │ ├ student + │ │ │ ├ student-finetuned + │ │ │ └ speed + │ │ └ exported + │ ├ en-ru + │ └ test + │ └ backward + │ + ├ experiments + │ └ ru-en + │ └ test + │ └ config.sh + ├ logs + │ └ ru-en + │ └ test + │ └ clean_corpus.log + ## Development ### Architecture @@ -271,9 +408,6 @@ Snakemake parallelizes steps that can be executed simultniously. It is especiall The main snakemkae process (scheduler) should be launched interactively. It runs job processes on the worker nodes in cluster mode or on a local machine in local mode. ### Conventions - -- All scripts work with respect to repo root directory. - It allows to not think about relative paths and execution folders. - Scripts inside the `pipeline` directory are independent and operate only using input arguments, input files and global envs. diff --git a/Snakefile b/Snakefile index c26224b4b..37bcbbc68 100644 --- a/Snakefile +++ b/Snakefile @@ -15,75 +15,6 @@ min_version("6.6.1") container: 'Singularity.sif' - -# Directories structure -# -#├ data -#│ ├ cache -#│ │ ├ corpus -#│ │ │ └ opus -#│ │ │ ├ ada83_v1.en.gz -#│ │ │ └ ada83_v1.ru.gz -#│ │ └ mono -#│ │ └ news-crawl -#│ │ ├ news.2019.ru.gz -#│ │ └ news.2019.en.gz -#│ └ ru-en -#│ └ test -#│ ├ original -#│ │ ├ corpus.ru.gz -#│ │ ├ corpus.en.gz -#│ │ ├ mono.ru.gz -#│ │ ├ mono.en.gz -#│ │ ├ devset.ru.gz -#│ │ └ devset.en.gz -#│ ├ evaluation -#│ │ ├ wmt12.ru -#│ │ ├ wmt12.en -#│ │ ├ wmt20.ru -#│ │ ├ wmt20.en -#│ ├ clean -#│ │ ├ corpus.ru.gz -#│ │ ├ corpus.en.gz -#│ │ ├ mono.ru.gz -#│ │ └ mono.en.gz -#│ ├ biclean -#│ │ ├ corpus.ru.gz -#│ │ ├ corpus.en.gz -#│ ├ translated -#│ │ ├ mono.ru.gz -#│ │ └ mono.en.gz -#│ ├ augmented -#│ │ ├ corpus.ru.gz -#│ │ └ corpus.en.gz -#│ ├ alignment -#│ │ ├ corpus.aln.gz -#│ │ └ lex.s2t.pruned.gz -#│ ├ merged -#│ │ ├ corpus.ru.gz -#│ │ └ corpus.en.gz -#│ └ filtered -#│ ├ corpus.ru.gz -#│ └ corpus.en.gz -#├ models -#│ ├ ru-en -#│ │ └ test -#│ │ ├ teacher -#│ │ ├ student -#│ │ ├ student-finetuned -#│ │ ├ speed -#│ │ └ exported -#│ ├ en-ru -#│ └ test -#│ └ s2s -#│ -#├ experiments -#│ └ ru-en -#│ └ test -#│ └ config.sh -#├ logs - - install_deps = config['deps'] == 'true' data_root_dir = config['root'] cuda_dir = config['cuda'] @@ -97,15 +28,17 @@ experiment = config['experiment']['name'] mono_max_sent_src = config['experiment']['mono-max-sentences-src'] mono_max_sent_trg = config['experiment']['mono-max-sentences-trg'] -bicleaner_threshold = config['experiment']['bicleaner-threshold'] -backward_model = config['experiment']['backward-model'] +bicl_default_threshold = config['experiment']['bicleaner']['default-threshold'] +bicl_dataset_thresholds = config['experiment']['bicleaner']['dataset-thresholds'] +backward_pretrained = config['experiment']['backward-model'] experiment_dir=f"{data_root_dir}/experiments/{src}-{trg}/{experiment}" # training -training_args = "" +training_args = {} if 'training' in config: - training_args = ' '.join([f'--{k} {v}' for k,v in config['training'].items()]) + training_args = {name: ' '.join([f'--{k} {v}' for k,v in conf.items() ]) + for name, conf in config['training'].items()} # datasets train_datasets = config['datasets']['train'] @@ -114,6 +47,9 @@ eval_datasets = config['datasets']['test'] mono_src_datasets = config['datasets']['mono-src'] mono_trg_datasets = config['datasets']['mono-trg'] +mono_datasets = {src: mono_src_datasets, trg: mono_trg_datasets} +mono_max_sent = {src: mono_max_sent_src, trg: mono_max_sent_trg} + # parallelization gpus = ' '.join([str(n) for n in range(int(gpus_num))]) ensemble = list(range(config['experiment']['teacher-ensemble'])) @@ -137,7 +73,6 @@ clean = f"{data_dir}/clean" biclean = f"{data_dir}/biclean" cache_dir = f"{data_dir}/cache" original = f"{data_dir}/original" -evaluation = f"{data_dir}/evaluation" translated = f"{data_dir}/translated" augmented = f"{data_dir}/augmented" merged = f"{data_dir}/merged" @@ -151,13 +86,22 @@ student_dir = f"{models_dir}/student" student_finetuned_dir = f"{models_dir}/student-finetuned" speed = f"{models_dir}/speed" exported = f"{models_dir}/exported" -best_model = "model.npz.best-bleu-detok.npz" -s2s=f'{models_dir}/s2s' - +best_model = f"model.npz.best-{config['experiment']['best-model']}.npz" +backward = f'{models_dir}/backward' + +#evaluation +eval_data = f"{original}/eval" +eval_res = f"{models_dir}/evaluation" +eval_backward = f'{eval_res}/backward' +eval_student = f'{eval_res}/student', +eval_student_finetuned = f'{eval_res}/student-finetuned', +eval_speed = f'{eval_res}/speed', +eval_teacher_ens = f'{eval_res}/teacher-ensemble', +full_eval_datasets = expand(f'{eval_data}/{{dataset}}.{{lang}}.gz', dataset=eval_datasets, lang=[src,trg]) # set common environment variables envs = f'''SRC={src} TRG={trg} MARIAN="{marian_dir}" GPUS="{gpus}" WORKSPACE={workspace} \ -CLEAN_TOOLS=pipeline/clean/tools BIN="{bin}" DATA_ROOT_DIR="{data_root_dir}" \ +BIN="{bin}" DATA_ROOT_DIR="{data_root_dir}" \ CUDA_DIR="{cuda_dir}"''' ### workflow options @@ -166,22 +110,25 @@ results = [f'{exported}/model.{src}{trg}.intgemm.alphas.bin.gz', f'{exported}/lex.50.50.{src}{trg}.s2t.bin.gz', f'{exported}/vocab.{src}{trg}.spm.gz', f'{experiment_dir}/config.yml', - expand(f'{teacher_dir}{{ens}}/eval',ens=ensemble), - f'{student_dir}/eval', - f'{student_finetuned_dir}/eval', - f'{speed}/eval', + expand(f'{eval_res}/teacher{{ens}}',ens=ensemble), + f'{eval_res}/student', + f'{eval_res}/student-finetuned', + f'{eval_res}/speed' ] +if len(ensemble) > 1: + results.append(f'{eval_res}/teacher-ensemble') + if install_deps: results.append("/tmp/flags/setup.done") -if not backward_model: - backward_model = s2s +if not backward_pretrained: # don't evaluate pretrained model - results.append(f'{backward_model}/eval') - train_s2s=True + results.append(eval_backward) + train_backward=True else: - train_s2s = False + train_backward = False + backward = backward_pretrained # bicleaner @@ -189,25 +136,29 @@ bicleaner_type = packs.find(src, trg) bicleaner_env = "envs/bicleaner-ai.yml" if bicleaner_type == 'bicleaner-ai' else 'envs/bicleaner.yml' if bicleaner_type: - clean_corpus_src = f"{biclean}/corpus.{src}.gz" - clean_corpus_trg = f"{biclean}/corpus.{trg}.gz" + clean_corpus_prefix = f'{biclean}/corpus' teacher_corpus = f'{biclean}/corpus' use_bicleaner = True else: - clean_corpus_src = f"{clean}/corpus.{src}.gz" - clean_corpus_trg = f"{clean}/corpus.{trg}.gz" + clean_corpus_prefix = f'{clean}/corpus' teacher_corpus = f'{clean}/corpus' use_bicleaner = False +clean_corpus_src = f'{clean_corpus_prefix}.{src}.gz' +clean_corpus_trg = f'{clean_corpus_prefix}.{trg}.gz' + # augmentation if mono_trg_datasets: teacher_corpus = f'{augmented}/corpus' - augment_corpus=True + augment_corpus = True + continue_teacher = True # continue training on parallel corpus + teacher_all_output = 'model.npz' else: - augment_corpus=False - + augment_corpus = False + continue_teacher = False + teacher_all_output = best_model ### rules @@ -216,13 +167,16 @@ def find_parts(wildcards, checkpoint): checkpoint_output = checkpoint.get(**wildcards).output[0] return glob_wildcards(os.path.join(checkpoint_output,"file.{part,\d+}")).part +def dataset_norm(name: str): + return name.replace('/','_') + shell.prefix(f"{envs} ") rule all: input: results localrules: experiment -ruleorder: teacher > eval_teacher +ruleorder: teacher_all > eval_teacher rule experiment: message: "Saving experiment metadata" @@ -245,7 +199,6 @@ if install_deps: output: touch("/tmp/flags/setup.done") # specific to local machine shell: 'bash pipeline/setup/install-deps.sh >> {log} 2>&1' - rule marian: message: "Compiling marian" log: f"{log_dir}/compile-marian.log" @@ -275,71 +228,62 @@ rule extract_lex: output: protected(f"{bin}/extract_lex") shell: 'bash pipeline/setup/compile-extract-lex.sh {extract_lex_build} {threads} >> {log} 2>&1' -# data - -rule data_train: - message: "Downloading training corpus" - log: f"{log_dir}/data_train.log" - conda: "envs/base.yml" - threads: 1 - group: 'data' - output: src=f"{original}/corpus.{src}.gz",trg=f"{original}/corpus.{trg}.gz" - params: prefix=f"{original}/corpus" - shell: 'bash pipeline/data/download-corpus.sh "{params.prefix}" "{cache_dir}" train {train_datasets} >> {log} 2>&1' +# data downloading -rule data_val: - message: "Downloading validation corpus" - log: f"{log_dir}/data_val.log" +rule download_corpus: + message: "Downloading parallel corpus" + log: f"{log_dir}/download_corpus/{{kind}}/{{dataset}}.log" conda: "envs/base.yml" threads: 1 group: 'data' - output: src=f"{original}/devset.{src}.gz",trg=f"{original}/devset.{trg}.gz" - params: prefix=f"{original}/devset" - shell: 'bash pipeline/data/download-corpus.sh "{params.prefix}" "{cache_dir}" valid {valid_datasets} >> {log} 2>&1' - -rule data_test: - message: "Downloading test corpus" - log: f"{log_dir}/data_test.log" + cache: False # caching is broken in snakemake + wildcard_constraints: kind="corpus|devset|eval" + output: multiext(f"{original}/{{kind}}/{{dataset}}", f".{src}.gz", f".{trg}.gz") + params: prefix=f"{original}/{{kind}}/{{dataset}}", dataset="{dataset}" + shell: 'bash pipeline/data/download-corpus.sh "{params.dataset}" "{params.prefix}" >> {log} 2>&1' + +rule download_mono: + message: "Downloading monolingual dataset" + log: f"{log_dir}/download_mono/{{dataset}}.{{lang}}.log" conda: "envs/base.yml" threads: 1 group: 'data' - output: expand(f"{evaluation}/{{dataset}}.{{lng}}",dataset=eval_datasets,lng=[src, trg]) - shell: 'bash pipeline/data/download-eval.sh "{evaluation}" "{cache_dir}" {eval_datasets} >> {log} 2>&1' - -rule data_mono_src: - message: "Downloading monolingual dataset for source language" - log: f"{log_dir}/data_mono_src.log" - conda: "envs/base.yml" - threads: 1 - group: 'data' - output: f'{original}/mono.{src}.gz' + cache: False # caching is broken in snakemake + wildcard_constraints: lang=f"{src}|{trg}" + output: f'{original}/mono/{{dataset}}.{{lang}}.gz' + params: max_sent=lambda wildcards: mono_max_sent[wildcards.lang], dataset='{dataset}', lang='{lang}' shell: '''bash pipeline/data/download-mono.sh \ - "{src}" "{mono_max_sent_src}" "{original}/mono" "{cache_dir}" {mono_src_datasets} >> {log} 2>&1''' - -if mono_trg_datasets: - rule data_mono_trg: - message: "Downloading monolingual dataset for target language" - log: f"{log_dir}/data_mono_trg.log" - conda: "envs/base.yml" - threads: 1 - group: 'data' - output: f'{original}/mono.{trg}.gz' - shell: '''bash pipeline/data/download-mono.sh \ - "{trg}" "{mono_max_sent_trg}" "{original}/mono" "{cache_dir}" {mono_trg_datasets} >> {log} 2>&1''' + "{params.dataset}" {params.lang} {params.max_sent} "{output}" >> {log} 2>&1''' # cleaning rule clean_corpus: - message: "Cleaning corpus" - log: f"{log_dir}/clean_corpus.log" + message: "Cleaning dataset" + log: f"{log_dir}/clean_corpus/{{dataset}}.log" conda: "envs/base.yml" + group: "clean_corpus" threads: workflow.cores - input: rules.data_train.output.src,rules.data_train.output.trg - output: src=f"{clean}/corpus.{src}.gz",trg=f"{clean}/corpus.{trg}.gz" - params: prefix_input=f"{original}/corpus",prefix_output=f"{clean}/corpus" - shell: '''bash pipeline/clean/clean-corpus.sh "{params.prefix_input}" "{params.prefix_output}" {threads} \ + input: multiext(f"{original}/corpus/{{dataset}}", f".{src}.gz", f".{trg}.gz") + output: multiext(f"{clean}/corpus/{{dataset}}", f".{src}.gz", f".{trg}.gz") + params: prefix_input=f"{original}/corpus/{{dataset}}",prefix_output=f"{clean}/corpus/{{dataset}}", + dataset=lambda wildcards: dataset_norm(wildcards.dataset) + shell: '''bash pipeline/clean/clean-corpus.sh "{params.prefix_input}" "{params.prefix_output}" {threads} {params.dataset} \ >> {log} 2>&1''' +rule clean_mono: + message: "Cleaning monolingual dataset" + log: f"{log_dir}/clean_mono/{{dataset}}.{{lang}}.log" + conda: "envs/base.yml" + threads: workflow.cores + group: "clean_mono{lang}" + cache: False + wildcard_constraints: lang=f"{src}|{trg}" + input: f'{original}/mono/{{dataset}}.{{lang}}.gz' + output: f'{clean}/mono/{{dataset}}.{{lang}}.gz' + params: prefix_input=f"{original}/mono/{{dataset}}", prefix_output=f"{clean}/mono/{{dataset}}", + dataset=lambda wildcards: dataset_norm(wildcards.dataset) + shell: '''bash pipeline/clean/clean-mono.sh {wildcards.lang} "{params.prefix_input}" "{params.prefix_output}" \ + {threads} {params.dataset} >> {log} 2>&1''' if use_bicleaner: rule kenlm: @@ -351,28 +295,67 @@ if use_bicleaner: output: directory(f"{bin}/kenlm") shell: 'bash pipeline/setup/install-kenlm.sh {kenlm} {threads} >> {log} 2>&1' + rule bicleaner_pack: + message: f"Downloading language pack for bicleaner" + log: f"{log_dir}/bicleaner_pack.log" + conda: bicleaner_env + group: "clean_corpus" + threads: 1 + input: rules.kenlm.output + output: directory(f"{biclean}/pack") + shell: '''bash pipeline/bicleaner/download-pack.sh "{output}" {bicleaner_type} >> {log} 2>&1''' + rule bicleaner: message: f"Cleaning corpus using {bicleaner_type}" - log: f"{log_dir}/bicleaner.log" + log: f"{log_dir}/bicleaner/{{dataset}}.log" conda: bicleaner_env - threads: workflow.cores - input: src=rules.clean_corpus.output.src,trg=rules.clean_corpus.output.trg,kenlm=rules.kenlm.output - output: src=clean_corpus_src,trg=clean_corpus_trg - params: prefix_input=f"{clean}/corpus",prefix_output=f"{biclean}/corpus" + group: "clean_corpus" + threads: 1 + input: rules.kenlm.output, multiext(f"{clean}/corpus/{{dataset}}", f".{src}.gz", f".{trg}.gz"), + pack_dir=rules.bicleaner_pack.output + output: multiext(f"{biclean}/corpus/{{dataset}}", f".{src}.gz", f".{trg}.gz") + params: + prefix_input=f"{clean}/corpus/{{dataset}}",prefix_output=f"{biclean}/corpus/{{dataset}}", + threshold=lambda wildcards: bicl_dataset_thresholds.get(wildcards.dataset) or bicl_default_threshold shell: '''bash pipeline/bicleaner/bicleaner.sh \ - "{params.prefix_input}" "{params.prefix_output}" {bicleaner_threshold} {bicleaner_type} \ - >> {log} 2>&1''' + "{params.prefix_input}" "{params.prefix_output}" {params.threshold} {bicleaner_type} {threads} \ + "{input.pack_dir}" >> {log} 2>&1''' -rule clean_mono: - message: "Cleaning monolingual dataset" - log: f"{log_dir}/clean_mono_{{lang}}.log" +rule merge_corpus: + message: "Merging clean parallel datasets" + log: f"{log_dir}/merge_corpus.log" + conda: "envs/base.yml" + threads: workflow.cores + group: "clean_corpus" + input: expand(f"{clean_corpus_prefix}/{{dataset}}.{{lang}}.gz", dataset=train_datasets, lang=[src, trg]) + output: src=clean_corpus_src,trg=clean_corpus_trg + params: prefix_output=clean_corpus_prefix, prefixes=expand(f"{clean_corpus_prefix}/{{dataset}}", dataset=train_datasets) + shell: '''bash pipeline/clean/merge-corpus.sh "{params.prefix_output}" {params.prefixes} >> {log} 2>&1''' + +rule merge_devset: + message: "Merging devsets" + log: f"{log_dir}/merge_devset.log" + conda: "envs/base.yml" + threads: workflow.cores + group: "clean_corpus" + input: expand(f"{original}/devset/{{dataset}}.{{lang}}.gz", dataset=valid_datasets, lang=[src, trg]) + output: multiext(f"{original}/devset", f".{src}.gz", f".{trg}.gz") + params: prefix_output=f"{original}/devset", prefixes=expand(f"{original}/devset/{{dataset}}", dataset=valid_datasets) + shell: '''bash pipeline/clean/merge-corpus.sh "{params.prefix_output}" {params.prefixes} >> {log} 2>&1''' + +rule merge_mono: + message: "Merging clean monolingual datasets" + log: f"{log_dir}/merge_mono_{{lang}}.log" conda: "envs/base.yml" threads: workflow.cores - input: f'{original}/mono.{{lang}}.gz' + group: "clean_mono{lang}" + input: + lambda wildcards: expand(f"{clean}/mono/{{dataset}}.{{lang}}.gz", + dataset=mono_datasets[wildcards.lang], lang=wildcards.lang) output: f"{clean}/mono.{{lang}}.gz" - params: lang='{lang}' - shell: '''bash pipeline/clean/clean-mono.sh "{params.lang}" "{original}/mono" "{clean}/mono" {threads} \ - >> {log} 2>&1''' + params: max_sent=lambda wildcards: mono_max_sent[wildcards.lang] + shell: '''bash pipeline/clean/merge-mono.sh "{output}" {params.max_sent} {input} >> {log} 2>&1''' + # augmentation and teacher training @@ -385,11 +368,11 @@ rule train_vocab: bin=rules.marian.output.vocab, corpus_src=clean_corpus_src,corpus_trg=clean_corpus_trg output: f"{models_dir}/vocab/vocab.spm" - params: prefix_train=f"{biclean}/corpus",prefix_test=f"{original}/devset" + params: prefix_train=clean_corpus_prefix,prefix_test=f"{original}/devset" shell: 'bash pipeline/train/spm-vocab.sh "{input.corpus_src}" "{input.corpus_trg}" "{output}" >> {log} 2>&1' -if train_s2s: +if train_backward: rule backward: message: "Training backward model" log: f"{log_dir}/train_backward.log" @@ -398,14 +381,14 @@ if train_s2s: resources: gpu=gpus_num group: 'backward' input: - train_src=clean_corpus_src,train_trg=clean_corpus_trg, - val_src=rules.data_val.output.src,val_trg=rules.data_val.output.trg, - bin=rules.marian.output.trainer, vocab=rules.train_vocab.output - output: model=f'{backward_model}/{best_model}' - params: prefix_train=f"{biclean}/corpus",prefix_test=f"{original}/devset" - shell: '''bash pipeline/train/train-s2s.sh \ - "{backward_model}" "{params.prefix_train}" "{params.prefix_test}" "{input.vocab}" {trg} {src} \ - {training_args} >> {log} 2>&1''' + rules.merge_devset.output, train_src=clean_corpus_src,train_trg=clean_corpus_trg, + bin=rules.marian.output.trainer, vocab=rules.train_vocab.output, + output: model=f'{backward}/{best_model}' + params: prefix_train=f"{biclean}/corpus",prefix_test=f"{original}/devset", + args=training_args.get("backward") or "" + shell: '''bash pipeline/train/train.sh \ + backward train {trg} {src} "{params.prefix_train}" "{params.prefix_test}" "{backward}" \ + "{input.vocab}" {params.args} >> {log} 2>&1''' rule eval_backward: message: "Evaluating backward model" @@ -415,11 +398,13 @@ if train_s2s: resources: gpu=gpus_num group: 'backward' priority: 50 - input: model=f'{backward_model}/{best_model}', datasets=rules.data_test.output + input: + full_eval_datasets, + model=f'{backward}/{best_model}' output: - report(directory(f'{backward_model}/eval'),patterns=["{name}.bleu"], + report(directory(eval_backward),patterns=["{name}.metrics"], category='evaluation', subcategory='finetuned', caption='reports/evaluation.rst') - shell: 'bash pipeline/train/eval.sh "{backward_model}" "{evaluation}" {trg} {src} >> {log} 2>&1' + shell: 'bash pipeline/train/eval.sh "{eval_backward}" "{eval_data}" {trg} {src} {input.model} >> {log} 2>&1' @@ -441,7 +426,7 @@ if augment_corpus: resources: gpu=gpus_num input: rules.marian.output.trainer,file=f'{translated}/mono_trg/file.{{part}}', - vocab=rules.train_vocab.output,model=f'{backward_model}/{best_model}' + vocab=rules.train_vocab.output,model=f'{backward}/{best_model}' output: f'{translated}/mono_trg/file.{{part}}.out' shell: 'bash pipeline/translate/translate.sh "{input.file}" "{input.vocab}" {input.model} >> {log} 2>&1' @@ -472,22 +457,43 @@ if augment_corpus: "{input.src1}" "{input.src2}" "{input.trg1}" "{input.trg2}" "{output.res_src}" "{output.res_trg}" \ >> {log} 2>&1''' -rule teacher: - message: "Training teacher" - log: f"{log_dir}/train_teacher{{ens}}.log" + + +rule teacher_all: + message: "Training teacher on all data" + log: f"{log_dir}/train_teacher_all{{ens}}.log" conda: "envs/base.yml" threads: gpus_num*2 resources: gpu=gpus_num group: 'teacher{ens}' input: - train_src=f'{teacher_corpus}.{src}.gz',train_trg=f'{teacher_corpus}.{trg}.gz', - val_src=rules.data_val.output.src,val_trg=rules.data_val.output.trg, + rules.merge_devset.output, train_src=f'{teacher_corpus}.{src}.gz',train_trg=f'{teacher_corpus}.{trg}.gz', bin=rules.marian.output.trainer,vocab=rules.train_vocab.output - output: model=f'{teacher_dir}{{ens}}/{best_model}' - params: prefix_train=teacher_corpus, prefix_test=f"{original}/devset", dir=directory(f'{teacher_dir}{{ens}}') - shell: '''bash pipeline/train/train-teacher.sh \ - "{params.dir}" "{params.prefix_train}" "{params.prefix_test}" "{input.vocab}" \ - {training_args} >> {log} 2>&1''' + output: model=f'{teacher_dir}{{ens}}/{teacher_all_output}' + params: prefix_train=teacher_corpus, prefix_test=f"{original}/devset", dir=directory(f'{teacher_dir}{{ens}}'), + args=training_args.get("teacher-all") or "" + shell: '''bash pipeline/train/train.sh \ + teacher train {src} {trg} "{params.prefix_train}" "{params.prefix_test}" "{params.dir}" \ + "{input.vocab}" {params.args} >> {log} 2>&1''' + +if continue_teacher: + rule teacher_parallel: + message: "Continue training teacher on parallel corpus" + log: f"{log_dir}/train_teacher_parallel{{ens}}.log" + conda: "envs/base.yml" + threads: gpus_num * 2 + resources: gpu=gpus_num + group: 'teacher{ens}' + input: + rules.merge_devset.output, model = f'{teacher_dir}{{ens}}/model.npz', + train_src=clean_corpus_src,train_trg=clean_corpus_trg, + bin=rules.marian.output.trainer,vocab=rules.train_vocab.output + output: model=f'{teacher_dir}{{ens}}/{best_model}' + params: prefix_train=clean_corpus_prefix,prefix_test=f"{original}/devset",dir=directory(f'{teacher_dir}{{ens}}'), + args=training_args.get("teacher-parallel") or "" + shell: '''bash pipeline/train/train.sh \ + teacher continue {src} {trg} "{params.prefix_train}" "{params.prefix_test}" "{params.dir}" \ + "{input.vocab}" {params.args} >> {log} 2>&1''' rule eval_teacher: message: "Evaluating teacher model" @@ -498,13 +504,29 @@ rule eval_teacher: group: 'teacher{ens}' priority: 50 input: - model=f'{teacher_dir}{{ens}}/{best_model}', - datasets=rules.data_test.output + full_eval_datasets, + model=f'{teacher_dir}{{ens}}/{best_model}' output: - report(directory(f'{teacher_dir}{{ens}}/eval'), patterns=["{name}.bleu"], - category='evaluation', subcategory='teacher', caption='reports/evaluation.rst') - params: dir=f'{teacher_dir}{{ens}}' - shell: 'bash pipeline/train/eval.sh "{params.dir}" "{evaluation}" {src} {trg} >> {log} 2>&1' + report(directory(f'{eval_res}/teacher{{ens}}'), patterns=["{name}.metrics"], + category='evaluation', subcategory='teacher{ens}', caption='reports/evaluation.rst') + params: dir=f'{eval_res}/teacher{{ens}}' + shell: 'bash pipeline/train/eval.sh "{params.dir}" "{eval_data}" {src} {trg} {input.model} >> {log} 2>&1' + + +if len(ensemble) > 1: + rule eval_teacher_ensemble: + message: "Evaluating an ensemble of teacher models" + log: f"{log_dir}/eval_teacher_ensemble.log" + conda: "envs/base.yml" + threads: gpus_num * 2 + resources: gpu=gpus_num + priority: 50 + input: + full_eval_datasets, models=[f'{teacher_dir}{ens}/{best_model}' for ens in ensemble] + output: + report(directory(eval_teacher_ens),patterns=["{name}.metrics"], + category='evaluation',subcategory='teacher_ensemble',caption='reports/evaluation.rst') + shell: 'bash pipeline/train/eval.sh "{eval_teacher_ens}" "{eval_data}" {src} {trg} {input.models} >> {log} 2>&1' ### translation with teacher @@ -640,7 +662,7 @@ rule ce_filter: output: src_corpus=f"{filtered}/corpus.{src}.gz",trg_corpus=f"{filtered}/corpus.{trg}.gz" params: input_prefix=f'{merged}/corpus',output_prefix=f'{filtered}/corpus' shell: '''bash pipeline/cefilter/ce-filter.sh \ - "{params.input_prefix}" "{params.output_prefix}" "{input.scores}" {threads} >> {log} 2>&1''' + "{params.input_prefix}" "{params.output_prefix}" "{input.scores}" >> {log} 2>&1''' rule alignments: message: 'Training word alignment and lexical shortlists' @@ -664,15 +686,15 @@ rule student: resources: gpu=gpus_num group: 'student' input: - train_src=rules.ce_filter.output.src_corpus, train_trg=rules.ce_filter.output.trg_corpus, - val_src=rules.data_val.output.src, val_trg=rules.data_val.output.trg, + rules.merge_devset.output, train_src=rules.ce_filter.output.src_corpus, train_trg=rules.ce_filter.output.trg_corpus, alignments=rules.alignments.output.alignment, bin=rules.marian.output.trainer, vocab=rules.train_vocab.output output: model=f'{student_dir}/{best_model}' - params: prefix_train=rules.ce_filter.params.output_prefix,prefix_test=f"{original}/devset" + params: prefix_train=rules.ce_filter.params.output_prefix,prefix_test=f"{original}/devset", + args=training_args.get("student") or "" shell: '''bash pipeline/train/train-student.sh \ - "{student_dir}" "{params.prefix_train}" "{params.prefix_test}" "{input.vocab}" \ - "{input.alignments}" {training_args} >> {log} 2>&1''' + "{input.alignments}" student train {src} {trg} "{params.prefix_train}" "{params.prefix_test}" \ + "{student_dir}" "{input.vocab}" {params.args} >> {log} 2>&1''' rule eval_student: message: "Evaluating student model" @@ -682,11 +704,11 @@ rule eval_student: resources: gpu=gpus_num group: 'student' priority: 50 - input: model=rules.student.output.model, datasets=rules.data_test.output + input: full_eval_datasets, model=rules.student.output.model output: - report(directory(f'{student_dir}/eval'),patterns=["{name}.bleu"],category='evaluation', + report(directory(eval_student),patterns=["{name}.metrics"],category='evaluation', subcategory='student', caption='reports/evaluation.rst') - shell: 'bash pipeline/train/eval.sh "{student_dir}" "{evaluation}" {src} {trg} >> {log} 2>&1' + shell: 'bash pipeline/train/eval.sh "{eval_student}" "{eval_data}" {src} {trg} {input.model} >> {log} 2>&1' # quantize @@ -698,15 +720,15 @@ rule finetune_student: resources: gpu=gpus_num group: 'finetune' input: - train_src=rules.ce_filter.output.src_corpus, train_trg=rules.ce_filter.output.trg_corpus, - val_src=rules.data_val.output.src, val_trg=rules.data_val.output.trg, + rules.merge_devset.output, train_src=rules.ce_filter.output.src_corpus, train_trg=rules.ce_filter.output.trg_corpus, alignments=rules.alignments.output.alignment, student_model=rules.student.output.model, bin=rules.marian.output.trainer, vocab=rules.train_vocab.output output: model=f'{student_finetuned_dir}/{best_model}' - params: prefix_train=rules.ce_filter.params.output_prefix,prefix_test=f"{original}/devset" - shell: '''bash pipeline/train/finetune-student.sh \ - "{student_finetuned_dir}" "{params.prefix_train}" "{params.prefix_test}" "{input.vocab}" \ - "{input.alignments}" "{input.student_model}" {training_args} >> {log} 2>&1''' + params: prefix_train=rules.ce_filter.params.output_prefix,prefix_test=f"{original}/devset", + args=training_args.get("student-finetune") or "" + shell: '''bash pipeline/train/train-student.sh \ + "{input.alignments}" student finetune {src} {trg} "{params.prefix_train}" "{params.prefix_test}" \ + "{student_finetuned_dir}" "{input.vocab}" {params.args} >> {log} 2>&1''' rule eval_finetuned_student: message: "Evaluating fine-tuned student model" @@ -716,19 +738,18 @@ rule eval_finetuned_student: resources: gpu=gpus_num group: 'finetune' priority: 50 - input: model=rules.finetune_student.output.model, datasets=rules.data_test.output + input: full_eval_datasets, model=rules.finetune_student.output.model output: - report(directory(f'{student_finetuned_dir}/eval'),patterns=["{name}.bleu"], + report(directory(eval_student_finetuned),patterns=["{name}.metrics"], category='evaluation', subcategory='finetuned', caption='reports/evaluation.rst') - shell: 'bash pipeline/train/eval.sh "{student_finetuned_dir}" "{evaluation}" {src} {trg} >> {log} 2>&1' + shell: 'bash pipeline/train/eval.sh "{eval_student_finetuned}" "{eval_data}" {src} {trg} {input.model} \ + >> {log} 2>&1' rule quantize: message: "Quantization" log: f"{log_dir}/quntize.log" conda: "envs/base.yml" - threads: gpus_num*2 - resources: gpu=gpus_num - threads: workflow.cores + threads: 1 input: shortlist=rules.alignments.output.shortlist, model=rules.finetune_student.output.model, bin=rules.marian.output.decoder, vocab=rules.train_vocab.output, devset=f"{original}/devset.{src}.gz" @@ -741,16 +762,16 @@ rule eval_quantized: log: f"{log_dir}/eval_quantized.log" conda: "envs/base.yml" group: 'export' - threads: workflow.cores + threads: 1 priority: 50 input: + full_eval_datasets, model=rules.quantize.output.model, - datasets=rules.data_test.output, shortlist=rules.alignments.output.shortlist,vocab=rules.train_vocab.output output: - report(directory(f'{speed}/eval'),patterns=["{name}.bleu"], category='evaluation', + report(directory(eval_speed),patterns=["{name}.metrics"], category='evaluation', subcategory='quantized', caption='reports/evaluation.rst') - shell: '''bash pipeline/quantize/eval.sh "{speed}" "{input.shortlist}" "{evaluation}" "{input.vocab}" \ + shell: '''bash pipeline/quantize/eval.sh "{speed}" "{input.shortlist}" "{eval_data}" "{input.vocab}" "{eval_speed}" \ >> {log} 2>&1''' rule export: diff --git a/configs/config.prod.yml b/configs/config.prod.yml index 89f8fd28f..f298156ac 100644 --- a/configs/config.prod.yml +++ b/configs/config.prod.yml @@ -1,4 +1,12 @@ +#### +# Example of a production config +# Change language pair, experiment name, datasets and other settings if needed +# Training low resource languages might require more tuning of pipeline/training/configs +### + +# These settings depend on execution environment +# They are set in the Makefile root: "" cuda: "" deps: false @@ -7,7 +15,7 @@ workspace: "" experiment: - name: snakemake + name: prod src: ru trg: en @@ -19,15 +27,26 @@ experiment: mono-max-sentences-src: 100000000 mono-max-sentences-trg: 20000000 - bicleaner-threshold: 0.5 - # split corpus to parallelize translation split-length: 2000000 + best-model: chrf + bicleaner: + default-threshold: 0.5 + dataset-thresholds: + opus_CCAligned/v1: 0.7 + opus_WikiMatrix/v1: 0.7 + opus_OpenSubtitles/v2018: 0.9 + opus_bible-uedin/v1: 0.7 + mtdata_cc_aligned: 0.7 + mtdata_wiki_titles_v1: 0.7 + mtdata_WikiMatrix_v1: 0.7 + mtdata_wiki_titles_v2: 0.7 + mtdata_wmt13_commoncrawl: 0.7 datasets: - # parallel corpus + # parallel training corpus train: - opus_ada83/v1 - opus_UN/v20090831 @@ -74,12 +93,14 @@ datasets: - mtdata_news_commentary_v14 - mtdata_neulab_tedtalksv1_test - mtdata_JW300 + # datasets to merge for validation while training devtest: - flores_dev - mtdata_newstest2019_ruen - mtdata_newstest2017_ruen - mtdata_newstest2015_ruen - mtdata_newstest2014_ruen + # datasets for evaluation test: - flores_devtest - sacrebleu_wmt20 @@ -99,7 +120,7 @@ datasets: - news-crawl_news.2013 - news-crawl_news.2012 - news-crawl_news.2011 - # to be translated by the shallow backward model to augment teacher corpus with back-translations + # to be translated by the backward model to augment teacher corpus with back-translations # leave empty to skip augmentation step (high resource languages) mono-trg: - news-crawl_news.2020 diff --git a/configs/config.test.yml b/configs/config.test.yml index 9b2b4d8b2..92a714134 100644 --- a/configs/config.test.yml +++ b/configs/config.test.yml @@ -1,3 +1,7 @@ +#### +# Test config, it rus the pipeline quickly end to end +### + root: "" cuda: "" @@ -6,28 +10,38 @@ gpus: "" workspace: "" experiment: - name: snakemake + name: test src: ru trg: en teacher-ensemble: 2 - # path to a pretrained backward model (optional) backward-model: "" - # limits per downloaded dataset mono-max-sentences-src: 100000 mono-max-sentences-trg: 200000 + split-length: 100000 - bicleaner-threshold: 0.5 + best-model: chrf - split-length: 100000 + bicleaner: + default-threshold: 0.5 + dataset-thresholds: + mtdata_neulab_tedtalksv1_train: 0.6 training: - after-epochs: 1 + backward: + after: 1000u + teacher-all: + after: 2000u + teacher-parallel: + after: 1000u + student: + after: 1000u + student-finetune: + after: 1000u datasets: - # parallel corpus train: - opus_ada83/v1 - opus_GNOME/v1 @@ -39,12 +53,8 @@ datasets: - flores_devtest - sacrebleu_wmt20 - sacrebleu_wmt18 - # monolingual datasets (ex. paracrawl-mono_paracrawl8, commoncrawl_wmt16, news-crawl_news.2020) - # to be translated by the teacher model mono-src: - news-crawl_news.2020 - # to be translated by the shallow backward model to augment teacher corpus with back-translations - # leave empty to skip augmentation step (high resource languages) mono-trg: - news-crawl_news.2020 diff --git a/envs/base.yml b/envs/base.yml index 0734d52d7..c46b7648d 100644 --- a/envs/base.yml +++ b/envs/base.yml @@ -7,6 +7,8 @@ dependencies: - cmake=3.21.1 - pip=21.2.2 - pip: - - sacrebleu==1.5.1 + - sacrebleu==2.0.0 - mtdata==0.2.9 - - fasttext==0.9.2 \ No newline at end of file + - fasttext==0.9.2 + - regex==2019.8.19 + - sacremoses==0.0.43 \ No newline at end of file diff --git a/pipeline/alignment/generate-alignment-and-shortlist.sh b/pipeline/alignment/generate-alignment-and-shortlist.sh index 9ad4342ee..9514014bf 100644 --- a/pipeline/alignment/generate-alignment-and-shortlist.sh +++ b/pipeline/alignment/generate-alignment-and-shortlist.sh @@ -17,6 +17,7 @@ vocab_path=$2 output_dir=$3 threads=$4 +cd "$(dirname "${0}")" mkdir -p "${output_dir}" dir="${output_dir}/tmp" @@ -72,7 +73,7 @@ test -s "${dir}/vocab.txt" || test -s "${output_dir}/lex.s2t.pruned.gz" || pigz -dc "${dir}/lex.s2t.gz" | grep -v NULL | - python3 "pipeline/alignment/prune_shortlist.py" 100 "${dir}/vocab.txt" | + python3 "prune_shortlist.py" 100 "${dir}/vocab.txt" | pigz >"${output_dir}/lex.s2t.pruned.gz" echo "### Deleting tmp dir" diff --git a/pipeline/bicleaner/bicleaner.sh b/pipeline/bicleaner/bicleaner.sh index ef5a0568a..86b3b2eea 100644 --- a/pipeline/bicleaner/bicleaner.sh +++ b/pipeline/bicleaner/bicleaner.sh @@ -15,18 +15,17 @@ corpus_prefix=$1 output_prefix=$2 bicleaner_threshold=$3 type=$4 +threads=$5 +pack_dir=$6 output_dir=$(dirname "${output_prefix}") -tmp_dir="${output_dir}/tmp" -mkdir -p "${tmp_dir}" +mkdir -p "${output_dir}" if [ "${type}" == 'bicleaner-ai' ]; then echo "### Using bicleaner-ai" - bash "pipeline/bicleaner/download-pack.sh" "${tmp_dir}" "bicleaner-ai" cmd=bicleaner-ai-classify elif [ "${type}" == 'bicleaner' ]; then echo "### Using bicleaner" - bash "pipeline/bicleaner/download-pack.sh" "${tmp_dir}" "bicleaner" cmd=bicleaner-classify else echo "### Unsupported type: ${type}" @@ -34,17 +33,18 @@ else fi echo "### Classifying and filtering" -test -s "${tmp_dir}/best.gz" || +test -s "${output_prefix}.best.gz" || paste <(pigz -dc "${corpus_prefix}.${SRC}.gz") <(pigz -dc "${corpus_prefix}.${TRG}.gz") | - ${cmd} --scol 1 --tcol 1 - - "${tmp_dir}"/*.yaml | + ${cmd} --scol 1 --tcol 1 --processes "${threads}" - - "${pack_dir}"/*.yaml | awk -v threshold=${bicleaner_threshold} '{if ($3>threshold) {print $0}}' | - pigz >"${tmp_dir}/best.gz" + pigz >"${output_prefix}.best.gz" echo "### Writing output corpus" -pigz -dc "${tmp_dir}/best.gz" | cut -f1 | pigz >"${output_prefix}.${SRC}.gz" -pigz -dc "${tmp_dir}/best.gz" | cut -f2 | pigz >"${output_prefix}.${TRG}.gz" +pigz -dc "${output_prefix}.best.gz" | + tee >(cut -f1 | pigz >"${output_prefix}.${SRC}.gz") | + cut -f2 | pigz >"${output_prefix}.${TRG}.gz" echo "### Cleaning files" -rm -rf "${tmp_dir}" +rm "${output_prefix}.best.gz" echo "###### Done: Bicleaner filtering" diff --git a/pipeline/bicleaner/download-pack.sh b/pipeline/bicleaner/download-pack.sh index bbaea6337..afd33d038 100644 --- a/pipeline/bicleaner/download-pack.sh +++ b/pipeline/bicleaner/download-pack.sh @@ -13,6 +13,7 @@ test -v TRG download_path=$1 type=$2 +mkdir -p download_path invalid_url() { wget -S --spider -o - $1 | grep -q '404 Not Found' @@ -47,11 +48,11 @@ else lang2=$TRG fi -if ! test -s "${download_path}"/*.yaml; then - wget -P "${download_path}" "${url}/${prefix}${lang1}-${lang2}.${extension}" - tar xvf "${download_path}/${prefix}${lang1}-${lang2}.${extension}" -C "${download_path}" --no-same-owner - mv "${download_path}/${lang1}-${lang2}"/* "${download_path}/" - rm "${download_path}/${prefix}${lang1}-${lang2}.${extension}" -fi + +wget -P "${download_path}" "${url}/${prefix}${lang1}-${lang2}.${extension}" +tar xvf "${download_path}/${prefix}${lang1}-${lang2}.${extension}" -C "${download_path}" --no-same-owner +mv "${download_path}/${lang1}-${lang2}"/* "${download_path}/" +rm "${download_path}/${prefix}${lang1}-${lang2}.${extension}" + echo "### ${type} language pack ${url} is downloaded" diff --git a/pipeline/cefilter/ce-filter.sh b/pipeline/cefilter/ce-filter.sh index 51ded55ba..24b2bc9d4 100644 --- a/pipeline/cefilter/ce-filter.sh +++ b/pipeline/cefilter/ce-filter.sh @@ -13,7 +13,8 @@ test -v TRG corpus_prefix=$1 output_prefix=$2 scores=$3 -threads=$4 + +cd "$(dirname "${0}")" # Part of the data to be removed (0.05 is 5%) remove=0.05 @@ -21,21 +22,10 @@ output_dir=$(dirname "${output_prefix}") tmp="${output_dir}/tmp" mkdir -p "${tmp}" -echo "### Decompressing corpus" -test -s "${tmp}/corpus.${TRG}" || pigz -dc "${corpus_prefix}.${TRG}.gz" >"${tmp}/corpus.${TRG}" -test -s "${tmp}/corpus.${SRC}" || pigz -dc "${corpus_prefix}.${SRC}.gz" >"${tmp}/corpus.${SRC}" - - -echo "### Normalizing scores" -test -s "${tmp}/scores.nrm.txt" || - paste "${scores}" "${tmp}/corpus.${TRG}" | - parallel --no-notice --pipe -k -j "${threads}" --block 50M "python pipeline/cefilter/normalize-scores.py" | - cut -f1 >"${tmp}/scores.nrm.txt" - echo "### Sorting scores" if [ ! -s "${tmp}/sorted.gz" ]; then buffer_size="$(echo "$(grep MemTotal /proc/meminfo | awk '{print $2}')"*0.9 | bc | cut -f1 -d.)" - paste "${tmp}/scores.nrm.txt" "${tmp}/corpus.${SRC}" "${tmp}/corpus.${TRG}" | + paste "${scores}" <(pigz -dc "${corpus_prefix}.${SRC}.gz") <(pigz -dc "${corpus_prefix}.${TRG}.gz") | LC_ALL=C sort -n -k1,1 -S "${buffer_size}K" -T "${tmp}" | pigz >"${tmp}/sorted.gz" fi @@ -48,8 +38,9 @@ if [ ! -s "${tmp}/best.gz" ]; then fi echo "### Writing output corpus" -pigz -dc "${tmp}/best.gz" | cut -f1 | pigz >"${output_prefix}.${SRC}.gz" -pigz -dc "${tmp}/best.gz" | cut -f2 | pigz >"${output_prefix}.${TRG}.gz" +pigz -dc "${tmp}/best.gz" | + tee >(cut -f1 | pigz >"${output_prefix}.${SRC}.gz") | + cut -f2 | pigz >"${output_prefix}.${TRG}.gz" echo "### Deleting tmp dir" rm -rf "${tmp}" diff --git a/pipeline/cefilter/normalize-scores.py b/pipeline/cefilter/normalize-scores.py deleted file mode 100644 index c920ea662..000000000 --- a/pipeline/cefilter/normalize-scores.py +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -from __future__ import print_function, unicode_literals, division - -import sys -import argparse -import math - - -def main(): - args = parse_user_args() - - for line in sys.stdin: - fields = line.strip().split("\t") - trg = fields[-1] - score = float(fields[0]) - - if not args.no_normalize: - length = len(trg.split()) - score = score / float(length + 1) - if args.exp: - score = math.exp(score) - - sys.stdout.write("{:.6f}\t{}".format(score, line)) - - -def parse_user_args(): - parser = argparse.ArgumentParser() - parser.add_argument("-n", "--no-normalize", action="store_true") - parser.add_argument("-e", "--exp", action="store_true") - return parser.parse_args() - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/pipeline/cefilter/score.sh b/pipeline/cefilter/score.sh index 78f72249b..a70ce1f24 100644 --- a/pipeline/cefilter/score.sh +++ b/pipeline/cefilter/score.sh @@ -31,6 +31,7 @@ mkdir -p "${dir}" --maxi-batch 1000 \ --max-length 250 \ --max-length-crop \ + --normalize \ -d ${GPUS} \ -w "${WORKSPACE}" \ --log "${dir}/scores.txt.log" \ diff --git a/pipeline/clean/clean-corpus.sh b/pipeline/clean/clean-corpus.sh index d2446c1d2..b24926c14 100755 --- a/pipeline/clean/clean-corpus.sh +++ b/pipeline/clean/clean-corpus.sh @@ -8,81 +8,101 @@ set -euo pipefail echo "###### Cleaning corpus" -export PYTHONPATH="${CLEAN_TOOLS}" + test -v SRC test -v TRG -test -v CLEAN_TOOLS -data=$1 -output=$2 +input_prefix=$1 +output_prefix=$2 threads=$3 +dataset=$4 + +cd "$(dirname "${0}")" +export PYTHONPATH="tools" -dir="$(dirname "${output}")" -tmp="${dir}/tmp" -mkdir -p "${tmp}" +dir="$(dirname "${output_prefix}")" +mkdir -p "${dir}" -echo "### CLeaning ${data}" +echo "### Cleaning ${input_prefix}" ###################################################################### echo "### Basic preprocessing" for lng in "${SRC}" "${TRG}"; do - test -s "${output}.${lng}.nrm.gz" || - pigz -dc "${data}.${lng}.gz" | + test -s "${output_prefix}.${lng}.nrm.gz" || + pigz -dc "${input_prefix}.${lng}.gz" | parallel --no-notice --pipe -k -j "${threads}" --block 50M \ - "perl ${CLEAN_TOOLS}/remove-non-printing-char.perl | perl ${CLEAN_TOOLS}/normalize-punctuation.perl -l ${lng}" | - pigz >"${output}.${lng}.nrm.gz" + "perl tools/remove-non-printing-char.perl" | + pigz >"${output_prefix}.${lng}.nrm.gz" +done + +##################################################################### +echo "### Apply monolingual fixes" +for lng in $SRC $TRG; do + if [[ ! -x fixes/${dataset}.${lng}.sh ]]; then + test -s "${output_prefix}.${lng}.monofix.gz" || + cp "${output_prefix}.${lng}.nrm.gz" "${output_prefix}.${lng}.monofix.gz" + else + test -s "${output_prefix}.${lng}.monofix.gz" || + pigz -dc "${output_prefix}.${lng}.nrm.gz" \ + | fixes/"${dataset}"."${lng}".sh \ + | pigz >"${output_prefix}.${lng}.monofix.gz" + fi done ###################################################################### -echo "### Deduplication" -test -s "${output}.${SRC}${TRG}.nrm.uniq.gz" || - paste <(pigz -dc "${output}.${SRC}.nrm.gz") <(pigz -dc "${output}.${TRG}.nrm.gz") | - LC_ALL=C sort -S 10G -T "${tmp}" | - uniq | - pigz >"${output}.${SRC}${TRG}.nrm.uniq.gz" +echo "### Apply bilingual fixes" +if [[ -x fixes/${dataset}.sh ]]; then + FIX="fixes/${dataset}.sh ${SRC} ${TRG} ${threads}" +else + FIX="cat" +fi +test -s "${output_prefix}.${SRC}${TRG}.fix.gz" || + paste <(pigz -dc "${output_prefix}.${SRC}.monofix.gz") <(pigz -dc "${output_prefix}.${TRG}.monofix.gz") \ + | $FIX \ + | pigz > "${output_prefix}.${SRC}${TRG}.fix.gz" ###################################################################### echo "### Rule-based filtering" -test -s "${output}.${SRC}${TRG}.rule-based.gz" || - pigz -dc "${output}.${SRC}${TRG}.nrm.uniq.gz" | +test -s "${output_prefix}.${SRC}${TRG}.rule-based.gz" || + pigz -dc "${output_prefix}.${SRC}${TRG}.fix.gz" | parallel --no-notice --pipe -k -j "${threads}" --block 50M \ - "python3 ${CLEAN_TOOLS}/clean_parallel.py -l1 ${SRC} -l2 ${TRG} --debug" \ - 2>"${output}.${SRC}${TRG}.clean.debug.txt" | - pigz >"${output}.${SRC}${TRG}.rule-based.gz" + "python3 tools/clean_parallel.py -l1 ${SRC} -l2 ${TRG} --debug" \ + 2>"${output_prefix}.${SRC}${TRG}.clean.debug.txt" | + pigz >"${output_prefix}.${SRC}${TRG}.rule-based.gz" ###################################################################### echo "### Language identification" -test -s "${output}.${SRC}${TRG}.langid.gz" || - pigz -dc "${output}.${SRC}${TRG}.rule-based.gz" | +test -s "${output_prefix}.${SRC}${TRG}.langid.gz" || + pigz -dc "${output_prefix}.${SRC}${TRG}.rule-based.gz" | # memory intensive parallel --no-notice --pipe -k -j "$(echo "${threads}"/4 | bc)" --block 50M \ - "python3 -Wi ${CLEAN_TOOLS}/langid_fasttext.py -f 1 | python3 -Wi ${CLEAN_TOOLS}/langid_fasttext.py -f 1" | + "python3 -Wi tools/langid_fasttext.py -f 1 | python3 -Wi tools/langid_fasttext.py -f 1" | grep -P "^${SRC}\t${TRG}\t" | cut -f3,4 | - pigz >"${output}.${SRC}${TRG}.langid.gz" + pigz >"${output_prefix}.${SRC}${TRG}.langid.gz" ###################################################################### echo "### Removing leading and repetitive white spaces" -pigz -dc "${output}.${SRC}${TRG}.langid.gz" | +pigz -dc "${output_prefix}.${SRC}${TRG}.langid.gz" | cut -f1 | sed -e 's/^[[:space:]]*//' | tr -s " " | -pigz >"${output}.${SRC}.gz" +pigz >"${output_prefix}.${SRC}.gz" -pigz -dc "${output}.${SRC}${TRG}.langid.gz" | +pigz -dc "${output_prefix}.${SRC}${TRG}.langid.gz" | cut -f2 | sed -e 's/^[[:space:]]*//' | tr -s " " | -pigz >"${output}.${TRG}.gz" +pigz >"${output_prefix}.${TRG}.gz" -test -s "${output}.${SRC}.gz" || exit 1 -test -s "${output}.${TRG}.gz" || exit 1 +test -s "${output_prefix}.${SRC}.gz" || exit 1 +test -s "${output_prefix}.${TRG}.gz" || exit 1 -echo "### Remove ${data} from intermediate steps" -rm -f "${output}".*.nrm.gz "${output}".*.nrm.uniq.gz "${output}".*.langid.gz "${output}".*.rule-based.gz -rm -rf "${tmp}" +echo "### Remove input_prefix from intermediate steps" +rm -rf "${output_prefix}".*.nrm.gz "${output_prefix}".*.langid.gz \ + "${output_prefix}".*.rule-based.gz "${output_prefix}".*.*fix.gz -echo "### Clean data is written to ${output}" +echo "### Clean ${input_prefix} is written to ${output_prefix}" echo "###### Done: Cleaning corpus" diff --git a/pipeline/clean/clean-mono.sh b/pipeline/clean/clean-mono.sh index 1490277a2..2c2fe7bbe 100755 --- a/pipeline/clean/clean-mono.sh +++ b/pipeline/clean/clean-mono.sh @@ -9,57 +9,63 @@ set -euo pipefail echo "###### Cleaning monolingual data" lang=$1 -input=$2 -output=$3 +input_prefix=$2 +output_prefix=$3 threads=$4 +dataset=$5 -test -v CLEAN_TOOLS +echo "### Cleaning ${input_prefix}" -echo "### CLeaning ${input}" +cd "$(dirname "${0}")" +export PYTHONPATH="tools" -dir="$(dirname "${output}")" -tmp="${dir}/tmp" -mkdir -p "${tmp}" +dir="$(dirname "${output_prefix}")" +mkdir -p "${dir}" ###################################################################### echo "### Basic preprocessing" -test -s "${output}.${lang}.nrm.gz" || - pigz -dc "${input}.${lang}.gz" | +test -s "${output_prefix}.${lang}.nrm.gz" || + pigz -dc "${input_prefix}.${lang}.gz" | parallel --no-notice --pipe -k -j "${threads}" --block 50M \ - "perl ${CLEAN_TOOLS}/remove-non-printing-char.perl | perl ${CLEAN_TOOLS}/normalize-punctuation.perl -l ${lang}" | - pigz >"${output}.${lang}.nrm.gz" + "perl tools/remove-non-printing-char.perl" | + pigz >"${output_prefix}.${lang}.nrm.gz" -###################################################################### -echo "### Deduplication" -test -s "${output}.${lang}.nrm.uniq.gz" || - pigz -dc "${output}.${lang}.nrm.gz" | - LC_ALL=C sort -S 10G -T "${tmp}" | - uniq | - pigz >"${output}.${lang}.nrm.uniq.gz" +##################################################################### +echo "### Apply monolingual fixes" +if [[ ! -x fixes/${dataset}.${lang}.sh ]]; then + test -s "${output_prefix}.${lang}.monofix.gz" || + cp "${output_prefix}.${lang}.nrm.gz" "${output_prefix}.${lang}.monofix.gz" +else + test -s "${output_prefix}.${lang}.monofix.gz" || + pigz -dc "${output_prefix}.${lang}.nrm.gz" \ + | fixes/"${dataset}"."${lang}".sh \ + | pigz >"${output_prefix}.${lang}.monofix.gz" +fi ###################################################################### echo "### Language identification" -test -s "${output}.${lang}.langid.gz" || - pigz -dc "${output}.${lang}.nrm.uniq.gz" | +test -s "${output_prefix}.${lang}.langid.gz" || + pigz -dc "${output_prefix}.${lang}.monofix.gz" | # memory intensive - parallel --no-notice --pipe -k -j "$(echo "${threads}"/4 | bc)" --block 50M "python ${CLEAN_TOOLS}/langid_fasttext.py" | + parallel --no-notice --pipe -k -j "$(echo "${threads}"/4 | bc)" --block 50M "python tools/langid_fasttext.py" | grep -P "^${lang}\t" | cut -f2 | - pigz >"${output}.${lang}.langid.gz" + pigz >"${output_prefix}.${lang}.langid.gz" ###################################################################### echo "### Rule-based filtering" -pigz -dc "${output}.${lang}.langid.gz" | +pigz -dc "${output_prefix}.${lang}.langid.gz" | parallel --no-notice --pipe -k -j "${threads}" --block 50M \ - "python ${CLEAN_TOOLS}/clean_mono.py -l ${lang} --debug" \ - 2>"${output}.${lang}.clean.debug.txt" | -pigz >"${output}.${lang}.gz" + "python tools/clean_mono.py -l ${lang} --debug" \ + 2>"${output_prefix}.${lang}.clean.debug.txt" | +pigz >"${output_prefix}.${lang}.gz" -test -s "${output}.${lang}.gz" || exit 1 +test -s "${output_prefix}.${lang}.gz" || exit 1 echo "### Remove data from intermediate steps" -rm -rf "${output}".*.nrm.gz "${output}".*.nrm.uniq.gz "${output}".*.langid.gz "${tmp}" +rm -rf "${output_prefix}".*.nrm.gz "${output_prefix}".*.langid.gz \ + "${output_prefix}".*.monofix.gz -echo "### Clean data is written to ${output}" +echo "### Clean data is written to ${output_prefix}" echo "###### Done: Cleaning monolingual data" diff --git a/pipeline/clean/fixes/detok.sh b/pipeline/clean/fixes/detok.sh new file mode 100755 index 000000000..0f6d792b5 --- /dev/null +++ b/pipeline/clean/fixes/detok.sh @@ -0,0 +1,17 @@ +#!/bin/bash +set -e + +# Detokenize + +SRC=$1 +TRG=$2 +threads=$3 + +temp=$(mktemp -d) + +tee >(cut -f1 | sacremoses -j $threads -l $SRC detokenize >$temp/$SRC.detok) \ + | cut -f2 | sacremoses -j $threads -l $TRG detokenize >$temp/$TRG.detok + +paste $temp/$SRC.detok $temp/$TRG.detok + +rm -r $temp diff --git a/pipeline/clean/fixes/mtdata_JW300.mt.sh b/pipeline/clean/fixes/mtdata_JW300.mt.sh new file mode 100755 index 000000000..98e5786f1 --- /dev/null +++ b/pipeline/clean/fixes/mtdata_JW300.mt.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +# Fix Maltese tokenization in JW300 that detokenizer cannot fix +sed "s/ - $(echo -ne \u200b) /-/g" \ + | sed 's/ - /-/g' diff --git a/pipeline/clean/fixes/mtdata_JW300.sh b/pipeline/clean/fixes/mtdata_JW300.sh new file mode 100755 index 000000000..8f24e4439 --- /dev/null +++ b/pipeline/clean/fixes/mtdata_JW300.sh @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +# Detokenize JW300 +fixes/detok.sh $1 $2 $3 diff --git a/pipeline/clean/fixes/mtdata_OPUS_DOGC_v2.ca.sh b/pipeline/clean/fixes/mtdata_OPUS_DOGC_v2.ca.sh new file mode 100755 index 000000000..79f003315 --- /dev/null +++ b/pipeline/clean/fixes/mtdata_OPUS_DOGC_v2.ca.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +# Detokenize Catalan apostrophe, dates and laws, and ending period +# detokenize middle dot +sed "s/\([lndsLNDS]\) ' \([a-zA-Z1]\)/\1'\2/g" \ + | sed "s#\([0-9]\) \?/ \?\([0-9]\)#\1/\2#g" \ + | sed "s/\([a-z]\) .\$/\1./g" \ + | sed "s/l · l/l·l/g" diff --git a/pipeline/clean/fixes/mtdata_OPUS_DOGC_v2.es.sh b/pipeline/clean/fixes/mtdata_OPUS_DOGC_v2.es.sh new file mode 100755 index 000000000..88b6cec22 --- /dev/null +++ b/pipeline/clean/fixes/mtdata_OPUS_DOGC_v2.es.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +# Detokenize dates and laws, and ending period +sed "s#\([0-9]\) \?/ \?\([0-9]\)#\1/\2#g" \ + | sed "s/\([a-z]\) .\$/\1./g" diff --git a/pipeline/clean/fixes/mtdata_OPUS_DOGC_v2.sh b/pipeline/clean/fixes/mtdata_OPUS_DOGC_v2.sh new file mode 100644 index 000000000..9c258aea7 --- /dev/null +++ b/pipeline/clean/fixes/mtdata_OPUS_DOGC_v2.sh @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +# Detokenize DOGC +fixes/detok.sh $1 $2 $3 diff --git a/pipeline/clean/fixes/mtdata_OPUS_ECB_v1.sh b/pipeline/clean/fixes/mtdata_OPUS_ECB_v1.sh new file mode 100755 index 000000000..8f24e4439 --- /dev/null +++ b/pipeline/clean/fixes/mtdata_OPUS_ECB_v1.sh @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +# Detokenize JW300 +fixes/detok.sh $1 $2 $3 diff --git a/pipeline/clean/fixes/mtdata_OPUS_SETIMES_v2.sh b/pipeline/clean/fixes/mtdata_OPUS_SETIMES_v2.sh new file mode 100755 index 000000000..695c69afb --- /dev/null +++ b/pipeline/clean/fixes/mtdata_OPUS_SETIMES_v2.sh @@ -0,0 +1,17 @@ +#!/bin/bash +set -e + +# Detokenize SETIMES + +SRC=$1 +TRG=$2 +threads=$3 + +temp=$(mktemp -d) + +tee >(cut -f1 | sacremoses -j $threads -l $SRC detokenize >$temp/$SRC.detok) \ + >(cut -f2 | sacremoses -j $threads -l $TRG detokenize >$temp/$TRG.detok) + +paste $temp/$SRC.detok $temp/$TRG.detok + +rm -r $temp diff --git a/pipeline/clean/fixes/mtdata_OPUS_UNPC_v1_0.en.sh b/pipeline/clean/fixes/mtdata_OPUS_UNPC_v1_0.en.sh new file mode 100755 index 000000000..33c194d8c --- /dev/null +++ b/pipeline/clean/fixes/mtdata_OPUS_UNPC_v1_0.en.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +# Detokenize English possessive +sed "s/\([a-z]\) ' \([s]\)/\1'\2/g" diff --git a/pipeline/clean/fixes/mtdata_OPUS_UNPC_v1_0.fr.sh b/pipeline/clean/fixes/mtdata_OPUS_UNPC_v1_0.fr.sh new file mode 100755 index 000000000..0e81ef40f --- /dev/null +++ b/pipeline/clean/fixes/mtdata_OPUS_UNPC_v1_0.fr.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +# Detokenize French apostrophe +sed "s/\([lndsLNDS]\) ' \([a-zA-Z]\)/\1'\2/g" diff --git a/pipeline/clean/fixes/mtdata_neulab_tedtalksv1_train.ro.sh b/pipeline/clean/fixes/mtdata_neulab_tedtalksv1_train.ro.sh new file mode 100755 index 000000000..993534550 --- /dev/null +++ b/pipeline/clean/fixes/mtdata_neulab_tedtalksv1_train.ro.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +# Detokenize Romanian hyphens +sed -E "s/(\w) - (\w)/\1-\2/g" diff --git a/pipeline/clean/fixes/mtdata_neulab_tedtalksv1_train.sh b/pipeline/clean/fixes/mtdata_neulab_tedtalksv1_train.sh new file mode 100755 index 000000000..8c42faa65 --- /dev/null +++ b/pipeline/clean/fixes/mtdata_neulab_tedtalksv1_train.sh @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +# Detokenize neulabs +fixes/detok.sh $1 $2 $3 diff --git a/pipeline/clean/merge-corpus.sh b/pipeline/clean/merge-corpus.sh new file mode 100644 index 000000000..734dbd2ea --- /dev/null +++ b/pipeline/clean/merge-corpus.sh @@ -0,0 +1,35 @@ +#!/bin/bash +## +# Merges and deduplicates parallel datasets +# + +set -x +set -euo pipefail + +echo "###### Merging parallel datasets" + +test -v SRC +test -v TRG + +output_prefix=$1 +input_prefixes=( "${@:2}" ) + +tmp="${output_prefix}/merge" +mkdir -p "${tmp}" + +echo "### Merging" +cat "${input_prefixes[@]/%/.${SRC}.gz}" >"${tmp}/corpus.${SRC}.dup.gz" +cat "${input_prefixes[@]/%/.${TRG}.gz}" >"${tmp}/corpus.${TRG}.dup.gz" + +echo "### Deduplication" +paste <(pigz -dc "${tmp}/corpus.${SRC}.dup.gz") <(pigz -dc "${tmp}/corpus.${TRG}.dup.gz") | +LC_ALL=C sort -S 10G -T "${tmp}" | +uniq | +pigz >"${tmp}.${SRC}${TRG}.gz" + +pigz -dc "${tmp}.${SRC}${TRG}.gz" | cut -f1 | pigz > "${output_prefix}.${SRC}.gz" +pigz -dc "${tmp}.${SRC}${TRG}.gz" | cut -f2 | pigz > "${output_prefix}.${TRG}.gz" + +rm -rf "${tmp}" + +echo "###### Done: Merging parallel datasets" diff --git a/pipeline/clean/merge-mono.sh b/pipeline/clean/merge-mono.sh new file mode 100644 index 000000000..564bd084b --- /dev/null +++ b/pipeline/clean/merge-mono.sh @@ -0,0 +1,23 @@ +#!/bin/bash +## +# Merges monolingual datasets +# + +set -x +set -euo pipefail + +echo "###### Merging monolingual datasets" + +output=$1 +max_sent=$2 +datasets=( "${@:3}" ) + +dir=$(dirname "${output}") +mkdir -p "${dir}" + +pigz -dc "${datasets[@]}" | + shuf -n "${max_sent}" | + pigz >"${output}" + + +echo "###### Done: Merging monolingual datasets" diff --git a/pipeline/clean/tools/clean_parallel.py b/pipeline/clean/tools/clean_parallel.py index 115004b03..88fc17254 100755 --- a/pipeline/clean/tools/clean_parallel.py +++ b/pipeline/clean/tools/clean_parallel.py @@ -17,15 +17,38 @@ RATIO_ALPHA_CHARS = 0.5 # minimum fraction of alpha characters in a source sentence CHARS = { + 'bg': r'[АаБбВвГгДддЕеЖжЗзИиЙйКкkasЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЬьЮюЯя]', 'cs': r'[a-zÁáČčĎďÉéěÍíŇňÓóŘřŠšŤťÚúůÝýŽž]', + 'ca': r'[a-zÀàÈèÉéÍíÒòÓóÚúÇç]', + 'da': r'[a-zÆæØøÅå]', + 'de': r'[a-zÄäÖöÜüß]', 'en': r'[a-z]', - 'es': r'[a-zÁáÉéÍíÓóÚúñÑ¡!¿?]', + 'el': r'[a-zΑαΒβΓγΔδΕεΖζΗηΘθΙιΚκΛλΜμΝνΞξΟοΠπΡρΣσςΤτΥυΦφΧχΨψΩω]', + 'es': r'[a-zÁáÉéÍíÓóÚúñÑ]', 'et': r'[a-zÕõÄäÖöÜü]', - 'de': r'[a-zÄäÖöÜüß]', - 'no': r'[a-zÂâÁáÀàâÉéÈèÊêÓóÒòÔôÜüÆæØøÅå]', + 'eu': r'[a-zñÑ]', + 'fi': r'[a-zÅåÄäÖö]', + 'fr': r'[a-zÂâÁáÀàâÇçÉéÈèÊêÓóÒòÔôŒœÜüÛûŸÿ]', + 'ga': r'[abcdefghilmnoprstuáéíóúÁÉÍÓÚ]', + 'gl': r'[a-zÁáÉéÍíÓóÚúÑñ]', + 'hr': r'[abcčČćĆdđĐefghijklmnoprsšŠtuvzžŽ]', + 'hu': r'[a-zÁáÉéÍíÓóÖöŐőŰű]', + 'is': r'[abdefghijklmnoprstuvxyÁáðÐÉéÍíÓóÚúÝýÞþÆæÖö]', + 'it': r'[a-zàÀèÈéÉìÌíÍîÎòÒóÓùÙúÚ]', + 'lt': r'[aąbcČčdeĘęĖėfghiĮįyjklmnoprsŠštuŲųŪūvzŽž]', + 'lv': r'[aĀābcČčdeĒēfgĢģhiĪījkĶķlĻļmnŅņoprsŠštuŪūvzŽž]', + 'mt': r'[abĊċdefĠġghĦħiiejklmnopqrstuvwxŻżz]', 'nb': r'[a-zÂâÁáÀàâÉéÈèÊêÓóÒòÔôÜüÆæØøÅå]', + 'nl': r'[a-zÂâÁáÀàâÉéÈèÊêÓóÒòÔôÚú]', + 'no': r'[a-zÂâÁáÀàâÉéÈèÊêÓóÒòÔôÜüÆæØøÅå]', 'nn': r'[a-zÂâÁáÀàâÉéÈèÊêÓóÒòÔôÜüÆæØøÅå]', - 'is': r'[abdefghijklmnoprstuvxyÁáðÐÉéÍíÓóÚúÝýÞþÆæÖö]', + 'pl': r'[a-zĄąĆćĘꣳŃńÓ󌜏źŻż]', + 'pt': r'[a-zÂâÁáÀàÃãÇçÉéÈèÊêÍíÌìÓóÒòÔôÕõÚúÙù]', + 'ro': r'[a-zĂăÂâÎîȘșȚț]', + 'ru': r'[а-я]', + 'sk': r'[a-záäÁÄčČďĎžéÉíÍĺĹľĽňŇóÓôÔŕŔšŠťŤúÚýÝžŽ]', + 'sl': r'[abcčČdđĐefghijklmnoprsšŠtuvzžŽ]', + 'sv': r'[a-zÅåÄäÖö]', } diff --git a/pipeline/data/download-corpus.sh b/pipeline/data/download-corpus.sh index 167f0433b..7937a228a 100644 --- a/pipeline/data/download-corpus.sh +++ b/pipeline/data/download-corpus.sh @@ -1,40 +1,27 @@ #!/bin/bash ## -# Downloads parallel corpus datasets +# Downloads parallel dataset # set -x set -euo pipefail -echo "###### Downloading corpus" - test -v SRC test -v TRG -prefix=$1 -cache=$2 -id=$3 -datasets=( "${@:4}" ) - -src_corpus="${prefix}.${SRC}.gz" -trg_corpus="${prefix}.${TRG}.gz" -dir=$(dirname "${prefix}")/${id} -mkdir -p "${dir}" +dataset=$1 +output_prefix=$2 -echo "### Downloading datasets" +echo "###### Downloading dataset ${dataset}" -for dataset in "${datasets[@]}"; do - echo "### Downloading dataset ${dataset}" - name=${dataset#*_} - type=${dataset%%_*} - bash "pipeline/data/importers/corpus/${type}.sh" "${SRC}" "${TRG}" "${dir}" "${name}" -done - -cat "${dir}"/*."${SRC}" | pigz >"${src_corpus}" -cat "${dir}"/*."${TRG}" | pigz >"${trg_corpus}" +cd "$(dirname "${0}")" +dir=$(dirname "${output_prefix}") +mkdir -p "${dir}" -rm -rf "${dir}" +name=${dataset#*_} +type=${dataset%%_*} +bash "importers/corpus/${type}.sh" "${SRC}" "${TRG}" "${output_prefix}" "${name}" -echo "###### Done: Downloading corpus" +echo "###### Done: Downloading dataset ${dataset}" diff --git a/pipeline/data/download-eval.sh b/pipeline/data/download-eval.sh deleted file mode 100644 index 871d5c6f3..000000000 --- a/pipeline/data/download-eval.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -## -# Downloads evaluation datasets -# - -set -x -set -euo pipefail - -echo "###### Downloading evaluation datasets" - -test -v SRC -test -v TRG - -dir=$1 -cache=$2 -datasets=( "${@:3}" ) - -for dataset in "${datasets[@]}"; do - name="${dataset//[^A-Za-z0-9_- ]/_}" - bash "pipeline/data/download-corpus.sh" "${dir}/${name}" "${cache}" eval "${dataset}" - - test -e "${dir}/${name}.${SRC}" || pigz -dk "${dir}/${name}.${SRC}.gz" - test -e "${dir}/${name}.${TRG}" || pigz -dk "${dir}/${name}.${TRG}.gz" -done - - -echo "###### Done: Downloading evaluation datasets" diff --git a/pipeline/data/download-mono.sh b/pipeline/data/download-mono.sh index 70300ff3c..e4875bf6c 100644 --- a/pipeline/data/download-mono.sh +++ b/pipeline/data/download-mono.sh @@ -6,53 +6,37 @@ set -x set -euo pipefail -echo "###### Downloading monolingual data" - -lang=$1 -max_sent=$2 -prefix=$3 -cache=$4 -datasets=( "${@:5}" ) - -file_name="${prefix}.${lang}.gz" -dir=$(dirname "${prefix}")/mono - -if [ ! -e "${file_name}" ]; then - echo "### Downloading monolingual corpus for ${lang}" - mkdir -p "${dir}" - coef=0.1 - - for dataset in "${datasets[@]}"; do - echo "### Downloading dataset ${dataset}" - source_prefix="${dir}/${dataset}.original.${lang}" - gz_path="${dir}/${dataset}.${lang}.gz" - name=${dataset#*_} - type=${dataset%%_*} - - test -s "${source_prefix}.gz" || - bash "pipeline/data/importers/mono/${type}.sh" "${lang}" "${source_prefix}" "${name}" - - echo "### Sampling dataset ${dataset}" - # temporary disable pipefail because perl operation causes SIGPIPE (141) - set +o pipefail - test -s "${gz_path}" || - pigz -dc "${source_prefix}.gz" | - shuf -n "$(bc -l <<<"${max_sent}+${max_sent}*${coef}")" | - perl -ne 'print if(split(/\s/, $_) < 100)' | - head -n "${max_sent}" | - pigz >"${gz_path}" - set -o pipefail - - rm "${source_prefix}"* - done - - pigz -dc "${dir}"/*."${lang}".gz | shuf -n "${max_sent}" | pigz >"${file_name}" - -fi - -test -s "${file_name}" - -lines=$(pigz -dc "${file_name}" | wc -l) -echo "### Number of sentences: ${lines}" +dataset=$1 +lang=$2 +max_sent=$3 +output_path=$4 +coef=0.1 + +echo "###### Downloading monolingual data for language ${lang} dataset ${dataset}" + +cd "$(dirname "${0}")" + +tmp=$(dirname "${output_path}")/original +mkdir -p "${tmp}" + +echo "### Downloading dataset" +original_prefix="${tmp}/${dataset}.original.${lang}" +name=${dataset#*_} +type=${dataset%%_*} + +test -s "${original_prefix}.gz" || + bash "importers/mono/${type}.sh" "${lang}" "${original_prefix}" "${name}" + +echo "### Sampling dataset" +# temporary disable pipefail because perl operation causes SIGPIPE (141) +set +o pipefail +pigz -dc "${original_prefix}.gz" | +shuf -n "$(bc -l <<<"${max_sent}+${max_sent}*${coef}")" | +perl -ne 'print if(split(/\s/, $_) < 100)' | +head -n "${max_sent}" | +pigz >"${output_path}" +set -o pipefail + +rm -rf "${original_prefix}.gz" echo "###### Done: Downloading monolingual data" diff --git a/pipeline/data/importers/corpus/custom-corpus.sh b/pipeline/data/importers/corpus/custom-corpus.sh index cde465b69..35a6a28e1 100644 --- a/pipeline/data/importers/corpus/custom-corpus.sh +++ b/pipeline/data/importers/corpus/custom-corpus.sh @@ -11,11 +11,11 @@ echo "###### Copying custom corpus" src=$1 trg=$2 -dir=$3 +output_prefix=$3 dataset=$4 -cp "${dataset}.${src}.gz" "${dir}/" -cp "${dataset}.${trg}.gz" "${dir}/" +cp "${dataset}.${src}.gz" "${output_prefix}.${src}.gz" +cp "${dataset}.${trg}.gz" "${output_prefix}.${trg}.gz" echo "###### Done: Copying custom corpus" \ No newline at end of file diff --git a/pipeline/data/importers/corpus/flores.sh b/pipeline/data/importers/corpus/flores.sh index 19e799561..e66e61ed6 100644 --- a/pipeline/data/importers/corpus/flores.sh +++ b/pipeline/data/importers/corpus/flores.sh @@ -11,15 +11,13 @@ echo "###### Downloading flores corpus" src=$1 trg=$2 -dir=$3 +output_prefix=$3 dataset=$4 -tmp="${dir}/flores" +tmp="$(dirname "${output_prefix}")/flores/${dataset}" mkdir -p "${tmp}" -test -s "${tmp}/flores101_dataset.tar.gz" || - wget -O "${tmp}/flores101_dataset.tar.gz" "https://dl.fbaipublicfiles.com/flores101/dataset/flores101_dataset.tar.gz" - +wget -O "${tmp}/flores101_dataset.tar.gz" "https://dl.fbaipublicfiles.com/flores101/dataset/flores101_dataset.tar.gz" tar -xzf "${tmp}/flores101_dataset.tar.gz" -C "${tmp}" --no-same-owner flores_code() { @@ -39,8 +37,8 @@ flores_code() { src_flores=$(flores_code "${src}") trg_flores=$(flores_code "${trg}") -cp "${tmp}/flores101_dataset/${dataset}/${src_flores}.${dataset}" "${dir}/flores.${src}" -cp "${tmp}/flores101_dataset/${dataset}/${trg_flores}.${dataset}" "${dir}/flores.${trg}" +pigz -c "${tmp}/flores101_dataset/${dataset}/${src_flores}.${dataset}" > "${output_prefix}.${src}.gz" +pigz -c "${tmp}/flores101_dataset/${dataset}/${trg_flores}.${dataset}" > "${output_prefix}.${trg}.gz" rm -rf "${tmp}" diff --git a/pipeline/data/importers/corpus/mtdata.sh b/pipeline/data/importers/corpus/mtdata.sh index 821f8b446..e243f14b9 100644 --- a/pipeline/data/importers/corpus/mtdata.sh +++ b/pipeline/data/importers/corpus/mtdata.sh @@ -10,24 +10,20 @@ echo "###### Downloading mtdata corpus" src=$1 trg=$2 -dir=$3 +output_prefix=$3 dataset=$4 +tmp="$(dirname "${output_prefix}")/mtdata/${dataset}" +mkdir -p "${tmp}" + src_iso=$(python -c "from mtdata.iso import iso3_code; print(iso3_code('${src}', fail_error=True))") trg_iso=$(python -c "from mtdata.iso import iso3_code; print(iso3_code('${trg}', fail_error=True))") -if [ ! -e "${dir}/${dataset}.${trg}" ]; then - mtdata get -l "${src}-${trg}" -tr "${dataset}" -o "${dir}" - - for f in "${dir}"/train-parts/*."${src_iso}"; do - mv "${f}" "${dir}/${dataset}.${src}" - done - for f in "${dir}"/train-parts/*."${trg_iso}"; do - mv "${f}" "${dir}/${dataset}.${trg}" - done +mtdata get -l "${src}-${trg}" -tr "${dataset}" -o "${tmp}" - rm -rf "${dir}/train-parts" -fi +pigz -c "${tmp}/train-parts/${dataset}-${src_iso}_${trg_iso}.${src_iso}" > "${output_prefix}.${src}.gz" +pigz -c "${tmp}/train-parts/${dataset}-${src_iso}_${trg_iso}.${trg_iso}" > "${output_prefix}.${trg}.gz" +rm -rf "${tmp}" echo "###### Done: Downloading mtdata corpus" diff --git a/pipeline/data/importers/corpus/opus.sh b/pipeline/data/importers/corpus/opus.sh index 3bff6eced..58172a199 100644 --- a/pipeline/data/importers/corpus/opus.sh +++ b/pipeline/data/importers/corpus/opus.sh @@ -10,23 +10,27 @@ echo "###### Downloading opus corpus" src=$1 trg=$2 -dir=$3 +output_prefix=$3 dataset=$4 name=${dataset%%/*} +name_and_version="${dataset//[^A-Za-z0-9_- ]/_}" -if [ ! -s "${dir}/${name}.${src}-${trg}.${trg}" ] && [ ! -s "${dir}/${name}.${trg}-${src}.${trg}" ]; then - mkdir -p "${dir}/opus" +tmp="$(dirname "${output_prefix}")/opus/${name_and_version}" +mkdir -p "${tmp}" - name_and_version="${dataset//[^A-Za-z0-9_- ]/_}" - archive_path="${dir}/opus/${name_and_version}.txt.zip" +archive_path="${tmp}/${name}.txt.zip" - test -s "${archive_path}" || - wget -O "${archive_path}" "https://object.pouta.csc.fi/OPUS-${dataset}/moses/${src}-${trg}.txt.zip" || - wget -O "${archive_path}" "https://object.pouta.csc.fi/OPUS-${dataset}/moses/${trg}-${src}.txt.zip" - unzip -o "${archive_path}" -d "${dir}" +wget -O "${archive_path}" "https://object.pouta.csc.fi/OPUS-${dataset}/moses/${src}-${trg}.txt.zip" || + wget -O "${archive_path}" "https://object.pouta.csc.fi/OPUS-${dataset}/moses/${trg}-${src}.txt.zip" +unzip -o "${archive_path}" -d "${tmp}" + +for lang in ${src} ${trg}; do + pigz -c "${tmp}/${name}.${src}-${trg}.${lang}" > "${output_prefix}.${lang}.gz" || + pigz -c "${tmp}/${name}.${trg}-${src}.${lang}" > "${output_prefix}.${lang}.gz" +done + +rm -rf "${tmp}" - rm -rf "${dir}/opus" -fi echo "###### Done: Downloading opus corpus" diff --git a/pipeline/data/importers/corpus/sacrebleu.sh b/pipeline/data/importers/corpus/sacrebleu.sh index dece83e21..cecacc3bf 100644 --- a/pipeline/data/importers/corpus/sacrebleu.sh +++ b/pipeline/data/importers/corpus/sacrebleu.sh @@ -10,15 +10,10 @@ echo "###### Downloading sacrebleu corpus" src=$1 trg=$2 -dir=$3 +output_prefix=$3 dataset=$4 -name="${dataset//[^A-Za-z0-9_- ]/_}" - -test -s "${dir}/${name}.${src}" || -sacrebleu -t "${dataset}" -l "${src}-${trg}" --echo src > "${dir}/${name}.${src}" - -test -s "${dir}/${name}.${trg}" || -sacrebleu -t "${dataset}" -l "${src}-${trg}" --echo ref > "${dir}/${name}.${trg}" +sacrebleu -t "${dataset}" -l "${src}-${trg}" --echo src | pigz > "${output_prefix}.${src}.gz" +sacrebleu -t "${dataset}" -l "${src}-${trg}" --echo ref | pigz > "${output_prefix}.${trg}.gz" echo "###### Done: Downloading sacrebleu corpus" diff --git a/pipeline/data/importers/mono/commoncrawl.sh b/pipeline/data/importers/mono/commoncrawl.sh index d093ebe20..917971f2c 100644 --- a/pipeline/data/importers/mono/commoncrawl.sh +++ b/pipeline/data/importers/mono/commoncrawl.sh @@ -12,8 +12,7 @@ lang=$1 output_prefix=$2 dataset=$3 -test -s "${output_prefix}.gz" || - wget -O "${output_prefix}.xz" \ +wget -O "${output_prefix}.xz" \ "http://web-language-models.s3-website-us-east-1.amazonaws.com/${dataset}/deduped/${lang}.xz" xzcat "${output_prefix}.xz" | pigz >"${output_prefix}.gz" diff --git a/pipeline/data/importers/mono/custom-mono.sh b/pipeline/data/importers/mono/custom-mono.sh index c326cde10..aba3e053c 100644 --- a/pipeline/data/importers/mono/custom-mono.sh +++ b/pipeline/data/importers/mono/custom-mono.sh @@ -13,7 +13,7 @@ lang=$1 output_prefix=$2 dataset=$3 -cp "${dataset}.${lang}.gz" "${output_prefix}.${lang}.gz" +cp "${dataset}.${lang}.gz" "${output_prefix}.gz" echo "###### Done: Copying custom monolingual dataset" \ No newline at end of file diff --git a/pipeline/data/importers/mono/news-crawl.sh b/pipeline/data/importers/mono/news-crawl.sh index d695243ec..39d8dc13b 100644 --- a/pipeline/data/importers/mono/news-crawl.sh +++ b/pipeline/data/importers/mono/news-crawl.sh @@ -12,8 +12,7 @@ dataset=$3 echo "###### Downloading WMT newscrawl monolingual data" -test -s "${output_prefix}.gz" || - wget -O "${output_prefix}.gz" \ +wget -O "${output_prefix}.gz" \ "http://data.statmt.org/news-crawl/${lang}/${dataset}.${lang}.shuffled.deduped.gz" echo "###### Done: Downloading WMT newscrawl monolingual data" diff --git a/pipeline/data/importers/mono/paracrawl-mono.sh b/pipeline/data/importers/mono/paracrawl-mono.sh index d86a1b852..fca5660ab 100644 --- a/pipeline/data/importers/mono/paracrawl-mono.sh +++ b/pipeline/data/importers/mono/paracrawl-mono.sh @@ -13,7 +13,6 @@ output_prefix=$2 dataset=$3 if [[ "${lang}" == "en" ]]; then - test -s "${output_prefix}.gz" || wget -O "${output_prefix}.gz" "https://neural.mt/data/${dataset}-mono/en-000.gz" else echo "Only English language is supported at this time for Paracrawl" diff --git a/pipeline/quantize/eval.sh b/pipeline/quantize/eval.sh index 602a7f7ad..07caf925e 100644 --- a/pipeline/quantize/eval.sh +++ b/pipeline/quantize/eval.sh @@ -16,32 +16,36 @@ model_dir=$1 shortlist=$2 datasets_dir=$3 vocab=$4 +eval_dir=$5 -eval_dir="${model_dir}/eval" +cd "$(dirname "${0}")" mkdir -p "${eval_dir}" echo "### Evaluating a model ${model_dir} on CPU" -for src_path in "${datasets_dir}"/*."${SRC}"; do - prefix=$(basename "${src_path}" ".${SRC}") +for src_path in "${datasets_dir}"/*."${SRC}.gz"; do + prefix=$(basename "${src_path}" ".${SRC}.gz") echo "### Evaluating ${prefix} ${SRC}-${TRG}" + pigz -dc "${datasets_dir}/${prefix}.${TRG}.gz" > "${eval_dir}/${prefix}.${TRG}.ref" + test -s "${eval_dir}/${prefix}.${TRG}.bleu" || - tee "${eval_dir}/${prefix}.${SRC}" < "${src_path}" | + pigz -dc "${src_path}" | + tee "${eval_dir}/${prefix}.${SRC}" | "${MARIAN}"/marian-decoder \ -m "${model_dir}/model.intgemm.alphas.bin" \ -v "${vocab}" "${vocab}" \ - -c "pipeline/quantize/decoder.yml" \ + -c "decoder.yml" \ --quiet \ --quiet-translation \ --log "${eval_dir}/${prefix}.log" \ --shortlist "${shortlist}" false \ --int8shiftAlphaAll | tee "${eval_dir}/${prefix}.${TRG}" | - sacrebleu -d --score-only -l "${SRC}-${TRG}" "${datasets_dir}/${prefix}.${TRG}" | - tee "${eval_dir}/${prefix}.${TRG}.bleu" + sacrebleu "${eval_dir}/${prefix}.${TRG}.ref" -d -f text --score-only -l "${SRC}-${TRG}" -m bleu chrf | + tee "${eval_dir}/${prefix}.${TRG}.metrics" - test -e "${eval_dir}/${prefix}.${TRG}.bleu" || exit 1 + test -e "${eval_dir}/${prefix}.${TRG}.metrics" || exit 1 done echo "###### Done: Evaluation of a quantized model" diff --git a/pipeline/quantize/quantize.sh b/pipeline/quantize/quantize.sh index 3d3def6d2..9a18653a4 100644 --- a/pipeline/quantize/quantize.sh +++ b/pipeline/quantize/quantize.sh @@ -19,6 +19,8 @@ shortlist=$3 devtest_src=$4 output_dir=$5 +cd "$(dirname "${0}")" + res_model="${output_dir}/model.intgemm.alphas.bin" mkdir -p "${output_dir}" cp "${vocab}" "${output_dir}" @@ -28,7 +30,7 @@ test -s "${output_dir}/quantmults" || "${MARIAN}"/marian-decoder \ -m "${model}" \ -v "${vocab}" "${vocab}" \ - -c "pipeline/quantize/decoder.yml" \ + -c "decoder.yml" \ -i "${devtest_src}" \ -o "${output_dir}/output.${TRG}" \ --shortlist "${shortlist}" false \ diff --git a/pipeline/train/configs/model/s2s.yml b/pipeline/train/configs/model/backward.yml similarity index 100% rename from pipeline/train/configs/model/s2s.yml rename to pipeline/train/configs/model/backward.yml diff --git a/pipeline/train/configs/model/student.tiny11.yml b/pipeline/train/configs/model/student.yml similarity index 100% rename from pipeline/train/configs/model/student.tiny11.yml rename to pipeline/train/configs/model/student.yml diff --git a/pipeline/train/configs/model/teacher.transformer.yml b/pipeline/train/configs/model/teacher.transformer.yml deleted file mode 100644 index cf3a72d3b..000000000 --- a/pipeline/train/configs/model/teacher.transformer.yml +++ /dev/null @@ -1,8 +0,0 @@ -# https://github.com/marian-nmt/marian-examples/tree/master/transformer -# https://github.com/marian-nmt/marian-examples/tree/master/wmt2017-uedin -dec-depth: 6 -dim-vocabs: [32000, 32000] -enc-depth: 6 -tied-embeddings-all: true -transformer-dropout: 0.1 -type: transformer diff --git a/pipeline/train/configs/model/teacher.yml b/pipeline/train/configs/model/teacher.yml new file mode 100644 index 000000000..57ebc0510 --- /dev/null +++ b/pipeline/train/configs/model/teacher.yml @@ -0,0 +1,6 @@ +# https://discourse.translatelocally.com/t/marian-configuration-to-use/24 +dim-vocabs: [32000, 32000] +type: transformer +# tasks: https://github.com/marian-nmt/marian-dev/blob/master/src/common/aliases.cpp +task: transformer-big +#task: transformer-base # use smaller model for low resource (<5M sentences) \ No newline at end of file diff --git a/pipeline/train/configs/training/s2s.train.yml b/pipeline/train/configs/training/backward.train.yml similarity index 84% rename from pipeline/train/configs/training/s2s.train.yml rename to pipeline/train/configs/training/backward.train.yml index 80a74e7ec..8e66f1d46 100644 --- a/pipeline/train/configs/training/s2s.train.yml +++ b/pipeline/train/configs/training/backward.train.yml @@ -1,5 +1,5 @@ ## https://github.com/marian-nmt/marian-examples/tree/master/wmt2017-uedin -after-epochs: 10 +after: 10e # change based on available training data beam-size: 12 cost-type: ce-mean-words disp-freq: 1000 diff --git a/pipeline/train/configs/training/teacher.continue.yml b/pipeline/train/configs/training/teacher.continue.yml new file mode 100644 index 000000000..c752ca9d6 --- /dev/null +++ b/pipeline/train/configs/training/teacher.continue.yml @@ -0,0 +1,9 @@ +# https://discourse.translatelocally.com/t/marian-configuration-to-use/24 +disp-freq: 1000 +learn-rate: 0.0003 # Turn this down if you get a diverged model, maybe 0.0001 +no-restore-corpus: True +optimizer-delay: 1 # Roughly GPU devices * optimizer-delay = 8, but keep as an integer +save-freq: 5000 +valid-freq: 3000 +valid-max-length: 300 +valid-mini-batch: 8 \ No newline at end of file diff --git a/pipeline/train/configs/training/teacher.train.yml b/pipeline/train/configs/training/teacher.train.yml new file mode 100644 index 000000000..718b476b1 --- /dev/null +++ b/pipeline/train/configs/training/teacher.train.yml @@ -0,0 +1,9 @@ +# https://discourse.translatelocally.com/t/marian-configuration-to-use/24 +after: 2e # remove for low resource languages or if training without augmentation +disp-freq: 1000 +learn-rate: 0.0003 # Turn this down if you get a diverged model, maybe 0.0001 +optimizer-delay: 1 # Roughly GPU devices * optimizer-delay = 8, but keep as an integer +save-freq: 5000 +valid-freq: 3000 +valid-max-length: 300 +valid-mini-batch: 8 \ No newline at end of file diff --git a/pipeline/train/configs/training/teacher.transformer-ens.train.yml b/pipeline/train/configs/training/teacher.transformer-ens.train.yml deleted file mode 100644 index 0c3c3e79a..000000000 --- a/pipeline/train/configs/training/teacher.transformer-ens.train.yml +++ /dev/null @@ -1,22 +0,0 @@ -# https://github.com/marian-nmt/marian-examples/tree/master/wmt2017-uedin -after-epochs: 8 -beam-size: 12 -clip-norm: 5 -cost-type: ce-mean-words -disp-freq: 500 -early-stopping: 5 -exponential-smoothing: True -label-smoothing: 0.1 -learn-rate: 0.0003 -lr-decay-inv-sqrt: 16000 -lr-report: True -lr-warmup: 16000 -max-length: 100 -maxi-batch: 1000 -mini-batch-fit: True -mini-batch: 1000 -normalize: 1 -optimizer-params: [0.9, 0.98, 1e-09] -save-freq: 5000 -valid-freq: 5000 -valid-mini-batch: 64 \ No newline at end of file diff --git a/pipeline/train/configs/training/teacher.transformer.train.yml b/pipeline/train/configs/training/teacher.transformer.train.yml deleted file mode 100644 index 12ecc7db6..000000000 --- a/pipeline/train/configs/training/teacher.transformer.train.yml +++ /dev/null @@ -1,21 +0,0 @@ -# https://github.com/marian-nmt/marian-examples/tree/master/transformer -beam-size: 6 -clip-norm: 5 -cost-type: ce-mean-words -disp-first: 10 -disp-freq: 500 -early-stopping: 10 -exponential-smoothing: True -label-smoothing: 0.1 -learn-rate: 0.0003 -lr-decay-inv-sqrt: 16000 -lr-report: True -lr-warmup: 16000 -max-length: 100 -maxi-batch: 1000 -mini-batch-fit: True -normalize: 0.6 -optimizer-params: [0.9, 0.98, 1e-09] -save-freq: 5000 -valid-freq: 5000 -valid-mini-batch: 64 \ No newline at end of file diff --git a/pipeline/train/eval.sh b/pipeline/train/eval.sh index e51ccbb61..56f18ac03 100644 --- a/pipeline/train/eval.sh +++ b/pipeline/train/eval.sh @@ -12,38 +12,38 @@ test -v GPUS test -v MARIAN test -v WORKSPACE -model_dir=$1 +eval_dir=$1 datasets_dir=$2 -src="${3:-${SRC}}" -trg="${4:-${TRG}}" +src=$3 +trg=$4 +models=( "${@:5}" ) -config="${model_dir}/model.npz.best-bleu-detok.npz.decoder.yml" -eval_dir="${model_dir}/eval" - -echo "### Checking model files" -test -e "${config}" || exit 1 mkdir -p "${eval_dir}" -echo "### Evaluating a model ${model_dir}" -for src_path in "${datasets_dir}"/*."${src}"; do - prefix=$(basename "${src_path}" ".${src}") +echo "### Evaluating the model" +for src_path in "${datasets_dir}"/*."${src}.gz"; do + prefix=$(basename "${src_path}" ".${src}.gz") echo "### Evaluating ${prefix} ${src}-${trg}" + pigz -dc "${datasets_dir}/${prefix}.${TRG}.gz" > "${eval_dir}/${prefix}.${TRG}.ref" + test -s "${eval_dir}/${prefix}.${trg}.bleu" || - tee "${eval_dir}/${prefix}.${src}" < "${src_path}" | + pigz -dc "${src_path}" | + tee "${eval_dir}/${prefix}.${src}" | "${MARIAN}"/marian-decoder \ - -c "${config}" \ + -m "${models[@]}" \ + -c "${models[0]}.decoder.yml" \ -w "${WORKSPACE}" \ --quiet \ --quiet-translation \ --log "${eval_dir}/${prefix}.log" \ -d ${GPUS} | tee "${eval_dir}/${prefix}.${trg}" | - sacrebleu -d --score-only -l "${src}-${trg}" "${datasets_dir}/${prefix}.${trg}" | - tee "${eval_dir}/${prefix}.${trg}.bleu" + sacrebleu "${eval_dir}/${prefix}.${TRG}.ref" -d -f text --score-only -l "${src}-${trg}" -m bleu chrf | + tee "${eval_dir}/${prefix}.${trg}.metrics" - test -e "${eval_dir}/${prefix}.${trg}.bleu" || exit 1 + test -e "${eval_dir}/${prefix}.${trg}.metrics" || exit 1 done diff --git a/pipeline/train/finetune-student.sh b/pipeline/train/finetune-student.sh deleted file mode 100644 index bd78d8ba9..000000000 --- a/pipeline/train/finetune-student.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/bin/bash -## -# Finetune a student model. -# - -set -x -set -euo pipefail - -echo "###### Finetuning the student model" - -dir=$1 -corpus=$2 -devset=$3 -vocab=$4 -alignment=$5 -student=$6 -extra_params=( "${@:7}" ) - -test -v SRC -test -v TRG - - -mkdir -p "${dir}" -cp "${student}" "${dir}/model.npz" - -bash "pipeline/train/train.sh" \ - "pipeline/train/configs/model/student.tiny11.yml" \ - "pipeline/train/configs/training/student.finetune.yml" \ - "${SRC}" \ - "${TRG}" \ - "${corpus}" \ - "${devset}" \ - "${dir}" \ - "${vocab}" \ - --guided-alignment "${alignment}" \ - "${extra_params[@]}" - -echo "###### Done: Finetuning the student model" - - diff --git a/pipeline/train/train-s2s.sh b/pipeline/train/train-s2s.sh deleted file mode 100644 index 002f1d543..000000000 --- a/pipeline/train/train-s2s.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash -## -# Train a shallow s2s model. -# - -set -x -set -euo pipefail - -echo "###### Training s2s model" - -dir=$1 -corpus=$2 -devset=$3 -vocab=$4 -src=$5 -trg=$6 -extra_params=( "${@:7}" ) - - -bash "pipeline/train/train.sh" \ - "pipeline/train/configs/model/s2s.yml" \ - "pipeline/train/configs/training/s2s.train.yml" \ - "${src}" \ - "${trg}" \ - "${corpus}" \ - "${devset}" \ - "${dir}" \ - "${vocab}" \ - "${extra_params[@]}" - - -echo "###### Done: Training s2s model" diff --git a/pipeline/train/train-student.sh b/pipeline/train/train-student.sh index 5b450e9f4..e0a5f9afb 100644 --- a/pipeline/train/train-student.sh +++ b/pipeline/train/train-student.sh @@ -8,27 +8,15 @@ set -euo pipefail echo "###### Training a student model" -dir=$1 -corpus=$2 -devset=$3 -vocab=$4 -alignment=$5 -extra_params=( "${@:6}" ) - -test -v SRC -test -v TRG - -bash "pipeline/train/train.sh" \ - "pipeline/train/configs/model/student.tiny11.yml" \ - "pipeline/train/configs/training/student.train.yml" \ - "${SRC}" \ - "${TRG}" \ - "${corpus}" \ - "${devset}" \ - "${dir}" \ - "${vocab}" \ +alignment=$1 +extra_params=( "${@:2}" ) + +cd "$(dirname "${0}")" + +bash "train.sh" \ + "${extra_params[@]}" \ --guided-alignment "${alignment}" \ - "${extra_params[@]}" + echo "###### Done: Training a student model" diff --git a/pipeline/train/train-teacher.sh b/pipeline/train/train-teacher.sh deleted file mode 100644 index dc3abee07..000000000 --- a/pipeline/train/train-teacher.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash -## -# Train a teacher model. -# - -set -x -set -euo pipefail - -echo "###### Training a teacher model" - -dir=$1 -corpus=$2 -devset=$3 -vocab=$4 -extra_params=( "${@:5}" ) - -test -v SRC -test -v TRG - -bash "pipeline/train/train.sh" \ - "pipeline/train/configs/model/teacher.transformer.yml" \ - "pipeline/train/configs/training/teacher.transformer.train.yml" \ - "${SRC}" \ - "${TRG}" \ - "${corpus}" \ - "${devset}" \ - "${dir}" \ - "${vocab}" \ - "${extra_params[@]}" - -echo "###### Training a teacher model" diff --git a/pipeline/train/train.sh b/pipeline/train/train.sh index 15f67145f..b089fb8a7 100644 --- a/pipeline/train/train.sh +++ b/pipeline/train/train.sh @@ -8,10 +8,8 @@ set -euo pipefail echo "###### Training a model" -#TODO too many positional args here, replace with names args - -model_config=$1 -training_config=$2 +model_type=$1 +training_type=$2 src=$3 trg=$4 train_set_prefix=$5 @@ -24,6 +22,7 @@ test -v GPUS test -v MARIAN test -v WORKSPACE +cd "$(dirname "${0}")" mkdir -p "${model_dir}/tmp" echo "### Training ${model_dir}" @@ -32,7 +31,7 @@ echo "### Training ${model_dir}" "${MARIAN}/marian" \ --model "${model_dir}/model.npz" \ - -c "${model_config}" "${training_config}" \ + -c "configs/model/${model_type}.yml" "configs/training/${model_type}.${training_type}.yml" \ --train-sets "${train_set_prefix}".{"${src}","${trg}"}.gz \ -T "${model_dir}/tmp" \ --shuffle-in-ram \ @@ -40,7 +39,7 @@ echo "### Training ${model_dir}" -w "${WORKSPACE}" \ --devices ${GPUS} \ --sync-sgd \ - --valid-metrics bleu-detok ce-mean-words perplexity \ + --valid-metrics ce-mean-words bleu-detok chrf \ --valid-sets "${valid_set_prefix}".{"${src}","${trg}"}.gz \ --valid-translation-output "${model_dir}/devset.out" \ --quiet-translation \ diff --git a/pipeline/translate/decoder.yml b/pipeline/translate/decoder.yml index 4ebbbf520..664a9f733 100644 --- a/pipeline/translate/decoder.yml +++ b/pipeline/translate/decoder.yml @@ -1,7 +1,8 @@ normalize: 1.0 word-penalty: 0 mini-batch: 16 -mini-batch-words: 2000 +#mini-batch-words: 2000 # 1 model or 24 gb GPU +mini-batch-words: 500 # 12 Gb GPU, ensemble of 4 teachers maxi-batch: 1000 maxi-batch-sort: src max-length: 200 diff --git a/pipeline/translate/translate-nbest.sh b/pipeline/translate/translate-nbest.sh index 6099acbc3..97a66b30b 100755 --- a/pipeline/translate/translate-nbest.sh +++ b/pipeline/translate/translate-nbest.sh @@ -14,9 +14,10 @@ input=$1 vocab=$2 models=( "${@:3}" ) +cd "$(dirname "${0}")" "${MARIAN}/marian-decoder" \ - -c pipeline/translate/decoder.yml \ + -c decoder.yml \ -m "${models[@]}" \ -v "${vocab}" "${vocab}" \ -i "${input}" \ @@ -26,3 +27,4 @@ models=( "${@:3}" ) -d ${GPUS} \ -w "${WORKSPACE}" +test "$(wc -l <"${input}.nbest")" -eq "$(( $(wc -l <"${input}") * 8 ))" \ No newline at end of file diff --git a/pipeline/translate/translate.sh b/pipeline/translate/translate.sh index b5389f6e4..f046ae532 100755 --- a/pipeline/translate/translate.sh +++ b/pipeline/translate/translate.sh @@ -15,8 +15,10 @@ vocab=$2 models=( "${@:3}" ) +cd "$(dirname "${0}")" + "${MARIAN}/marian-decoder" \ - -c pipeline/translate/decoder.yml \ + -c decoder.yml \ -m "${models[@]}" \ -v "${vocab}" "${vocab}" \ -i "${input}" \ @@ -25,3 +27,4 @@ models=( "${@:3}" ) -d ${GPUS} \ -w "${WORKSPACE}" +test "$(wc -l <"${input}")" == "$(wc -l <"${input}.out")" diff --git a/profiles/snakepit/config.yaml b/profiles/snakepit/config.yaml deleted file mode 100644 index 3a27972a1..000000000 --- a/profiles/snakepit/config.yaml +++ /dev/null @@ -1,6 +0,0 @@ -cluster: "submit.py" -cluster-status: "status.py" -jobscript: "jobscript.sh" -jobs: 10 -immediate-submit: false -verbose: true \ No newline at end of file diff --git a/profiles/snakepit/jobscript.sh b/profiles/snakepit/jobscript.sh deleted file mode 100644 index a72f24e3a..000000000 --- a/profiles/snakepit/jobscript.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash -# properties = {properties} -{exec_job} \ No newline at end of file diff --git a/profiles/snakepit/status.py b/profiles/snakepit/status.py deleted file mode 100644 index ca375ed10..000000000 --- a/profiles/snakepit/status.py +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env python3 - - -import sys -import subprocess - -job_id = sys.argv[1] - -try: - cmd = f''' - unset http_proxy - unset HTTP_PROXY - pit show job:{job_id}''' - - res = subprocess.run(cmd, - check=True, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - shell=True) - - info = res.stdout.decode() - - if 'FIN' in info: - if 'Status code: 0' in info: - print("success") - else: - print("failed") - else: - print("running") - -except (subprocess.CalledProcessError, IndexError, KeyboardInterrupt) as e: - print("failed") diff --git a/profiles/snakepit/submit.py b/profiles/snakepit/submit.py deleted file mode 100644 index ba396b67d..000000000 --- a/profiles/snakepit/submit.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python3 -import os -import sys -import argparse -import subprocess - -from snakemake.utils import read_job_properties - -jobscript = sys.argv[-1] -job_properties = read_job_properties(jobscript) - -request = '[]' # cpu only -if "resources" in job_properties: - resources = job_properties["resources"] - - if 'gpu' in resources: - num = resources['gpu'] - # todo: find available models - request = f'[{num}:txp]' - -name = job_properties.get("rule") -cmd = f''' - unset http_proxy - unset HTTP_PROXY - mkdir -p empty - cd empty - pit run snakemake-{name} {request} -e "bash {jobscript}"''' - -try: - res = subprocess.run(cmd, check=True, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) -except subprocess.CalledProcessError as e: - raise e - -res = res.stdout.decode() -number_line = '=> job number:' -job_id = res[res.find(number_line) + len(number_line):].strip() -print(job_id) diff --git a/reports/evaluation.rst b/reports/evaluation.rst index 4aca21f56..133588b7e 100644 --- a/reports/evaluation.rst +++ b/reports/evaluation.rst @@ -1 +1 @@ -.. include:: {{ snakemake.output[0] }}/{{ snakemake.wildcards.name }}.bleu \ No newline at end of file +.. include:: {{ snakemake.output[0] }}/{{ snakemake.wildcards.name }}.metrics \ No newline at end of file