Skip to content

Commit

Permalink
Add wget to mock out downloads for importers (#410)
Browse files Browse the repository at this point in the history
* Use run_task for dataset importing

* Add a wget mock for downloads
  • Loading branch information
gregtatum authored Jan 30, 2024
1 parent cb4231e commit 3d81ca5
Show file tree
Hide file tree
Showing 13 changed files with 122 additions and 34 deletions.
Empty file modified pipeline/data/dataset_importer.py
100644 → 100755
Empty file.
3 changes: 2 additions & 1 deletion pipeline/data/importers/corpus/flores.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,12 @@ dataset=$4

COMPRESSION_CMD="${COMPRESSION_CMD:-pigz}"
ARTIFACT_EXT="${ARTIFACT_EXT:-gz}"
WGET="${WGET:-wget}" # This can be overridden by tests.

tmp="$(mktemp -d)/flores/${dataset}"
mkdir -p "${tmp}"

wget -O "${tmp}/flores101_dataset.tar.gz" "https://dl.fbaipublicfiles.com/flores101/dataset/flores101_dataset.tar.gz"
${WGET} -O "${tmp}/flores101_dataset.tar.gz" "https://dl.fbaipublicfiles.com/flores101/dataset/flores101_dataset.tar.gz"
tar -xzf "${tmp}/flores101_dataset.tar.gz" -C "${tmp}" --no-same-owner

flores_code() {
Expand Down
5 changes: 3 additions & 2 deletions pipeline/data/importers/corpus/opus.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ dataset=$4

COMPRESSION_CMD="${COMPRESSION_CMD:-pigz}"
ARTIFACT_EXT="${ARTIFACT_EXT:-gz}"
WGET="${WGET:-wget}" # This can be overridden by tests.

name=${dataset%%/*}
name_and_version="${dataset//[^A-Za-z0-9_- ]/_}"
Expand All @@ -24,8 +25,8 @@ mkdir -p "${tmp}"

archive_path="${tmp}/${name}.txt.zip"

wget -O "${archive_path}" "https://object.pouta.csc.fi/OPUS-${dataset}/moses/${src}-${trg}.txt.zip" ||
wget -O "${archive_path}" "https://object.pouta.csc.fi/OPUS-${dataset}/moses/${trg}-${src}.txt.zip"
${WGET} -O "${archive_path}" "https://object.pouta.csc.fi/OPUS-${dataset}/moses/${src}-${trg}.txt.zip" ||
${WGET} -O "${archive_path}" "https://object.pouta.csc.fi/OPUS-${dataset}/moses/${trg}-${src}.txt.zip"
unzip -o "${archive_path}" -d "${tmp}"

for lang in ${src} ${trg}; do
Expand Down
3 changes: 2 additions & 1 deletion pipeline/data/importers/mono/commoncrawl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,9 @@ echo "###### Downloading commoncrawl monolingual data"
lang=$1
output_prefix=$2
dataset=$3
WGET="${WGET:-wget}" # This can be overridden by tests.

wget -O "${output_prefix}.xz" \
${WGET} -O "${output_prefix}.xz" \
"http://web-language-models.s3-website-us-east-1.amazonaws.com/${dataset}/deduped/${lang}.xz"
xzcat "${output_prefix}.xz" | pigz >"${output_prefix}.gz"

Expand Down
2 changes: 0 additions & 2 deletions pipeline/data/importers/mono/news-crawl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,3 @@ curl -L "http://data.statmt.org/news-crawl/${lang}/${dataset}.${lang}.shuffled.d
gunzip | ${COMPRESSION_CMD} -c > "${output_prefix}.${ARTIFACT_EXT}"

echo "###### Done: Downloading WMT newscrawl monolingual data"


12 changes: 6 additions & 6 deletions taskcluster/kinds/dataset/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ tasks:
python3 $VCS_PATH/pipeline/data/dataset_importer.py
--type corpus
--dataset {dataset}
--output_prefix /builds/worker/artifacts/{dataset_sanitized}
--output_prefix $TASK_WORKDIR/artifacts/{dataset_sanitized}
sacrebleu:
description: Fetch sacrebleu dataset
Expand All @@ -92,7 +92,7 @@ tasks:
python3 -u $VCS_PATH/pipeline/data/dataset_importer.py
--type corpus
--dataset {dataset}
--output_prefix /builds/worker/artifacts/{dataset_sanitized} 2>&1
--output_prefix $TASK_WORKDIR/artifacts/{dataset_sanitized} 2>&1
opus:
description: Fetch opus dataset
Expand All @@ -117,7 +117,7 @@ tasks:
python3 $VCS_PATH/pipeline/data/dataset_importer.py
--type corpus
--dataset {dataset}
--output_prefix /builds/worker/artifacts/{dataset_sanitized}
--output_prefix $TASK_WORKDIR/artifacts/{dataset_sanitized}
mtdata:
description: Fetch mtdata dataset
Expand All @@ -141,7 +141,7 @@ tasks:
python3 $VCS_PATH/pipeline/data/dataset_importer.py
--type corpus
--dataset {dataset}
--output_prefix /builds/worker/artifacts/{dataset_sanitized}
--output_prefix $TASK_WORKDIR/artifacts/{dataset_sanitized}
news-crawl-{src_locale}:
description: Fetch news-crawl dataset for {src_locale}
Expand Down Expand Up @@ -172,7 +172,7 @@ tasks:
{dataset}
{src_locale}
{max_sent_src}
/builds/worker/artifacts/{dataset_sanitized}.{src_locale}.zst
$TASK_WORKDIR/artifacts/{dataset_sanitized}.{src_locale}.zst
news-crawl-{trg_locale}:
description: Fetch news-crawl dataset for {trg_locale}
Expand Down Expand Up @@ -203,4 +203,4 @@ tasks:
{dataset}
{trg_locale}
{max_sent_trg}
/builds/worker/artifacts/{dataset_sanitized}.{trg_locale}.zst
$TASK_WORKDIR/artifacts/{dataset_sanitized}.{trg_locale}.zst
3 changes: 3 additions & 0 deletions tests/data/corpus_samples/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Corpus Samples

These are datasets that are cut down to only a few samples, which can be used for testing purposes.
Binary file added tests/data/corpus_samples/en-ru.txt.zip
Binary file not shown.
Binary file not shown.
15 changes: 15 additions & 0 deletions tests/fixtures/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,3 +300,18 @@ def get_task_command_and_env(task_name: str, script=None) -> tuple[str, dict[str

# Return the full command.
return command_parts, env


def get_mocked_downloads() -> str:
corpus_samples = os.path.abspath(os.path.join(FIXTURES_PATH, "../data/corpus_samples"))
return json.dumps(
{
"https://dl.fbaipublicfiles.com/flores101/dataset/flores101_dataset.tar.gz": os.path.join(
corpus_samples, "flores101_dataset.tar.gz"
),
"https://object.pouta.csc.fi/OPUS-ELRC-3075-wikipedia_health/v1/moses/en-ru.txt.zip": os.path.join(
corpus_samples, "en-ru.txt.zip"
),
"https://object.pouta.csc.fi/OPUS-ELRC-3075-wikipedia_health/v1/moses/ru-en.txt.zip": "404",
}
)
2 changes: 2 additions & 0 deletions tests/fixtures/config.pytest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,13 @@ datasets:
test:
- flores_devtest
- sacrebleu_wmt09
- sacrebleu_wmt19
- mtdata_Neulab-tedtalks_test-1-eng-rus
train:
- opus_Books/v1
- opus_CCAligned/v1
- opus_CCMatrix/v1
- opus_ELRC-3075-wikipedia_health/v1
mono-src:
- news-crawl_news.2021
- news-crawl_news.2020
Expand Down
56 changes: 56 additions & 0 deletions tests/fixtures/wget
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/usr/bin/env python3
"""
This is a test fixture which mocks wget.
Set the MOCKED_DOWNLOADS environment variable to a JSON object mapping a URL to a file path.
"""

import argparse
import json
import os
import shutil

if not os.environ.get("MOCKED_DOWNLOADS"):
raise Exception(
"The mocked_wget utility expected the MOCKED_DOWNLOADS environment variable to be set."
)


mocked_downloads = json.loads(os.environ.get("MOCKED_DOWNLOADS"))

if not isinstance(mocked_downloads, dict):
raise Exception(
"Expected the mocked downloads to be a json object mapping the URL to file path"
)


parser = argparse.ArgumentParser()
parser.add_argument(
"-O", "--output-document", dest="output", help="The output path", required=True
)
parser.add_argument("url", help="The url to download")

args = parser.parse_args()

print("[mocked wget]", args.url)

source_file = mocked_downloads.get(args.url)
if not source_file:
print("[mocked wget] MOCKED_DOWNLOADS:", mocked_downloads)
raise Exception(f"Received a URL that was not in MOCKED_DOWNLOADS {args.url}")

if source_file == "404":
print("[mocked wget]: Mocking a 404")
# 8 is what wget gives as an exit code in this case.
os.sys.exit(8)

if not os.path.exists(source_file):
raise Exception(f"The source file specified did not exist {source_file}")

print("[mocked wget] copying the file")
print(f"[mocked wget] from: {source_file}")
print(f"[mocked wget] to: {args.output}")

shutil.copyfile(source_file, args.output)

print("[mocked wget] Success")
55 changes: 33 additions & 22 deletions tests/test_data_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,20 @@
import os

import pytest
from fixtures import DataDir
from fixtures import DataDir, get_mocked_downloads

SRC = "ru"
TRG = "en"
ARTIFACT_EXT = "gz"
COMPRESSION_CMD = "pigz"
CURRENT_FOLDER = os.path.dirname(os.path.abspath(__file__))

os.environ["ARTIFACT_EXT"] = ARTIFACT_EXT
os.environ["COMPRESSION_CMD"] = "pigz"
os.environ["COMPRESSION_CMD"] = COMPRESSION_CMD
os.environ["SRC"] = SRC
os.environ["TRG"] = TRG


from pipeline.data.dataset_importer import run_import

# the augmentation is probabilistic, here is a range for 0.1 probability
Expand Down Expand Up @@ -47,36 +50,44 @@ def data_dir():


@pytest.mark.parametrize(
"dataset",
"importer,dataset",
[
"mtdata_Neulab-tedtalks_test-1-eng-rus",
"opus_ELRC-3075-wikipedia_health/v1",
"flores_dev",
"sacrebleu_wmt19",
("mtdata", "Neulab-tedtalks_test-1-eng-rus"),
("opus", "ELRC-3075-wikipedia_health_v1"),
("flores", "dev"),
("sacrebleu", "wmt19"),
],
)
def test_basic_corpus_import(dataset, data_dir):
prefix = data_dir.join(dataset)
output_src = f"{prefix}.{SRC}.{ARTIFACT_EXT}"
output_trg = f"{prefix}.{TRG}.{ARTIFACT_EXT}"

run_import("corpus", dataset, prefix)
def test_basic_corpus_import(importer, dataset, data_dir):
data_dir.run_task(
f"dataset-{importer}-{dataset}-en-ru",
env={
"COMPRESSION_CMD": COMPRESSION_CMD,
"ARTIFACT_EXT": ARTIFACT_EXT,
"WGET": os.path.join(CURRENT_FOLDER, "fixtures/wget"),
"MOCKED_DOWNLOADS": get_mocked_downloads(),
},
)

prefix = data_dir.join(f"artifacts/{dataset}")
output_src = f"{prefix}.ru.gz"
output_trg = f"{prefix}.en.gz"

assert os.path.exists(output_src)
assert os.path.exists(output_trg)
assert len(read_lines(output_src)) > 0
assert len(read_lines(output_trg)) > 0


@pytest.mark.parametrize(
"params",
[
("sacrebleu_aug-upper_wmt19", is_upper_case, AUG_MIN_RATE, AUG_MAX_RATE),
("sacrebleu_aug-upper-strict_wmt19", is_upper_case, 1.0, 1.0),
("sacrebleu_aug-title_wmt19", is_title_case, AUG_MIN_RATE, AUG_MAX_RATE),
("sacrebleu_aug-title-strict_wmt19", is_title_case, 1.0, 1.0),
],
)
augmentation_params = [
("sacrebleu_aug-upper_wmt19", is_upper_case, AUG_MIN_RATE, AUG_MAX_RATE),
("sacrebleu_aug-upper-strict_wmt19", is_upper_case, 1.0, 1.0),
("sacrebleu_aug-title_wmt19", is_title_case, AUG_MIN_RATE, AUG_MAX_RATE),
("sacrebleu_aug-title-strict_wmt19", is_title_case, 1.0, 1.0),
]


@pytest.mark.parametrize("params", augmentation_params, ids=[d[0] for d in augmentation_params])
def test_specific_augmentation(params, data_dir):
dataset, check_func, min_rate, max_rate = params
prefix = data_dir.join(dataset)
Expand Down

0 comments on commit 3d81ca5

Please sign in to comment.