Add wget to mock out downloads for importers (#410)

* Use run_task for dataset importing * Add a wget mock for downloads
mozilla · Jan 30, 2024 · 3d81ca5 · 3d81ca5
1 parent cb4231e
commit 3d81ca5
Show file tree

Hide file tree

Showing 13 changed files with 122 additions and 34 deletions.
diff --git a/pipeline/data/dataset_importer.py b/pipeline/data/dataset_importer.py
diff --git a/pipeline/data/importers/corpus/flores.sh b/pipeline/data/importers/corpus/flores.sh
@@ -16,11 +16,12 @@ dataset=$4
 
 COMPRESSION_CMD="${COMPRESSION_CMD:-pigz}"
 ARTIFACT_EXT="${ARTIFACT_EXT:-gz}"
+WGET="${WGET:-wget}" # This can be overridden by tests.
 
 tmp="$(mktemp -d)/flores/${dataset}"
 mkdir -p "${tmp}"
 
-wget -O "${tmp}/flores101_dataset.tar.gz" "https://dl.fbaipublicfiles.com/flores101/dataset/flores101_dataset.tar.gz"
+${WGET} -O "${tmp}/flores101_dataset.tar.gz" "https://dl.fbaipublicfiles.com/flores101/dataset/flores101_dataset.tar.gz"
 tar -xzf "${tmp}/flores101_dataset.tar.gz" -C "${tmp}" --no-same-owner
 
 flores_code() {

diff --git a/pipeline/data/importers/corpus/opus.sh b/pipeline/data/importers/corpus/opus.sh
@@ -15,6 +15,7 @@ dataset=$4
 
 COMPRESSION_CMD="${COMPRESSION_CMD:-pigz}"
 ARTIFACT_EXT="${ARTIFACT_EXT:-gz}"
+WGET="${WGET:-wget}" # This can be overridden by tests.
 
 name=${dataset%%/*}
 name_and_version="${dataset//[^A-Za-z0-9_- ]/_}"
@@ -24,8 +25,8 @@ mkdir -p "${tmp}"
 
 archive_path="${tmp}/${name}.txt.zip"
 
-wget -O "${archive_path}" "https://object.pouta.csc.fi/OPUS-${dataset}/moses/${src}-${trg}.txt.zip" ||
-  wget -O "${archive_path}" "https://object.pouta.csc.fi/OPUS-${dataset}/moses/${trg}-${src}.txt.zip"
+${WGET} -O "${archive_path}" "https://object.pouta.csc.fi/OPUS-${dataset}/moses/${src}-${trg}.txt.zip" ||
+  ${WGET} -O "${archive_path}" "https://object.pouta.csc.fi/OPUS-${dataset}/moses/${trg}-${src}.txt.zip"
 unzip -o "${archive_path}" -d "${tmp}"
 
 for lang in ${src} ${trg}; do

diff --git a/pipeline/data/importers/mono/commoncrawl.sh b/pipeline/data/importers/mono/commoncrawl.sh
@@ -11,8 +11,9 @@ echo "###### Downloading commoncrawl monolingual data"
 lang=$1
 output_prefix=$2
 dataset=$3
+WGET="${WGET:-wget}" # This can be overridden by tests.
 
-wget -O "${output_prefix}.xz" \
+${WGET} -O "${output_prefix}.xz" \
     "http://web-language-models.s3-website-us-east-1.amazonaws.com/${dataset}/deduped/${lang}.xz"
 xzcat "${output_prefix}.xz" | pigz >"${output_prefix}.gz"
 

diff --git a/pipeline/data/importers/mono/news-crawl.sh b/pipeline/data/importers/mono/news-crawl.sh
@@ -19,5 +19,3 @@ curl -L "http://data.statmt.org/news-crawl/${lang}/${dataset}.${lang}.shuffled.d
     gunzip | ${COMPRESSION_CMD} -c > "${output_prefix}.${ARTIFACT_EXT}"
 
 echo "###### Done: Downloading WMT newscrawl monolingual data"
-
-
diff --git a/taskcluster/kinds/dataset/kind.yml b/taskcluster/kinds/dataset/kind.yml
@@ -68,7 +68,7 @@ tasks:
                     python3 $VCS_PATH/pipeline/data/dataset_importer.py
                     --type corpus
                     --dataset {dataset}
-                    --output_prefix /builds/worker/artifacts/{dataset_sanitized}
+                    --output_prefix $TASK_WORKDIR/artifacts/{dataset_sanitized}
 
     sacrebleu:
         description: Fetch sacrebleu dataset
@@ -92,7 +92,7 @@ tasks:
                     python3 -u $VCS_PATH/pipeline/data/dataset_importer.py
                     --type corpus
                     --dataset {dataset}
-                    --output_prefix /builds/worker/artifacts/{dataset_sanitized} 2>&1
+                    --output_prefix $TASK_WORKDIR/artifacts/{dataset_sanitized} 2>&1
 
     opus:
         description: Fetch opus dataset
@@ -117,7 +117,7 @@ tasks:
                     python3 $VCS_PATH/pipeline/data/dataset_importer.py
                     --type corpus
                     --dataset {dataset}
-                    --output_prefix /builds/worker/artifacts/{dataset_sanitized}
+                    --output_prefix $TASK_WORKDIR/artifacts/{dataset_sanitized}
 
     mtdata:
         description: Fetch mtdata dataset
@@ -141,7 +141,7 @@ tasks:
                     python3 $VCS_PATH/pipeline/data/dataset_importer.py
                     --type corpus
                     --dataset {dataset}
-                    --output_prefix /builds/worker/artifacts/{dataset_sanitized}
+                    --output_prefix $TASK_WORKDIR/artifacts/{dataset_sanitized}
 
     news-crawl-{src_locale}:
         description: Fetch news-crawl dataset for {src_locale}
@@ -172,7 +172,7 @@ tasks:
                     {dataset}
                     {src_locale}
                     {max_sent_src}
-                    /builds/worker/artifacts/{dataset_sanitized}.{src_locale}.zst
+                    $TASK_WORKDIR/artifacts/{dataset_sanitized}.{src_locale}.zst
 
     news-crawl-{trg_locale}:
         description: Fetch news-crawl dataset for {trg_locale}
@@ -203,4 +203,4 @@ tasks:
                     {dataset}
                     {trg_locale}
                     {max_sent_trg}
-                    /builds/worker/artifacts/{dataset_sanitized}.{trg_locale}.zst
+                    $TASK_WORKDIR/artifacts/{dataset_sanitized}.{trg_locale}.zst
diff --git a/tests/data/corpus_samples/README.md b/tests/data/corpus_samples/README.md
@@ -0,0 +1,3 @@
+# Corpus Samples
+
+These are datasets that are cut down to only a few samples, which can be used for testing purposes.
diff --git a/tests/data/corpus_samples/en-ru.txt.zip b/tests/data/corpus_samples/en-ru.txt.zip
diff --git a/tests/data/corpus_samples/flores101_dataset.tar.gz b/tests/data/corpus_samples/flores101_dataset.tar.gz
diff --git a/tests/fixtures/__init__.py b/tests/fixtures/__init__.py
@@ -300,3 +300,18 @@ def get_task_command_and_env(task_name: str, script=None) -> tuple[str, dict[str
 
     # Return the full command.
     return command_parts, env
+
+
+def get_mocked_downloads() -> str:
+    corpus_samples = os.path.abspath(os.path.join(FIXTURES_PATH, "../data/corpus_samples"))
+    return json.dumps(
+        {
+            "https://dl.fbaipublicfiles.com/flores101/dataset/flores101_dataset.tar.gz": os.path.join(
+                corpus_samples, "flores101_dataset.tar.gz"
+            ),
+            "https://object.pouta.csc.fi/OPUS-ELRC-3075-wikipedia_health/v1/moses/en-ru.txt.zip": os.path.join(
+                corpus_samples, "en-ru.txt.zip"
+            ),
+            "https://object.pouta.csc.fi/OPUS-ELRC-3075-wikipedia_health/v1/moses/ru-en.txt.zip": "404",
+        }
+    )
diff --git a/tests/fixtures/config.pytest.yml b/tests/fixtures/config.pytest.yml
@@ -23,11 +23,13 @@ datasets:
   test:
     - flores_devtest
     - sacrebleu_wmt09
+    - sacrebleu_wmt19
     - mtdata_Neulab-tedtalks_test-1-eng-rus
   train:
     - opus_Books/v1
     - opus_CCAligned/v1
     - opus_CCMatrix/v1
+    - opus_ELRC-3075-wikipedia_health/v1
   mono-src:
     - news-crawl_news.2021
     - news-crawl_news.2020

diff --git a/tests/fixtures/wget b/tests/fixtures/wget
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+"""
+This is a test fixture which mocks wget.
+
+Set the MOCKED_DOWNLOADS environment variable to a JSON object mapping a URL to a file path.
+"""
+
+import argparse
+import json
+import os
+import shutil
+
+if not os.environ.get("MOCKED_DOWNLOADS"):
+    raise Exception(
+        "The mocked_wget utility expected the MOCKED_DOWNLOADS environment variable to be set."
+    )
+
+
+mocked_downloads = json.loads(os.environ.get("MOCKED_DOWNLOADS"))
+
+if not isinstance(mocked_downloads, dict):
+    raise Exception(
+        "Expected the mocked downloads to be a json object mapping the URL to file path"
+    )
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "-O", "--output-document", dest="output", help="The output path", required=True
+)
+parser.add_argument("url", help="The url to download")
+
+args = parser.parse_args()
+
+print("[mocked wget]", args.url)
+
+source_file = mocked_downloads.get(args.url)
+if not source_file:
+    print("[mocked wget] MOCKED_DOWNLOADS:", mocked_downloads)
+    raise Exception(f"Received a URL that was not in MOCKED_DOWNLOADS {args.url}")
+
+if source_file == "404":
+    print("[mocked wget]: Mocking a 404")
+    # 8 is what wget gives as an exit code in this case.
+    os.sys.exit(8)
+
+if not os.path.exists(source_file):
+    raise Exception(f"The source file specified did not exist {source_file}")
+
+print("[mocked wget] copying the file")
+print(f"[mocked wget] from: {source_file}")
+print(f"[mocked wget] to: {args.output}")
+
+shutil.copyfile(source_file, args.output)
+
+print("[mocked wget] Success")
diff --git a/tests/test_data_importer.py b/tests/test_data_importer.py
@@ -2,17 +2,20 @@
 import os
 
 import pytest
-from fixtures import DataDir
+from fixtures import DataDir, get_mocked_downloads
 
 SRC = "ru"
 TRG = "en"
 ARTIFACT_EXT = "gz"
+COMPRESSION_CMD = "pigz"
+CURRENT_FOLDER = os.path.dirname(os.path.abspath(__file__))
 
 os.environ["ARTIFACT_EXT"] = ARTIFACT_EXT
-os.environ["COMPRESSION_CMD"] = "pigz"
+os.environ["COMPRESSION_CMD"] = COMPRESSION_CMD
 os.environ["SRC"] = SRC
 os.environ["TRG"] = TRG
 
+
 from pipeline.data.dataset_importer import run_import
 
 # the augmentation is probabilistic, here is a range for 0.1 probability
@@ -47,36 +50,44 @@ def data_dir():
 
 
 @pytest.mark.parametrize(
-    "dataset",
+    "importer,dataset",
     [
-        "mtdata_Neulab-tedtalks_test-1-eng-rus",
-        "opus_ELRC-3075-wikipedia_health/v1",
-        "flores_dev",
-        "sacrebleu_wmt19",
+        ("mtdata", "Neulab-tedtalks_test-1-eng-rus"),
+        ("opus", "ELRC-3075-wikipedia_health_v1"),
+        ("flores", "dev"),
+        ("sacrebleu", "wmt19"),
     ],
 )
-def test_basic_corpus_import(dataset, data_dir):
-    prefix = data_dir.join(dataset)
-    output_src = f"{prefix}.{SRC}.{ARTIFACT_EXT}"
-    output_trg = f"{prefix}.{TRG}.{ARTIFACT_EXT}"
-
-    run_import("corpus", dataset, prefix)
+def test_basic_corpus_import(importer, dataset, data_dir):
+    data_dir.run_task(
+        f"dataset-{importer}-{dataset}-en-ru",
+        env={
+            "COMPRESSION_CMD": COMPRESSION_CMD,
+            "ARTIFACT_EXT": ARTIFACT_EXT,
+            "WGET": os.path.join(CURRENT_FOLDER, "fixtures/wget"),
+            "MOCKED_DOWNLOADS": get_mocked_downloads(),
+        },
+    )
+
+    prefix = data_dir.join(f"artifacts/{dataset}")
+    output_src = f"{prefix}.ru.gz"
+    output_trg = f"{prefix}.en.gz"
 
     assert os.path.exists(output_src)
     assert os.path.exists(output_trg)
     assert len(read_lines(output_src)) > 0
     assert len(read_lines(output_trg)) > 0
 
 
-@pytest.mark.parametrize(
-    "params",
-    [
-        ("sacrebleu_aug-upper_wmt19", is_upper_case, AUG_MIN_RATE, AUG_MAX_RATE),
-        ("sacrebleu_aug-upper-strict_wmt19", is_upper_case, 1.0, 1.0),
-        ("sacrebleu_aug-title_wmt19", is_title_case, AUG_MIN_RATE, AUG_MAX_RATE),
-        ("sacrebleu_aug-title-strict_wmt19", is_title_case, 1.0, 1.0),
-    ],
-)
+augmentation_params = [
+    ("sacrebleu_aug-upper_wmt19", is_upper_case, AUG_MIN_RATE, AUG_MAX_RATE),
+    ("sacrebleu_aug-upper-strict_wmt19", is_upper_case, 1.0, 1.0),
+    ("sacrebleu_aug-title_wmt19", is_title_case, AUG_MIN_RATE, AUG_MAX_RATE),
+    ("sacrebleu_aug-title-strict_wmt19", is_title_case, 1.0, 1.0),
+]
+
+
+@pytest.mark.parametrize("params", augmentation_params, ids=[d[0] for d in augmentation_params])
 def test_specific_augmentation(params, data_dir):
     dataset, check_func, min_rate, max_rate = params
     prefix = data_dir.join(dataset)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -19,5 +19,3 @@ curl -L "http://data.statmt.org/news-crawl/${lang}/${dataset}.${lang}.shuffled.d
		gunzip \| ${COMPRESSION_CMD} -c > "${output_prefix}.${ARTIFACT_EXT}"

		echo "###### Done: Downloading WMT newscrawl monolingual data"
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Corpus Samples

		These are datasets that are cut down to only a few samples, which can be used for testing purposes.