mozilla · gregtatum · Jan 30, 2024 · Jan 24, 2024 · Jan 24, 2024 · Jan 24, 2024
@@ -1,10 +1,17 @@
+import json
 import os
+import re
+import shlex
 import shutil
+import subprocess
 import sys
 from subprocess import CompletedProcess
+from typing import Optional, Union
 
 import zstandard as zstd
 
+from utils.preflight_check import get_taskgraph_parameters, run_taskgraph
+
 FIXTURES_PATH = os.path.dirname(os.path.abspath(__file__))
 DATA_PATH = os.path.abspath(os.path.join(FIXTURES_PATH, "../../data"))
 TESTS_DATA = os.path.join(DATA_PATH, "tests_data")
@@ -20,14 +27,14 @@
 “I’m very sorry, indeed,” said Dorothy, who was truly frightened to see the Witch actually melting away like brown sugar before her very eyes.
 """
 
-ca_sample = """La nena, en veure que havia perdut una de les seves boniques sabates, es va enfadar i va dir a la bruixa: "Torna'm la sabata!"
-"No ho faré", va replicar la Bruixa, "perquè ara és la meva sabata, i no la teva".
-"Ets una criatura dolenta!" va cridar la Dorothy. "No tens dret a treure'm la sabata".
-"Me'l guardaré, igualment", va dir la Bruixa, rient-se d'ella, "i algun dia t'agafaré l'altre també".
-Això va fer enfadar tant la Dorothy que va agafar la galleda d'aigua que hi havia a prop i la va llançar sobre la Bruixa, mullant-la de cap a peus.
-A l'instant, la malvada dona va fer un fort crit de por, i aleshores, mentre la Dorothy la mirava meravellada, la Bruixa va començar a encongir-se i a caure.
-"Mira què has fet!" ella va cridar. "D'aquí a un minut em fondreré".
-"Ho sento molt, de veritat", va dir la Dorothy, que es va espantar veritablement de veure que la Bruixa es va desfer com el sucre moreno davant els seus mateixos ulls.
+ru_sample = """Маленькая девочка, увидев, что потеряла одну из своих красивых туфелек, рассердилась и сказала Ведьме: «Верни мне мою туфельку!»
+«Я не буду, — парировала Ведьма, — потому что теперь это моя туфля, а не твоя».
+«Ты злое существо!» - воскликнула Дороти. «Ты не имеешь права забирать у меня туфлю».
+«Я все равно сохраню его, — сказала Ведьма, смеясь над ней, — и когда-нибудь я получу от тебя и другой».
+Это так разозлило Дороти, что она взяла стоявшее рядом ведро с водой и облила им Ведьму, обмочив ее с головы до ног.
+Мгновенно злая женщина громко вскрикнула от страха, а затем, когда Дороти с удивлением посмотрела на нее, Ведьма начала сжиматься и падать.
+«Посмотри, что ты наделал!» она закричала. «Через минуту я растаю».
+«Мне действительно очень жаль», — сказала Дороти, которая была по-настоящему напугана, увидев, что Ведьма тает, как коричневый сахар, у нее на глазах.
 """
 
 
@@ -36,6 +43,8 @@ class DataDir:
     Creates a persistent data directory in data/tests_data/{dir_name} that will be
     cleaned out before a test run. This should help in persisting artifacts between test
     runs to manually verify the results.
+
+    Taskcluster tasks can be run directly using the data dir via the run_task method.
     """
 
     def __init__(self, dir_name: str) -> None:
@@ -96,6 +105,77 @@ def create_file(self, name: str, contents: str) -> str:
 
         return text_path
 
+    def mkdir(self, name) -> str:
+        path = self.join(name)
+        os.makedirs(path, exist_ok=True)
+        return path
+
+    def run_task(
+        self,
+        task_name: str,
+        work_dir: Optional[str] = None,
+        fetches_dir: Optional[str] = None,
+        env: dict[str, str] = {},
+    ):
+        """
+        Runs a task from the taskgraph. See artifacts/full-task-graph.json after running a
+        test for the full list of task names
+
+        Arguments:
+
+        task_name - The full task name like "split-mono-src-en"
+            or "evaluate-backward-sacrebleu-wmt09-en-ru".
+
+        data_dir - The test's DataDir
+
+        work_dir - This is the TASK_WORKDIR, in tests generally the test's DataDir.
+
+        fetches_dir - The MOZ_FETCHES_DIR, generally set as the test's DataDir.
+
+        env - Any environment variable overrides.
+        """
+
+        command_parts, task_env = get_task_command_and_env(task_name)
+
+        # There are some non-string environment variables that involve taskcluster references
+        # Remove these.
+        for key in task_env:
+            if not isinstance(task_env[key], str):
+                task_env[key] = ""
+
+        current_folder = os.path.dirname(os.path.abspath(__file__))
+        root_path = os.path.abspath(os.path.join(current_folder, "../.."))
+
+        if not work_dir:
+            work_dir = self.path
+        if not fetches_dir:
+            fetches_dir = self.path
+
+        # Manually apply the environment variables, as they don't get added to the args
+        # through the subprocess.run
+        command_parts = [
+            part.replace("$TASK_WORKDIR/$VCS_PATH", root_path)
+            .replace("$TASK_WORKDIR", work_dir)
+            .replace("$MOZ_FETCHES_DIR", fetches_dir)
+            for part in command_parts
+        ]
+
+        result = subprocess.run(
+            command_parts,
+            env={
+                **os.environ,
+                **task_env,
+                "TASK_WORKDIR": work_dir,
+                "MOZ_FETCHES_DIR": fetches_dir,
+                "VCS_PATH": root_path,
+                **env,
+            },
+            cwd=root_path,
+            stderr=subprocess.PIPE,
+            check=False,
+        )
+        fail_on_error(result)
+
 
 def fail_on_error(result: CompletedProcess[bytes]):
     """When a process fails, surface the stderr."""
@@ -104,3 +184,119 @@ def fail_on_error(result: CompletedProcess[bytes]):
             print(line, file=sys.stderr)
 
         raise Exception(f"{result.args[0]} exited with a status code: {result.returncode}")
+
+
+# Only (lazily) create the full taskgraph once per test suite run as it's quite slow.
+_full_taskgraph: Optional[dict[str, object]] = None
+
+
+def get_full_taskgraph():
+    """
+    Generates the full taskgraph and stores it for re-use. It uses the config.pytest.yml
+    in this directory.
+    """
+    global _full_taskgraph  # noqa: PLW0603
+    if _full_taskgraph:
+        return _full_taskgraph
+    current_folder = os.path.dirname(os.path.abspath(__file__))
+    config = os.path.join(current_folder, "config.pytest.yml")
+    task_graph_json = os.path.join(current_folder, "../../artifacts/full-task-graph.json")
+
+    run_taskgraph(config, get_taskgraph_parameters())
+
+    with open(task_graph_json, "rb") as file:
+        _full_taskgraph = json.load(file)
+    return _full_taskgraph
+
+
+# Match a pipeline script like:
+# pipeline/data/dataset_importer.py
+SCRIPT_REGEX = re.compile(r"\/pipeline\/([\w\/-]+)\.(py|sh)")
+
+
+def find_pipeline_script(commands: Union[list[str], list[list[str]]]) -> str:
+    """
+    Extract the pipeline script and arguments from a command list.
+
+    Commands take the form:
+    [
+       ['chmod', '+x', 'run-task'],
+       ['./run-task', '--firefox_translations_training-checkout=./checkouts/vcs/', '--', 'bash', '-c', "full command"]
+    ]
+
+    or
+
+    [
+          "/usr/local/bin/run-task",
+          "--firefox_translations_training-checkout=/builds/worker/checkouts/vcs/",
+          "--", "bash", "-c",
+          "full command"
+    ]
+    """
+    command: str
+    if isinstance(commands[-1], str):
+        command = commands[-1]
+    elif isinstance(commands[-1][-1], str):
+        command = commands[-1][-1]
+    else:
+        print(command)
+        raise Exception("Unable to find a string in the nested command.")
+
+    match = re.search(r"\/pipeline\/([\w\/-]+)\.(py|sh)", command)
+    #                               ^^^^^^^^^^   ^^^^^
+    #                               |            |
+    #    Match the path to the script            Match the extension
+
+    if not match:
+        raise Exception(f"Could not find a pipeline script in the command: {command}")
+
+    script = f"pipeline/{match.group(1)}.{match.group(2)}"
+
+    # Return the parts after the script name.
+    parts = command.split(script)
+    if len(parts) != 2:
+        raise Exception(f"Could not find {script} in: {command}")
+    args = parts[1].strip()
+
+    return f"{script} {args}"
+
+
+def get_task_command_and_env(task_name: str, script=None) -> tuple[str, dict[str, str]]:
+    """
+    Extracts a task's command from the full taskgraph. This allows for testing
+    the full taskcluster pipeline and the scripts that it generates.
+    See artifacts/full-task-graph.json for the full list of what is generated.
+
+    task_name - The full task name like "split-mono-src-en"
+        or "evaluate-backward-sacrebleu-wmt09-en-ru".
+    script - If this is provided, then it will return all of the arguments provided
+        to a script, and ignore everything that came before it.
+    """
+    full_taskgraph = get_full_taskgraph()
+    task = full_taskgraph.get(task_name)
+    if not task:
+        print("Available tasks:")
+        for key in full_taskgraph.keys():
+            print(f' - "{key}"')
+        raise Exception(f"Could not find the task {task_name}")
+
+    env = task["task"]["payload"]["env"]
+
+    commands = task["task"]["payload"]["command"]
+    pipeline_script = find_pipeline_script(commands)
+
+    print(f'Running: "{task_name}":')
+    print(" > Commands:", commands)
+    print(" > Running:", pipeline_script)
+    print(" > Env:", env)
+
+    command_parts = [
+        part
+        for part in shlex.split(pipeline_script)
+        # subprocess.run doesn't understand how to redirect stderr to stdout, so ignore this
+        # if it's part of the command.
+        if part != "2>&1"
+    ]
+
+    # Return the full command.
+    return command_parts, env
@@ -0,0 +1,54 @@
+experiment:
+  name: pytest_en_ru
+  src: en
+  trg: ru
+  best-model: chrf
+  use-opuscleaner: "true"
+  bicleaner:
+    default-threshold: 0.5
+    dataset-thresholds:
+      opus_Books/v1: 0.0
+      opus_CCAligned/v1: 0.7
+  mono-max-sentences-src: 100_000_000
+  mono-max-sentences-trg: 20_000_000
+  spm-sample-size: 10_000_000
+  teacher-ensemble: 2
+  backward-model: NOT-YET-SUPPORTED
+  vocab: NOT-YET-SUPPORTED
+datasets:
+  devtest:
+    - flores_dev
+    - sacrebleu_wmt08
+    - mtdata_Neulab-tedtalks_dev-1-eng-rus
+  test:
+    - flores_devtest
+    - sacrebleu_wmt09
+    - mtdata_Neulab-tedtalks_test-1-eng-rus
+  train:
+    - opus_Books/v1
+    - opus_CCAligned/v1
+    - opus_CCMatrix/v1
+  mono-src:
+    - news-crawl_news.2021
+    - news-crawl_news.2020
+  mono-trg:
+    - news-crawl_news.2021
+    - news-crawl_news.2020
+marian-args:
+  decoding-backward:
+    beam-size: '12'
+    mini-batch-words: '2000'
+  decoding-teacher:
+    mini-batch-words: '4000'
+    precision: float16
+  training-backward:
+    early-stopping: '5'
+  training-teacher:
+    early-stopping: '30'
+  training-student:
+    early-stopping: '20'
+  training-student-finetuned:
+    early-stopping: '20'
+target-stage: all
+taskcluster:
+  split-chunks: 10