Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use a run_task abstraction for eval-tests.py #393

Merged
merged 4 commits into from
Jan 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
212 changes: 204 additions & 8 deletions tests/fixtures/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,17 @@
import json
import os
import re
import shlex
import shutil
import subprocess
import sys
from subprocess import CompletedProcess
from typing import Optional, Union

import zstandard as zstd

from utils.preflight_check import get_taskgraph_parameters, run_taskgraph

FIXTURES_PATH = os.path.dirname(os.path.abspath(__file__))
DATA_PATH = os.path.abspath(os.path.join(FIXTURES_PATH, "../../data"))
TESTS_DATA = os.path.join(DATA_PATH, "tests_data")
Expand All @@ -20,14 +27,14 @@
“I’m very sorry, indeed,” said Dorothy, who was truly frightened to see the Witch actually melting away like brown sugar before her very eyes.
"""

ca_sample = """La nena, en veure que havia perdut una de les seves boniques sabates, es va enfadar i va dir a la bruixa: "Torna'm la sabata!"
"No ho faré", va replicar la Bruixa, "perquè ara és la meva sabata, i no la teva".
"Ets una criatura dolenta!" va cridar la Dorothy. "No tens dret a treure'm la sabata".
"Me'l guardaré, igualment", va dir la Bruixa, rient-se d'ella, "i algun dia t'agafaré l'altre també".
Això va fer enfadar tant la Dorothy que va agafar la galleda d'aigua que hi havia a prop i la va llançar sobre la Bruixa, mullant-la de cap a peus.
A l'instant, la malvada dona va fer un fort crit de por, i aleshores, mentre la Dorothy la mirava meravellada, la Bruixa va començar a encongir-se i a caure.
"Mira què has fet!" ella va cridar. "D'aquí a un minut em fondreré".
"Ho sento molt, de veritat", va dir la Dorothy, que es va espantar veritablement de veure que la Bruixa es va desfer com el sucre moreno davant els seus mateixos ulls.
ru_sample = """Маленькая девочка, увидев, что потеряла одну из своих красивых туфелек, рассердилась и сказала Ведьме: «Верни мне мою туфельку!»
«Я не буду, — парировала Ведьма, — потому что теперь это моя туфля, а не твоя».
«Ты злое существо!» - воскликнула Дороти. «Ты не имеешь права забирать у меня туфлю».
«Я все равно сохраню его, — сказала Ведьма, смеясь над ней, — и когда-нибудь я получу от тебя и другой».
Это так разозлило Дороти, что она взяла стоявшее рядом ведро с водой и облила им Ведьму, обмочив ее с головы до ног.
Мгновенно злая женщина громко вскрикнула от страха, а затем, когда Дороти с удивлением посмотрела на нее, Ведьма начала сжиматься и падать.
«Посмотри, что ты наделал!» она закричала. «Через минуту я растаю».
«Мне действительно очень жаль», — сказала Дороти, которая была по-настоящему напугана, увидев, что Ведьма тает, как коричневый сахар, у нее на глазах.
"""


Expand All @@ -36,6 +43,8 @@ class DataDir:
Creates a persistent data directory in data/tests_data/{dir_name} that will be
cleaned out before a test run. This should help in persisting artifacts between test
runs to manually verify the results.

Taskcluster tasks can be run directly using the data dir via the run_task method.
"""

def __init__(self, dir_name: str) -> None:
Expand Down Expand Up @@ -96,6 +105,77 @@ def create_file(self, name: str, contents: str) -> str:

return text_path

def mkdir(self, name) -> str:
path = self.join(name)
os.makedirs(path, exist_ok=True)
return path

def run_task(
self,
task_name: str,
work_dir: Optional[str] = None,
fetches_dir: Optional[str] = None,
env: dict[str, str] = {},
):
"""
Runs a task from the taskgraph. See artifacts/full-task-graph.json after running a
test for the full list of task names

Arguments:

task_name - The full task name like "split-mono-src-en"
or "evaluate-backward-sacrebleu-wmt09-en-ru".

data_dir - The test's DataDir

work_dir - This is the TASK_WORKDIR, in tests generally the test's DataDir.

fetches_dir - The MOZ_FETCHES_DIR, generally set as the test's DataDir.

env - Any environment variable overrides.
"""

command_parts, task_env = get_task_command_and_env(task_name)

# There are some non-string environment variables that involve taskcluster references
# Remove these.
for key in task_env:
if not isinstance(task_env[key], str):
task_env[key] = ""

current_folder = os.path.dirname(os.path.abspath(__file__))
root_path = os.path.abspath(os.path.join(current_folder, "../.."))

if not work_dir:
work_dir = self.path
if not fetches_dir:
fetches_dir = self.path

# Manually apply the environment variables, as they don't get added to the args
# through the subprocess.run
command_parts = [
part.replace("$TASK_WORKDIR/$VCS_PATH", root_path)
.replace("$TASK_WORKDIR", work_dir)
.replace("$MOZ_FETCHES_DIR", fetches_dir)
for part in command_parts
]

result = subprocess.run(
command_parts,
env={
**os.environ,
**task_env,
"TASK_WORKDIR": work_dir,
"MOZ_FETCHES_DIR": fetches_dir,
"VCS_PATH": root_path,
**env,
},
cwd=root_path,
stderr=subprocess.PIPE,
check=False,
)
fail_on_error(result)


def fail_on_error(result: CompletedProcess[bytes]):
"""When a process fails, surface the stderr."""
Expand All @@ -104,3 +184,119 @@ def fail_on_error(result: CompletedProcess[bytes]):
print(line, file=sys.stderr)

raise Exception(f"{result.args[0]} exited with a status code: {result.returncode}")


# Only (lazily) create the full taskgraph once per test suite run as it's quite slow.
_full_taskgraph: Optional[dict[str, object]] = None


def get_full_taskgraph():
"""
Generates the full taskgraph and stores it for re-use. It uses the config.pytest.yml
in this directory.
"""
global _full_taskgraph # noqa: PLW0603
if _full_taskgraph:
return _full_taskgraph
current_folder = os.path.dirname(os.path.abspath(__file__))
config = os.path.join(current_folder, "config.pytest.yml")
task_graph_json = os.path.join(current_folder, "../../artifacts/full-task-graph.json")

run_taskgraph(config, get_taskgraph_parameters())

with open(task_graph_json, "rb") as file:
_full_taskgraph = json.load(file)
return _full_taskgraph


# Match a pipeline script like:
# pipeline/data/dataset_importer.py
SCRIPT_REGEX = re.compile(r"\/pipeline\/([\w\/-]+)\.(py|sh)")


def find_pipeline_script(commands: Union[list[str], list[list[str]]]) -> str:
"""
Extract the pipeline script and arguments from a command list.

Commands take the form:
[
['chmod', '+x', 'run-task'],
['./run-task', '--firefox_translations_training-checkout=./checkouts/vcs/', '--', 'bash', '-c', "full command"]
]

or

[
"/usr/local/bin/run-task",
"--firefox_translations_training-checkout=/builds/worker/checkouts/vcs/",
"--", "bash", "-c",
"full command"
]
"""
command: str
if isinstance(commands[-1], str):
command = commands[-1]
elif isinstance(commands[-1][-1], str):
command = commands[-1][-1]
else:
print(command)
raise Exception("Unable to find a string in the nested command.")

match = re.search(r"\/pipeline\/([\w\/-]+)\.(py|sh)", command)
# ^^^^^^^^^^ ^^^^^
# | |
# Match the path to the script Match the extension

if not match:
raise Exception(f"Could not find a pipeline script in the command: {command}")

script = f"pipeline/{match.group(1)}.{match.group(2)}"

# Return the parts after the script name.
parts = command.split(script)
if len(parts) != 2:
raise Exception(f"Could not find {script} in: {command}")
args = parts[1].strip()

return f"{script} {args}"


def get_task_command_and_env(task_name: str, script=None) -> tuple[str, dict[str, str]]:
"""
Extracts a task's command from the full taskgraph. This allows for testing
the full taskcluster pipeline and the scripts that it generates.
See artifacts/full-task-graph.json for the full list of what is generated.

task_name - The full task name like "split-mono-src-en"
or "evaluate-backward-sacrebleu-wmt09-en-ru".
script - If this is provided, then it will return all of the arguments provided
to a script, and ignore everything that came before it.
"""
full_taskgraph = get_full_taskgraph()
task = full_taskgraph.get(task_name)
if not task:
print("Available tasks:")
for key in full_taskgraph.keys():
print(f' - "{key}"')
raise Exception(f"Could not find the task {task_name}")

env = task["task"]["payload"]["env"]

commands = task["task"]["payload"]["command"]
pipeline_script = find_pipeline_script(commands)

print(f'Running: "{task_name}":')
print(" > Commands:", commands)
print(" > Running:", pipeline_script)
print(" > Env:", env)

command_parts = [
part
for part in shlex.split(pipeline_script)
# subprocess.run doesn't understand how to redirect stderr to stdout, so ignore this
# if it's part of the command.
if part != "2>&1"
]

# Return the full command.
return command_parts, env
54 changes: 54 additions & 0 deletions tests/fixtures/config.pytest.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
experiment:
name: pytest_en_ru
src: en
trg: ru
best-model: chrf
use-opuscleaner: "true"
bicleaner:
default-threshold: 0.5
dataset-thresholds:
opus_Books/v1: 0.0
opus_CCAligned/v1: 0.7
mono-max-sentences-src: 100_000_000
mono-max-sentences-trg: 20_000_000
spm-sample-size: 10_000_000
teacher-ensemble: 2
backward-model: NOT-YET-SUPPORTED
vocab: NOT-YET-SUPPORTED
datasets:
devtest:
- flores_dev
- sacrebleu_wmt08
- mtdata_Neulab-tedtalks_dev-1-eng-rus
test:
- flores_devtest
- sacrebleu_wmt09
- mtdata_Neulab-tedtalks_test-1-eng-rus
train:
- opus_Books/v1
- opus_CCAligned/v1
- opus_CCMatrix/v1
mono-src:
- news-crawl_news.2021
- news-crawl_news.2020
mono-trg:
- news-crawl_news.2021
- news-crawl_news.2020
marian-args:
decoding-backward:
beam-size: '12'
mini-batch-words: '2000'
decoding-teacher:
mini-batch-words: '4000'
precision: float16
training-backward:
early-stopping: '5'
training-teacher:
early-stopping: '30'
training-student:
early-stopping: '20'
training-student-finetuned:
early-stopping: '20'
target-stage: all
taskcluster:
split-chunks: 10
Loading