mozilla · bhearsum · May 24, 2023 · May 18, 2023 · May 18, 2023 · May 18, 2023
@@ -0,0 +1,90 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+---
+
+loader: taskgraph.loader.transform:loader
+
+transforms:
+    - translations_taskgraph.transforms.from_datasets:locales_only
+    - translations_taskgraph.transforms.command_context_from_params:transforms
+    - taskgraph.transforms.job:transforms
+    - translations_taskgraph.transforms.cache:transforms
+    - taskgraph.transforms.cached_tasks:transforms
+    - taskgraph.transforms.task:transforms
+
+kind-dependencies:
+    - merge-corpus
+    - toolchain
+
+tasks:
+    "{src_locale}-{trg_locale}":
+        description: train vocab for {src_locale}-{trg_locale}
+        attributes:
+            # TODO: is this `train` or `devset`? need to decode snakemake a bit more..
+            dataset-category: train
+            stage: train-vocab
+            cache:
+                type: train-vocab
+                resources:
+                    - pipeline/train/spm-vocab.sh
+                parameters:
+                    - train_vocab_sample_size
+        dataset-config:
+            substitution-fields:
+                - description
+                - name
+                - treeherder.symbol
+                - fetches
+                - dependencies
+        worker-type: b-linux-large
+        worker:
+            docker-image: {"in-tree": "train"}
+            max-run-time: 3600
+            artifacts:
+                - name: public/build
+                  path: /builds/worker/artifacts
+                  type: directory
+            env:
+                COMPRESSION_CMD: zstdmt
+
+        # Don't run unless explicitly scheduled
+        run-on-tasks-for: []
+
+        treeherder:
+            symbol: "{src_locale}-{trg_locale}"
+            platform: train-vocab/opt
+        run:
+            using: run-task
+            command-context:
+                from-parameters:
+                    - train_vocab_sample_size
+            command:
+                - bash
+                - -c
+                # Arguments are:
+                # 1) merged src corpus file
+                # 2) merged trg corpus file
+                # 3) output file
+                # 4) sample size
+                # 5) number of threads (auto = output of nproc)
+                - >-
+                    export MARIAN=$MOZ_FETCHES_DIR &&
+                    $VCS_PATH/pipeline/train/spm-vocab.sh
+                    fetches/corpus.{src_locale}.zst
+                    fetches/corpus.{trg_locale}.zst
+                    artifacts/vocab.spm
+                    {train_vocab_sample_size}
+                    auto
+
+        dependencies:
+            merge-corpus: merge-corpus-{src_locale}-{trg_locale}
+
+        fetches:
+            toolchain:
+                - marian
+            merge-corpus:
+                - artifact: corpus.{src_locale}.zst
+                  extract: false
+                - artifact: corpus.{trg_locale}.zst
+                  extract: false
@@ -42,7 +42,7 @@ def can_train(parameters):
 (any stages this choice depends on will be automatically included).""",
                 "default": "",
                 # TODO: this should probably be specified in ci/config.yml
-                "enum": ["clean", "bicleaner", "bicleaner-ai", "merge-corpus"],
+                "enum": ["clean", "bicleaner", "bicleaner-ai", "merge-corpus", "train-vocab"],
             },
             "datasets": {
                 "type": "object",
@@ -125,6 +125,11 @@ def can_train(parameters):
                 "description": "bicleaner threshold",
                 "default": "1.0",
             },
+            "train_vocab_sample_size": {
+                "type": "string",
+                "description": "vocabularly training sample size",
+                "default": "10000",
+            },
         },
         "required": [
             "stage",
@@ -152,6 +157,7 @@ def train_action(parameters, graph_config, input, task_group_id, task_id):
     parameters["src_locale"] = input["src_locale"]
     parameters["trg_locale"] = input["trg_locale"]
     parameters["bicleaner_threshold"] = input["bicleaner_threshold"]
+    parameters["train_vocab_sample_size"] = input["train_vocab_sample_size"]
 
     parameters = Parameters(**parameters)
     taskgraph_decision({"root": graph_config.root_dir}, parameters=parameters)
@@ -8,6 +8,7 @@
 def get_defaults(repo_root):
     return {
         "bicleaner_threshold": "0.0",
+        "train_vocab_sample_size": "1000",
         # These will never be used in practice, but specifying them ensures
         # that we always generate at least one task for each kind, which helps
         # to avoid bustage that doesn't show up until we run the training action.
@@ -38,6 +39,7 @@ def get_defaults(repo_root):
 extend_parameters_schema(
     {
         Optional("bicleaner_threshold"): str,
+        Optional("train_vocab_sample_size"): str,
         Optional("datasets"): {
             str: [str],
         },