feat(A/B): add customizable artifacts

ShadowCurse · ShadowCurse · commit 3eeca8ca23ac · 2025-12-18T17:51:25.000Z
Update `ab_test.py` with ability to accept custom artifacts for A and B
runs.

Additionally update `pipeline_perf.py` as well. Now REVISION_A_ARTIFACTS
and REVISION_B_ARTIFACTS environment variables specify custom artifacts
which will be used in A/B test

Signed-off-by: Egor Lazarchuk &lt;yegorlz@amazon.co.uk&gt;
diff --git a/.buildkite/common.py b/.buildkite/common.py
@@ -376,6 +376,12 @@ def to_json(self):
         """Serialize the pipeline to JSON"""
         return json.dumps(self.to_dict(), indent=4, sort_keys=True, ensure_ascii=False)
 
+    def devtool_download_artifacts(self, artifacts):
+        """Generate a `devtool download_ci_artifacts` command"""
+        parts = ["./tools/devtool -y download_ci_artifacts"]
+        parts += artifacts
+        return " ".join(parts)
+
     def devtool_test(self, devtool_opts=None, pytest_opts=None):
         """Generate a `devtool test` command"""
         cmds = []
diff --git a/.buildkite/pipeline_perf.py b/.buildkite/pipeline_perf.py
@@ -94,11 +94,16 @@
 
 REVISION_A = os.environ.get("REVISION_A")
 REVISION_B = os.environ.get("REVISION_B")
+REVISION_A_ARTIFACTS = os.environ.get("REVISION_A_ARTIFACTS")
+REVISION_B_ARTIFACTS = os.environ.get("REVISION_B_ARTIFACTS")
 
 # Either both are specified or neither. Only doing either is a bug. If you want to
 # run performance tests _on_ a specific commit, specify neither and put your commit
 # into buildkite's "commit" field.
 assert (REVISION_A and REVISION_B) or (not REVISION_A and not REVISION_B)
+assert (REVISION_A_ARTIFACTS and REVISION_B_ARTIFACTS) or (
+    not REVISION_A_ARTIFACTS and not REVISION_B_ARTIFACTS
+)
 
 BKPipeline.parser.add_argument(
     "--test",
@@ -132,17 +137,26 @@
     ab_opts = test.pop("ab_opts", "")
     devtool_opts += " --performance"
     test_script_opts = ""
+    artifacts = []
     if REVISION_A:
         devtool_opts += " --ab"
         test_script_opts = f'{ab_opts} run --binaries-a build/{REVISION_A}/ --binaries-b build/{REVISION_B} --pytest-opts "{test_selector}"'
+        if REVISION_A_ARTIFACTS:
+            artifacts.append(REVISION_A_ARTIFACTS)
+            artifacts.append(REVISION_B_ARTIFACTS)
+            test_script_opts += f" --artifacts-a {REVISION_A_ARTIFACTS} --artifacts-b {REVISION_B_ARTIFACTS}"
     else:
         # Passing `-m ''` below instructs pytest to collect tests regardless of
         # their markers (e.g. it will collect both tests marked as nonci, and
         # tests without any markers).
         test_script_opts += f" -m '' {test_selector}"
 
+    command = []
+    if artifacts:
+        command.append(pipeline.devtool_download_artifacts(artifacts))
+    command.extend(pipeline.devtool_test(devtool_opts, test_script_opts))
     pipeline.build_group(
-        command=pipeline.devtool_test(devtool_opts, test_script_opts),
+        command=command,
         # and the rest can be command arguments
         **test,
     )
diff --git a/tools/ab_test.py b/tools/ab_test.py
@@ -24,7 +24,7 @@
 import subprocess
 from collections import defaultdict
 from pathlib import Path
-from typing import Callable, List, TypeVar
+from typing import Callable, List, Optional, TypeVar
 
 import scipy
 
@@ -194,16 +194,31 @@ def uninteresting_dimensions(data):
     return uninteresting
 
 
-def collect_data(tag: str, binary_dir: Path, pytest_opts: str):
+def collect_data(
+    tag: str, binary_dir: Path, artifacts: Optional[Path], pytest_opts: str
+):
     """
     Executes the specified test using the provided firecracker binaries and
     stores results into the `test_results/tag` directory
     """
     binary_dir = binary_dir.resolve()
 
-    print(f"Collecting samples with {binary_dir}")
+    print(
+        f"Collecting samples | binaries path: {binary_dir}"
+        + f" | artifacts path: {artifacts}"
+        if artifacts
+        else ""
+    )
     test_path = f"test_results/{tag}"
     test_report_path = f"{test_path}/test-report.json"
+
+    # It is not possible to just download them here this script is usually run inside docker
+    # and artifacts downloading does not work inside it.
+    if artifacts:
+        subprocess.run(
+            f"./tools/devtool set_current_artifacts {artifacts}", check=True, shell=True
+        )
+
     subprocess.run(
         f"./tools/test.sh --binary-dir={binary_dir} {pytest_opts} -m '' --json-report-file=../{test_report_path}",
         env=os.environ
@@ -371,25 +386,29 @@ def analyze_data(
 
 
 def binary_ab_test(
-    test_runner: Callable[[Path, bool], T],
+    test_runner: Callable[[Path, Optional[Path], bool], T],
     comparator: Callable[[T, T], U],
     *,
     a_directory: Path,
     b_directory: Path,
+    a_artifacts: Optional[Path],
+    b_artifacts: Optional[Path],
 ):
     """
     Similar to `git_ab_test`, but instead of locally checking out different revisions, it operates on
     directories containing firecracker/jailer binaries
     """
-    result_a = test_runner(a_directory, True)
-    result_b = test_runner(b_directory, False)
+    result_a = test_runner(a_directory, a_artifacts, True)
+    result_b = test_runner(b_directory, b_artifacts, False)
 
     return result_a, result_b, comparator(result_a, result_b)
 
 
 def ab_performance_test(
-    a_revision: Path,
-    b_revision: Path,
+    a_directory: Path,
+    b_directory: Path,
+    a_artifacts: Optional[Path],
+    b_artifacts: Optional[Path],
     pytest_opts,
     p_thresh,
     strength_abs_thresh,
@@ -398,7 +417,9 @@ def ab_performance_test(
     """Does an A/B-test of the specified test with the given firecracker/jailer binaries"""
 
     return binary_ab_test(
-        lambda bin_dir, is_a: collect_data(is_a and "A" or "B", bin_dir, pytest_opts),
+        lambda bin_dir, art_dir, is_a: collect_data(
+            is_a and "A" or "B", bin_dir, art_dir, pytest_opts
+        ),
         lambda ah, be: analyze_data(
             ah,
             be,
@@ -407,8 +428,10 @@ def ab_performance_test(
             noise_threshold,
             n_resamples=int(100 / p_thresh),
         ),
-        a_directory=a_revision,
-        b_directory=b_revision,
+        a_directory=a_directory,
+        b_directory=b_directory,
+        a_artifacts=a_artifacts,
+        b_artifacts=b_artifacts,
     )
 
 
@@ -433,6 +456,22 @@ def ab_performance_test(
         type=Path,
         required=True,
     )
+    run_parser.add_argument(
+        "--artifacts-a",
+        help="Name of the artifacts directory in the build/artifacts to use for revision A test. If the directory does not exist, the name will be treated as S3 path and artifacts will be downloaded from there.",
+        # Type is string since it can be an s3 paht which if passed to `Path` constructor
+        # will be incorrectly modified
+        type=str,
+        required=False,
+    )
+    run_parser.add_argument(
+        "--artifacts-b",
+        help="Name of the artifacts directory in the build/artifacts to use for revision B test. If the directory does not exist, the name will be treated as S3 path and artifacts will be downloaded from there.",
+        # Type is string since it can be an s3 paht which if passed to `Path` constructor
+        # will be incorrectly modified
+        type=str,
+        required=False,
+    )
     run_parser.add_argument(
         "--pytest-opts",
         help="Parameters to pass through to pytest, for example for test selection",
@@ -476,6 +515,8 @@ def ab_performance_test(
         ab_performance_test(
             args.binaries_a,
             args.binaries_b,
+            args.artifacts_a,
+            args.artifacts_b,
             args.pytest_opts,
             args.significance,
             args.absolute_strength,