feat(A/B): add customizable artifacts

ShadowCurse · ShadowCurse · commit 7682ec8b9e7d · 2025-12-16T16:19:31.000Z
Update `ab_test.py` with ability to accept custom artifacts for A and B
runs.

Additionally update `pipeline_perf.py` as well. Now REVISION_A_ARTIFACTS
and REVISION_B_ARTIFACTS environment variables specify custom artifacts
which will be used in A/B test

Signed-off-by: Egor Lazarchuk &lt;yegorlz@amazon.co.uk&gt;
diff --git a/.buildkite/common.py b/.buildkite/common.py
@@ -376,10 +376,11 @@ def to_json(self):
         """Serialize the pipeline to JSON"""
         return json.dumps(self.to_dict(), indent=4, sort_keys=True, ensure_ascii=False)
 
-    def devtool_download_artifacts(self, artifacts_s3_url):
+    def devtool_download_artifacts(self, artifacts):
         """Generate a `devtool download_ci_artifacts` command"""
         parts = ["./tools/devtool -y download_ci_artifacts"]
-
+        parts += artifacts
+        return " ".join(parts)
 
     def devtool_test(self, devtool_opts=None, pytest_opts=None):
         """Generate a `devtool test` command"""
diff --git a/.buildkite/pipeline_perf.py b/.buildkite/pipeline_perf.py
@@ -94,11 +94,16 @@
 
 REVISION_A = os.environ.get("REVISION_A")
 REVISION_B = os.environ.get("REVISION_B")
+REVISION_A_ARTIFACTS = os.environ.get("REVISION_A_ARTIFACTS")
+REVISION_B_ARTIFACTS = os.environ.get("REVISION_B_ARTIFACTS")
 
 # Either both are specified or neither. Only doing either is a bug. If you want to
 # run performance tests _on_ a specific commit, specify neither and put your commit
 # into buildkite's "commit" field.
 assert (REVISION_A and REVISION_B) or (not REVISION_A and not REVISION_B)
+assert (REVISION_A_ARTIFACTS and REVISION_B_ARTIFACTS) or (
+    not REVISION_A_ARTIFACTS and not REVISION_B_ARTIFACTS
+)
 
 BKPipeline.parser.add_argument(
     "--test",
@@ -132,17 +137,28 @@
     ab_opts = test.pop("ab_opts", "")
     devtool_opts += " --performance"
     test_script_opts = ""
+    artifacts = []
     if REVISION_A:
         devtool_opts += " --ab"
-        test_script_opts = f'{ab_opts} run --binaries_a build/{REVISION_A}/ --binaries_b build/{REVISION_B} --pytest-opts "{test_selector}"'
+        test_script_opts = f'{ab_opts} run --binaries-a build/{REVISION_A}/ --binaries-b build/{REVISION_B} --pytest-opts "{test_selector}"'
+        if REVISION_A_ARTIFACTS:
+            artifacts.append(REVISION_A_ARTIFACTS)
+            artifacts.append(REVISION_B_ARTIFACTS)
+            test_script_opts += f" --artifacts-a {REVISION_A_ARTIFACTS} --artifacts-b {REVISION_B_ARTIFACTS}"
     else:
         # Passing `-m ''` below instructs pytest to collect tests regardless of
         # their markers (e.g. it will collect both tests marked as nonci, and
         # tests without any markers).
         test_script_opts += f" -m '' {test_selector}"
 
+    command = []
+    if artifacts:
+        command.append(pipeline.devtool_download_artifacts(artifacts))
+    # Hack because devtool_test already returns an array, just this array always
+    # has just 1 element
+    command.append(pipeline.devtool_test(devtool_opts, test_script_opts)[0])
     pipeline.build_group(
-        command=pipeline.devtool_test(devtool_opts, test_script_opts),
+        command=command,
         # and the rest can be command arguments
         **test,
     )
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -345,13 +345,17 @@ def get(self, _netns_id):
     for netns in netns_fcty._all:
         netns._cleanup_orig()
 
+
 @pytest.fixture(scope="session")
 def current_artifacts_path():
     with open("/firecracker/build/current_artifacts", "r", encoding="utf-8") as ca:
         yield ca.read().strip()
 
+
 @pytest.fixture()
-def microvm_factory(request, record_property, results_dir, netns_factory, current_artifacts_path):
+def microvm_factory(
+    request, record_property, results_dir, netns_factory, current_artifacts_path
+):
     """Fixture to create microvms simply."""
 
     binary_dir = request.config.getoption("--binary-dir") or DEFAULT_BINARY_DIR
diff --git a/tests/integration_tests/performance/test_block.py b/tests/integration_tests/performance/test_block.py
@@ -15,10 +15,10 @@
 BLOCK_DEVICE_SIZE_MB = 2048
 
 # Time (in seconds) for which fio "warms up"
-WARMUP_SEC = 10
+WARMUP_SEC = 1
 
 # Time (in seconds) for which fio runs after warmup is done
-RUNTIME_SEC = 30
+RUNTIME_SEC = 1
 
 # VM guest memory size
 GUEST_MEM_MIB = 1024
@@ -100,10 +100,10 @@ def emit_fio_metrics(logs_dir, metrics):
 
 
 @pytest.mark.nonci
-@pytest.mark.parametrize("vcpus", [1, 2], ids=["1vcpu", "2vcpu"])
-@pytest.mark.parametrize("fio_mode", [fio.Mode.RANDREAD, fio.Mode.RANDWRITE])
+@pytest.mark.parametrize("vcpus", [1])  # , 2], ids=["1vcpu", "2vcpu"])
+@pytest.mark.parametrize("fio_mode", [fio.Mode.RANDREAD])  # , fio.Mode.RANDWRITE])
 @pytest.mark.parametrize("fio_block_size", [4096], ids=["bs4096"])
-@pytest.mark.parametrize("fio_engine", [fio.Engine.LIBAIO, fio.Engine.PSYNC])
+@pytest.mark.parametrize("fio_engine", [fio.Engine.LIBAIO])  # , fio.Engine.PSYNC])
 def test_block_performance(
     uvm_plain_acpi,
     vcpus,
@@ -150,50 +150,50 @@ def test_block_performance(
             metrics.put_metric(f"cpu_utilization_{thread_name}", value, "Percent")
 
 
-@pytest.mark.nonci
-@pytest.mark.parametrize("vcpus", [1, 2], ids=["1vcpu", "2vcpu"])
-@pytest.mark.parametrize("fio_mode", [fio.Mode.RANDREAD])
-@pytest.mark.parametrize("fio_block_size", [4096], ids=["bs4096"])
-def test_block_vhost_user_performance(
-    uvm_plain_acpi,
-    vcpus,
-    fio_mode,
-    fio_block_size,
-    metrics,
-    results_dir,
-):
-    """
-    Execute block device emulation benchmarking scenarios.
-    """
-
-    vm = uvm_plain_acpi
-    vm.spawn(log_level="Info", emit_metrics=True)
-    vm.basic_config(vcpu_count=vcpus, mem_size_mib=GUEST_MEM_MIB)
-    vm.add_net_iface()
-
-    # Add a secondary block device for benchmark tests.
-    fs = drive_tools.FilesystemFile(size=BLOCK_DEVICE_SIZE_MB)
-    vm.add_vhost_user_drive("scratch", fs.path)
-    vm.start()
-
-    metrics.set_dimensions(
-        {
-            "performance_test": "test_block_performance",
-            "io_engine": "vhost-user",
-            "fio_mode": fio_mode,
-            "fio_block_size": str(fio_block_size),
-            "fio_engine": "libaio",
-            **vm.dimensions,
-        }
-    )
-
-    next_cpu = vm.pin_threads(0)
-    vm.disks_vhost_user["scratch"].pin(next_cpu)
-
-    cpu_util = run_fio(vm, fio_mode, fio_block_size, results_dir, fio.Engine.LIBAIO)
-
-    emit_fio_metrics(results_dir, metrics)
-
-    for thread_name, values in cpu_util.items():
-        for value in values:
-            metrics.put_metric(f"cpu_utilization_{thread_name}", value, "Percent")
+# @pytest.mark.nonci
+# @pytest.mark.parametrize("vcpus", [1, 2], ids=["1vcpu", "2vcpu"])
+# @pytest.mark.parametrize("fio_mode", [fio.Mode.RANDREAD])
+# @pytest.mark.parametrize("fio_block_size", [4096], ids=["bs4096"])
+# def test_block_vhost_user_performance(
+#     uvm_plain_acpi,
+#     vcpus,
+#     fio_mode,
+#     fio_block_size,
+#     metrics,
+#     results_dir,
+# ):
+#     """
+#     Execute block device emulation benchmarking scenarios.
+#     """
+#
+#     vm = uvm_plain_acpi
+#     vm.spawn(log_level="Info", emit_metrics=True)
+#     vm.basic_config(vcpu_count=vcpus, mem_size_mib=GUEST_MEM_MIB)
+#     vm.add_net_iface()
+#
+#     # Add a secondary block device for benchmark tests.
+#     fs = drive_tools.FilesystemFile(size=BLOCK_DEVICE_SIZE_MB)
+#     vm.add_vhost_user_drive("scratch", fs.path)
+#     vm.start()
+#
+#     metrics.set_dimensions(
+#         {
+#             "performance_test": "test_block_performance",
+#             "io_engine": "vhost-user",
+#             "fio_mode": fio_mode,
+#             "fio_block_size": str(fio_block_size),
+#             "fio_engine": "libaio",
+#             **vm.dimensions,
+#         }
+#     )
+#
+#     next_cpu = vm.pin_threads(0)
+#     vm.disks_vhost_user["scratch"].pin(next_cpu)
+#
+#     cpu_util = run_fio(vm, fio_mode, fio_block_size, results_dir, fio.Engine.LIBAIO)
+#
+#     emit_fio_metrics(results_dir, metrics)
+#
+#     for thread_name, values in cpu_util.items():
+#         for value in values:
+#             metrics.put_metric(f"cpu_utilization_{thread_name}", value, "Percent")
diff --git a/tools/ab_test.py b/tools/ab_test.py
@@ -24,7 +24,7 @@
 import subprocess
 from collections import defaultdict
 from pathlib import Path
-from typing import Callable, List, TypeVar
+from typing import Callable, List, TypeVar, Optional
 
 import scipy
 
@@ -194,16 +194,22 @@ def uninteresting_dimensions(data):
     return uninteresting
 
 
-def collect_data(tag: str, binary_dir: Path, pytest_opts: str):
+def collect_data(tag: str, binary_dir: Path, artifacts: Optional[Path], pytest_opts: str):
     """
     Executes the specified test using the provided firecracker binaries and
     stores results into the `test_results/tag` directory
     """
     binary_dir = binary_dir.resolve()
 
-    print(f"Collecting samples with {binary_dir}")
+    print(f"Collecting samples | binaries path: {binary_dir}" + f" | artifacts path: {artifacts}" if artifacts else "")
     test_path = f"test_results/{tag}"
     test_report_path = f"{test_path}/test-report.json"
+
+    # It is not possible to just download them here this script is usually run inside docker
+    # and artifacts downloading does not work inside it.
+    if artifacts:
+        subprocess.run(f"./tools/devtool set_current_artifacts {artifacts}", check=True, shell=True)
+
     subprocess.run(
         f"./tools/test.sh --binary-dir={binary_dir} {pytest_opts} -m '' --json-report-file=../{test_report_path}",
         env=os.environ
@@ -371,25 +377,29 @@ def analyze_data(
 
 
 def binary_ab_test(
-    test_runner: Callable[[Path, bool], T],
+    test_runner: Callable[[Path, Optional[Path], bool], T],
     comparator: Callable[[T, T], U],
     *,
     a_directory: Path,
     b_directory: Path,
+    a_artifacts: Optional[Path],
+    b_artifacts: Optional[Path],
 ):
     """
     Similar to `git_ab_test`, but instead of locally checking out different revisions, it operates on
     directories containing firecracker/jailer binaries
     """
-    result_a = test_runner(a_directory, True)
-    result_b = test_runner(b_directory, False)
+    result_a = test_runner(a_directory, a_artifacts, True)
+    result_b = test_runner(b_directory, b_artifacts, False)
 
     return result_a, result_b, comparator(result_a, result_b)
 
 
 def ab_performance_test(
-    a_revision: Path,
-    b_revision: Path,
+    a_directory: Path,
+    b_directory: Path,
+    a_artifacts: Optional[Path],
+    b_artifacts: Optional[Path],
     pytest_opts,
     p_thresh,
     strength_abs_thresh,
@@ -398,7 +408,7 @@ def ab_performance_test(
     """Does an A/B-test of the specified test with the given firecracker/jailer binaries"""
 
     return binary_ab_test(
-        lambda bin_dir, is_a: collect_data(is_a and "A" or "B", bin_dir, pytest_opts),
+        lambda bin_dir, art_dir, is_a: collect_data(is_a and "A" or "B", bin_dir, art_dir, pytest_opts),
         lambda ah, be: analyze_data(
             ah,
             be,
@@ -407,8 +417,10 @@ def ab_performance_test(
             noise_threshold,
             n_resamples=int(100 / p_thresh),
         ),
-        a_directory=a_revision,
-        b_directory=b_revision,
+        a_directory=a_directory,
+        b_directory=b_directory,
+        a_artifacts=a_artifacts,
+        b_artifacts=b_artifacts,
     )
 
 
@@ -433,6 +445,18 @@ def ab_performance_test(
         type=Path,
         required=True,
     )
+    run_parser.add_argument(
+        "--artifacts-a",
+        help="Name of the artifacts directory in the build/artifacts to use for revision A test. If the directory does not exist, the name will be treated as S3 path and artifacts will be downloaded from there.",
+        type=Path,
+        required=False,
+    )
+    run_parser.add_argument(
+        "--artifacts-b",
+        help="Name of the artifacts directory in the build/artifacts to use for revision B test. If the directory does not exist, the name will be treated as S3 path and artifacts will be downloaded from there.",
+        type=Path,
+        required=False,
+    )
     run_parser.add_argument(
         "--pytest-opts",
         help="Parameters to pass through to pytest, for example for test selection",
@@ -476,6 +500,8 @@ def ab_performance_test(
         ab_performance_test(
             args.binaries_a,
             args.binaries_b,
+            args.artifacts_a,
+            args.artifacts_b,
             args.pytest_opts,
             args.significance,
             args.absolute_strength,
diff --git a/tools/devtool b/tools/devtool
@@ -606,6 +606,15 @@ cmd_download_ci_artifacts() {
     done
 }
 
+cmd_set_current_artifacts() {
+    local artifacts=$1
+    if [ -z $artifacts ]; then
+      say "No artifacts were specified"
+    fi
+    local artifacts_local_path=$(get_local_artifacts_path $artifacts)/$(uname -m)
+    echo $artifacts_local_path > $LOCAL_ARTIFACTS_CURRENT_DIR_FILE
+}
+
 ensure_ci_artifacts() {
     local artifacts=$1
 
@@ -767,6 +776,10 @@ cmd_test() {
             "--no-ci-artifacts-check")
                 do_ci_artifacts_check=0
                 ;;
+            "--use-artifacts")
+                shift
+                local artifacts="$1"
+                ;;
             "--")               { shift; break;     } ;;
             *)
                 die "Unknown argument: $1. Please use --help for help."
@@ -779,7 +792,7 @@ cmd_test() {
     [ $do_kvm_check != 0 ] && ensure_kvm
     ensure_devctr
     ensure_build_dir
-    [ $do_ci_artifacts_check != 0 ] && ensure_ci_artifacts
+    [ $do_ci_artifacts_check != 0 ] && ensure_ci_artifacts $artifacts
     if [ $do_build != 0 ]; then
       cmd_build --release
       if [ -n "$BUILDKITE_PULL_REQUEST_BASE_BRANCH" ]; then