Skip to content

Commit 77b4b08

Browse files
authored
Merge branch 'main' into changelog-for-4261
2 parents d6277a6 + 60c3b14 commit 77b4b08

File tree

1 file changed

+24
-14
lines changed

1 file changed

+24
-14
lines changed

tools/ab_test.py

Lines changed: 24 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -275,20 +275,25 @@ def ab_performance_test(
275275
):
276276
failures.append((dimension_set, metric, result, unit))
277277

278-
failure_report = "\n".join(
279-
f"\033[0;32m[Firecracker A/B-Test Runner]\033[0m A/B-testing shows a change of "
280-
f"\033[0;31m\033[1m{format_with_reduced_unit(result.statistic, unit)}\033[0m "
281-
f"(from {format_with_reduced_unit(statistics.mean(processed_emf_a[dimension_set][metric][0]), unit)} "
282-
f"to {format_with_reduced_unit(statistics.mean(processed_emf_b[dimension_set][metric][0]), unit)}) "
283-
f"for metric \033[1m{metric}\033[0m with \033[0;31m\033[1mp={result.pvalue}\033[0m. "
284-
f"This means that observing a change of this magnitude or worse, assuming that performance "
285-
f"characteristics did not change across the tested commits, has a probability of {result.pvalue:.2%}. "
286-
f"Tested Dimensions:\n{json.dumps(dict(dimension_set), indent=2)}"
287-
for (dimension_set, metric, result, unit) in failures
278+
messages = []
279+
for dimension_set, metric, result, unit in failures:
288280
# Sanity check as described above
289-
if abs(statistics.mean(relative_changes_by_metric[metric])) > noise_threshold
290-
)
291-
assert not failure_report, "\n" + failure_report
281+
if abs(statistics.mean(relative_changes_by_metric[metric])) > noise_threshold:
282+
old_mean = statistics.mean(processed_emf_a[dimension_set][metric][0])
283+
new_mean = statistics.mean(processed_emf_b[dimension_set][metric][0])
284+
285+
msg = (
286+
f"\033[0;32m[Firecracker A/B-Test Runner]\033[0m A/B-testing shows a change of "
287+
f"{format_with_reduced_unit(result.statistic, unit)}, or {result.statistic / old_mean:.2%}, "
288+
f"(from {format_with_reduced_unit(old_mean, unit)} to {format_with_reduced_unit(new_mean, unit)}"
289+
f"for metric \033[1m{metric}\033[0m with \033[0;31m\033[1mp={result.pvalue}\033[0m. "
290+
f"This means that observing a change of this magnitude or worse, assuming that performance "
291+
f"characteristics did not change across the tested commits, has a probability of {result.pvalue:.2%}. "
292+
f"Tested Dimensions:\n{json.dumps(dict(dimension_set), indent=2)}"
293+
)
294+
messages.append(msg)
295+
296+
assert not messages, "\n" + "\n".join(messages)
292297
print("No regressions detected!")
293298

294299

@@ -322,7 +327,12 @@ def canonicalize_revision(revision):
322327
type=float,
323328
default=0.0,
324329
)
325-
parser.add_argument("--noise-threshold", type=float, default=0.05)
330+
parser.add_argument(
331+
"--noise-threshold",
332+
help="The minimal delta which a metric has to regress on average across all tests that emit it before the regressions will be considered valid.",
333+
type=float,
334+
default=0.05,
335+
)
326336
args = parser.parse_args()
327337

328338
ab_performance_test(

0 commit comments

Comments
 (0)