@@ -275,20 +275,25 @@ def ab_performance_test(
275
275
):
276
276
failures .append ((dimension_set , metric , result , unit ))
277
277
278
- failure_report = "\n " .join (
279
- f"\033 [0;32m[Firecracker A/B-Test Runner]\033 [0m A/B-testing shows a change of "
280
- f"\033 [0;31m\033 [1m{ format_with_reduced_unit (result .statistic , unit )} \033 [0m "
281
- f"(from { format_with_reduced_unit (statistics .mean (processed_emf_a [dimension_set ][metric ][0 ]), unit )} "
282
- f"to { format_with_reduced_unit (statistics .mean (processed_emf_b [dimension_set ][metric ][0 ]), unit )} ) "
283
- f"for metric \033 [1m{ metric } \033 [0m with \033 [0;31m\033 [1mp={ result .pvalue } \033 [0m. "
284
- f"This means that observing a change of this magnitude or worse, assuming that performance "
285
- f"characteristics did not change across the tested commits, has a probability of { result .pvalue :.2%} . "
286
- f"Tested Dimensions:\n { json .dumps (dict (dimension_set ), indent = 2 )} "
287
- for (dimension_set , metric , result , unit ) in failures
278
+ messages = []
279
+ for dimension_set , metric , result , unit in failures :
288
280
# Sanity check as described above
289
- if abs (statistics .mean (relative_changes_by_metric [metric ])) > noise_threshold
290
- )
291
- assert not failure_report , "\n " + failure_report
281
+ if abs (statistics .mean (relative_changes_by_metric [metric ])) > noise_threshold :
282
+ old_mean = statistics .mean (processed_emf_a [dimension_set ][metric ][0 ])
283
+ new_mean = statistics .mean (processed_emf_b [dimension_set ][metric ][0 ])
284
+
285
+ msg = (
286
+ f"\033 [0;32m[Firecracker A/B-Test Runner]\033 [0m A/B-testing shows a change of "
287
+ f"{ format_with_reduced_unit (result .statistic , unit )} , or { result .statistic / old_mean :.2%} , "
288
+ f"(from { format_with_reduced_unit (old_mean , unit )} to { format_with_reduced_unit (new_mean , unit )} "
289
+ f"for metric \033 [1m{ metric } \033 [0m with \033 [0;31m\033 [1mp={ result .pvalue } \033 [0m. "
290
+ f"This means that observing a change of this magnitude or worse, assuming that performance "
291
+ f"characteristics did not change across the tested commits, has a probability of { result .pvalue :.2%} . "
292
+ f"Tested Dimensions:\n { json .dumps (dict (dimension_set ), indent = 2 )} "
293
+ )
294
+ messages .append (msg )
295
+
296
+ assert not messages , "\n " + "\n " .join (messages )
292
297
print ("No regressions detected!" )
293
298
294
299
@@ -322,7 +327,12 @@ def canonicalize_revision(revision):
322
327
type = float ,
323
328
default = 0.0 ,
324
329
)
325
- parser .add_argument ("--noise-threshold" , type = float , default = 0.05 )
330
+ parser .add_argument (
331
+ "--noise-threshold" ,
332
+ help = "The minimal delta which a metric has to regress on average across all tests that emit it before the regressions will be considered valid." ,
333
+ type = float ,
334
+ default = 0.05 ,
335
+ )
326
336
args = parser .parse_args ()
327
337
328
338
ab_performance_test (
0 commit comments