SPMI: Avoid duplicate example diffs in diffs summary

jakobbotsch · jakobbotsch · commit 3d5ec4499cda · 2024-12-11T17:29:10.000+01:00
This deduplicates the examples that can be picked by their method name;
this means that the examples no longer should show multiple example
diffs with the same name.
diff --git a/src/coreclr/scripts/superpmi.py b/src/coreclr/scripts/superpmi.py
@@ -1954,7 +1954,7 @@ def aggregate_diff_metrics(details_file):
     # Project out these fields for the saved diffs, to use for further
     # processing. Saving everything into memory is costly on memory when there
     # are a large number of diffs.
-    diffs_fields = ["Context", "Context size", "Base ActualCodeBytes", "Diff ActualCodeBytes", "Base PerfScore", "Diff PerfScore"]
+    diffs_fields = ["Context", "Method full name", "Context size", "Base ActualCodeBytes", "Diff ActualCodeBytes", "Base PerfScore", "Diff PerfScore"]
     diffs = []
 
     for row in read_csv(details_file):
@@ -2725,12 +2725,22 @@ def diff_pct(r):
 
             display_subset("Smallest {} zero sized diffs:", smallest_zero_size_contexts)
 
-            by_diff_size_pct_examples = [diff for diff in by_diff_size_pct if abs(int(diff['Diff ActualCodeBytes']) - int(diff['Base ActualCodeBytes'])) < 50]
-            if len(by_diff_size_pct_examples) == 0:
-                by_diff_size_pct_examples = by_diff_size_pct
+            # Prefer to show small diffs over large percentage wise diffs; sort by this additionally.
+            # sorted is stable, so for multiple small diffs this will keep them in order of percentage wise improvement/regression.
+            def is_small_diff(row):
+                if abs(int(row['Diff ActualCodeBytes']) - int(row['Base ActualCodeBytes'])) < 50:
+                    return 0
+
+                return 1
+
+            by_small_then_improvement = sorted(by_diff_size_pct, key=is_small_diff)
+            by_small_then_regression = sorted(reversed(by_diff_size_pct), key=is_small_diff)
+
+            example_improvements = by_small_then_improvement[:3]
+
+            example_improvement_contexts = set(row["Context"] for row in example_improvements)
+            example_regressions = [row for row in by_small_then_regression if row["Context"] not in example_improvement_contexts][:3]
 
-            example_improvements = by_diff_size_pct_examples[:3]
-            example_regressions = by_diff_size_pct_examples[3:][-3:]
             contexts = smallest_contexts + top_improvements + top_regressions + top_improvements_pct + top_regressions_pct + smallest_zero_size_contexts + example_improvements + example_regressions
             examples = example_improvements + example_regressions