gh-119105: difflib.py Differ.compare is too slow [for degenerate cases] (#119376)

tim-one · web-flow · commit 07df93de73b6 · 2024-05-22T18:25:08.000-05:00
Track all pairs achieving the best ratio in Differ(). This repairs the "very deep recursion and cubic time" bad cases in a way that preserves previous output.
diff --git a/Lib/difflib.py b/Lib/difflib.py
@@ -908,95 +908,105 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
         + abcdefGhijkl
         ?    ^  ^  ^
         """
-
-        # don't synch up unless the lines have a similarity score of at
-        # least cutoff; best_ratio tracks the best score seen so far
-        # best_ratio is a tuple storing the best .ratio() seen so far, and
-        # a measure of how far the indices are from their index range
-        # midpoints. The latter is used to resolve ratio ties. Favoring
-        # indices near the midpoints tends to cut the ranges in half. Else,
-        # if there are many pairs with the best ratio, recursion can grow
-        # very deep, and runtime becomes cubic. See:
+        from operator import ge, gt
+        # Don't synch up unless the lines have a similarity score of at
+        # least cutoff; best_ratio tracks the best score seen so far.
+        # Keep track of all index pairs achieving the best ratio and
+        # deal with them here. Previously only the smallest pair was
+        # handled here, and if there are many pairs with the best ratio,
+        # recursion could grow very deep, and runtime cubic. See:
         # https://github.com/python/cpython/issues/119105
-        best_ratio, cutoff = (0.74, 0), 0.75
+        best_ratio, cutoff = 0.74, 0.75
         cruncher = SequenceMatcher(self.charjunk)
         eqi, eqj = None, None   # 1st indices of equal lines (if any)
+        # List of index pairs achieving best_ratio. Strictly increasing
+        # in both index positions.
+        max_pairs = []
+        maxi = -1 # `i` index of last pair in max_pairs
 
         # search for the pair that matches best without being identical
-        # (identical lines must be junk lines, & we don't want to synch up
-        # on junk -- unless we have to)
-        amid = (alo + ahi - 1) / 2
-        bmid = (blo + bhi - 1) / 2
+        # (identical lines must be junk lines, & we don't want to synch
+        # up on junk -- unless we have to)
+        crqr = cruncher.real_quick_ratio
+        cqr = cruncher.quick_ratio
+        cr = cruncher.ratio
         for j in range(blo, bhi):
             bj = b[j]
             cruncher.set_seq2(bj)
-            weight_j = - abs(j - bmid)
+            # Find new best, if possible. Else search for the smallest i
+            # (if any) > maxi that equals the best ratio
+            search_equal = True
             for i in range(alo, ahi):
                 ai = a[i]
                 if ai == bj:
                     if eqi is None:
                         eqi, eqj = i, j
                     continue
                 cruncher.set_seq1(ai)
-                # weight is used to balance the recursion by prioritizing
-                # i and j in the middle of their ranges
-                weight = weight_j - abs(i - amid)
                 # computing similarity is expensive, so use the quick
                 # upper bounds first -- have seen this speed up messy
                 # compares by a factor of 3.
-                # note that ratio() is only expensive to compute the first
-                # time it's called on a sequence pair; the expensive part
-                # of the computation is cached by cruncher
-                if (cruncher.real_quick_ratio(), weight) > best_ratio and \
-                      (cruncher.quick_ratio(), weight) > best_ratio and \
-                      (cruncher.ratio(), weight) > best_ratio:
-                    best_ratio, best_i, best_j = (cruncher.ratio(), weight), i, j
-        best_ratio, _ = best_ratio
+                cmp = ge if search_equal and i > maxi else gt
+                if (cmp(crqr(), best_ratio)
+                      and cmp(cqr(), best_ratio)
+                      and cmp((ratio := cr()), best_ratio)):
+                    if ratio > best_ratio:
+                        best_ratio = ratio
+                        max_pairs.clear()
+                    else:
+                        assert best_ratio == ratio and search_equal
+                        assert i > maxi
+                    max_pairs.append((i, j))
+                    maxi = i
+                    search_equal = False
         if best_ratio < cutoff:
+            assert not max_pairs
             # no non-identical "pretty close" pair
             if eqi is None:
                 # no identical pair either -- treat it as a straight replace
                 yield from self._plain_replace(a, alo, ahi, b, blo, bhi)
                 return
             # no close pair, but an identical pair -- synch up on that
-            best_i, best_j, best_ratio = eqi, eqj, 1.0
+            max_pairs = [(eqi, eqj)]
         else:
             # there's a close pair, so forget the identical pair (if any)
+            assert max_pairs
             eqi = None
 
-        # a[best_i] very similar to b[best_j]; eqi is None iff they're not
-        # identical
-
-        # pump out diffs from before the synch point
-        yield from self._fancy_helper(a, alo, best_i, b, blo, best_j)
-
-        # do intraline marking on the synch pair
-        aelt, belt = a[best_i], b[best_j]
-        if eqi is None:
-            # pump out a '-', '?', '+', '?' quad for the synched lines
-            atags = btags = ""
-            cruncher.set_seqs(aelt, belt)
-            for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():
-                la, lb = ai2 - ai1, bj2 - bj1
-                if tag == 'replace':
-                    atags += '^' * la
-                    btags += '^' * lb
-                elif tag == 'delete':
-                    atags += '-' * la
-                elif tag == 'insert':
-                    btags += '+' * lb
-                elif tag == 'equal':
-                    atags += ' ' * la
-                    btags += ' ' * lb
-                else:
-                    raise ValueError('unknown tag %r' % (tag,))
-            yield from self._qformat(aelt, belt, atags, btags)
-        else:
-            # the synch pair is identical
-            yield '  ' + aelt
+        last_i, last_j = alo, blo
+        for this_i, this_j in max_pairs:
+            # pump out diffs from before the synch point
+            yield from self._fancy_helper(a, last_i, this_i,
+                                          b, last_j, this_j)
+            # do intraline marking on the synch pair
+            aelt, belt = a[this_i], b[this_j]
+            if eqi is None:
+                # pump out a '-', '?', '+', '?' quad for the synched lines
+                atags = btags = ""
+                cruncher.set_seqs(aelt, belt)
+                for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():
+                    la, lb = ai2 - ai1, bj2 - bj1
+                    if tag == 'replace':
+                        atags += '^' * la
+                        btags += '^' * lb
+                    elif tag == 'delete':
+                        atags += '-' * la
+                    elif tag == 'insert':
+                        btags += '+' * lb
+                    elif tag == 'equal':
+                        atags += ' ' * la
+                        btags += ' ' * lb
+                    else:
+                        raise ValueError('unknown tag %r' % (tag,))
+                yield from self._qformat(aelt, belt, atags, btags)
+            else:
+                # the synch pair is identical
+                yield '  ' + aelt
+            last_i, last_j = this_i + 1, this_j + 1
 
-        # pump out diffs from after the synch point
-        yield from self._fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi)
+        # pump out diffs from after the last synch point
+        yield from self._fancy_helper(a, last_i, ahi,
+                                      b, last_j, bhi)
 
     def _fancy_helper(self, a, alo, ahi, b, blo, bhi):
         g = []
diff --git a/Lib/test/test_difflib.py b/Lib/test/test_difflib.py
@@ -272,6 +272,26 @@ def test_make_file_usascii_charset_with_nonascii_input(self):
         self.assertIn('content="text/html; charset=us-ascii"', output)
         self.assertIn('&#305;mpl&#305;c&#305;t', output)
 
+class TestDiffer(unittest.TestCase):
+    def test_close_matches_aligned(self):
+        # Of the 4 closely matching pairs, we want 1 to match with 3,
+        # and 2 with 4, to align with a "top to bottom" mental model.
+        a = ["cat\n", "dog\n", "close match 1\n", "close match 2\n"]
+        b = ["close match 3\n", "close match 4\n", "kitten\n", "puppy\n"]
+        m = difflib.Differ().compare(a, b)
+        self.assertEqual(list(m),
+                           ['- cat\n',
+                            '- dog\n',
+                            '- close match 1\n',
+                            '?             ^\n',
+                            '+ close match 3\n',
+                            '?             ^\n',
+                            '- close match 2\n',
+                            '?             ^\n',
+                            '+ close match 4\n',
+                            '?             ^\n',
+                            '+ kitten\n',
+                            '+ puppy\n'])
 
 class TestOutputFormat(unittest.TestCase):
     def test_tab_delimiter(self):