@@ -908,95 +908,105 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
908
908
+ abcdefGhijkl
909
909
? ^ ^ ^
910
910
"""
911
-
912
- # don't synch up unless the lines have a similarity score of at
913
- # least cutoff; best_ratio tracks the best score seen so far
914
- # best_ratio is a tuple storing the best .ratio() seen so far, and
915
- # a measure of how far the indices are from their index range
916
- # midpoints. The latter is used to resolve ratio ties. Favoring
917
- # indices near the midpoints tends to cut the ranges in half. Else,
918
- # if there are many pairs with the best ratio, recursion can grow
919
- # very deep, and runtime becomes cubic. See:
911
+ from operator import ge , gt
912
+ # Don't synch up unless the lines have a similarity score of at
913
+ # least cutoff; best_ratio tracks the best score seen so far.
914
+ # Keep track of all index pairs achieving the best ratio and
915
+ # deal with them here. Previously only the smallest pair was
916
+ # handled here, and if there are many pairs with the best ratio,
917
+ # recursion could grow very deep, and runtime cubic. See:
920
918
# https://github.com/python/cpython/issues/119105
921
- best_ratio , cutoff = ( 0.74 , 0 ) , 0.75
919
+ best_ratio , cutoff = 0.74 , 0.75
922
920
cruncher = SequenceMatcher (self .charjunk )
923
921
eqi , eqj = None , None # 1st indices of equal lines (if any)
922
+ # List of index pairs achieving best_ratio. Strictly increasing
923
+ # in both index positions.
924
+ max_pairs = []
925
+ maxi = - 1 # `i` index of last pair in max_pairs
924
926
925
927
# search for the pair that matches best without being identical
926
- # (identical lines must be junk lines, & we don't want to synch up
927
- # on junk -- unless we have to)
928
- amid = (alo + ahi - 1 ) / 2
929
- bmid = (blo + bhi - 1 ) / 2
928
+ # (identical lines must be junk lines, & we don't want to synch
929
+ # up on junk -- unless we have to)
930
+ crqr = cruncher .real_quick_ratio
931
+ cqr = cruncher .quick_ratio
932
+ cr = cruncher .ratio
930
933
for j in range (blo , bhi ):
931
934
bj = b [j ]
932
935
cruncher .set_seq2 (bj )
933
- weight_j = - abs (j - bmid )
936
+ # Find new best, if possible. Else search for the smallest i
937
+ # (if any) > maxi that equals the best ratio
938
+ search_equal = True
934
939
for i in range (alo , ahi ):
935
940
ai = a [i ]
936
941
if ai == bj :
937
942
if eqi is None :
938
943
eqi , eqj = i , j
939
944
continue
940
945
cruncher .set_seq1 (ai )
941
- # weight is used to balance the recursion by prioritizing
942
- # i and j in the middle of their ranges
943
- weight = weight_j - abs (i - amid )
944
946
# computing similarity is expensive, so use the quick
945
947
# upper bounds first -- have seen this speed up messy
946
948
# compares by a factor of 3.
947
- # note that ratio() is only expensive to compute the first
948
- # time it's called on a sequence pair; the expensive part
949
- # of the computation is cached by cruncher
950
- if (cruncher .real_quick_ratio (), weight ) > best_ratio and \
951
- (cruncher .quick_ratio (), weight ) > best_ratio and \
952
- (cruncher .ratio (), weight ) > best_ratio :
953
- best_ratio , best_i , best_j = (cruncher .ratio (), weight ), i , j
954
- best_ratio , _ = best_ratio
949
+ cmp = ge if search_equal and i > maxi else gt
950
+ if (cmp (crqr (), best_ratio )
951
+ and cmp (cqr (), best_ratio )
952
+ and cmp ((ratio := cr ()), best_ratio )):
953
+ if ratio > best_ratio :
954
+ best_ratio = ratio
955
+ max_pairs .clear ()
956
+ else :
957
+ assert best_ratio == ratio and search_equal
958
+ assert i > maxi
959
+ max_pairs .append ((i , j ))
960
+ maxi = i
961
+ search_equal = False
955
962
if best_ratio < cutoff :
963
+ assert not max_pairs
956
964
# no non-identical "pretty close" pair
957
965
if eqi is None :
958
966
# no identical pair either -- treat it as a straight replace
959
967
yield from self ._plain_replace (a , alo , ahi , b , blo , bhi )
960
968
return
961
969
# no close pair, but an identical pair -- synch up on that
962
- best_i , best_j , best_ratio = eqi , eqj , 1.0
970
+ max_pairs = [( eqi , eqj )]
963
971
else :
964
972
# there's a close pair, so forget the identical pair (if any)
973
+ assert max_pairs
965
974
eqi = None
966
975
967
- # a[best_i] very similar to b[best_j]; eqi is None iff they're not
968
- # identical
969
-
970
- # pump out diffs from before the synch point
971
- yield from self . _fancy_helper ( a , alo , best_i , b , blo , best_j )
972
-
973
- # do intraline marking on the synch pair
974
- aelt , belt = a [ best_i ], b [ best_j ]
975
- if eqi is None :
976
- # pump out a '-', '?', '+', '?' quad for the synched lines
977
- atags = btags = ""
978
- cruncher .set_seqs ( aelt , belt )
979
- for tag , ai1 , ai2 , bj1 , bj2 in cruncher . get_opcodes ():
980
- la , lb = ai2 - ai1 , bj2 - bj1
981
- if tag == 'replace' :
982
- atags += '^' * la
983
- btags += '^' * lb
984
- elif tag == 'delete' :
985
- atags += '-' * la
986
- elif tag == 'insert' :
987
- btags += '+' * lb
988
- elif tag == 'equal' :
989
- atags += ' ' * la
990
- btags += ' ' * lb
991
- else :
992
- raise ValueError ( 'unknown tag %r' % ( tag ,) )
993
- yield from self . _qformat ( aelt , belt , atags , btags )
994
- else :
995
- # the synch pair is identical
996
- yield ' ' + aelt
976
+ last_i , last_j = alo , blo
977
+ for this_i , this_j in max_pairs :
978
+ # pump out diffs from before the synch point
979
+ yield from self . _fancy_helper ( a , last_i , this_i ,
980
+ b , last_j , this_j )
981
+ # do intraline marking on the synch pair
982
+ aelt , belt = a [ this_i ], b [ this_j ]
983
+ if eqi is None :
984
+ # pump out a '-', '?', '+', '?' quad for the synched lines
985
+ atags = btags = ""
986
+ cruncher . set_seqs ( aelt , belt )
987
+ for tag , ai1 , ai2 , bj1 , bj2 in cruncher .get_opcodes ():
988
+ la , lb = ai2 - ai1 , bj2 - bj1
989
+ if tag == 'replace' :
990
+ atags += '^' * la
991
+ btags += '^' * lb
992
+ elif tag == 'delete' :
993
+ atags += '-' * la
994
+ elif tag == 'insert' :
995
+ btags += '+' * lb
996
+ elif tag == 'equal' :
997
+ atags += ' ' * la
998
+ btags += ' ' * lb
999
+ else :
1000
+ raise ValueError ( 'unknown tag %r' % ( tag ,))
1001
+ yield from self . _qformat ( aelt , belt , atags , btags )
1002
+ else :
1003
+ # the synch pair is identical
1004
+ yield ' ' + aelt
1005
+ last_i , last_j = this_i + 1 , this_j + 1
997
1006
998
- # pump out diffs from after the synch point
999
- yield from self ._fancy_helper (a , best_i + 1 , ahi , b , best_j + 1 , bhi )
1007
+ # pump out diffs from after the last synch point
1008
+ yield from self ._fancy_helper (a , last_i , ahi ,
1009
+ b , last_j , bhi )
1000
1010
1001
1011
def _fancy_helper (self , a , alo , ahi , b , blo , bhi ):
1002
1012
g = []
0 commit comments