Skip to content

Commit 73a85c4

Browse files
sweeneydetim-one
andauthored
bpo-41972: Use the two-way algorithm for string searching (GH-22904)
Implement an enhanced variant of Crochemore and Perrin's Two-Way string searching algorithm, which reduces worst-case time from quadratic (the product of the string and pattern lengths) to linear. This applies to forward searches (like``find``, ``index``, ``replace``); the algorithm for reverse searches (like ``rfind``) is not changed. Co-authored-by: Tim Peters <tim.peters@gmail.com>
1 parent 2183d06 commit 73a85c4

File tree

4 files changed

+941
-20
lines changed

4 files changed

+941
-20
lines changed

Lib/test/string_tests.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import unittest, string, sys, struct
66
from test import support
77
from collections import UserList
8+
import random
89

910
class Sequence:
1011
def __init__(self, seq='wxyz'): self.seq = seq
@@ -317,6 +318,44 @@ def test_rindex(self):
317318
else:
318319
self.checkraises(TypeError, 'hello', 'rindex', 42)
319320

321+
def test_find_periodic_pattern(self):
322+
"""Cover the special path for periodic patterns."""
323+
def reference_find(p, s):
324+
for i in range(len(s)):
325+
if s.startswith(p, i):
326+
return i
327+
return -1
328+
329+
rr = random.randrange
330+
choices = random.choices
331+
for _ in range(1000):
332+
p0 = ''.join(choices('abcde', k=rr(10))) * rr(10, 20)
333+
p = p0[:len(p0) - rr(10)] # pop off some characters
334+
left = ''.join(choices('abcdef', k=rr(2000)))
335+
right = ''.join(choices('abcdef', k=rr(2000)))
336+
text = left + p + right
337+
with self.subTest(p=p, text=text):
338+
self.checkequal(reference_find(p, text),
339+
text, 'find', p)
340+
341+
def test_find_shift_table_overflow(self):
342+
"""When the table of 8-bit shifts overflows."""
343+
N = 2**8 + 100
344+
345+
# first check the periodic case
346+
# here, the shift for 'b' is N + 1.
347+
pattern1 = 'a' * N + 'b' + 'a' * N
348+
text1 = 'babbaa' * N + pattern1
349+
self.checkequal(len(text1)-len(pattern1),
350+
text1, 'find', pattern1)
351+
352+
# now check the non-periodic case
353+
# here, the shift for 'd' is 3*(N+1)+1
354+
pattern2 = 'ddd' + 'abc' * N + "eee"
355+
text2 = pattern2[:-1] + "ddeede" * 2 * N + pattern2 + "de" * N
356+
self.checkequal(len(text2) - N*len("de") - len(pattern2),
357+
text2, 'find', pattern2)
358+
320359
def test_lower(self):
321360
self.checkequal('hello', 'HeLLo', 'lower')
322361
self.checkequal('hello', 'hello', 'lower')
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Substring search functions such as ``str1 in str2`` and ``str2.find(str1)`` now sometimes use the "Two-Way" string comparison algorithm to avoid quadratic behavior on long strings.

0 commit comments

Comments
 (0)