Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions pydatastructs/strings/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
'find'
]

PRIME_NUMBER, MOD = 257, 1000000007

def find(text, query, algorithm):
"""
Finds occurrence of a query string within the text string.
Expand All @@ -22,6 +24,7 @@ def find(text, query, algorithm):
Currently the following algorithms are
supported,
'kmp' -> Knuth-Morris-Pratt as given in [1].
'rabin_karp' -> Rabin–Karp algorithm as given in [2].

Returns
=======
Expand Down Expand Up @@ -52,6 +55,7 @@ def find(text, query, algorithm):
==========

.. [1] https://en.wikipedia.org/wiki/Knuth–Morris–Pratt_algorithm
.. [2] https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm
"""
import pydatastructs.strings.algorithms as algorithms
func = "_" + algorithm
Expand All @@ -64,6 +68,8 @@ def find(text, query, algorithm):


def _knuth_morris_pratt(text, query):
if len(text) == 0 or len(query) == 0:
return DynamicOneDimensionalArray(int, 0)
kmp_table = _build_kmp_table(query)
return _do_match(text, query, kmp_table)

Expand Down Expand Up @@ -107,3 +113,40 @@ def _do_match(string, query, kmp_table):
k = k + 1

return positions

def _p_pow(length, p=PRIME_NUMBER, m=MOD):
p_pow = OneDimensionalArray(int, length)
p_pow[0] = 1
for i in range(1, length):
p_pow[i] = (p_pow[i-1] * p) % m
return p_pow

def _hash_str(string, p=PRIME_NUMBER, m=MOD):
hash_value = 0
p_pow = _p_pow(len(string), p, m)
for i in range(len(string)):
hash_value = (hash_value + ord(string[i]) * p_pow[i]) % m
return hash_value

def _rabin_karp(text, query):
t = len(text)
q = len(query)
positions = DynamicOneDimensionalArray(int, 0)
if q == 0 or t == 0:
return positions

query_hash = _hash_str(query)
text_hash = OneDimensionalArray(int, t + 1)
text_hash.fill(0)
p_pow = _p_pow(t)

for i in range(t):
text_hash[i+1] = (text_hash[i] + ord(text[i]) * p_pow[i]) % MOD
for i in range(t - q + 1):
curr_hash = (text_hash[i + q] + MOD - text_hash[i]) % MOD
if curr_hash == (query_hash * p_pow[i]) % MOD:
positions.append(i)

return positions


10 changes: 7 additions & 3 deletions pydatastructs/strings/tests/test_algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
def test_kmp():
_test_common_string_matching('kmp')

def test_rka():
_test_common_string_matching('rabin_karp')

def _test_common_string_matching(algorithm):
true_text_pattern_dictionary = {
Expand All @@ -26,7 +28,9 @@ def _test_common_string_matching(algorithm):
"Knuth-Morris-Pratt": "-Pratt-",
"abcabcabcabdabcabdabcabca": "qwertyuiopzxcvbnm",
"aefcdfaecdaefaefcdaefeaefcdcdeae": "cdaefaefe",
"fullstringmatch": "fullstrinmatch"
"fullstringmatch": "fullstrinmatch",
"abc": "",
"": "abc"
}

for test_case_key in false_text_pattern_dictionary:
Expand All @@ -52,13 +56,13 @@ def gen_random_string(length):
if rand_str != query:
freq += 1
text += query + rand_str + query
positions = find(text, query, algorithm="kmp")
positions = find(text, query, algorithm)
assert positions._num == num_times * 2
for i in range(positions._last_pos_filled):
p = positions[i]
assert text[p:p + len(query)] == query

text = gen_random_string(len(query))
if text != query:
positions = find(text, query, algorithm="kmp")
positions = find(text, query, algorithm)
assert positions.size == 0