Skip to content

Commit d20dbfa

Browse files
committed
Use simpler faster Rabin-Karp-like search for short needle
1 parent 1898dfc commit d20dbfa

File tree

1 file changed

+83
-27
lines changed

1 file changed

+83
-27
lines changed

src/string.cr

+83-27
Original file line numberDiff line numberDiff line change
@@ -3312,15 +3312,20 @@ class String
33123312
# Update rolling hash for Rabin-Karp algorithm `String#index`.
33133313
private macro update_hash(n)
33143314
{% for i in 1..n %}
3315-
{% if i != 1 %}
3316-
byte = head_pointer.value
3317-
{% end %}
3315+
byte = head_pointer.value
33183316
hash = hash &* PRIME_RK &+ pointer.value &- pow &* byte
33193317
pointer += 1
33203318
head_pointer += 1
33213319
{% end %}
33223320
end
33233321

3322+
private macro update_simplehash(n)
3323+
{% for i in 1..n %}
3324+
hash = (hash << 8) | pointer.value
3325+
pointer += 1
3326+
{% end %}
3327+
end
3328+
33243329
# Returns the index of the _first_ occurrence of *search* in the string, or `nil` if not present.
33253330
# If *offset* is present, it defines the position to start the search.
33263331
#
@@ -3360,13 +3365,6 @@ class String
33603365
# Rabin-Karp algorithm
33613366
# https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm
33623367

3363-
# calculate a rolling hash of search text (needle)
3364-
search_hash = 0u32
3365-
search.each_byte do |b|
3366-
search_hash = search_hash &* PRIME_RK &+ b
3367-
end
3368-
pow = PRIME_RK &** search.bytesize
3369-
33703368
# Find start index with offset
33713369
char_index = 0
33723370
pointer = to_unsafe
@@ -3377,24 +3375,63 @@ class String
33773375
char_index += 1
33783376
end
33793377

3380-
head_pointer = pointer
3378+
return if pointer + search.bytesize > end_pointer
33813379

3382-
# calculate a rolling hash of this text (haystack)
3380+
if search.bytesize == 1
3381+
byte = search.to_unsafe[0]
3382+
while pointer < end_pointer
3383+
return char_index if pointer.value == byte
3384+
pointer += String.char_bytesize_at(pointer)
3385+
char_index += 1
3386+
end
3387+
return nil
3388+
end
3389+
3390+
head_pointer = pointer
3391+
search_hash = 0u32
33833392
hash = 0u32
3384-
hash_end_pointer = pointer + search.bytesize
3385-
return if hash_end_pointer > end_pointer
3386-
while pointer < hash_end_pointer
3393+
3394+
if search.bytesize <= 4
3395+
# simplified version with multiplier == 256
3396+
mask = 0u32
3397+
search.each_byte do |b|
3398+
search_hash = (search_hash << 8) | b
3399+
hash = (hash << 8) | pointer.value
3400+
mask = (mask << 8) | 0xff
3401+
pointer += 1
3402+
end
3403+
3404+
while true
3405+
return char_index if (hash & mask) == search_hash
3406+
3407+
char_bytesize = String.char_bytesize_at(head_pointer)
3408+
return if pointer + char_bytesize > end_pointer
3409+
case char_bytesize
3410+
when 1 then update_simplehash 1
3411+
when 2 then update_simplehash 2
3412+
when 3 then update_simplehash 3
3413+
else update_simplehash 4
3414+
end
3415+
3416+
head_pointer += char_bytesize
3417+
char_index += 1
3418+
end
3419+
end
3420+
3421+
# calculate a rolling hash of search text (needle) and this text (haystack)
3422+
search.each_byte do |b|
3423+
search_hash = search_hash &* PRIME_RK &+ b
33873424
hash = hash &* PRIME_RK &+ pointer.value
33883425
pointer += 1
33893426
end
3427+
pow = PRIME_RK &** search.bytesize
33903428

33913429
while true
33923430
# check hash equality and real string equality
33933431
if hash == search_hash && head_pointer.memcmp(search.to_unsafe, search.bytesize) == 0
33943432
return char_index
33953433
end
33963434

3397-
byte = head_pointer.value
33983435
char_bytesize = String.char_bytesize_at(head_pointer)
33993436
return if pointer + char_bytesize > end_pointer
34003437
case char_bytesize
@@ -3688,27 +3725,46 @@ class String
36883725
return if offset < 0
36893726

36903727
return bytesize < offset ? nil : offset if search.empty?
3728+
return byte_index(search.to_unsafe[0], offset) if search.bytesize == 1
36913729

36923730
# Rabin-Karp algorithm
36933731
# https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm
36943732

3695-
# calculate a rolling hash of search text (needle)
3733+
pointer = to_unsafe + offset
3734+
end_pointer = to_unsafe + bytesize
3735+
return if pointer + search.bytesize > end_pointer
3736+
36963737
search_hash = 0u32
3697-
search.each_byte do |b|
3698-
search_hash = search_hash &* PRIME_RK &+ b
3738+
hash = 0u32
3739+
3740+
if search.bytesize <= 4
3741+
# simplified version with multiplier == 256
3742+
mask = 0u32
3743+
search.each_byte do |b|
3744+
search_hash = (search_hash << 8) | b
3745+
hash = (hash << 8) | pointer.value
3746+
mask = (mask << 8) | 0xff
3747+
pointer += 1
3748+
end
3749+
3750+
while true
3751+
return offset if (hash & mask) == search_hash
3752+
return if pointer >= end_pointer
3753+
hash = (hash << 8) | pointer.value
3754+
pointer += 1
3755+
offset += 1
3756+
end
36993757
end
3700-
pow = PRIME_RK &** search.bytesize
37013758

3702-
# calculate a rolling hash of this text (haystack)
3703-
pointer = head_pointer = to_unsafe + offset
3704-
hash_end_pointer = pointer + search.bytesize
3705-
end_pointer = to_unsafe + bytesize
3706-
hash = 0u32
3707-
return if hash_end_pointer > end_pointer
3708-
while pointer < hash_end_pointer
3759+
head_pointer = pointer
3760+
3761+
# calculate a rolling hash of search text (needle) and this text (haystack)
3762+
search.each_byte do |b|
3763+
search_hash = search_hash &* PRIME_RK &+ b
37093764
hash = hash &* PRIME_RK &+ pointer.value
37103765
pointer += 1
37113766
end
3767+
pow = PRIME_RK &** search.bytesize
37123768

37133769
while true
37143770
# check hash equality and real string equality

0 commit comments

Comments
 (0)