Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use simpler faster Rabin-Karp-like search for short needle #13820

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Prev Previous commit
Next Next commit
use function with type parameter and block instead of macros
  • Loading branch information
funny-falcon committed Oct 9, 2023
commit f4b138e4a2f164a48f5147fc4e6072be25ad8f04
35 changes: 11 additions & 24 deletions src/string.cr
Original file line number Diff line number Diff line change
Expand Up @@ -3355,30 +3355,22 @@ class String
nil
end

private macro gen_index_short(int_class, by_char)
# simplified Rabin-Karp version with multiplier == 256
search_hash = {{int_class}}.new(0)
hash = {{int_class}}.new(0)
mask = {{int_class}}.new(0)
private def index_short(hash_type, offset : Int32, pointer : UInt8*, end_pointer : UInt8*, search, &)
search_hash = hash_type.new(0)
hash = hash_type.new(0)
mask = hash_type.new(0)

search.each_byte do |b|
search_hash = (search_hash << 8) | b
hash = (hash << 8) | pointer.value
mask = (mask << 8) | 0xff
pointer += 1
end
{% if by_char %}
search_bytesize = search.bytesize
{% end %}

while true
straight-shoota marked this conversation as resolved.
Show resolved Hide resolved
return offset if (hash & mask) == search_hash

{% if by_char %}
char_bytesize = String.char_bytesize_at(pointer - search_bytesize)
{% else %}
char_bytesize = 1
{% end %}
char_bytesize = yield pointer
return if pointer + char_bytesize > end_pointer
case char_bytesize
when 1 then update_simplehash 1
Expand All @@ -3391,10 +3383,6 @@ class String
end
end

private def index_2to8bytes(offset : Int32, pointer : UInt8*, end_pointer : UInt8*, search : String)
gen_index_short(UInt64, true)
end

# :ditto:
def index(search : String, offset = 0)
offset += size if offset < 0
Expand All @@ -3419,7 +3407,10 @@ class String
return if pointer + search.bytesize > end_pointer

if search.bytesize <= 8
return index_2to8bytes(char_index, pointer, end_pointer, search)
search_bytesize = search.bytesize
return index_short(UInt64, char_index, pointer, end_pointer, search) {|pointer|
String.char_bytesize_at(pointer - search_bytesize)
}
end

head_pointer = pointer
Expand Down Expand Up @@ -3734,11 +3725,7 @@ class String

pointer = to_unsafe + offset
end_pointer = to_unsafe + bytesize
gen_index_short(UInt32, false)
end

private def byte_index_2to8bytes(offset : Int32, pointer : UInt8*, end_pointer : UInt8*, search : String)
gen_index_short(UInt64, false)
index_short(UInt32, offset, pointer, end_pointer, search) { 1 }
end

# Returns the byte index of *search* in the string, or `nil` if the string is not present.
Expand Down Expand Up @@ -3772,7 +3759,7 @@ class String
hash = 0u32

if search.bytesize <= 8
return byte_index_2to8bytes(offset, pointer, end_pointer, search)
return index_short(UInt64, offset, pointer, end_pointer, search) { 1 }
end

head_pointer = pointer
Expand Down