@@ -3312,15 +3312,20 @@ class String
3312
3312
# Update rolling hash for Rabin-Karp algorithm `String#index`.
3313
3313
private macro update_hash (n )
3314
3314
{% for i in 1 ..n % }
3315
- {% if i != 1 % }
3316
- byte = head_pointer.value
3317
- {% end % }
3315
+ byte = head_pointer.value
3318
3316
hash = hash &* PRIME_RK &+ pointer.value &- pow &* byte
3319
3317
pointer += 1
3320
3318
head_pointer += 1
3321
3319
{% end % }
3322
3320
end
3323
3321
3322
+ private macro update_simplehash (n )
3323
+ {% for i in 1 ..n % }
3324
+ hash = (hash << 8 ) | pointer.value
3325
+ pointer += 1
3326
+ {% end % }
3327
+ end
3328
+
3324
3329
# Returns the index of the _first_ occurrence of *search* in the string, or `nil` if not present.
3325
3330
# If *offset* is present, it defines the position to start the search.
3326
3331
#
@@ -3360,13 +3365,6 @@ class String
3360
3365
# Rabin-Karp algorithm
3361
3366
# https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm
3362
3367
3363
- # calculate a rolling hash of search text (needle)
3364
- search_hash = 0 u32
3365
- search.each_byte do |b |
3366
- search_hash = search_hash &* PRIME_RK &+ b
3367
- end
3368
- pow = PRIME_RK &** search.bytesize
3369
-
3370
3368
# Find start index with offset
3371
3369
char_index = 0
3372
3370
pointer = to_unsafe
@@ -3377,24 +3375,63 @@ class String
3377
3375
char_index += 1
3378
3376
end
3379
3377
3380
- head_pointer = pointer
3378
+ return if pointer + search.bytesize > end_pointer
3381
3379
3382
- # calculate a rolling hash of this text (haystack)
3380
+ if search.bytesize == 1
3381
+ byte = search.to_unsafe[0 ]
3382
+ while pointer < end_pointer
3383
+ return char_index if pointer.value == byte
3384
+ pointer += String .char_bytesize_at(pointer)
3385
+ char_index += 1
3386
+ end
3387
+ return nil
3388
+ end
3389
+
3390
+ head_pointer = pointer
3391
+ search_hash = 0 u32
3383
3392
hash = 0 u32
3384
- hash_end_pointer = pointer + search.bytesize
3385
- return if hash_end_pointer > end_pointer
3386
- while pointer < hash_end_pointer
3393
+
3394
+ if search.bytesize <= 4
3395
+ # simplified version with multiplier == 256
3396
+ mask = 0 u32
3397
+ search.each_byte do |b |
3398
+ search_hash = (search_hash << 8 ) | b
3399
+ hash = (hash << 8 ) | pointer.value
3400
+ mask = (mask << 8 ) | 0xff
3401
+ pointer += 1
3402
+ end
3403
+
3404
+ while true
3405
+ return char_index if (hash & mask) == search_hash
3406
+
3407
+ char_bytesize = String .char_bytesize_at(head_pointer)
3408
+ return if pointer + char_bytesize > end_pointer
3409
+ case char_bytesize
3410
+ when 1 then update_simplehash 1
3411
+ when 2 then update_simplehash 2
3412
+ when 3 then update_simplehash 3
3413
+ else update_simplehash 4
3414
+ end
3415
+
3416
+ head_pointer += char_bytesize
3417
+ char_index += 1
3418
+ end
3419
+ end
3420
+
3421
+ # calculate a rolling hash of search text (needle) and this text (haystack)
3422
+ search.each_byte do |b |
3423
+ search_hash = search_hash &* PRIME_RK &+ b
3387
3424
hash = hash &* PRIME_RK &+ pointer.value
3388
3425
pointer += 1
3389
3426
end
3427
+ pow = PRIME_RK &** search.bytesize
3390
3428
3391
3429
while true
3392
3430
# check hash equality and real string equality
3393
3431
if hash == search_hash && head_pointer.memcmp(search.to_unsafe, search.bytesize) == 0
3394
3432
return char_index
3395
3433
end
3396
3434
3397
- byte = head_pointer.value
3398
3435
char_bytesize = String .char_bytesize_at(head_pointer)
3399
3436
return if pointer + char_bytesize > end_pointer
3400
3437
case char_bytesize
@@ -3688,27 +3725,46 @@ class String
3688
3725
return if offset < 0
3689
3726
3690
3727
return bytesize < offset ? nil : offset if search.empty?
3728
+ return byte_index(search.to_unsafe[0 ], offset) if search.bytesize == 1
3691
3729
3692
3730
# Rabin-Karp algorithm
3693
3731
# https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm
3694
3732
3695
- # calculate a rolling hash of search text (needle)
3733
+ pointer = to_unsafe + offset
3734
+ end_pointer = to_unsafe + bytesize
3735
+ return if pointer + search.bytesize > end_pointer
3736
+
3696
3737
search_hash = 0 u32
3697
- search.each_byte do |b |
3698
- search_hash = search_hash &* PRIME_RK &+ b
3738
+ hash = 0 u32
3739
+
3740
+ if search.bytesize <= 4
3741
+ # simplified version with multiplier == 256
3742
+ mask = 0 u32
3743
+ search.each_byte do |b |
3744
+ search_hash = (search_hash << 8 ) | b
3745
+ hash = (hash << 8 ) | pointer.value
3746
+ mask = (mask << 8 ) | 0xff
3747
+ pointer += 1
3748
+ end
3749
+
3750
+ while true
3751
+ return offset if (hash & mask) == search_hash
3752
+ return if pointer >= end_pointer
3753
+ hash = (hash << 8 ) | pointer.value
3754
+ pointer += 1
3755
+ offset += 1
3756
+ end
3699
3757
end
3700
- pow = PRIME_RK &** search.bytesize
3701
3758
3702
- # calculate a rolling hash of this text (haystack)
3703
- pointer = head_pointer = to_unsafe + offset
3704
- hash_end_pointer = pointer + search.bytesize
3705
- end_pointer = to_unsafe + bytesize
3706
- hash = 0 u32
3707
- return if hash_end_pointer > end_pointer
3708
- while pointer < hash_end_pointer
3759
+ head_pointer = pointer
3760
+
3761
+ # calculate a rolling hash of search text (needle) and this text (haystack)
3762
+ search.each_byte do |b |
3763
+ search_hash = search_hash &* PRIME_RK &+ b
3709
3764
hash = hash &* PRIME_RK &+ pointer.value
3710
3765
pointer += 1
3711
3766
end
3767
+ pow = PRIME_RK &** search.bytesize
3712
3768
3713
3769
while true
3714
3770
# check hash equality and real string equality
0 commit comments