Skip to content

Commit 30fbfc6

Browse files
ndinsmorematthias314
authored andcommitted
Vectorized isascii using simple loop 25+bytes/cycle for large strings (#48568)
Co-authored-by: matthias314 <56549971+matthias314@users.noreply.github.com>
1 parent 046f4bb commit 30fbfc6

File tree

4 files changed

+61
-6
lines changed

4 files changed

+61
-6
lines changed

base/strings/basic.jl

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -613,6 +613,38 @@ isascii(c::Char) = bswap(reinterpret(UInt32, c)) < 0x80
613613
isascii(s::AbstractString) = all(isascii, s)
614614
isascii(c::AbstractChar) = UInt32(c) < 0x80
615615

616+
@inline function _isascii(code_units::AbstractVector{CU}, first, last) where {CU}
617+
r = zero(CU)
618+
for n = first:last
619+
@inbounds r |= code_units[n]
620+
end
621+
return 0 r < 0x80
622+
end
623+
624+
#The chunking algorithm makes the last two chunks overlap inorder to keep the size fixed
625+
@inline function _isascii_chunks(chunk_size,cu::AbstractVector{CU}, first,last) where {CU}
626+
n=first
627+
while n <= last - chunk_size
628+
_isascii(cu,n,n+chunk_size-1) || return false
629+
n += chunk_size
630+
end
631+
return _isascii(cu,last-chunk_size+1,last)
632+
end
633+
"""
634+
isascii(cu::AbstractVector{CU}) where {CU <: Integer} -> Bool
635+
636+
Test whether all values in the vector belong to the ASCII character set (0x00 to 0x7f).
637+
This function is intended to be used by other string implementations that need a fast ASCII check.
638+
"""
639+
function isascii(cu::AbstractVector{CU}) where {CU <: Integer}
640+
chunk_size = 1024
641+
chunk_threshold = chunk_size + (chunk_size ÷ 2)
642+
first = firstindex(cu); last = lastindex(cu)
643+
l = last - first + 1
644+
l < chunk_threshold && return _isascii(cu,first,last)
645+
return _isascii_chunks(chunk_size,cu,first,last)
646+
end
647+
616648
## string map, filter ##
617649

618650
function map(f, s::AbstractString)

base/strings/string.jl

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -326,12 +326,7 @@ end
326326

327327
isvalid(s::String, i::Int) = checkbounds(Bool, s, i) && thisind(s, i) == i
328328

329-
function isascii(s::String)
330-
@inbounds for i = 1:ncodeunits(s)
331-
codeunit(s, i) >= 0x80 && return false
332-
end
333-
return true
334-
end
329+
isascii(s::String) = isascii(codeunits(s))
335330

336331
"""
337332
repeat(c::AbstractChar, r::Integer) -> String

base/strings/substring.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,8 @@ function getindex(s::SubString, i::Integer)
9292
@inbounds return getindex(s.string, s.offset + i)
9393
end
9494

95+
isascii(ss::SubString{String}) = isascii(codeunits(ss))
96+
9597
function isvalid(s::SubString, i::Integer)
9698
ib = true
9799
@boundscheck ib = checkbounds(Bool, s, i)

test/strings/basic.jl

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1125,6 +1125,32 @@ end
11251125
@test sprint(summary, "") == "empty String"
11261126
end
11271127

1128+
@testset "isascii" begin
1129+
N = 1
1130+
@test isascii("S"^N) == true
1131+
@test isascii("S"^(N - 1)) == true
1132+
@test isascii("S"^(N + 1)) == true
1133+
1134+
@test isascii("λ" * ("S"^(N))) == false
1135+
@test isascii(("S"^(N)) * "λ") == false
1136+
1137+
for p = 1:16
1138+
N = 2^p
1139+
@test isascii("S"^N) == true
1140+
@test isascii("S"^(N - 1)) == true
1141+
@test isascii("S"^(N + 1)) == true
1142+
1143+
@test isascii("λ" * ("S"^(N))) == false
1144+
@test isascii(("S"^(N)) * "λ") == false
1145+
@test isascii("λ"*("S"^(N - 1))) == false
1146+
@test isascii(("S"^(N - 1)) * "λ") == false
1147+
if N > 4
1148+
@test isascii("λ" * ("S"^(N - 3))) == false
1149+
@test isascii(("S"^(N - 3)) * "λ") == false
1150+
end
1151+
end
1152+
end
1153+
11281154
@testset "Plug holes in test coverage" begin
11291155
@test_throws MethodError checkbounds(Bool, "abc", [1.0, 2.0])
11301156

0 commit comments

Comments
 (0)