Skip to content

Commit

Permalink
Vectorized isascii using simple loop 25+bytes/cycle for large strings (
Browse files Browse the repository at this point in the history
…#48568)


Co-authored-by: matthias314 <[email protected]>
  • Loading branch information
2 people authored and pull[bot] committed Nov 26, 2023
1 parent 046f4bb commit 30fbfc6
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 6 deletions.
32 changes: 32 additions & 0 deletions base/strings/basic.jl
Original file line number Diff line number Diff line change
Expand Up @@ -613,6 +613,38 @@ isascii(c::Char) = bswap(reinterpret(UInt32, c)) < 0x80
isascii(s::AbstractString) = all(isascii, s)
isascii(c::AbstractChar) = UInt32(c) < 0x80

@inline function _isascii(code_units::AbstractVector{CU}, first, last) where {CU}
r = zero(CU)
for n = first:last
@inbounds r |= code_units[n]
end
return 0 r < 0x80
end

#The chunking algorithm makes the last two chunks overlap inorder to keep the size fixed
@inline function _isascii_chunks(chunk_size,cu::AbstractVector{CU}, first,last) where {CU}
n=first
while n <= last - chunk_size
_isascii(cu,n,n+chunk_size-1) || return false
n += chunk_size
end
return _isascii(cu,last-chunk_size+1,last)
end
"""
isascii(cu::AbstractVector{CU}) where {CU <: Integer} -> Bool
Test whether all values in the vector belong to the ASCII character set (0x00 to 0x7f).
This function is intended to be used by other string implementations that need a fast ASCII check.
"""
function isascii(cu::AbstractVector{CU}) where {CU <: Integer}
chunk_size = 1024
chunk_threshold = chunk_size + (chunk_size ÷ 2)
first = firstindex(cu); last = lastindex(cu)
l = last - first + 1
l < chunk_threshold && return _isascii(cu,first,last)
return _isascii_chunks(chunk_size,cu,first,last)
end

## string map, filter ##

function map(f, s::AbstractString)
Expand Down
7 changes: 1 addition & 6 deletions base/strings/string.jl
Original file line number Diff line number Diff line change
Expand Up @@ -326,12 +326,7 @@ end

isvalid(s::String, i::Int) = checkbounds(Bool, s, i) && thisind(s, i) == i

function isascii(s::String)
@inbounds for i = 1:ncodeunits(s)
codeunit(s, i) >= 0x80 && return false
end
return true
end
isascii(s::String) = isascii(codeunits(s))

"""
repeat(c::AbstractChar, r::Integer) -> String
Expand Down
2 changes: 2 additions & 0 deletions base/strings/substring.jl
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@ function getindex(s::SubString, i::Integer)
@inbounds return getindex(s.string, s.offset + i)
end

isascii(ss::SubString{String}) = isascii(codeunits(ss))

function isvalid(s::SubString, i::Integer)
ib = true
@boundscheck ib = checkbounds(Bool, s, i)
Expand Down
26 changes: 26 additions & 0 deletions test/strings/basic.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1125,6 +1125,32 @@ end
@test sprint(summary, "") == "empty String"
end

@testset "isascii" begin
N = 1
@test isascii("S"^N) == true
@test isascii("S"^(N - 1)) == true
@test isascii("S"^(N + 1)) == true

@test isascii("λ" * ("S"^(N))) == false
@test isascii(("S"^(N)) * "λ") == false

for p = 1:16
N = 2^p
@test isascii("S"^N) == true
@test isascii("S"^(N - 1)) == true
@test isascii("S"^(N + 1)) == true

@test isascii("λ" * ("S"^(N))) == false
@test isascii(("S"^(N)) * "λ") == false
@test isascii("λ"*("S"^(N - 1))) == false
@test isascii(("S"^(N - 1)) * "λ") == false
if N > 4
@test isascii("λ" * ("S"^(N - 3))) == false
@test isascii(("S"^(N - 3)) * "λ") == false
end
end
end

@testset "Plug holes in test coverage" begin
@test_throws MethodError checkbounds(Bool, "abc", [1.0, 2.0])

Expand Down

0 comments on commit 30fbfc6

Please sign in to comment.