From 30fbfc6e388a2d54416cb252cfd44198d54b23a1 Mon Sep 17 00:00:00 2001 From: ndinsmore <45537276+ndinsmore@users.noreply.github.com> Date: Fri, 3 Mar 2023 15:23:46 -0500 Subject: [PATCH] Vectorized isascii using simple loop 25+bytes/cycle for large strings (#48568) Co-authored-by: matthias314 <56549971+matthias314@users.noreply.github.com> --- base/strings/basic.jl | 32 ++++++++++++++++++++++++++++++++ base/strings/string.jl | 7 +------ base/strings/substring.jl | 2 ++ test/strings/basic.jl | 26 ++++++++++++++++++++++++++ 4 files changed, 61 insertions(+), 6 deletions(-) diff --git a/base/strings/basic.jl b/base/strings/basic.jl index ebd6907d7e96c1..2609edeaaaa18b 100644 --- a/base/strings/basic.jl +++ b/base/strings/basic.jl @@ -613,6 +613,38 @@ isascii(c::Char) = bswap(reinterpret(UInt32, c)) < 0x80 isascii(s::AbstractString) = all(isascii, s) isascii(c::AbstractChar) = UInt32(c) < 0x80 +@inline function _isascii(code_units::AbstractVector{CU}, first, last) where {CU} + r = zero(CU) + for n = first:last + @inbounds r |= code_units[n] + end + return 0 ≤ r < 0x80 +end + +#The chunking algorithm makes the last two chunks overlap inorder to keep the size fixed +@inline function _isascii_chunks(chunk_size,cu::AbstractVector{CU}, first,last) where {CU} + n=first + while n <= last - chunk_size + _isascii(cu,n,n+chunk_size-1) || return false + n += chunk_size + end + return _isascii(cu,last-chunk_size+1,last) +end +""" + isascii(cu::AbstractVector{CU}) where {CU <: Integer} -> Bool + +Test whether all values in the vector belong to the ASCII character set (0x00 to 0x7f). +This function is intended to be used by other string implementations that need a fast ASCII check. +""" +function isascii(cu::AbstractVector{CU}) where {CU <: Integer} + chunk_size = 1024 + chunk_threshold = chunk_size + (chunk_size ÷ 2) + first = firstindex(cu); last = lastindex(cu) + l = last - first + 1 + l < chunk_threshold && return _isascii(cu,first,last) + return _isascii_chunks(chunk_size,cu,first,last) +end + ## string map, filter ## function map(f, s::AbstractString) diff --git a/base/strings/string.jl b/base/strings/string.jl index 3d8db74d7b7956..59241223f4d49d 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -326,12 +326,7 @@ end isvalid(s::String, i::Int) = checkbounds(Bool, s, i) && thisind(s, i) == i -function isascii(s::String) - @inbounds for i = 1:ncodeunits(s) - codeunit(s, i) >= 0x80 && return false - end - return true -end +isascii(s::String) = isascii(codeunits(s)) """ repeat(c::AbstractChar, r::Integer) -> String diff --git a/base/strings/substring.jl b/base/strings/substring.jl index baaea038b2cfe0..76658b377c7b4f 100644 --- a/base/strings/substring.jl +++ b/base/strings/substring.jl @@ -92,6 +92,8 @@ function getindex(s::SubString, i::Integer) @inbounds return getindex(s.string, s.offset + i) end +isascii(ss::SubString{String}) = isascii(codeunits(ss)) + function isvalid(s::SubString, i::Integer) ib = true @boundscheck ib = checkbounds(Bool, s, i) diff --git a/test/strings/basic.jl b/test/strings/basic.jl index ed3a0fe858051f..168a01caac2070 100644 --- a/test/strings/basic.jl +++ b/test/strings/basic.jl @@ -1125,6 +1125,32 @@ end @test sprint(summary, "") == "empty String" end +@testset "isascii" begin + N = 1 + @test isascii("S"^N) == true + @test isascii("S"^(N - 1)) == true + @test isascii("S"^(N + 1)) == true + + @test isascii("λ" * ("S"^(N))) == false + @test isascii(("S"^(N)) * "λ") == false + + for p = 1:16 + N = 2^p + @test isascii("S"^N) == true + @test isascii("S"^(N - 1)) == true + @test isascii("S"^(N + 1)) == true + + @test isascii("λ" * ("S"^(N))) == false + @test isascii(("S"^(N)) * "λ") == false + @test isascii("λ"*("S"^(N - 1))) == false + @test isascii(("S"^(N - 1)) * "λ") == false + if N > 4 + @test isascii("λ" * ("S"^(N - 3))) == false + @test isascii(("S"^(N - 3)) * "λ") == false + end + end +end + @testset "Plug holes in test coverage" begin @test_throws MethodError checkbounds(Bool, "abc", [1.0, 2.0])