diff --git a/NEWS.md b/NEWS.md index c280cc864979d..b2df3ca485fb5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -308,6 +308,12 @@ This section lists changes that do not have deprecation warnings. `AbstractArray` types that specialized broadcasting using the old internal API will need to switch to the new API. ([#20740]) + * The `RevString` type has been removed from the language; `reverse(::String)` returns + a `String` with code points (or fragments thereof) in reverse order. In general, + `reverse(s)` should return a string of the same type and encoding as `s` with code + points in reverse order; any string type overrides `reverse` to return a different + type of string must also override `reverseind` to compute reversed indices correctly. + Library improvements -------------------- @@ -409,6 +415,15 @@ Library improvements * The `keys` of an `Associative` are now an `AbstractSet`. `Base.KeyIterator{<:Associative}` has been changed to `KeySet{K, <:Associative{K}} <: AbstractSet{K}` ([#24580]). + * New function `ncodeunits(s::AbstractString)` gives the number of code units in a string. + The generic definition is constant time but calls `endof(s)` which may be inefficient. + Therefore custom string types may want to define direct `ncodeunits` methods. + + * `reverseind(s::AbstractString, i::Integer)` now has an efficient generic fallback, so + custom string types do not need to provide their own efficient defintions. The generic + definition relies on `ncodeunits` however, so for optimal performance you may need to + define a custom method for that function. + Compiler/Runtime improvements ----------------------------- diff --git a/base/exports.jl b/base/exports.jl index 31af9f6ce7813..9756109b9263f 100644 --- a/base/exports.jl +++ b/base/exports.jl @@ -88,7 +88,6 @@ export Rational, Regex, RegexMatch, - RevString, RoundFromZero, RoundDown, RoundingMode, @@ -756,6 +755,7 @@ export lstrip, match, matchall, + ncodeunits, ndigits, nextind, normalize_string, diff --git a/base/precompile.jl b/base/precompile.jl index c7aefb8991d6b..d191f3dac0a73 100644 --- a/base/precompile.jl +++ b/base/precompile.jl @@ -578,9 +578,6 @@ precompile(Tuple{typeof(Base.LineEdit.complete_line), Base.LineEdit.PromptState, precompile(Tuple{typeof(Base.LineEdit.input_string_newlines_aftercursor), Base.LineEdit.PromptState}) precompile(Tuple{typeof(Base.LineEdit.complete_line), Base.REPL.REPLCompletionProvider, Base.LineEdit.PromptState}) precompile(Tuple{getfield(Base, Symbol("#kw##parse")), Array{Any, 1}, typeof(Base.parse), String}) -precompile(Tuple{typeof(Base.isvalid), Base.RevString{String}, Int64}) -precompile(Tuple{typeof(Base.nextind), Base.RevString{String}, Int64}) -precompile(Tuple{typeof(Base.search), Base.RevString{String}, Array{Char, 1}, Int64}) precompile(Tuple{typeof(Base.rsearch), String, Array{Char, 1}, Int64}) precompile(Tuple{getfield(Base.REPLCompletions, Symbol("#kw##find_start_brace")), Array{Any, 1}, typeof(Base.REPLCompletions.find_start_brace), String}) precompile(Tuple{typeof(Core.Inference.isbits), Tuple{Void, Void, Void}}) diff --git a/base/repl/REPLCompletions.jl b/base/repl/REPLCompletions.jl index 5523ab544ca3d..3e5056d613f26 100644 --- a/base/repl/REPLCompletions.jl +++ b/base/repl/REPLCompletions.jl @@ -225,7 +225,7 @@ end # closed start brace from the end of the string. function find_start_brace(s::AbstractString; c_start='(', c_end=')') braces = 0 - r = RevString(s) + r = reverse(s) i = start(r) in_single_quotes = false in_double_quotes = false @@ -245,18 +245,21 @@ function find_start_brace(s::AbstractString; c_start='(', c_end=')') in_back_ticks = true end else - if !in_back_ticks && !in_double_quotes && c == '\'' && !done(r, i) && next(r, i)[1]!='\\' + if !in_back_ticks && !in_double_quotes && + c == '\'' && !done(r, i) && next(r, i)[1] != '\\' in_single_quotes = !in_single_quotes - elseif !in_back_ticks && !in_single_quotes && c == '"' && !done(r, i) && next(r, i)[1]!='\\' + elseif !in_back_ticks && !in_single_quotes && + c == '"' && !done(r, i) && next(r, i)[1] != '\\' in_double_quotes = !in_double_quotes - elseif !in_single_quotes && !in_double_quotes && c == '`' && !done(r, i) && next(r, i)[1]!='\\' + elseif !in_single_quotes && !in_double_quotes && + c == '`' && !done(r, i) && next(r, i)[1] != '\\' in_back_ticks = !in_back_ticks end end braces == 1 && break end braces != 1 && return 0:-1, -1 - method_name_end = reverseind(r, i) + method_name_end = reverseind(s, i) startind = nextind(s, rsearch(s, non_identifier_chars, method_name_end)) return (startind:endof(s), method_name_end) end diff --git a/base/shell.jl b/base/shell.jl index b8923ced43d04..72ffd23d9a944 100644 --- a/base/shell.jl +++ b/base/shell.jl @@ -14,7 +14,7 @@ function shell_parse(str::AbstractString, interpolate::Bool=true; special::AbstractString="") s = lstrip(str) # strips the end but respects the space when the string ends with "\\ " - r = RevString(s) + r = reverse(s) i = start(r) c_old = nothing while !done(r,i) diff --git a/base/strings/basic.jl b/base/strings/basic.jl index cd7ea3cd8ea98..734f1cc6f9041 100644 --- a/base/strings/basic.jl +++ b/base/strings/basic.jl @@ -69,6 +69,10 @@ julia> 'j' * "ulia" one(::Union{T,Type{T}}) where {T<:AbstractString} = convert(T, "") +# generic number of code units; implementations generally know how long a string +# is though and should override this with a more efficient method +ncodeunits(s::AbstractString) = nextind(s, endof(s)) - 1 + """ length(s::AbstractString) @@ -233,11 +237,11 @@ end ## Generic indexing functions ## """ - thisind(str::AbstractString, i::Integer) + thisind(s::AbstractString, i::Integer) -Get the largest valid string index at or before `i`. -Returns `0` if there is no valid string index at or before `i`. -Returns `endof(str)` if `i≥endof(str)`. +If `i` is the index into a character in `s` then `thisind` returns the index of the +start of that character. If `i < start(s)` then it returns `start(s) - 1`. +If `i > ncodeunits(s)` then it returns `ncodeunits(s) + 1`. # Examples ```jldoctest @@ -253,15 +257,21 @@ julia> thisind("αβγdef", 3) julia> thisind("αβγdef", 4) 3 -julia> thisind("αβγdef", 20) +julia> thisind("αβγdef", 9) 9 + +julia> thisind("αβγdef", 10) +10 + +julia> thisind("αβγdef", 20) +10 """ function thisind(s::AbstractString, i::Integer) j = Int(i) isvalid(s, j) && return j j < start(s) && return 0 - e = endof(s) - j >= endof(s) && return e + n = ncodeunits(s) + j > n && return n + 1 prevind(s, j) end diff --git a/base/strings/search.jl b/base/strings/search.jl index 23f813ea28b26..43e880a26b9e5 100644 --- a/base/strings/search.jl +++ b/base/strings/search.jl @@ -194,12 +194,6 @@ end search(s::AbstractString, t::AbstractString, i::Integer=start(s)) = _search(s, t, i) search(s::ByteArray, t::ByteArray, i::Integer=start(s)) = _search(s, t, i) -function rsearch(s::AbstractString, c::Chars) - j = search(RevString(s), c) - j == 0 && return 0 - endof(s)-j+1 -end - """ rsearch(s::AbstractString, chars::Chars, [start::Integer]) @@ -212,44 +206,50 @@ julia> rsearch("aaabbb","b") 6:6 ``` """ -function rsearch(s::AbstractString, c::Chars, i::Integer) - e = endof(s) - j = search(RevString(s), c, e-i+1) - j == 0 && return 0 - e-j+1 +function rsearch(s::AbstractString, c::Chars, i::Integer=start(s)) + if i < 1 + return i == 0 ? 0 : throw(BoundsError(s, i)) + end + n = ncodeunits(s) + if i > n + return i == n+1 ? 0 : throw(BoundsError(s, i)) + end + # r[reverseind(r,i)] == reverse(r)[i] == s[i] + # s[reverseind(s,j)] == reverse(s)[j] == r[j] + r = reverse(s) + j = search(r, c, reverseind(r, i)) + j == 0 ? 0 : reverseind(s, j) end function _rsearchindex(s, t, i) if isempty(t) - return 1 <= i <= nextind(s,endof(s)) ? i : + return 1 <= i <= nextind(s, endof(s)) ? i : throw(BoundsError(s, i)) end - t = RevString(t) - rs = RevString(s) + t = reverse(t) + rs = reverse(s) l = endof(s) - t1, j2 = next(t,start(t)) + t1, j2 = next(t, start(t)) while true - i = rsearch(s,t1,i) - if i == 0 return 0 end - c, ii = next(rs,l-i+1) + i = rsearch(s, t1, i) + i == 0 && return 0 + c, ii = next(rs, reverseind(rs, i)) j = j2; k = ii matched = true - while !done(t,j) - if done(rs,k) + while !done(t, j) + if done(rs, k) matched = false break end - c, k = next(rs,k) - d, j = next(t,j) + c, k = next(rs, k) + d, j = next(t, j) if c != d matched = false break end end - if matched - return nextind(s,l-k+1) - end - i = l-ii+1 + matched && return nextind(s, reverseind(s, k)) + i = reverseind(s, ii) end end diff --git a/base/strings/string.jl b/base/strings/string.jl index c5c998f7f4419..e66f876a5f77d 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -87,6 +87,20 @@ codeunit(s::AbstractString, i::Integer) @gc_preserve s unsafe_load(pointer(s, i)) end +""" + ncodeunits(s::AbstractString) + +The number of code units in a string. For example, for UTF-8-like data such as +the default `String` type, the number of code units is the number of bytes in +the string, a.k.a. `sizeof(s)`. For a UTF-16 encoded string type, however, the +code unit is `UInt16` so the number of code units is the number of `UInt16` +words in the representation of the string. The expression `codeunit(s, i)` is +valid and safe for precisely the range of `i` values `1:ncodeunits(s)`. + +See also: [`codeunit`](@ref). +""" +ncodeunits(s::String) = sizeof(s) + write(io::IO, s::String) = @gc_preserve s unsafe_write(io, pointer(s), reinterpret(UInt, sizeof(s))) @@ -109,8 +123,8 @@ end function thisind(s::String, i::Integer) j = Int(i) j < 1 && return 0 - e = endof(s) - j >= e && return e + n = ncodeunits(s) + j > n && return n + 1 @inbounds while j > 0 && is_valid_continuation(codeunit(s,j)) j -= 1 end @@ -281,14 +295,6 @@ function first_utf8_byte(ch::Char) return b end -function reverseind(s::String, i::Integer) - j = sizeof(s) + 1 - i - @inbounds while is_valid_continuation(codeunit(s, j)) - j -= 1 - end - return j -end - ## overload methods for efficiency ## isvalid(s::String, i::Integer) = @@ -463,38 +469,6 @@ function string(a::Union{String,Char}...) return out end -function reverse(s::String) - dat = Vector{UInt8}(s) - n = length(dat) - n <= 1 && return s - buf = StringVector(n) - out = n - pos = 1 - @inbounds while out > 0 - ch = dat[pos] - if ch > 0xdf - if ch < 0xf0 - (out -= 3) < 0 && throw(UnicodeError(UTF_ERR_SHORT, pos, ch)) - buf[out + 1], buf[out + 2], buf[out + 3] = ch, dat[pos + 1], dat[pos + 2] - pos += 3 - else - (out -= 4) < 0 && throw(UnicodeError(UTF_ERR_SHORT, pos, ch)) - buf[out+1], buf[out+2], buf[out+3], buf[out+4] = ch, dat[pos+1], dat[pos+2], dat[pos+3] - pos += 4 - end - elseif ch > 0x7f - (out -= 2) < 0 && throw(UnicodeError(UTF_ERR_SHORT, pos, ch)) - buf[out + 1], buf[out + 2] = ch, dat[pos + 1] - pos += 2 - else - buf[out] = ch - out -= 1 - pos += 1 - end - end - String(buf) -end - function repeat(s::String, r::Integer) r < 0 && throw(ArgumentError("can't repeat a string $r times")) n = sizeof(s) diff --git a/base/strings/strings.jl b/base/strings/strings.jl index 0acad00f79ca7..1e442a09e406f 100644 --- a/base/strings/strings.jl +++ b/base/strings/strings.jl @@ -1,7 +1,7 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license include("strings/errors.jl") -include("strings/types.jl") +include("strings/substring.jl") include("strings/basic.jl") include("strings/search.jl") include("strings/util.jl") diff --git a/base/strings/types.jl b/base/strings/substring.jl similarity index 79% rename from base/strings/types.jl rename to base/strings/substring.jl index 9e45b09f19311..d1bf33e4123fb 100644 --- a/base/strings/types.jl +++ b/base/strings/substring.jl @@ -1,9 +1,5 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license -# SubString and RevString types - -## substrings reference original strings ## - """ SubString(s::AbstractString, i::Integer, j::Integer=endof(s)) SubString(s::AbstractString, r::UnitRange{<:Integer}) @@ -51,6 +47,9 @@ end SubString(s::AbstractString) = SubString(s, 1, endof(s)) SubString{T}(s::T) where {T<:AbstractString} = SubString{T}(s, 1, endof(s)) +convert(::Type{SubString{S}}, s::AbstractString) where {S<:AbstractString} = + SubString(convert(S, s)) + String(p::SubString{String}) = unsafe_string(pointer(p.string, p.offset+1), nextind(p, p.endof)-1) @@ -88,16 +87,16 @@ end function thisind(s::SubString{String}, i::Integer) j = Int(i) - j < 1 && return 0 - e = endof(s) - j >= e && return e + j < start(s) && return 0 + n = ncodeunits(s) + j > n && return n + 1 offset = s.offset str = s.string j += offset @inbounds while j > offset && is_valid_continuation(codeunit(str, j)) j -= 1 end - j-offset + j - offset end nextind(s::SubString, i::Integer) = nextind(s.string, i+s.offset)-s.offset @@ -123,32 +122,18 @@ function unsafe_convert(::Type{Ptr{R}}, s::SubString{String}) where R<:Union{Int convert(Ptr{R}, pointer(s.string)) + s.offset end -## reversed strings without data movement ## - -struct RevString{T<:AbstractString} <: AbstractString - string::T -end - -endof(s::RevString) = endof(s.string) -length(s::RevString) = length(s.string) -sizeof(s::RevString) = sizeof(s.string) - -function next(s::RevString, i::Int) - n = endof(s); j = n-i+1 - (s.string[j], n-prevind(s.string,j)+1) -end - """ reverse(s::AbstractString) -> AbstractString -Reverses a string. - -Technically, this function reverses the codepoints in a string, and its +Reverses a string. Technically, this function reverses the codepoints in a string and its main utility is for reversed-order string processing, especially for reversed -regular-expression searches. See also [`reverseind`](@ref) to convert indices -in `s` to indices in `reverse(s)` and vice-versa, and [`graphemes`](@ref) -to operate on user-visible "characters" (graphemes) rather than codepoints. -See also [`Iterators.reverse`](@ref) for reverse-order iteration without making a copy. +regular-expression searches. See also [`reverseind`](@ref) to convert indices in `s` to +indices in `reverse(s)` and vice-versa, and [`graphemes`](@ref) to operate on user-visible +"characters" (graphemes) rather than codepoints. See also [`Iterators.reverse`](@ref) for +reverse-order iteration without making a copy. Custom string types must implement the +`reverse` function themselves and should typically return a string with the same type +and encoding. If they return a string with a different encoding, they must also override +`reverseind` for that string type to satisfy `s[reverseind(s,i)] == reverse(s)[i]`. # Examples ```jldoctest @@ -162,10 +147,15 @@ julia> join(reverse(collect(graphemes("ax̂e")))) # reverses graphemes "ex̂a" ``` """ -reverse(s::AbstractString) = RevString(s) -reverse(s::RevString) = s.string - -## reverse an index i so that reverse(s)[i] == s[reverseind(s,i)] +function reverse(s::Union{String,SubString{String}})::String + sprint() do io + i, j = start(s), endof(s) + while i ≤ j + c, j = s[j], prevind(s, j) + write(io, c) + end + end +end """ reverseind(v, i) @@ -185,10 +175,7 @@ julia> for i in 1:length(r) Julia ``` """ -reverseind(s::AbstractString, i) = chr2ind(s, length(s) + 1 - ind2chr(reverse(s), i)) -reverseind(s::RevString, i::Integer) = endof(s) - i + 1 -reverseind(s::SubString{String}, i::Integer) = - reverseind(s.string, nextind(s.string, endof(s.string))-s.offset-s.endof+i-1) - s.offset +reverseind(s::AbstractString, i::Integer) = thisind(s, ncodeunits(s)-i+1) """ repeat(s::AbstractString, r::Integer) diff --git a/base/strings/util.jl b/base/strings/util.jl index ace3d93bcb24f..db230a16da0c6 100644 --- a/base/strings/util.jl +++ b/base/strings/util.jl @@ -190,16 +190,15 @@ julia> rstrip(a) ``` """ function rstrip(s::AbstractString, chars::Chars=_default_delims) - r = RevString(s) - i = start(r) - while !done(r,i) - c, j = next(r,i) - if !(c in chars) - return SubString(s, 1, endof(s)-i+1) - end + a = start(s) + i = endof(s) + while a ≤ i + c = s[i] + j = prevind(s, i) + c in chars || return SubString(s, 1:i) i = j end - SubString(s, 1, 0) + SubString(s, a, a-1) end """ diff --git a/doc/src/stdlib/strings.md b/doc/src/stdlib/strings.md index 36ed901262f92..ec561b8a810a6 100644 --- a/doc/src/stdlib/strings.md +++ b/doc/src/stdlib/strings.md @@ -39,7 +39,7 @@ Base.rsearch Base.searchindex Base.rsearchindex Base.contains(::AbstractString, ::AbstractString) -Base.reverse(::AbstractString) +Base.reverse(::Union{String,SubString{String}}) Base.replace Base.split Base.rsplit diff --git a/stdlib/Test/src/Test.jl b/stdlib/Test/src/Test.jl index ac97ef36a0e23..2bd86ddaefa00 100644 --- a/stdlib/Test/src/Test.jl +++ b/stdlib/Test/src/Test.jl @@ -1387,6 +1387,9 @@ struct GenericString <: AbstractString end Base.endof(s::GenericString) = endof(s.string) Base.next(s::GenericString, i::Int) = next(s.string, i) +Base.reverse(s::GenericString) = GenericString(reverse(s.string)) +Base.reverse(s::SubString{GenericString}) = + GenericString(typeof(s.string)(reverse(String(s)))) """ The `GenericSet` can be used to test generic set APIs that program to diff --git a/test/replcompletions.jl b/test/replcompletions.jl index a45fb05c7d28e..9698b1622fdf9 100644 --- a/test/replcompletions.jl +++ b/test/replcompletions.jl @@ -3,7 +3,7 @@ using Base.REPLCompletions let ex = quote - module CompletionFoo + module CompletionFoo mutable struct Test_y yy end diff --git a/test/strings/basic.jl b/test/strings/basic.jl index cac55ddae374a..384da5d8a70f3 100644 --- a/test/strings/basic.jl +++ b/test/strings/basic.jl @@ -580,13 +580,14 @@ end @test thisind(s, 6) == 6 @test thisind(s, 15) == 15 @test thisind(s, 16) == 15 - @test thisind(s, 30) == 15 + @test thisind(s, 17) == 17 + @test thisind(s, 30) == 17 end end let strs = Any["", s"", SubString("123", 2, 1), SubString(s"123", 2, 1)] for s in strs, i in -2:2 - @test thisind(s, i) == 0 + @test thisind(s, i) == (i > 0) end end end @@ -696,3 +697,11 @@ end @test String(take!(b)) == "UnicodeError: invalid character index 2 (0xba is a continuation byte)" end +@testset "ncodeunits" begin + for (s, n) in ["" => 0, "a" => 1, "abc" => 3, + "α" => 2, "abγ" => 4, "∀" => 3, + "∀x∃y" => 8, "🍕" => 4, "🍕∀" => 7] + @test ncodeunits(s) == n + @test ncodeunits(GenericString(s)) == n + end +end diff --git a/test/strings/types.jl b/test/strings/types.jl index a9ce276c02961..12dd75a1bd421 100644 --- a/test/strings/types.jl +++ b/test/strings/types.jl @@ -1,6 +1,6 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license -## SubString, RevString and Cstring tests ## +## SubString and Cstring tests ## ## SubString tests ## u8str = "∀ ε > 0, ∃ δ > 0: |x-y| < δ ⇒ |f(x)-f(y)| < ε" @@ -207,31 +207,13 @@ let s = "|η(α)-ϕ(κ)| < ε" @test length(SubString(s,4,11))==length(s[4:11]) end -## Reverse strings ## - -let rs = RevString("foobar") - @test length(rs) == 6 - @test sizeof(rs) == 6 - @test isascii(rs) -end - -# issue #4586 -@test rsplit(RevString("ailuj"),'l') == ["ju","ia"] -@test parse(Float64,RevString("64")) === 46.0 - -# reverseind -for T in (String, GenericString) +@testset "reverseind" for T in (String, SubString, GenericString) for prefix in ("", "abcd", "\U0001d6a4\U0001d4c1", "\U0001d6a4\U0001d4c1c", " \U0001d6a4\U0001d4c1") for suffix in ("", "abcde", "\U0001d4c1β\U0001d6a4", "\U0001d4c1β\U0001d6a4c", " \U0001d4c1β\U0001d6a4") for c in ('X', 'δ', '\U0001d6a5') s = convert(T, string(prefix, c, suffix)) r = reverse(s) ri = search(r, c) - @test r == RevString(s) - @test c == s[reverseind(s, ri)] == r[ri] - s = RevString(s) - r = reverse(s) - ri = search(r, c) @test c == s[reverseind(s, ri)] == r[ri] s = convert(T, string(prefix, prefix, c, suffix, suffix)) pre = convert(T, prefix) @@ -244,6 +226,18 @@ for T in (String, GenericString) end end +@testset "reverseind of empty strings" begin + for s in ("", + SubString("", 1, 0), + SubString("ab", 1, 0), + SubString("ab", 2, 1), + SubString("ab", 3, 2), + GenericString("")) + @test reverseind(s, 0) == 1 + @test reverseind(s, 1) == 0 + end +end + ## Cstring tests ## # issue #13974: comparison against pointers diff --git a/test/unicode/utf8.jl b/test/unicode/utf8.jl index bdb9664fc26fb..a9db6316d2fa9 100644 --- a/test/unicode/utf8.jl +++ b/test/unicode/utf8.jl @@ -27,16 +27,16 @@ end @test reverse("a") == "a" @test reverse("abc") == "cba" @test reverse("xyz\uff\u800\uffff\U10ffff") == "\U10ffff\uffff\u800\uffzyx" - for str in [ - b"xyz\xc1", - b"xyz\xd0", - b"xyz\xe0", - b"xyz\xed\x80", - b"xyz\xf0", - b"xyz\xf0\x80", - b"xyz\xf0\x80\x80" + for (s, r) in [ + b"xyz\xc1" => b"\xc1zyx", + b"xyz\xd0" => b"\xd0zyx", + b"xyz\xe0" => b"\xe0zyx", + b"xyz\xed\x80" => b"\xed\x80zyx", + b"xyz\xf0" => b"\xf0zyx", + b"xyz\xf0\x80" => b"\xf0\x80zyx", + b"xyz\xf0\x80\x80" => b"\xf0\x80\x80zyx", ] - @test_throws UnicodeError reverse(String(str)) + @test_broken reverse(String(s)) == String(r) end end