From aba86a16a2cab1156aec1a9c2dc22927fb788fc2 Mon Sep 17 00:00:00 2001 From: Jameson Nash Date: Mon, 7 Jun 2021 19:01:59 -0400 Subject: [PATCH] implement replace on String for multiple patterns (#40484) This has been attempted before, sometimes fairly similar to this, but the attempts seemed to be either too simple or too complicated. This aims to be simple, and even beats one of the "handwritten" benchmark cases. Past issues (e.g. #25396) have proposed that using Regex may be faster, but in my tests, this handily bests even simplified regexes. There can be slow Regexes patterns that can cause this to exhibit O(n^2) behavior, but only if the one of the earlier patterns is a partial match for a later pattern Regex and that Regex always matches O(n) of the input stream. This is a case that is hopefully usually avoidable in practice. fixes #35327 fixes #39061 fixes #35414 fixes #29849 fixes #30457 fixes #25396 --- NEWS.md | 4 + base/regex.jl | 20 +++-- base/set.jl | 1 - base/strings/util.jl | 84 ++++++++++++++------- test/strings/util.jl | 172 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 245 insertions(+), 36 deletions(-) diff --git a/NEWS.md b/NEWS.md index cc488312bbc70d..fc90e8a45594e7 100644 --- a/NEWS.md +++ b/NEWS.md @@ -121,6 +121,10 @@ Standard library changes * Some degree trigonometric functions, `sind`, `cosd`, `tand`, `asind`, `acosd`, `asecd`, `acscd`, `acotd`, `atand` now accept an square matrix ([#39758]). * A backslash before a newline in command literals now always removes the newline, similar to standard string literals, whereas the result was not well-defined before. ([#40753]) +* `replace(::String)` now allows multiple patterns to be specified, and they + will be applied left-to-right simultaneously, so only one pattern will be + applied to any character, and the patterns will only be applied to the input + text, not the replacements. ([#TBD]) #### Package Manager diff --git a/base/regex.jl b/base/regex.jl index 82e2042552ee43..15744fe14ce47d 100644 --- a/base/regex.jl +++ b/base/regex.jl @@ -589,7 +589,7 @@ _free_pat_replacer(r::RegexAndMatchData) = PCRE.free_match_data(r.match_data) replace_err(repl) = error("Bad replacement string: $repl") -function _write_capture(io, re::RegexAndMatchData, group) +function _write_capture(io::IO, group::Int, str, r, re::RegexAndMatchData) len = PCRE.substring_length_bynumber(re.match_data, group) # in the case of an optional group that doesn't match, len == 0 len == 0 && return @@ -598,6 +598,11 @@ function _write_capture(io, re::RegexAndMatchData, group) pointer(io.data, io.ptr), len+1) io.ptr += len io.size = max(io.size, io.ptr - 1) + nothing +end +function _write_capture(io::IO, group::Int, str, r, re) + group == 0 || replace_err("pattern is not a Regex") + return print(io, SubString(str, r)) end @@ -605,7 +610,7 @@ const SUB_CHAR = '\\' const GROUP_CHAR = 'g' const KEEP_ESC = [SUB_CHAR, GROUP_CHAR, '0':'9'...] -function _replace(io, repl_s::SubstitutionString, str, r, re::RegexAndMatchData) +function _replace(io, repl_s::SubstitutionString, str, r, re) LBRACKET = '<' RBRACKET = '>' repl = unescape_string(repl_s.string, KEEP_ESC) @@ -629,7 +634,7 @@ function _replace(io, repl_s::SubstitutionString, str, r, re::RegexAndMatchData) break end end - _write_capture(io, re, group) + _write_capture(io, group, str, r, re) elseif repl[next_i] == GROUP_CHAR i = nextind(repl, next_i) if i > e || repl[i] != LBRACKET @@ -642,15 +647,16 @@ function _replace(io, repl_s::SubstitutionString, str, r, re::RegexAndMatchData) i = nextind(repl, i) i > e && replace_err(repl) end - # TODO: avoid this allocation groupname = SubString(repl, groupstart, prevind(repl, i)) if all(isdigit, groupname) - _write_capture(io, re, parse(Int, groupname)) - else + group = parse(Int, groupname) + elseif re isa RegexAndMatchData group = PCRE.substring_number_from_name(re.re.regex, groupname) group < 0 && replace_err("Group $groupname not found in regex $(re.re)") - _write_capture(io, re, group) + else + group = -1 end + _write_capture(io, group, str, r, re) i = nextind(repl, i) else replace_err(repl) diff --git a/base/set.jl b/base/set.jl index 0c8a8b95b10cea..5a744c556432c3 100644 --- a/base/set.jl +++ b/base/set.jl @@ -621,7 +621,6 @@ replace!(a::Callable, b::Pair; count::Integer=-1) = throw(MethodError(replace!, replace!(a::Callable, b::Pair, c::Pair; count::Integer=-1) = throw(MethodError(replace!, (a, b, c))) replace(a::Callable, b::Pair; count::Integer=-1) = throw(MethodError(replace, (a, b))) replace(a::Callable, b::Pair, c::Pair; count::Integer=-1) = throw(MethodError(replace, (a, b, c))) -replace(a::AbstractString, b::Pair, c::Pair) = throw(MethodError(replace, (a, b, c))) ### replace! for AbstractDict/AbstractSet diff --git a/base/strings/util.jl b/base/strings/util.jl index 411961d8221e75..c6dad5f34bafb1 100644 --- a/base/strings/util.jl +++ b/base/strings/util.jl @@ -526,56 +526,74 @@ _replace(io, repl::Function, str, r, pattern) = _replace(io, repl::Function, str, r, pattern::Function) = print(io, repl(str[first(r)])) -replace(str::String, pat_repl::Pair{<:AbstractChar}; count::Integer=typemax(Int)) = - replace(str, isequal(first(pat_repl)) => last(pat_repl); count=count) - -replace(str::String, pat_repl::Pair{<:Union{Tuple{Vararg{AbstractChar}}, - AbstractVector{<:AbstractChar},Set{<:AbstractChar}}}; - count::Integer=typemax(Int)) = - replace(str, in(first(pat_repl)) => last(pat_repl), count=count) - _pat_replacer(x) = x _free_pat_replacer(x) = nothing -function replace(str::String, pat_repl::Pair; count::Integer=typemax(Int)) - pattern, repl = pat_repl +_pat_replacer(x::AbstractChar) = isequal(x) +_pat_replacer(x::Union{Tuple{Vararg{AbstractChar}},AbstractVector{<:AbstractChar},Set{<:AbstractChar}}) = in(x) + +function replace(str::String, pat_repl::Vararg{Pair,N}; count::Integer=typemax(Int)) where N count == 0 && return str count < 0 && throw(DomainError(count, "`count` must be non-negative.")) n = 1 - e = lastindex(str) + e1 = nextind(str, lastindex(str)) # sizeof(str) i = a = firstindex(str) - pattern = _pat_replacer(pattern) - r = something(findnext(pattern,str,i), 0) - j, k = first(r), last(r) - if j == 0 - _free_pat_replacer(pattern) + patterns = map(p -> _pat_replacer(first(p)), pat_repl) + replaces = map(last, pat_repl) + rs = map(patterns) do p + r = findnext(p, str, a) + if r === nothing || first(r) == 0 + return e1+1:0 + end + r isa Int && (r = r:r) # findnext / performance fix + return r + end + if all(>(e1), map(first, rs)) + foreach(_free_pat_replacer, patterns) return str end out = IOBuffer(sizehint=floor(Int, 1.2sizeof(str))) - while j != 0 + while true + p = argmin(map(first, rs)) # TODO: or argmin(rs), to pick the shortest first match ? + r = rs[p] + j, k = first(r), last(r) + j > e1 && break if i == a || i <= k + # copy out preserved portion GC.@preserve str unsafe_write(out, pointer(str, i), UInt(j-i)) - _replace(out, repl, str, r, pattern) + # copy out replacement string + _replace(out, replaces[p], str, r, patterns[p]) end if k < j i = j - j > e && break + j == e1 && break k = nextind(str, j) else i = k = nextind(str, k) end - r = something(findnext(pattern,str,k), 0) - r === 0:-1 || n == count && break - j, k = first(r), last(r) + n == count && break + let k = k + rs = map(patterns, rs) do p, r + if first(r) < k + r = findnext(p, str, k) + if r === nothing || first(r) == 0 + return e1+1:0 + end + r isa Int && (r = r:r) # findnext / performance fix + end + return r + end + end n += 1 end - _free_pat_replacer(pattern) - write(out, SubString(str,i)) - String(take!(out)) + foreach(_free_pat_replacer, patterns) + write(out, SubString(str, i)) + return String(take!(out)) end + """ - replace(s::AbstractString, pat=>r; [count::Integer]) + replace(s::AbstractString, pat=>r, [pat2=>r2, ...]; [count::Integer]) Search for the given pattern `pat` in `s`, and replace each occurrence with `r`. If `count` is provided, replace at most `count` occurrences. @@ -588,6 +606,13 @@ If `pat` is a regular expression and `r` is a [`SubstitutionString`](@ref), then references in `r` are replaced with the corresponding matched text. To remove instances of `pat` from `string`, set `r` to the empty `String` (`""`). +Multiple patterns can be specified, and they will be applied left-to-right +simultaneously, so only one pattern will be applied to any character, and the +patterns will only be applied to the input text, not the replacements. + +!!! compat "Julia 1.7" + Support for multiple patterns requires version 1.7. + # Examples ```jldoctest julia> replace("Python is a programming language.", "Python" => "Julia") @@ -601,10 +626,13 @@ julia> replace("The quick foxes run quickly.", "quick" => "", count=1) julia> replace("The quick foxes run quickly.", r"fox(es)?" => s"bus\\1") "The quick buses run quickly." + +julia> replace("abcabc", "a" => "b", "b" => "c", r".+" => "a") +"bca" ``` """ -replace(s::AbstractString, pat_f::Pair; count=typemax(Int)) = - replace(String(s), pat_f, count=count) +replace(s::AbstractString, pat_f::Pair...; count=typemax(Int)) = + replace(String(s), pat_f..., count=count) # TODO: allow transform as the first argument to replace? diff --git a/test/strings/util.jl b/test/strings/util.jl index b0b5a0c36b8ef6..3cf434feab113a 100644 --- a/test/strings/util.jl +++ b/test/strings/util.jl @@ -312,6 +312,178 @@ end end +@testset "replace many" begin + # PR 35414 Francesco Alemanno + @test replace("foobarbaz", "oo" => "zz", "ar" => "zz", "z" => "m") == "fzzbzzbam" + substmp=["z" => "m", "oo" => "zz", "ar" => "zz"] + for perm in [[1, 2, 3], [2, 1, 3], [3, 2, 1], [2, 3, 1], [1, 3, 2], [3, 1, 2]] + @test replace("foobarbaz", substmp[perm]...) == "fzzbzzbam" + @test replace("foobarbaz", substmp[perm]..., count=2) == "fzzbzzbaz" + @test replace("foobarbaz", substmp[perm]..., count=1) == "fzzbarbaz" + end + @test replace("foobarbaz", "z" => "m", r"a.*a" => uppercase) == "foobARBAm" + @test replace("foobarbaz", 'o' => 'z', 'a' => 'q', 'z' => 'm') == "fzzbqrbqm" + + + # PR #25732 Klaus Crusius + @test replace("\u2202", '*' => '\0', "" => "") == "\u2202" + + @test replace("foobar", 'o' => '0', "" => "") == "f00bar" + @test replace("foobar", 'o' => '0', count=1, "" => "") == "foobar" + @test replace("foobar", 'o' => '0', count=2, "" => "") == "f0obar" + @test replace("foobar", 'o' => "", "" => "") == "fbar" + @test replace("foobar", 'o' => "", count=1, "" => "") == "foobar" + @test replace("foobar", 'o' => "", count=2, "" => "") == "fobar" + @test replace("foobar", 'f' => 'F', "" => "") == "Foobar" + @test replace("foobar", 'r' => 'R', "" => "") == "foobaR" + + @test replace("foofoofoo", "foo" => "bar", "" => "") == "barbarbar" + @test replace("foobarfoo", "foo" => "baz", "" => "") == "bazbarbaz" + @test replace("barfoofoo", "foo" => "baz", "" => "") == "barbazbaz" + + @test replace("", "" => "", "" => "") == "" + @test replace("", "" => "x", "" => "") == "x" + @test replace("", "x" => "y", "" => "") == "" + + @test replace("abcd", "" => "^", "" => "") == "^a^b^c^d^" + @test replace("abcd", "b" => "^", "" => "") == "a^cd" + @test replace("abcd", r"b?" => "^", "" => "") == "^a^c^d^" + @test replace("abcd", r"b+" => "^", "" => "") == "a^cd" + @test replace("abcd", r"b?c?" => "^", "" => "") == "^a^d^" + @test replace("abcd", r"[bc]?" => "^", "" => "") == "^a^^d^" + + @test replace("foobarfoo", r"(fo|ba)" => "xx", "" => "") == "xxoxxrxxo" + @test replace("foobarfoo", r"(foo|ba)" => "bar", "" => "") == "barbarrbar" + + @test replace("foobar", 'o' => 'ø', "" => "") == "føøbar" + @test replace("foobar", 'o' => 'ø', count=2, "" => "") == "føobar" + @test replace("føøbar", 'ø' => 'o', "" => "") == "foobar" + @test replace("føøbar", 'ø' => 'o', count=2, "" => "") == "foøbar" + @test replace("føøbar", 'ø' => 'ö', "" => "") == "fööbar" + @test replace("føøbar", 'ø' => 'ö', count=2, "" => "") == "föøbar" + @test replace("føøbar", 'ø' => "", "" => "") == "fbar" + @test replace("føøbar", 'ø' => "", count=2, "" => "") == "føbar" + @test replace("føøbar", 'f' => 'F', "" => "") == "Føøbar" + @test replace("ḟøøbar", 'ḟ' => 'F', "" => "") == "Føøbar" + @test replace("føøbar", 'f' => 'Ḟ', "" => "") == "Ḟøøbar" + @test replace("ḟøøbar", 'ḟ' => 'Ḟ', "" => "") == "Ḟøøbar" + @test replace("føøbar", 'r' => 'R', "" => "") == "føøbaR" + @test replace("føøbaṙ", 'ṙ' => 'R', "" => "") == "føøbaR" + @test replace("føøbar", 'r' => 'Ṙ', "" => "") == "føøbaṘ" + @test replace("føøbaṙ", 'ṙ' => 'Ṙ', "" => "") == "føøbaṘ" + + @test replace("ḟøøḟøøḟøø", "ḟøø" => "bar", "" => "") == "barbarbar" + @test replace("ḟøøbarḟøø", "ḟøø" => "baz", "" => "") == "bazbarbaz" + @test replace("barḟøøḟøø", "ḟøø" => "baz", "" => "") == "barbazbaz" + + @test replace("foofoofoo", "foo" => "ƀäṙ", "" => "") == "ƀäṙƀäṙƀäṙ" + @test replace("fooƀäṙfoo", "foo" => "baz", "" => "") == "bazƀäṙbaz" + @test replace("ƀäṙfoofoo", "foo" => "baz", "" => "") == "ƀäṙbazbaz" + + @test replace("foofoofoo", "foo" => "bar", "" => "") == "barbarbar" + @test replace("foobarfoo", "foo" => "ƀäż", "" => "") == "ƀäżbarƀäż" + @test replace("barfoofoo", "foo" => "ƀäż", "" => "") == "barƀäżƀäż" + + @test replace("ḟøøḟøøḟøø", "ḟøø" => "ƀäṙ", "" => "") == "ƀäṙƀäṙƀäṙ" + @test replace("ḟøøƀäṙḟøø", "ḟøø" => "baz", "" => "") == "bazƀäṙbaz" + @test replace("ƀäṙḟøøḟøø", "ḟøø" => "baz", "" => "") == "ƀäṙbazbaz" + + @test replace("ḟøøḟøøḟøø", "ḟøø" => "bar", "" => "") == "barbarbar" + @test replace("ḟøøbarḟøø", "ḟøø" => "ƀäż", "" => "") == "ƀäżbarƀäż" + @test replace("barḟøøḟøø", "ḟøø" => "ƀäż", "" => "") == "barƀäżƀäż" + + @test replace("ḟøøḟøøḟøø", "ḟøø" => "ƀäṙ", "" => "") == "ƀäṙƀäṙƀäṙ" + @test replace("ḟøøƀäṙḟøø", "ḟøø" => "ƀäż", "" => "") == "ƀäżƀäṙƀäż" + @test replace("ƀäṙḟøøḟøø", "ḟøø" => "ƀäż", "" => "") == "ƀäṙƀäżƀäż" + + @test replace("", "" => "ẍ", "" => "") == "ẍ" + @test replace("", "ẍ" => "ÿ", "" => "") == "" + + @test replace("äƀçđ", "" => "π", "" => "") == "πäπƀπçπđπ" + @test replace("äƀçđ", "ƀ" => "π", "" => "") == "äπçđ" + @test replace("äƀçđ", r"ƀ?" => "π", "" => "") == "πäπçπđπ" + @test replace("äƀçđ", r"ƀ+" => "π", "" => "") == "äπçđ" + @test replace("äƀçđ", r"ƀ?ç?" => "π", "" => "") == "πäπđπ" + @test replace("äƀçđ", r"[ƀç]?" => "π", "" => "") == "πäππđπ" + + @test replace("foobarfoo", r"(fo|ba)" => "ẍẍ", "" => "") == "ẍẍoẍẍrẍẍo" + + @test replace("ḟøøbarḟøø", r"(ḟø|ba)" => "xx", "" => "") == "xxøxxrxxø" + @test replace("ḟøøbarḟøø", r"(ḟøø|ba)" => "bar", "" => "") == "barbarrbar" + + @test replace("fooƀäṙfoo", r"(fo|ƀä)" => "xx", "" => "") == "xxoxxṙxxo" + @test replace("fooƀäṙfoo", r"(foo|ƀä)" => "ƀäṙ", "" => "") == "ƀäṙƀäṙṙƀäṙ" + + @test replace("ḟøøƀäṙḟøø", r"(ḟø|ƀä)" => "xx", "" => "") == "xxøxxṙxxø" + @test replace("ḟøøƀäṙḟøø", r"(ḟøø|ƀä)" => "ƀäṙ", "" => "") == "ƀäṙƀäṙṙƀäṙ" + + @test replace("foo", "oo" => uppercase, "" => "") == "fOO" + + # Issue 13332 + @test replace("abc", 'b' => 2.1, "" => "") == "a2.1c" + + # test replace with a count for String and GenericString + # check that replace is a no-op if count==0 + for s in ["aaa", Test.GenericString("aaa")] + @test_throws DomainError replace(s, 'a' => "", count = -1, "" => "") + @test replace(s, 'a' => 'z', count=0, "" => "")::String == s + @test replace(s, 'a' => 'z', count=1, "" => "") == "zaa" + @test replace(s, 'a' => 'z', count=2, "" => "") == "zza" + @test replace(s, 'a' => 'z', count=3, "" => "") == "zzz" + @test replace(s, 'a' => 'z', count=4, "" => "") == "zzz" + @test replace(s, 'a' => 'z', count=typemax(Int), "" => "") == "zzz" + @test replace(s, 'a' => 'z', "" => "") == "zzz" + end + + let s = "abc" + @test replace(s) === s + @test replace(s, 'a' => 'z', "" => "") === "zbc" + @test replace(s, 'a' => 'z', 'b' => 'y') == "zyc" + @test replace(s, 'a' => 'z', 'c' => 'x', "b" => 'y') == "zyx" + @test replace(s, '1' => 'z', "" => "") == s + @test replace(s, 'b' => "BbB", "" => "", count=2) == "aBbBc" + end + + let s = "quick quicker quickest" + @test replace(s) === s + @test replace(s, "quickest" => 'z', "quicker" => uppercase, "quick" => 'a') == "a QUICKER z" + @test replace(s, "quick" => 'a', "quicker" => uppercase, "quickest" => 'z') == "a aer aest" + @test replace(s, "quickest" => "lame", "quicker" => "is", "quick" => "Duck", count=2) == "Duck is quickest" + @test "1q1u1i1c1k1 1q1u1i1c1k1e1r1 1q1u1i1c1k1e1s1t1" == + replace(s, "" => '1', "" => "") == + replace(s, "" => '1', "" => '2') + @test replace(s, "qu" => "QU", "qu" => "never happens", "ick" => "") == "QU QUer QUest" + @test replace(s, " " => '_', "r " => "r-") == "quick_quicker-quickest" + @test replace(s, r"[aeiou]" => "ä", "ui" => "ki", "i" => "I") == "qääck qääckär qääckäst" + @test replace(s, "i" => "I", "ui" => "ki", r"[aeiou]" => "ä") == "qkick qkickär qkickäst" + @test replace(s, r"[^ ]+" => "word", "quicker " => "X", count=big"99") == "word word word" + @test replace(s, "quicker " => "X", r"[^ ]+" => "word", count=big"99") == "word Xword" + + @test replace(s, r"(quick)(e)" => s"\2-\1", "x" => "X") == "quick e-quickr e-quickst" + + @test replace(s, 'q' => 'Q', 'u' => 'U') == "QUick QUicker QUickest" + @test replace(s, 'q' => 'Q', r"u" => 'U') == "QUick QUicker QUickest" + @test replace(s, 'q' => 'Q', ==('u') => uppercase) == "QUick QUicker QUickest" + @test replace(s, 'q' => 'Q', islowercase => '-') == "Q---- Q------ Q-------" + @test replace(s, ['q', 'u'] => 'K') == "KKick KKicker KKickest" + @test replace(s, occursin("uq") => 'K') == "KKick KKicker KKickest" + @test replace(s, ==('q') => "B") == "Buick Buicker Buickest" + + @test replace(s, "qui" => "A", 'r' => 'R') == "Ack AckeR Ackest" + @test replace(s, 'r' => 'x', islowercase => uppercase) == "QUICK QUICKEx QUICKEST" + @test replace(s, islowercase => uppercase, 'r' => 'x') == "QUICK QUICKER QUICKEST" + @test replace(s, "q" => "z", islowercase => uppercase, 'r' => 'x') == "zUICK zUICKER zUICKEST" + @test replace(s, "qui" => "A", 'r' => 'x', islowercase => uppercase) == "ACK ACKEx ACKEST" + @test replace(s, "qui" => "A", 'r' => 'x', islowercase => uppercase) == "ACK ACKEx ACKEST" + @test replace(s, r"q" => "z", islowercase => uppercase, 'r' => 'x') == "zUICK zUICKER zUICKEST" + + @test replace(s, "q" => s"a\0b") == "aqbuick aqbuicker aqbuickest" + @test replace(s, "q" => s"a\0b\n\\\g<0>") == "aqb\n\\quick aqb\n\\quicker aqb\n\\quickest" + @test_throws ErrorException("PCRE error: unknown substring") replace(s, r"q" => s"a\1b") + @test_throws ErrorException("Bad replacement string: pattern is not a Regex") replace(s, "q" => s"a\1b") + end +end + @testset "chomp/chop" begin @test chomp("foo\n") == "foo" @test chomp("fo∀\n") == "fo∀"