From 03bd792208797f941d50f418f81251d60435c0d4 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Fri, 16 Feb 2018 22:34:53 +0100 Subject: [PATCH 1/2] Always resize input to zero length in String(::Vector{UInt8}) This makes the behavior more predictable than only resizing Vector{UInt8} inputs when they have been allocated via StringVector, as the caller may have obtained them from other code without knowing how they were created. This ensures code will not rely on the fact that a copy is made in many common cases. The behavior is also simpler to document. --- NEWS.md | 6 ++++++ base/strings/string.jl | 29 +++++++++++++++-------------- src/array.c | 12 +++++++++--- test/strings/basic.jl | 4 +++- 4 files changed, 33 insertions(+), 18 deletions(-) diff --git a/NEWS.md b/NEWS.md index efcef4228c83f..0347dd7f42456 100644 --- a/NEWS.md +++ b/NEWS.md @@ -461,6 +461,11 @@ Library improvements * `Char` is now a subtype of `AbstractChar`, and most of the functions that take character arguments now accept any `AbstractChar` ([#26286]). + * `String(array)` now accepts an arbitrary `AbstractVector{UInt8}`. For `Vector` + inputs, it "steals" the memory buffer, leaving them with an empty buffer which + is guaranteed not to be shared with the `String` object. For other types of vectors + (in particular immutable vectors), a copy is made and the input is not truncated ([#26093]). + * `Irrational` is now a subtype of `AbstractIrrational` ([#24245]). * Introduced the `empty` function, the functional pair to `empty!` which returns a new, @@ -1398,6 +1403,7 @@ Command-line option changes [#26009]: https://github.com/JuliaLang/julia/issues/26009 [#26071]: https://github.com/JuliaLang/julia/issues/26071 [#26080]: https://github.com/JuliaLang/julia/issues/26080 +[#26093]: https://github.com/JuliaLang/julia/issues/26093 [#26149]: https://github.com/JuliaLang/julia/issues/26149 [#26154]: https://github.com/JuliaLang/julia/issues/26154 [#26156]: https://github.com/JuliaLang/julia/issues/26156 diff --git a/base/strings/string.jl b/base/strings/string.jl index 100364b0f8207..88a5f264c5118 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -16,19 +16,22 @@ const ByteArray = Union{Vector{UInt8},Vector{Int8}} # String constructor docstring from boot.jl, workaround for #16730 # and the unavailability of @doc in boot.jl context. """ - String(v::Vector{UInt8}) - -Create a new `String` from a vector `v` of bytes containing -UTF-8 encoded characters. This function takes "ownership" of -the array, which means that you should not subsequently modify -`v` (since strings are supposed to be immutable in Julia) for -as long as the string exists. - -If you need to subsequently modify `v`, use `String(copy(v))` instead. + String(v::AbstractVector{UInt8}) + +Create a new `String` object from a byte vector `v` containing UTF-8 encoded +characters. If `v` is `Vector{UInt8}` it will be truncated to zero length and +future modification of `v` cannot affect the contents of the resulting string. +To avoid truncation use `String(copy(v))`. + +When possible, the memory of `v` will be used without copying when the `String` +object is created. This is guaranteed to be the case for byte vectors returned +by [`take!`](@ref) on a writable [`IOBuffer`](@ref) and by calls to +[`read(io, nb)`](@ref). This allows zero-copy conversion of I/O data to strings. +In other cases, `Vector{UInt8}` data may be copied, but `v` is truncated anyway +to guarantee consistent behavior. """ -function String(v::Array{UInt8,1}) - ccall(:jl_array_to_string, Ref{String}, (Any,), v) -end +String(v::AbstractVector{UInt8}) = String(copyto!(StringVector(length(v)), v)) +String(v::Vector{UInt8}) = ccall(:jl_array_to_string, Ref{String}, (Any,), v) """ unsafe_string(p::Ptr{UInt8}, [length::Integer]) @@ -64,8 +67,6 @@ unsafe_wrap(::Type{Vector{UInt8}}, s::String) = ccall(:jl_string_to_array, Ref{V (::Type{Vector{UInt8}})(s::CodeUnits{UInt8,String}) = copyto!(Vector{UInt8}(undef, length(s)), s) -String(a::AbstractVector{UInt8}) = String(copyto!(StringVector(length(a)), a)) - String(s::CodeUnits{UInt8,String}) = s.s ## low-level functions ## diff --git a/src/array.c b/src/array.c index b5f21b98372ef..54574dbe55f0a 100644 --- a/src/array.c +++ b/src/array.c @@ -434,13 +434,14 @@ JL_DLLEXPORT jl_array_t *jl_pchar_to_array(const char *str, size_t len) JL_DLLEXPORT jl_value_t *jl_array_to_string(jl_array_t *a) { + size_t len = jl_array_len(a); if (a->flags.how == 3 && a->offset == 0 && a->elsize == 1 && (jl_array_ndims(a) != 1 || - ((a->maxsize + sizeof(void*) + 1 <= GC_MAX_SZCLASS) == (jl_array_len(a) + sizeof(void*) + 1 <= GC_MAX_SZCLASS)))) { + ((a->maxsize + sizeof(void*) + 1 <= GC_MAX_SZCLASS) == (len + sizeof(void*) + 1 <= GC_MAX_SZCLASS)))) { jl_value_t *o = jl_array_data_owner(a); if (jl_is_string(o)) { a->flags.isshared = 1; - *(size_t*)o = jl_array_len(a); + *(size_t*)o = len; a->nrows = 0; #ifdef STORE_ARRAY_LEN a->length = 0; @@ -449,7 +450,12 @@ JL_DLLEXPORT jl_value_t *jl_array_to_string(jl_array_t *a) return o; } } - return jl_pchar_to_string((const char*)jl_array_data(a), jl_array_len(a)); + a->nrows = 0; +#ifdef STORE_ARRAY_LEN + a->length = 0; +#endif + a->maxsize = 0; + return jl_pchar_to_string((const char*)jl_array_data(a), len); } JL_DLLEXPORT jl_value_t *jl_pchar_to_string(const char *str, size_t len) diff --git a/test/strings/basic.jl b/test/strings/basic.jl index a63d618c2633c..67cdf5e0f92c1 100644 --- a/test/strings/basic.jl +++ b/test/strings/basic.jl @@ -3,8 +3,10 @@ using Random @testset "constructors" begin - @test String([0x61,0x62,0x63,0x21]) == "abc!" + v = [0x61,0x62,0x63,0x21] + @test String(v) == "abc!" && isempty(v) @test String("abc!") == "abc!" + @test String(0x61:0x63) == "abc" @test isempty(string()) @test eltype(GenericString) == Char From 63a72d1499b7c216f8361cf6cf76b35a6d0343e7 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Fri, 16 Feb 2018 23:36:39 +0100 Subject: [PATCH 2/2] Make resize! a no-op when size does not change Else we set the last element to zero for one-byte element types like UInt8, which means that resizing a vector allocated with StringVector corrupts it. Also add redundant checks in Julia code to avoid a function call. --- base/array.jl | 2 +- src/array.c | 4 ++++ test/strings/basic.jl | 11 +++++++++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/base/array.jl b/base/array.jl index 53cb97ed7fe17..3bb6c89d00e7a 100644 --- a/base/array.jl +++ b/base/array.jl @@ -942,7 +942,7 @@ function resize!(a::Vector, nl::Integer) l = length(a) if nl > l ccall(:jl_array_grow_end, Cvoid, (Any, UInt), a, nl-l) - else + elseif nl != l if nl < 0 throw(ArgumentError("new length must be ≥ 0")) end diff --git a/src/array.c b/src/array.c index 54574dbe55f0a..b139b0935429f 100644 --- a/src/array.c +++ b/src/array.c @@ -1022,6 +1022,8 @@ JL_DLLEXPORT void jl_array_del_beg(jl_array_t *a, size_t dec) jl_bounds_error_int((jl_value_t*)a, dec); if (__unlikely(a->flags.isshared)) array_try_unshare(a); + if (dec == 0) + return; jl_array_del_at_beg(a, 0, dec, n); } @@ -1032,6 +1034,8 @@ JL_DLLEXPORT void jl_array_del_end(jl_array_t *a, size_t dec) jl_bounds_error_int((jl_value_t*)a, 0); if (__unlikely(a->flags.isshared)) array_try_unshare(a); + if (dec == 0) + return; jl_array_del_at_end(a, n - dec, dec, n); } diff --git a/test/strings/basic.jl b/test/strings/basic.jl index 67cdf5e0f92c1..030fa3be66f06 100644 --- a/test/strings/basic.jl +++ b/test/strings/basic.jl @@ -8,6 +8,17 @@ using Random @test String("abc!") == "abc!" @test String(0x61:0x63) == "abc" + # Check that resizing empty source vector does not corrupt string + b = IOBuffer() + write(b, "ab") + x = take!(b) + s = String(x) + resize!(x, 0) + empty!(x) # Another method which must be tested + @test s == "ab" + resize!(x, 1) + @test s == "ab" + @test isempty(string()) @test eltype(GenericString) == Char @test firstindex("abc") == 1