diff --git a/NEWS.md b/NEWS.md index efcef4228c83f..0347dd7f42456 100644 --- a/NEWS.md +++ b/NEWS.md @@ -461,6 +461,11 @@ Library improvements * `Char` is now a subtype of `AbstractChar`, and most of the functions that take character arguments now accept any `AbstractChar` ([#26286]). + * `String(array)` now accepts an arbitrary `AbstractVector{UInt8}`. For `Vector` + inputs, it "steals" the memory buffer, leaving them with an empty buffer which + is guaranteed not to be shared with the `String` object. For other types of vectors + (in particular immutable vectors), a copy is made and the input is not truncated ([#26093]). + * `Irrational` is now a subtype of `AbstractIrrational` ([#24245]). * Introduced the `empty` function, the functional pair to `empty!` which returns a new, @@ -1398,6 +1403,7 @@ Command-line option changes [#26009]: https://github.com/JuliaLang/julia/issues/26009 [#26071]: https://github.com/JuliaLang/julia/issues/26071 [#26080]: https://github.com/JuliaLang/julia/issues/26080 +[#26093]: https://github.com/JuliaLang/julia/issues/26093 [#26149]: https://github.com/JuliaLang/julia/issues/26149 [#26154]: https://github.com/JuliaLang/julia/issues/26154 [#26156]: https://github.com/JuliaLang/julia/issues/26156 diff --git a/base/array.jl b/base/array.jl index 53cb97ed7fe17..3bb6c89d00e7a 100644 --- a/base/array.jl +++ b/base/array.jl @@ -942,7 +942,7 @@ function resize!(a::Vector, nl::Integer) l = length(a) if nl > l ccall(:jl_array_grow_end, Cvoid, (Any, UInt), a, nl-l) - else + elseif nl != l if nl < 0 throw(ArgumentError("new length must be ≥ 0")) end diff --git a/base/strings/string.jl b/base/strings/string.jl index 100364b0f8207..88a5f264c5118 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -16,19 +16,22 @@ const ByteArray = Union{Vector{UInt8},Vector{Int8}} # String constructor docstring from boot.jl, workaround for #16730 # and the unavailability of @doc in boot.jl context. """ - String(v::Vector{UInt8}) - -Create a new `String` from a vector `v` of bytes containing -UTF-8 encoded characters. This function takes "ownership" of -the array, which means that you should not subsequently modify -`v` (since strings are supposed to be immutable in Julia) for -as long as the string exists. - -If you need to subsequently modify `v`, use `String(copy(v))` instead. + String(v::AbstractVector{UInt8}) + +Create a new `String` object from a byte vector `v` containing UTF-8 encoded +characters. If `v` is `Vector{UInt8}` it will be truncated to zero length and +future modification of `v` cannot affect the contents of the resulting string. +To avoid truncation use `String(copy(v))`. + +When possible, the memory of `v` will be used without copying when the `String` +object is created. This is guaranteed to be the case for byte vectors returned +by [`take!`](@ref) on a writable [`IOBuffer`](@ref) and by calls to +[`read(io, nb)`](@ref). This allows zero-copy conversion of I/O data to strings. +In other cases, `Vector{UInt8}` data may be copied, but `v` is truncated anyway +to guarantee consistent behavior. """ -function String(v::Array{UInt8,1}) - ccall(:jl_array_to_string, Ref{String}, (Any,), v) -end +String(v::AbstractVector{UInt8}) = String(copyto!(StringVector(length(v)), v)) +String(v::Vector{UInt8}) = ccall(:jl_array_to_string, Ref{String}, (Any,), v) """ unsafe_string(p::Ptr{UInt8}, [length::Integer]) @@ -64,8 +67,6 @@ unsafe_wrap(::Type{Vector{UInt8}}, s::String) = ccall(:jl_string_to_array, Ref{V (::Type{Vector{UInt8}})(s::CodeUnits{UInt8,String}) = copyto!(Vector{UInt8}(undef, length(s)), s) -String(a::AbstractVector{UInt8}) = String(copyto!(StringVector(length(a)), a)) - String(s::CodeUnits{UInt8,String}) = s.s ## low-level functions ## diff --git a/src/array.c b/src/array.c index b5f21b98372ef..b139b0935429f 100644 --- a/src/array.c +++ b/src/array.c @@ -434,13 +434,14 @@ JL_DLLEXPORT jl_array_t *jl_pchar_to_array(const char *str, size_t len) JL_DLLEXPORT jl_value_t *jl_array_to_string(jl_array_t *a) { + size_t len = jl_array_len(a); if (a->flags.how == 3 && a->offset == 0 && a->elsize == 1 && (jl_array_ndims(a) != 1 || - ((a->maxsize + sizeof(void*) + 1 <= GC_MAX_SZCLASS) == (jl_array_len(a) + sizeof(void*) + 1 <= GC_MAX_SZCLASS)))) { + ((a->maxsize + sizeof(void*) + 1 <= GC_MAX_SZCLASS) == (len + sizeof(void*) + 1 <= GC_MAX_SZCLASS)))) { jl_value_t *o = jl_array_data_owner(a); if (jl_is_string(o)) { a->flags.isshared = 1; - *(size_t*)o = jl_array_len(a); + *(size_t*)o = len; a->nrows = 0; #ifdef STORE_ARRAY_LEN a->length = 0; @@ -449,7 +450,12 @@ JL_DLLEXPORT jl_value_t *jl_array_to_string(jl_array_t *a) return o; } } - return jl_pchar_to_string((const char*)jl_array_data(a), jl_array_len(a)); + a->nrows = 0; +#ifdef STORE_ARRAY_LEN + a->length = 0; +#endif + a->maxsize = 0; + return jl_pchar_to_string((const char*)jl_array_data(a), len); } JL_DLLEXPORT jl_value_t *jl_pchar_to_string(const char *str, size_t len) @@ -1016,6 +1022,8 @@ JL_DLLEXPORT void jl_array_del_beg(jl_array_t *a, size_t dec) jl_bounds_error_int((jl_value_t*)a, dec); if (__unlikely(a->flags.isshared)) array_try_unshare(a); + if (dec == 0) + return; jl_array_del_at_beg(a, 0, dec, n); } @@ -1026,6 +1034,8 @@ JL_DLLEXPORT void jl_array_del_end(jl_array_t *a, size_t dec) jl_bounds_error_int((jl_value_t*)a, 0); if (__unlikely(a->flags.isshared)) array_try_unshare(a); + if (dec == 0) + return; jl_array_del_at_end(a, n - dec, dec, n); } diff --git a/test/strings/basic.jl b/test/strings/basic.jl index a63d618c2633c..030fa3be66f06 100644 --- a/test/strings/basic.jl +++ b/test/strings/basic.jl @@ -3,8 +3,21 @@ using Random @testset "constructors" begin - @test String([0x61,0x62,0x63,0x21]) == "abc!" + v = [0x61,0x62,0x63,0x21] + @test String(v) == "abc!" && isempty(v) @test String("abc!") == "abc!" + @test String(0x61:0x63) == "abc" + + # Check that resizing empty source vector does not corrupt string + b = IOBuffer() + write(b, "ab") + x = take!(b) + s = String(x) + resize!(x, 0) + empty!(x) # Another method which must be tested + @test s == "ab" + resize!(x, 1) + @test s == "ab" @test isempty(string()) @test eltype(GenericString) == Char