diff --git a/base/utf16.jl b/base/utf16.jl index 8caa3fd14f19d..134e8b8ddaaf7 100644 --- a/base/utf16.jl +++ b/base/utf16.jl @@ -143,3 +143,28 @@ function utf16(p::Union{Ptr{UInt16}, Ptr{Int16}}) while unsafe_load(p, len+1) != 0; len += 1; end utf16(p, len) end + +function map(fun, str::UTF16String) + buf = UInt16[] + sizehint!(buf, length(str.data)) + for ch in str + c2 = fun(ch) + if !isa(c2, Char) + throw(UnicodeError(UTF_ERR_MAP_CHAR, 0, 0)) + end + uc = reinterpret(UInt32, c2) + if uc < 0x10000 + if utf16_is_surrogate(UInt16(uc)) + throw(UnicodeError(UTF_ERR_INVALID_CHAR, 0, uc)) + end + push!(buf, UInt16(uc)) + elseif uc <= 0x10ffff + push!(buf, UInt16(0xd7c0 + (uc >> 10))) + push!(buf, UInt16(0xdc00 + (uc & 0x3ff))) + else + throw(UnicodeError(UTF_ERR_INVALID_CHAR, 0, uc)) + end + end + push!(buf, 0) + UTF16String(buf) +end diff --git a/test/strings.jl b/test/strings.jl index 3701567bae68e..855634722e0b6 100644 --- a/test/strings.jl +++ b/test/strings.jl @@ -1897,3 +1897,23 @@ end @test [c for c in "ḟøøƀäṙ"] == ['ḟ', 'ø', 'ø', 'ƀ', 'ä', 'ṙ'] @test [i for i in eachindex("ḟøøƀäṙ")] == [1, 4, 6, 8, 10, 12] @test [x for x in enumerate("ḟøøƀäṙ")] == [(1, 'ḟ'), (2, 'ø'), (3, 'ø'), (4, 'ƀ'), (5, 'ä'), (6, 'ṙ')] + +# issue # 11464: uppercase/lowercase of UTF16String becomes a UTF8String +str = "abcdef\uff\uffff\u10ffffABCDEF" +@test typeof(uppercase("abcdef")) == ASCIIString +@test typeof(uppercase(utf8(str))) == UTF8String +@test typeof(uppercase(utf16(str))) == UTF16String +@test typeof(uppercase(utf32(str))) == UTF32String +@test typeof(lowercase("ABCDEF")) == ASCIIString +@test typeof(lowercase(utf8(str))) == UTF8String +@test typeof(lowercase(utf16(str))) == UTF16String +@test typeof(lowercase(utf32(str))) == UTF32String + +foomap(ch) = (ch > 65) +foobar(ch) = Char(0xd800) +foobaz(ch) = Char(0x200000) +@test_throws UnicodeError map(foomap, utf16(str)) +@test_throws UnicodeError map(foobar, utf16(str)) +@test_throws UnicodeError map(foobaz, utf16(str)) + +