From 1def7738aecdfc98e191085b7dbf46f63043bbcc Mon Sep 17 00:00:00 2001 From: Ary Borenszweig Date: Tue, 22 Nov 2016 07:45:44 -0300 Subject: [PATCH] =?UTF-8?q?Unicode:=20consider=20special=20case=20conversi?= =?UTF-8?q?ons=20like=20the=20uppercase=20of=20"=EF=AC=84"=20and=20the=20d?= =?UTF-8?q?owncase=20of=20"=C4=B0"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/generate_unicode_data.cr | 34 ++++++++ scripts/unicode_data.ecr | 34 +++++++- spec/std/string_spec.cr | 4 + src/char.cr | 37 +++++++++ src/string.cr | 12 ++- src/unicode/data.cr | 133 ++++++++++++++++++++++++++++++- src/unicode/unicode.cr | 77 ++++++++++++++++++ 7 files changed, 325 insertions(+), 6 deletions(-) diff --git a/scripts/generate_unicode_data.cr b/scripts/generate_unicode_data.cr index 4cf7e4348945..6475647d1a59 100644 --- a/scripts/generate_unicode_data.cr +++ b/scripts/generate_unicode_data.cr @@ -15,6 +15,10 @@ record Entry, upcase : Int32?, downcase : Int32? +record SpecialCase, + codepoint : Int32, + value : Array(Int32) + record CaseRange, low : Int32, high : Int32, delta : Int32 record AlternateRange, low : Int32, high : Int32 record Stride, low : Int32, high : Int32, stride : Int32 @@ -123,6 +127,8 @@ def strides(entries, targets) end entries = [] of Entry +special_cases_downcase = [] of SpecialCase +special_cases_upcase = [] of SpecialCase url = "http://www.unicode.org/Public/9.0.0/ucd/UnicodeData.txt" body = HTTP::Client.get(url).body @@ -139,6 +145,34 @@ body.each_line do |line| entries << Entry.new(codepoint, name, general_category, upcase, downcase) end +url = "http://www.unicode.org/Public/9.0.0/ucd/SpecialCasing.txt" +body = HTTP::Client.get(url).body +body.each_line do |line| + line = line.strip + next if line.empty? + break if line.starts_with?("# Conditional Mappings") + next if line.starts_with?('#') + + pieces = line.split(';') + codepoint = pieces[0].to_i(16) + downcase = pieces[1].split.map(&.to_i(16)) + upcase = pieces[3].split.map(&.to_i(16)) + downcase = nil if downcase.size == 1 + upcase = nil if upcase.size == 1 + if downcase + while downcase.size < 3 + downcase << 0 + end + special_cases_downcase << SpecialCase.new(codepoint, downcase) + end + if upcase + while upcase.size < 3 + upcase << 0 + end + special_cases_upcase << SpecialCase.new(codepoint, upcase) + end +end + downcase_ranges = case_ranges entries, &.downcase downcase_one_ranges, downcase_ranges = downcase_ranges.partition { |r| r.delta == 1 } diff --git a/scripts/unicode_data.ecr b/scripts/unicode_data.ecr index 0b9c98adfa3d..0a20e698a05f 100644 --- a/scripts/unicode_data.ecr +++ b/scripts/unicode_data.ecr @@ -68,10 +68,42 @@ module Unicode end <%- end %> + # Special downcase transformation that involve mapping a codepoint + # to multiple codepoints. The maximum transformation is always 3 + # codepoints, so we store them all as 3 codepoints and 0 means end. + @@special_cases_downcase : Hash(Int32, {Int32, Int32, Int32})? + private def self.special_cases_downcase + @@special_cases_downcase ||= begin + data = Hash(Int32, {Int32, Int32, Int32}).new(initial_capacity: <%= special_cases_downcase.size %>) + <%- special_cases_downcase.each do |a_case| -%> + put(data, <%= a_case.codepoint %>, <%= a_case.value.join(", ") %>) + <%- end %> + data + end + end + + # Special upcase transformation that involve mapping a codepoint + # to multiple codepoints. The maximum transformation is always 3 + # codepoints, so we store them all as 3 codepoints and 0 means end. + @@special_cases_upcase : Hash(Int32, {Int32, Int32, Int32})? + private def self.special_cases_upcase + @@special_cases_upcase ||= begin + data = Hash(Int32, {Int32, Int32, Int32}).new(initial_capacity: <%= special_cases_upcase.size %>) + <%- special_cases_upcase.each do |a_case| -%> + put(data, <%= a_case.codepoint %>, <%= a_case.value.join(", ") %>) + <%- end %> + data + end + end + # TODO: this is needed to avoid generating lots of allocas # in LLVM, which makes LLVM really slow. The compiler should # try to avoid/reuse temporary allocas. - private def self.put(array, *values) : Nil + private def self.put(array : Array, *values) : Nil array << values end + + private def self.put(hash : Hash, key, *values) : Nil + hash[key] = values + end end diff --git a/spec/std/string_spec.cr b/spec/std/string_spec.cr index 459b83a6d97f..0e796dc4856e 100644 --- a/spec/std/string_spec.cr +++ b/spec/std/string_spec.cr @@ -461,6 +461,7 @@ describe "String" do assert { "ÁÉÍÓÚĀ".downcase.should eq("áéíóúā") } assert { "AEIİOU".downcase(Unicode::CaseOptions::Turkic).should eq("aeıiou") } assert { "ÁEÍOÚ".downcase(Unicode::CaseOptions::ASCII).should eq("ÁeÍoÚ") } + assert { "İ".downcase.should eq("i̇") } end describe "upcase" do @@ -469,12 +470,15 @@ describe "String" do assert { "áéíóúā".upcase.should eq("ÁÉÍÓÚĀ") } assert { "aeıiou".upcase(Unicode::CaseOptions::Turkic).should eq("AEIİOU") } assert { "áeíoú".upcase(Unicode::CaseOptions::ASCII).should eq("áEíOú") } + assert { "baffle".upcase.should eq("BAFFLE") } + assert { "ff".upcase.should eq("FF") } end describe "capitalize" do assert { "HELLO!".capitalize.should eq("Hello!") } assert { "HELLO MAN!".capitalize.should eq("Hello man!") } assert { "".capitalize.should eq("") } + assert { "fflİ".capitalize.should eq("FFLi̇") } end describe "chomp" do diff --git a/src/char.cr b/src/char.cr index 3f1b4f4adf4d..990c44015529 100644 --- a/src/char.cr +++ b/src/char.cr @@ -366,6 +366,13 @@ struct Char # Returns the downcase equivalent of this char. # + # Note that this only works for characters whose downcase + # equivalent yields a single codepoint. There are a few + # characters, like 'İ', than when downcased result in multiple + # characters (in this case: 'I' and the dot mark). + # + # For a more correct method see the method that receives a block. + # # ``` # 'Z'.downcase # => 'z' # 'x'.downcase # => 'x' @@ -375,8 +382,24 @@ struct Char Unicode.downcase(self, options) end + # Yields each char for the downcase equivalent of this char. + # + # This method takes into account the possibility that an downcase + # version of a char might result in multiple chars, like for + # 'İ', which results in 'i' and a dot mark. + def downcase(options = Unicode::CaseOptions::None) + Unicode.downcase(self, options) { |char| yield char } + end + # Returns the upcase equivalent of this char. # + # Note that this only works for characters whose upcase + # equivalent yields a single codepoint. There are a few + # characters, like 'ffl', than when upcased result in multiple + # characters (in this case: 'F', 'F', 'L'). + # + # For a more correct method see the method that receives a block. + # # ``` # 'z'.upcase # => 'Z' # 'X'.upcase # => 'X' @@ -386,6 +409,20 @@ struct Char Unicode.upcase(self, options) end + # Yields each char for the upcase equivalent of this char. + # + # This method takes into account the possibility that an upcase + # version of a char might result in multiple chars, like for + # 'ffl', which results in 'F', 'F' and 'L'. + # + # ``` + # 'z'.upcase { |v| puts v } # prints 'Z' + # 'ffl'.upcase { |v| puts v } # prints 'F', 'F', 'F' + # ``` + def upcase(options = Unicode::CaseOptions::None) + Unicode.upcase(self, options) { |char| yield char } + end + # Returns this char's codepoint. def hash ord diff --git a/src/string.cr b/src/string.cr index 875d06fc7e6c..ac4b75075833 100644 --- a/src/string.cr +++ b/src/string.cr @@ -877,7 +877,9 @@ class String def downcase(options = Unicode::CaseOptions::None) String.build(bytesize) do |io| each_char do |char| - io << char.downcase(options) + char.downcase(options) do |res| + io << res + end end end end @@ -891,7 +893,9 @@ class String def upcase(options = Unicode::CaseOptions::None) String.build(bytesize) do |io| each_char do |char| - io << char.upcase(options) + char.upcase(options) do |res| + io << res + end end end end @@ -908,9 +912,9 @@ class String String.build(bytesize) do |io| each_char_with_index do |char, i| if i == 0 - io << char.upcase + char.upcase { |c| io << c } else - io << char.downcase + char.downcase { |c| io << c } end end end diff --git a/src/unicode/data.cr b/src/unicode/data.cr index 0d54e5857b32..486077b5be63 100644 --- a/src/unicode/data.cr +++ b/src/unicode/data.cr @@ -1309,10 +1309,141 @@ module Unicode end end + # Special downcase transformation that involve mapping a codepoint + # to multiple codepoints. The maximum transformation is always 3 + # codepoints, so we store them all as 3 codepoints and 0 means end. + @@special_cases_downcase : Hash(Int32, {Int32, Int32, Int32})? + private def self.special_cases_downcase + @@special_cases_downcase ||= begin + data = Hash(Int32, {Int32, Int32, Int32}).new(initial_capacity: 1) + put(data, 304, 105, 775, 0) + + data + end + end + + # Special upcase transformation that involve mapping a codepoint + # to multiple codepoints. The maximum transformation is always 3 + # codepoints, so we store them all as 3 codepoints and 0 means end. + @@special_cases_upcase : Hash(Int32, {Int32, Int32, Int32})? + private def self.special_cases_upcase + @@special_cases_upcase ||= begin + data = Hash(Int32, {Int32, Int32, Int32}).new(initial_capacity: 102) + put(data, 223, 83, 83, 0) + put(data, 64256, 70, 70, 0) + put(data, 64257, 70, 73, 0) + put(data, 64258, 70, 76, 0) + put(data, 64259, 70, 70, 73) + put(data, 64260, 70, 70, 76) + put(data, 64261, 83, 84, 0) + put(data, 64262, 83, 84, 0) + put(data, 1415, 1333, 1362, 0) + put(data, 64275, 1348, 1350, 0) + put(data, 64276, 1348, 1333, 0) + put(data, 64277, 1348, 1339, 0) + put(data, 64278, 1358, 1350, 0) + put(data, 64279, 1348, 1341, 0) + put(data, 329, 700, 78, 0) + put(data, 912, 921, 776, 769) + put(data, 944, 933, 776, 769) + put(data, 496, 74, 780, 0) + put(data, 7830, 72, 817, 0) + put(data, 7831, 84, 776, 0) + put(data, 7832, 87, 778, 0) + put(data, 7833, 89, 778, 0) + put(data, 7834, 65, 702, 0) + put(data, 8016, 933, 787, 0) + put(data, 8018, 933, 787, 768) + put(data, 8020, 933, 787, 769) + put(data, 8022, 933, 787, 834) + put(data, 8118, 913, 834, 0) + put(data, 8134, 919, 834, 0) + put(data, 8146, 921, 776, 768) + put(data, 8147, 921, 776, 769) + put(data, 8150, 921, 834, 0) + put(data, 8151, 921, 776, 834) + put(data, 8162, 933, 776, 768) + put(data, 8163, 933, 776, 769) + put(data, 8164, 929, 787, 0) + put(data, 8166, 933, 834, 0) + put(data, 8167, 933, 776, 834) + put(data, 8182, 937, 834, 0) + put(data, 8064, 7944, 921, 0) + put(data, 8065, 7945, 921, 0) + put(data, 8066, 7946, 921, 0) + put(data, 8067, 7947, 921, 0) + put(data, 8068, 7948, 921, 0) + put(data, 8069, 7949, 921, 0) + put(data, 8070, 7950, 921, 0) + put(data, 8071, 7951, 921, 0) + put(data, 8072, 7944, 921, 0) + put(data, 8073, 7945, 921, 0) + put(data, 8074, 7946, 921, 0) + put(data, 8075, 7947, 921, 0) + put(data, 8076, 7948, 921, 0) + put(data, 8077, 7949, 921, 0) + put(data, 8078, 7950, 921, 0) + put(data, 8079, 7951, 921, 0) + put(data, 8080, 7976, 921, 0) + put(data, 8081, 7977, 921, 0) + put(data, 8082, 7978, 921, 0) + put(data, 8083, 7979, 921, 0) + put(data, 8084, 7980, 921, 0) + put(data, 8085, 7981, 921, 0) + put(data, 8086, 7982, 921, 0) + put(data, 8087, 7983, 921, 0) + put(data, 8088, 7976, 921, 0) + put(data, 8089, 7977, 921, 0) + put(data, 8090, 7978, 921, 0) + put(data, 8091, 7979, 921, 0) + put(data, 8092, 7980, 921, 0) + put(data, 8093, 7981, 921, 0) + put(data, 8094, 7982, 921, 0) + put(data, 8095, 7983, 921, 0) + put(data, 8096, 8040, 921, 0) + put(data, 8097, 8041, 921, 0) + put(data, 8098, 8042, 921, 0) + put(data, 8099, 8043, 921, 0) + put(data, 8100, 8044, 921, 0) + put(data, 8101, 8045, 921, 0) + put(data, 8102, 8046, 921, 0) + put(data, 8103, 8047, 921, 0) + put(data, 8104, 8040, 921, 0) + put(data, 8105, 8041, 921, 0) + put(data, 8106, 8042, 921, 0) + put(data, 8107, 8043, 921, 0) + put(data, 8108, 8044, 921, 0) + put(data, 8109, 8045, 921, 0) + put(data, 8110, 8046, 921, 0) + put(data, 8111, 8047, 921, 0) + put(data, 8115, 913, 921, 0) + put(data, 8124, 913, 921, 0) + put(data, 8131, 919, 921, 0) + put(data, 8140, 919, 921, 0) + put(data, 8179, 937, 921, 0) + put(data, 8188, 937, 921, 0) + put(data, 8114, 8122, 921, 0) + put(data, 8116, 902, 921, 0) + put(data, 8130, 8138, 921, 0) + put(data, 8132, 905, 921, 0) + put(data, 8178, 8186, 921, 0) + put(data, 8180, 911, 921, 0) + put(data, 8119, 913, 834, 921) + put(data, 8135, 919, 834, 921) + put(data, 8183, 937, 834, 921) + + data + end + end + # TODO: this is needed to avoid generating lots of allocas # in LLVM, which makes LLVM really slow. The compiler should # try to avoid/reuse temporary allocas. - private def self.put(array, *values) : Nil + private def self.put(array : Array, *values) : Nil array << values end + + private def self.put(hash : Hash, key, *values) : Nil + hash[key] = values + end end diff --git a/src/unicode/unicode.cr b/src/unicode/unicode.cr index 97bcddb44f7d..3d0f17abd0e3 100644 --- a/src/unicode/unicode.cr +++ b/src/unicode/unicode.cr @@ -25,6 +25,38 @@ module Unicode end def self.upcase(char : Char, options : CaseOptions) + result = check_upcase_ascii(char, options) + return result if result + + result = check_upcase_turkic(char, options) + return result if result + + check_upcase_ranges(char) + end + + def self.upcase(char : Char, options : CaseOptions) + result = check_upcase_ascii(char, options) + if result + yield result + return + end + + result = check_upcase_turkic(char, options) + if result + yield result + return + end + + result = special_cases_upcase[char.ord]? + if result + result.each { |c| yield c.unsafe_chr if c != 0 } + return + end + + yield check_upcase_ranges(char) + end + + private def self.check_upcase_ascii(char, options) if (char.ascii? && options == Unicode::CaseOptions::None) || options.ascii? if char.ascii_lowercase? return (char.ord - 32).unsafe_chr @@ -32,14 +64,20 @@ module Unicode return char end end + nil + end + private def self.check_upcase_turkic(char, options) if options.turkic? case char when 'ı'; return 'I' when 'i'; return 'İ' end end + nil + end + private def self.check_upcase_ranges(char) result = search_ranges(upcase_ranges, char.ord) return char + result if result @@ -50,6 +88,38 @@ module Unicode end def self.downcase(char : Char, options : CaseOptions) + result = check_downcase_ascii(char, options) + return result if result + + result = check_downcase_turkic(char, options) + return result if result + + check_downcase_ranges(char) + end + + def self.downcase(char : Char, options : CaseOptions) + result = check_downcase_ascii(char, options) + if result + yield result + return + end + + result = check_downcase_turkic(char, options) + if result + yield result + return + end + + result = special_cases_downcase[char.ord]? + if result + result.each { |c| yield c.unsafe_chr if c != 0 } + return + end + + yield check_downcase_ranges(char) + end + + private def self.check_downcase_ascii(char, options) if (char.ascii? && options == Unicode::CaseOptions::None) || options.ascii? if char.ascii_uppercase? return (char.ord + 32).unsafe_chr @@ -58,13 +128,20 @@ module Unicode end end + nil + end + + private def self.check_downcase_turkic(char, options) if options.turkic? case char when 'I'; return 'ı' when 'İ'; return 'i' end end + nil + end + private def self.check_downcase_ranges(char) result = search_ranges(downcase_ranges, char.ord) return char + result if result