Unicode: consider special case conversions like the uppercase of "ﬄ" …

…and the downcase of "İ"
ezrast · Nov 22, 2016 · 1def773 · 1def773
1 parent da21f0b
commit 1def773
Show file tree

Hide file tree

Showing 7 changed files with 325 additions and 6 deletions.
diff --git a/scripts/generate_unicode_data.cr b/scripts/generate_unicode_data.cr
@@ -15,6 +15,10 @@ record Entry,
   upcase : Int32?,
   downcase : Int32?
 
+record SpecialCase,
+  codepoint : Int32,
+  value : Array(Int32)
+
 record CaseRange, low : Int32, high : Int32, delta : Int32
 record AlternateRange, low : Int32, high : Int32
 record Stride, low : Int32, high : Int32, stride : Int32
@@ -123,6 +127,8 @@ def strides(entries, targets)
 end
 
 entries = [] of Entry
+special_cases_downcase = [] of SpecialCase
+special_cases_upcase = [] of SpecialCase
 
 url = "http://www.unicode.org/Public/9.0.0/ucd/UnicodeData.txt"
 body = HTTP::Client.get(url).body
@@ -139,6 +145,34 @@ body.each_line do |line|
   entries << Entry.new(codepoint, name, general_category, upcase, downcase)
 end
 
+url = "http://www.unicode.org/Public/9.0.0/ucd/SpecialCasing.txt"
+body = HTTP::Client.get(url).body
+body.each_line do |line|
+  line = line.strip
+  next if line.empty?
+  break if line.starts_with?("# Conditional Mappings")
+  next if line.starts_with?('#')
+
+  pieces = line.split(';')
+  codepoint = pieces[0].to_i(16)
+  downcase = pieces[1].split.map(&.to_i(16))
+  upcase = pieces[3].split.map(&.to_i(16))
+  downcase = nil if downcase.size == 1
+  upcase = nil if upcase.size == 1
+  if downcase
+    while downcase.size < 3
+      downcase << 0
+    end
+    special_cases_downcase << SpecialCase.new(codepoint, downcase)
+  end
+  if upcase
+    while upcase.size < 3
+      upcase << 0
+    end
+    special_cases_upcase << SpecialCase.new(codepoint, upcase)
+  end
+end
+
 downcase_ranges = case_ranges entries, &.downcase
 downcase_one_ranges, downcase_ranges = downcase_ranges.partition { |r| r.delta == 1 }
 

diff --git a/scripts/unicode_data.ecr b/scripts/unicode_data.ecr
@@ -68,10 +68,42 @@ module Unicode
     end
   <%- end %>
 
+  # Special downcase transformation that involve mapping a codepoint
+  # to multiple codepoints. The maximum transformation is always 3
+  # codepoints, so we store them all as 3 codepoints and 0 means end.
+  @@special_cases_downcase : Hash(Int32, {Int32, Int32, Int32})?
+  private def self.special_cases_downcase
+    @@special_cases_downcase ||= begin
+      data = Hash(Int32, {Int32, Int32, Int32}).new(initial_capacity: <%= special_cases_downcase.size %>)
+      <%- special_cases_downcase.each do |a_case| -%>
+        put(data, <%= a_case.codepoint %>, <%= a_case.value.join(", ") %>)
+      <%- end %>
+      data
+    end
+  end
+
+  # Special upcase transformation that involve mapping a codepoint
+  # to multiple codepoints. The maximum transformation is always 3
+  # codepoints, so we store them all as 3 codepoints and 0 means end.
+  @@special_cases_upcase : Hash(Int32, {Int32, Int32, Int32})?
+  private def self.special_cases_upcase
+    @@special_cases_upcase ||= begin
+      data = Hash(Int32, {Int32, Int32, Int32}).new(initial_capacity: <%= special_cases_upcase.size %>)
+      <%- special_cases_upcase.each do |a_case| -%>
+        put(data, <%= a_case.codepoint %>, <%= a_case.value.join(", ") %>)
+      <%- end %>
+      data
+    end
+  end
+
   # TODO: this is needed to avoid generating lots of allocas
   # in LLVM, which makes LLVM really slow. The compiler should
   # try to avoid/reuse temporary allocas.
-  private def self.put(array, *values) : Nil
+  private def self.put(array : Array, *values) : Nil
     array << values
   end
+
+  private def self.put(hash : Hash, key, *values) : Nil
+    hash[key] = values
+  end
 end
diff --git a/spec/std/string_spec.cr b/spec/std/string_spec.cr
@@ -461,6 +461,7 @@ describe "String" do
     assert { "ÁÉÍÓÚĀ".downcase.should eq("áéíóúā") }
     assert { "AEIİOU".downcase(Unicode::CaseOptions::Turkic).should eq("aeıiou") }
     assert { "ÁEÍOÚ".downcase(Unicode::CaseOptions::ASCII).should eq("ÁeÍoÚ") }
+    assert { "İ".downcase.should eq("i̇") }
   end
 
   describe "upcase" do
@@ -469,12 +470,15 @@ describe "String" do
     assert { "áéíóúā".upcase.should eq("ÁÉÍÓÚĀ") }
     assert { "aeıiou".upcase(Unicode::CaseOptions::Turkic).should eq("AEIİOU") }
     assert { "áeíoú".upcase(Unicode::CaseOptions::ASCII).should eq("áEíOú") }
+    assert { "baﬄe".upcase.should eq("BAFFLE") }
+    assert { "ﬀ".upcase.should eq("FF") }
   end
 
   describe "capitalize" do
     assert { "HELLO!".capitalize.should eq("Hello!") }
     assert { "HELLO MAN!".capitalize.should eq("Hello man!") }
     assert { "".capitalize.should eq("") }
+    assert { "ﬄİ".capitalize.should eq("FFLi̇") }
   end
 
   describe "chomp" do

diff --git a/src/char.cr b/src/char.cr
@@ -366,6 +366,13 @@ struct Char
 
   # Returns the downcase equivalent of this char.
   #
+  # Note that this only works for characters whose downcase
+  # equivalent yields a single codepoint. There are a few
+  # characters, like 'İ', than when downcased result in multiple
+  # characters (in this case: 'I' and the dot mark).
+  #
+  # For a more correct method see the method that receives a block.
+  #
   # ```
   # 'Z'.downcase # => 'z'
   # 'x'.downcase # => 'x'
@@ -375,8 +382,24 @@ struct Char
     Unicode.downcase(self, options)
   end
 
+  # Yields each char for the downcase equivalent of this char.
+  #
+  # This method takes into account the possibility that an downcase
+  # version of a char might result in multiple chars, like for
+  # 'İ', which results in 'i' and a dot mark.
+  def downcase(options = Unicode::CaseOptions::None)
+    Unicode.downcase(self, options) { |char| yield char }
+  end
+
   # Returns the upcase equivalent of this char.
   #
+  # Note that this only works for characters whose upcase
+  # equivalent yields a single codepoint. There are a few
+  # characters, like 'ﬄ', than when upcased result in multiple
+  # characters (in this case: 'F', 'F', 'L').
+  #
+  # For a more correct method see the method that receives a block.
+  #
   # ```
   # 'z'.upcase # => 'Z'
   # 'X'.upcase # => 'X'
@@ -386,6 +409,20 @@ struct Char
     Unicode.upcase(self, options)
   end
 
+  # Yields each char for the upcase equivalent of this char.
+  #
+  # This method takes into account the possibility that an upcase
+  # version of a char might result in multiple chars, like for
+  # 'ﬄ', which results in 'F', 'F' and 'L'.
+  #
+  # ```
+  # 'z'.upcase { |v| puts v } # prints 'Z'
+  # 'ﬄ'.upcase { |v| puts v } # prints 'F', 'F', 'F'
+  # ```
+  def upcase(options = Unicode::CaseOptions::None)
+    Unicode.upcase(self, options) { |char| yield char }
+  end
+
   # Returns this char's codepoint.
   def hash
     ord

diff --git a/src/string.cr b/src/string.cr
@@ -877,7 +877,9 @@ class String
   def downcase(options = Unicode::CaseOptions::None)
     String.build(bytesize) do |io|
       each_char do |char|
-        io << char.downcase(options)
+        char.downcase(options) do |res|
+          io << res
+        end
       end
     end
   end
@@ -891,7 +893,9 @@ class String
   def upcase(options = Unicode::CaseOptions::None)
     String.build(bytesize) do |io|
       each_char do |char|
-        io << char.upcase(options)
+        char.upcase(options) do |res|
+          io << res
+        end
       end
     end
   end
@@ -908,9 +912,9 @@ class String
     String.build(bytesize) do |io|
       each_char_with_index do |char, i|
         if i == 0
-          io << char.upcase
+          char.upcase { |c| io << c }
         else
-          io << char.downcase
+          char.downcase { |c| io << c }
         end
       end
     end

diff --git a/src/unicode/data.cr b/src/unicode/data.cr
@@ -1309,10 +1309,141 @@ module Unicode
     end
   end
 
+  # Special downcase transformation that involve mapping a codepoint
+  # to multiple codepoints. The maximum transformation is always 3
+  # codepoints, so we store them all as 3 codepoints and 0 means end.
+  @@special_cases_downcase : Hash(Int32, {Int32, Int32, Int32})?
+  private def self.special_cases_downcase
+    @@special_cases_downcase ||= begin
+      data = Hash(Int32, {Int32, Int32, Int32}).new(initial_capacity: 1)
+      put(data, 304, 105, 775, 0)
+
+      data
+    end
+  end
+
+  # Special upcase transformation that involve mapping a codepoint
+  # to multiple codepoints. The maximum transformation is always 3
+  # codepoints, so we store them all as 3 codepoints and 0 means end.
+  @@special_cases_upcase : Hash(Int32, {Int32, Int32, Int32})?
+  private def self.special_cases_upcase
+    @@special_cases_upcase ||= begin
+      data = Hash(Int32, {Int32, Int32, Int32}).new(initial_capacity: 102)
+      put(data, 223, 83, 83, 0)
+      put(data, 64256, 70, 70, 0)
+      put(data, 64257, 70, 73, 0)
+      put(data, 64258, 70, 76, 0)
+      put(data, 64259, 70, 70, 73)
+      put(data, 64260, 70, 70, 76)
+      put(data, 64261, 83, 84, 0)
+      put(data, 64262, 83, 84, 0)
+      put(data, 1415, 1333, 1362, 0)
+      put(data, 64275, 1348, 1350, 0)
+      put(data, 64276, 1348, 1333, 0)
+      put(data, 64277, 1348, 1339, 0)
+      put(data, 64278, 1358, 1350, 0)
+      put(data, 64279, 1348, 1341, 0)
+      put(data, 329, 700, 78, 0)
+      put(data, 912, 921, 776, 769)
+      put(data, 944, 933, 776, 769)
+      put(data, 496, 74, 780, 0)
+      put(data, 7830, 72, 817, 0)
+      put(data, 7831, 84, 776, 0)
+      put(data, 7832, 87, 778, 0)
+      put(data, 7833, 89, 778, 0)
+      put(data, 7834, 65, 702, 0)
+      put(data, 8016, 933, 787, 0)
+      put(data, 8018, 933, 787, 768)
+      put(data, 8020, 933, 787, 769)
+      put(data, 8022, 933, 787, 834)
+      put(data, 8118, 913, 834, 0)
+      put(data, 8134, 919, 834, 0)
+      put(data, 8146, 921, 776, 768)
+      put(data, 8147, 921, 776, 769)
+      put(data, 8150, 921, 834, 0)
+      put(data, 8151, 921, 776, 834)
+      put(data, 8162, 933, 776, 768)
+      put(data, 8163, 933, 776, 769)
+      put(data, 8164, 929, 787, 0)
+      put(data, 8166, 933, 834, 0)
+      put(data, 8167, 933, 776, 834)
+      put(data, 8182, 937, 834, 0)
+      put(data, 8064, 7944, 921, 0)
+      put(data, 8065, 7945, 921, 0)
+      put(data, 8066, 7946, 921, 0)
+      put(data, 8067, 7947, 921, 0)
+      put(data, 8068, 7948, 921, 0)
+      put(data, 8069, 7949, 921, 0)
+      put(data, 8070, 7950, 921, 0)
+      put(data, 8071, 7951, 921, 0)
+      put(data, 8072, 7944, 921, 0)
+      put(data, 8073, 7945, 921, 0)
+      put(data, 8074, 7946, 921, 0)
+      put(data, 8075, 7947, 921, 0)
+      put(data, 8076, 7948, 921, 0)
+      put(data, 8077, 7949, 921, 0)
+      put(data, 8078, 7950, 921, 0)
+      put(data, 8079, 7951, 921, 0)
+      put(data, 8080, 7976, 921, 0)
+      put(data, 8081, 7977, 921, 0)
+      put(data, 8082, 7978, 921, 0)
+      put(data, 8083, 7979, 921, 0)
+      put(data, 8084, 7980, 921, 0)
+      put(data, 8085, 7981, 921, 0)
+      put(data, 8086, 7982, 921, 0)
+      put(data, 8087, 7983, 921, 0)
+      put(data, 8088, 7976, 921, 0)
+      put(data, 8089, 7977, 921, 0)
+      put(data, 8090, 7978, 921, 0)
+      put(data, 8091, 7979, 921, 0)
+      put(data, 8092, 7980, 921, 0)
+      put(data, 8093, 7981, 921, 0)
+      put(data, 8094, 7982, 921, 0)
+      put(data, 8095, 7983, 921, 0)
+      put(data, 8096, 8040, 921, 0)
+      put(data, 8097, 8041, 921, 0)
+      put(data, 8098, 8042, 921, 0)
+      put(data, 8099, 8043, 921, 0)
+      put(data, 8100, 8044, 921, 0)
+      put(data, 8101, 8045, 921, 0)
+      put(data, 8102, 8046, 921, 0)
+      put(data, 8103, 8047, 921, 0)
+      put(data, 8104, 8040, 921, 0)
+      put(data, 8105, 8041, 921, 0)
+      put(data, 8106, 8042, 921, 0)
+      put(data, 8107, 8043, 921, 0)
+      put(data, 8108, 8044, 921, 0)
+      put(data, 8109, 8045, 921, 0)
+      put(data, 8110, 8046, 921, 0)
+      put(data, 8111, 8047, 921, 0)
+      put(data, 8115, 913, 921, 0)
+      put(data, 8124, 913, 921, 0)
+      put(data, 8131, 919, 921, 0)
+      put(data, 8140, 919, 921, 0)
+      put(data, 8179, 937, 921, 0)
+      put(data, 8188, 937, 921, 0)
+      put(data, 8114, 8122, 921, 0)
+      put(data, 8116, 902, 921, 0)
+      put(data, 8130, 8138, 921, 0)
+      put(data, 8132, 905, 921, 0)
+      put(data, 8178, 8186, 921, 0)
+      put(data, 8180, 911, 921, 0)
+      put(data, 8119, 913, 834, 921)
+      put(data, 8135, 919, 834, 921)
+      put(data, 8183, 937, 834, 921)
+
+      data
+    end
+  end
+
   # TODO: this is needed to avoid generating lots of allocas
   # in LLVM, which makes LLVM really slow. The compiler should
   # try to avoid/reuse temporary allocas.
-  private def self.put(array, *values) : Nil
+  private def self.put(array : Array, *values) : Nil
     array << values
   end
+
+  private def self.put(hash : Hash, key, *values) : Nil
+    hash[key] = values
+  end
 end