Skip to content

Commit

Permalink
Unicode: consider special case conversions like the uppercase of "ffl" …
Browse files Browse the repository at this point in the history
…and the downcase of "İ"
  • Loading branch information
Ary Borenszweig committed Nov 22, 2016
1 parent da21f0b commit 1def773
Show file tree
Hide file tree
Showing 7 changed files with 325 additions and 6 deletions.
34 changes: 34 additions & 0 deletions scripts/generate_unicode_data.cr
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ record Entry,
upcase : Int32?,
downcase : Int32?

record SpecialCase,
codepoint : Int32,
value : Array(Int32)

record CaseRange, low : Int32, high : Int32, delta : Int32
record AlternateRange, low : Int32, high : Int32
record Stride, low : Int32, high : Int32, stride : Int32
Expand Down Expand Up @@ -123,6 +127,8 @@ def strides(entries, targets)
end

entries = [] of Entry
special_cases_downcase = [] of SpecialCase
special_cases_upcase = [] of SpecialCase

url = "http://www.unicode.org/Public/9.0.0/ucd/UnicodeData.txt"
body = HTTP::Client.get(url).body
Expand All @@ -139,6 +145,34 @@ body.each_line do |line|
entries << Entry.new(codepoint, name, general_category, upcase, downcase)
end

url = "http://www.unicode.org/Public/9.0.0/ucd/SpecialCasing.txt"
body = HTTP::Client.get(url).body
body.each_line do |line|
line = line.strip
next if line.empty?
break if line.starts_with?("# Conditional Mappings")
next if line.starts_with?('#')

pieces = line.split(';')
codepoint = pieces[0].to_i(16)
downcase = pieces[1].split.map(&.to_i(16))
upcase = pieces[3].split.map(&.to_i(16))
downcase = nil if downcase.size == 1
upcase = nil if upcase.size == 1
if downcase
while downcase.size < 3
downcase << 0
end
special_cases_downcase << SpecialCase.new(codepoint, downcase)
end
if upcase
while upcase.size < 3
upcase << 0
end
special_cases_upcase << SpecialCase.new(codepoint, upcase)
end
end

downcase_ranges = case_ranges entries, &.downcase
downcase_one_ranges, downcase_ranges = downcase_ranges.partition { |r| r.delta == 1 }

Expand Down
34 changes: 33 additions & 1 deletion scripts/unicode_data.ecr
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,42 @@ module Unicode
end
<%- end %>

# Special downcase transformation that involve mapping a codepoint
# to multiple codepoints. The maximum transformation is always 3
# codepoints, so we store them all as 3 codepoints and 0 means end.
@@special_cases_downcase : Hash(Int32, {Int32, Int32, Int32})?
private def self.special_cases_downcase
@@special_cases_downcase ||= begin
data = Hash(Int32, {Int32, Int32, Int32}).new(initial_capacity: <%= special_cases_downcase.size %>)
<%- special_cases_downcase.each do |a_case| -%>
put(data, <%= a_case.codepoint %>, <%= a_case.value.join(", ") %>)
<%- end %>
data
end
end

# Special upcase transformation that involve mapping a codepoint
# to multiple codepoints. The maximum transformation is always 3
# codepoints, so we store them all as 3 codepoints and 0 means end.
@@special_cases_upcase : Hash(Int32, {Int32, Int32, Int32})?
private def self.special_cases_upcase
@@special_cases_upcase ||= begin
data = Hash(Int32, {Int32, Int32, Int32}).new(initial_capacity: <%= special_cases_upcase.size %>)
<%- special_cases_upcase.each do |a_case| -%>
put(data, <%= a_case.codepoint %>, <%= a_case.value.join(", ") %>)
<%- end %>
data
end
end

# TODO: this is needed to avoid generating lots of allocas
# in LLVM, which makes LLVM really slow. The compiler should
# try to avoid/reuse temporary allocas.
private def self.put(array, *values) : Nil
private def self.put(array : Array, *values) : Nil
array << values
end

private def self.put(hash : Hash, key, *values) : Nil
hash[key] = values
end
end
4 changes: 4 additions & 0 deletions spec/std/string_spec.cr
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,7 @@ describe "String" do
assert { "ÁÉÍÓÚĀ".downcase.should eq("áéíóúā") }
assert { "AEIİOU".downcase(Unicode::CaseOptions::Turkic).should eq("aeıiou") }
assert { "ÁEÍOÚ".downcase(Unicode::CaseOptions::ASCII).should eq("ÁeÍoÚ") }
assert { "İ".downcase.should eq("") }
end

describe "upcase" do
Expand All @@ -469,12 +470,15 @@ describe "String" do
assert { "áéíóúā".upcase.should eq("ÁÉÍÓÚĀ") }
assert { "aeıiou".upcase(Unicode::CaseOptions::Turkic).should eq("AEIİOU") }
assert { "áeíoú".upcase(Unicode::CaseOptions::ASCII).should eq("áEíOú") }
assert { "baffle".upcase.should eq("BAFFLE") }
assert { "".upcase.should eq("FF") }
end

describe "capitalize" do
assert { "HELLO!".capitalize.should eq("Hello!") }
assert { "HELLO MAN!".capitalize.should eq("Hello man!") }
assert { "".capitalize.should eq("") }
assert { "fflİ".capitalize.should eq("FFLi̇") }
end

describe "chomp" do
Expand Down
37 changes: 37 additions & 0 deletions src/char.cr
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,13 @@ struct Char

# Returns the downcase equivalent of this char.
#
# Note that this only works for characters whose downcase
# equivalent yields a single codepoint. There are a few
# characters, like 'İ', than when downcased result in multiple
# characters (in this case: 'I' and the dot mark).
#
# For a more correct method see the method that receives a block.
#
# ```
# 'Z'.downcase # => 'z'
# 'x'.downcase # => 'x'
Expand All @@ -375,8 +382,24 @@ struct Char
Unicode.downcase(self, options)
end

# Yields each char for the downcase equivalent of this char.
#
# This method takes into account the possibility that an downcase
# version of a char might result in multiple chars, like for
# 'İ', which results in 'i' and a dot mark.
def downcase(options = Unicode::CaseOptions::None)
Unicode.downcase(self, options) { |char| yield char }
end

# Returns the upcase equivalent of this char.
#
# Note that this only works for characters whose upcase
# equivalent yields a single codepoint. There are a few
# characters, like 'ffl', than when upcased result in multiple
# characters (in this case: 'F', 'F', 'L').
#
# For a more correct method see the method that receives a block.
#
# ```
# 'z'.upcase # => 'Z'
# 'X'.upcase # => 'X'
Expand All @@ -386,6 +409,20 @@ struct Char
Unicode.upcase(self, options)
end

# Yields each char for the upcase equivalent of this char.
#
# This method takes into account the possibility that an upcase
# version of a char might result in multiple chars, like for
# 'ffl', which results in 'F', 'F' and 'L'.
#
# ```
# 'z'.upcase { |v| puts v } # prints 'Z'
# 'ffl'.upcase { |v| puts v } # prints 'F', 'F', 'F'
# ```
def upcase(options = Unicode::CaseOptions::None)
Unicode.upcase(self, options) { |char| yield char }
end

# Returns this char's codepoint.
def hash
ord
Expand Down
12 changes: 8 additions & 4 deletions src/string.cr
Original file line number Diff line number Diff line change
Expand Up @@ -877,7 +877,9 @@ class String
def downcase(options = Unicode::CaseOptions::None)
String.build(bytesize) do |io|
each_char do |char|
io << char.downcase(options)
char.downcase(options) do |res|
io << res
end
end
end
end
Expand All @@ -891,7 +893,9 @@ class String
def upcase(options = Unicode::CaseOptions::None)
String.build(bytesize) do |io|
each_char do |char|
io << char.upcase(options)
char.upcase(options) do |res|
io << res
end
end
end
end
Expand All @@ -908,9 +912,9 @@ class String
String.build(bytesize) do |io|
each_char_with_index do |char, i|
if i == 0
io << char.upcase
char.upcase { |c| io << c }
else
io << char.downcase
char.downcase { |c| io << c }
end
end
end
Expand Down
133 changes: 132 additions & 1 deletion src/unicode/data.cr
Original file line number Diff line number Diff line change
Expand Up @@ -1309,10 +1309,141 @@ module Unicode
end
end

# Special downcase transformation that involve mapping a codepoint
# to multiple codepoints. The maximum transformation is always 3
# codepoints, so we store them all as 3 codepoints and 0 means end.
@@special_cases_downcase : Hash(Int32, {Int32, Int32, Int32})?
private def self.special_cases_downcase
@@special_cases_downcase ||= begin
data = Hash(Int32, {Int32, Int32, Int32}).new(initial_capacity: 1)
put(data, 304, 105, 775, 0)

data
end
end

# Special upcase transformation that involve mapping a codepoint
# to multiple codepoints. The maximum transformation is always 3
# codepoints, so we store them all as 3 codepoints and 0 means end.
@@special_cases_upcase : Hash(Int32, {Int32, Int32, Int32})?
private def self.special_cases_upcase
@@special_cases_upcase ||= begin
data = Hash(Int32, {Int32, Int32, Int32}).new(initial_capacity: 102)
put(data, 223, 83, 83, 0)
put(data, 64256, 70, 70, 0)
put(data, 64257, 70, 73, 0)
put(data, 64258, 70, 76, 0)
put(data, 64259, 70, 70, 73)
put(data, 64260, 70, 70, 76)
put(data, 64261, 83, 84, 0)
put(data, 64262, 83, 84, 0)
put(data, 1415, 1333, 1362, 0)
put(data, 64275, 1348, 1350, 0)
put(data, 64276, 1348, 1333, 0)
put(data, 64277, 1348, 1339, 0)
put(data, 64278, 1358, 1350, 0)
put(data, 64279, 1348, 1341, 0)
put(data, 329, 700, 78, 0)
put(data, 912, 921, 776, 769)
put(data, 944, 933, 776, 769)
put(data, 496, 74, 780, 0)
put(data, 7830, 72, 817, 0)
put(data, 7831, 84, 776, 0)
put(data, 7832, 87, 778, 0)
put(data, 7833, 89, 778, 0)
put(data, 7834, 65, 702, 0)
put(data, 8016, 933, 787, 0)
put(data, 8018, 933, 787, 768)
put(data, 8020, 933, 787, 769)
put(data, 8022, 933, 787, 834)
put(data, 8118, 913, 834, 0)
put(data, 8134, 919, 834, 0)
put(data, 8146, 921, 776, 768)
put(data, 8147, 921, 776, 769)
put(data, 8150, 921, 834, 0)
put(data, 8151, 921, 776, 834)
put(data, 8162, 933, 776, 768)
put(data, 8163, 933, 776, 769)
put(data, 8164, 929, 787, 0)
put(data, 8166, 933, 834, 0)
put(data, 8167, 933, 776, 834)
put(data, 8182, 937, 834, 0)
put(data, 8064, 7944, 921, 0)
put(data, 8065, 7945, 921, 0)
put(data, 8066, 7946, 921, 0)
put(data, 8067, 7947, 921, 0)
put(data, 8068, 7948, 921, 0)
put(data, 8069, 7949, 921, 0)
put(data, 8070, 7950, 921, 0)
put(data, 8071, 7951, 921, 0)
put(data, 8072, 7944, 921, 0)
put(data, 8073, 7945, 921, 0)
put(data, 8074, 7946, 921, 0)
put(data, 8075, 7947, 921, 0)
put(data, 8076, 7948, 921, 0)
put(data, 8077, 7949, 921, 0)
put(data, 8078, 7950, 921, 0)
put(data, 8079, 7951, 921, 0)
put(data, 8080, 7976, 921, 0)
put(data, 8081, 7977, 921, 0)
put(data, 8082, 7978, 921, 0)
put(data, 8083, 7979, 921, 0)
put(data, 8084, 7980, 921, 0)
put(data, 8085, 7981, 921, 0)
put(data, 8086, 7982, 921, 0)
put(data, 8087, 7983, 921, 0)
put(data, 8088, 7976, 921, 0)
put(data, 8089, 7977, 921, 0)
put(data, 8090, 7978, 921, 0)
put(data, 8091, 7979, 921, 0)
put(data, 8092, 7980, 921, 0)
put(data, 8093, 7981, 921, 0)
put(data, 8094, 7982, 921, 0)
put(data, 8095, 7983, 921, 0)
put(data, 8096, 8040, 921, 0)
put(data, 8097, 8041, 921, 0)
put(data, 8098, 8042, 921, 0)
put(data, 8099, 8043, 921, 0)
put(data, 8100, 8044, 921, 0)
put(data, 8101, 8045, 921, 0)
put(data, 8102, 8046, 921, 0)
put(data, 8103, 8047, 921, 0)
put(data, 8104, 8040, 921, 0)
put(data, 8105, 8041, 921, 0)
put(data, 8106, 8042, 921, 0)
put(data, 8107, 8043, 921, 0)
put(data, 8108, 8044, 921, 0)
put(data, 8109, 8045, 921, 0)
put(data, 8110, 8046, 921, 0)
put(data, 8111, 8047, 921, 0)
put(data, 8115, 913, 921, 0)
put(data, 8124, 913, 921, 0)
put(data, 8131, 919, 921, 0)
put(data, 8140, 919, 921, 0)
put(data, 8179, 937, 921, 0)
put(data, 8188, 937, 921, 0)
put(data, 8114, 8122, 921, 0)
put(data, 8116, 902, 921, 0)
put(data, 8130, 8138, 921, 0)
put(data, 8132, 905, 921, 0)
put(data, 8178, 8186, 921, 0)
put(data, 8180, 911, 921, 0)
put(data, 8119, 913, 834, 921)
put(data, 8135, 919, 834, 921)
put(data, 8183, 937, 834, 921)

data
end
end

# TODO: this is needed to avoid generating lots of allocas
# in LLVM, which makes LLVM really slow. The compiler should
# try to avoid/reuse temporary allocas.
private def self.put(array, *values) : Nil
private def self.put(array : Array, *values) : Nil
array << values
end

private def self.put(hash : Hash, key, *values) : Nil
hash[key] = values
end
end
Loading

0 comments on commit 1def773

Please sign in to comment.