forked from crystal-lang/crystal
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_unicode_data.cr
195 lines (165 loc) · 5.11 KB
/
generate_unicode_data.cr
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
# This script generates the file src/unicode/data.cr
# that contains compact representations of the UnicodeData.txt
# file from the unicode specification.
require "http/client"
require "ecr"
require "../src/compiler/crystal/formatter"
# Each entry in UnicodeData.txt
# (some info is missing but we don't use it yet)
record Entry,
codepoint : Int32,
name : String,
general_category : String,
upcase : Int32?,
downcase : Int32?
record SpecialCase,
codepoint : Int32,
value : Array(Int32)
record CaseRange, low : Int32, high : Int32, delta : Int32
record AlternateRange, low : Int32, high : Int32
record Stride, low : Int32, high : Int32, stride : Int32
def case_ranges(entries, &block)
ranges = [] of CaseRange
first_codepoint = nil
last_codepoint = nil
first_match = nil
last_match = nil
entries.each do |entry|
codepoint = entry.codepoint
match = yield entry
if match
if last_codepoint == codepoint - 1 && last_match == match - 1
# Continue streak
else
if last_codepoint && last_match
ranges << CaseRange.new(first_codepoint.not_nil!, last_codepoint, first_match.not_nil! - first_codepoint.not_nil!)
end
first_codepoint = codepoint
first_match = match
end
else
if last_codepoint && last_match
ranges << CaseRange.new(first_codepoint.not_nil!, last_codepoint, first_match.not_nil! - first_codepoint.not_nil!)
end
end
last_codepoint = codepoint
last_match = match
end
ranges
end
def alternate_ranges(ranges)
alternate = [] of AlternateRange
first_codepoint = nil
last_codepoint = nil
ranges.each do |range|
codepoint = range.low
if last_codepoint == codepoint - 2
# Continue streak
else
if first_codepoint
alternate << AlternateRange.new(first_codepoint, last_codepoint.not_nil!)
end
first_codepoint = codepoint
end
last_codepoint = codepoint
end
if first_codepoint
alternate << AlternateRange.new(first_codepoint, last_codepoint.not_nil!)
end
alternate
end
def strides(entries, targets)
strides = [] of Stride
entries = entries.select { |entry| targets.includes?(yield entry) }
first_entry = nil
last_entry = nil
stride = nil
entries.each do |entry|
if first_entry
if last_entry
current_stride = entry.codepoint - last_entry.codepoint
if current_stride == stride
# Continue stride
else
if first_entry == last_entry
stride = current_stride
else
stride = 1 if first_entry.name.ends_with?("First>") && last_entry.name.ends_with?("Last>")
strides << Stride.new(first_entry.codepoint, last_entry.codepoint, stride.not_nil!)
first_entry = entry
stride = nil
end
end
end
else
first_entry = entry
end
last_entry = entry
end
if first_entry && last_entry
if stride
stride = 1 if first_entry.name.ends_with?("First>") && last_entry.name.ends_with?("Last>")
strides << Stride.new(first_entry.codepoint, last_entry.codepoint, stride)
else
strides << Stride.new(first_entry.codepoint, last_entry.codepoint, 1)
end
end
strides
end
entries = [] of Entry
special_cases_downcase = [] of SpecialCase
special_cases_upcase = [] of SpecialCase
url = "http://www.unicode.org/Public/9.0.0/ucd/UnicodeData.txt"
body = HTTP::Client.get(url).body
body.each_line do |line|
line = line.strip
next if line.empty?
pieces = line.split(';')
codepoint = pieces[0].to_i(16)
name = pieces[1]
general_category = pieces[2]
upcase = pieces[12].to_i?(16)
downcase = pieces[13].to_i?(16)
entries << Entry.new(codepoint, name, general_category, upcase, downcase)
end
url = "http://www.unicode.org/Public/9.0.0/ucd/SpecialCasing.txt"
body = HTTP::Client.get(url).body
body.each_line do |line|
line = line.strip
next if line.empty?
break if line.starts_with?("# Conditional Mappings")
next if line.starts_with?('#')
pieces = line.split(';')
codepoint = pieces[0].to_i(16)
downcase = pieces[1].split.map(&.to_i(16))
upcase = pieces[3].split.map(&.to_i(16))
downcase = nil if downcase.size == 1
upcase = nil if upcase.size == 1
if downcase
while downcase.size < 3
downcase << 0
end
special_cases_downcase << SpecialCase.new(codepoint, downcase)
end
if upcase
while upcase.size < 3
upcase << 0
end
special_cases_upcase << SpecialCase.new(codepoint, upcase)
end
end
downcase_ranges = case_ranges entries, &.downcase
downcase_one_ranges, downcase_ranges = downcase_ranges.partition { |r| r.delta == 1 }
upcase_ranges = case_ranges entries, &.upcase
upcase_ranges.select! { |r| r.delta != -1 }
alternate_ranges = alternate_ranges(downcase_one_ranges)
all_strides = {} of String => Array(Stride)
categories = %w(Lu Ll Lt Mn Mc Me Nd Nl No Zs Zl Zp Cc Cf Cs Co Cn)
categories.each do |category|
all_strides[category] = strides entries, category, &.general_category
end
output = String.build do |str|
ECR.embed "#{__DIR__}/unicode_data.ecr", str
end
output = Crystal.format(output)
File.write("#{__DIR__}/../src/unicode/data.cr", output)