-
Notifications
You must be signed in to change notification settings - Fork 13.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add support to libcore for encoded-in-rust unicode character properti…
…es, at least. Add script to compute them from unicode.org.
- Loading branch information
Showing
4 changed files
with
4,919 additions
and
82 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,172 @@ | ||
#!/usr/bin/env python | ||
|
||
# This digests UnicodeData.txt and DerivedCoreProperties.txt and emits rust | ||
# code covering the core properties. Since this is a pretty rare event we | ||
# just store this out-of-line and check the unicode.rs file into git. | ||
# | ||
# The emitted code is "the minimum we think is necessary for libcore", that | ||
# is, to support basic operations of the compiler and "most nontrivial rust | ||
# programs". It is not meant to be a complete implementation of unicode. | ||
# For that we recommend you use a proper binding to libicu. | ||
|
||
import fileinput, re, os, sys | ||
|
||
|
||
def fetch(f): | ||
if not os.path.exists(f): | ||
os.system("curl -O http://www.unicode.org/Public/UNIDATA/%s" | ||
% f) | ||
|
||
if not os.path.exists(f): | ||
sys.stderr.write("cannot load %s" % f) | ||
exit(1) | ||
|
||
|
||
def load_general_categories(f): | ||
fetch(f) | ||
gencats = {} | ||
curr_cat = "" | ||
c_lo = 0 | ||
c_hi = 0 | ||
for line in fileinput.input(f): | ||
fields = line.split(";") | ||
if len(fields) != 15: | ||
continue | ||
[code, name, gencat, combine, bidi, | ||
decomp, deci, digit, num, mirror, | ||
old, iso, upcase, lowcsae, titlecase ] = fields | ||
|
||
code = int(code, 16) | ||
|
||
if curr_cat == "": | ||
curr_cat = gencat | ||
c_lo = code | ||
c_hi = code | ||
|
||
if curr_cat == gencat: | ||
c_hi = code | ||
else: | ||
if curr_cat not in gencats: | ||
gencats[curr_cat] = [] | ||
|
||
gencats[curr_cat].append((c_lo, c_hi)) | ||
curr_cat = gencat | ||
c_lo = code | ||
c_hi = code | ||
return gencats | ||
|
||
|
||
def load_derived_core_properties(f): | ||
fetch(f) | ||
derivedprops = {} | ||
interestingprops = ["XID_Start", "XID_Continue", "Alphabetic"] | ||
re1 = re.compile("^([0-9A-F]+) +; (\w+)") | ||
re2 = re.compile("^([0-9A-F]+)\.\.([0-9A-F]+) +; (\w+)") | ||
|
||
for line in fileinput.input(f): | ||
prop = None | ||
d_lo = 0 | ||
d_hi = 0 | ||
m = re1.match(line) | ||
if m: | ||
d_lo = m.group(1) | ||
d_hi = m.group(1) | ||
prop = m.group(2) | ||
else: | ||
m = re2.match(line) | ||
if m: | ||
d_lo = m.group(1) | ||
d_hi = m.group(2) | ||
prop = m.group(3) | ||
else: | ||
continue | ||
if prop not in interestingprops: | ||
continue | ||
d_lo = int(d_lo, 16) | ||
d_hi = int(d_hi, 16) | ||
if prop not in derivedprops: | ||
derivedprops[prop] = [] | ||
derivedprops[prop].append((d_lo, d_hi)) | ||
return derivedprops | ||
|
||
def escape_char(c): | ||
if c <= 0xff: | ||
return "'\\x%2.2x'" % c | ||
if c <= 0xffff: | ||
return "'\\u%4.4x'" % c | ||
return "'\\U%8.8x'" % c | ||
|
||
def emit_rust_module(f, mod, tbl): | ||
f.write("mod %s {\n" % mod) | ||
keys = tbl.keys() | ||
keys.sort() | ||
for cat in keys: | ||
f.write(" pure fn %s(c: char) -> bool {\n" % cat) | ||
f.write(" ret alt c {\n") | ||
prefix = ' ' | ||
for pair in tbl[cat]: | ||
if pair[0] == pair[1]: | ||
f.write(" %c %s\n" % | ||
(prefix, escape_char(pair[0]))) | ||
else: | ||
f.write(" %c %s to %s\n" % | ||
(prefix, | ||
escape_char(pair[0]), | ||
escape_char(pair[1]))) | ||
prefix = '|' | ||
f.write(" { true }\n") | ||
f.write(" _ { false }\n") | ||
f.write(" };\n") | ||
f.write(" }\n\n") | ||
f.write("}\n") | ||
|
||
|
||
def emit_cpp_module(f, mod, tbl): | ||
keys = tbl.keys() | ||
keys.sort() | ||
|
||
for cat in keys: | ||
|
||
singles = [] | ||
ranges = [] | ||
|
||
for pair in tbl[cat]: | ||
if pair[0] == pair[1]: | ||
singles.append(pair[0]) | ||
else: | ||
ranges.append(pair) | ||
|
||
f.write("bool %s_%s(unsigned c) {\n" % (mod, cat)) | ||
for pair in ranges: | ||
f.write(" if (0x%x <= c && c <= 0x%x) { return true; }\n" | ||
% pair) | ||
if len(singles) > 0: | ||
f.write(" switch (c) {\n"); | ||
for single in singles: | ||
f.write(" case 0x%x:\n" % single) | ||
f.write(" return true;\n"); | ||
f.write(" default:\n"); | ||
f.write(" return false;\n"); | ||
f.write(" }\n") | ||
f.write("return false;\n") | ||
f.write("}\n\n") | ||
|
||
|
||
def emit_module(rf, cf, mod, tbl): | ||
emit_rust_module(rf, mod, tbl) | ||
emit_cpp_module(cf, mod, tbl) | ||
|
||
r = "unicode.rs" | ||
c = "unicode.cpp" | ||
for i in [r, c]: | ||
if os.path.exists(i): | ||
os.remove(i); | ||
|
||
rf = open(r, "w") | ||
cf = open(c, "w") | ||
|
||
emit_module(rf, cf, "general_category", | ||
load_general_categories("UnicodeData.txt")) | ||
|
||
emit_module(rf, cf, "derived_property", | ||
load_derived_core_properties("DerivedCoreProperties.txt")) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.